{ "best_metric": 0.45736926794052124, "best_model_checkpoint": "models/llama3.2-3b-dpo-vanilla-subset/checkpoint-10000", "epoch": 1.0, "eval_steps": 1000, "global_step": 13020, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 7.680491551459293e-05, "grad_norm": 3.932099221609109, "learning_rate": 3.840245775729647e-10, "logits/chosen": -1.5, "logits/rejected": -0.765625, "logps/chosen": -73.0, "logps/rejected": -68.0, "loss": 1.3828, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0007680491551459293, "grad_norm": 4.687405090225381, "learning_rate": 3.840245775729646e-09, "logits/chosen": -1.2890625, "logits/rejected": -0.88671875, "logps/chosen": -116.0, "logps/rejected": -103.0, "loss": 1.3842, "rewards/accuracies": 0.2708333432674408, "rewards/chosen": -8.344650268554688e-05, "rewards/margins": 0.000225067138671875, "rewards/rejected": -0.0003070831298828125, "step": 10 }, { "epoch": 0.0015360983102918587, "grad_norm": 4.27265772041122, "learning_rate": 7.680491551459292e-09, "logits/chosen": -1.359375, "logits/rejected": -0.9609375, "logps/chosen": -120.0, "logps/rejected": -102.0, "loss": 1.3852, "rewards/accuracies": 0.3812499940395355, "rewards/chosen": 0.0003376007080078125, "rewards/margins": 0.000797271728515625, "rewards/rejected": -0.0004596710205078125, "step": 20 }, { "epoch": 0.002304147465437788, "grad_norm": 4.275212448798933, "learning_rate": 1.152073732718894e-08, "logits/chosen": -1.34375, "logits/rejected": -0.86328125, "logps/chosen": -121.5, "logps/rejected": -107.0, "loss": 1.3856, "rewards/accuracies": 0.3375000059604645, "rewards/chosen": -0.00016689300537109375, "rewards/margins": -0.00061798095703125, "rewards/rejected": 0.000453948974609375, "step": 30 }, { "epoch": 0.0030721966205837174, "grad_norm": 3.8644071587570488, "learning_rate": 1.5360983102918585e-08, "logits/chosen": -1.3671875, "logits/rejected": -0.9375, "logps/chosen": -114.0, "logps/rejected": -111.5, "loss": 1.3852, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -2.9802322387695312e-05, "rewards/margins": 1.704692840576172e-05, "rewards/rejected": -4.839897155761719e-05, "step": 40 }, { "epoch": 0.0038402457757296467, "grad_norm": 4.001472479217466, "learning_rate": 1.9201228878648235e-08, "logits/chosen": -1.2890625, "logits/rejected": -0.85546875, "logps/chosen": -126.0, "logps/rejected": -118.0, "loss": 1.3857, "rewards/accuracies": 0.33125001192092896, "rewards/chosen": 3.409385681152344e-05, "rewards/margins": -0.0001697540283203125, "rewards/rejected": 0.0002040863037109375, "step": 50 }, { "epoch": 0.004608294930875576, "grad_norm": 4.146418999663059, "learning_rate": 2.304147465437788e-08, "logits/chosen": -1.21875, "logits/rejected": -0.84765625, "logps/chosen": -120.5, "logps/rejected": -108.5, "loss": 1.3858, "rewards/accuracies": 0.33125001192092896, "rewards/chosen": -0.00092315673828125, "rewards/margins": -0.00058746337890625, "rewards/rejected": -0.0003337860107421875, "step": 60 }, { "epoch": 0.005376344086021506, "grad_norm": 4.513423039092285, "learning_rate": 2.6881720430107527e-08, "logits/chosen": -1.3828125, "logits/rejected": -1.0078125, "logps/chosen": -132.0, "logps/rejected": -109.0, "loss": 1.3852, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": 0.000164031982421875, "rewards/margins": 0.000797271728515625, "rewards/rejected": -0.00063323974609375, "step": 70 }, { "epoch": 0.006144393241167435, "grad_norm": 4.028666712604131, "learning_rate": 3.072196620583717e-08, "logits/chosen": -1.3046875, "logits/rejected": -0.93359375, "logps/chosen": -129.0, "logps/rejected": -102.5, "loss": 1.3852, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": -6.103515625e-05, "rewards/margins": 4.839897155761719e-05, "rewards/rejected": -0.0001087188720703125, "step": 80 }, { "epoch": 0.0069124423963133645, "grad_norm": 4.1233372095788186, "learning_rate": 3.456221198156682e-08, "logits/chosen": -1.3515625, "logits/rejected": -0.96484375, "logps/chosen": -137.0, "logps/rejected": -120.5, "loss": 1.3849, "rewards/accuracies": 0.35624998807907104, "rewards/chosen": -1.952052116394043e-06, "rewards/margins": 0.0004291534423828125, "rewards/rejected": -0.0004291534423828125, "step": 90 }, { "epoch": 0.007680491551459293, "grad_norm": 4.018859443090581, "learning_rate": 3.840245775729647e-08, "logits/chosen": -1.375, "logits/rejected": -0.92578125, "logps/chosen": -111.0, "logps/rejected": -100.0, "loss": 1.385, "rewards/accuracies": 0.35624998807907104, "rewards/chosen": -0.000568389892578125, "rewards/margins": -0.0003681182861328125, "rewards/rejected": -0.0002002716064453125, "step": 100 }, { "epoch": 0.008448540706605223, "grad_norm": 4.082122662913559, "learning_rate": 4.224270353302611e-08, "logits/chosen": -1.3359375, "logits/rejected": -1.015625, "logps/chosen": -108.5, "logps/rejected": -103.0, "loss": 1.3847, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": -0.00045013427734375, "rewards/margins": 9.202957153320312e-05, "rewards/rejected": -0.00054168701171875, "step": 110 }, { "epoch": 0.009216589861751152, "grad_norm": 4.882796318118811, "learning_rate": 4.608294930875576e-08, "logits/chosen": -1.3984375, "logits/rejected": -0.90234375, "logps/chosen": -133.0, "logps/rejected": -102.5, "loss": 1.3844, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 0.000682830810546875, "rewards/margins": 0.001922607421875, "rewards/rejected": -0.0012359619140625, "step": 120 }, { "epoch": 0.009984639016897081, "grad_norm": 4.332613018952837, "learning_rate": 4.9923195084485404e-08, "logits/chosen": -1.3359375, "logits/rejected": -0.9296875, "logps/chosen": -123.0, "logps/rejected": -100.5, "loss": 1.3843, "rewards/accuracies": 0.3687500059604645, "rewards/chosen": -0.000579833984375, "rewards/margins": -0.0001773834228515625, "rewards/rejected": -0.000400543212890625, "step": 130 }, { "epoch": 0.010752688172043012, "grad_norm": 4.201893902181227, "learning_rate": 5.3763440860215054e-08, "logits/chosen": -1.2109375, "logits/rejected": -0.7890625, "logps/chosen": -127.5, "logps/rejected": -93.0, "loss": 1.3841, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.00075531005859375, "rewards/margins": 0.00162506103515625, "rewards/rejected": -0.0008697509765625, "step": 140 }, { "epoch": 0.01152073732718894, "grad_norm": 4.167444510566117, "learning_rate": 5.76036866359447e-08, "logits/chosen": -1.21875, "logits/rejected": -0.96875, "logps/chosen": -145.0, "logps/rejected": -107.0, "loss": 1.3839, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": 5.793571472167969e-05, "rewards/margins": 0.00152587890625, "rewards/rejected": -0.00146484375, "step": 150 }, { "epoch": 0.01228878648233487, "grad_norm": 4.457481744826243, "learning_rate": 6.144393241167434e-08, "logits/chosen": -1.3359375, "logits/rejected": -1.0234375, "logps/chosen": -112.0, "logps/rejected": -98.5, "loss": 1.3832, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.00018787384033203125, "rewards/margins": 0.0022430419921875, "rewards/rejected": -0.0024261474609375, "step": 160 }, { "epoch": 0.013056835637480798, "grad_norm": 4.291800664907341, "learning_rate": 6.528417818740399e-08, "logits/chosen": -1.3203125, "logits/rejected": -0.9140625, "logps/chosen": -125.0, "logps/rejected": -119.0, "loss": 1.3825, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.00103759765625, "rewards/margins": 0.003326416015625, "rewards/rejected": -0.002288818359375, "step": 170 }, { "epoch": 0.013824884792626729, "grad_norm": 4.44737069976988, "learning_rate": 6.912442396313364e-08, "logits/chosen": -1.21875, "logits/rejected": -0.87890625, "logps/chosen": -126.0, "logps/rejected": -112.5, "loss": 1.3825, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.0006256103515625, "rewards/margins": 0.004608154296875, "rewards/rejected": -0.00396728515625, "step": 180 }, { "epoch": 0.014592933947772658, "grad_norm": 4.5218302561335575, "learning_rate": 7.296466973886329e-08, "logits/chosen": -1.375, "logits/rejected": -0.91015625, "logps/chosen": -108.0, "logps/rejected": -103.0, "loss": 1.3818, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.00023174285888671875, "rewards/margins": 0.00372314453125, "rewards/rejected": -0.00396728515625, "step": 190 }, { "epoch": 0.015360983102918587, "grad_norm": 4.1426619833116956, "learning_rate": 7.680491551459294e-08, "logits/chosen": -1.359375, "logits/rejected": -1.0859375, "logps/chosen": -126.5, "logps/rejected": -111.0, "loss": 1.3808, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0002765655517578125, "rewards/margins": 0.0045166015625, "rewards/rejected": -0.004241943359375, "step": 200 }, { "epoch": 0.016129032258064516, "grad_norm": 4.560193477708672, "learning_rate": 8.064516129032257e-08, "logits/chosen": -1.421875, "logits/rejected": -0.96875, "logps/chosen": -116.5, "logps/rejected": -104.0, "loss": 1.3805, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -4.696846008300781e-05, "rewards/margins": 0.005157470703125, "rewards/rejected": -0.005218505859375, "step": 210 }, { "epoch": 0.016897081413210446, "grad_norm": 4.140511737227676, "learning_rate": 8.448540706605222e-08, "logits/chosen": -1.40625, "logits/rejected": -0.98828125, "logps/chosen": -134.0, "logps/rejected": -127.0, "loss": 1.3792, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0003719329833984375, "rewards/margins": 0.00634765625, "rewards/rejected": -0.0067138671875, "step": 220 }, { "epoch": 0.017665130568356373, "grad_norm": 4.466917431271369, "learning_rate": 8.832565284178187e-08, "logits/chosen": -1.515625, "logits/rejected": -0.9765625, "logps/chosen": -134.0, "logps/rejected": -125.5, "loss": 1.3766, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.000469207763671875, "rewards/margins": 0.011962890625, "rewards/rejected": -0.01153564453125, "step": 230 }, { "epoch": 0.018433179723502304, "grad_norm": 4.34873429174038, "learning_rate": 9.216589861751152e-08, "logits/chosen": -1.2890625, "logits/rejected": -0.9765625, "logps/chosen": -137.0, "logps/rejected": -122.0, "loss": 1.3752, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.00060272216796875, "rewards/margins": 0.012451171875, "rewards/rejected": -0.0118408203125, "step": 240 }, { "epoch": 0.019201228878648235, "grad_norm": 5.129099092933685, "learning_rate": 9.600614439324116e-08, "logits/chosen": -1.4140625, "logits/rejected": -1.1171875, "logps/chosen": -148.0, "logps/rejected": -113.0, "loss": 1.3737, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0007171630859375, "rewards/margins": 0.01611328125, "rewards/rejected": -0.016845703125, "step": 250 }, { "epoch": 0.019969278033794162, "grad_norm": 3.9605696924835305, "learning_rate": 9.984639016897081e-08, "logits/chosen": -1.375, "logits/rejected": -0.94921875, "logps/chosen": -118.0, "logps/rejected": -114.5, "loss": 1.3723, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.00323486328125, "rewards/margins": 0.01361083984375, "rewards/rejected": -0.016845703125, "step": 260 }, { "epoch": 0.020737327188940093, "grad_norm": 4.601972767869401, "learning_rate": 1.0368663594470045e-07, "logits/chosen": -1.359375, "logits/rejected": -0.859375, "logps/chosen": -125.5, "logps/rejected": -102.5, "loss": 1.3691, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.00115203857421875, "rewards/margins": 0.0169677734375, "rewards/rejected": -0.01806640625, "step": 270 }, { "epoch": 0.021505376344086023, "grad_norm": 4.835118596000182, "learning_rate": 1.0752688172043011e-07, "logits/chosen": -1.515625, "logits/rejected": -1.140625, "logps/chosen": -125.0, "logps/rejected": -105.5, "loss": 1.3669, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0040283203125, "rewards/margins": 0.0172119140625, "rewards/rejected": -0.021240234375, "step": 280 }, { "epoch": 0.02227342549923195, "grad_norm": 4.128954992947858, "learning_rate": 1.1136712749615974e-07, "logits/chosen": -1.3828125, "logits/rejected": -1.109375, "logps/chosen": -124.5, "logps/rejected": -103.5, "loss": 1.3622, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.000720977783203125, "rewards/margins": 0.02099609375, "rewards/rejected": -0.021728515625, "step": 290 }, { "epoch": 0.02304147465437788, "grad_norm": 4.2764635649497675, "learning_rate": 1.152073732718894e-07, "logits/chosen": -1.28125, "logits/rejected": -1.1015625, "logps/chosen": -146.0, "logps/rejected": -129.0, "loss": 1.3574, "rewards/accuracies": 0.6875, "rewards/chosen": -0.00836181640625, "rewards/margins": 0.0263671875, "rewards/rejected": -0.03466796875, "step": 300 }, { "epoch": 0.023809523809523808, "grad_norm": 4.442244602454941, "learning_rate": 1.1904761904761903e-07, "logits/chosen": -1.4140625, "logits/rejected": -1.0546875, "logps/chosen": -123.0, "logps/rejected": -105.0, "loss": 1.3507, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.0059814453125, "rewards/margins": 0.03466796875, "rewards/rejected": -0.040771484375, "step": 310 }, { "epoch": 0.02457757296466974, "grad_norm": 4.346035026622215, "learning_rate": 1.2288786482334868e-07, "logits/chosen": -1.5, "logits/rejected": -1.0625, "logps/chosen": -131.0, "logps/rejected": -136.0, "loss": 1.3421, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.01007080078125, "rewards/margins": 0.05810546875, "rewards/rejected": -0.068359375, "step": 320 }, { "epoch": 0.02534562211981567, "grad_norm": 4.705856967968319, "learning_rate": 1.2672811059907834e-07, "logits/chosen": -1.4296875, "logits/rejected": -1.1171875, "logps/chosen": -122.5, "logps/rejected": -117.0, "loss": 1.341, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0228271484375, "rewards/margins": 0.038330078125, "rewards/rejected": -0.06103515625, "step": 330 }, { "epoch": 0.026113671274961597, "grad_norm": 4.924597220014136, "learning_rate": 1.3056835637480798e-07, "logits/chosen": -1.546875, "logits/rejected": -1.0625, "logps/chosen": -120.0, "logps/rejected": -110.0, "loss": 1.3287, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.0233154296875, "rewards/margins": 0.05322265625, "rewards/rejected": -0.07666015625, "step": 340 }, { "epoch": 0.026881720430107527, "grad_norm": 4.319711516806044, "learning_rate": 1.3440860215053762e-07, "logits/chosen": -1.546875, "logits/rejected": -1.21875, "logps/chosen": -125.0, "logps/rejected": -119.0, "loss": 1.3176, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.01611328125, "rewards/margins": 0.08349609375, "rewards/rejected": -0.099609375, "step": 350 }, { "epoch": 0.027649769585253458, "grad_norm": 4.784985805779567, "learning_rate": 1.3824884792626728e-07, "logits/chosen": -1.5546875, "logits/rejected": -1.2890625, "logps/chosen": -144.0, "logps/rejected": -130.0, "loss": 1.31, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.03857421875, "rewards/margins": 0.08251953125, "rewards/rejected": -0.12109375, "step": 360 }, { "epoch": 0.028417818740399385, "grad_norm": 5.667282006344584, "learning_rate": 1.4208909370199691e-07, "logits/chosen": -1.4453125, "logits/rejected": -1.15625, "logps/chosen": -135.0, "logps/rejected": -129.0, "loss": 1.3065, "rewards/accuracies": 0.65625, "rewards/chosen": -0.07373046875, "rewards/margins": 0.07763671875, "rewards/rejected": -0.1513671875, "step": 370 }, { "epoch": 0.029185867895545316, "grad_norm": 4.835945068795032, "learning_rate": 1.4592933947772658e-07, "logits/chosen": -1.6171875, "logits/rejected": -1.1875, "logps/chosen": -140.0, "logps/rejected": -131.0, "loss": 1.2998, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.0810546875, "rewards/margins": 0.08935546875, "rewards/rejected": -0.1708984375, "step": 380 }, { "epoch": 0.029953917050691243, "grad_norm": 4.8202142924051845, "learning_rate": 1.4976958525345621e-07, "logits/chosen": -1.3828125, "logits/rejected": -0.9921875, "logps/chosen": -126.5, "logps/rejected": -120.5, "loss": 1.2875, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.10400390625, "rewards/margins": 0.10107421875, "rewards/rejected": -0.205078125, "step": 390 }, { "epoch": 0.030721966205837174, "grad_norm": 5.329944205862767, "learning_rate": 1.5360983102918588e-07, "logits/chosen": -1.5703125, "logits/rejected": -1.0703125, "logps/chosen": -131.0, "logps/rejected": -139.0, "loss": 1.2741, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.10986328125, "rewards/margins": 0.126953125, "rewards/rejected": -0.236328125, "step": 400 }, { "epoch": 0.0314900153609831, "grad_norm": 6.474743197730298, "learning_rate": 1.574500768049155e-07, "logits/chosen": -1.6953125, "logits/rejected": -1.25, "logps/chosen": -150.0, "logps/rejected": -162.0, "loss": 1.2774, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.091796875, "rewards/margins": 0.1279296875, "rewards/rejected": -0.2197265625, "step": 410 }, { "epoch": 0.03225806451612903, "grad_norm": 5.41823911870074, "learning_rate": 1.6129032258064515e-07, "logits/chosen": -1.5625, "logits/rejected": -1.140625, "logps/chosen": -138.0, "logps/rejected": -133.0, "loss": 1.2718, "rewards/accuracies": 0.65625, "rewards/chosen": -0.12890625, "rewards/margins": 0.1025390625, "rewards/rejected": -0.2314453125, "step": 420 }, { "epoch": 0.03302611367127496, "grad_norm": 6.176435729039058, "learning_rate": 1.6513056835637479e-07, "logits/chosen": -1.625, "logits/rejected": -1.28125, "logps/chosen": -137.0, "logps/rejected": -143.0, "loss": 1.2625, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.1201171875, "rewards/margins": 0.1337890625, "rewards/rejected": -0.25390625, "step": 430 }, { "epoch": 0.03379416282642089, "grad_norm": 6.086477887071314, "learning_rate": 1.6897081413210445e-07, "logits/chosen": -1.5546875, "logits/rejected": -1.1328125, "logps/chosen": -133.0, "logps/rejected": -133.0, "loss": 1.2557, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.150390625, "rewards/margins": 0.134765625, "rewards/rejected": -0.28515625, "step": 440 }, { "epoch": 0.03456221198156682, "grad_norm": 5.350435017307425, "learning_rate": 1.7281105990783408e-07, "logits/chosen": -1.75, "logits/rejected": -1.4140625, "logps/chosen": -147.0, "logps/rejected": -142.0, "loss": 1.2378, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.158203125, "rewards/margins": 0.12451171875, "rewards/rejected": -0.283203125, "step": 450 }, { "epoch": 0.03533026113671275, "grad_norm": 5.771732751634189, "learning_rate": 1.7665130568356375e-07, "logits/chosen": -1.6484375, "logits/rejected": -1.1640625, "logps/chosen": -138.0, "logps/rejected": -163.0, "loss": 1.2516, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.154296875, "rewards/margins": 0.17578125, "rewards/rejected": -0.330078125, "step": 460 }, { "epoch": 0.03609831029185868, "grad_norm": 10.457247681112838, "learning_rate": 1.8049155145929338e-07, "logits/chosen": -1.6640625, "logits/rejected": -1.171875, "logps/chosen": -136.0, "logps/rejected": -159.0, "loss": 1.2312, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.1455078125, "rewards/margins": 0.177734375, "rewards/rejected": -0.32421875, "step": 470 }, { "epoch": 0.03686635944700461, "grad_norm": 6.6259821538911945, "learning_rate": 1.8433179723502305e-07, "logits/chosen": -1.6796875, "logits/rejected": -1.328125, "logps/chosen": -135.0, "logps/rejected": -147.0, "loss": 1.2384, "rewards/accuracies": 0.71875, "rewards/chosen": -0.1484375, "rewards/margins": 0.173828125, "rewards/rejected": -0.322265625, "step": 480 }, { "epoch": 0.03763440860215054, "grad_norm": 8.190517954671888, "learning_rate": 1.8817204301075268e-07, "logits/chosen": -1.7109375, "logits/rejected": -1.375, "logps/chosen": -141.0, "logps/rejected": -143.0, "loss": 1.24, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.1796875, "rewards/margins": 0.1728515625, "rewards/rejected": -0.3515625, "step": 490 }, { "epoch": 0.03840245775729647, "grad_norm": 6.114892016379565, "learning_rate": 1.9201228878648232e-07, "logits/chosen": -1.6796875, "logits/rejected": -1.3359375, "logps/chosen": -152.0, "logps/rejected": -151.0, "loss": 1.2238, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.19140625, "rewards/margins": 0.1650390625, "rewards/rejected": -0.35546875, "step": 500 }, { "epoch": 0.03917050691244239, "grad_norm": 7.285118058028228, "learning_rate": 1.9585253456221198e-07, "logits/chosen": -1.6015625, "logits/rejected": -1.3046875, "logps/chosen": -132.0, "logps/rejected": -152.0, "loss": 1.239, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.169921875, "rewards/margins": 0.1376953125, "rewards/rejected": -0.306640625, "step": 510 }, { "epoch": 0.039938556067588324, "grad_norm": 7.498115138055291, "learning_rate": 1.9969278033794162e-07, "logits/chosen": -1.6953125, "logits/rejected": -1.3125, "logps/chosen": -124.0, "logps/rejected": -130.0, "loss": 1.2159, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.130859375, "rewards/margins": 0.208984375, "rewards/rejected": -0.33984375, "step": 520 }, { "epoch": 0.040706605222734255, "grad_norm": 7.38478451418313, "learning_rate": 2.0353302611367125e-07, "logits/chosen": -1.796875, "logits/rejected": -1.375, "logps/chosen": -144.0, "logps/rejected": -127.0, "loss": 1.2155, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.2119140625, "rewards/margins": 0.166015625, "rewards/rejected": -0.376953125, "step": 530 }, { "epoch": 0.041474654377880185, "grad_norm": 8.136877346724, "learning_rate": 2.073732718894009e-07, "logits/chosen": -1.875, "logits/rejected": -1.484375, "logps/chosen": -148.0, "logps/rejected": -146.0, "loss": 1.2161, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.189453125, "rewards/margins": 0.2265625, "rewards/rejected": -0.416015625, "step": 540 }, { "epoch": 0.042242703533026116, "grad_norm": 6.757572155207074, "learning_rate": 2.1121351766513058e-07, "logits/chosen": -1.7578125, "logits/rejected": -1.515625, "logps/chosen": -149.0, "logps/rejected": -153.0, "loss": 1.2205, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.1904296875, "rewards/margins": 0.1982421875, "rewards/rejected": -0.388671875, "step": 550 }, { "epoch": 0.043010752688172046, "grad_norm": 7.5049904392736995, "learning_rate": 2.1505376344086022e-07, "logits/chosen": -1.84375, "logits/rejected": -1.4375, "logps/chosen": -143.0, "logps/rejected": -154.0, "loss": 1.2011, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.18359375, "rewards/margins": 0.2041015625, "rewards/rejected": -0.388671875, "step": 560 }, { "epoch": 0.04377880184331797, "grad_norm": 7.342660961421785, "learning_rate": 2.1889400921658985e-07, "logits/chosen": -1.75, "logits/rejected": -1.375, "logps/chosen": -145.0, "logps/rejected": -158.0, "loss": 1.2031, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.2294921875, "rewards/margins": 0.2041015625, "rewards/rejected": -0.43359375, "step": 570 }, { "epoch": 0.0445468509984639, "grad_norm": 6.96327534778557, "learning_rate": 2.227342549923195e-07, "logits/chosen": -1.8828125, "logits/rejected": -1.3984375, "logps/chosen": -152.0, "logps/rejected": -158.0, "loss": 1.1766, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2314453125, "rewards/margins": 0.2734375, "rewards/rejected": -0.50390625, "step": 580 }, { "epoch": 0.04531490015360983, "grad_norm": 8.239082811287949, "learning_rate": 2.2657450076804915e-07, "logits/chosen": -1.9140625, "logits/rejected": -1.453125, "logps/chosen": -165.0, "logps/rejected": -167.0, "loss": 1.1668, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2353515625, "rewards/margins": 0.248046875, "rewards/rejected": -0.482421875, "step": 590 }, { "epoch": 0.04608294930875576, "grad_norm": 10.09124352570273, "learning_rate": 2.304147465437788e-07, "logits/chosen": -1.90625, "logits/rejected": -1.53125, "logps/chosen": -154.0, "logps/rejected": -149.0, "loss": 1.1828, "rewards/accuracies": 0.71875, "rewards/chosen": -0.248046875, "rewards/margins": 0.255859375, "rewards/rejected": -0.50390625, "step": 600 }, { "epoch": 0.04685099846390169, "grad_norm": 8.225402504286484, "learning_rate": 2.3425499231950842e-07, "logits/chosen": -1.8515625, "logits/rejected": -1.5, "logps/chosen": -138.0, "logps/rejected": -157.0, "loss": 1.1805, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.23046875, "rewards/margins": 0.244140625, "rewards/rejected": -0.474609375, "step": 610 }, { "epoch": 0.047619047619047616, "grad_norm": 7.852096147334914, "learning_rate": 2.3809523809523806e-07, "logits/chosen": -1.90625, "logits/rejected": -1.625, "logps/chosen": -162.0, "logps/rejected": -164.0, "loss": 1.1746, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.20703125, "rewards/margins": 0.296875, "rewards/rejected": -0.50390625, "step": 620 }, { "epoch": 0.04838709677419355, "grad_norm": 8.828091659556467, "learning_rate": 2.4193548387096775e-07, "logits/chosen": -1.8671875, "logits/rejected": -1.4609375, "logps/chosen": -150.0, "logps/rejected": -178.0, "loss": 1.1813, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.25, "rewards/margins": 0.294921875, "rewards/rejected": -0.54296875, "step": 630 }, { "epoch": 0.04915514592933948, "grad_norm": 13.002816242908029, "learning_rate": 2.4577572964669736e-07, "logits/chosen": -2.0, "logits/rejected": -1.6875, "logps/chosen": -178.0, "logps/rejected": -183.0, "loss": 1.169, "rewards/accuracies": 0.75, "rewards/chosen": -0.294921875, "rewards/margins": 0.2578125, "rewards/rejected": -0.5546875, "step": 640 }, { "epoch": 0.04992319508448541, "grad_norm": 9.097655952299805, "learning_rate": 2.49615975422427e-07, "logits/chosen": -1.9609375, "logits/rejected": -1.71875, "logps/chosen": -166.0, "logps/rejected": -161.0, "loss": 1.1504, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.2255859375, "rewards/margins": 0.267578125, "rewards/rejected": -0.494140625, "step": 650 }, { "epoch": 0.05069124423963134, "grad_norm": 12.423527617063451, "learning_rate": 2.534562211981567e-07, "logits/chosen": -1.953125, "logits/rejected": -1.4921875, "logps/chosen": -143.0, "logps/rejected": -188.0, "loss": 1.1419, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.216796875, "rewards/margins": 0.373046875, "rewards/rejected": -0.58984375, "step": 660 }, { "epoch": 0.05145929339477726, "grad_norm": 9.209210921630307, "learning_rate": 2.572964669738863e-07, "logits/chosen": -2.046875, "logits/rejected": -1.6484375, "logps/chosen": -168.0, "logps/rejected": -171.0, "loss": 1.156, "rewards/accuracies": 0.75, "rewards/chosen": -0.2490234375, "rewards/margins": 0.30859375, "rewards/rejected": -0.55859375, "step": 670 }, { "epoch": 0.05222734254992319, "grad_norm": 12.795432701918582, "learning_rate": 2.6113671274961596e-07, "logits/chosen": -1.953125, "logits/rejected": -1.578125, "logps/chosen": -133.0, "logps/rejected": -175.0, "loss": 1.1515, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.298828125, "rewards/margins": 0.34765625, "rewards/rejected": -0.6484375, "step": 680 }, { "epoch": 0.052995391705069124, "grad_norm": 7.932836299383362, "learning_rate": 2.649769585253456e-07, "logits/chosen": -2.015625, "logits/rejected": -1.7578125, "logps/chosen": -135.0, "logps/rejected": -163.0, "loss": 1.1396, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.1953125, "rewards/margins": 0.314453125, "rewards/rejected": -0.5078125, "step": 690 }, { "epoch": 0.053763440860215055, "grad_norm": 9.622117835457317, "learning_rate": 2.6881720430107523e-07, "logits/chosen": -2.140625, "logits/rejected": -1.828125, "logps/chosen": -137.0, "logps/rejected": -167.0, "loss": 1.1495, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.224609375, "rewards/margins": 0.306640625, "rewards/rejected": -0.53125, "step": 700 }, { "epoch": 0.054531490015360985, "grad_norm": 8.327633646933254, "learning_rate": 2.726574500768049e-07, "logits/chosen": -2.171875, "logits/rejected": -1.8828125, "logps/chosen": -157.0, "logps/rejected": -173.0, "loss": 1.1367, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.32421875, "rewards/margins": 0.283203125, "rewards/rejected": -0.60546875, "step": 710 }, { "epoch": 0.055299539170506916, "grad_norm": 10.969226867744595, "learning_rate": 2.7649769585253456e-07, "logits/chosen": -2.046875, "logits/rejected": -1.765625, "logps/chosen": -155.0, "logps/rejected": -171.0, "loss": 1.1432, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.353515625, "rewards/margins": 0.287109375, "rewards/rejected": -0.640625, "step": 720 }, { "epoch": 0.05606758832565284, "grad_norm": 9.582368713157475, "learning_rate": 2.8033794162826417e-07, "logits/chosen": -2.203125, "logits/rejected": -1.8203125, "logps/chosen": -160.0, "logps/rejected": -210.0, "loss": 1.1286, "rewards/accuracies": 0.75, "rewards/chosen": -0.283203125, "rewards/margins": 0.369140625, "rewards/rejected": -0.65234375, "step": 730 }, { "epoch": 0.05683563748079877, "grad_norm": 9.620270097485895, "learning_rate": 2.8417818740399383e-07, "logits/chosen": -2.21875, "logits/rejected": -1.8984375, "logps/chosen": -147.0, "logps/rejected": -172.0, "loss": 1.1312, "rewards/accuracies": 0.75, "rewards/chosen": -0.271484375, "rewards/margins": 0.3359375, "rewards/rejected": -0.60546875, "step": 740 }, { "epoch": 0.0576036866359447, "grad_norm": 10.6851016308631, "learning_rate": 2.8801843317972344e-07, "logits/chosen": -2.265625, "logits/rejected": -1.8515625, "logps/chosen": -171.0, "logps/rejected": -199.0, "loss": 1.1451, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.349609375, "rewards/margins": 0.392578125, "rewards/rejected": -0.7421875, "step": 750 }, { "epoch": 0.05837173579109063, "grad_norm": 15.024732666792339, "learning_rate": 2.9185867895545315e-07, "logits/chosen": -2.015625, "logits/rejected": -1.8671875, "logps/chosen": -153.0, "logps/rejected": -190.0, "loss": 1.1271, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.3515625, "rewards/margins": 0.345703125, "rewards/rejected": -0.69921875, "step": 760 }, { "epoch": 0.05913978494623656, "grad_norm": 19.571724077897155, "learning_rate": 2.956989247311828e-07, "logits/chosen": -2.171875, "logits/rejected": -1.828125, "logps/chosen": -157.0, "logps/rejected": -174.0, "loss": 1.096, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.3515625, "rewards/margins": 0.33203125, "rewards/rejected": -0.68359375, "step": 770 }, { "epoch": 0.059907834101382486, "grad_norm": 10.15893283140202, "learning_rate": 2.9953917050691243e-07, "logits/chosen": -2.234375, "logits/rejected": -1.953125, "logps/chosen": -173.0, "logps/rejected": -198.0, "loss": 1.1128, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.287109375, "rewards/margins": 0.41015625, "rewards/rejected": -0.69921875, "step": 780 }, { "epoch": 0.060675883256528416, "grad_norm": 11.242795867218142, "learning_rate": 3.033794162826421e-07, "logits/chosen": -2.265625, "logits/rejected": -1.9296875, "logps/chosen": -157.0, "logps/rejected": -170.0, "loss": 1.1179, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.369140625, "rewards/margins": 0.33203125, "rewards/rejected": -0.69921875, "step": 790 }, { "epoch": 0.06144393241167435, "grad_norm": 11.082551485716726, "learning_rate": 3.0721966205837175e-07, "logits/chosen": -2.328125, "logits/rejected": -2.0, "logps/chosen": -166.0, "logps/rejected": -181.0, "loss": 1.0973, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.345703125, "rewards/margins": 0.380859375, "rewards/rejected": -0.7265625, "step": 800 }, { "epoch": 0.06221198156682028, "grad_norm": 9.871343148600326, "learning_rate": 3.1105990783410136e-07, "logits/chosen": -2.171875, "logits/rejected": -1.9765625, "logps/chosen": -177.0, "logps/rejected": -198.0, "loss": 1.1123, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.42578125, "rewards/margins": 0.341796875, "rewards/rejected": -0.76953125, "step": 810 }, { "epoch": 0.0629800307219662, "grad_norm": 15.371923972385046, "learning_rate": 3.14900153609831e-07, "logits/chosen": -2.3125, "logits/rejected": -1.9296875, "logps/chosen": -148.0, "logps/rejected": -179.0, "loss": 1.1217, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.369140625, "rewards/margins": 0.416015625, "rewards/rejected": -0.78515625, "step": 820 }, { "epoch": 0.06374807987711213, "grad_norm": 11.97586086733716, "learning_rate": 3.187403993855607e-07, "logits/chosen": -2.4375, "logits/rejected": -2.125, "logps/chosen": -187.0, "logps/rejected": -238.0, "loss": 1.103, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.443359375, "rewards/margins": 0.5234375, "rewards/rejected": -0.96484375, "step": 830 }, { "epoch": 0.06451612903225806, "grad_norm": 12.627156460815135, "learning_rate": 3.225806451612903e-07, "logits/chosen": -2.375, "logits/rejected": -2.0, "logps/chosen": -152.0, "logps/rejected": -191.0, "loss": 1.0984, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.4140625, "rewards/margins": 0.4765625, "rewards/rejected": -0.890625, "step": 840 }, { "epoch": 0.065284178187404, "grad_norm": 12.885880083551825, "learning_rate": 3.2642089093701996e-07, "logits/chosen": -2.28125, "logits/rejected": -2.125, "logps/chosen": -173.0, "logps/rejected": -182.0, "loss": 1.0992, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4140625, "rewards/margins": 0.33984375, "rewards/rejected": -0.75390625, "step": 850 }, { "epoch": 0.06605222734254992, "grad_norm": 15.279956469867352, "learning_rate": 3.3026113671274957e-07, "logits/chosen": -2.328125, "logits/rejected": -2.078125, "logps/chosen": -158.0, "logps/rejected": -207.0, "loss": 1.076, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.41796875, "rewards/margins": 0.470703125, "rewards/rejected": -0.890625, "step": 860 }, { "epoch": 0.06682027649769585, "grad_norm": 11.784651610599273, "learning_rate": 3.3410138248847923e-07, "logits/chosen": -2.328125, "logits/rejected": -2.046875, "logps/chosen": -182.0, "logps/rejected": -215.0, "loss": 1.0838, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.4453125, "rewards/margins": 0.515625, "rewards/rejected": -0.96484375, "step": 870 }, { "epoch": 0.06758832565284179, "grad_norm": 15.059500034971535, "learning_rate": 3.379416282642089e-07, "logits/chosen": -2.484375, "logits/rejected": -2.265625, "logps/chosen": -158.0, "logps/rejected": -218.0, "loss": 1.0683, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.484375, "rewards/margins": 0.435546875, "rewards/rejected": -0.91796875, "step": 880 }, { "epoch": 0.06835637480798772, "grad_norm": 10.960866456115573, "learning_rate": 3.417818740399385e-07, "logits/chosen": -2.4375, "logits/rejected": -2.203125, "logps/chosen": -180.0, "logps/rejected": -202.0, "loss": 1.0321, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.48046875, "rewards/margins": 0.5, "rewards/rejected": -0.98046875, "step": 890 }, { "epoch": 0.06912442396313365, "grad_norm": 11.876124660014703, "learning_rate": 3.4562211981566817e-07, "logits/chosen": -2.4375, "logits/rejected": -2.296875, "logps/chosen": -157.0, "logps/rejected": -181.0, "loss": 1.0356, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.498046875, "rewards/margins": 0.41015625, "rewards/rejected": -0.91015625, "step": 900 }, { "epoch": 0.06989247311827956, "grad_norm": 13.010931361364886, "learning_rate": 3.4946236559139783e-07, "logits/chosen": -2.5, "logits/rejected": -2.234375, "logps/chosen": -175.0, "logps/rejected": -199.0, "loss": 1.0559, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.490234375, "rewards/margins": 0.47265625, "rewards/rejected": -0.9609375, "step": 910 }, { "epoch": 0.0706605222734255, "grad_norm": 12.28103482323994, "learning_rate": 3.533026113671275e-07, "logits/chosen": -2.59375, "logits/rejected": -2.390625, "logps/chosen": -197.0, "logps/rejected": -232.0, "loss": 1.0292, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.498046875, "rewards/margins": 0.6015625, "rewards/rejected": -1.1015625, "step": 920 }, { "epoch": 0.07142857142857142, "grad_norm": 11.879658337994098, "learning_rate": 3.5714285714285716e-07, "logits/chosen": -2.609375, "logits/rejected": -2.328125, "logps/chosen": -149.0, "logps/rejected": -206.0, "loss": 1.0414, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.50390625, "rewards/margins": 0.5703125, "rewards/rejected": -1.0703125, "step": 930 }, { "epoch": 0.07219662058371736, "grad_norm": 13.306668831870702, "learning_rate": 3.6098310291858677e-07, "logits/chosen": -2.390625, "logits/rejected": -2.109375, "logps/chosen": -177.0, "logps/rejected": -221.0, "loss": 1.0264, "rewards/accuracies": 0.75, "rewards/chosen": -0.52734375, "rewards/margins": 0.54296875, "rewards/rejected": -1.0703125, "step": 940 }, { "epoch": 0.07296466973886329, "grad_norm": 11.963330511524836, "learning_rate": 3.6482334869431643e-07, "logits/chosen": -2.640625, "logits/rejected": -2.34375, "logps/chosen": -177.0, "logps/rejected": -235.0, "loss": 0.9925, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.478515625, "rewards/margins": 0.66015625, "rewards/rejected": -1.140625, "step": 950 }, { "epoch": 0.07373271889400922, "grad_norm": 10.794609707041102, "learning_rate": 3.686635944700461e-07, "logits/chosen": -2.65625, "logits/rejected": -2.5, "logps/chosen": -189.0, "logps/rejected": -219.0, "loss": 1.0469, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5390625, "rewards/margins": 0.54296875, "rewards/rejected": -1.078125, "step": 960 }, { "epoch": 0.07450076804915515, "grad_norm": 12.046600200195309, "learning_rate": 3.725038402457757e-07, "logits/chosen": -2.640625, "logits/rejected": -2.53125, "logps/chosen": -200.0, "logps/rejected": -228.0, "loss": 1.0407, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.6796875, "rewards/margins": 0.423828125, "rewards/rejected": -1.1015625, "step": 970 }, { "epoch": 0.07526881720430108, "grad_norm": 11.511215178445752, "learning_rate": 3.7634408602150537e-07, "logits/chosen": -2.640625, "logits/rejected": -2.390625, "logps/chosen": -195.0, "logps/rejected": -225.0, "loss": 1.023, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.61328125, "rewards/margins": 0.53125, "rewards/rejected": -1.1484375, "step": 980 }, { "epoch": 0.07603686635944701, "grad_norm": 12.194892471660435, "learning_rate": 3.8018433179723503e-07, "logits/chosen": -2.671875, "logits/rejected": -2.4375, "logps/chosen": -183.0, "logps/rejected": -232.0, "loss": 1.0334, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.51953125, "rewards/margins": 0.6875, "rewards/rejected": -1.203125, "step": 990 }, { "epoch": 0.07680491551459294, "grad_norm": 9.896487777671018, "learning_rate": 3.8402457757296464e-07, "logits/chosen": -2.625, "logits/rejected": -2.46875, "logps/chosen": -183.0, "logps/rejected": -230.0, "loss": 1.0346, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.451171875, "rewards/margins": 0.63671875, "rewards/rejected": -1.0859375, "step": 1000 }, { "epoch": 0.07680491551459294, "eval_logits/chosen": -2.59375, "eval_logits/rejected": -2.453125, "eval_logps/chosen": -202.0, "eval_logps/rejected": -228.0, "eval_loss": 0.5346714854240417, "eval_rewards/accuracies": 0.7289663553237915, "eval_rewards/chosen": -0.61328125, "eval_rewards/margins": 0.53125, "eval_rewards/rejected": -1.140625, "eval_runtime": 2263.7831, "eval_samples_per_second": 41.141, "eval_steps_per_second": 0.643, "step": 1000 }, { "epoch": 0.07757296466973887, "grad_norm": 10.288747535885566, "learning_rate": 3.878648233486943e-07, "logits/chosen": -2.609375, "logits/rejected": -2.40625, "logps/chosen": -181.0, "logps/rejected": -243.0, "loss": 1.0245, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.65234375, "rewards/margins": 0.64453125, "rewards/rejected": -1.296875, "step": 1010 }, { "epoch": 0.07834101382488479, "grad_norm": 10.731394234095653, "learning_rate": 3.9170506912442396e-07, "logits/chosen": -2.609375, "logits/rejected": -2.328125, "logps/chosen": -182.0, "logps/rejected": -231.0, "loss": 1.0266, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.66796875, "rewards/margins": 0.56640625, "rewards/rejected": -1.234375, "step": 1020 }, { "epoch": 0.07910906298003072, "grad_norm": 9.176632407351965, "learning_rate": 3.9554531490015357e-07, "logits/chosen": -2.578125, "logits/rejected": -2.3125, "logps/chosen": -191.0, "logps/rejected": -214.0, "loss": 1.0253, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.6171875, "rewards/margins": 0.349609375, "rewards/rejected": -0.96484375, "step": 1030 }, { "epoch": 0.07987711213517665, "grad_norm": 8.470269640738731, "learning_rate": 3.9938556067588324e-07, "logits/chosen": -2.65625, "logits/rejected": -2.4375, "logps/chosen": -173.0, "logps/rejected": -222.0, "loss": 1.0305, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5546875, "rewards/margins": 0.61328125, "rewards/rejected": -1.1640625, "step": 1040 }, { "epoch": 0.08064516129032258, "grad_norm": 10.438588222664185, "learning_rate": 4.0322580645161285e-07, "logits/chosen": -2.625, "logits/rejected": -2.5, "logps/chosen": -200.0, "logps/rejected": -224.0, "loss": 1.0228, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.6015625, "rewards/margins": 0.58984375, "rewards/rejected": -1.1953125, "step": 1050 }, { "epoch": 0.08141321044546851, "grad_norm": 9.398555757696725, "learning_rate": 4.070660522273425e-07, "logits/chosen": -2.609375, "logits/rejected": -2.328125, "logps/chosen": -170.0, "logps/rejected": -220.0, "loss": 1.0043, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.59375, "rewards/margins": 0.625, "rewards/rejected": -1.21875, "step": 1060 }, { "epoch": 0.08218125960061444, "grad_norm": 10.43356889416376, "learning_rate": 4.1090629800307217e-07, "logits/chosen": -2.625, "logits/rejected": -2.46875, "logps/chosen": -174.0, "logps/rejected": -231.0, "loss": 1.014, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.498046875, "rewards/margins": 0.671875, "rewards/rejected": -1.171875, "step": 1070 }, { "epoch": 0.08294930875576037, "grad_norm": 11.771244582891434, "learning_rate": 4.147465437788018e-07, "logits/chosen": -2.546875, "logits/rejected": -2.375, "logps/chosen": -196.0, "logps/rejected": -240.0, "loss": 1.0285, "rewards/accuracies": 0.8125, "rewards/chosen": -0.58203125, "rewards/margins": 0.66796875, "rewards/rejected": -1.25, "step": 1080 }, { "epoch": 0.0837173579109063, "grad_norm": 11.439168916024123, "learning_rate": 4.185867895545315e-07, "logits/chosen": -2.65625, "logits/rejected": -2.53125, "logps/chosen": -169.0, "logps/rejected": -252.0, "loss": 1.0004, "rewards/accuracies": 0.78125, "rewards/chosen": -0.50390625, "rewards/margins": 0.75, "rewards/rejected": -1.2578125, "step": 1090 }, { "epoch": 0.08448540706605223, "grad_norm": 10.844756876718325, "learning_rate": 4.2242703533026116e-07, "logits/chosen": -2.65625, "logits/rejected": -2.5, "logps/chosen": -189.0, "logps/rejected": -237.0, "loss": 1.0153, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.6640625, "rewards/margins": 0.69921875, "rewards/rejected": -1.3671875, "step": 1100 }, { "epoch": 0.08525345622119816, "grad_norm": 10.821683374334839, "learning_rate": 4.2626728110599077e-07, "logits/chosen": -2.703125, "logits/rejected": -2.578125, "logps/chosen": -178.0, "logps/rejected": -246.0, "loss": 1.0254, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.609375, "rewards/margins": 0.65234375, "rewards/rejected": -1.265625, "step": 1110 }, { "epoch": 0.08602150537634409, "grad_norm": 9.512336028794486, "learning_rate": 4.3010752688172043e-07, "logits/chosen": -2.640625, "logits/rejected": -2.53125, "logps/chosen": -193.0, "logps/rejected": -260.0, "loss": 0.9928, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5859375, "rewards/margins": 0.78515625, "rewards/rejected": -1.375, "step": 1120 }, { "epoch": 0.08678955453149001, "grad_norm": 10.275430090478245, "learning_rate": 4.339477726574501e-07, "logits/chosen": -2.671875, "logits/rejected": -2.546875, "logps/chosen": -211.0, "logps/rejected": -258.0, "loss": 1.0115, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.7109375, "rewards/margins": 0.6328125, "rewards/rejected": -1.34375, "step": 1130 }, { "epoch": 0.08755760368663594, "grad_norm": 10.245196448183021, "learning_rate": 4.377880184331797e-07, "logits/chosen": -2.640625, "logits/rejected": -2.515625, "logps/chosen": -195.0, "logps/rejected": -244.0, "loss": 0.9969, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.52734375, "rewards/margins": 0.703125, "rewards/rejected": -1.234375, "step": 1140 }, { "epoch": 0.08832565284178187, "grad_norm": 11.646417580255404, "learning_rate": 4.4162826420890937e-07, "logits/chosen": -2.71875, "logits/rejected": -2.5, "logps/chosen": -185.0, "logps/rejected": -234.0, "loss": 1.0169, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.55078125, "rewards/margins": 0.6953125, "rewards/rejected": -1.25, "step": 1150 }, { "epoch": 0.0890937019969278, "grad_norm": 9.215640590892086, "learning_rate": 4.45468509984639e-07, "logits/chosen": -2.703125, "logits/rejected": -2.546875, "logps/chosen": -178.0, "logps/rejected": -243.0, "loss": 0.9711, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.578125, "rewards/margins": 0.6796875, "rewards/rejected": -1.2578125, "step": 1160 }, { "epoch": 0.08986175115207373, "grad_norm": 10.966887243583152, "learning_rate": 4.4930875576036864e-07, "logits/chosen": -2.734375, "logits/rejected": -2.5, "logps/chosen": -195.0, "logps/rejected": -262.0, "loss": 0.9722, "rewards/accuracies": 0.75, "rewards/chosen": -0.67578125, "rewards/margins": 0.765625, "rewards/rejected": -1.4453125, "step": 1170 }, { "epoch": 0.09062980030721966, "grad_norm": 11.926561434880405, "learning_rate": 4.531490015360983e-07, "logits/chosen": -2.671875, "logits/rejected": -2.65625, "logps/chosen": -183.0, "logps/rejected": -236.0, "loss": 1.0007, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.59765625, "rewards/margins": 0.625, "rewards/rejected": -1.21875, "step": 1180 }, { "epoch": 0.0913978494623656, "grad_norm": 12.002669864097847, "learning_rate": 4.569892473118279e-07, "logits/chosen": -2.734375, "logits/rejected": -2.5, "logps/chosen": -193.0, "logps/rejected": -247.0, "loss": 0.9559, "rewards/accuracies": 0.8125, "rewards/chosen": -0.59765625, "rewards/margins": 0.78515625, "rewards/rejected": -1.3828125, "step": 1190 }, { "epoch": 0.09216589861751152, "grad_norm": 12.0902258207222, "learning_rate": 4.608294930875576e-07, "logits/chosen": -2.734375, "logits/rejected": -2.515625, "logps/chosen": -195.0, "logps/rejected": -254.0, "loss": 0.9747, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.671875, "rewards/margins": 0.796875, "rewards/rejected": -1.46875, "step": 1200 }, { "epoch": 0.09293394777265745, "grad_norm": 11.826790655084196, "learning_rate": 4.6466973886328724e-07, "logits/chosen": -2.78125, "logits/rejected": -2.640625, "logps/chosen": -225.0, "logps/rejected": -284.0, "loss": 0.9812, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.76953125, "rewards/margins": 0.74609375, "rewards/rejected": -1.515625, "step": 1210 }, { "epoch": 0.09370199692780339, "grad_norm": 12.80638344348903, "learning_rate": 4.6850998463901685e-07, "logits/chosen": -2.671875, "logits/rejected": -2.515625, "logps/chosen": -188.0, "logps/rejected": -258.0, "loss": 0.9787, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.546875, "rewards/margins": 0.79296875, "rewards/rejected": -1.34375, "step": 1220 }, { "epoch": 0.0944700460829493, "grad_norm": 14.708069370226704, "learning_rate": 4.723502304147465e-07, "logits/chosen": -2.796875, "logits/rejected": -2.59375, "logps/chosen": -199.0, "logps/rejected": -272.0, "loss": 1.0, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.78125, "rewards/margins": 0.7578125, "rewards/rejected": -1.5390625, "step": 1230 }, { "epoch": 0.09523809523809523, "grad_norm": 11.577408624430776, "learning_rate": 4.761904761904761e-07, "logits/chosen": -2.78125, "logits/rejected": -2.703125, "logps/chosen": -189.0, "logps/rejected": -278.0, "loss": 0.9656, "rewards/accuracies": 0.78125, "rewards/chosen": -0.76953125, "rewards/margins": 0.80078125, "rewards/rejected": -1.5703125, "step": 1240 }, { "epoch": 0.09600614439324116, "grad_norm": 12.660291136339154, "learning_rate": 4.800307219662058e-07, "logits/chosen": -2.78125, "logits/rejected": -2.53125, "logps/chosen": -191.0, "logps/rejected": -260.0, "loss": 0.9847, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.6875, "rewards/margins": 0.70703125, "rewards/rejected": -1.390625, "step": 1250 }, { "epoch": 0.0967741935483871, "grad_norm": 9.78436186448012, "learning_rate": 4.838709677419355e-07, "logits/chosen": -2.8125, "logits/rejected": -2.640625, "logps/chosen": -229.0, "logps/rejected": -272.0, "loss": 0.9778, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.77734375, "rewards/margins": 0.6875, "rewards/rejected": -1.4609375, "step": 1260 }, { "epoch": 0.09754224270353302, "grad_norm": 10.007316809322592, "learning_rate": 4.877112135176651e-07, "logits/chosen": -2.78125, "logits/rejected": -2.671875, "logps/chosen": -212.0, "logps/rejected": -254.0, "loss": 0.999, "rewards/accuracies": 0.75, "rewards/chosen": -0.8359375, "rewards/margins": 0.6484375, "rewards/rejected": -1.484375, "step": 1270 }, { "epoch": 0.09831029185867896, "grad_norm": 11.227227732703716, "learning_rate": 4.915514592933947e-07, "logits/chosen": -2.625, "logits/rejected": -2.40625, "logps/chosen": -193.0, "logps/rejected": -254.0, "loss": 0.9958, "rewards/accuracies": 0.78125, "rewards/chosen": -0.80859375, "rewards/margins": 0.61328125, "rewards/rejected": -1.421875, "step": 1280 }, { "epoch": 0.09907834101382489, "grad_norm": 10.434421727129537, "learning_rate": 4.953917050691244e-07, "logits/chosen": -2.71875, "logits/rejected": -2.625, "logps/chosen": -214.0, "logps/rejected": -268.0, "loss": 1.0017, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.88671875, "rewards/margins": 0.6953125, "rewards/rejected": -1.578125, "step": 1290 }, { "epoch": 0.09984639016897082, "grad_norm": 13.08215788882812, "learning_rate": 4.99231950844854e-07, "logits/chosen": -2.734375, "logits/rejected": -2.53125, "logps/chosen": -211.0, "logps/rejected": -276.0, "loss": 1.002, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.828125, "rewards/margins": 0.8203125, "rewards/rejected": -1.6484375, "step": 1300 }, { "epoch": 0.10061443932411675, "grad_norm": 9.069867651390021, "learning_rate": 4.999994249804922e-07, "logits/chosen": -2.765625, "logits/rejected": -2.546875, "logps/chosen": -195.0, "logps/rejected": -262.0, "loss": 0.9756, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.78125, "rewards/margins": 0.76171875, "rewards/rejected": -1.5390625, "step": 1310 }, { "epoch": 0.10138248847926268, "grad_norm": 9.277196186955432, "learning_rate": 4.999970889682754e-07, "logits/chosen": -2.765625, "logits/rejected": -2.59375, "logps/chosen": -215.0, "logps/rejected": -294.0, "loss": 0.9553, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.6875, "rewards/margins": 0.77734375, "rewards/rejected": -1.46875, "step": 1320 }, { "epoch": 0.10215053763440861, "grad_norm": 10.25101861322067, "learning_rate": 4.999929560414081e-07, "logits/chosen": -2.65625, "logits/rejected": -2.578125, "logps/chosen": -207.0, "logps/rejected": -270.0, "loss": 0.979, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.84375, "rewards/margins": 0.77734375, "rewards/rejected": -1.625, "step": 1330 }, { "epoch": 0.10291858678955453, "grad_norm": 9.338711917314816, "learning_rate": 4.999870262295966e-07, "logits/chosen": -2.578125, "logits/rejected": -2.546875, "logps/chosen": -187.0, "logps/rejected": -266.0, "loss": 0.9426, "rewards/accuracies": 0.8125, "rewards/chosen": -0.63671875, "rewards/margins": 0.9140625, "rewards/rejected": -1.546875, "step": 1340 }, { "epoch": 0.10368663594470046, "grad_norm": 9.302970222897207, "learning_rate": 4.999792995754629e-07, "logits/chosen": -2.796875, "logits/rejected": -2.609375, "logps/chosen": -215.0, "logps/rejected": -282.0, "loss": 0.9971, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.9140625, "rewards/margins": 0.75, "rewards/rejected": -1.6640625, "step": 1350 }, { "epoch": 0.10445468509984639, "grad_norm": 10.03109623677574, "learning_rate": 4.999697761345443e-07, "logits/chosen": -2.8125, "logits/rejected": -2.65625, "logps/chosen": -206.0, "logps/rejected": -310.0, "loss": 0.9576, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.796875, "rewards/margins": 0.87109375, "rewards/rejected": -1.671875, "step": 1360 }, { "epoch": 0.10522273425499232, "grad_norm": 9.934286407148253, "learning_rate": 4.999584559752927e-07, "logits/chosen": -2.75, "logits/rejected": -2.75, "logps/chosen": -196.0, "logps/rejected": -272.0, "loss": 0.9599, "rewards/accuracies": 0.8125, "rewards/chosen": -0.65234375, "rewards/margins": 0.9296875, "rewards/rejected": -1.578125, "step": 1370 }, { "epoch": 0.10599078341013825, "grad_norm": 12.154093010084933, "learning_rate": 4.999453391790746e-07, "logits/chosen": -2.734375, "logits/rejected": -2.625, "logps/chosen": -194.0, "logps/rejected": -256.0, "loss": 0.9772, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.75390625, "rewards/margins": 0.63671875, "rewards/rejected": -1.390625, "step": 1380 }, { "epoch": 0.10675883256528418, "grad_norm": 12.827697899650481, "learning_rate": 4.999304258401702e-07, "logits/chosen": -2.8125, "logits/rejected": -2.765625, "logps/chosen": -205.0, "logps/rejected": -282.0, "loss": 0.9446, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.73828125, "rewards/margins": 0.86328125, "rewards/rejected": -1.6015625, "step": 1390 }, { "epoch": 0.10752688172043011, "grad_norm": 11.03546365927337, "learning_rate": 4.999137160657726e-07, "logits/chosen": -2.78125, "logits/rejected": -2.515625, "logps/chosen": -183.0, "logps/rejected": -268.0, "loss": 0.9674, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.74609375, "rewards/margins": 0.8515625, "rewards/rejected": -1.59375, "step": 1400 }, { "epoch": 0.10829493087557604, "grad_norm": 10.560686842232878, "learning_rate": 4.998952099759873e-07, "logits/chosen": -2.71875, "logits/rejected": -2.5625, "logps/chosen": -219.0, "logps/rejected": -288.0, "loss": 0.9854, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.90625, "rewards/margins": 0.8046875, "rewards/rejected": -1.7109375, "step": 1410 }, { "epoch": 0.10906298003072197, "grad_norm": 11.379182724969192, "learning_rate": 4.998749077038314e-07, "logits/chosen": -2.71875, "logits/rejected": -2.59375, "logps/chosen": -214.0, "logps/rejected": -272.0, "loss": 0.9209, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.73828125, "rewards/margins": 0.77734375, "rewards/rejected": -1.515625, "step": 1420 }, { "epoch": 0.1098310291858679, "grad_norm": 10.749724141668255, "learning_rate": 4.998528093952326e-07, "logits/chosen": -2.765625, "logits/rejected": -2.640625, "logps/chosen": -215.0, "logps/rejected": -264.0, "loss": 0.9452, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.953125, "rewards/margins": 0.67578125, "rewards/rejected": -1.6328125, "step": 1430 }, { "epoch": 0.11059907834101383, "grad_norm": 10.315940129867293, "learning_rate": 4.998289152090274e-07, "logits/chosen": -2.765625, "logits/rejected": -2.6875, "logps/chosen": -200.0, "logps/rejected": -272.0, "loss": 0.9775, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7109375, "rewards/margins": 0.8125, "rewards/rejected": -1.5234375, "step": 1440 }, { "epoch": 0.11136712749615975, "grad_norm": 9.904732877453377, "learning_rate": 4.998032253169614e-07, "logits/chosen": -2.828125, "logits/rejected": -2.5625, "logps/chosen": -213.0, "logps/rejected": -270.0, "loss": 0.9704, "rewards/accuracies": 0.75, "rewards/chosen": -0.90234375, "rewards/margins": 0.796875, "rewards/rejected": -1.6953125, "step": 1450 }, { "epoch": 0.11213517665130568, "grad_norm": 11.424349862849619, "learning_rate": 4.997757399036869e-07, "logits/chosen": -2.8125, "logits/rejected": -2.71875, "logps/chosen": -193.0, "logps/rejected": -284.0, "loss": 0.9786, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.75390625, "rewards/margins": 0.890625, "rewards/rejected": -1.6484375, "step": 1460 }, { "epoch": 0.11290322580645161, "grad_norm": 11.939345476174541, "learning_rate": 4.997464591667619e-07, "logits/chosen": -2.78125, "logits/rejected": -2.59375, "logps/chosen": -181.0, "logps/rejected": -284.0, "loss": 0.9447, "rewards/accuracies": 0.78125, "rewards/chosen": -0.76953125, "rewards/margins": 0.9609375, "rewards/rejected": -1.734375, "step": 1470 }, { "epoch": 0.11367127496159754, "grad_norm": 11.023868276145246, "learning_rate": 4.997153833166486e-07, "logits/chosen": -2.75, "logits/rejected": -2.6875, "logps/chosen": -226.0, "logps/rejected": -282.0, "loss": 0.9852, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.88671875, "rewards/margins": 0.69921875, "rewards/rejected": -1.5859375, "step": 1480 }, { "epoch": 0.11443932411674347, "grad_norm": 12.167196253005324, "learning_rate": 4.996825125767126e-07, "logits/chosen": -2.8125, "logits/rejected": -2.75, "logps/chosen": -210.0, "logps/rejected": -264.0, "loss": 0.9376, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.8359375, "rewards/margins": 0.734375, "rewards/rejected": -1.5703125, "step": 1490 }, { "epoch": 0.1152073732718894, "grad_norm": 10.838935626201184, "learning_rate": 4.9964784718322e-07, "logits/chosen": -2.8125, "logits/rejected": -2.5625, "logps/chosen": -219.0, "logps/rejected": -294.0, "loss": 0.9353, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9140625, "rewards/margins": 0.984375, "rewards/rejected": -1.8984375, "step": 1500 }, { "epoch": 0.11597542242703533, "grad_norm": 12.25760539492326, "learning_rate": 4.996113873853367e-07, "logits/chosen": -2.734375, "logits/rejected": -2.53125, "logps/chosen": -212.0, "logps/rejected": -280.0, "loss": 0.9137, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.0546875, "rewards/margins": 0.8203125, "rewards/rejected": -1.875, "step": 1510 }, { "epoch": 0.11674347158218126, "grad_norm": 12.758509755854877, "learning_rate": 4.995731334451266e-07, "logits/chosen": -2.671875, "logits/rejected": -2.40625, "logps/chosen": -219.0, "logps/rejected": -308.0, "loss": 0.9203, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.90625, "rewards/margins": 1.140625, "rewards/rejected": -2.046875, "step": 1520 }, { "epoch": 0.1175115207373272, "grad_norm": 10.673773697575998, "learning_rate": 4.995330856375489e-07, "logits/chosen": -2.953125, "logits/rejected": -2.703125, "logps/chosen": -196.0, "logps/rejected": -282.0, "loss": 0.9347, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.91015625, "rewards/margins": 0.76171875, "rewards/rejected": -1.671875, "step": 1530 }, { "epoch": 0.11827956989247312, "grad_norm": 11.022988528461976, "learning_rate": 4.994912442504571e-07, "logits/chosen": -2.828125, "logits/rejected": -2.65625, "logps/chosen": -202.0, "logps/rejected": -276.0, "loss": 0.9831, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.921875, "rewards/margins": 0.796875, "rewards/rejected": -1.71875, "step": 1540 }, { "epoch": 0.11904761904761904, "grad_norm": 12.64870423752532, "learning_rate": 4.994476095845961e-07, "logits/chosen": -2.796875, "logits/rejected": -2.625, "logps/chosen": -217.0, "logps/rejected": -294.0, "loss": 0.9481, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.96484375, "rewards/margins": 0.82421875, "rewards/rejected": -1.7890625, "step": 1550 }, { "epoch": 0.11981566820276497, "grad_norm": 11.970951232116713, "learning_rate": 4.994021819536009e-07, "logits/chosen": -2.796875, "logits/rejected": -2.75, "logps/chosen": -209.0, "logps/rejected": -264.0, "loss": 0.938, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.70703125, "rewards/margins": 0.8671875, "rewards/rejected": -1.578125, "step": 1560 }, { "epoch": 0.1205837173579109, "grad_norm": 11.41210644671236, "learning_rate": 4.993549616839935e-07, "logits/chosen": -2.796875, "logits/rejected": -2.625, "logps/chosen": -197.0, "logps/rejected": -294.0, "loss": 0.9156, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.796875, "rewards/margins": 0.9140625, "rewards/rejected": -1.7109375, "step": 1570 }, { "epoch": 0.12135176651305683, "grad_norm": 11.88920663205869, "learning_rate": 4.993059491151809e-07, "logits/chosen": -2.84375, "logits/rejected": -2.6875, "logps/chosen": -221.0, "logps/rejected": -290.0, "loss": 0.9371, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.03125, "rewards/margins": 0.91015625, "rewards/rejected": -1.9375, "step": 1580 }, { "epoch": 0.12211981566820276, "grad_norm": 12.729750913881482, "learning_rate": 4.992551445994531e-07, "logits/chosen": -2.78125, "logits/rejected": -2.703125, "logps/chosen": -230.0, "logps/rejected": -340.0, "loss": 0.9183, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.0546875, "rewards/margins": 1.0078125, "rewards/rejected": -2.0625, "step": 1590 }, { "epoch": 0.1228878648233487, "grad_norm": 12.046272499808548, "learning_rate": 4.992025485019795e-07, "logits/chosen": -2.84375, "logits/rejected": -2.546875, "logps/chosen": -195.0, "logps/rejected": -292.0, "loss": 0.9545, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.8203125, "rewards/margins": 0.9765625, "rewards/rejected": -1.796875, "step": 1600 }, { "epoch": 0.12365591397849462, "grad_norm": 11.18605823142129, "learning_rate": 4.991481612008075e-07, "logits/chosen": -2.8125, "logits/rejected": -2.71875, "logps/chosen": -192.0, "logps/rejected": -276.0, "loss": 0.9394, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.78515625, "rewards/margins": 0.9140625, "rewards/rejected": -1.6953125, "step": 1610 }, { "epoch": 0.12442396313364056, "grad_norm": 11.295807857356117, "learning_rate": 4.990919830868589e-07, "logits/chosen": -2.8125, "logits/rejected": -2.6875, "logps/chosen": -209.0, "logps/rejected": -304.0, "loss": 0.9358, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.90625, "rewards/margins": 0.9375, "rewards/rejected": -1.84375, "step": 1620 }, { "epoch": 0.1251920122887865, "grad_norm": 10.80498356074713, "learning_rate": 4.990340145639278e-07, "logits/chosen": -2.75, "logits/rejected": -2.5625, "logps/chosen": -236.0, "logps/rejected": -356.0, "loss": 0.9221, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.0546875, "rewards/margins": 0.9921875, "rewards/rejected": -2.046875, "step": 1630 }, { "epoch": 0.1259600614439324, "grad_norm": 12.176387324676929, "learning_rate": 4.989742560486767e-07, "logits/chosen": -2.65625, "logits/rejected": -2.515625, "logps/chosen": -230.0, "logps/rejected": -290.0, "loss": 0.9407, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.046875, "rewards/margins": 0.828125, "rewards/rejected": -1.8828125, "step": 1640 }, { "epoch": 0.12672811059907835, "grad_norm": 12.304900532120273, "learning_rate": 4.989127079706345e-07, "logits/chosen": -2.796875, "logits/rejected": -2.78125, "logps/chosen": -229.0, "logps/rejected": -296.0, "loss": 0.9627, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.984375, "rewards/margins": 0.859375, "rewards/rejected": -1.84375, "step": 1650 }, { "epoch": 0.12749615975422426, "grad_norm": 13.262924695005738, "learning_rate": 4.988493707721928e-07, "logits/chosen": -2.875, "logits/rejected": -2.671875, "logps/chosen": -228.0, "logps/rejected": -316.0, "loss": 0.8731, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.91796875, "rewards/margins": 1.0625, "rewards/rejected": -1.9765625, "step": 1660 }, { "epoch": 0.1282642089093702, "grad_norm": 11.939428096923615, "learning_rate": 4.987842449086033e-07, "logits/chosen": -2.8125, "logits/rejected": -2.65625, "logps/chosen": -227.0, "logps/rejected": -302.0, "loss": 0.9057, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.0078125, "rewards/margins": 0.86328125, "rewards/rejected": -1.875, "step": 1670 }, { "epoch": 0.12903225806451613, "grad_norm": 13.521400483055606, "learning_rate": 4.987173308479737e-07, "logits/chosen": -2.8125, "logits/rejected": -2.515625, "logps/chosen": -211.0, "logps/rejected": -296.0, "loss": 0.9431, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.0078125, "rewards/margins": 0.91015625, "rewards/rejected": -1.9140625, "step": 1680 }, { "epoch": 0.12980030721966207, "grad_norm": 12.132107938504019, "learning_rate": 4.986486290712652e-07, "logits/chosen": -2.625, "logits/rejected": -2.53125, "logps/chosen": -237.0, "logps/rejected": -288.0, "loss": 0.9172, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.953125, "rewards/margins": 0.78125, "rewards/rejected": -1.734375, "step": 1690 }, { "epoch": 0.130568356374808, "grad_norm": 13.892295326894061, "learning_rate": 4.985781400722885e-07, "logits/chosen": -2.828125, "logits/rejected": -2.59375, "logps/chosen": -232.0, "logps/rejected": -320.0, "loss": 0.8968, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.87890625, "rewards/margins": 1.0625, "rewards/rejected": -1.9375, "step": 1700 }, { "epoch": 0.1313364055299539, "grad_norm": 12.00404226783099, "learning_rate": 4.985058643577002e-07, "logits/chosen": -2.828125, "logits/rejected": -2.546875, "logps/chosen": -212.0, "logps/rejected": -332.0, "loss": 0.926, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.9140625, "rewards/margins": 1.0703125, "rewards/rejected": -1.9765625, "step": 1710 }, { "epoch": 0.13210445468509985, "grad_norm": 14.67267400441331, "learning_rate": 4.984318024469994e-07, "logits/chosen": -2.859375, "logits/rejected": -2.71875, "logps/chosen": -240.0, "logps/rejected": -290.0, "loss": 0.9579, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.828125, "rewards/margins": 0.88671875, "rewards/rejected": -1.7109375, "step": 1720 }, { "epoch": 0.13287250384024576, "grad_norm": 12.540018099060788, "learning_rate": 4.983559548725244e-07, "logits/chosen": -2.8125, "logits/rejected": -2.59375, "logps/chosen": -236.0, "logps/rejected": -322.0, "loss": 0.8644, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.0, "rewards/margins": 0.99609375, "rewards/rejected": -2.0, "step": 1730 }, { "epoch": 0.1336405529953917, "grad_norm": 12.828308620419508, "learning_rate": 4.982783221794477e-07, "logits/chosen": -2.8125, "logits/rejected": -2.484375, "logps/chosen": -244.0, "logps/rejected": -348.0, "loss": 0.8867, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.0703125, "rewards/margins": 1.203125, "rewards/rejected": -2.28125, "step": 1740 }, { "epoch": 0.13440860215053763, "grad_norm": 11.231475266769484, "learning_rate": 4.981989049257733e-07, "logits/chosen": -2.765625, "logits/rejected": -2.515625, "logps/chosen": -218.0, "logps/rejected": -336.0, "loss": 0.8328, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.03125, "rewards/margins": 1.2265625, "rewards/rejected": -2.25, "step": 1750 }, { "epoch": 0.13517665130568357, "grad_norm": 15.19480096436614, "learning_rate": 4.981177036823318e-07, "logits/chosen": -2.828125, "logits/rejected": -2.703125, "logps/chosen": -227.0, "logps/rejected": -328.0, "loss": 0.934, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.0625, "rewards/margins": 0.96875, "rewards/rejected": -2.03125, "step": 1760 }, { "epoch": 0.1359447004608295, "grad_norm": 10.358931909421898, "learning_rate": 4.980347190327769e-07, "logits/chosen": -2.734375, "logits/rejected": -2.71875, "logps/chosen": -216.0, "logps/rejected": -334.0, "loss": 0.8955, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.0234375, "rewards/margins": 1.2421875, "rewards/rejected": -2.265625, "step": 1770 }, { "epoch": 0.13671274961597543, "grad_norm": 14.219657272940166, "learning_rate": 4.979499515735808e-07, "logits/chosen": -2.875, "logits/rejected": -2.640625, "logps/chosen": -249.0, "logps/rejected": -328.0, "loss": 0.902, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.0625, "rewards/margins": 1.03125, "rewards/rejected": -2.09375, "step": 1780 }, { "epoch": 0.13748079877112135, "grad_norm": 14.087012355802475, "learning_rate": 4.978634019140302e-07, "logits/chosen": -2.84375, "logits/rejected": -2.609375, "logps/chosen": -220.0, "logps/rejected": -318.0, "loss": 0.9083, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.0234375, "rewards/margins": 0.94921875, "rewards/rejected": -1.96875, "step": 1790 }, { "epoch": 0.1382488479262673, "grad_norm": 14.143414361305528, "learning_rate": 4.977750706762218e-07, "logits/chosen": -2.84375, "logits/rejected": -2.765625, "logps/chosen": -218.0, "logps/rejected": -300.0, "loss": 0.9053, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.015625, "rewards/margins": 0.89453125, "rewards/rejected": -1.9140625, "step": 1800 }, { "epoch": 0.1390168970814132, "grad_norm": 11.702828292068595, "learning_rate": 4.976849584950576e-07, "logits/chosen": -2.890625, "logits/rejected": -2.703125, "logps/chosen": -228.0, "logps/rejected": -318.0, "loss": 0.9029, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.984375, "rewards/margins": 1.03125, "rewards/rejected": -2.015625, "step": 1810 }, { "epoch": 0.13978494623655913, "grad_norm": 13.75862908176292, "learning_rate": 4.97593066018241e-07, "logits/chosen": -2.84375, "logits/rejected": -2.71875, "logps/chosen": -246.0, "logps/rejected": -304.0, "loss": 0.9144, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2265625, "rewards/margins": 0.796875, "rewards/rejected": -2.03125, "step": 1820 }, { "epoch": 0.14055299539170507, "grad_norm": 13.40618186579835, "learning_rate": 4.974993939062713e-07, "logits/chosen": -2.96875, "logits/rejected": -2.78125, "logps/chosen": -240.0, "logps/rejected": -324.0, "loss": 0.9275, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.171875, "rewards/margins": 0.98828125, "rewards/rejected": -2.15625, "step": 1830 }, { "epoch": 0.141321044546851, "grad_norm": 11.059233768822216, "learning_rate": 4.974039428324395e-07, "logits/chosen": -2.890625, "logits/rejected": -2.84375, "logps/chosen": -221.0, "logps/rejected": -304.0, "loss": 0.9384, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.125, "rewards/margins": 0.81640625, "rewards/rejected": -1.9453125, "step": 1840 }, { "epoch": 0.14208909370199693, "grad_norm": 13.828446616343584, "learning_rate": 4.973067134828232e-07, "logits/chosen": -2.796875, "logits/rejected": -2.609375, "logps/chosen": -230.0, "logps/rejected": -292.0, "loss": 0.9021, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.0390625, "rewards/margins": 0.859375, "rewards/rejected": -1.890625, "step": 1850 }, { "epoch": 0.14285714285714285, "grad_norm": 12.284270864342412, "learning_rate": 4.972077065562821e-07, "logits/chosen": -2.78125, "logits/rejected": -2.75, "logps/chosen": -223.0, "logps/rejected": -314.0, "loss": 0.8937, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.9765625, "rewards/margins": 1.078125, "rewards/rejected": -2.0625, "step": 1860 }, { "epoch": 0.1436251920122888, "grad_norm": 21.9808108479774, "learning_rate": 4.971069227644524e-07, "logits/chosen": -2.8125, "logits/rejected": -2.703125, "logps/chosen": -229.0, "logps/rejected": -314.0, "loss": 0.9477, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.0546875, "rewards/margins": 0.8984375, "rewards/rejected": -1.953125, "step": 1870 }, { "epoch": 0.1443932411674347, "grad_norm": 12.949415039823924, "learning_rate": 4.970043628317421e-07, "logits/chosen": -2.921875, "logits/rejected": -2.765625, "logps/chosen": -226.0, "logps/rejected": -308.0, "loss": 0.9509, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.015625, "rewards/margins": 0.921875, "rewards/rejected": -1.9375, "step": 1880 }, { "epoch": 0.14516129032258066, "grad_norm": 11.503682352680274, "learning_rate": 4.969000274953254e-07, "logits/chosen": -2.890625, "logits/rejected": -2.78125, "logps/chosen": -230.0, "logps/rejected": -300.0, "loss": 0.9292, "rewards/accuracies": 0.75, "rewards/chosen": -1.140625, "rewards/margins": 0.8046875, "rewards/rejected": -1.9375, "step": 1890 }, { "epoch": 0.14592933947772657, "grad_norm": 12.43131611553382, "learning_rate": 4.96793917505138e-07, "logits/chosen": -2.921875, "logits/rejected": -2.640625, "logps/chosen": -220.0, "logps/rejected": -338.0, "loss": 0.8572, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.97265625, "rewards/margins": 1.1640625, "rewards/rejected": -2.140625, "step": 1900 }, { "epoch": 0.14669738863287252, "grad_norm": 11.410372112354368, "learning_rate": 4.96686033623871e-07, "logits/chosen": -2.765625, "logits/rejected": -2.671875, "logps/chosen": -246.0, "logps/rejected": -334.0, "loss": 0.8731, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.1015625, "rewards/margins": 1.0859375, "rewards/rejected": -2.1875, "step": 1910 }, { "epoch": 0.14746543778801843, "grad_norm": 12.558801613957584, "learning_rate": 4.96576376626966e-07, "logits/chosen": -2.796875, "logits/rejected": -2.765625, "logps/chosen": -260.0, "logps/rejected": -358.0, "loss": 0.9011, "rewards/accuracies": 0.8125, "rewards/chosen": -1.125, "rewards/margins": 1.375, "rewards/rejected": -2.5, "step": 1920 }, { "epoch": 0.14823348694316435, "grad_norm": 13.13346681275969, "learning_rate": 4.964649473026092e-07, "logits/chosen": -2.828125, "logits/rejected": -2.796875, "logps/chosen": -282.0, "logps/rejected": -338.0, "loss": 0.8601, "rewards/accuracies": 0.75, "rewards/chosen": -1.2734375, "rewards/margins": 0.91796875, "rewards/rejected": -2.1875, "step": 1930 }, { "epoch": 0.1490015360983103, "grad_norm": 13.437954535628082, "learning_rate": 4.96351746451726e-07, "logits/chosen": -2.90625, "logits/rejected": -2.75, "logps/chosen": -239.0, "logps/rejected": -350.0, "loss": 0.9031, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.15625, "rewards/margins": 1.171875, "rewards/rejected": -2.328125, "step": 1940 }, { "epoch": 0.1497695852534562, "grad_norm": 10.81905592346158, "learning_rate": 4.962367748879748e-07, "logits/chosen": -2.796875, "logits/rejected": -2.609375, "logps/chosen": -234.0, "logps/rejected": -344.0, "loss": 0.8994, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.1015625, "rewards/margins": 1.046875, "rewards/rejected": -2.15625, "step": 1950 }, { "epoch": 0.15053763440860216, "grad_norm": 14.947172948637501, "learning_rate": 4.961200334377416e-07, "logits/chosen": -2.9375, "logits/rejected": -2.90625, "logps/chosen": -238.0, "logps/rejected": -324.0, "loss": 0.9, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.09375, "rewards/margins": 1.046875, "rewards/rejected": -2.140625, "step": 1960 }, { "epoch": 0.15130568356374807, "grad_norm": 11.920684900369118, "learning_rate": 4.96001522940134e-07, "logits/chosen": -2.875, "logits/rejected": -2.6875, "logps/chosen": -242.0, "logps/rejected": -342.0, "loss": 0.892, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.078125, "rewards/margins": 1.09375, "rewards/rejected": -2.171875, "step": 1970 }, { "epoch": 0.15207373271889402, "grad_norm": 12.34770359578292, "learning_rate": 4.958812442469746e-07, "logits/chosen": -2.90625, "logits/rejected": -2.90625, "logps/chosen": -239.0, "logps/rejected": -332.0, "loss": 0.8816, "rewards/accuracies": 0.78125, "rewards/chosen": -1.109375, "rewards/margins": 1.0234375, "rewards/rejected": -2.140625, "step": 1980 }, { "epoch": 0.15284178187403993, "grad_norm": 12.570829412915463, "learning_rate": 4.95759198222796e-07, "logits/chosen": -2.9375, "logits/rejected": -2.78125, "logps/chosen": -243.0, "logps/rejected": -336.0, "loss": 0.942, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.125, "rewards/margins": 0.9609375, "rewards/rejected": -2.078125, "step": 1990 }, { "epoch": 0.15360983102918588, "grad_norm": 13.675298489357694, "learning_rate": 4.956353857448334e-07, "logits/chosen": -2.921875, "logits/rejected": -2.828125, "logps/chosen": -230.0, "logps/rejected": -308.0, "loss": 0.9369, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.1171875, "rewards/margins": 0.9609375, "rewards/rejected": -2.078125, "step": 2000 }, { "epoch": 0.15360983102918588, "eval_logits/chosen": -2.84375, "eval_logits/rejected": -2.765625, "eval_logps/chosen": -250.0, "eval_logps/rejected": -314.0, "eval_loss": 0.47869741916656494, "eval_rewards/accuracies": 0.7510302066802979, "eval_rewards/chosen": -1.0859375, "eval_rewards/margins": 0.921875, "eval_rewards/rejected": -2.015625, "eval_runtime": 2263.0363, "eval_samples_per_second": 41.155, "eval_steps_per_second": 0.643, "step": 2000 }, { "epoch": 0.1543778801843318, "grad_norm": 13.066431288648134, "learning_rate": 4.95509807703019e-07, "logits/chosen": -2.828125, "logits/rejected": -2.84375, "logps/chosen": -224.0, "logps/rejected": -304.0, "loss": 0.9174, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.79296875, "rewards/margins": 1.1171875, "rewards/rejected": -1.90625, "step": 2010 }, { "epoch": 0.15514592933947774, "grad_norm": 14.253162188149206, "learning_rate": 4.953824649999754e-07, "logits/chosen": -2.78125, "logits/rejected": -2.78125, "logps/chosen": -248.0, "logps/rejected": -338.0, "loss": 0.9205, "rewards/accuracies": 0.75, "rewards/chosen": -1.140625, "rewards/margins": 1.0234375, "rewards/rejected": -2.171875, "step": 2020 }, { "epoch": 0.15591397849462366, "grad_norm": 12.492824851523928, "learning_rate": 4.952533585510092e-07, "logits/chosen": -2.890625, "logits/rejected": -2.6875, "logps/chosen": -242.0, "logps/rejected": -340.0, "loss": 0.9399, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.1640625, "rewards/margins": 1.0234375, "rewards/rejected": -2.1875, "step": 2030 }, { "epoch": 0.15668202764976957, "grad_norm": 13.23251744022185, "learning_rate": 4.951224892841041e-07, "logits/chosen": -2.84375, "logits/rejected": -2.75, "logps/chosen": -215.0, "logps/rejected": -306.0, "loss": 0.8916, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.0078125, "rewards/margins": 1.03125, "rewards/rejected": -2.046875, "step": 2040 }, { "epoch": 0.15745007680491552, "grad_norm": 12.167994706695382, "learning_rate": 4.949898581399149e-07, "logits/chosen": -2.875, "logits/rejected": -2.796875, "logps/chosen": -235.0, "logps/rejected": -312.0, "loss": 0.9514, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.1015625, "rewards/margins": 0.84375, "rewards/rejected": -1.9453125, "step": 2050 }, { "epoch": 0.15821812596006143, "grad_norm": 11.451775548773453, "learning_rate": 4.9485546607176e-07, "logits/chosen": -2.796875, "logits/rejected": -2.75, "logps/chosen": -241.0, "logps/rejected": -336.0, "loss": 0.8834, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.171875, "rewards/margins": 0.98046875, "rewards/rejected": -2.15625, "step": 2060 }, { "epoch": 0.15898617511520738, "grad_norm": 14.50082788051648, "learning_rate": 4.947193140456148e-07, "logits/chosen": -2.859375, "logits/rejected": -2.78125, "logps/chosen": -235.0, "logps/rejected": -318.0, "loss": 0.9053, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.0234375, "rewards/margins": 1.0, "rewards/rejected": -2.03125, "step": 2070 }, { "epoch": 0.1597542242703533, "grad_norm": 13.214314693565314, "learning_rate": 4.945814030401052e-07, "logits/chosen": -2.890625, "logits/rejected": -2.890625, "logps/chosen": -237.0, "logps/rejected": -334.0, "loss": 0.8968, "rewards/accuracies": 0.875, "rewards/chosen": -0.98828125, "rewards/margins": 1.1953125, "rewards/rejected": -2.1875, "step": 2080 }, { "epoch": 0.16052227342549924, "grad_norm": 13.467501885840683, "learning_rate": 4.944417340464998e-07, "logits/chosen": -2.921875, "logits/rejected": -2.796875, "logps/chosen": -239.0, "logps/rejected": -346.0, "loss": 0.902, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.21875, "rewards/margins": 1.0546875, "rewards/rejected": -2.28125, "step": 2090 }, { "epoch": 0.16129032258064516, "grad_norm": 13.588182800724297, "learning_rate": 4.943003080687035e-07, "logits/chosen": -2.984375, "logits/rejected": -2.96875, "logps/chosen": -223.0, "logps/rejected": -334.0, "loss": 0.8999, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.0546875, "rewards/margins": 1.0703125, "rewards/rejected": -2.125, "step": 2100 }, { "epoch": 0.1620583717357911, "grad_norm": 16.42354035414079, "learning_rate": 4.941571261232496e-07, "logits/chosen": -2.921875, "logits/rejected": -2.84375, "logps/chosen": -249.0, "logps/rejected": -338.0, "loss": 0.8889, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.2421875, "rewards/margins": 1.03125, "rewards/rejected": -2.28125, "step": 2110 }, { "epoch": 0.16282642089093702, "grad_norm": 13.876317692188165, "learning_rate": 4.940121892392932e-07, "logits/chosen": -2.9375, "logits/rejected": -2.6875, "logps/chosen": -276.0, "logps/rejected": -366.0, "loss": 0.8789, "rewards/accuracies": 0.78125, "rewards/chosen": -1.265625, "rewards/margins": 1.03125, "rewards/rejected": -2.296875, "step": 2120 }, { "epoch": 0.16359447004608296, "grad_norm": 13.113712875199626, "learning_rate": 4.938654984586032e-07, "logits/chosen": -3.03125, "logits/rejected": -2.859375, "logps/chosen": -222.0, "logps/rejected": -308.0, "loss": 0.8861, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.0, "rewards/margins": 0.95703125, "rewards/rejected": -1.9609375, "step": 2130 }, { "epoch": 0.16436251920122888, "grad_norm": 14.202134678257352, "learning_rate": 4.937170548355551e-07, "logits/chosen": -2.96875, "logits/rejected": -2.9375, "logps/chosen": -253.0, "logps/rejected": -314.0, "loss": 0.9046, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.203125, "rewards/margins": 0.8984375, "rewards/rejected": -2.109375, "step": 2140 }, { "epoch": 0.1651305683563748, "grad_norm": 14.07475068393389, "learning_rate": 4.935668594371233e-07, "logits/chosen": -2.875, "logits/rejected": -2.625, "logps/chosen": -239.0, "logps/rejected": -342.0, "loss": 0.8938, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.078125, "rewards/margins": 1.109375, "rewards/rejected": -2.1875, "step": 2150 }, { "epoch": 0.16589861751152074, "grad_norm": 12.79032408540752, "learning_rate": 4.934149133428738e-07, "logits/chosen": -3.0, "logits/rejected": -2.796875, "logps/chosen": -237.0, "logps/rejected": -358.0, "loss": 0.8779, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.1875, "rewards/margins": 1.2734375, "rewards/rejected": -2.46875, "step": 2160 }, { "epoch": 0.16666666666666666, "grad_norm": 15.892117743457144, "learning_rate": 4.932612176449559e-07, "logits/chosen": -2.9375, "logits/rejected": -2.75, "logps/chosen": -258.0, "logps/rejected": -348.0, "loss": 0.8891, "rewards/accuracies": 0.75, "rewards/chosen": -1.2109375, "rewards/margins": 0.99609375, "rewards/rejected": -2.203125, "step": 2170 }, { "epoch": 0.1674347158218126, "grad_norm": 13.638088131084134, "learning_rate": 4.931057734480947e-07, "logits/chosen": -2.9375, "logits/rejected": -2.828125, "logps/chosen": -241.0, "logps/rejected": -332.0, "loss": 0.8372, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.1015625, "rewards/margins": 1.03125, "rewards/rejected": -2.140625, "step": 2180 }, { "epoch": 0.16820276497695852, "grad_norm": 12.802697130265992, "learning_rate": 4.92948581869583e-07, "logits/chosen": -2.984375, "logits/rejected": -2.875, "logps/chosen": -260.0, "logps/rejected": -390.0, "loss": 0.9013, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.3046875, "rewards/margins": 1.3515625, "rewards/rejected": -2.65625, "step": 2190 }, { "epoch": 0.16897081413210446, "grad_norm": 13.264734752913222, "learning_rate": 4.927896440392734e-07, "logits/chosen": -2.984375, "logits/rejected": -2.828125, "logps/chosen": -256.0, "logps/rejected": -348.0, "loss": 0.8849, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.2421875, "rewards/margins": 1.0078125, "rewards/rejected": -2.25, "step": 2200 }, { "epoch": 0.16973886328725038, "grad_norm": 12.293223121562086, "learning_rate": 4.926289610995701e-07, "logits/chosen": -2.984375, "logits/rejected": -2.9375, "logps/chosen": -235.0, "logps/rejected": -342.0, "loss": 0.8977, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.21875, "rewards/margins": 1.0234375, "rewards/rejected": -2.234375, "step": 2210 }, { "epoch": 0.17050691244239632, "grad_norm": 12.558202197571354, "learning_rate": 4.924665342054204e-07, "logits/chosen": -2.96875, "logits/rejected": -2.75, "logps/chosen": -258.0, "logps/rejected": -330.0, "loss": 0.8925, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2421875, "rewards/margins": 0.94140625, "rewards/rejected": -2.1875, "step": 2220 }, { "epoch": 0.17127496159754224, "grad_norm": 13.984016103525263, "learning_rate": 4.923023645243073e-07, "logits/chosen": -2.90625, "logits/rejected": -2.78125, "logps/chosen": -239.0, "logps/rejected": -344.0, "loss": 0.9135, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.015625, "rewards/margins": 1.15625, "rewards/rejected": -2.171875, "step": 2230 }, { "epoch": 0.17204301075268819, "grad_norm": 12.78354957586987, "learning_rate": 4.921364532362399e-07, "logits/chosen": -3.0, "logits/rejected": -3.015625, "logps/chosen": -249.0, "logps/rejected": -346.0, "loss": 0.9235, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.1328125, "rewards/margins": 1.078125, "rewards/rejected": -2.203125, "step": 2240 }, { "epoch": 0.1728110599078341, "grad_norm": 13.802122654934335, "learning_rate": 4.91968801533746e-07, "logits/chosen": -3.03125, "logits/rejected": -2.953125, "logps/chosen": -252.0, "logps/rejected": -338.0, "loss": 0.8709, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.1328125, "rewards/margins": 1.046875, "rewards/rejected": -2.1875, "step": 2250 }, { "epoch": 0.17357910906298002, "grad_norm": 14.281392277256622, "learning_rate": 4.917994106218627e-07, "logits/chosen": -2.890625, "logits/rejected": -2.984375, "logps/chosen": -251.0, "logps/rejected": -324.0, "loss": 0.8775, "rewards/accuracies": 0.8125, "rewards/chosen": -1.203125, "rewards/margins": 1.0234375, "rewards/rejected": -2.21875, "step": 2260 }, { "epoch": 0.17434715821812596, "grad_norm": 15.70923827449078, "learning_rate": 4.916282817181282e-07, "logits/chosen": -2.921875, "logits/rejected": -2.921875, "logps/chosen": -249.0, "logps/rejected": -352.0, "loss": 0.8682, "rewards/accuracies": 0.84375, "rewards/chosen": -1.25, "rewards/margins": 1.125, "rewards/rejected": -2.375, "step": 2270 }, { "epoch": 0.17511520737327188, "grad_norm": 13.48081926485038, "learning_rate": 4.91455416052573e-07, "logits/chosen": -2.984375, "logits/rejected": -3.015625, "logps/chosen": -258.0, "logps/rejected": -348.0, "loss": 0.8849, "rewards/accuracies": 0.78125, "rewards/chosen": -1.25, "rewards/margins": 1.0703125, "rewards/rejected": -2.3125, "step": 2280 }, { "epoch": 0.17588325652841783, "grad_norm": 15.179548826754504, "learning_rate": 4.91280814867711e-07, "logits/chosen": -2.984375, "logits/rejected": -2.84375, "logps/chosen": -244.0, "logps/rejected": -360.0, "loss": 0.8453, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.21875, "rewards/margins": 1.1640625, "rewards/rejected": -2.390625, "step": 2290 }, { "epoch": 0.17665130568356374, "grad_norm": 14.52694970807923, "learning_rate": 4.911044794185305e-07, "logits/chosen": -2.921875, "logits/rejected": -2.859375, "logps/chosen": -254.0, "logps/rejected": -358.0, "loss": 0.8657, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.234375, "rewards/margins": 1.0625, "rewards/rejected": -2.3125, "step": 2300 }, { "epoch": 0.1774193548387097, "grad_norm": 14.28490390514988, "learning_rate": 4.909264109724852e-07, "logits/chosen": -2.890625, "logits/rejected": -2.90625, "logps/chosen": -270.0, "logps/rejected": -346.0, "loss": 0.9076, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3828125, "rewards/margins": 0.9296875, "rewards/rejected": -2.3125, "step": 2310 }, { "epoch": 0.1781874039938556, "grad_norm": 13.692256012733601, "learning_rate": 4.907466108094853e-07, "logits/chosen": -2.84375, "logits/rejected": -2.859375, "logps/chosen": -250.0, "logps/rejected": -344.0, "loss": 0.8562, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.1484375, "rewards/margins": 1.0859375, "rewards/rejected": -2.234375, "step": 2320 }, { "epoch": 0.17895545314900155, "grad_norm": 31.800550688947688, "learning_rate": 4.905650802218879e-07, "logits/chosen": -2.890625, "logits/rejected": -2.78125, "logps/chosen": -234.0, "logps/rejected": -336.0, "loss": 0.8945, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.9765625, "rewards/margins": 1.1328125, "rewards/rejected": -2.109375, "step": 2330 }, { "epoch": 0.17972350230414746, "grad_norm": 12.355845303294258, "learning_rate": 4.903818205144882e-07, "logits/chosen": -2.9375, "logits/rejected": -2.9375, "logps/chosen": -245.0, "logps/rejected": -340.0, "loss": 0.8763, "rewards/accuracies": 0.75, "rewards/chosen": -1.1484375, "rewards/margins": 1.0859375, "rewards/rejected": -2.234375, "step": 2340 }, { "epoch": 0.18049155145929338, "grad_norm": 13.253020383364085, "learning_rate": 4.901968330045098e-07, "logits/chosen": -2.953125, "logits/rejected": -2.890625, "logps/chosen": -260.0, "logps/rejected": -362.0, "loss": 0.8989, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.3359375, "rewards/margins": 1.078125, "rewards/rejected": -2.40625, "step": 2350 }, { "epoch": 0.18125960061443933, "grad_norm": 14.247383137099689, "learning_rate": 4.900101190215951e-07, "logits/chosen": -2.9375, "logits/rejected": -2.921875, "logps/chosen": -255.0, "logps/rejected": -362.0, "loss": 0.8593, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.1484375, "rewards/margins": 1.1015625, "rewards/rejected": -2.25, "step": 2360 }, { "epoch": 0.18202764976958524, "grad_norm": 15.167329560091888, "learning_rate": 4.898216799077964e-07, "logits/chosen": -2.921875, "logits/rejected": -2.734375, "logps/chosen": -241.0, "logps/rejected": -356.0, "loss": 0.8986, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2890625, "rewards/margins": 1.078125, "rewards/rejected": -2.375, "step": 2370 }, { "epoch": 0.1827956989247312, "grad_norm": 12.562210992547268, "learning_rate": 4.89631517017565e-07, "logits/chosen": -2.90625, "logits/rejected": -2.84375, "logps/chosen": -239.0, "logps/rejected": -322.0, "loss": 0.8778, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2265625, "rewards/margins": 1.0546875, "rewards/rejected": -2.28125, "step": 2380 }, { "epoch": 0.1835637480798771, "grad_norm": 15.949003095858044, "learning_rate": 4.89439631717743e-07, "logits/chosen": -3.09375, "logits/rejected": -3.0, "logps/chosen": -264.0, "logps/rejected": -364.0, "loss": 0.8794, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1875, "rewards/margins": 1.1328125, "rewards/rejected": -2.3125, "step": 2390 }, { "epoch": 0.18433179723502305, "grad_norm": 15.874229720904866, "learning_rate": 4.892460253875525e-07, "logits/chosen": -2.953125, "logits/rejected": -2.859375, "logps/chosen": -264.0, "logps/rejected": -382.0, "loss": 0.8702, "rewards/accuracies": 0.8125, "rewards/chosen": -1.171875, "rewards/margins": 1.2890625, "rewards/rejected": -2.453125, "step": 2400 }, { "epoch": 0.18509984639016897, "grad_norm": 13.880205306094494, "learning_rate": 4.89050699418586e-07, "logits/chosen": -3.015625, "logits/rejected": -3.03125, "logps/chosen": -246.0, "logps/rejected": -340.0, "loss": 0.8649, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.2734375, "rewards/margins": 1.1328125, "rewards/rejected": -2.40625, "step": 2410 }, { "epoch": 0.1858678955453149, "grad_norm": 18.163237260057027, "learning_rate": 4.88853655214796e-07, "logits/chosen": -3.0625, "logits/rejected": -3.125, "logps/chosen": -253.0, "logps/rejected": -354.0, "loss": 0.8885, "rewards/accuracies": 0.78125, "rewards/chosen": -1.234375, "rewards/margins": 1.21875, "rewards/rejected": -2.453125, "step": 2420 }, { "epoch": 0.18663594470046083, "grad_norm": 16.10034559613234, "learning_rate": 4.886548941924857e-07, "logits/chosen": -3.078125, "logits/rejected": -3.109375, "logps/chosen": -239.0, "logps/rejected": -342.0, "loss": 0.8504, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.265625, "rewards/margins": 1.2421875, "rewards/rejected": -2.515625, "step": 2430 }, { "epoch": 0.18740399385560677, "grad_norm": 23.43888009337736, "learning_rate": 4.88454417780298e-07, "logits/chosen": -3.0625, "logits/rejected": -2.921875, "logps/chosen": -251.0, "logps/rejected": -356.0, "loss": 0.8606, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.4296875, "rewards/margins": 1.1328125, "rewards/rejected": -2.5625, "step": 2440 }, { "epoch": 0.1881720430107527, "grad_norm": 12.055284855873023, "learning_rate": 4.882522274192056e-07, "logits/chosen": -3.140625, "logits/rejected": -3.046875, "logps/chosen": -247.0, "logps/rejected": -340.0, "loss": 0.855, "rewards/accuracies": 0.78125, "rewards/chosen": -1.1875, "rewards/margins": 1.1171875, "rewards/rejected": -2.296875, "step": 2450 }, { "epoch": 0.1889400921658986, "grad_norm": 12.460599629159253, "learning_rate": 4.880483245625008e-07, "logits/chosen": -3.078125, "logits/rejected": -2.921875, "logps/chosen": -248.0, "logps/rejected": -350.0, "loss": 0.889, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.359375, "rewards/margins": 1.0390625, "rewards/rejected": -2.390625, "step": 2460 }, { "epoch": 0.18970814132104455, "grad_norm": 14.328351433119291, "learning_rate": 4.878427106757848e-07, "logits/chosen": -2.96875, "logits/rejected": -2.90625, "logps/chosen": -254.0, "logps/rejected": -332.0, "loss": 0.8539, "rewards/accuracies": 0.71875, "rewards/chosen": -1.265625, "rewards/margins": 0.89453125, "rewards/rejected": -2.15625, "step": 2470 }, { "epoch": 0.19047619047619047, "grad_norm": 15.184689595217002, "learning_rate": 4.876353872369572e-07, "logits/chosen": -3.078125, "logits/rejected": -3.015625, "logps/chosen": -284.0, "logps/rejected": -368.0, "loss": 0.8668, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5078125, "rewards/margins": 1.046875, "rewards/rejected": -2.5625, "step": 2480 }, { "epoch": 0.1912442396313364, "grad_norm": 11.828651906098743, "learning_rate": 4.874263557362055e-07, "logits/chosen": -3.046875, "logits/rejected": -2.75, "logps/chosen": -274.0, "logps/rejected": -388.0, "loss": 0.8599, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.4140625, "rewards/margins": 1.359375, "rewards/rejected": -2.765625, "step": 2490 }, { "epoch": 0.19201228878648233, "grad_norm": 13.311264302647976, "learning_rate": 4.872156176759942e-07, "logits/chosen": -3.046875, "logits/rejected": -3.03125, "logps/chosen": -262.0, "logps/rejected": -348.0, "loss": 0.8639, "rewards/accuracies": 0.75, "rewards/chosen": -1.3203125, "rewards/margins": 1.1015625, "rewards/rejected": -2.421875, "step": 2500 }, { "epoch": 0.19278033794162827, "grad_norm": 13.648538171454566, "learning_rate": 4.870031745710541e-07, "logits/chosen": -3.046875, "logits/rejected": -2.9375, "logps/chosen": -216.0, "logps/rejected": -340.0, "loss": 0.8761, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9296875, "rewards/margins": 1.203125, "rewards/rejected": -2.140625, "step": 2510 }, { "epoch": 0.1935483870967742, "grad_norm": 12.183692923178004, "learning_rate": 4.867890279483717e-07, "logits/chosen": -3.0625, "logits/rejected": -3.0, "logps/chosen": -228.0, "logps/rejected": -336.0, "loss": 0.8938, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.0703125, "rewards/margins": 1.015625, "rewards/rejected": -2.09375, "step": 2520 }, { "epoch": 0.19431643625192013, "grad_norm": 15.684572272324115, "learning_rate": 4.865731793471776e-07, "logits/chosen": -3.125, "logits/rejected": -3.046875, "logps/chosen": -247.0, "logps/rejected": -378.0, "loss": 0.8379, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.1640625, "rewards/margins": 1.421875, "rewards/rejected": -2.578125, "step": 2530 }, { "epoch": 0.19508448540706605, "grad_norm": 19.14762706552057, "learning_rate": 4.863556303189357e-07, "logits/chosen": -3.15625, "logits/rejected": -2.96875, "logps/chosen": -262.0, "logps/rejected": -378.0, "loss": 0.8221, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.1875, "rewards/margins": 1.3828125, "rewards/rejected": -2.578125, "step": 2540 }, { "epoch": 0.195852534562212, "grad_norm": 14.748472558453047, "learning_rate": 4.861363824273329e-07, "logits/chosen": -2.953125, "logits/rejected": -2.890625, "logps/chosen": -233.0, "logps/rejected": -360.0, "loss": 0.8885, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.21875, "rewards/margins": 1.140625, "rewards/rejected": -2.359375, "step": 2550 }, { "epoch": 0.1966205837173579, "grad_norm": 16.02155658051898, "learning_rate": 4.859154372482661e-07, "logits/chosen": -3.0625, "logits/rejected": -2.921875, "logps/chosen": -213.0, "logps/rejected": -340.0, "loss": 0.8547, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.89453125, "rewards/margins": 1.3515625, "rewards/rejected": -2.25, "step": 2560 }, { "epoch": 0.19738863287250383, "grad_norm": 14.335513315031761, "learning_rate": 4.856927963698325e-07, "logits/chosen": -3.140625, "logits/rejected": -2.953125, "logps/chosen": -230.0, "logps/rejected": -340.0, "loss": 0.9002, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.140625, "rewards/margins": 1.1875, "rewards/rejected": -2.328125, "step": 2570 }, { "epoch": 0.19815668202764977, "grad_norm": 17.449968576347366, "learning_rate": 4.854684613923173e-07, "logits/chosen": -3.0625, "logits/rejected": -3.03125, "logps/chosen": -264.0, "logps/rejected": -342.0, "loss": 0.8439, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.2890625, "rewards/margins": 1.0625, "rewards/rejected": -2.34375, "step": 2580 }, { "epoch": 0.1989247311827957, "grad_norm": 15.787980448687197, "learning_rate": 4.852424339281824e-07, "logits/chosen": -3.09375, "logits/rejected": -2.90625, "logps/chosen": -247.0, "logps/rejected": -346.0, "loss": 0.925, "rewards/accuracies": 0.78125, "rewards/chosen": -1.28125, "rewards/margins": 1.015625, "rewards/rejected": -2.296875, "step": 2590 }, { "epoch": 0.19969278033794163, "grad_norm": 15.15259090190103, "learning_rate": 4.850147156020551e-07, "logits/chosen": -3.0625, "logits/rejected": -3.09375, "logps/chosen": -255.0, "logps/rejected": -336.0, "loss": 0.8666, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.140625, "rewards/margins": 1.140625, "rewards/rejected": -2.28125, "step": 2600 }, { "epoch": 0.20046082949308755, "grad_norm": 13.869832800975132, "learning_rate": 4.847853080507161e-07, "logits/chosen": -3.125, "logits/rejected": -2.984375, "logps/chosen": -243.0, "logps/rejected": -356.0, "loss": 0.8932, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1640625, "rewards/margins": 1.171875, "rewards/rejected": -2.34375, "step": 2610 }, { "epoch": 0.2012288786482335, "grad_norm": 12.438931882950325, "learning_rate": 4.845542129230875e-07, "logits/chosen": -2.984375, "logits/rejected": -2.90625, "logps/chosen": -240.0, "logps/rejected": -318.0, "loss": 0.8579, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.09375, "rewards/margins": 1.078125, "rewards/rejected": -2.171875, "step": 2620 }, { "epoch": 0.2019969278033794, "grad_norm": 13.610948295507129, "learning_rate": 4.843214318802219e-07, "logits/chosen": -2.96875, "logits/rejected": -2.84375, "logps/chosen": -268.0, "logps/rejected": -386.0, "loss": 0.8435, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.2578125, "rewards/margins": 1.296875, "rewards/rejected": -2.546875, "step": 2630 }, { "epoch": 0.20276497695852536, "grad_norm": 14.458312266342926, "learning_rate": 4.840869665952891e-07, "logits/chosen": -2.984375, "logits/rejected": -2.984375, "logps/chosen": -241.0, "logps/rejected": -346.0, "loss": 0.871, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.2109375, "rewards/margins": 1.140625, "rewards/rejected": -2.359375, "step": 2640 }, { "epoch": 0.20353302611367127, "grad_norm": 12.47494794873362, "learning_rate": 4.838508187535653e-07, "logits/chosen": -3.203125, "logits/rejected": -3.046875, "logps/chosen": -248.0, "logps/rejected": -370.0, "loss": 0.8535, "rewards/accuracies": 0.84375, "rewards/chosen": -1.15625, "rewards/margins": 1.203125, "rewards/rejected": -2.359375, "step": 2650 }, { "epoch": 0.20430107526881722, "grad_norm": 14.041632857662226, "learning_rate": 4.836129900524205e-07, "logits/chosen": -3.140625, "logits/rejected": -3.078125, "logps/chosen": -260.0, "logps/rejected": -360.0, "loss": 0.887, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.359375, "rewards/margins": 1.1640625, "rewards/rejected": -2.53125, "step": 2660 }, { "epoch": 0.20506912442396313, "grad_norm": 15.077531947605536, "learning_rate": 4.833734822013058e-07, "logits/chosen": -3.046875, "logits/rejected": -2.96875, "logps/chosen": -288.0, "logps/rejected": -380.0, "loss": 0.8893, "rewards/accuracies": 0.78125, "rewards/chosen": -1.5234375, "rewards/margins": 1.1015625, "rewards/rejected": -2.625, "step": 2670 }, { "epoch": 0.20583717357910905, "grad_norm": 11.519578439253102, "learning_rate": 4.83132296921742e-07, "logits/chosen": -3.125, "logits/rejected": -2.984375, "logps/chosen": -258.0, "logps/rejected": -356.0, "loss": 0.8616, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.390625, "rewards/margins": 1.171875, "rewards/rejected": -2.5625, "step": 2680 }, { "epoch": 0.206605222734255, "grad_norm": 16.976793045978372, "learning_rate": 4.828894359473069e-07, "logits/chosen": -3.125, "logits/rejected": -3.03125, "logps/chosen": -249.0, "logps/rejected": -368.0, "loss": 0.8661, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.1875, "rewards/margins": 1.3046875, "rewards/rejected": -2.484375, "step": 2690 }, { "epoch": 0.2073732718894009, "grad_norm": 17.57828601544212, "learning_rate": 4.826449010236225e-07, "logits/chosen": -3.0625, "logits/rejected": -2.953125, "logps/chosen": -245.0, "logps/rejected": -370.0, "loss": 0.8662, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.078125, "rewards/margins": 1.3125, "rewards/rejected": -2.390625, "step": 2700 }, { "epoch": 0.20814132104454686, "grad_norm": 15.572432673756335, "learning_rate": 4.823986939083426e-07, "logits/chosen": -3.015625, "logits/rejected": -3.0, "logps/chosen": -272.0, "logps/rejected": -370.0, "loss": 0.8732, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.34375, "rewards/margins": 1.046875, "rewards/rejected": -2.390625, "step": 2710 }, { "epoch": 0.20890937019969277, "grad_norm": 16.800254159746338, "learning_rate": 4.821508163711408e-07, "logits/chosen": -2.96875, "logits/rejected": -3.0, "logps/chosen": -240.0, "logps/rejected": -342.0, "loss": 0.8686, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.28125, "rewards/margins": 1.171875, "rewards/rejected": -2.453125, "step": 2720 }, { "epoch": 0.20967741935483872, "grad_norm": 14.498133387262694, "learning_rate": 4.819012701936969e-07, "logits/chosen": -3.046875, "logits/rejected": -3.0625, "logps/chosen": -251.0, "logps/rejected": -388.0, "loss": 0.8635, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.328125, "rewards/margins": 1.2890625, "rewards/rejected": -2.609375, "step": 2730 }, { "epoch": 0.21044546850998463, "grad_norm": 15.219178288565878, "learning_rate": 4.816500571696844e-07, "logits/chosen": -3.0, "logits/rejected": -3.109375, "logps/chosen": -276.0, "logps/rejected": -354.0, "loss": 0.8743, "rewards/accuracies": 0.78125, "rewards/chosen": -1.3203125, "rewards/margins": 1.1171875, "rewards/rejected": -2.4375, "step": 2740 }, { "epoch": 0.21121351766513058, "grad_norm": 14.503446623121269, "learning_rate": 4.81397179104758e-07, "logits/chosen": -3.0625, "logits/rejected": -3.125, "logps/chosen": -258.0, "logps/rejected": -390.0, "loss": 0.8622, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.328125, "rewards/margins": 1.3984375, "rewards/rejected": -2.71875, "step": 2750 }, { "epoch": 0.2119815668202765, "grad_norm": 13.436271593105287, "learning_rate": 4.811426378165398e-07, "logits/chosen": -3.1875, "logits/rejected": -2.953125, "logps/chosen": -290.0, "logps/rejected": -396.0, "loss": 0.8321, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.5546875, "rewards/margins": 1.1953125, "rewards/rejected": -2.75, "step": 2760 }, { "epoch": 0.21274961597542244, "grad_norm": 13.300139354950735, "learning_rate": 4.80886435134607e-07, "logits/chosen": -3.09375, "logits/rejected": -3.15625, "logps/chosen": -264.0, "logps/rejected": -390.0, "loss": 0.8473, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.3828125, "rewards/margins": 1.359375, "rewards/rejected": -2.734375, "step": 2770 }, { "epoch": 0.21351766513056836, "grad_norm": 15.09961769528304, "learning_rate": 4.806285729004786e-07, "logits/chosen": -3.140625, "logits/rejected": -3.125, "logps/chosen": -254.0, "logps/rejected": -348.0, "loss": 0.8753, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3125, "rewards/margins": 1.0078125, "rewards/rejected": -2.328125, "step": 2780 }, { "epoch": 0.21428571428571427, "grad_norm": 13.005222897202875, "learning_rate": 4.803690529676019e-07, "logits/chosen": -3.203125, "logits/rejected": -3.046875, "logps/chosen": -280.0, "logps/rejected": -396.0, "loss": 0.8439, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5625, "rewards/margins": 1.171875, "rewards/rejected": -2.734375, "step": 2790 }, { "epoch": 0.21505376344086022, "grad_norm": 15.626921781464608, "learning_rate": 4.801078772013392e-07, "logits/chosen": -3.125, "logits/rejected": -3.109375, "logps/chosen": -284.0, "logps/rejected": -380.0, "loss": 0.8282, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.46875, "rewards/margins": 1.1796875, "rewards/rejected": -2.65625, "step": 2800 }, { "epoch": 0.21582181259600614, "grad_norm": 14.677501782633808, "learning_rate": 4.798450474789547e-07, "logits/chosen": -3.125, "logits/rejected": -3.140625, "logps/chosen": -268.0, "logps/rejected": -378.0, "loss": 0.8846, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.3125, "rewards/margins": 1.2265625, "rewards/rejected": -2.546875, "step": 2810 }, { "epoch": 0.21658986175115208, "grad_norm": 13.119446379740348, "learning_rate": 4.795805656896005e-07, "logits/chosen": -3.03125, "logits/rejected": -3.125, "logps/chosen": -266.0, "logps/rejected": -366.0, "loss": 0.8947, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.1796875, "rewards/margins": 1.171875, "rewards/rejected": -2.359375, "step": 2820 }, { "epoch": 0.217357910906298, "grad_norm": 13.988549058667006, "learning_rate": 4.793144337343037e-07, "logits/chosen": -3.0625, "logits/rejected": -2.921875, "logps/chosen": -235.0, "logps/rejected": -366.0, "loss": 0.8349, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.1796875, "rewards/margins": 1.40625, "rewards/rejected": -2.59375, "step": 2830 }, { "epoch": 0.21812596006144394, "grad_norm": 16.32325223492421, "learning_rate": 4.790466535259523e-07, "logits/chosen": -3.015625, "logits/rejected": -3.078125, "logps/chosen": -247.0, "logps/rejected": -360.0, "loss": 0.8336, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.3046875, "rewards/margins": 1.2734375, "rewards/rejected": -2.578125, "step": 2840 }, { "epoch": 0.21889400921658986, "grad_norm": 15.128503011721632, "learning_rate": 4.787772269892813e-07, "logits/chosen": -3.0, "logits/rejected": -3.078125, "logps/chosen": -247.0, "logps/rejected": -346.0, "loss": 0.8418, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.203125, "rewards/margins": 1.2265625, "rewards/rejected": -2.421875, "step": 2850 }, { "epoch": 0.2196620583717358, "grad_norm": 17.045951069178003, "learning_rate": 4.785061560608592e-07, "logits/chosen": -3.140625, "logits/rejected": -3.15625, "logps/chosen": -250.0, "logps/rejected": -348.0, "loss": 0.8964, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.3359375, "rewards/margins": 1.125, "rewards/rejected": -2.46875, "step": 2860 }, { "epoch": 0.22043010752688172, "grad_norm": 17.460779405047315, "learning_rate": 4.78233442689074e-07, "logits/chosen": -3.046875, "logits/rejected": -2.953125, "logps/chosen": -256.0, "logps/rejected": -362.0, "loss": 0.8505, "rewards/accuracies": 0.78125, "rewards/chosen": -1.25, "rewards/margins": 1.203125, "rewards/rejected": -2.4375, "step": 2870 }, { "epoch": 0.22119815668202766, "grad_norm": 13.464954251175703, "learning_rate": 4.779590888341192e-07, "logits/chosen": -3.203125, "logits/rejected": -3.265625, "logps/chosen": -258.0, "logps/rejected": -372.0, "loss": 0.8806, "rewards/accuracies": 0.78125, "rewards/chosen": -1.3046875, "rewards/margins": 1.2421875, "rewards/rejected": -2.546875, "step": 2880 }, { "epoch": 0.22196620583717358, "grad_norm": 16.01545986224394, "learning_rate": 4.776830964679796e-07, "logits/chosen": -3.125, "logits/rejected": -3.078125, "logps/chosen": -260.0, "logps/rejected": -390.0, "loss": 0.8195, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.3203125, "rewards/margins": 1.4609375, "rewards/rejected": -2.78125, "step": 2890 }, { "epoch": 0.2227342549923195, "grad_norm": 15.548853022161012, "learning_rate": 4.774054675744172e-07, "logits/chosen": -3.109375, "logits/rejected": -3.140625, "logps/chosen": -266.0, "logps/rejected": -372.0, "loss": 0.8437, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.421875, "rewards/margins": 1.28125, "rewards/rejected": -2.703125, "step": 2900 }, { "epoch": 0.22350230414746544, "grad_norm": 16.187941952950606, "learning_rate": 4.771262041489569e-07, "logits/chosen": -3.203125, "logits/rejected": -3.203125, "logps/chosen": -249.0, "logps/rejected": -356.0, "loss": 0.8608, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4375, "rewards/margins": 1.2109375, "rewards/rejected": -2.65625, "step": 2910 }, { "epoch": 0.22427035330261136, "grad_norm": 16.06638034877557, "learning_rate": 4.768453081988723e-07, "logits/chosen": -3.109375, "logits/rejected": -3.0, "logps/chosen": -282.0, "logps/rejected": -392.0, "loss": 0.8762, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.28125, "rewards/margins": 1.3046875, "rewards/rejected": -2.59375, "step": 2920 }, { "epoch": 0.2250384024577573, "grad_norm": 13.410443800768205, "learning_rate": 4.765627817431709e-07, "logits/chosen": -3.21875, "logits/rejected": -3.09375, "logps/chosen": -228.0, "logps/rejected": -354.0, "loss": 0.9219, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.203125, "rewards/margins": 1.2734375, "rewards/rejected": -2.46875, "step": 2930 }, { "epoch": 0.22580645161290322, "grad_norm": 14.174715814534345, "learning_rate": 4.7627862681258027e-07, "logits/chosen": -3.203125, "logits/rejected": -2.953125, "logps/chosen": -247.0, "logps/rejected": -390.0, "loss": 0.7857, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.125, "rewards/margins": 1.6015625, "rewards/rejected": -2.734375, "step": 2940 }, { "epoch": 0.22657450076804916, "grad_norm": 15.44466073079211, "learning_rate": 4.759928454495328e-07, "logits/chosen": -3.25, "logits/rejected": -3.09375, "logps/chosen": -274.0, "logps/rejected": -458.0, "loss": 0.8043, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.4375, "rewards/margins": 2.015625, "rewards/rejected": -3.453125, "step": 2950 }, { "epoch": 0.22734254992319508, "grad_norm": 13.889523601116933, "learning_rate": 4.7570543970815114e-07, "logits/chosen": -3.203125, "logits/rejected": -3.1875, "logps/chosen": -274.0, "logps/rejected": -368.0, "loss": 0.857, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.46875, "rewards/margins": 1.265625, "rewards/rejected": -2.734375, "step": 2960 }, { "epoch": 0.22811059907834103, "grad_norm": 12.206125706350615, "learning_rate": 4.754164116542339e-07, "logits/chosen": -3.203125, "logits/rejected": -3.171875, "logps/chosen": -232.0, "logps/rejected": -382.0, "loss": 0.8358, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.0703125, "rewards/margins": 1.4921875, "rewards/rejected": -2.5625, "step": 2970 }, { "epoch": 0.22887864823348694, "grad_norm": 17.270252076450138, "learning_rate": 4.751257633652401e-07, "logits/chosen": -3.234375, "logits/rejected": -3.296875, "logps/chosen": -255.0, "logps/rejected": -336.0, "loss": 0.8989, "rewards/accuracies": 0.78125, "rewards/chosen": -1.1875, "rewards/margins": 1.078125, "rewards/rejected": -2.265625, "step": 2980 }, { "epoch": 0.22964669738863286, "grad_norm": 17.19605730502409, "learning_rate": 4.7483349693027474e-07, "logits/chosen": -3.265625, "logits/rejected": -3.09375, "logps/chosen": -250.0, "logps/rejected": -372.0, "loss": 0.8559, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4140625, "rewards/margins": 1.2578125, "rewards/rejected": -2.671875, "step": 2990 }, { "epoch": 0.2304147465437788, "grad_norm": 14.427143284908615, "learning_rate": 4.7453961445007375e-07, "logits/chosen": -3.21875, "logits/rejected": -3.234375, "logps/chosen": -280.0, "logps/rejected": -386.0, "loss": 0.8657, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.578125, "rewards/margins": 1.1875, "rewards/rejected": -2.765625, "step": 3000 }, { "epoch": 0.2304147465437788, "eval_logits/chosen": -3.203125, "eval_logits/rejected": -3.21875, "eval_logps/chosen": -284.0, "eval_logps/rejected": -364.0, "eval_loss": 0.46712711453437805, "eval_rewards/accuracies": 0.7584134340286255, "eval_rewards/chosen": -1.4375, "eval_rewards/margins": 1.078125, "eval_rewards/rejected": -2.515625, "eval_runtime": 2262.5076, "eval_samples_per_second": 41.165, "eval_steps_per_second": 0.644, "step": 3000 }, { "epoch": 0.23118279569892472, "grad_norm": 14.54817963735278, "learning_rate": 4.7424411803698855e-07, "logits/chosen": -3.171875, "logits/rejected": -3.203125, "logps/chosen": -231.0, "logps/rejected": -342.0, "loss": 0.8688, "rewards/accuracies": 0.8125, "rewards/chosen": -1.140625, "rewards/margins": 1.171875, "rewards/rejected": -2.3125, "step": 3010 }, { "epoch": 0.23195084485407066, "grad_norm": 16.52494972069348, "learning_rate": 4.7394700981497125e-07, "logits/chosen": -3.296875, "logits/rejected": -3.328125, "logps/chosen": -247.0, "logps/rejected": -354.0, "loss": 0.8777, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.28125, "rewards/margins": 1.15625, "rewards/rejected": -2.4375, "step": 3020 }, { "epoch": 0.23271889400921658, "grad_norm": 13.575130324364173, "learning_rate": 4.7364829191955925e-07, "logits/chosen": -3.203125, "logits/rejected": -3.109375, "logps/chosen": -274.0, "logps/rejected": -384.0, "loss": 0.8639, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.546875, "rewards/margins": 1.1328125, "rewards/rejected": -2.6875, "step": 3030 }, { "epoch": 0.23348694316436253, "grad_norm": 17.011782351976304, "learning_rate": 4.733479664978596e-07, "logits/chosen": -3.21875, "logits/rejected": -3.28125, "logps/chosen": -270.0, "logps/rejected": -380.0, "loss": 0.8359, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.453125, "rewards/margins": 1.3046875, "rewards/rejected": -2.75, "step": 3040 }, { "epoch": 0.23425499231950844, "grad_norm": 122.2945980370441, "learning_rate": 4.730460357085343e-07, "logits/chosen": -3.390625, "logits/rejected": -3.515625, "logps/chosen": -276.0, "logps/rejected": -408.0, "loss": 0.8925, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.453125, "rewards/margins": 1.5234375, "rewards/rejected": -2.984375, "step": 3050 }, { "epoch": 0.2350230414746544, "grad_norm": 13.498196090474455, "learning_rate": 4.727425017217839e-07, "logits/chosen": -3.21875, "logits/rejected": -3.109375, "logps/chosen": -270.0, "logps/rejected": -392.0, "loss": 0.8543, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.3515625, "rewards/margins": 1.2734375, "rewards/rejected": -2.625, "step": 3060 }, { "epoch": 0.2357910906298003, "grad_norm": 14.8703568645001, "learning_rate": 4.7243736671933245e-07, "logits/chosen": -3.1875, "logits/rejected": -3.140625, "logps/chosen": -284.0, "logps/rejected": -398.0, "loss": 0.8528, "rewards/accuracies": 0.78125, "rewards/chosen": -1.5234375, "rewards/margins": 1.2890625, "rewards/rejected": -2.8125, "step": 3070 }, { "epoch": 0.23655913978494625, "grad_norm": 16.80417779869175, "learning_rate": 4.721306328944118e-07, "logits/chosen": -3.203125, "logits/rejected": -3.234375, "logps/chosen": -270.0, "logps/rejected": -394.0, "loss": 0.8511, "rewards/accuracies": 0.75, "rewards/chosen": -1.5078125, "rewards/margins": 1.3125, "rewards/rejected": -2.828125, "step": 3080 }, { "epoch": 0.23732718894009217, "grad_norm": 19.62467303056461, "learning_rate": 4.718223024517456e-07, "logits/chosen": -3.265625, "logits/rejected": -3.171875, "logps/chosen": -282.0, "logps/rejected": -414.0, "loss": 0.8285, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.46875, "rewards/margins": 1.5546875, "rewards/rejected": -3.03125, "step": 3090 }, { "epoch": 0.23809523809523808, "grad_norm": 14.519866902542967, "learning_rate": 4.715123776075336e-07, "logits/chosen": -3.1875, "logits/rejected": -3.109375, "logps/chosen": -284.0, "logps/rejected": -396.0, "loss": 0.7957, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.515625, "rewards/margins": 1.234375, "rewards/rejected": -2.75, "step": 3100 }, { "epoch": 0.23886328725038403, "grad_norm": 14.551438017195222, "learning_rate": 4.7120086058943576e-07, "logits/chosen": -3.328125, "logits/rejected": -3.40625, "logps/chosen": -278.0, "logps/rejected": -412.0, "loss": 0.8386, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.4765625, "rewards/margins": 1.546875, "rewards/rejected": -3.03125, "step": 3110 }, { "epoch": 0.23963133640552994, "grad_norm": 14.223319441267623, "learning_rate": 4.70887753636556e-07, "logits/chosen": -3.28125, "logits/rejected": -3.40625, "logps/chosen": -300.0, "logps/rejected": -426.0, "loss": 0.8183, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.53125, "rewards/margins": 1.3515625, "rewards/rejected": -2.875, "step": 3120 }, { "epoch": 0.2403993855606759, "grad_norm": 17.277682504280243, "learning_rate": 4.705730589994266e-07, "logits/chosen": -3.28125, "logits/rejected": -3.34375, "logps/chosen": -276.0, "logps/rejected": -422.0, "loss": 0.8081, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.4609375, "rewards/margins": 1.6171875, "rewards/rejected": -3.078125, "step": 3130 }, { "epoch": 0.2411674347158218, "grad_norm": 14.101732782470322, "learning_rate": 4.7025677893999136e-07, "logits/chosen": -3.1875, "logits/rejected": -3.15625, "logps/chosen": -253.0, "logps/rejected": -388.0, "loss": 0.8144, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.328125, "rewards/margins": 1.375, "rewards/rejected": -2.703125, "step": 3140 }, { "epoch": 0.24193548387096775, "grad_norm": 13.978352846751093, "learning_rate": 4.6993891573159e-07, "logits/chosen": -3.28125, "logits/rejected": -3.25, "logps/chosen": -241.0, "logps/rejected": -376.0, "loss": 0.8173, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.2890625, "rewards/margins": 1.375, "rewards/rejected": -2.65625, "step": 3150 }, { "epoch": 0.24270353302611367, "grad_norm": 15.937292595356313, "learning_rate": 4.6961947165894116e-07, "logits/chosen": -3.234375, "logits/rejected": -3.28125, "logps/chosen": -262.0, "logps/rejected": -386.0, "loss": 0.8429, "rewards/accuracies": 0.8125, "rewards/chosen": -1.34375, "rewards/margins": 1.375, "rewards/rejected": -2.71875, "step": 3160 }, { "epoch": 0.2434715821812596, "grad_norm": 15.510913746166223, "learning_rate": 4.6929844901812665e-07, "logits/chosen": -3.15625, "logits/rejected": -3.046875, "logps/chosen": -243.0, "logps/rejected": -378.0, "loss": 0.8645, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.34375, "rewards/margins": 1.2734375, "rewards/rejected": -2.625, "step": 3170 }, { "epoch": 0.24423963133640553, "grad_norm": 18.32361721126205, "learning_rate": 4.689758501165744e-07, "logits/chosen": -3.203125, "logits/rejected": -3.25, "logps/chosen": -249.0, "logps/rejected": -374.0, "loss": 0.8415, "rewards/accuracies": 0.78125, "rewards/chosen": -1.234375, "rewards/margins": 1.4453125, "rewards/rejected": -2.671875, "step": 3180 }, { "epoch": 0.24500768049155147, "grad_norm": 17.234518597147606, "learning_rate": 4.6865167727304223e-07, "logits/chosen": -3.171875, "logits/rejected": -3.078125, "logps/chosen": -258.0, "logps/rejected": -386.0, "loss": 0.8362, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3203125, "rewards/margins": 1.359375, "rewards/rejected": -2.671875, "step": 3190 }, { "epoch": 0.2457757296466974, "grad_norm": 16.284378583620523, "learning_rate": 4.6832593281760096e-07, "logits/chosen": -3.140625, "logits/rejected": -3.140625, "logps/chosen": -270.0, "logps/rejected": -372.0, "loss": 0.8809, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4375, "rewards/margins": 1.1640625, "rewards/rejected": -2.59375, "step": 3200 }, { "epoch": 0.2465437788018433, "grad_norm": 13.204502397280224, "learning_rate": 4.6799861909161786e-07, "logits/chosen": -3.140625, "logits/rejected": -3.078125, "logps/chosen": -266.0, "logps/rejected": -378.0, "loss": 0.8391, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.4375, "rewards/margins": 1.125, "rewards/rejected": -2.5625, "step": 3210 }, { "epoch": 0.24731182795698925, "grad_norm": 16.71076941495318, "learning_rate": 4.676697384477395e-07, "logits/chosen": -3.171875, "logits/rejected": -3.171875, "logps/chosen": -244.0, "logps/rejected": -410.0, "loss": 0.8291, "rewards/accuracies": 0.875, "rewards/chosen": -1.2734375, "rewards/margins": 1.6640625, "rewards/rejected": -2.9375, "step": 3220 }, { "epoch": 0.24807987711213517, "grad_norm": 14.05156099024015, "learning_rate": 4.6733929324987544e-07, "logits/chosen": -3.109375, "logits/rejected": -3.015625, "logps/chosen": -238.0, "logps/rejected": -362.0, "loss": 0.8089, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.1875, "rewards/margins": 1.4296875, "rewards/rejected": -2.625, "step": 3230 }, { "epoch": 0.2488479262672811, "grad_norm": 16.709661332955324, "learning_rate": 4.670072858731803e-07, "logits/chosen": -3.21875, "logits/rejected": -3.0, "logps/chosen": -286.0, "logps/rejected": -482.0, "loss": 0.7926, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.6171875, "rewards/margins": 1.375, "rewards/rejected": -2.984375, "step": 3240 }, { "epoch": 0.24961597542242703, "grad_norm": 18.590064325915822, "learning_rate": 4.666737187040378e-07, "logits/chosen": -3.265625, "logits/rejected": -3.09375, "logps/chosen": -286.0, "logps/rejected": -418.0, "loss": 0.8355, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6875, "rewards/margins": 1.4375, "rewards/rejected": -3.125, "step": 3250 }, { "epoch": 0.250384024577573, "grad_norm": 17.89518365493759, "learning_rate": 4.663385941400426e-07, "logits/chosen": -3.171875, "logits/rejected": -3.234375, "logps/chosen": -276.0, "logps/rejected": -394.0, "loss": 0.8672, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5546875, "rewards/margins": 1.328125, "rewards/rejected": -2.890625, "step": 3260 }, { "epoch": 0.2511520737327189, "grad_norm": 14.902968075270614, "learning_rate": 4.660019145899839e-07, "logits/chosen": -3.203125, "logits/rejected": -3.0625, "logps/chosen": -286.0, "logps/rejected": -376.0, "loss": 0.8224, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.4296875, "rewards/margins": 1.2109375, "rewards/rejected": -2.640625, "step": 3270 }, { "epoch": 0.2519201228878648, "grad_norm": 15.65187817510133, "learning_rate": 4.656636824738274e-07, "logits/chosen": -3.203125, "logits/rejected": -3.09375, "logps/chosen": -253.0, "logps/rejected": -396.0, "loss": 0.85, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.3046875, "rewards/margins": 1.515625, "rewards/rejected": -2.828125, "step": 3280 }, { "epoch": 0.25268817204301075, "grad_norm": 15.70216750453606, "learning_rate": 4.653239002226984e-07, "logits/chosen": -3.203125, "logits/rejected": -3.046875, "logps/chosen": -274.0, "logps/rejected": -398.0, "loss": 0.8754, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.4453125, "rewards/margins": 1.2890625, "rewards/rejected": -2.734375, "step": 3290 }, { "epoch": 0.2534562211981567, "grad_norm": 14.771577456731363, "learning_rate": 4.6498257027886424e-07, "logits/chosen": -3.1875, "logits/rejected": -2.90625, "logps/chosen": -282.0, "logps/rejected": -412.0, "loss": 0.8192, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.390625, "rewards/margins": 1.390625, "rewards/rejected": -2.78125, "step": 3300 }, { "epoch": 0.2542242703533026, "grad_norm": 14.099213238804644, "learning_rate": 4.6463969509571643e-07, "logits/chosen": -3.25, "logits/rejected": -3.078125, "logps/chosen": -304.0, "logps/rejected": -406.0, "loss": 0.8398, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.7421875, "rewards/margins": 1.0234375, "rewards/rejected": -2.765625, "step": 3310 }, { "epoch": 0.25499231950844853, "grad_norm": 18.347542098767715, "learning_rate": 4.642952771377536e-07, "logits/chosen": -2.984375, "logits/rejected": -3.015625, "logps/chosen": -292.0, "logps/rejected": -410.0, "loss": 0.8541, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.5546875, "rewards/margins": 1.3828125, "rewards/rejected": -2.9375, "step": 3320 }, { "epoch": 0.2557603686635945, "grad_norm": 17.018541965965486, "learning_rate": 4.639493188805632e-07, "logits/chosen": -3.265625, "logits/rejected": -3.140625, "logps/chosen": -302.0, "logps/rejected": -458.0, "loss": 0.8176, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.53125, "rewards/margins": 1.6953125, "rewards/rejected": -3.21875, "step": 3330 }, { "epoch": 0.2565284178187404, "grad_norm": 16.73048755891279, "learning_rate": 4.6360182281080406e-07, "logits/chosen": -3.21875, "logits/rejected": -3.1875, "logps/chosen": -254.0, "logps/rejected": -388.0, "loss": 0.7986, "rewards/accuracies": 0.8125, "rewards/chosen": -1.375, "rewards/margins": 1.4765625, "rewards/rejected": -2.84375, "step": 3340 }, { "epoch": 0.2572964669738863, "grad_norm": 15.484100019845433, "learning_rate": 4.632527914261883e-07, "logits/chosen": -3.25, "logits/rejected": -3.203125, "logps/chosen": -288.0, "logps/rejected": -386.0, "loss": 0.854, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5546875, "rewards/margins": 1.15625, "rewards/rejected": -2.703125, "step": 3350 }, { "epoch": 0.25806451612903225, "grad_norm": 15.66296006289546, "learning_rate": 4.629022272354637e-07, "logits/chosen": -3.140625, "logits/rejected": -3.1875, "logps/chosen": -292.0, "logps/rejected": -390.0, "loss": 0.8264, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.6015625, "rewards/margins": 1.1328125, "rewards/rejected": -2.734375, "step": 3360 }, { "epoch": 0.2588325652841782, "grad_norm": 16.302139979780016, "learning_rate": 4.6255013275839516e-07, "logits/chosen": -3.296875, "logits/rejected": -3.296875, "logps/chosen": -266.0, "logps/rejected": -412.0, "loss": 0.8146, "rewards/accuracies": 0.84375, "rewards/chosen": -1.5703125, "rewards/margins": 1.453125, "rewards/rejected": -3.03125, "step": 3370 }, { "epoch": 0.25960061443932414, "grad_norm": 15.763187890704978, "learning_rate": 4.621965105257472e-07, "logits/chosen": -3.3125, "logits/rejected": -3.1875, "logps/chosen": -250.0, "logps/rejected": -404.0, "loss": 0.8246, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.421875, "rewards/margins": 1.4375, "rewards/rejected": -2.859375, "step": 3380 }, { "epoch": 0.26036866359447003, "grad_norm": 19.90362862569454, "learning_rate": 4.6184136307926527e-07, "logits/chosen": -3.21875, "logits/rejected": -3.34375, "logps/chosen": -296.0, "logps/rejected": -406.0, "loss": 0.8758, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.6953125, "rewards/margins": 1.3671875, "rewards/rejected": -3.0625, "step": 3390 }, { "epoch": 0.261136712749616, "grad_norm": 17.372905455783467, "learning_rate": 4.614846929716575e-07, "logits/chosen": -3.171875, "logits/rejected": -3.078125, "logps/chosen": -258.0, "logps/rejected": -406.0, "loss": 0.8255, "rewards/accuracies": 0.8125, "rewards/chosen": -1.453125, "rewards/margins": 1.40625, "rewards/rejected": -2.859375, "step": 3400 }, { "epoch": 0.2619047619047619, "grad_norm": 13.938629387693796, "learning_rate": 4.6112650276657696e-07, "logits/chosen": -3.265625, "logits/rejected": -3.171875, "logps/chosen": -264.0, "logps/rejected": -410.0, "loss": 0.8398, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5390625, "rewards/margins": 1.3046875, "rewards/rejected": -2.84375, "step": 3410 }, { "epoch": 0.2626728110599078, "grad_norm": 16.724810659671917, "learning_rate": 4.607667950386024e-07, "logits/chosen": -3.140625, "logits/rejected": -3.1875, "logps/chosen": -284.0, "logps/rejected": -384.0, "loss": 0.8087, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.6015625, "rewards/margins": 1.2578125, "rewards/rejected": -2.859375, "step": 3420 }, { "epoch": 0.26344086021505375, "grad_norm": 17.968704263295884, "learning_rate": 4.604055723732203e-07, "logits/chosen": -3.140625, "logits/rejected": -3.09375, "logps/chosen": -253.0, "logps/rejected": -380.0, "loss": 0.8433, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3828125, "rewards/margins": 1.21875, "rewards/rejected": -2.609375, "step": 3430 }, { "epoch": 0.2642089093701997, "grad_norm": 16.36463939583393, "learning_rate": 4.6004283736680614e-07, "logits/chosen": -3.25, "logits/rejected": -3.28125, "logps/chosen": -306.0, "logps/rejected": -428.0, "loss": 0.8242, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.71875, "rewards/margins": 1.3828125, "rewards/rejected": -3.09375, "step": 3440 }, { "epoch": 0.26497695852534564, "grad_norm": 17.483357874094864, "learning_rate": 4.5967859262660557e-07, "logits/chosen": -3.296875, "logits/rejected": -3.25, "logps/chosen": -320.0, "logps/rejected": -446.0, "loss": 0.8419, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.765625, "rewards/margins": 1.3671875, "rewards/rejected": -3.140625, "step": 3450 }, { "epoch": 0.26574500768049153, "grad_norm": 16.353143375729818, "learning_rate": 4.59312840770716e-07, "logits/chosen": -3.140625, "logits/rejected": -3.15625, "logps/chosen": -296.0, "logps/rejected": -396.0, "loss": 0.8553, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6640625, "rewards/margins": 1.1875, "rewards/rejected": -2.859375, "step": 3460 }, { "epoch": 0.2665130568356375, "grad_norm": 15.169770677014604, "learning_rate": 4.589455844280675e-07, "logits/chosen": -3.140625, "logits/rejected": -3.046875, "logps/chosen": -272.0, "logps/rejected": -434.0, "loss": 0.8199, "rewards/accuracies": 0.78125, "rewards/chosen": -1.3671875, "rewards/margins": 1.8203125, "rewards/rejected": -3.1875, "step": 3470 }, { "epoch": 0.2672811059907834, "grad_norm": 15.705821767654236, "learning_rate": 4.58576826238404e-07, "logits/chosen": -3.234375, "logits/rejected": -3.125, "logps/chosen": -242.0, "logps/rejected": -378.0, "loss": 0.8773, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.375, "rewards/margins": 1.2734375, "rewards/rejected": -2.65625, "step": 3480 }, { "epoch": 0.26804915514592936, "grad_norm": 15.235889368702642, "learning_rate": 4.5820656885226434e-07, "logits/chosen": -3.09375, "logits/rejected": -3.171875, "logps/chosen": -264.0, "logps/rejected": -360.0, "loss": 0.8721, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4609375, "rewards/margins": 1.0390625, "rewards/rejected": -2.5, "step": 3490 }, { "epoch": 0.26881720430107525, "grad_norm": 17.292914034928828, "learning_rate": 4.5783481493096323e-07, "logits/chosen": -3.203125, "logits/rejected": -3.25, "logps/chosen": -262.0, "logps/rejected": -390.0, "loss": 0.8666, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4296875, "rewards/margins": 1.390625, "rewards/rejected": -2.8125, "step": 3500 }, { "epoch": 0.2695852534562212, "grad_norm": 18.956439714830854, "learning_rate": 4.574615671465719e-07, "logits/chosen": -3.1875, "logits/rejected": -3.25, "logps/chosen": -278.0, "logps/rejected": -418.0, "loss": 0.8047, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.5390625, "rewards/margins": 1.5078125, "rewards/rejected": -3.046875, "step": 3510 }, { "epoch": 0.27035330261136714, "grad_norm": 14.965575073437368, "learning_rate": 4.570868281818995e-07, "logits/chosen": -3.09375, "logits/rejected": -3.140625, "logps/chosen": -310.0, "logps/rejected": -436.0, "loss": 0.8292, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.703125, "rewards/margins": 1.4453125, "rewards/rejected": -3.140625, "step": 3520 }, { "epoch": 0.27112135176651303, "grad_norm": 17.410583880241845, "learning_rate": 4.5671060073047287e-07, "logits/chosen": -3.3125, "logits/rejected": -3.21875, "logps/chosen": -274.0, "logps/rejected": -400.0, "loss": 0.8216, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.59375, "rewards/margins": 1.3828125, "rewards/rejected": -2.96875, "step": 3530 }, { "epoch": 0.271889400921659, "grad_norm": 17.133929743907547, "learning_rate": 4.563328874965182e-07, "logits/chosen": -3.234375, "logits/rejected": -3.28125, "logps/chosen": -274.0, "logps/rejected": -402.0, "loss": 0.8403, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.53125, "rewards/margins": 1.40625, "rewards/rejected": -2.9375, "step": 3540 }, { "epoch": 0.2726574500768049, "grad_norm": 15.610557280437975, "learning_rate": 4.5595369119494077e-07, "logits/chosen": -3.28125, "logits/rejected": -3.15625, "logps/chosen": -262.0, "logps/rejected": -388.0, "loss": 0.8046, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.3046875, "rewards/margins": 1.3671875, "rewards/rejected": -2.671875, "step": 3550 }, { "epoch": 0.27342549923195086, "grad_norm": 14.94968332759758, "learning_rate": 4.555730145513058e-07, "logits/chosen": -3.21875, "logits/rejected": -3.125, "logps/chosen": -288.0, "logps/rejected": -402.0, "loss": 0.8567, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.625, "rewards/margins": 1.3203125, "rewards/rejected": -2.953125, "step": 3560 }, { "epoch": 0.27419354838709675, "grad_norm": 16.734415770700714, "learning_rate": 4.551908603018191e-07, "logits/chosen": -3.171875, "logits/rejected": -3.328125, "logps/chosen": -302.0, "logps/rejected": -426.0, "loss": 0.859, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.640625, "rewards/margins": 1.296875, "rewards/rejected": -2.9375, "step": 3570 }, { "epoch": 0.2749615975422427, "grad_norm": 16.005835378395748, "learning_rate": 4.5480723119330673e-07, "logits/chosen": -3.0625, "logits/rejected": -2.984375, "logps/chosen": -286.0, "logps/rejected": -406.0, "loss": 0.8507, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.5234375, "rewards/margins": 1.390625, "rewards/rejected": -2.90625, "step": 3580 }, { "epoch": 0.27572964669738864, "grad_norm": 15.152836905055745, "learning_rate": 4.5442212998319594e-07, "logits/chosen": -3.21875, "logits/rejected": -3.171875, "logps/chosen": -290.0, "logps/rejected": -418.0, "loss": 0.8331, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.6015625, "rewards/margins": 1.390625, "rewards/rejected": -2.984375, "step": 3590 }, { "epoch": 0.2764976958525346, "grad_norm": 17.56762189031525, "learning_rate": 4.540355594394951e-07, "logits/chosen": -3.21875, "logits/rejected": -3.28125, "logps/chosen": -286.0, "logps/rejected": -420.0, "loss": 0.8153, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.53125, "rewards/margins": 1.5078125, "rewards/rejected": -3.046875, "step": 3600 }, { "epoch": 0.2772657450076805, "grad_norm": 14.342756176211262, "learning_rate": 4.536475223407735e-07, "logits/chosen": -3.28125, "logits/rejected": -3.171875, "logps/chosen": -296.0, "logps/rejected": -424.0, "loss": 0.8604, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.609375, "rewards/margins": 1.3984375, "rewards/rejected": -3.015625, "step": 3610 }, { "epoch": 0.2780337941628264, "grad_norm": 18.56515277678491, "learning_rate": 4.5325802147614183e-07, "logits/chosen": -3.25, "logits/rejected": -3.21875, "logps/chosen": -256.0, "logps/rejected": -374.0, "loss": 0.8548, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.3203125, "rewards/margins": 1.3046875, "rewards/rejected": -2.625, "step": 3620 }, { "epoch": 0.27880184331797236, "grad_norm": 17.357509024905056, "learning_rate": 4.52867059645232e-07, "logits/chosen": -3.3125, "logits/rejected": -3.3125, "logps/chosen": -260.0, "logps/rejected": -390.0, "loss": 0.8496, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.4375, "rewards/margins": 1.421875, "rewards/rejected": -2.859375, "step": 3630 }, { "epoch": 0.27956989247311825, "grad_norm": 17.69920223314499, "learning_rate": 4.524746396581769e-07, "logits/chosen": -3.34375, "logits/rejected": -3.21875, "logps/chosen": -300.0, "logps/rejected": -416.0, "loss": 0.8376, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7578125, "rewards/margins": 1.359375, "rewards/rejected": -3.109375, "step": 3640 }, { "epoch": 0.2803379416282642, "grad_norm": 17.793985261061312, "learning_rate": 4.5208076433559014e-07, "logits/chosen": -3.28125, "logits/rejected": -3.296875, "logps/chosen": -284.0, "logps/rejected": -422.0, "loss": 0.8353, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.53125, "rewards/margins": 1.5390625, "rewards/rejected": -3.078125, "step": 3650 }, { "epoch": 0.28110599078341014, "grad_norm": 16.938392660671827, "learning_rate": 4.516854365085462e-07, "logits/chosen": -3.25, "logits/rejected": -3.234375, "logps/chosen": -253.0, "logps/rejected": -380.0, "loss": 0.8285, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.40625, "rewards/margins": 1.3046875, "rewards/rejected": -2.71875, "step": 3660 }, { "epoch": 0.2818740399385561, "grad_norm": 15.127620227087771, "learning_rate": 4.5128865901855946e-07, "logits/chosen": -3.171875, "logits/rejected": -3.109375, "logps/chosen": -256.0, "logps/rejected": -380.0, "loss": 0.8359, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.4140625, "rewards/margins": 1.328125, "rewards/rejected": -2.75, "step": 3670 }, { "epoch": 0.282642089093702, "grad_norm": 14.405803711768018, "learning_rate": 4.508904347175644e-07, "logits/chosen": -3.1875, "logits/rejected": -3.15625, "logps/chosen": -274.0, "logps/rejected": -412.0, "loss": 0.8136, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -1.375, "rewards/margins": 1.609375, "rewards/rejected": -2.984375, "step": 3680 }, { "epoch": 0.2834101382488479, "grad_norm": 16.638165482873937, "learning_rate": 4.504907664678946e-07, "logits/chosen": -3.28125, "logits/rejected": -3.15625, "logps/chosen": -274.0, "logps/rejected": -408.0, "loss": 0.8241, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.4453125, "rewards/margins": 1.4296875, "rewards/rejected": -2.875, "step": 3690 }, { "epoch": 0.28417818740399386, "grad_norm": 20.16970031151636, "learning_rate": 4.500896571422625e-07, "logits/chosen": -3.484375, "logits/rejected": -3.453125, "logps/chosen": -272.0, "logps/rejected": -428.0, "loss": 0.7895, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.5859375, "rewards/margins": 1.53125, "rewards/rejected": -3.125, "step": 3700 }, { "epoch": 0.2849462365591398, "grad_norm": 16.847128709182265, "learning_rate": 4.496871096237386e-07, "logits/chosen": -3.328125, "logits/rejected": -3.28125, "logps/chosen": -292.0, "logps/rejected": -418.0, "loss": 0.8406, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.703125, "rewards/margins": 1.3984375, "rewards/rejected": -3.109375, "step": 3710 }, { "epoch": 0.2857142857142857, "grad_norm": 15.312276564124527, "learning_rate": 4.492831268057306e-07, "logits/chosen": -3.25, "logits/rejected": -3.125, "logps/chosen": -288.0, "logps/rejected": -410.0, "loss": 0.8323, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.59375, "rewards/margins": 1.296875, "rewards/rejected": -2.890625, "step": 3720 }, { "epoch": 0.28648233486943164, "grad_norm": 18.08768249993936, "learning_rate": 4.48877711591963e-07, "logits/chosen": -3.15625, "logits/rejected": -3.0625, "logps/chosen": -294.0, "logps/rejected": -412.0, "loss": 0.8293, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.4453125, "rewards/margins": 1.3828125, "rewards/rejected": -2.828125, "step": 3730 }, { "epoch": 0.2872503840245776, "grad_norm": 17.097058077352788, "learning_rate": 4.4847086689645574e-07, "logits/chosen": -3.265625, "logits/rejected": -3.34375, "logps/chosen": -290.0, "logps/rejected": -416.0, "loss": 0.8382, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.65625, "rewards/margins": 1.4453125, "rewards/rejected": -3.09375, "step": 3740 }, { "epoch": 0.2880184331797235, "grad_norm": 16.872468976821935, "learning_rate": 4.480625956435038e-07, "logits/chosen": -3.3125, "logits/rejected": -3.296875, "logps/chosen": -306.0, "logps/rejected": -452.0, "loss": 0.8362, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.6328125, "rewards/margins": 1.5859375, "rewards/rejected": -3.21875, "step": 3750 }, { "epoch": 0.2887864823348694, "grad_norm": 16.600937801591574, "learning_rate": 4.4765290076765564e-07, "logits/chosen": -3.3125, "logits/rejected": -3.203125, "logps/chosen": -290.0, "logps/rejected": -402.0, "loss": 0.8388, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.5625, "rewards/margins": 1.296875, "rewards/rejected": -2.859375, "step": 3760 }, { "epoch": 0.28955453149001537, "grad_norm": 13.950436045156051, "learning_rate": 4.472417852136925e-07, "logits/chosen": -3.3125, "logits/rejected": -3.359375, "logps/chosen": -282.0, "logps/rejected": -386.0, "loss": 0.8103, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.5234375, "rewards/margins": 1.390625, "rewards/rejected": -2.90625, "step": 3770 }, { "epoch": 0.2903225806451613, "grad_norm": 16.8802522061281, "learning_rate": 4.46829251936607e-07, "logits/chosen": -3.421875, "logits/rejected": -3.40625, "logps/chosen": -300.0, "logps/rejected": -448.0, "loss": 0.8261, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.7109375, "rewards/margins": 1.640625, "rewards/rejected": -3.359375, "step": 3780 }, { "epoch": 0.2910906298003072, "grad_norm": 14.291740661499095, "learning_rate": 4.4641530390158203e-07, "logits/chosen": -3.296875, "logits/rejected": -3.328125, "logps/chosen": -288.0, "logps/rejected": -418.0, "loss": 0.8213, "rewards/accuracies": 0.78125, "rewards/chosen": -1.6171875, "rewards/margins": 1.3984375, "rewards/rejected": -3.015625, "step": 3790 }, { "epoch": 0.29185867895545314, "grad_norm": 16.405875964191566, "learning_rate": 4.4599994408396945e-07, "logits/chosen": -3.375, "logits/rejected": -3.484375, "logps/chosen": -298.0, "logps/rejected": -388.0, "loss": 0.851, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.6328125, "rewards/margins": 1.1875, "rewards/rejected": -2.8125, "step": 3800 }, { "epoch": 0.2926267281105991, "grad_norm": 15.279153335067463, "learning_rate": 4.455831754692685e-07, "logits/chosen": -3.21875, "logits/rejected": -2.984375, "logps/chosen": -272.0, "logps/rejected": -408.0, "loss": 0.7836, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.59375, "rewards/margins": 1.3828125, "rewards/rejected": -2.984375, "step": 3810 }, { "epoch": 0.29339477726574503, "grad_norm": 19.627232038217517, "learning_rate": 4.451650010531046e-07, "logits/chosen": -3.453125, "logits/rejected": -3.46875, "logps/chosen": -282.0, "logps/rejected": -424.0, "loss": 0.8328, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.5234375, "rewards/margins": 1.640625, "rewards/rejected": -3.171875, "step": 3820 }, { "epoch": 0.2941628264208909, "grad_norm": 13.740120000295743, "learning_rate": 4.447454238412077e-07, "logits/chosen": -3.15625, "logits/rejected": -3.28125, "logps/chosen": -292.0, "logps/rejected": -410.0, "loss": 0.8188, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.59375, "rewards/margins": 1.4765625, "rewards/rejected": -3.0625, "step": 3830 }, { "epoch": 0.29493087557603687, "grad_norm": 16.487294434699415, "learning_rate": 4.4432444684939075e-07, "logits/chosen": -3.390625, "logits/rejected": -3.46875, "logps/chosen": -270.0, "logps/rejected": -372.0, "loss": 0.8097, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4765625, "rewards/margins": 1.234375, "rewards/rejected": -2.71875, "step": 3840 }, { "epoch": 0.2956989247311828, "grad_norm": 18.771329149538232, "learning_rate": 4.439020731035278e-07, "logits/chosen": -3.265625, "logits/rejected": -3.453125, "logps/chosen": -244.0, "logps/rejected": -388.0, "loss": 0.7926, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -1.265625, "rewards/margins": 1.625, "rewards/rejected": -2.890625, "step": 3850 }, { "epoch": 0.2964669738863287, "grad_norm": 19.626453787161285, "learning_rate": 4.4347830563953247e-07, "logits/chosen": -3.40625, "logits/rejected": -3.5, "logps/chosen": -280.0, "logps/rejected": -390.0, "loss": 0.8255, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.546875, "rewards/margins": 1.375, "rewards/rejected": -2.921875, "step": 3860 }, { "epoch": 0.29723502304147464, "grad_norm": 17.625887192668216, "learning_rate": 4.430531475033361e-07, "logits/chosen": -3.359375, "logits/rejected": -3.296875, "logps/chosen": -290.0, "logps/rejected": -468.0, "loss": 0.7895, "rewards/accuracies": 0.84375, "rewards/chosen": -1.5390625, "rewards/margins": 1.796875, "rewards/rejected": -3.34375, "step": 3870 }, { "epoch": 0.2980030721966206, "grad_norm": 18.158301834766917, "learning_rate": 4.4262660175086585e-07, "logits/chosen": -3.375, "logits/rejected": -3.421875, "logps/chosen": -266.0, "logps/rejected": -450.0, "loss": 0.7829, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.46875, "rewards/margins": 1.9375, "rewards/rejected": -3.40625, "step": 3880 }, { "epoch": 0.29877112135176653, "grad_norm": 14.60282202927116, "learning_rate": 4.421986714480226e-07, "logits/chosen": -3.390625, "logits/rejected": -3.28125, "logps/chosen": -284.0, "logps/rejected": -422.0, "loss": 0.8152, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.609375, "rewards/margins": 1.5, "rewards/rejected": -3.109375, "step": 3890 }, { "epoch": 0.2995391705069124, "grad_norm": 16.72853537102892, "learning_rate": 4.417693596706592e-07, "logits/chosen": -3.296875, "logits/rejected": -3.25, "logps/chosen": -272.0, "logps/rejected": -416.0, "loss": 0.7921, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.484375, "rewards/margins": 1.46875, "rewards/rejected": -2.953125, "step": 3900 }, { "epoch": 0.30030721966205837, "grad_norm": 14.375414596020308, "learning_rate": 4.4133866950455793e-07, "logits/chosen": -3.34375, "logits/rejected": -3.21875, "logps/chosen": -274.0, "logps/rejected": -390.0, "loss": 0.8234, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.5, "rewards/margins": 1.21875, "rewards/rejected": -2.71875, "step": 3910 }, { "epoch": 0.3010752688172043, "grad_norm": 17.782365605290366, "learning_rate": 4.409066040454088e-07, "logits/chosen": -3.3125, "logits/rejected": -3.171875, "logps/chosen": -249.0, "logps/rejected": -394.0, "loss": 0.874, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.4296875, "rewards/margins": 1.359375, "rewards/rejected": -2.796875, "step": 3920 }, { "epoch": 0.30184331797235026, "grad_norm": 15.366582313419237, "learning_rate": 4.4047316639878695e-07, "logits/chosen": -3.359375, "logits/rejected": -3.359375, "logps/chosen": -294.0, "logps/rejected": -402.0, "loss": 0.8907, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6953125, "rewards/margins": 1.1328125, "rewards/rejected": -2.828125, "step": 3930 }, { "epoch": 0.30261136712749614, "grad_norm": 14.52634895326098, "learning_rate": 4.400383596801306e-07, "logits/chosen": -3.328125, "logits/rejected": -3.359375, "logps/chosen": -278.0, "logps/rejected": -402.0, "loss": 0.7916, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.515625, "rewards/margins": 1.2421875, "rewards/rejected": -2.765625, "step": 3940 }, { "epoch": 0.3033794162826421, "grad_norm": 18.14233532869496, "learning_rate": 4.396021870147182e-07, "logits/chosen": -3.4375, "logits/rejected": -3.421875, "logps/chosen": -288.0, "logps/rejected": -422.0, "loss": 0.8625, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.515625, "rewards/margins": 1.546875, "rewards/rejected": -3.0625, "step": 3950 }, { "epoch": 0.30414746543778803, "grad_norm": 13.56792306029721, "learning_rate": 4.391646515376466e-07, "logits/chosen": -3.296875, "logits/rejected": -3.203125, "logps/chosen": -266.0, "logps/rejected": -410.0, "loss": 0.8401, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.546875, "rewards/margins": 1.484375, "rewards/rejected": -3.03125, "step": 3960 }, { "epoch": 0.3049155145929339, "grad_norm": 15.244605943213104, "learning_rate": 4.38725756393808e-07, "logits/chosen": -3.296875, "logits/rejected": -3.328125, "logps/chosen": -278.0, "logps/rejected": -396.0, "loss": 0.8172, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5078125, "rewards/margins": 1.3046875, "rewards/rejected": -2.8125, "step": 3970 }, { "epoch": 0.30568356374807987, "grad_norm": 16.7320565993339, "learning_rate": 4.3828550473786764e-07, "logits/chosen": -3.359375, "logits/rejected": -3.4375, "logps/chosen": -264.0, "logps/rejected": -384.0, "loss": 0.807, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.4375, "rewards/margins": 1.3046875, "rewards/rejected": -2.734375, "step": 3980 }, { "epoch": 0.3064516129032258, "grad_norm": 16.671461387717315, "learning_rate": 4.3784389973424084e-07, "logits/chosen": -3.46875, "logits/rejected": -3.4375, "logps/chosen": -260.0, "logps/rejected": -392.0, "loss": 0.8475, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.3515625, "rewards/margins": 1.28125, "rewards/rejected": -2.640625, "step": 3990 }, { "epoch": 0.30721966205837176, "grad_norm": 15.69357106035934, "learning_rate": 4.374009445570708e-07, "logits/chosen": -3.546875, "logits/rejected": -3.578125, "logps/chosen": -292.0, "logps/rejected": -428.0, "loss": 0.8038, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.78125, "rewards/margins": 1.3984375, "rewards/rejected": -3.1875, "step": 4000 }, { "epoch": 0.30721966205837176, "eval_logits/chosen": -3.53125, "eval_logits/rejected": -3.625, "eval_logps/chosen": -344.0, "eval_logps/rejected": -446.0, "eval_loss": 0.46870991587638855, "eval_rewards/accuracies": 0.7548935413360596, "eval_rewards/chosen": -2.03125, "eval_rewards/margins": 1.296875, "eval_rewards/rejected": -3.328125, "eval_runtime": 2263.1393, "eval_samples_per_second": 41.153, "eval_steps_per_second": 0.643, "step": 4000 }, { "epoch": 0.30798771121351765, "grad_norm": 16.617617358681475, "learning_rate": 4.369566423902049e-07, "logits/chosen": -3.515625, "logits/rejected": -3.671875, "logps/chosen": -308.0, "logps/rejected": -436.0, "loss": 0.8121, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.796875, "rewards/margins": 1.375, "rewards/rejected": -3.171875, "step": 4010 }, { "epoch": 0.3087557603686636, "grad_norm": 19.87581829707112, "learning_rate": 4.365109964271729e-07, "logits/chosen": -3.6875, "logits/rejected": -3.71875, "logps/chosen": -336.0, "logps/rejected": -464.0, "loss": 0.8237, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.140625, "rewards/margins": 1.3671875, "rewards/rejected": -3.515625, "step": 4020 }, { "epoch": 0.30952380952380953, "grad_norm": 21.56897559937465, "learning_rate": 4.360640098711629e-07, "logits/chosen": -3.625, "logits/rejected": -3.5, "logps/chosen": -280.0, "logps/rejected": -400.0, "loss": 0.8722, "rewards/accuracies": 0.75, "rewards/chosen": -1.71875, "rewards/margins": 1.09375, "rewards/rejected": -2.8125, "step": 4030 }, { "epoch": 0.3102918586789555, "grad_norm": 14.975397508924509, "learning_rate": 4.356156859349992e-07, "logits/chosen": -3.6875, "logits/rejected": -3.640625, "logps/chosen": -298.0, "logps/rejected": -418.0, "loss": 0.7794, "rewards/accuracies": 0.78125, "rewards/chosen": -1.7421875, "rewards/margins": 1.2421875, "rewards/rejected": -2.984375, "step": 4040 }, { "epoch": 0.31105990783410137, "grad_norm": 20.365584256271053, "learning_rate": 4.351660278411187e-07, "logits/chosen": -3.640625, "logits/rejected": -3.75, "logps/chosen": -310.0, "logps/rejected": -434.0, "loss": 0.8426, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.8046875, "rewards/margins": 1.4765625, "rewards/rejected": -3.28125, "step": 4050 }, { "epoch": 0.3118279569892473, "grad_norm": 16.081669513382575, "learning_rate": 4.3471503882154786e-07, "logits/chosen": -3.65625, "logits/rejected": -3.75, "logps/chosen": -280.0, "logps/rejected": -404.0, "loss": 0.7983, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.609375, "rewards/margins": 1.359375, "rewards/rejected": -2.96875, "step": 4060 }, { "epoch": 0.31259600614439326, "grad_norm": 19.71065776474869, "learning_rate": 4.342627221178796e-07, "logits/chosen": -3.484375, "logits/rejected": -3.5625, "logps/chosen": -310.0, "logps/rejected": -420.0, "loss": 0.8116, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8046875, "rewards/margins": 1.3984375, "rewards/rejected": -3.203125, "step": 4070 }, { "epoch": 0.31336405529953915, "grad_norm": 17.44741799382549, "learning_rate": 4.3380908098124975e-07, "logits/chosen": -3.4375, "logits/rejected": -3.421875, "logps/chosen": -306.0, "logps/rejected": -464.0, "loss": 0.8313, "rewards/accuracies": 0.84375, "rewards/chosen": -1.7109375, "rewards/margins": 1.6484375, "rewards/rejected": -3.359375, "step": 4080 }, { "epoch": 0.3141321044546851, "grad_norm": 15.63838227081899, "learning_rate": 4.33354118672314e-07, "logits/chosen": -3.546875, "logits/rejected": -3.421875, "logps/chosen": -296.0, "logps/rejected": -434.0, "loss": 0.8249, "rewards/accuracies": 0.78125, "rewards/chosen": -1.8984375, "rewards/margins": 1.3515625, "rewards/rejected": -3.25, "step": 4090 }, { "epoch": 0.31490015360983103, "grad_norm": 17.178958736997473, "learning_rate": 4.32897838461224e-07, "logits/chosen": -3.296875, "logits/rejected": -3.40625, "logps/chosen": -274.0, "logps/rejected": -394.0, "loss": 0.8433, "rewards/accuracies": 0.75, "rewards/chosen": -1.625, "rewards/margins": 1.265625, "rewards/rejected": -2.90625, "step": 4100 }, { "epoch": 0.315668202764977, "grad_norm": 21.507608941364772, "learning_rate": 4.3244024362760457e-07, "logits/chosen": -3.3125, "logits/rejected": -3.40625, "logps/chosen": -276.0, "logps/rejected": -380.0, "loss": 0.8557, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.5078125, "rewards/margins": 1.2421875, "rewards/rejected": -2.75, "step": 4110 }, { "epoch": 0.31643625192012287, "grad_norm": 18.343761678194024, "learning_rate": 4.3198133746052923e-07, "logits/chosen": -3.40625, "logits/rejected": -3.453125, "logps/chosen": -258.0, "logps/rejected": -360.0, "loss": 0.8573, "rewards/accuracies": 0.78125, "rewards/chosen": -1.46875, "rewards/margins": 1.234375, "rewards/rejected": -2.703125, "step": 4120 }, { "epoch": 0.3172043010752688, "grad_norm": 14.882534179888047, "learning_rate": 4.3152112325849734e-07, "logits/chosen": -3.421875, "logits/rejected": -3.453125, "logps/chosen": -326.0, "logps/rejected": -436.0, "loss": 0.7995, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8828125, "rewards/margins": 1.3671875, "rewards/rejected": -3.25, "step": 4130 }, { "epoch": 0.31797235023041476, "grad_norm": 15.201976488596957, "learning_rate": 4.310596043294099e-07, "logits/chosen": -3.5, "logits/rejected": -3.65625, "logps/chosen": -310.0, "logps/rejected": -440.0, "loss": 0.8005, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9609375, "rewards/margins": 1.40625, "rewards/rejected": -3.359375, "step": 4140 }, { "epoch": 0.3187403993855607, "grad_norm": 15.102623084879397, "learning_rate": 4.305967839905462e-07, "logits/chosen": -3.375, "logits/rejected": -3.4375, "logps/chosen": -284.0, "logps/rejected": -434.0, "loss": 0.798, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.59375, "rewards/margins": 1.53125, "rewards/rejected": -3.125, "step": 4150 }, { "epoch": 0.3195084485407066, "grad_norm": 16.458793528930567, "learning_rate": 4.3013266556853956e-07, "logits/chosen": -3.359375, "logits/rejected": -3.4375, "logps/chosen": -276.0, "logps/rejected": -400.0, "loss": 0.8478, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5703125, "rewards/margins": 1.1953125, "rewards/rejected": -2.765625, "step": 4160 }, { "epoch": 0.32027649769585254, "grad_norm": 17.75008163057227, "learning_rate": 4.296672523993534e-07, "logits/chosen": -3.609375, "logits/rejected": -3.5625, "logps/chosen": -280.0, "logps/rejected": -446.0, "loss": 0.8294, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.5859375, "rewards/margins": 1.59375, "rewards/rejected": -3.171875, "step": 4170 }, { "epoch": 0.3210445468509985, "grad_norm": 14.741252980617492, "learning_rate": 4.292005478282578e-07, "logits/chosen": -3.5625, "logits/rejected": -3.78125, "logps/chosen": -290.0, "logps/rejected": -418.0, "loss": 0.8361, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.71875, "rewards/margins": 1.4609375, "rewards/rejected": -3.1875, "step": 4180 }, { "epoch": 0.32181259600614437, "grad_norm": 14.696110541728833, "learning_rate": 4.287325552098049e-07, "logits/chosen": -3.4375, "logits/rejected": -3.4375, "logps/chosen": -306.0, "logps/rejected": -428.0, "loss": 0.8308, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.703125, "rewards/margins": 1.4453125, "rewards/rejected": -3.15625, "step": 4190 }, { "epoch": 0.3225806451612903, "grad_norm": 13.82195624234665, "learning_rate": 4.2826327790780505e-07, "logits/chosen": -3.4375, "logits/rejected": -3.609375, "logps/chosen": -250.0, "logps/rejected": -384.0, "loss": 0.8014, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.375, "rewards/margins": 1.4609375, "rewards/rejected": -2.828125, "step": 4200 }, { "epoch": 0.32334869431643626, "grad_norm": 15.875904900906727, "learning_rate": 4.277927192953026e-07, "logits/chosen": -3.3125, "logits/rejected": -3.40625, "logps/chosen": -306.0, "logps/rejected": -400.0, "loss": 0.8314, "rewards/accuracies": 0.75, "rewards/chosen": -1.59375, "rewards/margins": 1.1953125, "rewards/rejected": -2.78125, "step": 4210 }, { "epoch": 0.3241167434715822, "grad_norm": 18.03744098903991, "learning_rate": 4.2732088275455147e-07, "logits/chosen": -3.46875, "logits/rejected": -3.609375, "logps/chosen": -320.0, "logps/rejected": -462.0, "loss": 0.8349, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.890625, "rewards/margins": 1.5234375, "rewards/rejected": -3.40625, "step": 4220 }, { "epoch": 0.3248847926267281, "grad_norm": 18.325239907204132, "learning_rate": 4.268477716769912e-07, "logits/chosen": -3.40625, "logits/rejected": -3.546875, "logps/chosen": -328.0, "logps/rejected": -456.0, "loss": 0.8057, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.921875, "rewards/margins": 1.453125, "rewards/rejected": -3.375, "step": 4230 }, { "epoch": 0.32565284178187404, "grad_norm": 17.499136405986373, "learning_rate": 4.2637338946322234e-07, "logits/chosen": -3.453125, "logits/rejected": -3.390625, "logps/chosen": -300.0, "logps/rejected": -434.0, "loss": 0.8351, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.765625, "rewards/margins": 1.4140625, "rewards/rejected": -3.1875, "step": 4240 }, { "epoch": 0.32642089093702, "grad_norm": 16.575818467671812, "learning_rate": 4.2589773952298194e-07, "logits/chosen": -3.375, "logits/rejected": -3.328125, "logps/chosen": -330.0, "logps/rejected": -452.0, "loss": 0.8933, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.8828125, "rewards/margins": 1.28125, "rewards/rejected": -3.171875, "step": 4250 }, { "epoch": 0.3271889400921659, "grad_norm": 14.311431143614893, "learning_rate": 4.2542082527511914e-07, "logits/chosen": -3.390625, "logits/rejected": -3.578125, "logps/chosen": -262.0, "logps/rejected": -394.0, "loss": 0.8046, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.4375, "rewards/margins": 1.4765625, "rewards/rejected": -2.921875, "step": 4260 }, { "epoch": 0.3279569892473118, "grad_norm": 17.512538666952594, "learning_rate": 4.249426501475707e-07, "logits/chosen": -3.375, "logits/rejected": -3.5, "logps/chosen": -260.0, "logps/rejected": -386.0, "loss": 0.8424, "rewards/accuracies": 0.78125, "rewards/chosen": -1.359375, "rewards/margins": 1.3984375, "rewards/rejected": -2.765625, "step": 4270 }, { "epoch": 0.32872503840245776, "grad_norm": 15.49511389263498, "learning_rate": 4.244632175773363e-07, "logits/chosen": -3.359375, "logits/rejected": -3.578125, "logps/chosen": -254.0, "logps/rejected": -394.0, "loss": 0.7972, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.3828125, "rewards/margins": 1.578125, "rewards/rejected": -2.96875, "step": 4280 }, { "epoch": 0.3294930875576037, "grad_norm": 17.022274015872846, "learning_rate": 4.2398253101045354e-07, "logits/chosen": -3.578125, "logits/rejected": -3.5625, "logps/chosen": -286.0, "logps/rejected": -408.0, "loss": 0.7767, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6640625, "rewards/margins": 1.328125, "rewards/rejected": -2.984375, "step": 4290 }, { "epoch": 0.3302611367127496, "grad_norm": 13.57937300868101, "learning_rate": 4.235005939019737e-07, "logits/chosen": -3.53125, "logits/rejected": -3.625, "logps/chosen": -298.0, "logps/rejected": -440.0, "loss": 0.7926, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.828125, "rewards/margins": 1.4609375, "rewards/rejected": -3.28125, "step": 4300 }, { "epoch": 0.33102918586789554, "grad_norm": 14.955707714095723, "learning_rate": 4.2301740971593656e-07, "logits/chosen": -3.5625, "logits/rejected": -3.65625, "logps/chosen": -294.0, "logps/rejected": -430.0, "loss": 0.7944, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.6953125, "rewards/margins": 1.5, "rewards/rejected": -3.1875, "step": 4310 }, { "epoch": 0.3317972350230415, "grad_norm": 15.795658799377748, "learning_rate": 4.2253298192534535e-07, "logits/chosen": -3.546875, "logits/rejected": -3.578125, "logps/chosen": -306.0, "logps/rejected": -432.0, "loss": 0.8308, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.78125, "rewards/margins": 1.3359375, "rewards/rejected": -3.109375, "step": 4320 }, { "epoch": 0.3325652841781874, "grad_norm": 17.252874658876628, "learning_rate": 4.220473140121424e-07, "logits/chosen": -3.5, "logits/rejected": -3.59375, "logps/chosen": -280.0, "logps/rejected": -430.0, "loss": 0.8031, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.609375, "rewards/margins": 1.5546875, "rewards/rejected": -3.15625, "step": 4330 }, { "epoch": 0.3333333333333333, "grad_norm": 16.312039685038737, "learning_rate": 4.2156040946718343e-07, "logits/chosen": -3.34375, "logits/rejected": -3.359375, "logps/chosen": -312.0, "logps/rejected": -434.0, "loss": 0.8565, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.734375, "rewards/margins": 1.4765625, "rewards/rejected": -3.203125, "step": 4340 }, { "epoch": 0.33410138248847926, "grad_norm": 18.446846185807306, "learning_rate": 4.210722717902127e-07, "logits/chosen": -3.390625, "logits/rejected": -3.5625, "logps/chosen": -268.0, "logps/rejected": -426.0, "loss": 0.814, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.5703125, "rewards/margins": 1.7265625, "rewards/rejected": -3.296875, "step": 4350 }, { "epoch": 0.3348694316436252, "grad_norm": 16.730916243141003, "learning_rate": 4.205829044898382e-07, "logits/chosen": -3.4375, "logits/rejected": -3.4375, "logps/chosen": -300.0, "logps/rejected": -422.0, "loss": 0.8341, "rewards/accuracies": 0.8125, "rewards/chosen": -1.734375, "rewards/margins": 1.3359375, "rewards/rejected": -3.078125, "step": 4360 }, { "epoch": 0.33563748079877115, "grad_norm": 17.804495358557233, "learning_rate": 4.20092311083506e-07, "logits/chosen": -3.1875, "logits/rejected": -3.265625, "logps/chosen": -294.0, "logps/rejected": -434.0, "loss": 0.7967, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.5859375, "rewards/margins": 1.640625, "rewards/rejected": -3.21875, "step": 4370 }, { "epoch": 0.33640552995391704, "grad_norm": 23.383421955260506, "learning_rate": 4.19600495097475e-07, "logits/chosen": -3.390625, "logits/rejected": -3.453125, "logps/chosen": -286.0, "logps/rejected": -428.0, "loss": 0.8161, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5859375, "rewards/margins": 1.5625, "rewards/rejected": -3.140625, "step": 4380 }, { "epoch": 0.337173579109063, "grad_norm": 17.112674072714327, "learning_rate": 4.1910746006679167e-07, "logits/chosen": -3.421875, "logits/rejected": -3.234375, "logps/chosen": -320.0, "logps/rejected": -456.0, "loss": 0.814, "rewards/accuracies": 0.78125, "rewards/chosen": -1.8046875, "rewards/margins": 1.3984375, "rewards/rejected": -3.203125, "step": 4390 }, { "epoch": 0.3379416282642089, "grad_norm": 15.866946057248768, "learning_rate": 4.186132095352649e-07, "logits/chosen": -3.4375, "logits/rejected": -3.734375, "logps/chosen": -282.0, "logps/rejected": -398.0, "loss": 0.8166, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.640625, "rewards/margins": 1.3515625, "rewards/rejected": -2.984375, "step": 4400 }, { "epoch": 0.3387096774193548, "grad_norm": 17.8673630750488, "learning_rate": 4.181177470554401e-07, "logits/chosen": -3.515625, "logits/rejected": -3.546875, "logps/chosen": -302.0, "logps/rejected": -432.0, "loss": 0.8127, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.734375, "rewards/margins": 1.5390625, "rewards/rejected": -3.265625, "step": 4410 }, { "epoch": 0.33947772657450076, "grad_norm": 28.66314241165034, "learning_rate": 4.176210761885738e-07, "logits/chosen": -3.453125, "logits/rejected": -3.484375, "logps/chosen": -284.0, "logps/rejected": -428.0, "loss": 0.7997, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.640625, "rewards/margins": 1.5078125, "rewards/rejected": -3.15625, "step": 4420 }, { "epoch": 0.3402457757296467, "grad_norm": 19.271933498684852, "learning_rate": 4.171232005046084e-07, "logits/chosen": -3.375, "logits/rejected": -3.453125, "logps/chosen": -298.0, "logps/rejected": -414.0, "loss": 0.8117, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.71875, "rewards/margins": 1.3046875, "rewards/rejected": -3.03125, "step": 4430 }, { "epoch": 0.34101382488479265, "grad_norm": 18.557423996083525, "learning_rate": 4.1662412358214595e-07, "logits/chosen": -3.359375, "logits/rejected": -3.375, "logps/chosen": -310.0, "logps/rejected": -424.0, "loss": 0.8063, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.890625, "rewards/margins": 1.296875, "rewards/rejected": -3.1875, "step": 4440 }, { "epoch": 0.34178187403993854, "grad_norm": 21.932111725543976, "learning_rate": 4.1612384900842256e-07, "logits/chosen": -3.375, "logits/rejected": -3.4375, "logps/chosen": -310.0, "logps/rejected": -448.0, "loss": 0.8956, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.84375, "rewards/margins": 1.6171875, "rewards/rejected": -3.46875, "step": 4450 }, { "epoch": 0.3425499231950845, "grad_norm": 17.099280322720475, "learning_rate": 4.156223803792832e-07, "logits/chosen": -3.3125, "logits/rejected": -3.25, "logps/chosen": -324.0, "logps/rejected": -462.0, "loss": 0.7889, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.9375, "rewards/margins": 1.4140625, "rewards/rejected": -3.359375, "step": 4460 }, { "epoch": 0.3433179723502304, "grad_norm": 16.375617140005282, "learning_rate": 4.1511972129915496e-07, "logits/chosen": -3.390625, "logits/rejected": -3.3125, "logps/chosen": -298.0, "logps/rejected": -446.0, "loss": 0.8527, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.6484375, "rewards/margins": 1.5859375, "rewards/rejected": -3.234375, "step": 4470 }, { "epoch": 0.34408602150537637, "grad_norm": 17.549361385693885, "learning_rate": 4.1461587538102184e-07, "logits/chosen": -3.4375, "logits/rejected": -3.546875, "logps/chosen": -318.0, "logps/rejected": -452.0, "loss": 0.7798, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.84375, "rewards/margins": 1.5234375, "rewards/rejected": -3.375, "step": 4480 }, { "epoch": 0.34485407066052226, "grad_norm": 14.076053480024115, "learning_rate": 4.141108462463986e-07, "logits/chosen": -3.390625, "logits/rejected": -3.453125, "logps/chosen": -294.0, "logps/rejected": -446.0, "loss": 0.8856, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.65625, "rewards/margins": 1.671875, "rewards/rejected": -3.328125, "step": 4490 }, { "epoch": 0.3456221198156682, "grad_norm": 17.37330677497722, "learning_rate": 4.1360463752530414e-07, "logits/chosen": -3.390625, "logits/rejected": -3.453125, "logps/chosen": -286.0, "logps/rejected": -396.0, "loss": 0.7951, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.671875, "rewards/margins": 1.25, "rewards/rejected": -2.921875, "step": 4500 }, { "epoch": 0.34639016897081415, "grad_norm": 14.599133985303501, "learning_rate": 4.130972528562367e-07, "logits/chosen": -3.328125, "logits/rejected": -3.46875, "logps/chosen": -304.0, "logps/rejected": -416.0, "loss": 0.7845, "rewards/accuracies": 0.75, "rewards/chosen": -1.8515625, "rewards/margins": 1.2734375, "rewards/rejected": -3.125, "step": 4510 }, { "epoch": 0.34715821812596004, "grad_norm": 17.58361639812111, "learning_rate": 4.1258869588614633e-07, "logits/chosen": -3.4375, "logits/rejected": -3.390625, "logps/chosen": -308.0, "logps/rejected": -448.0, "loss": 0.8316, "rewards/accuracies": 0.8125, "rewards/chosen": -1.859375, "rewards/margins": 1.375, "rewards/rejected": -3.234375, "step": 4520 }, { "epoch": 0.347926267281106, "grad_norm": 17.62479103928613, "learning_rate": 4.120789702704095e-07, "logits/chosen": -3.390625, "logits/rejected": -3.4375, "logps/chosen": -308.0, "logps/rejected": -462.0, "loss": 0.8257, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.703125, "rewards/margins": 1.65625, "rewards/rejected": -3.359375, "step": 4530 }, { "epoch": 0.3486943164362519, "grad_norm": 17.33680038396139, "learning_rate": 4.115680796728026e-07, "logits/chosen": -3.34375, "logits/rejected": -3.453125, "logps/chosen": -312.0, "logps/rejected": -408.0, "loss": 0.7842, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.75, "rewards/margins": 1.234375, "rewards/rejected": -2.984375, "step": 4540 }, { "epoch": 0.34946236559139787, "grad_norm": 18.30755840797572, "learning_rate": 4.110560277654755e-07, "logits/chosen": -3.09375, "logits/rejected": -3.40625, "logps/chosen": -300.0, "logps/rejected": -420.0, "loss": 0.8072, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.578125, "rewards/margins": 1.4921875, "rewards/rejected": -3.078125, "step": 4550 }, { "epoch": 0.35023041474654376, "grad_norm": 17.262007662235696, "learning_rate": 4.1054281822892545e-07, "logits/chosen": -3.359375, "logits/rejected": -3.4375, "logps/chosen": -272.0, "logps/rejected": -438.0, "loss": 0.7925, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.609375, "rewards/margins": 1.6015625, "rewards/rejected": -3.21875, "step": 4560 }, { "epoch": 0.3509984639016897, "grad_norm": 16.221087660940555, "learning_rate": 4.100284547519703e-07, "logits/chosen": -3.4375, "logits/rejected": -3.421875, "logps/chosen": -286.0, "logps/rejected": -394.0, "loss": 0.8126, "rewards/accuracies": 0.75, "rewards/chosen": -1.6640625, "rewards/margins": 1.1640625, "rewards/rejected": -2.828125, "step": 4570 }, { "epoch": 0.35176651305683565, "grad_norm": 17.147089834516834, "learning_rate": 4.0951294103172217e-07, "logits/chosen": -3.5, "logits/rejected": -3.65625, "logps/chosen": -288.0, "logps/rejected": -378.0, "loss": 0.8268, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.640625, "rewards/margins": 1.1171875, "rewards/rejected": -2.75, "step": 4580 }, { "epoch": 0.35253456221198154, "grad_norm": 17.043367850978857, "learning_rate": 4.089962807735609e-07, "logits/chosen": -3.421875, "logits/rejected": -3.578125, "logps/chosen": -320.0, "logps/rejected": -432.0, "loss": 0.8083, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.828125, "rewards/margins": 1.4140625, "rewards/rejected": -3.234375, "step": 4590 }, { "epoch": 0.3533026113671275, "grad_norm": 18.924182584571927, "learning_rate": 4.084784776911072e-07, "logits/chosen": -3.453125, "logits/rejected": -3.328125, "logps/chosen": -302.0, "logps/rejected": -448.0, "loss": 0.7821, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.75, "rewards/margins": 1.5859375, "rewards/rejected": -3.34375, "step": 4600 }, { "epoch": 0.35407066052227343, "grad_norm": 16.019466419338407, "learning_rate": 4.079595355061962e-07, "logits/chosen": -3.4375, "logits/rejected": -3.609375, "logps/chosen": -320.0, "logps/rejected": -438.0, "loss": 0.7907, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.796875, "rewards/margins": 1.453125, "rewards/rejected": -3.25, "step": 4610 }, { "epoch": 0.3548387096774194, "grad_norm": 19.146244146960765, "learning_rate": 4.074394579488506e-07, "logits/chosen": -3.375, "logits/rejected": -3.3125, "logps/chosen": -296.0, "logps/rejected": -402.0, "loss": 0.8602, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.625, "rewards/margins": 1.296875, "rewards/rejected": -2.921875, "step": 4620 }, { "epoch": 0.35560675883256526, "grad_norm": 16.759912526310316, "learning_rate": 4.069182487572539e-07, "logits/chosen": -3.46875, "logits/rejected": -3.5, "logps/chosen": -264.0, "logps/rejected": -392.0, "loss": 0.8091, "rewards/accuracies": 0.84375, "rewards/chosen": -1.5234375, "rewards/margins": 1.421875, "rewards/rejected": -2.953125, "step": 4630 }, { "epoch": 0.3563748079877112, "grad_norm": 15.48884147774979, "learning_rate": 4.0639591167772337e-07, "logits/chosen": -3.4375, "logits/rejected": -3.375, "logps/chosen": -302.0, "logps/rejected": -446.0, "loss": 0.7682, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.8359375, "rewards/margins": 1.390625, "rewards/rejected": -3.21875, "step": 4640 }, { "epoch": 0.35714285714285715, "grad_norm": 16.35666974857001, "learning_rate": 4.058724504646834e-07, "logits/chosen": -3.421875, "logits/rejected": -3.609375, "logps/chosen": -298.0, "logps/rejected": -458.0, "loss": 0.8237, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.671875, "rewards/margins": 1.765625, "rewards/rejected": -3.4375, "step": 4650 }, { "epoch": 0.3579109062980031, "grad_norm": 17.6684883260653, "learning_rate": 4.053478688806382e-07, "logits/chosen": -3.4375, "logits/rejected": -3.453125, "logps/chosen": -316.0, "logps/rejected": -444.0, "loss": 0.823, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.78125, "rewards/margins": 1.375, "rewards/rejected": -3.15625, "step": 4660 }, { "epoch": 0.358678955453149, "grad_norm": 14.807596766431857, "learning_rate": 4.048221706961451e-07, "logits/chosen": -3.34375, "logits/rejected": -3.296875, "logps/chosen": -276.0, "logps/rejected": -424.0, "loss": 0.7874, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.5703125, "rewards/margins": 1.4765625, "rewards/rejected": -3.046875, "step": 4670 }, { "epoch": 0.35944700460829493, "grad_norm": 15.831756348019763, "learning_rate": 4.0429535968978725e-07, "logits/chosen": -3.25, "logits/rejected": -3.34375, "logps/chosen": -300.0, "logps/rejected": -424.0, "loss": 0.8484, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.6875, "rewards/margins": 1.390625, "rewards/rejected": -3.078125, "step": 4680 }, { "epoch": 0.3602150537634409, "grad_norm": 15.74327112298826, "learning_rate": 4.0376743964814617e-07, "logits/chosen": -3.328125, "logits/rejected": -3.34375, "logps/chosen": -292.0, "logps/rejected": -412.0, "loss": 0.8377, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.734375, "rewards/margins": 1.265625, "rewards/rejected": -3.0, "step": 4690 }, { "epoch": 0.36098310291858676, "grad_norm": 15.654294668120492, "learning_rate": 4.032384143657753e-07, "logits/chosen": -3.359375, "logits/rejected": -3.359375, "logps/chosen": -280.0, "logps/rejected": -426.0, "loss": 0.8069, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.6640625, "rewards/margins": 1.4453125, "rewards/rejected": -3.109375, "step": 4700 }, { "epoch": 0.3617511520737327, "grad_norm": 16.260920132370615, "learning_rate": 4.0270828764517194e-07, "logits/chosen": -3.34375, "logits/rejected": -3.34375, "logps/chosen": -272.0, "logps/rejected": -396.0, "loss": 0.8043, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.484375, "rewards/margins": 1.4453125, "rewards/rejected": -2.921875, "step": 4710 }, { "epoch": 0.36251920122887865, "grad_norm": 20.228695982898365, "learning_rate": 4.0217706329675027e-07, "logits/chosen": -3.40625, "logits/rejected": -3.34375, "logps/chosen": -282.0, "logps/rejected": -400.0, "loss": 0.8507, "rewards/accuracies": 0.78125, "rewards/chosen": -1.6015625, "rewards/margins": 1.296875, "rewards/rejected": -2.90625, "step": 4720 }, { "epoch": 0.3632872503840246, "grad_norm": 17.64953741860056, "learning_rate": 4.016447451388142e-07, "logits/chosen": -3.25, "logits/rejected": -3.265625, "logps/chosen": -276.0, "logps/rejected": -410.0, "loss": 0.8466, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.5234375, "rewards/margins": 1.2890625, "rewards/rejected": -2.8125, "step": 4730 }, { "epoch": 0.3640552995391705, "grad_norm": 16.576472038916126, "learning_rate": 4.0111133699752926e-07, "logits/chosen": -3.265625, "logits/rejected": -3.453125, "logps/chosen": -298.0, "logps/rejected": -408.0, "loss": 0.8239, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.625, "rewards/margins": 1.4375, "rewards/rejected": -3.0625, "step": 4740 }, { "epoch": 0.36482334869431643, "grad_norm": 14.890646519870455, "learning_rate": 4.0057684270689583e-07, "logits/chosen": -3.421875, "logits/rejected": -3.5, "logps/chosen": -278.0, "logps/rejected": -416.0, "loss": 0.8213, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.5703125, "rewards/margins": 1.421875, "rewards/rejected": -3.0, "step": 4750 }, { "epoch": 0.3655913978494624, "grad_norm": 16.083839107245673, "learning_rate": 4.0004126610872114e-07, "logits/chosen": -3.59375, "logits/rejected": -3.609375, "logps/chosen": -288.0, "logps/rejected": -450.0, "loss": 0.7815, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.8046875, "rewards/margins": 1.640625, "rewards/rejected": -3.4375, "step": 4760 }, { "epoch": 0.3663594470046083, "grad_norm": 16.975945950258236, "learning_rate": 3.995046110525917e-07, "logits/chosen": -3.546875, "logits/rejected": -3.671875, "logps/chosen": -304.0, "logps/rejected": -460.0, "loss": 0.7683, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.84375, "rewards/margins": 1.609375, "rewards/rejected": -3.453125, "step": 4770 }, { "epoch": 0.3671274961597542, "grad_norm": 17.994967475563172, "learning_rate": 3.9896688139584575e-07, "logits/chosen": -3.578125, "logits/rejected": -3.71875, "logps/chosen": -290.0, "logps/rejected": -428.0, "loss": 0.8122, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.8125, "rewards/margins": 1.359375, "rewards/rejected": -3.171875, "step": 4780 }, { "epoch": 0.36789554531490015, "grad_norm": 15.663333264710406, "learning_rate": 3.9842808100354555e-07, "logits/chosen": -3.5, "logits/rejected": -3.421875, "logps/chosen": -286.0, "logps/rejected": -434.0, "loss": 0.8139, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.6875, "rewards/margins": 1.4765625, "rewards/rejected": -3.15625, "step": 4790 }, { "epoch": 0.3686635944700461, "grad_norm": 15.79494855287583, "learning_rate": 3.978882137484495e-07, "logits/chosen": -3.359375, "logits/rejected": -3.171875, "logps/chosen": -264.0, "logps/rejected": -408.0, "loss": 0.7456, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.421875, "rewards/margins": 1.5234375, "rewards/rejected": -2.9375, "step": 4800 }, { "epoch": 0.369431643625192, "grad_norm": 18.03991825592209, "learning_rate": 3.973472835109841e-07, "logits/chosen": -3.46875, "logits/rejected": -3.53125, "logps/chosen": -296.0, "logps/rejected": -422.0, "loss": 0.8168, "rewards/accuracies": 0.75, "rewards/chosen": -1.84375, "rewards/margins": 1.296875, "rewards/rejected": -3.140625, "step": 4810 }, { "epoch": 0.37019969278033793, "grad_norm": 16.417213562114863, "learning_rate": 3.968052941792167e-07, "logits/chosen": -3.609375, "logits/rejected": -3.640625, "logps/chosen": -298.0, "logps/rejected": -448.0, "loss": 0.7379, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.9296875, "rewards/margins": 1.4375, "rewards/rejected": -3.359375, "step": 4820 }, { "epoch": 0.3709677419354839, "grad_norm": 15.781247727019622, "learning_rate": 3.9626224964882685e-07, "logits/chosen": -3.453125, "logits/rejected": -3.515625, "logps/chosen": -304.0, "logps/rejected": -440.0, "loss": 0.8131, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.84375, "rewards/margins": 1.484375, "rewards/rejected": -3.328125, "step": 4830 }, { "epoch": 0.3717357910906298, "grad_norm": 15.825681127696315, "learning_rate": 3.957181538230788e-07, "logits/chosen": -3.546875, "logits/rejected": -3.75, "logps/chosen": -272.0, "logps/rejected": -438.0, "loss": 0.8086, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.640625, "rewards/margins": 1.6796875, "rewards/rejected": -3.3125, "step": 4840 }, { "epoch": 0.3725038402457757, "grad_norm": 16.684302960454733, "learning_rate": 3.9517301061279305e-07, "logits/chosen": -3.5, "logits/rejected": -3.65625, "logps/chosen": -278.0, "logps/rejected": -494.0, "loss": 0.8493, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.6171875, "rewards/margins": 2.328125, "rewards/rejected": -3.953125, "step": 4850 }, { "epoch": 0.37327188940092165, "grad_norm": 17.93198363344499, "learning_rate": 3.946268239363185e-07, "logits/chosen": -3.40625, "logits/rejected": -3.5, "logps/chosen": -260.0, "logps/rejected": -382.0, "loss": 0.8473, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.390625, "rewards/margins": 1.328125, "rewards/rejected": -2.71875, "step": 4860 }, { "epoch": 0.3740399385560676, "grad_norm": 16.407669721437365, "learning_rate": 3.9407959771950416e-07, "logits/chosen": -3.421875, "logits/rejected": -3.5, "logps/chosen": -314.0, "logps/rejected": -424.0, "loss": 0.798, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.734375, "rewards/margins": 1.21875, "rewards/rejected": -2.953125, "step": 4870 }, { "epoch": 0.37480798771121354, "grad_norm": 19.62024090473589, "learning_rate": 3.9353133589567125e-07, "logits/chosen": -3.5, "logits/rejected": -3.28125, "logps/chosen": -282.0, "logps/rejected": -436.0, "loss": 0.7704, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.625, "rewards/margins": 1.53125, "rewards/rejected": -3.15625, "step": 4880 }, { "epoch": 0.37557603686635943, "grad_norm": 16.105661704613873, "learning_rate": 3.9298204240558427e-07, "logits/chosen": -3.546875, "logits/rejected": -3.5, "logps/chosen": -316.0, "logps/rejected": -434.0, "loss": 0.8222, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.03125, "rewards/margins": 1.234375, "rewards/rejected": -3.265625, "step": 4890 }, { "epoch": 0.3763440860215054, "grad_norm": 19.473751166106755, "learning_rate": 3.924317211974234e-07, "logits/chosen": -3.359375, "logits/rejected": -3.40625, "logps/chosen": -318.0, "logps/rejected": -438.0, "loss": 0.8125, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.875, "rewards/margins": 1.3203125, "rewards/rejected": -3.1875, "step": 4900 }, { "epoch": 0.3771121351766513, "grad_norm": 18.281647603192628, "learning_rate": 3.918803762267556e-07, "logits/chosen": -3.25, "logits/rejected": -3.421875, "logps/chosen": -308.0, "logps/rejected": -466.0, "loss": 0.8037, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.8125, "rewards/margins": 1.640625, "rewards/rejected": -3.453125, "step": 4910 }, { "epoch": 0.3778801843317972, "grad_norm": 16.81934367599949, "learning_rate": 3.9132801145650654e-07, "logits/chosen": -3.28125, "logits/rejected": -3.40625, "logps/chosen": -296.0, "logps/rejected": -428.0, "loss": 0.8209, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.578125, "rewards/margins": 1.453125, "rewards/rejected": -3.015625, "step": 4920 }, { "epoch": 0.37864823348694315, "grad_norm": 16.523710515296152, "learning_rate": 3.9077463085693165e-07, "logits/chosen": -3.34375, "logits/rejected": -3.390625, "logps/chosen": -328.0, "logps/rejected": -460.0, "loss": 0.8127, "rewards/accuracies": 0.8125, "rewards/chosen": -1.859375, "rewards/margins": 1.5078125, "rewards/rejected": -3.359375, "step": 4930 }, { "epoch": 0.3794162826420891, "grad_norm": 23.71688486076636, "learning_rate": 3.902202384055882e-07, "logits/chosen": -3.390625, "logits/rejected": -3.4375, "logps/chosen": -296.0, "logps/rejected": -438.0, "loss": 0.806, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.6484375, "rewards/margins": 1.5625, "rewards/rejected": -3.21875, "step": 4940 }, { "epoch": 0.38018433179723504, "grad_norm": 14.3881074690772, "learning_rate": 3.8966483808730626e-07, "logits/chosen": -3.578125, "logits/rejected": -3.5, "logps/chosen": -290.0, "logps/rejected": -430.0, "loss": 0.7722, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.7265625, "rewards/margins": 1.484375, "rewards/rejected": -3.203125, "step": 4950 }, { "epoch": 0.38095238095238093, "grad_norm": 17.82596056145954, "learning_rate": 3.891084338941603e-07, "logits/chosen": -3.4375, "logits/rejected": -3.390625, "logps/chosen": -282.0, "logps/rejected": -438.0, "loss": 0.7463, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.6015625, "rewards/margins": 1.578125, "rewards/rejected": -3.171875, "step": 4960 }, { "epoch": 0.3817204301075269, "grad_norm": 18.220097270846843, "learning_rate": 3.8855102982543996e-07, "logits/chosen": -3.578125, "logits/rejected": -3.75, "logps/chosen": -324.0, "logps/rejected": -456.0, "loss": 0.7594, "rewards/accuracies": 0.78125, "rewards/chosen": -2.03125, "rewards/margins": 1.5390625, "rewards/rejected": -3.5625, "step": 4970 }, { "epoch": 0.3824884792626728, "grad_norm": 16.18543823668262, "learning_rate": 3.879926298876223e-07, "logits/chosen": -3.65625, "logits/rejected": -3.453125, "logps/chosen": -300.0, "logps/rejected": -484.0, "loss": 0.774, "rewards/accuracies": 0.78125, "rewards/chosen": -1.8515625, "rewards/margins": 1.75, "rewards/rejected": -3.59375, "step": 4980 }, { "epoch": 0.38325652841781876, "grad_norm": 19.367314694643987, "learning_rate": 3.874332380943421e-07, "logits/chosen": -3.453125, "logits/rejected": -3.609375, "logps/chosen": -312.0, "logps/rejected": -436.0, "loss": 0.797, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.84375, "rewards/margins": 1.4296875, "rewards/rejected": -3.265625, "step": 4990 }, { "epoch": 0.38402457757296465, "grad_norm": 19.496978933597784, "learning_rate": 3.8687285846636344e-07, "logits/chosen": -3.609375, "logits/rejected": -3.546875, "logps/chosen": -310.0, "logps/rejected": -460.0, "loss": 0.8334, "rewards/accuracies": 0.78125, "rewards/chosen": -1.84375, "rewards/margins": 1.4453125, "rewards/rejected": -3.28125, "step": 5000 }, { "epoch": 0.38402457757296465, "eval_logits/chosen": -3.453125, "eval_logits/rejected": -3.546875, "eval_logps/chosen": -332.0, "eval_logps/rejected": -426.0, "eval_loss": 0.4594944417476654, "eval_rewards/accuracies": 0.7611607313156128, "eval_rewards/chosen": -1.90625, "eval_rewards/margins": 1.21875, "eval_rewards/rejected": -3.125, "eval_runtime": 2263.8781, "eval_samples_per_second": 41.14, "eval_steps_per_second": 0.643, "step": 5000 }, { "epoch": 0.3847926267281106, "grad_norm": 16.031980592931287, "learning_rate": 3.863114950315507e-07, "logits/chosen": -3.390625, "logits/rejected": -3.3125, "logps/chosen": -308.0, "logps/rejected": -432.0, "loss": 0.8198, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.6953125, "rewards/margins": 1.3984375, "rewards/rejected": -3.09375, "step": 5010 }, { "epoch": 0.38556067588325654, "grad_norm": 17.201551579420435, "learning_rate": 3.857491518248395e-07, "logits/chosen": -3.4375, "logits/rejected": -3.359375, "logps/chosen": -304.0, "logps/rejected": -438.0, "loss": 0.7883, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.734375, "rewards/margins": 1.484375, "rewards/rejected": -3.21875, "step": 5020 }, { "epoch": 0.38632872503840243, "grad_norm": 20.469135912750765, "learning_rate": 3.8518583288820785e-07, "logits/chosen": -3.40625, "logits/rejected": -3.515625, "logps/chosen": -278.0, "logps/rejected": -434.0, "loss": 0.771, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.5234375, "rewards/margins": 1.703125, "rewards/rejected": -3.21875, "step": 5030 }, { "epoch": 0.3870967741935484, "grad_norm": 19.556486454492383, "learning_rate": 3.8462154227064724e-07, "logits/chosen": -3.515625, "logits/rejected": -3.5625, "logps/chosen": -300.0, "logps/rejected": -424.0, "loss": 0.8726, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.65625, "rewards/margins": 1.375, "rewards/rejected": -3.03125, "step": 5040 }, { "epoch": 0.3878648233486943, "grad_norm": 16.821560526975947, "learning_rate": 3.840562840281331e-07, "logits/chosen": -3.40625, "logits/rejected": -3.421875, "logps/chosen": -258.0, "logps/rejected": -422.0, "loss": 0.8086, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.4296875, "rewards/margins": 1.53125, "rewards/rejected": -2.953125, "step": 5050 }, { "epoch": 0.38863287250384027, "grad_norm": 25.31090998993558, "learning_rate": 3.834900622235961e-07, "logits/chosen": -3.5, "logits/rejected": -3.6875, "logps/chosen": -280.0, "logps/rejected": -404.0, "loss": 0.7923, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.6328125, "rewards/margins": 1.4453125, "rewards/rejected": -3.078125, "step": 5060 }, { "epoch": 0.38940092165898615, "grad_norm": 16.637956674366805, "learning_rate": 3.829228809268926e-07, "logits/chosen": -3.375, "logits/rejected": -3.328125, "logps/chosen": -292.0, "logps/rejected": -436.0, "loss": 0.8295, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5625, "rewards/margins": 1.6015625, "rewards/rejected": -3.171875, "step": 5070 }, { "epoch": 0.3901689708141321, "grad_norm": 17.864952150560214, "learning_rate": 3.823547442147756e-07, "logits/chosen": -3.484375, "logits/rejected": -3.546875, "logps/chosen": -312.0, "logps/rejected": -418.0, "loss": 0.7926, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.7265625, "rewards/margins": 1.3671875, "rewards/rejected": -3.09375, "step": 5080 }, { "epoch": 0.39093701996927804, "grad_norm": 16.89639792203692, "learning_rate": 3.8178565617086534e-07, "logits/chosen": -3.453125, "logits/rejected": -3.484375, "logps/chosen": -284.0, "logps/rejected": -410.0, "loss": 0.7737, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.53125, "rewards/margins": 1.2890625, "rewards/rejected": -2.8125, "step": 5090 }, { "epoch": 0.391705069124424, "grad_norm": 15.764565694533212, "learning_rate": 3.812156208856201e-07, "logits/chosen": -3.390625, "logits/rejected": -3.375, "logps/chosen": -292.0, "logps/rejected": -426.0, "loss": 0.784, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.65625, "rewards/margins": 1.4609375, "rewards/rejected": -3.125, "step": 5100 }, { "epoch": 0.3924731182795699, "grad_norm": 16.858387958374248, "learning_rate": 3.8064464245630655e-07, "logits/chosen": -3.421875, "logits/rejected": -3.5, "logps/chosen": -264.0, "logps/rejected": -406.0, "loss": 0.8223, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5234375, "rewards/margins": 1.4921875, "rewards/rejected": -3.015625, "step": 5110 }, { "epoch": 0.3932411674347158, "grad_norm": 19.868592430184123, "learning_rate": 3.8007272498697054e-07, "logits/chosen": -3.484375, "logits/rejected": -3.5625, "logps/chosen": -280.0, "logps/rejected": -436.0, "loss": 0.7994, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5390625, "rewards/margins": 1.828125, "rewards/rejected": -3.375, "step": 5120 }, { "epoch": 0.39400921658986177, "grad_norm": 17.23638414531754, "learning_rate": 3.794998725884074e-07, "logits/chosen": -3.421875, "logits/rejected": -3.421875, "logps/chosen": -270.0, "logps/rejected": -440.0, "loss": 0.7829, "rewards/accuracies": 0.875, "rewards/chosen": -1.5703125, "rewards/margins": 1.7734375, "rewards/rejected": -3.34375, "step": 5130 }, { "epoch": 0.39477726574500765, "grad_norm": 18.239928853115348, "learning_rate": 3.7892608937813263e-07, "logits/chosen": -3.5, "logits/rejected": -3.375, "logps/chosen": -292.0, "logps/rejected": -418.0, "loss": 0.7928, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.78125, "rewards/margins": 1.28125, "rewards/rejected": -3.0625, "step": 5140 }, { "epoch": 0.3955453149001536, "grad_norm": 17.84717981479047, "learning_rate": 3.7835137948035204e-07, "logits/chosen": -3.421875, "logits/rejected": -3.40625, "logps/chosen": -296.0, "logps/rejected": -436.0, "loss": 0.795, "rewards/accuracies": 0.84375, "rewards/chosen": -1.71875, "rewards/margins": 1.46875, "rewards/rejected": -3.1875, "step": 5150 }, { "epoch": 0.39631336405529954, "grad_norm": 14.922308100123868, "learning_rate": 3.777757470259323e-07, "logits/chosen": -3.515625, "logits/rejected": -3.53125, "logps/chosen": -328.0, "logps/rejected": -494.0, "loss": 0.7979, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.890625, "rewards/margins": 1.7421875, "rewards/rejected": -3.640625, "step": 5160 }, { "epoch": 0.3970814132104455, "grad_norm": 14.662728247620963, "learning_rate": 3.77199196152371e-07, "logits/chosen": -3.578125, "logits/rejected": -3.84375, "logps/chosen": -306.0, "logps/rejected": -458.0, "loss": 0.8165, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.78125, "rewards/margins": 1.796875, "rewards/rejected": -3.578125, "step": 5170 }, { "epoch": 0.3978494623655914, "grad_norm": 20.077235681389134, "learning_rate": 3.766217310037674e-07, "logits/chosen": -3.453125, "logits/rejected": -3.578125, "logps/chosen": -286.0, "logps/rejected": -438.0, "loss": 0.8304, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.6015625, "rewards/margins": 1.640625, "rewards/rejected": -3.25, "step": 5180 }, { "epoch": 0.3986175115207373, "grad_norm": 15.175164461756195, "learning_rate": 3.7604335573079215e-07, "logits/chosen": -3.4375, "logits/rejected": -3.640625, "logps/chosen": -302.0, "logps/rejected": -434.0, "loss": 0.8059, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.796875, "rewards/margins": 1.3671875, "rewards/rejected": -3.171875, "step": 5190 }, { "epoch": 0.39938556067588327, "grad_norm": 19.524717937732504, "learning_rate": 3.7546407449065766e-07, "logits/chosen": -3.5625, "logits/rejected": -3.5, "logps/chosen": -298.0, "logps/rejected": -452.0, "loss": 0.7898, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.7265625, "rewards/margins": 1.53125, "rewards/rejected": -3.25, "step": 5200 }, { "epoch": 0.4001536098310292, "grad_norm": 17.54156681989062, "learning_rate": 3.748838914470881e-07, "logits/chosen": -3.515625, "logits/rejected": -3.40625, "logps/chosen": -302.0, "logps/rejected": -458.0, "loss": 0.7753, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6796875, "rewards/margins": 1.6796875, "rewards/rejected": -3.359375, "step": 5210 }, { "epoch": 0.4009216589861751, "grad_norm": 16.933602324799025, "learning_rate": 3.743028107702898e-07, "logits/chosen": -3.40625, "logits/rejected": -3.28125, "logps/chosen": -322.0, "logps/rejected": -482.0, "loss": 0.8012, "rewards/accuracies": 0.78125, "rewards/chosen": -1.9609375, "rewards/margins": 1.6015625, "rewards/rejected": -3.5625, "step": 5220 }, { "epoch": 0.40168970814132104, "grad_norm": 16.04295045193142, "learning_rate": 3.7372083663692087e-07, "logits/chosen": -3.546875, "logits/rejected": -3.453125, "logps/chosen": -302.0, "logps/rejected": -430.0, "loss": 0.7859, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8828125, "rewards/margins": 1.296875, "rewards/rejected": -3.171875, "step": 5230 }, { "epoch": 0.402457757296467, "grad_norm": 16.109632949038794, "learning_rate": 3.7313797323006137e-07, "logits/chosen": -3.515625, "logits/rejected": -3.703125, "logps/chosen": -316.0, "logps/rejected": -436.0, "loss": 0.7813, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8046875, "rewards/margins": 1.34375, "rewards/rejected": -3.15625, "step": 5240 }, { "epoch": 0.4032258064516129, "grad_norm": 22.664893257477484, "learning_rate": 3.7255422473918337e-07, "logits/chosen": -3.515625, "logits/rejected": -3.59375, "logps/chosen": -270.0, "logps/rejected": -406.0, "loss": 0.808, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.71875, "rewards/margins": 1.3671875, "rewards/rejected": -3.078125, "step": 5250 }, { "epoch": 0.4039938556067588, "grad_norm": 14.889036104309643, "learning_rate": 3.7196959536012043e-07, "logits/chosen": -3.5, "logits/rejected": -3.59375, "logps/chosen": -272.0, "logps/rejected": -388.0, "loss": 0.8147, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6015625, "rewards/margins": 1.3046875, "rewards/rejected": -2.90625, "step": 5260 }, { "epoch": 0.40476190476190477, "grad_norm": 18.623276804088903, "learning_rate": 3.7138408929503803e-07, "logits/chosen": -3.5, "logits/rejected": -3.53125, "logps/chosen": -272.0, "logps/rejected": -420.0, "loss": 0.8083, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.625, "rewards/margins": 1.5390625, "rewards/rejected": -3.171875, "step": 5270 }, { "epoch": 0.4055299539170507, "grad_norm": 15.485973557122227, "learning_rate": 3.707977107524028e-07, "logits/chosen": -3.5625, "logits/rejected": -3.609375, "logps/chosen": -302.0, "logps/rejected": -440.0, "loss": 0.8043, "rewards/accuracies": 0.78125, "rewards/chosen": -1.7734375, "rewards/margins": 1.4375, "rewards/rejected": -3.21875, "step": 5280 }, { "epoch": 0.4062980030721966, "grad_norm": 15.941783531048765, "learning_rate": 3.7021046394695257e-07, "logits/chosen": -3.453125, "logits/rejected": -3.5, "logps/chosen": -304.0, "logps/rejected": -462.0, "loss": 0.8381, "rewards/accuracies": 0.875, "rewards/chosen": -1.640625, "rewards/margins": 1.7421875, "rewards/rejected": -3.390625, "step": 5290 }, { "epoch": 0.40706605222734255, "grad_norm": 16.561603350176718, "learning_rate": 3.6962235309966604e-07, "logits/chosen": -3.484375, "logits/rejected": -3.59375, "logps/chosen": -272.0, "logps/rejected": -404.0, "loss": 0.7933, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.578125, "rewards/margins": 1.46875, "rewards/rejected": -3.046875, "step": 5300 }, { "epoch": 0.4078341013824885, "grad_norm": 17.33536256407671, "learning_rate": 3.6903338243773244e-07, "logits/chosen": -3.59375, "logits/rejected": -3.6875, "logps/chosen": -296.0, "logps/rejected": -424.0, "loss": 0.7979, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.75, "rewards/margins": 1.4609375, "rewards/rejected": -3.21875, "step": 5310 }, { "epoch": 0.40860215053763443, "grad_norm": 17.380724048885497, "learning_rate": 3.6844355619452124e-07, "logits/chosen": -3.59375, "logits/rejected": -3.78125, "logps/chosen": -318.0, "logps/rejected": -428.0, "loss": 0.7891, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.84375, "rewards/margins": 1.4140625, "rewards/rejected": -3.25, "step": 5320 }, { "epoch": 0.4093701996927803, "grad_norm": 16.564464749257606, "learning_rate": 3.678528786095513e-07, "logits/chosen": -3.71875, "logits/rejected": -3.6875, "logps/chosen": -294.0, "logps/rejected": -460.0, "loss": 0.8011, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7109375, "rewards/margins": 1.6171875, "rewards/rejected": -3.328125, "step": 5330 }, { "epoch": 0.41013824884792627, "grad_norm": 19.27867516677003, "learning_rate": 3.672613539284609e-07, "logits/chosen": -3.59375, "logits/rejected": -3.75, "logps/chosen": -318.0, "logps/rejected": -454.0, "loss": 0.7631, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.7890625, "rewards/margins": 1.546875, "rewards/rejected": -3.328125, "step": 5340 }, { "epoch": 0.4109062980030722, "grad_norm": 21.236615452049083, "learning_rate": 3.6666898640297705e-07, "logits/chosen": -3.609375, "logits/rejected": -3.671875, "logps/chosen": -330.0, "logps/rejected": -476.0, "loss": 0.8174, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.9765625, "rewards/margins": 1.6796875, "rewards/rejected": -3.65625, "step": 5350 }, { "epoch": 0.4116743471582181, "grad_norm": 17.45399500526455, "learning_rate": 3.660757802908848e-07, "logits/chosen": -3.546875, "logits/rejected": -3.59375, "logps/chosen": -334.0, "logps/rejected": -472.0, "loss": 0.7902, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.046875, "rewards/margins": 1.4375, "rewards/rejected": -3.484375, "step": 5360 }, { "epoch": 0.41244239631336405, "grad_norm": 17.242788946435322, "learning_rate": 3.65481739855997e-07, "logits/chosen": -3.484375, "logits/rejected": -3.671875, "logps/chosen": -308.0, "logps/rejected": -438.0, "loss": 0.7638, "rewards/accuracies": 0.78125, "rewards/chosen": -1.796875, "rewards/margins": 1.3671875, "rewards/rejected": -3.171875, "step": 5370 }, { "epoch": 0.41321044546851, "grad_norm": 20.796868017295157, "learning_rate": 3.6488686936812306e-07, "logits/chosen": -3.53125, "logits/rejected": -3.46875, "logps/chosen": -308.0, "logps/rejected": -488.0, "loss": 0.7833, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.8203125, "rewards/margins": 1.828125, "rewards/rejected": -3.640625, "step": 5380 }, { "epoch": 0.41397849462365593, "grad_norm": 16.876455631447925, "learning_rate": 3.6429117310303876e-07, "logits/chosen": -3.53125, "logits/rejected": -3.515625, "logps/chosen": -310.0, "logps/rejected": -476.0, "loss": 0.7774, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.9453125, "rewards/margins": 1.71875, "rewards/rejected": -3.671875, "step": 5390 }, { "epoch": 0.4147465437788018, "grad_norm": 20.218800824356673, "learning_rate": 3.6369465534245536e-07, "logits/chosen": -3.5, "logits/rejected": -3.625, "logps/chosen": -326.0, "logps/rejected": -480.0, "loss": 0.7899, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.953125, "rewards/margins": 1.671875, "rewards/rejected": -3.625, "step": 5400 }, { "epoch": 0.41551459293394777, "grad_norm": 20.237752194196474, "learning_rate": 3.6309732037398874e-07, "logits/chosen": -3.46875, "logits/rejected": -3.765625, "logps/chosen": -336.0, "logps/rejected": -476.0, "loss": 0.7772, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.015625, "rewards/margins": 1.703125, "rewards/rejected": -3.71875, "step": 5410 }, { "epoch": 0.4162826420890937, "grad_norm": 19.63700707754733, "learning_rate": 3.624991724911289e-07, "logits/chosen": -3.46875, "logits/rejected": -3.65625, "logps/chosen": -318.0, "logps/rejected": -454.0, "loss": 0.79, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.9140625, "rewards/margins": 1.59375, "rewards/rejected": -3.515625, "step": 5420 }, { "epoch": 0.41705069124423966, "grad_norm": 16.52200945295782, "learning_rate": 3.6190021599320846e-07, "logits/chosen": -3.375, "logits/rejected": -3.46875, "logps/chosen": -314.0, "logps/rejected": -436.0, "loss": 0.8159, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.8984375, "rewards/margins": 1.3046875, "rewards/rejected": -3.203125, "step": 5430 }, { "epoch": 0.41781874039938555, "grad_norm": 20.36533420941262, "learning_rate": 3.613004551853725e-07, "logits/chosen": -3.46875, "logits/rejected": -3.625, "logps/chosen": -290.0, "logps/rejected": -392.0, "loss": 0.8395, "rewards/accuracies": 0.75, "rewards/chosen": -1.7265625, "rewards/margins": 1.1640625, "rewards/rejected": -2.890625, "step": 5440 }, { "epoch": 0.4185867895545315, "grad_norm": 17.331398112650547, "learning_rate": 3.606998943785471e-07, "logits/chosen": -3.609375, "logits/rejected": -3.53125, "logps/chosen": -294.0, "logps/rejected": -424.0, "loss": 0.7599, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6875, "rewards/margins": 1.5078125, "rewards/rejected": -3.203125, "step": 5450 }, { "epoch": 0.41935483870967744, "grad_norm": 15.516281156546537, "learning_rate": 3.6009853788940856e-07, "logits/chosen": -3.5625, "logits/rejected": -3.515625, "logps/chosen": -314.0, "logps/rejected": -496.0, "loss": 0.8142, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8125, "rewards/margins": 1.890625, "rewards/rejected": -3.703125, "step": 5460 }, { "epoch": 0.4201228878648233, "grad_norm": 14.629329051414318, "learning_rate": 3.5949639004035243e-07, "logits/chosen": -3.453125, "logits/rejected": -3.328125, "logps/chosen": -310.0, "logps/rejected": -462.0, "loss": 0.8001, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.8203125, "rewards/margins": 1.5703125, "rewards/rejected": -3.390625, "step": 5470 }, { "epoch": 0.42089093701996927, "grad_norm": 17.791760511220073, "learning_rate": 3.5889345515946226e-07, "logits/chosen": -3.453125, "logits/rejected": -3.453125, "logps/chosen": -292.0, "logps/rejected": -426.0, "loss": 0.8289, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.7265625, "rewards/margins": 1.484375, "rewards/rejected": -3.203125, "step": 5480 }, { "epoch": 0.4216589861751152, "grad_norm": 19.73910061404625, "learning_rate": 3.582897375804784e-07, "logits/chosen": -3.53125, "logits/rejected": -3.421875, "logps/chosen": -294.0, "logps/rejected": -446.0, "loss": 0.818, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.7890625, "rewards/margins": 1.4921875, "rewards/rejected": -3.28125, "step": 5490 }, { "epoch": 0.42242703533026116, "grad_norm": 20.450487908117562, "learning_rate": 3.576852416427675e-07, "logits/chosen": -3.453125, "logits/rejected": -3.5625, "logps/chosen": -304.0, "logps/rejected": -442.0, "loss": 0.8441, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.765625, "rewards/margins": 1.5625, "rewards/rejected": -3.328125, "step": 5500 }, { "epoch": 0.42319508448540705, "grad_norm": 18.567522594798643, "learning_rate": 3.570799716912904e-07, "logits/chosen": -3.40625, "logits/rejected": -3.296875, "logps/chosen": -296.0, "logps/rejected": -452.0, "loss": 0.7862, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.6171875, "rewards/margins": 1.7109375, "rewards/rejected": -3.328125, "step": 5510 }, { "epoch": 0.423963133640553, "grad_norm": 18.26891433893111, "learning_rate": 3.564739320765716e-07, "logits/chosen": -3.5, "logits/rejected": -3.703125, "logps/chosen": -296.0, "logps/rejected": -412.0, "loss": 0.7973, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.84375, "rewards/margins": 1.296875, "rewards/rejected": -3.140625, "step": 5520 }, { "epoch": 0.42473118279569894, "grad_norm": 16.310092186173854, "learning_rate": 3.558671271546678e-07, "logits/chosen": -3.375, "logits/rejected": -3.46875, "logps/chosen": -294.0, "logps/rejected": -420.0, "loss": 0.7604, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6953125, "rewards/margins": 1.4609375, "rewards/rejected": -3.15625, "step": 5530 }, { "epoch": 0.4254992319508449, "grad_norm": 17.446789708599272, "learning_rate": 3.552595612871362e-07, "logits/chosen": -3.4375, "logits/rejected": -3.59375, "logps/chosen": -290.0, "logps/rejected": -410.0, "loss": 0.8018, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.578125, "rewards/margins": 1.3828125, "rewards/rejected": -2.96875, "step": 5540 }, { "epoch": 0.42626728110599077, "grad_norm": 16.013563640703854, "learning_rate": 3.546512388410037e-07, "logits/chosen": -3.40625, "logits/rejected": -3.53125, "logps/chosen": -286.0, "logps/rejected": -422.0, "loss": 0.765, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.5078125, "rewards/margins": 1.484375, "rewards/rejected": -2.984375, "step": 5550 }, { "epoch": 0.4270353302611367, "grad_norm": 16.79542470114921, "learning_rate": 3.540421641887353e-07, "logits/chosen": -3.5625, "logits/rejected": -3.578125, "logps/chosen": -330.0, "logps/rejected": -454.0, "loss": 0.7783, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.921875, "rewards/margins": 1.3984375, "rewards/rejected": -3.328125, "step": 5560 }, { "epoch": 0.42780337941628266, "grad_norm": 17.51569822785341, "learning_rate": 3.534323417082028e-07, "logits/chosen": -3.546875, "logits/rejected": -3.6875, "logps/chosen": -324.0, "logps/rejected": -458.0, "loss": 0.7987, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.9921875, "rewards/margins": 1.5625, "rewards/rejected": -3.5625, "step": 5570 }, { "epoch": 0.42857142857142855, "grad_norm": 16.682102088449923, "learning_rate": 3.528217757826529e-07, "logits/chosen": -3.5625, "logits/rejected": -3.6875, "logps/chosen": -332.0, "logps/rejected": -512.0, "loss": 0.7597, "rewards/accuracies": 0.84375, "rewards/chosen": -2.046875, "rewards/margins": 1.84375, "rewards/rejected": -3.890625, "step": 5580 }, { "epoch": 0.4293394777265745, "grad_norm": 24.502818718372758, "learning_rate": 3.5221047080067606e-07, "logits/chosen": -3.65625, "logits/rejected": -3.5625, "logps/chosen": -308.0, "logps/rejected": -464.0, "loss": 0.7748, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.90625, "rewards/margins": 1.7265625, "rewards/rejected": -3.625, "step": 5590 }, { "epoch": 0.43010752688172044, "grad_norm": 17.891891932043634, "learning_rate": 3.515984311561751e-07, "logits/chosen": -3.609375, "logits/rejected": -3.765625, "logps/chosen": -340.0, "logps/rejected": -520.0, "loss": 0.7738, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.0, "rewards/margins": 2.0, "rewards/rejected": -4.0, "step": 5600 }, { "epoch": 0.4308755760368664, "grad_norm": 17.245639290178275, "learning_rate": 3.5098566124833315e-07, "logits/chosen": -3.640625, "logits/rejected": -3.734375, "logps/chosen": -320.0, "logps/rejected": -502.0, "loss": 0.7596, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.03125, "rewards/margins": 1.921875, "rewards/rejected": -3.953125, "step": 5610 }, { "epoch": 0.43164362519201227, "grad_norm": 17.181338323097346, "learning_rate": 3.5037216548158243e-07, "logits/chosen": -3.578125, "logits/rejected": -3.53125, "logps/chosen": -310.0, "logps/rejected": -442.0, "loss": 0.8171, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.84375, "rewards/margins": 1.296875, "rewards/rejected": -3.140625, "step": 5620 }, { "epoch": 0.4324116743471582, "grad_norm": 17.867738306631768, "learning_rate": 3.4975794826557246e-07, "logits/chosen": -3.5625, "logits/rejected": -3.640625, "logps/chosen": -292.0, "logps/rejected": -460.0, "loss": 0.7704, "rewards/accuracies": 0.84375, "rewards/chosen": -1.75, "rewards/margins": 1.7109375, "rewards/rejected": -3.46875, "step": 5630 }, { "epoch": 0.43317972350230416, "grad_norm": 17.582447905726788, "learning_rate": 3.4914301401513827e-07, "logits/chosen": -3.484375, "logits/rejected": -3.390625, "logps/chosen": -292.0, "logps/rejected": -448.0, "loss": 0.813, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.75, "rewards/margins": 1.6484375, "rewards/rejected": -3.40625, "step": 5640 }, { "epoch": 0.4339477726574501, "grad_norm": 18.40147042323575, "learning_rate": 3.4852736715026877e-07, "logits/chosen": -3.53125, "logits/rejected": -3.625, "logps/chosen": -296.0, "logps/rejected": -464.0, "loss": 0.751, "rewards/accuracies": 0.84375, "rewards/chosen": -1.6640625, "rewards/margins": 1.78125, "rewards/rejected": -3.4375, "step": 5650 }, { "epoch": 0.434715821812596, "grad_norm": 18.156349242323607, "learning_rate": 3.47911012096075e-07, "logits/chosen": -3.546875, "logits/rejected": -3.671875, "logps/chosen": -286.0, "logps/rejected": -450.0, "loss": 0.8415, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.734375, "rewards/margins": 1.65625, "rewards/rejected": -3.390625, "step": 5660 }, { "epoch": 0.43548387096774194, "grad_norm": 15.659128294840436, "learning_rate": 3.4729395328275815e-07, "logits/chosen": -3.328125, "logits/rejected": -3.34375, "logps/chosen": -306.0, "logps/rejected": -446.0, "loss": 0.8047, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6875, "rewards/margins": 1.546875, "rewards/rejected": -3.234375, "step": 5670 }, { "epoch": 0.4362519201228879, "grad_norm": 17.546746600605292, "learning_rate": 3.466761951455781e-07, "logits/chosen": -3.359375, "logits/rejected": -3.421875, "logps/chosen": -322.0, "logps/rejected": -460.0, "loss": 0.7745, "rewards/accuracies": 0.84375, "rewards/chosen": -1.75, "rewards/margins": 1.5, "rewards/rejected": -3.25, "step": 5680 }, { "epoch": 0.43701996927803377, "grad_norm": 18.902482442001343, "learning_rate": 3.4605774212482103e-07, "logits/chosen": -3.453125, "logits/rejected": -3.390625, "logps/chosen": -332.0, "logps/rejected": -474.0, "loss": 0.7889, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.9453125, "rewards/margins": 1.5, "rewards/rejected": -3.4375, "step": 5690 }, { "epoch": 0.4377880184331797, "grad_norm": 18.0923119114921, "learning_rate": 3.4543859866576795e-07, "logits/chosen": -3.5625, "logits/rejected": -3.75, "logps/chosen": -356.0, "logps/rejected": -458.0, "loss": 0.8337, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.3125, "rewards/margins": 1.21875, "rewards/rejected": -3.53125, "step": 5700 }, { "epoch": 0.43855606758832566, "grad_norm": 21.009413242189897, "learning_rate": 3.448187692186625e-07, "logits/chosen": -3.421875, "logits/rejected": -3.65625, "logps/chosen": -300.0, "logps/rejected": -414.0, "loss": 0.8044, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.8046875, "rewards/margins": 1.34375, "rewards/rejected": -3.140625, "step": 5710 }, { "epoch": 0.4393241167434716, "grad_norm": 14.378983959639722, "learning_rate": 3.441982582386789e-07, "logits/chosen": -3.375, "logits/rejected": -3.53125, "logps/chosen": -284.0, "logps/rejected": -406.0, "loss": 0.7879, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.578125, "rewards/margins": 1.4765625, "rewards/rejected": -3.0625, "step": 5720 }, { "epoch": 0.4400921658986175, "grad_norm": 15.728845765650982, "learning_rate": 3.4357707018589035e-07, "logits/chosen": -3.359375, "logits/rejected": -3.4375, "logps/chosen": -298.0, "logps/rejected": -470.0, "loss": 0.7491, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.5859375, "rewards/margins": 1.9375, "rewards/rejected": -3.53125, "step": 5730 }, { "epoch": 0.44086021505376344, "grad_norm": 19.151656747159073, "learning_rate": 3.429552095252362e-07, "logits/chosen": -3.34375, "logits/rejected": -3.453125, "logps/chosen": -314.0, "logps/rejected": -478.0, "loss": 0.8117, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.609375, "rewards/margins": 1.828125, "rewards/rejected": -3.4375, "step": 5740 }, { "epoch": 0.4416282642089094, "grad_norm": 17.87683809599679, "learning_rate": 3.4233268072649095e-07, "logits/chosen": -3.484375, "logits/rejected": -3.328125, "logps/chosen": -296.0, "logps/rejected": -426.0, "loss": 0.7717, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8046875, "rewards/margins": 1.3671875, "rewards/rejected": -3.171875, "step": 5750 }, { "epoch": 0.4423963133640553, "grad_norm": 21.89971687683586, "learning_rate": 3.41709488264231e-07, "logits/chosen": -3.515625, "logits/rejected": -3.65625, "logps/chosen": -300.0, "logps/rejected": -464.0, "loss": 0.7683, "rewards/accuracies": 0.84375, "rewards/chosen": -1.6328125, "rewards/margins": 1.7890625, "rewards/rejected": -3.421875, "step": 5760 }, { "epoch": 0.4431643625192012, "grad_norm": 16.728751425061155, "learning_rate": 3.4108563661780316e-07, "logits/chosen": -3.296875, "logits/rejected": -3.1875, "logps/chosen": -296.0, "logps/rejected": -448.0, "loss": 0.8349, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8203125, "rewards/margins": 1.5390625, "rewards/rejected": -3.359375, "step": 5770 }, { "epoch": 0.44393241167434716, "grad_norm": 19.09682619564883, "learning_rate": 3.4046113027129236e-07, "logits/chosen": -3.4375, "logits/rejected": -3.5625, "logps/chosen": -298.0, "logps/rejected": -438.0, "loss": 0.7832, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.890625, "rewards/margins": 1.5390625, "rewards/rejected": -3.421875, "step": 5780 }, { "epoch": 0.4447004608294931, "grad_norm": 16.703014504291087, "learning_rate": 3.3983597371348926e-07, "logits/chosen": -3.53125, "logits/rejected": -3.515625, "logps/chosen": -300.0, "logps/rejected": -438.0, "loss": 0.791, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.7890625, "rewards/margins": 1.515625, "rewards/rejected": -3.3125, "step": 5790 }, { "epoch": 0.445468509984639, "grad_norm": 15.403839416637318, "learning_rate": 3.3921017143785813e-07, "logits/chosen": -3.390625, "logits/rejected": -3.515625, "logps/chosen": -322.0, "logps/rejected": -464.0, "loss": 0.7791, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.8125, "rewards/margins": 1.6875, "rewards/rejected": -3.5, "step": 5800 }, { "epoch": 0.44623655913978494, "grad_norm": 19.988635619079393, "learning_rate": 3.385837279425044e-07, "logits/chosen": -3.390625, "logits/rejected": -3.65625, "logps/chosen": -306.0, "logps/rejected": -466.0, "loss": 0.7788, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.828125, "rewards/margins": 1.6640625, "rewards/rejected": -3.5, "step": 5810 }, { "epoch": 0.4470046082949309, "grad_norm": 19.256723130682243, "learning_rate": 3.3795664773014256e-07, "logits/chosen": -3.359375, "logits/rejected": -3.546875, "logps/chosen": -280.0, "logps/rejected": -462.0, "loss": 0.7747, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.6171875, "rewards/margins": 1.8125, "rewards/rejected": -3.421875, "step": 5820 }, { "epoch": 0.4477726574500768, "grad_norm": 15.314444927276705, "learning_rate": 3.373289353080634e-07, "logits/chosen": -3.5, "logits/rejected": -3.578125, "logps/chosen": -300.0, "logps/rejected": -448.0, "loss": 0.7831, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.6484375, "rewards/margins": 1.71875, "rewards/rejected": -3.375, "step": 5830 }, { "epoch": 0.4485407066052227, "grad_norm": 14.864004440547522, "learning_rate": 3.367005951881022e-07, "logits/chosen": -3.40625, "logits/rejected": -3.546875, "logps/chosen": -312.0, "logps/rejected": -464.0, "loss": 0.8023, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.8203125, "rewards/margins": 1.6171875, "rewards/rejected": -3.4375, "step": 5840 }, { "epoch": 0.44930875576036866, "grad_norm": 18.827184567422428, "learning_rate": 3.360716318866058e-07, "logits/chosen": -3.5, "logits/rejected": -3.484375, "logps/chosen": -310.0, "logps/rejected": -434.0, "loss": 0.7664, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.8828125, "rewards/margins": 1.2734375, "rewards/rejected": -3.15625, "step": 5850 }, { "epoch": 0.4500768049155146, "grad_norm": 19.151156906129984, "learning_rate": 3.3544204992440026e-07, "logits/chosen": -3.53125, "logits/rejected": -3.53125, "logps/chosen": -310.0, "logps/rejected": -458.0, "loss": 0.808, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.8984375, "rewards/margins": 1.6015625, "rewards/rejected": -3.5, "step": 5860 }, { "epoch": 0.4508448540706605, "grad_norm": 18.99727800611593, "learning_rate": 3.348118538267586e-07, "logits/chosen": -3.34375, "logits/rejected": -3.4375, "logps/chosen": -336.0, "logps/rejected": -480.0, "loss": 0.775, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.015625, "rewards/margins": 1.640625, "rewards/rejected": -3.65625, "step": 5870 }, { "epoch": 0.45161290322580644, "grad_norm": 17.75701134056549, "learning_rate": 3.3418104812336784e-07, "logits/chosen": -3.25, "logits/rejected": -3.453125, "logps/chosen": -312.0, "logps/rejected": -428.0, "loss": 0.7722, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.75, "rewards/margins": 1.390625, "rewards/rejected": -3.140625, "step": 5880 }, { "epoch": 0.4523809523809524, "grad_norm": 17.363479740427582, "learning_rate": 3.335496373482969e-07, "logits/chosen": -3.5, "logits/rejected": -3.4375, "logps/chosen": -292.0, "logps/rejected": -452.0, "loss": 0.778, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.8046875, "rewards/margins": 1.6484375, "rewards/rejected": -3.453125, "step": 5890 }, { "epoch": 0.45314900153609833, "grad_norm": 20.119465509595983, "learning_rate": 3.3291762603996366e-07, "logits/chosen": -3.390625, "logits/rejected": -3.46875, "logps/chosen": -316.0, "logps/rejected": -444.0, "loss": 0.8041, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.828125, "rewards/margins": 1.4609375, "rewards/rejected": -3.28125, "step": 5900 }, { "epoch": 0.4539170506912442, "grad_norm": 15.507697484416754, "learning_rate": 3.322850187411025e-07, "logits/chosen": -3.484375, "logits/rejected": -3.40625, "logps/chosen": -318.0, "logps/rejected": -464.0, "loss": 0.7693, "rewards/accuracies": 0.75, "rewards/chosen": -1.9140625, "rewards/margins": 1.4375, "rewards/rejected": -3.359375, "step": 5910 }, { "epoch": 0.45468509984639016, "grad_norm": 20.174124371398882, "learning_rate": 3.316518199987318e-07, "logits/chosen": -3.546875, "logits/rejected": -3.28125, "logps/chosen": -314.0, "logps/rejected": -480.0, "loss": 0.782, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.03125, "rewards/margins": 1.65625, "rewards/rejected": -3.6875, "step": 5920 }, { "epoch": 0.4554531490015361, "grad_norm": 17.550601849315267, "learning_rate": 3.310180343641208e-07, "logits/chosen": -3.453125, "logits/rejected": -3.484375, "logps/chosen": -326.0, "logps/rejected": -484.0, "loss": 0.7974, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.8125, "rewards/margins": 1.8046875, "rewards/rejected": -3.609375, "step": 5930 }, { "epoch": 0.45622119815668205, "grad_norm": 14.757609243485273, "learning_rate": 3.303836663927574e-07, "logits/chosen": -3.53125, "logits/rejected": -3.59375, "logps/chosen": -304.0, "logps/rejected": -432.0, "loss": 0.7883, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.828125, "rewards/margins": 1.421875, "rewards/rejected": -3.25, "step": 5940 }, { "epoch": 0.45698924731182794, "grad_norm": 16.861908226160686, "learning_rate": 3.297487206443151e-07, "logits/chosen": -3.484375, "logits/rejected": -3.5, "logps/chosen": -292.0, "logps/rejected": -444.0, "loss": 0.8216, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.828125, "rewards/margins": 1.453125, "rewards/rejected": -3.28125, "step": 5950 }, { "epoch": 0.4577572964669739, "grad_norm": 16.15474489673189, "learning_rate": 3.291132016826202e-07, "logits/chosen": -3.296875, "logits/rejected": -3.484375, "logps/chosen": -286.0, "logps/rejected": -446.0, "loss": 0.7793, "rewards/accuracies": 0.8125, "rewards/chosen": -1.703125, "rewards/margins": 1.609375, "rewards/rejected": -3.3125, "step": 5960 }, { "epoch": 0.45852534562211983, "grad_norm": 16.953677186705, "learning_rate": 3.2847711407561934e-07, "logits/chosen": -3.203125, "logits/rejected": -3.296875, "logps/chosen": -292.0, "logps/rejected": -462.0, "loss": 0.7292, "rewards/accuracies": 0.90625, "rewards/chosen": -1.5703125, "rewards/margins": 1.765625, "rewards/rejected": -3.328125, "step": 5970 }, { "epoch": 0.4592933947772657, "grad_norm": 17.849985091938727, "learning_rate": 3.2784046239534626e-07, "logits/chosen": -3.34375, "logits/rejected": -3.5625, "logps/chosen": -334.0, "logps/rejected": -474.0, "loss": 0.791, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.125, "rewards/margins": 1.4921875, "rewards/rejected": -3.609375, "step": 5980 }, { "epoch": 0.46006144393241166, "grad_norm": 20.891646193083393, "learning_rate": 3.272032512178892e-07, "logits/chosen": -3.46875, "logits/rejected": -3.453125, "logps/chosen": -310.0, "logps/rejected": -496.0, "loss": 0.7909, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.8671875, "rewards/margins": 1.6875, "rewards/rejected": -3.5625, "step": 5990 }, { "epoch": 0.4608294930875576, "grad_norm": 17.928374601684563, "learning_rate": 3.2656548512335793e-07, "logits/chosen": -3.578125, "logits/rejected": -3.71875, "logps/chosen": -312.0, "logps/rejected": -474.0, "loss": 0.7848, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.8828125, "rewards/margins": 1.6640625, "rewards/rejected": -3.546875, "step": 6000 }, { "epoch": 0.4608294930875576, "eval_logits/chosen": -3.40625, "eval_logits/rejected": -3.53125, "eval_logps/chosen": -346.0, "eval_logps/rejected": -452.0, "eval_loss": 0.4593489170074463, "eval_rewards/accuracies": 0.7602163553237915, "eval_rewards/chosen": -2.0625, "eval_rewards/margins": 1.3359375, "eval_rewards/rejected": -3.390625, "eval_runtime": 2264.3288, "eval_samples_per_second": 41.131, "eval_steps_per_second": 0.643, "step": 6000 }, { "epoch": 0.46159754224270355, "grad_norm": 18.822049151897886, "learning_rate": 3.259271686958507e-07, "logits/chosen": -3.359375, "logits/rejected": -3.578125, "logps/chosen": -308.0, "logps/rejected": -426.0, "loss": 0.7926, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.8828125, "rewards/margins": 1.34375, "rewards/rejected": -3.21875, "step": 6010 }, { "epoch": 0.46236559139784944, "grad_norm": 17.042977054030523, "learning_rate": 3.2528830652342154e-07, "logits/chosen": -3.453125, "logits/rejected": -3.375, "logps/chosen": -332.0, "logps/rejected": -494.0, "loss": 0.7648, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.8515625, "rewards/margins": 1.828125, "rewards/rejected": -3.6875, "step": 6020 }, { "epoch": 0.4631336405529954, "grad_norm": 19.132351253311636, "learning_rate": 3.24648903198047e-07, "logits/chosen": -3.5, "logits/rejected": -3.671875, "logps/chosen": -302.0, "logps/rejected": -434.0, "loss": 0.7707, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.9921875, "rewards/margins": 1.4140625, "rewards/rejected": -3.40625, "step": 6030 }, { "epoch": 0.46390168970814133, "grad_norm": 17.570688552387573, "learning_rate": 3.240089633155936e-07, "logits/chosen": -3.578125, "logits/rejected": -3.671875, "logps/chosen": -334.0, "logps/rejected": -478.0, "loss": 0.7551, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9765625, "rewards/margins": 1.6015625, "rewards/rejected": -3.59375, "step": 6040 }, { "epoch": 0.4646697388632873, "grad_norm": 16.482163007886193, "learning_rate": 3.2336849147578433e-07, "logits/chosen": -3.453125, "logits/rejected": -3.6875, "logps/chosen": -324.0, "logps/rejected": -470.0, "loss": 0.7634, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.8984375, "rewards/margins": 1.734375, "rewards/rejected": -3.640625, "step": 6050 }, { "epoch": 0.46543778801843316, "grad_norm": 20.36526549613774, "learning_rate": 3.2272749228216545e-07, "logits/chosen": -3.5, "logits/rejected": -3.4375, "logps/chosen": -314.0, "logps/rejected": -448.0, "loss": 0.7828, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.8984375, "rewards/margins": 1.53125, "rewards/rejected": -3.421875, "step": 6060 }, { "epoch": 0.4662058371735791, "grad_norm": 17.101959290470777, "learning_rate": 3.220859703420742e-07, "logits/chosen": -3.296875, "logits/rejected": -3.4375, "logps/chosen": -296.0, "logps/rejected": -470.0, "loss": 0.744, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.6171875, "rewards/margins": 1.828125, "rewards/rejected": -3.4375, "step": 6070 }, { "epoch": 0.46697388632872505, "grad_norm": 18.362341758238866, "learning_rate": 3.2144393026660475e-07, "logits/chosen": -3.34375, "logits/rejected": -3.484375, "logps/chosen": -322.0, "logps/rejected": -434.0, "loss": 0.7754, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.9375, "rewards/margins": 1.3359375, "rewards/rejected": -3.28125, "step": 6080 }, { "epoch": 0.46774193548387094, "grad_norm": 18.52019522847079, "learning_rate": 3.2080137667057595e-07, "logits/chosen": -3.4375, "logits/rejected": -3.46875, "logps/chosen": -318.0, "logps/rejected": -474.0, "loss": 0.8438, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.828125, "rewards/margins": 1.6953125, "rewards/rejected": -3.53125, "step": 6090 }, { "epoch": 0.4685099846390169, "grad_norm": 17.702160098544777, "learning_rate": 3.2015831417249747e-07, "logits/chosen": -3.375, "logits/rejected": -3.3125, "logps/chosen": -304.0, "logps/rejected": -452.0, "loss": 0.7944, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.9453125, "rewards/margins": 1.46875, "rewards/rejected": -3.40625, "step": 6100 }, { "epoch": 0.46927803379416283, "grad_norm": 21.553695407910784, "learning_rate": 3.1951474739453656e-07, "logits/chosen": -3.28125, "logits/rejected": -3.25, "logps/chosen": -330.0, "logps/rejected": -480.0, "loss": 0.7477, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.859375, "rewards/margins": 1.515625, "rewards/rejected": -3.359375, "step": 6110 }, { "epoch": 0.4700460829493088, "grad_norm": 33.97491815173837, "learning_rate": 3.188706809624856e-07, "logits/chosen": -3.359375, "logits/rejected": -3.515625, "logps/chosen": -314.0, "logps/rejected": -464.0, "loss": 0.8148, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8984375, "rewards/margins": 1.484375, "rewards/rejected": -3.390625, "step": 6120 }, { "epoch": 0.47081413210445466, "grad_norm": 14.856427332150107, "learning_rate": 3.1822611950572797e-07, "logits/chosen": -3.265625, "logits/rejected": -3.21875, "logps/chosen": -312.0, "logps/rejected": -468.0, "loss": 0.7774, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.921875, "rewards/margins": 1.5390625, "rewards/rejected": -3.453125, "step": 6130 }, { "epoch": 0.4715821812596006, "grad_norm": 21.737519826941227, "learning_rate": 3.175810676572054e-07, "logits/chosen": -3.296875, "logits/rejected": -3.421875, "logps/chosen": -316.0, "logps/rejected": -470.0, "loss": 0.7667, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.796875, "rewards/margins": 1.7578125, "rewards/rejected": -3.5625, "step": 6140 }, { "epoch": 0.47235023041474655, "grad_norm": 19.559536508090744, "learning_rate": 3.1693553005338453e-07, "logits/chosen": -3.265625, "logits/rejected": -3.265625, "logps/chosen": -306.0, "logps/rejected": -466.0, "loss": 0.7394, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8515625, "rewards/margins": 1.59375, "rewards/rejected": -3.453125, "step": 6150 }, { "epoch": 0.4731182795698925, "grad_norm": 20.069769105119995, "learning_rate": 3.1628951133422296e-07, "logits/chosen": -3.40625, "logits/rejected": -3.46875, "logps/chosen": -320.0, "logps/rejected": -434.0, "loss": 0.7957, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.9140625, "rewards/margins": 1.3046875, "rewards/rejected": -3.21875, "step": 6160 }, { "epoch": 0.4738863287250384, "grad_norm": 13.877492861556338, "learning_rate": 3.1564301614313694e-07, "logits/chosen": -3.390625, "logits/rejected": -3.359375, "logps/chosen": -310.0, "logps/rejected": -486.0, "loss": 0.7727, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.8125, "rewards/margins": 1.859375, "rewards/rejected": -3.671875, "step": 6170 }, { "epoch": 0.47465437788018433, "grad_norm": 16.930556821069743, "learning_rate": 3.149960491269672e-07, "logits/chosen": -3.328125, "logits/rejected": -3.578125, "logps/chosen": -332.0, "logps/rejected": -476.0, "loss": 0.7345, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.0, "rewards/margins": 1.6328125, "rewards/rejected": -3.640625, "step": 6180 }, { "epoch": 0.4754224270353303, "grad_norm": 20.094462969560787, "learning_rate": 3.143486149359461e-07, "logits/chosen": -3.375, "logits/rejected": -3.546875, "logps/chosen": -324.0, "logps/rejected": -498.0, "loss": 0.8115, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.03125, "rewards/margins": 1.8515625, "rewards/rejected": -3.875, "step": 6190 }, { "epoch": 0.47619047619047616, "grad_norm": 16.78212785252274, "learning_rate": 3.137007182236637e-07, "logits/chosen": -3.453125, "logits/rejected": -3.453125, "logps/chosen": -300.0, "logps/rejected": -468.0, "loss": 0.7796, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.890625, "rewards/margins": 1.6875, "rewards/rejected": -3.578125, "step": 6200 }, { "epoch": 0.4769585253456221, "grad_norm": 17.39600420351786, "learning_rate": 3.1305236364703445e-07, "logits/chosen": -3.359375, "logits/rejected": -3.53125, "logps/chosen": -304.0, "logps/rejected": -474.0, "loss": 0.7551, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.78125, "rewards/margins": 1.859375, "rewards/rejected": -3.640625, "step": 6210 }, { "epoch": 0.47772657450076805, "grad_norm": 17.724996241393537, "learning_rate": 3.1240355586626414e-07, "logits/chosen": -3.484375, "logits/rejected": -3.46875, "logps/chosen": -306.0, "logps/rejected": -496.0, "loss": 0.7857, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.8515625, "rewards/margins": 1.90625, "rewards/rejected": -3.765625, "step": 6220 }, { "epoch": 0.478494623655914, "grad_norm": 17.82737127798486, "learning_rate": 3.117542995448158e-07, "logits/chosen": -3.4375, "logits/rejected": -3.265625, "logps/chosen": -300.0, "logps/rejected": -448.0, "loss": 0.7509, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7890625, "rewards/margins": 1.484375, "rewards/rejected": -3.28125, "step": 6230 }, { "epoch": 0.4792626728110599, "grad_norm": 17.275371254388375, "learning_rate": 3.1110459934937667e-07, "logits/chosen": -3.375, "logits/rejected": -3.390625, "logps/chosen": -274.0, "logps/rejected": -416.0, "loss": 0.7954, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.640625, "rewards/margins": 1.515625, "rewards/rejected": -3.15625, "step": 6240 }, { "epoch": 0.48003072196620583, "grad_norm": 19.747587111465716, "learning_rate": 3.104544599498242e-07, "logits/chosen": -3.296875, "logits/rejected": -3.453125, "logps/chosen": -320.0, "logps/rejected": -460.0, "loss": 0.7971, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.890625, "rewards/margins": 1.5546875, "rewards/rejected": -3.4375, "step": 6250 }, { "epoch": 0.4807987711213518, "grad_norm": 15.927345888269715, "learning_rate": 3.0980388601919286e-07, "logits/chosen": -3.421875, "logits/rejected": -3.5625, "logps/chosen": -328.0, "logps/rejected": -464.0, "loss": 0.8098, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.8671875, "rewards/margins": 1.5, "rewards/rejected": -3.375, "step": 6260 }, { "epoch": 0.4815668202764977, "grad_norm": 21.27782850039361, "learning_rate": 3.091528822336405e-07, "logits/chosen": -3.46875, "logits/rejected": -3.484375, "logps/chosen": -312.0, "logps/rejected": -430.0, "loss": 0.746, "rewards/accuracies": 0.78125, "rewards/chosen": -1.9140625, "rewards/margins": 1.3359375, "rewards/rejected": -3.25, "step": 6270 }, { "epoch": 0.4823348694316436, "grad_norm": 16.3318226304912, "learning_rate": 3.0850145327241444e-07, "logits/chosen": -3.359375, "logits/rejected": -3.3125, "logps/chosen": -352.0, "logps/rejected": -504.0, "loss": 0.7536, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.015625, "rewards/margins": 1.6640625, "rewards/rejected": -3.6875, "step": 6280 }, { "epoch": 0.48310291858678955, "grad_norm": 19.661991715267302, "learning_rate": 3.0784960381781834e-07, "logits/chosen": -3.484375, "logits/rejected": -3.5625, "logps/chosen": -308.0, "logps/rejected": -460.0, "loss": 0.7549, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.9375, "rewards/margins": 1.6015625, "rewards/rejected": -3.53125, "step": 6290 }, { "epoch": 0.4838709677419355, "grad_norm": 20.39589825983406, "learning_rate": 3.07197338555178e-07, "logits/chosen": -3.375, "logits/rejected": -3.671875, "logps/chosen": -342.0, "logps/rejected": -462.0, "loss": 0.8083, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.984375, "rewards/margins": 1.4296875, "rewards/rejected": -3.421875, "step": 6300 }, { "epoch": 0.4846390168970814, "grad_norm": 16.495543498144066, "learning_rate": 3.0654466217280797e-07, "logits/chosen": -3.4375, "logits/rejected": -3.453125, "logps/chosen": -292.0, "logps/rejected": -450.0, "loss": 0.7789, "rewards/accuracies": 0.78125, "rewards/chosen": -1.8515625, "rewards/margins": 1.53125, "rewards/rejected": -3.390625, "step": 6310 }, { "epoch": 0.48540706605222733, "grad_norm": 20.719394036929284, "learning_rate": 3.05891579361978e-07, "logits/chosen": -3.375, "logits/rejected": -3.359375, "logps/chosen": -302.0, "logps/rejected": -424.0, "loss": 0.8197, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.734375, "rewards/margins": 1.3671875, "rewards/rejected": -3.109375, "step": 6320 }, { "epoch": 0.4861751152073733, "grad_norm": 18.404173257465995, "learning_rate": 3.05238094816879e-07, "logits/chosen": -3.390625, "logits/rejected": -3.375, "logps/chosen": -280.0, "logps/rejected": -434.0, "loss": 0.7676, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.71875, "rewards/margins": 1.5859375, "rewards/rejected": -3.296875, "step": 6330 }, { "epoch": 0.4869431643625192, "grad_norm": 19.67051495846302, "learning_rate": 3.045842132345895e-07, "logits/chosen": -3.390625, "logits/rejected": -3.625, "logps/chosen": -302.0, "logps/rejected": -478.0, "loss": 0.8015, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.8203125, "rewards/margins": 1.8515625, "rewards/rejected": -3.671875, "step": 6340 }, { "epoch": 0.4877112135176651, "grad_norm": 16.58687861152321, "learning_rate": 3.0392993931504166e-07, "logits/chosen": -3.34375, "logits/rejected": -3.421875, "logps/chosen": -328.0, "logps/rejected": -492.0, "loss": 0.7784, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.0625, "rewards/margins": 1.6796875, "rewards/rejected": -3.734375, "step": 6350 }, { "epoch": 0.48847926267281105, "grad_norm": 19.128481916180917, "learning_rate": 3.03275277760988e-07, "logits/chosen": -3.28125, "logits/rejected": -3.390625, "logps/chosen": -338.0, "logps/rejected": -468.0, "loss": 0.7889, "rewards/accuracies": 0.8125, "rewards/chosen": -1.96875, "rewards/margins": 1.46875, "rewards/rejected": -3.4375, "step": 6360 }, { "epoch": 0.489247311827957, "grad_norm": 21.808812221144542, "learning_rate": 3.0262023327796703e-07, "logits/chosen": -3.40625, "logits/rejected": -3.453125, "logps/chosen": -322.0, "logps/rejected": -480.0, "loss": 0.808, "rewards/accuracies": 0.84375, "rewards/chosen": -2.0625, "rewards/margins": 1.5546875, "rewards/rejected": -3.609375, "step": 6370 }, { "epoch": 0.49001536098310294, "grad_norm": 18.084148863460435, "learning_rate": 3.019648105742696e-07, "logits/chosen": -3.515625, "logits/rejected": -3.484375, "logps/chosen": -298.0, "logps/rejected": -452.0, "loss": 0.7776, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.8984375, "rewards/margins": 1.484375, "rewards/rejected": -3.375, "step": 6380 }, { "epoch": 0.49078341013824883, "grad_norm": 21.211138568490092, "learning_rate": 3.013090143609053e-07, "logits/chosen": -3.4375, "logits/rejected": -3.46875, "logps/chosen": -356.0, "logps/rejected": -472.0, "loss": 0.7567, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.03125, "rewards/margins": 1.4375, "rewards/rejected": -3.453125, "step": 6390 }, { "epoch": 0.4915514592933948, "grad_norm": 20.48720553603255, "learning_rate": 3.0065284935156817e-07, "logits/chosen": -3.421875, "logits/rejected": -3.40625, "logps/chosen": -344.0, "logps/rejected": -480.0, "loss": 0.8193, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.15625, "rewards/margins": 1.3984375, "rewards/rejected": -3.546875, "step": 6400 }, { "epoch": 0.4923195084485407, "grad_norm": 18.314130680646276, "learning_rate": 2.9999632026260327e-07, "logits/chosen": -3.4375, "logits/rejected": -3.28125, "logps/chosen": -312.0, "logps/rejected": -460.0, "loss": 0.7903, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.0, "rewards/margins": 1.234375, "rewards/rejected": -3.234375, "step": 6410 }, { "epoch": 0.4930875576036866, "grad_norm": 15.054526562490578, "learning_rate": 2.993394318129726e-07, "logits/chosen": -3.578125, "logits/rejected": -3.640625, "logps/chosen": -300.0, "logps/rejected": -454.0, "loss": 0.7714, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.8671875, "rewards/margins": 1.5703125, "rewards/rejected": -3.4375, "step": 6420 }, { "epoch": 0.49385560675883255, "grad_norm": 20.328441455587214, "learning_rate": 2.986821887242209e-07, "logits/chosen": -3.34375, "logits/rejected": -3.71875, "logps/chosen": -340.0, "logps/rejected": -488.0, "loss": 0.7701, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.984375, "rewards/margins": 1.78125, "rewards/rejected": -3.765625, "step": 6430 }, { "epoch": 0.4946236559139785, "grad_norm": 19.228718959653726, "learning_rate": 2.9802459572044224e-07, "logits/chosen": -3.453125, "logits/rejected": -3.59375, "logps/chosen": -340.0, "logps/rejected": -474.0, "loss": 0.7831, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.9453125, "rewards/margins": 1.6015625, "rewards/rejected": -3.546875, "step": 6440 }, { "epoch": 0.49539170506912444, "grad_norm": 17.188253768978374, "learning_rate": 2.973666575282456e-07, "logits/chosen": -3.421875, "logits/rejected": -3.578125, "logps/chosen": -340.0, "logps/rejected": -480.0, "loss": 0.8059, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.125, "rewards/margins": 1.5078125, "rewards/rejected": -3.640625, "step": 6450 }, { "epoch": 0.49615975422427033, "grad_norm": 20.517271799808636, "learning_rate": 2.967083788767212e-07, "logits/chosen": -3.453125, "logits/rejected": -3.640625, "logps/chosen": -308.0, "logps/rejected": -462.0, "loss": 0.7729, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.890625, "rewards/margins": 1.7109375, "rewards/rejected": -3.59375, "step": 6460 }, { "epoch": 0.4969278033794163, "grad_norm": 18.647747912556493, "learning_rate": 2.960497644974063e-07, "logits/chosen": -3.375, "logits/rejected": -3.453125, "logps/chosen": -296.0, "logps/rejected": -442.0, "loss": 0.774, "rewards/accuracies": 0.84375, "rewards/chosen": -1.6484375, "rewards/margins": 1.6328125, "rewards/rejected": -3.28125, "step": 6470 }, { "epoch": 0.4976958525345622, "grad_norm": 18.631846458531726, "learning_rate": 2.9539081912425127e-07, "logits/chosen": -3.328125, "logits/rejected": -3.40625, "logps/chosen": -314.0, "logps/rejected": -454.0, "loss": 0.8007, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.9453125, "rewards/margins": 1.4375, "rewards/rejected": -3.375, "step": 6480 }, { "epoch": 0.49846390168970817, "grad_norm": 18.37044152280594, "learning_rate": 2.947315474935858e-07, "logits/chosen": -3.3125, "logits/rejected": -3.453125, "logps/chosen": -326.0, "logps/rejected": -470.0, "loss": 0.7648, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.8984375, "rewards/margins": 1.5390625, "rewards/rejected": -3.4375, "step": 6490 }, { "epoch": 0.49923195084485406, "grad_norm": 19.744935358183984, "learning_rate": 2.940719543440844e-07, "logits/chosen": -3.296875, "logits/rejected": -3.5, "logps/chosen": -308.0, "logps/rejected": -478.0, "loss": 0.7662, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.765625, "rewards/margins": 1.8515625, "rewards/rejected": -3.625, "step": 6500 }, { "epoch": 0.5, "grad_norm": 18.469616717027677, "learning_rate": 2.934120444167326e-07, "logits/chosen": -3.34375, "logits/rejected": -3.453125, "logps/chosen": -320.0, "logps/rejected": -456.0, "loss": 0.7942, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.78125, "rewards/margins": 1.546875, "rewards/rejected": -3.328125, "step": 6510 }, { "epoch": 0.500768049155146, "grad_norm": 16.67656973421717, "learning_rate": 2.9275182245479297e-07, "logits/chosen": -3.4375, "logits/rejected": -3.53125, "logps/chosen": -320.0, "logps/rejected": -460.0, "loss": 0.7735, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.96875, "rewards/margins": 1.59375, "rewards/rejected": -3.5625, "step": 6520 }, { "epoch": 0.5015360983102919, "grad_norm": 18.017217972110736, "learning_rate": 2.9209129320377076e-07, "logits/chosen": -3.40625, "logits/rejected": -3.59375, "logps/chosen": -336.0, "logps/rejected": -458.0, "loss": 0.7688, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.09375, "rewards/margins": 1.4609375, "rewards/rejected": -3.5625, "step": 6530 }, { "epoch": 0.5023041474654378, "grad_norm": 18.584243556972865, "learning_rate": 2.914304614113801e-07, "logits/chosen": -3.375, "logits/rejected": -3.515625, "logps/chosen": -338.0, "logps/rejected": -476.0, "loss": 0.7772, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.9921875, "rewards/margins": 1.5234375, "rewards/rejected": -3.515625, "step": 6540 }, { "epoch": 0.5030721966205837, "grad_norm": 18.966971310193394, "learning_rate": 2.9076933182750954e-07, "logits/chosen": -3.375, "logits/rejected": -3.609375, "logps/chosen": -326.0, "logps/rejected": -458.0, "loss": 0.7345, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.046875, "rewards/margins": 1.5703125, "rewards/rejected": -3.625, "step": 6550 }, { "epoch": 0.5038402457757296, "grad_norm": 21.519988730439284, "learning_rate": 2.901079092041881e-07, "logits/chosen": -3.53125, "logits/rejected": -3.828125, "logps/chosen": -294.0, "logps/rejected": -454.0, "loss": 0.7622, "rewards/accuracies": 0.8125, "rewards/chosen": -1.84375, "rewards/margins": 1.7109375, "rewards/rejected": -3.5625, "step": 6560 }, { "epoch": 0.5046082949308756, "grad_norm": 18.064183475414758, "learning_rate": 2.894461982955514e-07, "logits/chosen": -3.59375, "logits/rejected": -3.609375, "logps/chosen": -338.0, "logps/rejected": -490.0, "loss": 0.741, "rewards/accuracies": 0.8125, "rewards/chosen": -2.15625, "rewards/margins": 1.640625, "rewards/rejected": -3.796875, "step": 6570 }, { "epoch": 0.5053763440860215, "grad_norm": 18.5366727146085, "learning_rate": 2.887842038578066e-07, "logits/chosen": -3.28125, "logits/rejected": -3.609375, "logps/chosen": -362.0, "logps/rejected": -524.0, "loss": 0.79, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.140625, "rewards/margins": 1.8828125, "rewards/rejected": -4.03125, "step": 6580 }, { "epoch": 0.5061443932411674, "grad_norm": 18.659090128669806, "learning_rate": 2.881219306491993e-07, "logits/chosen": -3.515625, "logits/rejected": -3.609375, "logps/chosen": -356.0, "logps/rejected": -516.0, "loss": 0.764, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.171875, "rewards/margins": 1.8359375, "rewards/rejected": -4.0, "step": 6590 }, { "epoch": 0.5069124423963134, "grad_norm": 19.13708541054816, "learning_rate": 2.8745938342997864e-07, "logits/chosen": -3.65625, "logits/rejected": -3.625, "logps/chosen": -302.0, "logps/rejected": -442.0, "loss": 0.8333, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9609375, "rewards/margins": 1.296875, "rewards/rejected": -3.265625, "step": 6600 }, { "epoch": 0.5076804915514593, "grad_norm": 21.632786738820172, "learning_rate": 2.8679656696236324e-07, "logits/chosen": -3.578125, "logits/rejected": -3.75, "logps/chosen": -324.0, "logps/rejected": -482.0, "loss": 0.7472, "rewards/accuracies": 0.8125, "rewards/chosen": -2.046875, "rewards/margins": 1.59375, "rewards/rejected": -3.640625, "step": 6610 }, { "epoch": 0.5084485407066052, "grad_norm": 20.006874673351827, "learning_rate": 2.8613348601050726e-07, "logits/chosen": -3.546875, "logits/rejected": -3.46875, "logps/chosen": -352.0, "logps/rejected": -472.0, "loss": 0.8006, "rewards/accuracies": 0.8125, "rewards/chosen": -2.171875, "rewards/margins": 1.4453125, "rewards/rejected": -3.625, "step": 6620 }, { "epoch": 0.5092165898617511, "grad_norm": 17.299334894786995, "learning_rate": 2.8547014534046534e-07, "logits/chosen": -3.546875, "logits/rejected": -3.5625, "logps/chosen": -304.0, "logps/rejected": -456.0, "loss": 0.7672, "rewards/accuracies": 0.8125, "rewards/chosen": -1.90625, "rewards/margins": 1.6640625, "rewards/rejected": -3.578125, "step": 6630 }, { "epoch": 0.5099846390168971, "grad_norm": 19.794239250496965, "learning_rate": 2.8480654972015953e-07, "logits/chosen": -3.421875, "logits/rejected": -3.484375, "logps/chosen": -328.0, "logps/rejected": -488.0, "loss": 0.7619, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.953125, "rewards/margins": 1.8046875, "rewards/rejected": -3.75, "step": 6640 }, { "epoch": 0.510752688172043, "grad_norm": 18.762730818144416, "learning_rate": 2.841427039193441e-07, "logits/chosen": -3.5, "logits/rejected": -3.578125, "logps/chosen": -314.0, "logps/rejected": -474.0, "loss": 0.7978, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.90625, "rewards/margins": 1.7265625, "rewards/rejected": -3.640625, "step": 6650 }, { "epoch": 0.511520737327189, "grad_norm": 16.74411045701083, "learning_rate": 2.834786127095715e-07, "logits/chosen": -3.515625, "logits/rejected": -3.46875, "logps/chosen": -326.0, "logps/rejected": -468.0, "loss": 0.7609, "rewards/accuracies": 0.78125, "rewards/chosen": -2.015625, "rewards/margins": 1.4921875, "rewards/rejected": -3.515625, "step": 6660 }, { "epoch": 0.5122887864823349, "grad_norm": 19.455889469115807, "learning_rate": 2.828142808641584e-07, "logits/chosen": -3.46875, "logits/rejected": -3.515625, "logps/chosen": -318.0, "logps/rejected": -482.0, "loss": 0.7821, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.046875, "rewards/margins": 1.5546875, "rewards/rejected": -3.59375, "step": 6670 }, { "epoch": 0.5130568356374808, "grad_norm": 17.711504168734624, "learning_rate": 2.821497131581507e-07, "logits/chosen": -3.421875, "logits/rejected": -3.546875, "logps/chosen": -308.0, "logps/rejected": -452.0, "loss": 0.7845, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.890625, "rewards/margins": 1.5390625, "rewards/rejected": -3.421875, "step": 6680 }, { "epoch": 0.5138248847926268, "grad_norm": 19.33224467344747, "learning_rate": 2.8148491436828997e-07, "logits/chosen": -3.640625, "logits/rejected": -3.6875, "logps/chosen": -328.0, "logps/rejected": -478.0, "loss": 0.7314, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.109375, "rewards/margins": 1.71875, "rewards/rejected": -3.828125, "step": 6690 }, { "epoch": 0.5145929339477726, "grad_norm": 17.687605796109057, "learning_rate": 2.808198892729784e-07, "logits/chosen": -3.5625, "logits/rejected": -3.65625, "logps/chosen": -324.0, "logps/rejected": -488.0, "loss": 0.7795, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.9453125, "rewards/margins": 1.7265625, "rewards/rejected": -3.671875, "step": 6700 }, { "epoch": 0.5153609831029186, "grad_norm": 16.479326859423345, "learning_rate": 2.8015464265224516e-07, "logits/chosen": -3.421875, "logits/rejected": -3.6875, "logps/chosen": -324.0, "logps/rejected": -464.0, "loss": 0.7527, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.890625, "rewards/margins": 1.7109375, "rewards/rejected": -3.59375, "step": 6710 }, { "epoch": 0.5161290322580645, "grad_norm": 19.19857110360251, "learning_rate": 2.7948917928771153e-07, "logits/chosen": -3.28125, "logits/rejected": -3.640625, "logps/chosen": -308.0, "logps/rejected": -434.0, "loss": 0.7817, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.6875, "rewards/margins": 1.7421875, "rewards/rejected": -3.4375, "step": 6720 }, { "epoch": 0.5168970814132104, "grad_norm": 18.022320546825583, "learning_rate": 2.7882350396255655e-07, "logits/chosen": -3.40625, "logits/rejected": -3.515625, "logps/chosen": -328.0, "logps/rejected": -452.0, "loss": 0.7685, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.984375, "rewards/margins": 1.3671875, "rewards/rejected": -3.34375, "step": 6730 }, { "epoch": 0.5176651305683564, "grad_norm": 20.15488248553551, "learning_rate": 2.7815762146148303e-07, "logits/chosen": -3.484375, "logits/rejected": -3.5625, "logps/chosen": -318.0, "logps/rejected": -474.0, "loss": 0.774, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.90625, "rewards/margins": 1.75, "rewards/rejected": -3.65625, "step": 6740 }, { "epoch": 0.5184331797235023, "grad_norm": 17.822375332796334, "learning_rate": 2.7749153657068267e-07, "logits/chosen": -3.46875, "logits/rejected": -3.609375, "logps/chosen": -348.0, "logps/rejected": -496.0, "loss": 0.7388, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.875, "rewards/margins": 1.59375, "rewards/rejected": -3.46875, "step": 6750 }, { "epoch": 0.5192012288786483, "grad_norm": 19.324304797408928, "learning_rate": 2.7682525407780216e-07, "logits/chosen": -3.375, "logits/rejected": -3.484375, "logps/chosen": -322.0, "logps/rejected": -494.0, "loss": 0.758, "rewards/accuracies": 0.875, "rewards/chosen": -1.8984375, "rewards/margins": 1.9140625, "rewards/rejected": -3.8125, "step": 6760 }, { "epoch": 0.5199692780337941, "grad_norm": 19.029156641146777, "learning_rate": 2.761587787719083e-07, "logits/chosen": -3.546875, "logits/rejected": -3.6875, "logps/chosen": -360.0, "logps/rejected": -492.0, "loss": 0.7857, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.125, "rewards/margins": 1.5234375, "rewards/rejected": -3.65625, "step": 6770 }, { "epoch": 0.5207373271889401, "grad_norm": 16.072760078610763, "learning_rate": 2.7549211544345376e-07, "logits/chosen": -3.5, "logits/rejected": -3.484375, "logps/chosen": -302.0, "logps/rejected": -470.0, "loss": 0.7531, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.8359375, "rewards/margins": 1.640625, "rewards/rejected": -3.46875, "step": 6780 }, { "epoch": 0.521505376344086, "grad_norm": 18.1872178763105, "learning_rate": 2.7482526888424286e-07, "logits/chosen": -3.46875, "logits/rejected": -3.5625, "logps/chosen": -340.0, "logps/rejected": -502.0, "loss": 0.747, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.078125, "rewards/margins": 1.734375, "rewards/rejected": -3.796875, "step": 6790 }, { "epoch": 0.522273425499232, "grad_norm": 17.499155156442274, "learning_rate": 2.7415824388739665e-07, "logits/chosen": -3.484375, "logits/rejected": -3.484375, "logps/chosen": -324.0, "logps/rejected": -444.0, "loss": 0.7973, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8359375, "rewards/margins": 1.4375, "rewards/rejected": -3.265625, "step": 6800 }, { "epoch": 0.5230414746543779, "grad_norm": 19.717336834933235, "learning_rate": 2.7349104524731914e-07, "logits/chosen": -3.53125, "logits/rejected": -3.5625, "logps/chosen": -286.0, "logps/rejected": -424.0, "loss": 0.7862, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.765625, "rewards/margins": 1.4765625, "rewards/rejected": -3.25, "step": 6810 }, { "epoch": 0.5238095238095238, "grad_norm": 17.57519206446158, "learning_rate": 2.728236777596621e-07, "logits/chosen": -3.375, "logits/rejected": -3.515625, "logps/chosen": -328.0, "logps/rejected": -442.0, "loss": 0.7999, "rewards/accuracies": 0.78125, "rewards/chosen": -1.9140625, "rewards/margins": 1.4140625, "rewards/rejected": -3.328125, "step": 6820 }, { "epoch": 0.5245775729646698, "grad_norm": 17.45259408519089, "learning_rate": 2.7215614622129103e-07, "logits/chosen": -3.484375, "logits/rejected": -3.46875, "logps/chosen": -316.0, "logps/rejected": -454.0, "loss": 0.7846, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.875, "rewards/margins": 1.5078125, "rewards/rejected": -3.390625, "step": 6830 }, { "epoch": 0.5253456221198156, "grad_norm": 17.90183493028394, "learning_rate": 2.714884554302506e-07, "logits/chosen": -3.546875, "logits/rejected": -3.5625, "logps/chosen": -314.0, "logps/rejected": -470.0, "loss": 0.7598, "rewards/accuracies": 0.84375, "rewards/chosen": -1.921875, "rewards/margins": 1.671875, "rewards/rejected": -3.59375, "step": 6840 }, { "epoch": 0.5261136712749616, "grad_norm": 24.03998504349876, "learning_rate": 2.7082061018573016e-07, "logits/chosen": -3.28125, "logits/rejected": -3.65625, "logps/chosen": -320.0, "logps/rejected": -470.0, "loss": 0.8035, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.8828125, "rewards/margins": 1.8125, "rewards/rejected": -3.703125, "step": 6850 }, { "epoch": 0.5268817204301075, "grad_norm": 18.1906089255028, "learning_rate": 2.701526152880293e-07, "logits/chosen": -3.390625, "logits/rejected": -3.5, "logps/chosen": -312.0, "logps/rejected": -438.0, "loss": 0.8141, "rewards/accuracies": 0.78125, "rewards/chosen": -1.7421875, "rewards/margins": 1.5078125, "rewards/rejected": -3.25, "step": 6860 }, { "epoch": 0.5276497695852534, "grad_norm": 19.54597933868929, "learning_rate": 2.6948447553852304e-07, "logits/chosen": -3.359375, "logits/rejected": -3.328125, "logps/chosen": -286.0, "logps/rejected": -468.0, "loss": 0.7366, "rewards/accuracies": 0.84375, "rewards/chosen": -1.53125, "rewards/margins": 1.75, "rewards/rejected": -3.28125, "step": 6870 }, { "epoch": 0.5284178187403994, "grad_norm": 23.999806456562688, "learning_rate": 2.688161957396279e-07, "logits/chosen": -3.34375, "logits/rejected": -3.4375, "logps/chosen": -290.0, "logps/rejected": -432.0, "loss": 0.7492, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.6015625, "rewards/margins": 1.546875, "rewards/rejected": -3.140625, "step": 6880 }, { "epoch": 0.5291858678955453, "grad_norm": 17.668188969707057, "learning_rate": 2.6814778069476673e-07, "logits/chosen": -3.59375, "logits/rejected": -3.5625, "logps/chosen": -310.0, "logps/rejected": -476.0, "loss": 0.7384, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.8515625, "rewards/margins": 1.8125, "rewards/rejected": -3.671875, "step": 6890 }, { "epoch": 0.5299539170506913, "grad_norm": 21.658964045882353, "learning_rate": 2.674792352083347e-07, "logits/chosen": -3.375, "logits/rejected": -3.59375, "logps/chosen": -306.0, "logps/rejected": -468.0, "loss": 0.8043, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.828125, "rewards/margins": 1.671875, "rewards/rejected": -3.5, "step": 6900 }, { "epoch": 0.5307219662058372, "grad_norm": 16.75561014587604, "learning_rate": 2.668105640856644e-07, "logits/chosen": -3.625, "logits/rejected": -3.859375, "logps/chosen": -322.0, "logps/rejected": -444.0, "loss": 0.7883, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.9765625, "rewards/margins": 1.453125, "rewards/rejected": -3.4375, "step": 6910 }, { "epoch": 0.5314900153609831, "grad_norm": 17.493016878514776, "learning_rate": 2.6614177213299153e-07, "logits/chosen": -3.5625, "logits/rejected": -3.640625, "logps/chosen": -320.0, "logps/rejected": -466.0, "loss": 0.8084, "rewards/accuracies": 0.84375, "rewards/chosen": -2.015625, "rewards/margins": 1.578125, "rewards/rejected": -3.59375, "step": 6920 }, { "epoch": 0.532258064516129, "grad_norm": 18.528894558772812, "learning_rate": 2.6547286415742046e-07, "logits/chosen": -3.375, "logits/rejected": -3.421875, "logps/chosen": -304.0, "logps/rejected": -458.0, "loss": 0.7752, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8046875, "rewards/margins": 1.7109375, "rewards/rejected": -3.515625, "step": 6930 }, { "epoch": 0.533026113671275, "grad_norm": 17.64193831019922, "learning_rate": 2.648038449668892e-07, "logits/chosen": -3.515625, "logits/rejected": -3.59375, "logps/chosen": -290.0, "logps/rejected": -420.0, "loss": 0.7935, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.8515625, "rewards/margins": 1.3828125, "rewards/rejected": -3.234375, "step": 6940 }, { "epoch": 0.5337941628264209, "grad_norm": 17.033818160410732, "learning_rate": 2.641347193701353e-07, "logits/chosen": -3.5, "logits/rejected": -3.5625, "logps/chosen": -294.0, "logps/rejected": -426.0, "loss": 0.8094, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.828125, "rewards/margins": 1.3515625, "rewards/rejected": -3.171875, "step": 6950 }, { "epoch": 0.5345622119815668, "grad_norm": 19.134865210703136, "learning_rate": 2.634654921766611e-07, "logits/chosen": -3.421875, "logits/rejected": -3.4375, "logps/chosen": -310.0, "logps/rejected": -462.0, "loss": 0.7916, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9140625, "rewards/margins": 1.484375, "rewards/rejected": -3.390625, "step": 6960 }, { "epoch": 0.5353302611367128, "grad_norm": 16.334249222388447, "learning_rate": 2.627961681966992e-07, "logits/chosen": -3.390625, "logits/rejected": -3.28125, "logps/chosen": -294.0, "logps/rejected": -470.0, "loss": 0.7691, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.7890625, "rewards/margins": 1.8046875, "rewards/rejected": -3.59375, "step": 6970 }, { "epoch": 0.5360983102918587, "grad_norm": 20.3514663496486, "learning_rate": 2.6212675224117797e-07, "logits/chosen": -3.390625, "logits/rejected": -3.671875, "logps/chosen": -306.0, "logps/rejected": -442.0, "loss": 0.7473, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7890625, "rewards/margins": 1.53125, "rewards/rejected": -3.3125, "step": 6980 }, { "epoch": 0.5368663594470046, "grad_norm": 20.551785381164223, "learning_rate": 2.6145724912168675e-07, "logits/chosen": -3.421875, "logits/rejected": -3.390625, "logps/chosen": -334.0, "logps/rejected": -486.0, "loss": 0.7826, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9765625, "rewards/margins": 1.6796875, "rewards/rejected": -3.65625, "step": 6990 }, { "epoch": 0.5376344086021505, "grad_norm": 17.347776260853887, "learning_rate": 2.6078766365044135e-07, "logits/chosen": -3.546875, "logits/rejected": -3.640625, "logps/chosen": -288.0, "logps/rejected": -448.0, "loss": 0.7582, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.84375, "rewards/margins": 1.6171875, "rewards/rejected": -3.453125, "step": 7000 }, { "epoch": 0.5376344086021505, "eval_logits/chosen": -3.421875, "eval_logits/rejected": -3.515625, "eval_logps/chosen": -352.0, "eval_logps/rejected": -452.0, "eval_loss": 0.4553733468055725, "eval_rewards/accuracies": 0.7620192170143127, "eval_rewards/chosen": -2.09375, "eval_rewards/margins": 1.296875, "eval_rewards/rejected": -3.390625, "eval_runtime": 2264.1153, "eval_samples_per_second": 41.135, "eval_steps_per_second": 0.643, "step": 7000 }, { "epoch": 0.5384024577572964, "grad_norm": 31.840137736363033, "learning_rate": 2.601180006402497e-07, "logits/chosen": -3.46875, "logits/rejected": -3.484375, "logps/chosen": -300.0, "logps/rejected": -460.0, "loss": 0.7785, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.765625, "rewards/margins": 1.609375, "rewards/rejected": -3.375, "step": 7010 }, { "epoch": 0.5391705069124424, "grad_norm": 17.46625044068098, "learning_rate": 2.5944826490447683e-07, "logits/chosen": -3.34375, "logits/rejected": -3.546875, "logps/chosen": -310.0, "logps/rejected": -470.0, "loss": 0.7618, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8125, "rewards/margins": 1.7265625, "rewards/rejected": -3.53125, "step": 7020 }, { "epoch": 0.5399385560675883, "grad_norm": 20.707389931848077, "learning_rate": 2.587784612570107e-07, "logits/chosen": -3.4375, "logits/rejected": -3.5, "logps/chosen": -340.0, "logps/rejected": -480.0, "loss": 0.8001, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.890625, "rewards/margins": 1.515625, "rewards/rejected": -3.40625, "step": 7030 }, { "epoch": 0.5407066052227343, "grad_norm": 17.57767555702952, "learning_rate": 2.5810859451222735e-07, "logits/chosen": -3.484375, "logits/rejected": -3.53125, "logps/chosen": -312.0, "logps/rejected": -468.0, "loss": 0.7302, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.984375, "rewards/margins": 1.6953125, "rewards/rejected": -3.671875, "step": 7040 }, { "epoch": 0.5414746543778802, "grad_norm": 18.497440329977465, "learning_rate": 2.574386694849561e-07, "logits/chosen": -3.546875, "logits/rejected": -3.59375, "logps/chosen": -360.0, "logps/rejected": -520.0, "loss": 0.7643, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.375, "rewards/margins": 1.6328125, "rewards/rejected": -4.0, "step": 7050 }, { "epoch": 0.5422427035330261, "grad_norm": 19.248501823299286, "learning_rate": 2.567686909904457e-07, "logits/chosen": -3.546875, "logits/rejected": -3.515625, "logps/chosen": -346.0, "logps/rejected": -498.0, "loss": 0.7502, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.078125, "rewards/margins": 1.59375, "rewards/rejected": -3.671875, "step": 7060 }, { "epoch": 0.543010752688172, "grad_norm": 18.85684879878204, "learning_rate": 2.5609866384432873e-07, "logits/chosen": -3.484375, "logits/rejected": -3.5, "logps/chosen": -318.0, "logps/rejected": -480.0, "loss": 0.7385, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.8984375, "rewards/margins": 1.71875, "rewards/rejected": -3.609375, "step": 7070 }, { "epoch": 0.543778801843318, "grad_norm": 18.861092073898867, "learning_rate": 2.554285928625877e-07, "logits/chosen": -3.4375, "logits/rejected": -3.4375, "logps/chosen": -348.0, "logps/rejected": -494.0, "loss": 0.773, "rewards/accuracies": 0.78125, "rewards/chosen": -2.21875, "rewards/margins": 1.546875, "rewards/rejected": -3.75, "step": 7080 }, { "epoch": 0.5445468509984639, "grad_norm": 18.900064275492134, "learning_rate": 2.547584828615201e-07, "logits/chosen": -3.578125, "logits/rejected": -3.703125, "logps/chosen": -316.0, "logps/rejected": -480.0, "loss": 0.7522, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.9453125, "rewards/margins": 1.84375, "rewards/rejected": -3.78125, "step": 7090 }, { "epoch": 0.5453149001536098, "grad_norm": 19.488360307933533, "learning_rate": 2.5408833865770396e-07, "logits/chosen": -3.515625, "logits/rejected": -3.640625, "logps/chosen": -320.0, "logps/rejected": -460.0, "loss": 0.8052, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.0, "rewards/margins": 1.4921875, "rewards/rejected": -3.5, "step": 7100 }, { "epoch": 0.5460829493087558, "grad_norm": 18.981672070204343, "learning_rate": 2.5341816506796313e-07, "logits/chosen": -3.515625, "logits/rejected": -3.40625, "logps/chosen": -330.0, "logps/rejected": -508.0, "loss": 0.7666, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.046875, "rewards/margins": 1.6796875, "rewards/rejected": -3.71875, "step": 7110 }, { "epoch": 0.5468509984639017, "grad_norm": 19.371579837710502, "learning_rate": 2.527479669093327e-07, "logits/chosen": -3.375, "logits/rejected": -3.25, "logps/chosen": -328.0, "logps/rejected": -508.0, "loss": 0.7692, "rewards/accuracies": 0.84375, "rewards/chosen": -1.8671875, "rewards/margins": 1.84375, "rewards/rejected": -3.71875, "step": 7120 }, { "epoch": 0.5476190476190477, "grad_norm": 17.83213010465975, "learning_rate": 2.520777489990243e-07, "logits/chosen": -3.53125, "logits/rejected": -3.578125, "logps/chosen": -336.0, "logps/rejected": -476.0, "loss": 0.7553, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.015625, "rewards/margins": 1.5, "rewards/rejected": -3.515625, "step": 7130 }, { "epoch": 0.5483870967741935, "grad_norm": 21.930246197299468, "learning_rate": 2.514075161543915e-07, "logits/chosen": -3.703125, "logits/rejected": -3.875, "logps/chosen": -334.0, "logps/rejected": -508.0, "loss": 0.7707, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.171875, "rewards/margins": 1.890625, "rewards/rejected": -4.0625, "step": 7140 }, { "epoch": 0.5491551459293394, "grad_norm": 19.022008299496058, "learning_rate": 2.507372731928953e-07, "logits/chosen": -3.484375, "logits/rejected": -3.5625, "logps/chosen": -332.0, "logps/rejected": -512.0, "loss": 0.7446, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.953125, "rewards/margins": 1.9296875, "rewards/rejected": -3.890625, "step": 7150 }, { "epoch": 0.5499231950844854, "grad_norm": 19.1513839394623, "learning_rate": 2.5006702493206936e-07, "logits/chosen": -3.546875, "logits/rejected": -3.65625, "logps/chosen": -328.0, "logps/rejected": -494.0, "loss": 0.7625, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.96875, "rewards/margins": 1.71875, "rewards/rejected": -3.6875, "step": 7160 }, { "epoch": 0.5506912442396313, "grad_norm": 19.25399592641843, "learning_rate": 2.493967761894855e-07, "logits/chosen": -3.4375, "logits/rejected": -3.421875, "logps/chosen": -350.0, "logps/rejected": -492.0, "loss": 0.7719, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.9921875, "rewards/margins": 1.625, "rewards/rejected": -3.609375, "step": 7170 }, { "epoch": 0.5514592933947773, "grad_norm": 18.21365820258297, "learning_rate": 2.4872653178271895e-07, "logits/chosen": -3.640625, "logits/rejected": -3.453125, "logps/chosen": -298.0, "logps/rejected": -456.0, "loss": 0.759, "rewards/accuracies": 0.8125, "rewards/chosen": -1.75, "rewards/margins": 1.5078125, "rewards/rejected": -3.265625, "step": 7180 }, { "epoch": 0.5522273425499232, "grad_norm": 15.996214909579544, "learning_rate": 2.4805629652931377e-07, "logits/chosen": -3.5, "logits/rejected": -3.578125, "logps/chosen": -284.0, "logps/rejected": -436.0, "loss": 0.7672, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7265625, "rewards/margins": 1.59375, "rewards/rejected": -3.3125, "step": 7190 }, { "epoch": 0.5529953917050692, "grad_norm": 19.02007347923251, "learning_rate": 2.4738607524674826e-07, "logits/chosen": -3.34375, "logits/rejected": -3.5, "logps/chosen": -334.0, "logps/rejected": -466.0, "loss": 0.7305, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8359375, "rewards/margins": 1.5390625, "rewards/rejected": -3.375, "step": 7200 }, { "epoch": 0.553763440860215, "grad_norm": 18.16096341583378, "learning_rate": 2.4671587275240024e-07, "logits/chosen": -3.546875, "logits/rejected": -3.8125, "logps/chosen": -332.0, "logps/rejected": -450.0, "loss": 0.7549, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.046875, "rewards/margins": 1.3984375, "rewards/rejected": -3.4375, "step": 7210 }, { "epoch": 0.554531490015361, "grad_norm": 17.452728423937536, "learning_rate": 2.4604569386351256e-07, "logits/chosen": -3.578125, "logits/rejected": -3.40625, "logps/chosen": -310.0, "logps/rejected": -496.0, "loss": 0.7664, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.9765625, "rewards/margins": 1.84375, "rewards/rejected": -3.828125, "step": 7220 }, { "epoch": 0.5552995391705069, "grad_norm": 16.24526691750391, "learning_rate": 2.453755433971583e-07, "logits/chosen": -3.375, "logits/rejected": -3.390625, "logps/chosen": -326.0, "logps/rejected": -492.0, "loss": 0.7394, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.9765625, "rewards/margins": 1.8203125, "rewards/rejected": -3.796875, "step": 7230 }, { "epoch": 0.5560675883256528, "grad_norm": 17.450089602132117, "learning_rate": 2.447054261702064e-07, "logits/chosen": -3.5, "logits/rejected": -3.703125, "logps/chosen": -336.0, "logps/rejected": -474.0, "loss": 0.7429, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.953125, "rewards/margins": 1.7578125, "rewards/rejected": -3.71875, "step": 7240 }, { "epoch": 0.5568356374807988, "grad_norm": 25.17148793323974, "learning_rate": 2.440353469992868e-07, "logits/chosen": -3.578125, "logits/rejected": -3.5625, "logps/chosen": -342.0, "logps/rejected": -486.0, "loss": 0.8025, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.109375, "rewards/margins": 1.625, "rewards/rejected": -3.734375, "step": 7250 }, { "epoch": 0.5576036866359447, "grad_norm": 17.220632132685353, "learning_rate": 2.4336531070075583e-07, "logits/chosen": -3.40625, "logits/rejected": -3.4375, "logps/chosen": -330.0, "logps/rejected": -500.0, "loss": 0.7823, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.0625, "rewards/margins": 1.765625, "rewards/rejected": -3.828125, "step": 7260 }, { "epoch": 0.5583717357910907, "grad_norm": 17.406888647793647, "learning_rate": 2.4269532209066174e-07, "logits/chosen": -3.578125, "logits/rejected": -3.515625, "logps/chosen": -286.0, "logps/rejected": -450.0, "loss": 0.7455, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.8125, "rewards/margins": 1.71875, "rewards/rejected": -3.53125, "step": 7270 }, { "epoch": 0.5591397849462365, "grad_norm": 16.83825473216221, "learning_rate": 2.4202538598471005e-07, "logits/chosen": -3.421875, "logits/rejected": -3.609375, "logps/chosen": -296.0, "logps/rejected": -418.0, "loss": 0.7823, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.859375, "rewards/margins": 1.4140625, "rewards/rejected": -3.28125, "step": 7280 }, { "epoch": 0.5599078341013825, "grad_norm": 21.313316429294325, "learning_rate": 2.413555071982288e-07, "logits/chosen": -3.34375, "logits/rejected": -3.359375, "logps/chosen": -350.0, "logps/rejected": -486.0, "loss": 0.7451, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.15625, "rewards/margins": 1.5234375, "rewards/rejected": -3.671875, "step": 7290 }, { "epoch": 0.5606758832565284, "grad_norm": 18.398788576563607, "learning_rate": 2.4068569054613413e-07, "logits/chosen": -3.484375, "logits/rejected": -3.4375, "logps/chosen": -320.0, "logps/rejected": -460.0, "loss": 0.714, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.859375, "rewards/margins": 1.5, "rewards/rejected": -3.359375, "step": 7300 }, { "epoch": 0.5614439324116743, "grad_norm": 20.344234616095, "learning_rate": 2.400159408428955e-07, "logits/chosen": -3.484375, "logits/rejected": -3.546875, "logps/chosen": -310.0, "logps/rejected": -468.0, "loss": 0.7414, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.8671875, "rewards/margins": 1.734375, "rewards/rejected": -3.59375, "step": 7310 }, { "epoch": 0.5622119815668203, "grad_norm": 23.728970627245214, "learning_rate": 2.3934626290250114e-07, "logits/chosen": -3.25, "logits/rejected": -3.28125, "logps/chosen": -360.0, "logps/rejected": -510.0, "loss": 0.7836, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.015625, "rewards/margins": 1.5546875, "rewards/rejected": -3.578125, "step": 7320 }, { "epoch": 0.5629800307219662, "grad_norm": 21.709028284311355, "learning_rate": 2.3867666153842356e-07, "logits/chosen": -3.546875, "logits/rejected": -3.59375, "logps/chosen": -322.0, "logps/rejected": -466.0, "loss": 0.7712, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.9375, "rewards/margins": 1.7265625, "rewards/rejected": -3.671875, "step": 7330 }, { "epoch": 0.5637480798771122, "grad_norm": 19.217070283793824, "learning_rate": 2.3800714156358476e-07, "logits/chosen": -3.40625, "logits/rejected": -3.609375, "logps/chosen": -290.0, "logps/rejected": -432.0, "loss": 0.7661, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.796875, "rewards/margins": 1.4609375, "rewards/rejected": -3.25, "step": 7340 }, { "epoch": 0.5645161290322581, "grad_norm": 17.304043579644976, "learning_rate": 2.3733770779032184e-07, "logits/chosen": -3.34375, "logits/rejected": -3.28125, "logps/chosen": -326.0, "logps/rejected": -484.0, "loss": 0.7624, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.8671875, "rewards/margins": 1.7109375, "rewards/rejected": -3.578125, "step": 7350 }, { "epoch": 0.565284178187404, "grad_norm": 19.9155113834468, "learning_rate": 2.3666836503035214e-07, "logits/chosen": -3.40625, "logits/rejected": -3.625, "logps/chosen": -328.0, "logps/rejected": -470.0, "loss": 0.7867, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.0, "rewards/margins": 1.5078125, "rewards/rejected": -3.515625, "step": 7360 }, { "epoch": 0.5660522273425499, "grad_norm": 21.951713128268874, "learning_rate": 2.359991180947391e-07, "logits/chosen": -3.515625, "logits/rejected": -3.609375, "logps/chosen": -368.0, "logps/rejected": -536.0, "loss": 0.7642, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.140625, "rewards/margins": 1.859375, "rewards/rejected": -4.0, "step": 7370 }, { "epoch": 0.5668202764976958, "grad_norm": 20.03675703962159, "learning_rate": 2.353299717938571e-07, "logits/chosen": -3.640625, "logits/rejected": -3.734375, "logps/chosen": -328.0, "logps/rejected": -506.0, "loss": 0.7497, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.046875, "rewards/margins": 1.90625, "rewards/rejected": -3.9375, "step": 7380 }, { "epoch": 0.5675883256528418, "grad_norm": 17.98110443174631, "learning_rate": 2.3466093093735728e-07, "logits/chosen": -3.484375, "logits/rejected": -3.53125, "logps/chosen": -334.0, "logps/rejected": -454.0, "loss": 0.7617, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.0625, "rewards/margins": 1.3828125, "rewards/rejected": -3.453125, "step": 7390 }, { "epoch": 0.5683563748079877, "grad_norm": 17.451294695517184, "learning_rate": 2.3399200033413303e-07, "logits/chosen": -3.59375, "logits/rejected": -3.65625, "logps/chosen": -338.0, "logps/rejected": -516.0, "loss": 0.7029, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.15625, "rewards/margins": 1.7265625, "rewards/rejected": -3.875, "step": 7400 }, { "epoch": 0.5691244239631337, "grad_norm": 22.517600493844512, "learning_rate": 2.33323184792285e-07, "logits/chosen": -3.453125, "logits/rejected": -3.609375, "logps/chosen": -352.0, "logps/rejected": -516.0, "loss": 0.7622, "rewards/accuracies": 0.84375, "rewards/chosen": -2.0625, "rewards/margins": 1.765625, "rewards/rejected": -3.828125, "step": 7410 }, { "epoch": 0.5698924731182796, "grad_norm": 18.932244170572396, "learning_rate": 2.3265448911908708e-07, "logits/chosen": -3.46875, "logits/rejected": -3.5, "logps/chosen": -330.0, "logps/rejected": -502.0, "loss": 0.7739, "rewards/accuracies": 0.8125, "rewards/chosen": -2.03125, "rewards/margins": 1.7265625, "rewards/rejected": -3.75, "step": 7420 }, { "epoch": 0.5706605222734255, "grad_norm": 17.130692219655895, "learning_rate": 2.3198591812095143e-07, "logits/chosen": -3.5625, "logits/rejected": -3.6875, "logps/chosen": -334.0, "logps/rejected": -444.0, "loss": 0.8138, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.15625, "rewards/margins": 1.25, "rewards/rejected": -3.40625, "step": 7430 }, { "epoch": 0.5714285714285714, "grad_norm": 20.37717757000149, "learning_rate": 2.3131747660339394e-07, "logits/chosen": -3.53125, "logits/rejected": -3.5625, "logps/chosen": -326.0, "logps/rejected": -464.0, "loss": 0.7428, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.03125, "rewards/margins": 1.4765625, "rewards/rejected": -3.5, "step": 7440 }, { "epoch": 0.5721966205837173, "grad_norm": 18.174983659924376, "learning_rate": 2.3064916937100013e-07, "logits/chosen": -3.578125, "logits/rejected": -3.671875, "logps/chosen": -334.0, "logps/rejected": -482.0, "loss": 0.7434, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.109375, "rewards/margins": 1.484375, "rewards/rejected": -3.59375, "step": 7450 }, { "epoch": 0.5729646697388633, "grad_norm": 18.871527983071335, "learning_rate": 2.2998100122739007e-07, "logits/chosen": -3.546875, "logits/rejected": -3.703125, "logps/chosen": -320.0, "logps/rejected": -482.0, "loss": 0.7475, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.09375, "rewards/margins": 1.625, "rewards/rejected": -3.71875, "step": 7460 }, { "epoch": 0.5737327188940092, "grad_norm": 17.91703362739198, "learning_rate": 2.2931297697518432e-07, "logits/chosen": -3.46875, "logits/rejected": -3.484375, "logps/chosen": -352.0, "logps/rejected": -498.0, "loss": 0.7289, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.09375, "rewards/margins": 1.5625, "rewards/rejected": -3.640625, "step": 7470 }, { "epoch": 0.5745007680491552, "grad_norm": 20.677224355680472, "learning_rate": 2.2864510141596895e-07, "logits/chosen": -3.71875, "logits/rejected": -3.75, "logps/chosen": -324.0, "logps/rejected": -458.0, "loss": 0.7673, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.046875, "rewards/margins": 1.578125, "rewards/rejected": -3.625, "step": 7480 }, { "epoch": 0.5752688172043011, "grad_norm": 15.691599270067025, "learning_rate": 2.2797737935026133e-07, "logits/chosen": -3.71875, "logits/rejected": -3.578125, "logps/chosen": -320.0, "logps/rejected": -486.0, "loss": 0.8196, "rewards/accuracies": 0.75, "rewards/chosen": -2.015625, "rewards/margins": 1.65625, "rewards/rejected": -3.671875, "step": 7490 }, { "epoch": 0.576036866359447, "grad_norm": 18.012702817763053, "learning_rate": 2.273098155774757e-07, "logits/chosen": -3.5, "logits/rejected": -3.625, "logps/chosen": -300.0, "logps/rejected": -452.0, "loss": 0.7615, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8359375, "rewards/margins": 1.6171875, "rewards/rejected": -3.453125, "step": 7500 }, { "epoch": 0.5768049155145929, "grad_norm": 18.276486656397545, "learning_rate": 2.2664241489588832e-07, "logits/chosen": -3.515625, "logits/rejected": -3.75, "logps/chosen": -308.0, "logps/rejected": -452.0, "loss": 0.7883, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.78125, "rewards/margins": 1.625, "rewards/rejected": -3.40625, "step": 7510 }, { "epoch": 0.5775729646697388, "grad_norm": 19.210917177088742, "learning_rate": 2.259751821026034e-07, "logits/chosen": -3.640625, "logits/rejected": -3.703125, "logps/chosen": -306.0, "logps/rejected": -484.0, "loss": 0.7732, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.765625, "rewards/margins": 1.8828125, "rewards/rejected": -3.65625, "step": 7520 }, { "epoch": 0.5783410138248848, "grad_norm": 21.40574695868662, "learning_rate": 2.2530812199351828e-07, "logits/chosen": -3.53125, "logits/rejected": -3.65625, "logps/chosen": -316.0, "logps/rejected": -484.0, "loss": 0.7884, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.8671875, "rewards/margins": 1.7890625, "rewards/rejected": -3.65625, "step": 7530 }, { "epoch": 0.5791090629800307, "grad_norm": 21.39499677122841, "learning_rate": 2.2464123936328904e-07, "logits/chosen": -3.671875, "logits/rejected": -3.84375, "logps/chosen": -312.0, "logps/rejected": -442.0, "loss": 0.7894, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.9375, "rewards/margins": 1.5078125, "rewards/rejected": -3.4375, "step": 7540 }, { "epoch": 0.5798771121351767, "grad_norm": 18.54836464407459, "learning_rate": 2.2397453900529627e-07, "logits/chosen": -3.625, "logits/rejected": -3.65625, "logps/chosen": -312.0, "logps/rejected": -458.0, "loss": 0.7442, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.8046875, "rewards/margins": 1.6953125, "rewards/rejected": -3.5, "step": 7550 }, { "epoch": 0.5806451612903226, "grad_norm": 21.1983796423513, "learning_rate": 2.233080257116103e-07, "logits/chosen": -3.609375, "logits/rejected": -3.96875, "logps/chosen": -340.0, "logps/rejected": -468.0, "loss": 0.7922, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.0, "rewards/margins": 1.6171875, "rewards/rejected": -3.625, "step": 7560 }, { "epoch": 0.5814132104454686, "grad_norm": 19.409201117612415, "learning_rate": 2.226417042729569e-07, "logits/chosen": -3.59375, "logits/rejected": -3.78125, "logps/chosen": -320.0, "logps/rejected": -476.0, "loss": 0.7753, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.921875, "rewards/margins": 1.6953125, "rewards/rejected": -3.625, "step": 7570 }, { "epoch": 0.5821812596006144, "grad_norm": 19.601475936753733, "learning_rate": 2.219755794786829e-07, "logits/chosen": -3.65625, "logits/rejected": -3.65625, "logps/chosen": -316.0, "logps/rejected": -478.0, "loss": 0.7469, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.953125, "rewards/margins": 1.75, "rewards/rejected": -3.703125, "step": 7580 }, { "epoch": 0.5829493087557603, "grad_norm": 20.57182994813945, "learning_rate": 2.213096561167216e-07, "logits/chosen": -3.59375, "logits/rejected": -3.765625, "logps/chosen": -316.0, "logps/rejected": -454.0, "loss": 0.7726, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.96875, "rewards/margins": 1.5234375, "rewards/rejected": -3.5, "step": 7590 }, { "epoch": 0.5837173579109063, "grad_norm": 19.26296550900027, "learning_rate": 2.2064393897355856e-07, "logits/chosen": -3.609375, "logits/rejected": -3.59375, "logps/chosen": -314.0, "logps/rejected": -478.0, "loss": 0.7815, "rewards/accuracies": 0.84375, "rewards/chosen": -1.921875, "rewards/margins": 1.7421875, "rewards/rejected": -3.671875, "step": 7600 }, { "epoch": 0.5844854070660522, "grad_norm": 18.22947130514042, "learning_rate": 2.19978432834197e-07, "logits/chosen": -3.640625, "logits/rejected": -3.75, "logps/chosen": -332.0, "logps/rejected": -480.0, "loss": 0.7421, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.109375, "rewards/margins": 1.5546875, "rewards/rejected": -3.671875, "step": 7610 }, { "epoch": 0.5852534562211982, "grad_norm": 22.010896578235197, "learning_rate": 2.1931314248212365e-07, "logits/chosen": -3.5625, "logits/rejected": -3.640625, "logps/chosen": -328.0, "logps/rejected": -464.0, "loss": 0.8206, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.0625, "rewards/margins": 1.5, "rewards/rejected": -3.5625, "step": 7620 }, { "epoch": 0.5860215053763441, "grad_norm": 18.238420394327985, "learning_rate": 2.1864807269927404e-07, "logits/chosen": -3.8125, "logits/rejected": -3.96875, "logps/chosen": -322.0, "logps/rejected": -458.0, "loss": 0.7288, "rewards/accuracies": 0.78125, "rewards/chosen": -2.171875, "rewards/margins": 1.5078125, "rewards/rejected": -3.671875, "step": 7630 }, { "epoch": 0.5867895545314901, "grad_norm": 16.978580161116227, "learning_rate": 2.1798322826599834e-07, "logits/chosen": -3.765625, "logits/rejected": -3.71875, "logps/chosen": -338.0, "logps/rejected": -476.0, "loss": 0.7517, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.296875, "rewards/margins": 1.4609375, "rewards/rejected": -3.765625, "step": 7640 }, { "epoch": 0.5875576036866359, "grad_norm": 23.100105814415823, "learning_rate": 2.1731861396102713e-07, "logits/chosen": -3.53125, "logits/rejected": -3.6875, "logps/chosen": -374.0, "logps/rejected": -536.0, "loss": 0.7221, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.34375, "rewards/margins": 1.71875, "rewards/rejected": -4.0625, "step": 7650 }, { "epoch": 0.5883256528417818, "grad_norm": 17.940832185577737, "learning_rate": 2.1665423456143658e-07, "logits/chosen": -3.46875, "logits/rejected": -3.828125, "logps/chosen": -340.0, "logps/rejected": -462.0, "loss": 0.7718, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.125, "rewards/margins": 1.4921875, "rewards/rejected": -3.625, "step": 7660 }, { "epoch": 0.5890937019969278, "grad_norm": 23.684803322562026, "learning_rate": 2.1599009484261476e-07, "logits/chosen": -3.515625, "logits/rejected": -3.671875, "logps/chosen": -316.0, "logps/rejected": -490.0, "loss": 0.7733, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.8828125, "rewards/margins": 2.0, "rewards/rejected": -3.875, "step": 7670 }, { "epoch": 0.5898617511520737, "grad_norm": 20.445100774063693, "learning_rate": 2.153261995782268e-07, "logits/chosen": -3.59375, "logits/rejected": -3.703125, "logps/chosen": -330.0, "logps/rejected": -464.0, "loss": 0.7419, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.109375, "rewards/margins": 1.53125, "rewards/rejected": -3.640625, "step": 7680 }, { "epoch": 0.5906298003072197, "grad_norm": 20.84376169094254, "learning_rate": 2.1466255354018064e-07, "logits/chosen": -3.625, "logits/rejected": -3.703125, "logps/chosen": -316.0, "logps/rejected": -486.0, "loss": 0.7483, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.015625, "rewards/margins": 1.8125, "rewards/rejected": -3.828125, "step": 7690 }, { "epoch": 0.5913978494623656, "grad_norm": 22.273207028252145, "learning_rate": 2.1399916149859314e-07, "logits/chosen": -3.4375, "logits/rejected": -3.671875, "logps/chosen": -346.0, "logps/rejected": -474.0, "loss": 0.7729, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.171875, "rewards/margins": 1.4609375, "rewards/rejected": -3.640625, "step": 7700 }, { "epoch": 0.5921658986175116, "grad_norm": 19.095496680872692, "learning_rate": 2.1333602822175526e-07, "logits/chosen": -3.4375, "logits/rejected": -3.484375, "logps/chosen": -336.0, "logps/rejected": -472.0, "loss": 0.7493, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.953125, "rewards/margins": 1.5234375, "rewards/rejected": -3.484375, "step": 7710 }, { "epoch": 0.5929339477726574, "grad_norm": 25.742507678321854, "learning_rate": 2.1267315847609811e-07, "logits/chosen": -3.578125, "logits/rejected": -3.78125, "logps/chosen": -310.0, "logps/rejected": -484.0, "loss": 0.7933, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.8046875, "rewards/margins": 1.890625, "rewards/rejected": -3.703125, "step": 7720 }, { "epoch": 0.5937019969278033, "grad_norm": 22.00335670148217, "learning_rate": 2.1201055702615861e-07, "logits/chosen": -3.578125, "logits/rejected": -3.765625, "logps/chosen": -318.0, "logps/rejected": -492.0, "loss": 0.7247, "rewards/accuracies": 0.90625, "rewards/chosen": -1.90625, "rewards/margins": 1.96875, "rewards/rejected": -3.875, "step": 7730 }, { "epoch": 0.5944700460829493, "grad_norm": 19.5545966397924, "learning_rate": 2.1134822863454518e-07, "logits/chosen": -3.53125, "logits/rejected": -3.546875, "logps/chosen": -298.0, "logps/rejected": -490.0, "loss": 0.7944, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.7890625, "rewards/margins": 1.828125, "rewards/rejected": -3.625, "step": 7740 }, { "epoch": 0.5952380952380952, "grad_norm": 23.319181539581482, "learning_rate": 2.106861780619037e-07, "logits/chosen": -3.5, "logits/rejected": -3.640625, "logps/chosen": -308.0, "logps/rejected": -442.0, "loss": 0.7564, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9296875, "rewards/margins": 1.46875, "rewards/rejected": -3.40625, "step": 7750 }, { "epoch": 0.5960061443932412, "grad_norm": 23.18600797949107, "learning_rate": 2.1002441006688297e-07, "logits/chosen": -3.578125, "logits/rejected": -3.65625, "logps/chosen": -356.0, "logps/rejected": -494.0, "loss": 0.773, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.171875, "rewards/margins": 1.546875, "rewards/rejected": -3.71875, "step": 7760 }, { "epoch": 0.5967741935483871, "grad_norm": 20.197505938075953, "learning_rate": 2.0936292940610092e-07, "logits/chosen": -3.5, "logits/rejected": -3.328125, "logps/chosen": -320.0, "logps/rejected": -474.0, "loss": 0.7565, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.953125, "rewards/margins": 1.6015625, "rewards/rejected": -3.5625, "step": 7770 }, { "epoch": 0.5975422427035331, "grad_norm": 18.940727143271314, "learning_rate": 2.0870174083411e-07, "logits/chosen": -3.578125, "logits/rejected": -3.375, "logps/chosen": -338.0, "logps/rejected": -504.0, "loss": 0.7544, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.09375, "rewards/margins": 1.734375, "rewards/rejected": -3.8125, "step": 7780 }, { "epoch": 0.598310291858679, "grad_norm": 19.97221508982935, "learning_rate": 2.0804084910336323e-07, "logits/chosen": -3.46875, "logits/rejected": -3.703125, "logps/chosen": -328.0, "logps/rejected": -456.0, "loss": 0.7592, "rewards/accuracies": 0.78125, "rewards/chosen": -1.921875, "rewards/margins": 1.671875, "rewards/rejected": -3.59375, "step": 7790 }, { "epoch": 0.5990783410138248, "grad_norm": 20.913887435789885, "learning_rate": 2.0738025896418015e-07, "logits/chosen": -3.453125, "logits/rejected": -3.46875, "logps/chosen": -334.0, "logps/rejected": -496.0, "loss": 0.7306, "rewards/accuracies": 0.78125, "rewards/chosen": -2.109375, "rewards/margins": 1.7109375, "rewards/rejected": -3.8125, "step": 7800 }, { "epoch": 0.5998463901689708, "grad_norm": 22.09750494838704, "learning_rate": 2.067199751647124e-07, "logits/chosen": -3.5625, "logits/rejected": -3.546875, "logps/chosen": -318.0, "logps/rejected": -492.0, "loss": 0.7653, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.9921875, "rewards/margins": 1.703125, "rewards/rejected": -3.703125, "step": 7810 }, { "epoch": 0.6006144393241167, "grad_norm": 18.38349404138724, "learning_rate": 2.060600024509098e-07, "logits/chosen": -3.53125, "logits/rejected": -3.625, "logps/chosen": -342.0, "logps/rejected": -490.0, "loss": 0.7898, "rewards/accuracies": 0.8125, "rewards/chosen": -2.15625, "rewards/margins": 1.703125, "rewards/rejected": -3.859375, "step": 7820 }, { "epoch": 0.6013824884792627, "grad_norm": 18.855217132546734, "learning_rate": 2.0540034556648606e-07, "logits/chosen": -3.484375, "logits/rejected": -3.359375, "logps/chosen": -314.0, "logps/rejected": -468.0, "loss": 0.7337, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.8828125, "rewards/margins": 1.578125, "rewards/rejected": -3.46875, "step": 7830 }, { "epoch": 0.6021505376344086, "grad_norm": 18.96784597752165, "learning_rate": 2.0474100925288502e-07, "logits/chosen": -3.5, "logits/rejected": -3.5625, "logps/chosen": -312.0, "logps/rejected": -464.0, "loss": 0.7127, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8984375, "rewards/margins": 1.7109375, "rewards/rejected": -3.609375, "step": 7840 }, { "epoch": 0.6029185867895546, "grad_norm": 22.88974293629904, "learning_rate": 2.0408199824924604e-07, "logits/chosen": -3.75, "logits/rejected": -3.59375, "logps/chosen": -326.0, "logps/rejected": -462.0, "loss": 0.7944, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.125, "rewards/margins": 1.3671875, "rewards/rejected": -3.5, "step": 7850 }, { "epoch": 0.6036866359447005, "grad_norm": 20.57304081377326, "learning_rate": 2.0342331729237043e-07, "logits/chosen": -3.5625, "logits/rejected": -3.625, "logps/chosen": -312.0, "logps/rejected": -510.0, "loss": 0.7491, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.8515625, "rewards/margins": 2.09375, "rewards/rejected": -3.9375, "step": 7860 }, { "epoch": 0.6044546850998463, "grad_norm": 20.054625025173618, "learning_rate": 2.027649711166872e-07, "logits/chosen": -3.5625, "logits/rejected": -3.625, "logps/chosen": -336.0, "logps/rejected": -502.0, "loss": 0.7098, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.171875, "rewards/margins": 1.6953125, "rewards/rejected": -3.875, "step": 7870 }, { "epoch": 0.6052227342549923, "grad_norm": 23.499392313925103, "learning_rate": 2.0210696445421898e-07, "logits/chosen": -3.671875, "logits/rejected": -3.765625, "logps/chosen": -346.0, "logps/rejected": -520.0, "loss": 0.7727, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.265625, "rewards/margins": 1.8515625, "rewards/rejected": -4.125, "step": 7880 }, { "epoch": 0.6059907834101382, "grad_norm": 19.38245931848447, "learning_rate": 2.0144930203454812e-07, "logits/chosen": -3.53125, "logits/rejected": -3.78125, "logps/chosen": -346.0, "logps/rejected": -474.0, "loss": 0.7826, "rewards/accuracies": 0.84375, "rewards/chosen": -2.171875, "rewards/margins": 1.625, "rewards/rejected": -3.796875, "step": 7890 }, { "epoch": 0.6067588325652842, "grad_norm": 21.114354845388362, "learning_rate": 2.007919885847826e-07, "logits/chosen": -3.5, "logits/rejected": -3.46875, "logps/chosen": -334.0, "logps/rejected": -478.0, "loss": 0.7828, "rewards/accuracies": 0.8125, "rewards/chosen": -2.125, "rewards/margins": 1.5, "rewards/rejected": -3.625, "step": 7900 }, { "epoch": 0.6075268817204301, "grad_norm": 17.576939335189103, "learning_rate": 2.0013502882952203e-07, "logits/chosen": -3.46875, "logits/rejected": -3.5, "logps/chosen": -302.0, "logps/rejected": -458.0, "loss": 0.7378, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.8828125, "rewards/margins": 1.6484375, "rewards/rejected": -3.53125, "step": 7910 }, { "epoch": 0.6082949308755761, "grad_norm": 21.91089044435408, "learning_rate": 1.994784274908239e-07, "logits/chosen": -3.453125, "logits/rejected": -3.625, "logps/chosen": -334.0, "logps/rejected": -476.0, "loss": 0.7903, "rewards/accuracies": 0.8125, "rewards/chosen": -2.03125, "rewards/margins": 1.6171875, "rewards/rejected": -3.65625, "step": 7920 }, { "epoch": 0.609062980030722, "grad_norm": 17.323666966653054, "learning_rate": 1.988221892881694e-07, "logits/chosen": -3.46875, "logits/rejected": -3.53125, "logps/chosen": -332.0, "logps/rejected": -470.0, "loss": 0.7562, "rewards/accuracies": 0.8125, "rewards/chosen": -2.0625, "rewards/margins": 1.5859375, "rewards/rejected": -3.640625, "step": 7930 }, { "epoch": 0.6098310291858678, "grad_norm": 20.02681303743133, "learning_rate": 1.9816631893842967e-07, "logits/chosen": -3.734375, "logits/rejected": -3.71875, "logps/chosen": -342.0, "logps/rejected": -506.0, "loss": 0.7785, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.140625, "rewards/margins": 1.609375, "rewards/rejected": -3.75, "step": 7940 }, { "epoch": 0.6105990783410138, "grad_norm": 17.592837231944053, "learning_rate": 1.9751082115583174e-07, "logits/chosen": -3.5, "logits/rejected": -3.546875, "logps/chosen": -334.0, "logps/rejected": -484.0, "loss": 0.7184, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.09375, "rewards/margins": 1.6953125, "rewards/rejected": -3.796875, "step": 7950 }, { "epoch": 0.6113671274961597, "grad_norm": 21.515405143196883, "learning_rate": 1.9685570065192465e-07, "logits/chosen": -3.71875, "logits/rejected": -3.734375, "logps/chosen": -342.0, "logps/rejected": -498.0, "loss": 0.779, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.1875, "rewards/margins": 1.5390625, "rewards/rejected": -3.734375, "step": 7960 }, { "epoch": 0.6121351766513057, "grad_norm": 18.59096187289719, "learning_rate": 1.9620096213554588e-07, "logits/chosen": -3.640625, "logits/rejected": -3.84375, "logps/chosen": -328.0, "logps/rejected": -472.0, "loss": 0.7273, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.96875, "rewards/margins": 1.71875, "rewards/rejected": -3.6875, "step": 7970 }, { "epoch": 0.6129032258064516, "grad_norm": 17.955192316235753, "learning_rate": 1.955466103127871e-07, "logits/chosen": -3.65625, "logits/rejected": -3.578125, "logps/chosen": -330.0, "logps/rejected": -548.0, "loss": 0.6888, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.109375, "rewards/margins": 2.234375, "rewards/rejected": -4.34375, "step": 7980 }, { "epoch": 0.6136712749615976, "grad_norm": 23.945875757964973, "learning_rate": 1.9489264988696065e-07, "logits/chosen": -3.59375, "logits/rejected": -3.84375, "logps/chosen": -364.0, "logps/rejected": -528.0, "loss": 0.7437, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.375, "rewards/margins": 1.796875, "rewards/rejected": -4.1875, "step": 7990 }, { "epoch": 0.6144393241167435, "grad_norm": 19.42967458423542, "learning_rate": 1.9423908555856544e-07, "logits/chosen": -3.703125, "logits/rejected": -3.65625, "logps/chosen": -320.0, "logps/rejected": -508.0, "loss": 0.7661, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.0625, "rewards/margins": 1.953125, "rewards/rejected": -4.03125, "step": 8000 }, { "epoch": 0.6144393241167435, "eval_logits/chosen": -3.59375, "eval_logits/rejected": -3.765625, "eval_logps/chosen": -392.0, "eval_logps/rejected": -510.0, "eval_loss": 0.46331870555877686, "eval_rewards/accuracies": 0.7572115659713745, "eval_rewards/chosen": -2.5, "eval_rewards/margins": 1.4609375, "eval_rewards/rejected": -3.953125, "eval_runtime": 2263.2047, "eval_samples_per_second": 41.152, "eval_steps_per_second": 0.643, "step": 8000 }, { "epoch": 0.6152073732718893, "grad_norm": 21.832235460672152, "learning_rate": 1.9358592202525347e-07, "logits/chosen": -3.65625, "logits/rejected": -4.03125, "logps/chosen": -310.0, "logps/rejected": -490.0, "loss": 0.7563, "rewards/accuracies": 0.84375, "rewards/chosen": -1.9375, "rewards/margins": 1.9609375, "rewards/rejected": -3.90625, "step": 8010 }, { "epoch": 0.6159754224270353, "grad_norm": 19.679054436096216, "learning_rate": 1.9293316398179598e-07, "logits/chosen": -3.640625, "logits/rejected": -3.59375, "logps/chosen": -354.0, "logps/rejected": -520.0, "loss": 0.7472, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.15625, "rewards/margins": 1.8046875, "rewards/rejected": -3.96875, "step": 8020 }, { "epoch": 0.6167434715821812, "grad_norm": 20.9085466484652, "learning_rate": 1.9228081612004942e-07, "logits/chosen": -3.65625, "logits/rejected": -3.78125, "logps/chosen": -378.0, "logps/rejected": -508.0, "loss": 0.7336, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.46875, "rewards/margins": 1.484375, "rewards/rejected": -3.96875, "step": 8030 }, { "epoch": 0.6175115207373272, "grad_norm": 22.3978491984857, "learning_rate": 1.9162888312892228e-07, "logits/chosen": -3.484375, "logits/rejected": -3.640625, "logps/chosen": -352.0, "logps/rejected": -492.0, "loss": 0.757, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.265625, "rewards/margins": 1.5390625, "rewards/rejected": -3.796875, "step": 8040 }, { "epoch": 0.6182795698924731, "grad_norm": 16.777786561694697, "learning_rate": 1.9097736969434077e-07, "logits/chosen": -3.609375, "logits/rejected": -3.625, "logps/chosen": -330.0, "logps/rejected": -528.0, "loss": 0.7248, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.09375, "rewards/margins": 2.0625, "rewards/rejected": -4.15625, "step": 8050 }, { "epoch": 0.6190476190476191, "grad_norm": 21.57197346600474, "learning_rate": 1.9032628049921556e-07, "logits/chosen": -3.546875, "logits/rejected": -3.53125, "logps/chosen": -352.0, "logps/rejected": -500.0, "loss": 0.7352, "rewards/accuracies": 0.8125, "rewards/chosen": -2.203125, "rewards/margins": 1.6015625, "rewards/rejected": -3.796875, "step": 8060 }, { "epoch": 0.619815668202765, "grad_norm": 20.26779672786821, "learning_rate": 1.8967562022340807e-07, "logits/chosen": -3.515625, "logits/rejected": -3.578125, "logps/chosen": -340.0, "logps/rejected": -500.0, "loss": 0.7776, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.0625, "rewards/margins": 1.71875, "rewards/rejected": -3.78125, "step": 8070 }, { "epoch": 0.620583717357911, "grad_norm": 24.10697858572871, "learning_rate": 1.8902539354369661e-07, "logits/chosen": -3.65625, "logits/rejected": -3.765625, "logps/chosen": -350.0, "logps/rejected": -502.0, "loss": 0.7517, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.234375, "rewards/margins": 1.7109375, "rewards/rejected": -3.9375, "step": 8080 }, { "epoch": 0.6213517665130568, "grad_norm": 19.85487439745618, "learning_rate": 1.883756051337431e-07, "logits/chosen": -3.5625, "logits/rejected": -3.625, "logps/chosen": -352.0, "logps/rejected": -498.0, "loss": 0.7769, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.1875, "rewards/margins": 1.609375, "rewards/rejected": -3.796875, "step": 8090 }, { "epoch": 0.6221198156682027, "grad_norm": 22.83587378969899, "learning_rate": 1.877262596640591e-07, "logits/chosen": -3.453125, "logits/rejected": -3.5625, "logps/chosen": -340.0, "logps/rejected": -464.0, "loss": 0.767, "rewards/accuracies": 0.78125, "rewards/chosen": -2.046875, "rewards/margins": 1.4375, "rewards/rejected": -3.484375, "step": 8100 }, { "epoch": 0.6228878648233487, "grad_norm": 18.34290034412048, "learning_rate": 1.8707736180197242e-07, "logits/chosen": -3.40625, "logits/rejected": -3.65625, "logps/chosen": -320.0, "logps/rejected": -494.0, "loss": 0.7208, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.03125, "rewards/margins": 1.765625, "rewards/rejected": -3.8125, "step": 8110 }, { "epoch": 0.6236559139784946, "grad_norm": 20.460336682014788, "learning_rate": 1.8642891621159385e-07, "logits/chosen": -3.453125, "logits/rejected": -3.625, "logps/chosen": -328.0, "logps/rejected": -488.0, "loss": 0.7506, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.9453125, "rewards/margins": 1.796875, "rewards/rejected": -3.734375, "step": 8120 }, { "epoch": 0.6244239631336406, "grad_norm": 18.774422805337597, "learning_rate": 1.8578092755378306e-07, "logits/chosen": -3.4375, "logits/rejected": -3.5625, "logps/chosen": -370.0, "logps/rejected": -520.0, "loss": 0.7594, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.28125, "rewards/margins": 1.65625, "rewards/rejected": -3.9375, "step": 8130 }, { "epoch": 0.6251920122887865, "grad_norm": 21.291100144196985, "learning_rate": 1.851334004861156e-07, "logits/chosen": -3.359375, "logits/rejected": -3.609375, "logps/chosen": -314.0, "logps/rejected": -464.0, "loss": 0.7422, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.9765625, "rewards/margins": 1.5859375, "rewards/rejected": -3.5625, "step": 8140 }, { "epoch": 0.6259600614439325, "grad_norm": 20.614373189076055, "learning_rate": 1.8448633966284917e-07, "logits/chosen": -3.484375, "logits/rejected": -3.484375, "logps/chosen": -338.0, "logps/rejected": -492.0, "loss": 0.7324, "rewards/accuracies": 0.84375, "rewards/chosen": -2.03125, "rewards/margins": 1.6484375, "rewards/rejected": -3.6875, "step": 8150 }, { "epoch": 0.6267281105990783, "grad_norm": 17.66131129928176, "learning_rate": 1.838397497348901e-07, "logits/chosen": -3.53125, "logits/rejected": -3.296875, "logps/chosen": -338.0, "logps/rejected": -520.0, "loss": 0.735, "rewards/accuracies": 0.84375, "rewards/chosen": -2.125, "rewards/margins": 1.78125, "rewards/rejected": -3.90625, "step": 8160 }, { "epoch": 0.6274961597542242, "grad_norm": 18.86158065873651, "learning_rate": 1.8319363534976036e-07, "logits/chosen": -3.421875, "logits/rejected": -3.453125, "logps/chosen": -338.0, "logps/rejected": -484.0, "loss": 0.7655, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.125, "rewards/margins": 1.5078125, "rewards/rejected": -3.625, "step": 8170 }, { "epoch": 0.6282642089093702, "grad_norm": 17.2371630042241, "learning_rate": 1.825480011515634e-07, "logits/chosen": -3.5, "logits/rejected": -3.5625, "logps/chosen": -340.0, "logps/rejected": -472.0, "loss": 0.7404, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.078125, "rewards/margins": 1.59375, "rewards/rejected": -3.671875, "step": 8180 }, { "epoch": 0.6290322580645161, "grad_norm": 18.09934187631728, "learning_rate": 1.8190285178095172e-07, "logits/chosen": -3.4375, "logits/rejected": -3.4375, "logps/chosen": -316.0, "logps/rejected": -476.0, "loss": 0.7262, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.890625, "rewards/margins": 1.8515625, "rewards/rejected": -3.734375, "step": 8190 }, { "epoch": 0.6298003072196621, "grad_norm": 20.442027423108243, "learning_rate": 1.8125819187509255e-07, "logits/chosen": -3.484375, "logits/rejected": -3.703125, "logps/chosen": -320.0, "logps/rejected": -466.0, "loss": 0.7743, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.96875, "rewards/margins": 1.65625, "rewards/rejected": -3.625, "step": 8200 }, { "epoch": 0.630568356374808, "grad_norm": 20.188638661626445, "learning_rate": 1.806140260676352e-07, "logits/chosen": -3.390625, "logits/rejected": -3.359375, "logps/chosen": -312.0, "logps/rejected": -488.0, "loss": 0.7688, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.875, "rewards/margins": 1.8203125, "rewards/rejected": -3.6875, "step": 8210 }, { "epoch": 0.631336405529954, "grad_norm": 20.341791268541428, "learning_rate": 1.7997035898867755e-07, "logits/chosen": -3.515625, "logits/rejected": -3.6875, "logps/chosen": -330.0, "logps/rejected": -502.0, "loss": 0.7463, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.09375, "rewards/margins": 1.8828125, "rewards/rejected": -3.96875, "step": 8220 }, { "epoch": 0.6321044546850998, "grad_norm": 20.156915459155616, "learning_rate": 1.7932719526473262e-07, "logits/chosen": -3.484375, "logits/rejected": -3.4375, "logps/chosen": -352.0, "logps/rejected": -502.0, "loss": 0.7533, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.171875, "rewards/margins": 1.71875, "rewards/rejected": -3.90625, "step": 8230 }, { "epoch": 0.6328725038402457, "grad_norm": 20.680884546036502, "learning_rate": 1.7868453951869556e-07, "logits/chosen": -3.421875, "logits/rejected": -3.484375, "logps/chosen": -328.0, "logps/rejected": -488.0, "loss": 0.771, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.09375, "rewards/margins": 1.6328125, "rewards/rejected": -3.71875, "step": 8240 }, { "epoch": 0.6336405529953917, "grad_norm": 17.766814073929837, "learning_rate": 1.7804239636981028e-07, "logits/chosen": -3.453125, "logits/rejected": -3.4375, "logps/chosen": -324.0, "logps/rejected": -478.0, "loss": 0.7555, "rewards/accuracies": 0.78125, "rewards/chosen": -2.03125, "rewards/margins": 1.6328125, "rewards/rejected": -3.671875, "step": 8250 }, { "epoch": 0.6344086021505376, "grad_norm": 20.748803076341076, "learning_rate": 1.7740077043363614e-07, "logits/chosen": -3.359375, "logits/rejected": -3.25, "logps/chosen": -326.0, "logps/rejected": -502.0, "loss": 0.755, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.9609375, "rewards/margins": 1.7109375, "rewards/rejected": -3.671875, "step": 8260 }, { "epoch": 0.6351766513056836, "grad_norm": 18.17378548390649, "learning_rate": 1.7675966632201517e-07, "logits/chosen": -3.46875, "logits/rejected": -3.453125, "logps/chosen": -328.0, "logps/rejected": -484.0, "loss": 0.7614, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.9296875, "rewards/margins": 1.71875, "rewards/rejected": -3.640625, "step": 8270 }, { "epoch": 0.6359447004608295, "grad_norm": 20.483162897010818, "learning_rate": 1.7611908864303834e-07, "logits/chosen": -3.40625, "logits/rejected": -3.734375, "logps/chosen": -320.0, "logps/rejected": -468.0, "loss": 0.7168, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.9609375, "rewards/margins": 1.6328125, "rewards/rejected": -3.59375, "step": 8280 }, { "epoch": 0.6367127496159755, "grad_norm": 19.174861781290016, "learning_rate": 1.754790420010131e-07, "logits/chosen": -3.328125, "logits/rejected": -3.625, "logps/chosen": -384.0, "logps/rejected": -510.0, "loss": 0.7956, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.328125, "rewards/margins": 1.59375, "rewards/rejected": -3.921875, "step": 8290 }, { "epoch": 0.6374807987711214, "grad_norm": 18.78981855100812, "learning_rate": 1.748395309964297e-07, "logits/chosen": -3.453125, "logits/rejected": -3.5, "logps/chosen": -310.0, "logps/rejected": -472.0, "loss": 0.7563, "rewards/accuracies": 0.8125, "rewards/chosen": -1.921875, "rewards/margins": 1.7890625, "rewards/rejected": -3.703125, "step": 8300 }, { "epoch": 0.6382488479262672, "grad_norm": 15.651771341020954, "learning_rate": 1.742005602259284e-07, "logits/chosen": -3.328125, "logits/rejected": -3.25, "logps/chosen": -336.0, "logps/rejected": -486.0, "loss": 0.7397, "rewards/accuracies": 0.84375, "rewards/chosen": -2.0625, "rewards/margins": 1.5703125, "rewards/rejected": -3.640625, "step": 8310 }, { "epoch": 0.6390168970814132, "grad_norm": 18.1807335920128, "learning_rate": 1.7356213428226646e-07, "logits/chosen": -3.34375, "logits/rejected": -3.59375, "logps/chosen": -318.0, "logps/rejected": -466.0, "loss": 0.7192, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.8515625, "rewards/margins": 1.71875, "rewards/rejected": -3.578125, "step": 8320 }, { "epoch": 0.6397849462365591, "grad_norm": 21.900298615892993, "learning_rate": 1.72924257754285e-07, "logits/chosen": -3.5, "logits/rejected": -3.59375, "logps/chosen": -326.0, "logps/rejected": -516.0, "loss": 0.7528, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.109375, "rewards/margins": 1.8515625, "rewards/rejected": -3.96875, "step": 8330 }, { "epoch": 0.6405529953917051, "grad_norm": 18.61987030219781, "learning_rate": 1.7228693522687625e-07, "logits/chosen": -3.40625, "logits/rejected": -3.59375, "logps/chosen": -362.0, "logps/rejected": -520.0, "loss": 0.7484, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.203125, "rewards/margins": 1.703125, "rewards/rejected": -3.90625, "step": 8340 }, { "epoch": 0.641321044546851, "grad_norm": 18.26259426152838, "learning_rate": 1.7165017128095018e-07, "logits/chosen": -3.40625, "logits/rejected": -3.40625, "logps/chosen": -316.0, "logps/rejected": -488.0, "loss": 0.7145, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.9375, "rewards/margins": 1.765625, "rewards/rejected": -3.703125, "step": 8350 }, { "epoch": 0.642089093701997, "grad_norm": 24.035405901979505, "learning_rate": 1.7101397049340193e-07, "logits/chosen": -3.25, "logits/rejected": -3.515625, "logps/chosen": -326.0, "logps/rejected": -458.0, "loss": 0.7587, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.09375, "rewards/margins": 1.4375, "rewards/rejected": -3.53125, "step": 8360 }, { "epoch": 0.6428571428571429, "grad_norm": 20.57630916565714, "learning_rate": 1.7037833743707892e-07, "logits/chosen": -3.46875, "logits/rejected": -3.40625, "logps/chosen": -328.0, "logps/rejected": -512.0, "loss": 0.7755, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.15625, "rewards/margins": 1.6796875, "rewards/rejected": -3.828125, "step": 8370 }, { "epoch": 0.6436251920122887, "grad_norm": 19.05557625874726, "learning_rate": 1.697432766807476e-07, "logits/chosen": -3.5, "logits/rejected": -3.6875, "logps/chosen": -298.0, "logps/rejected": -446.0, "loss": 0.7529, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.84375, "rewards/margins": 1.609375, "rewards/rejected": -3.453125, "step": 8380 }, { "epoch": 0.6443932411674347, "grad_norm": 21.73603404824002, "learning_rate": 1.691087927890612e-07, "logits/chosen": -3.53125, "logits/rejected": -3.59375, "logps/chosen": -314.0, "logps/rejected": -490.0, "loss": 0.745, "rewards/accuracies": 0.8125, "rewards/chosen": -2.03125, "rewards/margins": 1.78125, "rewards/rejected": -3.8125, "step": 8390 }, { "epoch": 0.6451612903225806, "grad_norm": 18.08892623368941, "learning_rate": 1.6847489032252627e-07, "logits/chosen": -3.328125, "logits/rejected": -3.5625, "logps/chosen": -322.0, "logps/rejected": -472.0, "loss": 0.7164, "rewards/accuracies": 0.84375, "rewards/chosen": -1.96875, "rewards/margins": 1.7109375, "rewards/rejected": -3.671875, "step": 8400 }, { "epoch": 0.6459293394777266, "grad_norm": 21.064708219424137, "learning_rate": 1.6784157383747046e-07, "logits/chosen": -3.5625, "logits/rejected": -3.4375, "logps/chosen": -372.0, "logps/rejected": -516.0, "loss": 0.7585, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.3125, "rewards/margins": 1.65625, "rewards/rejected": -3.953125, "step": 8410 }, { "epoch": 0.6466973886328725, "grad_norm": 21.57793568854146, "learning_rate": 1.6720884788600949e-07, "logits/chosen": -3.5, "logits/rejected": -3.53125, "logps/chosen": -338.0, "logps/rejected": -478.0, "loss": 1.0658, "rewards/accuracies": 0.8125, "rewards/chosen": -2.125, "rewards/margins": 1.59375, "rewards/rejected": -3.71875, "step": 8420 }, { "epoch": 0.6474654377880185, "grad_norm": 17.872654881521576, "learning_rate": 1.6657671701601432e-07, "logits/chosen": -3.484375, "logits/rejected": -3.59375, "logps/chosen": -370.0, "logps/rejected": -540.0, "loss": 0.7519, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.296875, "rewards/margins": 1.8515625, "rewards/rejected": -4.15625, "step": 8430 }, { "epoch": 0.6482334869431644, "grad_norm": 17.342837740986333, "learning_rate": 1.659451857710789e-07, "logits/chosen": -3.4375, "logits/rejected": -3.578125, "logps/chosen": -316.0, "logps/rejected": -446.0, "loss": 0.8176, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.8671875, "rewards/margins": 1.5546875, "rewards/rejected": -3.421875, "step": 8440 }, { "epoch": 0.6490015360983102, "grad_norm": 16.8649670286298, "learning_rate": 1.6531425869048694e-07, "logits/chosen": -3.484375, "logits/rejected": -3.453125, "logps/chosen": -314.0, "logps/rejected": -490.0, "loss": 0.7796, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.984375, "rewards/margins": 1.71875, "rewards/rejected": -3.703125, "step": 8450 }, { "epoch": 0.6497695852534562, "grad_norm": 22.24514328668707, "learning_rate": 1.646839403091798e-07, "logits/chosen": -3.28125, "logits/rejected": -3.375, "logps/chosen": -344.0, "logps/rejected": -470.0, "loss": 0.7485, "rewards/accuracies": 0.84375, "rewards/chosen": -2.03125, "rewards/margins": 1.5625, "rewards/rejected": -3.59375, "step": 8460 }, { "epoch": 0.6505376344086021, "grad_norm": 21.008267888668698, "learning_rate": 1.640542351577235e-07, "logits/chosen": -3.515625, "logits/rejected": -3.484375, "logps/chosen": -330.0, "logps/rejected": -474.0, "loss": 0.7441, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.015625, "rewards/margins": 1.5546875, "rewards/rejected": -3.578125, "step": 8470 }, { "epoch": 0.6513056835637481, "grad_norm": 23.775776273942, "learning_rate": 1.634251477622764e-07, "logits/chosen": -3.328125, "logits/rejected": -3.609375, "logps/chosen": -308.0, "logps/rejected": -460.0, "loss": 0.79, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.8984375, "rewards/margins": 1.8203125, "rewards/rejected": -3.71875, "step": 8480 }, { "epoch": 0.652073732718894, "grad_norm": 19.356947477576167, "learning_rate": 1.627966826445566e-07, "logits/chosen": -3.5, "logits/rejected": -3.5625, "logps/chosen": -304.0, "logps/rejected": -458.0, "loss": 0.745, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.921875, "rewards/margins": 1.640625, "rewards/rejected": -3.5625, "step": 8490 }, { "epoch": 0.65284178187404, "grad_norm": 20.34839248707096, "learning_rate": 1.621688443218094e-07, "logits/chosen": -3.484375, "logits/rejected": -3.421875, "logps/chosen": -342.0, "logps/rejected": -510.0, "loss": 0.7545, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.109375, "rewards/margins": 1.828125, "rewards/rejected": -3.9375, "step": 8500 }, { "epoch": 0.6536098310291859, "grad_norm": 17.705289920168894, "learning_rate": 1.6154163730677494e-07, "logits/chosen": -3.53125, "logits/rejected": -3.609375, "logps/chosen": -346.0, "logps/rejected": -492.0, "loss": 0.7815, "rewards/accuracies": 0.78125, "rewards/chosen": -2.21875, "rewards/margins": 1.6640625, "rewards/rejected": -3.875, "step": 8510 }, { "epoch": 0.6543778801843319, "grad_norm": 21.774171351800277, "learning_rate": 1.6091506610765554e-07, "logits/chosen": -3.5, "logits/rejected": -3.546875, "logps/chosen": -346.0, "logps/rejected": -504.0, "loss": 0.8057, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.15625, "rewards/margins": 1.5703125, "rewards/rejected": -3.71875, "step": 8520 }, { "epoch": 0.6551459293394777, "grad_norm": 19.8677015248729, "learning_rate": 1.6028913522808342e-07, "logits/chosen": -3.578125, "logits/rejected": -3.71875, "logps/chosen": -332.0, "logps/rejected": -500.0, "loss": 0.7543, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.0625, "rewards/margins": 1.84375, "rewards/rejected": -3.90625, "step": 8530 }, { "epoch": 0.6559139784946236, "grad_norm": 25.19168505161973, "learning_rate": 1.5966384916708863e-07, "logits/chosen": -3.375, "logits/rejected": -3.4375, "logps/chosen": -338.0, "logps/rejected": -484.0, "loss": 0.7267, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.09375, "rewards/margins": 1.4765625, "rewards/rejected": -3.5625, "step": 8540 }, { "epoch": 0.6566820276497696, "grad_norm": 20.070699338042896, "learning_rate": 1.5903921241906608e-07, "logits/chosen": -3.609375, "logits/rejected": -3.71875, "logps/chosen": -352.0, "logps/rejected": -524.0, "loss": 0.723, "rewards/accuracies": 0.8125, "rewards/chosen": -2.3125, "rewards/margins": 1.734375, "rewards/rejected": -4.03125, "step": 8550 }, { "epoch": 0.6574500768049155, "grad_norm": 21.885302127034578, "learning_rate": 1.5841522947374385e-07, "logits/chosen": -3.421875, "logits/rejected": -3.75, "logps/chosen": -340.0, "logps/rejected": -462.0, "loss": 0.7797, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.125, "rewards/margins": 1.546875, "rewards/rejected": -3.671875, "step": 8560 }, { "epoch": 0.6582181259600615, "grad_norm": 22.26587381060883, "learning_rate": 1.577919048161505e-07, "logits/chosen": -3.5625, "logits/rejected": -3.71875, "logps/chosen": -332.0, "logps/rejected": -490.0, "loss": 0.7431, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.078125, "rewards/margins": 1.65625, "rewards/rejected": -3.734375, "step": 8570 }, { "epoch": 0.6589861751152074, "grad_norm": 22.258985026395226, "learning_rate": 1.57169242926583e-07, "logits/chosen": -3.40625, "logits/rejected": -3.59375, "logps/chosen": -324.0, "logps/rejected": -454.0, "loss": 0.7509, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.9375, "rewards/margins": 1.5078125, "rewards/rejected": -3.453125, "step": 8580 }, { "epoch": 0.6597542242703534, "grad_norm": 20.61275728417987, "learning_rate": 1.565472482805747e-07, "logits/chosen": -3.4375, "logits/rejected": -3.515625, "logps/chosen": -314.0, "logps/rejected": -486.0, "loss": 0.7668, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.9765625, "rewards/margins": 1.8203125, "rewards/rejected": -3.796875, "step": 8590 }, { "epoch": 0.6605222734254992, "grad_norm": 17.71342850535092, "learning_rate": 1.559259253488626e-07, "logits/chosen": -3.484375, "logits/rejected": -3.75, "logps/chosen": -326.0, "logps/rejected": -486.0, "loss": 0.7501, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.015625, "rewards/margins": 1.7578125, "rewards/rejected": -3.78125, "step": 8600 }, { "epoch": 0.6612903225806451, "grad_norm": 21.481547574476867, "learning_rate": 1.5530527859735599e-07, "logits/chosen": -3.4375, "logits/rejected": -3.390625, "logps/chosen": -298.0, "logps/rejected": -452.0, "loss": 0.768, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.796875, "rewards/margins": 1.703125, "rewards/rejected": -3.5, "step": 8610 }, { "epoch": 0.6620583717357911, "grad_norm": 19.5021358266694, "learning_rate": 1.5468531248710377e-07, "logits/chosen": -3.34375, "logits/rejected": -3.65625, "logps/chosen": -354.0, "logps/rejected": -472.0, "loss": 0.7378, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.125, "rewards/margins": 1.46875, "rewards/rejected": -3.59375, "step": 8620 }, { "epoch": 0.662826420890937, "grad_norm": 17.498980921680907, "learning_rate": 1.540660314742624e-07, "logits/chosen": -3.375, "logits/rejected": -3.484375, "logps/chosen": -314.0, "logps/rejected": -462.0, "loss": 0.7613, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.875, "rewards/margins": 1.5703125, "rewards/rejected": -3.4375, "step": 8630 }, { "epoch": 0.663594470046083, "grad_norm": 18.18813880672762, "learning_rate": 1.5344744001006444e-07, "logits/chosen": -3.40625, "logits/rejected": -3.484375, "logps/chosen": -332.0, "logps/rejected": -470.0, "loss": 0.7618, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.0, "rewards/margins": 1.5625, "rewards/rejected": -3.5625, "step": 8640 }, { "epoch": 0.6643625192012289, "grad_norm": 21.092409829165206, "learning_rate": 1.5282954254078576e-07, "logits/chosen": -3.421875, "logits/rejected": -3.359375, "logps/chosen": -366.0, "logps/rejected": -502.0, "loss": 0.7546, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.234375, "rewards/margins": 1.7109375, "rewards/rejected": -3.9375, "step": 8650 }, { "epoch": 0.6651305683563749, "grad_norm": 18.055089635401316, "learning_rate": 1.5221234350771424e-07, "logits/chosen": -3.40625, "logits/rejected": -3.53125, "logps/chosen": -348.0, "logps/rejected": -502.0, "loss": 0.7693, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.109375, "rewards/margins": 1.7265625, "rewards/rejected": -3.84375, "step": 8660 }, { "epoch": 0.6658986175115207, "grad_norm": 21.297120268197617, "learning_rate": 1.5159584734711741e-07, "logits/chosen": -3.421875, "logits/rejected": -3.53125, "logps/chosen": -316.0, "logps/rejected": -468.0, "loss": 0.758, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.046875, "rewards/margins": 1.7265625, "rewards/rejected": -3.765625, "step": 8670 }, { "epoch": 0.6666666666666666, "grad_norm": 22.964284785267143, "learning_rate": 1.5098005849021078e-07, "logits/chosen": -3.3125, "logits/rejected": -3.453125, "logps/chosen": -342.0, "logps/rejected": -508.0, "loss": 0.7487, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.9921875, "rewards/margins": 1.7265625, "rewards/rejected": -3.71875, "step": 8680 }, { "epoch": 0.6674347158218126, "grad_norm": 21.551871968862898, "learning_rate": 1.503649813631261e-07, "logits/chosen": -3.515625, "logits/rejected": -3.5, "logps/chosen": -308.0, "logps/rejected": -498.0, "loss": 0.7665, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.9375, "rewards/margins": 1.84375, "rewards/rejected": -3.78125, "step": 8690 }, { "epoch": 0.6682027649769585, "grad_norm": 18.33939773002097, "learning_rate": 1.4975062038687902e-07, "logits/chosen": -3.296875, "logits/rejected": -3.453125, "logps/chosen": -356.0, "logps/rejected": -502.0, "loss": 0.7359, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.078125, "rewards/margins": 1.7421875, "rewards/rejected": -3.8125, "step": 8700 }, { "epoch": 0.6689708141321045, "grad_norm": 18.962469554703656, "learning_rate": 1.4913697997733816e-07, "logits/chosen": -3.453125, "logits/rejected": -3.46875, "logps/chosen": -290.0, "logps/rejected": -448.0, "loss": 0.7586, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.796875, "rewards/margins": 1.625, "rewards/rejected": -3.421875, "step": 8710 }, { "epoch": 0.6697388632872504, "grad_norm": 20.237190795636387, "learning_rate": 1.485240645451925e-07, "logits/chosen": -3.328125, "logits/rejected": -3.359375, "logps/chosen": -320.0, "logps/rejected": -452.0, "loss": 0.7308, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.890625, "rewards/margins": 1.53125, "rewards/rejected": -3.421875, "step": 8720 }, { "epoch": 0.6705069124423964, "grad_norm": 18.380111604274845, "learning_rate": 1.4791187849592018e-07, "logits/chosen": -3.46875, "logits/rejected": -3.46875, "logps/chosen": -306.0, "logps/rejected": -468.0, "loss": 0.6833, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.8359375, "rewards/margins": 1.6015625, "rewards/rejected": -3.4375, "step": 8730 }, { "epoch": 0.6712749615975423, "grad_norm": 18.902915087276448, "learning_rate": 1.4730042622975693e-07, "logits/chosen": -3.59375, "logits/rejected": -3.296875, "logps/chosen": -332.0, "logps/rejected": -468.0, "loss": 0.806, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.125, "rewards/margins": 1.375, "rewards/rejected": -3.5, "step": 8740 }, { "epoch": 0.6720430107526881, "grad_norm": 21.269753091633362, "learning_rate": 1.4668971214166392e-07, "logits/chosen": -3.453125, "logits/rejected": -3.453125, "logps/chosen": -352.0, "logps/rejected": -502.0, "loss": 0.7843, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.21875, "rewards/margins": 1.5625, "rewards/rejected": -3.78125, "step": 8750 }, { "epoch": 0.6728110599078341, "grad_norm": 23.006407351059604, "learning_rate": 1.4607974062129669e-07, "logits/chosen": -3.359375, "logits/rejected": -3.5625, "logps/chosen": -344.0, "logps/rejected": -474.0, "loss": 0.7949, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.171875, "rewards/margins": 1.390625, "rewards/rejected": -3.5625, "step": 8760 }, { "epoch": 0.67357910906298, "grad_norm": 18.223244010064963, "learning_rate": 1.4547051605297335e-07, "logits/chosen": -3.34375, "logits/rejected": -3.296875, "logps/chosen": -326.0, "logps/rejected": -500.0, "loss": 0.7398, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.96875, "rewards/margins": 1.7890625, "rewards/rejected": -3.765625, "step": 8770 }, { "epoch": 0.674347158218126, "grad_norm": 22.42335075700551, "learning_rate": 1.4486204281564295e-07, "logits/chosen": -3.6875, "logits/rejected": -3.609375, "logps/chosen": -374.0, "logps/rejected": -496.0, "loss": 0.7735, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.375, "rewards/margins": 1.34375, "rewards/rejected": -3.71875, "step": 8780 }, { "epoch": 0.6751152073732719, "grad_norm": 18.69599753207403, "learning_rate": 1.442543252828547e-07, "logits/chosen": -3.53125, "logits/rejected": -3.515625, "logps/chosen": -324.0, "logps/rejected": -496.0, "loss": 0.7307, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.84375, "rewards/margins": 1.75, "rewards/rejected": -3.59375, "step": 8790 }, { "epoch": 0.6758832565284179, "grad_norm": 22.29731383830027, "learning_rate": 1.436473678227251e-07, "logits/chosen": -3.53125, "logits/rejected": -3.390625, "logps/chosen": -368.0, "logps/rejected": -540.0, "loss": 0.7515, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.265625, "rewards/margins": 1.734375, "rewards/rejected": -4.0, "step": 8800 }, { "epoch": 0.6766513056835638, "grad_norm": 17.782001032280043, "learning_rate": 1.4304117479790833e-07, "logits/chosen": -3.5625, "logits/rejected": -3.34375, "logps/chosen": -356.0, "logps/rejected": -488.0, "loss": 0.7512, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.28125, "rewards/margins": 1.3125, "rewards/rejected": -3.59375, "step": 8810 }, { "epoch": 0.6774193548387096, "grad_norm": 20.13308368327778, "learning_rate": 1.4243575056556354e-07, "logits/chosen": -3.625, "logits/rejected": -3.46875, "logps/chosen": -338.0, "logps/rejected": -494.0, "loss": 0.7433, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.171875, "rewards/margins": 1.5078125, "rewards/rejected": -3.671875, "step": 8820 }, { "epoch": 0.6781874039938556, "grad_norm": 19.636489920430737, "learning_rate": 1.4183109947732393e-07, "logits/chosen": -3.515625, "logits/rejected": -3.5625, "logps/chosen": -312.0, "logps/rejected": -474.0, "loss": 0.7554, "rewards/accuracies": 0.8125, "rewards/chosen": -2.03125, "rewards/margins": 1.6484375, "rewards/rejected": -3.6875, "step": 8830 }, { "epoch": 0.6789554531490015, "grad_norm": 19.943060391549608, "learning_rate": 1.4122722587926606e-07, "logits/chosen": -3.359375, "logits/rejected": -3.46875, "logps/chosen": -344.0, "logps/rejected": -504.0, "loss": 0.7323, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.015625, "rewards/margins": 1.7265625, "rewards/rejected": -3.75, "step": 8840 }, { "epoch": 0.6797235023041475, "grad_norm": 19.481764569114574, "learning_rate": 1.4062413411187734e-07, "logits/chosen": -3.53125, "logits/rejected": -3.515625, "logps/chosen": -336.0, "logps/rejected": -486.0, "loss": 0.7436, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.171875, "rewards/margins": 1.5390625, "rewards/rejected": -3.703125, "step": 8850 }, { "epoch": 0.6804915514592934, "grad_norm": 21.376340675399167, "learning_rate": 1.4002182851002622e-07, "logits/chosen": -3.65625, "logits/rejected": -3.671875, "logps/chosen": -350.0, "logps/rejected": -532.0, "loss": 0.7362, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.21875, "rewards/margins": 1.9296875, "rewards/rejected": -4.15625, "step": 8860 }, { "epoch": 0.6812596006144394, "grad_norm": 22.518200513873623, "learning_rate": 1.394203134029301e-07, "logits/chosen": -3.546875, "logits/rejected": -3.5, "logps/chosen": -302.0, "logps/rejected": -476.0, "loss": 0.7616, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.765625, "rewards/margins": 1.8203125, "rewards/rejected": -3.59375, "step": 8870 }, { "epoch": 0.6820276497695853, "grad_norm": 20.402219135424232, "learning_rate": 1.388195931141245e-07, "logits/chosen": -3.515625, "logits/rejected": -3.484375, "logps/chosen": -328.0, "logps/rejected": -478.0, "loss": 0.7414, "rewards/accuracies": 0.78125, "rewards/chosen": -2.140625, "rewards/margins": 1.4609375, "rewards/rejected": -3.59375, "step": 8880 }, { "epoch": 0.6827956989247311, "grad_norm": 24.597889026522772, "learning_rate": 1.3821967196143213e-07, "logits/chosen": -3.46875, "logits/rejected": -3.59375, "logps/chosen": -354.0, "logps/rejected": -482.0, "loss": 0.7641, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.203125, "rewards/margins": 1.5625, "rewards/rejected": -3.765625, "step": 8890 }, { "epoch": 0.6835637480798771, "grad_norm": 25.411285630510427, "learning_rate": 1.3762055425693147e-07, "logits/chosen": -3.546875, "logits/rejected": -3.75, "logps/chosen": -354.0, "logps/rejected": -506.0, "loss": 0.7674, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.296875, "rewards/margins": 1.734375, "rewards/rejected": -4.03125, "step": 8900 }, { "epoch": 0.684331797235023, "grad_norm": 18.42048196686646, "learning_rate": 1.3702224430692644e-07, "logits/chosen": -3.4375, "logits/rejected": -3.296875, "logps/chosen": -360.0, "logps/rejected": -524.0, "loss": 0.7191, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.203125, "rewards/margins": 1.625, "rewards/rejected": -3.828125, "step": 8910 }, { "epoch": 0.685099846390169, "grad_norm": 24.97829451107868, "learning_rate": 1.3642474641191452e-07, "logits/chosen": -3.5625, "logits/rejected": -3.671875, "logps/chosen": -352.0, "logps/rejected": -516.0, "loss": 0.7447, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.296875, "rewards/margins": 1.734375, "rewards/rejected": -4.03125, "step": 8920 }, { "epoch": 0.6858678955453149, "grad_norm": 19.13494533150056, "learning_rate": 1.358280648665569e-07, "logits/chosen": -3.640625, "logits/rejected": -3.671875, "logps/chosen": -328.0, "logps/rejected": -496.0, "loss": 0.7392, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.09375, "rewards/margins": 1.6953125, "rewards/rejected": -3.796875, "step": 8930 }, { "epoch": 0.6866359447004609, "grad_norm": 22.593548197602157, "learning_rate": 1.352322039596465e-07, "logits/chosen": -3.421875, "logits/rejected": -3.640625, "logps/chosen": -344.0, "logps/rejected": -488.0, "loss": 0.7807, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.109375, "rewards/margins": 1.6484375, "rewards/rejected": -3.765625, "step": 8940 }, { "epoch": 0.6874039938556068, "grad_norm": 17.06202333757221, "learning_rate": 1.3463716797407798e-07, "logits/chosen": -3.40625, "logits/rejected": -3.578125, "logps/chosen": -310.0, "logps/rejected": -482.0, "loss": 0.7826, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.953125, "rewards/margins": 1.9296875, "rewards/rejected": -3.890625, "step": 8950 }, { "epoch": 0.6881720430107527, "grad_norm": 21.519246027036136, "learning_rate": 1.340429611868168e-07, "logits/chosen": -3.453125, "logits/rejected": -3.515625, "logps/chosen": -340.0, "logps/rejected": -504.0, "loss": 0.7153, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.21875, "rewards/margins": 1.6640625, "rewards/rejected": -3.875, "step": 8960 }, { "epoch": 0.6889400921658986, "grad_norm": 22.165819773339173, "learning_rate": 1.3344958786886806e-07, "logits/chosen": -3.421875, "logits/rejected": -3.65625, "logps/chosen": -338.0, "logps/rejected": -488.0, "loss": 0.7432, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.09375, "rewards/margins": 1.7578125, "rewards/rejected": -3.84375, "step": 8970 }, { "epoch": 0.6897081413210445, "grad_norm": 14.54394288450691, "learning_rate": 1.3285705228524656e-07, "logits/chosen": -3.421875, "logits/rejected": -3.40625, "logps/chosen": -352.0, "logps/rejected": -508.0, "loss": 0.7422, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.1875, "rewards/margins": 1.71875, "rewards/rejected": -3.90625, "step": 8980 }, { "epoch": 0.6904761904761905, "grad_norm": 56.353660192219024, "learning_rate": 1.3226535869494503e-07, "logits/chosen": -3.4375, "logits/rejected": -3.34375, "logps/chosen": -294.0, "logps/rejected": -466.0, "loss": 0.7351, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.9375, "rewards/margins": 1.796875, "rewards/rejected": -3.734375, "step": 8990 }, { "epoch": 0.6912442396313364, "grad_norm": 22.086802360793858, "learning_rate": 1.3167451135090457e-07, "logits/chosen": -3.34375, "logits/rejected": -3.484375, "logps/chosen": -320.0, "logps/rejected": -460.0, "loss": 0.7677, "rewards/accuracies": 0.84375, "rewards/chosen": -1.953125, "rewards/margins": 1.6015625, "rewards/rejected": -3.5625, "step": 9000 }, { "epoch": 0.6912442396313364, "eval_logits/chosen": -3.3125, "eval_logits/rejected": -3.40625, "eval_logps/chosen": -362.0, "eval_logps/rejected": -470.0, "eval_loss": 0.4556503891944885, "eval_rewards/accuracies": 0.7616758346557617, "eval_rewards/chosen": -2.203125, "eval_rewards/margins": 1.3515625, "eval_rewards/rejected": -3.5625, "eval_runtime": 2263.6017, "eval_samples_per_second": 41.145, "eval_steps_per_second": 0.643, "step": 9000 }, { "epoch": 0.6920122887864824, "grad_norm": 18.681443693852366, "learning_rate": 1.310845144999838e-07, "logits/chosen": -3.28125, "logits/rejected": -3.234375, "logps/chosen": -304.0, "logps/rejected": -458.0, "loss": 0.7568, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.9296875, "rewards/margins": 1.5703125, "rewards/rejected": -3.5, "step": 9010 }, { "epoch": 0.6927803379416283, "grad_norm": 21.34776672278449, "learning_rate": 1.3049537238292785e-07, "logits/chosen": -3.265625, "logits/rejected": -3.171875, "logps/chosen": -298.0, "logps/rejected": -476.0, "loss": 0.815, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.859375, "rewards/margins": 1.8046875, "rewards/rejected": -3.65625, "step": 9020 }, { "epoch": 0.6935483870967742, "grad_norm": 19.973964364598086, "learning_rate": 1.2990708923433874e-07, "logits/chosen": -3.3125, "logits/rejected": -3.15625, "logps/chosen": -296.0, "logps/rejected": -462.0, "loss": 0.7173, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.6875, "rewards/margins": 1.90625, "rewards/rejected": -3.59375, "step": 9030 }, { "epoch": 0.6943164362519201, "grad_norm": 17.71475302054754, "learning_rate": 1.293196692826438e-07, "logits/chosen": -3.296875, "logits/rejected": -3.390625, "logps/chosen": -304.0, "logps/rejected": -468.0, "loss": 0.747, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.875, "rewards/margins": 1.6328125, "rewards/rejected": -3.5, "step": 9040 }, { "epoch": 0.695084485407066, "grad_norm": 19.174112867261677, "learning_rate": 1.287331167500663e-07, "logits/chosen": -3.4375, "logits/rejected": -3.59375, "logps/chosen": -342.0, "logps/rejected": -494.0, "loss": 0.777, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.171875, "rewards/margins": 1.7421875, "rewards/rejected": -3.90625, "step": 9050 }, { "epoch": 0.695852534562212, "grad_norm": 20.84745465927402, "learning_rate": 1.2814743585259486e-07, "logits/chosen": -3.34375, "logits/rejected": -3.421875, "logps/chosen": -346.0, "logps/rejected": -484.0, "loss": 0.7915, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.203125, "rewards/margins": 1.4921875, "rewards/rejected": -3.6875, "step": 9060 }, { "epoch": 0.6966205837173579, "grad_norm": 19.47584220851429, "learning_rate": 1.2756263079995268e-07, "logits/chosen": -3.390625, "logits/rejected": -3.59375, "logps/chosen": -290.0, "logps/rejected": -450.0, "loss": 0.7736, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.8515625, "rewards/margins": 1.65625, "rewards/rejected": -3.5, "step": 9070 }, { "epoch": 0.6973886328725039, "grad_norm": 17.195103176515055, "learning_rate": 1.2697870579556805e-07, "logits/chosen": -3.421875, "logits/rejected": -3.484375, "logps/chosen": -322.0, "logps/rejected": -490.0, "loss": 0.8011, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.984375, "rewards/margins": 1.7109375, "rewards/rejected": -3.703125, "step": 9080 }, { "epoch": 0.6981566820276498, "grad_norm": 23.381932687206053, "learning_rate": 1.2639566503654314e-07, "logits/chosen": -3.390625, "logits/rejected": -3.53125, "logps/chosen": -360.0, "logps/rejected": -488.0, "loss": 0.7681, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.109375, "rewards/margins": 1.515625, "rewards/rejected": -3.625, "step": 9090 }, { "epoch": 0.6989247311827957, "grad_norm": 18.28086899609597, "learning_rate": 1.2581351271362462e-07, "logits/chosen": -3.359375, "logits/rejected": -3.5625, "logps/chosen": -324.0, "logps/rejected": -454.0, "loss": 0.7383, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.9140625, "rewards/margins": 1.609375, "rewards/rejected": -3.53125, "step": 9100 }, { "epoch": 0.6996927803379416, "grad_norm": 19.371074989528278, "learning_rate": 1.2523225301117362e-07, "logits/chosen": -3.453125, "logits/rejected": -3.59375, "logps/chosen": -318.0, "logps/rejected": -436.0, "loss": 0.7276, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.03125, "rewards/margins": 1.375, "rewards/rejected": -3.40625, "step": 9110 }, { "epoch": 0.7004608294930875, "grad_norm": 22.04124476576801, "learning_rate": 1.2465189010713487e-07, "logits/chosen": -3.328125, "logits/rejected": -3.28125, "logps/chosen": -350.0, "logps/rejected": -498.0, "loss": 0.7698, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.109375, "rewards/margins": 1.703125, "rewards/rejected": -3.8125, "step": 9120 }, { "epoch": 0.7012288786482335, "grad_norm": 21.58000629565727, "learning_rate": 1.2407242817300766e-07, "logits/chosen": -3.4375, "logits/rejected": -3.546875, "logps/chosen": -326.0, "logps/rejected": -454.0, "loss": 0.81, "rewards/accuracies": 0.8125, "rewards/chosen": -2.015625, "rewards/margins": 1.4921875, "rewards/rejected": -3.515625, "step": 9130 }, { "epoch": 0.7019969278033794, "grad_norm": 21.508365974020567, "learning_rate": 1.2349387137381477e-07, "logits/chosen": -3.4375, "logits/rejected": -3.59375, "logps/chosen": -320.0, "logps/rejected": -480.0, "loss": 0.7649, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.953125, "rewards/margins": 1.6796875, "rewards/rejected": -3.625, "step": 9140 }, { "epoch": 0.7027649769585254, "grad_norm": 17.465904705871157, "learning_rate": 1.2291622386807335e-07, "logits/chosen": -3.390625, "logits/rejected": -3.453125, "logps/chosen": -358.0, "logps/rejected": -494.0, "loss": 0.7487, "rewards/accuracies": 0.84375, "rewards/chosen": -2.046875, "rewards/margins": 1.6484375, "rewards/rejected": -3.6875, "step": 9150 }, { "epoch": 0.7035330261136713, "grad_norm": 17.201319973082217, "learning_rate": 1.2233948980776497e-07, "logits/chosen": -3.4375, "logits/rejected": -3.53125, "logps/chosen": -346.0, "logps/rejected": -478.0, "loss": 0.6902, "rewards/accuracies": 0.84375, "rewards/chosen": -2.0625, "rewards/margins": 1.6484375, "rewards/rejected": -3.71875, "step": 9160 }, { "epoch": 0.7043010752688172, "grad_norm": 16.8901068814821, "learning_rate": 1.2176367333830534e-07, "logits/chosen": -3.375, "logits/rejected": -3.453125, "logps/chosen": -316.0, "logps/rejected": -474.0, "loss": 0.7627, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.9453125, "rewards/margins": 1.6015625, "rewards/rejected": -3.546875, "step": 9170 }, { "epoch": 0.7050691244239631, "grad_norm": 19.352044456400712, "learning_rate": 1.2118877859851505e-07, "logits/chosen": -3.46875, "logits/rejected": -3.453125, "logps/chosen": -326.0, "logps/rejected": -492.0, "loss": 0.7414, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.03125, "rewards/margins": 1.7109375, "rewards/rejected": -3.75, "step": 9180 }, { "epoch": 0.705837173579109, "grad_norm": 19.192783503335512, "learning_rate": 1.2061480972058908e-07, "logits/chosen": -3.40625, "logits/rejected": -3.5625, "logps/chosen": -324.0, "logps/rejected": -494.0, "loss": 0.7109, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.046875, "rewards/margins": 1.7265625, "rewards/rejected": -3.78125, "step": 9190 }, { "epoch": 0.706605222734255, "grad_norm": 22.44525854333054, "learning_rate": 1.200417708300678e-07, "logits/chosen": -3.453125, "logits/rejected": -3.4375, "logps/chosen": -328.0, "logps/rejected": -494.0, "loss": 0.78, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.140625, "rewards/margins": 1.640625, "rewards/rejected": -3.78125, "step": 9200 }, { "epoch": 0.7073732718894009, "grad_norm": 23.545882013656115, "learning_rate": 1.194696660458073e-07, "logits/chosen": -3.5, "logits/rejected": -3.515625, "logps/chosen": -322.0, "logps/rejected": -498.0, "loss": 0.7239, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.015625, "rewards/margins": 1.796875, "rewards/rejected": -3.8125, "step": 9210 }, { "epoch": 0.7081413210445469, "grad_norm": 22.199137433784923, "learning_rate": 1.1889849947994912e-07, "logits/chosen": -3.46875, "logits/rejected": -3.59375, "logps/chosen": -344.0, "logps/rejected": -510.0, "loss": 0.7568, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.21875, "rewards/margins": 1.7109375, "rewards/rejected": -3.921875, "step": 9220 }, { "epoch": 0.7089093701996928, "grad_norm": 17.57248597044622, "learning_rate": 1.1832827523789163e-07, "logits/chosen": -3.515625, "logits/rejected": -3.390625, "logps/chosen": -346.0, "logps/rejected": -516.0, "loss": 0.7637, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.09375, "rewards/margins": 1.828125, "rewards/rejected": -3.921875, "step": 9230 }, { "epoch": 0.7096774193548387, "grad_norm": 18.889651080778552, "learning_rate": 1.1775899741825945e-07, "logits/chosen": -3.40625, "logits/rejected": -3.453125, "logps/chosen": -318.0, "logps/rejected": -460.0, "loss": 0.7426, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.8984375, "rewards/margins": 1.515625, "rewards/rejected": -3.421875, "step": 9240 }, { "epoch": 0.7104454685099847, "grad_norm": 22.234577483671437, "learning_rate": 1.1719067011287487e-07, "logits/chosen": -3.5, "logits/rejected": -3.546875, "logps/chosen": -338.0, "logps/rejected": -476.0, "loss": 0.7371, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.21875, "rewards/margins": 1.53125, "rewards/rejected": -3.75, "step": 9250 }, { "epoch": 0.7112135176651305, "grad_norm": 23.564018424591215, "learning_rate": 1.1662329740672827e-07, "logits/chosen": -3.5, "logits/rejected": -3.515625, "logps/chosen": -352.0, "logps/rejected": -502.0, "loss": 0.7413, "rewards/accuracies": 0.78125, "rewards/chosen": -2.3125, "rewards/margins": 1.6328125, "rewards/rejected": -3.9375, "step": 9260 }, { "epoch": 0.7119815668202765, "grad_norm": 20.96100084618493, "learning_rate": 1.1605688337794825e-07, "logits/chosen": -3.5, "logits/rejected": -3.625, "logps/chosen": -344.0, "logps/rejected": -504.0, "loss": 0.7388, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.1875, "rewards/margins": 1.6484375, "rewards/rejected": -3.828125, "step": 9270 }, { "epoch": 0.7127496159754224, "grad_norm": 22.540865941251855, "learning_rate": 1.1549143209777318e-07, "logits/chosen": -3.421875, "logits/rejected": -3.53125, "logps/chosen": -352.0, "logps/rejected": -506.0, "loss": 0.7377, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.046875, "rewards/margins": 1.8046875, "rewards/rejected": -3.859375, "step": 9280 }, { "epoch": 0.7135176651305684, "grad_norm": 24.5570709994553, "learning_rate": 1.1492694763052086e-07, "logits/chosen": -3.59375, "logits/rejected": -3.578125, "logps/chosen": -346.0, "logps/rejected": -494.0, "loss": 0.7355, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.25, "rewards/margins": 1.625, "rewards/rejected": -3.875, "step": 9290 }, { "epoch": 0.7142857142857143, "grad_norm": 20.208197016932, "learning_rate": 1.1436343403356016e-07, "logits/chosen": -3.4375, "logits/rejected": -3.5, "logps/chosen": -336.0, "logps/rejected": -512.0, "loss": 0.7361, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.109375, "rewards/margins": 1.8984375, "rewards/rejected": -4.0, "step": 9300 }, { "epoch": 0.7150537634408602, "grad_norm": 20.683540240681157, "learning_rate": 1.1380089535728177e-07, "logits/chosen": -3.46875, "logits/rejected": -3.546875, "logps/chosen": -372.0, "logps/rejected": -492.0, "loss": 0.7669, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.359375, "rewards/margins": 1.40625, "rewards/rejected": -3.765625, "step": 9310 }, { "epoch": 0.7158218125960062, "grad_norm": 18.403749879794194, "learning_rate": 1.132393356450686e-07, "logits/chosen": -3.4375, "logits/rejected": -3.546875, "logps/chosen": -326.0, "logps/rejected": -474.0, "loss": 0.7614, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.15625, "rewards/margins": 1.53125, "rewards/rejected": -3.6875, "step": 9320 }, { "epoch": 0.716589861751152, "grad_norm": 23.139281443924833, "learning_rate": 1.1267875893326737e-07, "logits/chosen": -3.578125, "logits/rejected": -3.515625, "logps/chosen": -306.0, "logps/rejected": -472.0, "loss": 0.7663, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.9921875, "rewards/margins": 1.6796875, "rewards/rejected": -3.671875, "step": 9330 }, { "epoch": 0.717357910906298, "grad_norm": 19.124821938138098, "learning_rate": 1.1211916925115875e-07, "logits/chosen": -3.4375, "logits/rejected": -3.25, "logps/chosen": -358.0, "logps/rejected": -516.0, "loss": 0.752, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.15625, "rewards/margins": 1.6328125, "rewards/rejected": -3.78125, "step": 9340 }, { "epoch": 0.7181259600614439, "grad_norm": 21.5096308846843, "learning_rate": 1.1156057062092919e-07, "logits/chosen": -3.5, "logits/rejected": -3.515625, "logps/chosen": -324.0, "logps/rejected": -486.0, "loss": 0.7273, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.171875, "rewards/margins": 1.5625, "rewards/rejected": -3.75, "step": 9350 }, { "epoch": 0.7188940092165899, "grad_norm": 21.592095826636253, "learning_rate": 1.1100296705764184e-07, "logits/chosen": -3.46875, "logits/rejected": -3.453125, "logps/chosen": -320.0, "logps/rejected": -508.0, "loss": 0.7412, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.015625, "rewards/margins": 2.0, "rewards/rejected": -4.03125, "step": 9360 }, { "epoch": 0.7196620583717358, "grad_norm": 19.114295188423373, "learning_rate": 1.1044636256920728e-07, "logits/chosen": -3.25, "logits/rejected": -3.421875, "logps/chosen": -358.0, "logps/rejected": -516.0, "loss": 0.7394, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.125, "rewards/margins": 1.8515625, "rewards/rejected": -3.96875, "step": 9370 }, { "epoch": 0.7204301075268817, "grad_norm": 21.646855039634644, "learning_rate": 1.0989076115635537e-07, "logits/chosen": -3.296875, "logits/rejected": -3.359375, "logps/chosen": -318.0, "logps/rejected": -488.0, "loss": 0.7474, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.890625, "rewards/margins": 1.828125, "rewards/rejected": -3.71875, "step": 9380 }, { "epoch": 0.7211981566820277, "grad_norm": 21.613449581627957, "learning_rate": 1.0933616681260562e-07, "logits/chosen": -3.578125, "logits/rejected": -3.484375, "logps/chosen": -336.0, "logps/rejected": -492.0, "loss": 0.7242, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.28125, "rewards/margins": 1.5625, "rewards/rejected": -3.84375, "step": 9390 }, { "epoch": 0.7219662058371735, "grad_norm": 20.537609997972584, "learning_rate": 1.087825835242393e-07, "logits/chosen": -3.3125, "logits/rejected": -3.546875, "logps/chosen": -326.0, "logps/rejected": -478.0, "loss": 0.7406, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.96875, "rewards/margins": 1.625, "rewards/rejected": -3.59375, "step": 9400 }, { "epoch": 0.7227342549923195, "grad_norm": 23.034176071962865, "learning_rate": 1.0823001527027053e-07, "logits/chosen": -3.34375, "logits/rejected": -3.15625, "logps/chosen": -352.0, "logps/rejected": -536.0, "loss": 0.7457, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.109375, "rewards/margins": 1.84375, "rewards/rejected": -3.953125, "step": 9410 }, { "epoch": 0.7235023041474654, "grad_norm": 20.331933598953583, "learning_rate": 1.0767846602241739e-07, "logits/chosen": -3.53125, "logits/rejected": -3.546875, "logps/chosen": -342.0, "logps/rejected": -502.0, "loss": 0.7611, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.203125, "rewards/margins": 1.703125, "rewards/rejected": -3.90625, "step": 9420 }, { "epoch": 0.7242703533026114, "grad_norm": 20.094167785814385, "learning_rate": 1.07127939745074e-07, "logits/chosen": -3.40625, "logits/rejected": -3.53125, "logps/chosen": -316.0, "logps/rejected": -488.0, "loss": 0.7491, "rewards/accuracies": 0.875, "rewards/chosen": -1.9375, "rewards/margins": 1.9296875, "rewards/rejected": -3.875, "step": 9430 }, { "epoch": 0.7250384024577573, "grad_norm": 21.054439060531628, "learning_rate": 1.0657844039528108e-07, "logits/chosen": -3.578125, "logits/rejected": -3.3125, "logps/chosen": -324.0, "logps/rejected": -496.0, "loss": 0.7612, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.21875, "rewards/margins": 1.65625, "rewards/rejected": -3.875, "step": 9440 }, { "epoch": 0.7258064516129032, "grad_norm": 21.74894877814645, "learning_rate": 1.0602997192269839e-07, "logits/chosen": -3.34375, "logits/rejected": -3.46875, "logps/chosen": -328.0, "logps/rejected": -486.0, "loss": 0.7709, "rewards/accuracies": 0.84375, "rewards/chosen": -1.9140625, "rewards/margins": 1.7109375, "rewards/rejected": -3.625, "step": 9450 }, { "epoch": 0.7265745007680492, "grad_norm": 18.897535110652445, "learning_rate": 1.054825382695761e-07, "logits/chosen": -3.484375, "logits/rejected": -3.546875, "logps/chosen": -348.0, "logps/rejected": -524.0, "loss": 0.7595, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.234375, "rewards/margins": 1.765625, "rewards/rejected": -4.0, "step": 9460 }, { "epoch": 0.7273425499231951, "grad_norm": 18.916080533094167, "learning_rate": 1.0493614337072607e-07, "logits/chosen": -3.34375, "logits/rejected": -3.46875, "logps/chosen": -314.0, "logps/rejected": -480.0, "loss": 0.7256, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.8984375, "rewards/margins": 1.8125, "rewards/rejected": -3.71875, "step": 9470 }, { "epoch": 0.728110599078341, "grad_norm": 23.18369769020788, "learning_rate": 1.043907911534943e-07, "logits/chosen": -3.59375, "logits/rejected": -3.453125, "logps/chosen": -314.0, "logps/rejected": -516.0, "loss": 0.7161, "rewards/accuracies": 0.84375, "rewards/chosen": -2.09375, "rewards/margins": 1.8046875, "rewards/rejected": -3.90625, "step": 9480 }, { "epoch": 0.7288786482334869, "grad_norm": 19.248253164985627, "learning_rate": 1.0384648553773165e-07, "logits/chosen": -3.453125, "logits/rejected": -3.65625, "logps/chosen": -352.0, "logps/rejected": -466.0, "loss": 0.7472, "rewards/accuracies": 0.78125, "rewards/chosen": -2.25, "rewards/margins": 1.453125, "rewards/rejected": -3.703125, "step": 9490 }, { "epoch": 0.7296466973886329, "grad_norm": 23.335349712861305, "learning_rate": 1.0330323043576686e-07, "logits/chosen": -3.578125, "logits/rejected": -3.71875, "logps/chosen": -358.0, "logps/rejected": -520.0, "loss": 0.7578, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.15625, "rewards/margins": 1.734375, "rewards/rejected": -3.90625, "step": 9500 }, { "epoch": 0.7304147465437788, "grad_norm": 19.041218104080563, "learning_rate": 1.0276102975237752e-07, "logits/chosen": -3.484375, "logits/rejected": -3.625, "logps/chosen": -332.0, "logps/rejected": -468.0, "loss": 0.7359, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.125, "rewards/margins": 1.546875, "rewards/rejected": -3.671875, "step": 9510 }, { "epoch": 0.7311827956989247, "grad_norm": 21.58071323705968, "learning_rate": 1.0221988738476233e-07, "logits/chosen": -3.546875, "logits/rejected": -3.65625, "logps/chosen": -348.0, "logps/rejected": -516.0, "loss": 0.7551, "rewards/accuracies": 0.8125, "rewards/chosen": -2.15625, "rewards/margins": 1.8125, "rewards/rejected": -3.96875, "step": 9520 }, { "epoch": 0.7319508448540707, "grad_norm": 20.506561626962288, "learning_rate": 1.0167980722251346e-07, "logits/chosen": -3.609375, "logits/rejected": -3.578125, "logps/chosen": -310.0, "logps/rejected": -472.0, "loss": 0.7302, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.0625, "rewards/margins": 1.7109375, "rewards/rejected": -3.78125, "step": 9530 }, { "epoch": 0.7327188940092166, "grad_norm": 20.092961610859586, "learning_rate": 1.0114079314758753e-07, "logits/chosen": -3.5, "logits/rejected": -3.5625, "logps/chosen": -326.0, "logps/rejected": -478.0, "loss": 0.7491, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.109375, "rewards/margins": 1.6953125, "rewards/rejected": -3.796875, "step": 9540 }, { "epoch": 0.7334869431643625, "grad_norm": 22.420650771084116, "learning_rate": 1.0060284903427913e-07, "logits/chosen": -3.453125, "logits/rejected": -3.625, "logps/chosen": -356.0, "logps/rejected": -486.0, "loss": 0.7345, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.109375, "rewards/margins": 1.5, "rewards/rejected": -3.609375, "step": 9550 }, { "epoch": 0.7342549923195084, "grad_norm": 21.70650760323054, "learning_rate": 1.0006597874919176e-07, "logits/chosen": -3.484375, "logits/rejected": -3.5, "logps/chosen": -352.0, "logps/rejected": -516.0, "loss": 0.7294, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.25, "rewards/margins": 1.59375, "rewards/rejected": -3.84375, "step": 9560 }, { "epoch": 0.7350230414746544, "grad_norm": 24.42369105587417, "learning_rate": 9.953018615121065e-08, "logits/chosen": -3.421875, "logits/rejected": -3.671875, "logps/chosen": -326.0, "logps/rejected": -484.0, "loss": 0.7652, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.03125, "rewards/margins": 1.703125, "rewards/rejected": -3.75, "step": 9570 }, { "epoch": 0.7357910906298003, "grad_norm": 21.01274739454883, "learning_rate": 9.899547509147516e-08, "logits/chosen": -3.484375, "logits/rejected": -3.546875, "logps/chosen": -314.0, "logps/rejected": -460.0, "loss": 0.726, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.078125, "rewards/margins": 1.53125, "rewards/rejected": -3.609375, "step": 9580 }, { "epoch": 0.7365591397849462, "grad_norm": 20.110694802170322, "learning_rate": 9.846184941335026e-08, "logits/chosen": -3.546875, "logits/rejected": -3.703125, "logps/chosen": -342.0, "logps/rejected": -498.0, "loss": 0.7215, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.203125, "rewards/margins": 1.734375, "rewards/rejected": -3.953125, "step": 9590 }, { "epoch": 0.7373271889400922, "grad_norm": 20.899355058830643, "learning_rate": 9.79293129524e-08, "logits/chosen": -3.4375, "logits/rejected": -3.578125, "logps/chosen": -328.0, "logps/rejected": -502.0, "loss": 0.726, "rewards/accuracies": 0.84375, "rewards/chosen": -2.015625, "rewards/margins": 1.78125, "rewards/rejected": -3.796875, "step": 9600 }, { "epoch": 0.7380952380952381, "grad_norm": 19.73145467812417, "learning_rate": 9.739786953635923e-08, "logits/chosen": -3.484375, "logits/rejected": -3.578125, "logps/chosen": -320.0, "logps/rejected": -508.0, "loss": 0.7504, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.03125, "rewards/margins": 1.8515625, "rewards/rejected": -3.875, "step": 9610 }, { "epoch": 0.738863287250384, "grad_norm": 22.085524779738325, "learning_rate": 9.686752298510614e-08, "logits/chosen": -3.5, "logits/rejected": -3.359375, "logps/chosen": -342.0, "logps/rejected": -556.0, "loss": 0.7366, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.21875, "rewards/margins": 2.03125, "rewards/rejected": -4.25, "step": 9620 }, { "epoch": 0.7396313364055299, "grad_norm": 17.655027820961035, "learning_rate": 9.633827711063533e-08, "logits/chosen": -3.421875, "logits/rejected": -3.5, "logps/chosen": -322.0, "logps/rejected": -484.0, "loss": 0.7185, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.921875, "rewards/margins": 1.8046875, "rewards/rejected": -3.734375, "step": 9630 }, { "epoch": 0.7403993855606759, "grad_norm": 20.711184770179575, "learning_rate": 9.58101357170294e-08, "logits/chosen": -3.546875, "logits/rejected": -3.671875, "logps/chosen": -324.0, "logps/rejected": -470.0, "loss": 0.7607, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.03125, "rewards/margins": 1.6171875, "rewards/rejected": -3.65625, "step": 9640 }, { "epoch": 0.7411674347158218, "grad_norm": 23.396415254681685, "learning_rate": 9.528310260043285e-08, "logits/chosen": -3.5625, "logits/rejected": -3.59375, "logps/chosen": -340.0, "logps/rejected": -472.0, "loss": 0.7445, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.140625, "rewards/margins": 1.4921875, "rewards/rejected": -3.640625, "step": 9650 }, { "epoch": 0.7419354838709677, "grad_norm": 19.03480778644027, "learning_rate": 9.475718154902382e-08, "logits/chosen": -3.484375, "logits/rejected": -3.5, "logps/chosen": -316.0, "logps/rejected": -468.0, "loss": 0.7456, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.9453125, "rewards/margins": 1.640625, "rewards/rejected": -3.578125, "step": 9660 }, { "epoch": 0.7427035330261137, "grad_norm": 15.953304802603704, "learning_rate": 9.423237634298722e-08, "logits/chosen": -3.46875, "logits/rejected": -3.46875, "logps/chosen": -340.0, "logps/rejected": -524.0, "loss": 0.7239, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.015625, "rewards/margins": 1.8828125, "rewards/rejected": -3.890625, "step": 9670 }, { "epoch": 0.7434715821812596, "grad_norm": 20.01693213610309, "learning_rate": 9.37086907544879e-08, "logits/chosen": -3.390625, "logits/rejected": -3.46875, "logps/chosen": -354.0, "logps/rejected": -502.0, "loss": 0.7423, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.140625, "rewards/margins": 1.6640625, "rewards/rejected": -3.8125, "step": 9680 }, { "epoch": 0.7442396313364056, "grad_norm": 20.775471955988884, "learning_rate": 9.318612854764252e-08, "logits/chosen": -3.296875, "logits/rejected": -3.53125, "logps/chosen": -364.0, "logps/rejected": -516.0, "loss": 0.7387, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.234375, "rewards/margins": 1.78125, "rewards/rejected": -4.03125, "step": 9690 }, { "epoch": 0.7450076804915514, "grad_norm": 22.243820998470188, "learning_rate": 9.266469347849385e-08, "logits/chosen": -3.375, "logits/rejected": -3.4375, "logps/chosen": -344.0, "logps/rejected": -488.0, "loss": 0.7683, "rewards/accuracies": 0.78125, "rewards/chosen": -2.15625, "rewards/margins": 1.6015625, "rewards/rejected": -3.75, "step": 9700 }, { "epoch": 0.7457757296466974, "grad_norm": 21.522189380776044, "learning_rate": 9.21443892949827e-08, "logits/chosen": -3.453125, "logits/rejected": -3.734375, "logps/chosen": -354.0, "logps/rejected": -512.0, "loss": 0.7409, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.328125, "rewards/margins": 1.8046875, "rewards/rejected": -4.125, "step": 9710 }, { "epoch": 0.7465437788018433, "grad_norm": 21.966210803392734, "learning_rate": 9.162521973692139e-08, "logits/chosen": -3.453125, "logits/rejected": -3.484375, "logps/chosen": -350.0, "logps/rejected": -492.0, "loss": 0.7074, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.21875, "rewards/margins": 1.4921875, "rewards/rejected": -3.71875, "step": 9720 }, { "epoch": 0.7473118279569892, "grad_norm": 19.138860424585445, "learning_rate": 9.11071885359671e-08, "logits/chosen": -3.515625, "logits/rejected": -3.625, "logps/chosen": -344.0, "logps/rejected": -504.0, "loss": 0.7591, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.171875, "rewards/margins": 1.625, "rewards/rejected": -3.8125, "step": 9730 }, { "epoch": 0.7480798771121352, "grad_norm": 19.275916776590474, "learning_rate": 9.059029941559438e-08, "logits/chosen": -3.5, "logits/rejected": -3.734375, "logps/chosen": -346.0, "logps/rejected": -528.0, "loss": 0.6882, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.171875, "rewards/margins": 1.875, "rewards/rejected": -4.03125, "step": 9740 }, { "epoch": 0.7488479262672811, "grad_norm": 30.382643903319845, "learning_rate": 9.007455609106915e-08, "logits/chosen": -3.40625, "logits/rejected": -3.328125, "logps/chosen": -374.0, "logps/rejected": -532.0, "loss": 0.7871, "rewards/accuracies": 0.8125, "rewards/chosen": -2.21875, "rewards/margins": 1.765625, "rewards/rejected": -3.984375, "step": 9750 }, { "epoch": 0.7496159754224271, "grad_norm": 21.876830324770193, "learning_rate": 8.955996226942152e-08, "logits/chosen": -3.71875, "logits/rejected": -3.640625, "logps/chosen": -334.0, "logps/rejected": -516.0, "loss": 0.7276, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.171875, "rewards/margins": 2.03125, "rewards/rejected": -4.1875, "step": 9760 }, { "epoch": 0.7503840245775729, "grad_norm": 17.381578560352246, "learning_rate": 8.904652164941917e-08, "logits/chosen": -3.34375, "logits/rejected": -3.5625, "logps/chosen": -350.0, "logps/rejected": -506.0, "loss": 0.7565, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.9921875, "rewards/margins": 1.734375, "rewards/rejected": -3.71875, "step": 9770 }, { "epoch": 0.7511520737327189, "grad_norm": 18.82732762833397, "learning_rate": 8.853423792154118e-08, "logits/chosen": -3.546875, "logits/rejected": -3.65625, "logps/chosen": -342.0, "logps/rejected": -524.0, "loss": 0.7239, "rewards/accuracies": 0.84375, "rewards/chosen": -2.1875, "rewards/margins": 1.875, "rewards/rejected": -4.0625, "step": 9780 }, { "epoch": 0.7519201228878648, "grad_norm": 24.505041737498654, "learning_rate": 8.802311476795065e-08, "logits/chosen": -3.640625, "logits/rejected": -3.625, "logps/chosen": -318.0, "logps/rejected": -512.0, "loss": 0.7686, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.109375, "rewards/margins": 1.9140625, "rewards/rejected": -4.0, "step": 9790 }, { "epoch": 0.7526881720430108, "grad_norm": 18.482224578144688, "learning_rate": 8.751315586246937e-08, "logits/chosen": -3.5625, "logits/rejected": -3.625, "logps/chosen": -340.0, "logps/rejected": -492.0, "loss": 0.6893, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.296875, "rewards/margins": 1.703125, "rewards/rejected": -4.0, "step": 9800 }, { "epoch": 0.7534562211981567, "grad_norm": 23.183018665070772, "learning_rate": 8.700436487055043e-08, "logits/chosen": -3.609375, "logits/rejected": -3.65625, "logps/chosen": -346.0, "logps/rejected": -484.0, "loss": 0.7273, "rewards/accuracies": 0.8125, "rewards/chosen": -2.203125, "rewards/margins": 1.4609375, "rewards/rejected": -3.671875, "step": 9810 }, { "epoch": 0.7542242703533026, "grad_norm": 21.007513233426874, "learning_rate": 8.649674544925226e-08, "logits/chosen": -3.46875, "logits/rejected": -3.671875, "logps/chosen": -342.0, "logps/rejected": -524.0, "loss": 0.7489, "rewards/accuracies": 0.84375, "rewards/chosen": -2.078125, "rewards/margins": 1.90625, "rewards/rejected": -3.96875, "step": 9820 }, { "epoch": 0.7549923195084486, "grad_norm": 18.550750515065495, "learning_rate": 8.59903012472127e-08, "logits/chosen": -3.484375, "logits/rejected": -3.828125, "logps/chosen": -318.0, "logps/rejected": -446.0, "loss": 0.7958, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.90625, "rewards/margins": 1.59375, "rewards/rejected": -3.5, "step": 9830 }, { "epoch": 0.7557603686635944, "grad_norm": 21.125890055600262, "learning_rate": 8.548503590462187e-08, "logits/chosen": -3.484375, "logits/rejected": -3.78125, "logps/chosen": -316.0, "logps/rejected": -484.0, "loss": 0.7396, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.8515625, "rewards/margins": 1.96875, "rewards/rejected": -3.8125, "step": 9840 }, { "epoch": 0.7565284178187404, "grad_norm": 22.21715561231106, "learning_rate": 8.498095305319706e-08, "logits/chosen": -3.5625, "logits/rejected": -3.640625, "logps/chosen": -324.0, "logps/rejected": -494.0, "loss": 0.7287, "rewards/accuracies": 0.875, "rewards/chosen": -2.078125, "rewards/margins": 1.7578125, "rewards/rejected": -3.828125, "step": 9850 }, { "epoch": 0.7572964669738863, "grad_norm": 18.81029959606183, "learning_rate": 8.447805631615582e-08, "logits/chosen": -3.5, "logits/rejected": -3.6875, "logps/chosen": -320.0, "logps/rejected": -492.0, "loss": 0.7378, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.03125, "rewards/margins": 1.8984375, "rewards/rejected": -3.921875, "step": 9860 }, { "epoch": 0.7580645161290323, "grad_norm": 18.07548425882418, "learning_rate": 8.39763493081902e-08, "logits/chosen": -3.53125, "logits/rejected": -3.578125, "logps/chosen": -344.0, "logps/rejected": -492.0, "loss": 0.7451, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.21875, "rewards/margins": 1.6171875, "rewards/rejected": -3.828125, "step": 9870 }, { "epoch": 0.7588325652841782, "grad_norm": 19.25195131326682, "learning_rate": 8.347583563544116e-08, "logits/chosen": -3.5, "logits/rejected": -3.625, "logps/chosen": -336.0, "logps/rejected": -528.0, "loss": 0.6879, "rewards/accuracies": 0.84375, "rewards/chosen": -2.0625, "rewards/margins": 1.953125, "rewards/rejected": -4.0, "step": 9880 }, { "epoch": 0.7596006144393241, "grad_norm": 19.584281199730533, "learning_rate": 8.297651889547167e-08, "logits/chosen": -3.40625, "logits/rejected": -3.671875, "logps/chosen": -336.0, "logps/rejected": -504.0, "loss": 0.7287, "rewards/accuracies": 0.84375, "rewards/chosen": -2.109375, "rewards/margins": 1.875, "rewards/rejected": -4.0, "step": 9890 }, { "epoch": 0.7603686635944701, "grad_norm": 20.091348177450378, "learning_rate": 8.247840267724201e-08, "logits/chosen": -3.5, "logits/rejected": -3.546875, "logps/chosen": -314.0, "logps/rejected": -502.0, "loss": 0.7622, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.9296875, "rewards/margins": 1.875, "rewards/rejected": -3.796875, "step": 9900 }, { "epoch": 0.761136712749616, "grad_norm": 20.45337968140331, "learning_rate": 8.198149056108311e-08, "logits/chosen": -3.5625, "logits/rejected": -3.75, "logps/chosen": -322.0, "logps/rejected": -486.0, "loss": 0.7293, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.109375, "rewards/margins": 1.796875, "rewards/rejected": -3.90625, "step": 9910 }, { "epoch": 0.7619047619047619, "grad_norm": 18.28963173639438, "learning_rate": 8.148578611867113e-08, "logits/chosen": -3.421875, "logits/rejected": -3.765625, "logps/chosen": -340.0, "logps/rejected": -488.0, "loss": 0.751, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.03125, "rewards/margins": 1.78125, "rewards/rejected": -3.8125, "step": 9920 }, { "epoch": 0.7626728110599078, "grad_norm": 17.539824773598106, "learning_rate": 8.099129291300208e-08, "logits/chosen": -3.625, "logits/rejected": -3.703125, "logps/chosen": -320.0, "logps/rejected": -472.0, "loss": 0.7663, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.1875, "rewards/margins": 1.578125, "rewards/rejected": -3.765625, "step": 9930 }, { "epoch": 0.7634408602150538, "grad_norm": 22.091159583193804, "learning_rate": 8.049801449836541e-08, "logits/chosen": -3.5, "logits/rejected": -3.734375, "logps/chosen": -334.0, "logps/rejected": -470.0, "loss": 0.7798, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.015625, "rewards/margins": 1.640625, "rewards/rejected": -3.65625, "step": 9940 }, { "epoch": 0.7642089093701997, "grad_norm": 26.52858946093695, "learning_rate": 8.000595442031943e-08, "logits/chosen": -3.5, "logits/rejected": -3.609375, "logps/chosen": -304.0, "logps/rejected": -478.0, "loss": 0.7557, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.8359375, "rewards/margins": 1.8828125, "rewards/rejected": -3.71875, "step": 9950 }, { "epoch": 0.7649769585253456, "grad_norm": 19.566530001649348, "learning_rate": 7.951511621566514e-08, "logits/chosen": -3.515625, "logits/rejected": -3.59375, "logps/chosen": -304.0, "logps/rejected": -468.0, "loss": 0.7057, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.875, "rewards/margins": 1.671875, "rewards/rejected": -3.546875, "step": 9960 }, { "epoch": 0.7657450076804916, "grad_norm": 18.848913161430623, "learning_rate": 7.902550341242098e-08, "logits/chosen": -3.578125, "logits/rejected": -3.515625, "logps/chosen": -322.0, "logps/rejected": -478.0, "loss": 0.7149, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.046875, "rewards/margins": 1.4921875, "rewards/rejected": -3.546875, "step": 9970 }, { "epoch": 0.7665130568356375, "grad_norm": 18.760236391488345, "learning_rate": 7.853711952979786e-08, "logits/chosen": -3.5, "logits/rejected": -3.5625, "logps/chosen": -330.0, "logps/rejected": -490.0, "loss": 0.7569, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.140625, "rewards/margins": 1.7421875, "rewards/rejected": -3.875, "step": 9980 }, { "epoch": 0.7672811059907834, "grad_norm": 16.653865625280446, "learning_rate": 7.804996807817287e-08, "logits/chosen": -3.421875, "logits/rejected": -3.421875, "logps/chosen": -324.0, "logps/rejected": -490.0, "loss": 0.7652, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.0625, "rewards/margins": 1.7734375, "rewards/rejected": -3.828125, "step": 9990 }, { "epoch": 0.7680491551459293, "grad_norm": 21.852446915595984, "learning_rate": 7.756405255906546e-08, "logits/chosen": -3.53125, "logits/rejected": -3.40625, "logps/chosen": -316.0, "logps/rejected": -454.0, "loss": 0.7502, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.078125, "rewards/margins": 1.3359375, "rewards/rejected": -3.40625, "step": 10000 }, { "epoch": 0.7680491551459293, "eval_logits/chosen": -3.4375, "eval_logits/rejected": -3.546875, "eval_logps/chosen": -372.0, "eval_logps/rejected": -480.0, "eval_loss": 0.45736926794052124, "eval_rewards/accuracies": 0.7609890103340149, "eval_rewards/chosen": -2.3125, "eval_rewards/margins": 1.3671875, "eval_rewards/rejected": -3.671875, "eval_runtime": 2263.4665, "eval_samples_per_second": 41.147, "eval_steps_per_second": 0.643, "step": 10000 }, { "epoch": 0.7688172043010753, "grad_norm": 20.974145339113544, "learning_rate": 7.70793764651109e-08, "logits/chosen": -3.578125, "logits/rejected": -3.578125, "logps/chosen": -326.0, "logps/rejected": -476.0, "loss": 0.7725, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.0, "rewards/margins": 1.609375, "rewards/rejected": -3.609375, "step": 10010 }, { "epoch": 0.7695852534562212, "grad_norm": 20.127772060505038, "learning_rate": 7.659594328003624e-08, "logits/chosen": -3.421875, "logits/rejected": -3.59375, "logps/chosen": -358.0, "logps/rejected": -510.0, "loss": 0.7514, "rewards/accuracies": 0.84375, "rewards/chosen": -2.171875, "rewards/margins": 1.7890625, "rewards/rejected": -3.953125, "step": 10020 }, { "epoch": 0.7703533026113671, "grad_norm": 20.875237254779893, "learning_rate": 7.611375647863466e-08, "logits/chosen": -3.484375, "logits/rejected": -3.71875, "logps/chosen": -340.0, "logps/rejected": -510.0, "loss": 0.7564, "rewards/accuracies": 0.8125, "rewards/chosen": -2.109375, "rewards/margins": 1.84375, "rewards/rejected": -3.96875, "step": 10030 }, { "epoch": 0.7711213517665131, "grad_norm": 21.665117688994645, "learning_rate": 7.563281952674039e-08, "logits/chosen": -3.4375, "logits/rejected": -3.484375, "logps/chosen": -332.0, "logps/rejected": -532.0, "loss": 0.7653, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.046875, "rewards/margins": 1.8828125, "rewards/rejected": -3.921875, "step": 10040 }, { "epoch": 0.771889400921659, "grad_norm": 17.41940976025057, "learning_rate": 7.51531358812045e-08, "logits/chosen": -3.328125, "logits/rejected": -3.34375, "logps/chosen": -346.0, "logps/rejected": -520.0, "loss": 0.7238, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.140625, "rewards/margins": 1.8671875, "rewards/rejected": -4.0, "step": 10050 }, { "epoch": 0.7726574500768049, "grad_norm": 21.23970373012775, "learning_rate": 7.467470898986928e-08, "logits/chosen": -3.53125, "logits/rejected": -3.65625, "logps/chosen": -334.0, "logps/rejected": -488.0, "loss": 0.7498, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.09375, "rewards/margins": 1.703125, "rewards/rejected": -3.78125, "step": 10060 }, { "epoch": 0.7734254992319508, "grad_norm": 20.507600729646473, "learning_rate": 7.419754229154418e-08, "logits/chosen": -3.484375, "logits/rejected": -3.625, "logps/chosen": -332.0, "logps/rejected": -500.0, "loss": 0.7707, "rewards/accuracies": 0.84375, "rewards/chosen": -2.078125, "rewards/margins": 1.7734375, "rewards/rejected": -3.84375, "step": 10070 }, { "epoch": 0.7741935483870968, "grad_norm": 24.85609557100419, "learning_rate": 7.372163921598021e-08, "logits/chosen": -3.453125, "logits/rejected": -3.640625, "logps/chosen": -340.0, "logps/rejected": -462.0, "loss": 0.7672, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.078125, "rewards/margins": 1.515625, "rewards/rejected": -3.59375, "step": 10080 }, { "epoch": 0.7749615975422427, "grad_norm": 20.132776154966614, "learning_rate": 7.324700318384606e-08, "logits/chosen": -3.5, "logits/rejected": -3.5625, "logps/chosen": -334.0, "logps/rejected": -498.0, "loss": 0.7201, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.078125, "rewards/margins": 1.7109375, "rewards/rejected": -3.796875, "step": 10090 }, { "epoch": 0.7757296466973886, "grad_norm": 19.71597706877571, "learning_rate": 7.277363760670333e-08, "logits/chosen": -3.484375, "logits/rejected": -3.53125, "logps/chosen": -318.0, "logps/rejected": -480.0, "loss": 0.7822, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.03125, "rewards/margins": 1.65625, "rewards/rejected": -3.6875, "step": 10100 }, { "epoch": 0.7764976958525346, "grad_norm": 21.845252437425856, "learning_rate": 7.230154588698164e-08, "logits/chosen": -3.359375, "logits/rejected": -3.34375, "logps/chosen": -342.0, "logps/rejected": -504.0, "loss": 0.777, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.171875, "rewards/margins": 1.6796875, "rewards/rejected": -3.84375, "step": 10110 }, { "epoch": 0.7772657450076805, "grad_norm": 19.85497012662462, "learning_rate": 7.183073141795493e-08, "logits/chosen": -3.4375, "logits/rejected": -3.484375, "logps/chosen": -338.0, "logps/rejected": -498.0, "loss": 0.7494, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.03125, "rewards/margins": 1.7578125, "rewards/rejected": -3.796875, "step": 10120 }, { "epoch": 0.7780337941628265, "grad_norm": 25.423207704722753, "learning_rate": 7.136119758371595e-08, "logits/chosen": -3.40625, "logits/rejected": -3.375, "logps/chosen": -318.0, "logps/rejected": -494.0, "loss": 0.7775, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.03125, "rewards/margins": 1.6953125, "rewards/rejected": -3.71875, "step": 10130 }, { "epoch": 0.7788018433179723, "grad_norm": 17.05457394668177, "learning_rate": 7.089294775915291e-08, "logits/chosen": -3.46875, "logits/rejected": -3.234375, "logps/chosen": -348.0, "logps/rejected": -516.0, "loss": 0.706, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.1875, "rewards/margins": 1.640625, "rewards/rejected": -3.828125, "step": 10140 }, { "epoch": 0.7795698924731183, "grad_norm": 25.0882124433243, "learning_rate": 7.04259853099251e-08, "logits/chosen": -3.40625, "logits/rejected": -3.5625, "logps/chosen": -330.0, "logps/rejected": -484.0, "loss": 0.7629, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.03125, "rewards/margins": 1.75, "rewards/rejected": -3.78125, "step": 10150 }, { "epoch": 0.7803379416282642, "grad_norm": 20.01150579446275, "learning_rate": 6.996031359243804e-08, "logits/chosen": -3.546875, "logits/rejected": -3.671875, "logps/chosen": -328.0, "logps/rejected": -468.0, "loss": 0.7723, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.1875, "rewards/margins": 1.5859375, "rewards/rejected": -3.78125, "step": 10160 }, { "epoch": 0.7811059907834101, "grad_norm": 26.10318543410791, "learning_rate": 6.94959359538203e-08, "logits/chosen": -3.484375, "logits/rejected": -3.6875, "logps/chosen": -348.0, "logps/rejected": -468.0, "loss": 0.8186, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.21875, "rewards/margins": 1.4453125, "rewards/rejected": -3.65625, "step": 10170 }, { "epoch": 0.7818740399385561, "grad_norm": 18.680800926887073, "learning_rate": 6.903285573189843e-08, "logits/chosen": -3.4375, "logits/rejected": -3.484375, "logps/chosen": -338.0, "logps/rejected": -524.0, "loss": 0.6788, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.984375, "rewards/margins": 1.984375, "rewards/rejected": -3.96875, "step": 10180 }, { "epoch": 0.782642089093702, "grad_norm": 23.06878599342692, "learning_rate": 6.857107625517375e-08, "logits/chosen": -3.484375, "logits/rejected": -3.59375, "logps/chosen": -334.0, "logps/rejected": -512.0, "loss": 0.8046, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.109375, "rewards/margins": 1.828125, "rewards/rejected": -3.9375, "step": 10190 }, { "epoch": 0.783410138248848, "grad_norm": 21.98768045226996, "learning_rate": 6.811060084279827e-08, "logits/chosen": -3.5, "logits/rejected": -3.5625, "logps/chosen": -340.0, "logps/rejected": -496.0, "loss": 0.7466, "rewards/accuracies": 0.78125, "rewards/chosen": -2.140625, "rewards/margins": 1.6015625, "rewards/rejected": -3.75, "step": 10200 }, { "epoch": 0.7841781874039938, "grad_norm": 20.458659774339893, "learning_rate": 6.765143280455044e-08, "logits/chosen": -3.546875, "logits/rejected": -3.578125, "logps/chosen": -314.0, "logps/rejected": -476.0, "loss": 0.734, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.109375, "rewards/margins": 1.609375, "rewards/rejected": -3.71875, "step": 10210 }, { "epoch": 0.7849462365591398, "grad_norm": 20.74422424708975, "learning_rate": 6.7193575440812e-08, "logits/chosen": -3.5625, "logits/rejected": -3.5, "logps/chosen": -334.0, "logps/rejected": -472.0, "loss": 0.745, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.15625, "rewards/margins": 1.40625, "rewards/rejected": -3.5625, "step": 10220 }, { "epoch": 0.7857142857142857, "grad_norm": 21.743869394767035, "learning_rate": 6.673703204254347e-08, "logits/chosen": -3.59375, "logits/rejected": -3.609375, "logps/chosen": -318.0, "logps/rejected": -476.0, "loss": 0.7441, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.078125, "rewards/margins": 1.6328125, "rewards/rejected": -3.71875, "step": 10230 }, { "epoch": 0.7864823348694316, "grad_norm": 19.937040671652838, "learning_rate": 6.628180589126117e-08, "logits/chosen": -3.484375, "logits/rejected": -3.46875, "logps/chosen": -340.0, "logps/rejected": -528.0, "loss": 0.7077, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.140625, "rewards/margins": 1.90625, "rewards/rejected": -4.03125, "step": 10240 }, { "epoch": 0.7872503840245776, "grad_norm": 20.8301722432378, "learning_rate": 6.58279002590135e-08, "logits/chosen": -3.53125, "logits/rejected": -3.4375, "logps/chosen": -356.0, "logps/rejected": -544.0, "loss": 0.7574, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.1875, "rewards/margins": 1.9609375, "rewards/rejected": -4.15625, "step": 10250 }, { "epoch": 0.7880184331797235, "grad_norm": 23.93770947359177, "learning_rate": 6.537531840835702e-08, "logits/chosen": -3.46875, "logits/rejected": -3.609375, "logps/chosen": -336.0, "logps/rejected": -508.0, "loss": 0.7333, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.046875, "rewards/margins": 1.828125, "rewards/rejected": -3.875, "step": 10260 }, { "epoch": 0.7887864823348695, "grad_norm": 19.863269927954438, "learning_rate": 6.492406359233366e-08, "logits/chosen": -3.53125, "logits/rejected": -3.46875, "logps/chosen": -352.0, "logps/rejected": -516.0, "loss": 0.681, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.15625, "rewards/margins": 1.90625, "rewards/rejected": -4.0625, "step": 10270 }, { "epoch": 0.7895545314900153, "grad_norm": 20.89228973989477, "learning_rate": 6.447413905444651e-08, "logits/chosen": -3.484375, "logits/rejected": -3.390625, "logps/chosen": -334.0, "logps/rejected": -502.0, "loss": 0.7463, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.265625, "rewards/margins": 1.7265625, "rewards/rejected": -3.984375, "step": 10280 }, { "epoch": 0.7903225806451613, "grad_norm": 37.992045942759994, "learning_rate": 6.402554802863724e-08, "logits/chosen": -3.546875, "logits/rejected": -3.59375, "logps/chosen": -370.0, "logps/rejected": -498.0, "loss": 0.7656, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.265625, "rewards/margins": 1.546875, "rewards/rejected": -3.8125, "step": 10290 }, { "epoch": 0.7910906298003072, "grad_norm": 20.873211350512857, "learning_rate": 6.357829373926266e-08, "logits/chosen": -3.4375, "logits/rejected": -3.40625, "logps/chosen": -370.0, "logps/rejected": -532.0, "loss": 0.721, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.375, "rewards/margins": 1.8359375, "rewards/rejected": -4.21875, "step": 10300 }, { "epoch": 0.7918586789554531, "grad_norm": 18.999172768823474, "learning_rate": 6.313237940107122e-08, "logits/chosen": -3.46875, "logits/rejected": -3.609375, "logps/chosen": -354.0, "logps/rejected": -516.0, "loss": 0.8059, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.125, "rewards/margins": 1.8515625, "rewards/rejected": -3.984375, "step": 10310 }, { "epoch": 0.7926267281105991, "grad_norm": 20.295343626777687, "learning_rate": 6.268780821918043e-08, "logits/chosen": -3.5625, "logits/rejected": -3.53125, "logps/chosen": -360.0, "logps/rejected": -500.0, "loss": 0.753, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.375, "rewards/margins": 1.578125, "rewards/rejected": -3.953125, "step": 10320 }, { "epoch": 0.793394777265745, "grad_norm": 18.41060602642733, "learning_rate": 6.224458338905325e-08, "logits/chosen": -3.453125, "logits/rejected": -3.5, "logps/chosen": -348.0, "logps/rejected": -528.0, "loss": 0.6881, "rewards/accuracies": 0.84375, "rewards/chosen": -2.203125, "rewards/margins": 1.8828125, "rewards/rejected": -4.09375, "step": 10330 }, { "epoch": 0.794162826420891, "grad_norm": 19.323168047940346, "learning_rate": 6.180270809647545e-08, "logits/chosen": -3.390625, "logits/rejected": -3.5625, "logps/chosen": -348.0, "logps/rejected": -524.0, "loss": 0.6786, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.96875, "rewards/margins": 2.015625, "rewards/rejected": -3.984375, "step": 10340 }, { "epoch": 0.7949308755760369, "grad_norm": 21.51991454645188, "learning_rate": 6.136218551753297e-08, "logits/chosen": -3.484375, "logits/rejected": -3.53125, "logps/chosen": -368.0, "logps/rejected": -524.0, "loss": 0.7111, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.296875, "rewards/margins": 1.6953125, "rewards/rejected": -3.984375, "step": 10350 }, { "epoch": 0.7956989247311828, "grad_norm": 20.72353973878255, "learning_rate": 6.092301881858841e-08, "logits/chosen": -3.46875, "logits/rejected": -3.53125, "logps/chosen": -354.0, "logps/rejected": -494.0, "loss": 0.7309, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.359375, "rewards/margins": 1.3828125, "rewards/rejected": -3.734375, "step": 10360 }, { "epoch": 0.7964669738863287, "grad_norm": 20.66116698207495, "learning_rate": 6.048521115625904e-08, "logits/chosen": -3.546875, "logits/rejected": -3.5, "logps/chosen": -360.0, "logps/rejected": -506.0, "loss": 0.7523, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.28125, "rewards/margins": 1.5390625, "rewards/rejected": -3.828125, "step": 10370 }, { "epoch": 0.7972350230414746, "grad_norm": 21.90924332269246, "learning_rate": 6.004876567739334e-08, "logits/chosen": -3.515625, "logits/rejected": -3.578125, "logps/chosen": -340.0, "logps/rejected": -498.0, "loss": 0.7548, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.265625, "rewards/margins": 1.609375, "rewards/rejected": -3.890625, "step": 10380 }, { "epoch": 0.7980030721966206, "grad_norm": 17.66873766087295, "learning_rate": 5.961368551904886e-08, "logits/chosen": -3.5, "logits/rejected": -3.640625, "logps/chosen": -358.0, "logps/rejected": -516.0, "loss": 0.733, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.234375, "rewards/margins": 1.7890625, "rewards/rejected": -4.03125, "step": 10390 }, { "epoch": 0.7987711213517665, "grad_norm": 24.42723517323317, "learning_rate": 5.9179973808469866e-08, "logits/chosen": -3.578125, "logits/rejected": -3.453125, "logps/chosen": -336.0, "logps/rejected": -540.0, "loss": 0.7412, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.171875, "rewards/margins": 1.859375, "rewards/rejected": -4.03125, "step": 10400 }, { "epoch": 0.7995391705069125, "grad_norm": 25.5220558082019, "learning_rate": 5.8747633663064204e-08, "logits/chosen": -3.59375, "logits/rejected": -3.65625, "logps/chosen": -314.0, "logps/rejected": -474.0, "loss": 0.7942, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.078125, "rewards/margins": 1.765625, "rewards/rejected": -3.84375, "step": 10410 }, { "epoch": 0.8003072196620584, "grad_norm": 19.86662005484429, "learning_rate": 5.831666819038161e-08, "logits/chosen": -3.484375, "logits/rejected": -3.609375, "logps/chosen": -352.0, "logps/rejected": -524.0, "loss": 0.7282, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.1875, "rewards/margins": 1.90625, "rewards/rejected": -4.09375, "step": 10420 }, { "epoch": 0.8010752688172043, "grad_norm": 20.105314763070112, "learning_rate": 5.7887080488090685e-08, "logits/chosen": -3.421875, "logits/rejected": -3.625, "logps/chosen": -330.0, "logps/rejected": -472.0, "loss": 0.7307, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.046875, "rewards/margins": 1.6171875, "rewards/rejected": -3.65625, "step": 10430 }, { "epoch": 0.8018433179723502, "grad_norm": 23.84046017160219, "learning_rate": 5.745887364395707e-08, "logits/chosen": -3.65625, "logits/rejected": -3.75, "logps/chosen": -338.0, "logps/rejected": -494.0, "loss": 0.7563, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.203125, "rewards/margins": 1.765625, "rewards/rejected": -3.96875, "step": 10440 }, { "epoch": 0.8026113671274961, "grad_norm": 22.22595400073367, "learning_rate": 5.703205073582137e-08, "logits/chosen": -3.453125, "logits/rejected": -3.5625, "logps/chosen": -350.0, "logps/rejected": -520.0, "loss": 0.7692, "rewards/accuracies": 0.8125, "rewards/chosen": -2.1875, "rewards/margins": 1.84375, "rewards/rejected": -4.03125, "step": 10450 }, { "epoch": 0.8033794162826421, "grad_norm": 19.851551996336433, "learning_rate": 5.660661483157653e-08, "logits/chosen": -3.359375, "logits/rejected": -3.4375, "logps/chosen": -348.0, "logps/rejected": -498.0, "loss": 0.7398, "rewards/accuracies": 0.8125, "rewards/chosen": -2.078125, "rewards/margins": 1.7890625, "rewards/rejected": -3.875, "step": 10460 }, { "epoch": 0.804147465437788, "grad_norm": 21.97265041668777, "learning_rate": 5.618256898914642e-08, "logits/chosen": -3.34375, "logits/rejected": -3.5, "logps/chosen": -326.0, "logps/rejected": -468.0, "loss": 0.7654, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.9921875, "rewards/margins": 1.609375, "rewards/rejected": -3.59375, "step": 10470 }, { "epoch": 0.804915514592934, "grad_norm": 26.04671821402376, "learning_rate": 5.5759916256463005e-08, "logits/chosen": -3.5, "logits/rejected": -3.578125, "logps/chosen": -338.0, "logps/rejected": -492.0, "loss": 0.7577, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.078125, "rewards/margins": 1.65625, "rewards/rejected": -3.734375, "step": 10480 }, { "epoch": 0.8056835637480799, "grad_norm": 18.4154421359079, "learning_rate": 5.5338659671445245e-08, "logits/chosen": -3.5, "logits/rejected": -3.40625, "logps/chosen": -348.0, "logps/rejected": -492.0, "loss": 0.7574, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.140625, "rewards/margins": 1.65625, "rewards/rejected": -3.796875, "step": 10490 }, { "epoch": 0.8064516129032258, "grad_norm": 21.779414995266336, "learning_rate": 5.4918802261977067e-08, "logits/chosen": -3.328125, "logits/rejected": -3.484375, "logps/chosen": -328.0, "logps/rejected": -458.0, "loss": 0.7267, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.015625, "rewards/margins": 1.53125, "rewards/rejected": -3.546875, "step": 10500 }, { "epoch": 0.8072196620583717, "grad_norm": 21.01147479087384, "learning_rate": 5.4500347045885184e-08, "logits/chosen": -3.390625, "logits/rejected": -3.453125, "logps/chosen": -340.0, "logps/rejected": -520.0, "loss": 0.7787, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.109375, "rewards/margins": 1.8046875, "rewards/rejected": -3.921875, "step": 10510 }, { "epoch": 0.8079877112135176, "grad_norm": 19.287745225173087, "learning_rate": 5.4083297030918047e-08, "logits/chosen": -3.5, "logits/rejected": -3.578125, "logps/chosen": -340.0, "logps/rejected": -486.0, "loss": 0.7661, "rewards/accuracies": 0.8125, "rewards/chosen": -2.25, "rewards/margins": 1.6171875, "rewards/rejected": -3.859375, "step": 10520 }, { "epoch": 0.8087557603686636, "grad_norm": 20.683051290347155, "learning_rate": 5.3667655214723494e-08, "logits/chosen": -3.515625, "logits/rejected": -3.59375, "logps/chosen": -356.0, "logps/rejected": -512.0, "loss": 0.7434, "rewards/accuracies": 0.84375, "rewards/chosen": -2.09375, "rewards/margins": 1.6015625, "rewards/rejected": -3.703125, "step": 10530 }, { "epoch": 0.8095238095238095, "grad_norm": 24.09604335854086, "learning_rate": 5.325342458482779e-08, "logits/chosen": -3.390625, "logits/rejected": -3.359375, "logps/chosen": -328.0, "logps/rejected": -496.0, "loss": 0.7444, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.0625, "rewards/margins": 1.6640625, "rewards/rejected": -3.71875, "step": 10540 }, { "epoch": 0.8102918586789555, "grad_norm": 20.704145080558014, "learning_rate": 5.284060811861407e-08, "logits/chosen": -3.546875, "logits/rejected": -3.578125, "logps/chosen": -344.0, "logps/rejected": -474.0, "loss": 0.7395, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.1875, "rewards/margins": 1.546875, "rewards/rejected": -3.734375, "step": 10550 }, { "epoch": 0.8110599078341014, "grad_norm": 18.693519138438024, "learning_rate": 5.242920878330059e-08, "logits/chosen": -3.59375, "logits/rejected": -3.640625, "logps/chosen": -340.0, "logps/rejected": -496.0, "loss": 0.7229, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.28125, "rewards/margins": 1.65625, "rewards/rejected": -3.9375, "step": 10560 }, { "epoch": 0.8118279569892473, "grad_norm": 21.006625775744943, "learning_rate": 5.2019229535919944e-08, "logits/chosen": -3.546875, "logits/rejected": -3.609375, "logps/chosen": -314.0, "logps/rejected": -480.0, "loss": 0.7685, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.0, "rewards/margins": 1.7421875, "rewards/rejected": -3.75, "step": 10570 }, { "epoch": 0.8125960061443932, "grad_norm": 19.776400382325697, "learning_rate": 5.161067332329708e-08, "logits/chosen": -3.53125, "logits/rejected": -3.71875, "logps/chosen": -358.0, "logps/rejected": -470.0, "loss": 0.7299, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.21875, "rewards/margins": 1.4609375, "rewards/rejected": -3.671875, "step": 10580 }, { "epoch": 0.8133640552995391, "grad_norm": 21.53240232921702, "learning_rate": 5.120354308202893e-08, "logits/chosen": -3.5, "logits/rejected": -3.578125, "logps/chosen": -350.0, "logps/rejected": -498.0, "loss": 0.7422, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.234375, "rewards/margins": 1.59375, "rewards/rejected": -3.828125, "step": 10590 }, { "epoch": 0.8141321044546851, "grad_norm": 20.189951433179818, "learning_rate": 5.079784173846269e-08, "logits/chosen": -3.515625, "logits/rejected": -3.65625, "logps/chosen": -326.0, "logps/rejected": -496.0, "loss": 0.7277, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.078125, "rewards/margins": 1.8671875, "rewards/rejected": -3.9375, "step": 10600 }, { "epoch": 0.814900153609831, "grad_norm": 17.296918360048586, "learning_rate": 5.039357220867499e-08, "logits/chosen": -3.546875, "logits/rejected": -3.609375, "logps/chosen": -364.0, "logps/rejected": -516.0, "loss": 0.7293, "rewards/accuracies": 0.8125, "rewards/chosen": -2.21875, "rewards/margins": 1.6875, "rewards/rejected": -3.90625, "step": 10610 }, { "epoch": 0.815668202764977, "grad_norm": 18.90345224571788, "learning_rate": 4.9990737398451246e-08, "logits/chosen": -3.421875, "logits/rejected": -3.375, "logps/chosen": -338.0, "logps/rejected": -510.0, "loss": 0.7651, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.0625, "rewards/margins": 1.8203125, "rewards/rejected": -3.875, "step": 10620 }, { "epoch": 0.8164362519201229, "grad_norm": 23.029013838392526, "learning_rate": 4.9589340203263996e-08, "logits/chosen": -3.515625, "logits/rejected": -3.359375, "logps/chosen": -350.0, "logps/rejected": -510.0, "loss": 0.7607, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.234375, "rewards/margins": 1.609375, "rewards/rejected": -3.859375, "step": 10630 }, { "epoch": 0.8172043010752689, "grad_norm": 21.95641376890666, "learning_rate": 4.9189383508253025e-08, "logits/chosen": -3.515625, "logits/rejected": -3.40625, "logps/chosen": -352.0, "logps/rejected": -516.0, "loss": 0.7868, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.296875, "rewards/margins": 1.71875, "rewards/rejected": -4.0, "step": 10640 }, { "epoch": 0.8179723502304147, "grad_norm": 22.908599234654115, "learning_rate": 4.879087018820394e-08, "logits/chosen": -3.453125, "logits/rejected": -3.625, "logps/chosen": -374.0, "logps/rejected": -528.0, "loss": 0.7601, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.453125, "rewards/margins": 1.6953125, "rewards/rejected": -4.15625, "step": 10650 }, { "epoch": 0.8187403993855606, "grad_norm": 23.07938021076233, "learning_rate": 4.839380310752772e-08, "logits/chosen": -3.375, "logits/rejected": -3.484375, "logps/chosen": -340.0, "logps/rejected": -496.0, "loss": 0.7466, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.234375, "rewards/margins": 1.6171875, "rewards/rejected": -3.84375, "step": 10660 }, { "epoch": 0.8195084485407066, "grad_norm": 23.99943525600204, "learning_rate": 4.799818512024037e-08, "logits/chosen": -3.390625, "logits/rejected": -3.46875, "logps/chosen": -346.0, "logps/rejected": -510.0, "loss": 0.7755, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.15625, "rewards/margins": 1.8046875, "rewards/rejected": -3.953125, "step": 10670 }, { "epoch": 0.8202764976958525, "grad_norm": 20.676817915648662, "learning_rate": 4.7604019069941823e-08, "logits/chosen": -3.53125, "logits/rejected": -3.546875, "logps/chosen": -332.0, "logps/rejected": -482.0, "loss": 0.7186, "rewards/accuracies": 0.84375, "rewards/chosen": -2.140625, "rewards/margins": 1.6015625, "rewards/rejected": -3.75, "step": 10680 }, { "epoch": 0.8210445468509985, "grad_norm": 23.03945653837004, "learning_rate": 4.721130778979623e-08, "logits/chosen": -3.46875, "logits/rejected": -3.421875, "logps/chosen": -340.0, "logps/rejected": -502.0, "loss": 0.7405, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.015625, "rewards/margins": 1.796875, "rewards/rejected": -3.8125, "step": 10690 }, { "epoch": 0.8218125960061444, "grad_norm": 21.688932005657374, "learning_rate": 4.682005410251105e-08, "logits/chosen": -3.4375, "logits/rejected": -3.53125, "logps/chosen": -326.0, "logps/rejected": -480.0, "loss": 0.7379, "rewards/accuracies": 0.84375, "rewards/chosen": -2.078125, "rewards/margins": 1.6640625, "rewards/rejected": -3.734375, "step": 10700 }, { "epoch": 0.8225806451612904, "grad_norm": 21.634757553184382, "learning_rate": 4.643026082031684e-08, "logits/chosen": -3.46875, "logits/rejected": -3.5, "logps/chosen": -314.0, "logps/rejected": -476.0, "loss": 0.785, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.9609375, "rewards/margins": 1.578125, "rewards/rejected": -3.546875, "step": 10710 }, { "epoch": 0.8233486943164362, "grad_norm": 21.352495805492463, "learning_rate": 4.60419307449475e-08, "logits/chosen": -3.515625, "logits/rejected": -3.671875, "logps/chosen": -350.0, "logps/rejected": -488.0, "loss": 0.7743, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.390625, "rewards/margins": 1.53125, "rewards/rejected": -3.921875, "step": 10720 }, { "epoch": 0.8241167434715821, "grad_norm": 24.654663383675516, "learning_rate": 4.565506666761923e-08, "logits/chosen": -3.515625, "logits/rejected": -3.46875, "logps/chosen": -336.0, "logps/rejected": -490.0, "loss": 0.7321, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.015625, "rewards/margins": 1.5546875, "rewards/rejected": -3.5625, "step": 10730 }, { "epoch": 0.8248847926267281, "grad_norm": 22.842217808221488, "learning_rate": 4.526967136901155e-08, "logits/chosen": -3.40625, "logits/rejected": -3.546875, "logps/chosen": -350.0, "logps/rejected": -496.0, "loss": 0.6933, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.203125, "rewards/margins": 1.671875, "rewards/rejected": -3.890625, "step": 10740 }, { "epoch": 0.825652841781874, "grad_norm": 26.41847434087248, "learning_rate": 4.4885747619246516e-08, "logits/chosen": -3.546875, "logits/rejected": -3.5625, "logps/chosen": -352.0, "logps/rejected": -528.0, "loss": 0.7152, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.171875, "rewards/margins": 1.7890625, "rewards/rejected": -3.96875, "step": 10750 }, { "epoch": 0.82642089093702, "grad_norm": 19.626614291103454, "learning_rate": 4.4503298177868965e-08, "logits/chosen": -3.515625, "logits/rejected": -3.53125, "logps/chosen": -356.0, "logps/rejected": -498.0, "loss": 0.7506, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.21875, "rewards/margins": 1.6484375, "rewards/rejected": -3.875, "step": 10760 }, { "epoch": 0.8271889400921659, "grad_norm": 18.90059868865278, "learning_rate": 4.412232579382716e-08, "logits/chosen": -3.484375, "logits/rejected": -3.5625, "logps/chosen": -330.0, "logps/rejected": -478.0, "loss": 0.7234, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.09375, "rewards/margins": 1.6171875, "rewards/rejected": -3.703125, "step": 10770 }, { "epoch": 0.8279569892473119, "grad_norm": 18.62091665086436, "learning_rate": 4.374283320545216e-08, "logits/chosen": -3.5625, "logits/rejected": -3.625, "logps/chosen": -336.0, "logps/rejected": -488.0, "loss": 0.721, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.171875, "rewards/margins": 1.625, "rewards/rejected": -3.796875, "step": 10780 }, { "epoch": 0.8287250384024577, "grad_norm": 20.23039275881528, "learning_rate": 4.33648231404391e-08, "logits/chosen": -3.546875, "logits/rejected": -3.625, "logps/chosen": -352.0, "logps/rejected": -516.0, "loss": 0.7421, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.171875, "rewards/margins": 1.7578125, "rewards/rejected": -3.921875, "step": 10790 }, { "epoch": 0.8294930875576036, "grad_norm": 17.755307627976652, "learning_rate": 4.298829831582681e-08, "logits/chosen": -3.578125, "logits/rejected": -3.8125, "logps/chosen": -356.0, "logps/rejected": -506.0, "loss": 0.7182, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.375, "rewards/margins": 1.5703125, "rewards/rejected": -3.9375, "step": 10800 }, { "epoch": 0.8302611367127496, "grad_norm": 19.153260366011878, "learning_rate": 4.261326143797867e-08, "logits/chosen": -3.53125, "logits/rejected": -3.53125, "logps/chosen": -362.0, "logps/rejected": -560.0, "loss": 0.6906, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.15625, "rewards/margins": 1.9609375, "rewards/rejected": -4.125, "step": 10810 }, { "epoch": 0.8310291858678955, "grad_norm": 21.225628856191285, "learning_rate": 4.223971520256328e-08, "logits/chosen": -3.53125, "logits/rejected": -3.484375, "logps/chosen": -310.0, "logps/rejected": -516.0, "loss": 0.704, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.0, "rewards/margins": 1.9296875, "rewards/rejected": -3.9375, "step": 10820 }, { "epoch": 0.8317972350230415, "grad_norm": 18.584190021341897, "learning_rate": 4.186766229453448e-08, "logits/chosen": -3.46875, "logits/rejected": -3.84375, "logps/chosen": -340.0, "logps/rejected": -504.0, "loss": 0.7496, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.171875, "rewards/margins": 1.734375, "rewards/rejected": -3.921875, "step": 10830 }, { "epoch": 0.8325652841781874, "grad_norm": 20.644540874005326, "learning_rate": 4.1497105388112854e-08, "logits/chosen": -3.640625, "logits/rejected": -3.78125, "logps/chosen": -348.0, "logps/rejected": -516.0, "loss": 0.7061, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.28125, "rewards/margins": 1.859375, "rewards/rejected": -4.15625, "step": 10840 }, { "epoch": 0.8333333333333334, "grad_norm": 22.780253007726607, "learning_rate": 4.112804714676593e-08, "logits/chosen": -3.5, "logits/rejected": -3.5, "logps/chosen": -350.0, "logps/rejected": -488.0, "loss": 0.785, "rewards/accuracies": 0.8125, "rewards/chosen": -2.25, "rewards/margins": 1.4375, "rewards/rejected": -3.6875, "step": 10850 }, { "epoch": 0.8341013824884793, "grad_norm": 18.496243265564782, "learning_rate": 4.0760490223189145e-08, "logits/chosen": -3.4375, "logits/rejected": -3.53125, "logps/chosen": -338.0, "logps/rejected": -498.0, "loss": 0.7059, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.0625, "rewards/margins": 1.6328125, "rewards/rejected": -3.703125, "step": 10860 }, { "epoch": 0.8348694316436251, "grad_norm": 20.2793097008956, "learning_rate": 4.039443725928712e-08, "logits/chosen": -3.53125, "logits/rejected": -3.875, "logps/chosen": -338.0, "logps/rejected": -494.0, "loss": 0.6811, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.296875, "rewards/margins": 1.78125, "rewards/rejected": -4.0625, "step": 10870 }, { "epoch": 0.8356374807987711, "grad_norm": 22.80536117048858, "learning_rate": 4.0029890886154024e-08, "logits/chosen": -3.421875, "logits/rejected": -3.640625, "logps/chosen": -336.0, "logps/rejected": -516.0, "loss": 0.7209, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.9765625, "rewards/margins": 2.03125, "rewards/rejected": -4.0, "step": 10880 }, { "epoch": 0.836405529953917, "grad_norm": 19.429087331552125, "learning_rate": 3.96668537240554e-08, "logits/chosen": -3.65625, "logits/rejected": -3.609375, "logps/chosen": -342.0, "logps/rejected": -504.0, "loss": 0.7022, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.140625, "rewards/margins": 1.78125, "rewards/rejected": -3.921875, "step": 10890 }, { "epoch": 0.837173579109063, "grad_norm": 20.07273453785196, "learning_rate": 3.930532838240874e-08, "logits/chosen": -3.59375, "logits/rejected": -3.546875, "logps/chosen": -360.0, "logps/rejected": -524.0, "loss": 0.776, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.296875, "rewards/margins": 1.6875, "rewards/rejected": -3.984375, "step": 10900 }, { "epoch": 0.8379416282642089, "grad_norm": 26.83932824507004, "learning_rate": 3.894531745976501e-08, "logits/chosen": -3.484375, "logits/rejected": -3.578125, "logps/chosen": -358.0, "logps/rejected": -520.0, "loss": 0.7603, "rewards/accuracies": 0.8125, "rewards/chosen": -2.1875, "rewards/margins": 1.7265625, "rewards/rejected": -3.90625, "step": 10910 }, { "epoch": 0.8387096774193549, "grad_norm": 18.194536588748424, "learning_rate": 3.8586823543790116e-08, "logits/chosen": -3.46875, "logits/rejected": -3.75, "logps/chosen": -342.0, "logps/rejected": -548.0, "loss": 0.7218, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.109375, "rewards/margins": 2.140625, "rewards/rejected": -4.25, "step": 10920 }, { "epoch": 0.8394777265745008, "grad_norm": 20.176347730854477, "learning_rate": 3.822984921124567e-08, "logits/chosen": -3.609375, "logits/rejected": -3.765625, "logps/chosen": -370.0, "logps/rejected": -524.0, "loss": 0.6859, "rewards/accuracies": 0.8125, "rewards/chosen": -2.328125, "rewards/margins": 1.75, "rewards/rejected": -4.09375, "step": 10930 }, { "epoch": 0.8402457757296466, "grad_norm": 19.88253909274576, "learning_rate": 3.787439702797138e-08, "logits/chosen": -3.546875, "logits/rejected": -3.640625, "logps/chosen": -352.0, "logps/rejected": -528.0, "loss": 0.7194, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.078125, "rewards/margins": 2.0, "rewards/rejected": -4.0625, "step": 10940 }, { "epoch": 0.8410138248847926, "grad_norm": 19.493885243880705, "learning_rate": 3.752046954886587e-08, "logits/chosen": -3.5625, "logits/rejected": -3.53125, "logps/chosen": -350.0, "logps/rejected": -516.0, "loss": 0.7218, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.3125, "rewards/margins": 1.6484375, "rewards/rejected": -3.953125, "step": 10950 }, { "epoch": 0.8417818740399385, "grad_norm": 19.18815991336529, "learning_rate": 3.716806931786862e-08, "logits/chosen": -3.578125, "logits/rejected": -3.671875, "logps/chosen": -354.0, "logps/rejected": -482.0, "loss": 0.7573, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.296875, "rewards/margins": 1.46875, "rewards/rejected": -3.765625, "step": 10960 }, { "epoch": 0.8425499231950845, "grad_norm": 20.623885721971156, "learning_rate": 3.681719886794185e-08, "logits/chosen": -3.59375, "logits/rejected": -3.4375, "logps/chosen": -332.0, "logps/rejected": -506.0, "loss": 0.7612, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.21875, "rewards/margins": 1.6171875, "rewards/rejected": -3.84375, "step": 10970 }, { "epoch": 0.8433179723502304, "grad_norm": 20.985806456647612, "learning_rate": 3.6467860721051654e-08, "logits/chosen": -3.46875, "logits/rejected": -3.546875, "logps/chosen": -342.0, "logps/rejected": -502.0, "loss": 0.7461, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.09375, "rewards/margins": 1.8203125, "rewards/rejected": -3.921875, "step": 10980 }, { "epoch": 0.8440860215053764, "grad_norm": 21.04204747855968, "learning_rate": 3.6120057388150827e-08, "logits/chosen": -3.671875, "logits/rejected": -3.5625, "logps/chosen": -294.0, "logps/rejected": -472.0, "loss": 0.7201, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.9765625, "rewards/margins": 1.6796875, "rewards/rejected": -3.65625, "step": 10990 }, { "epoch": 0.8448540706605223, "grad_norm": 22.410304023624995, "learning_rate": 3.5773791369160036e-08, "logits/chosen": -3.59375, "logits/rejected": -3.734375, "logps/chosen": -344.0, "logps/rejected": -516.0, "loss": 0.7748, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.171875, "rewards/margins": 1.9375, "rewards/rejected": -4.09375, "step": 11000 }, { "epoch": 0.8448540706605223, "eval_logits/chosen": -3.46875, "eval_logits/rejected": -3.625, "eval_logps/chosen": -382.0, "eval_logps/rejected": -494.0, "eval_loss": 0.45760753750801086, "eval_rewards/accuracies": 0.7621051073074341, "eval_rewards/chosen": -2.40625, "eval_rewards/margins": 1.4140625, "eval_rewards/rejected": -3.8125, "eval_runtime": 2264.7079, "eval_samples_per_second": 41.125, "eval_steps_per_second": 0.643, "step": 11000 }, { "epoch": 0.8456221198156681, "grad_norm": 21.505370905608842, "learning_rate": 3.5429065152950106e-08, "logits/chosen": -3.5625, "logits/rejected": -3.4375, "logps/chosen": -350.0, "logps/rejected": -528.0, "loss": 0.7634, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.28125, "rewards/margins": 1.8125, "rewards/rejected": -4.09375, "step": 11010 }, { "epoch": 0.8463901689708141, "grad_norm": 18.91387591505148, "learning_rate": 3.508588121732442e-08, "logits/chosen": -3.4375, "logits/rejected": -3.578125, "logps/chosen": -350.0, "logps/rejected": -512.0, "loss": 0.7462, "rewards/accuracies": 0.84375, "rewards/chosen": -2.1875, "rewards/margins": 1.7734375, "rewards/rejected": -3.96875, "step": 11020 }, { "epoch": 0.84715821812596, "grad_norm": 23.35461163994808, "learning_rate": 3.474424202900045e-08, "logits/chosen": -3.515625, "logits/rejected": -3.640625, "logps/chosen": -320.0, "logps/rejected": -482.0, "loss": 0.7108, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -1.9609375, "rewards/margins": 1.859375, "rewards/rejected": -3.828125, "step": 11030 }, { "epoch": 0.847926267281106, "grad_norm": 20.246713904732324, "learning_rate": 3.440415004359282e-08, "logits/chosen": -3.484375, "logits/rejected": -3.65625, "logps/chosen": -318.0, "logps/rejected": -460.0, "loss": 0.6891, "rewards/accuracies": 0.8125, "rewards/chosen": -1.921875, "rewards/margins": 1.609375, "rewards/rejected": -3.53125, "step": 11040 }, { "epoch": 0.8486943164362519, "grad_norm": 24.14430129291793, "learning_rate": 3.4065607705594995e-08, "logits/chosen": -3.5625, "logits/rejected": -3.640625, "logps/chosen": -352.0, "logps/rejected": -502.0, "loss": 0.7517, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.265625, "rewards/margins": 1.6171875, "rewards/rejected": -3.890625, "step": 11050 }, { "epoch": 0.8494623655913979, "grad_norm": 18.156272566519615, "learning_rate": 3.372861744836206e-08, "logits/chosen": -3.59375, "logits/rejected": -3.421875, "logps/chosen": -348.0, "logps/rejected": -536.0, "loss": 0.7488, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.21875, "rewards/margins": 1.875, "rewards/rejected": -4.09375, "step": 11060 }, { "epoch": 0.8502304147465438, "grad_norm": 21.8540369008214, "learning_rate": 3.339318169409322e-08, "logits/chosen": -3.40625, "logits/rejected": -3.65625, "logps/chosen": -364.0, "logps/rejected": -524.0, "loss": 0.7242, "rewards/accuracies": 0.8125, "rewards/chosen": -2.296875, "rewards/margins": 1.8203125, "rewards/rejected": -4.125, "step": 11070 }, { "epoch": 0.8509984639016898, "grad_norm": 42.563097588477035, "learning_rate": 3.305930285381406e-08, "logits/chosen": -3.53125, "logits/rejected": -3.546875, "logps/chosen": -340.0, "logps/rejected": -508.0, "loss": 0.7818, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.125, "rewards/margins": 1.7265625, "rewards/rejected": -3.84375, "step": 11080 }, { "epoch": 0.8517665130568356, "grad_norm": 19.075315599689336, "learning_rate": 3.272698332735982e-08, "logits/chosen": -3.640625, "logits/rejected": -3.40625, "logps/chosen": -318.0, "logps/rejected": -516.0, "loss": 0.75, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.140625, "rewards/margins": 1.90625, "rewards/rejected": -4.03125, "step": 11090 }, { "epoch": 0.8525345622119815, "grad_norm": 17.614259398872168, "learning_rate": 3.239622550335755e-08, "logits/chosen": -3.625, "logits/rejected": -3.59375, "logps/chosen": -326.0, "logps/rejected": -488.0, "loss": 0.7263, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.203125, "rewards/margins": 1.78125, "rewards/rejected": -3.96875, "step": 11100 }, { "epoch": 0.8533026113671275, "grad_norm": 21.87882490819807, "learning_rate": 3.206703175920919e-08, "logits/chosen": -3.5625, "logits/rejected": -3.4375, "logps/chosen": -328.0, "logps/rejected": -492.0, "loss": 0.7427, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.09375, "rewards/margins": 1.7421875, "rewards/rejected": -3.828125, "step": 11110 }, { "epoch": 0.8540706605222734, "grad_norm": 18.196662514465636, "learning_rate": 3.17394044610747e-08, "logits/chosen": -3.53125, "logits/rejected": -3.546875, "logps/chosen": -354.0, "logps/rejected": -528.0, "loss": 0.7228, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.25, "rewards/margins": 1.7890625, "rewards/rejected": -4.03125, "step": 11120 }, { "epoch": 0.8548387096774194, "grad_norm": 22.41519391985988, "learning_rate": 3.141334596385447e-08, "logits/chosen": -3.453125, "logits/rejected": -3.46875, "logps/chosen": -350.0, "logps/rejected": -544.0, "loss": 0.7421, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.234375, "rewards/margins": 1.96875, "rewards/rejected": -4.1875, "step": 11130 }, { "epoch": 0.8556067588325653, "grad_norm": 22.094383440374468, "learning_rate": 3.1088858611173105e-08, "logits/chosen": -3.53125, "logits/rejected": -3.765625, "logps/chosen": -352.0, "logps/rejected": -528.0, "loss": 0.7461, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.234375, "rewards/margins": 1.984375, "rewards/rejected": -4.21875, "step": 11140 }, { "epoch": 0.8563748079877113, "grad_norm": 17.68312703035504, "learning_rate": 3.076594473536195e-08, "logits/chosen": -3.5, "logits/rejected": -3.296875, "logps/chosen": -364.0, "logps/rejected": -544.0, "loss": 0.7607, "rewards/accuracies": 0.8125, "rewards/chosen": -2.390625, "rewards/margins": 1.7109375, "rewards/rejected": -4.09375, "step": 11150 }, { "epoch": 0.8571428571428571, "grad_norm": 20.216795165982298, "learning_rate": 3.044460665744283e-08, "logits/chosen": -3.46875, "logits/rejected": -3.5, "logps/chosen": -340.0, "logps/rejected": -516.0, "loss": 0.7067, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.109375, "rewards/margins": 1.75, "rewards/rejected": -3.859375, "step": 11160 }, { "epoch": 0.857910906298003, "grad_norm": 18.48441405634118, "learning_rate": 3.01248466871111e-08, "logits/chosen": -3.484375, "logits/rejected": -3.546875, "logps/chosen": -308.0, "logps/rejected": -474.0, "loss": 0.7156, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.828125, "rewards/margins": 1.75, "rewards/rejected": -3.578125, "step": 11170 }, { "epoch": 0.858678955453149, "grad_norm": 24.938654382907483, "learning_rate": 2.9806667122718815e-08, "logits/chosen": -3.46875, "logits/rejected": -3.40625, "logps/chosen": -316.0, "logps/rejected": -508.0, "loss": 0.7139, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.984375, "rewards/margins": 2.0, "rewards/rejected": -3.984375, "step": 11180 }, { "epoch": 0.8594470046082949, "grad_norm": 19.9133258498516, "learning_rate": 2.9490070251258825e-08, "logits/chosen": -3.453125, "logits/rejected": -3.546875, "logps/chosen": -316.0, "logps/rejected": -480.0, "loss": 0.7334, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.109375, "rewards/margins": 1.65625, "rewards/rejected": -3.765625, "step": 11190 }, { "epoch": 0.8602150537634409, "grad_norm": 19.194891465910217, "learning_rate": 2.91750583483478e-08, "logits/chosen": -3.4375, "logits/rejected": -3.515625, "logps/chosen": -340.0, "logps/rejected": -520.0, "loss": 0.7325, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.09375, "rewards/margins": 1.9609375, "rewards/rejected": -4.0625, "step": 11200 }, { "epoch": 0.8609831029185868, "grad_norm": 19.145922796508, "learning_rate": 2.8861633678210208e-08, "logits/chosen": -3.5, "logits/rejected": -3.8125, "logps/chosen": -366.0, "logps/rejected": -482.0, "loss": 0.7351, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.25, "rewards/margins": 1.453125, "rewards/rejected": -3.703125, "step": 11210 }, { "epoch": 0.8617511520737328, "grad_norm": 24.502947181799183, "learning_rate": 2.8549798493661793e-08, "logits/chosen": -3.453125, "logits/rejected": -3.421875, "logps/chosen": -326.0, "logps/rejected": -500.0, "loss": 0.7994, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.0625, "rewards/margins": 1.75, "rewards/rejected": -3.8125, "step": 11220 }, { "epoch": 0.8625192012288786, "grad_norm": 24.58020320119817, "learning_rate": 2.8239555036093348e-08, "logits/chosen": -3.609375, "logits/rejected": -3.40625, "logps/chosen": -316.0, "logps/rejected": -520.0, "loss": 0.7182, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.09375, "rewards/margins": 2.03125, "rewards/rejected": -4.125, "step": 11230 }, { "epoch": 0.8632872503840245, "grad_norm": 21.419037855221163, "learning_rate": 2.7930905535455047e-08, "logits/chosen": -3.515625, "logits/rejected": -3.796875, "logps/chosen": -354.0, "logps/rejected": -524.0, "loss": 0.7482, "rewards/accuracies": 0.84375, "rewards/chosen": -2.171875, "rewards/margins": 1.875, "rewards/rejected": -4.0625, "step": 11240 }, { "epoch": 0.8640552995391705, "grad_norm": 18.43278639143147, "learning_rate": 2.7623852210239883e-08, "logits/chosen": -3.65625, "logits/rejected": -3.59375, "logps/chosen": -332.0, "logps/rejected": -510.0, "loss": 0.7261, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.21875, "rewards/margins": 1.78125, "rewards/rejected": -4.0, "step": 11250 }, { "epoch": 0.8648233486943164, "grad_norm": 21.259229219584974, "learning_rate": 2.731839726746818e-08, "logits/chosen": -3.5625, "logits/rejected": -3.796875, "logps/chosen": -318.0, "logps/rejected": -504.0, "loss": 0.7518, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.9921875, "rewards/margins": 2.03125, "rewards/rejected": -4.03125, "step": 11260 }, { "epoch": 0.8655913978494624, "grad_norm": 23.71900006873396, "learning_rate": 2.701454290267116e-08, "logits/chosen": -3.65625, "logits/rejected": -3.640625, "logps/chosen": -368.0, "logps/rejected": -500.0, "loss": 0.7474, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.34375, "rewards/margins": 1.53125, "rewards/rejected": -3.875, "step": 11270 }, { "epoch": 0.8663594470046083, "grad_norm": 17.823206963841976, "learning_rate": 2.6712291299875734e-08, "logits/chosen": -3.578125, "logits/rejected": -3.515625, "logps/chosen": -340.0, "logps/rejected": -498.0, "loss": 0.7348, "rewards/accuracies": 0.84375, "rewards/chosen": -2.234375, "rewards/margins": 1.671875, "rewards/rejected": -3.90625, "step": 11280 }, { "epoch": 0.8671274961597543, "grad_norm": 22.94892459166353, "learning_rate": 2.64116446315886e-08, "logits/chosen": -3.609375, "logits/rejected": -3.625, "logps/chosen": -338.0, "logps/rejected": -492.0, "loss": 0.7572, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.203125, "rewards/margins": 1.59375, "rewards/rejected": -3.796875, "step": 11290 }, { "epoch": 0.8678955453149002, "grad_norm": 20.458011142585217, "learning_rate": 2.6112605058780434e-08, "logits/chosen": -3.578125, "logits/rejected": -3.453125, "logps/chosen": -330.0, "logps/rejected": -480.0, "loss": 0.7557, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.0625, "rewards/margins": 1.5234375, "rewards/rejected": -3.59375, "step": 11300 }, { "epoch": 0.868663594470046, "grad_norm": 20.253174850382393, "learning_rate": 2.5815174730870805e-08, "logits/chosen": -3.453125, "logits/rejected": -3.5625, "logps/chosen": -346.0, "logps/rejected": -504.0, "loss": 0.692, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.1875, "rewards/margins": 1.6640625, "rewards/rejected": -3.84375, "step": 11310 }, { "epoch": 0.869431643625192, "grad_norm": 19.908644938927097, "learning_rate": 2.55193557857121e-08, "logits/chosen": -3.640625, "logits/rejected": -3.703125, "logps/chosen": -340.0, "logps/rejected": -516.0, "loss": 0.6955, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.15625, "rewards/margins": 1.921875, "rewards/rejected": -4.09375, "step": 11320 }, { "epoch": 0.8701996927803379, "grad_norm": 23.28244523038632, "learning_rate": 2.5225150349574647e-08, "logits/chosen": -3.609375, "logits/rejected": -3.75, "logps/chosen": -352.0, "logps/rejected": -496.0, "loss": 0.7461, "rewards/accuracies": 0.8125, "rewards/chosen": -2.171875, "rewards/margins": 1.53125, "rewards/rejected": -3.6875, "step": 11330 }, { "epoch": 0.8709677419354839, "grad_norm": 22.85824317536606, "learning_rate": 2.4932560537131413e-08, "logits/chosen": -3.46875, "logits/rejected": -3.5625, "logps/chosen": -364.0, "logps/rejected": -516.0, "loss": 0.7572, "rewards/accuracies": 0.78125, "rewards/chosen": -2.234375, "rewards/margins": 1.65625, "rewards/rejected": -3.890625, "step": 11340 }, { "epoch": 0.8717357910906298, "grad_norm": 19.243162194310855, "learning_rate": 2.464158845144246e-08, "logits/chosen": -3.65625, "logits/rejected": -3.6875, "logps/chosen": -340.0, "logps/rejected": -512.0, "loss": 0.7475, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.140625, "rewards/margins": 1.8203125, "rewards/rejected": -3.96875, "step": 11350 }, { "epoch": 0.8725038402457758, "grad_norm": 19.67127489471382, "learning_rate": 2.4352236183940306e-08, "logits/chosen": -3.390625, "logits/rejected": -3.453125, "logps/chosen": -320.0, "logps/rejected": -504.0, "loss": 0.7168, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.9765625, "rewards/margins": 1.9375, "rewards/rejected": -3.90625, "step": 11360 }, { "epoch": 0.8732718894009217, "grad_norm": 20.373253478554304, "learning_rate": 2.4064505814414355e-08, "logits/chosen": -3.609375, "logits/rejected": -3.5625, "logps/chosen": -362.0, "logps/rejected": -504.0, "loss": 0.7416, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.140625, "rewards/margins": 1.609375, "rewards/rejected": -3.75, "step": 11370 }, { "epoch": 0.8740399385560675, "grad_norm": 22.959597888582607, "learning_rate": 2.3778399410996325e-08, "logits/chosen": -3.59375, "logits/rejected": -3.6875, "logps/chosen": -340.0, "logps/rejected": -504.0, "loss": 0.7329, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.09375, "rewards/margins": 1.7890625, "rewards/rejected": -3.890625, "step": 11380 }, { "epoch": 0.8748079877112135, "grad_norm": 21.140551978231326, "learning_rate": 2.3493919030145442e-08, "logits/chosen": -3.515625, "logits/rejected": -3.59375, "logps/chosen": -340.0, "logps/rejected": -508.0, "loss": 0.7317, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.140625, "rewards/margins": 1.7578125, "rewards/rejected": -3.90625, "step": 11390 }, { "epoch": 0.8755760368663594, "grad_norm": 21.1284166935775, "learning_rate": 2.3211066716633254e-08, "logits/chosen": -3.515625, "logits/rejected": -3.65625, "logps/chosen": -328.0, "logps/rejected": -500.0, "loss": 0.7461, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.0625, "rewards/margins": 1.765625, "rewards/rejected": -3.828125, "step": 11400 }, { "epoch": 0.8763440860215054, "grad_norm": 21.340529426655248, "learning_rate": 2.2929844503529443e-08, "logits/chosen": -3.421875, "logits/rejected": -3.4375, "logps/chosen": -344.0, "logps/rejected": -508.0, "loss": 0.7249, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.109375, "rewards/margins": 1.75, "rewards/rejected": -3.859375, "step": 11410 }, { "epoch": 0.8771121351766513, "grad_norm": 23.709926398705164, "learning_rate": 2.2650254412186683e-08, "logits/chosen": -3.671875, "logits/rejected": -3.390625, "logps/chosen": -316.0, "logps/rejected": -484.0, "loss": 0.7287, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.125, "rewards/margins": 1.578125, "rewards/rejected": -3.703125, "step": 11420 }, { "epoch": 0.8778801843317973, "grad_norm": 19.681218480020757, "learning_rate": 2.2372298452226463e-08, "logits/chosen": -3.59375, "logits/rejected": -3.390625, "logps/chosen": -336.0, "logps/rejected": -516.0, "loss": 0.7031, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.109375, "rewards/margins": 1.9609375, "rewards/rejected": -4.0625, "step": 11430 }, { "epoch": 0.8786482334869432, "grad_norm": 21.422981229685682, "learning_rate": 2.2095978621524707e-08, "logits/chosen": -3.671875, "logits/rejected": -3.640625, "logps/chosen": -346.0, "logps/rejected": -516.0, "loss": 0.9161, "rewards/accuracies": 0.8125, "rewards/chosen": -2.390625, "rewards/margins": 1.625, "rewards/rejected": -4.0, "step": 11440 }, { "epoch": 0.879416282642089, "grad_norm": 20.712945698177492, "learning_rate": 2.182129690619708e-08, "logits/chosen": -3.5625, "logits/rejected": -3.65625, "logps/chosen": -366.0, "logps/rejected": -516.0, "loss": 0.7408, "rewards/accuracies": 0.84375, "rewards/chosen": -2.3125, "rewards/margins": 1.875, "rewards/rejected": -4.1875, "step": 11450 }, { "epoch": 0.880184331797235, "grad_norm": 23.333123038719396, "learning_rate": 2.154825528058504e-08, "logits/chosen": -3.40625, "logits/rejected": -3.6875, "logps/chosen": -346.0, "logps/rejected": -524.0, "loss": 0.7642, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.359375, "rewards/margins": 1.859375, "rewards/rejected": -4.21875, "step": 11460 }, { "epoch": 0.8809523809523809, "grad_norm": 21.14033617029676, "learning_rate": 2.127685570724136e-08, "logits/chosen": -3.578125, "logits/rejected": -3.53125, "logps/chosen": -340.0, "logps/rejected": -492.0, "loss": 0.7142, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.171875, "rewards/margins": 1.609375, "rewards/rejected": -3.78125, "step": 11470 }, { "epoch": 0.8817204301075269, "grad_norm": 24.735517204505573, "learning_rate": 2.1007100136916202e-08, "logits/chosen": -3.46875, "logits/rejected": -3.5625, "logps/chosen": -382.0, "logps/rejected": -536.0, "loss": 0.7424, "rewards/accuracies": 0.78125, "rewards/chosen": -2.375, "rewards/margins": 1.71875, "rewards/rejected": -4.09375, "step": 11480 }, { "epoch": 0.8824884792626728, "grad_norm": 20.72480122620671, "learning_rate": 2.073899050854319e-08, "logits/chosen": -3.5, "logits/rejected": -3.390625, "logps/chosen": -368.0, "logps/rejected": -536.0, "loss": 0.724, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.28125, "rewards/margins": 1.921875, "rewards/rejected": -4.1875, "step": 11490 }, { "epoch": 0.8832565284178188, "grad_norm": 19.838565604040785, "learning_rate": 2.0472528749225116e-08, "logits/chosen": -3.6875, "logits/rejected": -3.515625, "logps/chosen": -356.0, "logps/rejected": -532.0, "loss": 0.7264, "rewards/accuracies": 0.78125, "rewards/chosen": -2.359375, "rewards/margins": 1.7890625, "rewards/rejected": -4.15625, "step": 11500 }, { "epoch": 0.8840245775729647, "grad_norm": 22.61321607656573, "learning_rate": 2.020771677422062e-08, "logits/chosen": -3.5625, "logits/rejected": -3.640625, "logps/chosen": -380.0, "logps/rejected": -560.0, "loss": 0.6981, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.3125, "rewards/margins": 1.9140625, "rewards/rejected": -4.21875, "step": 11510 }, { "epoch": 0.8847926267281107, "grad_norm": 20.36848005206582, "learning_rate": 1.9944556486929776e-08, "logits/chosen": -3.546875, "logits/rejected": -3.546875, "logps/chosen": -352.0, "logps/rejected": -510.0, "loss": 0.7544, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.28125, "rewards/margins": 1.796875, "rewards/rejected": -4.0625, "step": 11520 }, { "epoch": 0.8855606758832565, "grad_norm": 21.31681102526964, "learning_rate": 1.9683049778880912e-08, "logits/chosen": -3.46875, "logits/rejected": -3.59375, "logps/chosen": -346.0, "logps/rejected": -516.0, "loss": 0.7496, "rewards/accuracies": 0.8125, "rewards/chosen": -2.140625, "rewards/margins": 1.6875, "rewards/rejected": -3.828125, "step": 11530 }, { "epoch": 0.8863287250384024, "grad_norm": 21.031705535837677, "learning_rate": 1.942319852971694e-08, "logits/chosen": -3.640625, "logits/rejected": -3.609375, "logps/chosen": -346.0, "logps/rejected": -512.0, "loss": 0.7281, "rewards/accuracies": 0.8125, "rewards/chosen": -2.234375, "rewards/margins": 1.6171875, "rewards/rejected": -3.859375, "step": 11540 }, { "epoch": 0.8870967741935484, "grad_norm": 21.611811637661294, "learning_rate": 1.9165004607181597e-08, "logits/chosen": -3.4375, "logits/rejected": -3.40625, "logps/chosen": -356.0, "logps/rejected": -520.0, "loss": 0.7703, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.34375, "rewards/margins": 1.625, "rewards/rejected": -3.96875, "step": 11550 }, { "epoch": 0.8878648233486943, "grad_norm": 20.495972090104992, "learning_rate": 1.8908469867106398e-08, "logits/chosen": -3.578125, "logits/rejected": -3.6875, "logps/chosen": -328.0, "logps/rejected": -482.0, "loss": 0.8502, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.15625, "rewards/margins": 1.6171875, "rewards/rejected": -3.78125, "step": 11560 }, { "epoch": 0.8886328725038403, "grad_norm": 20.39200706123321, "learning_rate": 1.865359615339679e-08, "logits/chosen": -3.609375, "logits/rejected": -3.8125, "logps/chosen": -352.0, "logps/rejected": -524.0, "loss": 0.7482, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.28125, "rewards/margins": 1.7734375, "rewards/rejected": -4.0625, "step": 11570 }, { "epoch": 0.8894009216589862, "grad_norm": 21.634052459555377, "learning_rate": 1.8400385298019377e-08, "logits/chosen": -3.390625, "logits/rejected": -3.46875, "logps/chosen": -340.0, "logps/rejected": -496.0, "loss": 0.6724, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.171875, "rewards/margins": 1.6875, "rewards/rejected": -3.859375, "step": 11580 }, { "epoch": 0.8901689708141322, "grad_norm": 23.882366982371618, "learning_rate": 1.814883912098858e-08, "logits/chosen": -3.59375, "logits/rejected": -3.71875, "logps/chosen": -320.0, "logps/rejected": -488.0, "loss": 0.7281, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.015625, "rewards/margins": 1.78125, "rewards/rejected": -3.796875, "step": 11590 }, { "epoch": 0.890937019969278, "grad_norm": 18.659630900567606, "learning_rate": 1.7898959430353466e-08, "logits/chosen": -3.515625, "logits/rejected": -3.4375, "logps/chosen": -300.0, "logps/rejected": -490.0, "loss": 0.7583, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.921875, "rewards/margins": 1.84375, "rewards/rejected": -3.765625, "step": 11600 }, { "epoch": 0.8917050691244239, "grad_norm": 18.588909962558922, "learning_rate": 1.7650748022184914e-08, "logits/chosen": -3.5, "logits/rejected": -3.40625, "logps/chosen": -346.0, "logps/rejected": -524.0, "loss": 0.7527, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.21875, "rewards/margins": 1.8125, "rewards/rejected": -4.03125, "step": 11610 }, { "epoch": 0.8924731182795699, "grad_norm": 21.24315598421127, "learning_rate": 1.74042066805625e-08, "logits/chosen": -3.453125, "logits/rejected": -3.546875, "logps/chosen": -368.0, "logps/rejected": -524.0, "loss": 0.6859, "rewards/accuracies": 0.84375, "rewards/chosen": -2.171875, "rewards/margins": 1.796875, "rewards/rejected": -3.96875, "step": 11620 }, { "epoch": 0.8932411674347158, "grad_norm": 19.58294652147007, "learning_rate": 1.7159337177561823e-08, "logits/chosen": -3.453125, "logits/rejected": -3.6875, "logps/chosen": -344.0, "logps/rejected": -482.0, "loss": 0.732, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.171875, "rewards/margins": 1.5859375, "rewards/rejected": -3.765625, "step": 11630 }, { "epoch": 0.8940092165898618, "grad_norm": 25.023874187902788, "learning_rate": 1.691614127324184e-08, "logits/chosen": -3.5625, "logits/rejected": -3.46875, "logps/chosen": -326.0, "logps/rejected": -498.0, "loss": 0.7601, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.09375, "rewards/margins": 1.6015625, "rewards/rejected": -3.6875, "step": 11640 }, { "epoch": 0.8947772657450077, "grad_norm": 25.423594175562922, "learning_rate": 1.6674620715631972e-08, "logits/chosen": -3.359375, "logits/rejected": -3.5625, "logps/chosen": -354.0, "logps/rejected": -486.0, "loss": 0.7712, "rewards/accuracies": 0.78125, "rewards/chosen": -2.234375, "rewards/margins": 1.5390625, "rewards/rejected": -3.765625, "step": 11650 }, { "epoch": 0.8955453149001537, "grad_norm": 22.21924735685701, "learning_rate": 1.6434777240719845e-08, "logits/chosen": -3.46875, "logits/rejected": -3.484375, "logps/chosen": -344.0, "logps/rejected": -498.0, "loss": 0.7615, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.203125, "rewards/margins": 1.59375, "rewards/rejected": -3.796875, "step": 11660 }, { "epoch": 0.8963133640552995, "grad_norm": 21.558388734810467, "learning_rate": 1.619661257243843e-08, "logits/chosen": -3.546875, "logits/rejected": -3.453125, "logps/chosen": -336.0, "logps/rejected": -512.0, "loss": 0.702, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.109375, "rewards/margins": 1.8125, "rewards/rejected": -3.921875, "step": 11670 }, { "epoch": 0.8970814132104454, "grad_norm": 19.48559234745337, "learning_rate": 1.5960128422654095e-08, "logits/chosen": -3.421875, "logits/rejected": -3.78125, "logps/chosen": -302.0, "logps/rejected": -452.0, "loss": 0.7169, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.9453125, "rewards/margins": 1.6484375, "rewards/rejected": -3.59375, "step": 11680 }, { "epoch": 0.8978494623655914, "grad_norm": 19.94113429551582, "learning_rate": 1.5725326491153906e-08, "logits/chosen": -3.5, "logits/rejected": -3.640625, "logps/chosen": -350.0, "logps/rejected": -502.0, "loss": 0.746, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.15625, "rewards/margins": 1.703125, "rewards/rejected": -3.859375, "step": 11690 }, { "epoch": 0.8986175115207373, "grad_norm": 22.732863838242412, "learning_rate": 1.5492208465633593e-08, "logits/chosen": -3.46875, "logits/rejected": -3.59375, "logps/chosen": -402.0, "logps/rejected": -524.0, "loss": 0.7604, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.5, "rewards/margins": 1.5, "rewards/rejected": -4.0, "step": 11700 }, { "epoch": 0.8993855606758833, "grad_norm": 17.785495782753326, "learning_rate": 1.52607760216856e-08, "logits/chosen": -3.484375, "logits/rejected": -3.5625, "logps/chosen": -350.0, "logps/rejected": -512.0, "loss": 0.7106, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.125, "rewards/margins": 1.859375, "rewards/rejected": -3.984375, "step": 11710 }, { "epoch": 0.9001536098310292, "grad_norm": 20.993917264639805, "learning_rate": 1.5031030822786507e-08, "logits/chosen": -3.453125, "logits/rejected": -3.625, "logps/chosen": -332.0, "logps/rejected": -484.0, "loss": 0.7465, "rewards/accuracies": 0.84375, "rewards/chosen": -2.125, "rewards/margins": 1.609375, "rewards/rejected": -3.734375, "step": 11720 }, { "epoch": 0.9009216589861752, "grad_norm": 28.684488574020936, "learning_rate": 1.4802974520285726e-08, "logits/chosen": -3.546875, "logits/rejected": -3.625, "logps/chosen": -330.0, "logps/rejected": -474.0, "loss": 0.7685, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.09375, "rewards/margins": 1.546875, "rewards/rejected": -3.640625, "step": 11730 }, { "epoch": 0.901689708141321, "grad_norm": 19.223937431694722, "learning_rate": 1.4576608753393143e-08, "logits/chosen": -3.484375, "logits/rejected": -3.640625, "logps/chosen": -358.0, "logps/rejected": -506.0, "loss": 0.7458, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.21875, "rewards/margins": 1.8046875, "rewards/rejected": -4.03125, "step": 11740 }, { "epoch": 0.9024577572964669, "grad_norm": 21.397229577195585, "learning_rate": 1.4351935149167549e-08, "logits/chosen": -3.4375, "logits/rejected": -3.453125, "logps/chosen": -370.0, "logps/rejected": -532.0, "loss": 0.7167, "rewards/accuracies": 0.84375, "rewards/chosen": -2.359375, "rewards/margins": 1.7109375, "rewards/rejected": -4.0625, "step": 11750 }, { "epoch": 0.9032258064516129, "grad_norm": 19.68120519281112, "learning_rate": 1.4128955322504965e-08, "logits/chosen": -3.328125, "logits/rejected": -3.390625, "logps/chosen": -338.0, "logps/rejected": -484.0, "loss": 0.7127, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.0, "rewards/margins": 1.7109375, "rewards/rejected": -3.71875, "step": 11760 }, { "epoch": 0.9039938556067588, "grad_norm": 22.01998260453928, "learning_rate": 1.390767087612682e-08, "logits/chosen": -3.4375, "logits/rejected": -3.46875, "logps/chosen": -326.0, "logps/rejected": -498.0, "loss": 0.7208, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.9453125, "rewards/margins": 1.9296875, "rewards/rejected": -3.875, "step": 11770 }, { "epoch": 0.9047619047619048, "grad_norm": 21.181862711757983, "learning_rate": 1.368808340056879e-08, "logits/chosen": -3.5, "logits/rejected": -3.625, "logps/chosen": -314.0, "logps/rejected": -486.0, "loss": 0.7539, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.0, "rewards/margins": 1.8828125, "rewards/rejected": -3.890625, "step": 11780 }, { "epoch": 0.9055299539170507, "grad_norm": 19.798832333501622, "learning_rate": 1.3470194474169027e-08, "logits/chosen": -3.46875, "logits/rejected": -3.421875, "logps/chosen": -308.0, "logps/rejected": -472.0, "loss": 0.7334, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.9296875, "rewards/margins": 1.703125, "rewards/rejected": -3.625, "step": 11790 }, { "epoch": 0.9062980030721967, "grad_norm": 20.709708986232105, "learning_rate": 1.3254005663056955e-08, "logits/chosen": -3.421875, "logits/rejected": -3.71875, "logps/chosen": -318.0, "logps/rejected": -480.0, "loss": 0.7013, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.9921875, "rewards/margins": 1.8046875, "rewards/rejected": -3.796875, "step": 11800 }, { "epoch": 0.9070660522273426, "grad_norm": 20.906899868208292, "learning_rate": 1.3039518521142101e-08, "logits/chosen": -3.421875, "logits/rejected": -3.421875, "logps/chosen": -344.0, "logps/rejected": -504.0, "loss": 0.7752, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.171875, "rewards/margins": 1.703125, "rewards/rejected": -3.875, "step": 11810 }, { "epoch": 0.9078341013824884, "grad_norm": 25.07083260178045, "learning_rate": 1.2826734590102668e-08, "logits/chosen": -3.46875, "logits/rejected": -3.4375, "logps/chosen": -342.0, "logps/rejected": -512.0, "loss": 0.7431, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.046875, "rewards/margins": 1.7578125, "rewards/rejected": -3.796875, "step": 11820 }, { "epoch": 0.9086021505376344, "grad_norm": 20.927062296659763, "learning_rate": 1.2615655399374813e-08, "logits/chosen": -3.453125, "logits/rejected": -3.5625, "logps/chosen": -340.0, "logps/rejected": -512.0, "loss": 0.7506, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.1875, "rewards/margins": 1.7734375, "rewards/rejected": -3.96875, "step": 11830 }, { "epoch": 0.9093701996927803, "grad_norm": 23.598046269534585, "learning_rate": 1.240628246614131e-08, "logits/chosen": -3.453125, "logits/rejected": -3.546875, "logps/chosen": -336.0, "logps/rejected": -516.0, "loss": 0.7559, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.140625, "rewards/margins": 1.875, "rewards/rejected": -4.0, "step": 11840 }, { "epoch": 0.9101382488479263, "grad_norm": 21.466713850585727, "learning_rate": 1.2198617295320845e-08, "logits/chosen": -3.59375, "logits/rejected": -3.609375, "logps/chosen": -338.0, "logps/rejected": -502.0, "loss": 0.7474, "rewards/accuracies": 0.8125, "rewards/chosen": -2.21875, "rewards/margins": 1.640625, "rewards/rejected": -3.859375, "step": 11850 }, { "epoch": 0.9109062980030722, "grad_norm": 20.591187294829435, "learning_rate": 1.1992661379557234e-08, "logits/chosen": -3.546875, "logits/rejected": -3.546875, "logps/chosen": -360.0, "logps/rejected": -516.0, "loss": 0.7243, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.203125, "rewards/margins": 1.6953125, "rewards/rejected": -3.890625, "step": 11860 }, { "epoch": 0.9116743471582182, "grad_norm": 22.427440390760765, "learning_rate": 1.1788416199208368e-08, "logits/chosen": -3.5, "logits/rejected": -3.65625, "logps/chosen": -344.0, "logps/rejected": -506.0, "loss": 0.718, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.203125, "rewards/margins": 1.8203125, "rewards/rejected": -4.03125, "step": 11870 }, { "epoch": 0.9124423963133641, "grad_norm": 17.725258068577784, "learning_rate": 1.1585883222336113e-08, "logits/chosen": -3.4375, "logits/rejected": -3.375, "logps/chosen": -336.0, "logps/rejected": -502.0, "loss": 0.7238, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.046875, "rewards/margins": 1.8203125, "rewards/rejected": -3.859375, "step": 11880 }, { "epoch": 0.9132104454685099, "grad_norm": 25.894661663994192, "learning_rate": 1.1385063904695264e-08, "logits/chosen": -3.5625, "logits/rejected": -3.546875, "logps/chosen": -344.0, "logps/rejected": -500.0, "loss": 0.743, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.171875, "rewards/margins": 1.7265625, "rewards/rejected": -3.890625, "step": 11890 }, { "epoch": 0.9139784946236559, "grad_norm": 24.289641232310625, "learning_rate": 1.1185959689723246e-08, "logits/chosen": -3.421875, "logits/rejected": -3.609375, "logps/chosen": -350.0, "logps/rejected": -492.0, "loss": 0.7757, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.1875, "rewards/margins": 1.4921875, "rewards/rejected": -3.6875, "step": 11900 }, { "epoch": 0.9147465437788018, "grad_norm": 22.361607741361127, "learning_rate": 1.0988572008530011e-08, "logits/chosen": -3.515625, "logits/rejected": -3.59375, "logps/chosen": -332.0, "logps/rejected": -512.0, "loss": 0.7448, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.1875, "rewards/margins": 1.671875, "rewards/rejected": -3.859375, "step": 11910 }, { "epoch": 0.9155145929339478, "grad_norm": 23.561411241347265, "learning_rate": 1.0792902279887189e-08, "logits/chosen": -3.59375, "logits/rejected": -3.71875, "logps/chosen": -318.0, "logps/rejected": -466.0, "loss": 0.7827, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.125, "rewards/margins": 1.5703125, "rewards/rejected": -3.703125, "step": 11920 }, { "epoch": 0.9162826420890937, "grad_norm": 20.248781089560445, "learning_rate": 1.0598951910218479e-08, "logits/chosen": -3.46875, "logits/rejected": -3.546875, "logps/chosen": -376.0, "logps/rejected": -520.0, "loss": 0.7417, "rewards/accuracies": 0.8125, "rewards/chosen": -2.3125, "rewards/margins": 1.65625, "rewards/rejected": -3.96875, "step": 11930 }, { "epoch": 0.9170506912442397, "grad_norm": 20.34286378881643, "learning_rate": 1.0406722293589077e-08, "logits/chosen": -3.515625, "logits/rejected": -3.46875, "logps/chosen": -324.0, "logps/rejected": -500.0, "loss": 0.7385, "rewards/accuracies": 0.8125, "rewards/chosen": -2.125, "rewards/margins": 1.75, "rewards/rejected": -3.875, "step": 11940 }, { "epoch": 0.9178187403993856, "grad_norm": 20.64222554589676, "learning_rate": 1.0216214811695933e-08, "logits/chosen": -3.546875, "logits/rejected": -3.578125, "logps/chosen": -348.0, "logps/rejected": -468.0, "loss": 0.7454, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.203125, "rewards/margins": 1.34375, "rewards/rejected": -3.546875, "step": 11950 }, { "epoch": 0.9185867895545314, "grad_norm": 23.292524673658818, "learning_rate": 1.0027430833857819e-08, "logits/chosen": -3.59375, "logits/rejected": -3.5, "logps/chosen": -354.0, "logps/rejected": -508.0, "loss": 0.7325, "rewards/accuracies": 0.8125, "rewards/chosen": -2.234375, "rewards/margins": 1.5390625, "rewards/rejected": -3.765625, "step": 11960 }, { "epoch": 0.9193548387096774, "grad_norm": 24.022387626535817, "learning_rate": 9.84037171700519e-09, "logits/chosen": -3.484375, "logits/rejected": -3.453125, "logps/chosen": -336.0, "logps/rejected": -500.0, "loss": 0.7535, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.140625, "rewards/margins": 1.7734375, "rewards/rejected": -3.90625, "step": 11970 }, { "epoch": 0.9201228878648233, "grad_norm": 20.43151613461517, "learning_rate": 9.655038805670868e-09, "logits/chosen": -3.390625, "logits/rejected": -3.546875, "logps/chosen": -344.0, "logps/rejected": -532.0, "loss": 0.7495, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.078125, "rewards/margins": 2.015625, "rewards/rejected": -4.09375, "step": 11980 }, { "epoch": 0.9208909370199693, "grad_norm": 23.426340671283256, "learning_rate": 9.471433431980037e-09, "logits/chosen": -3.40625, "logits/rejected": -3.46875, "logps/chosen": -352.0, "logps/rejected": -494.0, "loss": 0.7616, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.25, "rewards/margins": 1.515625, "rewards/rejected": -3.765625, "step": 11990 }, { "epoch": 0.9216589861751152, "grad_norm": 23.31472295312385, "learning_rate": 9.289556915640761e-09, "logits/chosen": -3.359375, "logits/rejected": -3.484375, "logps/chosen": -354.0, "logps/rejected": -510.0, "loss": 0.7384, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.28125, "rewards/margins": 1.6875, "rewards/rejected": -3.984375, "step": 12000 }, { "epoch": 0.9216589861751152, "eval_logits/chosen": -3.4375, "eval_logits/rejected": -3.578125, "eval_logps/chosen": -380.0, "eval_logps/rejected": -492.0, "eval_loss": 0.4575048089027405, "eval_rewards/accuracies": 0.7609890103340149, "eval_rewards/chosen": -2.375, "eval_rewards/margins": 1.4140625, "eval_rewards/rejected": -3.796875, "eval_runtime": 2264.8937, "eval_samples_per_second": 41.121, "eval_steps_per_second": 0.643, "step": 12000 }, { "epoch": 0.9224270353302612, "grad_norm": 22.57702412384079, "learning_rate": 9.109410563934661e-09, "logits/chosen": -3.375, "logits/rejected": -3.453125, "logps/chosen": -346.0, "logps/rejected": -528.0, "loss": 0.7237, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.140625, "rewards/margins": 1.96875, "rewards/rejected": -4.125, "step": 12010 }, { "epoch": 0.9231950844854071, "grad_norm": 22.25868832438705, "learning_rate": 8.930995671707186e-09, "logits/chosen": -3.453125, "logits/rejected": -3.5, "logps/chosen": -356.0, "logps/rejected": -506.0, "loss": 0.756, "rewards/accuracies": 0.8125, "rewards/chosen": -2.171875, "rewards/margins": 1.640625, "rewards/rejected": -3.8125, "step": 12020 }, { "epoch": 0.923963133640553, "grad_norm": 20.972449394588306, "learning_rate": 8.754313521358691e-09, "logits/chosen": -3.46875, "logits/rejected": -3.53125, "logps/chosen": -360.0, "logps/rejected": -532.0, "loss": 0.7091, "rewards/accuracies": 0.8125, "rewards/chosen": -2.203125, "rewards/margins": 1.90625, "rewards/rejected": -4.09375, "step": 12030 }, { "epoch": 0.9247311827956989, "grad_norm": 20.324871105851287, "learning_rate": 8.579365382834908e-09, "logits/chosen": -3.421875, "logits/rejected": -3.40625, "logps/chosen": -396.0, "logps/rejected": -532.0, "loss": 0.7133, "rewards/accuracies": 0.78125, "rewards/chosen": -2.3125, "rewards/margins": 1.5859375, "rewards/rejected": -3.90625, "step": 12040 }, { "epoch": 0.9254992319508448, "grad_norm": 24.6118940443726, "learning_rate": 8.406152513617926e-09, "logits/chosen": -3.296875, "logits/rejected": -3.640625, "logps/chosen": -348.0, "logps/rejected": -480.0, "loss": 0.7298, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.09375, "rewards/margins": 1.59375, "rewards/rejected": -3.671875, "step": 12050 }, { "epoch": 0.9262672811059908, "grad_norm": 22.275948720544534, "learning_rate": 8.234676158717313e-09, "logits/chosen": -3.453125, "logits/rejected": -3.390625, "logps/chosen": -336.0, "logps/rejected": -494.0, "loss": 0.7521, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.15625, "rewards/margins": 1.671875, "rewards/rejected": -3.828125, "step": 12060 }, { "epoch": 0.9270353302611367, "grad_norm": 23.572550515633075, "learning_rate": 8.064937550660789e-09, "logits/chosen": -3.546875, "logits/rejected": -3.640625, "logps/chosen": -328.0, "logps/rejected": -506.0, "loss": 0.7175, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.015625, "rewards/margins": 1.9140625, "rewards/rejected": -3.9375, "step": 12070 }, { "epoch": 0.9278033794162827, "grad_norm": 19.71612946881046, "learning_rate": 7.89693790948584e-09, "logits/chosen": -3.65625, "logits/rejected": -3.625, "logps/chosen": -320.0, "logps/rejected": -488.0, "loss": 0.7494, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.03125, "rewards/margins": 1.8125, "rewards/rejected": -3.84375, "step": 12080 }, { "epoch": 0.9285714285714286, "grad_norm": 19.18079896671756, "learning_rate": 7.730678442730537e-09, "logits/chosen": -3.53125, "logits/rejected": -3.625, "logps/chosen": -344.0, "logps/rejected": -494.0, "loss": 0.6824, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.109375, "rewards/margins": 1.65625, "rewards/rejected": -3.765625, "step": 12090 }, { "epoch": 0.9293394777265745, "grad_norm": 18.53726565390834, "learning_rate": 7.566160345425066e-09, "logits/chosen": -3.5625, "logits/rejected": -3.609375, "logps/chosen": -346.0, "logps/rejected": -512.0, "loss": 0.7508, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.25, "rewards/margins": 1.6953125, "rewards/rejected": -3.9375, "step": 12100 }, { "epoch": 0.9301075268817204, "grad_norm": 19.12086788690485, "learning_rate": 7.403384800083179e-09, "logits/chosen": -3.609375, "logits/rejected": -3.46875, "logps/chosen": -332.0, "logps/rejected": -492.0, "loss": 0.7438, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.265625, "rewards/margins": 1.484375, "rewards/rejected": -3.75, "step": 12110 }, { "epoch": 0.9308755760368663, "grad_norm": 23.17066988531768, "learning_rate": 7.242352976693484e-09, "logits/chosen": -3.375, "logits/rejected": -3.34375, "logps/chosen": -362.0, "logps/rejected": -524.0, "loss": 0.7531, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.234375, "rewards/margins": 1.78125, "rewards/rejected": -4.03125, "step": 12120 }, { "epoch": 0.9316436251920123, "grad_norm": 21.28513677723999, "learning_rate": 7.083066032711277e-09, "logits/chosen": -3.5625, "logits/rejected": -3.5, "logps/chosen": -320.0, "logps/rejected": -516.0, "loss": 0.7027, "rewards/accuracies": 0.84375, "rewards/chosen": -2.125, "rewards/margins": 1.90625, "rewards/rejected": -4.03125, "step": 12130 }, { "epoch": 0.9324116743471582, "grad_norm": 21.474190882889264, "learning_rate": 6.925525113050029e-09, "logits/chosen": -3.53125, "logits/rejected": -3.453125, "logps/chosen": -342.0, "logps/rejected": -498.0, "loss": 0.7492, "rewards/accuracies": 0.78125, "rewards/chosen": -2.1875, "rewards/margins": 1.5703125, "rewards/rejected": -3.75, "step": 12140 }, { "epoch": 0.9331797235023042, "grad_norm": 21.279685279053183, "learning_rate": 6.769731350073249e-09, "logits/chosen": -3.46875, "logits/rejected": -3.46875, "logps/chosen": -310.0, "logps/rejected": -468.0, "loss": 0.7258, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.9921875, "rewards/margins": 1.59375, "rewards/rejected": -3.578125, "step": 12150 }, { "epoch": 0.9339477726574501, "grad_norm": 23.924336541006713, "learning_rate": 6.615685863586462e-09, "logits/chosen": -3.328125, "logits/rejected": -3.71875, "logps/chosen": -376.0, "logps/rejected": -504.0, "loss": 0.7412, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.21875, "rewards/margins": 1.6328125, "rewards/rejected": -3.859375, "step": 12160 }, { "epoch": 0.934715821812596, "grad_norm": 19.843429447422587, "learning_rate": 6.4633897608288024e-09, "logits/chosen": -3.28125, "logits/rejected": -3.53125, "logps/chosen": -356.0, "logps/rejected": -548.0, "loss": 0.7276, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.09375, "rewards/margins": 2.046875, "rewards/rejected": -4.15625, "step": 12170 }, { "epoch": 0.9354838709677419, "grad_norm": 25.107195240935628, "learning_rate": 6.3128441364654615e-09, "logits/chosen": -3.453125, "logits/rejected": -3.65625, "logps/chosen": -314.0, "logps/rejected": -482.0, "loss": 0.7756, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.9375, "rewards/margins": 1.7890625, "rewards/rejected": -3.71875, "step": 12180 }, { "epoch": 0.9362519201228878, "grad_norm": 19.847496053499334, "learning_rate": 6.164050072579613e-09, "logits/chosen": -3.546875, "logits/rejected": -3.328125, "logps/chosen": -336.0, "logps/rejected": -536.0, "loss": 0.7054, "rewards/accuracies": 0.875, "rewards/chosen": -2.171875, "rewards/margins": 1.921875, "rewards/rejected": -4.09375, "step": 12190 }, { "epoch": 0.9370199692780338, "grad_norm": 22.44024130425636, "learning_rate": 6.017008638664611e-09, "logits/chosen": -3.34375, "logits/rejected": -3.4375, "logps/chosen": -358.0, "logps/rejected": -520.0, "loss": 0.785, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.1875, "rewards/margins": 1.796875, "rewards/rejected": -3.984375, "step": 12200 }, { "epoch": 0.9377880184331797, "grad_norm": 21.904932062388554, "learning_rate": 5.871720891616444e-09, "logits/chosen": -3.421875, "logits/rejected": -3.484375, "logps/chosen": -330.0, "logps/rejected": -496.0, "loss": 0.7795, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.125, "rewards/margins": 1.6875, "rewards/rejected": -3.8125, "step": 12210 }, { "epoch": 0.9385560675883257, "grad_norm": 18.40197048158309, "learning_rate": 5.728187875725876e-09, "logits/chosen": -3.40625, "logits/rejected": -3.6875, "logps/chosen": -374.0, "logps/rejected": -512.0, "loss": 0.6994, "rewards/accuracies": 0.8125, "rewards/chosen": -2.390625, "rewards/margins": 1.7109375, "rewards/rejected": -4.09375, "step": 12220 }, { "epoch": 0.9393241167434716, "grad_norm": 20.127808471251196, "learning_rate": 5.5864106226713726e-09, "logits/chosen": -3.578125, "logits/rejected": -3.828125, "logps/chosen": -328.0, "logps/rejected": -476.0, "loss": 0.7297, "rewards/accuracies": 0.8125, "rewards/chosen": -2.171875, "rewards/margins": 1.5390625, "rewards/rejected": -3.71875, "step": 12230 }, { "epoch": 0.9400921658986175, "grad_norm": 22.161187438812743, "learning_rate": 5.446390151511188e-09, "logits/chosen": -3.5625, "logits/rejected": -3.71875, "logps/chosen": -382.0, "logps/rejected": -524.0, "loss": 0.74, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.390625, "rewards/margins": 1.6875, "rewards/rejected": -4.0625, "step": 12240 }, { "epoch": 0.9408602150537635, "grad_norm": 21.965598163529364, "learning_rate": 5.308127468676482e-09, "logits/chosen": -3.5625, "logits/rejected": -3.578125, "logps/chosen": -338.0, "logps/rejected": -520.0, "loss": 0.7603, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.125, "rewards/margins": 1.9140625, "rewards/rejected": -4.03125, "step": 12250 }, { "epoch": 0.9416282642089093, "grad_norm": 22.336513742796694, "learning_rate": 5.1716235679637456e-09, "logits/chosen": -3.46875, "logits/rejected": -3.65625, "logps/chosen": -318.0, "logps/rejected": -480.0, "loss": 0.7653, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.109375, "rewards/margins": 1.828125, "rewards/rejected": -3.921875, "step": 12260 }, { "epoch": 0.9423963133640553, "grad_norm": 24.568292998632987, "learning_rate": 5.036879430527774e-09, "logits/chosen": -3.5, "logits/rejected": -3.5625, "logps/chosen": -330.0, "logps/rejected": -496.0, "loss": 0.7181, "rewards/accuracies": 0.78125, "rewards/chosen": -2.125, "rewards/margins": 1.8046875, "rewards/rejected": -3.9375, "step": 12270 }, { "epoch": 0.9431643625192012, "grad_norm": 19.02657693420966, "learning_rate": 4.903896024874815e-09, "logits/chosen": -3.625, "logits/rejected": -3.671875, "logps/chosen": -324.0, "logps/rejected": -520.0, "loss": 0.7378, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.09375, "rewards/margins": 2.0, "rewards/rejected": -4.09375, "step": 12280 }, { "epoch": 0.9439324116743472, "grad_norm": 19.264394441857792, "learning_rate": 4.7726743068552125e-09, "logits/chosen": -3.359375, "logits/rejected": -3.390625, "logps/chosen": -350.0, "logps/rejected": -524.0, "loss": 0.7198, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.171875, "rewards/margins": 1.7734375, "rewards/rejected": -3.9375, "step": 12290 }, { "epoch": 0.9447004608294931, "grad_norm": 17.873691137293907, "learning_rate": 4.643215219656937e-09, "logits/chosen": -3.59375, "logits/rejected": -3.53125, "logps/chosen": -356.0, "logps/rejected": -524.0, "loss": 0.7367, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.40625, "rewards/margins": 1.609375, "rewards/rejected": -4.0, "step": 12300 }, { "epoch": 0.945468509984639, "grad_norm": 20.86035725012575, "learning_rate": 4.515519693798542e-09, "logits/chosen": -3.609375, "logits/rejected": -3.578125, "logps/chosen": -290.0, "logps/rejected": -462.0, "loss": 0.7266, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.9296875, "rewards/margins": 1.578125, "rewards/rejected": -3.515625, "step": 12310 }, { "epoch": 0.946236559139785, "grad_norm": 21.97542246286064, "learning_rate": 4.389588647122522e-09, "logits/chosen": -3.484375, "logits/rejected": -3.6875, "logps/chosen": -336.0, "logps/rejected": -492.0, "loss": 0.7702, "rewards/accuracies": 0.8125, "rewards/chosen": -2.046875, "rewards/margins": 1.7109375, "rewards/rejected": -3.765625, "step": 12320 }, { "epoch": 0.9470046082949308, "grad_norm": 23.7139102809588, "learning_rate": 4.265422984788797e-09, "logits/chosen": -3.5, "logits/rejected": -3.40625, "logps/chosen": -360.0, "logps/rejected": -536.0, "loss": 0.7297, "rewards/accuracies": 0.84375, "rewards/chosen": -2.390625, "rewards/margins": 1.78125, "rewards/rejected": -4.15625, "step": 12330 }, { "epoch": 0.9477726574500768, "grad_norm": 22.189209005271902, "learning_rate": 4.143023599268131e-09, "logits/chosen": -3.390625, "logits/rejected": -3.6875, "logps/chosen": -338.0, "logps/rejected": -482.0, "loss": 0.746, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.203125, "rewards/margins": 1.6484375, "rewards/rejected": -3.84375, "step": 12340 }, { "epoch": 0.9485407066052227, "grad_norm": 19.646348585069152, "learning_rate": 4.022391370335831e-09, "logits/chosen": -3.484375, "logits/rejected": -3.640625, "logps/chosen": -346.0, "logps/rejected": -506.0, "loss": 0.7734, "rewards/accuracies": 0.8125, "rewards/chosen": -2.265625, "rewards/margins": 1.6875, "rewards/rejected": -3.953125, "step": 12350 }, { "epoch": 0.9493087557603687, "grad_norm": 22.12064526825735, "learning_rate": 3.903527165065229e-09, "logits/chosen": -3.453125, "logits/rejected": -3.609375, "logps/chosen": -338.0, "logps/rejected": -512.0, "loss": 0.7339, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.078125, "rewards/margins": 1.8828125, "rewards/rejected": -3.953125, "step": 12360 }, { "epoch": 0.9500768049155146, "grad_norm": 27.486438778528083, "learning_rate": 3.786431837821569e-09, "logits/chosen": -3.265625, "logits/rejected": -3.53125, "logps/chosen": -352.0, "logps/rejected": -512.0, "loss": 0.7447, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.15625, "rewards/margins": 1.765625, "rewards/rejected": -3.921875, "step": 12370 }, { "epoch": 0.9508448540706606, "grad_norm": 23.19052201115476, "learning_rate": 3.6711062302559327e-09, "logits/chosen": -3.515625, "logits/rejected": -3.59375, "logps/chosen": -340.0, "logps/rejected": -524.0, "loss": 0.6723, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.140625, "rewards/margins": 1.921875, "rewards/rejected": -4.0625, "step": 12380 }, { "epoch": 0.9516129032258065, "grad_norm": 20.667115058045106, "learning_rate": 3.5575511712990504e-09, "logits/chosen": -3.53125, "logits/rejected": -3.609375, "logps/chosen": -320.0, "logps/rejected": -496.0, "loss": 0.7843, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.140625, "rewards/margins": 1.7421875, "rewards/rejected": -3.875, "step": 12390 }, { "epoch": 0.9523809523809523, "grad_norm": 17.742652762994872, "learning_rate": 3.4457674771554422e-09, "logits/chosen": -3.484375, "logits/rejected": -3.578125, "logps/chosen": -342.0, "logps/rejected": -486.0, "loss": 0.7204, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.09375, "rewards/margins": 1.6953125, "rewards/rejected": -3.78125, "step": 12400 }, { "epoch": 0.9531490015360983, "grad_norm": 20.956648426480537, "learning_rate": 3.3357559512974808e-09, "logits/chosen": -3.46875, "logits/rejected": -3.375, "logps/chosen": -354.0, "logps/rejected": -520.0, "loss": 0.73, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.171875, "rewards/margins": 1.7578125, "rewards/rejected": -3.9375, "step": 12410 }, { "epoch": 0.9539170506912442, "grad_norm": 23.803914572872984, "learning_rate": 3.2275173844596437e-09, "logits/chosen": -3.421875, "logits/rejected": -3.453125, "logps/chosen": -320.0, "logps/rejected": -480.0, "loss": 0.7407, "rewards/accuracies": 0.84375, "rewards/chosen": -2.015625, "rewards/margins": 1.703125, "rewards/rejected": -3.734375, "step": 12420 }, { "epoch": 0.9546850998463902, "grad_norm": 21.74377166719803, "learning_rate": 3.1210525546328803e-09, "logits/chosen": -3.4375, "logits/rejected": -3.53125, "logps/chosen": -360.0, "logps/rejected": -524.0, "loss": 0.7353, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.34375, "rewards/margins": 1.7578125, "rewards/rejected": -4.09375, "step": 12430 }, { "epoch": 0.9554531490015361, "grad_norm": 18.417471389289243, "learning_rate": 3.0163622270589493e-09, "logits/chosen": -3.4375, "logits/rejected": -3.515625, "logps/chosen": -364.0, "logps/rejected": -520.0, "loss": 0.7389, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.1875, "rewards/margins": 1.6953125, "rewards/rejected": -3.875, "step": 12440 }, { "epoch": 0.956221198156682, "grad_norm": 18.89534164976327, "learning_rate": 2.9134471542249516e-09, "logits/chosen": -3.359375, "logits/rejected": -3.5625, "logps/chosen": -328.0, "logps/rejected": -496.0, "loss": 0.7329, "rewards/accuracies": 0.84375, "rewards/chosen": -2.0625, "rewards/margins": 1.7890625, "rewards/rejected": -3.859375, "step": 12450 }, { "epoch": 0.956989247311828, "grad_norm": 19.509833327833874, "learning_rate": 2.8123080758579166e-09, "logits/chosen": -3.5, "logits/rejected": -3.703125, "logps/chosen": -338.0, "logps/rejected": -486.0, "loss": 0.725, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.046875, "rewards/margins": 1.84375, "rewards/rejected": -3.890625, "step": 12460 }, { "epoch": 0.9577572964669739, "grad_norm": 19.024456280209066, "learning_rate": 2.712945718919418e-09, "logits/chosen": -3.421875, "logits/rejected": -3.53125, "logps/chosen": -336.0, "logps/rejected": -516.0, "loss": 0.7429, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.140625, "rewards/margins": 1.90625, "rewards/rejected": -4.0625, "step": 12470 }, { "epoch": 0.9585253456221198, "grad_norm": 23.029353108930767, "learning_rate": 2.615360797600524e-09, "logits/chosen": -3.453125, "logits/rejected": -3.359375, "logps/chosen": -342.0, "logps/rejected": -496.0, "loss": 0.7273, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.1875, "rewards/margins": 1.5703125, "rewards/rejected": -3.75, "step": 12480 }, { "epoch": 0.9592933947772657, "grad_norm": 21.81403168128203, "learning_rate": 2.519554013316466e-09, "logits/chosen": -3.578125, "logits/rejected": -3.515625, "logps/chosen": -376.0, "logps/rejected": -552.0, "loss": 0.7256, "rewards/accuracies": 0.8125, "rewards/chosen": -2.34375, "rewards/margins": 1.84375, "rewards/rejected": -4.1875, "step": 12490 }, { "epoch": 0.9600614439324117, "grad_norm": 20.531804681911286, "learning_rate": 2.4255260547018096e-09, "logits/chosen": -3.4375, "logits/rejected": -3.5625, "logps/chosen": -350.0, "logps/rejected": -516.0, "loss": 0.729, "rewards/accuracies": 0.875, "rewards/chosen": -2.171875, "rewards/margins": 1.84375, "rewards/rejected": -4.03125, "step": 12500 }, { "epoch": 0.9608294930875576, "grad_norm": 19.35372079173808, "learning_rate": 2.3332775976051824e-09, "logits/chosen": -3.40625, "logits/rejected": -3.65625, "logps/chosen": -338.0, "logps/rejected": -472.0, "loss": 0.7184, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.1875, "rewards/margins": 1.5625, "rewards/rejected": -3.75, "step": 12510 }, { "epoch": 0.9615975422427036, "grad_norm": 24.902542592004412, "learning_rate": 2.2428093050847473e-09, "logits/chosen": -3.578125, "logits/rejected": -3.640625, "logps/chosen": -328.0, "logps/rejected": -490.0, "loss": 0.7198, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.15625, "rewards/margins": 1.6171875, "rewards/rejected": -3.78125, "step": 12520 }, { "epoch": 0.9623655913978495, "grad_norm": 21.677315846382687, "learning_rate": 2.1541218274033213e-09, "logits/chosen": -3.625, "logits/rejected": -3.53125, "logps/chosen": -306.0, "logps/rejected": -464.0, "loss": 0.7102, "rewards/accuracies": 0.8125, "rewards/chosen": -2.0625, "rewards/margins": 1.671875, "rewards/rejected": -3.734375, "step": 12530 }, { "epoch": 0.9631336405529954, "grad_norm": 17.636333628673654, "learning_rate": 2.0672158020235974e-09, "logits/chosen": -3.375, "logits/rejected": -3.53125, "logps/chosen": -384.0, "logps/rejected": -524.0, "loss": 0.7187, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.40625, "rewards/margins": 1.609375, "rewards/rejected": -4.0, "step": 12540 }, { "epoch": 0.9639016897081413, "grad_norm": 20.70423703784766, "learning_rate": 1.982091853603707e-09, "logits/chosen": -3.328125, "logits/rejected": -3.296875, "logps/chosen": -360.0, "logps/rejected": -528.0, "loss": 0.7612, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.15625, "rewards/margins": 1.859375, "rewards/rejected": -4.0, "step": 12550 }, { "epoch": 0.9646697388632872, "grad_norm": 20.37084924895009, "learning_rate": 1.898750593992665e-09, "logits/chosen": -3.453125, "logits/rejected": -3.578125, "logps/chosen": -354.0, "logps/rejected": -486.0, "loss": 0.7466, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.3125, "rewards/margins": 1.390625, "rewards/rejected": -3.703125, "step": 12560 }, { "epoch": 0.9654377880184332, "grad_norm": 24.915397653095134, "learning_rate": 1.8171926222259049e-09, "logits/chosen": -3.5, "logits/rejected": -3.53125, "logps/chosen": -358.0, "logps/rejected": -498.0, "loss": 0.7621, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.296875, "rewards/margins": 1.4375, "rewards/rejected": -3.71875, "step": 12570 }, { "epoch": 0.9662058371735791, "grad_norm": 19.53825497631236, "learning_rate": 1.7374185245211947e-09, "logits/chosen": -3.484375, "logits/rejected": -3.75, "logps/chosen": -332.0, "logps/rejected": -478.0, "loss": 0.7969, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.140625, "rewards/margins": 1.6484375, "rewards/rejected": -3.78125, "step": 12580 }, { "epoch": 0.966973886328725, "grad_norm": 20.542391263260097, "learning_rate": 1.6594288742741436e-09, "logits/chosen": -3.546875, "logits/rejected": -3.453125, "logps/chosen": -344.0, "logps/rejected": -510.0, "loss": 0.7458, "rewards/accuracies": 0.8125, "rewards/chosen": -2.21875, "rewards/margins": 1.7265625, "rewards/rejected": -3.953125, "step": 12590 }, { "epoch": 0.967741935483871, "grad_norm": 21.384417316762224, "learning_rate": 1.5832242320543143e-09, "logits/chosen": -3.375, "logits/rejected": -3.53125, "logps/chosen": -316.0, "logps/rejected": -502.0, "loss": 0.7259, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.9609375, "rewards/margins": 1.9140625, "rewards/rejected": -3.875, "step": 12600 }, { "epoch": 0.9685099846390169, "grad_norm": 20.786841404483198, "learning_rate": 1.5088051456009499e-09, "logits/chosen": -3.453125, "logits/rejected": -3.375, "logps/chosen": -378.0, "logps/rejected": -536.0, "loss": 0.7461, "rewards/accuracies": 0.8125, "rewards/chosen": -2.5, "rewards/margins": 1.75, "rewards/rejected": -4.25, "step": 12610 }, { "epoch": 0.9692780337941628, "grad_norm": 19.5959025661614, "learning_rate": 1.436172149819309e-09, "logits/chosen": -3.671875, "logits/rejected": -3.4375, "logps/chosen": -326.0, "logps/rejected": -524.0, "loss": 0.7293, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.15625, "rewards/margins": 1.859375, "rewards/rejected": -4.03125, "step": 12620 }, { "epoch": 0.9700460829493087, "grad_norm": 20.685394205510708, "learning_rate": 1.3653257667766426e-09, "logits/chosen": -3.359375, "logits/rejected": -3.46875, "logps/chosen": -332.0, "logps/rejected": -504.0, "loss": 0.7397, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.171875, "rewards/margins": 1.78125, "rewards/rejected": -3.953125, "step": 12630 }, { "epoch": 0.9708141321044547, "grad_norm": 19.360263466758184, "learning_rate": 1.296266505698529e-09, "logits/chosen": -3.5, "logits/rejected": -3.53125, "logps/chosen": -344.0, "logps/rejected": -510.0, "loss": 0.7585, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.171875, "rewards/margins": 1.546875, "rewards/rejected": -3.71875, "step": 12640 }, { "epoch": 0.9715821812596006, "grad_norm": 20.61969175896087, "learning_rate": 1.2289948629650993e-09, "logits/chosen": -3.3125, "logits/rejected": -3.296875, "logps/chosen": -350.0, "logps/rejected": -494.0, "loss": 0.7676, "rewards/accuracies": 0.8125, "rewards/chosen": -2.1875, "rewards/margins": 1.4765625, "rewards/rejected": -3.671875, "step": 12650 }, { "epoch": 0.9723502304147466, "grad_norm": 23.854942899445863, "learning_rate": 1.1635113221075966e-09, "logits/chosen": -3.46875, "logits/rejected": -3.609375, "logps/chosen": -322.0, "logps/rejected": -476.0, "loss": 0.7388, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.03125, "rewards/margins": 1.6953125, "rewards/rejected": -3.71875, "step": 12660 }, { "epoch": 0.9731182795698925, "grad_norm": 23.216317697849252, "learning_rate": 1.0998163538048223e-09, "logits/chosen": -3.34375, "logits/rejected": -3.453125, "logps/chosen": -336.0, "logps/rejected": -480.0, "loss": 0.7205, "rewards/accuracies": 0.875, "rewards/chosen": -2.09375, "rewards/margins": 1.6484375, "rewards/rejected": -3.734375, "step": 12670 }, { "epoch": 0.9738863287250384, "grad_norm": 20.195196479632315, "learning_rate": 1.0379104158798334e-09, "logits/chosen": -3.6875, "logits/rejected": -3.703125, "logps/chosen": -332.0, "logps/rejected": -506.0, "loss": 0.7615, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.109375, "rewards/margins": 1.875, "rewards/rejected": -3.984375, "step": 12680 }, { "epoch": 0.9746543778801844, "grad_norm": 18.709480662305147, "learning_rate": 9.77793953296502e-10, "logits/chosen": -3.390625, "logits/rejected": -3.625, "logps/chosen": -380.0, "logps/rejected": -540.0, "loss": 0.6658, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.359375, "rewards/margins": 1.7578125, "rewards/rejected": -4.125, "step": 12690 }, { "epoch": 0.9754224270353302, "grad_norm": 29.221214353575448, "learning_rate": 9.194673981565426e-10, "logits/chosen": -3.53125, "logits/rejected": -3.65625, "logps/chosen": -340.0, "logps/rejected": -516.0, "loss": 0.752, "rewards/accuracies": 0.84375, "rewards/chosen": -2.15625, "rewards/margins": 1.828125, "rewards/rejected": -3.984375, "step": 12700 }, { "epoch": 0.9761904761904762, "grad_norm": 26.324851582513894, "learning_rate": 8.629311696961295e-10, "logits/chosen": -3.265625, "logits/rejected": -3.3125, "logps/chosen": -366.0, "logps/rejected": -524.0, "loss": 0.7503, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.125, "rewards/margins": 1.7109375, "rewards/rejected": -3.828125, "step": 12710 }, { "epoch": 0.9769585253456221, "grad_norm": 22.402714784038324, "learning_rate": 8.081856742831461e-10, "logits/chosen": -3.53125, "logits/rejected": -3.71875, "logps/chosen": -356.0, "logps/rejected": -540.0, "loss": 0.7566, "rewards/accuracies": 0.875, "rewards/chosen": -2.265625, "rewards/margins": 1.90625, "rewards/rejected": -4.1875, "step": 12720 }, { "epoch": 0.977726574500768, "grad_norm": 23.646629482293235, "learning_rate": 7.552313054141058e-10, "logits/chosen": -3.59375, "logits/rejected": -3.671875, "logps/chosen": -356.0, "logps/rejected": -516.0, "loss": 0.7413, "rewards/accuracies": 0.84375, "rewards/chosen": -2.109375, "rewards/margins": 1.7421875, "rewards/rejected": -3.84375, "step": 12730 }, { "epoch": 0.978494623655914, "grad_norm": 23.319756291523078, "learning_rate": 7.040684437113197e-10, "logits/chosen": -3.46875, "logits/rejected": -3.515625, "logps/chosen": -352.0, "logps/rejected": -516.0, "loss": 0.7497, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.21875, "rewards/margins": 1.734375, "rewards/rejected": -3.953125, "step": 12740 }, { "epoch": 0.9792626728110599, "grad_norm": 20.179312265549807, "learning_rate": 6.546974569203445e-10, "logits/chosen": -3.5625, "logits/rejected": -3.34375, "logps/chosen": -322.0, "logps/rejected": -532.0, "loss": 0.7478, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.046875, "rewards/margins": 2.125, "rewards/rejected": -4.1875, "step": 12750 }, { "epoch": 0.9800307219662059, "grad_norm": 20.228878262508637, "learning_rate": 6.07118699907011e-10, "logits/chosen": -3.609375, "logits/rejected": -3.640625, "logps/chosen": -344.0, "logps/rejected": -492.0, "loss": 0.7846, "rewards/accuracies": 0.84375, "rewards/chosen": -2.28125, "rewards/margins": 1.609375, "rewards/rejected": -3.890625, "step": 12760 }, { "epoch": 0.9807987711213517, "grad_norm": 23.420679952610154, "learning_rate": 5.613325146552051e-10, "logits/chosen": -3.5625, "logits/rejected": -3.453125, "logps/chosen": -332.0, "logps/rejected": -510.0, "loss": 0.7433, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.046875, "rewards/margins": 1.875, "rewards/rejected": -3.90625, "step": 12770 }, { "epoch": 0.9815668202764977, "grad_norm": 23.052262572076103, "learning_rate": 5.173392302642299e-10, "logits/chosen": -3.46875, "logits/rejected": -3.5, "logps/chosen": -360.0, "logps/rejected": -510.0, "loss": 0.7822, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.234375, "rewards/margins": 1.6640625, "rewards/rejected": -3.90625, "step": 12780 }, { "epoch": 0.9823348694316436, "grad_norm": 23.69648460056025, "learning_rate": 4.751391629464752e-10, "logits/chosen": -3.515625, "logits/rejected": -3.65625, "logps/chosen": -346.0, "logps/rejected": -506.0, "loss": 0.7429, "rewards/accuracies": 0.84375, "rewards/chosen": -2.1875, "rewards/margins": 1.8046875, "rewards/rejected": -4.0, "step": 12790 }, { "epoch": 0.9831029185867896, "grad_norm": 26.05608885544652, "learning_rate": 4.3473261602516854e-10, "logits/chosen": -3.546875, "logits/rejected": -3.671875, "logps/chosen": -336.0, "logps/rejected": -588.0, "loss": 0.7509, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.171875, "rewards/margins": 2.359375, "rewards/rejected": -4.53125, "step": 12800 }, { "epoch": 0.9838709677419355, "grad_norm": 20.111601006439326, "learning_rate": 3.9611987993212746e-10, "logits/chosen": -3.46875, "logits/rejected": -3.390625, "logps/chosen": -304.0, "logps/rejected": -490.0, "loss": 0.735, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9140625, "rewards/margins": 1.859375, "rewards/rejected": -3.765625, "step": 12810 }, { "epoch": 0.9846390168970814, "grad_norm": 22.941480849184153, "learning_rate": 3.593012322057609e-10, "logits/chosen": -3.53125, "logits/rejected": -3.515625, "logps/chosen": -344.0, "logps/rejected": -500.0, "loss": 0.7401, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.25, "rewards/margins": 1.5859375, "rewards/rejected": -3.828125, "step": 12820 }, { "epoch": 0.9854070660522274, "grad_norm": 20.941917758203793, "learning_rate": 3.242769374890153e-10, "logits/chosen": -3.5, "logits/rejected": -3.640625, "logps/chosen": -360.0, "logps/rejected": -536.0, "loss": 0.7446, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.390625, "rewards/margins": 1.8828125, "rewards/rejected": -4.28125, "step": 12830 }, { "epoch": 0.9861751152073732, "grad_norm": 19.678990786973262, "learning_rate": 2.910472475274872e-10, "logits/chosen": -3.40625, "logits/rejected": -3.453125, "logps/chosen": -330.0, "logps/rejected": -490.0, "loss": 0.778, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.03125, "rewards/margins": 1.6953125, "rewards/rejected": -3.71875, "step": 12840 }, { "epoch": 0.9869431643625192, "grad_norm": 17.13490226098189, "learning_rate": 2.5961240116764705e-10, "logits/chosen": -3.65625, "logits/rejected": -3.421875, "logps/chosen": -320.0, "logps/rejected": -494.0, "loss": 0.7298, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.0625, "rewards/margins": 1.59375, "rewards/rejected": -3.65625, "step": 12850 }, { "epoch": 0.9877112135176651, "grad_norm": 21.441176126264352, "learning_rate": 2.2997262435503483e-10, "logits/chosen": -3.40625, "logits/rejected": -3.421875, "logps/chosen": -336.0, "logps/rejected": -520.0, "loss": 0.739, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.078125, "rewards/margins": 1.875, "rewards/rejected": -3.953125, "step": 12860 }, { "epoch": 0.988479262672811, "grad_norm": 25.061356459937617, "learning_rate": 2.0212813013276153e-10, "logits/chosen": -3.53125, "logits/rejected": -3.609375, "logps/chosen": -320.0, "logps/rejected": -468.0, "loss": 0.7708, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.078125, "rewards/margins": 1.6953125, "rewards/rejected": -3.78125, "step": 12870 }, { "epoch": 0.989247311827957, "grad_norm": 22.55750776596521, "learning_rate": 1.7607911863987135e-10, "logits/chosen": -3.453125, "logits/rejected": -3.453125, "logps/chosen": -316.0, "logps/rejected": -488.0, "loss": 0.7443, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.8828125, "rewards/margins": 1.8828125, "rewards/rejected": -3.765625, "step": 12880 }, { "epoch": 0.9900153609831029, "grad_norm": 20.383562928986755, "learning_rate": 1.5182577710992627e-10, "logits/chosen": -3.5, "logits/rejected": -3.53125, "logps/chosen": -326.0, "logps/rejected": -544.0, "loss": 0.6874, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.0, "rewards/margins": 2.125, "rewards/rejected": -4.125, "step": 12890 }, { "epoch": 0.9907834101382489, "grad_norm": 21.17727483376081, "learning_rate": 1.2936827986972932e-10, "logits/chosen": -3.296875, "logits/rejected": -3.5, "logps/chosen": -368.0, "logps/rejected": -520.0, "loss": 0.7155, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.15625, "rewards/margins": 1.8125, "rewards/rejected": -3.96875, "step": 12900 }, { "epoch": 0.9915514592933948, "grad_norm": 21.115042650021365, "learning_rate": 1.0870678833796443e-10, "logits/chosen": -3.515625, "logits/rejected": -3.703125, "logps/chosen": -322.0, "logps/rejected": -488.0, "loss": 0.727, "rewards/accuracies": 0.84375, "rewards/chosen": -2.09375, "rewards/margins": 1.828125, "rewards/rejected": -3.921875, "step": 12910 }, { "epoch": 0.9923195084485407, "grad_norm": 28.409373389206866, "learning_rate": 8.984145102408636e-11, "logits/chosen": -3.390625, "logits/rejected": -3.453125, "logps/chosen": -340.0, "logps/rejected": -506.0, "loss": 0.7685, "rewards/accuracies": 0.8125, "rewards/chosen": -2.21875, "rewards/margins": 1.6875, "rewards/rejected": -3.90625, "step": 12920 }, { "epoch": 0.9930875576036866, "grad_norm": 21.03060270712551, "learning_rate": 7.277240352729363e-11, "logits/chosen": -3.515625, "logits/rejected": -3.546875, "logps/chosen": -368.0, "logps/rejected": -536.0, "loss": 0.7122, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.234375, "rewards/margins": 1.75, "rewards/rejected": -3.984375, "step": 12930 }, { "epoch": 0.9938556067588326, "grad_norm": 19.680208951624216, "learning_rate": 5.7499768535529405e-11, "logits/chosen": -3.453125, "logits/rejected": -3.609375, "logps/chosen": -324.0, "logps/rejected": -494.0, "loss": 0.7527, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.03125, "rewards/margins": 1.875, "rewards/rejected": -3.90625, "step": 12940 }, { "epoch": 0.9946236559139785, "grad_norm": 21.101140905906053, "learning_rate": 4.4023655824509954e-11, "logits/chosen": -3.5, "logits/rejected": -3.484375, "logps/chosen": -360.0, "logps/rejected": -506.0, "loss": 0.7558, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.3125, "rewards/margins": 1.5625, "rewards/rejected": -3.875, "step": 12950 }, { "epoch": 0.9953917050691244, "grad_norm": 21.513367429481175, "learning_rate": 3.234416225708636e-11, "logits/chosen": -3.421875, "logits/rejected": -3.5, "logps/chosen": -344.0, "logps/rejected": -504.0, "loss": 0.7316, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.203125, "rewards/margins": 1.7421875, "rewards/rejected": -3.953125, "step": 12960 }, { "epoch": 0.9961597542242704, "grad_norm": 21.766597717029086, "learning_rate": 2.2461371782467318e-11, "logits/chosen": -3.453125, "logits/rejected": -3.546875, "logps/chosen": -356.0, "logps/rejected": -528.0, "loss": 0.7049, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.203125, "rewards/margins": 1.8828125, "rewards/rejected": -4.09375, "step": 12970 }, { "epoch": 0.9969278033794163, "grad_norm": 21.208112460876507, "learning_rate": 1.4375355435580772e-11, "logits/chosen": -3.53125, "logits/rejected": -3.484375, "logps/chosen": -324.0, "logps/rejected": -506.0, "loss": 0.7228, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.0625, "rewards/margins": 1.75, "rewards/rejected": -3.8125, "step": 12980 }, { "epoch": 0.9976958525345622, "grad_norm": 21.424214488615988, "learning_rate": 8.086171336602055e-12, "logits/chosen": -3.484375, "logits/rejected": -3.390625, "logps/chosen": -342.0, "logps/rejected": -502.0, "loss": 0.7729, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.25, "rewards/margins": 1.7265625, "rewards/rejected": -3.96875, "step": 12990 }, { "epoch": 0.9984639016897081, "grad_norm": 22.952096818909066, "learning_rate": 3.5938646906208402e-12, "logits/chosen": -3.5625, "logits/rejected": -3.453125, "logps/chosen": -328.0, "logps/rejected": -478.0, "loss": 0.7331, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.109375, "rewards/margins": 1.578125, "rewards/rejected": -3.6875, "step": 13000 }, { "epoch": 0.9984639016897081, "eval_logits/chosen": -3.4375, "eval_logits/rejected": -3.578125, "eval_logps/chosen": -380.0, "eval_logps/rejected": -492.0, "eval_loss": 0.45752930641174316, "eval_rewards/accuracies": 0.7602163553237915, "eval_rewards/chosen": -2.375, "eval_rewards/margins": 1.3984375, "eval_rewards/rejected": -3.78125, "eval_runtime": 2264.2061, "eval_samples_per_second": 41.134, "eval_steps_per_second": 0.643, "step": 13000 }, { "epoch": 0.999231950844854, "grad_norm": 22.243867813015733, "learning_rate": 8.984677871415325e-13, "logits/chosen": -3.359375, "logits/rejected": -3.28125, "logps/chosen": -344.0, "logps/rejected": -540.0, "loss": 0.748, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.171875, "rewards/margins": 1.8515625, "rewards/rejected": -4.03125, "step": 13010 }, { "epoch": 1.0, "grad_norm": 19.88813474750179, "learning_rate": 0.0, "logits/chosen": -3.53125, "logits/rejected": -3.671875, "logps/chosen": -350.0, "logps/rejected": -516.0, "loss": 0.7221, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.203125, "rewards/margins": 1.78125, "rewards/rejected": -3.984375, "step": 13020 }, { "epoch": 1.0, "step": 13020, "total_flos": 0.0, "train_loss": 0.0, "train_runtime": 17.5662, "train_samples_per_second": 94871.579, "train_steps_per_second": 741.196 } ], "logging_steps": 10, "max_steps": 13020, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }