diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,18200 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.999297541394882, + "eval_steps": 400, + "global_step": 5604, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002676032781401572, + "grad_norm": 6.179981970234677, + "learning_rate": 2.6737967914438506e-08, + "logits/chosen": -0.07354718446731567, + "logits/rejected": 0.1362501084804535, + "logps/chosen": -1.7156760692596436, + "logps/rejected": -1.8900222778320312, + "loss": 1.1357, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.7156760692596436, + "rewards/margins": 0.17434628307819366, + "rewards/rejected": -1.8900222778320312, + "sft_loss": 1.468214750289917, + "step": 5 + }, + { + "epoch": 0.005352065562803144, + "grad_norm": 10.909175691763762, + "learning_rate": 5.347593582887701e-08, + "logits/chosen": -0.002189463470131159, + "logits/rejected": 0.12079276889562607, + "logps/chosen": -1.7996399402618408, + "logps/rejected": -1.8447366952896118, + "loss": 1.2279, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.7996399402618408, + "rewards/margins": 0.04509688913822174, + "rewards/rejected": -1.8447366952896118, + "sft_loss": 1.507448673248291, + "step": 10 + }, + { + "epoch": 0.008028098344204716, + "grad_norm": 13.61904960367422, + "learning_rate": 8.021390374331551e-08, + "logits/chosen": -0.049028005450963974, + "logits/rejected": 0.049201685935258865, + "logps/chosen": -1.6352916955947876, + "logps/rejected": -1.76531982421875, + "loss": 1.2021, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.6352916955947876, + "rewards/margins": 0.13002797961235046, + "rewards/rejected": -1.76531982421875, + "sft_loss": 1.5003212690353394, + "step": 15 + }, + { + "epoch": 0.010704131125606288, + "grad_norm": 5.956213295300661, + "learning_rate": 1.0695187165775402e-07, + "logits/chosen": -0.05677938461303711, + "logits/rejected": 0.029643535614013672, + "logps/chosen": -1.7252578735351562, + "logps/rejected": -1.805281639099121, + "loss": 1.2265, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.7252578735351562, + "rewards/margins": 0.0800238624215126, + "rewards/rejected": -1.805281639099121, + "sft_loss": 1.5005255937576294, + "step": 20 + }, + { + "epoch": 0.013380163907007862, + "grad_norm": 17.76695461787378, + "learning_rate": 1.3368983957219251e-07, + "logits/chosen": -0.06971491873264313, + "logits/rejected": 0.015968088060617447, + "logps/chosen": -1.866454839706421, + "logps/rejected": -1.7767322063446045, + "loss": 1.3448, + "rewards/accuracies": 0.375, + "rewards/chosen": -1.866454839706421, + "rewards/margins": -0.0897226631641388, + "rewards/rejected": -1.7767322063446045, + "sft_loss": 1.5450823307037354, + "step": 25 + }, + { + "epoch": 0.016056196688409432, + "grad_norm": 12.909544852261007, + "learning_rate": 1.6042780748663102e-07, + "logits/chosen": -0.10329775512218475, + "logits/rejected": -0.007711836602538824, + "logps/chosen": -1.9049831628799438, + "logps/rejected": -1.8297088146209717, + "loss": 1.3272, + "rewards/accuracies": 0.4437499940395355, + "rewards/chosen": -1.9049831628799438, + "rewards/margins": -0.0752745047211647, + "rewards/rejected": -1.8297088146209717, + "sft_loss": 1.6447674036026, + "step": 30 + }, + { + "epoch": 0.018732229469811006, + "grad_norm": 11.82840491928946, + "learning_rate": 1.8716577540106952e-07, + "logits/chosen": -0.04202842339873314, + "logits/rejected": 0.12342722713947296, + "logps/chosen": -1.8364969491958618, + "logps/rejected": -1.9843240976333618, + "loss": 1.2642, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.8364969491958618, + "rewards/margins": 0.14782710373401642, + "rewards/rejected": -1.9843240976333618, + "sft_loss": 1.5586276054382324, + "step": 35 + }, + { + "epoch": 0.021408262251212576, + "grad_norm": 10.875725329247906, + "learning_rate": 2.1390374331550805e-07, + "logits/chosen": 0.02725973166525364, + "logits/rejected": 0.20334240794181824, + "logps/chosen": -1.8649237155914307, + "logps/rejected": -1.7300840616226196, + "loss": 1.3038, + "rewards/accuracies": 0.45625001192092896, + "rewards/chosen": -1.8649237155914307, + "rewards/margins": -0.1348399817943573, + "rewards/rejected": -1.7300840616226196, + "sft_loss": 1.5143520832061768, + "step": 40 + }, + { + "epoch": 0.02408429503261415, + "grad_norm": 16.550871534498324, + "learning_rate": 2.4064171122994655e-07, + "logits/chosen": 0.031885191798210144, + "logits/rejected": 0.23356428742408752, + "logps/chosen": -1.8056827783584595, + "logps/rejected": -1.8406091928482056, + "loss": 1.268, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -1.8056827783584595, + "rewards/margins": 0.034926436841487885, + "rewards/rejected": -1.8406091928482056, + "sft_loss": 1.5227737426757812, + "step": 45 + }, + { + "epoch": 0.026760327814015723, + "grad_norm": 13.08952392847594, + "learning_rate": 2.6737967914438503e-07, + "logits/chosen": -0.0523727647960186, + "logits/rejected": 0.10018514096736908, + "logps/chosen": -1.8462203741073608, + "logps/rejected": -1.7395761013031006, + "loss": 1.3203, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.8462203741073608, + "rewards/margins": -0.10664422810077667, + "rewards/rejected": -1.7395761013031006, + "sft_loss": 1.5634331703186035, + "step": 50 + }, + { + "epoch": 0.029436360595417294, + "grad_norm": 9.035596704121557, + "learning_rate": 2.9411764705882356e-07, + "logits/chosen": -0.09299639612436295, + "logits/rejected": 0.1341559737920761, + "logps/chosen": -1.7790874242782593, + "logps/rejected": -1.8184455633163452, + "loss": 1.2866, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.7790874242782593, + "rewards/margins": 0.03935818746685982, + "rewards/rejected": -1.8184455633163452, + "sft_loss": 1.5595557689666748, + "step": 55 + }, + { + "epoch": 0.032112393376818864, + "grad_norm": 8.403697285946595, + "learning_rate": 3.2085561497326203e-07, + "logits/chosen": -0.10554766654968262, + "logits/rejected": 0.08509379625320435, + "logps/chosen": -1.7057393789291382, + "logps/rejected": -1.8046671152114868, + "loss": 1.2018, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.7057393789291382, + "rewards/margins": 0.0989275649189949, + "rewards/rejected": -1.8046671152114868, + "sft_loss": 1.5179922580718994, + "step": 60 + }, + { + "epoch": 0.03478842615822044, + "grad_norm": 6.4387299531804745, + "learning_rate": 3.475935828877005e-07, + "logits/chosen": -0.03802342340350151, + "logits/rejected": 0.1122186928987503, + "logps/chosen": -1.5271183252334595, + "logps/rejected": -1.637882947921753, + "loss": 1.136, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.5271183252334595, + "rewards/margins": 0.11076472699642181, + "rewards/rejected": -1.637882947921753, + "sft_loss": 1.426003336906433, + "step": 65 + }, + { + "epoch": 0.03746445893962201, + "grad_norm": 13.955625675437666, + "learning_rate": 3.7433155080213904e-07, + "logits/chosen": -0.08897098153829575, + "logits/rejected": 0.059094082564115524, + "logps/chosen": -1.6416652202606201, + "logps/rejected": -1.677323579788208, + "loss": 1.2583, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -1.6416652202606201, + "rewards/margins": 0.03565821796655655, + "rewards/rejected": -1.677323579788208, + "sft_loss": 1.5583785772323608, + "step": 70 + }, + { + "epoch": 0.04014049172102358, + "grad_norm": 13.363727924448506, + "learning_rate": 4.0106951871657757e-07, + "logits/chosen": -0.08906193822622299, + "logits/rejected": 0.08749326318502426, + "logps/chosen": -1.6009581089019775, + "logps/rejected": -1.8365551233291626, + "loss": 1.1338, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.6009581089019775, + "rewards/margins": 0.2355968952178955, + "rewards/rejected": -1.8365551233291626, + "sft_loss": 1.497521996498108, + "step": 75 + }, + { + "epoch": 0.04281652450242515, + "grad_norm": 8.865202503300885, + "learning_rate": 4.278074866310161e-07, + "logits/chosen": -0.0019889636896550655, + "logits/rejected": 0.10428965091705322, + "logps/chosen": -1.5218764543533325, + "logps/rejected": -1.5615147352218628, + "loss": 1.1697, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.5218764543533325, + "rewards/margins": 0.03963825851678848, + "rewards/rejected": -1.5615147352218628, + "sft_loss": 1.4340198040008545, + "step": 80 + }, + { + "epoch": 0.04549255728382673, + "grad_norm": 6.571045133474957, + "learning_rate": 4.5454545454545457e-07, + "logits/chosen": -0.16559067368507385, + "logits/rejected": 0.0771712213754654, + "logps/chosen": -1.5611470937728882, + "logps/rejected": -1.6897590160369873, + "loss": 1.1362, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.5611470937728882, + "rewards/margins": 0.12861183285713196, + "rewards/rejected": -1.6897590160369873, + "sft_loss": 1.4134607315063477, + "step": 85 + }, + { + "epoch": 0.0481685900652283, + "grad_norm": 11.981701286510356, + "learning_rate": 4.812834224598931e-07, + "logits/chosen": 0.03413419798016548, + "logits/rejected": -0.004057231359183788, + "logps/chosen": -1.436505913734436, + "logps/rejected": -1.5269982814788818, + "loss": 1.1281, + "rewards/accuracies": 0.46875, + "rewards/chosen": -1.436505913734436, + "rewards/margins": 0.09049233794212341, + "rewards/rejected": -1.5269982814788818, + "sft_loss": 1.3672358989715576, + "step": 90 + }, + { + "epoch": 0.05084462284662987, + "grad_norm": 5.582546128335958, + "learning_rate": 5.080213903743315e-07, + "logits/chosen": -0.10642153024673462, + "logits/rejected": 0.03692169114947319, + "logps/chosen": -1.3211487531661987, + "logps/rejected": -1.5329933166503906, + "loss": 1.0679, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3211487531661987, + "rewards/margins": 0.21184447407722473, + "rewards/rejected": -1.5329933166503906, + "sft_loss": 1.3490239381790161, + "step": 95 + }, + { + "epoch": 0.05352065562803145, + "grad_norm": 5.6861974383348155, + "learning_rate": 5.347593582887701e-07, + "logits/chosen": -0.10629527270793915, + "logits/rejected": -0.044294875115156174, + "logps/chosen": -1.3627598285675049, + "logps/rejected": -1.4516956806182861, + "loss": 1.1149, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.3627598285675049, + "rewards/margins": 0.08893603086471558, + "rewards/rejected": -1.4516956806182861, + "sft_loss": 1.3573975563049316, + "step": 100 + }, + { + "epoch": 0.05619668840943302, + "grad_norm": 4.705345534762539, + "learning_rate": 5.614973262032086e-07, + "logits/chosen": -0.0295580867677927, + "logits/rejected": -0.0050818738527596, + "logps/chosen": -1.3181979656219482, + "logps/rejected": -1.4915450811386108, + "loss": 1.059, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.3181979656219482, + "rewards/margins": 0.17334721982479095, + "rewards/rejected": -1.4915450811386108, + "sft_loss": 1.2995798587799072, + "step": 105 + }, + { + "epoch": 0.05887272119083459, + "grad_norm": 7.966487192278806, + "learning_rate": 5.882352941176471e-07, + "logits/chosen": -0.08093095570802689, + "logits/rejected": 0.013903314247727394, + "logps/chosen": -1.2849345207214355, + "logps/rejected": -1.348474383354187, + "loss": 1.104, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.2849345207214355, + "rewards/margins": 0.06353993713855743, + "rewards/rejected": -1.348474383354187, + "sft_loss": 1.2881290912628174, + "step": 110 + }, + { + "epoch": 0.06154875397223616, + "grad_norm": 7.641023591192343, + "learning_rate": 6.149732620320855e-07, + "logits/chosen": -0.047448135912418365, + "logits/rejected": 0.14301401376724243, + "logps/chosen": -1.323791742324829, + "logps/rejected": -1.5212907791137695, + "loss": 1.0755, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.323791742324829, + "rewards/margins": 0.19749906659126282, + "rewards/rejected": -1.5212907791137695, + "sft_loss": 1.3769772052764893, + "step": 115 + }, + { + "epoch": 0.06422478675363773, + "grad_norm": 6.708387519634524, + "learning_rate": 6.417112299465241e-07, + "logits/chosen": -0.14979350566864014, + "logits/rejected": 0.002688088919967413, + "logps/chosen": -1.3275854587554932, + "logps/rejected": -1.408825159072876, + "loss": 1.1135, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.3275854587554932, + "rewards/margins": 0.0812397450208664, + "rewards/rejected": -1.408825159072876, + "sft_loss": 1.3396663665771484, + "step": 120 + }, + { + "epoch": 0.0669008195350393, + "grad_norm": 4.9165529393450536, + "learning_rate": 6.684491978609626e-07, + "logits/chosen": -0.15790846943855286, + "logits/rejected": -0.03613553196191788, + "logps/chosen": -1.3417673110961914, + "logps/rejected": -1.3494497537612915, + "loss": 1.1568, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -1.3417673110961914, + "rewards/margins": 0.007682465016841888, + "rewards/rejected": -1.3494497537612915, + "sft_loss": 1.3967574834823608, + "step": 125 + }, + { + "epoch": 0.06957685231644088, + "grad_norm": 6.427234971687256, + "learning_rate": 6.95187165775401e-07, + "logits/chosen": -0.04104025289416313, + "logits/rejected": 0.08000461757183075, + "logps/chosen": -1.376473069190979, + "logps/rejected": -1.488599181175232, + "loss": 1.1242, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -1.376473069190979, + "rewards/margins": 0.11212635040283203, + "rewards/rejected": -1.488599181175232, + "sft_loss": 1.444916844367981, + "step": 130 + }, + { + "epoch": 0.07225288509784245, + "grad_norm": 8.054691538356435, + "learning_rate": 7.219251336898395e-07, + "logits/chosen": -0.10705189406871796, + "logits/rejected": 0.0006207168335095048, + "logps/chosen": -1.4154436588287354, + "logps/rejected": -1.5005041360855103, + "loss": 1.1338, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.4154436588287354, + "rewards/margins": 0.0850602462887764, + "rewards/rejected": -1.5005041360855103, + "sft_loss": 1.3668407201766968, + "step": 135 + }, + { + "epoch": 0.07492891787924402, + "grad_norm": 7.859083459156864, + "learning_rate": 7.486631016042781e-07, + "logits/chosen": -0.10198960453271866, + "logits/rejected": 0.04557307809591293, + "logps/chosen": -1.4215986728668213, + "logps/rejected": -1.4720596075057983, + "loss": 1.1592, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.4215986728668213, + "rewards/margins": 0.05046095699071884, + "rewards/rejected": -1.4720596075057983, + "sft_loss": 1.4362990856170654, + "step": 140 + }, + { + "epoch": 0.0776049506606456, + "grad_norm": 8.85741719438616, + "learning_rate": 7.754010695187166e-07, + "logits/chosen": -0.061334170401096344, + "logits/rejected": 0.0797557458281517, + "logps/chosen": -1.3318393230438232, + "logps/rejected": -1.427393913269043, + "loss": 1.1288, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3318393230438232, + "rewards/margins": 0.09555456787347794, + "rewards/rejected": -1.427393913269043, + "sft_loss": 1.4047821760177612, + "step": 145 + }, + { + "epoch": 0.08028098344204716, + "grad_norm": 8.202696544058703, + "learning_rate": 8.021390374331551e-07, + "logits/chosen": -0.10961981862783432, + "logits/rejected": 0.033692121505737305, + "logps/chosen": -1.2658131122589111, + "logps/rejected": -1.2866407632827759, + "loss": 1.116, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.2658131122589111, + "rewards/margins": 0.02082763984799385, + "rewards/rejected": -1.2866407632827759, + "sft_loss": 1.2677156925201416, + "step": 150 + }, + { + "epoch": 0.08295701622344874, + "grad_norm": 5.762281777012094, + "learning_rate": 8.288770053475937e-07, + "logits/chosen": -0.11363419145345688, + "logits/rejected": -0.06518497318029404, + "logps/chosen": -1.2847747802734375, + "logps/rejected": -1.4079326391220093, + "loss": 1.0887, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.2847747802734375, + "rewards/margins": 0.12315795570611954, + "rewards/rejected": -1.4079326391220093, + "sft_loss": 1.3240041732788086, + "step": 155 + }, + { + "epoch": 0.0856330490048503, + "grad_norm": 5.517470300559481, + "learning_rate": 8.556149732620322e-07, + "logits/chosen": -0.19057968258857727, + "logits/rejected": -0.059998493641614914, + "logps/chosen": -1.3801919221878052, + "logps/rejected": -1.3602113723754883, + "loss": 1.1731, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -1.3801919221878052, + "rewards/margins": -0.01998048648238182, + "rewards/rejected": -1.3602113723754883, + "sft_loss": 1.3693006038665771, + "step": 160 + }, + { + "epoch": 0.08830908178625188, + "grad_norm": 7.395292190886674, + "learning_rate": 8.823529411764706e-07, + "logits/chosen": -0.09310317784547806, + "logits/rejected": 0.06683714687824249, + "logps/chosen": -1.2894189357757568, + "logps/rejected": -1.3747179508209229, + "loss": 1.1075, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.2894189357757568, + "rewards/margins": 0.08529897779226303, + "rewards/rejected": -1.3747179508209229, + "sft_loss": 1.288217306137085, + "step": 165 + }, + { + "epoch": 0.09098511456765346, + "grad_norm": 6.2980422888580305, + "learning_rate": 9.090909090909091e-07, + "logits/chosen": -0.1289089173078537, + "logits/rejected": -0.08186782896518707, + "logps/chosen": -1.4113397598266602, + "logps/rejected": -1.489976406097412, + "loss": 1.1464, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.4113397598266602, + "rewards/margins": 0.0786367803812027, + "rewards/rejected": -1.489976406097412, + "sft_loss": 1.4135427474975586, + "step": 170 + }, + { + "epoch": 0.09366114734905502, + "grad_norm": 7.44909076238542, + "learning_rate": 9.358288770053477e-07, + "logits/chosen": 0.010211547836661339, + "logits/rejected": 0.011260807514190674, + "logps/chosen": -1.3008840084075928, + "logps/rejected": -1.3928817510604858, + "loss": 1.1112, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.3008840084075928, + "rewards/margins": 0.09199782460927963, + "rewards/rejected": -1.3928817510604858, + "sft_loss": 1.3410255908966064, + "step": 175 + }, + { + "epoch": 0.0963371801304566, + "grad_norm": 6.526310508545539, + "learning_rate": 9.625668449197862e-07, + "logits/chosen": -0.024570604786276817, + "logits/rejected": -0.02446991577744484, + "logps/chosen": -1.3202580213546753, + "logps/rejected": -1.527092456817627, + "loss": 1.1042, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3202580213546753, + "rewards/margins": 0.20683428645133972, + "rewards/rejected": -1.527092456817627, + "sft_loss": 1.360899567604065, + "step": 180 + }, + { + "epoch": 0.09901321291185818, + "grad_norm": 7.077223356554608, + "learning_rate": 9.893048128342246e-07, + "logits/chosen": -0.135633185505867, + "logits/rejected": -0.04922838136553764, + "logps/chosen": -1.3211921453475952, + "logps/rejected": -1.3769458532333374, + "loss": 1.132, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.3211921453475952, + "rewards/margins": 0.05575376749038696, + "rewards/rejected": -1.3769458532333374, + "sft_loss": 1.3447620868682861, + "step": 185 + }, + { + "epoch": 0.10168924569325974, + "grad_norm": 7.121196316462924, + "learning_rate": 1.016042780748663e-06, + "logits/chosen": -0.07600800693035126, + "logits/rejected": 0.0359211228787899, + "logps/chosen": -1.2341994047164917, + "logps/rejected": -1.3743019104003906, + "loss": 1.0644, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.2341994047164917, + "rewards/margins": 0.14010249078273773, + "rewards/rejected": -1.3743019104003906, + "sft_loss": 1.2724642753601074, + "step": 190 + }, + { + "epoch": 0.10436527847466132, + "grad_norm": 5.035842365815689, + "learning_rate": 1.0427807486631017e-06, + "logits/chosen": 0.018078740686178207, + "logits/rejected": 0.16420726478099823, + "logps/chosen": -1.229421854019165, + "logps/rejected": -1.388828992843628, + "loss": 1.052, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.229421854019165, + "rewards/margins": 0.15940706431865692, + "rewards/rejected": -1.388828992843628, + "sft_loss": 1.2792654037475586, + "step": 195 + }, + { + "epoch": 0.1070413112560629, + "grad_norm": 15.548912349802093, + "learning_rate": 1.0695187165775401e-06, + "logits/chosen": -0.07511644065380096, + "logits/rejected": 0.052346598356962204, + "logps/chosen": -1.3431646823883057, + "logps/rejected": -1.3926780223846436, + "loss": 1.1349, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.3431646823883057, + "rewards/margins": 0.04951336607336998, + "rewards/rejected": -1.3926780223846436, + "sft_loss": 1.3756563663482666, + "step": 200 + }, + { + "epoch": 0.10971734403746446, + "grad_norm": 8.522924881240787, + "learning_rate": 1.0962566844919785e-06, + "logits/chosen": -0.06255607306957245, + "logits/rejected": 0.07017968595027924, + "logps/chosen": -1.2415074110031128, + "logps/rejected": -1.3377946615219116, + "loss": 1.0811, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.2415074110031128, + "rewards/margins": 0.09628725051879883, + "rewards/rejected": -1.3377946615219116, + "sft_loss": 1.2635818719863892, + "step": 205 + }, + { + "epoch": 0.11239337681886603, + "grad_norm": 7.031854777872283, + "learning_rate": 1.1229946524064172e-06, + "logits/chosen": -0.12123604863882065, + "logits/rejected": 0.0530204176902771, + "logps/chosen": -1.3264672756195068, + "logps/rejected": -1.447780966758728, + "loss": 1.1011, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3264672756195068, + "rewards/margins": 0.12131373584270477, + "rewards/rejected": -1.447780966758728, + "sft_loss": 1.3216352462768555, + "step": 210 + }, + { + "epoch": 0.1150694096002676, + "grad_norm": 6.236274148718416, + "learning_rate": 1.1497326203208556e-06, + "logits/chosen": -0.15441572666168213, + "logits/rejected": 0.07962942123413086, + "logps/chosen": -1.3619040250778198, + "logps/rejected": -1.4223322868347168, + "loss": 1.1135, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.3619040250778198, + "rewards/margins": 0.06042822077870369, + "rewards/rejected": -1.4223322868347168, + "sft_loss": 1.3458638191223145, + "step": 215 + }, + { + "epoch": 0.11774544238166917, + "grad_norm": 12.60517336729982, + "learning_rate": 1.1764705882352942e-06, + "logits/chosen": 0.07616592943668365, + "logits/rejected": 0.1707703024148941, + "logps/chosen": -1.2778079509735107, + "logps/rejected": -1.4280025959014893, + "loss": 1.0803, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.2778079509735107, + "rewards/margins": 0.1501947045326233, + "rewards/rejected": -1.4280025959014893, + "sft_loss": 1.3137943744659424, + "step": 220 + }, + { + "epoch": 0.12042147516307075, + "grad_norm": 5.318042115628666, + "learning_rate": 1.2032085561497326e-06, + "logits/chosen": -0.0796518325805664, + "logits/rejected": 0.08061401546001434, + "logps/chosen": -1.2935796976089478, + "logps/rejected": -1.4250218868255615, + "loss": 1.068, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.2935796976089478, + "rewards/margins": 0.13144224882125854, + "rewards/rejected": -1.4250218868255615, + "sft_loss": 1.304241418838501, + "step": 225 + }, + { + "epoch": 0.12309750794447231, + "grad_norm": 5.299354677635938, + "learning_rate": 1.229946524064171e-06, + "logits/chosen": 0.005102366209030151, + "logits/rejected": 0.07590552419424057, + "logps/chosen": -1.2844891548156738, + "logps/rejected": -1.4591628313064575, + "loss": 1.0508, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.2844891548156738, + "rewards/margins": 0.17467369139194489, + "rewards/rejected": -1.4591628313064575, + "sft_loss": 1.2599900960922241, + "step": 230 + }, + { + "epoch": 0.1257735407258739, + "grad_norm": 6.761097593434722, + "learning_rate": 1.2566844919786097e-06, + "logits/chosen": 0.02427624724805355, + "logits/rejected": 0.1473507583141327, + "logps/chosen": -1.2645740509033203, + "logps/rejected": -1.4370596408843994, + "loss": 1.0477, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2645740509033203, + "rewards/margins": 0.17248567938804626, + "rewards/rejected": -1.4370596408843994, + "sft_loss": 1.2697627544403076, + "step": 235 + }, + { + "epoch": 0.12844957350727546, + "grad_norm": 4.531213132964872, + "learning_rate": 1.2834224598930481e-06, + "logits/chosen": 0.005139252170920372, + "logits/rejected": 0.1295509934425354, + "logps/chosen": -1.2738149166107178, + "logps/rejected": -1.4708013534545898, + "loss": 1.0667, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.2738149166107178, + "rewards/margins": 0.19698651134967804, + "rewards/rejected": -1.4708013534545898, + "sft_loss": 1.325798511505127, + "step": 240 + }, + { + "epoch": 0.13112560628867703, + "grad_norm": 6.486508247077267, + "learning_rate": 1.3101604278074866e-06, + "logits/chosen": 0.03270702809095383, + "logits/rejected": 0.1433630883693695, + "logps/chosen": -1.3901463747024536, + "logps/rejected": -1.435687780380249, + "loss": 1.1513, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3901463747024536, + "rewards/margins": 0.045541636645793915, + "rewards/rejected": -1.435687780380249, + "sft_loss": 1.4123432636260986, + "step": 245 + }, + { + "epoch": 0.1338016390700786, + "grad_norm": 7.36897429327943, + "learning_rate": 1.3368983957219252e-06, + "logits/chosen": -0.04421486333012581, + "logits/rejected": 0.11287758499383926, + "logps/chosen": -1.2664507627487183, + "logps/rejected": -1.3541992902755737, + "loss": 1.1097, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.2664507627487183, + "rewards/margins": 0.08774860203266144, + "rewards/rejected": -1.3541992902755737, + "sft_loss": 1.3012326955795288, + "step": 250 + }, + { + "epoch": 0.1364776718514802, + "grad_norm": 6.40818524595936, + "learning_rate": 1.3636363636363636e-06, + "logits/chosen": -0.004007840063422918, + "logits/rejected": 0.13231723010540009, + "logps/chosen": -1.2495529651641846, + "logps/rejected": -1.3691155910491943, + "loss": 1.0523, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.2495529651641846, + "rewards/margins": 0.11956258118152618, + "rewards/rejected": -1.3691155910491943, + "sft_loss": 1.2333502769470215, + "step": 255 + }, + { + "epoch": 0.13915370463288176, + "grad_norm": 4.889337882490812, + "learning_rate": 1.390374331550802e-06, + "logits/chosen": -0.2317703664302826, + "logits/rejected": -0.12833379209041595, + "logps/chosen": -1.3310482501983643, + "logps/rejected": -1.5018017292022705, + "loss": 1.0713, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.3310482501983643, + "rewards/margins": 0.1707535982131958, + "rewards/rejected": -1.5018017292022705, + "sft_loss": 1.3612117767333984, + "step": 260 + }, + { + "epoch": 0.1418297374142833, + "grad_norm": 7.23659379779864, + "learning_rate": 1.4171122994652407e-06, + "logits/chosen": -0.11722488701343536, + "logits/rejected": -0.03951631858944893, + "logps/chosen": -1.3124138116836548, + "logps/rejected": -1.504041075706482, + "loss": 1.0828, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3124138116836548, + "rewards/margins": 0.1916274130344391, + "rewards/rejected": -1.504041075706482, + "sft_loss": 1.3750369548797607, + "step": 265 + }, + { + "epoch": 0.1445057701956849, + "grad_norm": 4.501741960021518, + "learning_rate": 1.443850267379679e-06, + "logits/chosen": -0.07672002166509628, + "logits/rejected": 0.042726390063762665, + "logps/chosen": -1.294614553451538, + "logps/rejected": -1.4077153205871582, + "loss": 1.0837, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.294614553451538, + "rewards/margins": 0.11310066282749176, + "rewards/rejected": -1.4077153205871582, + "sft_loss": 1.3220535516738892, + "step": 270 + }, + { + "epoch": 0.14718180297708647, + "grad_norm": 5.164827833039285, + "learning_rate": 1.4705882352941175e-06, + "logits/chosen": -0.037459634244441986, + "logits/rejected": 0.05704887583851814, + "logps/chosen": -1.2455511093139648, + "logps/rejected": -1.4279637336730957, + "loss": 1.0358, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2455511093139648, + "rewards/margins": 0.18241265416145325, + "rewards/rejected": -1.4279637336730957, + "sft_loss": 1.2343533039093018, + "step": 275 + }, + { + "epoch": 0.14985783575848804, + "grad_norm": 8.367094770624977, + "learning_rate": 1.4973262032085562e-06, + "logits/chosen": -0.08834396302700043, + "logits/rejected": 0.05895150825381279, + "logps/chosen": -1.295008659362793, + "logps/rejected": -1.4184646606445312, + "loss": 1.0715, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.295008659362793, + "rewards/margins": 0.12345600128173828, + "rewards/rejected": -1.4184646606445312, + "sft_loss": 1.2937798500061035, + "step": 280 + }, + { + "epoch": 0.15253386853988962, + "grad_norm": 6.628270520705701, + "learning_rate": 1.5240641711229948e-06, + "logits/chosen": -0.05230358988046646, + "logits/rejected": 0.08594690263271332, + "logps/chosen": -1.3371986150741577, + "logps/rejected": -1.4414142370224, + "loss": 1.1202, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.3371986150741577, + "rewards/margins": 0.10421568155288696, + "rewards/rejected": -1.4414142370224, + "sft_loss": 1.3829238414764404, + "step": 285 + }, + { + "epoch": 0.1552099013212912, + "grad_norm": 6.450117105702508, + "learning_rate": 1.5508021390374332e-06, + "logits/chosen": -0.11853925883769989, + "logits/rejected": 0.16345106065273285, + "logps/chosen": -1.3565114736557007, + "logps/rejected": -1.495757818222046, + "loss": 1.0876, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.3565114736557007, + "rewards/margins": 0.13924629986286163, + "rewards/rejected": -1.495757818222046, + "sft_loss": 1.3547443151474, + "step": 290 + }, + { + "epoch": 0.15788593410269275, + "grad_norm": 6.722988345540176, + "learning_rate": 1.5775401069518718e-06, + "logits/chosen": -0.05183395743370056, + "logits/rejected": 0.0007757678395137191, + "logps/chosen": -1.2508453130722046, + "logps/rejected": -1.4139963388442993, + "loss": 1.0458, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2508453130722046, + "rewards/margins": 0.16315098106861115, + "rewards/rejected": -1.4139963388442993, + "sft_loss": 1.265608310699463, + "step": 295 + }, + { + "epoch": 0.16056196688409433, + "grad_norm": 6.04567526816521, + "learning_rate": 1.6042780748663103e-06, + "logits/chosen": -0.08818133920431137, + "logits/rejected": 0.06808780133724213, + "logps/chosen": -1.2863214015960693, + "logps/rejected": -1.410569429397583, + "loss": 1.0916, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.2863214015960693, + "rewards/margins": 0.12424807250499725, + "rewards/rejected": -1.410569429397583, + "sft_loss": 1.3566596508026123, + "step": 300 + }, + { + "epoch": 0.1632379996654959, + "grad_norm": 5.0046549507985745, + "learning_rate": 1.6310160427807487e-06, + "logits/chosen": -0.028544578701257706, + "logits/rejected": 0.041024815291166306, + "logps/chosen": -1.397444486618042, + "logps/rejected": -1.4231725931167603, + "loss": 1.1662, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.397444486618042, + "rewards/margins": 0.02572813630104065, + "rewards/rejected": -1.4231725931167603, + "sft_loss": 1.4036086797714233, + "step": 305 + }, + { + "epoch": 0.16591403244689748, + "grad_norm": 6.8799911344678675, + "learning_rate": 1.6577540106951873e-06, + "logits/chosen": -0.22853362560272217, + "logits/rejected": -0.1439324915409088, + "logps/chosen": -1.362104892730713, + "logps/rejected": -1.472100019454956, + "loss": 1.1235, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.362104892730713, + "rewards/margins": 0.10999520123004913, + "rewards/rejected": -1.472100019454956, + "sft_loss": 1.3633081912994385, + "step": 310 + }, + { + "epoch": 0.16859006522829906, + "grad_norm": 7.173718775073233, + "learning_rate": 1.6844919786096258e-06, + "logits/chosen": -0.02005874551832676, + "logits/rejected": 0.13060171902179718, + "logps/chosen": -1.3620105981826782, + "logps/rejected": -1.5362706184387207, + "loss": 1.1099, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3620105981826782, + "rewards/margins": 0.17425988614559174, + "rewards/rejected": -1.5362706184387207, + "sft_loss": 1.3757470846176147, + "step": 315 + }, + { + "epoch": 0.1712660980097006, + "grad_norm": 6.003421500347755, + "learning_rate": 1.7112299465240644e-06, + "logits/chosen": -0.09477636963129044, + "logits/rejected": 0.03201219066977501, + "logps/chosen": -1.3208723068237305, + "logps/rejected": -1.380985975265503, + "loss": 1.1145, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.3208723068237305, + "rewards/margins": 0.06011378765106201, + "rewards/rejected": -1.380985975265503, + "sft_loss": 1.3422465324401855, + "step": 320 + }, + { + "epoch": 0.17394213079110218, + "grad_norm": 5.681080389428846, + "learning_rate": 1.7379679144385028e-06, + "logits/chosen": -0.158976748585701, + "logits/rejected": -0.050321273505687714, + "logps/chosen": -1.3078538179397583, + "logps/rejected": -1.6020857095718384, + "loss": 1.0574, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3078538179397583, + "rewards/margins": 0.29423192143440247, + "rewards/rejected": -1.6020857095718384, + "sft_loss": 1.400783896446228, + "step": 325 + }, + { + "epoch": 0.17661816357250376, + "grad_norm": 7.299626652835666, + "learning_rate": 1.7647058823529412e-06, + "logits/chosen": -0.05776820331811905, + "logits/rejected": 0.07368157058954239, + "logps/chosen": -1.3429508209228516, + "logps/rejected": -1.5490708351135254, + "loss": 1.0678, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3429508209228516, + "rewards/margins": 0.2061198502779007, + "rewards/rejected": -1.5490708351135254, + "sft_loss": 1.3533952236175537, + "step": 330 + }, + { + "epoch": 0.17929419635390534, + "grad_norm": 8.171051466670482, + "learning_rate": 1.7914438502673799e-06, + "logits/chosen": -0.014549818821251392, + "logits/rejected": 0.07486838847398758, + "logps/chosen": -1.3574202060699463, + "logps/rejected": -1.4231749773025513, + "loss": 1.1095, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.3574202060699463, + "rewards/margins": 0.06575469672679901, + "rewards/rejected": -1.4231749773025513, + "sft_loss": 1.3472013473510742, + "step": 335 + }, + { + "epoch": 0.18197022913530692, + "grad_norm": 15.238594270083702, + "learning_rate": 1.8181818181818183e-06, + "logits/chosen": -0.0781969353556633, + "logits/rejected": 0.05372166633605957, + "logps/chosen": -1.4278004169464111, + "logps/rejected": -1.567711591720581, + "loss": 1.131, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.4278004169464111, + "rewards/margins": 0.13991113007068634, + "rewards/rejected": -1.567711591720581, + "sft_loss": 1.407225489616394, + "step": 340 + }, + { + "epoch": 0.1846462619167085, + "grad_norm": 6.818258628989823, + "learning_rate": 1.8449197860962567e-06, + "logits/chosen": -0.009267864748835564, + "logits/rejected": 0.019993681460618973, + "logps/chosen": -1.33181893825531, + "logps/rejected": -1.5204057693481445, + "loss": 1.0749, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.33181893825531, + "rewards/margins": 0.18858689069747925, + "rewards/rejected": -1.5204057693481445, + "sft_loss": 1.3686116933822632, + "step": 345 + }, + { + "epoch": 0.18732229469811004, + "grad_norm": 6.747081545944619, + "learning_rate": 1.8716577540106954e-06, + "logits/chosen": -0.02299405448138714, + "logits/rejected": 0.0645705908536911, + "logps/chosen": -1.2961095571517944, + "logps/rejected": -1.446307897567749, + "loss": 1.0961, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.2961095571517944, + "rewards/margins": 0.15019826591014862, + "rewards/rejected": -1.446307897567749, + "sft_loss": 1.3511130809783936, + "step": 350 + }, + { + "epoch": 0.18999832747951162, + "grad_norm": 6.665538710803582, + "learning_rate": 1.8983957219251338e-06, + "logits/chosen": -0.092487633228302, + "logits/rejected": 0.1206965297460556, + "logps/chosen": -1.3767292499542236, + "logps/rejected": -1.4593746662139893, + "loss": 1.1374, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3767292499542236, + "rewards/margins": 0.0826452448964119, + "rewards/rejected": -1.4593746662139893, + "sft_loss": 1.4032217264175415, + "step": 355 + }, + { + "epoch": 0.1926743602609132, + "grad_norm": 6.1643171333571, + "learning_rate": 1.9251336898395724e-06, + "logits/chosen": -0.12042136490345001, + "logits/rejected": -0.049273934215307236, + "logps/chosen": -1.314653992652893, + "logps/rejected": -1.4797592163085938, + "loss": 1.0733, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.314653992652893, + "rewards/margins": 0.16510513424873352, + "rewards/rejected": -1.4797592163085938, + "sft_loss": 1.2989557981491089, + "step": 360 + }, + { + "epoch": 0.19535039304231477, + "grad_norm": 7.255399369010597, + "learning_rate": 1.951871657754011e-06, + "logits/chosen": 0.021989356726408005, + "logits/rejected": 0.09734012186527252, + "logps/chosen": -1.2959258556365967, + "logps/rejected": -1.4293915033340454, + "loss": 1.08, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.2959258556365967, + "rewards/margins": 0.13346561789512634, + "rewards/rejected": -1.4293915033340454, + "sft_loss": 1.280139446258545, + "step": 365 + }, + { + "epoch": 0.19802642582371635, + "grad_norm": 5.332067731065435, + "learning_rate": 1.9786096256684493e-06, + "logits/chosen": -0.028383517637848854, + "logits/rejected": 0.058793745934963226, + "logps/chosen": -1.282463788986206, + "logps/rejected": -1.3635833263397217, + "loss": 1.0841, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.282463788986206, + "rewards/margins": 0.08111962676048279, + "rewards/rejected": -1.3635833263397217, + "sft_loss": 1.2678134441375732, + "step": 370 + }, + { + "epoch": 0.2007024586051179, + "grad_norm": 6.569832791382864, + "learning_rate": 2.0053475935828877e-06, + "logits/chosen": -0.07550617307424545, + "logits/rejected": 0.07492499053478241, + "logps/chosen": -1.2536544799804688, + "logps/rejected": -1.449466586112976, + "loss": 1.0603, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.2536544799804688, + "rewards/margins": 0.19581225514411926, + "rewards/rejected": -1.449466586112976, + "sft_loss": 1.3272215127944946, + "step": 375 + }, + { + "epoch": 0.20337849138651948, + "grad_norm": 5.57444327708168, + "learning_rate": 2.032085561497326e-06, + "logits/chosen": -0.05334942415356636, + "logits/rejected": 0.025158772245049477, + "logps/chosen": -1.2964035272598267, + "logps/rejected": -1.471839189529419, + "loss": 1.0621, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2964035272598267, + "rewards/margins": 0.17543578147888184, + "rewards/rejected": -1.471839189529419, + "sft_loss": 1.308597207069397, + "step": 380 + }, + { + "epoch": 0.20605452416792105, + "grad_norm": 4.900223930336491, + "learning_rate": 2.058823529411765e-06, + "logits/chosen": -0.019271325320005417, + "logits/rejected": 0.05648297816514969, + "logps/chosen": -1.3950426578521729, + "logps/rejected": -1.4196631908416748, + "loss": 1.172, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.3950426578521729, + "rewards/margins": 0.024620627984404564, + "rewards/rejected": -1.4196631908416748, + "sft_loss": 1.4064253568649292, + "step": 385 + }, + { + "epoch": 0.20873055694932263, + "grad_norm": 7.6217444720228045, + "learning_rate": 2.0855614973262034e-06, + "logits/chosen": 0.040967244654893875, + "logits/rejected": 0.19986467063426971, + "logps/chosen": -1.3848092555999756, + "logps/rejected": -1.4894533157348633, + "loss": 1.1273, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.3848092555999756, + "rewards/margins": 0.10464394092559814, + "rewards/rejected": -1.4894533157348633, + "sft_loss": 1.3908588886260986, + "step": 390 + }, + { + "epoch": 0.2114065897307242, + "grad_norm": 5.889142181465144, + "learning_rate": 2.112299465240642e-06, + "logits/chosen": -0.0939127653837204, + "logits/rejected": 0.05425548553466797, + "logps/chosen": -1.337721347808838, + "logps/rejected": -1.4008605480194092, + "loss": 1.1072, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.337721347808838, + "rewards/margins": 0.06313915550708771, + "rewards/rejected": -1.4008605480194092, + "sft_loss": 1.3472613096237183, + "step": 395 + }, + { + "epoch": 0.2140826225121258, + "grad_norm": 5.840741949185277, + "learning_rate": 2.1390374331550802e-06, + "logits/chosen": 0.047120727598667145, + "logits/rejected": 0.13390924036502838, + "logps/chosen": -1.3149826526641846, + "logps/rejected": -1.4476349353790283, + "loss": 1.0742, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3149826526641846, + "rewards/margins": 0.13265222311019897, + "rewards/rejected": -1.4476349353790283, + "sft_loss": 1.3131649494171143, + "step": 400 + }, + { + "epoch": 0.2140826225121258, + "eval_logits/chosen": 0.2665702998638153, + "eval_logits/rejected": 0.3536551594734192, + "eval_logps/chosen": -1.3461052179336548, + "eval_logps/rejected": -1.5309417247772217, + "eval_loss": 1.0823242664337158, + "eval_rewards/accuracies": 0.5778931975364685, + "eval_rewards/chosen": -1.3461052179336548, + "eval_rewards/margins": 0.18483661115169525, + "eval_rewards/rejected": -1.5309417247772217, + "eval_runtime": 49.4628, + "eval_samples_per_second": 27.192, + "eval_sft_loss": 1.3695601224899292, + "eval_steps_per_second": 6.813, + "step": 400 + }, + { + "epoch": 0.21675865529352734, + "grad_norm": 7.697700600455325, + "learning_rate": 2.1657754010695186e-06, + "logits/chosen": -0.018059352412819862, + "logits/rejected": 0.07431250065565109, + "logps/chosen": -1.3391938209533691, + "logps/rejected": -1.477910041809082, + "loss": 1.0935, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.3391938209533691, + "rewards/margins": 0.13871631026268005, + "rewards/rejected": -1.477910041809082, + "sft_loss": 1.3267302513122559, + "step": 405 + }, + { + "epoch": 0.2194346880749289, + "grad_norm": 6.022975123352896, + "learning_rate": 2.192513368983957e-06, + "logits/chosen": -0.0013650401961058378, + "logits/rejected": 0.12200506776571274, + "logps/chosen": -1.2950294017791748, + "logps/rejected": -1.4449807405471802, + "loss": 1.081, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.2950294017791748, + "rewards/margins": 0.1499512493610382, + "rewards/rejected": -1.4449807405471802, + "sft_loss": 1.3392434120178223, + "step": 410 + }, + { + "epoch": 0.2221107208563305, + "grad_norm": 4.99124806968459, + "learning_rate": 2.219251336898396e-06, + "logits/chosen": -0.009035291150212288, + "logits/rejected": 0.028119832277297974, + "logps/chosen": -1.3074760437011719, + "logps/rejected": -1.5078462362289429, + "loss": 1.0654, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.3074760437011719, + "rewards/margins": 0.200370192527771, + "rewards/rejected": -1.5078462362289429, + "sft_loss": 1.312064528465271, + "step": 415 + }, + { + "epoch": 0.22478675363773207, + "grad_norm": 5.36991559487342, + "learning_rate": 2.2459893048128343e-06, + "logits/chosen": -0.029295751824975014, + "logits/rejected": 0.15687605738639832, + "logps/chosen": -1.273036241531372, + "logps/rejected": -1.439571738243103, + "loss": 1.0761, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.273036241531372, + "rewards/margins": 0.16653569042682648, + "rewards/rejected": -1.439571738243103, + "sft_loss": 1.3252718448638916, + "step": 420 + }, + { + "epoch": 0.22746278641913364, + "grad_norm": 6.070579691423694, + "learning_rate": 2.2727272727272728e-06, + "logits/chosen": -0.05621107667684555, + "logits/rejected": 0.13872070610523224, + "logps/chosen": -1.3344993591308594, + "logps/rejected": -1.5649782419204712, + "loss": 1.0722, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3344993591308594, + "rewards/margins": 0.23047864437103271, + "rewards/rejected": -1.5649782419204712, + "sft_loss": 1.4042408466339111, + "step": 425 + }, + { + "epoch": 0.2301388192005352, + "grad_norm": 7.067992114737753, + "learning_rate": 2.299465240641711e-06, + "logits/chosen": -0.0927404910326004, + "logits/rejected": 0.10216756165027618, + "logps/chosen": -1.3587287664413452, + "logps/rejected": -1.5804948806762695, + "loss": 1.08, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3587287664413452, + "rewards/margins": 0.22176587581634521, + "rewards/rejected": -1.5804948806762695, + "sft_loss": 1.3935238122940063, + "step": 430 + }, + { + "epoch": 0.23281485198193677, + "grad_norm": 8.14751313426182, + "learning_rate": 2.3262032085561496e-06, + "logits/chosen": -0.02563117817044258, + "logits/rejected": 0.05861321836709976, + "logps/chosen": -1.2418124675750732, + "logps/rejected": -1.4311327934265137, + "loss": 1.0511, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.2418124675750732, + "rewards/margins": 0.18932026624679565, + "rewards/rejected": -1.4311327934265137, + "sft_loss": 1.3078510761260986, + "step": 435 + }, + { + "epoch": 0.23549088476333835, + "grad_norm": 6.318596335672091, + "learning_rate": 2.3529411764705885e-06, + "logits/chosen": -0.010552659630775452, + "logits/rejected": 0.08236038684844971, + "logps/chosen": -1.3421692848205566, + "logps/rejected": -1.4967700242996216, + "loss": 1.0794, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.3421692848205566, + "rewards/margins": 0.15460090339183807, + "rewards/rejected": -1.4967700242996216, + "sft_loss": 1.345083236694336, + "step": 440 + }, + { + "epoch": 0.23816691754473993, + "grad_norm": 5.876139249639594, + "learning_rate": 2.379679144385027e-06, + "logits/chosen": -0.024869054555892944, + "logits/rejected": 0.09330997616052628, + "logps/chosen": -1.3537369966506958, + "logps/rejected": -1.5938447713851929, + "loss": 1.0821, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3537369966506958, + "rewards/margins": 0.2401077002286911, + "rewards/rejected": -1.5938447713851929, + "sft_loss": 1.3780953884124756, + "step": 445 + }, + { + "epoch": 0.2408429503261415, + "grad_norm": 8.493429935039602, + "learning_rate": 2.4064171122994653e-06, + "logits/chosen": 0.026296118274331093, + "logits/rejected": 0.15405510365962982, + "logps/chosen": -1.351786494255066, + "logps/rejected": -1.5464465618133545, + "loss": 1.0431, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.351786494255066, + "rewards/margins": 0.19466015696525574, + "rewards/rejected": -1.5464465618133545, + "sft_loss": 1.3259353637695312, + "step": 450 + }, + { + "epoch": 0.24351898310754308, + "grad_norm": 6.197921325965198, + "learning_rate": 2.4331550802139037e-06, + "logits/chosen": 0.012350971810519695, + "logits/rejected": 0.1070106253027916, + "logps/chosen": -1.2704529762268066, + "logps/rejected": -1.5693228244781494, + "loss": 1.0265, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2704529762268066, + "rewards/margins": 0.2988698482513428, + "rewards/rejected": -1.5693228244781494, + "sft_loss": 1.2969739437103271, + "step": 455 + }, + { + "epoch": 0.24619501588894463, + "grad_norm": 6.124793908509888, + "learning_rate": 2.459893048128342e-06, + "logits/chosen": -0.11806248128414154, + "logits/rejected": 0.005137929227203131, + "logps/chosen": -1.403852105140686, + "logps/rejected": -1.512068510055542, + "loss": 1.1314, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.403852105140686, + "rewards/margins": 0.1082165464758873, + "rewards/rejected": -1.512068510055542, + "sft_loss": 1.4219634532928467, + "step": 460 + }, + { + "epoch": 0.2488710486703462, + "grad_norm": 6.465729425548841, + "learning_rate": 2.4866310160427806e-06, + "logits/chosen": 0.16115108132362366, + "logits/rejected": 0.18483372032642365, + "logps/chosen": -1.3084897994995117, + "logps/rejected": -1.5060487985610962, + "loss": 1.0691, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.3084897994995117, + "rewards/margins": 0.19755904376506805, + "rewards/rejected": -1.5060487985610962, + "sft_loss": 1.3134046792984009, + "step": 465 + }, + { + "epoch": 0.2515470814517478, + "grad_norm": 6.861663361054339, + "learning_rate": 2.5133689839572194e-06, + "logits/chosen": 0.16848713159561157, + "logits/rejected": 0.12264200299978256, + "logps/chosen": -1.2748721837997437, + "logps/rejected": -1.502882957458496, + "loss": 1.0426, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.2748721837997437, + "rewards/margins": 0.22801072895526886, + "rewards/rejected": -1.502882957458496, + "sft_loss": 1.303835153579712, + "step": 470 + }, + { + "epoch": 0.25422311423314936, + "grad_norm": 5.417790338120884, + "learning_rate": 2.540106951871658e-06, + "logits/chosen": -0.02455167844891548, + "logits/rejected": 0.11675968021154404, + "logps/chosen": -1.3103151321411133, + "logps/rejected": -1.6406517028808594, + "loss": 1.0362, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3103151321411133, + "rewards/margins": 0.33033671975135803, + "rewards/rejected": -1.6406517028808594, + "sft_loss": 1.3476392030715942, + "step": 475 + }, + { + "epoch": 0.2568991470145509, + "grad_norm": 6.187708853753433, + "learning_rate": 2.5668449197860963e-06, + "logits/chosen": -0.023285821080207825, + "logits/rejected": 0.1779555380344391, + "logps/chosen": -1.3003066778182983, + "logps/rejected": -1.4369903802871704, + "loss": 1.0748, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3003066778182983, + "rewards/margins": 0.13668350875377655, + "rewards/rejected": -1.4369903802871704, + "sft_loss": 1.315993070602417, + "step": 480 + }, + { + "epoch": 0.2595751797959525, + "grad_norm": 6.949952876484532, + "learning_rate": 2.5935828877005347e-06, + "logits/chosen": 0.018455123528838158, + "logits/rejected": 0.061692021787166595, + "logps/chosen": -1.3897532224655151, + "logps/rejected": -1.5637627840042114, + "loss": 1.0926, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.3897532224655151, + "rewards/margins": 0.1740095317363739, + "rewards/rejected": -1.5637627840042114, + "sft_loss": 1.3837809562683105, + "step": 485 + }, + { + "epoch": 0.26225121257735406, + "grad_norm": 8.424582028696578, + "learning_rate": 2.620320855614973e-06, + "logits/chosen": 0.016538361087441444, + "logits/rejected": 0.08852274715900421, + "logps/chosen": -1.3464523553848267, + "logps/rejected": -1.488745093345642, + "loss": 1.1035, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3464523553848267, + "rewards/margins": 0.14229276776313782, + "rewards/rejected": -1.488745093345642, + "sft_loss": 1.3327219486236572, + "step": 490 + }, + { + "epoch": 0.26492724535875567, + "grad_norm": 6.887461643958575, + "learning_rate": 2.647058823529412e-06, + "logits/chosen": -0.03392522409558296, + "logits/rejected": -0.011492741294205189, + "logps/chosen": -1.3476107120513916, + "logps/rejected": -1.490720272064209, + "loss": 1.1142, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.3476107120513916, + "rewards/margins": 0.1431095004081726, + "rewards/rejected": -1.490720272064209, + "sft_loss": 1.41537344455719, + "step": 495 + }, + { + "epoch": 0.2676032781401572, + "grad_norm": 5.493118625889312, + "learning_rate": 2.6737967914438504e-06, + "logits/chosen": -0.05503328517079353, + "logits/rejected": 0.04382333159446716, + "logps/chosen": -1.243357539176941, + "logps/rejected": -1.4519065618515015, + "loss": 1.0576, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.243357539176941, + "rewards/margins": 0.2085491418838501, + "rewards/rejected": -1.4519065618515015, + "sft_loss": 1.2865723371505737, + "step": 500 + }, + { + "epoch": 0.27027931092155877, + "grad_norm": 7.436163935133731, + "learning_rate": 2.700534759358289e-06, + "logits/chosen": -0.06544395536184311, + "logits/rejected": 0.07414183020591736, + "logps/chosen": -1.3723136186599731, + "logps/rejected": -1.4571316242218018, + "loss": 1.1255, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.3723136186599731, + "rewards/margins": 0.08481813967227936, + "rewards/rejected": -1.4571316242218018, + "sft_loss": 1.387662410736084, + "step": 505 + }, + { + "epoch": 0.2729553437029604, + "grad_norm": 5.367166647355689, + "learning_rate": 2.7272727272727272e-06, + "logits/chosen": 0.09557502716779709, + "logits/rejected": 0.1608404666185379, + "logps/chosen": -1.3068989515304565, + "logps/rejected": -1.560426115989685, + "loss": 1.0323, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.3068989515304565, + "rewards/margins": 0.25352734327316284, + "rewards/rejected": -1.560426115989685, + "sft_loss": 1.2871196269989014, + "step": 510 + }, + { + "epoch": 0.2756313764843619, + "grad_norm": 4.8720988314465155, + "learning_rate": 2.7540106951871656e-06, + "logits/chosen": 0.06531797349452972, + "logits/rejected": 0.1588929146528244, + "logps/chosen": -1.2504026889801025, + "logps/rejected": -1.4396889209747314, + "loss": 1.0612, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.2504026889801025, + "rewards/margins": 0.18928605318069458, + "rewards/rejected": -1.4396889209747314, + "sft_loss": 1.297341227531433, + "step": 515 + }, + { + "epoch": 0.27830740926576353, + "grad_norm": 5.480711455776227, + "learning_rate": 2.780748663101604e-06, + "logits/chosen": -0.04449567198753357, + "logits/rejected": 0.10000330209732056, + "logps/chosen": -1.316209316253662, + "logps/rejected": -1.4778051376342773, + "loss": 1.1114, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.316209316253662, + "rewards/margins": 0.16159582138061523, + "rewards/rejected": -1.4778051376342773, + "sft_loss": 1.4180432558059692, + "step": 520 + }, + { + "epoch": 0.2809834420471651, + "grad_norm": 11.096748515913996, + "learning_rate": 2.807486631016043e-06, + "logits/chosen": 0.13434985280036926, + "logits/rejected": 0.20518159866333008, + "logps/chosen": -1.3031270503997803, + "logps/rejected": -1.5300318002700806, + "loss": 1.0819, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3031270503997803, + "rewards/margins": 0.22690463066101074, + "rewards/rejected": -1.5300318002700806, + "sft_loss": 1.3854058980941772, + "step": 525 + }, + { + "epoch": 0.2836594748285666, + "grad_norm": 4.995978658701775, + "learning_rate": 2.8342245989304813e-06, + "logits/chosen": 0.08955325186252594, + "logits/rejected": 0.17318478226661682, + "logps/chosen": -1.2554371356964111, + "logps/rejected": -1.4383143186569214, + "loss": 1.043, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.2554371356964111, + "rewards/margins": 0.182877317070961, + "rewards/rejected": -1.4383143186569214, + "sft_loss": 1.2290910482406616, + "step": 530 + }, + { + "epoch": 0.28633550760996823, + "grad_norm": 4.997013561916889, + "learning_rate": 2.8609625668449198e-06, + "logits/chosen": -0.07346369326114655, + "logits/rejected": 0.19550183415412903, + "logps/chosen": -1.2598817348480225, + "logps/rejected": -1.3762149810791016, + "loss": 1.0623, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.2598817348480225, + "rewards/margins": 0.11633334308862686, + "rewards/rejected": -1.3762149810791016, + "sft_loss": 1.2499682903289795, + "step": 535 + }, + { + "epoch": 0.2890115403913698, + "grad_norm": 5.079883060726165, + "learning_rate": 2.887700534759358e-06, + "logits/chosen": 0.020293405279517174, + "logits/rejected": 0.09537501633167267, + "logps/chosen": -1.3982828855514526, + "logps/rejected": -1.543436050415039, + "loss": 1.1171, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3982828855514526, + "rewards/margins": 0.14515307545661926, + "rewards/rejected": -1.543436050415039, + "sft_loss": 1.4294731616973877, + "step": 540 + }, + { + "epoch": 0.2916875731727714, + "grad_norm": 5.485233483620466, + "learning_rate": 2.9144385026737966e-06, + "logits/chosen": -0.1151435375213623, + "logits/rejected": 0.08566157519817352, + "logps/chosen": -1.3151395320892334, + "logps/rejected": -1.521807312965393, + "loss": 1.0565, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.3151395320892334, + "rewards/margins": 0.2066677063703537, + "rewards/rejected": -1.521807312965393, + "sft_loss": 1.3310606479644775, + "step": 545 + }, + { + "epoch": 0.29436360595417294, + "grad_norm": 5.4571151840945085, + "learning_rate": 2.941176470588235e-06, + "logits/chosen": -0.010904309339821339, + "logits/rejected": 0.054650772362947464, + "logps/chosen": -1.3527603149414062, + "logps/rejected": -1.5563316345214844, + "loss": 1.055, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3527603149414062, + "rewards/margins": 0.20357127487659454, + "rewards/rejected": -1.5563316345214844, + "sft_loss": 1.315617561340332, + "step": 550 + }, + { + "epoch": 0.2970396387355745, + "grad_norm": 9.253522422552011, + "learning_rate": 2.967914438502674e-06, + "logits/chosen": -0.08174613863229752, + "logits/rejected": 0.04986164718866348, + "logps/chosen": -1.4038634300231934, + "logps/rejected": -1.5366909503936768, + "loss": 1.1312, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.4038634300231934, + "rewards/margins": 0.13282766938209534, + "rewards/rejected": -1.5366909503936768, + "sft_loss": 1.402178168296814, + "step": 555 + }, + { + "epoch": 0.2997156715169761, + "grad_norm": 4.965375183552373, + "learning_rate": 2.9946524064171123e-06, + "logits/chosen": 0.052502263337373734, + "logits/rejected": 0.06982048600912094, + "logps/chosen": -1.250318169593811, + "logps/rejected": -1.4745540618896484, + "loss": 1.0707, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.250318169593811, + "rewards/margins": 0.2242358922958374, + "rewards/rejected": -1.4745540618896484, + "sft_loss": 1.3915551900863647, + "step": 560 + }, + { + "epoch": 0.30239170429837764, + "grad_norm": 4.351113289208154, + "learning_rate": 2.999995343036539e-06, + "logits/chosen": 0.03482988476753235, + "logits/rejected": 0.09091867506504059, + "logps/chosen": -1.328087568283081, + "logps/rejected": -1.521519422531128, + "loss": 1.0806, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.328087568283081, + "rewards/margins": 0.1934318095445633, + "rewards/rejected": -1.521519422531128, + "sft_loss": 1.3730275630950928, + "step": 565 + }, + { + "epoch": 0.30506773707977924, + "grad_norm": 6.469054086250016, + "learning_rate": 2.9999764241720397e-06, + "logits/chosen": -0.03793569654226303, + "logits/rejected": 0.19039729237556458, + "logps/chosen": -1.3180876970291138, + "logps/rejected": -1.466033697128296, + "loss": 1.1138, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.3180876970291138, + "rewards/margins": 0.14794600009918213, + "rewards/rejected": -1.466033697128296, + "sft_loss": 1.4070537090301514, + "step": 570 + }, + { + "epoch": 0.3077437698611808, + "grad_norm": 5.026428226754346, + "learning_rate": 2.9999429525296936e-06, + "logits/chosen": -0.01073513738811016, + "logits/rejected": 0.049251411110162735, + "logps/chosen": -1.2521588802337646, + "logps/rejected": -1.4337341785430908, + "loss": 1.0529, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2521588802337646, + "rewards/margins": 0.18157517910003662, + "rewards/rejected": -1.4337341785430908, + "sft_loss": 1.27958083152771, + "step": 575 + }, + { + "epoch": 0.3104198026425824, + "grad_norm": 4.571129756220789, + "learning_rate": 2.9998949284342434e-06, + "logits/chosen": -0.06468029320240021, + "logits/rejected": 0.08895576000213623, + "logps/chosen": -1.2564995288848877, + "logps/rejected": -1.5854532718658447, + "loss": 1.0069, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2564995288848877, + "rewards/margins": 0.32895392179489136, + "rewards/rejected": -1.5854532718658447, + "sft_loss": 1.2941913604736328, + "step": 580 + }, + { + "epoch": 0.31309583542398395, + "grad_norm": 6.512823438890916, + "learning_rate": 2.99983235235162e-06, + "logits/chosen": -0.13302397727966309, + "logits/rejected": -0.03166192024946213, + "logps/chosen": -1.4814156293869019, + "logps/rejected": -1.6018117666244507, + "loss": 1.1668, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.4814156293869019, + "rewards/margins": 0.12039615958929062, + "rewards/rejected": -1.6018117666244507, + "sft_loss": 1.4760197401046753, + "step": 585 + }, + { + "epoch": 0.3157718682053855, + "grad_norm": 7.017628351517967, + "learning_rate": 2.999755224888935e-06, + "logits/chosen": -0.0977492704987526, + "logits/rejected": 0.015863103792071342, + "logps/chosen": -1.3981412649154663, + "logps/rejected": -1.4746181964874268, + "loss": 1.1421, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.3981412649154663, + "rewards/margins": 0.07647692412137985, + "rewards/rejected": -1.4746181964874268, + "sft_loss": 1.4174917936325073, + "step": 590 + }, + { + "epoch": 0.3184479009867871, + "grad_norm": 5.609570856110609, + "learning_rate": 2.9996635467944813e-06, + "logits/chosen": -0.032395754009485245, + "logits/rejected": 0.09316650032997131, + "logps/chosen": -1.2944605350494385, + "logps/rejected": -1.4776843786239624, + "loss": 1.0656, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.2944605350494385, + "rewards/margins": 0.18322405219078064, + "rewards/rejected": -1.4776843786239624, + "sft_loss": 1.3147443532943726, + "step": 595 + }, + { + "epoch": 0.32112393376818865, + "grad_norm": 4.871214261969739, + "learning_rate": 2.999557318957719e-06, + "logits/chosen": -0.11450288444757462, + "logits/rejected": 0.02715505287051201, + "logps/chosen": -1.3029778003692627, + "logps/rejected": -1.3981813192367554, + "loss": 1.1033, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3029778003692627, + "rewards/margins": 0.09520343691110611, + "rewards/rejected": -1.3981813192367554, + "sft_loss": 1.3368868827819824, + "step": 600 + }, + { + "epoch": 0.32379996654959026, + "grad_norm": 7.595463039232994, + "learning_rate": 2.9994365424092717e-06, + "logits/chosen": -0.15079785883426666, + "logits/rejected": -0.06511592119932175, + "logps/chosen": -1.382546067237854, + "logps/rejected": -1.610282301902771, + "loss": 1.094, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.382546067237854, + "rewards/margins": 0.22773627936840057, + "rewards/rejected": -1.610282301902771, + "sft_loss": 1.4123454093933105, + "step": 605 + }, + { + "epoch": 0.3264759993309918, + "grad_norm": 12.103020392515973, + "learning_rate": 2.9993012183209135e-06, + "logits/chosen": -0.005148774944245815, + "logits/rejected": 0.14879630506038666, + "logps/chosen": -1.3624019622802734, + "logps/rejected": -1.522040843963623, + "loss": 1.1099, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.3624019622802734, + "rewards/margins": 0.15963879227638245, + "rewards/rejected": -1.522040843963623, + "sft_loss": 1.362701177597046, + "step": 610 + }, + { + "epoch": 0.32915203211239336, + "grad_norm": 6.393010819793559, + "learning_rate": 2.9991513480055592e-06, + "logits/chosen": -0.11131584644317627, + "logits/rejected": -0.004262803588062525, + "logps/chosen": -1.351694107055664, + "logps/rejected": -1.6122684478759766, + "loss": 1.0688, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.351694107055664, + "rewards/margins": 0.2605743110179901, + "rewards/rejected": -1.6122684478759766, + "sft_loss": 1.374330997467041, + "step": 615 + }, + { + "epoch": 0.33182806489379496, + "grad_norm": 5.351828483818866, + "learning_rate": 2.998986932917252e-06, + "logits/chosen": 0.040585193783044815, + "logits/rejected": 0.10566142946481705, + "logps/chosen": -1.4071991443634033, + "logps/rejected": -1.579245924949646, + "loss": 1.1081, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.4071991443634033, + "rewards/margins": 0.1720467507839203, + "rewards/rejected": -1.579245924949646, + "sft_loss": 1.405289888381958, + "step": 620 + }, + { + "epoch": 0.3345040976751965, + "grad_norm": 6.112200213434097, + "learning_rate": 2.998807974651147e-06, + "logits/chosen": 0.009924227371811867, + "logits/rejected": 0.11972503364086151, + "logps/chosen": -1.3159351348876953, + "logps/rejected": -1.59053635597229, + "loss": 1.0489, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3159351348876953, + "rewards/margins": 0.27460095286369324, + "rewards/rejected": -1.59053635597229, + "sft_loss": 1.3495194911956787, + "step": 625 + }, + { + "epoch": 0.3371801304565981, + "grad_norm": 14.44182735717814, + "learning_rate": 2.9986144749434987e-06, + "logits/chosen": -0.06408433616161346, + "logits/rejected": 0.04238145425915718, + "logps/chosen": -1.369593858718872, + "logps/rejected": -1.6089508533477783, + "loss": 1.0433, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.369593858718872, + "rewards/margins": 0.23935675621032715, + "rewards/rejected": -1.6089508533477783, + "sft_loss": 1.3346786499023438, + "step": 630 + }, + { + "epoch": 0.33985616323799966, + "grad_norm": 6.096249152789308, + "learning_rate": 2.9984064356716413e-06, + "logits/chosen": -0.08199284970760345, + "logits/rejected": 0.1549263298511505, + "logps/chosen": -1.434735894203186, + "logps/rejected": -1.5937846899032593, + "loss": 1.1334, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.434735894203186, + "rewards/margins": 0.15904894471168518, + "rewards/rejected": -1.5937846899032593, + "sft_loss": 1.4382418394088745, + "step": 635 + }, + { + "epoch": 0.3425321960194012, + "grad_norm": 9.288480097861305, + "learning_rate": 2.998183858853974e-06, + "logits/chosen": -0.17539802193641663, + "logits/rejected": 0.019877593964338303, + "logps/chosen": -1.3679062128067017, + "logps/rejected": -1.5570322275161743, + "loss": 1.1068, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.3679062128067017, + "rewards/margins": 0.1891259253025055, + "rewards/rejected": -1.5570322275161743, + "sft_loss": 1.4440988302230835, + "step": 640 + }, + { + "epoch": 0.3452082288008028, + "grad_norm": 6.120276977609556, + "learning_rate": 2.997946746649937e-06, + "logits/chosen": -0.17709307372570038, + "logits/rejected": -0.09089900553226471, + "logps/chosen": -1.2848079204559326, + "logps/rejected": -1.5811681747436523, + "loss": 1.0051, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.2848079204559326, + "rewards/margins": 0.29636019468307495, + "rewards/rejected": -1.5811681747436523, + "sft_loss": 1.2749412059783936, + "step": 645 + }, + { + "epoch": 0.34788426158220437, + "grad_norm": 8.751065928216027, + "learning_rate": 2.997695101359994e-06, + "logits/chosen": -0.12560425698757172, + "logits/rejected": 0.016330739483237267, + "logps/chosen": -1.4288761615753174, + "logps/rejected": -1.6702779531478882, + "loss": 1.0873, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.4288761615753174, + "rewards/margins": 0.24140167236328125, + "rewards/rejected": -1.6702779531478882, + "sft_loss": 1.4446970224380493, + "step": 650 + }, + { + "epoch": 0.350560294363606, + "grad_norm": 7.261081306992401, + "learning_rate": 2.997428925425609e-06, + "logits/chosen": -0.03356175869703293, + "logits/rejected": -0.024207763373851776, + "logps/chosen": -1.3508546352386475, + "logps/rejected": -1.6246044635772705, + "loss": 1.076, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.3508546352386475, + "rewards/margins": 0.2737496495246887, + "rewards/rejected": -1.6246044635772705, + "sft_loss": 1.3727697134017944, + "step": 655 + }, + { + "epoch": 0.3532363271450075, + "grad_norm": 6.460018434735591, + "learning_rate": 2.997148221429223e-06, + "logits/chosen": -0.09538416564464569, + "logits/rejected": 0.02857859805226326, + "logps/chosen": -1.3230699300765991, + "logps/rejected": -1.446380615234375, + "loss": 1.103, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.3230699300765991, + "rewards/margins": 0.12331060320138931, + "rewards/rejected": -1.446380615234375, + "sft_loss": 1.3620294332504272, + "step": 660 + }, + { + "epoch": 0.35591235992640907, + "grad_norm": 4.745852200616508, + "learning_rate": 2.996852992094225e-06, + "logits/chosen": -0.1095251813530922, + "logits/rejected": 0.02362101897597313, + "logps/chosen": -1.2727789878845215, + "logps/rejected": -1.4820854663848877, + "loss": 1.063, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2727789878845215, + "rewards/margins": 0.20930643379688263, + "rewards/rejected": -1.4820854663848877, + "sft_loss": 1.3236764669418335, + "step": 665 + }, + { + "epoch": 0.3585883927078107, + "grad_norm": 4.751418020702247, + "learning_rate": 2.9965432402849336e-06, + "logits/chosen": -0.10184980928897858, + "logits/rejected": 0.11156318336725235, + "logps/chosen": -1.302096962928772, + "logps/rejected": -1.4565203189849854, + "loss": 1.0951, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.302096962928772, + "rewards/margins": 0.154423326253891, + "rewards/rejected": -1.4565203189849854, + "sft_loss": 1.4031466245651245, + "step": 670 + }, + { + "epoch": 0.3612644254892122, + "grad_norm": 5.594116499670167, + "learning_rate": 2.9962189690065614e-06, + "logits/chosen": -0.15128448605537415, + "logits/rejected": -0.07776673883199692, + "logps/chosen": -1.3057724237442017, + "logps/rejected": -1.6141293048858643, + "loss": 1.0376, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3057724237442017, + "rewards/margins": 0.3083568811416626, + "rewards/rejected": -1.6141293048858643, + "sft_loss": 1.3583399057388306, + "step": 675 + }, + { + "epoch": 0.36394045827061383, + "grad_norm": 5.211897854634603, + "learning_rate": 2.99588018140519e-06, + "logits/chosen": -0.07172641158103943, + "logits/rejected": 0.09021764993667603, + "logps/chosen": -1.3617281913757324, + "logps/rejected": -1.54132080078125, + "loss": 1.1241, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.3617281913757324, + "rewards/margins": 0.17959263920783997, + "rewards/rejected": -1.54132080078125, + "sft_loss": 1.3477673530578613, + "step": 680 + }, + { + "epoch": 0.3666164910520154, + "grad_norm": 15.889378620354021, + "learning_rate": 2.995526880767737e-06, + "logits/chosen": -0.11468782275915146, + "logits/rejected": 0.03871222585439682, + "logps/chosen": -1.3583406209945679, + "logps/rejected": -1.5525429248809814, + "loss": 1.0944, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3583406209945679, + "rewards/margins": 0.19420206546783447, + "rewards/rejected": -1.5525429248809814, + "sft_loss": 1.3450286388397217, + "step": 685 + }, + { + "epoch": 0.369292523833417, + "grad_norm": 6.6170605417450705, + "learning_rate": 2.9951590705219287e-06, + "logits/chosen": -0.1409483253955841, + "logits/rejected": -0.10424485057592392, + "logps/chosen": -1.3454536199569702, + "logps/rejected": -1.5386629104614258, + "loss": 1.1092, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3454536199569702, + "rewards/margins": 0.1932092308998108, + "rewards/rejected": -1.5386629104614258, + "sft_loss": 1.4022139310836792, + "step": 690 + }, + { + "epoch": 0.37196855661481854, + "grad_norm": 5.267932830709378, + "learning_rate": 2.99477675423626e-06, + "logits/chosen": -0.1686527580022812, + "logits/rejected": -0.08229938894510269, + "logps/chosen": -1.252075433731079, + "logps/rejected": -1.4953110218048096, + "loss": 1.0305, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.252075433731079, + "rewards/margins": 0.2432354986667633, + "rewards/rejected": -1.4953110218048096, + "sft_loss": 1.2790673971176147, + "step": 695 + }, + { + "epoch": 0.3746445893962201, + "grad_norm": 16.544037674536877, + "learning_rate": 2.994379935619966e-06, + "logits/chosen": -0.301688551902771, + "logits/rejected": -0.17635759711265564, + "logps/chosen": -1.4327962398529053, + "logps/rejected": -1.5365126132965088, + "loss": 1.1198, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.4327962398529053, + "rewards/margins": 0.1037164181470871, + "rewards/rejected": -1.5365126132965088, + "sft_loss": 1.404392123222351, + "step": 700 + }, + { + "epoch": 0.3773206221776217, + "grad_norm": 5.581296428398838, + "learning_rate": 2.9939686185229826e-06, + "logits/chosen": -0.26242876052856445, + "logits/rejected": -0.0957266241312027, + "logps/chosen": -1.3624424934387207, + "logps/rejected": -1.6302438974380493, + "loss": 1.0601, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3624424934387207, + "rewards/margins": 0.2678012251853943, + "rewards/rejected": -1.6302438974380493, + "sft_loss": 1.373811960220337, + "step": 705 + }, + { + "epoch": 0.37999665495902324, + "grad_norm": 6.0713875990807, + "learning_rate": 2.9935428069359103e-06, + "logits/chosen": -0.16007235646247864, + "logits/rejected": -0.07162095606327057, + "logps/chosen": -1.3129180669784546, + "logps/rejected": -1.5489208698272705, + "loss": 1.0377, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3129180669784546, + "rewards/margins": 0.23600268363952637, + "rewards/rejected": -1.5489208698272705, + "sft_loss": 1.3114233016967773, + "step": 710 + }, + { + "epoch": 0.38267268774042484, + "grad_norm": 6.726419075457986, + "learning_rate": 2.9931025049899744e-06, + "logits/chosen": -0.22567526996135712, + "logits/rejected": -0.07417559623718262, + "logps/chosen": -1.3759498596191406, + "logps/rejected": -1.5786223411560059, + "loss": 1.0625, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3759498596191406, + "rewards/margins": 0.20267245173454285, + "rewards/rejected": -1.5786223411560059, + "sft_loss": 1.3487598896026611, + "step": 715 + }, + { + "epoch": 0.3853487205218264, + "grad_norm": 6.154001565581968, + "learning_rate": 2.9926477169569865e-06, + "logits/chosen": -0.1562787890434265, + "logits/rejected": 0.011226480826735497, + "logps/chosen": -1.475956678390503, + "logps/rejected": -1.6560490131378174, + "loss": 1.1454, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.475956678390503, + "rewards/margins": 0.18009230494499207, + "rewards/rejected": -1.6560490131378174, + "sft_loss": 1.4573657512664795, + "step": 720 + }, + { + "epoch": 0.38802475330322794, + "grad_norm": 6.469702586737282, + "learning_rate": 2.9921784472493023e-06, + "logits/chosen": -0.2580137848854065, + "logits/rejected": -0.13596820831298828, + "logps/chosen": -1.2518982887268066, + "logps/rejected": -1.564278244972229, + "loss": 1.0125, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2518982887268066, + "rewards/margins": 0.3123798966407776, + "rewards/rejected": -1.564278244972229, + "sft_loss": 1.3133872747421265, + "step": 725 + }, + { + "epoch": 0.39070078608462955, + "grad_norm": 5.696707257736793, + "learning_rate": 2.9916947004197784e-06, + "logits/chosen": -0.2973279356956482, + "logits/rejected": -0.14339035749435425, + "logps/chosen": -1.3605453968048096, + "logps/rejected": -1.5471065044403076, + "loss": 1.0808, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3605453968048096, + "rewards/margins": 0.18656139075756073, + "rewards/rejected": -1.5471065044403076, + "sft_loss": 1.3702105283737183, + "step": 730 + }, + { + "epoch": 0.3933768188660311, + "grad_norm": 5.510506460182416, + "learning_rate": 2.9911964811617288e-06, + "logits/chosen": -0.2812764644622803, + "logits/rejected": -0.19277769327163696, + "logps/chosen": -1.3888843059539795, + "logps/rejected": -1.5353819131851196, + "loss": 1.1113, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3888843059539795, + "rewards/margins": 0.14649756252765656, + "rewards/rejected": -1.5353819131851196, + "sft_loss": 1.4330034255981445, + "step": 735 + }, + { + "epoch": 0.3960528516474327, + "grad_norm": 6.748687357461805, + "learning_rate": 2.990683794308879e-06, + "logits/chosen": -0.2419544905424118, + "logits/rejected": -0.08293026685714722, + "logps/chosen": -1.46367347240448, + "logps/rejected": -1.6116256713867188, + "loss": 1.1492, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.46367347240448, + "rewards/margins": 0.1479521244764328, + "rewards/rejected": -1.6116256713867188, + "sft_loss": 1.4724233150482178, + "step": 740 + }, + { + "epoch": 0.39872888442883425, + "grad_norm": 4.83181808694649, + "learning_rate": 2.990156644835318e-06, + "logits/chosen": -0.14327023923397064, + "logits/rejected": -0.08006924390792847, + "logps/chosen": -1.4113633632659912, + "logps/rejected": -1.674155831336975, + "loss": 1.1005, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.4113633632659912, + "rewards/margins": 0.2627924978733063, + "rewards/rejected": -1.674155831336975, + "sft_loss": 1.3983561992645264, + "step": 745 + }, + { + "epoch": 0.4014049172102358, + "grad_norm": 4.0797099615299, + "learning_rate": 2.989615037855454e-06, + "logits/chosen": -0.19163314998149872, + "logits/rejected": -0.045805417001247406, + "logps/chosen": -1.326471209526062, + "logps/rejected": -1.604789137840271, + "loss": 1.0458, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.326471209526062, + "rewards/margins": 0.2783178389072418, + "rewards/rejected": -1.604789137840271, + "sft_loss": 1.353487253189087, + "step": 750 + }, + { + "epoch": 0.4040809499916374, + "grad_norm": 5.719836220088525, + "learning_rate": 2.98905897862396e-06, + "logits/chosen": -0.1349993646144867, + "logits/rejected": -0.021838178858160973, + "logps/chosen": -1.3588494062423706, + "logps/rejected": -1.4981552362442017, + "loss": 1.1149, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.3588494062423706, + "rewards/margins": 0.13930585980415344, + "rewards/rejected": -1.4981552362442017, + "sft_loss": 1.3990111351013184, + "step": 755 + }, + { + "epoch": 0.40675698277303896, + "grad_norm": 4.736699163937537, + "learning_rate": 2.9884884725357237e-06, + "logits/chosen": -0.22783195972442627, + "logits/rejected": -0.16126468777656555, + "logps/chosen": -1.3684322834014893, + "logps/rejected": -1.5770494937896729, + "loss": 1.0834, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3684322834014893, + "rewards/margins": 0.20861713588237762, + "rewards/rejected": -1.5770494937896729, + "sft_loss": 1.4068214893341064, + "step": 760 + }, + { + "epoch": 0.40943301555444056, + "grad_norm": 5.749657872809939, + "learning_rate": 2.9879035251257994e-06, + "logits/chosen": -0.19161829352378845, + "logits/rejected": -0.1089678555727005, + "logps/chosen": -1.3445179462432861, + "logps/rejected": -1.4991188049316406, + "loss": 1.075, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.3445179462432861, + "rewards/margins": 0.15460094809532166, + "rewards/rejected": -1.4991188049316406, + "sft_loss": 1.337806224822998, + "step": 765 + }, + { + "epoch": 0.4121090483358421, + "grad_norm": 6.624603107735152, + "learning_rate": 2.9873041420693485e-06, + "logits/chosen": -0.08060692250728607, + "logits/rejected": 0.04494641348719597, + "logps/chosen": -1.3140990734100342, + "logps/rejected": -1.6481949090957642, + "loss": 1.0355, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3140990734100342, + "rewards/margins": 0.33409592509269714, + "rewards/rejected": -1.6481949090957642, + "sft_loss": 1.3196513652801514, + "step": 770 + }, + { + "epoch": 0.41478508111724366, + "grad_norm": 6.240389256604791, + "learning_rate": 2.9866903291815874e-06, + "logits/chosen": -0.24704810976982117, + "logits/rejected": -0.08406410366296768, + "logps/chosen": -1.3743051290512085, + "logps/rejected": -1.620736837387085, + "loss": 1.0634, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3743051290512085, + "rewards/margins": 0.24643178284168243, + "rewards/rejected": -1.620736837387085, + "sft_loss": 1.3263603448867798, + "step": 775 + }, + { + "epoch": 0.41746111389864526, + "grad_norm": 4.075127017126476, + "learning_rate": 2.986062092417733e-06, + "logits/chosen": -0.29847627878189087, + "logits/rejected": -0.15218086540699005, + "logps/chosen": -1.3139820098876953, + "logps/rejected": -1.5687217712402344, + "loss": 1.0493, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.3139820098876953, + "rewards/margins": 0.254739910364151, + "rewards/rejected": -1.5687217712402344, + "sft_loss": 1.3553965091705322, + "step": 780 + }, + { + "epoch": 0.4201371466800468, + "grad_norm": 5.403533929290899, + "learning_rate": 2.9854194378729402e-06, + "logits/chosen": -0.1648341715335846, + "logits/rejected": -0.03919944912195206, + "logps/chosen": -1.320778250694275, + "logps/rejected": -1.6576478481292725, + "loss": 1.0246, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.320778250694275, + "rewards/margins": 0.3368696868419647, + "rewards/rejected": -1.6576478481292725, + "sft_loss": 1.328687310218811, + "step": 785 + }, + { + "epoch": 0.4228131794614484, + "grad_norm": 5.640372007436496, + "learning_rate": 2.984762371782246e-06, + "logits/chosen": -0.22557714581489563, + "logits/rejected": -0.09820972383022308, + "logps/chosen": -1.3596031665802002, + "logps/rejected": -1.6158663034439087, + "loss": 1.0578, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3596031665802002, + "rewards/margins": 0.2562631368637085, + "rewards/rejected": -1.6158663034439087, + "sft_loss": 1.3415796756744385, + "step": 790 + }, + { + "epoch": 0.42548921224284997, + "grad_norm": 5.045707472029078, + "learning_rate": 2.9840909005205093e-06, + "logits/chosen": -0.21985206007957458, + "logits/rejected": -0.0283858273178339, + "logps/chosen": -1.3438150882720947, + "logps/rejected": -1.74857497215271, + "loss": 1.0471, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.3438150882720947, + "rewards/margins": 0.4047599732875824, + "rewards/rejected": -1.74857497215271, + "sft_loss": 1.3679378032684326, + "step": 795 + }, + { + "epoch": 0.4281652450242516, + "grad_norm": 6.148936941607535, + "learning_rate": 2.9834050306023467e-06, + "logits/chosen": -0.18061117827892303, + "logits/rejected": -0.09824011474847794, + "logps/chosen": -1.3768579959869385, + "logps/rejected": -1.6590843200683594, + "loss": 1.0513, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.3768579959869385, + "rewards/margins": 0.28222647309303284, + "rewards/rejected": -1.6590843200683594, + "sft_loss": 1.3391609191894531, + "step": 800 + }, + { + "epoch": 0.4281652450242516, + "eval_logits/chosen": 0.17743077874183655, + "eval_logits/rejected": 0.2683868110179901, + "eval_logps/chosen": -1.4223155975341797, + "eval_logps/rejected": -1.7731083631515503, + "eval_loss": 1.0583148002624512, + "eval_rewards/accuracies": 0.6038575768470764, + "eval_rewards/chosen": -1.4223155975341797, + "eval_rewards/margins": 0.35079291462898254, + "eval_rewards/rejected": -1.7731083631515503, + "eval_runtime": 48.9553, + "eval_samples_per_second": 27.474, + "eval_sft_loss": 1.4055895805358887, + "eval_steps_per_second": 6.884, + "step": 800 + }, + { + "epoch": 0.4308412778056531, + "grad_norm": 8.529398356886302, + "learning_rate": 2.9827047686820714e-06, + "logits/chosen": -0.21581999957561493, + "logits/rejected": -0.050138603895902634, + "logps/chosen": -1.3935011625289917, + "logps/rejected": -1.8084440231323242, + "loss": 1.0447, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3935011625289917, + "rewards/margins": 0.41494283080101013, + "rewards/rejected": -1.8084440231323242, + "sft_loss": 1.4068191051483154, + "step": 805 + }, + { + "epoch": 0.43351731058705467, + "grad_norm": 6.776422278764382, + "learning_rate": 2.981990121553627e-06, + "logits/chosen": -0.10206502676010132, + "logits/rejected": -0.02784993313252926, + "logps/chosen": -1.3431257009506226, + "logps/rejected": -1.7025333642959595, + "loss": 1.0352, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3431257009506226, + "rewards/margins": 0.35940781235694885, + "rewards/rejected": -1.7025333642959595, + "sft_loss": 1.349726676940918, + "step": 810 + }, + { + "epoch": 0.4361933433684563, + "grad_norm": 9.001427842804857, + "learning_rate": 2.9812610961505237e-06, + "logits/chosen": -0.10953329503536224, + "logits/rejected": 0.024411043152213097, + "logps/chosen": -1.3474403619766235, + "logps/rejected": -1.7991764545440674, + "loss": 1.0292, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3474403619766235, + "rewards/margins": 0.45173612236976624, + "rewards/rejected": -1.7991764545440674, + "sft_loss": 1.387012243270874, + "step": 815 + }, + { + "epoch": 0.4388693761498578, + "grad_norm": 7.739663573480024, + "learning_rate": 2.980517699545769e-06, + "logits/chosen": -0.08769674599170685, + "logits/rejected": -0.04560214281082153, + "logps/chosen": -1.3872299194335938, + "logps/rejected": -1.7312628030776978, + "loss": 1.0722, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.3872299194335938, + "rewards/margins": 0.34403276443481445, + "rewards/rejected": -1.7312628030776978, + "sft_loss": 1.3998725414276123, + "step": 820 + }, + { + "epoch": 0.44154540893125943, + "grad_norm": 6.01912269730338, + "learning_rate": 2.9797599389518003e-06, + "logits/chosen": -0.14980466663837433, + "logits/rejected": -0.01854553446173668, + "logps/chosen": -1.2706798315048218, + "logps/rejected": -1.5689154863357544, + "loss": 1.064, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2706798315048218, + "rewards/margins": 0.29823535680770874, + "rewards/rejected": -1.5689154863357544, + "sft_loss": 1.3684272766113281, + "step": 825 + }, + { + "epoch": 0.444221441712661, + "grad_norm": 9.561618737261167, + "learning_rate": 2.9789878217204138e-06, + "logits/chosen": -0.043389469385147095, + "logits/rejected": 0.12749220430850983, + "logps/chosen": -1.3664634227752686, + "logps/rejected": -1.5824370384216309, + "loss": 1.0751, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.3664634227752686, + "rewards/margins": 0.21597354114055634, + "rewards/rejected": -1.5824370384216309, + "sft_loss": 1.3450241088867188, + "step": 830 + }, + { + "epoch": 0.44689747449406253, + "grad_norm": 6.2442349386788125, + "learning_rate": 2.9782013553426944e-06, + "logits/chosen": -0.09964965283870697, + "logits/rejected": 0.03761683404445648, + "logps/chosen": -1.302367091178894, + "logps/rejected": -1.6130015850067139, + "loss": 1.0555, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.302367091178894, + "rewards/margins": 0.31063464283943176, + "rewards/rejected": -1.6130015850067139, + "sft_loss": 1.3702852725982666, + "step": 835 + }, + { + "epoch": 0.44957350727546413, + "grad_norm": 7.289038637772866, + "learning_rate": 2.977400547448942e-06, + "logits/chosen": -0.0977490097284317, + "logits/rejected": 0.06860803067684174, + "logps/chosen": -1.3717734813690186, + "logps/rejected": -1.6933990716934204, + "loss": 1.077, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3717734813690186, + "rewards/margins": 0.3216255307197571, + "rewards/rejected": -1.6933990716934204, + "sft_loss": 1.409711480140686, + "step": 840 + }, + { + "epoch": 0.4522495400568657, + "grad_norm": 4.052377699298299, + "learning_rate": 2.976585405808599e-06, + "logits/chosen": -0.06485694646835327, + "logits/rejected": 0.009555049240589142, + "logps/chosen": -1.3541524410247803, + "logps/rejected": -1.6053316593170166, + "loss": 1.0928, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3541524410247803, + "rewards/margins": 0.2511790692806244, + "rewards/rejected": -1.6053316593170166, + "sft_loss": 1.405263900756836, + "step": 845 + }, + { + "epoch": 0.4549255728382673, + "grad_norm": 7.249188220946621, + "learning_rate": 2.9757559383301726e-06, + "logits/chosen": -0.10097716003656387, + "logits/rejected": -0.024070020765066147, + "logps/chosen": -1.398870587348938, + "logps/rejected": -1.6465431451797485, + "loss": 1.0542, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.398870587348938, + "rewards/margins": 0.24767239391803741, + "rewards/rejected": -1.6465431451797485, + "sft_loss": 1.3686355352401733, + "step": 850 + }, + { + "epoch": 0.45760160561966884, + "grad_norm": 9.224325589545659, + "learning_rate": 2.9749121530611605e-06, + "logits/chosen": -0.13083770871162415, + "logits/rejected": 0.026032855734229088, + "logps/chosen": -1.4113142490386963, + "logps/rejected": -1.7621266841888428, + "loss": 1.0958, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.4113142490386963, + "rewards/margins": 0.3508125841617584, + "rewards/rejected": -1.7621266841888428, + "sft_loss": 1.3893181085586548, + "step": 855 + }, + { + "epoch": 0.4602776384010704, + "grad_norm": 5.128594397616122, + "learning_rate": 2.97405405818797e-06, + "logits/chosen": -0.19242218136787415, + "logits/rejected": -0.022298630326986313, + "logps/chosen": -1.4043214321136475, + "logps/rejected": -1.729936957359314, + "loss": 1.0579, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.4043214321136475, + "rewards/margins": 0.32561546564102173, + "rewards/rejected": -1.729936957359314, + "sft_loss": 1.4137206077575684, + "step": 860 + }, + { + "epoch": 0.462953671182472, + "grad_norm": 21.840546326122123, + "learning_rate": 2.9731816620358426e-06, + "logits/chosen": -0.10733000189065933, + "logits/rejected": 0.005105187650769949, + "logps/chosen": -1.3446229696273804, + "logps/rejected": -1.6905028820037842, + "loss": 1.0707, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.3446229696273804, + "rewards/margins": 0.3458799123764038, + "rewards/rejected": -1.6905028820037842, + "sft_loss": 1.3272907733917236, + "step": 865 + }, + { + "epoch": 0.46562970396387354, + "grad_norm": 4.725786898369011, + "learning_rate": 2.9722949730687687e-06, + "logits/chosen": -0.22538034617900848, + "logits/rejected": 0.0403381884098053, + "logps/chosen": -1.3605945110321045, + "logps/rejected": -1.602994680404663, + "loss": 1.1037, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.3605945110321045, + "rewards/margins": 0.24240007996559143, + "rewards/rejected": -1.602994680404663, + "sft_loss": 1.4284287691116333, + "step": 870 + }, + { + "epoch": 0.46830573674527515, + "grad_norm": 6.615571050104055, + "learning_rate": 2.9713939998894087e-06, + "logits/chosen": -0.1339145302772522, + "logits/rejected": -0.061110563576221466, + "logps/chosen": -1.3974251747131348, + "logps/rejected": -1.5865617990493774, + "loss": 1.1303, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.3974251747131348, + "rewards/margins": 0.18913669884204865, + "rewards/rejected": -1.5865617990493774, + "sft_loss": 1.3784010410308838, + "step": 875 + }, + { + "epoch": 0.4709817695266767, + "grad_norm": 5.315430604545462, + "learning_rate": 2.970478751239009e-06, + "logits/chosen": -0.1227155476808548, + "logits/rejected": 0.04234758019447327, + "logps/chosen": -1.4257047176361084, + "logps/rejected": -1.658288598060608, + "loss": 1.072, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.4257047176361084, + "rewards/margins": 0.23258371651172638, + "rewards/rejected": -1.658288598060608, + "sft_loss": 1.3530858755111694, + "step": 880 + }, + { + "epoch": 0.47365780230807825, + "grad_norm": 6.141860733800608, + "learning_rate": 2.9695492359973153e-06, + "logits/chosen": -0.18545794486999512, + "logits/rejected": -0.10255476087331772, + "logps/chosen": -1.3349025249481201, + "logps/rejected": -1.6219761371612549, + "loss": 1.0275, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3349025249481201, + "rewards/margins": 0.2870733141899109, + "rewards/rejected": -1.6219761371612549, + "sft_loss": 1.3557451963424683, + "step": 885 + }, + { + "epoch": 0.47633383508947985, + "grad_norm": 4.785365239112048, + "learning_rate": 2.9686054631824884e-06, + "logits/chosen": -0.3034934103488922, + "logits/rejected": -0.17880980670452118, + "logps/chosen": -1.3840830326080322, + "logps/rejected": -1.6017013788223267, + "loss": 1.0908, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3840830326080322, + "rewards/margins": 0.21761831641197205, + "rewards/rejected": -1.6017013788223267, + "sft_loss": 1.4323046207427979, + "step": 890 + }, + { + "epoch": 0.4790098678708814, + "grad_norm": 4.901710099079587, + "learning_rate": 2.9676474419510175e-06, + "logits/chosen": -0.08637824654579163, + "logits/rejected": 0.02645046077668667, + "logps/chosen": -1.264495849609375, + "logps/rejected": -1.4407931566238403, + "loss": 1.058, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.264495849609375, + "rewards/margins": 0.17629732191562653, + "rewards/rejected": -1.4407931566238403, + "sft_loss": 1.3106262683868408, + "step": 895 + }, + { + "epoch": 0.481685900652283, + "grad_norm": 5.044502800081685, + "learning_rate": 2.966675181597627e-06, + "logits/chosen": -0.22000615298748016, + "logits/rejected": -0.15294332802295685, + "logps/chosen": -1.2635812759399414, + "logps/rejected": -1.5805613994598389, + "loss": 1.0245, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2635812759399414, + "rewards/margins": 0.3169800937175751, + "rewards/rejected": -1.5805613994598389, + "sft_loss": 1.3097639083862305, + "step": 900 + }, + { + "epoch": 0.48436193343368455, + "grad_norm": 5.220203526780124, + "learning_rate": 2.965688691555193e-06, + "logits/chosen": -0.1628127098083496, + "logits/rejected": 0.016382919624447823, + "logps/chosen": -1.3505499362945557, + "logps/rejected": -1.6661789417266846, + "loss": 1.0762, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3505499362945557, + "rewards/margins": 0.3156289756298065, + "rewards/rejected": -1.6661789417266846, + "sft_loss": 1.4187357425689697, + "step": 905 + }, + { + "epoch": 0.48703796621508616, + "grad_norm": 4.052014511902803, + "learning_rate": 2.964687981394644e-06, + "logits/chosen": -0.21893298625946045, + "logits/rejected": -0.11495008319616318, + "logps/chosen": -1.3846489191055298, + "logps/rejected": -1.548292875289917, + "loss": 1.1082, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3846489191055298, + "rewards/margins": 0.16364414989948273, + "rewards/rejected": -1.548292875289917, + "sft_loss": 1.3700069189071655, + "step": 910 + }, + { + "epoch": 0.4897139989964877, + "grad_norm": 5.4281835015917155, + "learning_rate": 2.963673060824877e-06, + "logits/chosen": -0.21939484775066376, + "logits/rejected": -0.04351986199617386, + "logps/chosen": -1.3538082838058472, + "logps/rejected": -1.595873236656189, + "loss": 1.0634, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3538082838058472, + "rewards/margins": 0.24206483364105225, + "rewards/rejected": -1.595873236656189, + "sft_loss": 1.3432961702346802, + "step": 915 + }, + { + "epoch": 0.49239003177788926, + "grad_norm": 5.4281666785371865, + "learning_rate": 2.9626439396926536e-06, + "logits/chosen": -0.10007022321224213, + "logits/rejected": 0.029562795534729958, + "logps/chosen": -1.2651135921478271, + "logps/rejected": -1.620726227760315, + "loss": 1.0511, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2651135921478271, + "rewards/margins": 0.35561278462409973, + "rewards/rejected": -1.620726227760315, + "sft_loss": 1.3307000398635864, + "step": 920 + }, + { + "epoch": 0.49506606455929086, + "grad_norm": 6.509746165340049, + "learning_rate": 2.9616006279825125e-06, + "logits/chosen": -0.2518424689769745, + "logits/rejected": -0.09067486226558685, + "logps/chosen": -1.4045971632003784, + "logps/rejected": -1.63739013671875, + "loss": 1.0792, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.4045971632003784, + "rewards/margins": 0.23279277980327606, + "rewards/rejected": -1.63739013671875, + "sft_loss": 1.3810770511627197, + "step": 925 + }, + { + "epoch": 0.4977420973406924, + "grad_norm": 8.013188896663015, + "learning_rate": 2.9605431358166687e-06, + "logits/chosen": -0.24829821288585663, + "logits/rejected": -0.13628308475017548, + "logps/chosen": -1.3312690258026123, + "logps/rejected": -1.717877745628357, + "loss": 1.0382, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.3312690258026123, + "rewards/margins": 0.38660869002342224, + "rewards/rejected": -1.717877745628357, + "sft_loss": 1.3500382900238037, + "step": 930 + }, + { + "epoch": 0.500418130122094, + "grad_norm": 5.294809343057805, + "learning_rate": 2.959471473454915e-06, + "logits/chosen": -0.16869133710861206, + "logits/rejected": -0.12402909994125366, + "logps/chosen": -1.326379656791687, + "logps/rejected": -1.610535979270935, + "loss": 1.0652, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.326379656791687, + "rewards/margins": 0.28415626287460327, + "rewards/rejected": -1.610535979270935, + "sft_loss": 1.3545112609863281, + "step": 935 + }, + { + "epoch": 0.5030941629034956, + "grad_norm": 7.067492338783618, + "learning_rate": 2.9583856512945257e-06, + "logits/chosen": -0.2058066874742508, + "logits/rejected": -0.09429244697093964, + "logps/chosen": -1.3667848110198975, + "logps/rejected": -1.6059554815292358, + "loss": 1.0784, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3667848110198975, + "rewards/margins": 0.23917081952095032, + "rewards/rejected": -1.6059554815292358, + "sft_loss": 1.3848192691802979, + "step": 940 + }, + { + "epoch": 0.5057701956848971, + "grad_norm": 7.861411871397975, + "learning_rate": 2.957285679870151e-06, + "logits/chosen": -0.23982122540473938, + "logits/rejected": -0.10797332227230072, + "logps/chosen": -1.3651232719421387, + "logps/rejected": -1.7063182592391968, + "loss": 1.017, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3651232719421387, + "rewards/margins": 0.3411949872970581, + "rewards/rejected": -1.7063182592391968, + "sft_loss": 1.3415919542312622, + "step": 945 + }, + { + "epoch": 0.5084462284662987, + "grad_norm": 4.724553950873356, + "learning_rate": 2.9561715698537184e-06, + "logits/chosen": -0.2262788712978363, + "logits/rejected": -0.05607231333851814, + "logps/chosen": -1.4432750940322876, + "logps/rejected": -1.6429545879364014, + "loss": 1.1349, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.4432750940322876, + "rewards/margins": 0.1996796727180481, + "rewards/rejected": -1.6429545879364014, + "sft_loss": 1.4282090663909912, + "step": 950 + }, + { + "epoch": 0.5111222612477003, + "grad_norm": 5.446634366003163, + "learning_rate": 2.955043332054329e-06, + "logits/chosen": -0.15960340201854706, + "logits/rejected": 0.07019929587841034, + "logps/chosen": -1.4225099086761475, + "logps/rejected": -1.663869857788086, + "loss": 1.1221, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.4225099086761475, + "rewards/margins": 0.24135980010032654, + "rewards/rejected": -1.663869857788086, + "sft_loss": 1.474645972251892, + "step": 955 + }, + { + "epoch": 0.5137982940291018, + "grad_norm": 5.827944497702404, + "learning_rate": 2.95390097741815e-06, + "logits/chosen": -0.18014737963676453, + "logits/rejected": -0.029367715120315552, + "logps/chosen": -1.3832664489746094, + "logps/rejected": -1.5638530254364014, + "loss": 1.1006, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3832664489746094, + "rewards/margins": 0.1805865466594696, + "rewards/rejected": -1.5638530254364014, + "sft_loss": 1.3904502391815186, + "step": 960 + }, + { + "epoch": 0.5164743268105034, + "grad_norm": 6.324392438431142, + "learning_rate": 2.952744517028312e-06, + "logits/chosen": -0.08480402082204819, + "logits/rejected": -0.08991466462612152, + "logps/chosen": -1.4066896438598633, + "logps/rejected": -1.710906982421875, + "loss": 1.0802, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.4066896438598633, + "rewards/margins": 0.3042174279689789, + "rewards/rejected": -1.710906982421875, + "sft_loss": 1.4253562688827515, + "step": 965 + }, + { + "epoch": 0.519150359591905, + "grad_norm": 4.919009362060755, + "learning_rate": 2.951573962104798e-06, + "logits/chosen": -0.07150016725063324, + "logits/rejected": -0.06851210445165634, + "logps/chosen": -1.288236379623413, + "logps/rejected": -1.532168984413147, + "loss": 1.0465, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.288236379623413, + "rewards/margins": 0.24393276870250702, + "rewards/rejected": -1.532168984413147, + "sft_loss": 1.2897863388061523, + "step": 970 + }, + { + "epoch": 0.5218263923733065, + "grad_norm": 5.147263046472833, + "learning_rate": 2.950389324004337e-06, + "logits/chosen": -0.2402597963809967, + "logits/rejected": -0.05485190078616142, + "logps/chosen": -1.3670083284378052, + "logps/rejected": -1.5608259439468384, + "loss": 1.0778, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3670083284378052, + "rewards/margins": 0.19381758570671082, + "rewards/rejected": -1.5608259439468384, + "sft_loss": 1.4064772129058838, + "step": 975 + }, + { + "epoch": 0.5245024251547081, + "grad_norm": 8.649844009938525, + "learning_rate": 2.949190614220294e-06, + "logits/chosen": -0.2297898232936859, + "logits/rejected": -0.028187647461891174, + "logps/chosen": -1.4226592779159546, + "logps/rejected": -1.664825677871704, + "loss": 1.0959, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.4226592779159546, + "rewards/margins": 0.2421664446592331, + "rewards/rejected": -1.664825677871704, + "sft_loss": 1.4162763357162476, + "step": 980 + }, + { + "epoch": 0.5271784579361097, + "grad_norm": 6.148260341059483, + "learning_rate": 2.9479778443825553e-06, + "logits/chosen": -0.1363251805305481, + "logits/rejected": 0.060792457312345505, + "logps/chosen": -1.3869860172271729, + "logps/rejected": -1.6044954061508179, + "loss": 1.0984, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.3869860172271729, + "rewards/margins": 0.21750938892364502, + "rewards/rejected": -1.6044954061508179, + "sft_loss": 1.4573136568069458, + "step": 985 + }, + { + "epoch": 0.5298544907175113, + "grad_norm": 4.843430622851707, + "learning_rate": 2.9467510262574204e-06, + "logits/chosen": -0.0660191997885704, + "logits/rejected": -0.03288702294230461, + "logps/chosen": -1.2430740594863892, + "logps/rejected": -1.5976035594940186, + "loss": 0.9959, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.2430740594863892, + "rewards/margins": 0.3545294404029846, + "rewards/rejected": -1.5976035594940186, + "sft_loss": 1.3150891065597534, + "step": 990 + }, + { + "epoch": 0.5325305234989128, + "grad_norm": 7.473104680416714, + "learning_rate": 2.9455101717474834e-06, + "logits/chosen": -0.06151014566421509, + "logits/rejected": 0.01840631663799286, + "logps/chosen": -1.3655961751937866, + "logps/rejected": -1.5589427947998047, + "loss": 1.1384, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.3655961751937866, + "rewards/margins": 0.19334658980369568, + "rewards/rejected": -1.5589427947998047, + "sft_loss": 1.4315303564071655, + "step": 995 + }, + { + "epoch": 0.5352065562803144, + "grad_norm": 5.904135631586209, + "learning_rate": 2.9442552928915203e-06, + "logits/chosen": -0.04618370905518532, + "logits/rejected": 0.0838208794593811, + "logps/chosen": -1.3669238090515137, + "logps/rejected": -1.6801611185073853, + "loss": 1.0887, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.3669238090515137, + "rewards/margins": 0.3132372200489044, + "rewards/rejected": -1.6801611185073853, + "sft_loss": 1.396959900856018, + "step": 1000 + }, + { + "epoch": 0.537882589061716, + "grad_norm": 6.878902971181851, + "learning_rate": 2.942986401864371e-06, + "logits/chosen": -0.09690725803375244, + "logits/rejected": 0.07714128494262695, + "logps/chosen": -1.4062764644622803, + "logps/rejected": -1.6796951293945312, + "loss": 1.1054, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.4062764644622803, + "rewards/margins": 0.2734185457229614, + "rewards/rejected": -1.6796951293945312, + "sft_loss": 1.445420265197754, + "step": 1005 + }, + { + "epoch": 0.5405586218431175, + "grad_norm": 6.274384213051333, + "learning_rate": 2.9417035109768225e-06, + "logits/chosen": -0.1101980209350586, + "logits/rejected": 0.08307985216379166, + "logps/chosen": -1.2378486394882202, + "logps/rejected": -1.6158936023712158, + "loss": 1.0127, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.2378486394882202, + "rewards/margins": 0.3780447542667389, + "rewards/rejected": -1.6158936023712158, + "sft_loss": 1.2778068780899048, + "step": 1010 + }, + { + "epoch": 0.5432346546245191, + "grad_norm": 5.674580639618625, + "learning_rate": 2.9404066326754874e-06, + "logits/chosen": -0.12508752942085266, + "logits/rejected": 0.05659140273928642, + "logps/chosen": -1.3017909526824951, + "logps/rejected": -1.5709102153778076, + "loss": 1.0562, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.3017909526824951, + "rewards/margins": 0.2691193222999573, + "rewards/rejected": -1.5709102153778076, + "sft_loss": 1.3575425148010254, + "step": 1015 + }, + { + "epoch": 0.5459106874059207, + "grad_norm": 6.827706232097836, + "learning_rate": 2.9390957795426847e-06, + "logits/chosen": -0.11372099071741104, + "logits/rejected": 0.03485158085823059, + "logps/chosen": -1.365986943244934, + "logps/rejected": -1.6966253519058228, + "loss": 1.0481, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.365986943244934, + "rewards/margins": 0.33063825964927673, + "rewards/rejected": -1.6966253519058228, + "sft_loss": 1.4129970073699951, + "step": 1020 + }, + { + "epoch": 0.5485867201873222, + "grad_norm": 6.158787026981841, + "learning_rate": 2.9377709642963177e-06, + "logits/chosen": -0.15712231397628784, + "logits/rejected": -0.03692762926220894, + "logps/chosen": -1.3038196563720703, + "logps/rejected": -1.7374274730682373, + "loss": 1.0094, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3038196563720703, + "rewards/margins": 0.43360796570777893, + "rewards/rejected": -1.7374274730682373, + "sft_loss": 1.3382656574249268, + "step": 1025 + }, + { + "epoch": 0.5512627529687238, + "grad_norm": 5.364208643132879, + "learning_rate": 2.9364321997897485e-06, + "logits/chosen": -0.14669297635555267, + "logits/rejected": -0.059150196611881256, + "logps/chosen": -1.4097700119018555, + "logps/rejected": -1.7058780193328857, + "loss": 1.0981, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.4097700119018555, + "rewards/margins": 0.2961081862449646, + "rewards/rejected": -1.7058780193328857, + "sft_loss": 1.441806435585022, + "step": 1030 + }, + { + "epoch": 0.5539387857501255, + "grad_norm": 6.597472204582022, + "learning_rate": 2.935079499011677e-06, + "logits/chosen": -0.16338512301445007, + "logits/rejected": -0.04339775815606117, + "logps/chosen": -1.431691288948059, + "logps/rejected": -1.5908737182617188, + "loss": 1.1178, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.431691288948059, + "rewards/margins": 0.1591825932264328, + "rewards/rejected": -1.5908737182617188, + "sft_loss": 1.4271605014801025, + "step": 1035 + }, + { + "epoch": 0.5566148185315271, + "grad_norm": 8.155965406182842, + "learning_rate": 2.9337128750860126e-06, + "logits/chosen": -0.10452456772327423, + "logits/rejected": 0.04732733964920044, + "logps/chosen": -1.3343483209609985, + "logps/rejected": -1.635978102684021, + "loss": 1.068, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.3343483209609985, + "rewards/margins": 0.3016297221183777, + "rewards/rejected": -1.635978102684021, + "sft_loss": 1.3880765438079834, + "step": 1040 + }, + { + "epoch": 0.5592908513129285, + "grad_norm": 4.07354796456379, + "learning_rate": 2.932332341271746e-06, + "logits/chosen": -0.1672249734401703, + "logits/rejected": -0.03276212140917778, + "logps/chosen": -1.306597113609314, + "logps/rejected": -1.6181869506835938, + "loss": 1.0614, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.306597113609314, + "rewards/margins": 0.3115897476673126, + "rewards/rejected": -1.6181869506835938, + "sft_loss": 1.3978981971740723, + "step": 1045 + }, + { + "epoch": 0.5619668840943302, + "grad_norm": 5.813364166003015, + "learning_rate": 2.930937910962822e-06, + "logits/chosen": -0.19471415877342224, + "logits/rejected": -0.09367385506629944, + "logps/chosen": -1.37113618850708, + "logps/rejected": -1.7180440425872803, + "loss": 1.0714, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.37113618850708, + "rewards/margins": 0.34690791368484497, + "rewards/rejected": -1.7180440425872803, + "sft_loss": 1.4200074672698975, + "step": 1050 + }, + { + "epoch": 0.5646429168757318, + "grad_norm": 7.793661568914185, + "learning_rate": 2.9295295976880107e-06, + "logits/chosen": -0.1325044184923172, + "logits/rejected": -0.058493874967098236, + "logps/chosen": -1.397251009941101, + "logps/rejected": -1.69857656955719, + "loss": 1.0641, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.397251009941101, + "rewards/margins": 0.3013255000114441, + "rewards/rejected": -1.69857656955719, + "sft_loss": 1.4198894500732422, + "step": 1055 + }, + { + "epoch": 0.5673189496571333, + "grad_norm": 7.6046364035152365, + "learning_rate": 2.9281074151107727e-06, + "logits/chosen": -0.12792737782001495, + "logits/rejected": 0.04613568261265755, + "logps/chosen": -1.4523102045059204, + "logps/rejected": -1.7039387226104736, + "loss": 1.0922, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.4523102045059204, + "rewards/margins": 0.2516286075115204, + "rewards/rejected": -1.7039387226104736, + "sft_loss": 1.4452135562896729, + "step": 1060 + }, + { + "epoch": 0.5699949824385349, + "grad_norm": 4.692723677939538, + "learning_rate": 2.926671377029129e-06, + "logits/chosen": -0.12723150849342346, + "logits/rejected": -0.001778355217538774, + "logps/chosen": -1.3645659685134888, + "logps/rejected": -1.7801551818847656, + "loss": 1.0433, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3645659685134888, + "rewards/margins": 0.41558918356895447, + "rewards/rejected": -1.7801551818847656, + "sft_loss": 1.4643785953521729, + "step": 1065 + }, + { + "epoch": 0.5726710152199365, + "grad_norm": 5.786753397518219, + "learning_rate": 2.9252214973755294e-06, + "logits/chosen": -0.25807952880859375, + "logits/rejected": -0.01430868823081255, + "logps/chosen": -1.399417519569397, + "logps/rejected": -1.7631251811981201, + "loss": 1.0388, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.399417519569397, + "rewards/margins": 0.36370766162872314, + "rewards/rejected": -1.7631251811981201, + "sft_loss": 1.4122426509857178, + "step": 1070 + }, + { + "epoch": 0.5753470480013381, + "grad_norm": 4.920914855578094, + "learning_rate": 2.923757790216711e-06, + "logits/chosen": -0.1378275752067566, + "logits/rejected": 6.657242920482531e-05, + "logps/chosen": -1.3422327041625977, + "logps/rejected": -1.7522796392440796, + "loss": 1.0385, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.3422327041625977, + "rewards/margins": 0.41004714369773865, + "rewards/rejected": -1.7522796392440796, + "sft_loss": 1.3954424858093262, + "step": 1075 + }, + { + "epoch": 0.5780230807827396, + "grad_norm": 6.888088507193936, + "learning_rate": 2.922280269753568e-06, + "logits/chosen": -0.19910190999507904, + "logits/rejected": -0.09001894295215607, + "logps/chosen": -1.4434906244277954, + "logps/rejected": -1.6869752407073975, + "loss": 1.1025, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.4434906244277954, + "rewards/margins": 0.24348478019237518, + "rewards/rejected": -1.6869752407073975, + "sft_loss": 1.4737919569015503, + "step": 1080 + }, + { + "epoch": 0.5806991135641412, + "grad_norm": 8.187201984819211, + "learning_rate": 2.9207889503210094e-06, + "logits/chosen": -0.08224952965974808, + "logits/rejected": 0.09409169852733612, + "logps/chosen": -1.3726158142089844, + "logps/rejected": -1.4843034744262695, + "loss": 1.1351, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.3726158142089844, + "rewards/margins": 0.11168781667947769, + "rewards/rejected": -1.4843034744262695, + "sft_loss": 1.3914871215820312, + "step": 1085 + }, + { + "epoch": 0.5833751463455428, + "grad_norm": 5.930900410234613, + "learning_rate": 2.9192838463878236e-06, + "logits/chosen": -0.09559588134288788, + "logits/rejected": 0.014999288134276867, + "logps/chosen": -1.3439271450042725, + "logps/rejected": -1.467355728149414, + "loss": 1.1066, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.3439271450042725, + "rewards/margins": 0.12342876195907593, + "rewards/rejected": -1.467355728149414, + "sft_loss": 1.340254783630371, + "step": 1090 + }, + { + "epoch": 0.5860511791269443, + "grad_norm": 5.875408013228155, + "learning_rate": 2.917764972556535e-06, + "logits/chosen": -0.21042868494987488, + "logits/rejected": -0.06721127033233643, + "logps/chosen": -1.313696026802063, + "logps/rejected": -1.569189190864563, + "loss": 1.0512, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.313696026802063, + "rewards/margins": 0.2554931044578552, + "rewards/rejected": -1.569189190864563, + "sft_loss": 1.3528454303741455, + "step": 1095 + }, + { + "epoch": 0.5887272119083459, + "grad_norm": 6.238798929829675, + "learning_rate": 2.9162323435632657e-06, + "logits/chosen": -0.09522127360105515, + "logits/rejected": 0.020591190084815025, + "logps/chosen": -1.2226125001907349, + "logps/rejected": -1.6992833614349365, + "loss": 0.9786, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2226125001907349, + "rewards/margins": 0.4766710698604584, + "rewards/rejected": -1.6992833614349365, + "sft_loss": 1.265278935432434, + "step": 1100 + }, + { + "epoch": 0.5914032446897475, + "grad_norm": 5.288216235681103, + "learning_rate": 2.914685974277587e-06, + "logits/chosen": -0.16337811946868896, + "logits/rejected": -0.08522491157054901, + "logps/chosen": -1.3429590463638306, + "logps/rejected": -1.5685818195343018, + "loss": 1.0812, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3429590463638306, + "rewards/margins": 0.2256227433681488, + "rewards/rejected": -1.5685818195343018, + "sft_loss": 1.33394455909729, + "step": 1105 + }, + { + "epoch": 0.594079277471149, + "grad_norm": 6.9294869208445, + "learning_rate": 2.9131258797023814e-06, + "logits/chosen": -0.1615566909313202, + "logits/rejected": -0.027435744181275368, + "logps/chosen": -1.305321455001831, + "logps/rejected": -1.5158964395523071, + "loss": 1.0534, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.305321455001831, + "rewards/margins": 0.21057498455047607, + "rewards/rejected": -1.5158964395523071, + "sft_loss": 1.3161168098449707, + "step": 1110 + }, + { + "epoch": 0.5967553102525506, + "grad_norm": 5.726396935468426, + "learning_rate": 2.9115520749736934e-06, + "logits/chosen": -0.05702148750424385, + "logits/rejected": 0.07917685061693192, + "logps/chosen": -1.2768105268478394, + "logps/rejected": -1.6988168954849243, + "loss": 0.9809, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2768105268478394, + "rewards/margins": 0.4220063090324402, + "rewards/rejected": -1.6988168954849243, + "sft_loss": 1.2530466318130493, + "step": 1115 + }, + { + "epoch": 0.5994313430339522, + "grad_norm": 5.465210594232592, + "learning_rate": 2.909964575360583e-06, + "logits/chosen": -0.25398239493370056, + "logits/rejected": -0.14137795567512512, + "logps/chosen": -1.319097876548767, + "logps/rejected": -1.7301028966903687, + "loss": 1.0282, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.319097876548767, + "rewards/margins": 0.41100484132766724, + "rewards/rejected": -1.7301028966903687, + "sft_loss": 1.355691909790039, + "step": 1120 + }, + { + "epoch": 0.6021073758153538, + "grad_norm": 10.103974202551575, + "learning_rate": 2.9083633962649783e-06, + "logits/chosen": -0.2338935136795044, + "logits/rejected": -0.03408069536089897, + "logps/chosen": -1.4205596446990967, + "logps/rejected": -1.866235375404358, + "loss": 1.0359, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.4205596446990967, + "rewards/margins": 0.44567546248435974, + "rewards/rejected": -1.866235375404358, + "sft_loss": 1.405672311782837, + "step": 1125 + }, + { + "epoch": 0.6047834085967553, + "grad_norm": 7.660448870032001, + "learning_rate": 2.906748553221527e-06, + "logits/chosen": -0.007981347851455212, + "logits/rejected": 0.062486834824085236, + "logps/chosen": -1.3957369327545166, + "logps/rejected": -1.7972627878189087, + "loss": 1.025, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3957369327545166, + "rewards/margins": 0.4015257954597473, + "rewards/rejected": -1.7972627878189087, + "sft_loss": 1.319511890411377, + "step": 1130 + }, + { + "epoch": 0.6074594413781569, + "grad_norm": 7.685179844824281, + "learning_rate": 2.9051200618974418e-06, + "logits/chosen": -0.10257701575756073, + "logits/rejected": 0.08493608981370926, + "logps/chosen": -1.4827502965927124, + "logps/rejected": -1.8549566268920898, + "loss": 1.0427, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.4827502965927124, + "rewards/margins": 0.37220636010169983, + "rewards/rejected": -1.8549566268920898, + "sft_loss": 1.343749761581421, + "step": 1135 + }, + { + "epoch": 0.6101354741595585, + "grad_norm": 6.3001894697285605, + "learning_rate": 2.903477938092354e-06, + "logits/chosen": -0.08838716894388199, + "logits/rejected": -0.047435659915208817, + "logps/chosen": -1.39447021484375, + "logps/rejected": -1.5364607572555542, + "loss": 1.1398, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.39447021484375, + "rewards/margins": 0.14199037849903107, + "rewards/rejected": -1.5364607572555542, + "sft_loss": 1.4440193176269531, + "step": 1140 + }, + { + "epoch": 0.61281150694096, + "grad_norm": 5.5920734867489434, + "learning_rate": 2.901822197738155e-06, + "logits/chosen": -0.1857428401708603, + "logits/rejected": -0.049183569848537445, + "logps/chosen": -1.3664112091064453, + "logps/rejected": -1.703650712966919, + "loss": 1.0848, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3664112091064453, + "rewards/margins": 0.33723941445350647, + "rewards/rejected": -1.703650712966919, + "sft_loss": 1.4322891235351562, + "step": 1145 + }, + { + "epoch": 0.6154875397223616, + "grad_norm": 5.623834598583619, + "learning_rate": 2.9001528568988454e-06, + "logits/chosen": -0.16399559378623962, + "logits/rejected": -0.016664093360304832, + "logps/chosen": -1.2457985877990723, + "logps/rejected": -1.6100772619247437, + "loss": 0.9906, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2457985877990723, + "rewards/margins": 0.36427873373031616, + "rewards/rejected": -1.6100772619247437, + "sft_loss": 1.2594926357269287, + "step": 1150 + }, + { + "epoch": 0.6181635725037632, + "grad_norm": 8.323688037510497, + "learning_rate": 2.898469931770378e-06, + "logits/chosen": -0.023495309054851532, + "logits/rejected": 0.08119723200798035, + "logps/chosen": -1.3699777126312256, + "logps/rejected": -1.5569205284118652, + "loss": 1.0993, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3699777126312256, + "rewards/margins": 0.1869426965713501, + "rewards/rejected": -1.5569205284118652, + "sft_loss": 1.3976179361343384, + "step": 1155 + }, + { + "epoch": 0.6208396052851648, + "grad_norm": 7.410365546672956, + "learning_rate": 2.896773438680498e-06, + "logits/chosen": -0.008547200821340084, + "logits/rejected": 0.09500636160373688, + "logps/chosen": -1.342989206314087, + "logps/rejected": -1.706742525100708, + "loss": 1.0439, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.342989206314087, + "rewards/margins": 0.3637532591819763, + "rewards/rejected": -1.706742525100708, + "sft_loss": 1.3892873525619507, + "step": 1160 + }, + { + "epoch": 0.6235156380665663, + "grad_norm": 8.28756211424691, + "learning_rate": 2.8950633940885908e-06, + "logits/chosen": -0.09383663535118103, + "logits/rejected": 0.0011787057155743241, + "logps/chosen": -1.3285152912139893, + "logps/rejected": -1.649229645729065, + "loss": 1.0439, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.3285152912139893, + "rewards/margins": 0.32071438431739807, + "rewards/rejected": -1.649229645729065, + "sft_loss": 1.336268663406372, + "step": 1165 + }, + { + "epoch": 0.6261916708479679, + "grad_norm": 4.7219237142016075, + "learning_rate": 2.893339814585516e-06, + "logits/chosen": -0.14430832862854004, + "logits/rejected": 0.03167320415377617, + "logps/chosen": -1.5874072313308716, + "logps/rejected": -1.8974529504776, + "loss": 1.1458, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.5874072313308716, + "rewards/margins": 0.3100458085536957, + "rewards/rejected": -1.8974529504776, + "sft_loss": 1.509259819984436, + "step": 1170 + }, + { + "epoch": 0.6288677036293695, + "grad_norm": 4.943932933930905, + "learning_rate": 2.8916027168934483e-06, + "logits/chosen": -0.0802348330616951, + "logits/rejected": 0.12030297517776489, + "logps/chosen": -1.3435288667678833, + "logps/rejected": -1.6719791889190674, + "loss": 1.0802, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3435288667678833, + "rewards/margins": 0.32845038175582886, + "rewards/rejected": -1.6719791889190674, + "sft_loss": 1.3819727897644043, + "step": 1175 + }, + { + "epoch": 0.631543736410771, + "grad_norm": 5.123270327438244, + "learning_rate": 2.889852117865718e-06, + "logits/chosen": -0.07914379984140396, + "logits/rejected": 0.08770457655191422, + "logps/chosen": -1.4189870357513428, + "logps/rejected": -1.7545255422592163, + "loss": 1.0451, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.4189870357513428, + "rewards/margins": 0.33553850650787354, + "rewards/rejected": -1.7545255422592163, + "sft_loss": 1.4011993408203125, + "step": 1180 + }, + { + "epoch": 0.6342197691921726, + "grad_norm": 6.8755673937049195, + "learning_rate": 2.888088034486645e-06, + "logits/chosen": -0.008513232693076134, + "logits/rejected": 0.15203821659088135, + "logps/chosen": -1.4755704402923584, + "logps/rejected": -1.7755672931671143, + "loss": 1.0964, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.4755704402923584, + "rewards/margins": 0.29999667406082153, + "rewards/rejected": -1.7755672931671143, + "sft_loss": 1.426283836364746, + "step": 1185 + }, + { + "epoch": 0.6368958019735742, + "grad_norm": 7.966400753891832, + "learning_rate": 2.886310483871373e-06, + "logits/chosen": -0.08631936460733414, + "logits/rejected": 0.07498349249362946, + "logps/chosen": -1.4149017333984375, + "logps/rejected": -1.783022165298462, + "loss": 1.0399, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.4149017333984375, + "rewards/margins": 0.36812031269073486, + "rewards/rejected": -1.783022165298462, + "sft_loss": 1.4194279909133911, + "step": 1190 + }, + { + "epoch": 0.6395718347549757, + "grad_norm": 5.166588529451397, + "learning_rate": 2.8845194832657067e-06, + "logits/chosen": -0.026654431596398354, + "logits/rejected": 0.11714208126068115, + "logps/chosen": -1.261995553970337, + "logps/rejected": -1.717275857925415, + "loss": 0.9894, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.261995553970337, + "rewards/margins": 0.45528024435043335, + "rewards/rejected": -1.717275857925415, + "sft_loss": 1.3523094654083252, + "step": 1195 + }, + { + "epoch": 0.6422478675363773, + "grad_norm": 8.259488569986726, + "learning_rate": 2.882715050045941e-06, + "logits/chosen": -0.09320361912250519, + "logits/rejected": -0.018301691859960556, + "logps/chosen": -1.3758822679519653, + "logps/rejected": -1.6776307821273804, + "loss": 1.0763, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3758822679519653, + "rewards/margins": 0.30174845457077026, + "rewards/rejected": -1.6776307821273804, + "sft_loss": 1.3632720708847046, + "step": 1200 + }, + { + "epoch": 0.6422478675363773, + "eval_logits/chosen": 0.33133628964424133, + "eval_logits/rejected": 0.43879061937332153, + "eval_logps/chosen": -1.3875893354415894, + "eval_logps/rejected": -1.74982488155365, + "eval_loss": 1.0494909286499023, + "eval_rewards/accuracies": 0.6045994162559509, + "eval_rewards/chosen": -1.3875893354415894, + "eval_rewards/margins": 0.3622351884841919, + "eval_rewards/rejected": -1.74982488155365, + "eval_runtime": 43.0416, + "eval_samples_per_second": 31.249, + "eval_sft_loss": 1.3954273462295532, + "eval_steps_per_second": 7.83, + "step": 1200 + }, + { + "epoch": 0.6449239003177789, + "grad_norm": 8.512204625035347, + "learning_rate": 2.8808972017186957e-06, + "logits/chosen": -0.20027823746204376, + "logits/rejected": -0.0024368553422391415, + "logps/chosen": -1.3654619455337524, + "logps/rejected": -1.6480159759521484, + "loss": 1.0671, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3654619455337524, + "rewards/margins": 0.28255385160446167, + "rewards/rejected": -1.6480159759521484, + "sft_loss": 1.3939441442489624, + "step": 1205 + }, + { + "epoch": 0.6475999330991805, + "grad_norm": 5.435589749333542, + "learning_rate": 2.8790659559207434e-06, + "logits/chosen": -0.10933800041675568, + "logits/rejected": 0.12187595665454865, + "logps/chosen": -1.339516282081604, + "logps/rejected": -1.6303303241729736, + "loss": 1.0521, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.339516282081604, + "rewards/margins": 0.2908141016960144, + "rewards/rejected": -1.6303303241729736, + "sft_loss": 1.3552604913711548, + "step": 1210 + }, + { + "epoch": 0.650275965880582, + "grad_norm": 5.825372619643404, + "learning_rate": 2.877221330418838e-06, + "logits/chosen": -0.15005064010620117, + "logits/rejected": -0.004594183061271906, + "logps/chosen": -1.3774538040161133, + "logps/rejected": -1.5936380624771118, + "loss": 1.1074, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3774538040161133, + "rewards/margins": 0.21618422865867615, + "rewards/rejected": -1.5936380624771118, + "sft_loss": 1.3924789428710938, + "step": 1215 + }, + { + "epoch": 0.6529519986619836, + "grad_norm": 6.1466877107192355, + "learning_rate": 2.875363343109545e-06, + "logits/chosen": 0.013457834720611572, + "logits/rejected": 0.14119693636894226, + "logps/chosen": -1.28562331199646, + "logps/rejected": -1.5532904863357544, + "loss": 1.0477, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.28562331199646, + "rewards/margins": 0.26766690611839294, + "rewards/rejected": -1.5532904863357544, + "sft_loss": 1.2760810852050781, + "step": 1220 + }, + { + "epoch": 0.6556280314433852, + "grad_norm": 6.2074220518550804, + "learning_rate": 2.8734920120190645e-06, + "logits/chosen": -0.20628468692302704, + "logits/rejected": 0.02951905131340027, + "logps/chosen": -1.4024266004562378, + "logps/rejected": -1.6120758056640625, + "loss": 1.1006, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.4024266004562378, + "rewards/margins": 0.20964908599853516, + "rewards/rejected": -1.6120758056640625, + "sft_loss": 1.4208050966262817, + "step": 1225 + }, + { + "epoch": 0.6583040642247867, + "grad_norm": 8.262618460377562, + "learning_rate": 2.8716073553030593e-06, + "logits/chosen": -0.10507309436798096, + "logits/rejected": 0.011842099949717522, + "logps/chosen": -1.334250569343567, + "logps/rejected": -1.6416661739349365, + "loss": 1.0449, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.334250569343567, + "rewards/margins": 0.3074159026145935, + "rewards/rejected": -1.6416661739349365, + "sft_loss": 1.308292031288147, + "step": 1230 + }, + { + "epoch": 0.6609800970061883, + "grad_norm": 5.237995416480608, + "learning_rate": 2.8697093912464782e-06, + "logits/chosen": -0.08195515722036362, + "logits/rejected": 0.07722845673561096, + "logps/chosen": -1.3526867628097534, + "logps/rejected": -1.5650540590286255, + "loss": 1.0897, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3526867628097534, + "rewards/margins": 0.21236717700958252, + "rewards/rejected": -1.5650540590286255, + "sft_loss": 1.4327796697616577, + "step": 1235 + }, + { + "epoch": 0.6636561297875899, + "grad_norm": 4.975635376617012, + "learning_rate": 2.8677981382633753e-06, + "logits/chosen": -0.2311730831861496, + "logits/rejected": -0.0849604532122612, + "logps/chosen": -1.3269524574279785, + "logps/rejected": -1.647106409072876, + "loss": 1.041, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3269524574279785, + "rewards/margins": 0.32015395164489746, + "rewards/rejected": -1.647106409072876, + "sft_loss": 1.3890091180801392, + "step": 1240 + }, + { + "epoch": 0.6663321625689914, + "grad_norm": 5.458841354429057, + "learning_rate": 2.8658736148967366e-06, + "logits/chosen": -0.1458047479391098, + "logits/rejected": 0.0449729859828949, + "logps/chosen": -1.4105819463729858, + "logps/rejected": -1.6483221054077148, + "loss": 1.1175, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.4105819463729858, + "rewards/margins": 0.23774006962776184, + "rewards/rejected": -1.6483221054077148, + "sft_loss": 1.4562586545944214, + "step": 1245 + }, + { + "epoch": 0.669008195350393, + "grad_norm": 7.300740100285893, + "learning_rate": 2.8639358398182947e-06, + "logits/chosen": -0.14232835173606873, + "logits/rejected": 0.07038958370685577, + "logps/chosen": -1.452538013458252, + "logps/rejected": -1.6742738485336304, + "loss": 1.118, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.452538013458252, + "rewards/margins": 0.22173579037189484, + "rewards/rejected": -1.6742738485336304, + "sft_loss": 1.4558159112930298, + "step": 1250 + }, + { + "epoch": 0.6716842281317946, + "grad_norm": 6.969694171415584, + "learning_rate": 2.8619848318283538e-06, + "logits/chosen": -0.1824021339416504, + "logits/rejected": -0.051718395203351974, + "logps/chosen": -1.3421350717544556, + "logps/rejected": -1.6273431777954102, + "loss": 1.0732, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.3421350717544556, + "rewards/margins": 0.285208135843277, + "rewards/rejected": -1.6273431777954102, + "sft_loss": 1.430902123451233, + "step": 1255 + }, + { + "epoch": 0.6743602609131962, + "grad_norm": 6.732449093746142, + "learning_rate": 2.860020609855601e-06, + "logits/chosen": -0.2673589587211609, + "logits/rejected": -0.1350955367088318, + "logps/chosen": -1.3457567691802979, + "logps/rejected": -1.7736858129501343, + "loss": 1.0376, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3457567691802979, + "rewards/margins": 0.42792901396751404, + "rewards/rejected": -1.7736858129501343, + "sft_loss": 1.3949072360992432, + "step": 1260 + }, + { + "epoch": 0.6770362936945977, + "grad_norm": 5.744150499257258, + "learning_rate": 2.858043192956926e-06, + "logits/chosen": -0.13780884444713593, + "logits/rejected": 0.010678360238671303, + "logps/chosen": -1.3624342679977417, + "logps/rejected": -1.707216501235962, + "loss": 1.0493, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.3624342679977417, + "rewards/margins": 0.34478217363357544, + "rewards/rejected": -1.707216501235962, + "sft_loss": 1.3893402814865112, + "step": 1265 + }, + { + "epoch": 0.6797123264759993, + "grad_norm": 6.619509558086586, + "learning_rate": 2.856052600317237e-06, + "logits/chosen": -0.21283817291259766, + "logits/rejected": -0.10551418364048004, + "logps/chosen": -1.3334189653396606, + "logps/rejected": -1.694124460220337, + "loss": 1.0483, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3334189653396606, + "rewards/margins": 0.36070531606674194, + "rewards/rejected": -1.694124460220337, + "sft_loss": 1.3852914571762085, + "step": 1270 + }, + { + "epoch": 0.6823883592574009, + "grad_norm": 5.5743901442295565, + "learning_rate": 2.8540488512492725e-06, + "logits/chosen": -0.16439157724380493, + "logits/rejected": -0.053611718118190765, + "logps/chosen": -1.3701202869415283, + "logps/rejected": -1.631082534790039, + "loss": 1.0734, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.3701202869415283, + "rewards/margins": 0.2609623372554779, + "rewards/rejected": -1.631082534790039, + "sft_loss": 1.3603696823120117, + "step": 1275 + }, + { + "epoch": 0.6850643920388024, + "grad_norm": 7.116426977910912, + "learning_rate": 2.8520319651934147e-06, + "logits/chosen": -0.1693895161151886, + "logits/rejected": -0.007607897277921438, + "logps/chosen": -1.4261596202850342, + "logps/rejected": -1.6268680095672607, + "loss": 1.1275, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.4261596202850342, + "rewards/margins": 0.2007085531949997, + "rewards/rejected": -1.6268680095672607, + "sft_loss": 1.4610538482666016, + "step": 1280 + }, + { + "epoch": 0.687740424820204, + "grad_norm": 6.482471516998786, + "learning_rate": 2.8500019617175005e-06, + "logits/chosen": -0.1612972617149353, + "logits/rejected": -0.014109318144619465, + "logps/chosen": -1.288216233253479, + "logps/rejected": -1.5245250463485718, + "loss": 1.0442, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.288216233253479, + "rewards/margins": 0.2363087385892868, + "rewards/rejected": -1.5245250463485718, + "sft_loss": 1.318396806716919, + "step": 1285 + }, + { + "epoch": 0.6904164576016056, + "grad_norm": 7.346320744981688, + "learning_rate": 2.847958860516633e-06, + "logits/chosen": -0.31444284319877625, + "logits/rejected": -0.1650298833847046, + "logps/chosen": -1.3861689567565918, + "logps/rejected": -1.5643253326416016, + "loss": 1.1179, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.3861689567565918, + "rewards/margins": 0.17815645039081573, + "rewards/rejected": -1.5643253326416016, + "sft_loss": 1.380351185798645, + "step": 1290 + }, + { + "epoch": 0.6930924903830072, + "grad_norm": 4.507333897677239, + "learning_rate": 2.8459026814129887e-06, + "logits/chosen": -0.25474271178245544, + "logits/rejected": -0.2486313134431839, + "logps/chosen": -1.4083340167999268, + "logps/rejected": -1.7456175088882446, + "loss": 1.0595, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.4083340167999268, + "rewards/margins": 0.33728331327438354, + "rewards/rejected": -1.7456175088882446, + "sft_loss": 1.4073834419250488, + "step": 1295 + }, + { + "epoch": 0.6957685231644087, + "grad_norm": 5.9865959007720235, + "learning_rate": 2.8438334443556268e-06, + "logits/chosen": -0.2621445059776306, + "logits/rejected": -0.019263360649347305, + "logps/chosen": -1.3438646793365479, + "logps/rejected": -1.7832008600234985, + "loss": 1.0314, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.3438646793365479, + "rewards/margins": 0.43933621048927307, + "rewards/rejected": -1.7832008600234985, + "sft_loss": 1.3932850360870361, + "step": 1300 + }, + { + "epoch": 0.6984445559458103, + "grad_norm": 5.275881261428366, + "learning_rate": 2.8417511694202938e-06, + "logits/chosen": -0.1755446493625641, + "logits/rejected": -0.1253184974193573, + "logps/chosen": -1.370633602142334, + "logps/rejected": -1.6897408962249756, + "loss": 1.0674, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.370633602142334, + "rewards/margins": 0.3191072940826416, + "rewards/rejected": -1.6897408962249756, + "sft_loss": 1.3716998100280762, + "step": 1305 + }, + { + "epoch": 0.701120588727212, + "grad_norm": 4.190044503398931, + "learning_rate": 2.83965587680923e-06, + "logits/chosen": -0.13763971626758575, + "logits/rejected": -0.06209099292755127, + "logps/chosen": -1.3407888412475586, + "logps/rejected": -1.6444101333618164, + "loss": 1.0604, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3407888412475586, + "rewards/margins": 0.30362120270729065, + "rewards/rejected": -1.6444101333618164, + "sft_loss": 1.3559894561767578, + "step": 1310 + }, + { + "epoch": 0.7037966215086134, + "grad_norm": 8.589029882972284, + "learning_rate": 2.837547586850974e-06, + "logits/chosen": -0.2269708216190338, + "logits/rejected": -0.051694173365831375, + "logps/chosen": -1.2944409847259521, + "logps/rejected": -1.626725435256958, + "loss": 1.0151, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2944409847259521, + "rewards/margins": 0.33228427171707153, + "rewards/rejected": -1.626725435256958, + "sft_loss": 1.2930586338043213, + "step": 1315 + }, + { + "epoch": 0.706472654290015, + "grad_norm": 5.081003427653556, + "learning_rate": 2.8354263200001645e-06, + "logits/chosen": -0.3047958016395569, + "logits/rejected": -0.10471458733081818, + "logps/chosen": -1.274651288986206, + "logps/rejected": -1.5908291339874268, + "loss": 1.0108, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.274651288986206, + "rewards/margins": 0.3161778748035431, + "rewards/rejected": -1.5908291339874268, + "sft_loss": 1.3193323612213135, + "step": 1320 + }, + { + "epoch": 0.7091486870714167, + "grad_norm": 6.115327160037618, + "learning_rate": 2.8332920968373414e-06, + "logits/chosen": -0.12503978610038757, + "logits/rejected": -0.01366239320486784, + "logps/chosen": -1.3743711709976196, + "logps/rejected": -1.6227270364761353, + "loss": 1.1023, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.3743711709976196, + "rewards/margins": 0.24835577607154846, + "rewards/rejected": -1.6227270364761353, + "sft_loss": 1.3661524057388306, + "step": 1325 + }, + { + "epoch": 0.7118247198528181, + "grad_norm": 5.621581268348691, + "learning_rate": 2.831144938068747e-06, + "logits/chosen": -0.15741266310214996, + "logits/rejected": -0.04052863270044327, + "logps/chosen": -1.3196581602096558, + "logps/rejected": -1.546939492225647, + "loss": 1.0601, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3196581602096558, + "rewards/margins": 0.22728124260902405, + "rewards/rejected": -1.546939492225647, + "sft_loss": 1.326621413230896, + "step": 1330 + }, + { + "epoch": 0.7145007526342197, + "grad_norm": 8.501981777373967, + "learning_rate": 2.8289848645261253e-06, + "logits/chosen": -0.1369256228208542, + "logits/rejected": -0.05569840595126152, + "logps/chosen": -1.416253685951233, + "logps/rejected": -1.6683292388916016, + "loss": 1.0777, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.416253685951233, + "rewards/margins": 0.25207552313804626, + "rewards/rejected": -1.6683292388916016, + "sft_loss": 1.4455569982528687, + "step": 1335 + }, + { + "epoch": 0.7171767854156214, + "grad_norm": 6.8237644835522975, + "learning_rate": 2.826811897166519e-06, + "logits/chosen": -0.16232003271579742, + "logits/rejected": -0.1322738230228424, + "logps/chosen": -1.3589154481887817, + "logps/rejected": -1.6097590923309326, + "loss": 1.0632, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3589154481887817, + "rewards/margins": 0.25084370374679565, + "rewards/rejected": -1.6097590923309326, + "sft_loss": 1.3681776523590088, + "step": 1340 + }, + { + "epoch": 0.719852818197023, + "grad_norm": 5.273153198621067, + "learning_rate": 2.8246260570720673e-06, + "logits/chosen": -0.1638951450586319, + "logits/rejected": 0.008594045415520668, + "logps/chosen": -1.3590314388275146, + "logps/rejected": -1.7121204137802124, + "loss": 1.0402, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.3590314388275146, + "rewards/margins": 0.3530888259410858, + "rewards/rejected": -1.7121204137802124, + "sft_loss": 1.3907363414764404, + "step": 1345 + }, + { + "epoch": 0.7225288509784245, + "grad_norm": 7.631563301834823, + "learning_rate": 2.8224273654498007e-06, + "logits/chosen": -0.16318397223949432, + "logits/rejected": -0.11493877321481705, + "logps/chosen": -1.3974087238311768, + "logps/rejected": -1.540160894393921, + "loss": 1.1194, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3974087238311768, + "rewards/margins": 0.14275220036506653, + "rewards/rejected": -1.540160894393921, + "sft_loss": 1.4131476879119873, + "step": 1350 + }, + { + "epoch": 0.7252048837598261, + "grad_norm": 5.247204698791591, + "learning_rate": 2.8202158436314348e-06, + "logits/chosen": -0.20993058383464813, + "logits/rejected": 0.0656299963593483, + "logps/chosen": -1.4481570720672607, + "logps/rejected": -1.7513744831085205, + "loss": 1.0944, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.4481570720672607, + "rewards/margins": 0.3032172918319702, + "rewards/rejected": -1.7513744831085205, + "sft_loss": 1.4534757137298584, + "step": 1355 + }, + { + "epoch": 0.7278809165412277, + "grad_norm": 6.147247511492117, + "learning_rate": 2.817991513073163e-06, + "logits/chosen": -0.3009326756000519, + "logits/rejected": -0.1586245447397232, + "logps/chosen": -1.480035424232483, + "logps/rejected": -1.8455111980438232, + "loss": 1.0894, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.480035424232483, + "rewards/margins": 0.3654758334159851, + "rewards/rejected": -1.8455111980438232, + "sft_loss": 1.506760835647583, + "step": 1360 + }, + { + "epoch": 0.7305569493226292, + "grad_norm": 5.125439426438534, + "learning_rate": 2.8157543953554515e-06, + "logits/chosen": -0.16923581063747406, + "logits/rejected": -0.04237973317503929, + "logps/chosen": -1.3896102905273438, + "logps/rejected": -1.6864780187606812, + "loss": 1.0593, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3896102905273438, + "rewards/margins": 0.2968676686286926, + "rewards/rejected": -1.6864780187606812, + "sft_loss": 1.4087154865264893, + "step": 1365 + }, + { + "epoch": 0.7332329821040308, + "grad_norm": 7.886149599248979, + "learning_rate": 2.813504512182825e-06, + "logits/chosen": -0.14230379462242126, + "logits/rejected": -0.021787326782941818, + "logps/chosen": -1.4374529123306274, + "logps/rejected": -1.8932501077651978, + "loss": 1.0296, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.4374529123306274, + "rewards/margins": 0.45579713582992554, + "rewards/rejected": -1.8932501077651978, + "sft_loss": 1.4520795345306396, + "step": 1370 + }, + { + "epoch": 0.7359090148854324, + "grad_norm": 4.451903315499975, + "learning_rate": 2.811241885383661e-06, + "logits/chosen": -0.15278813242912292, + "logits/rejected": -0.0027175932191312313, + "logps/chosen": -1.363525152206421, + "logps/rejected": -1.8220170736312866, + "loss": 1.0206, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.363525152206421, + "rewards/margins": 0.45849180221557617, + "rewards/rejected": -1.8220170736312866, + "sft_loss": 1.4273641109466553, + "step": 1375 + }, + { + "epoch": 0.738585047666834, + "grad_norm": 5.185954540567181, + "learning_rate": 2.8089665369099737e-06, + "logits/chosen": -0.19583071768283844, + "logits/rejected": -0.06919754296541214, + "logps/chosen": -1.4165420532226562, + "logps/rejected": -1.6947746276855469, + "loss": 1.0948, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.4165420532226562, + "rewards/margins": 0.2782325744628906, + "rewards/rejected": -1.6947746276855469, + "sft_loss": 1.3944652080535889, + "step": 1380 + }, + { + "epoch": 0.7412610804482355, + "grad_norm": 11.778771528863663, + "learning_rate": 2.806678488837205e-06, + "logits/chosen": -0.1822900027036667, + "logits/rejected": -0.052754055708646774, + "logps/chosen": -1.361941933631897, + "logps/rejected": -1.6913766860961914, + "loss": 1.0664, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.361941933631897, + "rewards/margins": 0.3294347822666168, + "rewards/rejected": -1.6913766860961914, + "sft_loss": 1.3936665058135986, + "step": 1385 + }, + { + "epoch": 0.7439371132296371, + "grad_norm": 8.142489210482202, + "learning_rate": 2.804377763364006e-06, + "logits/chosen": -0.05914510414004326, + "logits/rejected": 0.06488485634326935, + "logps/chosen": -1.4206465482711792, + "logps/rejected": -1.731772780418396, + "loss": 1.0692, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.4206465482711792, + "rewards/margins": 0.31112608313560486, + "rewards/rejected": -1.731772780418396, + "sft_loss": 1.4565393924713135, + "step": 1390 + }, + { + "epoch": 0.7466131460110387, + "grad_norm": 5.424351957136408, + "learning_rate": 2.8020643828120263e-06, + "logits/chosen": -0.016803156584501266, + "logits/rejected": 0.07223924249410629, + "logps/chosen": -1.3424150943756104, + "logps/rejected": -1.546939492225647, + "loss": 1.0573, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.3424150943756104, + "rewards/margins": 0.204524427652359, + "rewards/rejected": -1.546939492225647, + "sft_loss": 1.3497629165649414, + "step": 1395 + }, + { + "epoch": 0.7492891787924402, + "grad_norm": 5.95213805104973, + "learning_rate": 2.799738369625694e-06, + "logits/chosen": -0.23150594532489777, + "logits/rejected": -0.09028539806604385, + "logps/chosen": -1.4120293855667114, + "logps/rejected": -1.6809848546981812, + "loss": 1.0749, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.4120293855667114, + "rewards/margins": 0.2689554989337921, + "rewards/rejected": -1.6809848546981812, + "sft_loss": 1.419755220413208, + "step": 1400 + }, + { + "epoch": 0.7519652115738418, + "grad_norm": 4.5709113857061885, + "learning_rate": 2.7973997463719993e-06, + "logits/chosen": -0.13744693994522095, + "logits/rejected": 0.05361655354499817, + "logps/chosen": -1.2519391775131226, + "logps/rejected": -1.7877897024154663, + "loss": 1.0051, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2519391775131226, + "rewards/margins": 0.5358504056930542, + "rewards/rejected": -1.7877897024154663, + "sft_loss": 1.3105920553207397, + "step": 1405 + }, + { + "epoch": 0.7546412443552434, + "grad_norm": 7.597294311838919, + "learning_rate": 2.7950485357402754e-06, + "logits/chosen": -0.154087632894516, + "logits/rejected": 0.03357213735580444, + "logps/chosen": -1.4047690629959106, + "logps/rejected": -1.7237621545791626, + "loss": 1.0731, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.4047690629959106, + "rewards/margins": 0.31899309158325195, + "rewards/rejected": -1.7237621545791626, + "sft_loss": 1.451963186264038, + "step": 1410 + }, + { + "epoch": 0.7573172771366449, + "grad_norm": 7.598509225801618, + "learning_rate": 2.7926847605419776e-06, + "logits/chosen": -0.05727584287524223, + "logits/rejected": 0.09257154166698456, + "logps/chosen": -1.4217592477798462, + "logps/rejected": -1.512450933456421, + "loss": 1.1334, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.4217592477798462, + "rewards/margins": 0.09069158881902695, + "rewards/rejected": -1.512450933456421, + "sft_loss": 1.388801097869873, + "step": 1415 + }, + { + "epoch": 0.7599933099180465, + "grad_norm": 4.161958163004443, + "learning_rate": 2.7903084437104633e-06, + "logits/chosen": -0.06450797617435455, + "logits/rejected": 0.07489770650863647, + "logps/chosen": -1.302138328552246, + "logps/rejected": -1.842264175415039, + "loss": 1.0035, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.302138328552246, + "rewards/margins": 0.5401259660720825, + "rewards/rejected": -1.842264175415039, + "sft_loss": 1.3500391244888306, + "step": 1420 + }, + { + "epoch": 0.7626693426994481, + "grad_norm": 8.043149328035888, + "learning_rate": 2.787919608300769e-06, + "logits/chosen": -0.03159303590655327, + "logits/rejected": 0.0678340420126915, + "logps/chosen": -1.3615281581878662, + "logps/rejected": -1.7922245264053345, + "loss": 1.0269, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3615281581878662, + "rewards/margins": 0.43069639801979065, + "rewards/rejected": -1.7922245264053345, + "sft_loss": 1.3711903095245361, + "step": 1425 + }, + { + "epoch": 0.7653453754808497, + "grad_norm": 6.851486546482081, + "learning_rate": 2.785518277489387e-06, + "logits/chosen": -0.1362898051738739, + "logits/rejected": 0.004780137445777655, + "logps/chosen": -1.4045703411102295, + "logps/rejected": -1.6434154510498047, + "loss": 1.0694, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.4045703411102295, + "rewards/margins": 0.23884525895118713, + "rewards/rejected": -1.6434154510498047, + "sft_loss": 1.3844901323318481, + "step": 1430 + }, + { + "epoch": 0.7680214082622512, + "grad_norm": 4.307609136140785, + "learning_rate": 2.783104474574038e-06, + "logits/chosen": 0.05091600492596626, + "logits/rejected": 0.10818967968225479, + "logps/chosen": -1.3143080472946167, + "logps/rejected": -1.7300665378570557, + "loss": 1.0169, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3143080472946167, + "rewards/margins": 0.4157584309577942, + "rewards/rejected": -1.7300665378570557, + "sft_loss": 1.3309390544891357, + "step": 1435 + }, + { + "epoch": 0.7706974410436528, + "grad_norm": 4.361601400253105, + "learning_rate": 2.7806782229734495e-06, + "logits/chosen": -0.050930093973875046, + "logits/rejected": 0.06570479273796082, + "logps/chosen": -1.3809306621551514, + "logps/rejected": -1.5890841484069824, + "loss": 1.0979, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.3809306621551514, + "rewards/margins": 0.20815351605415344, + "rewards/rejected": -1.5890841484069824, + "sft_loss": 1.41715407371521, + "step": 1440 + }, + { + "epoch": 0.7733734738250544, + "grad_norm": 6.9265210413386775, + "learning_rate": 2.7782395462271247e-06, + "logits/chosen": -0.12000956386327744, + "logits/rejected": 0.11605888605117798, + "logps/chosen": -1.4133331775665283, + "logps/rejected": -1.6733297109603882, + "loss": 1.1133, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.4133331775665283, + "rewards/margins": 0.2599967122077942, + "rewards/rejected": -1.6733297109603882, + "sft_loss": 1.4759104251861572, + "step": 1445 + }, + { + "epoch": 0.7760495066064559, + "grad_norm": 8.684710551750364, + "learning_rate": 2.7757884679951167e-06, + "logits/chosen": -0.0006413728115148842, + "logits/rejected": 0.09884602576494217, + "logps/chosen": -1.339545488357544, + "logps/rejected": -1.605812430381775, + "loss": 1.07, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.339545488357544, + "rewards/margins": 0.26626691222190857, + "rewards/rejected": -1.605812430381775, + "sft_loss": 1.3702061176300049, + "step": 1450 + }, + { + "epoch": 0.7787255393878575, + "grad_norm": 6.184752771063054, + "learning_rate": 2.7733250120577967e-06, + "logits/chosen": -0.062450431287288666, + "logits/rejected": 0.118833526968956, + "logps/chosen": -1.3291722536087036, + "logps/rejected": -1.7080695629119873, + "loss": 1.0305, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.3291722536087036, + "rewards/margins": 0.3788975179195404, + "rewards/rejected": -1.7080695629119873, + "sft_loss": 1.3770744800567627, + "step": 1455 + }, + { + "epoch": 0.7814015721692591, + "grad_norm": 5.404834701352218, + "learning_rate": 2.770849202315625e-06, + "logits/chosen": -0.03863369673490524, + "logits/rejected": 0.1544675976037979, + "logps/chosen": -1.306546688079834, + "logps/rejected": -1.64643132686615, + "loss": 1.0246, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.306546688079834, + "rewards/margins": 0.33988457918167114, + "rewards/rejected": -1.64643132686615, + "sft_loss": 1.3078439235687256, + "step": 1460 + }, + { + "epoch": 0.7840776049506607, + "grad_norm": 5.904208941026598, + "learning_rate": 2.768361062788919e-06, + "logits/chosen": -0.020415937528014183, + "logits/rejected": 0.10936160385608673, + "logps/chosen": -1.4067522287368774, + "logps/rejected": -1.695853590965271, + "loss": 1.0899, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.4067522287368774, + "rewards/margins": 0.28910142183303833, + "rewards/rejected": -1.695853590965271, + "sft_loss": 1.4484775066375732, + "step": 1465 + }, + { + "epoch": 0.7867536377320622, + "grad_norm": 5.874991355168797, + "learning_rate": 2.7658606176176186e-06, + "logits/chosen": -0.09968818724155426, + "logits/rejected": -0.0667973980307579, + "logps/chosen": -1.365729570388794, + "logps/rejected": -1.6799007654190063, + "loss": 1.0703, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.365729570388794, + "rewards/margins": 0.31417128443717957, + "rewards/rejected": -1.6799007654190063, + "sft_loss": 1.4080615043640137, + "step": 1470 + }, + { + "epoch": 0.7894296705134638, + "grad_norm": 5.676763085762059, + "learning_rate": 2.763347891061054e-06, + "logits/chosen": -0.15920597314834595, + "logits/rejected": 0.025278815999627113, + "logps/chosen": -1.3561172485351562, + "logps/rejected": -1.7594220638275146, + "loss": 1.0372, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3561172485351562, + "rewards/margins": 0.4033048748970032, + "rewards/rejected": -1.7594220638275146, + "sft_loss": 1.4026418924331665, + "step": 1475 + }, + { + "epoch": 0.7921057032948654, + "grad_norm": 5.215987898101865, + "learning_rate": 2.7608229074977103e-06, + "logits/chosen": -0.06529693305492401, + "logits/rejected": 0.04853939637541771, + "logps/chosen": -1.3548333644866943, + "logps/rejected": -1.8335111141204834, + "loss": 1.0368, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3548333644866943, + "rewards/margins": 0.4786776602268219, + "rewards/rejected": -1.8335111141204834, + "sft_loss": 1.3940856456756592, + "step": 1480 + }, + { + "epoch": 0.7947817360762669, + "grad_norm": 9.073385343283363, + "learning_rate": 2.758285691424988e-06, + "logits/chosen": -0.07635178416967392, + "logits/rejected": 0.0887339860200882, + "logps/chosen": -1.4306265115737915, + "logps/rejected": -1.8580020666122437, + "loss": 1.0567, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.4306265115737915, + "rewards/margins": 0.4273756146430969, + "rewards/rejected": -1.8580020666122437, + "sft_loss": 1.3975563049316406, + "step": 1485 + }, + { + "epoch": 0.7974577688576685, + "grad_norm": 8.815064683541053, + "learning_rate": 2.7557362674589687e-06, + "logits/chosen": -0.1560938060283661, + "logits/rejected": -0.04363911226391792, + "logps/chosen": -1.3959918022155762, + "logps/rejected": -1.690941572189331, + "loss": 1.0874, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3959918022155762, + "rewards/margins": 0.2949499785900116, + "rewards/rejected": -1.690941572189331, + "sft_loss": 1.3853015899658203, + "step": 1490 + }, + { + "epoch": 0.8001338016390701, + "grad_norm": 7.199444095750238, + "learning_rate": 2.753174660334175e-06, + "logits/chosen": -0.14701077342033386, + "logits/rejected": -0.039738696068525314, + "logps/chosen": -1.575615644454956, + "logps/rejected": -1.7450742721557617, + "loss": 1.1747, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.575615644454956, + "rewards/margins": 0.16945865750312805, + "rewards/rejected": -1.7450742721557617, + "sft_loss": 1.5748717784881592, + "step": 1495 + }, + { + "epoch": 0.8028098344204716, + "grad_norm": 9.70254139336959, + "learning_rate": 2.750600894903331e-06, + "logits/chosen": -0.1704801619052887, + "logits/rejected": -0.06618437170982361, + "logps/chosen": -1.4099934101104736, + "logps/rejected": -1.7492620944976807, + "loss": 1.0938, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.4099934101104736, + "rewards/margins": 0.3392687439918518, + "rewards/rejected": -1.7492620944976807, + "sft_loss": 1.4731342792510986, + "step": 1500 + }, + { + "epoch": 0.8054858672018732, + "grad_norm": 6.201274593098627, + "learning_rate": 2.7480149961371194e-06, + "logits/chosen": -0.07009023427963257, + "logits/rejected": -0.005963495466858149, + "logps/chosen": -1.3297359943389893, + "logps/rejected": -1.8149020671844482, + "loss": 0.9912, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3297359943389893, + "rewards/margins": 0.4851660132408142, + "rewards/rejected": -1.8149020671844482, + "sft_loss": 1.332242727279663, + "step": 1505 + }, + { + "epoch": 0.8081618999832748, + "grad_norm": 4.584004786925617, + "learning_rate": 2.745416989123942e-06, + "logits/chosen": -0.16404542326927185, + "logits/rejected": 0.11613837629556656, + "logps/chosen": -1.3978588581085205, + "logps/rejected": -1.738348364830017, + "loss": 1.0535, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.3978588581085205, + "rewards/margins": 0.34048959612846375, + "rewards/rejected": -1.738348364830017, + "sft_loss": 1.4199836254119873, + "step": 1510 + }, + { + "epoch": 0.8108379327646764, + "grad_norm": 4.7711233664827475, + "learning_rate": 2.7428068990696735e-06, + "logits/chosen": -0.08198316395282745, + "logits/rejected": -0.01538595836609602, + "logps/chosen": -1.3322317600250244, + "logps/rejected": -1.6438226699829102, + "loss": 1.0403, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3322317600250244, + "rewards/margins": 0.3115909695625305, + "rewards/rejected": -1.6438226699829102, + "sft_loss": 1.3519912958145142, + "step": 1515 + }, + { + "epoch": 0.8135139655460779, + "grad_norm": 7.822975676442135, + "learning_rate": 2.7401847512974194e-06, + "logits/chosen": -0.10810144990682602, + "logits/rejected": -0.025854378938674927, + "logps/chosen": -1.3840287923812866, + "logps/rejected": -1.6693832874298096, + "loss": 1.0843, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3840287923812866, + "rewards/margins": 0.2853543758392334, + "rewards/rejected": -1.6693832874298096, + "sft_loss": 1.4771511554718018, + "step": 1520 + }, + { + "epoch": 0.8161899983274795, + "grad_norm": 4.848775932175174, + "learning_rate": 2.7375505712472695e-06, + "logits/chosen": -0.10527946054935455, + "logits/rejected": 0.08367304503917694, + "logps/chosen": -1.3714563846588135, + "logps/rejected": -1.6404037475585938, + "loss": 1.0966, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3714563846588135, + "rewards/margins": 0.26894742250442505, + "rewards/rejected": -1.6404037475585938, + "sft_loss": 1.3577983379364014, + "step": 1525 + }, + { + "epoch": 0.8188660311088811, + "grad_norm": 8.808636218080713, + "learning_rate": 2.734904384476049e-06, + "logits/chosen": -0.09855546057224274, + "logits/rejected": 0.023541796952486038, + "logps/chosen": -1.404067039489746, + "logps/rejected": -1.6858993768692017, + "loss": 1.0677, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.404067039489746, + "rewards/margins": 0.28183233737945557, + "rewards/rejected": -1.6858993768692017, + "sft_loss": 1.3637378215789795, + "step": 1530 + }, + { + "epoch": 0.8215420638902826, + "grad_norm": 7.6000091929001465, + "learning_rate": 2.732246216657075e-06, + "logits/chosen": -0.0942855104804039, + "logits/rejected": 0.09655526280403137, + "logps/chosen": -1.3157068490982056, + "logps/rejected": -1.6163606643676758, + "loss": 1.0324, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3157068490982056, + "rewards/margins": 0.300653874874115, + "rewards/rejected": -1.6163606643676758, + "sft_loss": 1.3629335165023804, + "step": 1535 + }, + { + "epoch": 0.8242180966716842, + "grad_norm": 11.762348997233314, + "learning_rate": 2.729576093579902e-06, + "logits/chosen": -0.033582575619220734, + "logits/rejected": 0.1363482028245926, + "logps/chosen": -1.342462420463562, + "logps/rejected": -1.8016321659088135, + "loss": 1.0026, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.342462420463562, + "rewards/margins": 0.4591697156429291, + "rewards/rejected": -1.8016321659088135, + "sft_loss": 1.3745462894439697, + "step": 1540 + }, + { + "epoch": 0.8268941294530858, + "grad_norm": 4.7769659985032265, + "learning_rate": 2.726894041150077e-06, + "logits/chosen": -0.08443383872509003, + "logits/rejected": 0.08157948404550552, + "logps/chosen": -1.358519196510315, + "logps/rejected": -1.6386398077011108, + "loss": 1.077, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.358519196510315, + "rewards/margins": 0.2801207900047302, + "rewards/rejected": -1.6386398077011108, + "sft_loss": 1.3989120721817017, + "step": 1545 + }, + { + "epoch": 0.8295701622344873, + "grad_norm": 6.547673950226795, + "learning_rate": 2.7242000853888833e-06, + "logits/chosen": -0.2460707128047943, + "logits/rejected": 0.01976541243493557, + "logps/chosen": -1.4292163848876953, + "logps/rejected": -1.8006445169448853, + "loss": 1.0714, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.4292163848876953, + "rewards/margins": 0.37142807245254517, + "rewards/rejected": -1.8006445169448853, + "sft_loss": 1.4601786136627197, + "step": 1550 + }, + { + "epoch": 0.8322461950158889, + "grad_norm": 4.736053048394042, + "learning_rate": 2.7214942524330918e-06, + "logits/chosen": -0.2725904583930969, + "logits/rejected": -0.03147698566317558, + "logps/chosen": -1.4534674882888794, + "logps/rejected": -1.9418662786483765, + "loss": 1.0458, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.4534674882888794, + "rewards/margins": 0.488398939371109, + "rewards/rejected": -1.9418662786483765, + "sft_loss": 1.4068353176116943, + "step": 1555 + }, + { + "epoch": 0.8349222277972905, + "grad_norm": 6.440375726068714, + "learning_rate": 2.7187765685347063e-06, + "logits/chosen": -0.2839708924293518, + "logits/rejected": -0.22860319912433624, + "logps/chosen": -1.52508544921875, + "logps/rejected": -1.84702467918396, + "loss": 1.1148, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.52508544921875, + "rewards/margins": 0.32193922996520996, + "rewards/rejected": -1.84702467918396, + "sft_loss": 1.5309407711029053, + "step": 1560 + }, + { + "epoch": 0.8375982605786921, + "grad_norm": 6.367797012460332, + "learning_rate": 2.7160470600607076e-06, + "logits/chosen": -0.20195667445659637, + "logits/rejected": -0.12136946618556976, + "logps/chosen": -1.4493902921676636, + "logps/rejected": -1.7250474691390991, + "loss": 1.1045, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.4493902921676636, + "rewards/margins": 0.27565696835517883, + "rewards/rejected": -1.7250474691390991, + "sft_loss": 1.472017526626587, + "step": 1565 + }, + { + "epoch": 0.8402742933600936, + "grad_norm": 11.041022879549015, + "learning_rate": 2.7133057534927986e-06, + "logits/chosen": -0.034810569137334824, + "logits/rejected": -0.012720304541289806, + "logps/chosen": -1.2938734292984009, + "logps/rejected": -1.5503425598144531, + "loss": 1.0604, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2938734292984009, + "rewards/margins": 0.256469190120697, + "rewards/rejected": -1.5503425598144531, + "sft_loss": 1.345172643661499, + "step": 1570 + }, + { + "epoch": 0.8429503261414952, + "grad_norm": 5.873930927000431, + "learning_rate": 2.710552675427148e-06, + "logits/chosen": 0.010924572125077248, + "logits/rejected": 0.12592843174934387, + "logps/chosen": -1.3183715343475342, + "logps/rejected": -1.4716814756393433, + "loss": 1.0897, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3183715343475342, + "rewards/margins": 0.15330982208251953, + "rewards/rejected": -1.4716814756393433, + "sft_loss": 1.35067617893219, + "step": 1575 + }, + { + "epoch": 0.8456263589228968, + "grad_norm": 5.425517400664494, + "learning_rate": 2.707787852574131e-06, + "logits/chosen": 0.09026463329792023, + "logits/rejected": 0.3710160255432129, + "logps/chosen": -1.3301565647125244, + "logps/rejected": -1.5640554428100586, + "loss": 1.0565, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3301565647125244, + "rewards/margins": 0.23389868438243866, + "rewards/rejected": -1.5640554428100586, + "sft_loss": 1.3714720010757446, + "step": 1580 + }, + { + "epoch": 0.8483023917042983, + "grad_norm": 4.013920767960187, + "learning_rate": 2.7050113117580716e-06, + "logits/chosen": 0.05647750943899155, + "logits/rejected": 0.2564930021762848, + "logps/chosen": -1.267033576965332, + "logps/rejected": -1.6264817714691162, + "loss": 0.9891, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.267033576965332, + "rewards/margins": 0.3594485819339752, + "rewards/rejected": -1.6264817714691162, + "sft_loss": 1.2766499519348145, + "step": 1585 + }, + { + "epoch": 0.8509784244856999, + "grad_norm": 5.962891588311921, + "learning_rate": 2.70222307991698e-06, + "logits/chosen": -0.01841115951538086, + "logits/rejected": 0.08693607151508331, + "logps/chosen": -1.290045976638794, + "logps/rejected": -1.4861773252487183, + "loss": 1.0702, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.290045976638794, + "rewards/margins": 0.19613142311573029, + "rewards/rejected": -1.4861773252487183, + "sft_loss": 1.3594423532485962, + "step": 1590 + }, + { + "epoch": 0.8536544572671015, + "grad_norm": 7.335803525766391, + "learning_rate": 2.6994231841022947e-06, + "logits/chosen": 0.0794888362288475, + "logits/rejected": 0.16885769367218018, + "logps/chosen": -1.4231336116790771, + "logps/rejected": -1.5418658256530762, + "loss": 1.1379, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -1.4231336116790771, + "rewards/margins": 0.11873219162225723, + "rewards/rejected": -1.5418658256530762, + "sft_loss": 1.4285242557525635, + "step": 1595 + }, + { + "epoch": 0.8563304900485031, + "grad_norm": 4.23742331064105, + "learning_rate": 2.6966116514786166e-06, + "logits/chosen": -0.0561373308300972, + "logits/rejected": 0.19555945694446564, + "logps/chosen": -1.3804311752319336, + "logps/rejected": -1.74152410030365, + "loss": 1.0436, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.3804311752319336, + "rewards/margins": 0.36109280586242676, + "rewards/rejected": -1.74152410030365, + "sft_loss": 1.3977186679840088, + "step": 1600 + }, + { + "epoch": 0.8563304900485031, + "eval_logits/chosen": 0.44763803482055664, + "eval_logits/rejected": 0.5672866702079773, + "eval_logps/chosen": -1.3712090253829956, + "eval_logps/rejected": -1.6780245304107666, + "eval_loss": 1.0524135828018188, + "eval_rewards/accuracies": 0.609050452709198, + "eval_rewards/chosen": -1.3712090253829956, + "eval_rewards/margins": 0.3068154454231262, + "eval_rewards/rejected": -1.6780245304107666, + "eval_runtime": 43.1222, + "eval_samples_per_second": 31.19, + "eval_sft_loss": 1.3880013227462769, + "eval_steps_per_second": 7.815, + "step": 1600 + }, + { + "epoch": 0.8590065228299046, + "grad_norm": 5.002638375551188, + "learning_rate": 2.6937885093234477e-06, + "logits/chosen": -0.06392344832420349, + "logits/rejected": 0.22160764038562775, + "logps/chosen": -1.3333699703216553, + "logps/rejected": -1.6846908330917358, + "loss": 1.0288, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3333699703216553, + "rewards/margins": 0.35132086277008057, + "rewards/rejected": -1.6846908330917358, + "sft_loss": 1.3701868057250977, + "step": 1605 + }, + { + "epoch": 0.8616825556113062, + "grad_norm": 4.64471612658279, + "learning_rate": 2.6909537850269256e-06, + "logits/chosen": -0.09533826261758804, + "logits/rejected": 0.13083715736865997, + "logps/chosen": -1.3075568675994873, + "logps/rejected": -1.6741119623184204, + "loss": 1.0239, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3075568675994873, + "rewards/margins": 0.36655518412590027, + "rewards/rejected": -1.6741119623184204, + "sft_loss": 1.33218514919281, + "step": 1610 + }, + { + "epoch": 0.8643585883927078, + "grad_norm": 6.172356811989879, + "learning_rate": 2.688107506091558e-06, + "logits/chosen": -0.015719827264547348, + "logits/rejected": 0.12321136146783829, + "logps/chosen": -1.4253634214401245, + "logps/rejected": -1.7837772369384766, + "loss": 1.0835, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.4253634214401245, + "rewards/margins": 0.3584136366844177, + "rewards/rejected": -1.7837772369384766, + "sft_loss": 1.4478522539138794, + "step": 1615 + }, + { + "epoch": 0.8670346211741093, + "grad_norm": 5.227709282984366, + "learning_rate": 2.6852497001319555e-06, + "logits/chosen": 0.04353545233607292, + "logits/rejected": 0.23053112626075745, + "logps/chosen": -1.2532846927642822, + "logps/rejected": -1.6680285930633545, + "loss": 0.9879, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2532846927642822, + "rewards/margins": 0.41474419832229614, + "rewards/rejected": -1.6680285930633545, + "sft_loss": 1.2794431447982788, + "step": 1620 + }, + { + "epoch": 0.869710653955511, + "grad_norm": 5.939591442905057, + "learning_rate": 2.682380394874564e-06, + "logits/chosen": 0.11653991788625717, + "logits/rejected": 0.17456406354904175, + "logps/chosen": -1.4384989738464355, + "logps/rejected": -1.6639492511749268, + "loss": 1.1006, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.4384989738464355, + "rewards/margins": 0.22545020282268524, + "rewards/rejected": -1.6639492511749268, + "sft_loss": 1.3895422220230103, + "step": 1625 + }, + { + "epoch": 0.8723866867369126, + "grad_norm": 5.613994005388873, + "learning_rate": 2.6794996181573953e-06, + "logits/chosen": 0.019490620121359825, + "logits/rejected": 0.20958340167999268, + "logps/chosen": -1.3698194026947021, + "logps/rejected": -1.6351855993270874, + "loss": 1.0802, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.3698194026947021, + "rewards/margins": 0.26536640524864197, + "rewards/rejected": -1.6351855993270874, + "sft_loss": 1.382866621017456, + "step": 1630 + }, + { + "epoch": 0.875062719518314, + "grad_norm": 5.922049659999224, + "learning_rate": 2.6766073979297584e-06, + "logits/chosen": -0.06902351975440979, + "logits/rejected": 0.08735234290361404, + "logps/chosen": -1.2911508083343506, + "logps/rejected": -1.6872625350952148, + "loss": 1.0277, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.2911508083343506, + "rewards/margins": 0.3961116671562195, + "rewards/rejected": -1.6872625350952148, + "sft_loss": 1.337499737739563, + "step": 1635 + }, + { + "epoch": 0.8777387522997157, + "grad_norm": 6.2625347475235635, + "learning_rate": 2.6737037622519866e-06, + "logits/chosen": -0.06433682888746262, + "logits/rejected": 0.0965544730424881, + "logps/chosen": -1.304503321647644, + "logps/rejected": -1.7035449743270874, + "loss": 1.0314, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.304503321647644, + "rewards/margins": 0.39904141426086426, + "rewards/rejected": -1.7035449743270874, + "sft_loss": 1.3322417736053467, + "step": 1640 + }, + { + "epoch": 0.8804147850811173, + "grad_norm": 6.559343546184692, + "learning_rate": 2.670788739295166e-06, + "logits/chosen": -0.011899260804057121, + "logits/rejected": 0.06090838462114334, + "logps/chosen": -1.3005620241165161, + "logps/rejected": -1.567338466644287, + "loss": 1.032, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3005620241165161, + "rewards/margins": 0.2667763829231262, + "rewards/rejected": -1.567338466644287, + "sft_loss": 1.3188759088516235, + "step": 1645 + }, + { + "epoch": 0.8830908178625189, + "grad_norm": 7.289341250127611, + "learning_rate": 2.6678623573408613e-06, + "logits/chosen": 0.03133546561002731, + "logits/rejected": 0.10692791640758514, + "logps/chosen": -1.3453984260559082, + "logps/rejected": -1.6661173105239868, + "loss": 1.0295, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3453984260559082, + "rewards/margins": 0.32071900367736816, + "rewards/rejected": -1.6661173105239868, + "sft_loss": 1.3463585376739502, + "step": 1650 + }, + { + "epoch": 0.8857668506439204, + "grad_norm": 6.203168074083552, + "learning_rate": 2.664924644780844e-06, + "logits/chosen": -0.1072305217385292, + "logits/rejected": 0.02232867106795311, + "logps/chosen": -1.4158356189727783, + "logps/rejected": -1.749995470046997, + "loss": 1.064, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.4158356189727783, + "rewards/margins": 0.33415982127189636, + "rewards/rejected": -1.749995470046997, + "sft_loss": 1.4020464420318604, + "step": 1655 + }, + { + "epoch": 0.888442883425322, + "grad_norm": 7.027198385868576, + "learning_rate": 2.661975630116813e-06, + "logits/chosen": -0.030960649251937866, + "logits/rejected": -0.007368740625679493, + "logps/chosen": -1.3272325992584229, + "logps/rejected": -1.690601110458374, + "loss": 1.0076, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.3272325992584229, + "rewards/margins": 0.3633684515953064, + "rewards/rejected": -1.690601110458374, + "sft_loss": 1.2698527574539185, + "step": 1660 + }, + { + "epoch": 0.8911189162067236, + "grad_norm": 5.9083328170756655, + "learning_rate": 2.6590153419601236e-06, + "logits/chosen": -0.06882845610380173, + "logits/rejected": 0.008100676350295544, + "logps/chosen": -1.4700895547866821, + "logps/rejected": -1.7158946990966797, + "loss": 1.1199, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.4700895547866821, + "rewards/margins": 0.2458050698041916, + "rewards/rejected": -1.7158946990966797, + "sft_loss": 1.4389417171478271, + "step": 1665 + }, + { + "epoch": 0.8937949489881251, + "grad_norm": 5.138637229143034, + "learning_rate": 2.656043809031503e-06, + "logits/chosen": -0.02306452952325344, + "logits/rejected": 0.1670946180820465, + "logps/chosen": -1.4814660549163818, + "logps/rejected": -1.7213853597640991, + "loss": 1.1272, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.4814660549163818, + "rewards/margins": 0.23991911113262177, + "rewards/rejected": -1.7213853597640991, + "sft_loss": 1.4016886949539185, + "step": 1670 + }, + { + "epoch": 0.8964709817695267, + "grad_norm": 4.711908559504893, + "learning_rate": 2.6530610601607764e-06, + "logits/chosen": -0.025666356086730957, + "logits/rejected": 0.20217475295066833, + "logps/chosen": -1.398291826248169, + "logps/rejected": -1.856555700302124, + "loss": 1.0498, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.398291826248169, + "rewards/margins": 0.45826393365859985, + "rewards/rejected": -1.856555700302124, + "sft_loss": 1.4265127182006836, + "step": 1675 + }, + { + "epoch": 0.8991470145509283, + "grad_norm": 5.912792494626306, + "learning_rate": 2.6500671242865877e-06, + "logits/chosen": -0.11349250376224518, + "logits/rejected": 0.018579021096229553, + "logps/chosen": -1.4116361141204834, + "logps/rejected": -1.6884177923202515, + "loss": 1.0678, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.4116361141204834, + "rewards/margins": 0.27678191661834717, + "rewards/rejected": -1.6884177923202515, + "sft_loss": 1.4139634370803833, + "step": 1680 + }, + { + "epoch": 0.9018230473323299, + "grad_norm": 9.673599748893212, + "learning_rate": 2.6470620304561147e-06, + "logits/chosen": -0.09010537713766098, + "logits/rejected": 0.16394095122814178, + "logps/chosen": -1.3627606630325317, + "logps/rejected": -1.6861966848373413, + "loss": 1.0783, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.3627606630325317, + "rewards/margins": 0.32343605160713196, + "rewards/rejected": -1.6861966848373413, + "sft_loss": 1.3936148881912231, + "step": 1685 + }, + { + "epoch": 0.9044990801137314, + "grad_norm": 5.433251100646687, + "learning_rate": 2.6440458078247914e-06, + "logits/chosen": -0.08464756608009338, + "logits/rejected": 0.13104847073554993, + "logps/chosen": -1.3060964345932007, + "logps/rejected": -1.687644362449646, + "loss": 1.0294, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3060964345932007, + "rewards/margins": 0.38154786825180054, + "rewards/rejected": -1.687644362449646, + "sft_loss": 1.3979648351669312, + "step": 1690 + }, + { + "epoch": 0.907175112895133, + "grad_norm": 10.324471284438198, + "learning_rate": 2.641018485656023e-06, + "logits/chosen": -0.260097473859787, + "logits/rejected": -0.1005074754357338, + "logps/chosen": -1.4122055768966675, + "logps/rejected": -1.6907997131347656, + "loss": 1.1087, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.4122055768966675, + "rewards/margins": 0.27859410643577576, + "rewards/rejected": -1.6907997131347656, + "sft_loss": 1.4807606935501099, + "step": 1695 + }, + { + "epoch": 0.9098511456765346, + "grad_norm": 6.001739367878445, + "learning_rate": 2.6379800933209028e-06, + "logits/chosen": -0.06367066502571106, + "logits/rejected": -0.12096105515956879, + "logps/chosen": -1.4194618463516235, + "logps/rejected": -1.5650646686553955, + "loss": 1.125, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.4194618463516235, + "rewards/margins": 0.14560283720493317, + "rewards/rejected": -1.5650646686553955, + "sft_loss": 1.4329822063446045, + "step": 1700 + }, + { + "epoch": 0.9125271784579361, + "grad_norm": 7.867099607944738, + "learning_rate": 2.634930660297926e-06, + "logits/chosen": -0.06066015362739563, + "logits/rejected": 0.0960855782032013, + "logps/chosen": -1.3637042045593262, + "logps/rejected": -1.5945680141448975, + "loss": 1.0748, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3637042045593262, + "rewards/margins": 0.23086369037628174, + "rewards/rejected": -1.5945680141448975, + "sft_loss": 1.394171953201294, + "step": 1705 + }, + { + "epoch": 0.9152032112393377, + "grad_norm": 5.64421578862294, + "learning_rate": 2.631870216172705e-06, + "logits/chosen": -0.11932742595672607, + "logits/rejected": -0.026896988973021507, + "logps/chosen": -1.3475847244262695, + "logps/rejected": -1.6134121417999268, + "loss": 1.0596, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.3475847244262695, + "rewards/margins": 0.2658274471759796, + "rewards/rejected": -1.6134121417999268, + "sft_loss": 1.3892669677734375, + "step": 1710 + }, + { + "epoch": 0.9178792440207393, + "grad_norm": 7.7643387899472565, + "learning_rate": 2.6287987906376834e-06, + "logits/chosen": -0.11442971229553223, + "logits/rejected": 0.10210961103439331, + "logps/chosen": -1.4364932775497437, + "logps/rejected": -1.6449638605117798, + "loss": 1.1332, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.4364932775497437, + "rewards/margins": 0.20847049355506897, + "rewards/rejected": -1.6449638605117798, + "sft_loss": 1.4206666946411133, + "step": 1715 + }, + { + "epoch": 0.9205552768021408, + "grad_norm": 10.238029687623929, + "learning_rate": 2.6257164134918435e-06, + "logits/chosen": -0.11684347689151764, + "logits/rejected": -0.044993676245212555, + "logps/chosen": -1.2513480186462402, + "logps/rejected": -1.7037394046783447, + "loss": 0.9972, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2513480186462402, + "rewards/margins": 0.45239129662513733, + "rewards/rejected": -1.7037394046783447, + "sft_loss": 1.2875103950500488, + "step": 1720 + }, + { + "epoch": 0.9232313095835424, + "grad_norm": 4.716017354618029, + "learning_rate": 2.622623114640423e-06, + "logits/chosen": -0.10425164550542831, + "logits/rejected": 0.011850642040371895, + "logps/chosen": -1.3868951797485352, + "logps/rejected": -1.8481849431991577, + "loss": 1.0351, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.3868951797485352, + "rewards/margins": 0.4612897038459778, + "rewards/rejected": -1.8481849431991577, + "sft_loss": 1.4478013515472412, + "step": 1725 + }, + { + "epoch": 0.925907342364944, + "grad_norm": 5.886652291268732, + "learning_rate": 2.6195189240946205e-06, + "logits/chosen": -0.04155025631189346, + "logits/rejected": 0.011087211780250072, + "logps/chosen": -1.3720247745513916, + "logps/rejected": -1.573352575302124, + "loss": 1.0922, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.3720247745513916, + "rewards/margins": 0.20132780075073242, + "rewards/rejected": -1.573352575302124, + "sft_loss": 1.3772026300430298, + "step": 1730 + }, + { + "epoch": 0.9285833751463456, + "grad_norm": 5.855119768331059, + "learning_rate": 2.6164038719713065e-06, + "logits/chosen": -0.21831436455249786, + "logits/rejected": -0.02849755249917507, + "logps/chosen": -1.2959465980529785, + "logps/rejected": -1.9212411642074585, + "loss": 0.9482, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2959465980529785, + "rewards/margins": 0.6252948641777039, + "rewards/rejected": -1.9212411642074585, + "sft_loss": 1.2650933265686035, + "step": 1735 + }, + { + "epoch": 0.9312594079277471, + "grad_norm": 8.313999999494468, + "learning_rate": 2.6132779884927303e-06, + "logits/chosen": -0.1863516867160797, + "logits/rejected": -0.03137914463877678, + "logps/chosen": -1.3538908958435059, + "logps/rejected": -1.6903021335601807, + "loss": 1.0279, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.3538908958435059, + "rewards/margins": 0.3364112973213196, + "rewards/rejected": -1.6903021335601807, + "sft_loss": 1.2951140403747559, + "step": 1740 + }, + { + "epoch": 0.9339354407091487, + "grad_norm": 4.376347329320102, + "learning_rate": 2.6101413039862274e-06, + "logits/chosen": -0.09724593162536621, + "logits/rejected": -0.07287286221981049, + "logps/chosen": -1.3620411157608032, + "logps/rejected": -1.6702470779418945, + "loss": 1.0637, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3620411157608032, + "rewards/margins": 0.30820587277412415, + "rewards/rejected": -1.6702470779418945, + "sft_loss": 1.403433084487915, + "step": 1745 + }, + { + "epoch": 0.9366114734905503, + "grad_norm": 10.776839930667265, + "learning_rate": 2.606993848883924e-06, + "logits/chosen": -0.14971527457237244, + "logits/rejected": -0.08246836811304092, + "logps/chosen": -1.3855582475662231, + "logps/rejected": -1.7720504999160767, + "loss": 1.0406, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.3855582475662231, + "rewards/margins": 0.38649213314056396, + "rewards/rejected": -1.7720504999160767, + "sft_loss": 1.3715816736221313, + "step": 1750 + }, + { + "epoch": 0.9392875062719518, + "grad_norm": 6.5716180189865785, + "learning_rate": 2.6038356537224433e-06, + "logits/chosen": -0.1572066694498062, + "logits/rejected": -0.04727526754140854, + "logps/chosen": -1.3189653158187866, + "logps/rejected": -1.6002721786499023, + "loss": 1.0475, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.3189653158187866, + "rewards/margins": 0.2813068926334381, + "rewards/rejected": -1.6002721786499023, + "sft_loss": 1.330338478088379, + "step": 1755 + }, + { + "epoch": 0.9419635390533534, + "grad_norm": 5.7150594115528435, + "learning_rate": 2.6006667491426098e-06, + "logits/chosen": -0.10597167909145355, + "logits/rejected": 0.03009122982621193, + "logps/chosen": -1.2991611957550049, + "logps/rejected": -1.6237561702728271, + "loss": 1.0552, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.2991611957550049, + "rewards/margins": 0.32459500432014465, + "rewards/rejected": -1.6237561702728271, + "sft_loss": 1.388869285583496, + "step": 1760 + }, + { + "epoch": 0.944639571834755, + "grad_norm": 6.6110723627273025, + "learning_rate": 2.5974871658891483e-06, + "logits/chosen": -0.04896597936749458, + "logits/rejected": -0.018855730071663857, + "logps/chosen": -1.3407930135726929, + "logps/rejected": -1.709975004196167, + "loss": 1.0233, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.3407930135726929, + "rewards/margins": 0.369181752204895, + "rewards/rejected": -1.709975004196167, + "sft_loss": 1.3349924087524414, + "step": 1765 + }, + { + "epoch": 0.9473156046161565, + "grad_norm": 5.332542486808135, + "learning_rate": 2.59429693481039e-06, + "logits/chosen": -0.08531305938959122, + "logits/rejected": 0.08129794895648956, + "logps/chosen": -1.3761022090911865, + "logps/rejected": -1.6028330326080322, + "loss": 1.0791, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3761022090911865, + "rewards/margins": 0.226730614900589, + "rewards/rejected": -1.6028330326080322, + "sft_loss": 1.4245785474777222, + "step": 1770 + }, + { + "epoch": 0.9499916373975581, + "grad_norm": 6.260488330364942, + "learning_rate": 2.5910960868579707e-06, + "logits/chosen": -0.17833852767944336, + "logits/rejected": -0.0832989290356636, + "logps/chosen": -1.3513391017913818, + "logps/rejected": -1.6686162948608398, + "loss": 1.0535, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.3513391017913818, + "rewards/margins": 0.31727713346481323, + "rewards/rejected": -1.6686162948608398, + "sft_loss": 1.3597242832183838, + "step": 1775 + }, + { + "epoch": 0.9526676701789597, + "grad_norm": 7.700677741681279, + "learning_rate": 2.5878846530865316e-06, + "logits/chosen": -0.1629796177148819, + "logits/rejected": -0.034121911972761154, + "logps/chosen": -1.3352044820785522, + "logps/rejected": -1.6965748071670532, + "loss": 1.022, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3352044820785522, + "rewards/margins": 0.3613702952861786, + "rewards/rejected": -1.6965748071670532, + "sft_loss": 1.2992217540740967, + "step": 1780 + }, + { + "epoch": 0.9553437029603613, + "grad_norm": 6.446010485829404, + "learning_rate": 2.584662664653417e-06, + "logits/chosen": -0.07534436881542206, + "logits/rejected": 0.0013519420754164457, + "logps/chosen": -1.2989352941513062, + "logps/rejected": -1.492027997970581, + "loss": 1.0538, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.2989352941513062, + "rewards/margins": 0.19309253990650177, + "rewards/rejected": -1.492027997970581, + "sft_loss": 1.2925944328308105, + "step": 1785 + }, + { + "epoch": 0.9580197357417628, + "grad_norm": 6.389180090896336, + "learning_rate": 2.5814301528183724e-06, + "logits/chosen": -0.06336723268032074, + "logits/rejected": -0.013006513938307762, + "logps/chosen": -1.3607739210128784, + "logps/rejected": -1.599548578262329, + "loss": 1.0743, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3607739210128784, + "rewards/margins": 0.23877449333667755, + "rewards/rejected": -1.599548578262329, + "sft_loss": 1.3988316059112549, + "step": 1790 + }, + { + "epoch": 0.9606957685231644, + "grad_norm": 4.7739184001154396, + "learning_rate": 2.5781871489432425e-06, + "logits/chosen": -0.19657868146896362, + "logits/rejected": -0.05092762038111687, + "logps/chosen": -1.3206299543380737, + "logps/rejected": -1.6578598022460938, + "loss": 1.0403, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.3206299543380737, + "rewards/margins": 0.33722978830337524, + "rewards/rejected": -1.6578598022460938, + "sft_loss": 1.367488980293274, + "step": 1795 + }, + { + "epoch": 0.963371801304566, + "grad_norm": 6.31162142812504, + "learning_rate": 2.5749336844916644e-06, + "logits/chosen": -0.14416718482971191, + "logits/rejected": -0.0685230940580368, + "logps/chosen": -1.343637228012085, + "logps/rejected": -1.5947457551956177, + "loss": 1.0791, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.343637228012085, + "rewards/margins": 0.25110840797424316, + "rewards/rejected": -1.5947457551956177, + "sft_loss": 1.412238359451294, + "step": 1800 + }, + { + "epoch": 0.9660478340859675, + "grad_norm": 5.616802509695661, + "learning_rate": 2.5716697910287653e-06, + "logits/chosen": -0.2638893723487854, + "logits/rejected": -0.12773671746253967, + "logps/chosen": -1.296654462814331, + "logps/rejected": -1.7552036046981812, + "loss": 0.9963, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.296654462814331, + "rewards/margins": 0.4585490822792053, + "rewards/rejected": -1.7552036046981812, + "sft_loss": 1.3569939136505127, + "step": 1805 + }, + { + "epoch": 0.9687238668673691, + "grad_norm": 5.274540470269256, + "learning_rate": 2.5683955002208533e-06, + "logits/chosen": -0.14453324675559998, + "logits/rejected": 0.0004860505578108132, + "logps/chosen": -1.3231637477874756, + "logps/rejected": -1.6313807964324951, + "loss": 1.0423, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.3231637477874756, + "rewards/margins": 0.30821719765663147, + "rewards/rejected": -1.6313807964324951, + "sft_loss": 1.3522526025772095, + "step": 1810 + }, + { + "epoch": 0.9713998996487707, + "grad_norm": 4.9948455748446134, + "learning_rate": 2.5651108438351125e-06, + "logits/chosen": -0.17525389790534973, + "logits/rejected": -0.03190991282463074, + "logps/chosen": -1.3508412837982178, + "logps/rejected": -1.6075427532196045, + "loss": 1.0749, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3508412837982178, + "rewards/margins": 0.256701797246933, + "rewards/rejected": -1.6075427532196045, + "sft_loss": 1.4097492694854736, + "step": 1815 + }, + { + "epoch": 0.9740759324301723, + "grad_norm": 6.723527857237567, + "learning_rate": 2.5618158537392933e-06, + "logits/chosen": -0.17094561457633972, + "logits/rejected": -0.0991709753870964, + "logps/chosen": -1.3676115274429321, + "logps/rejected": -1.6341607570648193, + "loss": 1.039, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3676115274429321, + "rewards/margins": 0.2665492594242096, + "rewards/rejected": -1.6341607570648193, + "sft_loss": 1.3335355520248413, + "step": 1820 + }, + { + "epoch": 0.9767519652115738, + "grad_norm": 6.955491736729167, + "learning_rate": 2.5585105619014042e-06, + "logits/chosen": -0.2364232838153839, + "logits/rejected": -0.08536256849765778, + "logps/chosen": -1.3084943294525146, + "logps/rejected": -1.698646903038025, + "loss": 1.0407, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3084943294525146, + "rewards/margins": 0.39015254378318787, + "rewards/rejected": -1.698646903038025, + "sft_loss": 1.3571867942810059, + "step": 1825 + }, + { + "epoch": 0.9794279979929754, + "grad_norm": 4.935043830160641, + "learning_rate": 2.555195000389401e-06, + "logits/chosen": -0.09421467036008835, + "logits/rejected": -0.06247818470001221, + "logps/chosen": -1.398600697517395, + "logps/rejected": -1.5775256156921387, + "loss": 1.0923, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.398600697517395, + "rewards/margins": 0.1789250671863556, + "rewards/rejected": -1.5775256156921387, + "sft_loss": 1.4048207998275757, + "step": 1830 + }, + { + "epoch": 0.982104030774377, + "grad_norm": 5.488009318290176, + "learning_rate": 2.5518692013708764e-06, + "logits/chosen": -0.19971255958080292, + "logits/rejected": -0.13240863382816315, + "logps/chosen": -1.3496644496917725, + "logps/rejected": -1.4494019746780396, + "loss": 1.117, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.3496644496917725, + "rewards/margins": 0.09973742812871933, + "rewards/rejected": -1.4494019746780396, + "sft_loss": 1.3982349634170532, + "step": 1835 + }, + { + "epoch": 0.9847800635557785, + "grad_norm": 5.861461820294152, + "learning_rate": 2.5485331971127467e-06, + "logits/chosen": -0.17495878040790558, + "logits/rejected": -0.061311207711696625, + "logps/chosen": -1.3403594493865967, + "logps/rejected": -1.7291936874389648, + "loss": 1.0203, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3403594493865967, + "rewards/margins": 0.3888341784477234, + "rewards/rejected": -1.7291936874389648, + "sft_loss": 1.3728001117706299, + "step": 1840 + }, + { + "epoch": 0.9874560963371801, + "grad_norm": 8.398297831143816, + "learning_rate": 2.5451870199809398e-06, + "logits/chosen": -0.2426266372203827, + "logits/rejected": -0.15641649067401886, + "logps/chosen": -1.3813214302062988, + "logps/rejected": -1.6519542932510376, + "loss": 1.1025, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3813214302062988, + "rewards/margins": 0.27063268423080444, + "rewards/rejected": -1.6519542932510376, + "sft_loss": 1.407288908958435, + "step": 1845 + }, + { + "epoch": 0.9901321291185817, + "grad_norm": 10.421658236786511, + "learning_rate": 2.5418307024400808e-06, + "logits/chosen": -0.3867731988430023, + "logits/rejected": -0.25579994916915894, + "logps/chosen": -1.4403059482574463, + "logps/rejected": -1.6446120738983154, + "loss": 1.1147, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.4403059482574463, + "rewards/margins": 0.20430617034435272, + "rewards/rejected": -1.6446120738983154, + "sft_loss": 1.4047480821609497, + "step": 1850 + }, + { + "epoch": 0.9928081618999832, + "grad_norm": 8.945771287029995, + "learning_rate": 2.538464277053178e-06, + "logits/chosen": -0.3046836256980896, + "logits/rejected": -0.20668058097362518, + "logps/chosen": -1.3661428689956665, + "logps/rejected": -1.698788046836853, + "loss": 1.0565, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3661428689956665, + "rewards/margins": 0.332645058631897, + "rewards/rejected": -1.698788046836853, + "sft_loss": 1.3933660984039307, + "step": 1855 + }, + { + "epoch": 0.9954841946813848, + "grad_norm": 6.4600258169068105, + "learning_rate": 2.5350877764813042e-06, + "logits/chosen": -0.24842870235443115, + "logits/rejected": -0.17129719257354736, + "logps/chosen": -1.4513623714447021, + "logps/rejected": -1.7643362283706665, + "loss": 1.0665, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.4513623714447021, + "rewards/margins": 0.3129737079143524, + "rewards/rejected": -1.7643362283706665, + "sft_loss": 1.4227665662765503, + "step": 1860 + }, + { + "epoch": 0.9981602274627864, + "grad_norm": 5.726169321686694, + "learning_rate": 2.531701233483284e-06, + "logits/chosen": -0.2357204407453537, + "logits/rejected": -0.1661442667245865, + "logps/chosen": -1.3102283477783203, + "logps/rejected": -1.7116100788116455, + "loss": 1.0178, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3102283477783203, + "rewards/margins": 0.40138188004493713, + "rewards/rejected": -1.7116100788116455, + "sft_loss": 1.3458659648895264, + "step": 1865 + }, + { + "epoch": 1.000836260244188, + "grad_norm": 6.694603142400152, + "learning_rate": 2.5283046809153708e-06, + "logits/chosen": -0.28832611441612244, + "logits/rejected": -0.1387026458978653, + "logps/chosen": -1.4116193056106567, + "logps/rejected": -1.744187593460083, + "loss": 1.0715, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.4116193056106567, + "rewards/margins": 0.3325682580471039, + "rewards/rejected": -1.744187593460083, + "sft_loss": 1.4343914985656738, + "step": 1870 + }, + { + "epoch": 1.0035122930255895, + "grad_norm": 8.177964909816092, + "learning_rate": 2.524898151730934e-06, + "logits/chosen": -0.3231280446052551, + "logits/rejected": -0.19551904499530792, + "logps/chosen": -1.3622983694076538, + "logps/rejected": -1.7303298711776733, + "loss": 1.0075, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3622983694076538, + "rewards/margins": 0.36803165078163147, + "rewards/rejected": -1.7303298711776733, + "sft_loss": 1.3062353134155273, + "step": 1875 + }, + { + "epoch": 1.0061883258069912, + "grad_norm": 7.594474341201221, + "learning_rate": 2.5214816789801337e-06, + "logits/chosen": -0.2040158212184906, + "logits/rejected": -0.044173695147037506, + "logps/chosen": -1.3152166604995728, + "logps/rejected": -1.9164692163467407, + "loss": 0.9674, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3152166604995728, + "rewards/margins": 0.6012526154518127, + "rewards/rejected": -1.9164692163467407, + "sft_loss": 1.3626232147216797, + "step": 1880 + }, + { + "epoch": 1.0088643585883927, + "grad_norm": 5.9740793893869935, + "learning_rate": 2.518055295809604e-06, + "logits/chosen": -0.2451355904340744, + "logits/rejected": -0.16891005635261536, + "logps/chosen": -1.2620337009429932, + "logps/rejected": -1.720070242881775, + "loss": 0.9786, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2620337009429932, + "rewards/margins": 0.4580365717411041, + "rewards/rejected": -1.720070242881775, + "sft_loss": 1.2582600116729736, + "step": 1885 + }, + { + "epoch": 1.0115403913697942, + "grad_norm": 5.63311132788852, + "learning_rate": 2.5146190354621295e-06, + "logits/chosen": -0.29360517859458923, + "logits/rejected": -0.09512095153331757, + "logps/chosen": -1.2986427545547485, + "logps/rejected": -1.7647788524627686, + "loss": 0.9846, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2986427545547485, + "rewards/margins": 0.4661361575126648, + "rewards/rejected": -1.7647788524627686, + "sft_loss": 1.3902837038040161, + "step": 1890 + }, + { + "epoch": 1.014216424151196, + "grad_norm": 6.071013462973501, + "learning_rate": 2.511172931276323e-06, + "logits/chosen": -0.23120129108428955, + "logits/rejected": -0.1789826899766922, + "logps/chosen": -1.3082417249679565, + "logps/rejected": -1.6579334735870361, + "loss": 0.9905, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3082417249679565, + "rewards/margins": 0.34969156980514526, + "rewards/rejected": -1.6579334735870361, + "sft_loss": 1.325251817703247, + "step": 1895 + }, + { + "epoch": 1.0168924569325974, + "grad_norm": 4.573280999864407, + "learning_rate": 2.5077170166863026e-06, + "logits/chosen": -0.3544650971889496, + "logits/rejected": -0.10121510177850723, + "logps/chosen": -1.328932762145996, + "logps/rejected": -1.7810074090957642, + "loss": 0.9955, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.328932762145996, + "rewards/margins": 0.4520746171474457, + "rewards/rejected": -1.7810074090957642, + "sft_loss": 1.377091646194458, + "step": 1900 + }, + { + "epoch": 1.019568489713999, + "grad_norm": 4.361287268957535, + "learning_rate": 2.504251325221366e-06, + "logits/chosen": -0.28322911262512207, + "logits/rejected": -0.14230282604694366, + "logps/chosen": -1.3653513193130493, + "logps/rejected": -1.7526214122772217, + "loss": 1.0219, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3653513193130493, + "rewards/margins": 0.38727012276649475, + "rewards/rejected": -1.7526214122772217, + "sft_loss": 1.3574550151824951, + "step": 1905 + }, + { + "epoch": 1.0222445224954007, + "grad_norm": 6.172709458505564, + "learning_rate": 2.500775890505668e-06, + "logits/chosen": -0.3746119737625122, + "logits/rejected": -0.2532230019569397, + "logps/chosen": -1.307799220085144, + "logps/rejected": -1.6917346715927124, + "loss": 0.9872, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.307799220085144, + "rewards/margins": 0.38393548130989075, + "rewards/rejected": -1.6917346715927124, + "sft_loss": 1.3196412324905396, + "step": 1910 + }, + { + "epoch": 1.0249205552768021, + "grad_norm": 4.0633181580152735, + "learning_rate": 2.497290746257891e-06, + "logits/chosen": -0.29822617769241333, + "logits/rejected": -0.22360272705554962, + "logps/chosen": -1.2829499244689941, + "logps/rejected": -1.6371288299560547, + "loss": 1.0297, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2829499244689941, + "rewards/margins": 0.35417887568473816, + "rewards/rejected": -1.6371288299560547, + "sft_loss": 1.359552025794983, + "step": 1915 + }, + { + "epoch": 1.0275965880582036, + "grad_norm": 7.153620462384848, + "learning_rate": 2.49379592629092e-06, + "logits/chosen": -0.34407928586006165, + "logits/rejected": -0.27675971388816833, + "logps/chosen": -1.1805469989776611, + "logps/rejected": -1.652948021888733, + "loss": 0.9315, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1805469989776611, + "rewards/margins": 0.47240084409713745, + "rewards/rejected": -1.652948021888733, + "sft_loss": 1.2449908256530762, + "step": 1920 + }, + { + "epoch": 1.0302726208396054, + "grad_norm": 6.279139416421326, + "learning_rate": 2.4902914645115135e-06, + "logits/chosen": -0.46754807233810425, + "logits/rejected": -0.27566924691200256, + "logps/chosen": -1.3483153581619263, + "logps/rejected": -1.741081953048706, + "loss": 1.0151, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3483153581619263, + "rewards/margins": 0.39276665449142456, + "rewards/rejected": -1.741081953048706, + "sft_loss": 1.4039819240570068, + "step": 1925 + }, + { + "epoch": 1.0329486536210069, + "grad_norm": 15.078125841372746, + "learning_rate": 2.4867773949199748e-06, + "logits/chosen": -0.4051434397697449, + "logits/rejected": -0.27849093079566956, + "logps/chosen": -1.2485005855560303, + "logps/rejected": -1.7951538562774658, + "loss": 0.9279, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2485005855560303, + "rewards/margins": 0.5466530919075012, + "rewards/rejected": -1.7951538562774658, + "sft_loss": 1.3159576654434204, + "step": 1930 + }, + { + "epoch": 1.0356246864024083, + "grad_norm": 7.358861910072948, + "learning_rate": 2.483253751609823e-06, + "logits/chosen": -0.3944740891456604, + "logits/rejected": -0.21546992659568787, + "logps/chosen": -1.341235637664795, + "logps/rejected": -1.9729608297348022, + "loss": 0.9468, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.341235637664795, + "rewards/margins": 0.6317251920700073, + "rewards/rejected": -1.9729608297348022, + "sft_loss": 1.3817254304885864, + "step": 1935 + }, + { + "epoch": 1.03830071918381, + "grad_norm": 6.991473226065031, + "learning_rate": 2.4797205687674608e-06, + "logits/chosen": -0.31449756026268005, + "logits/rejected": -0.21377721428871155, + "logps/chosen": -1.345902681350708, + "logps/rejected": -1.9185478687286377, + "loss": 0.9752, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.345902681350708, + "rewards/margins": 0.5726450085639954, + "rewards/rejected": -1.9185478687286377, + "sft_loss": 1.3627817630767822, + "step": 1940 + }, + { + "epoch": 1.0409767519652116, + "grad_norm": 11.559584684546387, + "learning_rate": 2.476177880671843e-06, + "logits/chosen": -0.43540072441101074, + "logits/rejected": -0.27904751896858215, + "logps/chosen": -1.3725993633270264, + "logps/rejected": -2.1540937423706055, + "loss": 0.9587, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3725993633270264, + "rewards/margins": 0.7814942598342896, + "rewards/rejected": -2.1540937423706055, + "sft_loss": 1.4134228229522705, + "step": 1945 + }, + { + "epoch": 1.043652784746613, + "grad_norm": 7.589617000083219, + "learning_rate": 2.4726257216941463e-06, + "logits/chosen": -0.34726226329803467, + "logits/rejected": -0.1460946649312973, + "logps/chosen": -1.3964488506317139, + "logps/rejected": -1.9331518411636353, + "loss": 1.0112, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3964488506317139, + "rewards/margins": 0.5367029905319214, + "rewards/rejected": -1.9331518411636353, + "sft_loss": 1.4542505741119385, + "step": 1950 + }, + { + "epoch": 1.0463288175280148, + "grad_norm": 5.53217072199513, + "learning_rate": 2.4690641262974317e-06, + "logits/chosen": -0.3106427490711212, + "logits/rejected": -0.24054069817066193, + "logps/chosen": -1.254062294960022, + "logps/rejected": -1.7604010105133057, + "loss": 0.9492, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.254062294960022, + "rewards/margins": 0.5063384771347046, + "rewards/rejected": -1.7604010105133057, + "sft_loss": 1.2828395366668701, + "step": 1955 + }, + { + "epoch": 1.0490048503094163, + "grad_norm": 5.789997895299221, + "learning_rate": 2.4654931290363135e-06, + "logits/chosen": -0.39274802803993225, + "logits/rejected": -0.3678857684135437, + "logps/chosen": -1.3077292442321777, + "logps/rejected": -1.8038746118545532, + "loss": 0.9884, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3077292442321777, + "rewards/margins": 0.4961455464363098, + "rewards/rejected": -1.8038746118545532, + "sft_loss": 1.3811976909637451, + "step": 1960 + }, + { + "epoch": 1.051680883090818, + "grad_norm": 7.167359990925718, + "learning_rate": 2.461912764556623e-06, + "logits/chosen": -0.32825708389282227, + "logits/rejected": -0.2701486349105835, + "logps/chosen": -1.2460230588912964, + "logps/rejected": -1.9635534286499023, + "loss": 0.9254, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2460230588912964, + "rewards/margins": 0.717530369758606, + "rewards/rejected": -1.9635534286499023, + "sft_loss": 1.3071447610855103, + "step": 1965 + }, + { + "epoch": 1.0543569158722195, + "grad_norm": 4.728180593919103, + "learning_rate": 2.4583230675950717e-06, + "logits/chosen": -0.3871462643146515, + "logits/rejected": -0.2666874825954437, + "logps/chosen": -1.287161111831665, + "logps/rejected": -1.8006842136383057, + "loss": 0.9718, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.287161111831665, + "rewards/margins": 0.5135231614112854, + "rewards/rejected": -1.8006842136383057, + "sft_loss": 1.3299510478973389, + "step": 1970 + }, + { + "epoch": 1.057032948653621, + "grad_norm": 6.5082169516852355, + "learning_rate": 2.4547240729789156e-06, + "logits/chosen": -0.3427007794380188, + "logits/rejected": -0.26755183935165405, + "logps/chosen": -1.2617965936660767, + "logps/rejected": -1.7926756143569946, + "loss": 0.9507, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2617965936660767, + "rewards/margins": 0.5308788418769836, + "rewards/rejected": -1.7926756143569946, + "sft_loss": 1.3087375164031982, + "step": 1975 + }, + { + "epoch": 1.0597089814350227, + "grad_norm": 6.383588115894568, + "learning_rate": 2.451115815625617e-06, + "logits/chosen": -0.2760846018791199, + "logits/rejected": -0.16218586266040802, + "logps/chosen": -1.3795157670974731, + "logps/rejected": -1.9243261814117432, + "loss": 0.9986, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3795157670974731, + "rewards/margins": 0.5448102951049805, + "rewards/rejected": -1.9243261814117432, + "sft_loss": 1.3830487728118896, + "step": 1980 + }, + { + "epoch": 1.0623850142164242, + "grad_norm": 5.9339767920468, + "learning_rate": 2.4474983305425025e-06, + "logits/chosen": -0.35163047909736633, + "logits/rejected": -0.20219776034355164, + "logps/chosen": -1.3649961948394775, + "logps/rejected": -1.8313382863998413, + "loss": 1.0088, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3649961948394775, + "rewards/margins": 0.46634215116500854, + "rewards/rejected": -1.8313382863998413, + "sft_loss": 1.356627345085144, + "step": 1985 + }, + { + "epoch": 1.0650610469978257, + "grad_norm": 9.750541232945201, + "learning_rate": 2.4438716528264307e-06, + "logits/chosen": -0.39386358857154846, + "logits/rejected": -0.3099510073661804, + "logps/chosen": -1.3966448307037354, + "logps/rejected": -1.8608152866363525, + "loss": 0.989, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3966448307037354, + "rewards/margins": 0.4641706049442291, + "rewards/rejected": -1.8608152866363525, + "sft_loss": 1.3813846111297607, + "step": 1990 + }, + { + "epoch": 1.0677370797792274, + "grad_norm": 6.0436821093239175, + "learning_rate": 2.440235817663443e-06, + "logits/chosen": -0.2763604521751404, + "logits/rejected": -0.14427319169044495, + "logps/chosen": -1.2740201950073242, + "logps/rejected": -1.9402685165405273, + "loss": 0.941, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2740201950073242, + "rewards/margins": 0.6662485003471375, + "rewards/rejected": -1.9402685165405273, + "sft_loss": 1.3213818073272705, + "step": 1995 + }, + { + "epoch": 1.0704131125606289, + "grad_norm": 13.243435815567677, + "learning_rate": 2.4365908603284285e-06, + "logits/chosen": -0.3816941976547241, + "logits/rejected": -0.23525352776050568, + "logps/chosen": -1.4328410625457764, + "logps/rejected": -2.0395724773406982, + "loss": 1.0569, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.4328410625457764, + "rewards/margins": 0.6067315340042114, + "rewards/rejected": -2.0395724773406982, + "sft_loss": 1.42452871799469, + "step": 2000 + }, + { + "epoch": 1.0704131125606289, + "eval_logits/chosen": -0.06950338929891586, + "eval_logits/rejected": 0.009176608175039291, + "eval_logps/chosen": -1.4133427143096924, + "eval_logps/rejected": -1.8531452417373657, + "eval_loss": 1.0426981449127197, + "eval_rewards/accuracies": 0.6298219561576843, + "eval_rewards/chosen": -1.4133427143096924, + "eval_rewards/margins": 0.43980276584625244, + "eval_rewards/rejected": -1.8531452417373657, + "eval_runtime": 43.1749, + "eval_samples_per_second": 31.152, + "eval_sft_loss": 1.416014313697815, + "eval_steps_per_second": 7.805, + "step": 2000 + }, + { + "epoch": 1.0730891453420304, + "grad_norm": 6.708271089434237, + "learning_rate": 2.4329368161847796e-06, + "logits/chosen": -0.34353774785995483, + "logits/rejected": -0.27555742859840393, + "logps/chosen": -1.352007269859314, + "logps/rejected": -1.7914186716079712, + "loss": 1.0402, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.352007269859314, + "rewards/margins": 0.4394114911556244, + "rewards/rejected": -1.7914186716079712, + "sft_loss": 1.4041087627410889, + "step": 2005 + }, + { + "epoch": 1.075765178123432, + "grad_norm": 8.254107888627756, + "learning_rate": 2.4292737206840483e-06, + "logits/chosen": -0.2618446946144104, + "logits/rejected": -0.16235823929309845, + "logps/chosen": -1.2731168270111084, + "logps/rejected": -1.714038610458374, + "loss": 0.9913, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2731168270111084, + "rewards/margins": 0.4409221112728119, + "rewards/rejected": -1.714038610458374, + "sft_loss": 1.3453394174575806, + "step": 2010 + }, + { + "epoch": 1.0784412109048336, + "grad_norm": 7.6854724761378135, + "learning_rate": 2.4256016093656035e-06, + "logits/chosen": -0.3184880018234253, + "logits/rejected": -0.17739680409431458, + "logps/chosen": -1.2959625720977783, + "logps/rejected": -1.75029718875885, + "loss": 0.9699, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2959625720977783, + "rewards/margins": 0.454334557056427, + "rewards/rejected": -1.75029718875885, + "sft_loss": 1.289369821548462, + "step": 2015 + }, + { + "epoch": 1.081117243686235, + "grad_norm": 6.3231906945046825, + "learning_rate": 2.421920517856285e-06, + "logits/chosen": -0.38904905319213867, + "logits/rejected": -0.21985527873039246, + "logps/chosen": -1.3628504276275635, + "logps/rejected": -1.8863815069198608, + "loss": 0.977, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3628504276275635, + "rewards/margins": 0.5235310196876526, + "rewards/rejected": -1.8863815069198608, + "sft_loss": 1.369199514389038, + "step": 2020 + }, + { + "epoch": 1.0837932764676368, + "grad_norm": 7.658576283536331, + "learning_rate": 2.418230481870058e-06, + "logits/chosen": -0.300046443939209, + "logits/rejected": -0.17431317269802094, + "logps/chosen": -1.3725075721740723, + "logps/rejected": -1.9859756231307983, + "loss": 0.9873, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3725075721740723, + "rewards/margins": 0.6134681701660156, + "rewards/rejected": -1.9859756231307983, + "sft_loss": 1.4489480257034302, + "step": 2025 + }, + { + "epoch": 1.0864693092490383, + "grad_norm": 5.871835625416842, + "learning_rate": 2.41453153720767e-06, + "logits/chosen": -0.3782210946083069, + "logits/rejected": -0.36119428277015686, + "logps/chosen": -1.2919793128967285, + "logps/rejected": -1.691510796546936, + "loss": 1.0062, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.2919793128967285, + "rewards/margins": 0.39953145384788513, + "rewards/rejected": -1.691510796546936, + "sft_loss": 1.3468921184539795, + "step": 2030 + }, + { + "epoch": 1.0891453420304398, + "grad_norm": 6.901454702299402, + "learning_rate": 2.4108237197562963e-06, + "logits/chosen": -0.4194413721561432, + "logits/rejected": -0.2624618411064148, + "logps/chosen": -1.327225685119629, + "logps/rejected": -1.884225606918335, + "loss": 0.9807, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.327225685119629, + "rewards/margins": 0.5570000410079956, + "rewards/rejected": -1.884225606918335, + "sft_loss": 1.346602201461792, + "step": 2035 + }, + { + "epoch": 1.0918213748118415, + "grad_norm": 29.751282062885846, + "learning_rate": 2.407107065489199e-06, + "logits/chosen": -0.471548855304718, + "logits/rejected": -0.42120179533958435, + "logps/chosen": -1.386148452758789, + "logps/rejected": -1.854129433631897, + "loss": 1.0606, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.386148452758789, + "rewards/margins": 0.46798110008239746, + "rewards/rejected": -1.854129433631897, + "sft_loss": 1.4089126586914062, + "step": 2040 + }, + { + "epoch": 1.094497407593243, + "grad_norm": 6.698173603667034, + "learning_rate": 2.403381610465374e-06, + "logits/chosen": -0.32573026418685913, + "logits/rejected": -0.289516419172287, + "logps/chosen": -1.35390305519104, + "logps/rejected": -1.8425319194793701, + "loss": 0.9591, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.35390305519104, + "rewards/margins": 0.488629013299942, + "rewards/rejected": -1.8425319194793701, + "sft_loss": 1.3057984113693237, + "step": 2045 + }, + { + "epoch": 1.0971734403746445, + "grad_norm": 6.012288384070778, + "learning_rate": 2.3996473908292017e-06, + "logits/chosen": -0.4606549143791199, + "logits/rejected": -0.3668895959854126, + "logps/chosen": -1.3278155326843262, + "logps/rejected": -1.7195066213607788, + "loss": 1.0423, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3278155326843262, + "rewards/margins": 0.39169105887413025, + "rewards/rejected": -1.7195066213607788, + "sft_loss": 1.4088014364242554, + "step": 2050 + }, + { + "epoch": 1.0998494731560462, + "grad_norm": 7.264707775672911, + "learning_rate": 2.3959044428100985e-06, + "logits/chosen": -0.3544057011604309, + "logits/rejected": -0.25213176012039185, + "logps/chosen": -1.293614149093628, + "logps/rejected": -1.742283821105957, + "loss": 1.0046, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.293614149093628, + "rewards/margins": 0.4486696124076843, + "rewards/rejected": -1.742283821105957, + "sft_loss": 1.35196852684021, + "step": 2055 + }, + { + "epoch": 1.1025255059374477, + "grad_norm": 6.904375922387609, + "learning_rate": 2.392152802722162e-06, + "logits/chosen": -0.29229849576950073, + "logits/rejected": -0.25606706738471985, + "logps/chosen": -1.3440862894058228, + "logps/rejected": -1.8628699779510498, + "loss": 1.0064, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3440862894058228, + "rewards/margins": 0.5187836289405823, + "rewards/rejected": -1.8628699779510498, + "sft_loss": 1.4032114744186401, + "step": 2060 + }, + { + "epoch": 1.1052015387188494, + "grad_norm": 6.313972841462663, + "learning_rate": 2.38839250696382e-06, + "logits/chosen": -0.3327978551387787, + "logits/rejected": -0.22336368262767792, + "logps/chosen": -1.2951395511627197, + "logps/rejected": -1.7230771780014038, + "loss": 0.9956, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2951395511627197, + "rewards/margins": 0.42793768644332886, + "rewards/rejected": -1.7230771780014038, + "sft_loss": 1.300781488418579, + "step": 2065 + }, + { + "epoch": 1.107877571500251, + "grad_norm": 6.052818712794828, + "learning_rate": 2.3846235920174794e-06, + "logits/chosen": -0.3500472903251648, + "logits/rejected": -0.21844089031219482, + "logps/chosen": -1.2430912256240845, + "logps/rejected": -1.7730716466903687, + "loss": 0.934, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2430912256240845, + "rewards/margins": 0.529980480670929, + "rewards/rejected": -1.7730716466903687, + "sft_loss": 1.2939088344573975, + "step": 2070 + }, + { + "epoch": 1.1105536042816524, + "grad_norm": 10.532652555200189, + "learning_rate": 2.380846094449169e-06, + "logits/chosen": -0.3928828835487366, + "logits/rejected": -0.3052484393119812, + "logps/chosen": -1.3178021907806396, + "logps/rejected": -1.8242807388305664, + "loss": 0.9956, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.3178021907806396, + "rewards/margins": 0.5064784288406372, + "rewards/rejected": -1.8242807388305664, + "sft_loss": 1.3954349756240845, + "step": 2075 + }, + { + "epoch": 1.1132296370630541, + "grad_norm": 5.505296514008136, + "learning_rate": 2.3770600509081872e-06, + "logits/chosen": -0.448671817779541, + "logits/rejected": -0.29919299483299255, + "logps/chosen": -1.252861738204956, + "logps/rejected": -1.7711899280548096, + "loss": 0.9439, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.252861738204956, + "rewards/margins": 0.5183283090591431, + "rewards/rejected": -1.7711899280548096, + "sft_loss": 1.30695378780365, + "step": 2080 + }, + { + "epoch": 1.1159056698444556, + "grad_norm": 9.690465905387198, + "learning_rate": 2.373265498126745e-06, + "logits/chosen": -0.4107128083705902, + "logits/rejected": -0.31409990787506104, + "logps/chosen": -1.3148083686828613, + "logps/rejected": -1.8912432193756104, + "loss": 0.974, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3148083686828613, + "rewards/margins": 0.5764346122741699, + "rewards/rejected": -1.8912432193756104, + "sft_loss": 1.3471014499664307, + "step": 2085 + }, + { + "epoch": 1.118581702625857, + "grad_norm": 8.136623207062934, + "learning_rate": 2.36946247291961e-06, + "logits/chosen": -0.4802681803703308, + "logits/rejected": -0.4794479012489319, + "logps/chosen": -1.3315585851669312, + "logps/rejected": -1.769972801208496, + "loss": 1.0317, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3315585851669312, + "rewards/margins": 0.4384143352508545, + "rewards/rejected": -1.769972801208496, + "sft_loss": 1.4288564920425415, + "step": 2090 + }, + { + "epoch": 1.1212577354072588, + "grad_norm": 6.752891233419036, + "learning_rate": 2.3656510121837492e-06, + "logits/chosen": -0.4110310971736908, + "logits/rejected": -0.27823466062545776, + "logps/chosen": -1.4545396566390991, + "logps/rejected": -1.8475834131240845, + "loss": 1.0645, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.4545396566390991, + "rewards/margins": 0.3930436670780182, + "rewards/rejected": -1.8475834131240845, + "sft_loss": 1.485935091972351, + "step": 2095 + }, + { + "epoch": 1.1239337681886603, + "grad_norm": 8.288303344067318, + "learning_rate": 2.3618311528979717e-06, + "logits/chosen": -0.2994609475135803, + "logits/rejected": -0.2679150700569153, + "logps/chosen": -1.418294906616211, + "logps/rejected": -1.8473249673843384, + "loss": 1.0135, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.418294906616211, + "rewards/margins": 0.4290298521518707, + "rewards/rejected": -1.8473249673843384, + "sft_loss": 1.4293949604034424, + "step": 2100 + }, + { + "epoch": 1.1266098009700618, + "grad_norm": 8.36414752887087, + "learning_rate": 2.3580029321225692e-06, + "logits/chosen": -0.30061233043670654, + "logits/rejected": -0.19322152435779572, + "logps/chosen": -1.3510148525238037, + "logps/rejected": -1.9490478038787842, + "loss": 0.9667, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3510148525238037, + "rewards/margins": 0.5980329513549805, + "rewards/rejected": -1.9490478038787842, + "sft_loss": 1.3427172899246216, + "step": 2105 + }, + { + "epoch": 1.1292858337514635, + "grad_norm": 5.440835529900344, + "learning_rate": 2.354166386998956e-06, + "logits/chosen": -0.3956003189086914, + "logits/rejected": -0.24811363220214844, + "logps/chosen": -1.2949475049972534, + "logps/rejected": -2.068253993988037, + "loss": 0.9611, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2949475049972534, + "rewards/margins": 0.7733063101768494, + "rewards/rejected": -2.068253993988037, + "sft_loss": 1.3592411279678345, + "step": 2110 + }, + { + "epoch": 1.131961866532865, + "grad_norm": 7.4646506111629565, + "learning_rate": 2.3503215547493097e-06, + "logits/chosen": -0.24873106181621552, + "logits/rejected": -0.20497290790081024, + "logps/chosen": -1.326761245727539, + "logps/rejected": -1.8516361713409424, + "loss": 1.0211, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.326761245727539, + "rewards/margins": 0.524874746799469, + "rewards/rejected": -1.8516361713409424, + "sft_loss": 1.3926951885223389, + "step": 2115 + }, + { + "epoch": 1.1346378993142665, + "grad_norm": 6.778428238441266, + "learning_rate": 2.3464684726762104e-06, + "logits/chosen": -0.370392769575119, + "logits/rejected": -0.3410020172595978, + "logps/chosen": -1.3269846439361572, + "logps/rejected": -1.7383434772491455, + "loss": 1.0277, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3269846439361572, + "rewards/margins": 0.41135889291763306, + "rewards/rejected": -1.7383434772491455, + "sft_loss": 1.4061932563781738, + "step": 2120 + }, + { + "epoch": 1.1373139320956682, + "grad_norm": 5.1636028770142826, + "learning_rate": 2.342607178162276e-06, + "logits/chosen": -0.2839960753917694, + "logits/rejected": -0.22470524907112122, + "logps/chosen": -1.260023832321167, + "logps/rejected": -1.901210069656372, + "loss": 0.9193, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.260023832321167, + "rewards/margins": 0.6411863565444946, + "rewards/rejected": -1.901210069656372, + "sft_loss": 1.2899072170257568, + "step": 2125 + }, + { + "epoch": 1.1399899648770697, + "grad_norm": 14.15197203351975, + "learning_rate": 2.338737708669804e-06, + "logits/chosen": -0.2899111211299896, + "logits/rejected": -0.061303604394197464, + "logps/chosen": -1.3471908569335938, + "logps/rejected": -1.8845421075820923, + "loss": 1.0016, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3471908569335938, + "rewards/margins": 0.5373513698577881, + "rewards/rejected": -1.8845421075820923, + "sft_loss": 1.4021618366241455, + "step": 2130 + }, + { + "epoch": 1.1426659976584714, + "grad_norm": 7.043226372929394, + "learning_rate": 2.334860101740404e-06, + "logits/chosen": -0.3275575637817383, + "logits/rejected": -0.17974238097667694, + "logps/chosen": -1.3362411260604858, + "logps/rejected": -1.9289495944976807, + "loss": 0.9747, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3362411260604858, + "rewards/margins": 0.5927082896232605, + "rewards/rejected": -1.9289495944976807, + "sft_loss": 1.362756609916687, + "step": 2135 + }, + { + "epoch": 1.145342030439873, + "grad_norm": 15.077464220247649, + "learning_rate": 2.330974394994635e-06, + "logits/chosen": -0.371978759765625, + "logits/rejected": -0.24568262696266174, + "logps/chosen": -1.3593480587005615, + "logps/rejected": -1.8906453847885132, + "loss": 0.999, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3593480587005615, + "rewards/margins": 0.5312973856925964, + "rewards/rejected": -1.8906453847885132, + "sft_loss": 1.373910665512085, + "step": 2140 + }, + { + "epoch": 1.1480180632212744, + "grad_norm": 8.061203841557813, + "learning_rate": 2.327080626131641e-06, + "logits/chosen": -0.34131118655204773, + "logits/rejected": -0.2771221995353699, + "logps/chosen": -1.2546392679214478, + "logps/rejected": -1.965981125831604, + "loss": 0.9339, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2546392679214478, + "rewards/margins": 0.7113418579101562, + "rewards/rejected": -1.965981125831604, + "sft_loss": 1.3372882604599, + "step": 2145 + }, + { + "epoch": 1.1506940960026761, + "grad_norm": 6.198725779486274, + "learning_rate": 2.3231788329287855e-06, + "logits/chosen": -0.3761574625968933, + "logits/rejected": -0.33477336168289185, + "logps/chosen": -1.4132637977600098, + "logps/rejected": -1.9284454584121704, + "loss": 1.0295, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.4132637977600098, + "rewards/margins": 0.5151815414428711, + "rewards/rejected": -1.9284454584121704, + "sft_loss": 1.4557913541793823, + "step": 2150 + }, + { + "epoch": 1.1533701287840776, + "grad_norm": 8.643689993037764, + "learning_rate": 2.3192690532412827e-06, + "logits/chosen": -0.3037932217121124, + "logits/rejected": -0.24522796273231506, + "logps/chosen": -1.3926770687103271, + "logps/rejected": -1.7917263507843018, + "loss": 1.0373, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3926770687103271, + "rewards/margins": 0.3990491032600403, + "rewards/rejected": -1.7917263507843018, + "sft_loss": 1.4622339010238647, + "step": 2155 + }, + { + "epoch": 1.1560461615654791, + "grad_norm": 8.998457327542715, + "learning_rate": 2.315351325001832e-06, + "logits/chosen": -0.3872067928314209, + "logits/rejected": -0.2904983162879944, + "logps/chosen": -1.3250735998153687, + "logps/rejected": -1.9157615900039673, + "loss": 0.9745, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3250735998153687, + "rewards/margins": 0.5906879305839539, + "rewards/rejected": -1.9157615900039673, + "sft_loss": 1.3803297281265259, + "step": 2160 + }, + { + "epoch": 1.1587221943468808, + "grad_norm": 6.358258923592004, + "learning_rate": 2.3114256862202495e-06, + "logits/chosen": -0.3770531713962555, + "logits/rejected": -0.21621887385845184, + "logps/chosen": -1.3132208585739136, + "logps/rejected": -1.9442708492279053, + "loss": 0.9482, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3132208585739136, + "rewards/margins": 0.6310499906539917, + "rewards/rejected": -1.9442708492279053, + "sft_loss": 1.3465019464492798, + "step": 2165 + }, + { + "epoch": 1.1613982271282823, + "grad_norm": 4.281762607044848, + "learning_rate": 2.3074921749831013e-06, + "logits/chosen": -0.33440592885017395, + "logits/rejected": -0.17447985708713531, + "logps/chosen": -1.3043618202209473, + "logps/rejected": -1.869027853012085, + "loss": 0.97, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3043618202209473, + "rewards/margins": 0.5646663904190063, + "rewards/rejected": -1.869027853012085, + "sft_loss": 1.3304054737091064, + "step": 2170 + }, + { + "epoch": 1.1640742599096838, + "grad_norm": 5.573774423455921, + "learning_rate": 2.30355082945333e-06, + "logits/chosen": -0.3903493881225586, + "logits/rejected": -0.23366177082061768, + "logps/chosen": -1.3278374671936035, + "logps/rejected": -1.7130565643310547, + "loss": 1.0106, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.3278374671936035, + "rewards/margins": 0.3852190375328064, + "rewards/rejected": -1.7130565643310547, + "sft_loss": 1.3740935325622559, + "step": 2175 + }, + { + "epoch": 1.1667502926910855, + "grad_norm": 5.561648952860178, + "learning_rate": 2.2996016878698866e-06, + "logits/chosen": -0.41646987199783325, + "logits/rejected": -0.36212459206581116, + "logps/chosen": -1.2699323892593384, + "logps/rejected": -1.782680869102478, + "loss": 0.978, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2699323892593384, + "rewards/margins": 0.5127487182617188, + "rewards/rejected": -1.782680869102478, + "sft_loss": 1.3473727703094482, + "step": 2180 + }, + { + "epoch": 1.169426325472487, + "grad_norm": 7.516882134975139, + "learning_rate": 2.2956447885473607e-06, + "logits/chosen": -0.32953667640686035, + "logits/rejected": -0.20225711166858673, + "logps/chosen": -1.3292700052261353, + "logps/rejected": -1.7645435333251953, + "loss": 0.9786, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3292700052261353, + "rewards/margins": 0.4352734088897705, + "rewards/rejected": -1.7645435333251953, + "sft_loss": 1.3366014957427979, + "step": 2185 + }, + { + "epoch": 1.1721023582538885, + "grad_norm": 5.618775063863522, + "learning_rate": 2.2916801698756063e-06, + "logits/chosen": -0.2886897027492523, + "logits/rejected": -0.2478192150592804, + "logps/chosen": -1.3457328081130981, + "logps/rejected": -1.8688604831695557, + "loss": 1.0074, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3457328081130981, + "rewards/margins": 0.5231277346611023, + "rewards/rejected": -1.8688604831695557, + "sft_loss": 1.4351282119750977, + "step": 2190 + }, + { + "epoch": 1.1747783910352902, + "grad_norm": 7.458063698457432, + "learning_rate": 2.287707870319372e-06, + "logits/chosen": -0.41209912300109863, + "logits/rejected": -0.32010191679000854, + "logps/chosen": -1.3309061527252197, + "logps/rejected": -1.969655990600586, + "loss": 0.98, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3309061527252197, + "rewards/margins": 0.6387497782707214, + "rewards/rejected": -1.969655990600586, + "sft_loss": 1.379540205001831, + "step": 2195 + }, + { + "epoch": 1.1774544238166917, + "grad_norm": 7.046611503518798, + "learning_rate": 2.283727928417925e-06, + "logits/chosen": -0.48315876722335815, + "logits/rejected": -0.49728766083717346, + "logps/chosen": -1.3308050632476807, + "logps/rejected": -1.9045906066894531, + "loss": 0.9829, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3308050632476807, + "rewards/margins": 0.573785662651062, + "rewards/rejected": -1.9045906066894531, + "sft_loss": 1.400823950767517, + "step": 2200 + }, + { + "epoch": 1.1801304565980932, + "grad_norm": 6.089821566872165, + "learning_rate": 2.27974038278468e-06, + "logits/chosen": -0.5165932774543762, + "logits/rejected": -0.3608459532260895, + "logps/chosen": -1.2554948329925537, + "logps/rejected": -1.802454948425293, + "loss": 0.9347, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2554948329925537, + "rewards/margins": 0.5469598770141602, + "rewards/rejected": -1.802454948425293, + "sft_loss": 1.268705129623413, + "step": 2205 + }, + { + "epoch": 1.182806489379495, + "grad_norm": 7.429965290613201, + "learning_rate": 2.2757452721068206e-06, + "logits/chosen": -0.5314079523086548, + "logits/rejected": -0.44425535202026367, + "logps/chosen": -1.1852877140045166, + "logps/rejected": -1.7990539073944092, + "loss": 0.9296, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.1852877140045166, + "rewards/margins": 0.6137663125991821, + "rewards/rejected": -1.7990539073944092, + "sft_loss": 1.2573305368423462, + "step": 2210 + }, + { + "epoch": 1.1854825221608964, + "grad_norm": 8.428947513370273, + "learning_rate": 2.2717426351449294e-06, + "logits/chosen": -0.5099958181381226, + "logits/rejected": -0.453339159488678, + "logps/chosen": -1.4342055320739746, + "logps/rejected": -2.0443735122680664, + "loss": 0.9893, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.4342055320739746, + "rewards/margins": 0.6101681590080261, + "rewards/rejected": -2.0443735122680664, + "sft_loss": 1.3941960334777832, + "step": 2215 + }, + { + "epoch": 1.188158554942298, + "grad_norm": 8.649507351765893, + "learning_rate": 2.2677325107326067e-06, + "logits/chosen": -0.5668259859085083, + "logits/rejected": -0.46763554215431213, + "logps/chosen": -1.2631756067276, + "logps/rejected": -1.745678186416626, + "loss": 0.9944, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2631756067276, + "rewards/margins": 0.4825025200843811, + "rewards/rejected": -1.745678186416626, + "sft_loss": 1.325333833694458, + "step": 2220 + }, + { + "epoch": 1.1908345877236997, + "grad_norm": 6.626725196731764, + "learning_rate": 2.2637149377760985e-06, + "logits/chosen": -0.5213819742202759, + "logits/rejected": -0.3564545512199402, + "logps/chosen": -1.239008903503418, + "logps/rejected": -1.8543846607208252, + "loss": 0.9431, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.239008903503418, + "rewards/margins": 0.6153759360313416, + "rewards/rejected": -1.8543846607208252, + "sft_loss": 1.3210550546646118, + "step": 2225 + }, + { + "epoch": 1.1935106205051011, + "grad_norm": 10.636372148532034, + "learning_rate": 2.2596899552539136e-06, + "logits/chosen": -0.5316981077194214, + "logits/rejected": -0.4064369201660156, + "logps/chosen": -1.340803861618042, + "logps/rejected": -2.076864004135132, + "loss": 0.9649, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.340803861618042, + "rewards/margins": 0.7360602021217346, + "rewards/rejected": -2.076864004135132, + "sft_loss": 1.355507731437683, + "step": 2230 + }, + { + "epoch": 1.1961866532865026, + "grad_norm": 6.704793830964508, + "learning_rate": 2.2556576022164516e-06, + "logits/chosen": -0.4898918569087982, + "logits/rejected": -0.3309337794780731, + "logps/chosen": -1.28102445602417, + "logps/rejected": -1.877788782119751, + "loss": 0.9655, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.28102445602417, + "rewards/margins": 0.5967644453048706, + "rewards/rejected": -1.877788782119751, + "sft_loss": 1.312239408493042, + "step": 2235 + }, + { + "epoch": 1.1988626860679044, + "grad_norm": 6.073604347367257, + "learning_rate": 2.2516179177856182e-06, + "logits/chosen": -0.48533496260643005, + "logits/rejected": -0.35858696699142456, + "logps/chosen": -1.2944753170013428, + "logps/rejected": -1.8752460479736328, + "loss": 0.9308, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.2944753170013428, + "rewards/margins": 0.5807708501815796, + "rewards/rejected": -1.8752460479736328, + "sft_loss": 1.343420386314392, + "step": 2240 + }, + { + "epoch": 1.2015387188493059, + "grad_norm": 6.93473369461339, + "learning_rate": 2.2475709411544503e-06, + "logits/chosen": -0.4326443672180176, + "logits/rejected": -0.39417821168899536, + "logps/chosen": -1.2652297019958496, + "logps/rejected": -1.755052924156189, + "loss": 0.9683, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2652297019958496, + "rewards/margins": 0.48982328176498413, + "rewards/rejected": -1.755052924156189, + "sft_loss": 1.3312979936599731, + "step": 2245 + }, + { + "epoch": 1.2042147516307076, + "grad_norm": 7.19328412489189, + "learning_rate": 2.2435167115867325e-06, + "logits/chosen": -0.4243658185005188, + "logits/rejected": -0.41740983724594116, + "logps/chosen": -1.27482008934021, + "logps/rejected": -1.8900315761566162, + "loss": 0.9154, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.27482008934021, + "rewards/margins": 0.6152116060256958, + "rewards/rejected": -1.8900315761566162, + "sft_loss": 1.3052546977996826, + "step": 2250 + }, + { + "epoch": 1.206890784412109, + "grad_norm": 8.271740290521542, + "learning_rate": 2.239455268416618e-06, + "logits/chosen": -0.49116426706314087, + "logits/rejected": -0.404990017414093, + "logps/chosen": -1.3746150732040405, + "logps/rejected": -1.832044005393982, + "loss": 1.0338, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3746150732040405, + "rewards/margins": 0.45742878317832947, + "rewards/rejected": -1.832044005393982, + "sft_loss": 1.3913629055023193, + "step": 2255 + }, + { + "epoch": 1.2095668171935106, + "grad_norm": 6.574147946513313, + "learning_rate": 2.2353866510482463e-06, + "logits/chosen": -0.43215814232826233, + "logits/rejected": -0.44993463158607483, + "logps/chosen": -1.3542969226837158, + "logps/rejected": -1.8027689456939697, + "loss": 0.9946, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3542969226837158, + "rewards/margins": 0.4484720230102539, + "rewards/rejected": -1.8027689456939697, + "sft_loss": 1.3709371089935303, + "step": 2260 + }, + { + "epoch": 1.2122428499749123, + "grad_norm": 5.6040914299629145, + "learning_rate": 2.231310898955361e-06, + "logits/chosen": -0.5037276744842529, + "logits/rejected": -0.4357093870639801, + "logps/chosen": -1.37501060962677, + "logps/rejected": -1.9487228393554688, + "loss": 1.0059, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.37501060962677, + "rewards/margins": 0.5737122893333435, + "rewards/rejected": -1.9487228393554688, + "sft_loss": 1.4598820209503174, + "step": 2265 + }, + { + "epoch": 1.2149188827563138, + "grad_norm": 10.060128681878934, + "learning_rate": 2.2272280516809262e-06, + "logits/chosen": -0.5884903073310852, + "logits/rejected": -0.45150431990623474, + "logps/chosen": -1.3184568881988525, + "logps/rejected": -1.9314219951629639, + "loss": 0.9558, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3184568881988525, + "rewards/margins": 0.6129651069641113, + "rewards/rejected": -1.9314219951629639, + "sft_loss": 1.326110601425171, + "step": 2270 + }, + { + "epoch": 1.2175949155377153, + "grad_norm": 9.465708541220504, + "learning_rate": 2.2231381488367447e-06, + "logits/chosen": -0.46365708112716675, + "logits/rejected": -0.3804323077201843, + "logps/chosen": -1.2983791828155518, + "logps/rejected": -1.9503008127212524, + "loss": 0.9392, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2983791828155518, + "rewards/margins": 0.6519216299057007, + "rewards/rejected": -1.9503008127212524, + "sft_loss": 1.317442536354065, + "step": 2275 + }, + { + "epoch": 1.220270948319117, + "grad_norm": 7.15563960123028, + "learning_rate": 2.2190412301030717e-06, + "logits/chosen": -0.5409069061279297, + "logits/rejected": -0.41943830251693726, + "logps/chosen": -1.2157642841339111, + "logps/rejected": -1.7413638830184937, + "loss": 0.9514, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2157642841339111, + "rewards/margins": 0.5255998373031616, + "rewards/rejected": -1.7413638830184937, + "sft_loss": 1.2732056379318237, + "step": 2280 + }, + { + "epoch": 1.2229469811005185, + "grad_norm": 9.345389829897462, + "learning_rate": 2.2149373352282307e-06, + "logits/chosen": -0.49041399359703064, + "logits/rejected": -0.34547704458236694, + "logps/chosen": -1.3812358379364014, + "logps/rejected": -2.0083115100860596, + "loss": 0.9721, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3812358379364014, + "rewards/margins": 0.6270755529403687, + "rewards/rejected": -2.0083115100860596, + "sft_loss": 1.380582571029663, + "step": 2285 + }, + { + "epoch": 1.22562301388192, + "grad_norm": 4.908793417677732, + "learning_rate": 2.2108265040282275e-06, + "logits/chosen": -0.6118310689926147, + "logits/rejected": -0.5083206295967102, + "logps/chosen": -1.2254685163497925, + "logps/rejected": -1.7813608646392822, + "loss": 0.9601, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2254685163497925, + "rewards/margins": 0.555892288684845, + "rewards/rejected": -1.7813608646392822, + "sft_loss": 1.2873799800872803, + "step": 2290 + }, + { + "epoch": 1.2282990466633217, + "grad_norm": 12.854062830258666, + "learning_rate": 2.2067087763863644e-06, + "logits/chosen": -0.5858707427978516, + "logits/rejected": -0.5209950804710388, + "logps/chosen": -1.3518887758255005, + "logps/rejected": -1.9931846857070923, + "loss": 1.0183, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3518887758255005, + "rewards/margins": 0.6412959098815918, + "rewards/rejected": -1.9931846857070923, + "sft_loss": 1.4554845094680786, + "step": 2295 + }, + { + "epoch": 1.2309750794447232, + "grad_norm": 11.230806427426023, + "learning_rate": 2.202584192252854e-06, + "logits/chosen": -0.5063179731369019, + "logits/rejected": -0.41649264097213745, + "logps/chosen": -1.3351062536239624, + "logps/rejected": -1.8526620864868164, + "loss": 1.0234, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3351062536239624, + "rewards/margins": 0.517555832862854, + "rewards/rejected": -1.8526620864868164, + "sft_loss": 1.392976999282837, + "step": 2300 + }, + { + "epoch": 1.233651112226125, + "grad_norm": 8.630130641238273, + "learning_rate": 2.1984527916444283e-06, + "logits/chosen": -0.5501508712768555, + "logits/rejected": -0.44187331199645996, + "logps/chosen": -1.4538100957870483, + "logps/rejected": -2.0695865154266357, + "loss": 1.0058, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4538100957870483, + "rewards/margins": 0.6157761812210083, + "rewards/rejected": -2.0695865154266357, + "sft_loss": 1.4128062725067139, + "step": 2305 + }, + { + "epoch": 1.2363271450075264, + "grad_norm": 8.889623842267012, + "learning_rate": 2.1943146146439557e-06, + "logits/chosen": -0.49043694138526917, + "logits/rejected": -0.28054124116897583, + "logps/chosen": -1.3581206798553467, + "logps/rejected": -2.006803274154663, + "loss": 0.9749, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3581206798553467, + "rewards/margins": 0.648682713508606, + "rewards/rejected": -2.006803274154663, + "sft_loss": 1.3612374067306519, + "step": 2310 + }, + { + "epoch": 1.2390031777889279, + "grad_norm": 8.460384516617, + "learning_rate": 2.190169701400046e-06, + "logits/chosen": -0.5204485654830933, + "logits/rejected": -0.38551202416419983, + "logps/chosen": -1.3627592325210571, + "logps/rejected": -2.024308681488037, + "loss": 0.9908, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3627592325210571, + "rewards/margins": 0.6615496277809143, + "rewards/rejected": -2.024308681488037, + "sft_loss": 1.4165655374526978, + "step": 2315 + }, + { + "epoch": 1.2416792105703296, + "grad_norm": 6.849685883510946, + "learning_rate": 2.186018092126666e-06, + "logits/chosen": -0.4214208722114563, + "logits/rejected": -0.4097623825073242, + "logps/chosen": -1.3190131187438965, + "logps/rejected": -1.9012644290924072, + "loss": 0.9591, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3190131187438965, + "rewards/margins": 0.582251250743866, + "rewards/rejected": -1.9012644290924072, + "sft_loss": 1.3551105260849, + "step": 2320 + }, + { + "epoch": 1.244355243351731, + "grad_norm": 8.481789607899168, + "learning_rate": 2.181859827102748e-06, + "logits/chosen": -0.3990851044654846, + "logits/rejected": -0.3555835783481598, + "logps/chosen": -1.3547569513320923, + "logps/rejected": -2.081740140914917, + "loss": 0.9382, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3547569513320923, + "rewards/margins": 0.726983368396759, + "rewards/rejected": -2.081740140914917, + "sft_loss": 1.3477305173873901, + "step": 2325 + }, + { + "epoch": 1.2470312761331326, + "grad_norm": 9.195165027494054, + "learning_rate": 2.1776949466717967e-06, + "logits/chosen": -0.5580836534500122, + "logits/rejected": -0.4793972074985504, + "logps/chosen": -1.365241289138794, + "logps/rejected": -1.9775069952011108, + "loss": 0.9959, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.365241289138794, + "rewards/margins": 0.6122655868530273, + "rewards/rejected": -1.9775069952011108, + "sft_loss": 1.4184669256210327, + "step": 2330 + }, + { + "epoch": 1.2497073089145343, + "grad_norm": 8.394492326756822, + "learning_rate": 2.1735234912415007e-06, + "logits/chosen": -0.4259399473667145, + "logits/rejected": -0.3873990774154663, + "logps/chosen": -1.3863012790679932, + "logps/rejected": -1.9576469659805298, + "loss": 0.9876, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3863012790679932, + "rewards/margins": 0.5713458061218262, + "rewards/rejected": -1.9576469659805298, + "sft_loss": 1.404350996017456, + "step": 2335 + }, + { + "epoch": 1.2523833416959358, + "grad_norm": 7.4852815755175035, + "learning_rate": 2.1693455012833388e-06, + "logits/chosen": -0.5679572224617004, + "logits/rejected": -0.4183397889137268, + "logps/chosen": -1.3491175174713135, + "logps/rejected": -1.9551103115081787, + "loss": 0.9983, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3491175174713135, + "rewards/margins": 0.6059929132461548, + "rewards/rejected": -1.9551103115081787, + "sft_loss": 1.366546630859375, + "step": 2340 + }, + { + "epoch": 1.2550593744773373, + "grad_norm": 7.392603378684189, + "learning_rate": 2.1651610173321877e-06, + "logits/chosen": -0.4930770993232727, + "logits/rejected": -0.3636007010936737, + "logps/chosen": -1.330756425857544, + "logps/rejected": -1.9209697246551514, + "loss": 0.9733, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.330756425857544, + "rewards/margins": 0.5902132391929626, + "rewards/rejected": -1.9209697246551514, + "sft_loss": 1.3710582256317139, + "step": 2345 + }, + { + "epoch": 1.257735407258739, + "grad_norm": 5.9058043996703145, + "learning_rate": 2.1609700799859287e-06, + "logits/chosen": -0.5141445994377136, + "logits/rejected": -0.4121854305267334, + "logps/chosen": -1.35433030128479, + "logps/rejected": -1.8730039596557617, + "loss": 1.0054, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.35433030128479, + "rewards/margins": 0.5186737179756165, + "rewards/rejected": -1.8730039596557617, + "sft_loss": 1.381361961364746, + "step": 2350 + }, + { + "epoch": 1.2604114400401405, + "grad_norm": 8.438160978332148, + "learning_rate": 2.1567727299050555e-06, + "logits/chosen": -0.497615247964859, + "logits/rejected": -0.3885241150856018, + "logps/chosen": -1.2426456212997437, + "logps/rejected": -2.0065460205078125, + "loss": 0.9398, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.2426456212997437, + "rewards/margins": 0.7639003396034241, + "rewards/rejected": -2.0065460205078125, + "sft_loss": 1.3120468854904175, + "step": 2355 + }, + { + "epoch": 1.263087472821542, + "grad_norm": 11.62262649985683, + "learning_rate": 2.152569007812276e-06, + "logits/chosen": -0.525715708732605, + "logits/rejected": -0.4441584646701813, + "logps/chosen": -1.3071136474609375, + "logps/rejected": -2.1241071224212646, + "loss": 0.9343, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3071136474609375, + "rewards/margins": 0.8169934153556824, + "rewards/rejected": -2.1241071224212646, + "sft_loss": 1.3898365497589111, + "step": 2360 + }, + { + "epoch": 1.2657635056029437, + "grad_norm": 5.788344945163352, + "learning_rate": 2.1483589544921202e-06, + "logits/chosen": -0.5040058493614197, + "logits/rejected": -0.42346611618995667, + "logps/chosen": -1.361579179763794, + "logps/rejected": -1.98703932762146, + "loss": 1.0044, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.361579179763794, + "rewards/margins": 0.6254600882530212, + "rewards/rejected": -1.98703932762146, + "sft_loss": 1.4275305271148682, + "step": 2365 + }, + { + "epoch": 1.2684395383843452, + "grad_norm": 7.765495051030483, + "learning_rate": 2.144142610790545e-06, + "logits/chosen": -0.4902319014072418, + "logits/rejected": -0.40892449021339417, + "logps/chosen": -1.3193453550338745, + "logps/rejected": -1.8786367177963257, + "loss": 0.9648, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3193453550338745, + "rewards/margins": 0.5592910647392273, + "rewards/rejected": -1.8786367177963257, + "sft_loss": 1.4025046825408936, + "step": 2370 + }, + { + "epoch": 1.2711155711657467, + "grad_norm": 9.18984071370844, + "learning_rate": 2.1399200176145344e-06, + "logits/chosen": -0.6545987129211426, + "logits/rejected": -0.5351656079292297, + "logps/chosen": -1.2366050481796265, + "logps/rejected": -1.7836157083511353, + "loss": 0.9548, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2366050481796265, + "rewards/margins": 0.5470104217529297, + "rewards/rejected": -1.7836157083511353, + "sft_loss": 1.2792375087738037, + "step": 2375 + }, + { + "epoch": 1.2737916039471484, + "grad_norm": 7.955905733638929, + "learning_rate": 2.1356912159317067e-06, + "logits/chosen": -0.6400793790817261, + "logits/rejected": -0.48976173996925354, + "logps/chosen": -1.4008651971817017, + "logps/rejected": -2.1842517852783203, + "loss": 0.9835, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.4008651971817017, + "rewards/margins": 0.7833863496780396, + "rewards/rejected": -2.1842517852783203, + "sft_loss": 1.4612462520599365, + "step": 2380 + }, + { + "epoch": 1.27646763672855, + "grad_norm": 6.902439419571325, + "learning_rate": 2.1314562467699133e-06, + "logits/chosen": -0.5332831740379333, + "logits/rejected": -0.4664790630340576, + "logps/chosen": -1.3477102518081665, + "logps/rejected": -1.8963968753814697, + "loss": 0.9786, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.3477102518081665, + "rewards/margins": 0.5486865043640137, + "rewards/rejected": -1.8963968753814697, + "sft_loss": 1.3262543678283691, + "step": 2385 + }, + { + "epoch": 1.2791436695099514, + "grad_norm": 9.388764886978345, + "learning_rate": 2.1272151512168453e-06, + "logits/chosen": -0.5033223628997803, + "logits/rejected": -0.46903854608535767, + "logps/chosen": -1.2872415781021118, + "logps/rejected": -2.043466567993164, + "loss": 0.9396, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2872415781021118, + "rewards/margins": 0.7562249898910522, + "rewards/rejected": -2.043466567993164, + "sft_loss": 1.3627054691314697, + "step": 2390 + }, + { + "epoch": 1.2818197022913531, + "grad_norm": 6.369306347442532, + "learning_rate": 2.122967970419629e-06, + "logits/chosen": -0.6639467477798462, + "logits/rejected": -0.5871747732162476, + "logps/chosen": -1.262899398803711, + "logps/rejected": -1.8460830450057983, + "loss": 0.9408, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.262899398803711, + "rewards/margins": 0.5831834077835083, + "rewards/rejected": -1.8460830450057983, + "sft_loss": 1.3119322061538696, + "step": 2395 + }, + { + "epoch": 1.2844957350727546, + "grad_norm": 6.598712576180703, + "learning_rate": 2.118714745584431e-06, + "logits/chosen": -0.5417832732200623, + "logits/rejected": -0.46218061447143555, + "logps/chosen": -1.2741425037384033, + "logps/rejected": -1.827161431312561, + "loss": 0.9655, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2741425037384033, + "rewards/margins": 0.5530189275741577, + "rewards/rejected": -1.827161431312561, + "sft_loss": 1.3322670459747314, + "step": 2400 + }, + { + "epoch": 1.2844957350727546, + "eval_logits/chosen": -0.32509803771972656, + "eval_logits/rejected": -0.2668491303920746, + "eval_logps/chosen": -1.4205245971679688, + "eval_logps/rejected": -1.9133427143096924, + "eval_loss": 1.0376254320144653, + "eval_rewards/accuracies": 0.6357566714286804, + "eval_rewards/chosen": -1.4205245971679688, + "eval_rewards/margins": 0.4928181767463684, + "eval_rewards/rejected": -1.9133427143096924, + "eval_runtime": 43.0353, + "eval_samples_per_second": 31.253, + "eval_sft_loss": 1.416913628578186, + "eval_steps_per_second": 7.831, + "step": 2400 + }, + { + "epoch": 1.287171767854156, + "grad_norm": 5.32756149749768, + "learning_rate": 2.1144555179760582e-06, + "logits/chosen": -0.5373546481132507, + "logits/rejected": -0.42289772629737854, + "logps/chosen": -1.3479527235031128, + "logps/rejected": -2.0310306549072266, + "loss": 0.977, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3479527235031128, + "rewards/margins": 0.6830779314041138, + "rewards/rejected": -2.0310306549072266, + "sft_loss": 1.384868860244751, + "step": 2405 + }, + { + "epoch": 1.2898478006355578, + "grad_norm": 7.929697943613389, + "learning_rate": 2.110190328917555e-06, + "logits/chosen": -0.6266440153121948, + "logits/rejected": -0.46866053342819214, + "logps/chosen": -1.3129150867462158, + "logps/rejected": -1.685058832168579, + "loss": 1.0223, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3129150867462158, + "rewards/margins": 0.3721437156200409, + "rewards/rejected": -1.685058832168579, + "sft_loss": 1.3628969192504883, + "step": 2410 + }, + { + "epoch": 1.2925238334169593, + "grad_norm": 8.923537045074404, + "learning_rate": 2.1059192197898044e-06, + "logits/chosen": -0.4667263925075531, + "logits/rejected": -0.41617077589035034, + "logps/chosen": -1.2272765636444092, + "logps/rejected": -1.9868072271347046, + "loss": 0.909, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2272765636444092, + "rewards/margins": 0.7595307230949402, + "rewards/rejected": -1.9868072271347046, + "sft_loss": 1.2580385208129883, + "step": 2415 + }, + { + "epoch": 1.2951998661983608, + "grad_norm": 8.038623786392707, + "learning_rate": 2.1016422320311257e-06, + "logits/chosen": -0.563264787197113, + "logits/rejected": -0.4620184302330017, + "logps/chosen": -1.3580414056777954, + "logps/rejected": -1.9529234170913696, + "loss": 0.9653, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3580414056777954, + "rewards/margins": 0.5948818325996399, + "rewards/rejected": -1.9529234170913696, + "sft_loss": 1.433401107788086, + "step": 2420 + }, + { + "epoch": 1.2978758989797625, + "grad_norm": 6.234418417515403, + "learning_rate": 2.097359407136873e-06, + "logits/chosen": -0.44907650351524353, + "logits/rejected": -0.38264599442481995, + "logps/chosen": -1.258331537246704, + "logps/rejected": -1.6858599185943604, + "loss": 0.987, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.258331537246704, + "rewards/margins": 0.4275285601615906, + "rewards/rejected": -1.6858599185943604, + "sft_loss": 1.351828932762146, + "step": 2425 + }, + { + "epoch": 1.300551931761164, + "grad_norm": 8.813954012728816, + "learning_rate": 2.093070786659033e-06, + "logits/chosen": -0.4727330207824707, + "logits/rejected": -0.4371541142463684, + "logps/chosen": -1.3942360877990723, + "logps/rejected": -1.9270381927490234, + "loss": 1.0123, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3942360877990723, + "rewards/margins": 0.5328022241592407, + "rewards/rejected": -1.9270381927490234, + "sft_loss": 1.434128999710083, + "step": 2430 + }, + { + "epoch": 1.3032279645425655, + "grad_norm": 5.8086980544229245, + "learning_rate": 2.0887764122058195e-06, + "logits/chosen": -0.45605263113975525, + "logits/rejected": -0.348542183637619, + "logps/chosen": -1.3246591091156006, + "logps/rejected": -1.7781784534454346, + "loss": 0.994, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3246591091156006, + "rewards/margins": 0.4535194933414459, + "rewards/rejected": -1.7781784534454346, + "sft_loss": 1.3365366458892822, + "step": 2435 + }, + { + "epoch": 1.3059039973239672, + "grad_norm": 10.34553529868521, + "learning_rate": 2.084476325441272e-06, + "logits/chosen": -0.5627564191818237, + "logits/rejected": -0.4698103368282318, + "logps/chosen": -1.291358232498169, + "logps/rejected": -1.9262027740478516, + "loss": 0.9394, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.291358232498169, + "rewards/margins": 0.6348446607589722, + "rewards/rejected": -1.9262027740478516, + "sft_loss": 1.2897425889968872, + "step": 2440 + }, + { + "epoch": 1.3085800301053687, + "grad_norm": 30.91851751488722, + "learning_rate": 2.0801705680848523e-06, + "logits/chosen": -0.5131195187568665, + "logits/rejected": -0.3856009542942047, + "logps/chosen": -1.389692783355713, + "logps/rejected": -1.9225718975067139, + "loss": 1.0114, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.389692783355713, + "rewards/margins": 0.5328791737556458, + "rewards/rejected": -1.9225718975067139, + "sft_loss": 1.35300612449646, + "step": 2445 + }, + { + "epoch": 1.3112560628867704, + "grad_norm": 9.622022723192302, + "learning_rate": 2.0758591819110364e-06, + "logits/chosen": -0.5288017988204956, + "logits/rejected": -0.40660151839256287, + "logps/chosen": -1.2950856685638428, + "logps/rejected": -1.991323709487915, + "loss": 0.949, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2950856685638428, + "rewards/margins": 0.6962381601333618, + "rewards/rejected": -1.991323709487915, + "sft_loss": 1.302475929260254, + "step": 2450 + }, + { + "epoch": 1.313932095668172, + "grad_norm": 4.744900707874614, + "learning_rate": 2.071542208748912e-06, + "logits/chosen": -0.5513706207275391, + "logits/rejected": -0.3701619803905487, + "logps/chosen": -1.3338464498519897, + "logps/rejected": -1.896228551864624, + "loss": 0.9767, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.3338464498519897, + "rewards/margins": 0.5623821020126343, + "rewards/rejected": -1.896228551864624, + "sft_loss": 1.3844144344329834, + "step": 2455 + }, + { + "epoch": 1.3166081284495736, + "grad_norm": 8.34519438573912, + "learning_rate": 2.0672196904817715e-06, + "logits/chosen": -0.5042263269424438, + "logits/rejected": -0.4224318563938141, + "logps/chosen": -1.3436133861541748, + "logps/rejected": -1.7886947393417358, + "loss": 1.0281, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3436133861541748, + "rewards/margins": 0.44508129358291626, + "rewards/rejected": -1.7886947393417358, + "sft_loss": 1.3645684719085693, + "step": 2460 + }, + { + "epoch": 1.3192841612309751, + "grad_norm": 5.267127509259156, + "learning_rate": 2.0628916690467066e-06, + "logits/chosen": -0.47091466188430786, + "logits/rejected": -0.422675222158432, + "logps/chosen": -1.286413311958313, + "logps/rejected": -1.9102262258529663, + "loss": 0.9609, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.286413311958313, + "rewards/margins": 0.6238128542900085, + "rewards/rejected": -1.9102262258529663, + "sft_loss": 1.3008480072021484, + "step": 2465 + }, + { + "epoch": 1.3219601940123766, + "grad_norm": 7.586235743737926, + "learning_rate": 2.0585581864341995e-06, + "logits/chosen": -0.6090031862258911, + "logits/rejected": -0.508503258228302, + "logps/chosen": -1.2876276969909668, + "logps/rejected": -1.6935985088348389, + "loss": 1.0107, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2876276969909668, + "rewards/margins": 0.405970960855484, + "rewards/rejected": -1.6935985088348389, + "sft_loss": 1.3544785976409912, + "step": 2470 + }, + { + "epoch": 1.3246362267937783, + "grad_norm": 6.45092653818614, + "learning_rate": 2.0542192846877177e-06, + "logits/chosen": -0.5176225304603577, + "logits/rejected": -0.47460445761680603, + "logps/chosen": -1.3008874654769897, + "logps/rejected": -1.8060070276260376, + "loss": 0.9677, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3008874654769897, + "rewards/margins": 0.5051193237304688, + "rewards/rejected": -1.8060070276260376, + "sft_loss": 1.3537688255310059, + "step": 2475 + }, + { + "epoch": 1.3273122595751798, + "grad_norm": 6.57852999670518, + "learning_rate": 2.049875005903305e-06, + "logits/chosen": -0.6487798690795898, + "logits/rejected": -0.5197803378105164, + "logps/chosen": -1.3330776691436768, + "logps/rejected": -2.0792503356933594, + "loss": 0.9503, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3330776691436768, + "rewards/margins": 0.7461727857589722, + "rewards/rejected": -2.0792503356933594, + "sft_loss": 1.449393391609192, + "step": 2480 + }, + { + "epoch": 1.3299882923565813, + "grad_norm": 10.072456903626007, + "learning_rate": 2.045525392229174e-06, + "logits/chosen": -0.48566898703575134, + "logits/rejected": -0.33911052346229553, + "logps/chosen": -1.39583158493042, + "logps/rejected": -2.1033527851104736, + "loss": 1.0366, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.39583158493042, + "rewards/margins": 0.7075213193893433, + "rewards/rejected": -2.1033527851104736, + "sft_loss": 1.4875379800796509, + "step": 2485 + }, + { + "epoch": 1.332664325137983, + "grad_norm": 15.057610984252282, + "learning_rate": 2.0411704858652946e-06, + "logits/chosen": -0.542778491973877, + "logits/rejected": -0.5049811601638794, + "logps/chosen": -1.3699305057525635, + "logps/rejected": -2.0171689987182617, + "loss": 0.9714, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3699305057525635, + "rewards/margins": 0.6472384929656982, + "rewards/rejected": -2.0171689987182617, + "sft_loss": 1.43272864818573, + "step": 2490 + }, + { + "epoch": 1.3353403579193845, + "grad_norm": 5.7032019325446, + "learning_rate": 2.0368103290629877e-06, + "logits/chosen": -0.42945393919944763, + "logits/rejected": -0.4224371314048767, + "logps/chosen": -1.3021575212478638, + "logps/rejected": -1.8281514644622803, + "loss": 0.9871, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.3021575212478638, + "rewards/margins": 0.5259938836097717, + "rewards/rejected": -1.8281514644622803, + "sft_loss": 1.344936490058899, + "step": 2495 + }, + { + "epoch": 1.338016390700786, + "grad_norm": 7.021802073199058, + "learning_rate": 2.0324449641245145e-06, + "logits/chosen": -0.4047786295413971, + "logits/rejected": -0.2578263282775879, + "logps/chosen": -1.2514671087265015, + "logps/rejected": -1.685306191444397, + "loss": 0.9715, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2514671087265015, + "rewards/margins": 0.4338390827178955, + "rewards/rejected": -1.685306191444397, + "sft_loss": 1.324374794960022, + "step": 2500 + }, + { + "epoch": 1.3406924234821878, + "grad_norm": 6.0144852456574185, + "learning_rate": 2.028074433402664e-06, + "logits/chosen": -0.4007970690727234, + "logits/rejected": -0.2498869001865387, + "logps/chosen": -1.2454150915145874, + "logps/rejected": -1.7854740619659424, + "loss": 0.9722, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2454150915145874, + "rewards/margins": 0.5400589108467102, + "rewards/rejected": -1.7854740619659424, + "sft_loss": 1.2837181091308594, + "step": 2505 + }, + { + "epoch": 1.3433684562635893, + "grad_norm": 9.461615744398255, + "learning_rate": 2.023698779300344e-06, + "logits/chosen": -0.48871421813964844, + "logits/rejected": -0.38032227754592896, + "logps/chosen": -1.2504608631134033, + "logps/rejected": -1.7310020923614502, + "loss": 0.9611, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2504608631134033, + "rewards/margins": 0.48054131865501404, + "rewards/rejected": -1.7310020923614502, + "sft_loss": 1.3119781017303467, + "step": 2510 + }, + { + "epoch": 1.3460444890449907, + "grad_norm": 6.2563350888606815, + "learning_rate": 2.019318044270171e-06, + "logits/chosen": -0.42136192321777344, + "logits/rejected": -0.3354993462562561, + "logps/chosen": -1.3155518770217896, + "logps/rejected": -1.7504370212554932, + "loss": 1.0227, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3155518770217896, + "rewards/margins": 0.43488508462905884, + "rewards/rejected": -1.7504370212554932, + "sft_loss": 1.3969285488128662, + "step": 2515 + }, + { + "epoch": 1.3487205218263925, + "grad_norm": 6.955482075506187, + "learning_rate": 2.0149322708140545e-06, + "logits/chosen": -0.5253806710243225, + "logits/rejected": -0.4579823613166809, + "logps/chosen": -1.3426711559295654, + "logps/rejected": -1.7703368663787842, + "loss": 0.9858, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3426711559295654, + "rewards/margins": 0.42766571044921875, + "rewards/rejected": -1.7703368663787842, + "sft_loss": 1.3179153203964233, + "step": 2520 + }, + { + "epoch": 1.351396554607794, + "grad_norm": 8.697109095245661, + "learning_rate": 2.0105415014827886e-06, + "logits/chosen": -0.5423754453659058, + "logits/rejected": -0.4885048270225525, + "logps/chosen": -1.3919349908828735, + "logps/rejected": -2.001356601715088, + "loss": 1.0147, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3919349908828735, + "rewards/margins": 0.6094216704368591, + "rewards/rejected": -2.001356601715088, + "sft_loss": 1.4702781438827515, + "step": 2525 + }, + { + "epoch": 1.3540725873891954, + "grad_norm": 6.615814137244643, + "learning_rate": 2.006145778875636e-06, + "logits/chosen": -0.5715299844741821, + "logits/rejected": -0.534439206123352, + "logps/chosen": -1.3156145811080933, + "logps/rejected": -1.8228168487548828, + "loss": 1.0074, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3156145811080933, + "rewards/margins": 0.5072023272514343, + "rewards/rejected": -1.8228168487548828, + "sft_loss": 1.3530737161636353, + "step": 2530 + }, + { + "epoch": 1.3567486201705972, + "grad_norm": 6.026078647434834, + "learning_rate": 2.0017451456399165e-06, + "logits/chosen": -0.587544322013855, + "logits/rejected": -0.4726153314113617, + "logps/chosen": -1.3580278158187866, + "logps/rejected": -1.9934993982315063, + "loss": 0.9682, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3580278158187866, + "rewards/margins": 0.6354714035987854, + "rewards/rejected": -1.9934993982315063, + "sft_loss": 1.3702924251556396, + "step": 2535 + }, + { + "epoch": 1.3594246529519987, + "grad_norm": 7.655586715147393, + "learning_rate": 1.9973396444705934e-06, + "logits/chosen": -0.5170526504516602, + "logits/rejected": -0.38636916875839233, + "logps/chosen": -1.4224025011062622, + "logps/rejected": -1.9178695678710938, + "loss": 1.0333, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.4224025011062622, + "rewards/margins": 0.4954671859741211, + "rewards/rejected": -1.9178695678710938, + "sft_loss": 1.4758238792419434, + "step": 2540 + }, + { + "epoch": 1.3621006857334002, + "grad_norm": 9.384836133788575, + "learning_rate": 1.9929293181098588e-06, + "logits/chosen": -0.48921626806259155, + "logits/rejected": -0.35572922229766846, + "logps/chosen": -1.3469831943511963, + "logps/rejected": -2.0367984771728516, + "loss": 0.9761, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3469831943511963, + "rewards/margins": 0.6898151636123657, + "rewards/rejected": -2.0367984771728516, + "sft_loss": 1.4044857025146484, + "step": 2545 + }, + { + "epoch": 1.3647767185148019, + "grad_norm": 6.119379297533443, + "learning_rate": 1.988514209346718e-06, + "logits/chosen": -0.5070446729660034, + "logits/rejected": -0.3788461983203888, + "logps/chosen": -1.3717761039733887, + "logps/rejected": -1.871145248413086, + "loss": 1.0153, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3717761039733887, + "rewards/margins": 0.4993690848350525, + "rewards/rejected": -1.871145248413086, + "sft_loss": 1.4144275188446045, + "step": 2550 + }, + { + "epoch": 1.3674527512962034, + "grad_norm": 15.852340801316688, + "learning_rate": 1.984094361016575e-06, + "logits/chosen": -0.4378221929073334, + "logits/rejected": -0.36867469549179077, + "logps/chosen": -1.2725999355316162, + "logps/rejected": -1.9760814905166626, + "loss": 0.9695, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2725999355316162, + "rewards/margins": 0.7034815549850464, + "rewards/rejected": -1.9760814905166626, + "sft_loss": 1.3369901180267334, + "step": 2555 + }, + { + "epoch": 1.3701287840776049, + "grad_norm": 10.310229691402508, + "learning_rate": 1.9796698160008187e-06, + "logits/chosen": -0.41603922843933105, + "logits/rejected": -0.30848008394241333, + "logps/chosen": -1.322511076927185, + "logps/rejected": -1.90826416015625, + "loss": 0.9483, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.322511076927185, + "rewards/margins": 0.5857528448104858, + "rewards/rejected": -1.90826416015625, + "sft_loss": 1.3534080982208252, + "step": 2560 + }, + { + "epoch": 1.3728048168590066, + "grad_norm": 7.437344286420298, + "learning_rate": 1.975240617226404e-06, + "logits/chosen": -0.3992313742637634, + "logits/rejected": -0.28970590233802795, + "logps/chosen": -1.2926113605499268, + "logps/rejected": -1.9368501901626587, + "loss": 0.958, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2926113605499268, + "rewards/margins": 0.6442388296127319, + "rewards/rejected": -1.9368501901626587, + "sft_loss": 1.3517730236053467, + "step": 2565 + }, + { + "epoch": 1.375480849640408, + "grad_norm": 5.739484924634926, + "learning_rate": 1.9708068076654364e-06, + "logits/chosen": -0.3324227035045624, + "logits/rejected": -0.27540525794029236, + "logps/chosen": -1.2775561809539795, + "logps/rejected": -1.87930166721344, + "loss": 0.9508, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2775561809539795, + "rewards/margins": 0.6017455458641052, + "rewards/rejected": -1.87930166721344, + "sft_loss": 1.326965093612671, + "step": 2570 + }, + { + "epoch": 1.3781568824218096, + "grad_norm": 8.181676957715014, + "learning_rate": 1.966368430334756e-06, + "logits/chosen": -0.4677346348762512, + "logits/rejected": -0.3302808701992035, + "logps/chosen": -1.2933305501937866, + "logps/rejected": -1.8723223209381104, + "loss": 0.9541, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2933305501937866, + "rewards/margins": 0.5789917707443237, + "rewards/rejected": -1.8723223209381104, + "sft_loss": 1.3387445211410522, + "step": 2575 + }, + { + "epoch": 1.3808329152032113, + "grad_norm": 10.306750512008252, + "learning_rate": 1.961925528295519e-06, + "logits/chosen": -0.41305112838745117, + "logits/rejected": -0.34868156909942627, + "logps/chosen": -1.3654184341430664, + "logps/rejected": -1.779240608215332, + "loss": 1.0189, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3654184341430664, + "rewards/margins": 0.4138219356536865, + "rewards/rejected": -1.779240608215332, + "sft_loss": 1.4515666961669922, + "step": 2580 + }, + { + "epoch": 1.3835089479846128, + "grad_norm": 8.789090186645119, + "learning_rate": 1.9574781446527806e-06, + "logits/chosen": -0.3169155716896057, + "logits/rejected": -0.17200681567192078, + "logps/chosen": -1.277912974357605, + "logps/rejected": -1.8929014205932617, + "loss": 0.9165, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.277912974357605, + "rewards/margins": 0.6149882078170776, + "rewards/rejected": -1.8929014205932617, + "sft_loss": 1.3085170984268188, + "step": 2585 + }, + { + "epoch": 1.3861849807660143, + "grad_norm": 12.34770292195211, + "learning_rate": 1.9530263225550765e-06, + "logits/chosen": -0.43816858530044556, + "logits/rejected": -0.31687816977500916, + "logps/chosen": -1.2946866750717163, + "logps/rejected": -1.8282054662704468, + "loss": 0.9975, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2946866750717163, + "rewards/margins": 0.5335186719894409, + "rewards/rejected": -1.8282054662704468, + "sft_loss": 1.4057872295379639, + "step": 2590 + }, + { + "epoch": 1.388861013547416, + "grad_norm": 8.534590340266245, + "learning_rate": 1.9485701051940037e-06, + "logits/chosen": -0.4214317202568054, + "logits/rejected": -0.3847965598106384, + "logps/chosen": -1.329071283340454, + "logps/rejected": -1.8042049407958984, + "loss": 0.9922, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.329071283340454, + "rewards/margins": 0.4751337468624115, + "rewards/rejected": -1.8042049407958984, + "sft_loss": 1.3613306283950806, + "step": 2595 + }, + { + "epoch": 1.3915370463288175, + "grad_norm": 8.168860666279095, + "learning_rate": 1.9441095358038035e-06, + "logits/chosen": -0.3514612019062042, + "logits/rejected": -0.2528729736804962, + "logps/chosen": -1.3265708684921265, + "logps/rejected": -1.7354872226715088, + "loss": 1.0077, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3265708684921265, + "rewards/margins": 0.40891632437705994, + "rewards/rejected": -1.7354872226715088, + "sft_loss": 1.3667696714401245, + "step": 2600 + }, + { + "epoch": 1.394213079110219, + "grad_norm": 12.852456295482222, + "learning_rate": 1.9396446576609387e-06, + "logits/chosen": -0.36365336179733276, + "logits/rejected": -0.33231550455093384, + "logps/chosen": -1.2981659173965454, + "logps/rejected": -1.7442443370819092, + "loss": 0.9804, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2981659173965454, + "rewards/margins": 0.4460783898830414, + "rewards/rejected": -1.7442443370819092, + "sft_loss": 1.3654030561447144, + "step": 2605 + }, + { + "epoch": 1.3968891118916207, + "grad_norm": 9.120917157459237, + "learning_rate": 1.935175514083677e-06, + "logits/chosen": -0.3523769676685333, + "logits/rejected": -0.3050524890422821, + "logps/chosen": -1.3515031337738037, + "logps/rejected": -1.8546243906021118, + "loss": 1.0265, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.3515031337738037, + "rewards/margins": 0.503121018409729, + "rewards/rejected": -1.8546243906021118, + "sft_loss": 1.399251937866211, + "step": 2610 + }, + { + "epoch": 1.3995651446730222, + "grad_norm": 13.956277485278768, + "learning_rate": 1.9307021484316693e-06, + "logits/chosen": -0.43114447593688965, + "logits/rejected": -0.3138170838356018, + "logps/chosen": -1.2429790496826172, + "logps/rejected": -1.8580067157745361, + "loss": 0.9582, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2429790496826172, + "rewards/margins": 0.615027666091919, + "rewards/rejected": -1.8580067157745361, + "sft_loss": 1.309478998184204, + "step": 2615 + }, + { + "epoch": 1.402241177454424, + "grad_norm": 6.04682907749214, + "learning_rate": 1.926224604105529e-06, + "logits/chosen": -0.46197718381881714, + "logits/rejected": -0.48134127259254456, + "logps/chosen": -1.3629053831100464, + "logps/rejected": -1.7455739974975586, + "loss": 1.0485, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.3629053831100464, + "rewards/margins": 0.38266849517822266, + "rewards/rejected": -1.7455739974975586, + "sft_loss": 1.4094994068145752, + "step": 2620 + }, + { + "epoch": 1.4049172102358254, + "grad_norm": 15.757296282890119, + "learning_rate": 1.92174292454641e-06, + "logits/chosen": -0.4499499201774597, + "logits/rejected": -0.3384786546230316, + "logps/chosen": -1.3081551790237427, + "logps/rejected": -1.918835997581482, + "loss": 0.9591, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3081551790237427, + "rewards/margins": 0.6106808185577393, + "rewards/rejected": -1.918835997581482, + "sft_loss": 1.3155691623687744, + "step": 2625 + }, + { + "epoch": 1.4075932430172269, + "grad_norm": 6.048744754749274, + "learning_rate": 1.917257153235587e-06, + "logits/chosen": -0.6204741597175598, + "logits/rejected": -0.45593032240867615, + "logps/chosen": -1.3463962078094482, + "logps/rejected": -1.8253018856048584, + "loss": 1.0093, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3463962078094482, + "rewards/margins": 0.4789056181907654, + "rewards/rejected": -1.8253018856048584, + "sft_loss": 1.3738082647323608, + "step": 2630 + }, + { + "epoch": 1.4102692757986286, + "grad_norm": 9.04564590360611, + "learning_rate": 1.9127673336940335e-06, + "logits/chosen": -0.48332133889198303, + "logits/rejected": -0.42709770798683167, + "logps/chosen": -1.305422067642212, + "logps/rejected": -1.817214012145996, + "loss": 0.995, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.305422067642212, + "rewards/margins": 0.5117920637130737, + "rewards/rejected": -1.817214012145996, + "sft_loss": 1.3502801656723022, + "step": 2635 + }, + { + "epoch": 1.41294530858003, + "grad_norm": 5.615734265148532, + "learning_rate": 1.908273509481998e-06, + "logits/chosen": -0.4106292724609375, + "logits/rejected": -0.3586362898349762, + "logps/chosen": -1.3568456172943115, + "logps/rejected": -1.8621975183486938, + "loss": 0.9954, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3568456172943115, + "rewards/margins": 0.5053519010543823, + "rewards/rejected": -1.8621975183486938, + "sft_loss": 1.3644628524780273, + "step": 2640 + }, + { + "epoch": 1.4156213413614318, + "grad_norm": 7.975761092606305, + "learning_rate": 1.9037757241985832e-06, + "logits/chosen": -0.44296973943710327, + "logits/rejected": -0.38010460138320923, + "logps/chosen": -1.293700098991394, + "logps/rejected": -1.8513076305389404, + "loss": 0.9578, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.293700098991394, + "rewards/margins": 0.5576077699661255, + "rewards/rejected": -1.8513076305389404, + "sft_loss": 1.314415693283081, + "step": 2645 + }, + { + "epoch": 1.4182973741428333, + "grad_norm": 9.470303618697493, + "learning_rate": 1.899274021481321e-06, + "logits/chosen": -0.5351869463920593, + "logits/rejected": -0.39628365635871887, + "logps/chosen": -1.3325697183609009, + "logps/rejected": -2.1645429134368896, + "loss": 0.9519, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3325697183609009, + "rewards/margins": 0.8319734334945679, + "rewards/rejected": -2.1645429134368896, + "sft_loss": 1.3672149181365967, + "step": 2650 + }, + { + "epoch": 1.4209734069242348, + "grad_norm": 8.118243747697562, + "learning_rate": 1.8947684450057516e-06, + "logits/chosen": -0.4836540222167969, + "logits/rejected": -0.3647536039352417, + "logps/chosen": -1.2214539051055908, + "logps/rejected": -1.8250150680541992, + "loss": 0.9029, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2214539051055908, + "rewards/margins": 0.6035611629486084, + "rewards/rejected": -1.8250150680541992, + "sft_loss": 1.2677587270736694, + "step": 2655 + }, + { + "epoch": 1.4236494397056365, + "grad_norm": 8.409738270885674, + "learning_rate": 1.890259038484997e-06, + "logits/chosen": -0.4491191804409027, + "logits/rejected": -0.431225061416626, + "logps/chosen": -1.2447589635849, + "logps/rejected": -1.9232664108276367, + "loss": 0.927, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2447589635849, + "rewards/margins": 0.678507387638092, + "rewards/rejected": -1.9232664108276367, + "sft_loss": 1.2517122030258179, + "step": 2660 + }, + { + "epoch": 1.426325472487038, + "grad_norm": 6.0457132942542335, + "learning_rate": 1.8857458456693398e-06, + "logits/chosen": -0.5236254334449768, + "logits/rejected": -0.4272289276123047, + "logps/chosen": -1.3596875667572021, + "logps/rejected": -1.9714336395263672, + "loss": 0.9883, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3596875667572021, + "rewards/margins": 0.6117460131645203, + "rewards/rejected": -1.9714336395263672, + "sft_loss": 1.4770348072052002, + "step": 2665 + }, + { + "epoch": 1.4290015052684395, + "grad_norm": 11.462157523408354, + "learning_rate": 1.881228910345796e-06, + "logits/chosen": -0.4460994601249695, + "logits/rejected": -0.3811623454093933, + "logps/chosen": -1.4243602752685547, + "logps/rejected": -1.946854591369629, + "loss": 1.0049, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.4243602752685547, + "rewards/margins": 0.5224944949150085, + "rewards/rejected": -1.946854591369629, + "sft_loss": 1.4408633708953857, + "step": 2670 + }, + { + "epoch": 1.4316775380498412, + "grad_norm": 12.819377325527697, + "learning_rate": 1.8767082763376916e-06, + "logits/chosen": -0.49858832359313965, + "logits/rejected": -0.377076655626297, + "logps/chosen": -1.3824807405471802, + "logps/rejected": -1.892433762550354, + "loss": 0.9976, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3824807405471802, + "rewards/margins": 0.509952962398529, + "rewards/rejected": -1.892433762550354, + "sft_loss": 1.3168888092041016, + "step": 2675 + }, + { + "epoch": 1.4343535708312427, + "grad_norm": 9.435587493834031, + "learning_rate": 1.8721839875042386e-06, + "logits/chosen": -0.5917202830314636, + "logits/rejected": -0.4736374318599701, + "logps/chosen": -1.347184419631958, + "logps/rejected": -1.8652584552764893, + "loss": 1.002, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.347184419631958, + "rewards/margins": 0.5180739164352417, + "rewards/rejected": -1.8652584552764893, + "sft_loss": 1.4115911722183228, + "step": 2680 + }, + { + "epoch": 1.4370296036126442, + "grad_norm": 7.706508747987979, + "learning_rate": 1.8676560877401062e-06, + "logits/chosen": -0.5961927175521851, + "logits/rejected": -0.465129554271698, + "logps/chosen": -1.323976755142212, + "logps/rejected": -1.871092438697815, + "loss": 0.9549, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.323976755142212, + "rewards/margins": 0.5471157431602478, + "rewards/rejected": -1.871092438697815, + "sft_loss": 1.3681257963180542, + "step": 2685 + }, + { + "epoch": 1.439705636394046, + "grad_norm": 63.92300829705635, + "learning_rate": 1.8631246209749982e-06, + "logits/chosen": -0.7190247774124146, + "logits/rejected": -0.5581813454627991, + "logps/chosen": -1.3239647150039673, + "logps/rejected": -2.063500165939331, + "loss": 0.9355, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3239647150039673, + "rewards/margins": 0.7395354509353638, + "rewards/rejected": -2.063500165939331, + "sft_loss": 1.3667714595794678, + "step": 2690 + }, + { + "epoch": 1.4423816691754474, + "grad_norm": 7.07593824509397, + "learning_rate": 1.8585896311732247e-06, + "logits/chosen": -0.5935165882110596, + "logits/rejected": -0.5888763070106506, + "logps/chosen": -1.3361377716064453, + "logps/rejected": -1.9687163829803467, + "loss": 0.9731, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3361377716064453, + "rewards/margins": 0.6325784921646118, + "rewards/rejected": -1.9687163829803467, + "sft_loss": 1.3577778339385986, + "step": 2695 + }, + { + "epoch": 1.445057701956849, + "grad_norm": 8.352588214209106, + "learning_rate": 1.854051162333277e-06, + "logits/chosen": -0.5560085773468018, + "logits/rejected": -0.4076048731803894, + "logps/chosen": -1.3173125982284546, + "logps/rejected": -1.8631242513656616, + "loss": 0.9914, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3173125982284546, + "rewards/margins": 0.5458115935325623, + "rewards/rejected": -1.8631242513656616, + "sft_loss": 1.4011826515197754, + "step": 2700 + }, + { + "epoch": 1.4477337347382506, + "grad_norm": 6.70826330315372, + "learning_rate": 1.8495092584873992e-06, + "logits/chosen": -0.571670413017273, + "logits/rejected": -0.4206954538822174, + "logps/chosen": -1.2080904245376587, + "logps/rejected": -1.8981767892837524, + "loss": 0.8717, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.2080904245376587, + "rewards/margins": 0.6900863647460938, + "rewards/rejected": -1.8981767892837524, + "sft_loss": 1.2135100364685059, + "step": 2705 + }, + { + "epoch": 1.4504097675196521, + "grad_norm": 6.540141215860256, + "learning_rate": 1.844963963701163e-06, + "logits/chosen": -0.49980098009109497, + "logits/rejected": -0.4846612811088562, + "logps/chosen": -1.3230160474777222, + "logps/rejected": -1.87786066532135, + "loss": 0.9543, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3230160474777222, + "rewards/margins": 0.5548445582389832, + "rewards/rejected": -1.87786066532135, + "sft_loss": 1.3326829671859741, + "step": 2710 + }, + { + "epoch": 1.4530858003010536, + "grad_norm": 8.881381528164832, + "learning_rate": 1.8404153220730383e-06, + "logits/chosen": -0.6166124939918518, + "logits/rejected": -0.5608782172203064, + "logps/chosen": -1.2661406993865967, + "logps/rejected": -1.8450348377227783, + "loss": 0.9832, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2661406993865967, + "rewards/margins": 0.5788939595222473, + "rewards/rejected": -1.8450348377227783, + "sft_loss": 1.3628873825073242, + "step": 2715 + }, + { + "epoch": 1.4557618330824553, + "grad_norm": 6.028525689985978, + "learning_rate": 1.8358633777339654e-06, + "logits/chosen": -0.5961281061172485, + "logits/rejected": -0.5205580592155457, + "logps/chosen": -1.3284294605255127, + "logps/rejected": -1.8279927968978882, + "loss": 0.9702, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3284294605255127, + "rewards/margins": 0.4995633065700531, + "rewards/rejected": -1.8279927968978882, + "sft_loss": 1.330628752708435, + "step": 2720 + }, + { + "epoch": 1.4584378658638568, + "grad_norm": 10.231848785799206, + "learning_rate": 1.831308174846929e-06, + "logits/chosen": -0.5107966065406799, + "logits/rejected": -0.42333516478538513, + "logps/chosen": -1.3227458000183105, + "logps/rejected": -1.9830515384674072, + "loss": 0.9385, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3227458000183105, + "rewards/margins": 0.6603055000305176, + "rewards/rejected": -1.9830515384674072, + "sft_loss": 1.3296210765838623, + "step": 2725 + }, + { + "epoch": 1.4611138986452583, + "grad_norm": 8.00344492894106, + "learning_rate": 1.826749757606527e-06, + "logits/chosen": -0.5675755739212036, + "logits/rejected": -0.4411475658416748, + "logps/chosen": -1.3341108560562134, + "logps/rejected": -2.089054822921753, + "loss": 0.9605, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3341108560562134, + "rewards/margins": 0.7549439668655396, + "rewards/rejected": -2.089054822921753, + "sft_loss": 1.3772116899490356, + "step": 2730 + }, + { + "epoch": 1.46378993142666, + "grad_norm": 6.2724946757219175, + "learning_rate": 1.8221881702385435e-06, + "logits/chosen": -0.5259631872177124, + "logits/rejected": -0.35373836755752563, + "logps/chosen": -1.2499545812606812, + "logps/rejected": -2.0189995765686035, + "loss": 0.9046, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.2499545812606812, + "rewards/margins": 0.7690447568893433, + "rewards/rejected": -2.0189995765686035, + "sft_loss": 1.3529435396194458, + "step": 2735 + }, + { + "epoch": 1.4664659642080615, + "grad_norm": 11.644048954737464, + "learning_rate": 1.8176234569995196e-06, + "logits/chosen": -0.5477747917175293, + "logits/rejected": -0.47055092453956604, + "logps/chosen": -1.346920371055603, + "logps/rejected": -2.2004103660583496, + "loss": 0.9552, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.346920371055603, + "rewards/margins": 0.8534899950027466, + "rewards/rejected": -2.2004103660583496, + "sft_loss": 1.3843533992767334, + "step": 2740 + }, + { + "epoch": 1.469141996989463, + "grad_norm": 7.688148681172665, + "learning_rate": 1.8130556621763223e-06, + "logits/chosen": -0.525569498538971, + "logits/rejected": -0.44252434372901917, + "logps/chosen": -1.305481195449829, + "logps/rejected": -1.9038594961166382, + "loss": 0.9807, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.305481195449829, + "rewards/margins": 0.5983783006668091, + "rewards/rejected": -1.9038594961166382, + "sft_loss": 1.3511950969696045, + "step": 2745 + }, + { + "epoch": 1.4718180297708647, + "grad_norm": 7.370258227670556, + "learning_rate": 1.808484830085718e-06, + "logits/chosen": -0.514094889163971, + "logits/rejected": -0.4376614987850189, + "logps/chosen": -1.4168999195098877, + "logps/rejected": -2.2200722694396973, + "loss": 0.9447, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4168999195098877, + "rewards/margins": 0.8031722903251648, + "rewards/rejected": -2.2200722694396973, + "sft_loss": 1.4548296928405762, + "step": 2750 + }, + { + "epoch": 1.4744940625522662, + "grad_norm": 11.663556866456318, + "learning_rate": 1.8039110050739394e-06, + "logits/chosen": -0.46310463547706604, + "logits/rejected": -0.35408297181129456, + "logps/chosen": -1.3439658880233765, + "logps/rejected": -2.0157570838928223, + "loss": 0.9598, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3439658880233765, + "rewards/margins": 0.6717912554740906, + "rewards/rejected": -2.0157570838928223, + "sft_loss": 1.4041773080825806, + "step": 2755 + }, + { + "epoch": 1.4771700953336677, + "grad_norm": 7.113265265380697, + "learning_rate": 1.7993342315162563e-06, + "logits/chosen": -0.534980297088623, + "logits/rejected": -0.3727056384086609, + "logps/chosen": -1.3619086742401123, + "logps/rejected": -2.215651035308838, + "loss": 0.9169, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3619086742401123, + "rewards/margins": 0.8537423014640808, + "rewards/rejected": -2.215651035308838, + "sft_loss": 1.3776787519454956, + "step": 2760 + }, + { + "epoch": 1.4798461281150694, + "grad_norm": 8.036413528086007, + "learning_rate": 1.794754553816546e-06, + "logits/chosen": -0.43422263860702515, + "logits/rejected": -0.3120557367801666, + "logps/chosen": -1.3172380924224854, + "logps/rejected": -1.9721095561981201, + "loss": 0.9269, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3172380924224854, + "rewards/margins": 0.6548714637756348, + "rewards/rejected": -1.9721095561981201, + "sft_loss": 1.378082036972046, + "step": 2765 + }, + { + "epoch": 1.482522160896471, + "grad_norm": 7.781275539577071, + "learning_rate": 1.7901720164068623e-06, + "logits/chosen": -0.5075265169143677, + "logits/rejected": -0.4491181969642639, + "logps/chosen": -1.2856837511062622, + "logps/rejected": -1.723960518836975, + "loss": 1.0079, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2856837511062622, + "rewards/margins": 0.43827691674232483, + "rewards/rejected": -1.723960518836975, + "sft_loss": 1.348190188407898, + "step": 2770 + }, + { + "epoch": 1.4851981936778724, + "grad_norm": 13.606802860849392, + "learning_rate": 1.7855866637470027e-06, + "logits/chosen": -0.4231862425804138, + "logits/rejected": -0.39519035816192627, + "logps/chosen": -1.2925583124160767, + "logps/rejected": -1.953325629234314, + "loss": 0.963, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2925583124160767, + "rewards/margins": 0.6607673764228821, + "rewards/rejected": -1.953325629234314, + "sft_loss": 1.3326704502105713, + "step": 2775 + }, + { + "epoch": 1.4878742264592741, + "grad_norm": 7.471840777920316, + "learning_rate": 1.780998540324079e-06, + "logits/chosen": -0.41194334626197815, + "logits/rejected": -0.31532105803489685, + "logps/chosen": -1.4386813640594482, + "logps/rejected": -1.9639618396759033, + "loss": 1.0387, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.4386813640594482, + "rewards/margins": 0.5252804756164551, + "rewards/rejected": -1.9639618396759033, + "sft_loss": 1.4364547729492188, + "step": 2780 + }, + { + "epoch": 1.4905502592406756, + "grad_norm": 8.230403778332743, + "learning_rate": 1.776407690652084e-06, + "logits/chosen": -0.4215714931488037, + "logits/rejected": -0.2935768961906433, + "logps/chosen": -1.4041473865509033, + "logps/rejected": -2.048205852508545, + "loss": 1.0061, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.4041473865509033, + "rewards/margins": 0.6440584063529968, + "rewards/rejected": -2.048205852508545, + "sft_loss": 1.4206860065460205, + "step": 2785 + }, + { + "epoch": 1.4932262920220774, + "grad_norm": 9.35742961252276, + "learning_rate": 1.7718141592714628e-06, + "logits/chosen": -0.33215445280075073, + "logits/rejected": -0.3602014482021332, + "logps/chosen": -1.2948368787765503, + "logps/rejected": -1.8967326879501343, + "loss": 1.0093, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2948368787765503, + "rewards/margins": 0.6018956899642944, + "rewards/rejected": -1.8967326879501343, + "sft_loss": 1.389012098312378, + "step": 2790 + }, + { + "epoch": 1.4959023248034788, + "grad_norm": 6.029347217384856, + "learning_rate": 1.7672179907486757e-06, + "logits/chosen": -0.24997854232788086, + "logits/rejected": -0.2451924830675125, + "logps/chosen": -1.2664806842803955, + "logps/rejected": -1.734628438949585, + "loss": 1.0014, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2664806842803955, + "rewards/margins": 0.4681479036808014, + "rewards/rejected": -1.734628438949585, + "sft_loss": 1.297235369682312, + "step": 2795 + }, + { + "epoch": 1.4985783575848803, + "grad_norm": 7.690103127341859, + "learning_rate": 1.7626192296757708e-06, + "logits/chosen": -0.4005914628505707, + "logits/rejected": -0.3310944139957428, + "logps/chosen": -1.3489172458648682, + "logps/rejected": -1.814736008644104, + "loss": 1.0333, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3489172458648682, + "rewards/margins": 0.46581873297691345, + "rewards/rejected": -1.814736008644104, + "sft_loss": 1.426548719406128, + "step": 2800 + }, + { + "epoch": 1.4985783575848803, + "eval_logits/chosen": -0.08405511826276779, + "eval_logits/rejected": -0.004597527906298637, + "eval_logps/chosen": -1.3793412446975708, + "eval_logps/rejected": -1.7730538845062256, + "eval_loss": 1.04581880569458, + "eval_rewards/accuracies": 0.6127596497535706, + "eval_rewards/chosen": -1.3793412446975708, + "eval_rewards/margins": 0.3937126696109772, + "eval_rewards/rejected": -1.7730538845062256, + "eval_runtime": 42.9279, + "eval_samples_per_second": 31.332, + "eval_sft_loss": 1.3972686529159546, + "eval_steps_per_second": 7.85, + "step": 2800 + }, + { + "epoch": 1.5012543903662818, + "grad_norm": 5.2232884918186375, + "learning_rate": 1.7580179206699475e-06, + "logits/chosen": -0.4917038381099701, + "logits/rejected": -0.3383350968360901, + "logps/chosen": -1.1643739938735962, + "logps/rejected": -1.6991310119628906, + "loss": 0.932, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.1643739938735962, + "rewards/margins": 0.5347572565078735, + "rewards/rejected": -1.6991310119628906, + "sft_loss": 1.247200846672058, + "step": 2805 + }, + { + "epoch": 1.5039304231476835, + "grad_norm": 9.802894598065583, + "learning_rate": 1.7534141083731262e-06, + "logits/chosen": -0.3784298896789551, + "logits/rejected": -0.3399508595466614, + "logps/chosen": -1.3304203748703003, + "logps/rejected": -1.8287245035171509, + "loss": 0.9993, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3304203748703003, + "rewards/margins": 0.49830397963523865, + "rewards/rejected": -1.8287245035171509, + "sft_loss": 1.4084551334381104, + "step": 2810 + }, + { + "epoch": 1.5066064559290853, + "grad_norm": 7.710213953120398, + "learning_rate": 1.7488078374515143e-06, + "logits/chosen": -0.36453741788864136, + "logits/rejected": -0.25610360503196716, + "logps/chosen": -1.3093892335891724, + "logps/rejected": -1.9001718759536743, + "loss": 0.9638, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3093892335891724, + "rewards/margins": 0.5907825231552124, + "rewards/rejected": -1.9001718759536743, + "sft_loss": 1.3398463726043701, + "step": 2815 + }, + { + "epoch": 1.5092824887104868, + "grad_norm": 6.664089191337728, + "learning_rate": 1.7441991525951722e-06, + "logits/chosen": -0.4268978238105774, + "logits/rejected": -0.2702500820159912, + "logps/chosen": -1.282658576965332, + "logps/rejected": -1.73626708984375, + "loss": 0.9957, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.282658576965332, + "rewards/margins": 0.45360851287841797, + "rewards/rejected": -1.73626708984375, + "sft_loss": 1.3267815113067627, + "step": 2820 + }, + { + "epoch": 1.5119585214918883, + "grad_norm": 10.434741433534874, + "learning_rate": 1.7395880985175808e-06, + "logits/chosen": -0.5320969223976135, + "logits/rejected": -0.37741774320602417, + "logps/chosen": -1.3708407878875732, + "logps/rejected": -2.0818610191345215, + "loss": 0.952, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3708407878875732, + "rewards/margins": 0.7110201716423035, + "rewards/rejected": -2.0818610191345215, + "sft_loss": 1.3801392316818237, + "step": 2825 + }, + { + "epoch": 1.51463455427329, + "grad_norm": 7.419515705303618, + "learning_rate": 1.7349747199552063e-06, + "logits/chosen": -0.47449636459350586, + "logits/rejected": -0.35492879152297974, + "logps/chosen": -1.3637349605560303, + "logps/rejected": -1.8692903518676758, + "loss": 1.0242, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3637349605560303, + "rewards/margins": 0.5055556297302246, + "rewards/rejected": -1.8692903518676758, + "sft_loss": 1.4562511444091797, + "step": 2830 + }, + { + "epoch": 1.5173105870546915, + "grad_norm": 10.065096311367334, + "learning_rate": 1.7303590616670683e-06, + "logits/chosen": -0.4715927243232727, + "logits/rejected": -0.3220441937446594, + "logps/chosen": -1.3339498043060303, + "logps/rejected": -1.9941116571426392, + "loss": 0.9525, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3339498043060303, + "rewards/margins": 0.6601617336273193, + "rewards/rejected": -1.9941116571426392, + "sft_loss": 1.3572701215744019, + "step": 2835 + }, + { + "epoch": 1.519986619836093, + "grad_norm": 7.483523830892962, + "learning_rate": 1.7257411684343042e-06, + "logits/chosen": -0.44677048921585083, + "logits/rejected": -0.3579447865486145, + "logps/chosen": -1.330339789390564, + "logps/rejected": -1.7873833179473877, + "loss": 1.0151, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.330339789390564, + "rewards/margins": 0.45704346895217896, + "rewards/rejected": -1.7873833179473877, + "sft_loss": 1.3790075778961182, + "step": 2840 + }, + { + "epoch": 1.5226626526174947, + "grad_norm": 10.065579953099064, + "learning_rate": 1.7211210850597333e-06, + "logits/chosen": -0.42220425605773926, + "logits/rejected": -0.34389373660087585, + "logps/chosen": -1.3688390254974365, + "logps/rejected": -2.004911422729492, + "loss": 0.9812, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3688390254974365, + "rewards/margins": 0.6360724568367004, + "rewards/rejected": -2.004911422729492, + "sft_loss": 1.3310651779174805, + "step": 2845 + }, + { + "epoch": 1.5253386853988962, + "grad_norm": 7.369649452814721, + "learning_rate": 1.7164988563674256e-06, + "logits/chosen": -0.4650971293449402, + "logits/rejected": -0.38579609990119934, + "logps/chosen": -1.3725297451019287, + "logps/rejected": -2.0686371326446533, + "loss": 1.0069, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3725297451019287, + "rewards/margins": 0.6961073279380798, + "rewards/rejected": -2.0686371326446533, + "sft_loss": 1.3935930728912354, + "step": 2850 + }, + { + "epoch": 1.5280147181802977, + "grad_norm": 6.300272482544147, + "learning_rate": 1.7118745272022635e-06, + "logits/chosen": -0.48021286725997925, + "logits/rejected": -0.32814162969589233, + "logps/chosen": -1.402618169784546, + "logps/rejected": -1.969491958618164, + "loss": 0.9914, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.402618169784546, + "rewards/margins": 0.5668739080429077, + "rewards/rejected": -1.969491958618164, + "sft_loss": 1.4442012310028076, + "step": 2855 + }, + { + "epoch": 1.5306907509616994, + "grad_norm": 8.625096368723874, + "learning_rate": 1.7072481424295097e-06, + "logits/chosen": -0.5285122990608215, + "logits/rejected": -0.36223360896110535, + "logps/chosen": -1.307751178741455, + "logps/rejected": -1.7706029415130615, + "loss": 0.9759, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.307751178741455, + "rewards/margins": 0.4628518521785736, + "rewards/rejected": -1.7706029415130615, + "sft_loss": 1.3280938863754272, + "step": 2860 + }, + { + "epoch": 1.5333667837431009, + "grad_norm": 5.997814680553211, + "learning_rate": 1.702619746934369e-06, + "logits/chosen": -0.5822547674179077, + "logits/rejected": -0.4525377154350281, + "logps/chosen": -1.3115359544754028, + "logps/rejected": -1.9264564514160156, + "loss": 0.957, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3115359544754028, + "rewards/margins": 0.6149204969406128, + "rewards/rejected": -1.9264564514160156, + "sft_loss": 1.3683974742889404, + "step": 2865 + }, + { + "epoch": 1.5360428165245024, + "grad_norm": 9.018405181315453, + "learning_rate": 1.6979893856215547e-06, + "logits/chosen": -0.5128965973854065, + "logits/rejected": -0.4116179943084717, + "logps/chosen": -1.3670125007629395, + "logps/rejected": -1.7760088443756104, + "loss": 1.0132, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.3670125007629395, + "rewards/margins": 0.4089964032173157, + "rewards/rejected": -1.7760088443756104, + "sft_loss": 1.3387202024459839, + "step": 2870 + }, + { + "epoch": 1.538718849305904, + "grad_norm": 10.277223558286527, + "learning_rate": 1.6933571034148531e-06, + "logits/chosen": -0.48255014419555664, + "logits/rejected": -0.40862828493118286, + "logps/chosen": -1.4015209674835205, + "logps/rejected": -1.928873062133789, + "loss": 0.9631, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4015209674835205, + "rewards/margins": 0.5273522138595581, + "rewards/rejected": -1.928873062133789, + "sft_loss": 1.3639947175979614, + "step": 2875 + }, + { + "epoch": 1.5413948820873056, + "grad_norm": 10.625700942112903, + "learning_rate": 1.6887229452566859e-06, + "logits/chosen": -0.42231544852256775, + "logits/rejected": -0.31882724165916443, + "logps/chosen": -1.3298161029815674, + "logps/rejected": -2.002760410308838, + "loss": 0.9609, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3298161029815674, + "rewards/margins": 0.6729440689086914, + "rewards/rejected": -2.002760410308838, + "sft_loss": 1.3508632183074951, + "step": 2880 + }, + { + "epoch": 1.544070914868707, + "grad_norm": 9.865396438185735, + "learning_rate": 1.6840869561076761e-06, + "logits/chosen": -0.5014291405677795, + "logits/rejected": -0.3962041735649109, + "logps/chosen": -1.3711020946502686, + "logps/rejected": -1.9670244455337524, + "loss": 1.0002, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3711020946502686, + "rewards/margins": 0.5959222912788391, + "rewards/rejected": -1.9670244455337524, + "sft_loss": 1.4149680137634277, + "step": 2885 + }, + { + "epoch": 1.5467469476501088, + "grad_norm": 6.339272952448974, + "learning_rate": 1.6794491809462108e-06, + "logits/chosen": -0.5733720660209656, + "logits/rejected": -0.4118824899196625, + "logps/chosen": -1.3491941690444946, + "logps/rejected": -1.9526889324188232, + "loss": 0.9637, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3491941690444946, + "rewards/margins": 0.6034947037696838, + "rewards/rejected": -1.9526889324188232, + "sft_loss": 1.367173194885254, + "step": 2890 + }, + { + "epoch": 1.5494229804315103, + "grad_norm": 6.440190526652441, + "learning_rate": 1.674809664768005e-06, + "logits/chosen": -0.5244798064231873, + "logits/rejected": -0.3966430127620697, + "logps/chosen": -1.3039839267730713, + "logps/rejected": -1.9712779521942139, + "loss": 0.9312, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3039839267730713, + "rewards/margins": 0.6672938466072083, + "rewards/rejected": -1.9712779521942139, + "sft_loss": 1.3040238618850708, + "step": 2895 + }, + { + "epoch": 1.5520990132129118, + "grad_norm": 7.879057907006199, + "learning_rate": 1.6701684525856647e-06, + "logits/chosen": -0.44239291548728943, + "logits/rejected": -0.3611551821231842, + "logps/chosen": -1.3233540058135986, + "logps/rejected": -1.9400726556777954, + "loss": 0.9645, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3233540058135986, + "rewards/margins": 0.616718590259552, + "rewards/rejected": -1.9400726556777954, + "sft_loss": 1.385852336883545, + "step": 2900 + }, + { + "epoch": 1.5547750459943135, + "grad_norm": 11.832512783512026, + "learning_rate": 1.6655255894282515e-06, + "logits/chosen": -0.36237016320228577, + "logits/rejected": -0.35094302892684937, + "logps/chosen": -1.322729468345642, + "logps/rejected": -1.9308160543441772, + "loss": 0.9677, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.322729468345642, + "rewards/margins": 0.6080866456031799, + "rewards/rejected": -1.9308160543441772, + "sft_loss": 1.3535382747650146, + "step": 2905 + }, + { + "epoch": 1.557451078775715, + "grad_norm": 11.201933335039847, + "learning_rate": 1.6608811203408437e-06, + "logits/chosen": -0.42994600534439087, + "logits/rejected": -0.3571633994579315, + "logps/chosen": -1.2974226474761963, + "logps/rejected": -1.7543586492538452, + "loss": 0.9976, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.2974226474761963, + "rewards/margins": 0.45693597197532654, + "rewards/rejected": -1.7543586492538452, + "sft_loss": 1.3785746097564697, + "step": 2910 + }, + { + "epoch": 1.5601271115571165, + "grad_norm": 11.915398676209564, + "learning_rate": 1.6562350903841002e-06, + "logits/chosen": -0.4060365557670593, + "logits/rejected": -0.24389150738716125, + "logps/chosen": -1.388873815536499, + "logps/rejected": -2.0121002197265625, + "loss": 0.9886, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.388873815536499, + "rewards/margins": 0.6232262849807739, + "rewards/rejected": -2.0121002197265625, + "sft_loss": 1.415649175643921, + "step": 2915 + }, + { + "epoch": 1.5628031443385182, + "grad_norm": 13.960085221092982, + "learning_rate": 1.651587544633825e-06, + "logits/chosen": -0.4067690968513489, + "logits/rejected": -0.292506605386734, + "logps/chosen": -1.359520435333252, + "logps/rejected": -2.041755199432373, + "loss": 0.9575, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.359520435333252, + "rewards/margins": 0.6822346448898315, + "rewards/rejected": -2.041755199432373, + "sft_loss": 1.3790920972824097, + "step": 2920 + }, + { + "epoch": 1.5654791771199197, + "grad_norm": 7.559174143764919, + "learning_rate": 1.6469385281805267e-06, + "logits/chosen": -0.3783648908138275, + "logits/rejected": -0.3329547047615051, + "logps/chosen": -1.2987470626831055, + "logps/rejected": -1.9435436725616455, + "loss": 0.9662, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2987470626831055, + "rewards/margins": 0.6447966694831848, + "rewards/rejected": -1.9435436725616455, + "sft_loss": 1.3145548105239868, + "step": 2925 + }, + { + "epoch": 1.5681552099013212, + "grad_norm": 9.139123936589463, + "learning_rate": 1.642288086128984e-06, + "logits/chosen": -0.530595600605011, + "logits/rejected": -0.38953492045402527, + "logps/chosen": -1.3028291463851929, + "logps/rejected": -2.1624374389648438, + "loss": 0.9542, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3028291463851929, + "rewards/margins": 0.8596083521842957, + "rewards/rejected": -2.1624374389648438, + "sft_loss": 1.3950421810150146, + "step": 2930 + }, + { + "epoch": 1.570831242682723, + "grad_norm": 8.34429297717093, + "learning_rate": 1.6376362635978055e-06, + "logits/chosen": -0.5230890512466431, + "logits/rejected": -0.40567511320114136, + "logps/chosen": -1.3589205741882324, + "logps/rejected": -1.9703487157821655, + "loss": 0.9887, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3589205741882324, + "rewards/margins": 0.6114282011985779, + "rewards/rejected": -1.9703487157821655, + "sft_loss": 1.40828537940979, + "step": 2935 + }, + { + "epoch": 1.5735072754641244, + "grad_norm": 8.114057802851098, + "learning_rate": 1.6329831057189936e-06, + "logits/chosen": -0.5228386521339417, + "logits/rejected": -0.37643635272979736, + "logps/chosen": -1.3287737369537354, + "logps/rejected": -2.1009817123413086, + "loss": 0.9644, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.3287737369537354, + "rewards/margins": 0.7722080945968628, + "rewards/rejected": -2.1009817123413086, + "sft_loss": 1.4050620794296265, + "step": 2940 + }, + { + "epoch": 1.5761833082455259, + "grad_norm": 6.84292995539461, + "learning_rate": 1.6283286576375069e-06, + "logits/chosen": -0.5087316036224365, + "logits/rejected": -0.39270779490470886, + "logps/chosen": -1.321316123008728, + "logps/rejected": -1.7746942043304443, + "loss": 1.0044, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.321316123008728, + "rewards/margins": 0.453377902507782, + "rewards/rejected": -1.7746942043304443, + "sft_loss": 1.3574235439300537, + "step": 2945 + }, + { + "epoch": 1.5788593410269276, + "grad_norm": 11.548431373015145, + "learning_rate": 1.623672964510821e-06, + "logits/chosen": -0.37392085790634155, + "logits/rejected": -0.14829064905643463, + "logps/chosen": -1.2613165378570557, + "logps/rejected": -2.001952648162842, + "loss": 0.9193, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2613165378570557, + "rewards/margins": 0.7406360507011414, + "rewards/rejected": -2.001952648162842, + "sft_loss": 1.2964996099472046, + "step": 2950 + }, + { + "epoch": 1.5815353738083293, + "grad_norm": 10.972399432410526, + "learning_rate": 1.6190160715084909e-06, + "logits/chosen": -0.3920244872570038, + "logits/rejected": -0.30911877751350403, + "logps/chosen": -1.3052040338516235, + "logps/rejected": -1.8989613056182861, + "loss": 0.9648, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.3052040338516235, + "rewards/margins": 0.5937572717666626, + "rewards/rejected": -1.8989613056182861, + "sft_loss": 1.3513920307159424, + "step": 2955 + }, + { + "epoch": 1.5842114065897306, + "grad_norm": 7.320339128263725, + "learning_rate": 1.6143580238117132e-06, + "logits/chosen": -0.4998478889465332, + "logits/rejected": -0.39607498049736023, + "logps/chosen": -1.288106083869934, + "logps/rejected": -1.835472822189331, + "loss": 0.9586, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.288106083869934, + "rewards/margins": 0.5473669767379761, + "rewards/rejected": -1.835472822189331, + "sft_loss": 1.330539584159851, + "step": 2960 + }, + { + "epoch": 1.5868874393711323, + "grad_norm": 5.9223720788792376, + "learning_rate": 1.6096988666128867e-06, + "logits/chosen": -0.48560982942581177, + "logits/rejected": -0.4185088276863098, + "logps/chosen": -1.2920416593551636, + "logps/rejected": -1.901266098022461, + "loss": 0.9729, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2920416593551636, + "rewards/margins": 0.6092244386672974, + "rewards/rejected": -1.901266098022461, + "sft_loss": 1.301276445388794, + "step": 2965 + }, + { + "epoch": 1.589563472152534, + "grad_norm": 7.885940091214718, + "learning_rate": 1.6050386451151753e-06, + "logits/chosen": -0.5195499658584595, + "logits/rejected": -0.38451433181762695, + "logps/chosen": -1.3761128187179565, + "logps/rejected": -1.9619756937026978, + "loss": 1.0324, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.3761128187179565, + "rewards/margins": 0.5858628749847412, + "rewards/rejected": -1.9619756937026978, + "sft_loss": 1.4462617635726929, + "step": 2970 + }, + { + "epoch": 1.5922395049339353, + "grad_norm": 12.299188719276014, + "learning_rate": 1.6003774045320686e-06, + "logits/chosen": -0.5009174942970276, + "logits/rejected": -0.3847171664237976, + "logps/chosen": -1.3710649013519287, + "logps/rejected": -2.1126813888549805, + "loss": 0.9785, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3710649013519287, + "rewards/margins": 0.7416165471076965, + "rewards/rejected": -2.1126813888549805, + "sft_loss": 1.4547905921936035, + "step": 2975 + }, + { + "epoch": 1.594915537715337, + "grad_norm": 5.547444314574255, + "learning_rate": 1.5957151900869425e-06, + "logits/chosen": -0.612585723400116, + "logits/rejected": -0.4938820004463196, + "logps/chosen": -1.439699649810791, + "logps/rejected": -2.009437322616577, + "loss": 0.9814, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.439699649810791, + "rewards/margins": 0.5697377920150757, + "rewards/rejected": -2.009437322616577, + "sft_loss": 1.4339112043380737, + "step": 2980 + }, + { + "epoch": 1.5975915704967387, + "grad_norm": 7.988338377974674, + "learning_rate": 1.5910520470126228e-06, + "logits/chosen": -0.5794527530670166, + "logits/rejected": -0.44258028268814087, + "logps/chosen": -1.4420968294143677, + "logps/rejected": -2.096639394760132, + "loss": 0.9941, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4420968294143677, + "rewards/margins": 0.6545425057411194, + "rewards/rejected": -2.096639394760132, + "sft_loss": 1.3904750347137451, + "step": 2985 + }, + { + "epoch": 1.60026760327814, + "grad_norm": 11.050093340691287, + "learning_rate": 1.5863880205509432e-06, + "logits/chosen": -0.5791524052619934, + "logits/rejected": -0.4395558834075928, + "logps/chosen": -1.3059974908828735, + "logps/rejected": -2.073613166809082, + "loss": 0.9213, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3059974908828735, + "rewards/margins": 0.7676156759262085, + "rewards/rejected": -2.073613166809082, + "sft_loss": 1.3330674171447754, + "step": 2990 + }, + { + "epoch": 1.6029436360595417, + "grad_norm": 8.206254056781196, + "learning_rate": 1.5817231559523097e-06, + "logits/chosen": -0.5781939029693604, + "logits/rejected": -0.5216881036758423, + "logps/chosen": -1.4066269397735596, + "logps/rejected": -2.1981959342956543, + "loss": 0.9927, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.4066269397735596, + "rewards/margins": 0.7915690541267395, + "rewards/rejected": -2.1981959342956543, + "sft_loss": 1.4781930446624756, + "step": 2995 + }, + { + "epoch": 1.6056196688409434, + "grad_norm": 6.359668377120754, + "learning_rate": 1.5770574984752582e-06, + "logits/chosen": -0.6331890821456909, + "logits/rejected": -0.5329810976982117, + "logps/chosen": -1.3997949361801147, + "logps/rejected": -2.0195674896240234, + "loss": 1.0077, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3997949361801147, + "rewards/margins": 0.6197725534439087, + "rewards/rejected": -2.0195674896240234, + "sft_loss": 1.3704310655593872, + "step": 3000 + }, + { + "epoch": 1.608295701622345, + "grad_norm": 12.987698998099232, + "learning_rate": 1.5723910933860191e-06, + "logits/chosen": -0.6868072748184204, + "logits/rejected": -0.5752248167991638, + "logps/chosen": -1.3194029331207275, + "logps/rejected": -1.893593192100525, + "loss": 0.9706, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3194029331207275, + "rewards/margins": 0.5741902589797974, + "rewards/rejected": -1.893593192100525, + "sft_loss": 1.3269562721252441, + "step": 3005 + }, + { + "epoch": 1.6109717344037464, + "grad_norm": 8.335772019808212, + "learning_rate": 1.5677239859580742e-06, + "logits/chosen": -0.6515632271766663, + "logits/rejected": -0.5666736364364624, + "logps/chosen": -1.3261549472808838, + "logps/rejected": -1.8968639373779297, + "loss": 0.999, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3261549472808838, + "rewards/margins": 0.5707091689109802, + "rewards/rejected": -1.8968639373779297, + "sft_loss": 1.349196434020996, + "step": 3010 + }, + { + "epoch": 1.6136477671851481, + "grad_norm": 12.432038308281975, + "learning_rate": 1.5630562214717205e-06, + "logits/chosen": -0.5145977139472961, + "logits/rejected": -0.4578397274017334, + "logps/chosen": -1.4018304347991943, + "logps/rejected": -1.8993148803710938, + "loss": 1.0053, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4018304347991943, + "rewards/margins": 0.4974845051765442, + "rewards/rejected": -1.8993148803710938, + "sft_loss": 1.391309380531311, + "step": 3015 + }, + { + "epoch": 1.6163237999665496, + "grad_norm": 6.843658695419179, + "learning_rate": 1.5583878452136296e-06, + "logits/chosen": -0.6365788578987122, + "logits/rejected": -0.5374706387519836, + "logps/chosen": -1.3000893592834473, + "logps/rejected": -1.7917066812515259, + "loss": 0.9793, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3000893592834473, + "rewards/margins": 0.4916171133518219, + "rewards/rejected": -1.7917066812515259, + "sft_loss": 1.3843553066253662, + "step": 3020 + }, + { + "epoch": 1.6189998327479511, + "grad_norm": 6.092318695370072, + "learning_rate": 1.5537189024764086e-06, + "logits/chosen": -0.5974392890930176, + "logits/rejected": -0.49662700295448303, + "logps/chosen": -1.2565548419952393, + "logps/rejected": -1.714015245437622, + "loss": 0.9882, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2565548419952393, + "rewards/margins": 0.45746010541915894, + "rewards/rejected": -1.714015245437622, + "sft_loss": 1.3433637619018555, + "step": 3025 + }, + { + "epoch": 1.6216758655293528, + "grad_norm": 6.9247657211195754, + "learning_rate": 1.5490494385581599e-06, + "logits/chosen": -0.5492630004882812, + "logits/rejected": -0.451678991317749, + "logps/chosen": -1.363454818725586, + "logps/rejected": -1.8830769062042236, + "loss": 0.9923, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.363454818725586, + "rewards/margins": 0.5196219682693481, + "rewards/rejected": -1.8830769062042236, + "sft_loss": 1.3941650390625, + "step": 3030 + }, + { + "epoch": 1.6243518983107543, + "grad_norm": 7.708345661189646, + "learning_rate": 1.5443794987620433e-06, + "logits/chosen": -0.4692181646823883, + "logits/rejected": -0.40992242097854614, + "logps/chosen": -1.3081228733062744, + "logps/rejected": -1.6529757976531982, + "loss": 0.997, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3081228733062744, + "rewards/margins": 0.34485286474227905, + "rewards/rejected": -1.6529757976531982, + "sft_loss": 1.3308913707733154, + "step": 3035 + }, + { + "epoch": 1.6270279310921558, + "grad_norm": 6.957139071318043, + "learning_rate": 1.539709128395835e-06, + "logits/chosen": -0.5676388144493103, + "logits/rejected": -0.5339337587356567, + "logps/chosen": -1.214019536972046, + "logps/rejected": -1.9025837182998657, + "loss": 0.9272, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.214019536972046, + "rewards/margins": 0.6885641813278198, + "rewards/rejected": -1.9025837182998657, + "sft_loss": 1.2781291007995605, + "step": 3040 + }, + { + "epoch": 1.6297039638735575, + "grad_norm": 31.39373896850739, + "learning_rate": 1.5350383727714888e-06, + "logits/chosen": -0.5760513544082642, + "logits/rejected": -0.5065566301345825, + "logps/chosen": -1.3135230541229248, + "logps/rejected": -1.7099521160125732, + "loss": 1.0353, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3135230541229248, + "rewards/margins": 0.3964292109012604, + "rewards/rejected": -1.7099521160125732, + "sft_loss": 1.3254320621490479, + "step": 3045 + }, + { + "epoch": 1.632379996654959, + "grad_norm": 7.158147144834485, + "learning_rate": 1.5303672772046963e-06, + "logits/chosen": -0.5834034085273743, + "logits/rejected": -0.45677343010902405, + "logps/chosen": -1.3490737676620483, + "logps/rejected": -2.0836308002471924, + "loss": 0.9457, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3490737676620483, + "rewards/margins": 0.7345567345619202, + "rewards/rejected": -2.0836308002471924, + "sft_loss": 1.4182727336883545, + "step": 3050 + }, + { + "epoch": 1.6350560294363605, + "grad_norm": 7.871853838413433, + "learning_rate": 1.525695887014447e-06, + "logits/chosen": -0.561724066734314, + "logits/rejected": -0.4565046429634094, + "logps/chosen": -1.30876886844635, + "logps/rejected": -1.8969253301620483, + "loss": 0.9524, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.30876886844635, + "rewards/margins": 0.5881567001342773, + "rewards/rejected": -1.8969253301620483, + "sft_loss": 1.3320848941802979, + "step": 3055 + }, + { + "epoch": 1.6377320622177622, + "grad_norm": 7.269484826641823, + "learning_rate": 1.5210242475225896e-06, + "logits/chosen": -0.5328022837638855, + "logits/rejected": -0.3768201172351837, + "logps/chosen": -1.3358179330825806, + "logps/rejected": -1.868719458580017, + "loss": 1.0073, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3358179330825806, + "rewards/margins": 0.532901406288147, + "rewards/rejected": -1.868719458580017, + "sft_loss": 1.3920084238052368, + "step": 3060 + }, + { + "epoch": 1.6404080949991637, + "grad_norm": 14.670382231421788, + "learning_rate": 1.5163524040533903e-06, + "logits/chosen": -0.4326193928718567, + "logits/rejected": -0.4179346561431885, + "logps/chosen": -1.3334858417510986, + "logps/rejected": -1.9773250818252563, + "loss": 0.9607, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3334858417510986, + "rewards/margins": 0.6438394784927368, + "rewards/rejected": -1.9773250818252563, + "sft_loss": 1.373219609260559, + "step": 3065 + }, + { + "epoch": 1.6430841277805652, + "grad_norm": 7.986154271307429, + "learning_rate": 1.5116804019330951e-06, + "logits/chosen": -0.5422690510749817, + "logits/rejected": -0.4372781217098236, + "logps/chosen": -1.3089624643325806, + "logps/rejected": -1.870825171470642, + "loss": 0.9804, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3089624643325806, + "rewards/margins": 0.5618628263473511, + "rewards/rejected": -1.870825171470642, + "sft_loss": 1.3825502395629883, + "step": 3070 + }, + { + "epoch": 1.645760160561967, + "grad_norm": 5.749957550258941, + "learning_rate": 1.5070082864894892e-06, + "logits/chosen": -0.5507332682609558, + "logits/rejected": -0.5055480599403381, + "logps/chosen": -1.2318403720855713, + "logps/rejected": -1.7911930084228516, + "loss": 0.9293, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2318403720855713, + "rewards/margins": 0.5593525767326355, + "rewards/rejected": -1.7911930084228516, + "sft_loss": 1.256838321685791, + "step": 3075 + }, + { + "epoch": 1.6484361933433684, + "grad_norm": 8.496437191360968, + "learning_rate": 1.5023361030514572e-06, + "logits/chosen": -0.6092488765716553, + "logits/rejected": -0.4492560029029846, + "logps/chosen": -1.192866563796997, + "logps/rejected": -1.7828502655029297, + "loss": 0.9257, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.192866563796997, + "rewards/margins": 0.5899838209152222, + "rewards/rejected": -1.7828502655029297, + "sft_loss": 1.277256965637207, + "step": 3080 + }, + { + "epoch": 1.65111222612477, + "grad_norm": 7.625719573165574, + "learning_rate": 1.4976638969485433e-06, + "logits/chosen": -0.4337848722934723, + "logits/rejected": -0.4376908242702484, + "logps/chosen": -1.2819950580596924, + "logps/rejected": -1.8182960748672485, + "loss": 0.9591, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2819950580596924, + "rewards/margins": 0.5363009572029114, + "rewards/rejected": -1.8182960748672485, + "sft_loss": 1.3229899406433105, + "step": 3085 + }, + { + "epoch": 1.6537882589061716, + "grad_norm": 9.886210431426107, + "learning_rate": 1.492991713510511e-06, + "logits/chosen": -0.4346126914024353, + "logits/rejected": -0.3826829791069031, + "logps/chosen": -1.2966272830963135, + "logps/rejected": -1.7472527027130127, + "loss": 1.0168, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.2966272830963135, + "rewards/margins": 0.45062533020973206, + "rewards/rejected": -1.7472527027130127, + "sft_loss": 1.3685801029205322, + "step": 3090 + }, + { + "epoch": 1.6564642916875731, + "grad_norm": 7.0604512187525765, + "learning_rate": 1.4883195980669052e-06, + "logits/chosen": -0.512303352355957, + "logits/rejected": -0.40312275290489197, + "logps/chosen": -1.3184245824813843, + "logps/rejected": -1.9654176235198975, + "loss": 0.9278, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3184245824813843, + "rewards/margins": 0.6469929814338684, + "rewards/rejected": -1.9654176235198975, + "sft_loss": 1.3390653133392334, + "step": 3095 + }, + { + "epoch": 1.6591403244689746, + "grad_norm": 10.563683019206419, + "learning_rate": 1.48364759594661e-06, + "logits/chosen": -0.6492779850959778, + "logits/rejected": -0.5419107675552368, + "logps/chosen": -1.344524621963501, + "logps/rejected": -1.9468377828598022, + "loss": 0.9877, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.344524621963501, + "rewards/margins": 0.6023133993148804, + "rewards/rejected": -1.9468377828598022, + "sft_loss": 1.4405728578567505, + "step": 3100 + }, + { + "epoch": 1.6618163572503764, + "grad_norm": 12.262873845911782, + "learning_rate": 1.4789757524774105e-06, + "logits/chosen": -0.5909110307693481, + "logits/rejected": -0.4422377645969391, + "logps/chosen": -1.3807677030563354, + "logps/rejected": -1.9295337200164795, + "loss": 0.9935, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3807677030563354, + "rewards/margins": 0.5487662553787231, + "rewards/rejected": -1.9295337200164795, + "sft_loss": 1.4326026439666748, + "step": 3105 + }, + { + "epoch": 1.6644923900317778, + "grad_norm": 10.11402429370191, + "learning_rate": 1.474304112985553e-06, + "logits/chosen": -0.5599151253700256, + "logits/rejected": -0.4676848351955414, + "logps/chosen": -1.3399332761764526, + "logps/rejected": -1.953974962234497, + "loss": 0.9437, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3399332761764526, + "rewards/margins": 0.614041805267334, + "rewards/rejected": -1.953974962234497, + "sft_loss": 1.312151312828064, + "step": 3110 + }, + { + "epoch": 1.6671684228131793, + "grad_norm": 8.156421754815042, + "learning_rate": 1.469632722795304e-06, + "logits/chosen": -0.4865991473197937, + "logits/rejected": -0.4286844730377197, + "logps/chosen": -1.3717714548110962, + "logps/rejected": -2.03859543800354, + "loss": 0.9726, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3717714548110962, + "rewards/margins": 0.6668239235877991, + "rewards/rejected": -2.03859543800354, + "sft_loss": 1.450972080230713, + "step": 3115 + }, + { + "epoch": 1.669844455594581, + "grad_norm": 7.9413448011852275, + "learning_rate": 1.4649616272285115e-06, + "logits/chosen": -0.591395914554596, + "logits/rejected": -0.48542946577072144, + "logps/chosen": -1.4039747714996338, + "logps/rejected": -2.049610137939453, + "loss": 1.0139, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.4039747714996338, + "rewards/margins": 0.6456353068351746, + "rewards/rejected": -2.049610137939453, + "sft_loss": 1.4193564653396606, + "step": 3120 + }, + { + "epoch": 1.6725204883759828, + "grad_norm": 6.763474728045645, + "learning_rate": 1.4602908716041651e-06, + "logits/chosen": -0.4781588912010193, + "logits/rejected": -0.39982470870018005, + "logps/chosen": -1.526206374168396, + "logps/rejected": -2.1994595527648926, + "loss": 0.9935, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.526206374168396, + "rewards/margins": 0.6732532978057861, + "rewards/rejected": -2.1994595527648926, + "sft_loss": 1.4044595956802368, + "step": 3125 + }, + { + "epoch": 1.675196521157384, + "grad_norm": 9.44893034407677, + "learning_rate": 1.4556205012379568e-06, + "logits/chosen": -0.48927783966064453, + "logits/rejected": -0.3744940161705017, + "logps/chosen": -1.3688924312591553, + "logps/rejected": -1.95903742313385, + "loss": 0.9783, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3688924312591553, + "rewards/margins": 0.5901449918746948, + "rewards/rejected": -1.95903742313385, + "sft_loss": 1.4328769445419312, + "step": 3130 + }, + { + "epoch": 1.6778725539387858, + "grad_norm": 7.876706171928691, + "learning_rate": 1.4509505614418402e-06, + "logits/chosen": -0.4390248656272888, + "logits/rejected": -0.38640502095222473, + "logps/chosen": -1.4024837017059326, + "logps/rejected": -1.9528404474258423, + "loss": 0.9861, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.4024837017059326, + "rewards/margins": 0.5503565073013306, + "rewards/rejected": -1.9528404474258423, + "sft_loss": 1.3517831563949585, + "step": 3135 + }, + { + "epoch": 1.6805485867201875, + "grad_norm": 7.842891697104294, + "learning_rate": 1.4462810975235915e-06, + "logits/chosen": -0.6836906671524048, + "logits/rejected": -0.5594662427902222, + "logps/chosen": -1.2556931972503662, + "logps/rejected": -1.6881049871444702, + "loss": 0.992, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2556931972503662, + "rewards/margins": 0.4324119985103607, + "rewards/rejected": -1.6881049871444702, + "sft_loss": 1.3282722234725952, + "step": 3140 + }, + { + "epoch": 1.6832246195015887, + "grad_norm": 9.152902396575497, + "learning_rate": 1.4416121547863703e-06, + "logits/chosen": -0.49310851097106934, + "logits/rejected": -0.3880365490913391, + "logps/chosen": -1.3027580976486206, + "logps/rejected": -1.9210455417633057, + "loss": 0.9856, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3027580976486206, + "rewards/margins": 0.6182874441146851, + "rewards/rejected": -1.9210455417633057, + "sft_loss": 1.362561583518982, + "step": 3145 + }, + { + "epoch": 1.6859006522829905, + "grad_norm": 6.911340969081792, + "learning_rate": 1.4369437785282794e-06, + "logits/chosen": -0.6421962976455688, + "logits/rejected": -0.5590324401855469, + "logps/chosen": -1.3706694841384888, + "logps/rejected": -1.921740174293518, + "loss": 0.9652, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3706694841384888, + "rewards/margins": 0.5510705709457397, + "rewards/rejected": -1.921740174293518, + "sft_loss": 1.385704755783081, + "step": 3150 + }, + { + "epoch": 1.6885766850643922, + "grad_norm": 8.799634282616347, + "learning_rate": 1.4322760140419259e-06, + "logits/chosen": -0.6151847839355469, + "logits/rejected": -0.5183078646659851, + "logps/chosen": -1.2506282329559326, + "logps/rejected": -1.8808501958847046, + "loss": 0.9542, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.2506282329559326, + "rewards/margins": 0.630221962928772, + "rewards/rejected": -1.8808501958847046, + "sft_loss": 1.3045356273651123, + "step": 3155 + }, + { + "epoch": 1.6912527178457935, + "grad_norm": 13.265562216769109, + "learning_rate": 1.427608906613981e-06, + "logits/chosen": -0.561122715473175, + "logits/rejected": -0.5639852285385132, + "logps/chosen": -1.3408920764923096, + "logps/rejected": -1.9823658466339111, + "loss": 0.9855, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3408920764923096, + "rewards/margins": 0.6414738893508911, + "rewards/rejected": -1.9823658466339111, + "sft_loss": 1.4433765411376953, + "step": 3160 + }, + { + "epoch": 1.6939287506271952, + "grad_norm": 8.517806199918638, + "learning_rate": 1.4229425015247414e-06, + "logits/chosen": -0.6105222105979919, + "logits/rejected": -0.5246438980102539, + "logps/chosen": -1.3622534275054932, + "logps/rejected": -1.8676481246948242, + "loss": 1.0286, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3622534275054932, + "rewards/margins": 0.5053948760032654, + "rewards/rejected": -1.8676481246948242, + "sft_loss": 1.4393728971481323, + "step": 3165 + }, + { + "epoch": 1.6966047834085969, + "grad_norm": 6.247142456096901, + "learning_rate": 1.4182768440476904e-06, + "logits/chosen": -0.6173042058944702, + "logits/rejected": -0.5559664964675903, + "logps/chosen": -1.358097791671753, + "logps/rejected": -1.9720399379730225, + "loss": 0.9824, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.358097791671753, + "rewards/margins": 0.6139422059059143, + "rewards/rejected": -1.9720399379730225, + "sft_loss": 1.4049310684204102, + "step": 3170 + }, + { + "epoch": 1.6992808161899984, + "grad_norm": 19.69526048686937, + "learning_rate": 1.4136119794490567e-06, + "logits/chosen": -0.6866458654403687, + "logits/rejected": -0.6137515902519226, + "logps/chosen": -1.3925710916519165, + "logps/rejected": -1.8843557834625244, + "loss": 1.0459, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3925710916519165, + "rewards/margins": 0.4917844831943512, + "rewards/rejected": -1.8843557834625244, + "sft_loss": 1.431840181350708, + "step": 3175 + }, + { + "epoch": 1.7019568489713999, + "grad_norm": 6.359568180732788, + "learning_rate": 1.4089479529873773e-06, + "logits/chosen": -0.5308200716972351, + "logits/rejected": -0.4930025637149811, + "logps/chosen": -1.3771328926086426, + "logps/rejected": -2.063537359237671, + "loss": 0.9812, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3771328926086426, + "rewards/margins": 0.6864045858383179, + "rewards/rejected": -2.063537359237671, + "sft_loss": 1.3803982734680176, + "step": 3180 + }, + { + "epoch": 1.7046328817528016, + "grad_norm": 7.135729281452068, + "learning_rate": 1.4042848099130574e-06, + "logits/chosen": -0.5700703263282776, + "logits/rejected": -0.5708822011947632, + "logps/chosen": -1.2795307636260986, + "logps/rejected": -1.7099710702896118, + "loss": 1.0013, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2795307636260986, + "rewards/margins": 0.43044036626815796, + "rewards/rejected": -1.7099710702896118, + "sft_loss": 1.358174204826355, + "step": 3185 + }, + { + "epoch": 1.707308914534203, + "grad_norm": 5.886225397742045, + "learning_rate": 1.3996225954679317e-06, + "logits/chosen": -0.5978802442550659, + "logits/rejected": -0.4967038035392761, + "logps/chosen": -1.2683467864990234, + "logps/rejected": -1.8838962316513062, + "loss": 0.9146, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2683467864990234, + "rewards/margins": 0.6155495047569275, + "rewards/rejected": -1.8838962316513062, + "sft_loss": 1.2717249393463135, + "step": 3190 + }, + { + "epoch": 1.7099849473156046, + "grad_norm": 6.2223063839513735, + "learning_rate": 1.3949613548848248e-06, + "logits/chosen": -0.6236740350723267, + "logits/rejected": -0.5328378677368164, + "logps/chosen": -1.2708802223205566, + "logps/rejected": -1.8795549869537354, + "loss": 0.9315, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2708802223205566, + "rewards/margins": 0.6086748242378235, + "rewards/rejected": -1.8795549869537354, + "sft_loss": 1.259526014328003, + "step": 3195 + }, + { + "epoch": 1.7126609800970063, + "grad_norm": 7.529280028879536, + "learning_rate": 1.3903011333871134e-06, + "logits/chosen": -0.5299532413482666, + "logits/rejected": -0.4078814387321472, + "logps/chosen": -1.3449041843414307, + "logps/rejected": -1.996220350265503, + "loss": 0.9824, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3449041843414307, + "rewards/margins": 0.6513162851333618, + "rewards/rejected": -1.996220350265503, + "sft_loss": 1.3764393329620361, + "step": 3200 + }, + { + "epoch": 1.7126609800970063, + "eval_logits/chosen": -0.29770681262016296, + "eval_logits/rejected": -0.23772414028644562, + "eval_logps/chosen": -1.3916462659835815, + "eval_logps/rejected": -1.8345197439193726, + "eval_loss": 1.0347309112548828, + "eval_rewards/accuracies": 0.6283382773399353, + "eval_rewards/chosen": -1.3916462659835815, + "eval_rewards/margins": 0.4428735375404358, + "eval_rewards/rejected": -1.8345197439193726, + "eval_runtime": 46.4526, + "eval_samples_per_second": 28.954, + "eval_sft_loss": 1.406299114227295, + "eval_steps_per_second": 7.255, + "step": 3200 + }, + { + "epoch": 1.7153370128784078, + "grad_norm": 7.354120391704242, + "learning_rate": 1.3856419761882875e-06, + "logits/chosen": -0.6612704992294312, + "logits/rejected": -0.579429030418396, + "logps/chosen": -1.3465828895568848, + "logps/rejected": -1.9286384582519531, + "loss": 0.9623, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3465828895568848, + "rewards/margins": 0.5820555686950684, + "rewards/rejected": -1.9286384582519531, + "sft_loss": 1.3542835712432861, + "step": 3205 + }, + { + "epoch": 1.7180130456598093, + "grad_norm": 8.205677696925335, + "learning_rate": 1.3809839284915096e-06, + "logits/chosen": -0.638985276222229, + "logits/rejected": -0.5678723454475403, + "logps/chosen": -1.3050861358642578, + "logps/rejected": -1.8729593753814697, + "loss": 0.9808, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3050861358642578, + "rewards/margins": 0.5678732991218567, + "rewards/rejected": -1.8729593753814697, + "sft_loss": 1.3473398685455322, + "step": 3210 + }, + { + "epoch": 1.720689078441211, + "grad_norm": 8.31806706498762, + "learning_rate": 1.3763270354891795e-06, + "logits/chosen": -0.6113357543945312, + "logits/rejected": -0.5205150842666626, + "logps/chosen": -1.3317005634307861, + "logps/rejected": -1.945020079612732, + "loss": 0.9746, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3317005634307861, + "rewards/margins": 0.6133192777633667, + "rewards/rejected": -1.945020079612732, + "sft_loss": 1.3502792119979858, + "step": 3215 + }, + { + "epoch": 1.7233651112226125, + "grad_norm": 6.374273487412651, + "learning_rate": 1.3716713423624936e-06, + "logits/chosen": -0.6377384662628174, + "logits/rejected": -0.4735463559627533, + "logps/chosen": -1.4612407684326172, + "logps/rejected": -2.114426374435425, + "loss": 1.0311, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4612407684326172, + "rewards/margins": 0.6531856656074524, + "rewards/rejected": -2.114426374435425, + "sft_loss": 1.4271241426467896, + "step": 3220 + }, + { + "epoch": 1.726041144004014, + "grad_norm": 7.714682568173162, + "learning_rate": 1.367016894281007e-06, + "logits/chosen": -0.6564761400222778, + "logits/rejected": -0.5606242418289185, + "logps/chosen": -1.2402262687683105, + "logps/rejected": -1.8998502492904663, + "loss": 0.9272, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2402262687683105, + "rewards/margins": 0.6596239805221558, + "rewards/rejected": -1.8998502492904663, + "sft_loss": 1.3146278858184814, + "step": 3225 + }, + { + "epoch": 1.7287171767854157, + "grad_norm": 9.634844022060468, + "learning_rate": 1.3623637364021952e-06, + "logits/chosen": -0.6817273497581482, + "logits/rejected": -0.565589427947998, + "logps/chosen": -1.3778315782546997, + "logps/rejected": -2.260098695755005, + "loss": 0.9018, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3778315782546997, + "rewards/margins": 0.88226717710495, + "rewards/rejected": -2.260098695755005, + "sft_loss": 1.3882973194122314, + "step": 3230 + }, + { + "epoch": 1.7313932095668172, + "grad_norm": 7.761642142665585, + "learning_rate": 1.3577119138710165e-06, + "logits/chosen": -0.6394303441047668, + "logits/rejected": -0.6095726490020752, + "logps/chosen": -1.3704173564910889, + "logps/rejected": -1.9797813892364502, + "loss": 0.9781, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3704173564910889, + "rewards/margins": 0.6093640923500061, + "rewards/rejected": -1.9797813892364502, + "sft_loss": 1.4049458503723145, + "step": 3235 + }, + { + "epoch": 1.7340692423482187, + "grad_norm": 7.592780449296121, + "learning_rate": 1.3530614718194734e-06, + "logits/chosen": -0.588589072227478, + "logits/rejected": -0.5276697874069214, + "logps/chosen": -1.3299793004989624, + "logps/rejected": -2.089031219482422, + "loss": 0.9079, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3299793004989624, + "rewards/margins": 0.7590519189834595, + "rewards/rejected": -2.089031219482422, + "sft_loss": 1.279463291168213, + "step": 3240 + }, + { + "epoch": 1.7367452751296204, + "grad_norm": 11.02640467263545, + "learning_rate": 1.3484124553661754e-06, + "logits/chosen": -0.7396196126937866, + "logits/rejected": -0.6345130205154419, + "logps/chosen": -1.380319595336914, + "logps/rejected": -2.030441999435425, + "loss": 0.9741, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.380319595336914, + "rewards/margins": 0.6501225233078003, + "rewards/rejected": -2.030441999435425, + "sft_loss": 1.3692419528961182, + "step": 3245 + }, + { + "epoch": 1.739421307911022, + "grad_norm": 8.612600618686669, + "learning_rate": 1.3437649096159e-06, + "logits/chosen": -0.5689498782157898, + "logits/rejected": -0.496391624212265, + "logps/chosen": -1.3582226037979126, + "logps/rejected": -1.966665267944336, + "loss": 0.9627, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3582226037979126, + "rewards/margins": 0.6084426641464233, + "rewards/rejected": -1.966665267944336, + "sft_loss": 1.3722515106201172, + "step": 3250 + }, + { + "epoch": 1.7420973406924234, + "grad_norm": 5.941814371197206, + "learning_rate": 1.3391188796591568e-06, + "logits/chosen": -0.6051737666130066, + "logits/rejected": -0.5670342445373535, + "logps/chosen": -1.398306131362915, + "logps/rejected": -1.9220012426376343, + "loss": 0.9966, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.398306131362915, + "rewards/margins": 0.5236951112747192, + "rewards/rejected": -1.9220012426376343, + "sft_loss": 1.4062070846557617, + "step": 3255 + }, + { + "epoch": 1.744773373473825, + "grad_norm": 7.529679820359014, + "learning_rate": 1.3344744105717487e-06, + "logits/chosen": -0.7107158303260803, + "logits/rejected": -0.6142610907554626, + "logps/chosen": -1.3456017971038818, + "logps/rejected": -1.86309814453125, + "loss": 0.9881, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3456017971038818, + "rewards/margins": 0.5174962878227234, + "rewards/rejected": -1.86309814453125, + "sft_loss": 1.3745471239089966, + "step": 3260 + }, + { + "epoch": 1.7474494062552266, + "grad_norm": 11.00381742148276, + "learning_rate": 1.3298315474143354e-06, + "logits/chosen": -0.577115535736084, + "logits/rejected": -0.4973164200782776, + "logps/chosen": -1.2828433513641357, + "logps/rejected": -1.985243558883667, + "loss": 0.9345, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.2828433513641357, + "rewards/margins": 0.7024003863334656, + "rewards/rejected": -1.985243558883667, + "sft_loss": 1.3437730073928833, + "step": 3265 + }, + { + "epoch": 1.750125439036628, + "grad_norm": 7.235230962019375, + "learning_rate": 1.3251903352319951e-06, + "logits/chosen": -0.5999752283096313, + "logits/rejected": -0.4686052203178406, + "logps/chosen": -1.2623176574707031, + "logps/rejected": -1.961260199546814, + "loss": 0.9324, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2623176574707031, + "rewards/margins": 0.6989427208900452, + "rewards/rejected": -1.961260199546814, + "sft_loss": 1.3062334060668945, + "step": 3270 + }, + { + "epoch": 1.7528014718180298, + "grad_norm": 7.746349070805825, + "learning_rate": 1.3205508190537895e-06, + "logits/chosen": -0.6424199342727661, + "logits/rejected": -0.4257664680480957, + "logps/chosen": -1.3440327644348145, + "logps/rejected": -1.970116376876831, + "loss": 0.9803, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3440327644348145, + "rewards/margins": 0.6260837316513062, + "rewards/rejected": -1.970116376876831, + "sft_loss": 1.4007642269134521, + "step": 3275 + }, + { + "epoch": 1.7554775045994313, + "grad_norm": 11.270118234339037, + "learning_rate": 1.3159130438923242e-06, + "logits/chosen": -0.6333575248718262, + "logits/rejected": -0.5891388654708862, + "logps/chosen": -1.2919838428497314, + "logps/rejected": -1.8722622394561768, + "loss": 0.9688, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2919838428497314, + "rewards/margins": 0.5802782773971558, + "rewards/rejected": -1.8722622394561768, + "sft_loss": 1.3905534744262695, + "step": 3280 + }, + { + "epoch": 1.7581535373808328, + "grad_norm": 7.970392351680856, + "learning_rate": 1.3112770547433144e-06, + "logits/chosen": -0.6499245762825012, + "logits/rejected": -0.49632638692855835, + "logps/chosen": -1.3325730562210083, + "logps/rejected": -1.9567444324493408, + "loss": 0.9676, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3325730562210083, + "rewards/margins": 0.6241713166236877, + "rewards/rejected": -1.9567444324493408, + "sft_loss": 1.3798366785049438, + "step": 3285 + }, + { + "epoch": 1.7608295701622345, + "grad_norm": 9.061602961372913, + "learning_rate": 1.3066428965851472e-06, + "logits/chosen": -0.6206027269363403, + "logits/rejected": -0.5510232448577881, + "logps/chosen": -1.4109750986099243, + "logps/rejected": -2.0197665691375732, + "loss": 1.0131, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.4109750986099243, + "rewards/margins": 0.6087915897369385, + "rewards/rejected": -2.0197665691375732, + "sft_loss": 1.4571069478988647, + "step": 3290 + }, + { + "epoch": 1.763505602943636, + "grad_norm": 8.38575291192651, + "learning_rate": 1.3020106143784454e-06, + "logits/chosen": -0.6656057238578796, + "logits/rejected": -0.6355992555618286, + "logps/chosen": -1.4497514963150024, + "logps/rejected": -2.0534777641296387, + "loss": 1.0232, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4497514963150024, + "rewards/margins": 0.6037260890007019, + "rewards/rejected": -2.0534777641296387, + "sft_loss": 1.4650858640670776, + "step": 3295 + }, + { + "epoch": 1.7661816357250375, + "grad_norm": 7.755305600869868, + "learning_rate": 1.2973802530656314e-06, + "logits/chosen": -0.7491085529327393, + "logits/rejected": -0.6685279607772827, + "logps/chosen": -1.4618529081344604, + "logps/rejected": -2.1163430213928223, + "loss": 1.0177, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.4618529081344604, + "rewards/margins": 0.6544899940490723, + "rewards/rejected": -2.1163430213928223, + "sft_loss": 1.5315229892730713, + "step": 3300 + }, + { + "epoch": 1.7688576685064392, + "grad_norm": 12.168745838634132, + "learning_rate": 1.2927518575704906e-06, + "logits/chosen": -0.713337779045105, + "logits/rejected": -0.6121014952659607, + "logps/chosen": -1.3905467987060547, + "logps/rejected": -2.015956401824951, + "loss": 0.9914, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3905467987060547, + "rewards/margins": 0.6254096627235413, + "rewards/rejected": -2.015956401824951, + "sft_loss": 1.4138438701629639, + "step": 3305 + }, + { + "epoch": 1.771533701287841, + "grad_norm": 7.931750234484632, + "learning_rate": 1.2881254727977365e-06, + "logits/chosen": -0.5274156332015991, + "logits/rejected": -0.5057668089866638, + "logps/chosen": -1.3538296222686768, + "logps/rejected": -1.9042785167694092, + "loss": 0.9768, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.3538296222686768, + "rewards/margins": 0.5504489541053772, + "rewards/rejected": -1.9042785167694092, + "sft_loss": 1.382365107536316, + "step": 3310 + }, + { + "epoch": 1.7742097340692422, + "grad_norm": 9.237317453089084, + "learning_rate": 1.2835011436325749e-06, + "logits/chosen": -0.660834014415741, + "logits/rejected": -0.5497223734855652, + "logps/chosen": -1.314284086227417, + "logps/rejected": -1.891516923904419, + "loss": 0.9609, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.314284086227417, + "rewards/margins": 0.5772326588630676, + "rewards/rejected": -1.891516923904419, + "sft_loss": 1.344481348991394, + "step": 3315 + }, + { + "epoch": 1.776885766850644, + "grad_norm": 6.456649113595514, + "learning_rate": 1.278878914940267e-06, + "logits/chosen": -0.582197368144989, + "logits/rejected": -0.43381983041763306, + "logps/chosen": -1.3320949077606201, + "logps/rejected": -2.1774628162384033, + "loss": 0.9482, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3320949077606201, + "rewards/margins": 0.8453680872917175, + "rewards/rejected": -2.1774628162384033, + "sft_loss": 1.3886038064956665, + "step": 3320 + }, + { + "epoch": 1.7795617996320456, + "grad_norm": 12.876972828783495, + "learning_rate": 1.2742588315656963e-06, + "logits/chosen": -0.6521707773208618, + "logits/rejected": -0.500217854976654, + "logps/chosen": -1.3390861749649048, + "logps/rejected": -2.0092623233795166, + "loss": 0.9634, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3390861749649048, + "rewards/margins": 0.6701762080192566, + "rewards/rejected": -2.0092623233795166, + "sft_loss": 1.4378734827041626, + "step": 3325 + }, + { + "epoch": 1.782237832413447, + "grad_norm": 6.924077732123589, + "learning_rate": 1.269640938332932e-06, + "logits/chosen": -0.5279954671859741, + "logits/rejected": -0.4550296366214752, + "logps/chosen": -1.2392055988311768, + "logps/rejected": -2.0392346382141113, + "loss": 0.8986, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.2392055988311768, + "rewards/margins": 0.8000289797782898, + "rewards/rejected": -2.0392346382141113, + "sft_loss": 1.3009543418884277, + "step": 3330 + }, + { + "epoch": 1.7849138651948486, + "grad_norm": 13.126758218379875, + "learning_rate": 1.265025280044794e-06, + "logits/chosen": -0.6107516884803772, + "logits/rejected": -0.4902040958404541, + "logps/chosen": -1.364108681678772, + "logps/rejected": -1.9469196796417236, + "loss": 0.9617, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.364108681678772, + "rewards/margins": 0.5828110575675964, + "rewards/rejected": -1.9469196796417236, + "sft_loss": 1.3647311925888062, + "step": 3335 + }, + { + "epoch": 1.7875898979762503, + "grad_norm": 10.757237916977163, + "learning_rate": 1.2604119014824197e-06, + "logits/chosen": -0.5626201629638672, + "logits/rejected": -0.46400943398475647, + "logps/chosen": -1.3011457920074463, + "logps/rejected": -1.8755508661270142, + "loss": 0.9755, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3011457920074463, + "rewards/margins": 0.574405312538147, + "rewards/rejected": -1.8755508661270142, + "sft_loss": 1.3482170104980469, + "step": 3340 + }, + { + "epoch": 1.7902659307576518, + "grad_norm": 7.560167106069082, + "learning_rate": 1.2558008474048279e-06, + "logits/chosen": -0.5565083622932434, + "logits/rejected": -0.4244113862514496, + "logps/chosen": -1.230414628982544, + "logps/rejected": -1.8162529468536377, + "loss": 0.9346, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.230414628982544, + "rewards/margins": 0.5858383774757385, + "rewards/rejected": -1.8162529468536377, + "sft_loss": 1.293988585472107, + "step": 3345 + }, + { + "epoch": 1.7929419635390533, + "grad_norm": 10.80950410067998, + "learning_rate": 1.2511921625484857e-06, + "logits/chosen": -0.7061828374862671, + "logits/rejected": -0.6089943647384644, + "logps/chosen": -1.3991694450378418, + "logps/rejected": -1.9263055324554443, + "loss": 0.9943, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3991694450378418, + "rewards/margins": 0.5271362066268921, + "rewards/rejected": -1.9263055324554443, + "sft_loss": 1.4165757894515991, + "step": 3350 + }, + { + "epoch": 1.795617996320455, + "grad_norm": 9.365451206885025, + "learning_rate": 1.2465858916268734e-06, + "logits/chosen": -0.4708705544471741, + "logits/rejected": -0.4554038643836975, + "logps/chosen": -1.4415693283081055, + "logps/rejected": -1.9839493036270142, + "loss": 1.0765, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.4415693283081055, + "rewards/margins": 0.5423800349235535, + "rewards/rejected": -1.9839493036270142, + "sft_loss": 1.4307851791381836, + "step": 3355 + }, + { + "epoch": 1.7982940291018565, + "grad_norm": 11.867632025826655, + "learning_rate": 1.2419820793300526e-06, + "logits/chosen": -0.6369816064834595, + "logits/rejected": -0.4788491129875183, + "logps/chosen": -1.295993685722351, + "logps/rejected": -1.9095256328582764, + "loss": 0.9696, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.295993685722351, + "rewards/margins": 0.6135318875312805, + "rewards/rejected": -1.9095256328582764, + "sft_loss": 1.3290159702301025, + "step": 3360 + }, + { + "epoch": 1.800970061883258, + "grad_norm": 8.543260387548337, + "learning_rate": 1.2373807703242293e-06, + "logits/chosen": -0.6585286259651184, + "logits/rejected": -0.5375559329986572, + "logps/chosen": -1.3580644130706787, + "logps/rejected": -1.9835560321807861, + "loss": 0.9655, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3580644130706787, + "rewards/margins": 0.625491738319397, + "rewards/rejected": -1.9835560321807861, + "sft_loss": 1.3997434377670288, + "step": 3365 + }, + { + "epoch": 1.8036460946646597, + "grad_norm": 8.393989138340482, + "learning_rate": 1.232782009251324e-06, + "logits/chosen": -0.643993079662323, + "logits/rejected": -0.5305663347244263, + "logps/chosen": -1.3146846294403076, + "logps/rejected": -1.8510560989379883, + "loss": 0.9927, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.3146846294403076, + "rewards/margins": 0.536371648311615, + "rewards/rejected": -1.8510560989379883, + "sft_loss": 1.3597562313079834, + "step": 3370 + }, + { + "epoch": 1.8063221274460612, + "grad_norm": 11.44897952633478, + "learning_rate": 1.228185840728537e-06, + "logits/chosen": -0.5137637853622437, + "logits/rejected": -0.4967266023159027, + "logps/chosen": -1.4023181200027466, + "logps/rejected": -1.9296295642852783, + "loss": 1.0354, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.4023181200027466, + "rewards/margins": 0.5273114442825317, + "rewards/rejected": -1.9296295642852783, + "sft_loss": 1.4055049419403076, + "step": 3375 + }, + { + "epoch": 1.8089981602274627, + "grad_norm": 8.204998932114652, + "learning_rate": 1.2235923093479156e-06, + "logits/chosen": -0.7242423295974731, + "logits/rejected": -0.6163910627365112, + "logps/chosen": -1.3340586423873901, + "logps/rejected": -2.004737377166748, + "loss": 0.9414, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3340586423873901, + "rewards/margins": 0.670678973197937, + "rewards/rejected": -2.004737377166748, + "sft_loss": 1.3409759998321533, + "step": 3380 + }, + { + "epoch": 1.8116741930088645, + "grad_norm": 7.287111485026014, + "learning_rate": 1.219001459675921e-06, + "logits/chosen": -0.6734879016876221, + "logits/rejected": -0.6657724380493164, + "logps/chosen": -1.3868675231933594, + "logps/rejected": -1.85861074924469, + "loss": 1.0186, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.3868675231933594, + "rewards/margins": 0.4717431962490082, + "rewards/rejected": -1.85861074924469, + "sft_loss": 1.4022619724273682, + "step": 3385 + }, + { + "epoch": 1.814350225790266, + "grad_norm": 10.1264692240913, + "learning_rate": 1.2144133362529974e-06, + "logits/chosen": -0.6233974695205688, + "logits/rejected": -0.5367701053619385, + "logps/chosen": -1.418578863143921, + "logps/rejected": -1.9424394369125366, + "loss": 1.029, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.418578863143921, + "rewards/margins": 0.5238603949546814, + "rewards/rejected": -1.9424394369125366, + "sft_loss": 1.4560730457305908, + "step": 3390 + }, + { + "epoch": 1.8170262585716674, + "grad_norm": 10.456040091114867, + "learning_rate": 1.2098279835931382e-06, + "logits/chosen": -0.6557838916778564, + "logits/rejected": -0.5996249914169312, + "logps/chosen": -1.2695906162261963, + "logps/rejected": -1.9603517055511475, + "loss": 0.9336, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2695906162261963, + "rewards/margins": 0.690761387348175, + "rewards/rejected": -1.9603517055511475, + "sft_loss": 1.2876946926116943, + "step": 3395 + }, + { + "epoch": 1.8197022913530692, + "grad_norm": 8.826313757148988, + "learning_rate": 1.2052454461834544e-06, + "logits/chosen": -0.6324909329414368, + "logits/rejected": -0.5516208410263062, + "logps/chosen": -1.356690764427185, + "logps/rejected": -1.9758851528167725, + "loss": 0.9906, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.356690764427185, + "rewards/margins": 0.6191944479942322, + "rewards/rejected": -1.9758851528167725, + "sft_loss": 1.3892412185668945, + "step": 3400 + }, + { + "epoch": 1.8223783241344707, + "grad_norm": 11.425711089424011, + "learning_rate": 1.2006657684837445e-06, + "logits/chosen": -0.6415581703186035, + "logits/rejected": -0.5432634353637695, + "logps/chosen": -1.3200055360794067, + "logps/rejected": -1.8696330785751343, + "loss": 0.9841, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3200055360794067, + "rewards/margins": 0.5496276617050171, + "rewards/rejected": -1.8696330785751343, + "sft_loss": 1.399254560470581, + "step": 3405 + }, + { + "epoch": 1.8250543569158721, + "grad_norm": 7.832674288185595, + "learning_rate": 1.1960889949260613e-06, + "logits/chosen": -0.6560367345809937, + "logits/rejected": -0.5233356952667236, + "logps/chosen": -1.4361064434051514, + "logps/rejected": -1.959800362586975, + "loss": 1.0228, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.4361064434051514, + "rewards/margins": 0.5236939191818237, + "rewards/rejected": -1.959800362586975, + "sft_loss": 1.453070878982544, + "step": 3410 + }, + { + "epoch": 1.8277303896972739, + "grad_norm": 15.075128594039345, + "learning_rate": 1.1915151699142825e-06, + "logits/chosen": -0.6657556891441345, + "logits/rejected": -0.6010826230049133, + "logps/chosen": -1.365739107131958, + "logps/rejected": -2.0313916206359863, + "loss": 1.0002, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.365739107131958, + "rewards/margins": 0.665652334690094, + "rewards/rejected": -2.0313916206359863, + "sft_loss": 1.4609452486038208, + "step": 3415 + }, + { + "epoch": 1.8304064224786754, + "grad_norm": 16.784397044420558, + "learning_rate": 1.1869443378236782e-06, + "logits/chosen": -0.6641906499862671, + "logits/rejected": -0.5761057734489441, + "logps/chosen": -1.437990427017212, + "logps/rejected": -2.15848708152771, + "loss": 1.02, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.437990427017212, + "rewards/margins": 0.7204967737197876, + "rewards/rejected": -2.15848708152771, + "sft_loss": 1.4786887168884277, + "step": 3420 + }, + { + "epoch": 1.8330824552600768, + "grad_norm": 13.981297309313103, + "learning_rate": 1.1823765430004812e-06, + "logits/chosen": -0.700829029083252, + "logits/rejected": -0.6927005648612976, + "logps/chosen": -1.3422982692718506, + "logps/rejected": -2.0163462162017822, + "loss": 0.9734, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.3422982692718506, + "rewards/margins": 0.6740477085113525, + "rewards/rejected": -2.0163462162017822, + "sft_loss": 1.351973056793213, + "step": 3425 + }, + { + "epoch": 1.8357584880414786, + "grad_norm": 8.964708336353103, + "learning_rate": 1.177811829761457e-06, + "logits/chosen": -0.6395770907402039, + "logits/rejected": -0.5657497048377991, + "logps/chosen": -1.3664880990982056, + "logps/rejected": -2.083770275115967, + "loss": 0.9664, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3664880990982056, + "rewards/margins": 0.7172822952270508, + "rewards/rejected": -2.083770275115967, + "sft_loss": 1.3898346424102783, + "step": 3430 + }, + { + "epoch": 1.83843452082288, + "grad_norm": 8.986806024083583, + "learning_rate": 1.1732502423934737e-06, + "logits/chosen": -0.634931206703186, + "logits/rejected": -0.6005954742431641, + "logps/chosen": -1.3003346920013428, + "logps/rejected": -1.934922456741333, + "loss": 0.9261, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3003346920013428, + "rewards/margins": 0.6345877051353455, + "rewards/rejected": -1.934922456741333, + "sft_loss": 1.3663181066513062, + "step": 3435 + }, + { + "epoch": 1.8411105536042816, + "grad_norm": 7.9950743058468925, + "learning_rate": 1.1686918251530716e-06, + "logits/chosen": -0.7094308137893677, + "logits/rejected": -0.6693183779716492, + "logps/chosen": -1.2470004558563232, + "logps/rejected": -2.106454372406006, + "loss": 0.9191, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2470004558563232, + "rewards/margins": 0.8594539761543274, + "rewards/rejected": -2.106454372406006, + "sft_loss": 1.2810404300689697, + "step": 3440 + }, + { + "epoch": 1.8437865863856833, + "grad_norm": 6.634098157658137, + "learning_rate": 1.164136622266035e-06, + "logits/chosen": -0.6816264390945435, + "logits/rejected": -0.5064053535461426, + "logps/chosen": -1.324466347694397, + "logps/rejected": -1.8923667669296265, + "loss": 0.9682, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.324466347694397, + "rewards/margins": 0.5679003596305847, + "rewards/rejected": -1.8923667669296265, + "sft_loss": 1.3814367055892944, + "step": 3445 + }, + { + "epoch": 1.8464626191670848, + "grad_norm": 9.472889100430713, + "learning_rate": 1.1595846779269622e-06, + "logits/chosen": -0.7638979554176331, + "logits/rejected": -0.6376869082450867, + "logps/chosen": -1.3652839660644531, + "logps/rejected": -2.0850937366485596, + "loss": 0.9771, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3652839660644531, + "rewards/margins": 0.7198096513748169, + "rewards/rejected": -2.0850937366485596, + "sft_loss": 1.4393898248672485, + "step": 3450 + }, + { + "epoch": 1.8491386519484863, + "grad_norm": 8.955470407751477, + "learning_rate": 1.155036036298837e-06, + "logits/chosen": -0.6534041166305542, + "logits/rejected": -0.5347647666931152, + "logps/chosen": -1.522452712059021, + "logps/rejected": -2.2508766651153564, + "loss": 1.0283, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.522452712059021, + "rewards/margins": 0.7284238934516907, + "rewards/rejected": -2.2508766651153564, + "sft_loss": 1.5423953533172607, + "step": 3455 + }, + { + "epoch": 1.851814684729888, + "grad_norm": 10.485052013304722, + "learning_rate": 1.1504907415126008e-06, + "logits/chosen": -0.5379031896591187, + "logits/rejected": -0.4626663327217102, + "logps/chosen": -1.3453060388565063, + "logps/rejected": -1.9888687133789062, + "loss": 0.9917, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3453060388565063, + "rewards/margins": 0.6435626149177551, + "rewards/rejected": -1.9888687133789062, + "sft_loss": 1.3984931707382202, + "step": 3460 + }, + { + "epoch": 1.8544907175112895, + "grad_norm": 6.859780274263937, + "learning_rate": 1.1459488376667235e-06, + "logits/chosen": -0.7011104822158813, + "logits/rejected": -0.6202434301376343, + "logps/chosen": -1.279159665107727, + "logps/rejected": -1.7484251260757446, + "loss": 0.969, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.279159665107727, + "rewards/margins": 0.46926528215408325, + "rewards/rejected": -1.7484251260757446, + "sft_loss": 1.3194999694824219, + "step": 3465 + }, + { + "epoch": 1.857166750292691, + "grad_norm": 10.019955364920394, + "learning_rate": 1.1414103688267756e-06, + "logits/chosen": -0.6622621417045593, + "logits/rejected": -0.5898221135139465, + "logps/chosen": -1.4044809341430664, + "logps/rejected": -2.0266287326812744, + "loss": 0.9957, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.4044809341430664, + "rewards/margins": 0.6221475601196289, + "rewards/rejected": -2.0266287326812744, + "sft_loss": 1.4509638547897339, + "step": 3470 + }, + { + "epoch": 1.8598427830740927, + "grad_norm": 11.18927324808781, + "learning_rate": 1.136875379025002e-06, + "logits/chosen": -0.6773689389228821, + "logits/rejected": -0.6406997442245483, + "logps/chosen": -1.3125277757644653, + "logps/rejected": -1.8904197216033936, + "loss": 0.9518, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3125277757644653, + "rewards/margins": 0.577892005443573, + "rewards/rejected": -1.8904197216033936, + "sft_loss": 1.3247658014297485, + "step": 3475 + }, + { + "epoch": 1.8625188158554944, + "grad_norm": 6.368596092649226, + "learning_rate": 1.132343912259894e-06, + "logits/chosen": -0.5879019498825073, + "logits/rejected": -0.5750831365585327, + "logps/chosen": -1.3526540994644165, + "logps/rejected": -1.9275623559951782, + "loss": 0.9745, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3526540994644165, + "rewards/margins": 0.5749083757400513, + "rewards/rejected": -1.9275623559951782, + "sft_loss": 1.3961443901062012, + "step": 3480 + }, + { + "epoch": 1.8651948486368957, + "grad_norm": 8.96244156761912, + "learning_rate": 1.1278160124957617e-06, + "logits/chosen": -0.6188081502914429, + "logits/rejected": -0.5407212972640991, + "logps/chosen": -1.3009461164474487, + "logps/rejected": -1.8492584228515625, + "loss": 0.9641, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3009461164474487, + "rewards/margins": 0.5483121871948242, + "rewards/rejected": -1.8492584228515625, + "sft_loss": 1.3755310773849487, + "step": 3485 + }, + { + "epoch": 1.8678708814182974, + "grad_norm": 7.912781929815597, + "learning_rate": 1.1232917236623085e-06, + "logits/chosen": -0.5891327857971191, + "logits/rejected": -0.5167615413665771, + "logps/chosen": -1.3643978834152222, + "logps/rejected": -1.8810253143310547, + "loss": 1.012, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3643978834152222, + "rewards/margins": 0.516627311706543, + "rewards/rejected": -1.8810253143310547, + "sft_loss": 1.4596291780471802, + "step": 3490 + }, + { + "epoch": 1.870546914199699, + "grad_norm": 9.082992475981591, + "learning_rate": 1.1187710896542045e-06, + "logits/chosen": -0.7301222085952759, + "logits/rejected": -0.6070829033851624, + "logps/chosen": -1.3872734308242798, + "logps/rejected": -1.9226562976837158, + "loss": 0.9757, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3872734308242798, + "rewards/margins": 0.5353829264640808, + "rewards/rejected": -1.9226562976837158, + "sft_loss": 1.4221903085708618, + "step": 3495 + }, + { + "epoch": 1.8732229469811004, + "grad_norm": 9.516454972740851, + "learning_rate": 1.1142541543306603e-06, + "logits/chosen": -0.6769564747810364, + "logits/rejected": -0.565294623374939, + "logps/chosen": -1.4010789394378662, + "logps/rejected": -2.2788970470428467, + "loss": 0.9576, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.4010789394378662, + "rewards/margins": 0.8778184056282043, + "rewards/rejected": -2.2788970470428467, + "sft_loss": 1.4577972888946533, + "step": 3500 + }, + { + "epoch": 1.875898979762502, + "grad_norm": 11.30165208210217, + "learning_rate": 1.109740961515003e-06, + "logits/chosen": -0.6544634103775024, + "logits/rejected": -0.5810515880584717, + "logps/chosen": -1.4300733804702759, + "logps/rejected": -2.110495090484619, + "loss": 0.9866, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4300733804702759, + "rewards/margins": 0.680421769618988, + "rewards/rejected": -2.110495090484619, + "sft_loss": 1.4657166004180908, + "step": 3505 + }, + { + "epoch": 1.8785750125439038, + "grad_norm": 13.350266083433564, + "learning_rate": 1.1052315549942487e-06, + "logits/chosen": -0.668562650680542, + "logits/rejected": -0.6335813403129578, + "logps/chosen": -1.3456979990005493, + "logps/rejected": -2.0151984691619873, + "loss": 0.9477, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3456979990005493, + "rewards/margins": 0.6695006489753723, + "rewards/rejected": -2.0151984691619873, + "sft_loss": 1.3689993619918823, + "step": 3510 + }, + { + "epoch": 1.881251045325305, + "grad_norm": 9.927984052359218, + "learning_rate": 1.100725978518679e-06, + "logits/chosen": -0.6724262833595276, + "logits/rejected": -0.5307937860488892, + "logps/chosen": -1.3404207229614258, + "logps/rejected": -1.971617341041565, + "loss": 0.9628, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3404207229614258, + "rewards/margins": 0.6311966180801392, + "rewards/rejected": -1.971617341041565, + "sft_loss": 1.348073959350586, + "step": 3515 + }, + { + "epoch": 1.8839270781067068, + "grad_norm": 9.773831218207205, + "learning_rate": 1.0962242758014169e-06, + "logits/chosen": -0.7042940855026245, + "logits/rejected": -0.6029044389724731, + "logps/chosen": -1.3506810665130615, + "logps/rejected": -2.053575038909912, + "loss": 0.9527, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3506810665130615, + "rewards/margins": 0.7028939127922058, + "rewards/rejected": -2.053575038909912, + "sft_loss": 1.4030991792678833, + "step": 3520 + }, + { + "epoch": 1.8866031108881085, + "grad_norm": 5.9940400713158, + "learning_rate": 1.091726490518002e-06, + "logits/chosen": -0.6179289817810059, + "logits/rejected": -0.4722275733947754, + "logps/chosen": -1.3348115682601929, + "logps/rejected": -1.9982001781463623, + "loss": 0.9753, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3348115682601929, + "rewards/margins": 0.6633888483047485, + "rewards/rejected": -1.9982001781463623, + "sft_loss": 1.4151699542999268, + "step": 3525 + }, + { + "epoch": 1.88927914366951, + "grad_norm": 9.015576299378978, + "learning_rate": 1.0872326663059668e-06, + "logits/chosen": -0.6120736002922058, + "logits/rejected": -0.5604882836341858, + "logps/chosen": -1.3328831195831299, + "logps/rejected": -1.956390619277954, + "loss": 0.9804, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3328831195831299, + "rewards/margins": 0.6235072612762451, + "rewards/rejected": -1.956390619277954, + "sft_loss": 1.4265530109405518, + "step": 3530 + }, + { + "epoch": 1.8919551764509115, + "grad_norm": 7.096923003122541, + "learning_rate": 1.0827428467644132e-06, + "logits/chosen": -0.7125923037528992, + "logits/rejected": -0.6133583188056946, + "logps/chosen": -1.281874179840088, + "logps/rejected": -1.9087066650390625, + "loss": 0.9619, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.281874179840088, + "rewards/margins": 0.6268326044082642, + "rewards/rejected": -1.9087066650390625, + "sft_loss": 1.33371102809906, + "step": 3535 + }, + { + "epoch": 1.8946312092323132, + "grad_norm": 9.935078763524709, + "learning_rate": 1.0782570754535903e-06, + "logits/chosen": -0.6627184748649597, + "logits/rejected": -0.4770810008049011, + "logps/chosen": -1.3245737552642822, + "logps/rejected": -1.7778218984603882, + "loss": 0.9977, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3245737552642822, + "rewards/margins": 0.4532480239868164, + "rewards/rejected": -1.7778218984603882, + "sft_loss": 1.3664424419403076, + "step": 3540 + }, + { + "epoch": 1.8973072420137147, + "grad_norm": 8.945704039529609, + "learning_rate": 1.0737753958944712e-06, + "logits/chosen": -0.7426556348800659, + "logits/rejected": -0.5525585412979126, + "logps/chosen": -1.3033052682876587, + "logps/rejected": -1.8459545373916626, + "loss": 0.9616, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3033052682876587, + "rewards/margins": 0.5426491498947144, + "rewards/rejected": -1.8459545373916626, + "sft_loss": 1.3472154140472412, + "step": 3545 + }, + { + "epoch": 1.8999832747951162, + "grad_norm": 7.642566646415059, + "learning_rate": 1.0692978515683305e-06, + "logits/chosen": -0.603020966053009, + "logits/rejected": -0.5429368019104004, + "logps/chosen": -1.2958600521087646, + "logps/rejected": -1.7832367420196533, + "loss": 0.9581, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2958600521087646, + "rewards/margins": 0.4873766303062439, + "rewards/rejected": -1.7832367420196533, + "sft_loss": 1.272925615310669, + "step": 3550 + }, + { + "epoch": 1.902659307576518, + "grad_norm": 7.510300680479713, + "learning_rate": 1.0648244859163227e-06, + "logits/chosen": -0.7047315239906311, + "logits/rejected": -0.5987603068351746, + "logps/chosen": -1.2614771127700806, + "logps/rejected": -1.9058072566986084, + "loss": 0.9537, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2614771127700806, + "rewards/margins": 0.6443303227424622, + "rewards/rejected": -1.9058072566986084, + "sft_loss": 1.29532790184021, + "step": 3555 + }, + { + "epoch": 1.9053353403579194, + "grad_norm": 8.516169618871453, + "learning_rate": 1.0603553423390612e-06, + "logits/chosen": -0.6130501627922058, + "logits/rejected": -0.561513364315033, + "logps/chosen": -1.3350327014923096, + "logps/rejected": -1.9226328134536743, + "loss": 0.9654, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3350327014923096, + "rewards/margins": 0.58760005235672, + "rewards/rejected": -1.9226328134536743, + "sft_loss": 1.3583624362945557, + "step": 3560 + }, + { + "epoch": 1.908011373139321, + "grad_norm": 10.22264712287352, + "learning_rate": 1.0558904641961966e-06, + "logits/chosen": -0.6046770811080933, + "logits/rejected": -0.5800749659538269, + "logps/chosen": -1.2874317169189453, + "logps/rejected": -2.0039634704589844, + "loss": 0.9636, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2874317169189453, + "rewards/margins": 0.7165321111679077, + "rewards/rejected": -2.0039634704589844, + "sft_loss": 1.3508872985839844, + "step": 3565 + }, + { + "epoch": 1.9106874059207226, + "grad_norm": 6.834050275139067, + "learning_rate": 1.0514298948059961e-06, + "logits/chosen": -0.6144019961357117, + "logits/rejected": -0.5128809809684753, + "logps/chosen": -1.293882966041565, + "logps/rejected": -1.8975296020507812, + "loss": 0.9465, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.293882966041565, + "rewards/margins": 0.6036466956138611, + "rewards/rejected": -1.8975296020507812, + "sft_loss": 1.345824956893921, + "step": 3570 + }, + { + "epoch": 1.913363438702124, + "grad_norm": 8.874431612379572, + "learning_rate": 1.0469736774449235e-06, + "logits/chosen": -0.5574930310249329, + "logits/rejected": -0.4778769612312317, + "logps/chosen": -1.2909395694732666, + "logps/rejected": -1.8894269466400146, + "loss": 0.9914, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2909395694732666, + "rewards/margins": 0.5984874963760376, + "rewards/rejected": -1.8894269466400146, + "sft_loss": 1.3335301876068115, + "step": 3575 + }, + { + "epoch": 1.9160394714835256, + "grad_norm": 6.988865773579564, + "learning_rate": 1.0425218553472193e-06, + "logits/chosen": -0.6077667474746704, + "logits/rejected": -0.5917859077453613, + "logps/chosen": -1.2812873125076294, + "logps/rejected": -1.9763896465301514, + "loss": 0.9109, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2812873125076294, + "rewards/margins": 0.6951022744178772, + "rewards/rejected": -1.9763896465301514, + "sft_loss": 1.332451581954956, + "step": 3580 + }, + { + "epoch": 1.9187155042649273, + "grad_norm": 7.559212482726521, + "learning_rate": 1.038074471704481e-06, + "logits/chosen": -0.5742273926734924, + "logits/rejected": -0.5489991307258606, + "logps/chosen": -1.417245864868164, + "logps/rejected": -2.0291554927825928, + "loss": 1.0054, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.417245864868164, + "rewards/margins": 0.6119096875190735, + "rewards/rejected": -2.0291554927825928, + "sft_loss": 1.5023760795593262, + "step": 3585 + }, + { + "epoch": 1.9213915370463288, + "grad_norm": 7.638506343278633, + "learning_rate": 1.033631569665244e-06, + "logits/chosen": -0.5891857743263245, + "logits/rejected": -0.535805344581604, + "logps/chosen": -1.3460886478424072, + "logps/rejected": -1.8601782321929932, + "loss": 1.012, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3460886478424072, + "rewards/margins": 0.5140894651412964, + "rewards/rejected": -1.8601782321929932, + "sft_loss": 1.3942530155181885, + "step": 3590 + }, + { + "epoch": 1.9240675698277303, + "grad_norm": 7.330430515610557, + "learning_rate": 1.0291931923345635e-06, + "logits/chosen": -0.730771541595459, + "logits/rejected": -0.5671923756599426, + "logps/chosen": -1.3746426105499268, + "logps/rejected": -2.0186867713928223, + "loss": 0.9488, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3746426105499268, + "rewards/margins": 0.6440441012382507, + "rewards/rejected": -2.0186867713928223, + "sft_loss": 1.3147354125976562, + "step": 3595 + }, + { + "epoch": 1.926743602609132, + "grad_norm": 8.909033918227077, + "learning_rate": 1.0247593827735966e-06, + "logits/chosen": -0.6141990423202515, + "logits/rejected": -0.4768144190311432, + "logps/chosen": -1.372605800628662, + "logps/rejected": -2.222141742706299, + "loss": 0.9557, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.372605800628662, + "rewards/margins": 0.8495360612869263, + "rewards/rejected": -2.222141742706299, + "sft_loss": 1.3834865093231201, + "step": 3600 + }, + { + "epoch": 1.926743602609132, + "eval_logits/chosen": -0.34722036123275757, + "eval_logits/rejected": -0.29034626483917236, + "eval_logps/chosen": -1.4343228340148926, + "eval_logps/rejected": -1.9644039869308472, + "eval_loss": 1.0308538675308228, + "eval_rewards/accuracies": 0.6454005837440491, + "eval_rewards/chosen": -1.4343228340148926, + "eval_rewards/margins": 0.5300810933113098, + "eval_rewards/rejected": -1.9644039869308472, + "eval_runtime": 45.9074, + "eval_samples_per_second": 29.298, + "eval_sft_loss": 1.4319441318511963, + "eval_steps_per_second": 7.341, + "step": 3600 + }, + { + "epoch": 1.9294196353905335, + "grad_norm": 9.575565595168888, + "learning_rate": 1.0203301839991816e-06, + "logits/chosen": -0.6674664616584778, + "logits/rejected": -0.6533695459365845, + "logps/chosen": -1.3234946727752686, + "logps/rejected": -1.7638345956802368, + "loss": 1.0147, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3234946727752686, + "rewards/margins": 0.4403398633003235, + "rewards/rejected": -1.7638345956802368, + "sft_loss": 1.365957260131836, + "step": 3605 + }, + { + "epoch": 1.932095668171935, + "grad_norm": 7.413916582912618, + "learning_rate": 1.0159056389834254e-06, + "logits/chosen": -0.6367475986480713, + "logits/rejected": -0.5504434704780579, + "logps/chosen": -1.320719838142395, + "logps/rejected": -1.9815881252288818, + "loss": 0.9514, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.320719838142395, + "rewards/margins": 0.6608681678771973, + "rewards/rejected": -1.9815881252288818, + "sft_loss": 1.4096839427947998, + "step": 3610 + }, + { + "epoch": 1.9347717009533367, + "grad_norm": 6.010152257530943, + "learning_rate": 1.0114857906532827e-06, + "logits/chosen": -0.5671981573104858, + "logits/rejected": -0.4917985796928406, + "logps/chosen": -1.3791894912719727, + "logps/rejected": -1.947789192199707, + "loss": 0.9871, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3791894912719727, + "rewards/margins": 0.5685997009277344, + "rewards/rejected": -1.947789192199707, + "sft_loss": 1.3976194858551025, + "step": 3615 + }, + { + "epoch": 1.9374477337347382, + "grad_norm": 12.21225851564805, + "learning_rate": 1.0070706818901417e-06, + "logits/chosen": -0.6242167353630066, + "logits/rejected": -0.5752514600753784, + "logps/chosen": -1.3984695672988892, + "logps/rejected": -1.881219506263733, + "loss": 1.0316, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3984695672988892, + "rewards/margins": 0.4827499985694885, + "rewards/rejected": -1.881219506263733, + "sft_loss": 1.4380195140838623, + "step": 3620 + }, + { + "epoch": 1.9401237665161397, + "grad_norm": 6.902602838356776, + "learning_rate": 1.0026603555294073e-06, + "logits/chosen": -0.5850001573562622, + "logits/rejected": -0.5802913904190063, + "logps/chosen": -1.3279889822006226, + "logps/rejected": -1.8815221786499023, + "loss": 0.9881, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3279889822006226, + "rewards/margins": 0.5535333752632141, + "rewards/rejected": -1.8815221786499023, + "sft_loss": 1.4049403667449951, + "step": 3625 + }, + { + "epoch": 1.9427997992975414, + "grad_norm": 11.500033917000573, + "learning_rate": 9.982548543600843e-07, + "logits/chosen": -0.6528538465499878, + "logits/rejected": -0.6264825463294983, + "logps/chosen": -1.3921599388122559, + "logps/rejected": -1.9632352590560913, + "loss": 1.0585, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3921599388122559, + "rewards/margins": 0.5710754990577698, + "rewards/rejected": -1.9632352590560913, + "sft_loss": 1.5010713338851929, + "step": 3630 + }, + { + "epoch": 1.945475832078943, + "grad_norm": 8.069222931489321, + "learning_rate": 9.93854221124365e-07, + "logits/chosen": -0.6453899145126343, + "logits/rejected": -0.5964998602867126, + "logps/chosen": -1.3313370943069458, + "logps/rejected": -1.9300304651260376, + "loss": 0.9682, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3313370943069458, + "rewards/margins": 0.5986935496330261, + "rewards/rejected": -1.9300304651260376, + "sft_loss": 1.3976194858551025, + "step": 3635 + }, + { + "epoch": 1.9481518648603444, + "grad_norm": 6.284563214144554, + "learning_rate": 9.894584985172121e-07, + "logits/chosen": -0.6314951777458191, + "logits/rejected": -0.5464446544647217, + "logps/chosen": -1.4279075860977173, + "logps/rejected": -1.8439185619354248, + "loss": 1.0198, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.4279075860977173, + "rewards/margins": 0.41601109504699707, + "rewards/rejected": -1.8439185619354248, + "sft_loss": 1.4700605869293213, + "step": 3640 + }, + { + "epoch": 1.9508278976417461, + "grad_norm": 9.537923550198874, + "learning_rate": 9.850677291859458e-07, + "logits/chosen": -0.6765953898429871, + "logits/rejected": -0.5869105458259583, + "logps/chosen": -1.4551875591278076, + "logps/rejected": -1.8798748254776, + "loss": 1.0508, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.4551875591278076, + "rewards/margins": 0.42468729615211487, + "rewards/rejected": -1.8798748254776, + "sft_loss": 1.5198633670806885, + "step": 3645 + }, + { + "epoch": 1.9535039304231478, + "grad_norm": 7.939924398451612, + "learning_rate": 9.806819557298295e-07, + "logits/chosen": -0.6868590712547302, + "logits/rejected": -0.6186624765396118, + "logps/chosen": -1.3275146484375, + "logps/rejected": -1.836299180984497, + "loss": 0.9731, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3275146484375, + "rewards/margins": 0.5087844729423523, + "rewards/rejected": -1.836299180984497, + "sft_loss": 1.386508584022522, + "step": 3650 + }, + { + "epoch": 1.9561799632045491, + "grad_norm": 8.054517035710493, + "learning_rate": 9.76301220699656e-07, + "logits/chosen": -0.6554325222969055, + "logits/rejected": -0.5593774914741516, + "logps/chosen": -1.4057365655899048, + "logps/rejected": -1.9774185419082642, + "loss": 0.9963, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4057365655899048, + "rewards/margins": 0.5716819167137146, + "rewards/rejected": -1.9774185419082642, + "sft_loss": 1.4154739379882812, + "step": 3655 + }, + { + "epoch": 1.9588559959859508, + "grad_norm": 11.357176551974492, + "learning_rate": 9.719255665973365e-07, + "logits/chosen": -0.6747775077819824, + "logits/rejected": -0.5881116390228271, + "logps/chosen": -1.3360207080841064, + "logps/rejected": -1.9081058502197266, + "loss": 0.984, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3360207080841064, + "rewards/margins": 0.5720850229263306, + "rewards/rejected": -1.9081058502197266, + "sft_loss": 1.3956830501556396, + "step": 3660 + }, + { + "epoch": 1.9615320287673526, + "grad_norm": 7.848314944555806, + "learning_rate": 9.675550358754857e-07, + "logits/chosen": -0.6074212193489075, + "logits/rejected": -0.5252271294593811, + "logps/chosen": -1.272979497909546, + "logps/rejected": -1.9367396831512451, + "loss": 0.9582, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.272979497909546, + "rewards/margins": 0.6637603640556335, + "rewards/rejected": -1.9367396831512451, + "sft_loss": 1.2792924642562866, + "step": 3665 + }, + { + "epoch": 1.9642080615487538, + "grad_norm": 8.533089389751797, + "learning_rate": 9.631896709370124e-07, + "logits/chosen": -0.6103826761245728, + "logits/rejected": -0.507732629776001, + "logps/chosen": -1.3028061389923096, + "logps/rejected": -2.1012096405029297, + "loss": 0.9137, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3028061389923096, + "rewards/margins": 0.7984035611152649, + "rewards/rejected": -2.1012096405029297, + "sft_loss": 1.4023349285125732, + "step": 3670 + }, + { + "epoch": 1.9668840943301555, + "grad_norm": 6.983088917340356, + "learning_rate": 9.588295141347055e-07, + "logits/chosen": -0.6318444013595581, + "logits/rejected": -0.529419481754303, + "logps/chosen": -1.4596143960952759, + "logps/rejected": -2.2197556495666504, + "loss": 1.0007, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4596143960952759, + "rewards/margins": 0.7601410746574402, + "rewards/rejected": -2.2197556495666504, + "sft_loss": 1.4763944149017334, + "step": 3675 + }, + { + "epoch": 1.9695601271115573, + "grad_norm": 9.347057160760839, + "learning_rate": 9.544746077708263e-07, + "logits/chosen": -0.6308041214942932, + "logits/rejected": -0.5474078059196472, + "logps/chosen": -1.2283879518508911, + "logps/rejected": -1.8275114297866821, + "loss": 0.9208, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2283879518508911, + "rewards/margins": 0.5991234183311462, + "rewards/rejected": -1.8275114297866821, + "sft_loss": 1.294682264328003, + "step": 3680 + }, + { + "epoch": 1.9722361598929585, + "grad_norm": 8.534486415963926, + "learning_rate": 9.50124994096695e-07, + "logits/chosen": -0.6402637362480164, + "logits/rejected": -0.5865155458450317, + "logps/chosen": -1.3168342113494873, + "logps/rejected": -1.9747432470321655, + "loss": 0.975, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3168342113494873, + "rewards/margins": 0.6579092144966125, + "rewards/rejected": -1.9747432470321655, + "sft_loss": 1.383578896522522, + "step": 3685 + }, + { + "epoch": 1.9749121926743602, + "grad_norm": 7.607105121263159, + "learning_rate": 9.457807153122826e-07, + "logits/chosen": -0.5896097421646118, + "logits/rejected": -0.4782601296901703, + "logps/chosen": -1.3596460819244385, + "logps/rejected": -2.096602439880371, + "loss": 0.9529, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3596460819244385, + "rewards/margins": 0.7369564771652222, + "rewards/rejected": -2.096602439880371, + "sft_loss": 1.352794885635376, + "step": 3690 + }, + { + "epoch": 1.977588225455762, + "grad_norm": 8.683135121423115, + "learning_rate": 9.41441813565801e-07, + "logits/chosen": -0.5771075487136841, + "logits/rejected": -0.5605219006538391, + "logps/chosen": -1.4253747463226318, + "logps/rejected": -2.057281970977783, + "loss": 1.0184, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4253747463226318, + "rewards/margins": 0.6319074034690857, + "rewards/rejected": -2.057281970977783, + "sft_loss": 1.5130159854888916, + "step": 3695 + }, + { + "epoch": 1.9802642582371635, + "grad_norm": 8.479150683466232, + "learning_rate": 9.371083309532938e-07, + "logits/chosen": -0.5546612739562988, + "logits/rejected": -0.47140711545944214, + "logps/chosen": -1.3317207098007202, + "logps/rejected": -1.954064130783081, + "loss": 0.9682, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3317207098007202, + "rewards/margins": 0.6223434805870056, + "rewards/rejected": -1.954064130783081, + "sft_loss": 1.3673937320709229, + "step": 3700 + }, + { + "epoch": 1.982940291018565, + "grad_norm": 7.173261243532672, + "learning_rate": 9.327803095182284e-07, + "logits/chosen": -0.6001571416854858, + "logits/rejected": -0.5562934875488281, + "logps/chosen": -1.3848721981048584, + "logps/rejected": -2.006413698196411, + "loss": 0.963, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3848721981048584, + "rewards/margins": 0.6215416193008423, + "rewards/rejected": -2.006413698196411, + "sft_loss": 1.3758659362792969, + "step": 3705 + }, + { + "epoch": 1.9856163237999667, + "grad_norm": 9.639346443973118, + "learning_rate": 9.28457791251088e-07, + "logits/chosen": -0.4836387634277344, + "logits/rejected": -0.45512452721595764, + "logps/chosen": -1.3742034435272217, + "logps/rejected": -1.8803613185882568, + "loss": 1.0281, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3742034435272217, + "rewards/margins": 0.506157636642456, + "rewards/rejected": -1.8803613185882568, + "sft_loss": 1.4473121166229248, + "step": 3710 + }, + { + "epoch": 1.9882923565813682, + "grad_norm": 6.039721125610725, + "learning_rate": 9.241408180889638e-07, + "logits/chosen": -0.5536881685256958, + "logits/rejected": -0.5073939561843872, + "logps/chosen": -1.3740195035934448, + "logps/rejected": -1.9659255743026733, + "loss": 0.9995, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3740195035934448, + "rewards/margins": 0.5919061899185181, + "rewards/rejected": -1.9659255743026733, + "sft_loss": 1.4292535781860352, + "step": 3715 + }, + { + "epoch": 1.9909683893627697, + "grad_norm": 6.770977619657827, + "learning_rate": 9.198294319151478e-07, + "logits/chosen": -0.5691328644752502, + "logits/rejected": -0.5202389359474182, + "logps/chosen": -1.3304475545883179, + "logps/rejected": -1.7927255630493164, + "loss": 0.9824, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3304475545883179, + "rewards/margins": 0.462277889251709, + "rewards/rejected": -1.7927255630493164, + "sft_loss": 1.3431183099746704, + "step": 3720 + }, + { + "epoch": 1.9936444221441714, + "grad_norm": 7.303791106226416, + "learning_rate": 9.155236745587279e-07, + "logits/chosen": -0.6500542759895325, + "logits/rejected": -0.5855103135108948, + "logps/chosen": -1.3462293148040771, + "logps/rejected": -1.933521032333374, + "loss": 0.9937, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3462293148040771, + "rewards/margins": 0.5872918367385864, + "rewards/rejected": -1.933521032333374, + "sft_loss": 1.4082921743392944, + "step": 3725 + }, + { + "epoch": 1.9963204549255729, + "grad_norm": 8.400362850544338, + "learning_rate": 9.112235877941808e-07, + "logits/chosen": -0.5426613688468933, + "logits/rejected": -0.44646692276000977, + "logps/chosen": -1.2635165452957153, + "logps/rejected": -1.8674116134643555, + "loss": 0.9185, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2635165452957153, + "rewards/margins": 0.6038950085639954, + "rewards/rejected": -1.8674116134643555, + "sft_loss": 1.2890541553497314, + "step": 3730 + }, + { + "epoch": 1.9989964877069744, + "grad_norm": 7.477633679990428, + "learning_rate": 9.069292133409672e-07, + "logits/chosen": -0.5348911881446838, + "logits/rejected": -0.48282891511917114, + "logps/chosen": -1.4278972148895264, + "logps/rejected": -1.913153886795044, + "loss": 1.0657, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.4278972148895264, + "rewards/margins": 0.48525673151016235, + "rewards/rejected": -1.913153886795044, + "sft_loss": 1.4490896463394165, + "step": 3735 + }, + { + "epoch": 2.001672520488376, + "grad_norm": 9.063452365996959, + "learning_rate": 9.026405928631269e-07, + "logits/chosen": -0.5262424349784851, + "logits/rejected": -0.5206266641616821, + "logps/chosen": -1.3640168905258179, + "logps/rejected": -1.8886162042617798, + "loss": 0.9904, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3640168905258179, + "rewards/margins": 0.5245994329452515, + "rewards/rejected": -1.8886162042617798, + "sft_loss": 1.3758842945098877, + "step": 3740 + }, + { + "epoch": 2.0043485532697773, + "grad_norm": 5.506743461924594, + "learning_rate": 8.983577679688745e-07, + "logits/chosen": -0.6251953840255737, + "logits/rejected": -0.56153804063797, + "logps/chosen": -1.3155872821807861, + "logps/rejected": -2.001924991607666, + "loss": 0.9253, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3155872821807861, + "rewards/margins": 0.6863377690315247, + "rewards/rejected": -2.001924991607666, + "sft_loss": 1.3774052858352661, + "step": 3745 + }, + { + "epoch": 2.007024586051179, + "grad_norm": 7.6386048683501775, + "learning_rate": 8.940807802101961e-07, + "logits/chosen": -0.6785083413124084, + "logits/rejected": -0.6239103674888611, + "logps/chosen": -1.1901180744171143, + "logps/rejected": -1.972402811050415, + "loss": 0.8635, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.1901180744171143, + "rewards/margins": 0.7822847366333008, + "rewards/rejected": -1.972402811050415, + "sft_loss": 1.2562397718429565, + "step": 3750 + }, + { + "epoch": 2.0097006188325808, + "grad_norm": 9.068439135923542, + "learning_rate": 8.898096710824455e-07, + "logits/chosen": -0.6381465196609497, + "logits/rejected": -0.5730241537094116, + "logps/chosen": -1.2574265003204346, + "logps/rejected": -2.136043071746826, + "loss": 0.9042, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2574265003204346, + "rewards/margins": 0.8786169290542603, + "rewards/rejected": -2.136043071746826, + "sft_loss": 1.3654946088790894, + "step": 3755 + }, + { + "epoch": 2.0123766516139825, + "grad_norm": 5.629269515593526, + "learning_rate": 8.855444820239421e-07, + "logits/chosen": -0.6984506845474243, + "logits/rejected": -0.7039046883583069, + "logps/chosen": -1.2816941738128662, + "logps/rejected": -2.23903226852417, + "loss": 0.8963, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.2816941738128662, + "rewards/margins": 0.9573379755020142, + "rewards/rejected": -2.23903226852417, + "sft_loss": 1.3613649606704712, + "step": 3760 + }, + { + "epoch": 2.0150526843953838, + "grad_norm": 6.517730028677668, + "learning_rate": 8.812852544155691e-07, + "logits/chosen": -0.6268309950828552, + "logits/rejected": -0.49097996950149536, + "logps/chosen": -1.2899795770645142, + "logps/rejected": -2.2022030353546143, + "loss": 0.8829, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.2899795770645142, + "rewards/margins": 0.9122235178947449, + "rewards/rejected": -2.2022030353546143, + "sft_loss": 1.3458114862442017, + "step": 3765 + }, + { + "epoch": 2.0177287171767855, + "grad_norm": 10.89228362918389, + "learning_rate": 8.770320295803714e-07, + "logits/chosen": -0.713524580001831, + "logits/rejected": -0.6023358106613159, + "logps/chosen": -1.2371385097503662, + "logps/rejected": -2.269756317138672, + "loss": 0.8539, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2371385097503662, + "rewards/margins": 1.0326178073883057, + "rewards/rejected": -2.269756317138672, + "sft_loss": 1.3002526760101318, + "step": 3770 + }, + { + "epoch": 2.020404749958187, + "grad_norm": 9.872604944685474, + "learning_rate": 8.727848487831545e-07, + "logits/chosen": -0.6570896506309509, + "logits/rejected": -0.6487444639205933, + "logps/chosen": -1.2803922891616821, + "logps/rejected": -2.118070602416992, + "loss": 0.8801, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2803922891616821, + "rewards/margins": 0.8376782536506653, + "rewards/rejected": -2.118070602416992, + "sft_loss": 1.3236936330795288, + "step": 3775 + }, + { + "epoch": 2.0230807827395885, + "grad_norm": 7.965795082559047, + "learning_rate": 8.685437532300863e-07, + "logits/chosen": -0.6047377586364746, + "logits/rejected": -0.6088879704475403, + "logps/chosen": -1.2858202457427979, + "logps/rejected": -2.0399012565612793, + "loss": 0.9426, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2858202457427979, + "rewards/margins": 0.7540808916091919, + "rewards/rejected": -2.0399012565612793, + "sft_loss": 1.3772176504135132, + "step": 3780 + }, + { + "epoch": 2.02575681552099, + "grad_norm": 8.500820535507295, + "learning_rate": 8.64308784068293e-07, + "logits/chosen": -0.6436256170272827, + "logits/rejected": -0.5537455677986145, + "logps/chosen": -1.3402982950210571, + "logps/rejected": -2.2292492389678955, + "loss": 0.8799, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3402982950210571, + "rewards/margins": 0.8889509439468384, + "rewards/rejected": -2.2292492389678955, + "sft_loss": 1.3502360582351685, + "step": 3785 + }, + { + "epoch": 2.028432848302392, + "grad_norm": 3.8693780171767522, + "learning_rate": 8.600799823854655e-07, + "logits/chosen": -0.6947168111801147, + "logits/rejected": -0.5738447904586792, + "logps/chosen": -1.2864720821380615, + "logps/rejected": -2.260383129119873, + "loss": 0.8736, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.2864720821380615, + "rewards/margins": 0.9739111065864563, + "rewards/rejected": -2.260383129119873, + "sft_loss": 1.3739585876464844, + "step": 3790 + }, + { + "epoch": 2.031108881083793, + "grad_norm": 8.208137567928274, + "learning_rate": 8.558573892094547e-07, + "logits/chosen": -0.6459519863128662, + "logits/rejected": -0.6449599862098694, + "logps/chosen": -1.2557079792022705, + "logps/rejected": -1.9562523365020752, + "loss": 0.9317, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2557079792022705, + "rewards/margins": 0.7005443572998047, + "rewards/rejected": -1.9562523365020752, + "sft_loss": 1.3796627521514893, + "step": 3795 + }, + { + "epoch": 2.033784913865195, + "grad_norm": 15.685924831899001, + "learning_rate": 8.516410455078793e-07, + "logits/chosen": -0.6611881256103516, + "logits/rejected": -0.5716621279716492, + "logps/chosen": -1.3278257846832275, + "logps/rejected": -2.2204842567443848, + "loss": 0.9066, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.3278257846832275, + "rewards/margins": 0.8926587104797363, + "rewards/rejected": -2.2204842567443848, + "sft_loss": 1.4301295280456543, + "step": 3800 + }, + { + "epoch": 2.0364609466465966, + "grad_norm": 9.902269917183073, + "learning_rate": 8.474309921877238e-07, + "logits/chosen": -0.6370423436164856, + "logits/rejected": -0.5673704147338867, + "logps/chosen": -1.2646716833114624, + "logps/rejected": -2.070289373397827, + "loss": 0.8929, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.2646716833114624, + "rewards/margins": 0.8056178092956543, + "rewards/rejected": -2.070289373397827, + "sft_loss": 1.324689269065857, + "step": 3805 + }, + { + "epoch": 2.039136979427998, + "grad_norm": 10.820319319966085, + "learning_rate": 8.432272700949452e-07, + "logits/chosen": -0.590074896812439, + "logits/rejected": -0.5352758765220642, + "logps/chosen": -1.3020083904266357, + "logps/rejected": -2.3446125984191895, + "loss": 0.8485, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.3020083904266357, + "rewards/margins": 1.042603850364685, + "rewards/rejected": -2.3446125984191895, + "sft_loss": 1.3151377439498901, + "step": 3810 + }, + { + "epoch": 2.0418130122093996, + "grad_norm": 8.07034734287858, + "learning_rate": 8.390299200140712e-07, + "logits/chosen": -0.7658069729804993, + "logits/rejected": -0.6765872836112976, + "logps/chosen": -1.3506486415863037, + "logps/rejected": -2.1507608890533447, + "loss": 0.8976, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3506486415863037, + "rewards/margins": 0.8001121282577515, + "rewards/rejected": -2.1507608890533447, + "sft_loss": 1.339355707168579, + "step": 3815 + }, + { + "epoch": 2.0444890449908013, + "grad_norm": 7.603117365844863, + "learning_rate": 8.348389826678129e-07, + "logits/chosen": -0.7368592619895935, + "logits/rejected": -0.582007110118866, + "logps/chosen": -1.3428410291671753, + "logps/rejected": -2.237255811691284, + "loss": 0.9049, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.3428410291671753, + "rewards/margins": 0.8944147229194641, + "rewards/rejected": -2.237255811691284, + "sft_loss": 1.389864206314087, + "step": 3820 + }, + { + "epoch": 2.0471650777722026, + "grad_norm": 9.380763525185147, + "learning_rate": 8.306544987166615e-07, + "logits/chosen": -0.6831952333450317, + "logits/rejected": -0.6276777982711792, + "logps/chosen": -1.2772094011306763, + "logps/rejected": -2.1352391242980957, + "loss": 0.8712, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.2772094011306763, + "rewards/margins": 0.8580294847488403, + "rewards/rejected": -2.1352391242980957, + "sft_loss": 1.344315767288208, + "step": 3825 + }, + { + "epoch": 2.0498411105536043, + "grad_norm": 9.07721790035139, + "learning_rate": 8.264765087584998e-07, + "logits/chosen": -0.7254881858825684, + "logits/rejected": -0.6233684420585632, + "logps/chosen": -1.369759202003479, + "logps/rejected": -2.263784408569336, + "loss": 0.9069, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.369759202003479, + "rewards/margins": 0.8940251469612122, + "rewards/rejected": -2.263784408569336, + "sft_loss": 1.4023691415786743, + "step": 3830 + }, + { + "epoch": 2.052517143335006, + "grad_norm": 5.792245981917584, + "learning_rate": 8.223050533282033e-07, + "logits/chosen": -0.6422053575515747, + "logits/rejected": -0.5379607677459717, + "logps/chosen": -1.3162529468536377, + "logps/rejected": -2.225130558013916, + "loss": 0.8708, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3162529468536377, + "rewards/margins": 0.9088780283927917, + "rewards/rejected": -2.225130558013916, + "sft_loss": 1.3632615804672241, + "step": 3835 + }, + { + "epoch": 2.0551931761164073, + "grad_norm": 7.190187729992475, + "learning_rate": 8.181401728972522e-07, + "logits/chosen": -0.6327613592147827, + "logits/rejected": -0.5678998231887817, + "logps/chosen": -1.2547208070755005, + "logps/rejected": -2.2098288536071777, + "loss": 0.8403, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.2547208070755005, + "rewards/margins": 0.9551082849502563, + "rewards/rejected": -2.2098288536071777, + "sft_loss": 1.2958214282989502, + "step": 3840 + }, + { + "epoch": 2.057869208897809, + "grad_norm": 8.609904902907434, + "learning_rate": 8.139819078733338e-07, + "logits/chosen": -0.7823083996772766, + "logits/rejected": -0.6090282201766968, + "logps/chosen": -1.3866506814956665, + "logps/rejected": -2.327561616897583, + "loss": 0.8912, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3866506814956665, + "rewards/margins": 0.940910816192627, + "rewards/rejected": -2.327561616897583, + "sft_loss": 1.4194447994232178, + "step": 3845 + }, + { + "epoch": 2.0605452416792107, + "grad_norm": 11.17797594329939, + "learning_rate": 8.098302985999547e-07, + "logits/chosen": -0.6843757033348083, + "logits/rejected": -0.5614744424819946, + "logps/chosen": -1.3497182130813599, + "logps/rejected": -2.1061267852783203, + "loss": 0.9361, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.3497182130813599, + "rewards/margins": 0.7564086318016052, + "rewards/rejected": -2.1061267852783203, + "sft_loss": 1.395309567451477, + "step": 3850 + }, + { + "epoch": 2.063221274460612, + "grad_norm": 7.9813576056899125, + "learning_rate": 8.056853853560447e-07, + "logits/chosen": -0.6499954462051392, + "logits/rejected": -0.5020272731781006, + "logps/chosen": -1.2980072498321533, + "logps/rejected": -2.4210283756256104, + "loss": 0.842, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.2980072498321533, + "rewards/margins": 1.123020887374878, + "rewards/rejected": -2.4210283756256104, + "sft_loss": 1.3265023231506348, + "step": 3855 + }, + { + "epoch": 2.0658973072420137, + "grad_norm": 6.769947516647681, + "learning_rate": 8.015472083555717e-07, + "logits/chosen": -0.6723402142524719, + "logits/rejected": -0.5541272163391113, + "logps/chosen": -1.279719591140747, + "logps/rejected": -2.26812744140625, + "loss": 0.8635, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.279719591140747, + "rewards/margins": 0.988408088684082, + "rewards/rejected": -2.26812744140625, + "sft_loss": 1.2970294952392578, + "step": 3860 + }, + { + "epoch": 2.0685733400234154, + "grad_norm": 8.99183976486184, + "learning_rate": 7.974158077471461e-07, + "logits/chosen": -0.788475513458252, + "logits/rejected": -0.667589008808136, + "logps/chosen": -1.3984959125518799, + "logps/rejected": -2.200364589691162, + "loss": 0.9518, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.3984959125518799, + "rewards/margins": 0.8018687963485718, + "rewards/rejected": -2.200364589691162, + "sft_loss": 1.4584678411483765, + "step": 3865 + }, + { + "epoch": 2.0712493728048167, + "grad_norm": 6.984993509609364, + "learning_rate": 7.932912236136356e-07, + "logits/chosen": -0.6701870560646057, + "logits/rejected": -0.6589460968971252, + "logps/chosen": -1.1804810762405396, + "logps/rejected": -2.028503894805908, + "loss": 0.8487, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.1804810762405396, + "rewards/margins": 0.8480230569839478, + "rewards/rejected": -2.028503894805908, + "sft_loss": 1.2543269395828247, + "step": 3870 + }, + { + "epoch": 2.0739254055862184, + "grad_norm": 8.13663272463278, + "learning_rate": 7.891734959717726e-07, + "logits/chosen": -0.6010066270828247, + "logits/rejected": -0.5039754509925842, + "logps/chosen": -1.330596923828125, + "logps/rejected": -2.231889486312866, + "loss": 0.8742, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.330596923828125, + "rewards/margins": 0.9012928009033203, + "rewards/rejected": -2.231889486312866, + "sft_loss": 1.4094765186309814, + "step": 3875 + }, + { + "epoch": 2.07660143836762, + "grad_norm": 7.834614174863811, + "learning_rate": 7.850626647717698e-07, + "logits/chosen": -0.6204323768615723, + "logits/rejected": -0.49028873443603516, + "logps/chosen": -1.2041785717010498, + "logps/rejected": -2.15822696685791, + "loss": 0.8209, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.2041785717010498, + "rewards/margins": 0.9540484547615051, + "rewards/rejected": -2.15822696685791, + "sft_loss": 1.2499016523361206, + "step": 3880 + }, + { + "epoch": 2.0792774711490214, + "grad_norm": 6.98029199111734, + "learning_rate": 7.809587698969282e-07, + "logits/chosen": -0.5861740112304688, + "logits/rejected": -0.4753715395927429, + "logps/chosen": -1.2556285858154297, + "logps/rejected": -2.299781560897827, + "loss": 0.857, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.2556285858154297, + "rewards/margins": 1.0441529750823975, + "rewards/rejected": -2.299781560897827, + "sft_loss": 1.3279292583465576, + "step": 3885 + }, + { + "epoch": 2.081953503930423, + "grad_norm": 8.151958862375524, + "learning_rate": 7.768618511632555e-07, + "logits/chosen": -0.5082379579544067, + "logits/rejected": -0.42794299125671387, + "logps/chosen": -1.3039039373397827, + "logps/rejected": -2.215757131576538, + "loss": 0.9311, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3039039373397827, + "rewards/margins": 0.9118531942367554, + "rewards/rejected": -2.215757131576538, + "sft_loss": 1.3952258825302124, + "step": 3890 + }, + { + "epoch": 2.084629536711825, + "grad_norm": 8.039789806789473, + "learning_rate": 7.727719483190737e-07, + "logits/chosen": -0.602218508720398, + "logits/rejected": -0.3982509970664978, + "logps/chosen": -1.3197612762451172, + "logps/rejected": -2.207941770553589, + "loss": 0.898, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3197612762451172, + "rewards/margins": 0.8881803750991821, + "rewards/rejected": -2.207941770553589, + "sft_loss": 1.3508869409561157, + "step": 3895 + }, + { + "epoch": 2.087305569493226, + "grad_norm": 8.145725584579546, + "learning_rate": 7.686891010446394e-07, + "logits/chosen": -0.46284013986587524, + "logits/rejected": -0.47182974219322205, + "logps/chosen": -1.3199361562728882, + "logps/rejected": -2.0793938636779785, + "loss": 0.9104, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3199361562728882, + "rewards/margins": 0.7594578266143799, + "rewards/rejected": -2.0793938636779785, + "sft_loss": 1.3835678100585938, + "step": 3900 + }, + { + "epoch": 2.089981602274628, + "grad_norm": 5.976896677469458, + "learning_rate": 7.646133489517535e-07, + "logits/chosen": -0.5105876922607422, + "logits/rejected": -0.42746657133102417, + "logps/chosen": -1.323020577430725, + "logps/rejected": -2.170231819152832, + "loss": 0.8989, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.323020577430725, + "rewards/margins": 0.8472112417221069, + "rewards/rejected": -2.170231819152832, + "sft_loss": 1.3276392221450806, + "step": 3905 + }, + { + "epoch": 2.0926576350560295, + "grad_norm": 6.634770698723937, + "learning_rate": 7.605447315833821e-07, + "logits/chosen": -0.4753246307373047, + "logits/rejected": -0.3948332965373993, + "logps/chosen": -1.2177543640136719, + "logps/rejected": -2.0822913646698, + "loss": 0.8623, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.2177543640136719, + "rewards/margins": 0.8645371198654175, + "rewards/rejected": -2.0822913646698, + "sft_loss": 1.2676682472229004, + "step": 3910 + }, + { + "epoch": 2.095333667837431, + "grad_norm": 9.030628197865246, + "learning_rate": 7.564832884132672e-07, + "logits/chosen": -0.5891907215118408, + "logits/rejected": -0.4459839463233948, + "logps/chosen": -1.3624029159545898, + "logps/rejected": -2.277984380722046, + "loss": 0.9139, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.3624029159545898, + "rewards/margins": 0.9155814051628113, + "rewards/rejected": -2.277984380722046, + "sft_loss": 1.4179056882858276, + "step": 3915 + }, + { + "epoch": 2.0980097006188325, + "grad_norm": 10.064942364992636, + "learning_rate": 7.524290588455499e-07, + "logits/chosen": -0.5865086913108826, + "logits/rejected": -0.4551008641719818, + "logps/chosen": -1.357023000717163, + "logps/rejected": -2.522416591644287, + "loss": 0.8519, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.357023000717163, + "rewards/margins": 1.165393590927124, + "rewards/rejected": -2.522416591644287, + "sft_loss": 1.3542098999023438, + "step": 3920 + }, + { + "epoch": 2.1006857334002342, + "grad_norm": 10.286245053566672, + "learning_rate": 7.483820822143816e-07, + "logits/chosen": -0.6092128157615662, + "logits/rejected": -0.5239665508270264, + "logps/chosen": -1.2598596811294556, + "logps/rejected": -2.266526937484741, + "loss": 0.8794, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.2598596811294556, + "rewards/margins": 1.0066674947738647, + "rewards/rejected": -2.266526937484741, + "sft_loss": 1.3254220485687256, + "step": 3925 + }, + { + "epoch": 2.103361766181636, + "grad_norm": 5.140048064303052, + "learning_rate": 7.443423977835487e-07, + "logits/chosen": -0.6823610663414001, + "logits/rejected": -0.576682984828949, + "logps/chosen": -1.3115248680114746, + "logps/rejected": -2.282677412033081, + "loss": 0.8606, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3115248680114746, + "rewards/margins": 0.971152663230896, + "rewards/rejected": -2.282677412033081, + "sft_loss": 1.3290514945983887, + "step": 3930 + }, + { + "epoch": 2.106037798963037, + "grad_norm": 9.831200068099484, + "learning_rate": 7.403100447460861e-07, + "logits/chosen": -0.5606423020362854, + "logits/rejected": -0.5006273984909058, + "logps/chosen": -1.305371642112732, + "logps/rejected": -2.273237943649292, + "loss": 0.8726, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.305371642112732, + "rewards/margins": 0.9678661227226257, + "rewards/rejected": -2.273237943649292, + "sft_loss": 1.306117296218872, + "step": 3935 + }, + { + "epoch": 2.108713831744439, + "grad_norm": 10.618972493058113, + "learning_rate": 7.36285062223902e-07, + "logits/chosen": -0.5751175880432129, + "logits/rejected": -0.5403181910514832, + "logps/chosen": -1.2422279119491577, + "logps/rejected": -2.406665086746216, + "loss": 0.8025, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2422279119491577, + "rewards/margins": 1.1644370555877686, + "rewards/rejected": -2.406665086746216, + "sft_loss": 1.2548398971557617, + "step": 3940 + }, + { + "epoch": 2.1113898645258407, + "grad_norm": 8.628247938136461, + "learning_rate": 7.322674892673931e-07, + "logits/chosen": -0.5670984983444214, + "logits/rejected": -0.4008113741874695, + "logps/chosen": -1.3559355735778809, + "logps/rejected": -2.112020969390869, + "loss": 0.9507, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3559355735778809, + "rewards/margins": 0.7560855746269226, + "rewards/rejected": -2.112020969390869, + "sft_loss": 1.4266374111175537, + "step": 3945 + }, + { + "epoch": 2.114065897307242, + "grad_norm": 7.079779269175077, + "learning_rate": 7.282573648550709e-07, + "logits/chosen": -0.48685067892074585, + "logits/rejected": -0.36240923404693604, + "logps/chosen": -1.3060166835784912, + "logps/rejected": -2.1700565814971924, + "loss": 0.9002, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3060166835784912, + "rewards/margins": 0.8640398979187012, + "rewards/rejected": -2.1700565814971924, + "sft_loss": 1.3409571647644043, + "step": 3950 + }, + { + "epoch": 2.1167419300886436, + "grad_norm": 7.646818085113159, + "learning_rate": 7.242547278931792e-07, + "logits/chosen": -0.6195932030677795, + "logits/rejected": -0.5592155456542969, + "logps/chosen": -1.2778809070587158, + "logps/rejected": -2.3017425537109375, + "loss": 0.8653, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.2778809070587158, + "rewards/margins": 1.0238616466522217, + "rewards/rejected": -2.3017425537109375, + "sft_loss": 1.3358291387557983, + "step": 3955 + }, + { + "epoch": 2.1194179628700454, + "grad_norm": 9.883662610666288, + "learning_rate": 7.202596172153203e-07, + "logits/chosen": -0.525765597820282, + "logits/rejected": -0.4359908103942871, + "logps/chosen": -1.3174402713775635, + "logps/rejected": -2.382549524307251, + "loss": 0.8847, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3174402713775635, + "rewards/margins": 1.0651090145111084, + "rewards/rejected": -2.382549524307251, + "sft_loss": 1.4014842510223389, + "step": 3960 + }, + { + "epoch": 2.1220939956514466, + "grad_norm": 8.474083790065452, + "learning_rate": 7.162720715820742e-07, + "logits/chosen": -0.5002860426902771, + "logits/rejected": -0.4082806706428528, + "logps/chosen": -1.2493045330047607, + "logps/rejected": -2.2874772548675537, + "loss": 0.8689, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.2493045330047607, + "rewards/margins": 1.0381726026535034, + "rewards/rejected": -2.2874772548675537, + "sft_loss": 1.346387267112732, + "step": 3965 + }, + { + "epoch": 2.1247700284328483, + "grad_norm": 6.205278251934963, + "learning_rate": 7.122921296806278e-07, + "logits/chosen": -0.5613098740577698, + "logits/rejected": -0.48972979187965393, + "logps/chosen": -1.2905679941177368, + "logps/rejected": -2.313955068588257, + "loss": 0.8803, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.2905679941177368, + "rewards/margins": 1.0233871936798096, + "rewards/rejected": -2.313955068588257, + "sft_loss": 1.3947936296463013, + "step": 3970 + }, + { + "epoch": 2.12744606121425, + "grad_norm": 9.86803358657914, + "learning_rate": 7.083198301243937e-07, + "logits/chosen": -0.5123628973960876, + "logits/rejected": -0.43953627347946167, + "logps/chosen": -1.180465817451477, + "logps/rejected": -1.981055498123169, + "loss": 0.847, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.180465817451477, + "rewards/margins": 0.8005896806716919, + "rewards/rejected": -1.981055498123169, + "sft_loss": 1.2407500743865967, + "step": 3975 + }, + { + "epoch": 2.1301220939956513, + "grad_norm": 9.064455669725788, + "learning_rate": 7.043552114526395e-07, + "logits/chosen": -0.5958659052848816, + "logits/rejected": -0.53276127576828, + "logps/chosen": -1.1877778768539429, + "logps/rejected": -2.1918532848358154, + "loss": 0.8463, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.1877778768539429, + "rewards/margins": 1.0040756464004517, + "rewards/rejected": -2.1918532848358154, + "sft_loss": 1.3280725479125977, + "step": 3980 + }, + { + "epoch": 2.132798126777053, + "grad_norm": 10.715834332859933, + "learning_rate": 7.003983121301139e-07, + "logits/chosen": -0.6565994620323181, + "logits/rejected": -0.5570293664932251, + "logps/chosen": -1.2925097942352295, + "logps/rejected": -2.3577346801757812, + "loss": 0.8334, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2925097942352295, + "rewards/margins": 1.0652248859405518, + "rewards/rejected": -2.3577346801757812, + "sft_loss": 1.341284990310669, + "step": 3985 + }, + { + "epoch": 2.1354741595584548, + "grad_norm": 10.098791734156606, + "learning_rate": 6.964491705466704e-07, + "logits/chosen": -0.6845608949661255, + "logits/rejected": -0.5976250767707825, + "logps/chosen": -1.2707078456878662, + "logps/rejected": -2.3325119018554688, + "loss": 0.8549, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.2707078456878662, + "rewards/margins": 1.0618040561676025, + "rewards/rejected": -2.3325119018554688, + "sft_loss": 1.322950839996338, + "step": 3990 + }, + { + "epoch": 2.138150192339856, + "grad_norm": 8.970579954518575, + "learning_rate": 6.92507825016899e-07, + "logits/chosen": -0.7098767161369324, + "logits/rejected": -0.4670625329017639, + "logps/chosen": -1.2964619398117065, + "logps/rejected": -2.4015231132507324, + "loss": 0.8596, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.2964619398117065, + "rewards/margins": 1.1050611734390259, + "rewards/rejected": -2.4015231132507324, + "sft_loss": 1.360012412071228, + "step": 3995 + }, + { + "epoch": 2.1408262251212578, + "grad_norm": 11.765582068507088, + "learning_rate": 6.885743137797502e-07, + "logits/chosen": -0.5854183435440063, + "logits/rejected": -0.5112401843070984, + "logps/chosen": -1.2789857387542725, + "logps/rejected": -2.4257848262786865, + "loss": 0.8559, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.2789857387542725, + "rewards/margins": 1.1467993259429932, + "rewards/rejected": -2.4257848262786865, + "sft_loss": 1.372701644897461, + "step": 4000 + }, + { + "epoch": 2.1408262251212578, + "eval_logits/chosen": -0.24339796602725983, + "eval_logits/rejected": -0.17608876526355743, + "eval_logps/chosen": -1.5362346172332764, + "eval_logps/rejected": -2.1634719371795654, + "eval_loss": 1.0419976711273193, + "eval_rewards/accuracies": 0.6550444960594177, + "eval_rewards/chosen": -1.5362346172332764, + "eval_rewards/margins": 0.6272372603416443, + "eval_rewards/rejected": -2.1634719371795654, + "eval_runtime": 43.6443, + "eval_samples_per_second": 30.817, + "eval_sft_loss": 1.4888213872909546, + "eval_steps_per_second": 7.722, + "step": 4000 + }, + { + "epoch": 2.1435022579026595, + "grad_norm": 7.152007906365384, + "learning_rate": 6.846486749981684e-07, + "logits/chosen": -0.5965894460678101, + "logits/rejected": -0.41736525297164917, + "logps/chosen": -1.4271074533462524, + "logps/rejected": -2.2944045066833496, + "loss": 0.9183, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4271074533462524, + "rewards/margins": 0.8672970533370972, + "rewards/rejected": -2.2944045066833496, + "sft_loss": 1.4213173389434814, + "step": 4005 + }, + { + "epoch": 2.1461782906840607, + "grad_norm": 14.90474531035357, + "learning_rate": 6.807309467587173e-07, + "logits/chosen": -0.6228991150856018, + "logits/rejected": -0.5492960214614868, + "logps/chosen": -1.3263002634048462, + "logps/rejected": -2.23538875579834, + "loss": 0.8993, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3263002634048462, + "rewards/margins": 0.9090884923934937, + "rewards/rejected": -2.23538875579834, + "sft_loss": 1.3850328922271729, + "step": 4010 + }, + { + "epoch": 2.1488543234654625, + "grad_norm": 10.686015445659427, + "learning_rate": 6.768211670712146e-07, + "logits/chosen": -0.6297835111618042, + "logits/rejected": -0.38724324107170105, + "logps/chosen": -1.3609362840652466, + "logps/rejected": -2.22047758102417, + "loss": 0.9235, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3609362840652466, + "rewards/margins": 0.8595415353775024, + "rewards/rejected": -2.22047758102417, + "sft_loss": 1.393182635307312, + "step": 4015 + }, + { + "epoch": 2.151530356246864, + "grad_norm": 8.16760442315204, + "learning_rate": 6.729193738683589e-07, + "logits/chosen": -0.7085089683532715, + "logits/rejected": -0.60541832447052, + "logps/chosen": -1.3713706731796265, + "logps/rejected": -2.3267054557800293, + "loss": 0.9264, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3713706731796265, + "rewards/margins": 0.9553349614143372, + "rewards/rejected": -2.3267054557800293, + "sft_loss": 1.4503589868545532, + "step": 4020 + }, + { + "epoch": 2.1542063890282654, + "grad_norm": 6.910080249548427, + "learning_rate": 6.690256050053652e-07, + "logits/chosen": -0.6153056025505066, + "logits/rejected": -0.5264173746109009, + "logps/chosen": -1.2847819328308105, + "logps/rejected": -2.305784225463867, + "loss": 0.8757, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.2847819328308105, + "rewards/margins": 1.021002173423767, + "rewards/rejected": -2.305784225463867, + "sft_loss": 1.317858099937439, + "step": 4025 + }, + { + "epoch": 2.156882421809667, + "grad_norm": 7.708333768885909, + "learning_rate": 6.651398982595967e-07, + "logits/chosen": -0.632286012172699, + "logits/rejected": -0.5932300090789795, + "logps/chosen": -1.2660969495773315, + "logps/rejected": -2.2909035682678223, + "loss": 0.8702, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2660969495773315, + "rewards/margins": 1.0248066186904907, + "rewards/rejected": -2.2909035682678223, + "sft_loss": 1.3490211963653564, + "step": 4030 + }, + { + "epoch": 2.159558454591069, + "grad_norm": 11.974974879307897, + "learning_rate": 6.612622913301961e-07, + "logits/chosen": -0.6110135316848755, + "logits/rejected": -0.6037889719009399, + "logps/chosen": -1.2175084352493286, + "logps/rejected": -1.889762282371521, + "loss": 0.9114, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2175084352493286, + "rewards/margins": 0.6722537875175476, + "rewards/rejected": -1.889762282371521, + "sft_loss": 1.3205323219299316, + "step": 4035 + }, + { + "epoch": 2.16223448737247, + "grad_norm": 8.652725511123771, + "learning_rate": 6.573928218377243e-07, + "logits/chosen": -0.6040986776351929, + "logits/rejected": -0.6009663343429565, + "logps/chosen": -1.2341662645339966, + "logps/rejected": -2.081291675567627, + "loss": 0.8842, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2341662645339966, + "rewards/margins": 0.8471253514289856, + "rewards/rejected": -2.081291675567627, + "sft_loss": 1.2724459171295166, + "step": 4040 + }, + { + "epoch": 2.164910520153872, + "grad_norm": 6.388012839682491, + "learning_rate": 6.5353152732379e-07, + "logits/chosen": -0.5803536772727966, + "logits/rejected": -0.47784996032714844, + "logps/chosen": -1.333279013633728, + "logps/rejected": -2.183581829071045, + "loss": 0.916, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.333279013633728, + "rewards/margins": 0.8503029942512512, + "rewards/rejected": -2.183581829071045, + "sft_loss": 1.4035253524780273, + "step": 4045 + }, + { + "epoch": 2.1675865529352736, + "grad_norm": 10.295310498808199, + "learning_rate": 6.496784452506907e-07, + "logits/chosen": -0.665248692035675, + "logits/rejected": -0.5683599710464478, + "logps/chosen": -1.3648693561553955, + "logps/rejected": -2.200554609298706, + "loss": 0.9444, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3648693561553955, + "rewards/margins": 0.8356854319572449, + "rewards/rejected": -2.200554609298706, + "sft_loss": 1.4416182041168213, + "step": 4050 + }, + { + "epoch": 2.170262585716675, + "grad_norm": 7.63395836520522, + "learning_rate": 6.458336130010442e-07, + "logits/chosen": -0.573379397392273, + "logits/rejected": -0.5254855751991272, + "logps/chosen": -1.300022840499878, + "logps/rejected": -1.9362703561782837, + "loss": 0.8988, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.300022840499878, + "rewards/margins": 0.6362478137016296, + "rewards/rejected": -1.9362703561782837, + "sft_loss": 1.3450088500976562, + "step": 4055 + }, + { + "epoch": 2.1729386184980766, + "grad_norm": 8.165944085428556, + "learning_rate": 6.419970678774312e-07, + "logits/chosen": -0.4763668477535248, + "logits/rejected": -0.381841242313385, + "logps/chosen": -1.202392578125, + "logps/rejected": -2.0925676822662354, + "loss": 0.8646, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.202392578125, + "rewards/margins": 0.8901751637458801, + "rewards/rejected": -2.0925676822662354, + "sft_loss": 1.3081729412078857, + "step": 4060 + }, + { + "epoch": 2.1756146512794783, + "grad_norm": 7.609107903515956, + "learning_rate": 6.381688471020282e-07, + "logits/chosen": -0.6313573122024536, + "logits/rejected": -0.584338366985321, + "logps/chosen": -1.2770190238952637, + "logps/rejected": -2.2510018348693848, + "loss": 0.8755, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2770190238952637, + "rewards/margins": 0.9739829897880554, + "rewards/rejected": -2.2510018348693848, + "sft_loss": 1.3386409282684326, + "step": 4065 + }, + { + "epoch": 2.1782906840608796, + "grad_norm": 10.51751349012702, + "learning_rate": 6.34348987816251e-07, + "logits/chosen": -0.5648465156555176, + "logits/rejected": -0.36703091859817505, + "logps/chosen": -1.2451423406600952, + "logps/rejected": -2.3362300395965576, + "loss": 0.8612, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.2451423406600952, + "rewards/margins": 1.0910876989364624, + "rewards/rejected": -2.3362300395965576, + "sft_loss": 1.3578553199768066, + "step": 4070 + }, + { + "epoch": 2.1809667168422813, + "grad_norm": 12.273680244934537, + "learning_rate": 6.3053752708039e-07, + "logits/chosen": -0.5989198684692383, + "logits/rejected": -0.4523433744907379, + "logps/chosen": -1.350265622138977, + "logps/rejected": -2.188004732131958, + "loss": 0.9194, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.350265622138977, + "rewards/margins": 0.8377388119697571, + "rewards/rejected": -2.188004732131958, + "sft_loss": 1.4103902578353882, + "step": 4075 + }, + { + "epoch": 2.183642749623683, + "grad_norm": 5.712507138294748, + "learning_rate": 6.267345018732552e-07, + "logits/chosen": -0.6406753659248352, + "logits/rejected": -0.5366859436035156, + "logps/chosen": -1.3789926767349243, + "logps/rejected": -2.4511501789093018, + "loss": 0.9131, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3789926767349243, + "rewards/margins": 1.0721572637557983, + "rewards/rejected": -2.4511501789093018, + "sft_loss": 1.467935562133789, + "step": 4080 + }, + { + "epoch": 2.1863187824050843, + "grad_norm": 10.413751258895298, + "learning_rate": 6.229399490918126e-07, + "logits/chosen": -0.5594549775123596, + "logits/rejected": -0.5101550817489624, + "logps/chosen": -1.3116942644119263, + "logps/rejected": -2.1040501594543457, + "loss": 0.8984, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3116942644119263, + "rewards/margins": 0.7923555374145508, + "rewards/rejected": -2.1040501594543457, + "sft_loss": 1.3226666450500488, + "step": 4085 + }, + { + "epoch": 2.188994815186486, + "grad_norm": 7.726743703110656, + "learning_rate": 6.19153905550831e-07, + "logits/chosen": -0.6734306216239929, + "logits/rejected": -0.5152979493141174, + "logps/chosen": -1.2853702306747437, + "logps/rejected": -2.1580288410186768, + "loss": 0.897, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.2853702306747437, + "rewards/margins": 0.8726586103439331, + "rewards/rejected": -2.1580288410186768, + "sft_loss": 1.3541196584701538, + "step": 4090 + }, + { + "epoch": 2.1916708479678877, + "grad_norm": 8.947962461451352, + "learning_rate": 6.153764079825211e-07, + "logits/chosen": -0.6520611047744751, + "logits/rejected": -0.5935763120651245, + "logps/chosen": -1.3681317567825317, + "logps/rejected": -2.20524525642395, + "loss": 0.925, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3681317567825317, + "rewards/margins": 0.8371133804321289, + "rewards/rejected": -2.20524525642395, + "sft_loss": 1.4108130931854248, + "step": 4095 + }, + { + "epoch": 2.194346880749289, + "grad_norm": 10.456483262678793, + "learning_rate": 6.116074930361803e-07, + "logits/chosen": -0.5813900828361511, + "logits/rejected": -0.47301873564720154, + "logps/chosen": -1.253137469291687, + "logps/rejected": -2.296706199645996, + "loss": 0.8651, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.253137469291687, + "rewards/margins": 1.043568730354309, + "rewards/rejected": -2.296706199645996, + "sft_loss": 1.3607757091522217, + "step": 4100 + }, + { + "epoch": 2.1970229135306907, + "grad_norm": 7.759076142826036, + "learning_rate": 6.078471972778388e-07, + "logits/chosen": -0.592880368232727, + "logits/rejected": -0.4213104844093323, + "logps/chosen": -1.3892344236373901, + "logps/rejected": -2.3780176639556885, + "loss": 0.8894, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3892344236373901, + "rewards/margins": 0.9887831807136536, + "rewards/rejected": -2.3780176639556885, + "sft_loss": 1.4068810939788818, + "step": 4105 + }, + { + "epoch": 2.1996989463120924, + "grad_norm": 13.227310541931292, + "learning_rate": 6.040955571899018e-07, + "logits/chosen": -0.601898729801178, + "logits/rejected": -0.44615617394447327, + "logps/chosen": -1.33761727809906, + "logps/rejected": -2.3596558570861816, + "loss": 0.9046, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.33761727809906, + "rewards/margins": 1.0220386981964111, + "rewards/rejected": -2.3596558570861816, + "sft_loss": 1.4018828868865967, + "step": 4110 + }, + { + "epoch": 2.202374979093494, + "grad_norm": 11.048455514959556, + "learning_rate": 6.003526091707986e-07, + "logits/chosen": -0.5543791055679321, + "logits/rejected": -0.5159690976142883, + "logps/chosen": -1.307888150215149, + "logps/rejected": -2.3227555751800537, + "loss": 0.8555, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.307888150215149, + "rewards/margins": 1.0148675441741943, + "rewards/rejected": -2.3227555751800537, + "sft_loss": 1.3637077808380127, + "step": 4115 + }, + { + "epoch": 2.2050510118748954, + "grad_norm": 7.691464355723396, + "learning_rate": 5.966183895346264e-07, + "logits/chosen": -0.58518385887146, + "logits/rejected": -0.5370736122131348, + "logps/chosen": -1.245976209640503, + "logps/rejected": -2.251274585723877, + "loss": 0.8695, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.245976209640503, + "rewards/margins": 1.005298376083374, + "rewards/rejected": -2.251274585723877, + "sft_loss": 1.34295654296875, + "step": 4120 + }, + { + "epoch": 2.207727044656297, + "grad_norm": 7.3169015153864585, + "learning_rate": 5.928929345108015e-07, + "logits/chosen": -0.651535153388977, + "logits/rejected": -0.4840888977050781, + "logps/chosen": -1.2679508924484253, + "logps/rejected": -2.3214924335479736, + "loss": 0.8589, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.2679508924484253, + "rewards/margins": 1.053541660308838, + "rewards/rejected": -2.3214924335479736, + "sft_loss": 1.3484123945236206, + "step": 4125 + }, + { + "epoch": 2.210403077437699, + "grad_norm": 13.563104283561207, + "learning_rate": 5.891762802437039e-07, + "logits/chosen": -0.5729442834854126, + "logits/rejected": -0.49847179651260376, + "logps/chosen": -1.315578579902649, + "logps/rejected": -2.3521034717559814, + "loss": 0.8863, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.315578579902649, + "rewards/margins": 1.036525011062622, + "rewards/rejected": -2.3521034717559814, + "sft_loss": 1.3842662572860718, + "step": 4130 + }, + { + "epoch": 2.2130791102191, + "grad_norm": 8.362458070033808, + "learning_rate": 5.854684627923306e-07, + "logits/chosen": -0.5584360361099243, + "logits/rejected": -0.5975090265274048, + "logps/chosen": -1.3722999095916748, + "logps/rejected": -2.573744297027588, + "loss": 0.9023, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3722999095916748, + "rewards/margins": 1.2014447450637817, + "rewards/rejected": -2.573744297027588, + "sft_loss": 1.4072306156158447, + "step": 4135 + }, + { + "epoch": 2.215755143000502, + "grad_norm": 7.472610360581948, + "learning_rate": 5.817695181299418e-07, + "logits/chosen": -0.6967195272445679, + "logits/rejected": -0.6487011313438416, + "logps/chosen": -1.2705659866333008, + "logps/rejected": -2.1925406455993652, + "loss": 0.879, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2705659866333008, + "rewards/margins": 0.9219745397567749, + "rewards/rejected": -2.1925406455993652, + "sft_loss": 1.3230068683624268, + "step": 4140 + }, + { + "epoch": 2.2184311757819035, + "grad_norm": 8.964792124855025, + "learning_rate": 5.780794821437158e-07, + "logits/chosen": -0.519437313079834, + "logits/rejected": -0.3859899342060089, + "logps/chosen": -1.3835476636886597, + "logps/rejected": -2.3953795433044434, + "loss": 0.8809, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.3835476636886597, + "rewards/margins": 1.011831521987915, + "rewards/rejected": -2.3953795433044434, + "sft_loss": 1.3992650508880615, + "step": 4145 + }, + { + "epoch": 2.221107208563305, + "grad_norm": 7.8812313935035405, + "learning_rate": 5.743983906343969e-07, + "logits/chosen": -0.5854192972183228, + "logits/rejected": -0.5049089193344116, + "logps/chosen": -1.2127283811569214, + "logps/rejected": -2.2633450031280518, + "loss": 0.8527, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.2127283811569214, + "rewards/margins": 1.0506165027618408, + "rewards/rejected": -2.2633450031280518, + "sft_loss": 1.3281142711639404, + "step": 4150 + }, + { + "epoch": 2.2237832413447065, + "grad_norm": 8.011591459714527, + "learning_rate": 5.707262793159521e-07, + "logits/chosen": -0.5341406464576721, + "logits/rejected": -0.5728651881217957, + "logps/chosen": -1.2559373378753662, + "logps/rejected": -2.0911660194396973, + "loss": 0.8749, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.2559373378753662, + "rewards/margins": 0.8352285623550415, + "rewards/rejected": -2.0911660194396973, + "sft_loss": 1.3055473566055298, + "step": 4155 + }, + { + "epoch": 2.2264592741261082, + "grad_norm": 12.086417724962727, + "learning_rate": 5.670631838152204e-07, + "logits/chosen": -0.5920090675354004, + "logits/rejected": -0.47571271657943726, + "logps/chosen": -1.3779784440994263, + "logps/rejected": -2.217038869857788, + "loss": 0.9123, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3779784440994263, + "rewards/margins": 0.8390604853630066, + "rewards/rejected": -2.217038869857788, + "sft_loss": 1.4312714338302612, + "step": 4160 + }, + { + "epoch": 2.2291353069075095, + "grad_norm": 7.473769846664818, + "learning_rate": 5.634091396715716e-07, + "logits/chosen": -0.6135045886039734, + "logits/rejected": -0.5422523021697998, + "logps/chosen": -1.309459924697876, + "logps/rejected": -2.377963066101074, + "loss": 0.8797, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.309459924697876, + "rewards/margins": 1.0685032606124878, + "rewards/rejected": -2.377963066101074, + "sft_loss": 1.3874919414520264, + "step": 4165 + }, + { + "epoch": 2.231811339688911, + "grad_norm": 10.69968137640894, + "learning_rate": 5.59764182336557e-07, + "logits/chosen": -0.49654465913772583, + "logits/rejected": -0.4765149652957916, + "logps/chosen": -1.354604959487915, + "logps/rejected": -2.4720609188079834, + "loss": 0.8629, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.354604959487915, + "rewards/margins": 1.1174561977386475, + "rewards/rejected": -2.4720609188079834, + "sft_loss": 1.4114564657211304, + "step": 4170 + }, + { + "epoch": 2.234487372470313, + "grad_norm": 10.193127728348049, + "learning_rate": 5.561283471735695e-07, + "logits/chosen": -0.5936630964279175, + "logits/rejected": -0.535660982131958, + "logps/chosen": -1.2989658117294312, + "logps/rejected": -2.1412882804870605, + "loss": 0.9114, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2989658117294312, + "rewards/margins": 0.8423227071762085, + "rewards/rejected": -2.1412882804870605, + "sft_loss": 1.340101957321167, + "step": 4175 + }, + { + "epoch": 2.237163405251714, + "grad_norm": 18.238471325259738, + "learning_rate": 5.52501669457497e-07, + "logits/chosen": -0.6202064752578735, + "logits/rejected": -0.41418686509132385, + "logps/chosen": -1.2645350694656372, + "logps/rejected": -2.3794612884521484, + "loss": 0.845, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2645350694656372, + "rewards/margins": 1.1149260997772217, + "rewards/rejected": -2.3794612884521484, + "sft_loss": 1.3265830278396606, + "step": 4180 + }, + { + "epoch": 2.239839438033116, + "grad_norm": 13.203807096035783, + "learning_rate": 5.488841843743833e-07, + "logits/chosen": -0.6738411784172058, + "logits/rejected": -0.6753481030464172, + "logps/chosen": -1.229512095451355, + "logps/rejected": -2.188352108001709, + "loss": 0.8635, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.229512095451355, + "rewards/margins": 0.958840012550354, + "rewards/rejected": -2.188352108001709, + "sft_loss": 1.2865744829177856, + "step": 4185 + }, + { + "epoch": 2.2425154708145176, + "grad_norm": 8.750089998855545, + "learning_rate": 5.452759270210839e-07, + "logits/chosen": -0.5226529836654663, + "logits/rejected": -0.4423566460609436, + "logps/chosen": -1.278633952140808, + "logps/rejected": -2.46871280670166, + "loss": 0.8591, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.278633952140808, + "rewards/margins": 1.1900789737701416, + "rewards/rejected": -2.46871280670166, + "sft_loss": 1.3546621799468994, + "step": 4190 + }, + { + "epoch": 2.245191503595919, + "grad_norm": 6.64950755241882, + "learning_rate": 5.416769324049282e-07, + "logits/chosen": -0.729025661945343, + "logits/rejected": -0.6177822351455688, + "logps/chosen": -1.3424952030181885, + "logps/rejected": -2.1164371967315674, + "loss": 0.9672, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3424952030181885, + "rewards/margins": 0.7739418745040894, + "rewards/rejected": -2.1164371967315674, + "sft_loss": 1.4235026836395264, + "step": 4195 + }, + { + "epoch": 2.2478675363773206, + "grad_norm": 8.79179757288585, + "learning_rate": 5.38087235443377e-07, + "logits/chosen": -0.4805554747581482, + "logits/rejected": -0.525387704372406, + "logps/chosen": -1.3676626682281494, + "logps/rejected": -2.3163561820983887, + "loss": 0.8948, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.3676626682281494, + "rewards/margins": 0.9486930966377258, + "rewards/rejected": -2.3163561820983887, + "sft_loss": 1.4133599996566772, + "step": 4200 + }, + { + "epoch": 2.2505435691587223, + "grad_norm": 5.380572243442791, + "learning_rate": 5.345068709636866e-07, + "logits/chosen": -0.6558794975280762, + "logits/rejected": -0.5935714244842529, + "logps/chosen": -1.278519630432129, + "logps/rejected": -2.2183380126953125, + "loss": 0.859, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.278519630432129, + "rewards/margins": 0.9398185610771179, + "rewards/rejected": -2.2183380126953125, + "sft_loss": 1.3346195220947266, + "step": 4205 + }, + { + "epoch": 2.2532196019401236, + "grad_norm": 7.582818350173143, + "learning_rate": 5.309358737025682e-07, + "logits/chosen": -0.6234208941459656, + "logits/rejected": -0.5458418726921082, + "logps/chosen": -1.3497518301010132, + "logps/rejected": -2.6105995178222656, + "loss": 0.8876, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3497518301010132, + "rewards/margins": 1.260847806930542, + "rewards/rejected": -2.6105995178222656, + "sft_loss": 1.4059553146362305, + "step": 4210 + }, + { + "epoch": 2.2558956347215253, + "grad_norm": 15.432277361630593, + "learning_rate": 5.273742783058537e-07, + "logits/chosen": -0.6282011270523071, + "logits/rejected": -0.5331791639328003, + "logps/chosen": -1.3267197608947754, + "logps/rejected": -2.370863676071167, + "loss": 0.902, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3267197608947754, + "rewards/margins": 1.044143795967102, + "rewards/rejected": -2.370863676071167, + "sft_loss": 1.3646594285964966, + "step": 4215 + }, + { + "epoch": 2.258571667502927, + "grad_norm": 6.940650950295138, + "learning_rate": 5.23822119328157e-07, + "logits/chosen": -0.6763989329338074, + "logits/rejected": -0.5052696466445923, + "logps/chosen": -1.2935147285461426, + "logps/rejected": -2.270808458328247, + "loss": 0.8958, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.2935147285461426, + "rewards/margins": 0.9772937893867493, + "rewards/rejected": -2.270808458328247, + "sft_loss": 1.3664653301239014, + "step": 4220 + }, + { + "epoch": 2.2612477002843283, + "grad_norm": 6.544634209943382, + "learning_rate": 5.202794312325399e-07, + "logits/chosen": -0.6599363684654236, + "logits/rejected": -0.4820350706577301, + "logps/chosen": -1.3575453758239746, + "logps/rejected": -2.430337429046631, + "loss": 0.8547, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.3575453758239746, + "rewards/margins": 1.0727920532226562, + "rewards/rejected": -2.430337429046631, + "sft_loss": 1.4071263074874878, + "step": 4225 + }, + { + "epoch": 2.26392373306573, + "grad_norm": 7.936678377225664, + "learning_rate": 5.167462483901773e-07, + "logits/chosen": -0.6433524489402771, + "logits/rejected": -0.582925021648407, + "logps/chosen": -1.3367130756378174, + "logps/rejected": -2.375605344772339, + "loss": 0.8797, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.3367130756378174, + "rewards/margins": 1.038892388343811, + "rewards/rejected": -2.375605344772339, + "sft_loss": 1.3651541471481323, + "step": 4230 + }, + { + "epoch": 2.2665997658471317, + "grad_norm": 7.015436901921431, + "learning_rate": 5.132226050800256e-07, + "logits/chosen": -0.5926527380943298, + "logits/rejected": -0.5132274627685547, + "logps/chosen": -1.3561756610870361, + "logps/rejected": -2.0783355236053467, + "loss": 0.9461, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3561756610870361, + "rewards/margins": 0.7221602201461792, + "rewards/rejected": -2.0783355236053467, + "sft_loss": 1.437430739402771, + "step": 4235 + }, + { + "epoch": 2.269275798628533, + "grad_norm": 5.5716445207379515, + "learning_rate": 5.097085354884869e-07, + "logits/chosen": -0.5913428068161011, + "logits/rejected": -0.5225772857666016, + "logps/chosen": -1.2736221551895142, + "logps/rejected": -2.1757893562316895, + "loss": 0.8966, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2736221551895142, + "rewards/margins": 0.9021672010421753, + "rewards/rejected": -2.1757893562316895, + "sft_loss": 1.3726946115493774, + "step": 4240 + }, + { + "epoch": 2.2719518314099347, + "grad_norm": 9.07817146254204, + "learning_rate": 5.062040737090806e-07, + "logits/chosen": -0.655211329460144, + "logits/rejected": -0.5655419826507568, + "logps/chosen": -1.3266026973724365, + "logps/rejected": -2.2359845638275146, + "loss": 0.907, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3266026973724365, + "rewards/margins": 0.9093819856643677, + "rewards/rejected": -2.2359845638275146, + "sft_loss": 1.3721539974212646, + "step": 4245 + }, + { + "epoch": 2.2746278641913364, + "grad_norm": 10.049455974892844, + "learning_rate": 5.027092537421091e-07, + "logits/chosen": -0.6034508943557739, + "logits/rejected": -0.4758704602718353, + "logps/chosen": -1.3402130603790283, + "logps/rejected": -2.1698737144470215, + "loss": 0.9332, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3402130603790283, + "rewards/margins": 0.8296605944633484, + "rewards/rejected": -2.1698737144470215, + "sft_loss": 1.3776081800460815, + "step": 4250 + }, + { + "epoch": 2.2773038969727377, + "grad_norm": 9.900693966782814, + "learning_rate": 4.992241094943326e-07, + "logits/chosen": -0.6390854716300964, + "logits/rejected": -0.41629552841186523, + "logps/chosen": -1.3275277614593506, + "logps/rejected": -2.4523518085479736, + "loss": 0.8547, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.3275277614593506, + "rewards/margins": 1.124824047088623, + "rewards/rejected": -2.4523518085479736, + "sft_loss": 1.3700711727142334, + "step": 4255 + }, + { + "epoch": 2.2799799297541394, + "grad_norm": 12.835616709525304, + "learning_rate": 4.957486747786342e-07, + "logits/chosen": -0.6146548986434937, + "logits/rejected": -0.5280757546424866, + "logps/chosen": -1.2478665113449097, + "logps/rejected": -2.1629269123077393, + "loss": 0.8487, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.2478665113449097, + "rewards/margins": 0.9150603413581848, + "rewards/rejected": -2.1629269123077393, + "sft_loss": 1.2602465152740479, + "step": 4260 + }, + { + "epoch": 2.282655962535541, + "grad_norm": 7.320178337679792, + "learning_rate": 4.922829833136984e-07, + "logits/chosen": -0.7337331771850586, + "logits/rejected": -0.5915257930755615, + "logps/chosen": -1.2824734449386597, + "logps/rejected": -2.310690402984619, + "loss": 0.8835, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.2824734449386597, + "rewards/margins": 1.0282166004180908, + "rewards/rejected": -2.310690402984619, + "sft_loss": 1.3565456867218018, + "step": 4265 + }, + { + "epoch": 2.285331995316943, + "grad_norm": 5.297260316677994, + "learning_rate": 4.888270687236773e-07, + "logits/chosen": -0.5888367891311646, + "logits/rejected": -0.3697664737701416, + "logps/chosen": -1.3420976400375366, + "logps/rejected": -2.446397304534912, + "loss": 0.862, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.3420976400375366, + "rewards/margins": 1.104299783706665, + "rewards/rejected": -2.446397304534912, + "sft_loss": 1.3266279697418213, + "step": 4270 + }, + { + "epoch": 2.288008028098344, + "grad_norm": 11.963344029150749, + "learning_rate": 4.853809645378709e-07, + "logits/chosen": -0.6401462554931641, + "logits/rejected": -0.5787710547447205, + "logps/chosen": -1.3975201845169067, + "logps/rejected": -2.3918261528015137, + "loss": 0.9225, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3975201845169067, + "rewards/margins": 0.9943059682846069, + "rewards/rejected": -2.3918261528015137, + "sft_loss": 1.4320608377456665, + "step": 4275 + }, + { + "epoch": 2.290684060879746, + "grad_norm": 7.788506744129682, + "learning_rate": 4.81944704190396e-07, + "logits/chosen": -0.652570366859436, + "logits/rejected": -0.6069068312644958, + "logps/chosen": -1.2777897119522095, + "logps/rejected": -2.1783740520477295, + "loss": 0.8837, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.2777897119522095, + "rewards/margins": 0.9005842208862305, + "rewards/rejected": -2.1783740520477295, + "sft_loss": 1.3518401384353638, + "step": 4280 + }, + { + "epoch": 2.293360093661147, + "grad_norm": 11.72830078153376, + "learning_rate": 4.785183210198667e-07, + "logits/chosen": -0.5845471620559692, + "logits/rejected": -0.6229828000068665, + "logps/chosen": -1.2514235973358154, + "logps/rejected": -2.2576746940612793, + "loss": 0.8695, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.2514235973358154, + "rewards/margins": 1.0062510967254639, + "rewards/rejected": -2.2576746940612793, + "sft_loss": 1.3402807712554932, + "step": 4285 + }, + { + "epoch": 2.296036126442549, + "grad_norm": 14.698312110073433, + "learning_rate": 4.7510184826906626e-07, + "logits/chosen": -0.7108185887336731, + "logits/rejected": -0.5703717470169067, + "logps/chosen": -1.3999465703964233, + "logps/rejected": -2.3794710636138916, + "loss": 0.9243, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.3999465703964233, + "rewards/margins": 0.9795247912406921, + "rewards/rejected": -2.3794710636138916, + "sft_loss": 1.440507173538208, + "step": 4290 + }, + { + "epoch": 2.2987121592239506, + "grad_norm": 8.97879561445652, + "learning_rate": 4.7169531908462953e-07, + "logits/chosen": -0.6745246648788452, + "logits/rejected": -0.6440014243125916, + "logps/chosen": -1.3127162456512451, + "logps/rejected": -2.2175207138061523, + "loss": 0.8733, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3127162456512451, + "rewards/margins": 0.9048043489456177, + "rewards/rejected": -2.2175207138061523, + "sft_loss": 1.3560574054718018, + "step": 4295 + }, + { + "epoch": 2.3013881920053523, + "grad_norm": 5.615115793326001, + "learning_rate": 4.6829876651671636e-07, + "logits/chosen": -0.6029156446456909, + "logits/rejected": -0.5269027352333069, + "logps/chosen": -1.2948484420776367, + "logps/rejected": -2.2537930011749268, + "loss": 0.8801, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2948484420776367, + "rewards/margins": 0.9589444994926453, + "rewards/rejected": -2.2537930011749268, + "sft_loss": 1.3327890634536743, + "step": 4300 + }, + { + "epoch": 2.3040642247867535, + "grad_norm": 19.92989352140432, + "learning_rate": 4.64912223518696e-07, + "logits/chosen": -0.7015405297279358, + "logits/rejected": -0.6147525906562805, + "logps/chosen": -1.2988426685333252, + "logps/rejected": -2.4416849613189697, + "loss": 0.842, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.2988426685333252, + "rewards/margins": 1.142842173576355, + "rewards/rejected": -2.4416849613189697, + "sft_loss": 1.3980969190597534, + "step": 4305 + }, + { + "epoch": 2.3067402575681553, + "grad_norm": 7.255081147335957, + "learning_rate": 4.615357229468221e-07, + "logits/chosen": -0.6565300822257996, + "logits/rejected": -0.4925423562526703, + "logps/chosen": -1.289421796798706, + "logps/rejected": -2.5012295246124268, + "loss": 0.83, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.289421796798706, + "rewards/margins": 1.2118077278137207, + "rewards/rejected": -2.5012295246124268, + "sft_loss": 1.3298838138580322, + "step": 4310 + }, + { + "epoch": 2.3094162903495565, + "grad_norm": 13.943097839852726, + "learning_rate": 4.581692975599192e-07, + "logits/chosen": -0.6478136777877808, + "logits/rejected": -0.5113622546195984, + "logps/chosen": -1.3406429290771484, + "logps/rejected": -2.235804319381714, + "loss": 0.8955, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3406429290771484, + "rewards/margins": 0.8951613306999207, + "rewards/rejected": -2.235804319381714, + "sft_loss": 1.4088243246078491, + "step": 4315 + }, + { + "epoch": 2.3120923231309582, + "grad_norm": 10.306307118346142, + "learning_rate": 4.548129800190603e-07, + "logits/chosen": -0.6606016159057617, + "logits/rejected": -0.5660589933395386, + "logps/chosen": -1.312565803527832, + "logps/rejected": -2.348137617111206, + "loss": 0.8717, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.312565803527832, + "rewards/margins": 1.035571813583374, + "rewards/rejected": -2.348137617111206, + "sft_loss": 1.379921793937683, + "step": 4320 + }, + { + "epoch": 2.31476835591236, + "grad_norm": 11.13862485808852, + "learning_rate": 4.5146680288725367e-07, + "logits/chosen": -0.6532072424888611, + "logits/rejected": -0.5128600597381592, + "logps/chosen": -1.3058058023452759, + "logps/rejected": -2.372032880783081, + "loss": 0.8814, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3058058023452759, + "rewards/margins": 1.0662271976470947, + "rewards/rejected": -2.372032880783081, + "sft_loss": 1.3538084030151367, + "step": 4325 + }, + { + "epoch": 2.3174443886937617, + "grad_norm": 16.579937013749525, + "learning_rate": 4.481307986291237e-07, + "logits/chosen": -0.6807089447975159, + "logits/rejected": -0.6148806214332581, + "logps/chosen": -1.400302767753601, + "logps/rejected": -2.3534867763519287, + "loss": 0.9536, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.400302767753601, + "rewards/margins": 0.9531840085983276, + "rewards/rejected": -2.3534867763519287, + "sft_loss": 1.41280198097229, + "step": 4330 + }, + { + "epoch": 2.320120421475163, + "grad_norm": 8.168795805694037, + "learning_rate": 4.4480499961059915e-07, + "logits/chosen": -0.6159850358963013, + "logits/rejected": -0.5662098526954651, + "logps/chosen": -1.3755680322647095, + "logps/rejected": -2.3326282501220703, + "loss": 0.8879, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.3755680322647095, + "rewards/margins": 0.9570604562759399, + "rewards/rejected": -2.3326282501220703, + "sft_loss": 1.3728866577148438, + "step": 4335 + }, + { + "epoch": 2.3227964542565647, + "grad_norm": 9.224841659650155, + "learning_rate": 4.414894380985959e-07, + "logits/chosen": -0.695419192314148, + "logits/rejected": -0.5470012426376343, + "logps/chosen": -1.2756474018096924, + "logps/rejected": -2.5504016876220703, + "loss": 0.8475, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.2756474018096924, + "rewards/margins": 1.2747540473937988, + "rewards/rejected": -2.5504016876220703, + "sft_loss": 1.3470367193222046, + "step": 4340 + }, + { + "epoch": 2.3254724870379664, + "grad_norm": 14.596642629245082, + "learning_rate": 4.3818414626070703e-07, + "logits/chosen": -0.716138482093811, + "logits/rejected": -0.6742798089981079, + "logps/chosen": -1.4133208990097046, + "logps/rejected": -2.321979522705078, + "loss": 0.9418, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4133208990097046, + "rewards/margins": 0.9086586236953735, + "rewards/rejected": -2.321979522705078, + "sft_loss": 1.4485996961593628, + "step": 4345 + }, + { + "epoch": 2.3281485198193677, + "grad_norm": 10.813466223871607, + "learning_rate": 4.3488915616488757e-07, + "logits/chosen": -0.6477821469306946, + "logits/rejected": -0.6120352149009705, + "logps/chosen": -1.3689117431640625, + "logps/rejected": -2.4252209663391113, + "loss": 0.8642, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.3689117431640625, + "rewards/margins": 1.0563093423843384, + "rewards/rejected": -2.4252209663391113, + "sft_loss": 1.3875305652618408, + "step": 4350 + }, + { + "epoch": 2.3308245526007694, + "grad_norm": 8.596530588605493, + "learning_rate": 4.316044997791469e-07, + "logits/chosen": -0.7162259817123413, + "logits/rejected": -0.6386939287185669, + "logps/chosen": -1.3806841373443604, + "logps/rejected": -2.3228540420532227, + "loss": 0.8994, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.3806841373443604, + "rewards/margins": 0.9421700239181519, + "rewards/rejected": -2.3228540420532227, + "sft_loss": 1.4305603504180908, + "step": 4355 + }, + { + "epoch": 2.333500585382171, + "grad_norm": 8.601381742032121, + "learning_rate": 4.283302089712348e-07, + "logits/chosen": -0.746851921081543, + "logits/rejected": -0.5745272040367126, + "logps/chosen": -1.4036058187484741, + "logps/rejected": -2.382904529571533, + "loss": 0.8812, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.4036058187484741, + "rewards/margins": 0.9792987108230591, + "rewards/rejected": -2.382904529571533, + "sft_loss": 1.4322397708892822, + "step": 4360 + }, + { + "epoch": 2.3361766181635724, + "grad_norm": 9.74306816379491, + "learning_rate": 4.250663155083357e-07, + "logits/chosen": -0.5852586030960083, + "logits/rejected": -0.6144552826881409, + "logps/chosen": -1.3089172840118408, + "logps/rejected": -2.3125052452087402, + "loss": 0.8857, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3089172840118408, + "rewards/margins": 1.0035879611968994, + "rewards/rejected": -2.3125052452087402, + "sft_loss": 1.3435930013656616, + "step": 4365 + }, + { + "epoch": 2.338852650944974, + "grad_norm": 11.720966069939928, + "learning_rate": 4.218128510567578e-07, + "logits/chosen": -0.6212056875228882, + "logits/rejected": -0.5088070631027222, + "logps/chosen": -1.2342393398284912, + "logps/rejected": -2.3752951622009277, + "loss": 0.8039, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.2342393398284912, + "rewards/margins": 1.1410560607910156, + "rewards/rejected": -2.3752951622009277, + "sft_loss": 1.2810344696044922, + "step": 4370 + }, + { + "epoch": 2.341528683726376, + "grad_norm": 8.919312733582453, + "learning_rate": 4.185698471816279e-07, + "logits/chosen": -0.7572565078735352, + "logits/rejected": -0.5971711874008179, + "logps/chosen": -1.3673795461654663, + "logps/rejected": -2.542800188064575, + "loss": 0.9014, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.3673795461654663, + "rewards/margins": 1.1754207611083984, + "rewards/rejected": -2.542800188064575, + "sft_loss": 1.450248122215271, + "step": 4375 + }, + { + "epoch": 2.344204716507777, + "grad_norm": 10.14180915809746, + "learning_rate": 4.1533733534658326e-07, + "logits/chosen": -0.6978228688240051, + "logits/rejected": -0.5519328713417053, + "logps/chosen": -1.3382021188735962, + "logps/rejected": -2.3281280994415283, + "loss": 0.9024, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3382021188735962, + "rewards/margins": 0.9899260401725769, + "rewards/rejected": -2.3281280994415283, + "sft_loss": 1.3950769901275635, + "step": 4380 + }, + { + "epoch": 2.346880749289179, + "grad_norm": 7.8186537137711865, + "learning_rate": 4.121153469134686e-07, + "logits/chosen": -0.6784673929214478, + "logits/rejected": -0.563454270362854, + "logps/chosen": -1.3425476551055908, + "logps/rejected": -2.3465404510498047, + "loss": 0.8798, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3425476551055908, + "rewards/margins": 1.003993034362793, + "rewards/rejected": -2.3465404510498047, + "sft_loss": 1.385127067565918, + "step": 4385 + }, + { + "epoch": 2.3495567820705805, + "grad_norm": 7.009913853726649, + "learning_rate": 4.089039131420292e-07, + "logits/chosen": -0.6797121167182922, + "logits/rejected": -0.5956329107284546, + "logps/chosen": -1.2960842847824097, + "logps/rejected": -2.1306862831115723, + "loss": 0.9071, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.2960842847824097, + "rewards/margins": 0.8346019983291626, + "rewards/rejected": -2.1306862831115723, + "sft_loss": 1.3619669675827026, + "step": 4390 + }, + { + "epoch": 2.3522328148519818, + "grad_norm": 9.23115558482239, + "learning_rate": 4.0570306518961027e-07, + "logits/chosen": -0.6509016752243042, + "logits/rejected": -0.5293477773666382, + "logps/chosen": -1.3495789766311646, + "logps/rejected": -2.419602394104004, + "loss": 0.9034, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3495789766311646, + "rewards/margins": 1.0700232982635498, + "rewards/rejected": -2.419602394104004, + "sft_loss": 1.4062663316726685, + "step": 4395 + }, + { + "epoch": 2.3549088476333835, + "grad_norm": 7.471901000006091, + "learning_rate": 4.025128341108517e-07, + "logits/chosen": -0.7326493263244629, + "logits/rejected": -0.5960140228271484, + "logps/chosen": -1.328127145767212, + "logps/rejected": -2.343733072280884, + "loss": 0.8788, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.328127145767212, + "rewards/margins": 1.0156059265136719, + "rewards/rejected": -2.343733072280884, + "sft_loss": 1.3994157314300537, + "step": 4400 + }, + { + "epoch": 2.3549088476333835, + "eval_logits/chosen": -0.3551722764968872, + "eval_logits/rejected": -0.29631373286247253, + "eval_logps/chosen": -1.5272942781448364, + "eval_logps/rejected": -2.1771371364593506, + "eval_loss": 1.0414245128631592, + "eval_rewards/accuracies": 0.6468842625617981, + "eval_rewards/chosen": -1.5272942781448364, + "eval_rewards/margins": 0.6498429179191589, + "eval_rewards/rejected": -2.1771371364593506, + "eval_runtime": 44.1464, + "eval_samples_per_second": 30.467, + "eval_sft_loss": 1.4793506860733032, + "eval_steps_per_second": 7.634, + "step": 4400 + }, + { + "epoch": 2.357584880414785, + "grad_norm": 18.24389181198718, + "learning_rate": 3.9933325085739047e-07, + "logits/chosen": -0.749717116355896, + "logits/rejected": -0.7404106259346008, + "logps/chosen": -1.2358829975128174, + "logps/rejected": -2.073561191558838, + "loss": 0.8849, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2358829975128174, + "rewards/margins": 0.8376781344413757, + "rewards/rejected": -2.073561191558838, + "sft_loss": 1.2863762378692627, + "step": 4405 + }, + { + "epoch": 2.3602609131961865, + "grad_norm": 8.743575469487237, + "learning_rate": 3.9616434627755624e-07, + "logits/chosen": -0.6609512567520142, + "logits/rejected": -0.6308881044387817, + "logps/chosen": -1.3922468423843384, + "logps/rejected": -2.649442195892334, + "loss": 0.8581, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.3922468423843384, + "rewards/margins": 1.257195234298706, + "rewards/rejected": -2.649442195892334, + "sft_loss": 1.431348443031311, + "step": 4410 + }, + { + "epoch": 2.362936945977588, + "grad_norm": 8.223634442134767, + "learning_rate": 3.930061511160762e-07, + "logits/chosen": -0.6532543301582336, + "logits/rejected": -0.5277368426322937, + "logps/chosen": -1.3420674800872803, + "logps/rejected": -2.281698703765869, + "loss": 0.9015, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3420674800872803, + "rewards/margins": 0.939631462097168, + "rewards/rejected": -2.281698703765869, + "sft_loss": 1.3932065963745117, + "step": 4415 + }, + { + "epoch": 2.36561297875899, + "grad_norm": 13.739216943538223, + "learning_rate": 3.898586960137726e-07, + "logits/chosen": -0.6745472550392151, + "logits/rejected": -0.6361994743347168, + "logps/chosen": -1.3241745233535767, + "logps/rejected": -2.131701946258545, + "loss": 0.8933, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3241745233535767, + "rewards/margins": 0.8075275421142578, + "rewards/rejected": -2.131701946258545, + "sft_loss": 1.358970046043396, + "step": 4420 + }, + { + "epoch": 2.368289011540391, + "grad_norm": 7.72059808047864, + "learning_rate": 3.867220115072696e-07, + "logits/chosen": -0.6497647166252136, + "logits/rejected": -0.5739953517913818, + "logps/chosen": -1.2007476091384888, + "logps/rejected": -2.001007318496704, + "loss": 0.87, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.2007476091384888, + "rewards/margins": 0.8002594709396362, + "rewards/rejected": -2.001007318496704, + "sft_loss": 1.3230812549591064, + "step": 4425 + }, + { + "epoch": 2.370965044321793, + "grad_norm": 7.127098452943066, + "learning_rate": 3.8359612802869367e-07, + "logits/chosen": -0.6745830774307251, + "logits/rejected": -0.5413500070571899, + "logps/chosen": -1.3599244356155396, + "logps/rejected": -2.357084035873413, + "loss": 0.9164, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3599244356155396, + "rewards/margins": 0.9971596002578735, + "rewards/rejected": -2.357084035873413, + "sft_loss": 1.4182361364364624, + "step": 4430 + }, + { + "epoch": 2.3736410771031946, + "grad_norm": 8.195598404674085, + "learning_rate": 3.8048107590537987e-07, + "logits/chosen": -0.7300285696983337, + "logits/rejected": -0.5494762659072876, + "logps/chosen": -1.2751190662384033, + "logps/rejected": -2.339775323867798, + "loss": 0.8621, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.2751190662384033, + "rewards/margins": 1.0646560192108154, + "rewards/rejected": -2.339775323867798, + "sft_loss": 1.3891193866729736, + "step": 4435 + }, + { + "epoch": 2.376317109884596, + "grad_norm": 7.272577584258443, + "learning_rate": 3.773768853595774e-07, + "logits/chosen": -0.735929548740387, + "logits/rejected": -0.5498467683792114, + "logps/chosen": -1.3501901626586914, + "logps/rejected": -2.220968723297119, + "loss": 0.9322, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.3501901626586914, + "rewards/margins": 0.8707782626152039, + "rewards/rejected": -2.220968723297119, + "sft_loss": 1.4078781604766846, + "step": 4440 + }, + { + "epoch": 2.3789931426659976, + "grad_norm": 10.306160904948694, + "learning_rate": 3.7428358650815706e-07, + "logits/chosen": -0.7110830545425415, + "logits/rejected": -0.5415419340133667, + "logps/chosen": -1.3333743810653687, + "logps/rejected": -2.156614303588867, + "loss": 0.9339, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3333743810653687, + "rewards/margins": 0.8232399225234985, + "rewards/rejected": -2.156614303588867, + "sft_loss": 1.3927953243255615, + "step": 4445 + }, + { + "epoch": 2.3816691754473993, + "grad_norm": 7.818563194854539, + "learning_rate": 3.712012093623172e-07, + "logits/chosen": -0.6752433776855469, + "logits/rejected": -0.5796962380409241, + "logps/chosen": -1.3343051671981812, + "logps/rejected": -2.4307165145874023, + "loss": 0.8901, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.3343051671981812, + "rewards/margins": 1.0964112281799316, + "rewards/rejected": -2.4307165145874023, + "sft_loss": 1.394907832145691, + "step": 4450 + }, + { + "epoch": 2.384345208228801, + "grad_norm": 15.317372572588315, + "learning_rate": 3.6812978382729524e-07, + "logits/chosen": -0.7404896020889282, + "logits/rejected": -0.6768995523452759, + "logps/chosen": -1.3519471883773804, + "logps/rejected": -2.286410093307495, + "loss": 0.921, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.3519471883773804, + "rewards/margins": 0.9344632029533386, + "rewards/rejected": -2.286410093307495, + "sft_loss": 1.4139940738677979, + "step": 4455 + }, + { + "epoch": 2.3870212410102023, + "grad_norm": 8.674143010837458, + "learning_rate": 3.650693397020744e-07, + "logits/chosen": -0.7839964628219604, + "logits/rejected": -0.6039905548095703, + "logps/chosen": -1.2913477420806885, + "logps/rejected": -2.494628429412842, + "loss": 0.8862, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.2913477420806885, + "rewards/margins": 1.2032805681228638, + "rewards/rejected": -2.494628429412842, + "sft_loss": 1.3796261548995972, + "step": 4460 + }, + { + "epoch": 2.389697273791604, + "grad_norm": 14.304659649681543, + "learning_rate": 3.6201990667909774e-07, + "logits/chosen": -0.7501360177993774, + "logits/rejected": -0.6040843725204468, + "logps/chosen": -1.3918160200119019, + "logps/rejected": -2.316871166229248, + "loss": 0.9388, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3918160200119019, + "rewards/margins": 0.9250553250312805, + "rewards/rejected": -2.316871166229248, + "sft_loss": 1.4156030416488647, + "step": 4465 + }, + { + "epoch": 2.3923733065730053, + "grad_norm": 10.260220317471864, + "learning_rate": 3.589815143439772e-07, + "logits/chosen": -0.6317887902259827, + "logits/rejected": -0.5740104913711548, + "logps/chosen": -1.2195584774017334, + "logps/rejected": -2.1278574466705322, + "loss": 0.8879, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.2195584774017334, + "rewards/margins": 0.9082988500595093, + "rewards/rejected": -2.1278574466705322, + "sft_loss": 1.2862308025360107, + "step": 4470 + }, + { + "epoch": 2.395049339354407, + "grad_norm": 7.903865291358914, + "learning_rate": 3.559541921752091e-07, + "logits/chosen": -0.7242652177810669, + "logits/rejected": -0.5375711917877197, + "logps/chosen": -1.3358646631240845, + "logps/rejected": -2.285754680633545, + "loss": 0.8906, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.3358646631240845, + "rewards/margins": 0.9498898386955261, + "rewards/rejected": -2.285754680633545, + "sft_loss": 1.3705825805664062, + "step": 4475 + }, + { + "epoch": 2.3977253721358087, + "grad_norm": 7.419993242744366, + "learning_rate": 3.5293796954388565e-07, + "logits/chosen": -0.6737462282180786, + "logits/rejected": -0.6052318811416626, + "logps/chosen": -1.2106980085372925, + "logps/rejected": -2.0494823455810547, + "loss": 0.8845, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.2106980085372925, + "rewards/margins": 0.8387842178344727, + "rewards/rejected": -2.0494823455810547, + "sft_loss": 1.2878100872039795, + "step": 4480 + }, + { + "epoch": 2.4004014049172104, + "grad_norm": 11.853678645168188, + "learning_rate": 3.499328757134129e-07, + "logits/chosen": -0.6386197209358215, + "logits/rejected": -0.5696219801902771, + "logps/chosen": -1.338358998298645, + "logps/rejected": -2.3903615474700928, + "loss": 0.8575, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.338358998298645, + "rewards/margins": 1.0520024299621582, + "rewards/rejected": -2.3903615474700928, + "sft_loss": 1.3504332304000854, + "step": 4485 + }, + { + "epoch": 2.4030774376986117, + "grad_norm": 8.859236562170793, + "learning_rate": 3.469389398392237e-07, + "logits/chosen": -0.6983398199081421, + "logits/rejected": -0.5085052251815796, + "logps/chosen": -1.2761619091033936, + "logps/rejected": -2.413107395172119, + "loss": 0.8225, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2761619091033936, + "rewards/margins": 1.1369454860687256, + "rewards/rejected": -2.413107395172119, + "sft_loss": 1.339481234550476, + "step": 4490 + }, + { + "epoch": 2.4057534704800134, + "grad_norm": 10.898058882878603, + "learning_rate": 3.4395619096849764e-07, + "logits/chosen": -0.7793927788734436, + "logits/rejected": -0.5934855341911316, + "logps/chosen": -1.3146260976791382, + "logps/rejected": -2.3048288822174072, + "loss": 0.8978, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.3146260976791382, + "rewards/margins": 0.9902030825614929, + "rewards/rejected": -2.3048288822174072, + "sft_loss": 1.4020847082138062, + "step": 4495 + }, + { + "epoch": 2.408429503261415, + "grad_norm": 8.3769372005201, + "learning_rate": 3.409846580398766e-07, + "logits/chosen": -0.5743625164031982, + "logits/rejected": -0.6095589399337769, + "logps/chosen": -1.2932769060134888, + "logps/rejected": -2.2235419750213623, + "loss": 0.8752, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.2932769060134888, + "rewards/margins": 0.9302651286125183, + "rewards/rejected": -2.2235419750213623, + "sft_loss": 1.3634895086288452, + "step": 4500 + }, + { + "epoch": 2.4111055360428164, + "grad_norm": 8.348768620392361, + "learning_rate": 3.380243698831869e-07, + "logits/chosen": -0.7401924133300781, + "logits/rejected": -0.600095272064209, + "logps/chosen": -1.3059180974960327, + "logps/rejected": -2.298156261444092, + "loss": 0.8658, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.3059180974960327, + "rewards/margins": 0.9922383427619934, + "rewards/rejected": -2.298156261444092, + "sft_loss": 1.372867226600647, + "step": 4505 + }, + { + "epoch": 2.413781568824218, + "grad_norm": 5.762119412659862, + "learning_rate": 3.350753552191563e-07, + "logits/chosen": -0.7536791563034058, + "logits/rejected": -0.6396313905715942, + "logps/chosen": -1.3253954648971558, + "logps/rejected": -2.276576519012451, + "loss": 0.8466, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.3253954648971558, + "rewards/margins": 0.9511808156967163, + "rewards/rejected": -2.276576519012451, + "sft_loss": 1.3070496320724487, + "step": 4510 + }, + { + "epoch": 2.41645760160562, + "grad_norm": 9.495811954480246, + "learning_rate": 3.3213764265913915e-07, + "logits/chosen": -0.7169691324234009, + "logits/rejected": -0.6960464715957642, + "logps/chosen": -1.315781593322754, + "logps/rejected": -2.1747562885284424, + "loss": 0.8986, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.315781593322754, + "rewards/margins": 0.8589746356010437, + "rewards/rejected": -2.1747562885284424, + "sft_loss": 1.3821439743041992, + "step": 4515 + }, + { + "epoch": 2.419133634387021, + "grad_norm": 9.196657234975614, + "learning_rate": 3.292112607048343e-07, + "logits/chosen": -0.6809697151184082, + "logits/rejected": -0.6138942837715149, + "logps/chosen": -1.3078068494796753, + "logps/rejected": -2.3196234703063965, + "loss": 0.8486, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.3078068494796753, + "rewards/margins": 1.0118167400360107, + "rewards/rejected": -2.3196234703063965, + "sft_loss": 1.3212130069732666, + "step": 4520 + }, + { + "epoch": 2.421809667168423, + "grad_norm": 10.931137947223332, + "learning_rate": 3.262962377480136e-07, + "logits/chosen": -0.7364736199378967, + "logits/rejected": -0.597502589225769, + "logps/chosen": -1.3951513767242432, + "logps/rejected": -2.4568734169006348, + "loss": 0.9168, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.3951513767242432, + "rewards/margins": 1.0617221593856812, + "rewards/rejected": -2.4568734169006348, + "sft_loss": 1.4719269275665283, + "step": 4525 + }, + { + "epoch": 2.4244856999498245, + "grad_norm": 9.68105778607799, + "learning_rate": 3.233926020702414e-07, + "logits/chosen": -0.7190853953361511, + "logits/rejected": -0.6500726938247681, + "logps/chosen": -1.3541837930679321, + "logps/rejected": -2.039454221725464, + "loss": 0.9484, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3541837930679321, + "rewards/margins": 0.6852703094482422, + "rewards/rejected": -2.039454221725464, + "sft_loss": 1.359106183052063, + "step": 4530 + }, + { + "epoch": 2.427161732731226, + "grad_norm": 10.439717436730895, + "learning_rate": 3.205003818426047e-07, + "logits/chosen": -0.5770900845527649, + "logits/rejected": -0.5122250318527222, + "logps/chosen": -1.265426754951477, + "logps/rejected": -2.2611968517303467, + "loss": 0.8864, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.265426754951477, + "rewards/margins": 0.9957700967788696, + "rewards/rejected": -2.2611968517303467, + "sft_loss": 1.3501912355422974, + "step": 4535 + }, + { + "epoch": 2.4298377655126275, + "grad_norm": 9.77852965592999, + "learning_rate": 3.1761960512543627e-07, + "logits/chosen": -0.6387471556663513, + "logits/rejected": -0.5983433127403259, + "logps/chosen": -1.2862733602523804, + "logps/rejected": -2.2489261627197266, + "loss": 0.8922, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2862733602523804, + "rewards/margins": 0.9626529812812805, + "rewards/rejected": -2.2489261627197266, + "sft_loss": 1.3163095712661743, + "step": 4540 + }, + { + "epoch": 2.4325137982940293, + "grad_norm": 24.67312605387757, + "learning_rate": 3.147502998680447e-07, + "logits/chosen": -0.6618883609771729, + "logits/rejected": -0.5567878484725952, + "logps/chosen": -1.2788108587265015, + "logps/rejected": -2.2833950519561768, + "loss": 0.8881, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.2788108587265015, + "rewards/margins": 1.0045843124389648, + "rewards/rejected": -2.2833950519561768, + "sft_loss": 1.3379775285720825, + "step": 4545 + }, + { + "epoch": 2.4351898310754305, + "grad_norm": 7.560659530220782, + "learning_rate": 3.11892493908442e-07, + "logits/chosen": -0.72434002161026, + "logits/rejected": -0.6553603410720825, + "logps/chosen": -1.2460191249847412, + "logps/rejected": -2.1280198097229004, + "loss": 0.8913, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.2460191249847412, + "rewards/margins": 0.8820006251335144, + "rewards/rejected": -2.1280198097229004, + "sft_loss": 1.3037611246109009, + "step": 4550 + }, + { + "epoch": 2.4378658638568322, + "grad_norm": 11.183665158896252, + "learning_rate": 3.0904621497307437e-07, + "logits/chosen": -0.6681724786758423, + "logits/rejected": -0.61586594581604, + "logps/chosen": -1.3558168411254883, + "logps/rejected": -2.169890880584717, + "loss": 0.9579, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3558168411254883, + "rewards/margins": 0.8140741586685181, + "rewards/rejected": -2.169890880584717, + "sft_loss": 1.463220477104187, + "step": 4555 + }, + { + "epoch": 2.440541896638234, + "grad_norm": 10.577372485814985, + "learning_rate": 3.062114906765522e-07, + "logits/chosen": -0.703151524066925, + "logits/rejected": -0.5398627519607544, + "logps/chosen": -1.2571704387664795, + "logps/rejected": -2.2171876430511475, + "loss": 0.8799, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2571704387664795, + "rewards/margins": 0.960017204284668, + "rewards/rejected": -2.2171876430511475, + "sft_loss": 1.2801587581634521, + "step": 4560 + }, + { + "epoch": 2.4432179294196352, + "grad_norm": 13.685702261492946, + "learning_rate": 3.0338834852138346e-07, + "logits/chosen": -0.6743217706680298, + "logits/rejected": -0.6253520846366882, + "logps/chosen": -1.3779981136322021, + "logps/rejected": -2.291980743408203, + "loss": 0.9037, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.3779981136322021, + "rewards/margins": 0.9139825105667114, + "rewards/rejected": -2.291980743408203, + "sft_loss": 1.38899564743042, + "step": 4565 + }, + { + "epoch": 2.445893962201037, + "grad_norm": 17.384020814403296, + "learning_rate": 3.0057681589770526e-07, + "logits/chosen": -0.6919450163841248, + "logits/rejected": -0.5843518972396851, + "logps/chosen": -1.3070205450057983, + "logps/rejected": -2.368257999420166, + "loss": 0.8608, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3070205450057983, + "rewards/margins": 1.0612374544143677, + "rewards/rejected": -2.368257999420166, + "sft_loss": 1.3402724266052246, + "step": 4570 + }, + { + "epoch": 2.4485699949824387, + "grad_norm": 13.322706624373525, + "learning_rate": 2.9777692008301993e-07, + "logits/chosen": -0.6535794138908386, + "logits/rejected": -0.6342029571533203, + "logps/chosen": -1.3068435192108154, + "logps/rejected": -2.229980230331421, + "loss": 0.887, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.3068435192108154, + "rewards/margins": 0.9231365323066711, + "rewards/rejected": -2.229980230331421, + "sft_loss": 1.371297836303711, + "step": 4575 + }, + { + "epoch": 2.45124602776384, + "grad_norm": 7.737728756955118, + "learning_rate": 2.949886882419284e-07, + "logits/chosen": -0.7128503322601318, + "logits/rejected": -0.6816755533218384, + "logps/chosen": -1.2589917182922363, + "logps/rejected": -2.2378623485565186, + "loss": 0.8626, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2589917182922363, + "rewards/margins": 0.9788707494735718, + "rewards/rejected": -2.2378623485565186, + "sft_loss": 1.3179762363433838, + "step": 4580 + }, + { + "epoch": 2.4539220605452416, + "grad_norm": 8.316412011766696, + "learning_rate": 2.92212147425869e-07, + "logits/chosen": -0.6452383995056152, + "logits/rejected": -0.5411955714225769, + "logps/chosen": -1.3594417572021484, + "logps/rejected": -2.3032777309417725, + "loss": 0.9555, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3594417572021484, + "rewards/margins": 0.9438360929489136, + "rewards/rejected": -2.3032777309417725, + "sft_loss": 1.4443765878677368, + "step": 4585 + }, + { + "epoch": 2.4565980933266434, + "grad_norm": 10.39657321444957, + "learning_rate": 2.894473245728518e-07, + "logits/chosen": -0.7684861421585083, + "logits/rejected": -0.6261590719223022, + "logps/chosen": -1.2524304389953613, + "logps/rejected": -2.1767070293426514, + "loss": 0.891, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2524304389953613, + "rewards/margins": 0.9242765307426453, + "rewards/rejected": -2.1767070293426514, + "sft_loss": 1.3225806951522827, + "step": 4590 + }, + { + "epoch": 2.4592741261080446, + "grad_norm": 17.39160581642626, + "learning_rate": 2.866942465072014e-07, + "logits/chosen": -0.7339946031570435, + "logits/rejected": -0.6413661241531372, + "logps/chosen": -1.3248698711395264, + "logps/rejected": -2.3884971141815186, + "loss": 0.8961, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3248698711395264, + "rewards/margins": 1.063627004623413, + "rewards/rejected": -2.3884971141815186, + "sft_loss": 1.3735225200653076, + "step": 4595 + }, + { + "epoch": 2.4619501588894463, + "grad_norm": 5.919966901123824, + "learning_rate": 2.839529399392924e-07, + "logits/chosen": -0.7200514078140259, + "logits/rejected": -0.5268881320953369, + "logps/chosen": -1.3770151138305664, + "logps/rejected": -2.454486131668091, + "loss": 0.9059, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3770151138305664, + "rewards/margins": 1.077471137046814, + "rewards/rejected": -2.454486131668091, + "sft_loss": 1.4495172500610352, + "step": 4600 + }, + { + "epoch": 2.464626191670848, + "grad_norm": 9.71994547678538, + "learning_rate": 2.812234314652937e-07, + "logits/chosen": -0.6608083248138428, + "logits/rejected": -0.5510744452476501, + "logps/chosen": -1.3255417346954346, + "logps/rejected": -2.4279494285583496, + "loss": 0.8981, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3255417346954346, + "rewards/margins": 1.102407693862915, + "rewards/rejected": -2.4279494285583496, + "sft_loss": 1.3806711435317993, + "step": 4605 + }, + { + "epoch": 2.46730222445225, + "grad_norm": 6.834992459863064, + "learning_rate": 2.785057475669084e-07, + "logits/chosen": -0.7301166653633118, + "logits/rejected": -0.6108202934265137, + "logps/chosen": -1.302876591682434, + "logps/rejected": -2.4487080574035645, + "loss": 0.8634, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.302876591682434, + "rewards/margins": 1.1458313465118408, + "rewards/rejected": -2.4487080574035645, + "sft_loss": 1.3394935131072998, + "step": 4610 + }, + { + "epoch": 2.469978257233651, + "grad_norm": 9.15944035800927, + "learning_rate": 2.75799914611117e-07, + "logits/chosen": -0.6518206000328064, + "logits/rejected": -0.5422743558883667, + "logps/chosen": -1.3434274196624756, + "logps/rejected": -2.491009473800659, + "loss": 0.8819, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3434274196624756, + "rewards/margins": 1.1475821733474731, + "rewards/rejected": -2.491009473800659, + "sft_loss": 1.3923437595367432, + "step": 4615 + }, + { + "epoch": 2.4726542900150528, + "grad_norm": 9.054548708671863, + "learning_rate": 2.7310595884992354e-07, + "logits/chosen": -0.6681968569755554, + "logits/rejected": -0.5110117197036743, + "logps/chosen": -1.2326476573944092, + "logps/rejected": -2.231614589691162, + "loss": 0.8627, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2326476573944092, + "rewards/margins": 0.998967170715332, + "rewards/rejected": -2.231614589691162, + "sft_loss": 1.3625903129577637, + "step": 4620 + }, + { + "epoch": 2.475330322796454, + "grad_norm": 11.42134908729122, + "learning_rate": 2.7042390642009805e-07, + "logits/chosen": -0.7752343416213989, + "logits/rejected": -0.7648538947105408, + "logps/chosen": -1.2942503690719604, + "logps/rejected": -2.251347064971924, + "loss": 0.8968, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.2942503690719604, + "rewards/margins": 0.9570968747138977, + "rewards/rejected": -2.251347064971924, + "sft_loss": 1.364689588546753, + "step": 4625 + }, + { + "epoch": 2.4780063555778558, + "grad_norm": 7.018668458347168, + "learning_rate": 2.6775378334292543e-07, + "logits/chosen": -0.6319466233253479, + "logits/rejected": -0.5827672481536865, + "logps/chosen": -1.2391040325164795, + "logps/rejected": -2.199864387512207, + "loss": 0.8923, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2391040325164795, + "rewards/margins": 0.9607602953910828, + "rewards/rejected": -2.199864387512207, + "sft_loss": 1.33708918094635, + "step": 4630 + }, + { + "epoch": 2.4806823883592575, + "grad_norm": 24.407625854432688, + "learning_rate": 2.650956155239512e-07, + "logits/chosen": -0.6126594543457031, + "logits/rejected": -0.49098721146583557, + "logps/chosen": -1.276228666305542, + "logps/rejected": -2.399261236190796, + "loss": 0.846, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.276228666305542, + "rewards/margins": 1.123032569885254, + "rewards/rejected": -2.399261236190796, + "sft_loss": 1.2981501817703247, + "step": 4635 + }, + { + "epoch": 2.483358421140659, + "grad_norm": 10.230795726796329, + "learning_rate": 2.6244942875273093e-07, + "logits/chosen": -0.6266258955001831, + "logits/rejected": -0.5506526231765747, + "logps/chosen": -1.3115276098251343, + "logps/rejected": -2.3939852714538574, + "loss": 0.8511, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.3115276098251343, + "rewards/margins": 1.0824575424194336, + "rewards/rejected": -2.3939852714538574, + "sft_loss": 1.3581907749176025, + "step": 4640 + }, + { + "epoch": 2.4860344539220605, + "grad_norm": 13.846920028469155, + "learning_rate": 2.59815248702581e-07, + "logits/chosen": -0.6682706475257874, + "logits/rejected": -0.555500864982605, + "logps/chosen": -1.2825028896331787, + "logps/rejected": -2.1850879192352295, + "loss": 0.8783, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2825028896331787, + "rewards/margins": 0.9025851488113403, + "rewards/rejected": -2.1850879192352295, + "sft_loss": 1.3643155097961426, + "step": 4645 + }, + { + "epoch": 2.488710486703462, + "grad_norm": 10.690683184069727, + "learning_rate": 2.5719310093032695e-07, + "logits/chosen": -0.7059717774391174, + "logits/rejected": -0.5149275660514832, + "logps/chosen": -1.313106894493103, + "logps/rejected": -2.339468002319336, + "loss": 0.8633, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.313106894493103, + "rewards/margins": 1.026361107826233, + "rewards/rejected": -2.339468002319336, + "sft_loss": 1.337316632270813, + "step": 4650 + }, + { + "epoch": 2.4913865194848634, + "grad_norm": 12.241505842991586, + "learning_rate": 2.5458301087605876e-07, + "logits/chosen": -0.7407052516937256, + "logits/rejected": -0.6273609399795532, + "logps/chosen": -1.315629005432129, + "logps/rejected": -2.2260148525238037, + "loss": 0.9076, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.315629005432129, + "rewards/margins": 0.9103857278823853, + "rewards/rejected": -2.2260148525238037, + "sft_loss": 1.4132753610610962, + "step": 4655 + }, + { + "epoch": 2.494062552266265, + "grad_norm": 12.546182844033622, + "learning_rate": 2.5198500386288083e-07, + "logits/chosen": -0.7224112749099731, + "logits/rejected": -0.6466668844223022, + "logps/chosen": -1.3775488138198853, + "logps/rejected": -2.372617721557617, + "loss": 0.8919, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.3775488138198853, + "rewards/margins": 0.9950687289237976, + "rewards/rejected": -2.372617721557617, + "sft_loss": 1.4110504388809204, + "step": 4660 + }, + { + "epoch": 2.496738585047667, + "grad_norm": 9.800911861573296, + "learning_rate": 2.493991050966694e-07, + "logits/chosen": -0.6987338662147522, + "logits/rejected": -0.6448682546615601, + "logps/chosen": -1.3800842761993408, + "logps/rejected": -2.333311080932617, + "loss": 0.9007, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.3800842761993408, + "rewards/margins": 0.9532268643379211, + "rewards/rejected": -2.333311080932617, + "sft_loss": 1.4070731401443481, + "step": 4665 + }, + { + "epoch": 2.4994146178290686, + "grad_norm": 25.998145847928697, + "learning_rate": 2.4682533966582494e-07, + "logits/chosen": -0.6867243647575378, + "logits/rejected": -0.5943757891654968, + "logps/chosen": -1.3054351806640625, + "logps/rejected": -2.0959885120391846, + "loss": 0.9185, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3054351806640625, + "rewards/margins": 0.7905532717704773, + "rewards/rejected": -2.0959885120391846, + "sft_loss": 1.400160789489746, + "step": 4670 + }, + { + "epoch": 2.50209065061047, + "grad_norm": 11.02779994160791, + "learning_rate": 2.442637325410316e-07, + "logits/chosen": -0.6484078168869019, + "logits/rejected": -0.5131920576095581, + "logps/chosen": -1.267917513847351, + "logps/rejected": -2.326103687286377, + "loss": 0.8481, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.267917513847351, + "rewards/margins": 1.0581862926483154, + "rewards/rejected": -2.326103687286377, + "sft_loss": 1.2982842922210693, + "step": 4675 + }, + { + "epoch": 2.5047666833918716, + "grad_norm": 14.251626374285026, + "learning_rate": 2.417143085750122e-07, + "logits/chosen": -0.6278024911880493, + "logits/rejected": -0.5599108934402466, + "logps/chosen": -1.2820518016815186, + "logps/rejected": -2.404381513595581, + "loss": 0.8423, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.2820518016815186, + "rewards/margins": 1.1223294734954834, + "rewards/rejected": -2.404381513595581, + "sft_loss": 1.327433705329895, + "step": 4680 + }, + { + "epoch": 2.507442716173273, + "grad_norm": 11.805514240107307, + "learning_rate": 2.3917709250228994e-07, + "logits/chosen": -0.7355372309684753, + "logits/rejected": -0.590130627155304, + "logps/chosen": -1.3373448848724365, + "logps/rejected": -2.2430479526519775, + "loss": 0.8902, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3373448848724365, + "rewards/margins": 0.9057027697563171, + "rewards/rejected": -2.2430479526519775, + "sft_loss": 1.3762180805206299, + "step": 4685 + }, + { + "epoch": 2.5101187489546746, + "grad_norm": 7.8787461944719865, + "learning_rate": 2.3665210893894557e-07, + "logits/chosen": -0.6504893898963928, + "logits/rejected": -0.6293870806694031, + "logps/chosen": -1.3105475902557373, + "logps/rejected": -2.3168463706970215, + "loss": 0.8746, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.3105475902557373, + "rewards/margins": 1.0062991380691528, + "rewards/rejected": -2.3168463706970215, + "sft_loss": 1.3457313776016235, + "step": 4690 + }, + { + "epoch": 2.5127947817360763, + "grad_norm": 7.173051141455469, + "learning_rate": 2.3413938238238157e-07, + "logits/chosen": -0.6418722867965698, + "logits/rejected": -0.4838961660861969, + "logps/chosen": -1.3251688480377197, + "logps/rejected": -2.3496623039245605, + "loss": 0.9154, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.3251688480377197, + "rewards/margins": 1.0244933366775513, + "rewards/rejected": -2.3496623039245605, + "sft_loss": 1.3792946338653564, + "step": 4695 + }, + { + "epoch": 2.515470814517478, + "grad_norm": 7.241597522887762, + "learning_rate": 2.316389372110812e-07, + "logits/chosen": -0.7322367429733276, + "logits/rejected": -0.6537310481071472, + "logps/chosen": -1.282598614692688, + "logps/rejected": -2.2019360065460205, + "loss": 0.9101, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.282598614692688, + "rewards/margins": 0.9193374514579773, + "rewards/rejected": -2.2019360065460205, + "sft_loss": 1.368976354598999, + "step": 4700 + }, + { + "epoch": 2.5181468472988793, + "grad_norm": 9.936247951660189, + "learning_rate": 2.2915079768437514e-07, + "logits/chosen": -0.6218239665031433, + "logits/rejected": -0.6292901039123535, + "logps/chosen": -1.3534696102142334, + "logps/rejected": -2.283411979675293, + "loss": 0.9143, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.3534696102142334, + "rewards/margins": 0.9299424886703491, + "rewards/rejected": -2.283411979675293, + "sft_loss": 1.364280343055725, + "step": 4705 + }, + { + "epoch": 2.520822880080281, + "grad_norm": 11.31977470233518, + "learning_rate": 2.2667498794220326e-07, + "logits/chosen": -0.7041364312171936, + "logits/rejected": -0.5870811343193054, + "logps/chosen": -1.3467581272125244, + "logps/rejected": -2.507812261581421, + "loss": 0.8574, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.3467581272125244, + "rewards/margins": 1.1610538959503174, + "rewards/rejected": -2.507812261581421, + "sft_loss": 1.3909904956817627, + "step": 4710 + }, + { + "epoch": 2.5234989128616827, + "grad_norm": 9.736817733827035, + "learning_rate": 2.2421153200488332e-07, + "logits/chosen": -0.677000105381012, + "logits/rejected": -0.7056635022163391, + "logps/chosen": -1.414865255355835, + "logps/rejected": -2.4057819843292236, + "loss": 0.9029, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.414865255355835, + "rewards/margins": 0.9909166097640991, + "rewards/rejected": -2.4057819843292236, + "sft_loss": 1.4676659107208252, + "step": 4715 + }, + { + "epoch": 2.526174945643084, + "grad_norm": 11.290414144270878, + "learning_rate": 2.217604537728749e-07, + "logits/chosen": -0.6715537309646606, + "logits/rejected": -0.6152840852737427, + "logps/chosen": -1.2174723148345947, + "logps/rejected": -2.1375701427459717, + "loss": 0.8517, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.2174723148345947, + "rewards/margins": 0.9200976490974426, + "rewards/rejected": -2.1375701427459717, + "sft_loss": 1.3102633953094482, + "step": 4720 + }, + { + "epoch": 2.5288509784244857, + "grad_norm": 8.768687363299435, + "learning_rate": 2.1932177702655053e-07, + "logits/chosen": -0.7175818681716919, + "logits/rejected": -0.698711633682251, + "logps/chosen": -1.3792225122451782, + "logps/rejected": -2.303156614303589, + "loss": 0.9178, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3792225122451782, + "rewards/margins": 0.9239341020584106, + "rewards/rejected": -2.303156614303589, + "sft_loss": 1.4294236898422241, + "step": 4725 + }, + { + "epoch": 2.5315270112058874, + "grad_norm": 8.362980908179956, + "learning_rate": 2.1689552542596232e-07, + "logits/chosen": -0.6957578063011169, + "logits/rejected": -0.5916758179664612, + "logps/chosen": -1.2203314304351807, + "logps/rejected": -2.3191776275634766, + "loss": 0.8499, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2203314304351807, + "rewards/margins": 1.098846197128296, + "rewards/rejected": -2.3191776275634766, + "sft_loss": 1.3278615474700928, + "step": 4730 + }, + { + "epoch": 2.5342030439872887, + "grad_norm": 5.208940835506882, + "learning_rate": 2.1448172251061338e-07, + "logits/chosen": -0.6650044322013855, + "logits/rejected": -0.7591149806976318, + "logps/chosen": -1.347022294998169, + "logps/rejected": -2.176450252532959, + "loss": 0.8969, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.347022294998169, + "rewards/margins": 0.8294281959533691, + "rewards/rejected": -2.176450252532959, + "sft_loss": 1.3947023153305054, + "step": 4735 + }, + { + "epoch": 2.5368790767686904, + "grad_norm": 6.993025606054764, + "learning_rate": 2.1208039169923122e-07, + "logits/chosen": -0.7362990975379944, + "logits/rejected": -0.6220189929008484, + "logps/chosen": -1.3531649112701416, + "logps/rejected": -2.3682894706726074, + "loss": 0.875, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3531649112701416, + "rewards/margins": 1.0151245594024658, + "rewards/rejected": -2.3682894706726074, + "sft_loss": 1.3880125284194946, + "step": 4740 + }, + { + "epoch": 2.539555109550092, + "grad_norm": 8.38125688208826, + "learning_rate": 2.096915562895369e-07, + "logits/chosen": -0.6793426275253296, + "logits/rejected": -0.6831785440444946, + "logps/chosen": -1.3633360862731934, + "logps/rejected": -2.381472110748291, + "loss": 0.901, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3633360862731934, + "rewards/margins": 1.0181360244750977, + "rewards/rejected": -2.381472110748291, + "sft_loss": 1.4215288162231445, + "step": 4745 + }, + { + "epoch": 2.5422311423314934, + "grad_norm": 9.175151061347913, + "learning_rate": 2.07315239458023e-07, + "logits/chosen": -0.6697893738746643, + "logits/rejected": -0.520065426826477, + "logps/chosen": -1.3501994609832764, + "logps/rejected": -2.576988458633423, + "loss": 0.8288, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.3501994609832764, + "rewards/margins": 1.2267887592315674, + "rewards/rejected": -2.576988458633423, + "sft_loss": 1.3498144149780273, + "step": 4750 + }, + { + "epoch": 2.544907175112895, + "grad_norm": 10.016426252351847, + "learning_rate": 2.0495146425972487e-07, + "logits/chosen": -0.7503206133842468, + "logits/rejected": -0.6195517778396606, + "logps/chosen": -1.2370802164077759, + "logps/rejected": -2.4151086807250977, + "loss": 0.8469, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2370802164077759, + "rewards/margins": 1.1780284643173218, + "rewards/rejected": -2.4151086807250977, + "sft_loss": 1.3105494976043701, + "step": 4755 + }, + { + "epoch": 2.547583207894297, + "grad_norm": 12.292841806081597, + "learning_rate": 2.0260025362800078e-07, + "logits/chosen": -0.7724018096923828, + "logits/rejected": -0.6956478357315063, + "logps/chosen": -1.251227617263794, + "logps/rejected": -2.305633068084717, + "loss": 0.8592, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.251227617263794, + "rewards/margins": 1.0544054508209229, + "rewards/rejected": -2.305633068084717, + "sft_loss": 1.3491507768630981, + "step": 4760 + }, + { + "epoch": 2.5502592406756985, + "grad_norm": 16.015556311358075, + "learning_rate": 2.002616303743059e-07, + "logits/chosen": -0.7809301614761353, + "logits/rejected": -0.6208760142326355, + "logps/chosen": -1.3786977529525757, + "logps/rejected": -2.4913034439086914, + "loss": 0.8944, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.3786977529525757, + "rewards/margins": 1.1126058101654053, + "rewards/rejected": -2.4913034439086914, + "sft_loss": 1.4251824617385864, + "step": 4765 + }, + { + "epoch": 2.5529352734571, + "grad_norm": 8.73037919482241, + "learning_rate": 1.979356171879738e-07, + "logits/chosen": -0.7047054767608643, + "logits/rejected": -0.6488312482833862, + "logps/chosen": -1.3484723567962646, + "logps/rejected": -2.4292967319488525, + "loss": 0.8628, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.3484723567962646, + "rewards/margins": 1.0808244943618774, + "rewards/rejected": -2.4292967319488525, + "sft_loss": 1.3975765705108643, + "step": 4770 + }, + { + "epoch": 2.5556113062385015, + "grad_norm": 11.96525948420899, + "learning_rate": 1.9562223663599399e-07, + "logits/chosen": -0.6062943339347839, + "logits/rejected": -0.5331624150276184, + "logps/chosen": -1.3019130229949951, + "logps/rejected": -2.3833680152893066, + "loss": 0.8578, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.3019130229949951, + "rewards/margins": 1.0814554691314697, + "rewards/rejected": -2.3833680152893066, + "sft_loss": 1.309452772140503, + "step": 4775 + }, + { + "epoch": 2.558287339019903, + "grad_norm": 19.670502841219136, + "learning_rate": 1.9332151116279557e-07, + "logits/chosen": -0.7169122695922852, + "logits/rejected": -0.6609517335891724, + "logps/chosen": -1.3092458248138428, + "logps/rejected": -2.282841920852661, + "loss": 0.8788, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.3092458248138428, + "rewards/margins": 0.9735962748527527, + "rewards/rejected": -2.282841920852661, + "sft_loss": 1.3545231819152832, + "step": 4780 + }, + { + "epoch": 2.5609633718013045, + "grad_norm": 8.73783859329783, + "learning_rate": 1.9103346309002623e-07, + "logits/chosen": -0.6947210431098938, + "logits/rejected": -0.6709440350532532, + "logps/chosen": -1.3139759302139282, + "logps/rejected": -2.1969356536865234, + "loss": 0.9067, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3139759302139282, + "rewards/margins": 0.8829599618911743, + "rewards/rejected": -2.1969356536865234, + "sft_loss": 1.3645445108413696, + "step": 4785 + }, + { + "epoch": 2.5636394045827062, + "grad_norm": 8.28415692660026, + "learning_rate": 1.887581146163394e-07, + "logits/chosen": -0.7579419016838074, + "logits/rejected": -0.6561330556869507, + "logps/chosen": -1.3163559436798096, + "logps/rejected": -2.490429401397705, + "loss": 0.8568, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.3163559436798096, + "rewards/margins": 1.174073576927185, + "rewards/rejected": -2.490429401397705, + "sft_loss": 1.350839376449585, + "step": 4790 + }, + { + "epoch": 2.566315437364108, + "grad_norm": 9.467127267461985, + "learning_rate": 1.8649548781717506e-07, + "logits/chosen": -0.6651458740234375, + "logits/rejected": -0.6147671937942505, + "logps/chosen": -1.3289358615875244, + "logps/rejected": -2.270233392715454, + "loss": 0.8755, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.3289358615875244, + "rewards/margins": 0.9412969350814819, + "rewards/rejected": -2.270233392715454, + "sft_loss": 1.3590539693832397, + "step": 4795 + }, + { + "epoch": 2.568991470145509, + "grad_norm": 6.597920954146079, + "learning_rate": 1.8424560464454891e-07, + "logits/chosen": -0.7337859869003296, + "logits/rejected": -0.6585230827331543, + "logps/chosen": -1.2692844867706299, + "logps/rejected": -2.1357951164245605, + "loss": 0.8747, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2692844867706299, + "rewards/margins": 0.8665107488632202, + "rewards/rejected": -2.1357951164245605, + "sft_loss": 1.3429874181747437, + "step": 4800 + }, + { + "epoch": 2.568991470145509, + "eval_logits/chosen": -0.44644811749458313, + "eval_logits/rejected": -0.39516207575798035, + "eval_logps/chosen": -1.5252902507781982, + "eval_logps/rejected": -2.175719976425171, + "eval_loss": 1.0419161319732666, + "eval_rewards/accuracies": 0.6454005837440491, + "eval_rewards/chosen": -1.5252902507781982, + "eval_rewards/margins": 0.6504298448562622, + "eval_rewards/rejected": -2.175719976425171, + "eval_runtime": 44.1168, + "eval_samples_per_second": 30.487, + "eval_sft_loss": 1.4755855798721313, + "eval_steps_per_second": 7.639, + "step": 4800 + }, + { + "epoch": 2.571667502926911, + "grad_norm": 15.103542790559718, + "learning_rate": 1.820084869268369e-07, + "logits/chosen": -0.7690774202346802, + "logits/rejected": -0.7059490084648132, + "logps/chosen": -1.3684237003326416, + "logps/rejected": -2.373002052307129, + "loss": 0.8923, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.3684237003326416, + "rewards/margins": 1.0045783519744873, + "rewards/rejected": -2.373002052307129, + "sft_loss": 1.4043967723846436, + "step": 4805 + }, + { + "epoch": 2.574343535708312, + "grad_norm": 10.299547408906609, + "learning_rate": 1.7978415636856571e-07, + "logits/chosen": -0.6917232871055603, + "logits/rejected": -0.6187536716461182, + "logps/chosen": -1.3753135204315186, + "logps/rejected": -2.3357646465301514, + "loss": 0.9299, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3753135204315186, + "rewards/margins": 0.9604509472846985, + "rewards/rejected": -2.3357646465301514, + "sft_loss": 1.4127343893051147, + "step": 4810 + }, + { + "epoch": 2.577019568489714, + "grad_norm": 11.14236285917446, + "learning_rate": 1.7757263455019906e-07, + "logits/chosen": -0.6763869524002075, + "logits/rejected": -0.5861636400222778, + "logps/chosen": -1.204840064048767, + "logps/rejected": -2.362485885620117, + "loss": 0.8418, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.204840064048767, + "rewards/margins": 1.1576459407806396, + "rewards/rejected": -2.362485885620117, + "sft_loss": 1.283821702003479, + "step": 4815 + }, + { + "epoch": 2.5796956012711156, + "grad_norm": 7.259647243572955, + "learning_rate": 1.7537394292793245e-07, + "logits/chosen": -0.6820626258850098, + "logits/rejected": -0.611841082572937, + "logps/chosen": -1.3615658283233643, + "logps/rejected": -2.250566005706787, + "loss": 0.9006, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3615658283233643, + "rewards/margins": 0.8890002965927124, + "rewards/rejected": -2.250566005706787, + "sft_loss": 1.3739036321640015, + "step": 4820 + }, + { + "epoch": 2.5823716340525174, + "grad_norm": 9.838289890502702, + "learning_rate": 1.731881028334808e-07, + "logits/chosen": -0.6777626872062683, + "logits/rejected": -0.5994135141372681, + "logps/chosen": -1.3140347003936768, + "logps/rejected": -2.183943748474121, + "loss": 0.8925, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.3140347003936768, + "rewards/margins": 0.8699088096618652, + "rewards/rejected": -2.183943748474121, + "sft_loss": 1.322758436203003, + "step": 4825 + }, + { + "epoch": 2.5850476668339186, + "grad_norm": 9.648523193237793, + "learning_rate": 1.7101513547387487e-07, + "logits/chosen": -0.7211915850639343, + "logits/rejected": -0.620406448841095, + "logps/chosen": -1.3341495990753174, + "logps/rejected": -2.2978355884552, + "loss": 0.8812, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.3341495990753174, + "rewards/margins": 0.9636863470077515, + "rewards/rejected": -2.2978355884552, + "sft_loss": 1.3428423404693604, + "step": 4830 + }, + { + "epoch": 2.5877236996153203, + "grad_norm": 12.784087079857935, + "learning_rate": 1.6885506193125306e-07, + "logits/chosen": -0.8385518193244934, + "logits/rejected": -0.6945291757583618, + "logps/chosen": -1.3321683406829834, + "logps/rejected": -2.4906907081604004, + "loss": 0.8596, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.3321683406829834, + "rewards/margins": 1.158522367477417, + "rewards/rejected": -2.4906907081604004, + "sft_loss": 1.3666893243789673, + "step": 4835 + }, + { + "epoch": 2.5903997323967216, + "grad_norm": 11.568765499261328, + "learning_rate": 1.667079031626591e-07, + "logits/chosen": -0.7675267457962036, + "logits/rejected": -0.6532658338546753, + "logps/chosen": -1.3479506969451904, + "logps/rejected": -2.358936309814453, + "loss": 0.8862, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3479506969451904, + "rewards/margins": 1.0109856128692627, + "rewards/rejected": -2.358936309814453, + "sft_loss": 1.4017937183380127, + "step": 4840 + }, + { + "epoch": 2.5930757651781233, + "grad_norm": 9.203820574490626, + "learning_rate": 1.6457367999983568e-07, + "logits/chosen": -0.7395308613777161, + "logits/rejected": -0.6636630892753601, + "logps/chosen": -1.3089097738265991, + "logps/rejected": -2.2758500576019287, + "loss": 0.8811, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.3089097738265991, + "rewards/margins": 0.9669402241706848, + "rewards/rejected": -2.2758500576019287, + "sft_loss": 1.4001891613006592, + "step": 4845 + }, + { + "epoch": 2.595751797959525, + "grad_norm": 12.680296874297454, + "learning_rate": 1.6245241314902604e-07, + "logits/chosen": -0.8743961453437805, + "logits/rejected": -0.7420130968093872, + "logps/chosen": -1.295114278793335, + "logps/rejected": -2.3928744792938232, + "loss": 0.8639, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.295114278793335, + "rewards/margins": 1.097760558128357, + "rewards/rejected": -2.3928744792938232, + "sft_loss": 1.3053842782974243, + "step": 4850 + }, + { + "epoch": 2.5984278307409268, + "grad_norm": 7.991829421367602, + "learning_rate": 1.6034412319077008e-07, + "logits/chosen": -0.6992126703262329, + "logits/rejected": -0.5952213406562805, + "logps/chosen": -1.2627414464950562, + "logps/rejected": -2.328612804412842, + "loss": 0.8834, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.2627414464950562, + "rewards/margins": 1.065871238708496, + "rewards/rejected": -2.328612804412842, + "sft_loss": 1.3418805599212646, + "step": 4855 + }, + { + "epoch": 2.601103863522328, + "grad_norm": 8.77204071833793, + "learning_rate": 1.582488305797068e-07, + "logits/chosen": -0.7220746278762817, + "logits/rejected": -0.6973734498023987, + "logps/chosen": -1.2161717414855957, + "logps/rejected": -2.3466389179229736, + "loss": 0.7956, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.2161717414855957, + "rewards/margins": 1.1304669380187988, + "rewards/rejected": -2.3466389179229736, + "sft_loss": 1.2758996486663818, + "step": 4860 + }, + { + "epoch": 2.6037798963037297, + "grad_norm": 8.753318218697338, + "learning_rate": 1.5616655564437354e-07, + "logits/chosen": -0.8045107126235962, + "logits/rejected": -0.7597325444221497, + "logps/chosen": -1.3217512369155884, + "logps/rejected": -2.395482301712036, + "loss": 0.8733, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.3217512369155884, + "rewards/margins": 1.0737309455871582, + "rewards/rejected": -2.395482301712036, + "sft_loss": 1.347623586654663, + "step": 4865 + }, + { + "epoch": 2.606455929085131, + "grad_norm": 7.614181712513337, + "learning_rate": 1.5409731858701154e-07, + "logits/chosen": -0.6682974696159363, + "logits/rejected": -0.5954986214637756, + "logps/chosen": -1.2902181148529053, + "logps/rejected": -2.3482024669647217, + "loss": 0.853, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.2902181148529053, + "rewards/margins": 1.0579843521118164, + "rewards/rejected": -2.3482024669647217, + "sft_loss": 1.3140208721160889, + "step": 4870 + }, + { + "epoch": 2.6091319618665327, + "grad_norm": 9.475477603706777, + "learning_rate": 1.5204113948336717e-07, + "logits/chosen": -0.5529733896255493, + "logits/rejected": -0.4986020028591156, + "logps/chosen": -1.2315846681594849, + "logps/rejected": -2.5735256671905518, + "loss": 0.8204, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.2315846681594849, + "rewards/margins": 1.341940999031067, + "rewards/rejected": -2.5735256671905518, + "sft_loss": 1.3268613815307617, + "step": 4875 + }, + { + "epoch": 2.6118079946479344, + "grad_norm": 7.4222452619037576, + "learning_rate": 1.499980382824997e-07, + "logits/chosen": -0.6061524748802185, + "logits/rejected": -0.5214860439300537, + "logps/chosen": -1.258568525314331, + "logps/rejected": -2.432037830352783, + "loss": 0.8569, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.258568525314331, + "rewards/margins": 1.1734689474105835, + "rewards/rejected": -2.432037830352783, + "sft_loss": 1.3374583721160889, + "step": 4880 + }, + { + "epoch": 2.614484027429336, + "grad_norm": 11.640054133774896, + "learning_rate": 1.479680348065855e-07, + "logits/chosen": -0.6220036149024963, + "logits/rejected": -0.5831924676895142, + "logps/chosen": -1.4027297496795654, + "logps/rejected": -2.553523540496826, + "loss": 0.9172, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.4027297496795654, + "rewards/margins": 1.1507937908172607, + "rewards/rejected": -2.553523540496826, + "sft_loss": 1.530013084411621, + "step": 4885 + }, + { + "epoch": 2.6171600602107374, + "grad_norm": 15.846563432651616, + "learning_rate": 1.4595114875072762e-07, + "logits/chosen": -0.7990840673446655, + "logits/rejected": -0.6822538375854492, + "logps/chosen": -1.3178495168685913, + "logps/rejected": -2.345160722732544, + "loss": 0.8832, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3178495168685913, + "rewards/margins": 1.027311086654663, + "rewards/rejected": -2.345160722732544, + "sft_loss": 1.3916982412338257, + "step": 4890 + }, + { + "epoch": 2.619836092992139, + "grad_norm": 7.443163924141347, + "learning_rate": 1.4394739968276293e-07, + "logits/chosen": -0.7228940725326538, + "logits/rejected": -0.6642226576805115, + "logps/chosen": -1.3509809970855713, + "logps/rejected": -2.127976179122925, + "loss": 0.9536, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3509809970855713, + "rewards/margins": 0.7769953012466431, + "rewards/rejected": -2.127976179122925, + "sft_loss": 1.4385178089141846, + "step": 4895 + }, + { + "epoch": 2.622512125773541, + "grad_norm": 7.294244692854026, + "learning_rate": 1.4195680704307405e-07, + "logits/chosen": -0.6522566676139832, + "logits/rejected": -0.5300472974777222, + "logps/chosen": -1.2895146608352661, + "logps/rejected": -2.4499125480651855, + "loss": 0.8536, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2895146608352661, + "rewards/margins": 1.1603978872299194, + "rewards/rejected": -2.4499125480651855, + "sft_loss": 1.351528525352478, + "step": 4900 + }, + { + "epoch": 2.625188158554942, + "grad_norm": 17.234232738029313, + "learning_rate": 1.3997939014439926e-07, + "logits/chosen": -0.7080479860305786, + "logits/rejected": -0.5974324941635132, + "logps/chosen": -1.382468819618225, + "logps/rejected": -2.398974895477295, + "loss": 0.9083, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.382468819618225, + "rewards/margins": 1.0165059566497803, + "rewards/rejected": -2.398974895477295, + "sft_loss": 1.4537564516067505, + "step": 4905 + }, + { + "epoch": 2.627864191336344, + "grad_norm": 9.92835117922593, + "learning_rate": 1.380151681716465e-07, + "logits/chosen": -0.7300471067428589, + "logits/rejected": -0.7779293060302734, + "logps/chosen": -1.3747262954711914, + "logps/rejected": -2.7321619987487793, + "loss": 0.871, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.3747262954711914, + "rewards/margins": 1.3574353456497192, + "rewards/rejected": -2.7321619987487793, + "sft_loss": 1.411215901374817, + "step": 4910 + }, + { + "epoch": 2.6305402241177456, + "grad_norm": 10.59200185807618, + "learning_rate": 1.3606416018170502e-07, + "logits/chosen": -0.6524641513824463, + "logits/rejected": -0.5611596703529358, + "logps/chosen": -1.2252461910247803, + "logps/rejected": -2.2592012882232666, + "loss": 0.848, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.2252461910247803, + "rewards/margins": 1.0339548587799072, + "rewards/rejected": -2.2592012882232666, + "sft_loss": 1.3374149799346924, + "step": 4915 + }, + { + "epoch": 2.6332162568991473, + "grad_norm": 6.598038307883381, + "learning_rate": 1.3412638510326397e-07, + "logits/chosen": -0.6867167949676514, + "logits/rejected": -0.6597640514373779, + "logps/chosen": -1.3008304834365845, + "logps/rejected": -2.2904257774353027, + "loss": 0.8999, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3008304834365845, + "rewards/margins": 0.9895952939987183, + "rewards/rejected": -2.2904257774353027, + "sft_loss": 1.379055380821228, + "step": 4920 + }, + { + "epoch": 2.6358922896805486, + "grad_norm": 8.580472421138872, + "learning_rate": 1.3220186173662462e-07, + "logits/chosen": -0.8828998804092407, + "logits/rejected": -0.6815120577812195, + "logps/chosen": -1.2868249416351318, + "logps/rejected": -2.482501268386841, + "loss": 0.8522, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.2868249416351318, + "rewards/margins": 1.1956764459609985, + "rewards/rejected": -2.482501268386841, + "sft_loss": 1.3695834875106812, + "step": 4925 + }, + { + "epoch": 2.6385683224619503, + "grad_norm": 8.937724503158526, + "learning_rate": 1.30290608753522e-07, + "logits/chosen": -0.6469255685806274, + "logits/rejected": -0.5224730968475342, + "logps/chosen": -1.3711665868759155, + "logps/rejected": -2.592311382293701, + "loss": 0.8565, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.3711665868759155, + "rewards/margins": 1.2211449146270752, + "rewards/rejected": -2.592311382293701, + "sft_loss": 1.385009527206421, + "step": 4930 + }, + { + "epoch": 2.6412443552433515, + "grad_norm": 12.928969840134284, + "learning_rate": 1.2839264469694039e-07, + "logits/chosen": -0.7605575323104858, + "logits/rejected": -0.6379026174545288, + "logps/chosen": -1.3274918794631958, + "logps/rejected": -2.2872328758239746, + "loss": 0.9115, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3274918794631958, + "rewards/margins": 0.9597407579421997, + "rewards/rejected": -2.2872328758239746, + "sft_loss": 1.38313627243042, + "step": 4935 + }, + { + "epoch": 2.6439203880247533, + "grad_norm": 8.512784932615155, + "learning_rate": 1.2650798798093577e-07, + "logits/chosen": -0.7268552780151367, + "logits/rejected": -0.6888422966003418, + "logps/chosen": -1.306331992149353, + "logps/rejected": -2.0488409996032715, + "loss": 0.9097, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.306331992149353, + "rewards/margins": 0.742509126663208, + "rewards/rejected": -2.0488409996032715, + "sft_loss": 1.3654850721359253, + "step": 4940 + }, + { + "epoch": 2.646596420806155, + "grad_norm": 6.309196448048183, + "learning_rate": 1.2463665689045533e-07, + "logits/chosen": -0.7216583490371704, + "logits/rejected": -0.5565370321273804, + "logps/chosen": -1.2761034965515137, + "logps/rejected": -2.5478756427764893, + "loss": 0.8398, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.2761034965515137, + "rewards/margins": 1.271771788597107, + "rewards/rejected": -2.5478756427764893, + "sft_loss": 1.3512345552444458, + "step": 4945 + }, + { + "epoch": 2.6492724535875567, + "grad_norm": 9.598407648987838, + "learning_rate": 1.2277866958116207e-07, + "logits/chosen": -0.7417315244674683, + "logits/rejected": -0.6495709419250488, + "logps/chosen": -1.3259822130203247, + "logps/rejected": -2.192164421081543, + "loss": 0.8991, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3259822130203247, + "rewards/margins": 0.8661823272705078, + "rewards/rejected": -2.192164421081543, + "sft_loss": 1.3365287780761719, + "step": 4950 + }, + { + "epoch": 2.651948486368958, + "grad_norm": 7.078745272401252, + "learning_rate": 1.2093404407925668e-07, + "logits/chosen": -0.7276274561882019, + "logits/rejected": -0.7417802214622498, + "logps/chosen": -1.3177086114883423, + "logps/rejected": -2.2994585037231445, + "loss": 0.8947, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3177086114883423, + "rewards/margins": 0.9817501306533813, + "rewards/rejected": -2.2994585037231445, + "sft_loss": 1.3993942737579346, + "step": 4955 + }, + { + "epoch": 2.6546245191503597, + "grad_norm": 8.760101354046029, + "learning_rate": 1.1910279828130405e-07, + "logits/chosen": -0.623112678527832, + "logits/rejected": -0.5573943853378296, + "logps/chosen": -1.2637913227081299, + "logps/rejected": -2.2288098335266113, + "loss": 0.8648, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.2637913227081299, + "rewards/margins": 0.9650182723999023, + "rewards/rejected": -2.2288098335266113, + "sft_loss": 1.303546667098999, + "step": 4960 + }, + { + "epoch": 2.657300551931761, + "grad_norm": 9.425834805164445, + "learning_rate": 1.1728494995405876e-07, + "logits/chosen": -0.7425374984741211, + "logits/rejected": -0.6649678945541382, + "logps/chosen": -1.2058379650115967, + "logps/rejected": -2.2518324851989746, + "loss": 0.8527, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.2058379650115967, + "rewards/margins": 1.0459946393966675, + "rewards/rejected": -2.2518324851989746, + "sft_loss": 1.3115302324295044, + "step": 4965 + }, + { + "epoch": 2.6599765847131627, + "grad_norm": 8.040153775855304, + "learning_rate": 1.1548051673429366e-07, + "logits/chosen": -0.6462007164955139, + "logits/rejected": -0.6145657896995544, + "logps/chosen": -1.228861927986145, + "logps/rejected": -2.3314716815948486, + "loss": 0.8266, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.228861927986145, + "rewards/margins": 1.1026098728179932, + "rewards/rejected": -2.3314716815948486, + "sft_loss": 1.2801616191864014, + "step": 4970 + }, + { + "epoch": 2.6626526174945644, + "grad_norm": 10.12979124482581, + "learning_rate": 1.136895161286271e-07, + "logits/chosen": -0.704356849193573, + "logits/rejected": -0.6970897912979126, + "logps/chosen": -1.3229091167449951, + "logps/rejected": -2.3172924518585205, + "loss": 0.87, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3229091167449951, + "rewards/margins": 0.9943831562995911, + "rewards/rejected": -2.3172924518585205, + "sft_loss": 1.3527791500091553, + "step": 4975 + }, + { + "epoch": 2.665328650275966, + "grad_norm": 9.107056984707933, + "learning_rate": 1.1191196551335547e-07, + "logits/chosen": -0.6133028864860535, + "logits/rejected": -0.5777004957199097, + "logps/chosen": -1.442797303199768, + "logps/rejected": -2.3889315128326416, + "loss": 0.9202, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.442797303199768, + "rewards/margins": 0.9461342096328735, + "rewards/rejected": -2.3889315128326416, + "sft_loss": 1.3789355754852295, + "step": 4980 + }, + { + "epoch": 2.6680046830573674, + "grad_norm": 10.841673758953423, + "learning_rate": 1.1014788213428206e-07, + "logits/chosen": -0.681205153465271, + "logits/rejected": -0.5534544587135315, + "logps/chosen": -1.2655446529388428, + "logps/rejected": -2.481712818145752, + "loss": 0.8421, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.2655446529388428, + "rewards/margins": 1.2161678075790405, + "rewards/rejected": -2.481712818145752, + "sft_loss": 1.3065024614334106, + "step": 4985 + }, + { + "epoch": 2.670680715838769, + "grad_norm": 8.353892759307273, + "learning_rate": 1.08397283106552e-07, + "logits/chosen": -0.8035761713981628, + "logits/rejected": -0.6475404500961304, + "logps/chosen": -1.2617781162261963, + "logps/rejected": -2.3890726566314697, + "loss": 0.8401, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.2617781162261963, + "rewards/margins": 1.1272943019866943, + "rewards/rejected": -2.3890726566314697, + "sft_loss": 1.3372514247894287, + "step": 4990 + }, + { + "epoch": 2.6733567486201704, + "grad_norm": 11.899115678259255, + "learning_rate": 1.0666018541448442e-07, + "logits/chosen": -0.7440560460090637, + "logits/rejected": -0.7424860000610352, + "logps/chosen": -1.3631649017333984, + "logps/rejected": -2.1925854682922363, + "loss": 0.9325, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.3631649017333984, + "rewards/margins": 0.8294209241867065, + "rewards/rejected": -2.1925854682922363, + "sft_loss": 1.4264438152313232, + "step": 4995 + }, + { + "epoch": 2.676032781401572, + "grad_norm": 8.637571793111615, + "learning_rate": 1.0493660591140919e-07, + "logits/chosen": -0.7476707100868225, + "logits/rejected": -0.7413309216499329, + "logps/chosen": -1.3109577894210815, + "logps/rejected": -2.4442172050476074, + "loss": 0.8531, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3109577894210815, + "rewards/margins": 1.133259654045105, + "rewards/rejected": -2.4442172050476074, + "sft_loss": 1.3718430995941162, + "step": 5000 + }, + { + "epoch": 2.678708814182974, + "grad_norm": 6.938299568282176, + "learning_rate": 1.0322656131950165e-07, + "logits/chosen": -0.6714180707931519, + "logits/rejected": -0.6517351865768433, + "logps/chosen": -1.2863237857818604, + "logps/rejected": -2.285841703414917, + "loss": 0.8472, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -1.2863237857818604, + "rewards/margins": 0.999518096446991, + "rewards/rejected": -2.285841703414917, + "sft_loss": 1.3182358741760254, + "step": 5005 + }, + { + "epoch": 2.6813848469643755, + "grad_norm": 9.804769515443496, + "learning_rate": 1.0153006822962246e-07, + "logits/chosen": -0.6418807506561279, + "logits/rejected": -0.6069843173027039, + "logps/chosen": -1.4389764070510864, + "logps/rejected": -2.4578685760498047, + "loss": 0.9649, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4389764070510864, + "rewards/margins": 1.0188924074172974, + "rewards/rejected": -2.4578685760498047, + "sft_loss": 1.4984257221221924, + "step": 5010 + }, + { + "epoch": 2.684060879745777, + "grad_norm": 10.572158439743552, + "learning_rate": 9.984714310115434e-08, + "logits/chosen": -0.7636522650718689, + "logits/rejected": -0.72252357006073, + "logps/chosen": -1.4187383651733398, + "logps/rejected": -2.4914615154266357, + "loss": 0.8708, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.4187383651733398, + "rewards/margins": 1.072723150253296, + "rewards/rejected": -2.4914615154266357, + "sft_loss": 1.3292744159698486, + "step": 5015 + }, + { + "epoch": 2.6867369125271785, + "grad_norm": 23.55508226743853, + "learning_rate": 9.817780226184509e-08, + "logits/chosen": -0.7631696462631226, + "logits/rejected": -0.6109831929206848, + "logps/chosen": -1.2823731899261475, + "logps/rejected": -2.268428087234497, + "loss": 0.8635, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.2823731899261475, + "rewards/margins": 0.9860547780990601, + "rewards/rejected": -2.268428087234497, + "sft_loss": 1.319327712059021, + "step": 5020 + }, + { + "epoch": 2.6894129453085798, + "grad_norm": 12.154954085554968, + "learning_rate": 9.652206190764611e-08, + "logits/chosen": -0.7588266134262085, + "logits/rejected": -0.6622456312179565, + "logps/chosen": -1.321897268295288, + "logps/rejected": -2.219815254211426, + "loss": 0.8903, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.321897268295288, + "rewards/margins": 0.8979180455207825, + "rewards/rejected": -2.219815254211426, + "sft_loss": 1.3334851264953613, + "step": 5025 + }, + { + "epoch": 2.6920889780899815, + "grad_norm": 6.100600505123231, + "learning_rate": 9.487993810255823e-08, + "logits/chosen": -0.7056714296340942, + "logits/rejected": -0.6703065633773804, + "logps/chosen": -1.3172643184661865, + "logps/rejected": -2.510986566543579, + "loss": 0.8796, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3172643184661865, + "rewards/margins": 1.1937217712402344, + "rewards/rejected": -2.510986566543579, + "sft_loss": 1.3384928703308105, + "step": 5030 + }, + { + "epoch": 2.694765010871383, + "grad_norm": 9.24875569951955, + "learning_rate": 9.325144677847325e-08, + "logits/chosen": -0.7563332915306091, + "logits/rejected": -0.6742985248565674, + "logps/chosen": -1.351980447769165, + "logps/rejected": -2.3332836627960205, + "loss": 0.8956, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.351980447769165, + "rewards/margins": 0.9813033938407898, + "rewards/rejected": -2.3332836627960205, + "sft_loss": 1.4207156896591187, + "step": 5035 + }, + { + "epoch": 2.697441043652785, + "grad_norm": 8.06269783556905, + "learning_rate": 9.163660373502158e-08, + "logits/chosen": -0.5251580476760864, + "logits/rejected": -0.5942437648773193, + "logps/chosen": -1.333310604095459, + "logps/rejected": -2.270529270172119, + "loss": 0.8927, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.333310604095459, + "rewards/margins": 0.9372186660766602, + "rewards/rejected": -2.270529270172119, + "sft_loss": 1.3476473093032837, + "step": 5040 + }, + { + "epoch": 2.700117076434186, + "grad_norm": 15.59237219629854, + "learning_rate": 9.003542463941711e-08, + "logits/chosen": -0.6538597941398621, + "logits/rejected": -0.6683381199836731, + "logps/chosen": -1.2633765935897827, + "logps/rejected": -2.2634286880493164, + "loss": 0.8548, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2633765935897827, + "rewards/margins": 1.0000520944595337, + "rewards/rejected": -2.2634286880493164, + "sft_loss": 1.3013274669647217, + "step": 5045 + }, + { + "epoch": 2.702793109215588, + "grad_norm": 8.31848775007685, + "learning_rate": 8.844792502630705e-08, + "logits/chosen": -0.7369416356086731, + "logits/rejected": -0.682004988193512, + "logps/chosen": -1.1934149265289307, + "logps/rejected": -2.214322328567505, + "loss": 0.8138, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.1934149265289307, + "rewards/margins": 1.0209074020385742, + "rewards/rejected": -2.214322328567505, + "sft_loss": 1.252170205116272, + "step": 5050 + }, + { + "epoch": 2.7054691419969896, + "grad_norm": 7.300347450169716, + "learning_rate": 8.687412029761866e-08, + "logits/chosen": -0.8128924369812012, + "logits/rejected": -0.7599584460258484, + "logps/chosen": -1.2087875604629517, + "logps/rejected": -2.244777202606201, + "loss": 0.843, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.2087875604629517, + "rewards/margins": 1.03598952293396, + "rewards/rejected": -2.244777202606201, + "sft_loss": 1.281432867050171, + "step": 5055 + }, + { + "epoch": 2.708145174778391, + "grad_norm": 8.97940117033596, + "learning_rate": 8.531402572241325e-08, + "logits/chosen": -0.7046575546264648, + "logits/rejected": -0.669869065284729, + "logps/chosen": -1.2762138843536377, + "logps/rejected": -2.200396776199341, + "loss": 0.9019, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2762138843536377, + "rewards/margins": 0.9241830110549927, + "rewards/rejected": -2.200396776199341, + "sft_loss": 1.3880784511566162, + "step": 5060 + }, + { + "epoch": 2.7108212075597926, + "grad_norm": 9.349225873443023, + "learning_rate": 8.376765643673462e-08, + "logits/chosen": -0.737822413444519, + "logits/rejected": -0.5608315467834473, + "logps/chosen": -1.3172581195831299, + "logps/rejected": -2.219006061553955, + "loss": 0.8674, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.3172581195831299, + "rewards/margins": 0.90174800157547, + "rewards/rejected": -2.219006061553955, + "sft_loss": 1.3546133041381836, + "step": 5065 + }, + { + "epoch": 2.7134972403411943, + "grad_norm": 13.166356373380424, + "learning_rate": 8.223502744346484e-08, + "logits/chosen": -0.6462982892990112, + "logits/rejected": -0.5416436791419983, + "logps/chosen": -1.232508897781372, + "logps/rejected": -2.0593743324279785, + "loss": 0.871, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.232508897781372, + "rewards/margins": 0.8268651962280273, + "rewards/rejected": -2.0593743324279785, + "sft_loss": 1.2882734537124634, + "step": 5070 + }, + { + "epoch": 2.7161732731225956, + "grad_norm": 11.578676537207762, + "learning_rate": 8.071615361217648e-08, + "logits/chosen": -0.6779338121414185, + "logits/rejected": -0.6354584097862244, + "logps/chosen": -1.2588183879852295, + "logps/rejected": -1.9801623821258545, + "loss": 0.9133, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.2588183879852295, + "rewards/margins": 0.7213441133499146, + "rewards/rejected": -1.9801623821258545, + "sft_loss": 1.3248854875564575, + "step": 5075 + }, + { + "epoch": 2.7188493059039973, + "grad_norm": 11.088766453636934, + "learning_rate": 7.92110496789909e-08, + "logits/chosen": -0.7551140189170837, + "logits/rejected": -0.6507441997528076, + "logps/chosen": -1.3093791007995605, + "logps/rejected": -2.214526414871216, + "loss": 0.8765, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.3093791007995605, + "rewards/margins": 0.9051472544670105, + "rewards/rejected": -2.214526414871216, + "sft_loss": 1.3492542505264282, + "step": 5080 + }, + { + "epoch": 2.721525338685399, + "grad_norm": 32.57712911465191, + "learning_rate": 7.771973024643241e-08, + "logits/chosen": -0.7945700287818909, + "logits/rejected": -0.7233412861824036, + "logps/chosen": -1.2418968677520752, + "logps/rejected": -2.426748037338257, + "loss": 0.8114, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2418968677520752, + "rewards/margins": 1.184851050376892, + "rewards/rejected": -2.426748037338257, + "sft_loss": 1.2658183574676514, + "step": 5085 + }, + { + "epoch": 2.7242013714668003, + "grad_norm": 11.21305824615193, + "learning_rate": 7.624220978328905e-08, + "logits/chosen": -0.8018733859062195, + "logits/rejected": -0.6985114216804504, + "logps/chosen": -1.3311643600463867, + "logps/rejected": -2.403780221939087, + "loss": 0.8759, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3311643600463867, + "rewards/margins": 1.0726158618927002, + "rewards/rejected": -2.403780221939087, + "sft_loss": 1.3794212341308594, + "step": 5090 + }, + { + "epoch": 2.726877404248202, + "grad_norm": 13.044135461749839, + "learning_rate": 7.477850262447056e-08, + "logits/chosen": -0.8470147252082825, + "logits/rejected": -0.6896313428878784, + "logps/chosen": -1.2608357667922974, + "logps/rejected": -2.455876111984253, + "loss": 0.8377, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.2608357667922974, + "rewards/margins": 1.1950404644012451, + "rewards/rejected": -2.455876111984253, + "sft_loss": 1.3385980129241943, + "step": 5095 + }, + { + "epoch": 2.7295534370296037, + "grad_norm": 8.305080535633154, + "learning_rate": 7.332862297087073e-08, + "logits/chosen": -0.6654651761054993, + "logits/rejected": -0.5675476789474487, + "logps/chosen": -1.2314693927764893, + "logps/rejected": -2.658346652984619, + "loss": 0.8194, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.2314693927764893, + "rewards/margins": 1.4268771409988403, + "rewards/rejected": -2.658346652984619, + "sft_loss": 1.2853989601135254, + "step": 5100 + }, + { + "epoch": 2.7322294698110055, + "grad_norm": 10.450685745088888, + "learning_rate": 7.189258488922768e-08, + "logits/chosen": -0.6975101232528687, + "logits/rejected": -0.5831706523895264, + "logps/chosen": -1.3284635543823242, + "logps/rejected": -2.352112293243408, + "loss": 0.8555, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.3284635543823242, + "rewards/margins": 1.023648738861084, + "rewards/rejected": -2.352112293243408, + "sft_loss": 1.3477023839950562, + "step": 5105 + }, + { + "epoch": 2.7349055025924067, + "grad_norm": 12.362442246584587, + "learning_rate": 7.047040231198959e-08, + "logits/chosen": -0.7626334428787231, + "logits/rejected": -0.6651133298873901, + "logps/chosen": -1.2891103029251099, + "logps/rejected": -2.2566936016082764, + "loss": 0.8845, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.2891103029251099, + "rewards/margins": 0.9675832986831665, + "rewards/rejected": -2.2566936016082764, + "sft_loss": 1.3317053318023682, + "step": 5110 + }, + { + "epoch": 2.7375815353738084, + "grad_norm": 7.641707637535059, + "learning_rate": 6.906208903717787e-08, + "logits/chosen": -0.8081218004226685, + "logits/rejected": -0.6535941362380981, + "logps/chosen": -1.2907568216323853, + "logps/rejected": -2.467261552810669, + "loss": 0.8587, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2907568216323853, + "rewards/margins": 1.1765047311782837, + "rewards/rejected": -2.467261552810669, + "sft_loss": 1.351701021194458, + "step": 5115 + }, + { + "epoch": 2.7402575681552097, + "grad_norm": 9.531204664336729, + "learning_rate": 6.76676587282542e-08, + "logits/chosen": -0.7317078709602356, + "logits/rejected": -0.7078748345375061, + "logps/chosen": -1.4110424518585205, + "logps/rejected": -2.46642804145813, + "loss": 0.8744, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4110424518585205, + "rewards/margins": 1.0553853511810303, + "rewards/rejected": -2.46642804145813, + "sft_loss": 1.419112205505371, + "step": 5120 + }, + { + "epoch": 2.7429336009366114, + "grad_norm": 13.469773428881165, + "learning_rate": 6.628712491398736e-08, + "logits/chosen": -0.8352164030075073, + "logits/rejected": -0.6624903678894043, + "logps/chosen": -1.2661771774291992, + "logps/rejected": -2.2277333736419678, + "loss": 0.8784, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.2661771774291992, + "rewards/margins": 0.9615561366081238, + "rewards/rejected": -2.2277333736419678, + "sft_loss": 1.3674657344818115, + "step": 5125 + }, + { + "epoch": 2.745609633718013, + "grad_norm": 9.16478362847294, + "learning_rate": 6.492050098832281e-08, + "logits/chosen": -0.8760141134262085, + "logits/rejected": -0.7233066558837891, + "logps/chosen": -1.2933028936386108, + "logps/rejected": -2.4618935585021973, + "loss": 0.8477, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.2933028936386108, + "rewards/margins": 1.1685909032821655, + "rewards/rejected": -2.4618935585021973, + "sft_loss": 1.3852190971374512, + "step": 5130 + }, + { + "epoch": 2.748285666499415, + "grad_norm": 13.989699607246886, + "learning_rate": 6.356780021025161e-08, + "logits/chosen": -0.6575424075126648, + "logits/rejected": -0.6311792135238647, + "logps/chosen": -1.3274948596954346, + "logps/rejected": -2.2456724643707275, + "loss": 0.8865, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.3274948596954346, + "rewards/margins": 0.9181777238845825, + "rewards/rejected": -2.2456724643707275, + "sft_loss": 1.379867434501648, + "step": 5135 + }, + { + "epoch": 2.750961699280816, + "grad_norm": 7.570676290771131, + "learning_rate": 6.222903570368288e-08, + "logits/chosen": -0.6666676998138428, + "logits/rejected": -0.5848277807235718, + "logps/chosen": -1.3754925727844238, + "logps/rejected": -2.2089810371398926, + "loss": 0.9377, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3754925727844238, + "rewards/margins": 0.8334886431694031, + "rewards/rejected": -2.2089810371398926, + "sft_loss": 1.4269959926605225, + "step": 5140 + }, + { + "epoch": 2.753637732062218, + "grad_norm": 10.834118294330352, + "learning_rate": 6.090422045731525e-08, + "logits/chosen": -0.7254561185836792, + "logits/rejected": -0.5795365571975708, + "logps/chosen": -1.2621824741363525, + "logps/rejected": -2.258971691131592, + "loss": 0.8837, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.2621824741363525, + "rewards/margins": 0.9967893362045288, + "rewards/rejected": -2.258971691131592, + "sft_loss": 1.325939416885376, + "step": 5145 + }, + { + "epoch": 2.756313764843619, + "grad_norm": 10.56441198792611, + "learning_rate": 5.9593367324512593e-08, + "logits/chosen": -0.7619807720184326, + "logits/rejected": -0.6812753081321716, + "logps/chosen": -1.2736539840698242, + "logps/rejected": -2.1827409267425537, + "loss": 0.853, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.2736539840698242, + "rewards/margins": 0.9090871810913086, + "rewards/rejected": -2.1827409267425537, + "sft_loss": 1.314415454864502, + "step": 5150 + }, + { + "epoch": 2.758989797625021, + "grad_norm": 7.411699622579818, + "learning_rate": 5.8296489023177305e-08, + "logits/chosen": -0.7957712411880493, + "logits/rejected": -0.7316339015960693, + "logps/chosen": -1.3173013925552368, + "logps/rejected": -2.193631172180176, + "loss": 0.8966, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3173013925552368, + "rewards/margins": 0.8763298988342285, + "rewards/rejected": -2.193631172180176, + "sft_loss": 1.4033677577972412, + "step": 5155 + }, + { + "epoch": 2.7616658304064226, + "grad_norm": 5.6330080840511245, + "learning_rate": 5.7013598135628895e-08, + "logits/chosen": -0.73341304063797, + "logits/rejected": -0.7175203561782837, + "logps/chosen": -1.2750922441482544, + "logps/rejected": -2.3670904636383057, + "loss": 0.8476, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.2750922441482544, + "rewards/margins": 1.0919979810714722, + "rewards/rejected": -2.3670904636383057, + "sft_loss": 1.3750401735305786, + "step": 5160 + }, + { + "epoch": 2.7643418631878243, + "grad_norm": 11.765520522447899, + "learning_rate": 5.5744707108479784e-08, + "logits/chosen": -0.7418981790542603, + "logits/rejected": -0.6119092106819153, + "logps/chosen": -1.2623355388641357, + "logps/rejected": -2.2879061698913574, + "loss": 0.8443, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.2623355388641357, + "rewards/margins": 1.0255706310272217, + "rewards/rejected": -2.2879061698913574, + "sft_loss": 1.3115851879119873, + "step": 5165 + }, + { + "epoch": 2.7670178959692255, + "grad_norm": 8.656161954299588, + "learning_rate": 5.448982825251686e-08, + "logits/chosen": -0.7418988943099976, + "logits/rejected": -0.6640129089355469, + "logps/chosen": -1.3426659107208252, + "logps/rejected": -2.3372931480407715, + "loss": 0.9129, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3426659107208252, + "rewards/margins": 0.9946274757385254, + "rewards/rejected": -2.3372931480407715, + "sft_loss": 1.417859673500061, + "step": 5170 + }, + { + "epoch": 2.7696939287506273, + "grad_norm": 15.682599029347866, + "learning_rate": 5.324897374257959e-08, + "logits/chosen": -0.7691652774810791, + "logits/rejected": -0.7162500619888306, + "logps/chosen": -1.3604718446731567, + "logps/rejected": -2.3567006587982178, + "loss": 0.9071, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3604718446731567, + "rewards/margins": 0.9962291717529297, + "rewards/rejected": -2.3567006587982178, + "sft_loss": 1.3918625116348267, + "step": 5175 + }, + { + "epoch": 2.7723699615320285, + "grad_norm": 6.815161618210579, + "learning_rate": 5.202215561744461e-08, + "logits/chosen": -0.6741605997085571, + "logits/rejected": -0.6667729616165161, + "logps/chosen": -1.3192849159240723, + "logps/rejected": -2.3011367321014404, + "loss": 0.9035, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.3192849159240723, + "rewards/margins": 0.9818517565727234, + "rewards/rejected": -2.3011367321014404, + "sft_loss": 1.397717833518982, + "step": 5180 + }, + { + "epoch": 2.7750459943134302, + "grad_norm": 8.747190731241487, + "learning_rate": 5.080938577970617e-08, + "logits/chosen": -0.7463937997817993, + "logits/rejected": -0.6645978093147278, + "logps/chosen": -1.219977617263794, + "logps/rejected": -2.464294195175171, + "loss": 0.8587, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.219977617263794, + "rewards/margins": 1.2443166971206665, + "rewards/rejected": -2.464294195175171, + "sft_loss": 1.3175201416015625, + "step": 5185 + }, + { + "epoch": 2.777722027094832, + "grad_norm": 9.688695160055701, + "learning_rate": 4.961067599566305e-08, + "logits/chosen": -0.8435994386672974, + "logits/rejected": -0.7417990565299988, + "logps/chosen": -1.281051754951477, + "logps/rejected": -2.3396692276000977, + "loss": 0.8574, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.281051754951477, + "rewards/margins": 1.0586177110671997, + "rewards/rejected": -2.3396692276000977, + "sft_loss": 1.3897156715393066, + "step": 5190 + }, + { + "epoch": 2.7803980598762337, + "grad_norm": 7.8819553620316025, + "learning_rate": 4.8426037895202277e-08, + "logits/chosen": -0.7271918654441833, + "logits/rejected": -0.6405032873153687, + "logps/chosen": -1.3014874458312988, + "logps/rejected": -2.2847611904144287, + "loss": 0.8617, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.3014874458312988, + "rewards/margins": 0.9832738041877747, + "rewards/rejected": -2.2847611904144287, + "sft_loss": 1.3530741930007935, + "step": 5195 + }, + { + "epoch": 2.783074092657635, + "grad_norm": 11.952336263629748, + "learning_rate": 4.725548297168847e-08, + "logits/chosen": -0.7949420809745789, + "logits/rejected": -0.7019721269607544, + "logps/chosen": -1.2483434677124023, + "logps/rejected": -2.2791337966918945, + "loss": 0.8717, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2483434677124023, + "rewards/margins": 1.0307904481887817, + "rewards/rejected": -2.2791337966918945, + "sft_loss": 1.3429853916168213, + "step": 5200 + }, + { + "epoch": 2.783074092657635, + "eval_logits/chosen": -0.49635621905326843, + "eval_logits/rejected": -0.4497099220752716, + "eval_logps/chosen": -1.5370229482650757, + "eval_logps/rejected": -2.2062909603118896, + "eval_loss": 1.0438063144683838, + "eval_rewards/accuracies": 0.6468842625617981, + "eval_rewards/chosen": -1.5370229482650757, + "eval_rewards/margins": 0.6692681312561035, + "eval_rewards/rejected": -2.2062909603118896, + "eval_runtime": 42.9859, + "eval_samples_per_second": 31.289, + "eval_sft_loss": 1.4854966402053833, + "eval_steps_per_second": 7.84, + "step": 5200 + }, + { + "epoch": 2.7857501254390367, + "grad_norm": 8.386269839505383, + "learning_rate": 4.609902258185017e-08, + "logits/chosen": -0.6943266987800598, + "logits/rejected": -0.7278541326522827, + "logps/chosen": -1.2799112796783447, + "logps/rejected": -2.147169589996338, + "loss": 0.919, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2799112796783447, + "rewards/margins": 0.8672581911087036, + "rewards/rejected": -2.147169589996338, + "sft_loss": 1.3275152444839478, + "step": 5205 + }, + { + "epoch": 2.788426158220438, + "grad_norm": 8.29830813055329, + "learning_rate": 4.4956667945671496e-08, + "logits/chosen": -0.7525221109390259, + "logits/rejected": -0.7058395147323608, + "logps/chosen": -1.2669576406478882, + "logps/rejected": -2.4628257751464844, + "loss": 0.8309, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.2669576406478882, + "rewards/margins": 1.1958680152893066, + "rewards/rejected": -2.4628257751464844, + "sft_loss": 1.3189142942428589, + "step": 5210 + }, + { + "epoch": 2.7911021910018396, + "grad_norm": 12.635756214928179, + "learning_rate": 4.382843014628168e-08, + "logits/chosen": -0.7203118205070496, + "logits/rejected": -0.6712735891342163, + "logps/chosen": -1.270727515220642, + "logps/rejected": -2.275433301925659, + "loss": 0.8641, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.270727515220642, + "rewards/margins": 1.004705548286438, + "rewards/rejected": -2.275433301925659, + "sft_loss": 1.3368369340896606, + "step": 5215 + }, + { + "epoch": 2.7937782237832414, + "grad_norm": 10.654987854599767, + "learning_rate": 4.271432012984938e-08, + "logits/chosen": -0.7634333372116089, + "logits/rejected": -0.7373124361038208, + "logps/chosen": -1.282454252243042, + "logps/rejected": -2.5014779567718506, + "loss": 0.8665, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.282454252243042, + "rewards/margins": 1.2190238237380981, + "rewards/rejected": -2.5014779567718506, + "sft_loss": 1.385162591934204, + "step": 5220 + }, + { + "epoch": 2.796454256564643, + "grad_norm": 21.230618820765667, + "learning_rate": 4.1614348705474534e-08, + "logits/chosen": -0.689985454082489, + "logits/rejected": -0.5928815603256226, + "logps/chosen": -1.3926451206207275, + "logps/rejected": -2.5388782024383545, + "loss": 0.8896, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3926451206207275, + "rewards/margins": 1.146233320236206, + "rewards/rejected": -2.5388782024383545, + "sft_loss": 1.416735291481018, + "step": 5225 + }, + { + "epoch": 2.7991302893460444, + "grad_norm": 9.824058265140998, + "learning_rate": 4.052852654508482e-08, + "logits/chosen": -0.8481475710868835, + "logits/rejected": -0.7754641771316528, + "logps/chosen": -1.3186979293823242, + "logps/rejected": -2.2451705932617188, + "loss": 0.8758, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.3186979293823242, + "rewards/margins": 0.9264729619026184, + "rewards/rejected": -2.2451705932617188, + "sft_loss": 1.3459546566009521, + "step": 5230 + }, + { + "epoch": 2.801806322127446, + "grad_norm": 12.835873812310274, + "learning_rate": 3.9456864183331557e-08, + "logits/chosen": -0.7798875570297241, + "logits/rejected": -0.7219721078872681, + "logps/chosen": -1.3479766845703125, + "logps/rejected": -2.30627703666687, + "loss": 0.873, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.3479766845703125, + "rewards/margins": 0.9583007097244263, + "rewards/rejected": -2.30627703666687, + "sft_loss": 1.3579721450805664, + "step": 5235 + }, + { + "epoch": 2.804482354908848, + "grad_norm": 8.458792436519417, + "learning_rate": 3.839937201748744e-08, + "logits/chosen": -0.7576145529747009, + "logits/rejected": -0.5977579951286316, + "logps/chosen": -1.3713600635528564, + "logps/rejected": -2.4814581871032715, + "loss": 0.9009, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3713600635528564, + "rewards/margins": 1.110097885131836, + "rewards/rejected": -2.4814581871032715, + "sft_loss": 1.3729455471038818, + "step": 5240 + }, + { + "epoch": 2.807158387690249, + "grad_norm": 7.784021685248867, + "learning_rate": 3.735606030734651e-08, + "logits/chosen": -0.7065932154655457, + "logits/rejected": -0.6710800528526306, + "logps/chosen": -1.2697302103042603, + "logps/rejected": -2.2421200275421143, + "loss": 0.897, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2697302103042603, + "rewards/margins": 0.9723899960517883, + "rewards/rejected": -2.2421200275421143, + "sft_loss": 1.3095524311065674, + "step": 5245 + }, + { + "epoch": 2.8098344204716508, + "grad_norm": 15.526328851790549, + "learning_rate": 3.632693917512331e-08, + "logits/chosen": -0.7905303239822388, + "logits/rejected": -0.6953557729721069, + "logps/chosen": -1.36759352684021, + "logps/rejected": -2.4535601139068604, + "loss": 0.932, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.36759352684021, + "rewards/margins": 1.0859668254852295, + "rewards/rejected": -2.4535601139068604, + "sft_loss": 1.415594458580017, + "step": 5250 + }, + { + "epoch": 2.8125104532530525, + "grad_norm": 11.082167378180044, + "learning_rate": 3.531201860535588e-08, + "logits/chosen": -0.7412352561950684, + "logits/rejected": -0.5843526721000671, + "logps/chosen": -1.3602845668792725, + "logps/rejected": -2.3672142028808594, + "loss": 0.8927, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3602845668792725, + "rewards/margins": 1.006929636001587, + "rewards/rejected": -2.3672142028808594, + "sft_loss": 1.3555505275726318, + "step": 5255 + }, + { + "epoch": 2.8151864860344538, + "grad_norm": 12.359836615146241, + "learning_rate": 3.431130844480762e-08, + "logits/chosen": -0.7371433973312378, + "logits/rejected": -0.7097469568252563, + "logps/chosen": -1.3078466653823853, + "logps/rejected": -2.363670825958252, + "loss": 0.8994, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3078466653823853, + "rewards/margins": 1.0558240413665771, + "rewards/rejected": -2.363670825958252, + "sft_loss": 1.4063899517059326, + "step": 5260 + }, + { + "epoch": 2.8178625188158555, + "grad_norm": 8.790703213167431, + "learning_rate": 3.332481840237306e-08, + "logits/chosen": -0.831184983253479, + "logits/rejected": -0.7051724195480347, + "logps/chosen": -1.4858452081680298, + "logps/rejected": -2.5289244651794434, + "loss": 0.935, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.4858452081680298, + "rewards/margins": 1.043079137802124, + "rewards/rejected": -2.5289244651794434, + "sft_loss": 1.5154434442520142, + "step": 5265 + }, + { + "epoch": 2.820538551597257, + "grad_norm": 10.28811582940433, + "learning_rate": 3.235255804898307e-08, + "logits/chosen": -0.7143429517745972, + "logits/rejected": -0.6302151083946228, + "logps/chosen": -1.255330204963684, + "logps/rejected": -2.2524116039276123, + "loss": 0.8299, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.255330204963684, + "rewards/margins": 0.9970816373825073, + "rewards/rejected": -2.2524116039276123, + "sft_loss": 1.288132905960083, + "step": 5270 + }, + { + "epoch": 2.8232145843786585, + "grad_norm": 5.87670585733461, + "learning_rate": 3.1394536817511475e-08, + "logits/chosen": -0.7384647727012634, + "logits/rejected": -0.6223892569541931, + "logps/chosen": -1.3244822025299072, + "logps/rejected": -2.414888381958008, + "loss": 0.845, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.3244822025299072, + "rewards/margins": 1.090406060218811, + "rewards/rejected": -2.414888381958008, + "sft_loss": 1.3457294702529907, + "step": 5275 + }, + { + "epoch": 2.82589061716006, + "grad_norm": 7.378821074283863, + "learning_rate": 3.0450764002684926e-08, + "logits/chosen": -0.6905801296234131, + "logits/rejected": -0.5800179839134216, + "logps/chosen": -1.4378401041030884, + "logps/rejected": -2.594846248626709, + "loss": 0.8967, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.4378401041030884, + "rewards/margins": 1.1570061445236206, + "rewards/rejected": -2.594846248626709, + "sft_loss": 1.4353234767913818, + "step": 5280 + }, + { + "epoch": 2.828566649941462, + "grad_norm": 9.299567439189415, + "learning_rate": 2.9521248760991158e-08, + "logits/chosen": -0.7614275813102722, + "logits/rejected": -0.7028220891952515, + "logps/chosen": -1.2835723161697388, + "logps/rejected": -2.5371744632720947, + "loss": 0.8219, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.2835723161697388, + "rewards/margins": 1.253602147102356, + "rewards/rejected": -2.5371744632720947, + "sft_loss": 1.306063175201416, + "step": 5285 + }, + { + "epoch": 2.8312426827228636, + "grad_norm": 8.787277207068174, + "learning_rate": 2.8606000110591224e-08, + "logits/chosen": -0.7085649371147156, + "logits/rejected": -0.61260586977005, + "logps/chosen": -1.3210874795913696, + "logps/rejected": -2.2309603691101074, + "loss": 0.9032, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3210874795913696, + "rewards/margins": 0.9098728895187378, + "rewards/rejected": -2.2309603691101074, + "sft_loss": 1.3892929553985596, + "step": 5290 + }, + { + "epoch": 2.833918715504265, + "grad_norm": 8.989265420899805, + "learning_rate": 2.770502693123139e-08, + "logits/chosen": -0.7841325998306274, + "logits/rejected": -0.6407720446586609, + "logps/chosen": -1.4084599018096924, + "logps/rejected": -2.4518253803253174, + "loss": 0.9106, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.4084599018096924, + "rewards/margins": 1.043365716934204, + "rewards/rejected": -2.4518253803253174, + "sft_loss": 1.4504055976867676, + "step": 5295 + }, + { + "epoch": 2.8365947482856666, + "grad_norm": 7.355372041676527, + "learning_rate": 2.6818337964157726e-08, + "logits/chosen": -0.749289870262146, + "logits/rejected": -0.7474431991577148, + "logps/chosen": -1.3349918127059937, + "logps/rejected": -2.496898889541626, + "loss": 0.8718, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3349918127059937, + "rewards/margins": 1.1619070768356323, + "rewards/rejected": -2.496898889541626, + "sft_loss": 1.3594180345535278, + "step": 5300 + }, + { + "epoch": 2.839270781067068, + "grad_norm": 10.195419419106704, + "learning_rate": 2.5945941812029973e-08, + "logits/chosen": -0.7281011343002319, + "logits/rejected": -0.6409409642219543, + "logps/chosen": -1.372420310974121, + "logps/rejected": -2.2830145359039307, + "loss": 0.934, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.372420310974121, + "rewards/margins": 0.9105939865112305, + "rewards/rejected": -2.2830145359039307, + "sft_loss": 1.4592607021331787, + "step": 5305 + }, + { + "epoch": 2.8419468138484696, + "grad_norm": 10.851374419235961, + "learning_rate": 2.5087846938839976e-08, + "logits/chosen": -0.8489446640014648, + "logits/rejected": -0.6587695479393005, + "logps/chosen": -1.2906196117401123, + "logps/rejected": -2.5040271282196045, + "loss": 0.863, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.2906196117401123, + "rewards/margins": 1.2134075164794922, + "rewards/rejected": -2.5040271282196045, + "sft_loss": 1.3395346403121948, + "step": 5310 + }, + { + "epoch": 2.8446228466298713, + "grad_norm": 10.678429689134765, + "learning_rate": 2.42440616698274e-08, + "logits/chosen": -0.6294640302658081, + "logits/rejected": -0.5675816535949707, + "logps/chosen": -1.2999883890151978, + "logps/rejected": -2.193481922149658, + "loss": 0.8862, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2999883890151978, + "rewards/margins": 0.8934933543205261, + "rewards/rejected": -2.193481922149658, + "sft_loss": 1.374890685081482, + "step": 5315 + }, + { + "epoch": 2.847298879411273, + "grad_norm": 7.048939436252608, + "learning_rate": 2.3414594191401128e-08, + "logits/chosen": -0.657579779624939, + "logits/rejected": -0.6235214471817017, + "logps/chosen": -1.2714077234268188, + "logps/rejected": -2.1557183265686035, + "loss": 0.8777, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2714077234268188, + "rewards/margins": 0.8843106031417847, + "rewards/rejected": -2.1557183265686035, + "sft_loss": 1.2815296649932861, + "step": 5320 + }, + { + "epoch": 2.8499749121926743, + "grad_norm": 10.24828223806133, + "learning_rate": 2.2599452551057998e-08, + "logits/chosen": -0.6916013956069946, + "logits/rejected": -0.5996135473251343, + "logps/chosen": -1.3933923244476318, + "logps/rejected": -2.5210020542144775, + "loss": 0.8655, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.3933923244476318, + "rewards/margins": 1.1276098489761353, + "rewards/rejected": -2.5210020542144775, + "sft_loss": 1.4164988994598389, + "step": 5325 + }, + { + "epoch": 2.852650944974076, + "grad_norm": 14.831723599703547, + "learning_rate": 2.1798644657305857e-08, + "logits/chosen": -0.6822465062141418, + "logits/rejected": -0.6394556164741516, + "logps/chosen": -1.2614357471466064, + "logps/rejected": -2.4309158325195312, + "loss": 0.8739, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.2614357471466064, + "rewards/margins": 1.1694800853729248, + "rewards/rejected": -2.4309158325195312, + "sft_loss": 1.3675239086151123, + "step": 5330 + }, + { + "epoch": 2.8553269777554773, + "grad_norm": 9.26451309003692, + "learning_rate": 2.1012178279586293e-08, + "logits/chosen": -0.6579137444496155, + "logits/rejected": -0.6982561349868774, + "logps/chosen": -1.275101900100708, + "logps/rejected": -2.031686782836914, + "loss": 0.9113, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.275101900100708, + "rewards/margins": 0.7565848231315613, + "rewards/rejected": -2.031686782836914, + "sft_loss": 1.2985403537750244, + "step": 5335 + }, + { + "epoch": 2.858003010536879, + "grad_norm": 14.334556180897604, + "learning_rate": 2.02400610481997e-08, + "logits/chosen": -0.684406578540802, + "logits/rejected": -0.6869625449180603, + "logps/chosen": -1.3207378387451172, + "logps/rejected": -2.1499147415161133, + "loss": 0.891, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.3207378387451172, + "rewards/margins": 0.8291767239570618, + "rewards/rejected": -2.1499147415161133, + "sft_loss": 1.3247309923171997, + "step": 5340 + }, + { + "epoch": 2.8606790433182807, + "grad_norm": 7.025935672166714, + "learning_rate": 1.948230045423083e-08, + "logits/chosen": -0.8215829730033875, + "logits/rejected": -0.6989858150482178, + "logps/chosen": -1.2560803890228271, + "logps/rejected": -2.2751643657684326, + "loss": 0.823, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.2560803890228271, + "rewards/margins": 1.0190842151641846, + "rewards/rejected": -2.2751643657684326, + "sft_loss": 1.3089632987976074, + "step": 5345 + }, + { + "epoch": 2.8633550760996824, + "grad_norm": 10.410442099194597, + "learning_rate": 1.8738903849476186e-08, + "logits/chosen": -0.6630287170410156, + "logits/rejected": -0.7192557454109192, + "logps/chosen": -1.3754017353057861, + "logps/rejected": -2.25844669342041, + "loss": 0.9245, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3754017353057861, + "rewards/margins": 0.8830450177192688, + "rewards/rejected": -2.25844669342041, + "sft_loss": 1.3733446598052979, + "step": 5350 + }, + { + "epoch": 2.8660311088810837, + "grad_norm": 21.160878628046465, + "learning_rate": 1.8009878446373083e-08, + "logits/chosen": -0.7304435968399048, + "logits/rejected": -0.6887432336807251, + "logps/chosen": -1.3410518169403076, + "logps/rejected": -2.280390977859497, + "loss": 0.9104, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3410518169403076, + "rewards/margins": 0.9393390417098999, + "rewards/rejected": -2.280390977859497, + "sft_loss": 1.3831353187561035, + "step": 5355 + }, + { + "epoch": 2.8687071416624854, + "grad_norm": 12.445287099306254, + "learning_rate": 1.729523131792887e-08, + "logits/chosen": -0.7037655115127563, + "logits/rejected": -0.5557045936584473, + "logps/chosen": -1.3285108804702759, + "logps/rejected": -2.325253963470459, + "loss": 0.9054, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3285108804702759, + "rewards/margins": 0.996743381023407, + "rewards/rejected": -2.325253963470459, + "sft_loss": 1.4038374423980713, + "step": 5360 + }, + { + "epoch": 2.8713831744438867, + "grad_norm": 8.398774874498331, + "learning_rate": 1.6594969397653316e-08, + "logits/chosen": -0.775101363658905, + "logits/rejected": -0.6764003038406372, + "logps/chosen": -1.3454972505569458, + "logps/rejected": -2.4815573692321777, + "loss": 0.8661, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3454972505569458, + "rewards/margins": 1.136060357093811, + "rewards/rejected": -2.4815573692321777, + "sft_loss": 1.4039502143859863, + "step": 5365 + }, + { + "epoch": 2.8740592072252884, + "grad_norm": 9.084274857994668, + "learning_rate": 1.5909099479490653e-08, + "logits/chosen": -0.6702944040298462, + "logits/rejected": -0.6756909489631653, + "logps/chosen": -1.3259707689285278, + "logps/rejected": -2.08377742767334, + "loss": 0.9366, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3259707689285278, + "rewards/margins": 0.7578068375587463, + "rewards/rejected": -2.08377742767334, + "sft_loss": 1.3615634441375732, + "step": 5370 + }, + { + "epoch": 2.87673524000669, + "grad_norm": 10.559950123474135, + "learning_rate": 1.5237628217753818e-08, + "logits/chosen": -0.7217191457748413, + "logits/rejected": -0.679119884967804, + "logps/chosen": -1.2826389074325562, + "logps/rejected": -2.4561946392059326, + "loss": 0.8817, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2826389074325562, + "rewards/margins": 1.1735559701919556, + "rewards/rejected": -2.4561946392059326, + "sft_loss": 1.3704030513763428, + "step": 5375 + }, + { + "epoch": 2.879411272788092, + "grad_norm": 11.42448679848264, + "learning_rate": 1.4580562127059994e-08, + "logits/chosen": -0.7507520914077759, + "logits/rejected": -0.5890139937400818, + "logps/chosen": -1.43178129196167, + "logps/rejected": -2.6365647315979004, + "loss": 0.8829, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.43178129196167, + "rewards/margins": 1.2047832012176514, + "rewards/rejected": -2.6365647315979004, + "sft_loss": 1.4908645153045654, + "step": 5380 + }, + { + "epoch": 2.882087305569493, + "grad_norm": 7.472431729509608, + "learning_rate": 1.3937907582267151e-08, + "logits/chosen": -0.6748173236846924, + "logits/rejected": -0.6350966095924377, + "logps/chosen": -1.2746866941452026, + "logps/rejected": -2.2813777923583984, + "loss": 0.8562, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.2746866941452026, + "rewards/margins": 1.0066910982131958, + "rewards/rejected": -2.2813777923583984, + "sft_loss": 1.3331294059753418, + "step": 5385 + }, + { + "epoch": 2.884763338350895, + "grad_norm": 11.134637185940472, + "learning_rate": 1.3309670818412446e-08, + "logits/chosen": -0.7103601694107056, + "logits/rejected": -0.6358092427253723, + "logps/chosen": -1.3732722997665405, + "logps/rejected": -2.3026444911956787, + "loss": 0.8945, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3732722997665405, + "rewards/margins": 0.9293721914291382, + "rewards/rejected": -2.3026444911956787, + "sft_loss": 1.4193050861358643, + "step": 5390 + }, + { + "epoch": 2.887439371132296, + "grad_norm": 9.166135528851378, + "learning_rate": 1.2695857930651921e-08, + "logits/chosen": -0.8693428039550781, + "logits/rejected": -0.7057567834854126, + "logps/chosen": -1.2538936138153076, + "logps/rejected": -2.3252408504486084, + "loss": 0.8164, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.2538936138153076, + "rewards/margins": 1.0713472366333008, + "rewards/rejected": -2.3252408504486084, + "sft_loss": 1.3077678680419922, + "step": 5395 + }, + { + "epoch": 2.890115403913698, + "grad_norm": 6.076040614686364, + "learning_rate": 1.2096474874200735e-08, + "logits/chosen": -0.7578507661819458, + "logits/rejected": -0.5721181631088257, + "logps/chosen": -1.3199083805084229, + "logps/rejected": -2.6234889030456543, + "loss": 0.8339, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.3199083805084229, + "rewards/margins": 1.3035802841186523, + "rewards/rejected": -2.6234889030456543, + "sft_loss": 1.34604811668396, + "step": 5400 + }, + { + "epoch": 2.8927914366950995, + "grad_norm": 8.014889456039318, + "learning_rate": 1.1511527464276194e-08, + "logits/chosen": -0.6812966465950012, + "logits/rejected": -0.6842519640922546, + "logps/chosen": -1.414838194847107, + "logps/rejected": -2.4990859031677246, + "loss": 0.8919, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.414838194847107, + "rewards/margins": 1.0842478275299072, + "rewards/rejected": -2.4990859031677246, + "sft_loss": 1.4632160663604736, + "step": 5405 + }, + { + "epoch": 2.8954674694765012, + "grad_norm": 5.700276972210124, + "learning_rate": 1.0941021376040305e-08, + "logits/chosen": -0.6988664865493774, + "logits/rejected": -0.6308866739273071, + "logps/chosen": -1.3098427057266235, + "logps/rejected": -2.4279820919036865, + "loss": 0.8976, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3098427057266235, + "rewards/margins": 1.1181391477584839, + "rewards/rejected": -2.4279820919036865, + "sft_loss": 1.3720207214355469, + "step": 5410 + }, + { + "epoch": 2.8981435022579025, + "grad_norm": 8.648526708006992, + "learning_rate": 1.0384962144545818e-08, + "logits/chosen": -0.7574768662452698, + "logits/rejected": -0.6414147615432739, + "logps/chosen": -1.3453338146209717, + "logps/rejected": -2.267055034637451, + "loss": 0.9057, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3453338146209717, + "rewards/margins": 0.9217211008071899, + "rewards/rejected": -2.267055034637451, + "sft_loss": 1.4229142665863037, + "step": 5415 + }, + { + "epoch": 2.9008195350393042, + "grad_norm": 9.044442234302466, + "learning_rate": 9.843355164681767e-09, + "logits/chosen": -0.7178055047988892, + "logits/rejected": -0.6718038320541382, + "logps/chosen": -1.2801578044891357, + "logps/rejected": -2.220691680908203, + "loss": 0.8944, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.2801578044891357, + "rewards/margins": 0.9405338168144226, + "rewards/rejected": -2.220691680908203, + "sft_loss": 1.3160831928253174, + "step": 5420 + }, + { + "epoch": 2.903495567820706, + "grad_norm": 9.813349046424374, + "learning_rate": 9.316205691121515e-09, + "logits/chosen": -0.7028988599777222, + "logits/rejected": -0.6428855657577515, + "logps/chosen": -1.3650051355361938, + "logps/rejected": -2.4545657634735107, + "loss": 0.8675, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3650051355361938, + "rewards/margins": 1.0895609855651855, + "rewards/rejected": -2.4545657634735107, + "sft_loss": 1.3854036331176758, + "step": 5425 + }, + { + "epoch": 2.906171600602107, + "grad_norm": 10.624268559701752, + "learning_rate": 8.803518838271463e-09, + "logits/chosen": -0.7422298192977905, + "logits/rejected": -0.6166383028030396, + "logps/chosen": -1.3253414630889893, + "logps/rejected": -2.4383633136749268, + "loss": 0.84, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.3253414630889893, + "rewards/margins": 1.1130220890045166, + "rewards/rejected": -2.4383633136749268, + "sft_loss": 1.3755017518997192, + "step": 5430 + }, + { + "epoch": 2.908847633383509, + "grad_norm": 8.741716309198573, + "learning_rate": 8.305299580221748e-09, + "logits/chosen": -0.8222736120223999, + "logits/rejected": -0.7537822723388672, + "logps/chosen": -1.273291826248169, + "logps/rejected": -2.3553757667541504, + "loss": 0.8519, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.273291826248169, + "rewards/margins": 1.082083821296692, + "rewards/rejected": -2.3553757667541504, + "sft_loss": 1.3592718839645386, + "step": 5435 + }, + { + "epoch": 2.9115236661649107, + "grad_norm": 43.86752680722663, + "learning_rate": 7.821552750697958e-09, + "logits/chosen": -0.7530517578125, + "logits/rejected": -0.6569010019302368, + "logps/chosen": -1.3031551837921143, + "logps/rejected": -2.2156496047973633, + "loss": 0.9016, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.3031551837921143, + "rewards/margins": 0.9124943017959595, + "rewards/rejected": -2.2156496047973633, + "sft_loss": 1.390745997428894, + "step": 5440 + }, + { + "epoch": 2.9141996989463124, + "grad_norm": 7.838655429793774, + "learning_rate": 7.3522830430136635e-09, + "logits/chosen": -0.5364211797714233, + "logits/rejected": -0.5294418931007385, + "logps/chosen": -1.3198676109313965, + "logps/rejected": -2.8044791221618652, + "loss": 0.8214, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3198676109313965, + "rewards/margins": 1.4846115112304688, + "rewards/rejected": -2.8044791221618652, + "sft_loss": 1.3252191543579102, + "step": 5445 + }, + { + "epoch": 2.9168757317277136, + "grad_norm": 9.352079097559933, + "learning_rate": 6.897495010025956e-09, + "logits/chosen": -0.6472350358963013, + "logits/rejected": -0.581468939781189, + "logps/chosen": -1.3410637378692627, + "logps/rejected": -2.4063053131103516, + "loss": 0.8721, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.3410637378692627, + "rewards/margins": 1.0652415752410889, + "rewards/rejected": -2.4063053131103516, + "sft_loss": 1.3743155002593994, + "step": 5450 + }, + { + "epoch": 2.9195517645091154, + "grad_norm": 10.857747514119612, + "learning_rate": 6.4571930640899835e-09, + "logits/chosen": -0.7794079780578613, + "logits/rejected": -0.6243848204612732, + "logps/chosen": -1.3348716497421265, + "logps/rejected": -2.204594135284424, + "loss": 0.9159, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3348716497421265, + "rewards/margins": 0.8697225451469421, + "rewards/rejected": -2.204594135284424, + "sft_loss": 1.3703720569610596, + "step": 5455 + }, + { + "epoch": 2.9222277972905166, + "grad_norm": 9.561445401671, + "learning_rate": 6.0313814770174836e-09, + "logits/chosen": -0.7363497614860535, + "logits/rejected": -0.675298810005188, + "logps/chosen": -1.3376637697219849, + "logps/rejected": -2.3302359580993652, + "loss": 0.9026, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3376637697219849, + "rewards/margins": 0.9925724267959595, + "rewards/rejected": -2.3302359580993652, + "sft_loss": 1.426440715789795, + "step": 5460 + }, + { + "epoch": 2.9249038300719183, + "grad_norm": 13.182686708577272, + "learning_rate": 5.620064380033985e-09, + "logits/chosen": -0.7910588979721069, + "logits/rejected": -0.6436706185340881, + "logps/chosen": -1.403331995010376, + "logps/rejected": -2.356701374053955, + "loss": 0.8904, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.403331995010376, + "rewards/margins": 0.9533694386482239, + "rewards/rejected": -2.356701374053955, + "sft_loss": 1.4023396968841553, + "step": 5465 + }, + { + "epoch": 2.92757986285332, + "grad_norm": 30.005707028226283, + "learning_rate": 5.22324576374017e-09, + "logits/chosen": -0.7054346203804016, + "logits/rejected": -0.6411979794502258, + "logps/chosen": -1.2877601385116577, + "logps/rejected": -2.226062059402466, + "loss": 0.8736, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2877601385116577, + "rewards/margins": 0.9383017420768738, + "rewards/rejected": -2.226062059402466, + "sft_loss": 1.3081693649291992, + "step": 5470 + }, + { + "epoch": 2.9302558956347218, + "grad_norm": 9.514563994159747, + "learning_rate": 4.840929478071576e-09, + "logits/chosen": -0.6584609746932983, + "logits/rejected": -0.7336791753768921, + "logps/chosen": -1.247133493423462, + "logps/rejected": -2.2455291748046875, + "loss": 0.8666, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.247133493423462, + "rewards/margins": 0.9983956217765808, + "rewards/rejected": -2.2455291748046875, + "sft_loss": 1.3202401399612427, + "step": 5475 + }, + { + "epoch": 2.932931928416123, + "grad_norm": 16.65588273040812, + "learning_rate": 4.47311923226279e-09, + "logits/chosen": -0.7217914462089539, + "logits/rejected": -0.6552537679672241, + "logps/chosen": -1.321942687034607, + "logps/rejected": -2.1904098987579346, + "loss": 0.9158, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.321942687034607, + "rewards/margins": 0.868466854095459, + "rewards/rejected": -2.1904098987579346, + "sft_loss": 1.4025959968566895, + "step": 5480 + }, + { + "epoch": 2.9356079611975248, + "grad_norm": 8.627080547408749, + "learning_rate": 4.119818594810476e-09, + "logits/chosen": -0.6494681239128113, + "logits/rejected": -0.5289129018783569, + "logps/chosen": -1.2953909635543823, + "logps/rejected": -2.254089832305908, + "loss": 0.8834, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.2953909635543823, + "rewards/margins": 0.9586986303329468, + "rewards/rejected": -2.254089832305908, + "sft_loss": 1.3672096729278564, + "step": 5485 + }, + { + "epoch": 2.938283993978926, + "grad_norm": 11.394554940053027, + "learning_rate": 3.781030993438573e-09, + "logits/chosen": -0.726294994354248, + "logits/rejected": -0.7348896265029907, + "logps/chosen": -1.2835429906845093, + "logps/rejected": -2.3471598625183105, + "loss": 0.8917, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2835429906845093, + "rewards/margins": 1.0636169910430908, + "rewards/rejected": -2.3471598625183105, + "sft_loss": 1.3875757455825806, + "step": 5490 + }, + { + "epoch": 2.9409600267603278, + "grad_norm": 8.284869142815294, + "learning_rate": 3.4567597150663155e-09, + "logits/chosen": -0.811735987663269, + "logits/rejected": -0.6471236944198608, + "logps/chosen": -1.2578198909759521, + "logps/rejected": -2.3566677570343018, + "loss": 0.834, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.2578198909759521, + "rewards/margins": 1.0988481044769287, + "rewards/rejected": -2.3566677570343018, + "sft_loss": 1.3185293674468994, + "step": 5495 + }, + { + "epoch": 2.9436360595417295, + "grad_norm": 7.885958963716733, + "learning_rate": 3.147007905774768e-09, + "logits/chosen": -0.667231023311615, + "logits/rejected": -0.601595401763916, + "logps/chosen": -1.3667452335357666, + "logps/rejected": -2.311988592147827, + "loss": 0.9335, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3667452335357666, + "rewards/margins": 0.9452434778213501, + "rewards/rejected": -2.311988592147827, + "sft_loss": 1.391640305519104, + "step": 5500 + }, + { + "epoch": 2.946312092323131, + "grad_norm": 17.05972389177579, + "learning_rate": 2.851778570777508e-09, + "logits/chosen": -0.6465167999267578, + "logits/rejected": -0.6802867650985718, + "logps/chosen": -1.351560354232788, + "logps/rejected": -2.2371137142181396, + "loss": 0.8927, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.351560354232788, + "rewards/margins": 0.8855533599853516, + "rewards/rejected": -2.2371137142181396, + "sft_loss": 1.376105546951294, + "step": 5505 + }, + { + "epoch": 2.9489881251045325, + "grad_norm": 11.071659410964866, + "learning_rate": 2.5710745743908192e-09, + "logits/chosen": -0.7512981295585632, + "logits/rejected": -0.6664119958877563, + "logps/chosen": -1.331237554550171, + "logps/rejected": -2.683913469314575, + "loss": 0.8378, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.331237554550171, + "rewards/margins": 1.3526757955551147, + "rewards/rejected": -2.683913469314575, + "sft_loss": 1.3605666160583496, + "step": 5510 + }, + { + "epoch": 2.951664157885934, + "grad_norm": 10.604952132522218, + "learning_rate": 2.304898640006048e-09, + "logits/chosen": -0.7933308482170105, + "logits/rejected": -0.6962018013000488, + "logps/chosen": -1.2568343877792358, + "logps/rejected": -2.4431662559509277, + "loss": 0.8777, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.2568343877792358, + "rewards/margins": 1.186332106590271, + "rewards/rejected": -2.4431662559509277, + "sft_loss": 1.3736810684204102, + "step": 5515 + }, + { + "epoch": 2.9543401906673354, + "grad_norm": 10.915979227431771, + "learning_rate": 2.0532533500631225e-09, + "logits/chosen": -0.6986032128334045, + "logits/rejected": -0.6873008012771606, + "logps/chosen": -1.2808072566986084, + "logps/rejected": -2.178983449935913, + "loss": 0.8949, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2808072566986084, + "rewards/margins": 0.8981763124465942, + "rewards/rejected": -2.178983449935913, + "sft_loss": 1.3327544927597046, + "step": 5520 + }, + { + "epoch": 2.957016223448737, + "grad_norm": 15.710577343617024, + "learning_rate": 1.8161411460262401e-09, + "logits/chosen": -0.7469355463981628, + "logits/rejected": -0.6454081535339355, + "logps/chosen": -1.3567637205123901, + "logps/rejected": -2.6256422996520996, + "loss": 0.8532, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.3567637205123901, + "rewards/margins": 1.2688785791397095, + "rewards/rejected": -2.6256422996520996, + "sft_loss": 1.385542631149292, + "step": 5525 + }, + { + "epoch": 2.959692256230139, + "grad_norm": 7.801556195681117, + "learning_rate": 1.5935643283585545e-09, + "logits/chosen": -0.7634005546569824, + "logits/rejected": -0.5943415760993958, + "logps/chosen": -1.4132963418960571, + "logps/rejected": -2.3316051959991455, + "loss": 0.9049, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.4132963418960571, + "rewards/margins": 0.9183089137077332, + "rewards/rejected": -2.3316051959991455, + "sft_loss": 1.4461170434951782, + "step": 5530 + }, + { + "epoch": 2.9623682890115406, + "grad_norm": 11.690526948729639, + "learning_rate": 1.3855250565015244e-09, + "logits/chosen": -0.732628583908081, + "logits/rejected": -0.73069167137146, + "logps/chosen": -1.2804659605026245, + "logps/rejected": -2.1868271827697754, + "loss": 0.9143, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.2804659605026245, + "rewards/margins": 0.906360924243927, + "rewards/rejected": -2.1868271827697754, + "sft_loss": 1.358859658241272, + "step": 5535 + }, + { + "epoch": 2.965044321792942, + "grad_norm": 6.098410094413616, + "learning_rate": 1.1920253488530986e-09, + "logits/chosen": -0.8392502069473267, + "logits/rejected": -0.7265399098396301, + "logps/chosen": -1.337065577507019, + "logps/rejected": -2.251086950302124, + "loss": 0.8909, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.337065577507019, + "rewards/margins": 0.9140211939811707, + "rewards/rejected": -2.251086950302124, + "sft_loss": 1.3136518001556396, + "step": 5540 + }, + { + "epoch": 2.9677203545743436, + "grad_norm": 17.435180135775486, + "learning_rate": 1.0130670827482314e-09, + "logits/chosen": -0.724540114402771, + "logits/rejected": -0.6843401789665222, + "logps/chosen": -1.2680495977401733, + "logps/rejected": -2.138017416000366, + "loss": 0.8746, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2680495977401733, + "rewards/margins": 0.8699675798416138, + "rewards/rejected": -2.138017416000366, + "sft_loss": 1.3009154796600342, + "step": 5545 + }, + { + "epoch": 2.970396387355745, + "grad_norm": 7.694728883809673, + "learning_rate": 8.4865199444073e-10, + "logits/chosen": -0.6581524610519409, + "logits/rejected": -0.5862508416175842, + "logps/chosen": -1.3748828172683716, + "logps/rejected": -2.388700485229492, + "loss": 0.8962, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.3748828172683716, + "rewards/margins": 1.0138174295425415, + "rewards/rejected": -2.388700485229492, + "sft_loss": 1.4283702373504639, + "step": 5550 + }, + { + "epoch": 2.9730724201371466, + "grad_norm": 13.217299665435, + "learning_rate": 6.987816790866019e-10, + "logits/chosen": -0.7316558361053467, + "logits/rejected": -0.5861397981643677, + "logps/chosen": -1.4063125848770142, + "logps/rejected": -2.501913547515869, + "loss": 0.9166, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.4063125848770142, + "rewards/margins": 1.0956008434295654, + "rewards/rejected": -2.501913547515869, + "sft_loss": 1.4392328262329102, + "step": 5555 + }, + { + "epoch": 2.9757484529185483, + "grad_norm": 11.587722175371816, + "learning_rate": 5.634575907284001e-10, + "logits/chosen": -0.6962507963180542, + "logits/rejected": -0.718319296836853, + "logps/chosen": -1.3200267553329468, + "logps/rejected": -2.1147451400756836, + "loss": 0.9395, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3200267553329468, + "rewards/margins": 0.7947185635566711, + "rewards/rejected": -2.1147451400756836, + "sft_loss": 1.4130859375, + "step": 5560 + }, + { + "epoch": 2.97842448569995, + "grad_norm": 9.125117910711788, + "learning_rate": 4.426810422809013e-10, + "logits/chosen": -0.7751784324645996, + "logits/rejected": -0.7629925608634949, + "logps/chosen": -1.2659528255462646, + "logps/rejected": -2.2525923252105713, + "loss": 0.878, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.2659528255462646, + "rewards/margins": 0.9866394996643066, + "rewards/rejected": -2.2525923252105713, + "sft_loss": 1.3184149265289307, + "step": 5565 + }, + { + "epoch": 2.9811005184813513, + "grad_norm": 9.994838583436563, + "learning_rate": 3.36453205518783e-10, + "logits/chosen": -0.7178460359573364, + "logits/rejected": -0.6534903645515442, + "logps/chosen": -1.2815896272659302, + "logps/rejected": -2.5624938011169434, + "loss": 0.8331, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.2815896272659302, + "rewards/margins": 1.2809040546417236, + "rewards/rejected": -2.5624938011169434, + "sft_loss": 1.349057912826538, + "step": 5570 + }, + { + "epoch": 2.983776551262753, + "grad_norm": 10.932926394809643, + "learning_rate": 2.447751110647989e-10, + "logits/chosen": -0.7367149591445923, + "logits/rejected": -0.6385723352432251, + "logps/chosen": -1.2736537456512451, + "logps/rejected": -2.4375550746917725, + "loss": 0.8645, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.2736537456512451, + "rewards/margins": 1.1639012098312378, + "rewards/rejected": -2.4375550746917725, + "sft_loss": 1.3699533939361572, + "step": 5575 + }, + { + "epoch": 2.9864525840441547, + "grad_norm": 7.8711121475014325, + "learning_rate": 1.6764764838045342e-10, + "logits/chosen": -0.8285869359970093, + "logits/rejected": -0.63426673412323, + "logps/chosen": -1.2631531953811646, + "logps/rejected": -2.3292622566223145, + "loss": 0.8533, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.2631531953811646, + "rewards/margins": 1.0661091804504395, + "rewards/rejected": -2.3292622566223145, + "sft_loss": 1.32146418094635, + "step": 5580 + }, + { + "epoch": 2.989128616825556, + "grad_norm": 6.485888323981134, + "learning_rate": 1.0507156575650934e-10, + "logits/chosen": -0.8164284825325012, + "logits/rejected": -0.6994372010231018, + "logps/chosen": -1.3235969543457031, + "logps/rejected": -2.549525260925293, + "loss": 0.8687, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.3235969543457031, + "rewards/margins": 1.225928544998169, + "rewards/rejected": -2.549525260925293, + "sft_loss": 1.4341703653335571, + "step": 5585 + }, + { + "epoch": 2.9918046496069577, + "grad_norm": 8.46512671115117, + "learning_rate": 5.7047470306659246e-11, + "logits/chosen": -0.7358273267745972, + "logits/rejected": -0.7163251638412476, + "logps/chosen": -1.3835117816925049, + "logps/rejected": -2.445448398590088, + "loss": 0.9048, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3835117816925049, + "rewards/margins": 1.061936616897583, + "rewards/rejected": -2.445448398590088, + "sft_loss": 1.3738961219787598, + "step": 5590 + }, + { + "epoch": 2.9944806823883594, + "grad_norm": 7.695148971648861, + "learning_rate": 2.3575827960697906e-11, + "logits/chosen": -0.7481427192687988, + "logits/rejected": -0.6636000871658325, + "logps/chosen": -1.2466938495635986, + "logps/rejected": -2.349560499191284, + "loss": 0.826, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2466938495635986, + "rewards/margins": 1.1028664112091064, + "rewards/rejected": -2.349560499191284, + "sft_loss": 1.316910743713379, + "step": 5595 + }, + { + "epoch": 2.9971567151697607, + "grad_norm": 8.539557104214142, + "learning_rate": 4.656963460691888e-12, + "logits/chosen": -0.753442645072937, + "logits/rejected": -0.7010436058044434, + "logps/chosen": -1.3138887882232666, + "logps/rejected": -2.5131661891937256, + "loss": 0.8816, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3138887882232666, + "rewards/margins": 1.199277639389038, + "rewards/rejected": -2.5131661891937256, + "sft_loss": 1.4080010652542114, + "step": 5600 + }, + { + "epoch": 2.9971567151697607, + "eval_logits/chosen": -0.47351476550102234, + "eval_logits/rejected": -0.42483875155448914, + "eval_logps/chosen": -1.5353264808654785, + "eval_logps/rejected": -2.2017006874084473, + "eval_loss": 1.0436354875564575, + "eval_rewards/accuracies": 0.6476261019706726, + "eval_rewards/chosen": -1.5353264808654785, + "eval_rewards/margins": 0.6663743257522583, + "eval_rewards/rejected": -2.2017006874084473, + "eval_runtime": 43.0659, + "eval_samples_per_second": 31.231, + "eval_sft_loss": 1.4855337142944336, + "eval_steps_per_second": 7.825, + "step": 5600 + }, + { + "epoch": 2.999297541394882, + "step": 5604, + "total_flos": 0.0, + "train_loss": 0.9803723535479859, + "train_runtime": 32947.1599, + "train_samples_per_second": 5.444, + "train_steps_per_second": 0.17 + } + ], + "logging_steps": 5, + "max_steps": 5604, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1000000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}