{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9994666666666667, "eval_steps": 500, "global_step": 8436, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0035555555555555557, "grad_norm": 172.0995169680502, "learning_rate": 1.1848341232227489e-07, "loss": 2.2225, "step": 10 }, { "epoch": 0.0071111111111111115, "grad_norm": 104.28584294834684, "learning_rate": 2.3696682464454978e-07, "loss": 2.0266, "step": 20 }, { "epoch": 0.010666666666666666, "grad_norm": 23.951684052753993, "learning_rate": 3.5545023696682467e-07, "loss": 1.7378, "step": 30 }, { "epoch": 0.014222222222222223, "grad_norm": 28.63768523364588, "learning_rate": 4.7393364928909956e-07, "loss": 1.4898, "step": 40 }, { "epoch": 0.017777777777777778, "grad_norm": 13.504890360680221, "learning_rate": 5.924170616113745e-07, "loss": 1.1851, "step": 50 }, { "epoch": 0.021333333333333333, "grad_norm": 6.1972205960321585, "learning_rate": 7.109004739336493e-07, "loss": 0.988, "step": 60 }, { "epoch": 0.024888888888888887, "grad_norm": 5.065573416801065, "learning_rate": 8.293838862559242e-07, "loss": 0.862, "step": 70 }, { "epoch": 0.028444444444444446, "grad_norm": 4.1523834288926516, "learning_rate": 9.478672985781991e-07, "loss": 0.7813, "step": 80 }, { "epoch": 0.032, "grad_norm": 4.2769222488911405, "learning_rate": 1.0663507109004742e-06, "loss": 0.6638, "step": 90 }, { "epoch": 0.035555555555555556, "grad_norm": 3.7926520111038613, "learning_rate": 1.184834123222749e-06, "loss": 0.6073, "step": 100 }, { "epoch": 0.03911111111111111, "grad_norm": 4.5354775101779605, "learning_rate": 1.303317535545024e-06, "loss": 0.5392, "step": 110 }, { "epoch": 0.042666666666666665, "grad_norm": 9.972389478302686, "learning_rate": 1.4218009478672987e-06, "loss": 0.5798, "step": 120 }, { "epoch": 0.04622222222222222, "grad_norm": 3.3720288764197903, "learning_rate": 1.5402843601895737e-06, "loss": 0.5119, "step": 130 }, { "epoch": 0.049777777777777775, "grad_norm": 2.9830238004238674, "learning_rate": 1.6587677725118483e-06, "loss": 0.4432, "step": 140 }, { "epoch": 0.05333333333333334, "grad_norm": 3.0823358079048395, "learning_rate": 1.7772511848341234e-06, "loss": 0.4637, "step": 150 }, { "epoch": 0.05688888888888889, "grad_norm": 2.7399052383817493, "learning_rate": 1.8957345971563982e-06, "loss": 0.4623, "step": 160 }, { "epoch": 0.060444444444444446, "grad_norm": 2.5949470941499886, "learning_rate": 2.0142180094786733e-06, "loss": 0.4909, "step": 170 }, { "epoch": 0.064, "grad_norm": 2.860434581778304, "learning_rate": 2.1327014218009483e-06, "loss": 0.4522, "step": 180 }, { "epoch": 0.06755555555555555, "grad_norm": 2.4794062920348514, "learning_rate": 2.251184834123223e-06, "loss": 0.4683, "step": 190 }, { "epoch": 0.07111111111111111, "grad_norm": 3.5898381841290385, "learning_rate": 2.369668246445498e-06, "loss": 0.4544, "step": 200 }, { "epoch": 0.07466666666666667, "grad_norm": 2.8271249937433893, "learning_rate": 2.4881516587677726e-06, "loss": 0.452, "step": 210 }, { "epoch": 0.07822222222222222, "grad_norm": 2.820485688519842, "learning_rate": 2.606635071090048e-06, "loss": 0.4594, "step": 220 }, { "epoch": 0.08177777777777778, "grad_norm": 2.879680482909577, "learning_rate": 2.7251184834123223e-06, "loss": 0.4079, "step": 230 }, { "epoch": 0.08533333333333333, "grad_norm": 2.2760447333960547, "learning_rate": 2.8436018957345973e-06, "loss": 0.4586, "step": 240 }, { "epoch": 0.08888888888888889, "grad_norm": 2.831009166917502, "learning_rate": 2.9620853080568724e-06, "loss": 0.4143, "step": 250 }, { "epoch": 0.09244444444444444, "grad_norm": 3.2359232461275895, "learning_rate": 3.0805687203791474e-06, "loss": 0.454, "step": 260 }, { "epoch": 0.096, "grad_norm": 2.7067735723833932, "learning_rate": 3.1990521327014216e-06, "loss": 0.3998, "step": 270 }, { "epoch": 0.09955555555555555, "grad_norm": 2.550645136169034, "learning_rate": 3.3175355450236967e-06, "loss": 0.397, "step": 280 }, { "epoch": 0.10311111111111111, "grad_norm": 2.6477271129566162, "learning_rate": 3.4360189573459717e-06, "loss": 0.416, "step": 290 }, { "epoch": 0.10666666666666667, "grad_norm": 2.744124645461815, "learning_rate": 3.5545023696682468e-06, "loss": 0.4521, "step": 300 }, { "epoch": 0.11022222222222222, "grad_norm": 2.499585309198425, "learning_rate": 3.672985781990522e-06, "loss": 0.4023, "step": 310 }, { "epoch": 0.11377777777777778, "grad_norm": 2.6278096303414467, "learning_rate": 3.7914691943127964e-06, "loss": 0.4191, "step": 320 }, { "epoch": 0.11733333333333333, "grad_norm": 2.4188835712940326, "learning_rate": 3.9099526066350715e-06, "loss": 0.4122, "step": 330 }, { "epoch": 0.12088888888888889, "grad_norm": 2.553975268194503, "learning_rate": 4.0284360189573465e-06, "loss": 0.3498, "step": 340 }, { "epoch": 0.12444444444444444, "grad_norm": 2.834535859400579, "learning_rate": 4.146919431279622e-06, "loss": 0.4094, "step": 350 }, { "epoch": 0.128, "grad_norm": 2.533973817990368, "learning_rate": 4.265402843601897e-06, "loss": 0.4298, "step": 360 }, { "epoch": 0.13155555555555556, "grad_norm": 2.813906241826433, "learning_rate": 4.383886255924171e-06, "loss": 0.4216, "step": 370 }, { "epoch": 0.1351111111111111, "grad_norm": 2.102931563969342, "learning_rate": 4.502369668246446e-06, "loss": 0.3808, "step": 380 }, { "epoch": 0.13866666666666666, "grad_norm": 2.4379289560773896, "learning_rate": 4.620853080568721e-06, "loss": 0.3618, "step": 390 }, { "epoch": 0.14222222222222222, "grad_norm": 2.3557567609798777, "learning_rate": 4.739336492890996e-06, "loss": 0.4044, "step": 400 }, { "epoch": 0.14577777777777778, "grad_norm": 2.2820973068522514, "learning_rate": 4.857819905213271e-06, "loss": 0.4071, "step": 410 }, { "epoch": 0.14933333333333335, "grad_norm": 2.6709678530509993, "learning_rate": 4.976303317535545e-06, "loss": 0.4272, "step": 420 }, { "epoch": 0.15288888888888888, "grad_norm": 2.332134363712532, "learning_rate": 5.09478672985782e-06, "loss": 0.434, "step": 430 }, { "epoch": 0.15644444444444444, "grad_norm": 2.9162979668749047, "learning_rate": 5.213270142180096e-06, "loss": 0.3695, "step": 440 }, { "epoch": 0.16, "grad_norm": 2.2427213677361655, "learning_rate": 5.33175355450237e-06, "loss": 0.3723, "step": 450 }, { "epoch": 0.16355555555555557, "grad_norm": 2.5901865124993, "learning_rate": 5.4502369668246446e-06, "loss": 0.4147, "step": 460 }, { "epoch": 0.1671111111111111, "grad_norm": 2.56419802107506, "learning_rate": 5.5687203791469205e-06, "loss": 0.4083, "step": 470 }, { "epoch": 0.17066666666666666, "grad_norm": 2.145912482611642, "learning_rate": 5.687203791469195e-06, "loss": 0.3631, "step": 480 }, { "epoch": 0.17422222222222222, "grad_norm": 2.1572804538302983, "learning_rate": 5.8056872037914706e-06, "loss": 0.3838, "step": 490 }, { "epoch": 0.17777777777777778, "grad_norm": 2.7221208940412955, "learning_rate": 5.924170616113745e-06, "loss": 0.3703, "step": 500 }, { "epoch": 0.17777777777777778, "eval_loss": 0.27164188027381897, "eval_runtime": 561.686, "eval_samples_per_second": 17.804, "eval_steps_per_second": 4.451, "step": 500 }, { "epoch": 0.18133333333333335, "grad_norm": 2.128760220417613, "learning_rate": 6.042654028436019e-06, "loss": 0.3936, "step": 510 }, { "epoch": 0.18488888888888888, "grad_norm": 2.37349559892131, "learning_rate": 6.161137440758295e-06, "loss": 0.4097, "step": 520 }, { "epoch": 0.18844444444444444, "grad_norm": 2.1546814583393954, "learning_rate": 6.279620853080569e-06, "loss": 0.3487, "step": 530 }, { "epoch": 0.192, "grad_norm": 2.5691866709112174, "learning_rate": 6.398104265402843e-06, "loss": 0.3795, "step": 540 }, { "epoch": 0.19555555555555557, "grad_norm": 2.511088780500042, "learning_rate": 6.516587677725119e-06, "loss": 0.3592, "step": 550 }, { "epoch": 0.1991111111111111, "grad_norm": 2.1980105108863306, "learning_rate": 6.635071090047393e-06, "loss": 0.3759, "step": 560 }, { "epoch": 0.20266666666666666, "grad_norm": 2.0372925079256508, "learning_rate": 6.753554502369669e-06, "loss": 0.3372, "step": 570 }, { "epoch": 0.20622222222222222, "grad_norm": 2.4474157501007188, "learning_rate": 6.8720379146919435e-06, "loss": 0.3821, "step": 580 }, { "epoch": 0.20977777777777779, "grad_norm": 2.6150488990545813, "learning_rate": 6.990521327014218e-06, "loss": 0.4033, "step": 590 }, { "epoch": 0.21333333333333335, "grad_norm": 2.218675478875157, "learning_rate": 7.1090047393364935e-06, "loss": 0.3498, "step": 600 }, { "epoch": 0.21688888888888888, "grad_norm": 2.60194848198847, "learning_rate": 7.227488151658768e-06, "loss": 0.3974, "step": 610 }, { "epoch": 0.22044444444444444, "grad_norm": 2.4008012422084883, "learning_rate": 7.345971563981044e-06, "loss": 0.3522, "step": 620 }, { "epoch": 0.224, "grad_norm": 2.370019125222766, "learning_rate": 7.464454976303318e-06, "loss": 0.3843, "step": 630 }, { "epoch": 0.22755555555555557, "grad_norm": 2.319127909040294, "learning_rate": 7.582938388625593e-06, "loss": 0.3852, "step": 640 }, { "epoch": 0.2311111111111111, "grad_norm": 2.0344327356388963, "learning_rate": 7.701421800947868e-06, "loss": 0.3753, "step": 650 }, { "epoch": 0.23466666666666666, "grad_norm": 2.0974945886124274, "learning_rate": 7.819905213270143e-06, "loss": 0.3622, "step": 660 }, { "epoch": 0.23822222222222222, "grad_norm": 2.3710225236326656, "learning_rate": 7.938388625592418e-06, "loss": 0.3776, "step": 670 }, { "epoch": 0.24177777777777779, "grad_norm": 2.1972590118602353, "learning_rate": 8.056872037914693e-06, "loss": 0.4131, "step": 680 }, { "epoch": 0.24533333333333332, "grad_norm": 2.124563531807995, "learning_rate": 8.175355450236966e-06, "loss": 0.4041, "step": 690 }, { "epoch": 0.24888888888888888, "grad_norm": 2.186519973081525, "learning_rate": 8.293838862559243e-06, "loss": 0.4342, "step": 700 }, { "epoch": 0.25244444444444447, "grad_norm": 2.2098045409685785, "learning_rate": 8.412322274881517e-06, "loss": 0.3753, "step": 710 }, { "epoch": 0.256, "grad_norm": 2.364680759422569, "learning_rate": 8.530805687203793e-06, "loss": 0.3499, "step": 720 }, { "epoch": 0.25955555555555554, "grad_norm": 2.0592638534598975, "learning_rate": 8.649289099526067e-06, "loss": 0.3676, "step": 730 }, { "epoch": 0.26311111111111113, "grad_norm": 2.076874300192435, "learning_rate": 8.767772511848342e-06, "loss": 0.3882, "step": 740 }, { "epoch": 0.26666666666666666, "grad_norm": 2.256989717343507, "learning_rate": 8.886255924170617e-06, "loss": 0.3906, "step": 750 }, { "epoch": 0.2702222222222222, "grad_norm": 2.2777259263170753, "learning_rate": 9.004739336492892e-06, "loss": 0.3881, "step": 760 }, { "epoch": 0.2737777777777778, "grad_norm": 2.0191108991103452, "learning_rate": 9.123222748815167e-06, "loss": 0.3598, "step": 770 }, { "epoch": 0.2773333333333333, "grad_norm": 2.1955719220241114, "learning_rate": 9.241706161137442e-06, "loss": 0.3411, "step": 780 }, { "epoch": 0.2808888888888889, "grad_norm": 1.8450512554264078, "learning_rate": 9.360189573459715e-06, "loss": 0.3989, "step": 790 }, { "epoch": 0.28444444444444444, "grad_norm": 2.011115441632504, "learning_rate": 9.478672985781992e-06, "loss": 0.3982, "step": 800 }, { "epoch": 0.288, "grad_norm": 1.8704472001913133, "learning_rate": 9.597156398104265e-06, "loss": 0.414, "step": 810 }, { "epoch": 0.29155555555555557, "grad_norm": 1.9254101904021153, "learning_rate": 9.715639810426542e-06, "loss": 0.3767, "step": 820 }, { "epoch": 0.2951111111111111, "grad_norm": 1.9015728855115495, "learning_rate": 9.834123222748815e-06, "loss": 0.3775, "step": 830 }, { "epoch": 0.2986666666666667, "grad_norm": 1.928562219171237, "learning_rate": 9.95260663507109e-06, "loss": 0.3955, "step": 840 }, { "epoch": 0.3022222222222222, "grad_norm": 1.5585912642130104, "learning_rate": 9.999984589042141e-06, "loss": 0.3897, "step": 850 }, { "epoch": 0.30577777777777776, "grad_norm": 2.088285655295682, "learning_rate": 9.999890411310363e-06, "loss": 0.3657, "step": 860 }, { "epoch": 0.30933333333333335, "grad_norm": 1.7831321620409892, "learning_rate": 9.999710619100732e-06, "loss": 0.3699, "step": 870 }, { "epoch": 0.3128888888888889, "grad_norm": 1.8859386777237288, "learning_rate": 9.999445215491888e-06, "loss": 0.3675, "step": 880 }, { "epoch": 0.3164444444444444, "grad_norm": 1.793847189739239, "learning_rate": 9.999094205028403e-06, "loss": 0.3804, "step": 890 }, { "epoch": 0.32, "grad_norm": 1.8588345423039347, "learning_rate": 9.998657593720726e-06, "loss": 0.3628, "step": 900 }, { "epoch": 0.32355555555555554, "grad_norm": 1.904522383364726, "learning_rate": 9.998135389045071e-06, "loss": 0.3832, "step": 910 }, { "epoch": 0.32711111111111113, "grad_norm": 1.7658830671737389, "learning_rate": 9.997527599943288e-06, "loss": 0.3931, "step": 920 }, { "epoch": 0.33066666666666666, "grad_norm": 1.8645179401650172, "learning_rate": 9.996834236822718e-06, "loss": 0.3587, "step": 930 }, { "epoch": 0.3342222222222222, "grad_norm": 1.8432627384605438, "learning_rate": 9.996055311556002e-06, "loss": 0.4065, "step": 940 }, { "epoch": 0.3377777777777778, "grad_norm": 1.8436289309250031, "learning_rate": 9.99519083748089e-06, "loss": 0.3861, "step": 950 }, { "epoch": 0.3413333333333333, "grad_norm": 1.9136369003670703, "learning_rate": 9.994240829400006e-06, "loss": 0.3794, "step": 960 }, { "epoch": 0.3448888888888889, "grad_norm": 1.559884705708754, "learning_rate": 9.993205303580596e-06, "loss": 0.3675, "step": 970 }, { "epoch": 0.34844444444444445, "grad_norm": 2.0844042937670317, "learning_rate": 9.992084277754246e-06, "loss": 0.3725, "step": 980 }, { "epoch": 0.352, "grad_norm": 1.4001469351101974, "learning_rate": 9.990877771116588e-06, "loss": 0.3526, "step": 990 }, { "epoch": 0.35555555555555557, "grad_norm": 1.7302242639985734, "learning_rate": 9.989585804326963e-06, "loss": 0.3451, "step": 1000 }, { "epoch": 0.35555555555555557, "eval_loss": 0.2586575448513031, "eval_runtime": 561.7755, "eval_samples_per_second": 17.801, "eval_steps_per_second": 4.45, "step": 1000 }, { "epoch": 0.3591111111111111, "grad_norm": 2.1060593962865832, "learning_rate": 9.988208399508064e-06, "loss": 0.3923, "step": 1010 }, { "epoch": 0.3626666666666667, "grad_norm": 1.6475744003826194, "learning_rate": 9.986745580245569e-06, "loss": 0.3077, "step": 1020 }, { "epoch": 0.3662222222222222, "grad_norm": 1.9521091866638012, "learning_rate": 9.985197371587732e-06, "loss": 0.389, "step": 1030 }, { "epoch": 0.36977777777777776, "grad_norm": 1.7609515675334448, "learning_rate": 9.983563800044942e-06, "loss": 0.3424, "step": 1040 }, { "epoch": 0.37333333333333335, "grad_norm": 1.7210920690658038, "learning_rate": 9.981844893589294e-06, "loss": 0.3558, "step": 1050 }, { "epoch": 0.3768888888888889, "grad_norm": 1.823734659697161, "learning_rate": 9.980040681654085e-06, "loss": 0.3693, "step": 1060 }, { "epoch": 0.3804444444444444, "grad_norm": 2.102269417162816, "learning_rate": 9.978151195133326e-06, "loss": 0.3638, "step": 1070 }, { "epoch": 0.384, "grad_norm": 1.8033749091845297, "learning_rate": 9.976176466381205e-06, "loss": 0.3484, "step": 1080 }, { "epoch": 0.38755555555555554, "grad_norm": 1.8854677696591007, "learning_rate": 9.974116529211539e-06, "loss": 0.3967, "step": 1090 }, { "epoch": 0.39111111111111113, "grad_norm": 2.0272157520267218, "learning_rate": 9.971971418897189e-06, "loss": 0.3741, "step": 1100 }, { "epoch": 0.39466666666666667, "grad_norm": 2.0179018140684555, "learning_rate": 9.969741172169461e-06, "loss": 0.3904, "step": 1110 }, { "epoch": 0.3982222222222222, "grad_norm": 1.6226992565101939, "learning_rate": 9.967425827217473e-06, "loss": 0.3485, "step": 1120 }, { "epoch": 0.4017777777777778, "grad_norm": 1.9028497690136488, "learning_rate": 9.965025423687505e-06, "loss": 0.346, "step": 1130 }, { "epoch": 0.4053333333333333, "grad_norm": 1.694320712579824, "learning_rate": 9.962540002682314e-06, "loss": 0.3635, "step": 1140 }, { "epoch": 0.4088888888888889, "grad_norm": 1.6440393469215313, "learning_rate": 9.95996960676044e-06, "loss": 0.3794, "step": 1150 }, { "epoch": 0.41244444444444445, "grad_norm": 1.9859711063744807, "learning_rate": 9.957314279935467e-06, "loss": 0.3727, "step": 1160 }, { "epoch": 0.416, "grad_norm": 1.5764827911729749, "learning_rate": 9.954574067675276e-06, "loss": 0.3472, "step": 1170 }, { "epoch": 0.41955555555555557, "grad_norm": 2.0270228575955938, "learning_rate": 9.951749016901266e-06, "loss": 0.3651, "step": 1180 }, { "epoch": 0.4231111111111111, "grad_norm": 1.4711992564971241, "learning_rate": 9.948839175987543e-06, "loss": 0.4007, "step": 1190 }, { "epoch": 0.4266666666666667, "grad_norm": 1.6555299578050973, "learning_rate": 9.945844594760104e-06, "loss": 0.3662, "step": 1200 }, { "epoch": 0.43022222222222223, "grad_norm": 1.6087449112246428, "learning_rate": 9.94276532449597e-06, "loss": 0.3266, "step": 1210 }, { "epoch": 0.43377777777777776, "grad_norm": 1.7938918985177508, "learning_rate": 9.939601417922326e-06, "loss": 0.367, "step": 1220 }, { "epoch": 0.43733333333333335, "grad_norm": 1.9419042479062267, "learning_rate": 9.936352929215598e-06, "loss": 0.3479, "step": 1230 }, { "epoch": 0.4408888888888889, "grad_norm": 1.7389871732986788, "learning_rate": 9.933019914000537e-06, "loss": 0.3991, "step": 1240 }, { "epoch": 0.4444444444444444, "grad_norm": 1.954697966163684, "learning_rate": 9.929602429349267e-06, "loss": 0.387, "step": 1250 }, { "epoch": 0.448, "grad_norm": 1.9390505686602657, "learning_rate": 9.926100533780304e-06, "loss": 0.3623, "step": 1260 }, { "epoch": 0.45155555555555554, "grad_norm": 1.6639481540933314, "learning_rate": 9.922514287257553e-06, "loss": 0.3758, "step": 1270 }, { "epoch": 0.45511111111111113, "grad_norm": 1.722757928957694, "learning_rate": 9.918843751189285e-06, "loss": 0.3355, "step": 1280 }, { "epoch": 0.45866666666666667, "grad_norm": 1.845850757530145, "learning_rate": 9.915088988427085e-06, "loss": 0.3698, "step": 1290 }, { "epoch": 0.4622222222222222, "grad_norm": 1.44128404254532, "learning_rate": 9.911250063264768e-06, "loss": 0.4047, "step": 1300 }, { "epoch": 0.4657777777777778, "grad_norm": 1.7671518160334596, "learning_rate": 9.907327041437295e-06, "loss": 0.3692, "step": 1310 }, { "epoch": 0.4693333333333333, "grad_norm": 1.8380352484481248, "learning_rate": 9.903319990119629e-06, "loss": 0.36, "step": 1320 }, { "epoch": 0.4728888888888889, "grad_norm": 1.76427459962676, "learning_rate": 9.899228977925594e-06, "loss": 0.3741, "step": 1330 }, { "epoch": 0.47644444444444445, "grad_norm": 1.4897822709650264, "learning_rate": 9.895054074906703e-06, "loss": 0.3407, "step": 1340 }, { "epoch": 0.48, "grad_norm": 1.8107592753421746, "learning_rate": 9.890795352550949e-06, "loss": 0.3737, "step": 1350 }, { "epoch": 0.48355555555555557, "grad_norm": 1.7814141617442254, "learning_rate": 9.886452883781588e-06, "loss": 0.3706, "step": 1360 }, { "epoch": 0.4871111111111111, "grad_norm": 1.6423771491979522, "learning_rate": 9.882026742955892e-06, "loss": 0.3593, "step": 1370 }, { "epoch": 0.49066666666666664, "grad_norm": 1.9926182163486512, "learning_rate": 9.877517005863865e-06, "loss": 0.388, "step": 1380 }, { "epoch": 0.49422222222222223, "grad_norm": 1.6527200649892368, "learning_rate": 9.872923749726959e-06, "loss": 0.3825, "step": 1390 }, { "epoch": 0.49777777777777776, "grad_norm": 1.800321612826116, "learning_rate": 9.868247053196744e-06, "loss": 0.3406, "step": 1400 }, { "epoch": 0.5013333333333333, "grad_norm": 1.8998896812539383, "learning_rate": 9.86348699635356e-06, "loss": 0.3718, "step": 1410 }, { "epoch": 0.5048888888888889, "grad_norm": 1.8642598101048677, "learning_rate": 9.85864366070515e-06, "loss": 0.3728, "step": 1420 }, { "epoch": 0.5084444444444445, "grad_norm": 2.04147924521036, "learning_rate": 9.853717129185262e-06, "loss": 0.3371, "step": 1430 }, { "epoch": 0.512, "grad_norm": 1.765175754873959, "learning_rate": 9.848707486152231e-06, "loss": 0.3468, "step": 1440 }, { "epoch": 0.5155555555555555, "grad_norm": 1.7955950262413882, "learning_rate": 9.843614817387531e-06, "loss": 0.3456, "step": 1450 }, { "epoch": 0.5191111111111111, "grad_norm": 1.4037783734962412, "learning_rate": 9.838439210094309e-06, "loss": 0.3244, "step": 1460 }, { "epoch": 0.5226666666666666, "grad_norm": 1.8006249556531597, "learning_rate": 9.833180752895887e-06, "loss": 0.3391, "step": 1470 }, { "epoch": 0.5262222222222223, "grad_norm": 1.7020622735675546, "learning_rate": 9.827839535834258e-06, "loss": 0.3922, "step": 1480 }, { "epoch": 0.5297777777777778, "grad_norm": 1.6034083398484584, "learning_rate": 9.822415650368525e-06, "loss": 0.304, "step": 1490 }, { "epoch": 0.5333333333333333, "grad_norm": 1.7309514997235147, "learning_rate": 9.816909189373347e-06, "loss": 0.3531, "step": 1500 }, { "epoch": 0.5333333333333333, "eval_loss": 0.24488620460033417, "eval_runtime": 562.1833, "eval_samples_per_second": 17.788, "eval_steps_per_second": 4.447, "step": 1500 }, { "epoch": 0.5368888888888889, "grad_norm": 1.4581125274966544, "learning_rate": 9.81132024713735e-06, "loss": 0.3771, "step": 1510 }, { "epoch": 0.5404444444444444, "grad_norm": 1.6490332212552936, "learning_rate": 9.805648919361505e-06, "loss": 0.3848, "step": 1520 }, { "epoch": 0.544, "grad_norm": 1.7512970600212527, "learning_rate": 9.799895303157492e-06, "loss": 0.3694, "step": 1530 }, { "epoch": 0.5475555555555556, "grad_norm": 1.7421405313188358, "learning_rate": 9.794059497046043e-06, "loss": 0.3553, "step": 1540 }, { "epoch": 0.5511111111111111, "grad_norm": 1.7340918047507783, "learning_rate": 9.788141600955244e-06, "loss": 0.3357, "step": 1550 }, { "epoch": 0.5546666666666666, "grad_norm": 1.657973523226739, "learning_rate": 9.782141716218832e-06, "loss": 0.3448, "step": 1560 }, { "epoch": 0.5582222222222222, "grad_norm": 1.7266109549753084, "learning_rate": 9.77605994557446e-06, "loss": 0.3336, "step": 1570 }, { "epoch": 0.5617777777777778, "grad_norm": 1.7634795513841868, "learning_rate": 9.769896393161937e-06, "loss": 0.336, "step": 1580 }, { "epoch": 0.5653333333333334, "grad_norm": 1.7328448062964845, "learning_rate": 9.763651164521436e-06, "loss": 0.3505, "step": 1590 }, { "epoch": 0.5688888888888889, "grad_norm": 1.7601349288429824, "learning_rate": 9.7573243665917e-06, "loss": 0.3816, "step": 1600 }, { "epoch": 0.5724444444444444, "grad_norm": 1.887857912509665, "learning_rate": 9.750916107708205e-06, "loss": 0.358, "step": 1610 }, { "epoch": 0.576, "grad_norm": 1.8940080571652895, "learning_rate": 9.744426497601305e-06, "loss": 0.363, "step": 1620 }, { "epoch": 0.5795555555555556, "grad_norm": 1.5744873206102685, "learning_rate": 9.737855647394346e-06, "loss": 0.3544, "step": 1630 }, { "epoch": 0.5831111111111111, "grad_norm": 1.5744080074196256, "learning_rate": 9.73120366960178e-06, "loss": 0.375, "step": 1640 }, { "epoch": 0.5866666666666667, "grad_norm": 1.6398095171132219, "learning_rate": 9.724470678127226e-06, "loss": 0.3649, "step": 1650 }, { "epoch": 0.5902222222222222, "grad_norm": 1.4310246627875627, "learning_rate": 9.717656788261519e-06, "loss": 0.3716, "step": 1660 }, { "epoch": 0.5937777777777777, "grad_norm": 1.490999227794774, "learning_rate": 9.71076211668074e-06, "loss": 0.352, "step": 1670 }, { "epoch": 0.5973333333333334, "grad_norm": 1.6484132205325386, "learning_rate": 9.703786781444218e-06, "loss": 0.3555, "step": 1680 }, { "epoch": 0.6008888888888889, "grad_norm": 1.3854857319423775, "learning_rate": 9.69673090199251e-06, "loss": 0.3348, "step": 1690 }, { "epoch": 0.6044444444444445, "grad_norm": 1.6107410705301848, "learning_rate": 9.689594599145348e-06, "loss": 0.3499, "step": 1700 }, { "epoch": 0.608, "grad_norm": 1.520886748403311, "learning_rate": 9.682377995099581e-06, "loss": 0.3389, "step": 1710 }, { "epoch": 0.6115555555555555, "grad_norm": 1.4556730210725268, "learning_rate": 9.675081213427076e-06, "loss": 0.3412, "step": 1720 }, { "epoch": 0.6151111111111112, "grad_norm": 1.476388303700134, "learning_rate": 9.667704379072597e-06, "loss": 0.3363, "step": 1730 }, { "epoch": 0.6186666666666667, "grad_norm": 1.2168509424846436, "learning_rate": 9.660247618351683e-06, "loss": 0.3328, "step": 1740 }, { "epoch": 0.6222222222222222, "grad_norm": 1.395468629739029, "learning_rate": 9.652711058948463e-06, "loss": 0.3509, "step": 1750 }, { "epoch": 0.6257777777777778, "grad_norm": 1.586845461880222, "learning_rate": 9.645094829913487e-06, "loss": 0.3471, "step": 1760 }, { "epoch": 0.6293333333333333, "grad_norm": 1.5411518795473231, "learning_rate": 9.637399061661507e-06, "loss": 0.3246, "step": 1770 }, { "epoch": 0.6328888888888888, "grad_norm": 1.658660033117339, "learning_rate": 9.62962388596925e-06, "loss": 0.3399, "step": 1780 }, { "epoch": 0.6364444444444445, "grad_norm": 1.313159566501215, "learning_rate": 9.621769435973152e-06, "loss": 0.3478, "step": 1790 }, { "epoch": 0.64, "grad_norm": 1.8380402091451324, "learning_rate": 9.61383584616709e-06, "loss": 0.3251, "step": 1800 }, { "epoch": 0.6435555555555555, "grad_norm": 1.6180991422896933, "learning_rate": 9.60582325240007e-06, "loss": 0.3553, "step": 1810 }, { "epoch": 0.6471111111111111, "grad_norm": 1.8283857342608776, "learning_rate": 9.597731791873907e-06, "loss": 0.3594, "step": 1820 }, { "epoch": 0.6506666666666666, "grad_norm": 1.4175489521300049, "learning_rate": 9.58956160314087e-06, "loss": 0.3549, "step": 1830 }, { "epoch": 0.6542222222222223, "grad_norm": 1.6783488504498176, "learning_rate": 9.581312826101315e-06, "loss": 0.3813, "step": 1840 }, { "epoch": 0.6577777777777778, "grad_norm": 1.6351873747299641, "learning_rate": 9.572985602001283e-06, "loss": 0.3518, "step": 1850 }, { "epoch": 0.6613333333333333, "grad_norm": 1.3790848679324303, "learning_rate": 9.56458007343009e-06, "loss": 0.3303, "step": 1860 }, { "epoch": 0.6648888888888889, "grad_norm": 1.6322052333334587, "learning_rate": 9.556096384317878e-06, "loss": 0.3403, "step": 1870 }, { "epoch": 0.6684444444444444, "grad_norm": 1.788030342136729, "learning_rate": 9.547534679933155e-06, "loss": 0.3717, "step": 1880 }, { "epoch": 0.672, "grad_norm": 1.4934586402235337, "learning_rate": 9.538895106880302e-06, "loss": 0.3468, "step": 1890 }, { "epoch": 0.6755555555555556, "grad_norm": 1.9556398213487334, "learning_rate": 9.53017781309707e-06, "loss": 0.3495, "step": 1900 }, { "epoch": 0.6791111111111111, "grad_norm": 1.4201698189636593, "learning_rate": 9.521382947852042e-06, "loss": 0.3631, "step": 1910 }, { "epoch": 0.6826666666666666, "grad_norm": 1.8176078337580701, "learning_rate": 9.512510661742078e-06, "loss": 0.366, "step": 1920 }, { "epoch": 0.6862222222222222, "grad_norm": 1.5895629439283847, "learning_rate": 9.503561106689736e-06, "loss": 0.3165, "step": 1930 }, { "epoch": 0.6897777777777778, "grad_norm": 1.7257922798447645, "learning_rate": 9.494534435940668e-06, "loss": 0.3199, "step": 1940 }, { "epoch": 0.6933333333333334, "grad_norm": 1.3859470273389864, "learning_rate": 9.485430804061009e-06, "loss": 0.3244, "step": 1950 }, { "epoch": 0.6968888888888889, "grad_norm": 1.3389192102707597, "learning_rate": 9.476250366934708e-06, "loss": 0.3557, "step": 1960 }, { "epoch": 0.7004444444444444, "grad_norm": 1.761133913330945, "learning_rate": 9.466993281760879e-06, "loss": 0.3367, "step": 1970 }, { "epoch": 0.704, "grad_norm": 1.5576575807000288, "learning_rate": 9.457659707051099e-06, "loss": 0.335, "step": 1980 }, { "epoch": 0.7075555555555556, "grad_norm": 1.5125566207561287, "learning_rate": 9.448249802626696e-06, "loss": 0.3286, "step": 1990 }, { "epoch": 0.7111111111111111, "grad_norm": 1.7236714219097393, "learning_rate": 9.43876372961601e-06, "loss": 0.3544, "step": 2000 }, { "epoch": 0.7111111111111111, "eval_loss": 0.23682241141796112, "eval_runtime": 560.8939, "eval_samples_per_second": 17.829, "eval_steps_per_second": 4.457, "step": 2000 }, { "epoch": 0.7146666666666667, "grad_norm": 1.7803508157706263, "learning_rate": 9.429201650451642e-06, "loss": 0.3218, "step": 2010 }, { "epoch": 0.7182222222222222, "grad_norm": 1.6971031315045289, "learning_rate": 9.419563728867663e-06, "loss": 0.3417, "step": 2020 }, { "epoch": 0.7217777777777777, "grad_norm": 1.9366329088516083, "learning_rate": 9.409850129896812e-06, "loss": 0.3104, "step": 2030 }, { "epoch": 0.7253333333333334, "grad_norm": 1.85452483851228, "learning_rate": 9.40006101986768e-06, "loss": 0.3371, "step": 2040 }, { "epoch": 0.7288888888888889, "grad_norm": 1.4768370143060883, "learning_rate": 9.390196566401844e-06, "loss": 0.3324, "step": 2050 }, { "epoch": 0.7324444444444445, "grad_norm": 1.3195137184227357, "learning_rate": 9.38025693841102e-06, "loss": 0.3384, "step": 2060 }, { "epoch": 0.736, "grad_norm": 1.7121308917693614, "learning_rate": 9.370242306094141e-06, "loss": 0.3339, "step": 2070 }, { "epoch": 0.7395555555555555, "grad_norm": 1.3801023810052373, "learning_rate": 9.360152840934477e-06, "loss": 0.3449, "step": 2080 }, { "epoch": 0.7431111111111111, "grad_norm": 1.4391167681264767, "learning_rate": 9.349988715696671e-06, "loss": 0.3444, "step": 2090 }, { "epoch": 0.7466666666666667, "grad_norm": 1.840759552395967, "learning_rate": 9.33975010442379e-06, "loss": 0.3496, "step": 2100 }, { "epoch": 0.7502222222222222, "grad_norm": 1.348141880287597, "learning_rate": 9.329437182434351e-06, "loss": 0.3202, "step": 2110 }, { "epoch": 0.7537777777777778, "grad_norm": 1.528620379748828, "learning_rate": 9.31905012631931e-06, "loss": 0.3545, "step": 2120 }, { "epoch": 0.7573333333333333, "grad_norm": 1.502678851982848, "learning_rate": 9.30858911393904e-06, "loss": 0.3457, "step": 2130 }, { "epoch": 0.7608888888888888, "grad_norm": 1.591416150002211, "learning_rate": 9.298054324420294e-06, "loss": 0.3125, "step": 2140 }, { "epoch": 0.7644444444444445, "grad_norm": 1.5254470204546493, "learning_rate": 9.287445938153121e-06, "loss": 0.3596, "step": 2150 }, { "epoch": 0.768, "grad_norm": 1.230432920766134, "learning_rate": 9.276764136787798e-06, "loss": 0.3352, "step": 2160 }, { "epoch": 0.7715555555555556, "grad_norm": 1.8112353212418606, "learning_rate": 9.266009103231702e-06, "loss": 0.3504, "step": 2170 }, { "epoch": 0.7751111111111111, "grad_norm": 1.6435932354458154, "learning_rate": 9.255181021646182e-06, "loss": 0.3289, "step": 2180 }, { "epoch": 0.7786666666666666, "grad_norm": 1.3388409038180085, "learning_rate": 9.244280077443417e-06, "loss": 0.3542, "step": 2190 }, { "epoch": 0.7822222222222223, "grad_norm": 1.5875341933538416, "learning_rate": 9.233306457283223e-06, "loss": 0.3516, "step": 2200 }, { "epoch": 0.7857777777777778, "grad_norm": 1.5094881761609635, "learning_rate": 9.222260349069874e-06, "loss": 0.3489, "step": 2210 }, { "epoch": 0.7893333333333333, "grad_norm": 1.477094884348464, "learning_rate": 9.211141941948872e-06, "loss": 0.3581, "step": 2220 }, { "epoch": 0.7928888888888889, "grad_norm": 1.4717030162478277, "learning_rate": 9.199951426303711e-06, "loss": 0.3415, "step": 2230 }, { "epoch": 0.7964444444444444, "grad_norm": 1.5752422305129774, "learning_rate": 9.188688993752626e-06, "loss": 0.3355, "step": 2240 }, { "epoch": 0.8, "grad_norm": 1.5354049474859641, "learning_rate": 9.177354837145298e-06, "loss": 0.3394, "step": 2250 }, { "epoch": 0.8035555555555556, "grad_norm": 1.8308300488763203, "learning_rate": 9.165949150559561e-06, "loss": 0.3545, "step": 2260 }, { "epoch": 0.8071111111111111, "grad_norm": 1.7274391712847685, "learning_rate": 9.154472129298075e-06, "loss": 0.363, "step": 2270 }, { "epoch": 0.8106666666666666, "grad_norm": 1.663966013940676, "learning_rate": 9.142923969884984e-06, "loss": 0.3395, "step": 2280 }, { "epoch": 0.8142222222222222, "grad_norm": 1.631283026660004, "learning_rate": 9.131304870062554e-06, "loss": 0.3486, "step": 2290 }, { "epoch": 0.8177777777777778, "grad_norm": 1.6552982308578106, "learning_rate": 9.119615028787771e-06, "loss": 0.3509, "step": 2300 }, { "epoch": 0.8213333333333334, "grad_norm": 1.7276297897533288, "learning_rate": 9.107854646228961e-06, "loss": 0.325, "step": 2310 }, { "epoch": 0.8248888888888889, "grad_norm": 1.445647497408194, "learning_rate": 9.096023923762333e-06, "loss": 0.3149, "step": 2320 }, { "epoch": 0.8284444444444444, "grad_norm": 1.531947731156783, "learning_rate": 9.08412306396856e-06, "loss": 0.348, "step": 2330 }, { "epoch": 0.832, "grad_norm": 1.3576987022774867, "learning_rate": 9.072152270629281e-06, "loss": 0.3096, "step": 2340 }, { "epoch": 0.8355555555555556, "grad_norm": 1.4298680216684836, "learning_rate": 9.060111748723639e-06, "loss": 0.3609, "step": 2350 }, { "epoch": 0.8391111111111111, "grad_norm": 1.5782942370819155, "learning_rate": 9.048001704424747e-06, "loss": 0.3307, "step": 2360 }, { "epoch": 0.8426666666666667, "grad_norm": 1.6461644102732529, "learning_rate": 9.035822345096177e-06, "loss": 0.3327, "step": 2370 }, { "epoch": 0.8462222222222222, "grad_norm": 1.5843145785651733, "learning_rate": 9.023573879288394e-06, "loss": 0.3312, "step": 2380 }, { "epoch": 0.8497777777777777, "grad_norm": 1.5152546857205669, "learning_rate": 9.0112565167352e-06, "loss": 0.3298, "step": 2390 }, { "epoch": 0.8533333333333334, "grad_norm": 1.7304070586423994, "learning_rate": 8.99887046835013e-06, "loss": 0.3404, "step": 2400 }, { "epoch": 0.8568888888888889, "grad_norm": 1.461299493248939, "learning_rate": 8.986415946222843e-06, "loss": 0.3351, "step": 2410 }, { "epoch": 0.8604444444444445, "grad_norm": 1.6967152528749099, "learning_rate": 8.973893163615498e-06, "loss": 0.3257, "step": 2420 }, { "epoch": 0.864, "grad_norm": 1.4154067723973784, "learning_rate": 8.96130233495909e-06, "loss": 0.3199, "step": 2430 }, { "epoch": 0.8675555555555555, "grad_norm": 1.3361597312618834, "learning_rate": 8.948643675849793e-06, "loss": 0.3442, "step": 2440 }, { "epoch": 0.8711111111111111, "grad_norm": 1.4032866224408458, "learning_rate": 8.935917403045251e-06, "loss": 0.2947, "step": 2450 }, { "epoch": 0.8746666666666667, "grad_norm": 1.234939739680067, "learning_rate": 8.923123734460885e-06, "loss": 0.3577, "step": 2460 }, { "epoch": 0.8782222222222222, "grad_norm": 1.5765934665163166, "learning_rate": 8.910262889166144e-06, "loss": 0.3326, "step": 2470 }, { "epoch": 0.8817777777777778, "grad_norm": 1.5046341548865376, "learning_rate": 8.897335087380769e-06, "loss": 0.3212, "step": 2480 }, { "epoch": 0.8853333333333333, "grad_norm": 1.3276870900100486, "learning_rate": 8.884340550471008e-06, "loss": 0.3143, "step": 2490 }, { "epoch": 0.8888888888888888, "grad_norm": 1.719735619655969, "learning_rate": 8.87127950094584e-06, "loss": 0.3747, "step": 2500 }, { "epoch": 0.8888888888888888, "eval_loss": 0.23135392367839813, "eval_runtime": 562.1868, "eval_samples_per_second": 17.788, "eval_steps_per_second": 4.447, "step": 2500 }, { "epoch": 0.8924444444444445, "grad_norm": 1.584313301872745, "learning_rate": 8.85815216245315e-06, "loss": 0.3251, "step": 2510 }, { "epoch": 0.896, "grad_norm": 1.2854406639721594, "learning_rate": 8.844958759775917e-06, "loss": 0.3242, "step": 2520 }, { "epoch": 0.8995555555555556, "grad_norm": 1.3421636352208044, "learning_rate": 8.83169951882834e-06, "loss": 0.3069, "step": 2530 }, { "epoch": 0.9031111111111111, "grad_norm": 1.6982202912735271, "learning_rate": 8.818374666652001e-06, "loss": 0.3303, "step": 2540 }, { "epoch": 0.9066666666666666, "grad_norm": 1.3802398833209684, "learning_rate": 8.804984431411951e-06, "loss": 0.3558, "step": 2550 }, { "epoch": 0.9102222222222223, "grad_norm": 1.8913239549685246, "learning_rate": 8.791529042392813e-06, "loss": 0.3947, "step": 2560 }, { "epoch": 0.9137777777777778, "grad_norm": 1.4494060942613418, "learning_rate": 8.77800872999486e-06, "loss": 0.3362, "step": 2570 }, { "epoch": 0.9173333333333333, "grad_norm": 1.7204036116920214, "learning_rate": 8.764423725730062e-06, "loss": 0.3298, "step": 2580 }, { "epoch": 0.9208888888888889, "grad_norm": 1.6130463149964605, "learning_rate": 8.750774262218129e-06, "loss": 0.3218, "step": 2590 }, { "epoch": 0.9244444444444444, "grad_norm": 1.4272505738840544, "learning_rate": 8.737060573182518e-06, "loss": 0.3325, "step": 2600 }, { "epoch": 0.928, "grad_norm": 1.5909460584884059, "learning_rate": 8.723282893446447e-06, "loss": 0.3496, "step": 2610 }, { "epoch": 0.9315555555555556, "grad_norm": 2.0360938733984963, "learning_rate": 8.709441458928853e-06, "loss": 0.3197, "step": 2620 }, { "epoch": 0.9351111111111111, "grad_norm": 1.6918095124182533, "learning_rate": 8.695536506640369e-06, "loss": 0.3349, "step": 2630 }, { "epoch": 0.9386666666666666, "grad_norm": 1.561883507817091, "learning_rate": 8.681568274679264e-06, "loss": 0.3357, "step": 2640 }, { "epoch": 0.9422222222222222, "grad_norm": 1.635386123467993, "learning_rate": 8.66753700222735e-06, "loss": 0.3023, "step": 2650 }, { "epoch": 0.9457777777777778, "grad_norm": 1.6460980849436542, "learning_rate": 8.653442929545914e-06, "loss": 0.3482, "step": 2660 }, { "epoch": 0.9493333333333334, "grad_norm": 1.8476260091970051, "learning_rate": 8.639286297971575e-06, "loss": 0.3111, "step": 2670 }, { "epoch": 0.9528888888888889, "grad_norm": 1.5625524365842092, "learning_rate": 8.625067349912171e-06, "loss": 0.3333, "step": 2680 }, { "epoch": 0.9564444444444444, "grad_norm": 1.679549783886682, "learning_rate": 8.610786328842602e-06, "loss": 0.3012, "step": 2690 }, { "epoch": 0.96, "grad_norm": 1.7334271987057313, "learning_rate": 8.59644347930066e-06, "loss": 0.3158, "step": 2700 }, { "epoch": 0.9635555555555556, "grad_norm": 1.7183702234532738, "learning_rate": 8.582039046882842e-06, "loss": 0.3045, "step": 2710 }, { "epoch": 0.9671111111111111, "grad_norm": 1.677327314139312, "learning_rate": 8.567573278240147e-06, "loss": 0.3379, "step": 2720 }, { "epoch": 0.9706666666666667, "grad_norm": 1.4197759922345252, "learning_rate": 8.55304642107385e-06, "loss": 0.3376, "step": 2730 }, { "epoch": 0.9742222222222222, "grad_norm": 1.7365860935410007, "learning_rate": 8.538458724131258e-06, "loss": 0.3395, "step": 2740 }, { "epoch": 0.9777777777777777, "grad_norm": 1.5642529718868006, "learning_rate": 8.523810437201463e-06, "loss": 0.3105, "step": 2750 }, { "epoch": 0.9813333333333333, "grad_norm": 1.6285786801359268, "learning_rate": 8.509101811111045e-06, "loss": 0.314, "step": 2760 }, { "epoch": 0.9848888888888889, "grad_norm": 1.7932095997349375, "learning_rate": 8.494333097719795e-06, "loss": 0.3183, "step": 2770 }, { "epoch": 0.9884444444444445, "grad_norm": 1.7636055661476138, "learning_rate": 8.479504549916393e-06, "loss": 0.3459, "step": 2780 }, { "epoch": 0.992, "grad_norm": 1.7893218283734698, "learning_rate": 8.464616421614077e-06, "loss": 0.3655, "step": 2790 }, { "epoch": 0.9955555555555555, "grad_norm": 1.56040627840869, "learning_rate": 8.449668967746303e-06, "loss": 0.3145, "step": 2800 }, { "epoch": 0.9991111111111111, "grad_norm": 1.7372692555117912, "learning_rate": 8.434662444262374e-06, "loss": 0.3152, "step": 2810 }, { "epoch": 1.0026666666666666, "grad_norm": 1.3178611516659062, "learning_rate": 8.419597108123054e-06, "loss": 0.256, "step": 2820 }, { "epoch": 1.0062222222222221, "grad_norm": 1.7641513434209246, "learning_rate": 8.404473217296174e-06, "loss": 0.2304, "step": 2830 }, { "epoch": 1.0097777777777779, "grad_norm": 1.702777106397184, "learning_rate": 8.389291030752215e-06, "loss": 0.2451, "step": 2840 }, { "epoch": 1.0133333333333334, "grad_norm": 1.516656565976496, "learning_rate": 8.37405080845987e-06, "loss": 0.2463, "step": 2850 }, { "epoch": 1.016888888888889, "grad_norm": 1.2615996283177406, "learning_rate": 8.358752811381592e-06, "loss": 0.2439, "step": 2860 }, { "epoch": 1.0204444444444445, "grad_norm": 1.2426761993789008, "learning_rate": 8.343397301469127e-06, "loss": 0.2301, "step": 2870 }, { "epoch": 1.024, "grad_norm": 1.7414567869166766, "learning_rate": 8.327984541659035e-06, "loss": 0.26, "step": 2880 }, { "epoch": 1.0275555555555556, "grad_norm": 1.778546754169589, "learning_rate": 8.312514795868177e-06, "loss": 0.2537, "step": 2890 }, { "epoch": 1.031111111111111, "grad_norm": 1.693194016869835, "learning_rate": 8.296988328989195e-06, "loss": 0.2474, "step": 2900 }, { "epoch": 1.0346666666666666, "grad_norm": 1.4905129718116352, "learning_rate": 8.281405406885992e-06, "loss": 0.2259, "step": 2910 }, { "epoch": 1.0382222222222222, "grad_norm": 1.6844431624217413, "learning_rate": 8.265766296389164e-06, "loss": 0.2206, "step": 2920 }, { "epoch": 1.0417777777777777, "grad_norm": 1.4064579919162583, "learning_rate": 8.250071265291432e-06, "loss": 0.2498, "step": 2930 }, { "epoch": 1.0453333333333332, "grad_norm": 1.4383166925160618, "learning_rate": 8.23432058234307e-06, "loss": 0.2316, "step": 2940 }, { "epoch": 1.048888888888889, "grad_norm": 1.7880359369165812, "learning_rate": 8.218514517247287e-06, "loss": 0.2421, "step": 2950 }, { "epoch": 1.0524444444444445, "grad_norm": 1.49095155848045, "learning_rate": 8.202653340655614e-06, "loss": 0.2547, "step": 2960 }, { "epoch": 1.056, "grad_norm": 1.802867297616481, "learning_rate": 8.18673732416328e-06, "loss": 0.2609, "step": 2970 }, { "epoch": 1.0595555555555556, "grad_norm": 1.799375023246126, "learning_rate": 8.170766740304541e-06, "loss": 0.2369, "step": 2980 }, { "epoch": 1.0631111111111111, "grad_norm": 1.645090115101595, "learning_rate": 8.154741862548035e-06, "loss": 0.2519, "step": 2990 }, { "epoch": 1.0666666666666667, "grad_norm": 1.8315765038402207, "learning_rate": 8.13866296529208e-06, "loss": 0.2248, "step": 3000 }, { "epoch": 1.0666666666666667, "eval_loss": 0.23144060373306274, "eval_runtime": 562.045, "eval_samples_per_second": 17.792, "eval_steps_per_second": 4.448, "step": 3000 }, { "epoch": 1.0702222222222222, "grad_norm": 1.3604786834079945, "learning_rate": 8.122530323859992e-06, "loss": 0.2494, "step": 3010 }, { "epoch": 1.0737777777777777, "grad_norm": 1.472974815302568, "learning_rate": 8.106344214495359e-06, "loss": 0.2168, "step": 3020 }, { "epoch": 1.0773333333333333, "grad_norm": 1.9232740710019078, "learning_rate": 8.090104914357316e-06, "loss": 0.2544, "step": 3030 }, { "epoch": 1.0808888888888888, "grad_norm": 1.6517745707358162, "learning_rate": 8.073812701515799e-06, "loss": 0.2362, "step": 3040 }, { "epoch": 1.0844444444444445, "grad_norm": 1.5375717590050721, "learning_rate": 8.057467854946783e-06, "loss": 0.238, "step": 3050 }, { "epoch": 1.088, "grad_norm": 1.736104134714019, "learning_rate": 8.041070654527498e-06, "loss": 0.2329, "step": 3060 }, { "epoch": 1.0915555555555556, "grad_norm": 1.578126670290498, "learning_rate": 8.024621381031654e-06, "loss": 0.2525, "step": 3070 }, { "epoch": 1.0951111111111111, "grad_norm": 1.2995445031583646, "learning_rate": 8.008120316124612e-06, "loss": 0.2378, "step": 3080 }, { "epoch": 1.0986666666666667, "grad_norm": 1.9084352174123695, "learning_rate": 7.991567742358582e-06, "loss": 0.2469, "step": 3090 }, { "epoch": 1.1022222222222222, "grad_norm": 1.6004292294784017, "learning_rate": 7.974963943167761e-06, "loss": 0.2721, "step": 3100 }, { "epoch": 1.1057777777777777, "grad_norm": 1.4738079995177567, "learning_rate": 7.958309202863506e-06, "loss": 0.2457, "step": 3110 }, { "epoch": 1.1093333333333333, "grad_norm": 1.5493675656690653, "learning_rate": 7.941603806629444e-06, "loss": 0.2274, "step": 3120 }, { "epoch": 1.1128888888888888, "grad_norm": 1.6554292154622638, "learning_rate": 7.9248480405166e-06, "loss": 0.2595, "step": 3130 }, { "epoch": 1.1164444444444444, "grad_norm": 1.6112904935857704, "learning_rate": 7.908042191438497e-06, "loss": 0.2374, "step": 3140 }, { "epoch": 1.12, "grad_norm": 1.4663251499352947, "learning_rate": 7.891186547166238e-06, "loss": 0.2128, "step": 3150 }, { "epoch": 1.1235555555555556, "grad_norm": 1.8636139047215206, "learning_rate": 7.874281396323589e-06, "loss": 0.2263, "step": 3160 }, { "epoch": 1.1271111111111112, "grad_norm": 1.6257921444204015, "learning_rate": 7.857327028382025e-06, "loss": 0.2392, "step": 3170 }, { "epoch": 1.1306666666666667, "grad_norm": 1.4066061759358834, "learning_rate": 7.84032373365578e-06, "loss": 0.2342, "step": 3180 }, { "epoch": 1.1342222222222222, "grad_norm": 1.5852680151393, "learning_rate": 7.823271803296876e-06, "loss": 0.2271, "step": 3190 }, { "epoch": 1.1377777777777778, "grad_norm": 1.7721860252109063, "learning_rate": 7.80617152929014e-06, "loss": 0.2376, "step": 3200 }, { "epoch": 1.1413333333333333, "grad_norm": 1.8867413038702499, "learning_rate": 7.789023204448189e-06, "loss": 0.2516, "step": 3210 }, { "epoch": 1.1448888888888888, "grad_norm": 1.4279840133381525, "learning_rate": 7.771827122406437e-06, "loss": 0.2265, "step": 3220 }, { "epoch": 1.1484444444444444, "grad_norm": 1.676800279171029, "learning_rate": 7.754583577618057e-06, "loss": 0.2554, "step": 3230 }, { "epoch": 1.152, "grad_norm": 1.6723494127405627, "learning_rate": 7.737292865348933e-06, "loss": 0.2408, "step": 3240 }, { "epoch": 1.1555555555555554, "grad_norm": 1.6148606083372026, "learning_rate": 7.719955281672618e-06, "loss": 0.2287, "step": 3250 }, { "epoch": 1.1591111111111112, "grad_norm": 1.6092526546730486, "learning_rate": 7.702571123465252e-06, "loss": 0.237, "step": 3260 }, { "epoch": 1.1626666666666667, "grad_norm": 1.3380193435685535, "learning_rate": 7.685140688400484e-06, "loss": 0.2393, "step": 3270 }, { "epoch": 1.1662222222222223, "grad_norm": 1.3406231671146336, "learning_rate": 7.66766427494438e-06, "loss": 0.2158, "step": 3280 }, { "epoch": 1.1697777777777778, "grad_norm": 1.5365708586926026, "learning_rate": 7.650142182350294e-06, "loss": 0.201, "step": 3290 }, { "epoch": 1.1733333333333333, "grad_norm": 1.7847958889549216, "learning_rate": 7.632574710653773e-06, "loss": 0.2627, "step": 3300 }, { "epoch": 1.1768888888888889, "grad_norm": 1.4770511975662048, "learning_rate": 7.614962160667384e-06, "loss": 0.221, "step": 3310 }, { "epoch": 1.1804444444444444, "grad_norm": 1.8043230337610534, "learning_rate": 7.597304833975596e-06, "loss": 0.2419, "step": 3320 }, { "epoch": 1.184, "grad_norm": 1.9363141324764201, "learning_rate": 7.579603032929597e-06, "loss": 0.2572, "step": 3330 }, { "epoch": 1.1875555555555555, "grad_norm": 1.600071864532325, "learning_rate": 7.56185706064212e-06, "loss": 0.2462, "step": 3340 }, { "epoch": 1.1911111111111112, "grad_norm": 1.5785414115422856, "learning_rate": 7.544067220982254e-06, "loss": 0.2312, "step": 3350 }, { "epoch": 1.1946666666666665, "grad_norm": 1.5789285671514135, "learning_rate": 7.526233818570245e-06, "loss": 0.2067, "step": 3360 }, { "epoch": 1.1982222222222223, "grad_norm": 1.7448328186975814, "learning_rate": 7.508357158772273e-06, "loss": 0.2448, "step": 3370 }, { "epoch": 1.2017777777777778, "grad_norm": 1.4619128557517416, "learning_rate": 7.490437547695224e-06, "loss": 0.2194, "step": 3380 }, { "epoch": 1.2053333333333334, "grad_norm": 1.6063307731749306, "learning_rate": 7.472475292181454e-06, "loss": 0.2501, "step": 3390 }, { "epoch": 1.208888888888889, "grad_norm": 1.9510115721688825, "learning_rate": 7.45447069980353e-06, "loss": 0.2515, "step": 3400 }, { "epoch": 1.2124444444444444, "grad_norm": 1.5856572080139135, "learning_rate": 7.4364240788589625e-06, "loss": 0.2461, "step": 3410 }, { "epoch": 1.216, "grad_norm": 1.846941973796494, "learning_rate": 7.418335738364931e-06, "loss": 0.2241, "step": 3420 }, { "epoch": 1.2195555555555555, "grad_norm": 1.8886992728965029, "learning_rate": 7.400205988052991e-06, "loss": 0.2298, "step": 3430 }, { "epoch": 1.223111111111111, "grad_norm": 1.6140767527032074, "learning_rate": 7.382035138363764e-06, "loss": 0.2516, "step": 3440 }, { "epoch": 1.2266666666666666, "grad_norm": 1.637777869962237, "learning_rate": 7.363823500441636e-06, "loss": 0.2422, "step": 3450 }, { "epoch": 1.2302222222222223, "grad_norm": 1.3783132940885547, "learning_rate": 7.345571386129413e-06, "loss": 0.2368, "step": 3460 }, { "epoch": 1.2337777777777779, "grad_norm": 1.750318456803832, "learning_rate": 7.327279107962995e-06, "loss": 0.2488, "step": 3470 }, { "epoch": 1.2373333333333334, "grad_norm": 1.7745176716418858, "learning_rate": 7.308946979166012e-06, "loss": 0.2277, "step": 3480 }, { "epoch": 1.240888888888889, "grad_norm": 1.7469697925399752, "learning_rate": 7.290575313644476e-06, "loss": 0.2329, "step": 3490 }, { "epoch": 1.2444444444444445, "grad_norm": 1.4439208816879574, "learning_rate": 7.272164425981387e-06, "loss": 0.2575, "step": 3500 }, { "epoch": 1.2444444444444445, "eval_loss": 0.22694812715053558, "eval_runtime": 564.2235, "eval_samples_per_second": 17.723, "eval_steps_per_second": 4.431, "step": 3500 }, { "epoch": 1.248, "grad_norm": 1.5767155030054063, "learning_rate": 7.253714631431366e-06, "loss": 0.2492, "step": 3510 }, { "epoch": 1.2515555555555555, "grad_norm": 1.5655624730827595, "learning_rate": 7.235226245915239e-06, "loss": 0.2259, "step": 3520 }, { "epoch": 1.255111111111111, "grad_norm": 1.8883245133962092, "learning_rate": 7.216699586014642e-06, "loss": 0.2487, "step": 3530 }, { "epoch": 1.2586666666666666, "grad_norm": 1.2903228684726653, "learning_rate": 7.198134968966588e-06, "loss": 0.2341, "step": 3540 }, { "epoch": 1.2622222222222224, "grad_norm": 1.6585013961180077, "learning_rate": 7.179532712658047e-06, "loss": 0.2625, "step": 3550 }, { "epoch": 1.2657777777777777, "grad_norm": 1.4955952405740183, "learning_rate": 7.160893135620488e-06, "loss": 0.2602, "step": 3560 }, { "epoch": 1.2693333333333334, "grad_norm": 1.8286387441617464, "learning_rate": 7.142216557024443e-06, "loss": 0.2221, "step": 3570 }, { "epoch": 1.272888888888889, "grad_norm": 1.6146123865735058, "learning_rate": 7.123503296674021e-06, "loss": 0.247, "step": 3580 }, { "epoch": 1.2764444444444445, "grad_norm": 1.4700165794501387, "learning_rate": 7.104753675001453e-06, "loss": 0.2405, "step": 3590 }, { "epoch": 1.28, "grad_norm": 1.4475828320209072, "learning_rate": 7.085968013061585e-06, "loss": 0.2452, "step": 3600 }, { "epoch": 1.2835555555555556, "grad_norm": 1.9854917772925798, "learning_rate": 7.067146632526398e-06, "loss": 0.2813, "step": 3610 }, { "epoch": 1.287111111111111, "grad_norm": 1.863775670718366, "learning_rate": 7.048289855679487e-06, "loss": 0.2272, "step": 3620 }, { "epoch": 1.2906666666666666, "grad_norm": 2.0238745081645693, "learning_rate": 7.029398005410551e-06, "loss": 0.2588, "step": 3630 }, { "epoch": 1.2942222222222222, "grad_norm": 1.8729516419448864, "learning_rate": 7.01047140520986e-06, "loss": 0.2403, "step": 3640 }, { "epoch": 1.2977777777777777, "grad_norm": 1.721501900738319, "learning_rate": 6.9915103791627146e-06, "loss": 0.2477, "step": 3650 }, { "epoch": 1.3013333333333335, "grad_norm": 1.6626021007269847, "learning_rate": 6.972515251943901e-06, "loss": 0.2279, "step": 3660 }, { "epoch": 1.3048888888888888, "grad_norm": 1.6716430135185554, "learning_rate": 6.953486348812127e-06, "loss": 0.2414, "step": 3670 }, { "epoch": 1.3084444444444445, "grad_norm": 1.4291636119458788, "learning_rate": 6.934423995604455e-06, "loss": 0.248, "step": 3680 }, { "epoch": 1.312, "grad_norm": 1.4674689793023254, "learning_rate": 6.915328518730724e-06, "loss": 0.2459, "step": 3690 }, { "epoch": 1.3155555555555556, "grad_norm": 1.5215618690023482, "learning_rate": 6.896200245167956e-06, "loss": 0.2546, "step": 3700 }, { "epoch": 1.3191111111111111, "grad_norm": 1.67624683709797, "learning_rate": 6.877039502454758e-06, "loss": 0.2006, "step": 3710 }, { "epoch": 1.3226666666666667, "grad_norm": 1.552246698817707, "learning_rate": 6.857846618685724e-06, "loss": 0.2213, "step": 3720 }, { "epoch": 1.3262222222222222, "grad_norm": 2.021180154460745, "learning_rate": 6.8386219225057945e-06, "loss": 0.2315, "step": 3730 }, { "epoch": 1.3297777777777777, "grad_norm": 1.8378386656471875, "learning_rate": 6.819365743104655e-06, "loss": 0.2235, "step": 3740 }, { "epoch": 1.3333333333333333, "grad_norm": 1.8383503621089257, "learning_rate": 6.8000784102110795e-06, "loss": 0.2348, "step": 3750 }, { "epoch": 1.3368888888888888, "grad_norm": 1.476660408503267, "learning_rate": 6.780760254087293e-06, "loss": 0.2433, "step": 3760 }, { "epoch": 1.3404444444444445, "grad_norm": 1.6056267413924534, "learning_rate": 6.7614116055233146e-06, "loss": 0.2511, "step": 3770 }, { "epoch": 1.3439999999999999, "grad_norm": 1.5433968607865032, "learning_rate": 6.742032795831298e-06, "loss": 0.2218, "step": 3780 }, { "epoch": 1.3475555555555556, "grad_norm": 1.8752695620093498, "learning_rate": 6.722624156839847e-06, "loss": 0.2607, "step": 3790 }, { "epoch": 1.3511111111111112, "grad_norm": 1.7018274048947808, "learning_rate": 6.703186020888347e-06, "loss": 0.2434, "step": 3800 }, { "epoch": 1.3546666666666667, "grad_norm": 1.7419410223233012, "learning_rate": 6.683718720821264e-06, "loss": 0.2494, "step": 3810 }, { "epoch": 1.3582222222222222, "grad_norm": 1.5145074056393906, "learning_rate": 6.664222589982451e-06, "loss": 0.2215, "step": 3820 }, { "epoch": 1.3617777777777778, "grad_norm": 1.2846516741089247, "learning_rate": 6.644697962209434e-06, "loss": 0.2346, "step": 3830 }, { "epoch": 1.3653333333333333, "grad_norm": 1.4951097829345636, "learning_rate": 6.6251451718277095e-06, "loss": 0.2122, "step": 3840 }, { "epoch": 1.3688888888888888, "grad_norm": 1.837176746272441, "learning_rate": 6.605564553644998e-06, "loss": 0.2289, "step": 3850 }, { "epoch": 1.3724444444444446, "grad_norm": 1.7541861945923773, "learning_rate": 6.585956442945531e-06, "loss": 0.2304, "step": 3860 }, { "epoch": 1.376, "grad_norm": 1.456084798251464, "learning_rate": 6.566321175484298e-06, "loss": 0.2524, "step": 3870 }, { "epoch": 1.3795555555555556, "grad_norm": 1.4021880078388174, "learning_rate": 6.546659087481304e-06, "loss": 0.2344, "step": 3880 }, { "epoch": 1.3831111111111112, "grad_norm": 1.386759603833687, "learning_rate": 6.526970515615807e-06, "loss": 0.2278, "step": 3890 }, { "epoch": 1.3866666666666667, "grad_norm": 1.9340717544487618, "learning_rate": 6.507255797020555e-06, "loss": 0.2299, "step": 3900 }, { "epoch": 1.3902222222222222, "grad_norm": 1.4309730673942778, "learning_rate": 6.487515269276015e-06, "loss": 0.2518, "step": 3910 }, { "epoch": 1.3937777777777778, "grad_norm": 1.5432073955843775, "learning_rate": 6.467749270404593e-06, "loss": 0.2196, "step": 3920 }, { "epoch": 1.3973333333333333, "grad_norm": 1.5255820019311863, "learning_rate": 6.4479581388648404e-06, "loss": 0.2527, "step": 3930 }, { "epoch": 1.4008888888888889, "grad_norm": 1.9387048217346732, "learning_rate": 6.428142213545662e-06, "loss": 0.2663, "step": 3940 }, { "epoch": 1.4044444444444444, "grad_norm": 1.4687424654762213, "learning_rate": 6.408301833760517e-06, "loss": 0.2141, "step": 3950 }, { "epoch": 1.408, "grad_norm": 1.6790491256350315, "learning_rate": 6.388437339241601e-06, "loss": 0.2419, "step": 3960 }, { "epoch": 1.4115555555555557, "grad_norm": 1.4986463255132796, "learning_rate": 6.368549070134036e-06, "loss": 0.2205, "step": 3970 }, { "epoch": 1.415111111111111, "grad_norm": 1.8639041315873657, "learning_rate": 6.348637366990038e-06, "loss": 0.2403, "step": 3980 }, { "epoch": 1.4186666666666667, "grad_norm": 1.8313804556837663, "learning_rate": 6.328702570763098e-06, "loss": 0.243, "step": 3990 }, { "epoch": 1.4222222222222223, "grad_norm": 1.6288666479905434, "learning_rate": 6.308745022802128e-06, "loss": 0.2376, "step": 4000 }, { "epoch": 1.4222222222222223, "eval_loss": 0.22332721948623657, "eval_runtime": 562.4439, "eval_samples_per_second": 17.78, "eval_steps_per_second": 4.445, "step": 4000 }, { "epoch": 1.4257777777777778, "grad_norm": 1.28363469470016, "learning_rate": 6.288765064845629e-06, "loss": 0.2119, "step": 4010 }, { "epoch": 1.4293333333333333, "grad_norm": 1.5685400141436767, "learning_rate": 6.268763039015833e-06, "loss": 0.2372, "step": 4020 }, { "epoch": 1.4328888888888889, "grad_norm": 1.2419732210599121, "learning_rate": 6.248739287812846e-06, "loss": 0.2378, "step": 4030 }, { "epoch": 1.4364444444444444, "grad_norm": 1.450791049105233, "learning_rate": 6.228694154108783e-06, "loss": 0.236, "step": 4040 }, { "epoch": 1.44, "grad_norm": 1.3478041984965912, "learning_rate": 6.208627981141902e-06, "loss": 0.2165, "step": 4050 }, { "epoch": 1.4435555555555555, "grad_norm": 1.6880548918845273, "learning_rate": 6.188541112510713e-06, "loss": 0.2405, "step": 4060 }, { "epoch": 1.447111111111111, "grad_norm": 1.489941080547117, "learning_rate": 6.168433892168113e-06, "loss": 0.2288, "step": 4070 }, { "epoch": 1.4506666666666668, "grad_norm": 2.036909885440752, "learning_rate": 6.148306664415476e-06, "loss": 0.235, "step": 4080 }, { "epoch": 1.4542222222222223, "grad_norm": 1.60733518117776, "learning_rate": 6.128159773896783e-06, "loss": 0.2143, "step": 4090 }, { "epoch": 1.4577777777777778, "grad_norm": 1.6002205563066152, "learning_rate": 6.107993565592693e-06, "loss": 0.239, "step": 4100 }, { "epoch": 1.4613333333333334, "grad_norm": 1.59924513215813, "learning_rate": 6.087808384814652e-06, "loss": 0.2185, "step": 4110 }, { "epoch": 1.464888888888889, "grad_norm": 1.6651512334739322, "learning_rate": 6.067604577198981e-06, "loss": 0.238, "step": 4120 }, { "epoch": 1.4684444444444444, "grad_norm": 1.6551324049801701, "learning_rate": 6.04738248870095e-06, "loss": 0.2238, "step": 4130 }, { "epoch": 1.472, "grad_norm": 1.5301258421668906, "learning_rate": 6.027142465588855e-06, "loss": 0.2453, "step": 4140 }, { "epoch": 1.4755555555555555, "grad_norm": 1.8144546212524773, "learning_rate": 6.006884854438099e-06, "loss": 0.2375, "step": 4150 }, { "epoch": 1.479111111111111, "grad_norm": 1.5099593511650293, "learning_rate": 5.9866100021252415e-06, "loss": 0.2331, "step": 4160 }, { "epoch": 1.4826666666666668, "grad_norm": 1.502590510458408, "learning_rate": 5.966318255822072e-06, "loss": 0.2131, "step": 4170 }, { "epoch": 1.4862222222222221, "grad_norm": 1.7399671557461471, "learning_rate": 5.946009962989659e-06, "loss": 0.243, "step": 4180 }, { "epoch": 1.4897777777777779, "grad_norm": 1.959843593418678, "learning_rate": 5.9256854713724e-06, "loss": 0.2344, "step": 4190 }, { "epoch": 1.4933333333333334, "grad_norm": 1.5187384802338688, "learning_rate": 5.905345128992072e-06, "loss": 0.2372, "step": 4200 }, { "epoch": 1.496888888888889, "grad_norm": 1.713913961820143, "learning_rate": 5.884989284141866e-06, "loss": 0.2137, "step": 4210 }, { "epoch": 1.5004444444444445, "grad_norm": 1.5301932679943313, "learning_rate": 5.86461828538043e-06, "loss": 0.2264, "step": 4220 }, { "epoch": 1.504, "grad_norm": 1.6650108469792486, "learning_rate": 5.84423248152589e-06, "loss": 0.2167, "step": 4230 }, { "epoch": 1.5075555555555555, "grad_norm": 1.7377610919859674, "learning_rate": 5.82383222164989e-06, "loss": 0.2223, "step": 4240 }, { "epoch": 1.511111111111111, "grad_norm": 1.8280200619954592, "learning_rate": 5.803417855071603e-06, "loss": 0.2361, "step": 4250 }, { "epoch": 1.5146666666666668, "grad_norm": 1.7315368181217787, "learning_rate": 5.782989731351762e-06, "loss": 0.2665, "step": 4260 }, { "epoch": 1.5182222222222221, "grad_norm": 1.6917154736502973, "learning_rate": 5.762548200286659e-06, "loss": 0.212, "step": 4270 }, { "epoch": 1.521777777777778, "grad_norm": 1.5262051452408105, "learning_rate": 5.742093611902168e-06, "loss": 0.2142, "step": 4280 }, { "epoch": 1.5253333333333332, "grad_norm": 1.4955231464253305, "learning_rate": 5.721626316447748e-06, "loss": 0.2302, "step": 4290 }, { "epoch": 1.528888888888889, "grad_norm": 1.729596636954076, "learning_rate": 5.7011466643904434e-06, "loss": 0.2209, "step": 4300 }, { "epoch": 1.5324444444444445, "grad_norm": 1.470928828267314, "learning_rate": 5.680655006408882e-06, "loss": 0.2398, "step": 4310 }, { "epoch": 1.536, "grad_norm": 1.4046672488847465, "learning_rate": 5.660151693387273e-06, "loss": 0.2335, "step": 4320 }, { "epoch": 1.5395555555555556, "grad_norm": 1.6687999325358385, "learning_rate": 5.639637076409404e-06, "loss": 0.2207, "step": 4330 }, { "epoch": 1.543111111111111, "grad_norm": 1.60564618911301, "learning_rate": 5.6191115067526135e-06, "loss": 0.2411, "step": 4340 }, { "epoch": 1.5466666666666666, "grad_norm": 1.6047937970455775, "learning_rate": 5.598575335881792e-06, "loss": 0.2161, "step": 4350 }, { "epoch": 1.5502222222222222, "grad_norm": 1.3451412373708476, "learning_rate": 5.578028915443356e-06, "loss": 0.2104, "step": 4360 }, { "epoch": 1.553777777777778, "grad_norm": 1.827680836587444, "learning_rate": 5.55747259725923e-06, "loss": 0.2333, "step": 4370 }, { "epoch": 1.5573333333333332, "grad_norm": 1.8474659285597943, "learning_rate": 5.536906733320816e-06, "loss": 0.2447, "step": 4380 }, { "epoch": 1.560888888888889, "grad_norm": 1.5571932949328393, "learning_rate": 5.516331675782973e-06, "loss": 0.2445, "step": 4390 }, { "epoch": 1.5644444444444443, "grad_norm": 1.9294806844289611, "learning_rate": 5.495747776957987e-06, "loss": 0.2382, "step": 4400 }, { "epoch": 1.568, "grad_norm": 1.3637347529801744, "learning_rate": 5.475155389309531e-06, "loss": 0.2162, "step": 4410 }, { "epoch": 1.5715555555555556, "grad_norm": 1.552594376889073, "learning_rate": 5.4545548654466366e-06, "loss": 0.2351, "step": 4420 }, { "epoch": 1.5751111111111111, "grad_norm": 1.563596866564994, "learning_rate": 5.433946558117654e-06, "loss": 0.2259, "step": 4430 }, { "epoch": 1.5786666666666667, "grad_norm": 1.9424477147575314, "learning_rate": 5.413330820204214e-06, "loss": 0.2269, "step": 4440 }, { "epoch": 1.5822222222222222, "grad_norm": 1.7161442287459214, "learning_rate": 5.392708004715178e-06, "loss": 0.233, "step": 4450 }, { "epoch": 1.5857777777777777, "grad_norm": 1.4458518805717744, "learning_rate": 5.372078464780603e-06, "loss": 0.2428, "step": 4460 }, { "epoch": 1.5893333333333333, "grad_norm": 1.7197914268509118, "learning_rate": 5.351442553645691e-06, "loss": 0.2095, "step": 4470 }, { "epoch": 1.592888888888889, "grad_norm": 1.7871712697682276, "learning_rate": 5.330800624664736e-06, "loss": 0.2375, "step": 4480 }, { "epoch": 1.5964444444444443, "grad_norm": 1.6154295338481346, "learning_rate": 5.310153031295079e-06, "loss": 0.2365, "step": 4490 }, { "epoch": 1.6, "grad_norm": 1.8622833358204558, "learning_rate": 5.289500127091056e-06, "loss": 0.2521, "step": 4500 }, { "epoch": 1.6, "eval_loss": 0.22019484639167786, "eval_runtime": 562.6101, "eval_samples_per_second": 17.774, "eval_steps_per_second": 4.444, "step": 4500 }, { "epoch": 1.6035555555555554, "grad_norm": 1.4160865462023664, "learning_rate": 5.26884226569794e-06, "loss": 0.2445, "step": 4510 }, { "epoch": 1.6071111111111112, "grad_norm": 1.6982387533503471, "learning_rate": 5.248179800845884e-06, "loss": 0.2586, "step": 4520 }, { "epoch": 1.6106666666666667, "grad_norm": 1.8063057152671183, "learning_rate": 5.227513086343875e-06, "loss": 0.2342, "step": 4530 }, { "epoch": 1.6142222222222222, "grad_norm": 1.8369946808465265, "learning_rate": 5.20684247607366e-06, "loss": 0.2149, "step": 4540 }, { "epoch": 1.6177777777777778, "grad_norm": 1.4919743522204885, "learning_rate": 5.186168323983702e-06, "loss": 0.2361, "step": 4550 }, { "epoch": 1.6213333333333333, "grad_norm": 1.908909797085476, "learning_rate": 5.1654909840831e-06, "loss": 0.2422, "step": 4560 }, { "epoch": 1.624888888888889, "grad_norm": 1.6970594817568836, "learning_rate": 5.144810810435553e-06, "loss": 0.2702, "step": 4570 }, { "epoch": 1.6284444444444444, "grad_norm": 1.914631182858778, "learning_rate": 5.124128157153273e-06, "loss": 0.211, "step": 4580 }, { "epoch": 1.6320000000000001, "grad_norm": 1.8308898752074714, "learning_rate": 5.103443378390935e-06, "loss": 0.213, "step": 4590 }, { "epoch": 1.6355555555555554, "grad_norm": 1.4716155031307734, "learning_rate": 5.08275682833961e-06, "loss": 0.2348, "step": 4600 }, { "epoch": 1.6391111111111112, "grad_norm": 1.3846959035420932, "learning_rate": 5.062068861220697e-06, "loss": 0.2323, "step": 4610 }, { "epoch": 1.6426666666666667, "grad_norm": 1.310528332429156, "learning_rate": 5.041379831279859e-06, "loss": 0.2274, "step": 4620 }, { "epoch": 1.6462222222222223, "grad_norm": 1.56294035415104, "learning_rate": 5.020690092780961e-06, "loss": 0.2382, "step": 4630 }, { "epoch": 1.6497777777777778, "grad_norm": 1.797053581769004, "learning_rate": 5e-06, "loss": 0.2263, "step": 4640 }, { "epoch": 1.6533333333333333, "grad_norm": 1.57684485333151, "learning_rate": 4.9793099072190406e-06, "loss": 0.2225, "step": 4650 }, { "epoch": 1.6568888888888889, "grad_norm": 2.0411280702141883, "learning_rate": 4.958620168720144e-06, "loss": 0.2225, "step": 4660 }, { "epoch": 1.6604444444444444, "grad_norm": 1.476641016823167, "learning_rate": 4.937931138779305e-06, "loss": 0.2438, "step": 4670 }, { "epoch": 1.6640000000000001, "grad_norm": 1.4259185034698016, "learning_rate": 4.917243171660391e-06, "loss": 0.2127, "step": 4680 }, { "epoch": 1.6675555555555555, "grad_norm": 1.9925037267732388, "learning_rate": 4.896556621609066e-06, "loss": 0.223, "step": 4690 }, { "epoch": 1.6711111111111112, "grad_norm": 1.3845653896887404, "learning_rate": 4.8758718428467275e-06, "loss": 0.2332, "step": 4700 }, { "epoch": 1.6746666666666665, "grad_norm": 1.5936847174408162, "learning_rate": 4.8551891895644485e-06, "loss": 0.2381, "step": 4710 }, { "epoch": 1.6782222222222223, "grad_norm": 1.8741655887113169, "learning_rate": 4.8345090159169015e-06, "loss": 0.2182, "step": 4720 }, { "epoch": 1.6817777777777778, "grad_norm": 2.0577120951961057, "learning_rate": 4.813831676016301e-06, "loss": 0.2323, "step": 4730 }, { "epoch": 1.6853333333333333, "grad_norm": 1.6887655358314864, "learning_rate": 4.793157523926343e-06, "loss": 0.2236, "step": 4740 }, { "epoch": 1.6888888888888889, "grad_norm": 1.669624887759933, "learning_rate": 4.772486913656126e-06, "loss": 0.216, "step": 4750 }, { "epoch": 1.6924444444444444, "grad_norm": 1.3957590014036165, "learning_rate": 4.751820199154116e-06, "loss": 0.2104, "step": 4760 }, { "epoch": 1.696, "grad_norm": 1.7601085948001791, "learning_rate": 4.731157734302063e-06, "loss": 0.2255, "step": 4770 }, { "epoch": 1.6995555555555555, "grad_norm": 1.4141936030167341, "learning_rate": 4.7104998729089456e-06, "loss": 0.2216, "step": 4780 }, { "epoch": 1.7031111111111112, "grad_norm": 1.5375991664201998, "learning_rate": 4.689846968704921e-06, "loss": 0.2316, "step": 4790 }, { "epoch": 1.7066666666666666, "grad_norm": 1.835379245628528, "learning_rate": 4.669199375335267e-06, "loss": 0.2211, "step": 4800 }, { "epoch": 1.7102222222222223, "grad_norm": 1.8813507703109071, "learning_rate": 4.64855744635431e-06, "loss": 0.2279, "step": 4810 }, { "epoch": 1.7137777777777776, "grad_norm": 1.6192801344534893, "learning_rate": 4.627921535219398e-06, "loss": 0.2076, "step": 4820 }, { "epoch": 1.7173333333333334, "grad_norm": 1.5047363033780152, "learning_rate": 4.607291995284824e-06, "loss": 0.2272, "step": 4830 }, { "epoch": 1.720888888888889, "grad_norm": 1.7489501841705488, "learning_rate": 4.586669179795789e-06, "loss": 0.2269, "step": 4840 }, { "epoch": 1.7244444444444444, "grad_norm": 1.5125229649844467, "learning_rate": 4.566053441882346e-06, "loss": 0.2187, "step": 4850 }, { "epoch": 1.728, "grad_norm": 1.456492370626904, "learning_rate": 4.545445134553365e-06, "loss": 0.2179, "step": 4860 }, { "epoch": 1.7315555555555555, "grad_norm": 1.620452560710039, "learning_rate": 4.52484461069047e-06, "loss": 0.2262, "step": 4870 }, { "epoch": 1.7351111111111113, "grad_norm": 2.0083784630353887, "learning_rate": 4.504252223042015e-06, "loss": 0.2363, "step": 4880 }, { "epoch": 1.7386666666666666, "grad_norm": 1.4284347298197593, "learning_rate": 4.4836683242170274e-06, "loss": 0.2297, "step": 4890 }, { "epoch": 1.7422222222222223, "grad_norm": 1.4968259463132965, "learning_rate": 4.463093266679185e-06, "loss": 0.2223, "step": 4900 }, { "epoch": 1.7457777777777777, "grad_norm": 1.625381108991568, "learning_rate": 4.442527402740773e-06, "loss": 0.2177, "step": 4910 }, { "epoch": 1.7493333333333334, "grad_norm": 1.7761034776967624, "learning_rate": 4.4219710845566445e-06, "loss": 0.2266, "step": 4920 }, { "epoch": 1.752888888888889, "grad_norm": 1.513194923019174, "learning_rate": 4.401424664118209e-06, "loss": 0.2385, "step": 4930 }, { "epoch": 1.7564444444444445, "grad_norm": 1.6662188116169265, "learning_rate": 4.380888493247389e-06, "loss": 0.2209, "step": 4940 }, { "epoch": 1.76, "grad_norm": 1.7192566216460916, "learning_rate": 4.360362923590599e-06, "loss": 0.2273, "step": 4950 }, { "epoch": 1.7635555555555555, "grad_norm": 1.6376141309754375, "learning_rate": 4.339848306612726e-06, "loss": 0.2263, "step": 4960 }, { "epoch": 1.767111111111111, "grad_norm": 1.5441961811580323, "learning_rate": 4.319344993591122e-06, "loss": 0.2317, "step": 4970 }, { "epoch": 1.7706666666666666, "grad_norm": 1.8214320335618939, "learning_rate": 4.298853335609558e-06, "loss": 0.2352, "step": 4980 }, { "epoch": 1.7742222222222224, "grad_norm": 1.56553607416482, "learning_rate": 4.278373683552252e-06, "loss": 0.2451, "step": 4990 }, { "epoch": 1.7777777777777777, "grad_norm": 1.3995626238477137, "learning_rate": 4.257906388097833e-06, "loss": 0.2119, "step": 5000 }, { "epoch": 1.7777777777777777, "eval_loss": 0.2164340764284134, "eval_runtime": 560.6747, "eval_samples_per_second": 17.836, "eval_steps_per_second": 4.459, "step": 5000 }, { "epoch": 1.7813333333333334, "grad_norm": 2.040538040793932, "learning_rate": 4.237451799713343e-06, "loss": 0.2311, "step": 5010 }, { "epoch": 1.7848888888888887, "grad_norm": 1.718359867250397, "learning_rate": 4.2170102686482386e-06, "loss": 0.2308, "step": 5020 }, { "epoch": 1.7884444444444445, "grad_norm": 1.647498620915099, "learning_rate": 4.196582144928398e-06, "loss": 0.2343, "step": 5030 }, { "epoch": 1.792, "grad_norm": 1.529219174043635, "learning_rate": 4.176167778350111e-06, "loss": 0.2471, "step": 5040 }, { "epoch": 1.7955555555555556, "grad_norm": 1.8299602144032394, "learning_rate": 4.155767518474112e-06, "loss": 0.2334, "step": 5050 }, { "epoch": 1.799111111111111, "grad_norm": 1.6343462536475093, "learning_rate": 4.135381714619572e-06, "loss": 0.2352, "step": 5060 }, { "epoch": 1.8026666666666666, "grad_norm": 1.9294723624845498, "learning_rate": 4.115010715858135e-06, "loss": 0.2295, "step": 5070 }, { "epoch": 1.8062222222222222, "grad_norm": 1.8402038191366281, "learning_rate": 4.09465487100793e-06, "loss": 0.2227, "step": 5080 }, { "epoch": 1.8097777777777777, "grad_norm": 1.8931304584295443, "learning_rate": 4.074314528627602e-06, "loss": 0.2355, "step": 5090 }, { "epoch": 1.8133333333333335, "grad_norm": 1.8206151546804537, "learning_rate": 4.053990037010342e-06, "loss": 0.2323, "step": 5100 }, { "epoch": 1.8168888888888888, "grad_norm": 1.5473952396079231, "learning_rate": 4.033681744177929e-06, "loss": 0.2069, "step": 5110 }, { "epoch": 1.8204444444444445, "grad_norm": 1.2199743932660083, "learning_rate": 4.013389997874759e-06, "loss": 0.2076, "step": 5120 }, { "epoch": 1.8239999999999998, "grad_norm": 1.7825722106285342, "learning_rate": 3.993115145561902e-06, "loss": 0.2425, "step": 5130 }, { "epoch": 1.8275555555555556, "grad_norm": 1.8303008392916014, "learning_rate": 3.9728575344111456e-06, "loss": 0.234, "step": 5140 }, { "epoch": 1.8311111111111111, "grad_norm": 1.2964915164879398, "learning_rate": 3.9526175112990515e-06, "loss": 0.1987, "step": 5150 }, { "epoch": 1.8346666666666667, "grad_norm": 1.5700753166440498, "learning_rate": 3.93239542280102e-06, "loss": 0.2137, "step": 5160 }, { "epoch": 1.8382222222222222, "grad_norm": 1.6406760092620998, "learning_rate": 3.912191615185349e-06, "loss": 0.2235, "step": 5170 }, { "epoch": 1.8417777777777777, "grad_norm": 1.5447905159493263, "learning_rate": 3.892006434407309e-06, "loss": 0.2218, "step": 5180 }, { "epoch": 1.8453333333333335, "grad_norm": 1.7383544264235498, "learning_rate": 3.871840226103219e-06, "loss": 0.2287, "step": 5190 }, { "epoch": 1.8488888888888888, "grad_norm": 1.9317016214891507, "learning_rate": 3.851693335584525e-06, "loss": 0.2228, "step": 5200 }, { "epoch": 1.8524444444444446, "grad_norm": 1.5692018080933492, "learning_rate": 3.831566107831889e-06, "loss": 0.2331, "step": 5210 }, { "epoch": 1.8559999999999999, "grad_norm": 2.050378660719503, "learning_rate": 3.8114588874892893e-06, "loss": 0.2137, "step": 5220 }, { "epoch": 1.8595555555555556, "grad_norm": 1.5271617708228957, "learning_rate": 3.791372018858099e-06, "loss": 0.2135, "step": 5230 }, { "epoch": 1.8631111111111112, "grad_norm": 1.31763541419423, "learning_rate": 3.7713058458912164e-06, "loss": 0.2217, "step": 5240 }, { "epoch": 1.8666666666666667, "grad_norm": 1.6488724873659462, "learning_rate": 3.751260712187156e-06, "loss": 0.2539, "step": 5250 }, { "epoch": 1.8702222222222222, "grad_norm": 1.392136229173735, "learning_rate": 3.731236960984169e-06, "loss": 0.2179, "step": 5260 }, { "epoch": 1.8737777777777778, "grad_norm": 1.6189512718112575, "learning_rate": 3.711234935154372e-06, "loss": 0.2183, "step": 5270 }, { "epoch": 1.8773333333333333, "grad_norm": 1.5548818693905742, "learning_rate": 3.6912549771978747e-06, "loss": 0.2354, "step": 5280 }, { "epoch": 1.8808888888888888, "grad_norm": 1.4728328055912387, "learning_rate": 3.6712974292369035e-06, "loss": 0.2268, "step": 5290 }, { "epoch": 1.8844444444444446, "grad_norm": 1.5435161738551857, "learning_rate": 3.651362633009962e-06, "loss": 0.204, "step": 5300 }, { "epoch": 1.888, "grad_norm": 1.5873129086509827, "learning_rate": 3.6314509298659663e-06, "loss": 0.208, "step": 5310 }, { "epoch": 1.8915555555555557, "grad_norm": 1.3391876728975607, "learning_rate": 3.6115626607584e-06, "loss": 0.2372, "step": 5320 }, { "epoch": 1.895111111111111, "grad_norm": 1.88178920211116, "learning_rate": 3.5916981662394856e-06, "loss": 0.2257, "step": 5330 }, { "epoch": 1.8986666666666667, "grad_norm": 1.764120901512499, "learning_rate": 3.5718577864543396e-06, "loss": 0.2103, "step": 5340 }, { "epoch": 1.9022222222222223, "grad_norm": 1.6698875487111986, "learning_rate": 3.552041861135161e-06, "loss": 0.211, "step": 5350 }, { "epoch": 1.9057777777777778, "grad_norm": 1.6957349016200651, "learning_rate": 3.532250729595408e-06, "loss": 0.2164, "step": 5360 }, { "epoch": 1.9093333333333333, "grad_norm": 1.5603565111247202, "learning_rate": 3.5124847307239863e-06, "loss": 0.2265, "step": 5370 }, { "epoch": 1.9128888888888889, "grad_norm": 1.5529468285695374, "learning_rate": 3.4927442029794467e-06, "loss": 0.2316, "step": 5380 }, { "epoch": 1.9164444444444444, "grad_norm": 1.7677530671686799, "learning_rate": 3.473029484384196e-06, "loss": 0.219, "step": 5390 }, { "epoch": 1.92, "grad_norm": 1.9782571884316444, "learning_rate": 3.4533409125186974e-06, "loss": 0.2252, "step": 5400 }, { "epoch": 1.9235555555555557, "grad_norm": 1.7371605678560165, "learning_rate": 3.4336788245157026e-06, "loss": 0.2222, "step": 5410 }, { "epoch": 1.927111111111111, "grad_norm": 1.7241089696999294, "learning_rate": 3.4140435570544708e-06, "loss": 0.2345, "step": 5420 }, { "epoch": 1.9306666666666668, "grad_norm": 1.7019802310043695, "learning_rate": 3.3944354463550035e-06, "loss": 0.214, "step": 5430 }, { "epoch": 1.934222222222222, "grad_norm": 1.8394276850187319, "learning_rate": 3.374854828172292e-06, "loss": 0.234, "step": 5440 }, { "epoch": 1.9377777777777778, "grad_norm": 1.7264682966489493, "learning_rate": 3.3553020377905663e-06, "loss": 0.2242, "step": 5450 }, { "epoch": 1.9413333333333334, "grad_norm": 1.6744044298365783, "learning_rate": 3.3357774100175513e-06, "loss": 0.2245, "step": 5460 }, { "epoch": 1.944888888888889, "grad_norm": 1.4991747809315612, "learning_rate": 3.316281279178737e-06, "loss": 0.2114, "step": 5470 }, { "epoch": 1.9484444444444444, "grad_norm": 1.5141154002091217, "learning_rate": 3.296813979111655e-06, "loss": 0.2182, "step": 5480 }, { "epoch": 1.952, "grad_norm": 1.7580533484108005, "learning_rate": 3.2773758431601543e-06, "loss": 0.2234, "step": 5490 }, { "epoch": 1.9555555555555557, "grad_norm": 1.6014365241780455, "learning_rate": 3.257967204168705e-06, "loss": 0.238, "step": 5500 }, { "epoch": 1.9555555555555557, "eval_loss": 0.21176277101039886, "eval_runtime": 560.9255, "eval_samples_per_second": 17.828, "eval_steps_per_second": 4.457, "step": 5500 }, { "epoch": 1.959111111111111, "grad_norm": 1.566927102750067, "learning_rate": 3.2385883944766867e-06, "loss": 0.1932, "step": 5510 }, { "epoch": 1.9626666666666668, "grad_norm": 1.7041733469332605, "learning_rate": 3.2192397459127077e-06, "loss": 0.2194, "step": 5520 }, { "epoch": 1.966222222222222, "grad_norm": 1.7846179835205314, "learning_rate": 3.199921589788923e-06, "loss": 0.2092, "step": 5530 }, { "epoch": 1.9697777777777778, "grad_norm": 1.482707355318634, "learning_rate": 3.180634256895345e-06, "loss": 0.2328, "step": 5540 }, { "epoch": 1.9733333333333334, "grad_norm": 1.6559180099205715, "learning_rate": 3.161378077494205e-06, "loss": 0.234, "step": 5550 }, { "epoch": 1.976888888888889, "grad_norm": 1.4931797613124567, "learning_rate": 3.142153381314278e-06, "loss": 0.2285, "step": 5560 }, { "epoch": 1.9804444444444445, "grad_norm": 1.6899228150340497, "learning_rate": 3.122960497545242e-06, "loss": 0.2347, "step": 5570 }, { "epoch": 1.984, "grad_norm": 1.6112817535514066, "learning_rate": 3.103799754832045e-06, "loss": 0.2017, "step": 5580 }, { "epoch": 1.9875555555555555, "grad_norm": 1.4492842053913877, "learning_rate": 3.0846714812692774e-06, "loss": 0.2282, "step": 5590 }, { "epoch": 1.991111111111111, "grad_norm": 1.6227303784789882, "learning_rate": 3.065576004395546e-06, "loss": 0.2193, "step": 5600 }, { "epoch": 1.9946666666666668, "grad_norm": 1.6532339878737676, "learning_rate": 3.046513651187874e-06, "loss": 0.205, "step": 5610 }, { "epoch": 1.9982222222222221, "grad_norm": 1.726150455488493, "learning_rate": 3.027484748056101e-06, "loss": 0.2052, "step": 5620 }, { "epoch": 2.001777777777778, "grad_norm": 1.2491575364238943, "learning_rate": 3.008489620837287e-06, "loss": 0.1793, "step": 5630 }, { "epoch": 2.005333333333333, "grad_norm": 1.539466703681713, "learning_rate": 2.989528594790142e-06, "loss": 0.133, "step": 5640 }, { "epoch": 2.008888888888889, "grad_norm": 1.5201921987042595, "learning_rate": 2.97060199458945e-06, "loss": 0.1364, "step": 5650 }, { "epoch": 2.0124444444444443, "grad_norm": 1.8387836805686166, "learning_rate": 2.9517101443205143e-06, "loss": 0.138, "step": 5660 }, { "epoch": 2.016, "grad_norm": 1.6624452979538558, "learning_rate": 2.9328533674736043e-06, "loss": 0.1372, "step": 5670 }, { "epoch": 2.0195555555555558, "grad_norm": 2.0375067274701464, "learning_rate": 2.914031986938417e-06, "loss": 0.1376, "step": 5680 }, { "epoch": 2.023111111111111, "grad_norm": 1.5020388133691598, "learning_rate": 2.895246324998549e-06, "loss": 0.132, "step": 5690 }, { "epoch": 2.026666666666667, "grad_norm": 1.5200304354769367, "learning_rate": 2.8764967033259793e-06, "loss": 0.1332, "step": 5700 }, { "epoch": 2.030222222222222, "grad_norm": 1.615938242121572, "learning_rate": 2.8577834429755586e-06, "loss": 0.137, "step": 5710 }, { "epoch": 2.033777777777778, "grad_norm": 1.7244206202588588, "learning_rate": 2.839106864379512e-06, "loss": 0.1311, "step": 5720 }, { "epoch": 2.037333333333333, "grad_norm": 1.4204204890159835, "learning_rate": 2.8204672873419565e-06, "loss": 0.1359, "step": 5730 }, { "epoch": 2.040888888888889, "grad_norm": 1.641810724006462, "learning_rate": 2.8018650310334118e-06, "loss": 0.1524, "step": 5740 }, { "epoch": 2.0444444444444443, "grad_norm": 1.6197231294728873, "learning_rate": 2.783300413985359e-06, "loss": 0.1216, "step": 5750 }, { "epoch": 2.048, "grad_norm": 1.7166152973793496, "learning_rate": 2.764773754084763e-06, "loss": 0.1393, "step": 5760 }, { "epoch": 2.0515555555555554, "grad_norm": 1.7305108784705923, "learning_rate": 2.7462853685686362e-06, "loss": 0.1429, "step": 5770 }, { "epoch": 2.055111111111111, "grad_norm": 1.2910967057789844, "learning_rate": 2.7278355740186123e-06, "loss": 0.1336, "step": 5780 }, { "epoch": 2.058666666666667, "grad_norm": 1.5080611405633613, "learning_rate": 2.7094246863555262e-06, "loss": 0.1359, "step": 5790 }, { "epoch": 2.062222222222222, "grad_norm": 1.8733744454525603, "learning_rate": 2.691053020833988e-06, "loss": 0.1388, "step": 5800 }, { "epoch": 2.065777777777778, "grad_norm": 1.7085324740063759, "learning_rate": 2.6727208920370063e-06, "loss": 0.1355, "step": 5810 }, { "epoch": 2.0693333333333332, "grad_norm": 1.5576784710780245, "learning_rate": 2.6544286138705867e-06, "loss": 0.1328, "step": 5820 }, { "epoch": 2.072888888888889, "grad_norm": 1.9703710936721526, "learning_rate": 2.636176499558364e-06, "loss": 0.1354, "step": 5830 }, { "epoch": 2.0764444444444443, "grad_norm": 1.5952203119705437, "learning_rate": 2.6179648616362374e-06, "loss": 0.1493, "step": 5840 }, { "epoch": 2.08, "grad_norm": 1.9073156525645674, "learning_rate": 2.599794011947012e-06, "loss": 0.1579, "step": 5850 }, { "epoch": 2.0835555555555554, "grad_norm": 1.7695748236621889, "learning_rate": 2.581664261635069e-06, "loss": 0.1446, "step": 5860 }, { "epoch": 2.087111111111111, "grad_norm": 1.8880183020861152, "learning_rate": 2.5635759211410396e-06, "loss": 0.1406, "step": 5870 }, { "epoch": 2.0906666666666665, "grad_norm": 1.5198269240530051, "learning_rate": 2.545529300196472e-06, "loss": 0.1244, "step": 5880 }, { "epoch": 2.094222222222222, "grad_norm": 1.9355343365767825, "learning_rate": 2.527524707818547e-06, "loss": 0.1289, "step": 5890 }, { "epoch": 2.097777777777778, "grad_norm": 1.546102626213903, "learning_rate": 2.5095624523047775e-06, "loss": 0.1151, "step": 5900 }, { "epoch": 2.1013333333333333, "grad_norm": 1.3237810299249595, "learning_rate": 2.491642841227729e-06, "loss": 0.1386, "step": 5910 }, { "epoch": 2.104888888888889, "grad_norm": 1.6354432410587478, "learning_rate": 2.4737661814297557e-06, "loss": 0.1152, "step": 5920 }, { "epoch": 2.1084444444444443, "grad_norm": 1.7641939157921844, "learning_rate": 2.455932779017747e-06, "loss": 0.1267, "step": 5930 }, { "epoch": 2.112, "grad_norm": 1.7717956617877848, "learning_rate": 2.438142939357882e-06, "loss": 0.1468, "step": 5940 }, { "epoch": 2.1155555555555554, "grad_norm": 1.9248857260031529, "learning_rate": 2.4203969670704065e-06, "loss": 0.1426, "step": 5950 }, { "epoch": 2.119111111111111, "grad_norm": 1.6693083011986807, "learning_rate": 2.4026951660244063e-06, "loss": 0.1519, "step": 5960 }, { "epoch": 2.1226666666666665, "grad_norm": 1.4577868069815147, "learning_rate": 2.385037839332616e-06, "loss": 0.1449, "step": 5970 }, { "epoch": 2.1262222222222222, "grad_norm": 1.5757247401728414, "learning_rate": 2.3674252893462304e-06, "loss": 0.1508, "step": 5980 }, { "epoch": 2.129777777777778, "grad_norm": 1.798414953668795, "learning_rate": 2.3498578176497055e-06, "loss": 0.1336, "step": 5990 }, { "epoch": 2.1333333333333333, "grad_norm": 1.3502333712237125, "learning_rate": 2.3323357250556213e-06, "loss": 0.1289, "step": 6000 }, { "epoch": 2.1333333333333333, "eval_loss": 0.24109843373298645, "eval_runtime": 561.0318, "eval_samples_per_second": 17.824, "eval_steps_per_second": 4.456, "step": 6000 }, { "epoch": 2.136888888888889, "grad_norm": 1.6807098639484461, "learning_rate": 2.3148593115995155e-06, "loss": 0.1232, "step": 6010 }, { "epoch": 2.1404444444444444, "grad_norm": 1.3750693562838343, "learning_rate": 2.2974288765347484e-06, "loss": 0.1406, "step": 6020 }, { "epoch": 2.144, "grad_norm": 1.7740210796916787, "learning_rate": 2.280044718327383e-06, "loss": 0.1366, "step": 6030 }, { "epoch": 2.1475555555555554, "grad_norm": 1.3613431283259703, "learning_rate": 2.262707134651069e-06, "loss": 0.1347, "step": 6040 }, { "epoch": 2.151111111111111, "grad_norm": 1.5001232721911446, "learning_rate": 2.2454164223819443e-06, "loss": 0.1435, "step": 6050 }, { "epoch": 2.1546666666666665, "grad_norm": 1.6096086307058128, "learning_rate": 2.228172877593563e-06, "loss": 0.1248, "step": 6060 }, { "epoch": 2.1582222222222223, "grad_norm": 1.4625689431665512, "learning_rate": 2.2109767955518135e-06, "loss": 0.129, "step": 6070 }, { "epoch": 2.1617777777777776, "grad_norm": 1.7396993983427422, "learning_rate": 2.193828470709863e-06, "loss": 0.1259, "step": 6080 }, { "epoch": 2.1653333333333333, "grad_norm": 1.4423513554123952, "learning_rate": 2.176728196703122e-06, "loss": 0.1308, "step": 6090 }, { "epoch": 2.168888888888889, "grad_norm": 1.9920936118384482, "learning_rate": 2.159676266344222e-06, "loss": 0.1496, "step": 6100 }, { "epoch": 2.1724444444444444, "grad_norm": 2.13727569719491, "learning_rate": 2.142672971617978e-06, "loss": 0.1359, "step": 6110 }, { "epoch": 2.176, "grad_norm": 1.5724700258419562, "learning_rate": 2.125718603676413e-06, "loss": 0.1412, "step": 6120 }, { "epoch": 2.1795555555555555, "grad_norm": 1.3817720285663424, "learning_rate": 2.1088134528337635e-06, "loss": 0.1357, "step": 6130 }, { "epoch": 2.1831111111111112, "grad_norm": 1.6852270201894561, "learning_rate": 2.091957808561505e-06, "loss": 0.1388, "step": 6140 }, { "epoch": 2.1866666666666665, "grad_norm": 1.5752301082061768, "learning_rate": 2.0751519594834025e-06, "loss": 0.1359, "step": 6150 }, { "epoch": 2.1902222222222223, "grad_norm": 1.9588237176858065, "learning_rate": 2.058396193370556e-06, "loss": 0.1364, "step": 6160 }, { "epoch": 2.1937777777777776, "grad_norm": 1.5906028620881005, "learning_rate": 2.0416907971364937e-06, "loss": 0.1286, "step": 6170 }, { "epoch": 2.1973333333333334, "grad_norm": 1.6040127033831966, "learning_rate": 2.0250360568322395e-06, "loss": 0.132, "step": 6180 }, { "epoch": 2.2008888888888887, "grad_norm": 1.903945940065679, "learning_rate": 2.0084322576414205e-06, "loss": 0.1311, "step": 6190 }, { "epoch": 2.2044444444444444, "grad_norm": 1.7327408494603853, "learning_rate": 1.991879683875386e-06, "loss": 0.1412, "step": 6200 }, { "epoch": 2.208, "grad_norm": 1.6938104353348038, "learning_rate": 1.975378618968348e-06, "loss": 0.1358, "step": 6210 }, { "epoch": 2.2115555555555555, "grad_norm": 1.498102728760879, "learning_rate": 1.958929345472503e-06, "loss": 0.1272, "step": 6220 }, { "epoch": 2.2151111111111113, "grad_norm": 1.5061713395545921, "learning_rate": 1.942532145053219e-06, "loss": 0.1335, "step": 6230 }, { "epoch": 2.2186666666666666, "grad_norm": 1.8881968807558394, "learning_rate": 1.926187298484201e-06, "loss": 0.13, "step": 6240 }, { "epoch": 2.2222222222222223, "grad_norm": 1.7409457044279315, "learning_rate": 1.9098950856426845e-06, "loss": 0.1197, "step": 6250 }, { "epoch": 2.2257777777777776, "grad_norm": 1.7410736866607524, "learning_rate": 1.893655785504644e-06, "loss": 0.136, "step": 6260 }, { "epoch": 2.2293333333333334, "grad_norm": 1.4673795329307866, "learning_rate": 1.8774696761400107e-06, "loss": 0.1351, "step": 6270 }, { "epoch": 2.2328888888888887, "grad_norm": 1.4286935284704283, "learning_rate": 1.8613370347079207e-06, "loss": 0.1316, "step": 6280 }, { "epoch": 2.2364444444444445, "grad_norm": 1.6752679462634348, "learning_rate": 1.845258137451968e-06, "loss": 0.1343, "step": 6290 }, { "epoch": 2.24, "grad_norm": 1.5334658674891999, "learning_rate": 1.8292332596954605e-06, "loss": 0.1252, "step": 6300 }, { "epoch": 2.2435555555555555, "grad_norm": 1.7816021858972186, "learning_rate": 1.8132626758367217e-06, "loss": 0.1373, "step": 6310 }, { "epoch": 2.2471111111111113, "grad_norm": 1.4751058571451898, "learning_rate": 1.7973466593443861e-06, "loss": 0.1238, "step": 6320 }, { "epoch": 2.2506666666666666, "grad_norm": 1.5737118263350949, "learning_rate": 1.7814854827527144e-06, "loss": 0.1331, "step": 6330 }, { "epoch": 2.2542222222222223, "grad_norm": 1.6723085510766795, "learning_rate": 1.7656794176569302e-06, "loss": 0.1392, "step": 6340 }, { "epoch": 2.2577777777777777, "grad_norm": 1.6074614963797307, "learning_rate": 1.749928734708568e-06, "loss": 0.1482, "step": 6350 }, { "epoch": 2.2613333333333334, "grad_norm": 1.514935517928495, "learning_rate": 1.734233703610838e-06, "loss": 0.1318, "step": 6360 }, { "epoch": 2.2648888888888887, "grad_norm": 2.1990045539686767, "learning_rate": 1.7185945931140086e-06, "loss": 0.1389, "step": 6370 }, { "epoch": 2.2684444444444445, "grad_norm": 1.7900402567821287, "learning_rate": 1.7030116710108068e-06, "loss": 0.1402, "step": 6380 }, { "epoch": 2.2720000000000002, "grad_norm": 1.5936415333953513, "learning_rate": 1.6874852041318246e-06, "loss": 0.1383, "step": 6390 }, { "epoch": 2.2755555555555556, "grad_norm": 1.6874167667097502, "learning_rate": 1.6720154583409642e-06, "loss": 0.1297, "step": 6400 }, { "epoch": 2.279111111111111, "grad_norm": 1.7461565673164665, "learning_rate": 1.6566026985308737e-06, "loss": 0.1265, "step": 6410 }, { "epoch": 2.2826666666666666, "grad_norm": 1.9943666083505533, "learning_rate": 1.6412471886184106e-06, "loss": 0.1433, "step": 6420 }, { "epoch": 2.2862222222222224, "grad_norm": 1.889269033390485, "learning_rate": 1.6259491915401322e-06, "loss": 0.1295, "step": 6430 }, { "epoch": 2.2897777777777777, "grad_norm": 1.9954192603921324, "learning_rate": 1.6107089692477856e-06, "loss": 0.1506, "step": 6440 }, { "epoch": 2.2933333333333334, "grad_norm": 1.73943513110269, "learning_rate": 1.5955267827038267e-06, "loss": 0.1309, "step": 6450 }, { "epoch": 2.2968888888888888, "grad_norm": 1.5696215992092173, "learning_rate": 1.5804028918769488e-06, "loss": 0.1245, "step": 6460 }, { "epoch": 2.3004444444444445, "grad_norm": 1.4480211516999386, "learning_rate": 1.5653375557376266e-06, "loss": 0.1419, "step": 6470 }, { "epoch": 2.304, "grad_norm": 1.7769598112511977, "learning_rate": 1.5503310322536962e-06, "loss": 0.1357, "step": 6480 }, { "epoch": 2.3075555555555556, "grad_norm": 1.6914490635403432, "learning_rate": 1.5353835783859244e-06, "loss": 0.1344, "step": 6490 }, { "epoch": 2.311111111111111, "grad_norm": 1.2896364219654397, "learning_rate": 1.5204954500836095e-06, "loss": 0.1336, "step": 6500 }, { "epoch": 2.311111111111111, "eval_loss": 0.2400493621826172, "eval_runtime": 562.3512, "eval_samples_per_second": 17.782, "eval_steps_per_second": 4.446, "step": 6500 }, { "epoch": 2.3146666666666667, "grad_norm": 1.6249516275302234, "learning_rate": 1.5056669022802051e-06, "loss": 0.1578, "step": 6510 }, { "epoch": 2.3182222222222224, "grad_norm": 1.5534728727358678, "learning_rate": 1.4908981888889562e-06, "loss": 0.1236, "step": 6520 }, { "epoch": 2.3217777777777777, "grad_norm": 2.305594450780404, "learning_rate": 1.4761895627985384e-06, "loss": 0.1437, "step": 6530 }, { "epoch": 2.3253333333333335, "grad_norm": 1.7525804358624415, "learning_rate": 1.461541275868742e-06, "loss": 0.1244, "step": 6540 }, { "epoch": 2.328888888888889, "grad_norm": 1.5857723879215653, "learning_rate": 1.4469535789261518e-06, "loss": 0.138, "step": 6550 }, { "epoch": 2.3324444444444445, "grad_norm": 1.4470785666281207, "learning_rate": 1.4324267217598543e-06, "loss": 0.1311, "step": 6560 }, { "epoch": 2.336, "grad_norm": 1.5783013529079604, "learning_rate": 1.41796095311716e-06, "loss": 0.1476, "step": 6570 }, { "epoch": 2.3395555555555556, "grad_norm": 1.792387189040966, "learning_rate": 1.4035565206993407e-06, "loss": 0.1313, "step": 6580 }, { "epoch": 2.343111111111111, "grad_norm": 2.0097219507066986, "learning_rate": 1.3892136711573983e-06, "loss": 0.1481, "step": 6590 }, { "epoch": 2.3466666666666667, "grad_norm": 1.6038575587094324, "learning_rate": 1.3749326500878308e-06, "loss": 0.1329, "step": 6600 }, { "epoch": 2.3502222222222224, "grad_norm": 1.8038941533229218, "learning_rate": 1.3607137020284267e-06, "loss": 0.1296, "step": 6610 }, { "epoch": 2.3537777777777777, "grad_norm": 1.5967517903597408, "learning_rate": 1.3465570704540877e-06, "loss": 0.1323, "step": 6620 }, { "epoch": 2.3573333333333335, "grad_norm": 1.6630671725280828, "learning_rate": 1.33246299777265e-06, "loss": 0.1353, "step": 6630 }, { "epoch": 2.360888888888889, "grad_norm": 1.6910996186336409, "learning_rate": 1.3184317253207379e-06, "loss": 0.1198, "step": 6640 }, { "epoch": 2.3644444444444446, "grad_norm": 1.667550829249205, "learning_rate": 1.3044634933596311e-06, "loss": 0.1398, "step": 6650 }, { "epoch": 2.368, "grad_norm": 1.3604264834299673, "learning_rate": 1.290558541071148e-06, "loss": 0.123, "step": 6660 }, { "epoch": 2.3715555555555556, "grad_norm": 1.4966865021721736, "learning_rate": 1.2767171065535538e-06, "loss": 0.1221, "step": 6670 }, { "epoch": 2.375111111111111, "grad_norm": 1.3751769981745194, "learning_rate": 1.2629394268174811e-06, "loss": 0.1398, "step": 6680 }, { "epoch": 2.3786666666666667, "grad_norm": 1.7552964254373993, "learning_rate": 1.2492257377818734e-06, "loss": 0.122, "step": 6690 }, { "epoch": 2.3822222222222225, "grad_norm": 1.984424873865648, "learning_rate": 1.235576274269938e-06, "loss": 0.1366, "step": 6700 }, { "epoch": 2.3857777777777778, "grad_norm": 1.8024296643627178, "learning_rate": 1.2219912700051417e-06, "loss": 0.1304, "step": 6710 }, { "epoch": 2.389333333333333, "grad_norm": 1.6704237658027163, "learning_rate": 1.2084709576071885e-06, "loss": 0.1339, "step": 6720 }, { "epoch": 2.392888888888889, "grad_norm": 1.8905223292433262, "learning_rate": 1.1950155685880504e-06, "loss": 0.138, "step": 6730 }, { "epoch": 2.3964444444444446, "grad_norm": 1.8585326052998994, "learning_rate": 1.1816253333479994e-06, "loss": 0.1402, "step": 6740 }, { "epoch": 2.4, "grad_norm": 1.4117751565900303, "learning_rate": 1.1683004811716597e-06, "loss": 0.1219, "step": 6750 }, { "epoch": 2.4035555555555557, "grad_norm": 2.177441304004068, "learning_rate": 1.1550412402240852e-06, "loss": 0.1472, "step": 6760 }, { "epoch": 2.407111111111111, "grad_norm": 1.7312870442889088, "learning_rate": 1.1418478375468496e-06, "loss": 0.14, "step": 6770 }, { "epoch": 2.4106666666666667, "grad_norm": 1.4691171208612808, "learning_rate": 1.1287204990541612e-06, "loss": 0.1382, "step": 6780 }, { "epoch": 2.414222222222222, "grad_norm": 1.9102821919207582, "learning_rate": 1.1156594495289923e-06, "loss": 0.1508, "step": 6790 }, { "epoch": 2.417777777777778, "grad_norm": 1.5765296328104144, "learning_rate": 1.1026649126192334e-06, "loss": 0.1244, "step": 6800 }, { "epoch": 2.421333333333333, "grad_norm": 1.485558878346715, "learning_rate": 1.0897371108338572e-06, "loss": 0.1262, "step": 6810 }, { "epoch": 2.424888888888889, "grad_norm": 1.6805947418795415, "learning_rate": 1.076876265539115e-06, "loss": 0.1397, "step": 6820 }, { "epoch": 2.4284444444444446, "grad_norm": 1.8439671145791727, "learning_rate": 1.0640825969547498e-06, "loss": 0.1298, "step": 6830 }, { "epoch": 2.432, "grad_norm": 1.8675356289498493, "learning_rate": 1.051356324150209e-06, "loss": 0.1334, "step": 6840 }, { "epoch": 2.4355555555555557, "grad_norm": 2.097329265797065, "learning_rate": 1.0386976650409102e-06, "loss": 0.1342, "step": 6850 }, { "epoch": 2.439111111111111, "grad_norm": 1.7733262424549074, "learning_rate": 1.0261068363845034e-06, "loss": 0.1297, "step": 6860 }, { "epoch": 2.4426666666666668, "grad_norm": 1.7698885455909084, "learning_rate": 1.0135840537771574e-06, "loss": 0.1355, "step": 6870 }, { "epoch": 2.446222222222222, "grad_norm": 1.699595680180769, "learning_rate": 1.001129531649872e-06, "loss": 0.1255, "step": 6880 }, { "epoch": 2.449777777777778, "grad_norm": 1.8061641909036275, "learning_rate": 9.887434832647997e-07, "loss": 0.1355, "step": 6890 }, { "epoch": 2.453333333333333, "grad_norm": 1.8282679409791762, "learning_rate": 9.764261207116061e-07, "loss": 0.1437, "step": 6900 }, { "epoch": 2.456888888888889, "grad_norm": 1.8691781223789907, "learning_rate": 9.641776549038257e-07, "loss": 0.1274, "step": 6910 }, { "epoch": 2.4604444444444447, "grad_norm": 1.8720204975109627, "learning_rate": 9.519982955752549e-07, "loss": 0.1321, "step": 6920 }, { "epoch": 2.464, "grad_norm": 1.714725769185188, "learning_rate": 9.398882512763618e-07, "loss": 0.1299, "step": 6930 }, { "epoch": 2.4675555555555557, "grad_norm": 1.5736356325676821, "learning_rate": 9.278477293707189e-07, "loss": 0.1454, "step": 6940 }, { "epoch": 2.471111111111111, "grad_norm": 1.7235279739808778, "learning_rate": 9.158769360314412e-07, "loss": 0.1301, "step": 6950 }, { "epoch": 2.474666666666667, "grad_norm": 1.7964601353844663, "learning_rate": 9.039760762376665e-07, "loss": 0.1329, "step": 6960 }, { "epoch": 2.478222222222222, "grad_norm": 1.7113961505997257, "learning_rate": 8.921453537710406e-07, "loss": 0.1301, "step": 6970 }, { "epoch": 2.481777777777778, "grad_norm": 3.7247151362742708, "learning_rate": 8.803849712122292e-07, "loss": 0.1366, "step": 6980 }, { "epoch": 2.485333333333333, "grad_norm": 1.6042128553101094, "learning_rate": 8.686951299374474e-07, "loss": 0.1248, "step": 6990 }, { "epoch": 2.488888888888889, "grad_norm": 1.7566315817690532, "learning_rate": 8.570760301150166e-07, "loss": 0.1397, "step": 7000 }, { "epoch": 2.488888888888889, "eval_loss": 0.239632710814476, "eval_runtime": 563.0915, "eval_samples_per_second": 17.759, "eval_steps_per_second": 4.44, "step": 7000 }, { "epoch": 2.4924444444444447, "grad_norm": 1.915869222287072, "learning_rate": 8.455278707019255e-07, "loss": 0.133, "step": 7010 }, { "epoch": 2.496, "grad_norm": 1.4611242467498158, "learning_rate": 8.340508494404415e-07, "loss": 0.128, "step": 7020 }, { "epoch": 2.4995555555555553, "grad_norm": 1.8274207116893812, "learning_rate": 8.226451628547039e-07, "loss": 0.1304, "step": 7030 }, { "epoch": 2.503111111111111, "grad_norm": 1.5195837090357422, "learning_rate": 8.113110062473756e-07, "loss": 0.1337, "step": 7040 }, { "epoch": 2.506666666666667, "grad_norm": 1.534284195780538, "learning_rate": 8.000485736962899e-07, "loss": 0.1365, "step": 7050 }, { "epoch": 2.510222222222222, "grad_norm": 1.3874360730778557, "learning_rate": 7.888580580511307e-07, "loss": 0.1157, "step": 7060 }, { "epoch": 2.513777777777778, "grad_norm": 1.347897014568791, "learning_rate": 7.777396509301278e-07, "loss": 0.1258, "step": 7070 }, { "epoch": 2.517333333333333, "grad_norm": 1.5444960857241712, "learning_rate": 7.666935427167777e-07, "loss": 0.1261, "step": 7080 }, { "epoch": 2.520888888888889, "grad_norm": 1.5787802499569878, "learning_rate": 7.557199225565848e-07, "loss": 0.1353, "step": 7090 }, { "epoch": 2.5244444444444447, "grad_norm": 1.6575537900928325, "learning_rate": 7.448189783538184e-07, "loss": 0.1223, "step": 7100 }, { "epoch": 2.528, "grad_norm": 1.58456318992188, "learning_rate": 7.339908967683007e-07, "loss": 0.1227, "step": 7110 }, { "epoch": 2.5315555555555553, "grad_norm": 1.916341417565209, "learning_rate": 7.232358632122022e-07, "loss": 0.1365, "step": 7120 }, { "epoch": 2.535111111111111, "grad_norm": 2.009648842498942, "learning_rate": 7.125540618468784e-07, "loss": 0.1435, "step": 7130 }, { "epoch": 2.538666666666667, "grad_norm": 1.2589650678388224, "learning_rate": 7.019456755797083e-07, "loss": 0.1333, "step": 7140 }, { "epoch": 2.542222222222222, "grad_norm": 1.534526581817288, "learning_rate": 6.914108860609608e-07, "loss": 0.1372, "step": 7150 }, { "epoch": 2.545777777777778, "grad_norm": 1.5742622053962463, "learning_rate": 6.809498736806919e-07, "loss": 0.135, "step": 7160 }, { "epoch": 2.5493333333333332, "grad_norm": 1.876907152948741, "learning_rate": 6.705628175656498e-07, "loss": 0.1304, "step": 7170 }, { "epoch": 2.552888888888889, "grad_norm": 1.7507039554831174, "learning_rate": 6.602498955762105e-07, "loss": 0.1361, "step": 7180 }, { "epoch": 2.5564444444444443, "grad_norm": 1.5168112309443524, "learning_rate": 6.500112843033313e-07, "loss": 0.1235, "step": 7190 }, { "epoch": 2.56, "grad_norm": 1.366857399391539, "learning_rate": 6.39847159065523e-07, "loss": 0.1268, "step": 7200 }, { "epoch": 2.5635555555555554, "grad_norm": 1.7472209117726187, "learning_rate": 6.297576939058586e-07, "loss": 0.1338, "step": 7210 }, { "epoch": 2.567111111111111, "grad_norm": 1.5771285823832333, "learning_rate": 6.197430615889838e-07, "loss": 0.1304, "step": 7220 }, { "epoch": 2.570666666666667, "grad_norm": 1.5122386895026887, "learning_rate": 6.098034335981573e-07, "loss": 0.1255, "step": 7230 }, { "epoch": 2.574222222222222, "grad_norm": 1.5101320862852827, "learning_rate": 5.999389801323219e-07, "loss": 0.128, "step": 7240 }, { "epoch": 2.5777777777777775, "grad_norm": 1.751375058176443, "learning_rate": 5.901498701031894e-07, "loss": 0.131, "step": 7250 }, { "epoch": 2.5813333333333333, "grad_norm": 1.5370110538793642, "learning_rate": 5.804362711323391e-07, "loss": 0.1273, "step": 7260 }, { "epoch": 2.584888888888889, "grad_norm": 1.5422190674222276, "learning_rate": 5.707983495483593e-07, "loss": 0.122, "step": 7270 }, { "epoch": 2.5884444444444443, "grad_norm": 1.8111593254497258, "learning_rate": 5.612362703839907e-07, "loss": 0.1308, "step": 7280 }, { "epoch": 2.592, "grad_norm": 1.7898287718649462, "learning_rate": 5.517501973733059e-07, "loss": 0.1239, "step": 7290 }, { "epoch": 2.5955555555555554, "grad_norm": 1.5741550714022359, "learning_rate": 5.423402929489019e-07, "loss": 0.1242, "step": 7300 }, { "epoch": 2.599111111111111, "grad_norm": 1.7431025808198797, "learning_rate": 5.330067182391219e-07, "loss": 0.1258, "step": 7310 }, { "epoch": 2.602666666666667, "grad_norm": 1.669472703725672, "learning_rate": 5.237496330652925e-07, "loss": 0.1318, "step": 7320 }, { "epoch": 2.606222222222222, "grad_norm": 1.7086096850592123, "learning_rate": 5.145691959389932e-07, "loss": 0.1292, "step": 7330 }, { "epoch": 2.6097777777777775, "grad_norm": 1.79780883791639, "learning_rate": 5.054655640593325e-07, "loss": 0.1446, "step": 7340 }, { "epoch": 2.6133333333333333, "grad_norm": 1.760230682240199, "learning_rate": 4.964388933102666e-07, "loss": 0.1418, "step": 7350 }, { "epoch": 2.616888888888889, "grad_norm": 1.540197801989686, "learning_rate": 4.874893382579232e-07, "loss": 0.1269, "step": 7360 }, { "epoch": 2.6204444444444444, "grad_norm": 1.7177370855999565, "learning_rate": 4.786170521479588e-07, "loss": 0.1223, "step": 7370 }, { "epoch": 2.624, "grad_norm": 1.881294576905093, "learning_rate": 4.698221869029307e-07, "loss": 0.1443, "step": 7380 }, { "epoch": 2.6275555555555554, "grad_norm": 1.74196972034532, "learning_rate": 4.6110489311969876e-07, "loss": 0.1429, "step": 7390 }, { "epoch": 2.631111111111111, "grad_norm": 1.5651241374342044, "learning_rate": 4.524653200668461e-07, "loss": 0.1264, "step": 7400 }, { "epoch": 2.634666666666667, "grad_norm": 1.8251309622054404, "learning_rate": 4.439036156821225e-07, "loss": 0.1213, "step": 7410 }, { "epoch": 2.6382222222222222, "grad_norm": 1.4351427368380598, "learning_rate": 4.3541992656991163e-07, "loss": 0.1182, "step": 7420 }, { "epoch": 2.6417777777777776, "grad_norm": 1.9769377027322241, "learning_rate": 4.2701439799871847e-07, "loss": 0.1453, "step": 7430 }, { "epoch": 2.6453333333333333, "grad_norm": 1.6755217149463195, "learning_rate": 4.1868717389868694e-07, "loss": 0.1284, "step": 7440 }, { "epoch": 2.648888888888889, "grad_norm": 1.4882784431490907, "learning_rate": 4.1043839685913135e-07, "loss": 0.1289, "step": 7450 }, { "epoch": 2.6524444444444444, "grad_norm": 1.2678152146637376, "learning_rate": 4.022682081260942e-07, "loss": 0.122, "step": 7460 }, { "epoch": 2.656, "grad_norm": 1.7036091433400906, "learning_rate": 3.941767475999297e-07, "loss": 0.1292, "step": 7470 }, { "epoch": 2.6595555555555555, "grad_norm": 2.0073020304210485, "learning_rate": 3.8616415383291083e-07, "loss": 0.1281, "step": 7480 }, { "epoch": 2.663111111111111, "grad_norm": 1.7003882572239488, "learning_rate": 3.7823056402684856e-07, "loss": 0.1205, "step": 7490 }, { "epoch": 2.6666666666666665, "grad_norm": 1.8649824143158358, "learning_rate": 3.70376114030751e-07, "loss": 0.1405, "step": 7500 }, { "epoch": 2.6666666666666665, "eval_loss": 0.2399507761001587, "eval_runtime": 561.5965, "eval_samples_per_second": 17.806, "eval_steps_per_second": 4.452, "step": 7500 }, { "epoch": 2.6702222222222223, "grad_norm": 1.778861851144716, "learning_rate": 3.626009383384926e-07, "loss": 0.1424, "step": 7510 }, { "epoch": 2.6737777777777776, "grad_norm": 1.7506343466298935, "learning_rate": 3.549051700865136e-07, "loss": 0.1242, "step": 7520 }, { "epoch": 2.6773333333333333, "grad_norm": 1.5579333925843626, "learning_rate": 3.47288941051539e-07, "loss": 0.125, "step": 7530 }, { "epoch": 2.680888888888889, "grad_norm": 2.030096385748008, "learning_rate": 3.3975238164831893e-07, "loss": 0.1253, "step": 7540 }, { "epoch": 2.6844444444444444, "grad_norm": 1.635535994621638, "learning_rate": 3.322956209274031e-07, "loss": 0.1322, "step": 7550 }, { "epoch": 2.6879999999999997, "grad_norm": 1.7329277515156414, "learning_rate": 3.2491878657292643e-07, "loss": 0.1355, "step": 7560 }, { "epoch": 2.6915555555555555, "grad_norm": 1.7444157426686764, "learning_rate": 3.176220049004197e-07, "loss": 0.1179, "step": 7570 }, { "epoch": 2.6951111111111112, "grad_norm": 1.3483728954452034, "learning_rate": 3.104054008546525e-07, "loss": 0.1338, "step": 7580 }, { "epoch": 2.6986666666666665, "grad_norm": 1.3906620863471058, "learning_rate": 3.032690980074915e-07, "loss": 0.131, "step": 7590 }, { "epoch": 2.7022222222222223, "grad_norm": 1.8327466893042572, "learning_rate": 2.962132185557826e-07, "loss": 0.1223, "step": 7600 }, { "epoch": 2.7057777777777776, "grad_norm": 1.5547638545825841, "learning_rate": 2.892378833192611e-07, "loss": 0.1282, "step": 7610 }, { "epoch": 2.7093333333333334, "grad_norm": 1.804096897597165, "learning_rate": 2.823432117384822e-07, "loss": 0.1321, "step": 7620 }, { "epoch": 2.712888888888889, "grad_norm": 1.5920189474841397, "learning_rate": 2.755293218727739e-07, "loss": 0.1266, "step": 7630 }, { "epoch": 2.7164444444444444, "grad_norm": 1.95119518386987, "learning_rate": 2.6879633039821994e-07, "loss": 0.1356, "step": 7640 }, { "epoch": 2.7199999999999998, "grad_norm": 1.8385230420520196, "learning_rate": 2.62144352605655e-07, "loss": 0.1262, "step": 7650 }, { "epoch": 2.7235555555555555, "grad_norm": 1.7885799872230752, "learning_rate": 2.555735023986966e-07, "loss": 0.1315, "step": 7660 }, { "epoch": 2.7271111111111113, "grad_norm": 1.8941729319880476, "learning_rate": 2.4908389229179484e-07, "loss": 0.1179, "step": 7670 }, { "epoch": 2.7306666666666666, "grad_norm": 1.5725333890356554, "learning_rate": 2.4267563340830026e-07, "loss": 0.1122, "step": 7680 }, { "epoch": 2.7342222222222223, "grad_norm": 1.9949059298619423, "learning_rate": 2.363488354785648e-07, "loss": 0.1372, "step": 7690 }, { "epoch": 2.7377777777777776, "grad_norm": 1.706241835042834, "learning_rate": 2.301036068380641e-07, "loss": 0.1303, "step": 7700 }, { "epoch": 2.7413333333333334, "grad_norm": 1.5015166048586166, "learning_rate": 2.239400544255399e-07, "loss": 0.121, "step": 7710 }, { "epoch": 2.744888888888889, "grad_norm": 1.69358016809196, "learning_rate": 2.178582837811688e-07, "loss": 0.1249, "step": 7720 }, { "epoch": 2.7484444444444445, "grad_norm": 1.9732967017351475, "learning_rate": 2.1185839904475869e-07, "loss": 0.133, "step": 7730 }, { "epoch": 2.752, "grad_norm": 1.5594363807881604, "learning_rate": 2.0594050295395852e-07, "loss": 0.1304, "step": 7740 }, { "epoch": 2.7555555555555555, "grad_norm": 2.026099043557669, "learning_rate": 2.0010469684250856e-07, "loss": 0.1385, "step": 7750 }, { "epoch": 2.7591111111111113, "grad_norm": 1.5917173969753626, "learning_rate": 1.9435108063849684e-07, "loss": 0.1365, "step": 7760 }, { "epoch": 2.7626666666666666, "grad_norm": 1.7387563784538043, "learning_rate": 1.8867975286265106e-07, "loss": 0.1278, "step": 7770 }, { "epoch": 2.7662222222222224, "grad_norm": 1.491992475001642, "learning_rate": 1.830908106266538e-07, "loss": 0.1169, "step": 7780 }, { "epoch": 2.7697777777777777, "grad_norm": 1.8209635910179756, "learning_rate": 1.7758434963147665e-07, "loss": 0.143, "step": 7790 }, { "epoch": 2.7733333333333334, "grad_norm": 1.6054626426110197, "learning_rate": 1.7216046416574316e-07, "loss": 0.1335, "step": 7800 }, { "epoch": 2.7768888888888887, "grad_norm": 1.6151516199907796, "learning_rate": 1.66819247104113e-07, "loss": 0.1338, "step": 7810 }, { "epoch": 2.7804444444444445, "grad_norm": 1.9698941742198866, "learning_rate": 1.6156078990569313e-07, "loss": 0.1203, "step": 7820 }, { "epoch": 2.784, "grad_norm": 1.6305672042666572, "learning_rate": 1.563851826124696e-07, "loss": 0.1216, "step": 7830 }, { "epoch": 2.7875555555555556, "grad_norm": 1.0194788026355706, "learning_rate": 1.5129251384776998e-07, "loss": 0.1181, "step": 7840 }, { "epoch": 2.7911111111111113, "grad_norm": 1.7073067625712353, "learning_rate": 1.462828708147379e-07, "loss": 0.139, "step": 7850 }, { "epoch": 2.7946666666666666, "grad_norm": 1.4957713592543374, "learning_rate": 1.4135633929485026e-07, "loss": 0.1373, "step": 7860 }, { "epoch": 2.7982222222222224, "grad_norm": 1.6268976958462047, "learning_rate": 1.3651300364644126e-07, "loss": 0.1294, "step": 7870 }, { "epoch": 2.8017777777777777, "grad_norm": 1.3636030825381604, "learning_rate": 1.317529468032569e-07, "loss": 0.1158, "step": 7880 }, { "epoch": 2.8053333333333335, "grad_norm": 1.5147346477252843, "learning_rate": 1.2707625027304104e-07, "loss": 0.124, "step": 7890 }, { "epoch": 2.8088888888888888, "grad_norm": 1.7193516342629052, "learning_rate": 1.2248299413613607e-07, "loss": 0.1332, "step": 7900 }, { "epoch": 2.8124444444444445, "grad_norm": 1.6484553937509365, "learning_rate": 1.1797325704411e-07, "loss": 0.1214, "step": 7910 }, { "epoch": 2.816, "grad_norm": 1.6919284405549642, "learning_rate": 1.1354711621841208e-07, "loss": 0.133, "step": 7920 }, { "epoch": 2.8195555555555556, "grad_norm": 1.223501357852658, "learning_rate": 1.0920464744905157e-07, "loss": 0.1205, "step": 7930 }, { "epoch": 2.8231111111111113, "grad_norm": 1.5481520280664143, "learning_rate": 1.0494592509329716e-07, "loss": 0.1469, "step": 7940 }, { "epoch": 2.8266666666666667, "grad_norm": 1.7879544199201751, "learning_rate": 1.007710220744057e-07, "loss": 0.1269, "step": 7950 }, { "epoch": 2.830222222222222, "grad_norm": 1.513993378655108, "learning_rate": 9.668000988037163e-08, "loss": 0.1322, "step": 7960 }, { "epoch": 2.8337777777777777, "grad_norm": 1.7964467427017516, "learning_rate": 9.267295856270509e-08, "loss": 0.1354, "step": 7970 }, { "epoch": 2.8373333333333335, "grad_norm": 1.787987364521523, "learning_rate": 8.874993673523236e-08, "loss": 0.1319, "step": 7980 }, { "epoch": 2.840888888888889, "grad_norm": 1.6897870372176325, "learning_rate": 8.491101157291737e-08, "loss": 0.1274, "step": 7990 }, { "epoch": 2.8444444444444446, "grad_norm": 1.6105609971746402, "learning_rate": 8.115624881071594e-08, "loss": 0.1318, "step": 8000 }, { "epoch": 2.8444444444444446, "eval_loss": 0.23905394971370697, "eval_runtime": 559.7682, "eval_samples_per_second": 17.865, "eval_steps_per_second": 4.466, "step": 8000 }, { "epoch": 2.848, "grad_norm": 1.3881391902801445, "learning_rate": 7.748571274244776e-08, "loss": 0.1199, "step": 8010 }, { "epoch": 2.8515555555555556, "grad_norm": 1.8275543306577795, "learning_rate": 7.389946621969679e-08, "loss": 0.1494, "step": 8020 }, { "epoch": 2.8551111111111114, "grad_norm": 1.8960525825598256, "learning_rate": 7.039757065073316e-08, "loss": 0.1354, "step": 8030 }, { "epoch": 2.8586666666666667, "grad_norm": 1.6485916403071794, "learning_rate": 6.698008599946404e-08, "loss": 0.1246, "step": 8040 }, { "epoch": 2.862222222222222, "grad_norm": 1.2435705558011503, "learning_rate": 6.364707078440335e-08, "loss": 0.1266, "step": 8050 }, { "epoch": 2.8657777777777778, "grad_norm": 1.5746164801301799, "learning_rate": 6.039858207767479e-08, "loss": 0.134, "step": 8060 }, { "epoch": 2.8693333333333335, "grad_norm": 1.5169697571883205, "learning_rate": 5.723467550403039e-08, "loss": 0.1326, "step": 8070 }, { "epoch": 2.872888888888889, "grad_norm": 1.5881237505008923, "learning_rate": 5.4155405239897926e-08, "loss": 0.1488, "step": 8080 }, { "epoch": 2.8764444444444446, "grad_norm": 1.690061086159581, "learning_rate": 5.1160824012458367e-08, "loss": 0.1232, "step": 8090 }, { "epoch": 2.88, "grad_norm": 1.6253293072576216, "learning_rate": 4.825098309873544e-08, "loss": 0.1264, "step": 8100 }, { "epoch": 2.8835555555555556, "grad_norm": 1.8528993602738453, "learning_rate": 4.542593232472414e-08, "loss": 0.1328, "step": 8110 }, { "epoch": 2.887111111111111, "grad_norm": 1.949296952991108, "learning_rate": 4.268572006453364e-08, "loss": 0.1264, "step": 8120 }, { "epoch": 2.8906666666666667, "grad_norm": 1.5505902041733666, "learning_rate": 4.003039323956126e-08, "loss": 0.1308, "step": 8130 }, { "epoch": 2.894222222222222, "grad_norm": 0.9023008663346067, "learning_rate": 3.7459997317687014e-08, "loss": 0.1101, "step": 8140 }, { "epoch": 2.897777777777778, "grad_norm": 1.8468547733058307, "learning_rate": 3.4974576312497564e-08, "loss": 0.1249, "step": 8150 }, { "epoch": 2.9013333333333335, "grad_norm": 1.7056102650658924, "learning_rate": 3.25741727825285e-08, "loss": 0.1193, "step": 8160 }, { "epoch": 2.904888888888889, "grad_norm": 1.3690587953613977, "learning_rate": 3.025882783054046e-08, "loss": 0.1199, "step": 8170 }, { "epoch": 2.9084444444444446, "grad_norm": 1.3946208158917515, "learning_rate": 2.8028581102811924e-08, "loss": 0.1365, "step": 8180 }, { "epoch": 2.912, "grad_norm": 1.9644328667604294, "learning_rate": 2.588347078846254e-08, "loss": 0.1323, "step": 8190 }, { "epoch": 2.9155555555555557, "grad_norm": 1.7619431494028974, "learning_rate": 2.382353361879586e-08, "loss": 0.1244, "step": 8200 }, { "epoch": 2.919111111111111, "grad_norm": 1.6739735252712569, "learning_rate": 2.18488048666754e-08, "loss": 0.1241, "step": 8210 }, { "epoch": 2.9226666666666667, "grad_norm": 1.7618267751958017, "learning_rate": 1.995931834591569e-08, "loss": 0.132, "step": 8220 }, { "epoch": 2.926222222222222, "grad_norm": 1.5149144065240054, "learning_rate": 1.8155106410706613e-08, "loss": 0.1359, "step": 8230 }, { "epoch": 2.929777777777778, "grad_norm": 1.7464428231188038, "learning_rate": 1.6436199955057742e-08, "loss": 0.1477, "step": 8240 }, { "epoch": 2.9333333333333336, "grad_norm": 1.7961519057796862, "learning_rate": 1.480262841226987e-08, "loss": 0.1482, "step": 8250 }, { "epoch": 2.936888888888889, "grad_norm": 1.668237688338044, "learning_rate": 1.3254419754430981e-08, "loss": 0.1369, "step": 8260 }, { "epoch": 2.940444444444444, "grad_norm": 1.5710565780518715, "learning_rate": 1.1791600491937172e-08, "loss": 0.1265, "step": 8270 }, { "epoch": 2.944, "grad_norm": 1.6190463651101816, "learning_rate": 1.041419567303914e-08, "loss": 0.1233, "step": 8280 }, { "epoch": 2.9475555555555557, "grad_norm": 1.3359272700606026, "learning_rate": 9.12222888341252e-09, "loss": 0.1308, "step": 8290 }, { "epoch": 2.951111111111111, "grad_norm": 1.7965214936961842, "learning_rate": 7.915722245754876e-09, "loss": 0.141, "step": 8300 }, { "epoch": 2.9546666666666668, "grad_norm": 1.7433994283889143, "learning_rate": 6.7946964194059994e-09, "loss": 0.1493, "step": 8310 }, { "epoch": 2.958222222222222, "grad_norm": 1.6666804006077884, "learning_rate": 5.759170599994868e-09, "loss": 0.1284, "step": 8320 }, { "epoch": 2.961777777777778, "grad_norm": 1.4232443691197452, "learning_rate": 4.809162519110455e-09, "loss": 0.1231, "step": 8330 }, { "epoch": 2.9653333333333336, "grad_norm": 1.8464380977109713, "learning_rate": 3.944688443998646e-09, "loss": 0.1466, "step": 8340 }, { "epoch": 2.968888888888889, "grad_norm": 1.8474020149086245, "learning_rate": 3.16576317728301e-09, "loss": 0.126, "step": 8350 }, { "epoch": 2.9724444444444442, "grad_norm": 1.731427281949659, "learning_rate": 2.4724000567116768e-09, "loss": 0.1361, "step": 8360 }, { "epoch": 2.976, "grad_norm": 1.8993388895043506, "learning_rate": 1.86461095492918e-09, "loss": 0.1258, "step": 8370 }, { "epoch": 2.9795555555555557, "grad_norm": 1.676714063923629, "learning_rate": 1.3424062792738445e-09, "loss": 0.1311, "step": 8380 }, { "epoch": 2.983111111111111, "grad_norm": 1.714542756833673, "learning_rate": 9.057949715968183e-10, "loss": 0.1236, "step": 8390 }, { "epoch": 2.986666666666667, "grad_norm": 1.6829258625832335, "learning_rate": 5.547845081121939e-10, "loss": 0.1171, "step": 8400 }, { "epoch": 2.990222222222222, "grad_norm": 1.5917279579386703, "learning_rate": 2.89380899267111e-10, "loss": 0.1309, "step": 8410 }, { "epoch": 2.993777777777778, "grad_norm": 1.72982950263424, "learning_rate": 1.0958868963906188e-10, "loss": 0.1314, "step": 8420 }, { "epoch": 2.997333333333333, "grad_norm": 1.6121134095652765, "learning_rate": 1.541095785984048e-11, "loss": 0.1267, "step": 8430 }, { "epoch": 2.9994666666666667, "step": 8436, "total_flos": 621656373067776.0, "train_loss": 0.25353994178455436, "train_runtime": 39823.1973, "train_samples_per_second": 6.78, "train_steps_per_second": 0.212 } ], "logging_steps": 10, "max_steps": 8436, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 621656373067776.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }