RAG-Critic-3B / trainer_state.json
dongguanting's picture
upload weight
6b3bccb verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9994666666666667,
"eval_steps": 500,
"global_step": 8436,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0035555555555555557,
"grad_norm": 172.0995169680502,
"learning_rate": 1.1848341232227489e-07,
"loss": 2.2225,
"step": 10
},
{
"epoch": 0.0071111111111111115,
"grad_norm": 104.28584294834684,
"learning_rate": 2.3696682464454978e-07,
"loss": 2.0266,
"step": 20
},
{
"epoch": 0.010666666666666666,
"grad_norm": 23.951684052753993,
"learning_rate": 3.5545023696682467e-07,
"loss": 1.7378,
"step": 30
},
{
"epoch": 0.014222222222222223,
"grad_norm": 28.63768523364588,
"learning_rate": 4.7393364928909956e-07,
"loss": 1.4898,
"step": 40
},
{
"epoch": 0.017777777777777778,
"grad_norm": 13.504890360680221,
"learning_rate": 5.924170616113745e-07,
"loss": 1.1851,
"step": 50
},
{
"epoch": 0.021333333333333333,
"grad_norm": 6.1972205960321585,
"learning_rate": 7.109004739336493e-07,
"loss": 0.988,
"step": 60
},
{
"epoch": 0.024888888888888887,
"grad_norm": 5.065573416801065,
"learning_rate": 8.293838862559242e-07,
"loss": 0.862,
"step": 70
},
{
"epoch": 0.028444444444444446,
"grad_norm": 4.1523834288926516,
"learning_rate": 9.478672985781991e-07,
"loss": 0.7813,
"step": 80
},
{
"epoch": 0.032,
"grad_norm": 4.2769222488911405,
"learning_rate": 1.0663507109004742e-06,
"loss": 0.6638,
"step": 90
},
{
"epoch": 0.035555555555555556,
"grad_norm": 3.7926520111038613,
"learning_rate": 1.184834123222749e-06,
"loss": 0.6073,
"step": 100
},
{
"epoch": 0.03911111111111111,
"grad_norm": 4.5354775101779605,
"learning_rate": 1.303317535545024e-06,
"loss": 0.5392,
"step": 110
},
{
"epoch": 0.042666666666666665,
"grad_norm": 9.972389478302686,
"learning_rate": 1.4218009478672987e-06,
"loss": 0.5798,
"step": 120
},
{
"epoch": 0.04622222222222222,
"grad_norm": 3.3720288764197903,
"learning_rate": 1.5402843601895737e-06,
"loss": 0.5119,
"step": 130
},
{
"epoch": 0.049777777777777775,
"grad_norm": 2.9830238004238674,
"learning_rate": 1.6587677725118483e-06,
"loss": 0.4432,
"step": 140
},
{
"epoch": 0.05333333333333334,
"grad_norm": 3.0823358079048395,
"learning_rate": 1.7772511848341234e-06,
"loss": 0.4637,
"step": 150
},
{
"epoch": 0.05688888888888889,
"grad_norm": 2.7399052383817493,
"learning_rate": 1.8957345971563982e-06,
"loss": 0.4623,
"step": 160
},
{
"epoch": 0.060444444444444446,
"grad_norm": 2.5949470941499886,
"learning_rate": 2.0142180094786733e-06,
"loss": 0.4909,
"step": 170
},
{
"epoch": 0.064,
"grad_norm": 2.860434581778304,
"learning_rate": 2.1327014218009483e-06,
"loss": 0.4522,
"step": 180
},
{
"epoch": 0.06755555555555555,
"grad_norm": 2.4794062920348514,
"learning_rate": 2.251184834123223e-06,
"loss": 0.4683,
"step": 190
},
{
"epoch": 0.07111111111111111,
"grad_norm": 3.5898381841290385,
"learning_rate": 2.369668246445498e-06,
"loss": 0.4544,
"step": 200
},
{
"epoch": 0.07466666666666667,
"grad_norm": 2.8271249937433893,
"learning_rate": 2.4881516587677726e-06,
"loss": 0.452,
"step": 210
},
{
"epoch": 0.07822222222222222,
"grad_norm": 2.820485688519842,
"learning_rate": 2.606635071090048e-06,
"loss": 0.4594,
"step": 220
},
{
"epoch": 0.08177777777777778,
"grad_norm": 2.879680482909577,
"learning_rate": 2.7251184834123223e-06,
"loss": 0.4079,
"step": 230
},
{
"epoch": 0.08533333333333333,
"grad_norm": 2.2760447333960547,
"learning_rate": 2.8436018957345973e-06,
"loss": 0.4586,
"step": 240
},
{
"epoch": 0.08888888888888889,
"grad_norm": 2.831009166917502,
"learning_rate": 2.9620853080568724e-06,
"loss": 0.4143,
"step": 250
},
{
"epoch": 0.09244444444444444,
"grad_norm": 3.2359232461275895,
"learning_rate": 3.0805687203791474e-06,
"loss": 0.454,
"step": 260
},
{
"epoch": 0.096,
"grad_norm": 2.7067735723833932,
"learning_rate": 3.1990521327014216e-06,
"loss": 0.3998,
"step": 270
},
{
"epoch": 0.09955555555555555,
"grad_norm": 2.550645136169034,
"learning_rate": 3.3175355450236967e-06,
"loss": 0.397,
"step": 280
},
{
"epoch": 0.10311111111111111,
"grad_norm": 2.6477271129566162,
"learning_rate": 3.4360189573459717e-06,
"loss": 0.416,
"step": 290
},
{
"epoch": 0.10666666666666667,
"grad_norm": 2.744124645461815,
"learning_rate": 3.5545023696682468e-06,
"loss": 0.4521,
"step": 300
},
{
"epoch": 0.11022222222222222,
"grad_norm": 2.499585309198425,
"learning_rate": 3.672985781990522e-06,
"loss": 0.4023,
"step": 310
},
{
"epoch": 0.11377777777777778,
"grad_norm": 2.6278096303414467,
"learning_rate": 3.7914691943127964e-06,
"loss": 0.4191,
"step": 320
},
{
"epoch": 0.11733333333333333,
"grad_norm": 2.4188835712940326,
"learning_rate": 3.9099526066350715e-06,
"loss": 0.4122,
"step": 330
},
{
"epoch": 0.12088888888888889,
"grad_norm": 2.553975268194503,
"learning_rate": 4.0284360189573465e-06,
"loss": 0.3498,
"step": 340
},
{
"epoch": 0.12444444444444444,
"grad_norm": 2.834535859400579,
"learning_rate": 4.146919431279622e-06,
"loss": 0.4094,
"step": 350
},
{
"epoch": 0.128,
"grad_norm": 2.533973817990368,
"learning_rate": 4.265402843601897e-06,
"loss": 0.4298,
"step": 360
},
{
"epoch": 0.13155555555555556,
"grad_norm": 2.813906241826433,
"learning_rate": 4.383886255924171e-06,
"loss": 0.4216,
"step": 370
},
{
"epoch": 0.1351111111111111,
"grad_norm": 2.102931563969342,
"learning_rate": 4.502369668246446e-06,
"loss": 0.3808,
"step": 380
},
{
"epoch": 0.13866666666666666,
"grad_norm": 2.4379289560773896,
"learning_rate": 4.620853080568721e-06,
"loss": 0.3618,
"step": 390
},
{
"epoch": 0.14222222222222222,
"grad_norm": 2.3557567609798777,
"learning_rate": 4.739336492890996e-06,
"loss": 0.4044,
"step": 400
},
{
"epoch": 0.14577777777777778,
"grad_norm": 2.2820973068522514,
"learning_rate": 4.857819905213271e-06,
"loss": 0.4071,
"step": 410
},
{
"epoch": 0.14933333333333335,
"grad_norm": 2.6709678530509993,
"learning_rate": 4.976303317535545e-06,
"loss": 0.4272,
"step": 420
},
{
"epoch": 0.15288888888888888,
"grad_norm": 2.332134363712532,
"learning_rate": 5.09478672985782e-06,
"loss": 0.434,
"step": 430
},
{
"epoch": 0.15644444444444444,
"grad_norm": 2.9162979668749047,
"learning_rate": 5.213270142180096e-06,
"loss": 0.3695,
"step": 440
},
{
"epoch": 0.16,
"grad_norm": 2.2427213677361655,
"learning_rate": 5.33175355450237e-06,
"loss": 0.3723,
"step": 450
},
{
"epoch": 0.16355555555555557,
"grad_norm": 2.5901865124993,
"learning_rate": 5.4502369668246446e-06,
"loss": 0.4147,
"step": 460
},
{
"epoch": 0.1671111111111111,
"grad_norm": 2.56419802107506,
"learning_rate": 5.5687203791469205e-06,
"loss": 0.4083,
"step": 470
},
{
"epoch": 0.17066666666666666,
"grad_norm": 2.145912482611642,
"learning_rate": 5.687203791469195e-06,
"loss": 0.3631,
"step": 480
},
{
"epoch": 0.17422222222222222,
"grad_norm": 2.1572804538302983,
"learning_rate": 5.8056872037914706e-06,
"loss": 0.3838,
"step": 490
},
{
"epoch": 0.17777777777777778,
"grad_norm": 2.7221208940412955,
"learning_rate": 5.924170616113745e-06,
"loss": 0.3703,
"step": 500
},
{
"epoch": 0.17777777777777778,
"eval_loss": 0.27164188027381897,
"eval_runtime": 561.686,
"eval_samples_per_second": 17.804,
"eval_steps_per_second": 4.451,
"step": 500
},
{
"epoch": 0.18133333333333335,
"grad_norm": 2.128760220417613,
"learning_rate": 6.042654028436019e-06,
"loss": 0.3936,
"step": 510
},
{
"epoch": 0.18488888888888888,
"grad_norm": 2.37349559892131,
"learning_rate": 6.161137440758295e-06,
"loss": 0.4097,
"step": 520
},
{
"epoch": 0.18844444444444444,
"grad_norm": 2.1546814583393954,
"learning_rate": 6.279620853080569e-06,
"loss": 0.3487,
"step": 530
},
{
"epoch": 0.192,
"grad_norm": 2.5691866709112174,
"learning_rate": 6.398104265402843e-06,
"loss": 0.3795,
"step": 540
},
{
"epoch": 0.19555555555555557,
"grad_norm": 2.511088780500042,
"learning_rate": 6.516587677725119e-06,
"loss": 0.3592,
"step": 550
},
{
"epoch": 0.1991111111111111,
"grad_norm": 2.1980105108863306,
"learning_rate": 6.635071090047393e-06,
"loss": 0.3759,
"step": 560
},
{
"epoch": 0.20266666666666666,
"grad_norm": 2.0372925079256508,
"learning_rate": 6.753554502369669e-06,
"loss": 0.3372,
"step": 570
},
{
"epoch": 0.20622222222222222,
"grad_norm": 2.4474157501007188,
"learning_rate": 6.8720379146919435e-06,
"loss": 0.3821,
"step": 580
},
{
"epoch": 0.20977777777777779,
"grad_norm": 2.6150488990545813,
"learning_rate": 6.990521327014218e-06,
"loss": 0.4033,
"step": 590
},
{
"epoch": 0.21333333333333335,
"grad_norm": 2.218675478875157,
"learning_rate": 7.1090047393364935e-06,
"loss": 0.3498,
"step": 600
},
{
"epoch": 0.21688888888888888,
"grad_norm": 2.60194848198847,
"learning_rate": 7.227488151658768e-06,
"loss": 0.3974,
"step": 610
},
{
"epoch": 0.22044444444444444,
"grad_norm": 2.4008012422084883,
"learning_rate": 7.345971563981044e-06,
"loss": 0.3522,
"step": 620
},
{
"epoch": 0.224,
"grad_norm": 2.370019125222766,
"learning_rate": 7.464454976303318e-06,
"loss": 0.3843,
"step": 630
},
{
"epoch": 0.22755555555555557,
"grad_norm": 2.319127909040294,
"learning_rate": 7.582938388625593e-06,
"loss": 0.3852,
"step": 640
},
{
"epoch": 0.2311111111111111,
"grad_norm": 2.0344327356388963,
"learning_rate": 7.701421800947868e-06,
"loss": 0.3753,
"step": 650
},
{
"epoch": 0.23466666666666666,
"grad_norm": 2.0974945886124274,
"learning_rate": 7.819905213270143e-06,
"loss": 0.3622,
"step": 660
},
{
"epoch": 0.23822222222222222,
"grad_norm": 2.3710225236326656,
"learning_rate": 7.938388625592418e-06,
"loss": 0.3776,
"step": 670
},
{
"epoch": 0.24177777777777779,
"grad_norm": 2.1972590118602353,
"learning_rate": 8.056872037914693e-06,
"loss": 0.4131,
"step": 680
},
{
"epoch": 0.24533333333333332,
"grad_norm": 2.124563531807995,
"learning_rate": 8.175355450236966e-06,
"loss": 0.4041,
"step": 690
},
{
"epoch": 0.24888888888888888,
"grad_norm": 2.186519973081525,
"learning_rate": 8.293838862559243e-06,
"loss": 0.4342,
"step": 700
},
{
"epoch": 0.25244444444444447,
"grad_norm": 2.2098045409685785,
"learning_rate": 8.412322274881517e-06,
"loss": 0.3753,
"step": 710
},
{
"epoch": 0.256,
"grad_norm": 2.364680759422569,
"learning_rate": 8.530805687203793e-06,
"loss": 0.3499,
"step": 720
},
{
"epoch": 0.25955555555555554,
"grad_norm": 2.0592638534598975,
"learning_rate": 8.649289099526067e-06,
"loss": 0.3676,
"step": 730
},
{
"epoch": 0.26311111111111113,
"grad_norm": 2.076874300192435,
"learning_rate": 8.767772511848342e-06,
"loss": 0.3882,
"step": 740
},
{
"epoch": 0.26666666666666666,
"grad_norm": 2.256989717343507,
"learning_rate": 8.886255924170617e-06,
"loss": 0.3906,
"step": 750
},
{
"epoch": 0.2702222222222222,
"grad_norm": 2.2777259263170753,
"learning_rate": 9.004739336492892e-06,
"loss": 0.3881,
"step": 760
},
{
"epoch": 0.2737777777777778,
"grad_norm": 2.0191108991103452,
"learning_rate": 9.123222748815167e-06,
"loss": 0.3598,
"step": 770
},
{
"epoch": 0.2773333333333333,
"grad_norm": 2.1955719220241114,
"learning_rate": 9.241706161137442e-06,
"loss": 0.3411,
"step": 780
},
{
"epoch": 0.2808888888888889,
"grad_norm": 1.8450512554264078,
"learning_rate": 9.360189573459715e-06,
"loss": 0.3989,
"step": 790
},
{
"epoch": 0.28444444444444444,
"grad_norm": 2.011115441632504,
"learning_rate": 9.478672985781992e-06,
"loss": 0.3982,
"step": 800
},
{
"epoch": 0.288,
"grad_norm": 1.8704472001913133,
"learning_rate": 9.597156398104265e-06,
"loss": 0.414,
"step": 810
},
{
"epoch": 0.29155555555555557,
"grad_norm": 1.9254101904021153,
"learning_rate": 9.715639810426542e-06,
"loss": 0.3767,
"step": 820
},
{
"epoch": 0.2951111111111111,
"grad_norm": 1.9015728855115495,
"learning_rate": 9.834123222748815e-06,
"loss": 0.3775,
"step": 830
},
{
"epoch": 0.2986666666666667,
"grad_norm": 1.928562219171237,
"learning_rate": 9.95260663507109e-06,
"loss": 0.3955,
"step": 840
},
{
"epoch": 0.3022222222222222,
"grad_norm": 1.5585912642130104,
"learning_rate": 9.999984589042141e-06,
"loss": 0.3897,
"step": 850
},
{
"epoch": 0.30577777777777776,
"grad_norm": 2.088285655295682,
"learning_rate": 9.999890411310363e-06,
"loss": 0.3657,
"step": 860
},
{
"epoch": 0.30933333333333335,
"grad_norm": 1.7831321620409892,
"learning_rate": 9.999710619100732e-06,
"loss": 0.3699,
"step": 870
},
{
"epoch": 0.3128888888888889,
"grad_norm": 1.8859386777237288,
"learning_rate": 9.999445215491888e-06,
"loss": 0.3675,
"step": 880
},
{
"epoch": 0.3164444444444444,
"grad_norm": 1.793847189739239,
"learning_rate": 9.999094205028403e-06,
"loss": 0.3804,
"step": 890
},
{
"epoch": 0.32,
"grad_norm": 1.8588345423039347,
"learning_rate": 9.998657593720726e-06,
"loss": 0.3628,
"step": 900
},
{
"epoch": 0.32355555555555554,
"grad_norm": 1.904522383364726,
"learning_rate": 9.998135389045071e-06,
"loss": 0.3832,
"step": 910
},
{
"epoch": 0.32711111111111113,
"grad_norm": 1.7658830671737389,
"learning_rate": 9.997527599943288e-06,
"loss": 0.3931,
"step": 920
},
{
"epoch": 0.33066666666666666,
"grad_norm": 1.8645179401650172,
"learning_rate": 9.996834236822718e-06,
"loss": 0.3587,
"step": 930
},
{
"epoch": 0.3342222222222222,
"grad_norm": 1.8432627384605438,
"learning_rate": 9.996055311556002e-06,
"loss": 0.4065,
"step": 940
},
{
"epoch": 0.3377777777777778,
"grad_norm": 1.8436289309250031,
"learning_rate": 9.99519083748089e-06,
"loss": 0.3861,
"step": 950
},
{
"epoch": 0.3413333333333333,
"grad_norm": 1.9136369003670703,
"learning_rate": 9.994240829400006e-06,
"loss": 0.3794,
"step": 960
},
{
"epoch": 0.3448888888888889,
"grad_norm": 1.559884705708754,
"learning_rate": 9.993205303580596e-06,
"loss": 0.3675,
"step": 970
},
{
"epoch": 0.34844444444444445,
"grad_norm": 2.0844042937670317,
"learning_rate": 9.992084277754246e-06,
"loss": 0.3725,
"step": 980
},
{
"epoch": 0.352,
"grad_norm": 1.4001469351101974,
"learning_rate": 9.990877771116588e-06,
"loss": 0.3526,
"step": 990
},
{
"epoch": 0.35555555555555557,
"grad_norm": 1.7302242639985734,
"learning_rate": 9.989585804326963e-06,
"loss": 0.3451,
"step": 1000
},
{
"epoch": 0.35555555555555557,
"eval_loss": 0.2586575448513031,
"eval_runtime": 561.7755,
"eval_samples_per_second": 17.801,
"eval_steps_per_second": 4.45,
"step": 1000
},
{
"epoch": 0.3591111111111111,
"grad_norm": 2.1060593962865832,
"learning_rate": 9.988208399508064e-06,
"loss": 0.3923,
"step": 1010
},
{
"epoch": 0.3626666666666667,
"grad_norm": 1.6475744003826194,
"learning_rate": 9.986745580245569e-06,
"loss": 0.3077,
"step": 1020
},
{
"epoch": 0.3662222222222222,
"grad_norm": 1.9521091866638012,
"learning_rate": 9.985197371587732e-06,
"loss": 0.389,
"step": 1030
},
{
"epoch": 0.36977777777777776,
"grad_norm": 1.7609515675334448,
"learning_rate": 9.983563800044942e-06,
"loss": 0.3424,
"step": 1040
},
{
"epoch": 0.37333333333333335,
"grad_norm": 1.7210920690658038,
"learning_rate": 9.981844893589294e-06,
"loss": 0.3558,
"step": 1050
},
{
"epoch": 0.3768888888888889,
"grad_norm": 1.823734659697161,
"learning_rate": 9.980040681654085e-06,
"loss": 0.3693,
"step": 1060
},
{
"epoch": 0.3804444444444444,
"grad_norm": 2.102269417162816,
"learning_rate": 9.978151195133326e-06,
"loss": 0.3638,
"step": 1070
},
{
"epoch": 0.384,
"grad_norm": 1.8033749091845297,
"learning_rate": 9.976176466381205e-06,
"loss": 0.3484,
"step": 1080
},
{
"epoch": 0.38755555555555554,
"grad_norm": 1.8854677696591007,
"learning_rate": 9.974116529211539e-06,
"loss": 0.3967,
"step": 1090
},
{
"epoch": 0.39111111111111113,
"grad_norm": 2.0272157520267218,
"learning_rate": 9.971971418897189e-06,
"loss": 0.3741,
"step": 1100
},
{
"epoch": 0.39466666666666667,
"grad_norm": 2.0179018140684555,
"learning_rate": 9.969741172169461e-06,
"loss": 0.3904,
"step": 1110
},
{
"epoch": 0.3982222222222222,
"grad_norm": 1.6226992565101939,
"learning_rate": 9.967425827217473e-06,
"loss": 0.3485,
"step": 1120
},
{
"epoch": 0.4017777777777778,
"grad_norm": 1.9028497690136488,
"learning_rate": 9.965025423687505e-06,
"loss": 0.346,
"step": 1130
},
{
"epoch": 0.4053333333333333,
"grad_norm": 1.694320712579824,
"learning_rate": 9.962540002682314e-06,
"loss": 0.3635,
"step": 1140
},
{
"epoch": 0.4088888888888889,
"grad_norm": 1.6440393469215313,
"learning_rate": 9.95996960676044e-06,
"loss": 0.3794,
"step": 1150
},
{
"epoch": 0.41244444444444445,
"grad_norm": 1.9859711063744807,
"learning_rate": 9.957314279935467e-06,
"loss": 0.3727,
"step": 1160
},
{
"epoch": 0.416,
"grad_norm": 1.5764827911729749,
"learning_rate": 9.954574067675276e-06,
"loss": 0.3472,
"step": 1170
},
{
"epoch": 0.41955555555555557,
"grad_norm": 2.0270228575955938,
"learning_rate": 9.951749016901266e-06,
"loss": 0.3651,
"step": 1180
},
{
"epoch": 0.4231111111111111,
"grad_norm": 1.4711992564971241,
"learning_rate": 9.948839175987543e-06,
"loss": 0.4007,
"step": 1190
},
{
"epoch": 0.4266666666666667,
"grad_norm": 1.6555299578050973,
"learning_rate": 9.945844594760104e-06,
"loss": 0.3662,
"step": 1200
},
{
"epoch": 0.43022222222222223,
"grad_norm": 1.6087449112246428,
"learning_rate": 9.94276532449597e-06,
"loss": 0.3266,
"step": 1210
},
{
"epoch": 0.43377777777777776,
"grad_norm": 1.7938918985177508,
"learning_rate": 9.939601417922326e-06,
"loss": 0.367,
"step": 1220
},
{
"epoch": 0.43733333333333335,
"grad_norm": 1.9419042479062267,
"learning_rate": 9.936352929215598e-06,
"loss": 0.3479,
"step": 1230
},
{
"epoch": 0.4408888888888889,
"grad_norm": 1.7389871732986788,
"learning_rate": 9.933019914000537e-06,
"loss": 0.3991,
"step": 1240
},
{
"epoch": 0.4444444444444444,
"grad_norm": 1.954697966163684,
"learning_rate": 9.929602429349267e-06,
"loss": 0.387,
"step": 1250
},
{
"epoch": 0.448,
"grad_norm": 1.9390505686602657,
"learning_rate": 9.926100533780304e-06,
"loss": 0.3623,
"step": 1260
},
{
"epoch": 0.45155555555555554,
"grad_norm": 1.6639481540933314,
"learning_rate": 9.922514287257553e-06,
"loss": 0.3758,
"step": 1270
},
{
"epoch": 0.45511111111111113,
"grad_norm": 1.722757928957694,
"learning_rate": 9.918843751189285e-06,
"loss": 0.3355,
"step": 1280
},
{
"epoch": 0.45866666666666667,
"grad_norm": 1.845850757530145,
"learning_rate": 9.915088988427085e-06,
"loss": 0.3698,
"step": 1290
},
{
"epoch": 0.4622222222222222,
"grad_norm": 1.44128404254532,
"learning_rate": 9.911250063264768e-06,
"loss": 0.4047,
"step": 1300
},
{
"epoch": 0.4657777777777778,
"grad_norm": 1.7671518160334596,
"learning_rate": 9.907327041437295e-06,
"loss": 0.3692,
"step": 1310
},
{
"epoch": 0.4693333333333333,
"grad_norm": 1.8380352484481248,
"learning_rate": 9.903319990119629e-06,
"loss": 0.36,
"step": 1320
},
{
"epoch": 0.4728888888888889,
"grad_norm": 1.76427459962676,
"learning_rate": 9.899228977925594e-06,
"loss": 0.3741,
"step": 1330
},
{
"epoch": 0.47644444444444445,
"grad_norm": 1.4897822709650264,
"learning_rate": 9.895054074906703e-06,
"loss": 0.3407,
"step": 1340
},
{
"epoch": 0.48,
"grad_norm": 1.8107592753421746,
"learning_rate": 9.890795352550949e-06,
"loss": 0.3737,
"step": 1350
},
{
"epoch": 0.48355555555555557,
"grad_norm": 1.7814141617442254,
"learning_rate": 9.886452883781588e-06,
"loss": 0.3706,
"step": 1360
},
{
"epoch": 0.4871111111111111,
"grad_norm": 1.6423771491979522,
"learning_rate": 9.882026742955892e-06,
"loss": 0.3593,
"step": 1370
},
{
"epoch": 0.49066666666666664,
"grad_norm": 1.9926182163486512,
"learning_rate": 9.877517005863865e-06,
"loss": 0.388,
"step": 1380
},
{
"epoch": 0.49422222222222223,
"grad_norm": 1.6527200649892368,
"learning_rate": 9.872923749726959e-06,
"loss": 0.3825,
"step": 1390
},
{
"epoch": 0.49777777777777776,
"grad_norm": 1.800321612826116,
"learning_rate": 9.868247053196744e-06,
"loss": 0.3406,
"step": 1400
},
{
"epoch": 0.5013333333333333,
"grad_norm": 1.8998896812539383,
"learning_rate": 9.86348699635356e-06,
"loss": 0.3718,
"step": 1410
},
{
"epoch": 0.5048888888888889,
"grad_norm": 1.8642598101048677,
"learning_rate": 9.85864366070515e-06,
"loss": 0.3728,
"step": 1420
},
{
"epoch": 0.5084444444444445,
"grad_norm": 2.04147924521036,
"learning_rate": 9.853717129185262e-06,
"loss": 0.3371,
"step": 1430
},
{
"epoch": 0.512,
"grad_norm": 1.765175754873959,
"learning_rate": 9.848707486152231e-06,
"loss": 0.3468,
"step": 1440
},
{
"epoch": 0.5155555555555555,
"grad_norm": 1.7955950262413882,
"learning_rate": 9.843614817387531e-06,
"loss": 0.3456,
"step": 1450
},
{
"epoch": 0.5191111111111111,
"grad_norm": 1.4037783734962412,
"learning_rate": 9.838439210094309e-06,
"loss": 0.3244,
"step": 1460
},
{
"epoch": 0.5226666666666666,
"grad_norm": 1.8006249556531597,
"learning_rate": 9.833180752895887e-06,
"loss": 0.3391,
"step": 1470
},
{
"epoch": 0.5262222222222223,
"grad_norm": 1.7020622735675546,
"learning_rate": 9.827839535834258e-06,
"loss": 0.3922,
"step": 1480
},
{
"epoch": 0.5297777777777778,
"grad_norm": 1.6034083398484584,
"learning_rate": 9.822415650368525e-06,
"loss": 0.304,
"step": 1490
},
{
"epoch": 0.5333333333333333,
"grad_norm": 1.7309514997235147,
"learning_rate": 9.816909189373347e-06,
"loss": 0.3531,
"step": 1500
},
{
"epoch": 0.5333333333333333,
"eval_loss": 0.24488620460033417,
"eval_runtime": 562.1833,
"eval_samples_per_second": 17.788,
"eval_steps_per_second": 4.447,
"step": 1500
},
{
"epoch": 0.5368888888888889,
"grad_norm": 1.4581125274966544,
"learning_rate": 9.81132024713735e-06,
"loss": 0.3771,
"step": 1510
},
{
"epoch": 0.5404444444444444,
"grad_norm": 1.6490332212552936,
"learning_rate": 9.805648919361505e-06,
"loss": 0.3848,
"step": 1520
},
{
"epoch": 0.544,
"grad_norm": 1.7512970600212527,
"learning_rate": 9.799895303157492e-06,
"loss": 0.3694,
"step": 1530
},
{
"epoch": 0.5475555555555556,
"grad_norm": 1.7421405313188358,
"learning_rate": 9.794059497046043e-06,
"loss": 0.3553,
"step": 1540
},
{
"epoch": 0.5511111111111111,
"grad_norm": 1.7340918047507783,
"learning_rate": 9.788141600955244e-06,
"loss": 0.3357,
"step": 1550
},
{
"epoch": 0.5546666666666666,
"grad_norm": 1.657973523226739,
"learning_rate": 9.782141716218832e-06,
"loss": 0.3448,
"step": 1560
},
{
"epoch": 0.5582222222222222,
"grad_norm": 1.7266109549753084,
"learning_rate": 9.77605994557446e-06,
"loss": 0.3336,
"step": 1570
},
{
"epoch": 0.5617777777777778,
"grad_norm": 1.7634795513841868,
"learning_rate": 9.769896393161937e-06,
"loss": 0.336,
"step": 1580
},
{
"epoch": 0.5653333333333334,
"grad_norm": 1.7328448062964845,
"learning_rate": 9.763651164521436e-06,
"loss": 0.3505,
"step": 1590
},
{
"epoch": 0.5688888888888889,
"grad_norm": 1.7601349288429824,
"learning_rate": 9.7573243665917e-06,
"loss": 0.3816,
"step": 1600
},
{
"epoch": 0.5724444444444444,
"grad_norm": 1.887857912509665,
"learning_rate": 9.750916107708205e-06,
"loss": 0.358,
"step": 1610
},
{
"epoch": 0.576,
"grad_norm": 1.8940080571652895,
"learning_rate": 9.744426497601305e-06,
"loss": 0.363,
"step": 1620
},
{
"epoch": 0.5795555555555556,
"grad_norm": 1.5744873206102685,
"learning_rate": 9.737855647394346e-06,
"loss": 0.3544,
"step": 1630
},
{
"epoch": 0.5831111111111111,
"grad_norm": 1.5744080074196256,
"learning_rate": 9.73120366960178e-06,
"loss": 0.375,
"step": 1640
},
{
"epoch": 0.5866666666666667,
"grad_norm": 1.6398095171132219,
"learning_rate": 9.724470678127226e-06,
"loss": 0.3649,
"step": 1650
},
{
"epoch": 0.5902222222222222,
"grad_norm": 1.4310246627875627,
"learning_rate": 9.717656788261519e-06,
"loss": 0.3716,
"step": 1660
},
{
"epoch": 0.5937777777777777,
"grad_norm": 1.490999227794774,
"learning_rate": 9.71076211668074e-06,
"loss": 0.352,
"step": 1670
},
{
"epoch": 0.5973333333333334,
"grad_norm": 1.6484132205325386,
"learning_rate": 9.703786781444218e-06,
"loss": 0.3555,
"step": 1680
},
{
"epoch": 0.6008888888888889,
"grad_norm": 1.3854857319423775,
"learning_rate": 9.69673090199251e-06,
"loss": 0.3348,
"step": 1690
},
{
"epoch": 0.6044444444444445,
"grad_norm": 1.6107410705301848,
"learning_rate": 9.689594599145348e-06,
"loss": 0.3499,
"step": 1700
},
{
"epoch": 0.608,
"grad_norm": 1.520886748403311,
"learning_rate": 9.682377995099581e-06,
"loss": 0.3389,
"step": 1710
},
{
"epoch": 0.6115555555555555,
"grad_norm": 1.4556730210725268,
"learning_rate": 9.675081213427076e-06,
"loss": 0.3412,
"step": 1720
},
{
"epoch": 0.6151111111111112,
"grad_norm": 1.476388303700134,
"learning_rate": 9.667704379072597e-06,
"loss": 0.3363,
"step": 1730
},
{
"epoch": 0.6186666666666667,
"grad_norm": 1.2168509424846436,
"learning_rate": 9.660247618351683e-06,
"loss": 0.3328,
"step": 1740
},
{
"epoch": 0.6222222222222222,
"grad_norm": 1.395468629739029,
"learning_rate": 9.652711058948463e-06,
"loss": 0.3509,
"step": 1750
},
{
"epoch": 0.6257777777777778,
"grad_norm": 1.586845461880222,
"learning_rate": 9.645094829913487e-06,
"loss": 0.3471,
"step": 1760
},
{
"epoch": 0.6293333333333333,
"grad_norm": 1.5411518795473231,
"learning_rate": 9.637399061661507e-06,
"loss": 0.3246,
"step": 1770
},
{
"epoch": 0.6328888888888888,
"grad_norm": 1.658660033117339,
"learning_rate": 9.62962388596925e-06,
"loss": 0.3399,
"step": 1780
},
{
"epoch": 0.6364444444444445,
"grad_norm": 1.313159566501215,
"learning_rate": 9.621769435973152e-06,
"loss": 0.3478,
"step": 1790
},
{
"epoch": 0.64,
"grad_norm": 1.8380402091451324,
"learning_rate": 9.61383584616709e-06,
"loss": 0.3251,
"step": 1800
},
{
"epoch": 0.6435555555555555,
"grad_norm": 1.6180991422896933,
"learning_rate": 9.60582325240007e-06,
"loss": 0.3553,
"step": 1810
},
{
"epoch": 0.6471111111111111,
"grad_norm": 1.8283857342608776,
"learning_rate": 9.597731791873907e-06,
"loss": 0.3594,
"step": 1820
},
{
"epoch": 0.6506666666666666,
"grad_norm": 1.4175489521300049,
"learning_rate": 9.58956160314087e-06,
"loss": 0.3549,
"step": 1830
},
{
"epoch": 0.6542222222222223,
"grad_norm": 1.6783488504498176,
"learning_rate": 9.581312826101315e-06,
"loss": 0.3813,
"step": 1840
},
{
"epoch": 0.6577777777777778,
"grad_norm": 1.6351873747299641,
"learning_rate": 9.572985602001283e-06,
"loss": 0.3518,
"step": 1850
},
{
"epoch": 0.6613333333333333,
"grad_norm": 1.3790848679324303,
"learning_rate": 9.56458007343009e-06,
"loss": 0.3303,
"step": 1860
},
{
"epoch": 0.6648888888888889,
"grad_norm": 1.6322052333334587,
"learning_rate": 9.556096384317878e-06,
"loss": 0.3403,
"step": 1870
},
{
"epoch": 0.6684444444444444,
"grad_norm": 1.788030342136729,
"learning_rate": 9.547534679933155e-06,
"loss": 0.3717,
"step": 1880
},
{
"epoch": 0.672,
"grad_norm": 1.4934586402235337,
"learning_rate": 9.538895106880302e-06,
"loss": 0.3468,
"step": 1890
},
{
"epoch": 0.6755555555555556,
"grad_norm": 1.9556398213487334,
"learning_rate": 9.53017781309707e-06,
"loss": 0.3495,
"step": 1900
},
{
"epoch": 0.6791111111111111,
"grad_norm": 1.4201698189636593,
"learning_rate": 9.521382947852042e-06,
"loss": 0.3631,
"step": 1910
},
{
"epoch": 0.6826666666666666,
"grad_norm": 1.8176078337580701,
"learning_rate": 9.512510661742078e-06,
"loss": 0.366,
"step": 1920
},
{
"epoch": 0.6862222222222222,
"grad_norm": 1.5895629439283847,
"learning_rate": 9.503561106689736e-06,
"loss": 0.3165,
"step": 1930
},
{
"epoch": 0.6897777777777778,
"grad_norm": 1.7257922798447645,
"learning_rate": 9.494534435940668e-06,
"loss": 0.3199,
"step": 1940
},
{
"epoch": 0.6933333333333334,
"grad_norm": 1.3859470273389864,
"learning_rate": 9.485430804061009e-06,
"loss": 0.3244,
"step": 1950
},
{
"epoch": 0.6968888888888889,
"grad_norm": 1.3389192102707597,
"learning_rate": 9.476250366934708e-06,
"loss": 0.3557,
"step": 1960
},
{
"epoch": 0.7004444444444444,
"grad_norm": 1.761133913330945,
"learning_rate": 9.466993281760879e-06,
"loss": 0.3367,
"step": 1970
},
{
"epoch": 0.704,
"grad_norm": 1.5576575807000288,
"learning_rate": 9.457659707051099e-06,
"loss": 0.335,
"step": 1980
},
{
"epoch": 0.7075555555555556,
"grad_norm": 1.5125566207561287,
"learning_rate": 9.448249802626696e-06,
"loss": 0.3286,
"step": 1990
},
{
"epoch": 0.7111111111111111,
"grad_norm": 1.7236714219097393,
"learning_rate": 9.43876372961601e-06,
"loss": 0.3544,
"step": 2000
},
{
"epoch": 0.7111111111111111,
"eval_loss": 0.23682241141796112,
"eval_runtime": 560.8939,
"eval_samples_per_second": 17.829,
"eval_steps_per_second": 4.457,
"step": 2000
},
{
"epoch": 0.7146666666666667,
"grad_norm": 1.7803508157706263,
"learning_rate": 9.429201650451642e-06,
"loss": 0.3218,
"step": 2010
},
{
"epoch": 0.7182222222222222,
"grad_norm": 1.6971031315045289,
"learning_rate": 9.419563728867663e-06,
"loss": 0.3417,
"step": 2020
},
{
"epoch": 0.7217777777777777,
"grad_norm": 1.9366329088516083,
"learning_rate": 9.409850129896812e-06,
"loss": 0.3104,
"step": 2030
},
{
"epoch": 0.7253333333333334,
"grad_norm": 1.85452483851228,
"learning_rate": 9.40006101986768e-06,
"loss": 0.3371,
"step": 2040
},
{
"epoch": 0.7288888888888889,
"grad_norm": 1.4768370143060883,
"learning_rate": 9.390196566401844e-06,
"loss": 0.3324,
"step": 2050
},
{
"epoch": 0.7324444444444445,
"grad_norm": 1.3195137184227357,
"learning_rate": 9.38025693841102e-06,
"loss": 0.3384,
"step": 2060
},
{
"epoch": 0.736,
"grad_norm": 1.7121308917693614,
"learning_rate": 9.370242306094141e-06,
"loss": 0.3339,
"step": 2070
},
{
"epoch": 0.7395555555555555,
"grad_norm": 1.3801023810052373,
"learning_rate": 9.360152840934477e-06,
"loss": 0.3449,
"step": 2080
},
{
"epoch": 0.7431111111111111,
"grad_norm": 1.4391167681264767,
"learning_rate": 9.349988715696671e-06,
"loss": 0.3444,
"step": 2090
},
{
"epoch": 0.7466666666666667,
"grad_norm": 1.840759552395967,
"learning_rate": 9.33975010442379e-06,
"loss": 0.3496,
"step": 2100
},
{
"epoch": 0.7502222222222222,
"grad_norm": 1.348141880287597,
"learning_rate": 9.329437182434351e-06,
"loss": 0.3202,
"step": 2110
},
{
"epoch": 0.7537777777777778,
"grad_norm": 1.528620379748828,
"learning_rate": 9.31905012631931e-06,
"loss": 0.3545,
"step": 2120
},
{
"epoch": 0.7573333333333333,
"grad_norm": 1.502678851982848,
"learning_rate": 9.30858911393904e-06,
"loss": 0.3457,
"step": 2130
},
{
"epoch": 0.7608888888888888,
"grad_norm": 1.591416150002211,
"learning_rate": 9.298054324420294e-06,
"loss": 0.3125,
"step": 2140
},
{
"epoch": 0.7644444444444445,
"grad_norm": 1.5254470204546493,
"learning_rate": 9.287445938153121e-06,
"loss": 0.3596,
"step": 2150
},
{
"epoch": 0.768,
"grad_norm": 1.230432920766134,
"learning_rate": 9.276764136787798e-06,
"loss": 0.3352,
"step": 2160
},
{
"epoch": 0.7715555555555556,
"grad_norm": 1.8112353212418606,
"learning_rate": 9.266009103231702e-06,
"loss": 0.3504,
"step": 2170
},
{
"epoch": 0.7751111111111111,
"grad_norm": 1.6435932354458154,
"learning_rate": 9.255181021646182e-06,
"loss": 0.3289,
"step": 2180
},
{
"epoch": 0.7786666666666666,
"grad_norm": 1.3388409038180085,
"learning_rate": 9.244280077443417e-06,
"loss": 0.3542,
"step": 2190
},
{
"epoch": 0.7822222222222223,
"grad_norm": 1.5875341933538416,
"learning_rate": 9.233306457283223e-06,
"loss": 0.3516,
"step": 2200
},
{
"epoch": 0.7857777777777778,
"grad_norm": 1.5094881761609635,
"learning_rate": 9.222260349069874e-06,
"loss": 0.3489,
"step": 2210
},
{
"epoch": 0.7893333333333333,
"grad_norm": 1.477094884348464,
"learning_rate": 9.211141941948872e-06,
"loss": 0.3581,
"step": 2220
},
{
"epoch": 0.7928888888888889,
"grad_norm": 1.4717030162478277,
"learning_rate": 9.199951426303711e-06,
"loss": 0.3415,
"step": 2230
},
{
"epoch": 0.7964444444444444,
"grad_norm": 1.5752422305129774,
"learning_rate": 9.188688993752626e-06,
"loss": 0.3355,
"step": 2240
},
{
"epoch": 0.8,
"grad_norm": 1.5354049474859641,
"learning_rate": 9.177354837145298e-06,
"loss": 0.3394,
"step": 2250
},
{
"epoch": 0.8035555555555556,
"grad_norm": 1.8308300488763203,
"learning_rate": 9.165949150559561e-06,
"loss": 0.3545,
"step": 2260
},
{
"epoch": 0.8071111111111111,
"grad_norm": 1.7274391712847685,
"learning_rate": 9.154472129298075e-06,
"loss": 0.363,
"step": 2270
},
{
"epoch": 0.8106666666666666,
"grad_norm": 1.663966013940676,
"learning_rate": 9.142923969884984e-06,
"loss": 0.3395,
"step": 2280
},
{
"epoch": 0.8142222222222222,
"grad_norm": 1.631283026660004,
"learning_rate": 9.131304870062554e-06,
"loss": 0.3486,
"step": 2290
},
{
"epoch": 0.8177777777777778,
"grad_norm": 1.6552982308578106,
"learning_rate": 9.119615028787771e-06,
"loss": 0.3509,
"step": 2300
},
{
"epoch": 0.8213333333333334,
"grad_norm": 1.7276297897533288,
"learning_rate": 9.107854646228961e-06,
"loss": 0.325,
"step": 2310
},
{
"epoch": 0.8248888888888889,
"grad_norm": 1.445647497408194,
"learning_rate": 9.096023923762333e-06,
"loss": 0.3149,
"step": 2320
},
{
"epoch": 0.8284444444444444,
"grad_norm": 1.531947731156783,
"learning_rate": 9.08412306396856e-06,
"loss": 0.348,
"step": 2330
},
{
"epoch": 0.832,
"grad_norm": 1.3576987022774867,
"learning_rate": 9.072152270629281e-06,
"loss": 0.3096,
"step": 2340
},
{
"epoch": 0.8355555555555556,
"grad_norm": 1.4298680216684836,
"learning_rate": 9.060111748723639e-06,
"loss": 0.3609,
"step": 2350
},
{
"epoch": 0.8391111111111111,
"grad_norm": 1.5782942370819155,
"learning_rate": 9.048001704424747e-06,
"loss": 0.3307,
"step": 2360
},
{
"epoch": 0.8426666666666667,
"grad_norm": 1.6461644102732529,
"learning_rate": 9.035822345096177e-06,
"loss": 0.3327,
"step": 2370
},
{
"epoch": 0.8462222222222222,
"grad_norm": 1.5843145785651733,
"learning_rate": 9.023573879288394e-06,
"loss": 0.3312,
"step": 2380
},
{
"epoch": 0.8497777777777777,
"grad_norm": 1.5152546857205669,
"learning_rate": 9.0112565167352e-06,
"loss": 0.3298,
"step": 2390
},
{
"epoch": 0.8533333333333334,
"grad_norm": 1.7304070586423994,
"learning_rate": 8.99887046835013e-06,
"loss": 0.3404,
"step": 2400
},
{
"epoch": 0.8568888888888889,
"grad_norm": 1.461299493248939,
"learning_rate": 8.986415946222843e-06,
"loss": 0.3351,
"step": 2410
},
{
"epoch": 0.8604444444444445,
"grad_norm": 1.6967152528749099,
"learning_rate": 8.973893163615498e-06,
"loss": 0.3257,
"step": 2420
},
{
"epoch": 0.864,
"grad_norm": 1.4154067723973784,
"learning_rate": 8.96130233495909e-06,
"loss": 0.3199,
"step": 2430
},
{
"epoch": 0.8675555555555555,
"grad_norm": 1.3361597312618834,
"learning_rate": 8.948643675849793e-06,
"loss": 0.3442,
"step": 2440
},
{
"epoch": 0.8711111111111111,
"grad_norm": 1.4032866224408458,
"learning_rate": 8.935917403045251e-06,
"loss": 0.2947,
"step": 2450
},
{
"epoch": 0.8746666666666667,
"grad_norm": 1.234939739680067,
"learning_rate": 8.923123734460885e-06,
"loss": 0.3577,
"step": 2460
},
{
"epoch": 0.8782222222222222,
"grad_norm": 1.5765934665163166,
"learning_rate": 8.910262889166144e-06,
"loss": 0.3326,
"step": 2470
},
{
"epoch": 0.8817777777777778,
"grad_norm": 1.5046341548865376,
"learning_rate": 8.897335087380769e-06,
"loss": 0.3212,
"step": 2480
},
{
"epoch": 0.8853333333333333,
"grad_norm": 1.3276870900100486,
"learning_rate": 8.884340550471008e-06,
"loss": 0.3143,
"step": 2490
},
{
"epoch": 0.8888888888888888,
"grad_norm": 1.719735619655969,
"learning_rate": 8.87127950094584e-06,
"loss": 0.3747,
"step": 2500
},
{
"epoch": 0.8888888888888888,
"eval_loss": 0.23135392367839813,
"eval_runtime": 562.1868,
"eval_samples_per_second": 17.788,
"eval_steps_per_second": 4.447,
"step": 2500
},
{
"epoch": 0.8924444444444445,
"grad_norm": 1.584313301872745,
"learning_rate": 8.85815216245315e-06,
"loss": 0.3251,
"step": 2510
},
{
"epoch": 0.896,
"grad_norm": 1.2854406639721594,
"learning_rate": 8.844958759775917e-06,
"loss": 0.3242,
"step": 2520
},
{
"epoch": 0.8995555555555556,
"grad_norm": 1.3421636352208044,
"learning_rate": 8.83169951882834e-06,
"loss": 0.3069,
"step": 2530
},
{
"epoch": 0.9031111111111111,
"grad_norm": 1.6982202912735271,
"learning_rate": 8.818374666652001e-06,
"loss": 0.3303,
"step": 2540
},
{
"epoch": 0.9066666666666666,
"grad_norm": 1.3802398833209684,
"learning_rate": 8.804984431411951e-06,
"loss": 0.3558,
"step": 2550
},
{
"epoch": 0.9102222222222223,
"grad_norm": 1.8913239549685246,
"learning_rate": 8.791529042392813e-06,
"loss": 0.3947,
"step": 2560
},
{
"epoch": 0.9137777777777778,
"grad_norm": 1.4494060942613418,
"learning_rate": 8.77800872999486e-06,
"loss": 0.3362,
"step": 2570
},
{
"epoch": 0.9173333333333333,
"grad_norm": 1.7204036116920214,
"learning_rate": 8.764423725730062e-06,
"loss": 0.3298,
"step": 2580
},
{
"epoch": 0.9208888888888889,
"grad_norm": 1.6130463149964605,
"learning_rate": 8.750774262218129e-06,
"loss": 0.3218,
"step": 2590
},
{
"epoch": 0.9244444444444444,
"grad_norm": 1.4272505738840544,
"learning_rate": 8.737060573182518e-06,
"loss": 0.3325,
"step": 2600
},
{
"epoch": 0.928,
"grad_norm": 1.5909460584884059,
"learning_rate": 8.723282893446447e-06,
"loss": 0.3496,
"step": 2610
},
{
"epoch": 0.9315555555555556,
"grad_norm": 2.0360938733984963,
"learning_rate": 8.709441458928853e-06,
"loss": 0.3197,
"step": 2620
},
{
"epoch": 0.9351111111111111,
"grad_norm": 1.6918095124182533,
"learning_rate": 8.695536506640369e-06,
"loss": 0.3349,
"step": 2630
},
{
"epoch": 0.9386666666666666,
"grad_norm": 1.561883507817091,
"learning_rate": 8.681568274679264e-06,
"loss": 0.3357,
"step": 2640
},
{
"epoch": 0.9422222222222222,
"grad_norm": 1.635386123467993,
"learning_rate": 8.66753700222735e-06,
"loss": 0.3023,
"step": 2650
},
{
"epoch": 0.9457777777777778,
"grad_norm": 1.6460980849436542,
"learning_rate": 8.653442929545914e-06,
"loss": 0.3482,
"step": 2660
},
{
"epoch": 0.9493333333333334,
"grad_norm": 1.8476260091970051,
"learning_rate": 8.639286297971575e-06,
"loss": 0.3111,
"step": 2670
},
{
"epoch": 0.9528888888888889,
"grad_norm": 1.5625524365842092,
"learning_rate": 8.625067349912171e-06,
"loss": 0.3333,
"step": 2680
},
{
"epoch": 0.9564444444444444,
"grad_norm": 1.679549783886682,
"learning_rate": 8.610786328842602e-06,
"loss": 0.3012,
"step": 2690
},
{
"epoch": 0.96,
"grad_norm": 1.7334271987057313,
"learning_rate": 8.59644347930066e-06,
"loss": 0.3158,
"step": 2700
},
{
"epoch": 0.9635555555555556,
"grad_norm": 1.7183702234532738,
"learning_rate": 8.582039046882842e-06,
"loss": 0.3045,
"step": 2710
},
{
"epoch": 0.9671111111111111,
"grad_norm": 1.677327314139312,
"learning_rate": 8.567573278240147e-06,
"loss": 0.3379,
"step": 2720
},
{
"epoch": 0.9706666666666667,
"grad_norm": 1.4197759922345252,
"learning_rate": 8.55304642107385e-06,
"loss": 0.3376,
"step": 2730
},
{
"epoch": 0.9742222222222222,
"grad_norm": 1.7365860935410007,
"learning_rate": 8.538458724131258e-06,
"loss": 0.3395,
"step": 2740
},
{
"epoch": 0.9777777777777777,
"grad_norm": 1.5642529718868006,
"learning_rate": 8.523810437201463e-06,
"loss": 0.3105,
"step": 2750
},
{
"epoch": 0.9813333333333333,
"grad_norm": 1.6285786801359268,
"learning_rate": 8.509101811111045e-06,
"loss": 0.314,
"step": 2760
},
{
"epoch": 0.9848888888888889,
"grad_norm": 1.7932095997349375,
"learning_rate": 8.494333097719795e-06,
"loss": 0.3183,
"step": 2770
},
{
"epoch": 0.9884444444444445,
"grad_norm": 1.7636055661476138,
"learning_rate": 8.479504549916393e-06,
"loss": 0.3459,
"step": 2780
},
{
"epoch": 0.992,
"grad_norm": 1.7893218283734698,
"learning_rate": 8.464616421614077e-06,
"loss": 0.3655,
"step": 2790
},
{
"epoch": 0.9955555555555555,
"grad_norm": 1.56040627840869,
"learning_rate": 8.449668967746303e-06,
"loss": 0.3145,
"step": 2800
},
{
"epoch": 0.9991111111111111,
"grad_norm": 1.7372692555117912,
"learning_rate": 8.434662444262374e-06,
"loss": 0.3152,
"step": 2810
},
{
"epoch": 1.0026666666666666,
"grad_norm": 1.3178611516659062,
"learning_rate": 8.419597108123054e-06,
"loss": 0.256,
"step": 2820
},
{
"epoch": 1.0062222222222221,
"grad_norm": 1.7641513434209246,
"learning_rate": 8.404473217296174e-06,
"loss": 0.2304,
"step": 2830
},
{
"epoch": 1.0097777777777779,
"grad_norm": 1.702777106397184,
"learning_rate": 8.389291030752215e-06,
"loss": 0.2451,
"step": 2840
},
{
"epoch": 1.0133333333333334,
"grad_norm": 1.516656565976496,
"learning_rate": 8.37405080845987e-06,
"loss": 0.2463,
"step": 2850
},
{
"epoch": 1.016888888888889,
"grad_norm": 1.2615996283177406,
"learning_rate": 8.358752811381592e-06,
"loss": 0.2439,
"step": 2860
},
{
"epoch": 1.0204444444444445,
"grad_norm": 1.2426761993789008,
"learning_rate": 8.343397301469127e-06,
"loss": 0.2301,
"step": 2870
},
{
"epoch": 1.024,
"grad_norm": 1.7414567869166766,
"learning_rate": 8.327984541659035e-06,
"loss": 0.26,
"step": 2880
},
{
"epoch": 1.0275555555555556,
"grad_norm": 1.778546754169589,
"learning_rate": 8.312514795868177e-06,
"loss": 0.2537,
"step": 2890
},
{
"epoch": 1.031111111111111,
"grad_norm": 1.693194016869835,
"learning_rate": 8.296988328989195e-06,
"loss": 0.2474,
"step": 2900
},
{
"epoch": 1.0346666666666666,
"grad_norm": 1.4905129718116352,
"learning_rate": 8.281405406885992e-06,
"loss": 0.2259,
"step": 2910
},
{
"epoch": 1.0382222222222222,
"grad_norm": 1.6844431624217413,
"learning_rate": 8.265766296389164e-06,
"loss": 0.2206,
"step": 2920
},
{
"epoch": 1.0417777777777777,
"grad_norm": 1.4064579919162583,
"learning_rate": 8.250071265291432e-06,
"loss": 0.2498,
"step": 2930
},
{
"epoch": 1.0453333333333332,
"grad_norm": 1.4383166925160618,
"learning_rate": 8.23432058234307e-06,
"loss": 0.2316,
"step": 2940
},
{
"epoch": 1.048888888888889,
"grad_norm": 1.7880359369165812,
"learning_rate": 8.218514517247287e-06,
"loss": 0.2421,
"step": 2950
},
{
"epoch": 1.0524444444444445,
"grad_norm": 1.49095155848045,
"learning_rate": 8.202653340655614e-06,
"loss": 0.2547,
"step": 2960
},
{
"epoch": 1.056,
"grad_norm": 1.802867297616481,
"learning_rate": 8.18673732416328e-06,
"loss": 0.2609,
"step": 2970
},
{
"epoch": 1.0595555555555556,
"grad_norm": 1.799375023246126,
"learning_rate": 8.170766740304541e-06,
"loss": 0.2369,
"step": 2980
},
{
"epoch": 1.0631111111111111,
"grad_norm": 1.645090115101595,
"learning_rate": 8.154741862548035e-06,
"loss": 0.2519,
"step": 2990
},
{
"epoch": 1.0666666666666667,
"grad_norm": 1.8315765038402207,
"learning_rate": 8.13866296529208e-06,
"loss": 0.2248,
"step": 3000
},
{
"epoch": 1.0666666666666667,
"eval_loss": 0.23144060373306274,
"eval_runtime": 562.045,
"eval_samples_per_second": 17.792,
"eval_steps_per_second": 4.448,
"step": 3000
},
{
"epoch": 1.0702222222222222,
"grad_norm": 1.3604786834079945,
"learning_rate": 8.122530323859992e-06,
"loss": 0.2494,
"step": 3010
},
{
"epoch": 1.0737777777777777,
"grad_norm": 1.472974815302568,
"learning_rate": 8.106344214495359e-06,
"loss": 0.2168,
"step": 3020
},
{
"epoch": 1.0773333333333333,
"grad_norm": 1.9232740710019078,
"learning_rate": 8.090104914357316e-06,
"loss": 0.2544,
"step": 3030
},
{
"epoch": 1.0808888888888888,
"grad_norm": 1.6517745707358162,
"learning_rate": 8.073812701515799e-06,
"loss": 0.2362,
"step": 3040
},
{
"epoch": 1.0844444444444445,
"grad_norm": 1.5375717590050721,
"learning_rate": 8.057467854946783e-06,
"loss": 0.238,
"step": 3050
},
{
"epoch": 1.088,
"grad_norm": 1.736104134714019,
"learning_rate": 8.041070654527498e-06,
"loss": 0.2329,
"step": 3060
},
{
"epoch": 1.0915555555555556,
"grad_norm": 1.578126670290498,
"learning_rate": 8.024621381031654e-06,
"loss": 0.2525,
"step": 3070
},
{
"epoch": 1.0951111111111111,
"grad_norm": 1.2995445031583646,
"learning_rate": 8.008120316124612e-06,
"loss": 0.2378,
"step": 3080
},
{
"epoch": 1.0986666666666667,
"grad_norm": 1.9084352174123695,
"learning_rate": 7.991567742358582e-06,
"loss": 0.2469,
"step": 3090
},
{
"epoch": 1.1022222222222222,
"grad_norm": 1.6004292294784017,
"learning_rate": 7.974963943167761e-06,
"loss": 0.2721,
"step": 3100
},
{
"epoch": 1.1057777777777777,
"grad_norm": 1.4738079995177567,
"learning_rate": 7.958309202863506e-06,
"loss": 0.2457,
"step": 3110
},
{
"epoch": 1.1093333333333333,
"grad_norm": 1.5493675656690653,
"learning_rate": 7.941603806629444e-06,
"loss": 0.2274,
"step": 3120
},
{
"epoch": 1.1128888888888888,
"grad_norm": 1.6554292154622638,
"learning_rate": 7.9248480405166e-06,
"loss": 0.2595,
"step": 3130
},
{
"epoch": 1.1164444444444444,
"grad_norm": 1.6112904935857704,
"learning_rate": 7.908042191438497e-06,
"loss": 0.2374,
"step": 3140
},
{
"epoch": 1.12,
"grad_norm": 1.4663251499352947,
"learning_rate": 7.891186547166238e-06,
"loss": 0.2128,
"step": 3150
},
{
"epoch": 1.1235555555555556,
"grad_norm": 1.8636139047215206,
"learning_rate": 7.874281396323589e-06,
"loss": 0.2263,
"step": 3160
},
{
"epoch": 1.1271111111111112,
"grad_norm": 1.6257921444204015,
"learning_rate": 7.857327028382025e-06,
"loss": 0.2392,
"step": 3170
},
{
"epoch": 1.1306666666666667,
"grad_norm": 1.4066061759358834,
"learning_rate": 7.84032373365578e-06,
"loss": 0.2342,
"step": 3180
},
{
"epoch": 1.1342222222222222,
"grad_norm": 1.5852680151393,
"learning_rate": 7.823271803296876e-06,
"loss": 0.2271,
"step": 3190
},
{
"epoch": 1.1377777777777778,
"grad_norm": 1.7721860252109063,
"learning_rate": 7.80617152929014e-06,
"loss": 0.2376,
"step": 3200
},
{
"epoch": 1.1413333333333333,
"grad_norm": 1.8867413038702499,
"learning_rate": 7.789023204448189e-06,
"loss": 0.2516,
"step": 3210
},
{
"epoch": 1.1448888888888888,
"grad_norm": 1.4279840133381525,
"learning_rate": 7.771827122406437e-06,
"loss": 0.2265,
"step": 3220
},
{
"epoch": 1.1484444444444444,
"grad_norm": 1.676800279171029,
"learning_rate": 7.754583577618057e-06,
"loss": 0.2554,
"step": 3230
},
{
"epoch": 1.152,
"grad_norm": 1.6723494127405627,
"learning_rate": 7.737292865348933e-06,
"loss": 0.2408,
"step": 3240
},
{
"epoch": 1.1555555555555554,
"grad_norm": 1.6148606083372026,
"learning_rate": 7.719955281672618e-06,
"loss": 0.2287,
"step": 3250
},
{
"epoch": 1.1591111111111112,
"grad_norm": 1.6092526546730486,
"learning_rate": 7.702571123465252e-06,
"loss": 0.237,
"step": 3260
},
{
"epoch": 1.1626666666666667,
"grad_norm": 1.3380193435685535,
"learning_rate": 7.685140688400484e-06,
"loss": 0.2393,
"step": 3270
},
{
"epoch": 1.1662222222222223,
"grad_norm": 1.3406231671146336,
"learning_rate": 7.66766427494438e-06,
"loss": 0.2158,
"step": 3280
},
{
"epoch": 1.1697777777777778,
"grad_norm": 1.5365708586926026,
"learning_rate": 7.650142182350294e-06,
"loss": 0.201,
"step": 3290
},
{
"epoch": 1.1733333333333333,
"grad_norm": 1.7847958889549216,
"learning_rate": 7.632574710653773e-06,
"loss": 0.2627,
"step": 3300
},
{
"epoch": 1.1768888888888889,
"grad_norm": 1.4770511975662048,
"learning_rate": 7.614962160667384e-06,
"loss": 0.221,
"step": 3310
},
{
"epoch": 1.1804444444444444,
"grad_norm": 1.8043230337610534,
"learning_rate": 7.597304833975596e-06,
"loss": 0.2419,
"step": 3320
},
{
"epoch": 1.184,
"grad_norm": 1.9363141324764201,
"learning_rate": 7.579603032929597e-06,
"loss": 0.2572,
"step": 3330
},
{
"epoch": 1.1875555555555555,
"grad_norm": 1.600071864532325,
"learning_rate": 7.56185706064212e-06,
"loss": 0.2462,
"step": 3340
},
{
"epoch": 1.1911111111111112,
"grad_norm": 1.5785414115422856,
"learning_rate": 7.544067220982254e-06,
"loss": 0.2312,
"step": 3350
},
{
"epoch": 1.1946666666666665,
"grad_norm": 1.5789285671514135,
"learning_rate": 7.526233818570245e-06,
"loss": 0.2067,
"step": 3360
},
{
"epoch": 1.1982222222222223,
"grad_norm": 1.7448328186975814,
"learning_rate": 7.508357158772273e-06,
"loss": 0.2448,
"step": 3370
},
{
"epoch": 1.2017777777777778,
"grad_norm": 1.4619128557517416,
"learning_rate": 7.490437547695224e-06,
"loss": 0.2194,
"step": 3380
},
{
"epoch": 1.2053333333333334,
"grad_norm": 1.6063307731749306,
"learning_rate": 7.472475292181454e-06,
"loss": 0.2501,
"step": 3390
},
{
"epoch": 1.208888888888889,
"grad_norm": 1.9510115721688825,
"learning_rate": 7.45447069980353e-06,
"loss": 0.2515,
"step": 3400
},
{
"epoch": 1.2124444444444444,
"grad_norm": 1.5856572080139135,
"learning_rate": 7.4364240788589625e-06,
"loss": 0.2461,
"step": 3410
},
{
"epoch": 1.216,
"grad_norm": 1.846941973796494,
"learning_rate": 7.418335738364931e-06,
"loss": 0.2241,
"step": 3420
},
{
"epoch": 1.2195555555555555,
"grad_norm": 1.8886992728965029,
"learning_rate": 7.400205988052991e-06,
"loss": 0.2298,
"step": 3430
},
{
"epoch": 1.223111111111111,
"grad_norm": 1.6140767527032074,
"learning_rate": 7.382035138363764e-06,
"loss": 0.2516,
"step": 3440
},
{
"epoch": 1.2266666666666666,
"grad_norm": 1.637777869962237,
"learning_rate": 7.363823500441636e-06,
"loss": 0.2422,
"step": 3450
},
{
"epoch": 1.2302222222222223,
"grad_norm": 1.3783132940885547,
"learning_rate": 7.345571386129413e-06,
"loss": 0.2368,
"step": 3460
},
{
"epoch": 1.2337777777777779,
"grad_norm": 1.750318456803832,
"learning_rate": 7.327279107962995e-06,
"loss": 0.2488,
"step": 3470
},
{
"epoch": 1.2373333333333334,
"grad_norm": 1.7745176716418858,
"learning_rate": 7.308946979166012e-06,
"loss": 0.2277,
"step": 3480
},
{
"epoch": 1.240888888888889,
"grad_norm": 1.7469697925399752,
"learning_rate": 7.290575313644476e-06,
"loss": 0.2329,
"step": 3490
},
{
"epoch": 1.2444444444444445,
"grad_norm": 1.4439208816879574,
"learning_rate": 7.272164425981387e-06,
"loss": 0.2575,
"step": 3500
},
{
"epoch": 1.2444444444444445,
"eval_loss": 0.22694812715053558,
"eval_runtime": 564.2235,
"eval_samples_per_second": 17.723,
"eval_steps_per_second": 4.431,
"step": 3500
},
{
"epoch": 1.248,
"grad_norm": 1.5767155030054063,
"learning_rate": 7.253714631431366e-06,
"loss": 0.2492,
"step": 3510
},
{
"epoch": 1.2515555555555555,
"grad_norm": 1.5655624730827595,
"learning_rate": 7.235226245915239e-06,
"loss": 0.2259,
"step": 3520
},
{
"epoch": 1.255111111111111,
"grad_norm": 1.8883245133962092,
"learning_rate": 7.216699586014642e-06,
"loss": 0.2487,
"step": 3530
},
{
"epoch": 1.2586666666666666,
"grad_norm": 1.2903228684726653,
"learning_rate": 7.198134968966588e-06,
"loss": 0.2341,
"step": 3540
},
{
"epoch": 1.2622222222222224,
"grad_norm": 1.6585013961180077,
"learning_rate": 7.179532712658047e-06,
"loss": 0.2625,
"step": 3550
},
{
"epoch": 1.2657777777777777,
"grad_norm": 1.4955952405740183,
"learning_rate": 7.160893135620488e-06,
"loss": 0.2602,
"step": 3560
},
{
"epoch": 1.2693333333333334,
"grad_norm": 1.8286387441617464,
"learning_rate": 7.142216557024443e-06,
"loss": 0.2221,
"step": 3570
},
{
"epoch": 1.272888888888889,
"grad_norm": 1.6146123865735058,
"learning_rate": 7.123503296674021e-06,
"loss": 0.247,
"step": 3580
},
{
"epoch": 1.2764444444444445,
"grad_norm": 1.4700165794501387,
"learning_rate": 7.104753675001453e-06,
"loss": 0.2405,
"step": 3590
},
{
"epoch": 1.28,
"grad_norm": 1.4475828320209072,
"learning_rate": 7.085968013061585e-06,
"loss": 0.2452,
"step": 3600
},
{
"epoch": 1.2835555555555556,
"grad_norm": 1.9854917772925798,
"learning_rate": 7.067146632526398e-06,
"loss": 0.2813,
"step": 3610
},
{
"epoch": 1.287111111111111,
"grad_norm": 1.863775670718366,
"learning_rate": 7.048289855679487e-06,
"loss": 0.2272,
"step": 3620
},
{
"epoch": 1.2906666666666666,
"grad_norm": 2.0238745081645693,
"learning_rate": 7.029398005410551e-06,
"loss": 0.2588,
"step": 3630
},
{
"epoch": 1.2942222222222222,
"grad_norm": 1.8729516419448864,
"learning_rate": 7.01047140520986e-06,
"loss": 0.2403,
"step": 3640
},
{
"epoch": 1.2977777777777777,
"grad_norm": 1.721501900738319,
"learning_rate": 6.9915103791627146e-06,
"loss": 0.2477,
"step": 3650
},
{
"epoch": 1.3013333333333335,
"grad_norm": 1.6626021007269847,
"learning_rate": 6.972515251943901e-06,
"loss": 0.2279,
"step": 3660
},
{
"epoch": 1.3048888888888888,
"grad_norm": 1.6716430135185554,
"learning_rate": 6.953486348812127e-06,
"loss": 0.2414,
"step": 3670
},
{
"epoch": 1.3084444444444445,
"grad_norm": 1.4291636119458788,
"learning_rate": 6.934423995604455e-06,
"loss": 0.248,
"step": 3680
},
{
"epoch": 1.312,
"grad_norm": 1.4674689793023254,
"learning_rate": 6.915328518730724e-06,
"loss": 0.2459,
"step": 3690
},
{
"epoch": 1.3155555555555556,
"grad_norm": 1.5215618690023482,
"learning_rate": 6.896200245167956e-06,
"loss": 0.2546,
"step": 3700
},
{
"epoch": 1.3191111111111111,
"grad_norm": 1.67624683709797,
"learning_rate": 6.877039502454758e-06,
"loss": 0.2006,
"step": 3710
},
{
"epoch": 1.3226666666666667,
"grad_norm": 1.552246698817707,
"learning_rate": 6.857846618685724e-06,
"loss": 0.2213,
"step": 3720
},
{
"epoch": 1.3262222222222222,
"grad_norm": 2.021180154460745,
"learning_rate": 6.8386219225057945e-06,
"loss": 0.2315,
"step": 3730
},
{
"epoch": 1.3297777777777777,
"grad_norm": 1.8378386656471875,
"learning_rate": 6.819365743104655e-06,
"loss": 0.2235,
"step": 3740
},
{
"epoch": 1.3333333333333333,
"grad_norm": 1.8383503621089257,
"learning_rate": 6.8000784102110795e-06,
"loss": 0.2348,
"step": 3750
},
{
"epoch": 1.3368888888888888,
"grad_norm": 1.476660408503267,
"learning_rate": 6.780760254087293e-06,
"loss": 0.2433,
"step": 3760
},
{
"epoch": 1.3404444444444445,
"grad_norm": 1.6056267413924534,
"learning_rate": 6.7614116055233146e-06,
"loss": 0.2511,
"step": 3770
},
{
"epoch": 1.3439999999999999,
"grad_norm": 1.5433968607865032,
"learning_rate": 6.742032795831298e-06,
"loss": 0.2218,
"step": 3780
},
{
"epoch": 1.3475555555555556,
"grad_norm": 1.8752695620093498,
"learning_rate": 6.722624156839847e-06,
"loss": 0.2607,
"step": 3790
},
{
"epoch": 1.3511111111111112,
"grad_norm": 1.7018274048947808,
"learning_rate": 6.703186020888347e-06,
"loss": 0.2434,
"step": 3800
},
{
"epoch": 1.3546666666666667,
"grad_norm": 1.7419410223233012,
"learning_rate": 6.683718720821264e-06,
"loss": 0.2494,
"step": 3810
},
{
"epoch": 1.3582222222222222,
"grad_norm": 1.5145074056393906,
"learning_rate": 6.664222589982451e-06,
"loss": 0.2215,
"step": 3820
},
{
"epoch": 1.3617777777777778,
"grad_norm": 1.2846516741089247,
"learning_rate": 6.644697962209434e-06,
"loss": 0.2346,
"step": 3830
},
{
"epoch": 1.3653333333333333,
"grad_norm": 1.4951097829345636,
"learning_rate": 6.6251451718277095e-06,
"loss": 0.2122,
"step": 3840
},
{
"epoch": 1.3688888888888888,
"grad_norm": 1.837176746272441,
"learning_rate": 6.605564553644998e-06,
"loss": 0.2289,
"step": 3850
},
{
"epoch": 1.3724444444444446,
"grad_norm": 1.7541861945923773,
"learning_rate": 6.585956442945531e-06,
"loss": 0.2304,
"step": 3860
},
{
"epoch": 1.376,
"grad_norm": 1.456084798251464,
"learning_rate": 6.566321175484298e-06,
"loss": 0.2524,
"step": 3870
},
{
"epoch": 1.3795555555555556,
"grad_norm": 1.4021880078388174,
"learning_rate": 6.546659087481304e-06,
"loss": 0.2344,
"step": 3880
},
{
"epoch": 1.3831111111111112,
"grad_norm": 1.386759603833687,
"learning_rate": 6.526970515615807e-06,
"loss": 0.2278,
"step": 3890
},
{
"epoch": 1.3866666666666667,
"grad_norm": 1.9340717544487618,
"learning_rate": 6.507255797020555e-06,
"loss": 0.2299,
"step": 3900
},
{
"epoch": 1.3902222222222222,
"grad_norm": 1.4309730673942778,
"learning_rate": 6.487515269276015e-06,
"loss": 0.2518,
"step": 3910
},
{
"epoch": 1.3937777777777778,
"grad_norm": 1.5432073955843775,
"learning_rate": 6.467749270404593e-06,
"loss": 0.2196,
"step": 3920
},
{
"epoch": 1.3973333333333333,
"grad_norm": 1.5255820019311863,
"learning_rate": 6.4479581388648404e-06,
"loss": 0.2527,
"step": 3930
},
{
"epoch": 1.4008888888888889,
"grad_norm": 1.9387048217346732,
"learning_rate": 6.428142213545662e-06,
"loss": 0.2663,
"step": 3940
},
{
"epoch": 1.4044444444444444,
"grad_norm": 1.4687424654762213,
"learning_rate": 6.408301833760517e-06,
"loss": 0.2141,
"step": 3950
},
{
"epoch": 1.408,
"grad_norm": 1.6790491256350315,
"learning_rate": 6.388437339241601e-06,
"loss": 0.2419,
"step": 3960
},
{
"epoch": 1.4115555555555557,
"grad_norm": 1.4986463255132796,
"learning_rate": 6.368549070134036e-06,
"loss": 0.2205,
"step": 3970
},
{
"epoch": 1.415111111111111,
"grad_norm": 1.8639041315873657,
"learning_rate": 6.348637366990038e-06,
"loss": 0.2403,
"step": 3980
},
{
"epoch": 1.4186666666666667,
"grad_norm": 1.8313804556837663,
"learning_rate": 6.328702570763098e-06,
"loss": 0.243,
"step": 3990
},
{
"epoch": 1.4222222222222223,
"grad_norm": 1.6288666479905434,
"learning_rate": 6.308745022802128e-06,
"loss": 0.2376,
"step": 4000
},
{
"epoch": 1.4222222222222223,
"eval_loss": 0.22332721948623657,
"eval_runtime": 562.4439,
"eval_samples_per_second": 17.78,
"eval_steps_per_second": 4.445,
"step": 4000
},
{
"epoch": 1.4257777777777778,
"grad_norm": 1.28363469470016,
"learning_rate": 6.288765064845629e-06,
"loss": 0.2119,
"step": 4010
},
{
"epoch": 1.4293333333333333,
"grad_norm": 1.5685400141436767,
"learning_rate": 6.268763039015833e-06,
"loss": 0.2372,
"step": 4020
},
{
"epoch": 1.4328888888888889,
"grad_norm": 1.2419732210599121,
"learning_rate": 6.248739287812846e-06,
"loss": 0.2378,
"step": 4030
},
{
"epoch": 1.4364444444444444,
"grad_norm": 1.450791049105233,
"learning_rate": 6.228694154108783e-06,
"loss": 0.236,
"step": 4040
},
{
"epoch": 1.44,
"grad_norm": 1.3478041984965912,
"learning_rate": 6.208627981141902e-06,
"loss": 0.2165,
"step": 4050
},
{
"epoch": 1.4435555555555555,
"grad_norm": 1.6880548918845273,
"learning_rate": 6.188541112510713e-06,
"loss": 0.2405,
"step": 4060
},
{
"epoch": 1.447111111111111,
"grad_norm": 1.489941080547117,
"learning_rate": 6.168433892168113e-06,
"loss": 0.2288,
"step": 4070
},
{
"epoch": 1.4506666666666668,
"grad_norm": 2.036909885440752,
"learning_rate": 6.148306664415476e-06,
"loss": 0.235,
"step": 4080
},
{
"epoch": 1.4542222222222223,
"grad_norm": 1.60733518117776,
"learning_rate": 6.128159773896783e-06,
"loss": 0.2143,
"step": 4090
},
{
"epoch": 1.4577777777777778,
"grad_norm": 1.6002205563066152,
"learning_rate": 6.107993565592693e-06,
"loss": 0.239,
"step": 4100
},
{
"epoch": 1.4613333333333334,
"grad_norm": 1.59924513215813,
"learning_rate": 6.087808384814652e-06,
"loss": 0.2185,
"step": 4110
},
{
"epoch": 1.464888888888889,
"grad_norm": 1.6651512334739322,
"learning_rate": 6.067604577198981e-06,
"loss": 0.238,
"step": 4120
},
{
"epoch": 1.4684444444444444,
"grad_norm": 1.6551324049801701,
"learning_rate": 6.04738248870095e-06,
"loss": 0.2238,
"step": 4130
},
{
"epoch": 1.472,
"grad_norm": 1.5301258421668906,
"learning_rate": 6.027142465588855e-06,
"loss": 0.2453,
"step": 4140
},
{
"epoch": 1.4755555555555555,
"grad_norm": 1.8144546212524773,
"learning_rate": 6.006884854438099e-06,
"loss": 0.2375,
"step": 4150
},
{
"epoch": 1.479111111111111,
"grad_norm": 1.5099593511650293,
"learning_rate": 5.9866100021252415e-06,
"loss": 0.2331,
"step": 4160
},
{
"epoch": 1.4826666666666668,
"grad_norm": 1.502590510458408,
"learning_rate": 5.966318255822072e-06,
"loss": 0.2131,
"step": 4170
},
{
"epoch": 1.4862222222222221,
"grad_norm": 1.7399671557461471,
"learning_rate": 5.946009962989659e-06,
"loss": 0.243,
"step": 4180
},
{
"epoch": 1.4897777777777779,
"grad_norm": 1.959843593418678,
"learning_rate": 5.9256854713724e-06,
"loss": 0.2344,
"step": 4190
},
{
"epoch": 1.4933333333333334,
"grad_norm": 1.5187384802338688,
"learning_rate": 5.905345128992072e-06,
"loss": 0.2372,
"step": 4200
},
{
"epoch": 1.496888888888889,
"grad_norm": 1.713913961820143,
"learning_rate": 5.884989284141866e-06,
"loss": 0.2137,
"step": 4210
},
{
"epoch": 1.5004444444444445,
"grad_norm": 1.5301932679943313,
"learning_rate": 5.86461828538043e-06,
"loss": 0.2264,
"step": 4220
},
{
"epoch": 1.504,
"grad_norm": 1.6650108469792486,
"learning_rate": 5.84423248152589e-06,
"loss": 0.2167,
"step": 4230
},
{
"epoch": 1.5075555555555555,
"grad_norm": 1.7377610919859674,
"learning_rate": 5.82383222164989e-06,
"loss": 0.2223,
"step": 4240
},
{
"epoch": 1.511111111111111,
"grad_norm": 1.8280200619954592,
"learning_rate": 5.803417855071603e-06,
"loss": 0.2361,
"step": 4250
},
{
"epoch": 1.5146666666666668,
"grad_norm": 1.7315368181217787,
"learning_rate": 5.782989731351762e-06,
"loss": 0.2665,
"step": 4260
},
{
"epoch": 1.5182222222222221,
"grad_norm": 1.6917154736502973,
"learning_rate": 5.762548200286659e-06,
"loss": 0.212,
"step": 4270
},
{
"epoch": 1.521777777777778,
"grad_norm": 1.5262051452408105,
"learning_rate": 5.742093611902168e-06,
"loss": 0.2142,
"step": 4280
},
{
"epoch": 1.5253333333333332,
"grad_norm": 1.4955231464253305,
"learning_rate": 5.721626316447748e-06,
"loss": 0.2302,
"step": 4290
},
{
"epoch": 1.528888888888889,
"grad_norm": 1.729596636954076,
"learning_rate": 5.7011466643904434e-06,
"loss": 0.2209,
"step": 4300
},
{
"epoch": 1.5324444444444445,
"grad_norm": 1.470928828267314,
"learning_rate": 5.680655006408882e-06,
"loss": 0.2398,
"step": 4310
},
{
"epoch": 1.536,
"grad_norm": 1.4046672488847465,
"learning_rate": 5.660151693387273e-06,
"loss": 0.2335,
"step": 4320
},
{
"epoch": 1.5395555555555556,
"grad_norm": 1.6687999325358385,
"learning_rate": 5.639637076409404e-06,
"loss": 0.2207,
"step": 4330
},
{
"epoch": 1.543111111111111,
"grad_norm": 1.60564618911301,
"learning_rate": 5.6191115067526135e-06,
"loss": 0.2411,
"step": 4340
},
{
"epoch": 1.5466666666666666,
"grad_norm": 1.6047937970455775,
"learning_rate": 5.598575335881792e-06,
"loss": 0.2161,
"step": 4350
},
{
"epoch": 1.5502222222222222,
"grad_norm": 1.3451412373708476,
"learning_rate": 5.578028915443356e-06,
"loss": 0.2104,
"step": 4360
},
{
"epoch": 1.553777777777778,
"grad_norm": 1.827680836587444,
"learning_rate": 5.55747259725923e-06,
"loss": 0.2333,
"step": 4370
},
{
"epoch": 1.5573333333333332,
"grad_norm": 1.8474659285597943,
"learning_rate": 5.536906733320816e-06,
"loss": 0.2447,
"step": 4380
},
{
"epoch": 1.560888888888889,
"grad_norm": 1.5571932949328393,
"learning_rate": 5.516331675782973e-06,
"loss": 0.2445,
"step": 4390
},
{
"epoch": 1.5644444444444443,
"grad_norm": 1.9294806844289611,
"learning_rate": 5.495747776957987e-06,
"loss": 0.2382,
"step": 4400
},
{
"epoch": 1.568,
"grad_norm": 1.3637347529801744,
"learning_rate": 5.475155389309531e-06,
"loss": 0.2162,
"step": 4410
},
{
"epoch": 1.5715555555555556,
"grad_norm": 1.552594376889073,
"learning_rate": 5.4545548654466366e-06,
"loss": 0.2351,
"step": 4420
},
{
"epoch": 1.5751111111111111,
"grad_norm": 1.563596866564994,
"learning_rate": 5.433946558117654e-06,
"loss": 0.2259,
"step": 4430
},
{
"epoch": 1.5786666666666667,
"grad_norm": 1.9424477147575314,
"learning_rate": 5.413330820204214e-06,
"loss": 0.2269,
"step": 4440
},
{
"epoch": 1.5822222222222222,
"grad_norm": 1.7161442287459214,
"learning_rate": 5.392708004715178e-06,
"loss": 0.233,
"step": 4450
},
{
"epoch": 1.5857777777777777,
"grad_norm": 1.4458518805717744,
"learning_rate": 5.372078464780603e-06,
"loss": 0.2428,
"step": 4460
},
{
"epoch": 1.5893333333333333,
"grad_norm": 1.7197914268509118,
"learning_rate": 5.351442553645691e-06,
"loss": 0.2095,
"step": 4470
},
{
"epoch": 1.592888888888889,
"grad_norm": 1.7871712697682276,
"learning_rate": 5.330800624664736e-06,
"loss": 0.2375,
"step": 4480
},
{
"epoch": 1.5964444444444443,
"grad_norm": 1.6154295338481346,
"learning_rate": 5.310153031295079e-06,
"loss": 0.2365,
"step": 4490
},
{
"epoch": 1.6,
"grad_norm": 1.8622833358204558,
"learning_rate": 5.289500127091056e-06,
"loss": 0.2521,
"step": 4500
},
{
"epoch": 1.6,
"eval_loss": 0.22019484639167786,
"eval_runtime": 562.6101,
"eval_samples_per_second": 17.774,
"eval_steps_per_second": 4.444,
"step": 4500
},
{
"epoch": 1.6035555555555554,
"grad_norm": 1.4160865462023664,
"learning_rate": 5.26884226569794e-06,
"loss": 0.2445,
"step": 4510
},
{
"epoch": 1.6071111111111112,
"grad_norm": 1.6982387533503471,
"learning_rate": 5.248179800845884e-06,
"loss": 0.2586,
"step": 4520
},
{
"epoch": 1.6106666666666667,
"grad_norm": 1.8063057152671183,
"learning_rate": 5.227513086343875e-06,
"loss": 0.2342,
"step": 4530
},
{
"epoch": 1.6142222222222222,
"grad_norm": 1.8369946808465265,
"learning_rate": 5.20684247607366e-06,
"loss": 0.2149,
"step": 4540
},
{
"epoch": 1.6177777777777778,
"grad_norm": 1.4919743522204885,
"learning_rate": 5.186168323983702e-06,
"loss": 0.2361,
"step": 4550
},
{
"epoch": 1.6213333333333333,
"grad_norm": 1.908909797085476,
"learning_rate": 5.1654909840831e-06,
"loss": 0.2422,
"step": 4560
},
{
"epoch": 1.624888888888889,
"grad_norm": 1.6970594817568836,
"learning_rate": 5.144810810435553e-06,
"loss": 0.2702,
"step": 4570
},
{
"epoch": 1.6284444444444444,
"grad_norm": 1.914631182858778,
"learning_rate": 5.124128157153273e-06,
"loss": 0.211,
"step": 4580
},
{
"epoch": 1.6320000000000001,
"grad_norm": 1.8308898752074714,
"learning_rate": 5.103443378390935e-06,
"loss": 0.213,
"step": 4590
},
{
"epoch": 1.6355555555555554,
"grad_norm": 1.4716155031307734,
"learning_rate": 5.08275682833961e-06,
"loss": 0.2348,
"step": 4600
},
{
"epoch": 1.6391111111111112,
"grad_norm": 1.3846959035420932,
"learning_rate": 5.062068861220697e-06,
"loss": 0.2323,
"step": 4610
},
{
"epoch": 1.6426666666666667,
"grad_norm": 1.310528332429156,
"learning_rate": 5.041379831279859e-06,
"loss": 0.2274,
"step": 4620
},
{
"epoch": 1.6462222222222223,
"grad_norm": 1.56294035415104,
"learning_rate": 5.020690092780961e-06,
"loss": 0.2382,
"step": 4630
},
{
"epoch": 1.6497777777777778,
"grad_norm": 1.797053581769004,
"learning_rate": 5e-06,
"loss": 0.2263,
"step": 4640
},
{
"epoch": 1.6533333333333333,
"grad_norm": 1.57684485333151,
"learning_rate": 4.9793099072190406e-06,
"loss": 0.2225,
"step": 4650
},
{
"epoch": 1.6568888888888889,
"grad_norm": 2.0411280702141883,
"learning_rate": 4.958620168720144e-06,
"loss": 0.2225,
"step": 4660
},
{
"epoch": 1.6604444444444444,
"grad_norm": 1.476641016823167,
"learning_rate": 4.937931138779305e-06,
"loss": 0.2438,
"step": 4670
},
{
"epoch": 1.6640000000000001,
"grad_norm": 1.4259185034698016,
"learning_rate": 4.917243171660391e-06,
"loss": 0.2127,
"step": 4680
},
{
"epoch": 1.6675555555555555,
"grad_norm": 1.9925037267732388,
"learning_rate": 4.896556621609066e-06,
"loss": 0.223,
"step": 4690
},
{
"epoch": 1.6711111111111112,
"grad_norm": 1.3845653896887404,
"learning_rate": 4.8758718428467275e-06,
"loss": 0.2332,
"step": 4700
},
{
"epoch": 1.6746666666666665,
"grad_norm": 1.5936847174408162,
"learning_rate": 4.8551891895644485e-06,
"loss": 0.2381,
"step": 4710
},
{
"epoch": 1.6782222222222223,
"grad_norm": 1.8741655887113169,
"learning_rate": 4.8345090159169015e-06,
"loss": 0.2182,
"step": 4720
},
{
"epoch": 1.6817777777777778,
"grad_norm": 2.0577120951961057,
"learning_rate": 4.813831676016301e-06,
"loss": 0.2323,
"step": 4730
},
{
"epoch": 1.6853333333333333,
"grad_norm": 1.6887655358314864,
"learning_rate": 4.793157523926343e-06,
"loss": 0.2236,
"step": 4740
},
{
"epoch": 1.6888888888888889,
"grad_norm": 1.669624887759933,
"learning_rate": 4.772486913656126e-06,
"loss": 0.216,
"step": 4750
},
{
"epoch": 1.6924444444444444,
"grad_norm": 1.3957590014036165,
"learning_rate": 4.751820199154116e-06,
"loss": 0.2104,
"step": 4760
},
{
"epoch": 1.696,
"grad_norm": 1.7601085948001791,
"learning_rate": 4.731157734302063e-06,
"loss": 0.2255,
"step": 4770
},
{
"epoch": 1.6995555555555555,
"grad_norm": 1.4141936030167341,
"learning_rate": 4.7104998729089456e-06,
"loss": 0.2216,
"step": 4780
},
{
"epoch": 1.7031111111111112,
"grad_norm": 1.5375991664201998,
"learning_rate": 4.689846968704921e-06,
"loss": 0.2316,
"step": 4790
},
{
"epoch": 1.7066666666666666,
"grad_norm": 1.835379245628528,
"learning_rate": 4.669199375335267e-06,
"loss": 0.2211,
"step": 4800
},
{
"epoch": 1.7102222222222223,
"grad_norm": 1.8813507703109071,
"learning_rate": 4.64855744635431e-06,
"loss": 0.2279,
"step": 4810
},
{
"epoch": 1.7137777777777776,
"grad_norm": 1.6192801344534893,
"learning_rate": 4.627921535219398e-06,
"loss": 0.2076,
"step": 4820
},
{
"epoch": 1.7173333333333334,
"grad_norm": 1.5047363033780152,
"learning_rate": 4.607291995284824e-06,
"loss": 0.2272,
"step": 4830
},
{
"epoch": 1.720888888888889,
"grad_norm": 1.7489501841705488,
"learning_rate": 4.586669179795789e-06,
"loss": 0.2269,
"step": 4840
},
{
"epoch": 1.7244444444444444,
"grad_norm": 1.5125229649844467,
"learning_rate": 4.566053441882346e-06,
"loss": 0.2187,
"step": 4850
},
{
"epoch": 1.728,
"grad_norm": 1.456492370626904,
"learning_rate": 4.545445134553365e-06,
"loss": 0.2179,
"step": 4860
},
{
"epoch": 1.7315555555555555,
"grad_norm": 1.620452560710039,
"learning_rate": 4.52484461069047e-06,
"loss": 0.2262,
"step": 4870
},
{
"epoch": 1.7351111111111113,
"grad_norm": 2.0083784630353887,
"learning_rate": 4.504252223042015e-06,
"loss": 0.2363,
"step": 4880
},
{
"epoch": 1.7386666666666666,
"grad_norm": 1.4284347298197593,
"learning_rate": 4.4836683242170274e-06,
"loss": 0.2297,
"step": 4890
},
{
"epoch": 1.7422222222222223,
"grad_norm": 1.4968259463132965,
"learning_rate": 4.463093266679185e-06,
"loss": 0.2223,
"step": 4900
},
{
"epoch": 1.7457777777777777,
"grad_norm": 1.625381108991568,
"learning_rate": 4.442527402740773e-06,
"loss": 0.2177,
"step": 4910
},
{
"epoch": 1.7493333333333334,
"grad_norm": 1.7761034776967624,
"learning_rate": 4.4219710845566445e-06,
"loss": 0.2266,
"step": 4920
},
{
"epoch": 1.752888888888889,
"grad_norm": 1.513194923019174,
"learning_rate": 4.401424664118209e-06,
"loss": 0.2385,
"step": 4930
},
{
"epoch": 1.7564444444444445,
"grad_norm": 1.6662188116169265,
"learning_rate": 4.380888493247389e-06,
"loss": 0.2209,
"step": 4940
},
{
"epoch": 1.76,
"grad_norm": 1.7192566216460916,
"learning_rate": 4.360362923590599e-06,
"loss": 0.2273,
"step": 4950
},
{
"epoch": 1.7635555555555555,
"grad_norm": 1.6376141309754375,
"learning_rate": 4.339848306612726e-06,
"loss": 0.2263,
"step": 4960
},
{
"epoch": 1.767111111111111,
"grad_norm": 1.5441961811580323,
"learning_rate": 4.319344993591122e-06,
"loss": 0.2317,
"step": 4970
},
{
"epoch": 1.7706666666666666,
"grad_norm": 1.8214320335618939,
"learning_rate": 4.298853335609558e-06,
"loss": 0.2352,
"step": 4980
},
{
"epoch": 1.7742222222222224,
"grad_norm": 1.56553607416482,
"learning_rate": 4.278373683552252e-06,
"loss": 0.2451,
"step": 4990
},
{
"epoch": 1.7777777777777777,
"grad_norm": 1.3995626238477137,
"learning_rate": 4.257906388097833e-06,
"loss": 0.2119,
"step": 5000
},
{
"epoch": 1.7777777777777777,
"eval_loss": 0.2164340764284134,
"eval_runtime": 560.6747,
"eval_samples_per_second": 17.836,
"eval_steps_per_second": 4.459,
"step": 5000
},
{
"epoch": 1.7813333333333334,
"grad_norm": 2.040538040793932,
"learning_rate": 4.237451799713343e-06,
"loss": 0.2311,
"step": 5010
},
{
"epoch": 1.7848888888888887,
"grad_norm": 1.718359867250397,
"learning_rate": 4.2170102686482386e-06,
"loss": 0.2308,
"step": 5020
},
{
"epoch": 1.7884444444444445,
"grad_norm": 1.647498620915099,
"learning_rate": 4.196582144928398e-06,
"loss": 0.2343,
"step": 5030
},
{
"epoch": 1.792,
"grad_norm": 1.529219174043635,
"learning_rate": 4.176167778350111e-06,
"loss": 0.2471,
"step": 5040
},
{
"epoch": 1.7955555555555556,
"grad_norm": 1.8299602144032394,
"learning_rate": 4.155767518474112e-06,
"loss": 0.2334,
"step": 5050
},
{
"epoch": 1.799111111111111,
"grad_norm": 1.6343462536475093,
"learning_rate": 4.135381714619572e-06,
"loss": 0.2352,
"step": 5060
},
{
"epoch": 1.8026666666666666,
"grad_norm": 1.9294723624845498,
"learning_rate": 4.115010715858135e-06,
"loss": 0.2295,
"step": 5070
},
{
"epoch": 1.8062222222222222,
"grad_norm": 1.8402038191366281,
"learning_rate": 4.09465487100793e-06,
"loss": 0.2227,
"step": 5080
},
{
"epoch": 1.8097777777777777,
"grad_norm": 1.8931304584295443,
"learning_rate": 4.074314528627602e-06,
"loss": 0.2355,
"step": 5090
},
{
"epoch": 1.8133333333333335,
"grad_norm": 1.8206151546804537,
"learning_rate": 4.053990037010342e-06,
"loss": 0.2323,
"step": 5100
},
{
"epoch": 1.8168888888888888,
"grad_norm": 1.5473952396079231,
"learning_rate": 4.033681744177929e-06,
"loss": 0.2069,
"step": 5110
},
{
"epoch": 1.8204444444444445,
"grad_norm": 1.2199743932660083,
"learning_rate": 4.013389997874759e-06,
"loss": 0.2076,
"step": 5120
},
{
"epoch": 1.8239999999999998,
"grad_norm": 1.7825722106285342,
"learning_rate": 3.993115145561902e-06,
"loss": 0.2425,
"step": 5130
},
{
"epoch": 1.8275555555555556,
"grad_norm": 1.8303008392916014,
"learning_rate": 3.9728575344111456e-06,
"loss": 0.234,
"step": 5140
},
{
"epoch": 1.8311111111111111,
"grad_norm": 1.2964915164879398,
"learning_rate": 3.9526175112990515e-06,
"loss": 0.1987,
"step": 5150
},
{
"epoch": 1.8346666666666667,
"grad_norm": 1.5700753166440498,
"learning_rate": 3.93239542280102e-06,
"loss": 0.2137,
"step": 5160
},
{
"epoch": 1.8382222222222222,
"grad_norm": 1.6406760092620998,
"learning_rate": 3.912191615185349e-06,
"loss": 0.2235,
"step": 5170
},
{
"epoch": 1.8417777777777777,
"grad_norm": 1.5447905159493263,
"learning_rate": 3.892006434407309e-06,
"loss": 0.2218,
"step": 5180
},
{
"epoch": 1.8453333333333335,
"grad_norm": 1.7383544264235498,
"learning_rate": 3.871840226103219e-06,
"loss": 0.2287,
"step": 5190
},
{
"epoch": 1.8488888888888888,
"grad_norm": 1.9317016214891507,
"learning_rate": 3.851693335584525e-06,
"loss": 0.2228,
"step": 5200
},
{
"epoch": 1.8524444444444446,
"grad_norm": 1.5692018080933492,
"learning_rate": 3.831566107831889e-06,
"loss": 0.2331,
"step": 5210
},
{
"epoch": 1.8559999999999999,
"grad_norm": 2.050378660719503,
"learning_rate": 3.8114588874892893e-06,
"loss": 0.2137,
"step": 5220
},
{
"epoch": 1.8595555555555556,
"grad_norm": 1.5271617708228957,
"learning_rate": 3.791372018858099e-06,
"loss": 0.2135,
"step": 5230
},
{
"epoch": 1.8631111111111112,
"grad_norm": 1.31763541419423,
"learning_rate": 3.7713058458912164e-06,
"loss": 0.2217,
"step": 5240
},
{
"epoch": 1.8666666666666667,
"grad_norm": 1.6488724873659462,
"learning_rate": 3.751260712187156e-06,
"loss": 0.2539,
"step": 5250
},
{
"epoch": 1.8702222222222222,
"grad_norm": 1.392136229173735,
"learning_rate": 3.731236960984169e-06,
"loss": 0.2179,
"step": 5260
},
{
"epoch": 1.8737777777777778,
"grad_norm": 1.6189512718112575,
"learning_rate": 3.711234935154372e-06,
"loss": 0.2183,
"step": 5270
},
{
"epoch": 1.8773333333333333,
"grad_norm": 1.5548818693905742,
"learning_rate": 3.6912549771978747e-06,
"loss": 0.2354,
"step": 5280
},
{
"epoch": 1.8808888888888888,
"grad_norm": 1.4728328055912387,
"learning_rate": 3.6712974292369035e-06,
"loss": 0.2268,
"step": 5290
},
{
"epoch": 1.8844444444444446,
"grad_norm": 1.5435161738551857,
"learning_rate": 3.651362633009962e-06,
"loss": 0.204,
"step": 5300
},
{
"epoch": 1.888,
"grad_norm": 1.5873129086509827,
"learning_rate": 3.6314509298659663e-06,
"loss": 0.208,
"step": 5310
},
{
"epoch": 1.8915555555555557,
"grad_norm": 1.3391876728975607,
"learning_rate": 3.6115626607584e-06,
"loss": 0.2372,
"step": 5320
},
{
"epoch": 1.895111111111111,
"grad_norm": 1.88178920211116,
"learning_rate": 3.5916981662394856e-06,
"loss": 0.2257,
"step": 5330
},
{
"epoch": 1.8986666666666667,
"grad_norm": 1.764120901512499,
"learning_rate": 3.5718577864543396e-06,
"loss": 0.2103,
"step": 5340
},
{
"epoch": 1.9022222222222223,
"grad_norm": 1.6698875487111986,
"learning_rate": 3.552041861135161e-06,
"loss": 0.211,
"step": 5350
},
{
"epoch": 1.9057777777777778,
"grad_norm": 1.6957349016200651,
"learning_rate": 3.532250729595408e-06,
"loss": 0.2164,
"step": 5360
},
{
"epoch": 1.9093333333333333,
"grad_norm": 1.5603565111247202,
"learning_rate": 3.5124847307239863e-06,
"loss": 0.2265,
"step": 5370
},
{
"epoch": 1.9128888888888889,
"grad_norm": 1.5529468285695374,
"learning_rate": 3.4927442029794467e-06,
"loss": 0.2316,
"step": 5380
},
{
"epoch": 1.9164444444444444,
"grad_norm": 1.7677530671686799,
"learning_rate": 3.473029484384196e-06,
"loss": 0.219,
"step": 5390
},
{
"epoch": 1.92,
"grad_norm": 1.9782571884316444,
"learning_rate": 3.4533409125186974e-06,
"loss": 0.2252,
"step": 5400
},
{
"epoch": 1.9235555555555557,
"grad_norm": 1.7371605678560165,
"learning_rate": 3.4336788245157026e-06,
"loss": 0.2222,
"step": 5410
},
{
"epoch": 1.927111111111111,
"grad_norm": 1.7241089696999294,
"learning_rate": 3.4140435570544708e-06,
"loss": 0.2345,
"step": 5420
},
{
"epoch": 1.9306666666666668,
"grad_norm": 1.7019802310043695,
"learning_rate": 3.3944354463550035e-06,
"loss": 0.214,
"step": 5430
},
{
"epoch": 1.934222222222222,
"grad_norm": 1.8394276850187319,
"learning_rate": 3.374854828172292e-06,
"loss": 0.234,
"step": 5440
},
{
"epoch": 1.9377777777777778,
"grad_norm": 1.7264682966489493,
"learning_rate": 3.3553020377905663e-06,
"loss": 0.2242,
"step": 5450
},
{
"epoch": 1.9413333333333334,
"grad_norm": 1.6744044298365783,
"learning_rate": 3.3357774100175513e-06,
"loss": 0.2245,
"step": 5460
},
{
"epoch": 1.944888888888889,
"grad_norm": 1.4991747809315612,
"learning_rate": 3.316281279178737e-06,
"loss": 0.2114,
"step": 5470
},
{
"epoch": 1.9484444444444444,
"grad_norm": 1.5141154002091217,
"learning_rate": 3.296813979111655e-06,
"loss": 0.2182,
"step": 5480
},
{
"epoch": 1.952,
"grad_norm": 1.7580533484108005,
"learning_rate": 3.2773758431601543e-06,
"loss": 0.2234,
"step": 5490
},
{
"epoch": 1.9555555555555557,
"grad_norm": 1.6014365241780455,
"learning_rate": 3.257967204168705e-06,
"loss": 0.238,
"step": 5500
},
{
"epoch": 1.9555555555555557,
"eval_loss": 0.21176277101039886,
"eval_runtime": 560.9255,
"eval_samples_per_second": 17.828,
"eval_steps_per_second": 4.457,
"step": 5500
},
{
"epoch": 1.959111111111111,
"grad_norm": 1.566927102750067,
"learning_rate": 3.2385883944766867e-06,
"loss": 0.1932,
"step": 5510
},
{
"epoch": 1.9626666666666668,
"grad_norm": 1.7041733469332605,
"learning_rate": 3.2192397459127077e-06,
"loss": 0.2194,
"step": 5520
},
{
"epoch": 1.966222222222222,
"grad_norm": 1.7846179835205314,
"learning_rate": 3.199921589788923e-06,
"loss": 0.2092,
"step": 5530
},
{
"epoch": 1.9697777777777778,
"grad_norm": 1.482707355318634,
"learning_rate": 3.180634256895345e-06,
"loss": 0.2328,
"step": 5540
},
{
"epoch": 1.9733333333333334,
"grad_norm": 1.6559180099205715,
"learning_rate": 3.161378077494205e-06,
"loss": 0.234,
"step": 5550
},
{
"epoch": 1.976888888888889,
"grad_norm": 1.4931797613124567,
"learning_rate": 3.142153381314278e-06,
"loss": 0.2285,
"step": 5560
},
{
"epoch": 1.9804444444444445,
"grad_norm": 1.6899228150340497,
"learning_rate": 3.122960497545242e-06,
"loss": 0.2347,
"step": 5570
},
{
"epoch": 1.984,
"grad_norm": 1.6112817535514066,
"learning_rate": 3.103799754832045e-06,
"loss": 0.2017,
"step": 5580
},
{
"epoch": 1.9875555555555555,
"grad_norm": 1.4492842053913877,
"learning_rate": 3.0846714812692774e-06,
"loss": 0.2282,
"step": 5590
},
{
"epoch": 1.991111111111111,
"grad_norm": 1.6227303784789882,
"learning_rate": 3.065576004395546e-06,
"loss": 0.2193,
"step": 5600
},
{
"epoch": 1.9946666666666668,
"grad_norm": 1.6532339878737676,
"learning_rate": 3.046513651187874e-06,
"loss": 0.205,
"step": 5610
},
{
"epoch": 1.9982222222222221,
"grad_norm": 1.726150455488493,
"learning_rate": 3.027484748056101e-06,
"loss": 0.2052,
"step": 5620
},
{
"epoch": 2.001777777777778,
"grad_norm": 1.2491575364238943,
"learning_rate": 3.008489620837287e-06,
"loss": 0.1793,
"step": 5630
},
{
"epoch": 2.005333333333333,
"grad_norm": 1.539466703681713,
"learning_rate": 2.989528594790142e-06,
"loss": 0.133,
"step": 5640
},
{
"epoch": 2.008888888888889,
"grad_norm": 1.5201921987042595,
"learning_rate": 2.97060199458945e-06,
"loss": 0.1364,
"step": 5650
},
{
"epoch": 2.0124444444444443,
"grad_norm": 1.8387836805686166,
"learning_rate": 2.9517101443205143e-06,
"loss": 0.138,
"step": 5660
},
{
"epoch": 2.016,
"grad_norm": 1.6624452979538558,
"learning_rate": 2.9328533674736043e-06,
"loss": 0.1372,
"step": 5670
},
{
"epoch": 2.0195555555555558,
"grad_norm": 2.0375067274701464,
"learning_rate": 2.914031986938417e-06,
"loss": 0.1376,
"step": 5680
},
{
"epoch": 2.023111111111111,
"grad_norm": 1.5020388133691598,
"learning_rate": 2.895246324998549e-06,
"loss": 0.132,
"step": 5690
},
{
"epoch": 2.026666666666667,
"grad_norm": 1.5200304354769367,
"learning_rate": 2.8764967033259793e-06,
"loss": 0.1332,
"step": 5700
},
{
"epoch": 2.030222222222222,
"grad_norm": 1.615938242121572,
"learning_rate": 2.8577834429755586e-06,
"loss": 0.137,
"step": 5710
},
{
"epoch": 2.033777777777778,
"grad_norm": 1.7244206202588588,
"learning_rate": 2.839106864379512e-06,
"loss": 0.1311,
"step": 5720
},
{
"epoch": 2.037333333333333,
"grad_norm": 1.4204204890159835,
"learning_rate": 2.8204672873419565e-06,
"loss": 0.1359,
"step": 5730
},
{
"epoch": 2.040888888888889,
"grad_norm": 1.641810724006462,
"learning_rate": 2.8018650310334118e-06,
"loss": 0.1524,
"step": 5740
},
{
"epoch": 2.0444444444444443,
"grad_norm": 1.6197231294728873,
"learning_rate": 2.783300413985359e-06,
"loss": 0.1216,
"step": 5750
},
{
"epoch": 2.048,
"grad_norm": 1.7166152973793496,
"learning_rate": 2.764773754084763e-06,
"loss": 0.1393,
"step": 5760
},
{
"epoch": 2.0515555555555554,
"grad_norm": 1.7305108784705923,
"learning_rate": 2.7462853685686362e-06,
"loss": 0.1429,
"step": 5770
},
{
"epoch": 2.055111111111111,
"grad_norm": 1.2910967057789844,
"learning_rate": 2.7278355740186123e-06,
"loss": 0.1336,
"step": 5780
},
{
"epoch": 2.058666666666667,
"grad_norm": 1.5080611405633613,
"learning_rate": 2.7094246863555262e-06,
"loss": 0.1359,
"step": 5790
},
{
"epoch": 2.062222222222222,
"grad_norm": 1.8733744454525603,
"learning_rate": 2.691053020833988e-06,
"loss": 0.1388,
"step": 5800
},
{
"epoch": 2.065777777777778,
"grad_norm": 1.7085324740063759,
"learning_rate": 2.6727208920370063e-06,
"loss": 0.1355,
"step": 5810
},
{
"epoch": 2.0693333333333332,
"grad_norm": 1.5576784710780245,
"learning_rate": 2.6544286138705867e-06,
"loss": 0.1328,
"step": 5820
},
{
"epoch": 2.072888888888889,
"grad_norm": 1.9703710936721526,
"learning_rate": 2.636176499558364e-06,
"loss": 0.1354,
"step": 5830
},
{
"epoch": 2.0764444444444443,
"grad_norm": 1.5952203119705437,
"learning_rate": 2.6179648616362374e-06,
"loss": 0.1493,
"step": 5840
},
{
"epoch": 2.08,
"grad_norm": 1.9073156525645674,
"learning_rate": 2.599794011947012e-06,
"loss": 0.1579,
"step": 5850
},
{
"epoch": 2.0835555555555554,
"grad_norm": 1.7695748236621889,
"learning_rate": 2.581664261635069e-06,
"loss": 0.1446,
"step": 5860
},
{
"epoch": 2.087111111111111,
"grad_norm": 1.8880183020861152,
"learning_rate": 2.5635759211410396e-06,
"loss": 0.1406,
"step": 5870
},
{
"epoch": 2.0906666666666665,
"grad_norm": 1.5198269240530051,
"learning_rate": 2.545529300196472e-06,
"loss": 0.1244,
"step": 5880
},
{
"epoch": 2.094222222222222,
"grad_norm": 1.9355343365767825,
"learning_rate": 2.527524707818547e-06,
"loss": 0.1289,
"step": 5890
},
{
"epoch": 2.097777777777778,
"grad_norm": 1.546102626213903,
"learning_rate": 2.5095624523047775e-06,
"loss": 0.1151,
"step": 5900
},
{
"epoch": 2.1013333333333333,
"grad_norm": 1.3237810299249595,
"learning_rate": 2.491642841227729e-06,
"loss": 0.1386,
"step": 5910
},
{
"epoch": 2.104888888888889,
"grad_norm": 1.6354432410587478,
"learning_rate": 2.4737661814297557e-06,
"loss": 0.1152,
"step": 5920
},
{
"epoch": 2.1084444444444443,
"grad_norm": 1.7641939157921844,
"learning_rate": 2.455932779017747e-06,
"loss": 0.1267,
"step": 5930
},
{
"epoch": 2.112,
"grad_norm": 1.7717956617877848,
"learning_rate": 2.438142939357882e-06,
"loss": 0.1468,
"step": 5940
},
{
"epoch": 2.1155555555555554,
"grad_norm": 1.9248857260031529,
"learning_rate": 2.4203969670704065e-06,
"loss": 0.1426,
"step": 5950
},
{
"epoch": 2.119111111111111,
"grad_norm": 1.6693083011986807,
"learning_rate": 2.4026951660244063e-06,
"loss": 0.1519,
"step": 5960
},
{
"epoch": 2.1226666666666665,
"grad_norm": 1.4577868069815147,
"learning_rate": 2.385037839332616e-06,
"loss": 0.1449,
"step": 5970
},
{
"epoch": 2.1262222222222222,
"grad_norm": 1.5757247401728414,
"learning_rate": 2.3674252893462304e-06,
"loss": 0.1508,
"step": 5980
},
{
"epoch": 2.129777777777778,
"grad_norm": 1.798414953668795,
"learning_rate": 2.3498578176497055e-06,
"loss": 0.1336,
"step": 5990
},
{
"epoch": 2.1333333333333333,
"grad_norm": 1.3502333712237125,
"learning_rate": 2.3323357250556213e-06,
"loss": 0.1289,
"step": 6000
},
{
"epoch": 2.1333333333333333,
"eval_loss": 0.24109843373298645,
"eval_runtime": 561.0318,
"eval_samples_per_second": 17.824,
"eval_steps_per_second": 4.456,
"step": 6000
},
{
"epoch": 2.136888888888889,
"grad_norm": 1.6807098639484461,
"learning_rate": 2.3148593115995155e-06,
"loss": 0.1232,
"step": 6010
},
{
"epoch": 2.1404444444444444,
"grad_norm": 1.3750693562838343,
"learning_rate": 2.2974288765347484e-06,
"loss": 0.1406,
"step": 6020
},
{
"epoch": 2.144,
"grad_norm": 1.7740210796916787,
"learning_rate": 2.280044718327383e-06,
"loss": 0.1366,
"step": 6030
},
{
"epoch": 2.1475555555555554,
"grad_norm": 1.3613431283259703,
"learning_rate": 2.262707134651069e-06,
"loss": 0.1347,
"step": 6040
},
{
"epoch": 2.151111111111111,
"grad_norm": 1.5001232721911446,
"learning_rate": 2.2454164223819443e-06,
"loss": 0.1435,
"step": 6050
},
{
"epoch": 2.1546666666666665,
"grad_norm": 1.6096086307058128,
"learning_rate": 2.228172877593563e-06,
"loss": 0.1248,
"step": 6060
},
{
"epoch": 2.1582222222222223,
"grad_norm": 1.4625689431665512,
"learning_rate": 2.2109767955518135e-06,
"loss": 0.129,
"step": 6070
},
{
"epoch": 2.1617777777777776,
"grad_norm": 1.7396993983427422,
"learning_rate": 2.193828470709863e-06,
"loss": 0.1259,
"step": 6080
},
{
"epoch": 2.1653333333333333,
"grad_norm": 1.4423513554123952,
"learning_rate": 2.176728196703122e-06,
"loss": 0.1308,
"step": 6090
},
{
"epoch": 2.168888888888889,
"grad_norm": 1.9920936118384482,
"learning_rate": 2.159676266344222e-06,
"loss": 0.1496,
"step": 6100
},
{
"epoch": 2.1724444444444444,
"grad_norm": 2.13727569719491,
"learning_rate": 2.142672971617978e-06,
"loss": 0.1359,
"step": 6110
},
{
"epoch": 2.176,
"grad_norm": 1.5724700258419562,
"learning_rate": 2.125718603676413e-06,
"loss": 0.1412,
"step": 6120
},
{
"epoch": 2.1795555555555555,
"grad_norm": 1.3817720285663424,
"learning_rate": 2.1088134528337635e-06,
"loss": 0.1357,
"step": 6130
},
{
"epoch": 2.1831111111111112,
"grad_norm": 1.6852270201894561,
"learning_rate": 2.091957808561505e-06,
"loss": 0.1388,
"step": 6140
},
{
"epoch": 2.1866666666666665,
"grad_norm": 1.5752301082061768,
"learning_rate": 2.0751519594834025e-06,
"loss": 0.1359,
"step": 6150
},
{
"epoch": 2.1902222222222223,
"grad_norm": 1.9588237176858065,
"learning_rate": 2.058396193370556e-06,
"loss": 0.1364,
"step": 6160
},
{
"epoch": 2.1937777777777776,
"grad_norm": 1.5906028620881005,
"learning_rate": 2.0416907971364937e-06,
"loss": 0.1286,
"step": 6170
},
{
"epoch": 2.1973333333333334,
"grad_norm": 1.6040127033831966,
"learning_rate": 2.0250360568322395e-06,
"loss": 0.132,
"step": 6180
},
{
"epoch": 2.2008888888888887,
"grad_norm": 1.903945940065679,
"learning_rate": 2.0084322576414205e-06,
"loss": 0.1311,
"step": 6190
},
{
"epoch": 2.2044444444444444,
"grad_norm": 1.7327408494603853,
"learning_rate": 1.991879683875386e-06,
"loss": 0.1412,
"step": 6200
},
{
"epoch": 2.208,
"grad_norm": 1.6938104353348038,
"learning_rate": 1.975378618968348e-06,
"loss": 0.1358,
"step": 6210
},
{
"epoch": 2.2115555555555555,
"grad_norm": 1.498102728760879,
"learning_rate": 1.958929345472503e-06,
"loss": 0.1272,
"step": 6220
},
{
"epoch": 2.2151111111111113,
"grad_norm": 1.5061713395545921,
"learning_rate": 1.942532145053219e-06,
"loss": 0.1335,
"step": 6230
},
{
"epoch": 2.2186666666666666,
"grad_norm": 1.8881968807558394,
"learning_rate": 1.926187298484201e-06,
"loss": 0.13,
"step": 6240
},
{
"epoch": 2.2222222222222223,
"grad_norm": 1.7409457044279315,
"learning_rate": 1.9098950856426845e-06,
"loss": 0.1197,
"step": 6250
},
{
"epoch": 2.2257777777777776,
"grad_norm": 1.7410736866607524,
"learning_rate": 1.893655785504644e-06,
"loss": 0.136,
"step": 6260
},
{
"epoch": 2.2293333333333334,
"grad_norm": 1.4673795329307866,
"learning_rate": 1.8774696761400107e-06,
"loss": 0.1351,
"step": 6270
},
{
"epoch": 2.2328888888888887,
"grad_norm": 1.4286935284704283,
"learning_rate": 1.8613370347079207e-06,
"loss": 0.1316,
"step": 6280
},
{
"epoch": 2.2364444444444445,
"grad_norm": 1.6752679462634348,
"learning_rate": 1.845258137451968e-06,
"loss": 0.1343,
"step": 6290
},
{
"epoch": 2.24,
"grad_norm": 1.5334658674891999,
"learning_rate": 1.8292332596954605e-06,
"loss": 0.1252,
"step": 6300
},
{
"epoch": 2.2435555555555555,
"grad_norm": 1.7816021858972186,
"learning_rate": 1.8132626758367217e-06,
"loss": 0.1373,
"step": 6310
},
{
"epoch": 2.2471111111111113,
"grad_norm": 1.4751058571451898,
"learning_rate": 1.7973466593443861e-06,
"loss": 0.1238,
"step": 6320
},
{
"epoch": 2.2506666666666666,
"grad_norm": 1.5737118263350949,
"learning_rate": 1.7814854827527144e-06,
"loss": 0.1331,
"step": 6330
},
{
"epoch": 2.2542222222222223,
"grad_norm": 1.6723085510766795,
"learning_rate": 1.7656794176569302e-06,
"loss": 0.1392,
"step": 6340
},
{
"epoch": 2.2577777777777777,
"grad_norm": 1.6074614963797307,
"learning_rate": 1.749928734708568e-06,
"loss": 0.1482,
"step": 6350
},
{
"epoch": 2.2613333333333334,
"grad_norm": 1.514935517928495,
"learning_rate": 1.734233703610838e-06,
"loss": 0.1318,
"step": 6360
},
{
"epoch": 2.2648888888888887,
"grad_norm": 2.1990045539686767,
"learning_rate": 1.7185945931140086e-06,
"loss": 0.1389,
"step": 6370
},
{
"epoch": 2.2684444444444445,
"grad_norm": 1.7900402567821287,
"learning_rate": 1.7030116710108068e-06,
"loss": 0.1402,
"step": 6380
},
{
"epoch": 2.2720000000000002,
"grad_norm": 1.5936415333953513,
"learning_rate": 1.6874852041318246e-06,
"loss": 0.1383,
"step": 6390
},
{
"epoch": 2.2755555555555556,
"grad_norm": 1.6874167667097502,
"learning_rate": 1.6720154583409642e-06,
"loss": 0.1297,
"step": 6400
},
{
"epoch": 2.279111111111111,
"grad_norm": 1.7461565673164665,
"learning_rate": 1.6566026985308737e-06,
"loss": 0.1265,
"step": 6410
},
{
"epoch": 2.2826666666666666,
"grad_norm": 1.9943666083505533,
"learning_rate": 1.6412471886184106e-06,
"loss": 0.1433,
"step": 6420
},
{
"epoch": 2.2862222222222224,
"grad_norm": 1.889269033390485,
"learning_rate": 1.6259491915401322e-06,
"loss": 0.1295,
"step": 6430
},
{
"epoch": 2.2897777777777777,
"grad_norm": 1.9954192603921324,
"learning_rate": 1.6107089692477856e-06,
"loss": 0.1506,
"step": 6440
},
{
"epoch": 2.2933333333333334,
"grad_norm": 1.73943513110269,
"learning_rate": 1.5955267827038267e-06,
"loss": 0.1309,
"step": 6450
},
{
"epoch": 2.2968888888888888,
"grad_norm": 1.5696215992092173,
"learning_rate": 1.5804028918769488e-06,
"loss": 0.1245,
"step": 6460
},
{
"epoch": 2.3004444444444445,
"grad_norm": 1.4480211516999386,
"learning_rate": 1.5653375557376266e-06,
"loss": 0.1419,
"step": 6470
},
{
"epoch": 2.304,
"grad_norm": 1.7769598112511977,
"learning_rate": 1.5503310322536962e-06,
"loss": 0.1357,
"step": 6480
},
{
"epoch": 2.3075555555555556,
"grad_norm": 1.6914490635403432,
"learning_rate": 1.5353835783859244e-06,
"loss": 0.1344,
"step": 6490
},
{
"epoch": 2.311111111111111,
"grad_norm": 1.2896364219654397,
"learning_rate": 1.5204954500836095e-06,
"loss": 0.1336,
"step": 6500
},
{
"epoch": 2.311111111111111,
"eval_loss": 0.2400493621826172,
"eval_runtime": 562.3512,
"eval_samples_per_second": 17.782,
"eval_steps_per_second": 4.446,
"step": 6500
},
{
"epoch": 2.3146666666666667,
"grad_norm": 1.6249516275302234,
"learning_rate": 1.5056669022802051e-06,
"loss": 0.1578,
"step": 6510
},
{
"epoch": 2.3182222222222224,
"grad_norm": 1.5534728727358678,
"learning_rate": 1.4908981888889562e-06,
"loss": 0.1236,
"step": 6520
},
{
"epoch": 2.3217777777777777,
"grad_norm": 2.305594450780404,
"learning_rate": 1.4761895627985384e-06,
"loss": 0.1437,
"step": 6530
},
{
"epoch": 2.3253333333333335,
"grad_norm": 1.7525804358624415,
"learning_rate": 1.461541275868742e-06,
"loss": 0.1244,
"step": 6540
},
{
"epoch": 2.328888888888889,
"grad_norm": 1.5857723879215653,
"learning_rate": 1.4469535789261518e-06,
"loss": 0.138,
"step": 6550
},
{
"epoch": 2.3324444444444445,
"grad_norm": 1.4470785666281207,
"learning_rate": 1.4324267217598543e-06,
"loss": 0.1311,
"step": 6560
},
{
"epoch": 2.336,
"grad_norm": 1.5783013529079604,
"learning_rate": 1.41796095311716e-06,
"loss": 0.1476,
"step": 6570
},
{
"epoch": 2.3395555555555556,
"grad_norm": 1.792387189040966,
"learning_rate": 1.4035565206993407e-06,
"loss": 0.1313,
"step": 6580
},
{
"epoch": 2.343111111111111,
"grad_norm": 2.0097219507066986,
"learning_rate": 1.3892136711573983e-06,
"loss": 0.1481,
"step": 6590
},
{
"epoch": 2.3466666666666667,
"grad_norm": 1.6038575587094324,
"learning_rate": 1.3749326500878308e-06,
"loss": 0.1329,
"step": 6600
},
{
"epoch": 2.3502222222222224,
"grad_norm": 1.8038941533229218,
"learning_rate": 1.3607137020284267e-06,
"loss": 0.1296,
"step": 6610
},
{
"epoch": 2.3537777777777777,
"grad_norm": 1.5967517903597408,
"learning_rate": 1.3465570704540877e-06,
"loss": 0.1323,
"step": 6620
},
{
"epoch": 2.3573333333333335,
"grad_norm": 1.6630671725280828,
"learning_rate": 1.33246299777265e-06,
"loss": 0.1353,
"step": 6630
},
{
"epoch": 2.360888888888889,
"grad_norm": 1.6910996186336409,
"learning_rate": 1.3184317253207379e-06,
"loss": 0.1198,
"step": 6640
},
{
"epoch": 2.3644444444444446,
"grad_norm": 1.667550829249205,
"learning_rate": 1.3044634933596311e-06,
"loss": 0.1398,
"step": 6650
},
{
"epoch": 2.368,
"grad_norm": 1.3604264834299673,
"learning_rate": 1.290558541071148e-06,
"loss": 0.123,
"step": 6660
},
{
"epoch": 2.3715555555555556,
"grad_norm": 1.4966865021721736,
"learning_rate": 1.2767171065535538e-06,
"loss": 0.1221,
"step": 6670
},
{
"epoch": 2.375111111111111,
"grad_norm": 1.3751769981745194,
"learning_rate": 1.2629394268174811e-06,
"loss": 0.1398,
"step": 6680
},
{
"epoch": 2.3786666666666667,
"grad_norm": 1.7552964254373993,
"learning_rate": 1.2492257377818734e-06,
"loss": 0.122,
"step": 6690
},
{
"epoch": 2.3822222222222225,
"grad_norm": 1.984424873865648,
"learning_rate": 1.235576274269938e-06,
"loss": 0.1366,
"step": 6700
},
{
"epoch": 2.3857777777777778,
"grad_norm": 1.8024296643627178,
"learning_rate": 1.2219912700051417e-06,
"loss": 0.1304,
"step": 6710
},
{
"epoch": 2.389333333333333,
"grad_norm": 1.6704237658027163,
"learning_rate": 1.2084709576071885e-06,
"loss": 0.1339,
"step": 6720
},
{
"epoch": 2.392888888888889,
"grad_norm": 1.8905223292433262,
"learning_rate": 1.1950155685880504e-06,
"loss": 0.138,
"step": 6730
},
{
"epoch": 2.3964444444444446,
"grad_norm": 1.8585326052998994,
"learning_rate": 1.1816253333479994e-06,
"loss": 0.1402,
"step": 6740
},
{
"epoch": 2.4,
"grad_norm": 1.4117751565900303,
"learning_rate": 1.1683004811716597e-06,
"loss": 0.1219,
"step": 6750
},
{
"epoch": 2.4035555555555557,
"grad_norm": 2.177441304004068,
"learning_rate": 1.1550412402240852e-06,
"loss": 0.1472,
"step": 6760
},
{
"epoch": 2.407111111111111,
"grad_norm": 1.7312870442889088,
"learning_rate": 1.1418478375468496e-06,
"loss": 0.14,
"step": 6770
},
{
"epoch": 2.4106666666666667,
"grad_norm": 1.4691171208612808,
"learning_rate": 1.1287204990541612e-06,
"loss": 0.1382,
"step": 6780
},
{
"epoch": 2.414222222222222,
"grad_norm": 1.9102821919207582,
"learning_rate": 1.1156594495289923e-06,
"loss": 0.1508,
"step": 6790
},
{
"epoch": 2.417777777777778,
"grad_norm": 1.5765296328104144,
"learning_rate": 1.1026649126192334e-06,
"loss": 0.1244,
"step": 6800
},
{
"epoch": 2.421333333333333,
"grad_norm": 1.485558878346715,
"learning_rate": 1.0897371108338572e-06,
"loss": 0.1262,
"step": 6810
},
{
"epoch": 2.424888888888889,
"grad_norm": 1.6805947418795415,
"learning_rate": 1.076876265539115e-06,
"loss": 0.1397,
"step": 6820
},
{
"epoch": 2.4284444444444446,
"grad_norm": 1.8439671145791727,
"learning_rate": 1.0640825969547498e-06,
"loss": 0.1298,
"step": 6830
},
{
"epoch": 2.432,
"grad_norm": 1.8675356289498493,
"learning_rate": 1.051356324150209e-06,
"loss": 0.1334,
"step": 6840
},
{
"epoch": 2.4355555555555557,
"grad_norm": 2.097329265797065,
"learning_rate": 1.0386976650409102e-06,
"loss": 0.1342,
"step": 6850
},
{
"epoch": 2.439111111111111,
"grad_norm": 1.7733262424549074,
"learning_rate": 1.0261068363845034e-06,
"loss": 0.1297,
"step": 6860
},
{
"epoch": 2.4426666666666668,
"grad_norm": 1.7698885455909084,
"learning_rate": 1.0135840537771574e-06,
"loss": 0.1355,
"step": 6870
},
{
"epoch": 2.446222222222222,
"grad_norm": 1.699595680180769,
"learning_rate": 1.001129531649872e-06,
"loss": 0.1255,
"step": 6880
},
{
"epoch": 2.449777777777778,
"grad_norm": 1.8061641909036275,
"learning_rate": 9.887434832647997e-07,
"loss": 0.1355,
"step": 6890
},
{
"epoch": 2.453333333333333,
"grad_norm": 1.8282679409791762,
"learning_rate": 9.764261207116061e-07,
"loss": 0.1437,
"step": 6900
},
{
"epoch": 2.456888888888889,
"grad_norm": 1.8691781223789907,
"learning_rate": 9.641776549038257e-07,
"loss": 0.1274,
"step": 6910
},
{
"epoch": 2.4604444444444447,
"grad_norm": 1.8720204975109627,
"learning_rate": 9.519982955752549e-07,
"loss": 0.1321,
"step": 6920
},
{
"epoch": 2.464,
"grad_norm": 1.714725769185188,
"learning_rate": 9.398882512763618e-07,
"loss": 0.1299,
"step": 6930
},
{
"epoch": 2.4675555555555557,
"grad_norm": 1.5736356325676821,
"learning_rate": 9.278477293707189e-07,
"loss": 0.1454,
"step": 6940
},
{
"epoch": 2.471111111111111,
"grad_norm": 1.7235279739808778,
"learning_rate": 9.158769360314412e-07,
"loss": 0.1301,
"step": 6950
},
{
"epoch": 2.474666666666667,
"grad_norm": 1.7964601353844663,
"learning_rate": 9.039760762376665e-07,
"loss": 0.1329,
"step": 6960
},
{
"epoch": 2.478222222222222,
"grad_norm": 1.7113961505997257,
"learning_rate": 8.921453537710406e-07,
"loss": 0.1301,
"step": 6970
},
{
"epoch": 2.481777777777778,
"grad_norm": 3.7247151362742708,
"learning_rate": 8.803849712122292e-07,
"loss": 0.1366,
"step": 6980
},
{
"epoch": 2.485333333333333,
"grad_norm": 1.6042128553101094,
"learning_rate": 8.686951299374474e-07,
"loss": 0.1248,
"step": 6990
},
{
"epoch": 2.488888888888889,
"grad_norm": 1.7566315817690532,
"learning_rate": 8.570760301150166e-07,
"loss": 0.1397,
"step": 7000
},
{
"epoch": 2.488888888888889,
"eval_loss": 0.239632710814476,
"eval_runtime": 563.0915,
"eval_samples_per_second": 17.759,
"eval_steps_per_second": 4.44,
"step": 7000
},
{
"epoch": 2.4924444444444447,
"grad_norm": 1.915869222287072,
"learning_rate": 8.455278707019255e-07,
"loss": 0.133,
"step": 7010
},
{
"epoch": 2.496,
"grad_norm": 1.4611242467498158,
"learning_rate": 8.340508494404415e-07,
"loss": 0.128,
"step": 7020
},
{
"epoch": 2.4995555555555553,
"grad_norm": 1.8274207116893812,
"learning_rate": 8.226451628547039e-07,
"loss": 0.1304,
"step": 7030
},
{
"epoch": 2.503111111111111,
"grad_norm": 1.5195837090357422,
"learning_rate": 8.113110062473756e-07,
"loss": 0.1337,
"step": 7040
},
{
"epoch": 2.506666666666667,
"grad_norm": 1.534284195780538,
"learning_rate": 8.000485736962899e-07,
"loss": 0.1365,
"step": 7050
},
{
"epoch": 2.510222222222222,
"grad_norm": 1.3874360730778557,
"learning_rate": 7.888580580511307e-07,
"loss": 0.1157,
"step": 7060
},
{
"epoch": 2.513777777777778,
"grad_norm": 1.347897014568791,
"learning_rate": 7.777396509301278e-07,
"loss": 0.1258,
"step": 7070
},
{
"epoch": 2.517333333333333,
"grad_norm": 1.5444960857241712,
"learning_rate": 7.666935427167777e-07,
"loss": 0.1261,
"step": 7080
},
{
"epoch": 2.520888888888889,
"grad_norm": 1.5787802499569878,
"learning_rate": 7.557199225565848e-07,
"loss": 0.1353,
"step": 7090
},
{
"epoch": 2.5244444444444447,
"grad_norm": 1.6575537900928325,
"learning_rate": 7.448189783538184e-07,
"loss": 0.1223,
"step": 7100
},
{
"epoch": 2.528,
"grad_norm": 1.58456318992188,
"learning_rate": 7.339908967683007e-07,
"loss": 0.1227,
"step": 7110
},
{
"epoch": 2.5315555555555553,
"grad_norm": 1.916341417565209,
"learning_rate": 7.232358632122022e-07,
"loss": 0.1365,
"step": 7120
},
{
"epoch": 2.535111111111111,
"grad_norm": 2.009648842498942,
"learning_rate": 7.125540618468784e-07,
"loss": 0.1435,
"step": 7130
},
{
"epoch": 2.538666666666667,
"grad_norm": 1.2589650678388224,
"learning_rate": 7.019456755797083e-07,
"loss": 0.1333,
"step": 7140
},
{
"epoch": 2.542222222222222,
"grad_norm": 1.534526581817288,
"learning_rate": 6.914108860609608e-07,
"loss": 0.1372,
"step": 7150
},
{
"epoch": 2.545777777777778,
"grad_norm": 1.5742622053962463,
"learning_rate": 6.809498736806919e-07,
"loss": 0.135,
"step": 7160
},
{
"epoch": 2.5493333333333332,
"grad_norm": 1.876907152948741,
"learning_rate": 6.705628175656498e-07,
"loss": 0.1304,
"step": 7170
},
{
"epoch": 2.552888888888889,
"grad_norm": 1.7507039554831174,
"learning_rate": 6.602498955762105e-07,
"loss": 0.1361,
"step": 7180
},
{
"epoch": 2.5564444444444443,
"grad_norm": 1.5168112309443524,
"learning_rate": 6.500112843033313e-07,
"loss": 0.1235,
"step": 7190
},
{
"epoch": 2.56,
"grad_norm": 1.366857399391539,
"learning_rate": 6.39847159065523e-07,
"loss": 0.1268,
"step": 7200
},
{
"epoch": 2.5635555555555554,
"grad_norm": 1.7472209117726187,
"learning_rate": 6.297576939058586e-07,
"loss": 0.1338,
"step": 7210
},
{
"epoch": 2.567111111111111,
"grad_norm": 1.5771285823832333,
"learning_rate": 6.197430615889838e-07,
"loss": 0.1304,
"step": 7220
},
{
"epoch": 2.570666666666667,
"grad_norm": 1.5122386895026887,
"learning_rate": 6.098034335981573e-07,
"loss": 0.1255,
"step": 7230
},
{
"epoch": 2.574222222222222,
"grad_norm": 1.5101320862852827,
"learning_rate": 5.999389801323219e-07,
"loss": 0.128,
"step": 7240
},
{
"epoch": 2.5777777777777775,
"grad_norm": 1.751375058176443,
"learning_rate": 5.901498701031894e-07,
"loss": 0.131,
"step": 7250
},
{
"epoch": 2.5813333333333333,
"grad_norm": 1.5370110538793642,
"learning_rate": 5.804362711323391e-07,
"loss": 0.1273,
"step": 7260
},
{
"epoch": 2.584888888888889,
"grad_norm": 1.5422190674222276,
"learning_rate": 5.707983495483593e-07,
"loss": 0.122,
"step": 7270
},
{
"epoch": 2.5884444444444443,
"grad_norm": 1.8111593254497258,
"learning_rate": 5.612362703839907e-07,
"loss": 0.1308,
"step": 7280
},
{
"epoch": 2.592,
"grad_norm": 1.7898287718649462,
"learning_rate": 5.517501973733059e-07,
"loss": 0.1239,
"step": 7290
},
{
"epoch": 2.5955555555555554,
"grad_norm": 1.5741550714022359,
"learning_rate": 5.423402929489019e-07,
"loss": 0.1242,
"step": 7300
},
{
"epoch": 2.599111111111111,
"grad_norm": 1.7431025808198797,
"learning_rate": 5.330067182391219e-07,
"loss": 0.1258,
"step": 7310
},
{
"epoch": 2.602666666666667,
"grad_norm": 1.669472703725672,
"learning_rate": 5.237496330652925e-07,
"loss": 0.1318,
"step": 7320
},
{
"epoch": 2.606222222222222,
"grad_norm": 1.7086096850592123,
"learning_rate": 5.145691959389932e-07,
"loss": 0.1292,
"step": 7330
},
{
"epoch": 2.6097777777777775,
"grad_norm": 1.79780883791639,
"learning_rate": 5.054655640593325e-07,
"loss": 0.1446,
"step": 7340
},
{
"epoch": 2.6133333333333333,
"grad_norm": 1.760230682240199,
"learning_rate": 4.964388933102666e-07,
"loss": 0.1418,
"step": 7350
},
{
"epoch": 2.616888888888889,
"grad_norm": 1.540197801989686,
"learning_rate": 4.874893382579232e-07,
"loss": 0.1269,
"step": 7360
},
{
"epoch": 2.6204444444444444,
"grad_norm": 1.7177370855999565,
"learning_rate": 4.786170521479588e-07,
"loss": 0.1223,
"step": 7370
},
{
"epoch": 2.624,
"grad_norm": 1.881294576905093,
"learning_rate": 4.698221869029307e-07,
"loss": 0.1443,
"step": 7380
},
{
"epoch": 2.6275555555555554,
"grad_norm": 1.74196972034532,
"learning_rate": 4.6110489311969876e-07,
"loss": 0.1429,
"step": 7390
},
{
"epoch": 2.631111111111111,
"grad_norm": 1.5651241374342044,
"learning_rate": 4.524653200668461e-07,
"loss": 0.1264,
"step": 7400
},
{
"epoch": 2.634666666666667,
"grad_norm": 1.8251309622054404,
"learning_rate": 4.439036156821225e-07,
"loss": 0.1213,
"step": 7410
},
{
"epoch": 2.6382222222222222,
"grad_norm": 1.4351427368380598,
"learning_rate": 4.3541992656991163e-07,
"loss": 0.1182,
"step": 7420
},
{
"epoch": 2.6417777777777776,
"grad_norm": 1.9769377027322241,
"learning_rate": 4.2701439799871847e-07,
"loss": 0.1453,
"step": 7430
},
{
"epoch": 2.6453333333333333,
"grad_norm": 1.6755217149463195,
"learning_rate": 4.1868717389868694e-07,
"loss": 0.1284,
"step": 7440
},
{
"epoch": 2.648888888888889,
"grad_norm": 1.4882784431490907,
"learning_rate": 4.1043839685913135e-07,
"loss": 0.1289,
"step": 7450
},
{
"epoch": 2.6524444444444444,
"grad_norm": 1.2678152146637376,
"learning_rate": 4.022682081260942e-07,
"loss": 0.122,
"step": 7460
},
{
"epoch": 2.656,
"grad_norm": 1.7036091433400906,
"learning_rate": 3.941767475999297e-07,
"loss": 0.1292,
"step": 7470
},
{
"epoch": 2.6595555555555555,
"grad_norm": 2.0073020304210485,
"learning_rate": 3.8616415383291083e-07,
"loss": 0.1281,
"step": 7480
},
{
"epoch": 2.663111111111111,
"grad_norm": 1.7003882572239488,
"learning_rate": 3.7823056402684856e-07,
"loss": 0.1205,
"step": 7490
},
{
"epoch": 2.6666666666666665,
"grad_norm": 1.8649824143158358,
"learning_rate": 3.70376114030751e-07,
"loss": 0.1405,
"step": 7500
},
{
"epoch": 2.6666666666666665,
"eval_loss": 0.2399507761001587,
"eval_runtime": 561.5965,
"eval_samples_per_second": 17.806,
"eval_steps_per_second": 4.452,
"step": 7500
},
{
"epoch": 2.6702222222222223,
"grad_norm": 1.778861851144716,
"learning_rate": 3.626009383384926e-07,
"loss": 0.1424,
"step": 7510
},
{
"epoch": 2.6737777777777776,
"grad_norm": 1.7506343466298935,
"learning_rate": 3.549051700865136e-07,
"loss": 0.1242,
"step": 7520
},
{
"epoch": 2.6773333333333333,
"grad_norm": 1.5579333925843626,
"learning_rate": 3.47288941051539e-07,
"loss": 0.125,
"step": 7530
},
{
"epoch": 2.680888888888889,
"grad_norm": 2.030096385748008,
"learning_rate": 3.3975238164831893e-07,
"loss": 0.1253,
"step": 7540
},
{
"epoch": 2.6844444444444444,
"grad_norm": 1.635535994621638,
"learning_rate": 3.322956209274031e-07,
"loss": 0.1322,
"step": 7550
},
{
"epoch": 2.6879999999999997,
"grad_norm": 1.7329277515156414,
"learning_rate": 3.2491878657292643e-07,
"loss": 0.1355,
"step": 7560
},
{
"epoch": 2.6915555555555555,
"grad_norm": 1.7444157426686764,
"learning_rate": 3.176220049004197e-07,
"loss": 0.1179,
"step": 7570
},
{
"epoch": 2.6951111111111112,
"grad_norm": 1.3483728954452034,
"learning_rate": 3.104054008546525e-07,
"loss": 0.1338,
"step": 7580
},
{
"epoch": 2.6986666666666665,
"grad_norm": 1.3906620863471058,
"learning_rate": 3.032690980074915e-07,
"loss": 0.131,
"step": 7590
},
{
"epoch": 2.7022222222222223,
"grad_norm": 1.8327466893042572,
"learning_rate": 2.962132185557826e-07,
"loss": 0.1223,
"step": 7600
},
{
"epoch": 2.7057777777777776,
"grad_norm": 1.5547638545825841,
"learning_rate": 2.892378833192611e-07,
"loss": 0.1282,
"step": 7610
},
{
"epoch": 2.7093333333333334,
"grad_norm": 1.804096897597165,
"learning_rate": 2.823432117384822e-07,
"loss": 0.1321,
"step": 7620
},
{
"epoch": 2.712888888888889,
"grad_norm": 1.5920189474841397,
"learning_rate": 2.755293218727739e-07,
"loss": 0.1266,
"step": 7630
},
{
"epoch": 2.7164444444444444,
"grad_norm": 1.95119518386987,
"learning_rate": 2.6879633039821994e-07,
"loss": 0.1356,
"step": 7640
},
{
"epoch": 2.7199999999999998,
"grad_norm": 1.8385230420520196,
"learning_rate": 2.62144352605655e-07,
"loss": 0.1262,
"step": 7650
},
{
"epoch": 2.7235555555555555,
"grad_norm": 1.7885799872230752,
"learning_rate": 2.555735023986966e-07,
"loss": 0.1315,
"step": 7660
},
{
"epoch": 2.7271111111111113,
"grad_norm": 1.8941729319880476,
"learning_rate": 2.4908389229179484e-07,
"loss": 0.1179,
"step": 7670
},
{
"epoch": 2.7306666666666666,
"grad_norm": 1.5725333890356554,
"learning_rate": 2.4267563340830026e-07,
"loss": 0.1122,
"step": 7680
},
{
"epoch": 2.7342222222222223,
"grad_norm": 1.9949059298619423,
"learning_rate": 2.363488354785648e-07,
"loss": 0.1372,
"step": 7690
},
{
"epoch": 2.7377777777777776,
"grad_norm": 1.706241835042834,
"learning_rate": 2.301036068380641e-07,
"loss": 0.1303,
"step": 7700
},
{
"epoch": 2.7413333333333334,
"grad_norm": 1.5015166048586166,
"learning_rate": 2.239400544255399e-07,
"loss": 0.121,
"step": 7710
},
{
"epoch": 2.744888888888889,
"grad_norm": 1.69358016809196,
"learning_rate": 2.178582837811688e-07,
"loss": 0.1249,
"step": 7720
},
{
"epoch": 2.7484444444444445,
"grad_norm": 1.9732967017351475,
"learning_rate": 2.1185839904475869e-07,
"loss": 0.133,
"step": 7730
},
{
"epoch": 2.752,
"grad_norm": 1.5594363807881604,
"learning_rate": 2.0594050295395852e-07,
"loss": 0.1304,
"step": 7740
},
{
"epoch": 2.7555555555555555,
"grad_norm": 2.026099043557669,
"learning_rate": 2.0010469684250856e-07,
"loss": 0.1385,
"step": 7750
},
{
"epoch": 2.7591111111111113,
"grad_norm": 1.5917173969753626,
"learning_rate": 1.9435108063849684e-07,
"loss": 0.1365,
"step": 7760
},
{
"epoch": 2.7626666666666666,
"grad_norm": 1.7387563784538043,
"learning_rate": 1.8867975286265106e-07,
"loss": 0.1278,
"step": 7770
},
{
"epoch": 2.7662222222222224,
"grad_norm": 1.491992475001642,
"learning_rate": 1.830908106266538e-07,
"loss": 0.1169,
"step": 7780
},
{
"epoch": 2.7697777777777777,
"grad_norm": 1.8209635910179756,
"learning_rate": 1.7758434963147665e-07,
"loss": 0.143,
"step": 7790
},
{
"epoch": 2.7733333333333334,
"grad_norm": 1.6054626426110197,
"learning_rate": 1.7216046416574316e-07,
"loss": 0.1335,
"step": 7800
},
{
"epoch": 2.7768888888888887,
"grad_norm": 1.6151516199907796,
"learning_rate": 1.66819247104113e-07,
"loss": 0.1338,
"step": 7810
},
{
"epoch": 2.7804444444444445,
"grad_norm": 1.9698941742198866,
"learning_rate": 1.6156078990569313e-07,
"loss": 0.1203,
"step": 7820
},
{
"epoch": 2.784,
"grad_norm": 1.6305672042666572,
"learning_rate": 1.563851826124696e-07,
"loss": 0.1216,
"step": 7830
},
{
"epoch": 2.7875555555555556,
"grad_norm": 1.0194788026355706,
"learning_rate": 1.5129251384776998e-07,
"loss": 0.1181,
"step": 7840
},
{
"epoch": 2.7911111111111113,
"grad_norm": 1.7073067625712353,
"learning_rate": 1.462828708147379e-07,
"loss": 0.139,
"step": 7850
},
{
"epoch": 2.7946666666666666,
"grad_norm": 1.4957713592543374,
"learning_rate": 1.4135633929485026e-07,
"loss": 0.1373,
"step": 7860
},
{
"epoch": 2.7982222222222224,
"grad_norm": 1.6268976958462047,
"learning_rate": 1.3651300364644126e-07,
"loss": 0.1294,
"step": 7870
},
{
"epoch": 2.8017777777777777,
"grad_norm": 1.3636030825381604,
"learning_rate": 1.317529468032569e-07,
"loss": 0.1158,
"step": 7880
},
{
"epoch": 2.8053333333333335,
"grad_norm": 1.5147346477252843,
"learning_rate": 1.2707625027304104e-07,
"loss": 0.124,
"step": 7890
},
{
"epoch": 2.8088888888888888,
"grad_norm": 1.7193516342629052,
"learning_rate": 1.2248299413613607e-07,
"loss": 0.1332,
"step": 7900
},
{
"epoch": 2.8124444444444445,
"grad_norm": 1.6484553937509365,
"learning_rate": 1.1797325704411e-07,
"loss": 0.1214,
"step": 7910
},
{
"epoch": 2.816,
"grad_norm": 1.6919284405549642,
"learning_rate": 1.1354711621841208e-07,
"loss": 0.133,
"step": 7920
},
{
"epoch": 2.8195555555555556,
"grad_norm": 1.223501357852658,
"learning_rate": 1.0920464744905157e-07,
"loss": 0.1205,
"step": 7930
},
{
"epoch": 2.8231111111111113,
"grad_norm": 1.5481520280664143,
"learning_rate": 1.0494592509329716e-07,
"loss": 0.1469,
"step": 7940
},
{
"epoch": 2.8266666666666667,
"grad_norm": 1.7879544199201751,
"learning_rate": 1.007710220744057e-07,
"loss": 0.1269,
"step": 7950
},
{
"epoch": 2.830222222222222,
"grad_norm": 1.513993378655108,
"learning_rate": 9.668000988037163e-08,
"loss": 0.1322,
"step": 7960
},
{
"epoch": 2.8337777777777777,
"grad_norm": 1.7964467427017516,
"learning_rate": 9.267295856270509e-08,
"loss": 0.1354,
"step": 7970
},
{
"epoch": 2.8373333333333335,
"grad_norm": 1.787987364521523,
"learning_rate": 8.874993673523236e-08,
"loss": 0.1319,
"step": 7980
},
{
"epoch": 2.840888888888889,
"grad_norm": 1.6897870372176325,
"learning_rate": 8.491101157291737e-08,
"loss": 0.1274,
"step": 7990
},
{
"epoch": 2.8444444444444446,
"grad_norm": 1.6105609971746402,
"learning_rate": 8.115624881071594e-08,
"loss": 0.1318,
"step": 8000
},
{
"epoch": 2.8444444444444446,
"eval_loss": 0.23905394971370697,
"eval_runtime": 559.7682,
"eval_samples_per_second": 17.865,
"eval_steps_per_second": 4.466,
"step": 8000
},
{
"epoch": 2.848,
"grad_norm": 1.3881391902801445,
"learning_rate": 7.748571274244776e-08,
"loss": 0.1199,
"step": 8010
},
{
"epoch": 2.8515555555555556,
"grad_norm": 1.8275543306577795,
"learning_rate": 7.389946621969679e-08,
"loss": 0.1494,
"step": 8020
},
{
"epoch": 2.8551111111111114,
"grad_norm": 1.8960525825598256,
"learning_rate": 7.039757065073316e-08,
"loss": 0.1354,
"step": 8030
},
{
"epoch": 2.8586666666666667,
"grad_norm": 1.6485916403071794,
"learning_rate": 6.698008599946404e-08,
"loss": 0.1246,
"step": 8040
},
{
"epoch": 2.862222222222222,
"grad_norm": 1.2435705558011503,
"learning_rate": 6.364707078440335e-08,
"loss": 0.1266,
"step": 8050
},
{
"epoch": 2.8657777777777778,
"grad_norm": 1.5746164801301799,
"learning_rate": 6.039858207767479e-08,
"loss": 0.134,
"step": 8060
},
{
"epoch": 2.8693333333333335,
"grad_norm": 1.5169697571883205,
"learning_rate": 5.723467550403039e-08,
"loss": 0.1326,
"step": 8070
},
{
"epoch": 2.872888888888889,
"grad_norm": 1.5881237505008923,
"learning_rate": 5.4155405239897926e-08,
"loss": 0.1488,
"step": 8080
},
{
"epoch": 2.8764444444444446,
"grad_norm": 1.690061086159581,
"learning_rate": 5.1160824012458367e-08,
"loss": 0.1232,
"step": 8090
},
{
"epoch": 2.88,
"grad_norm": 1.6253293072576216,
"learning_rate": 4.825098309873544e-08,
"loss": 0.1264,
"step": 8100
},
{
"epoch": 2.8835555555555556,
"grad_norm": 1.8528993602738453,
"learning_rate": 4.542593232472414e-08,
"loss": 0.1328,
"step": 8110
},
{
"epoch": 2.887111111111111,
"grad_norm": 1.949296952991108,
"learning_rate": 4.268572006453364e-08,
"loss": 0.1264,
"step": 8120
},
{
"epoch": 2.8906666666666667,
"grad_norm": 1.5505902041733666,
"learning_rate": 4.003039323956126e-08,
"loss": 0.1308,
"step": 8130
},
{
"epoch": 2.894222222222222,
"grad_norm": 0.9023008663346067,
"learning_rate": 3.7459997317687014e-08,
"loss": 0.1101,
"step": 8140
},
{
"epoch": 2.897777777777778,
"grad_norm": 1.8468547733058307,
"learning_rate": 3.4974576312497564e-08,
"loss": 0.1249,
"step": 8150
},
{
"epoch": 2.9013333333333335,
"grad_norm": 1.7056102650658924,
"learning_rate": 3.25741727825285e-08,
"loss": 0.1193,
"step": 8160
},
{
"epoch": 2.904888888888889,
"grad_norm": 1.3690587953613977,
"learning_rate": 3.025882783054046e-08,
"loss": 0.1199,
"step": 8170
},
{
"epoch": 2.9084444444444446,
"grad_norm": 1.3946208158917515,
"learning_rate": 2.8028581102811924e-08,
"loss": 0.1365,
"step": 8180
},
{
"epoch": 2.912,
"grad_norm": 1.9644328667604294,
"learning_rate": 2.588347078846254e-08,
"loss": 0.1323,
"step": 8190
},
{
"epoch": 2.9155555555555557,
"grad_norm": 1.7619431494028974,
"learning_rate": 2.382353361879586e-08,
"loss": 0.1244,
"step": 8200
},
{
"epoch": 2.919111111111111,
"grad_norm": 1.6739735252712569,
"learning_rate": 2.18488048666754e-08,
"loss": 0.1241,
"step": 8210
},
{
"epoch": 2.9226666666666667,
"grad_norm": 1.7618267751958017,
"learning_rate": 1.995931834591569e-08,
"loss": 0.132,
"step": 8220
},
{
"epoch": 2.926222222222222,
"grad_norm": 1.5149144065240054,
"learning_rate": 1.8155106410706613e-08,
"loss": 0.1359,
"step": 8230
},
{
"epoch": 2.929777777777778,
"grad_norm": 1.7464428231188038,
"learning_rate": 1.6436199955057742e-08,
"loss": 0.1477,
"step": 8240
},
{
"epoch": 2.9333333333333336,
"grad_norm": 1.7961519057796862,
"learning_rate": 1.480262841226987e-08,
"loss": 0.1482,
"step": 8250
},
{
"epoch": 2.936888888888889,
"grad_norm": 1.668237688338044,
"learning_rate": 1.3254419754430981e-08,
"loss": 0.1369,
"step": 8260
},
{
"epoch": 2.940444444444444,
"grad_norm": 1.5710565780518715,
"learning_rate": 1.1791600491937172e-08,
"loss": 0.1265,
"step": 8270
},
{
"epoch": 2.944,
"grad_norm": 1.6190463651101816,
"learning_rate": 1.041419567303914e-08,
"loss": 0.1233,
"step": 8280
},
{
"epoch": 2.9475555555555557,
"grad_norm": 1.3359272700606026,
"learning_rate": 9.12222888341252e-09,
"loss": 0.1308,
"step": 8290
},
{
"epoch": 2.951111111111111,
"grad_norm": 1.7965214936961842,
"learning_rate": 7.915722245754876e-09,
"loss": 0.141,
"step": 8300
},
{
"epoch": 2.9546666666666668,
"grad_norm": 1.7433994283889143,
"learning_rate": 6.7946964194059994e-09,
"loss": 0.1493,
"step": 8310
},
{
"epoch": 2.958222222222222,
"grad_norm": 1.6666804006077884,
"learning_rate": 5.759170599994868e-09,
"loss": 0.1284,
"step": 8320
},
{
"epoch": 2.961777777777778,
"grad_norm": 1.4232443691197452,
"learning_rate": 4.809162519110455e-09,
"loss": 0.1231,
"step": 8330
},
{
"epoch": 2.9653333333333336,
"grad_norm": 1.8464380977109713,
"learning_rate": 3.944688443998646e-09,
"loss": 0.1466,
"step": 8340
},
{
"epoch": 2.968888888888889,
"grad_norm": 1.8474020149086245,
"learning_rate": 3.16576317728301e-09,
"loss": 0.126,
"step": 8350
},
{
"epoch": 2.9724444444444442,
"grad_norm": 1.731427281949659,
"learning_rate": 2.4724000567116768e-09,
"loss": 0.1361,
"step": 8360
},
{
"epoch": 2.976,
"grad_norm": 1.8993388895043506,
"learning_rate": 1.86461095492918e-09,
"loss": 0.1258,
"step": 8370
},
{
"epoch": 2.9795555555555557,
"grad_norm": 1.676714063923629,
"learning_rate": 1.3424062792738445e-09,
"loss": 0.1311,
"step": 8380
},
{
"epoch": 2.983111111111111,
"grad_norm": 1.714542756833673,
"learning_rate": 9.057949715968183e-10,
"loss": 0.1236,
"step": 8390
},
{
"epoch": 2.986666666666667,
"grad_norm": 1.6829258625832335,
"learning_rate": 5.547845081121939e-10,
"loss": 0.1171,
"step": 8400
},
{
"epoch": 2.990222222222222,
"grad_norm": 1.5917279579386703,
"learning_rate": 2.89380899267111e-10,
"loss": 0.1309,
"step": 8410
},
{
"epoch": 2.993777777777778,
"grad_norm": 1.72982950263424,
"learning_rate": 1.0958868963906188e-10,
"loss": 0.1314,
"step": 8420
},
{
"epoch": 2.997333333333333,
"grad_norm": 1.6121134095652765,
"learning_rate": 1.541095785984048e-11,
"loss": 0.1267,
"step": 8430
},
{
"epoch": 2.9994666666666667,
"step": 8436,
"total_flos": 621656373067776.0,
"train_loss": 0.25353994178455436,
"train_runtime": 39823.1973,
"train_samples_per_second": 6.78,
"train_steps_per_second": 0.212
}
],
"logging_steps": 10,
"max_steps": 8436,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 621656373067776.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}