fengyao1909's picture
Upload folder using huggingface_hub
0b6b38f verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.66688,
"eval_steps": 500,
"global_step": 1042,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00064,
"grad_norm": 2.353584373604404,
"learning_rate": 0.0,
"loss": 0.5764,
"step": 1
},
{
"epoch": 0.00128,
"grad_norm": 2.5677274470706184,
"learning_rate": 3.79746835443038e-07,
"loss": 0.6154,
"step": 2
},
{
"epoch": 0.00192,
"grad_norm": 2.434815038353528,
"learning_rate": 7.59493670886076e-07,
"loss": 0.5762,
"step": 3
},
{
"epoch": 0.00256,
"grad_norm": 2.4265789349847133,
"learning_rate": 1.139240506329114e-06,
"loss": 0.5898,
"step": 4
},
{
"epoch": 0.0032,
"grad_norm": 2.4968640172692007,
"learning_rate": 1.518987341772152e-06,
"loss": 0.5854,
"step": 5
},
{
"epoch": 0.00384,
"grad_norm": 2.6270464110646,
"learning_rate": 1.8987341772151901e-06,
"loss": 0.6063,
"step": 6
},
{
"epoch": 0.00448,
"grad_norm": 2.3240861061539664,
"learning_rate": 2.278481012658228e-06,
"loss": 0.5713,
"step": 7
},
{
"epoch": 0.00512,
"grad_norm": 2.277096457178879,
"learning_rate": 2.6582278481012658e-06,
"loss": 0.5981,
"step": 8
},
{
"epoch": 0.00576,
"grad_norm": 2.0556904877285276,
"learning_rate": 3.037974683544304e-06,
"loss": 0.5783,
"step": 9
},
{
"epoch": 0.0064,
"grad_norm": 1.8883532047436715,
"learning_rate": 3.4177215189873417e-06,
"loss": 0.5626,
"step": 10
},
{
"epoch": 0.00704,
"grad_norm": 1.4054834462317158,
"learning_rate": 3.7974683544303802e-06,
"loss": 0.5762,
"step": 11
},
{
"epoch": 0.00768,
"grad_norm": 1.2672259034804543,
"learning_rate": 4.1772151898734175e-06,
"loss": 0.5447,
"step": 12
},
{
"epoch": 0.00832,
"grad_norm": 1.1315187874621562,
"learning_rate": 4.556962025316456e-06,
"loss": 0.5003,
"step": 13
},
{
"epoch": 0.00896,
"grad_norm": 1.2442008617105749,
"learning_rate": 4.936708860759494e-06,
"loss": 0.5235,
"step": 14
},
{
"epoch": 0.0096,
"grad_norm": 1.1708354130739331,
"learning_rate": 5.3164556962025316e-06,
"loss": 0.4659,
"step": 15
},
{
"epoch": 0.01024,
"grad_norm": 1.3068048804077694,
"learning_rate": 5.69620253164557e-06,
"loss": 0.4601,
"step": 16
},
{
"epoch": 0.01088,
"grad_norm": 1.487196594526998,
"learning_rate": 6.075949367088608e-06,
"loss": 0.487,
"step": 17
},
{
"epoch": 0.01152,
"grad_norm": 1.3702547576006188,
"learning_rate": 6.455696202531646e-06,
"loss": 0.4829,
"step": 18
},
{
"epoch": 0.01216,
"grad_norm": 0.985500593752308,
"learning_rate": 6.835443037974683e-06,
"loss": 0.4463,
"step": 19
},
{
"epoch": 0.0128,
"grad_norm": 0.9909916347896555,
"learning_rate": 7.215189873417722e-06,
"loss": 0.4554,
"step": 20
},
{
"epoch": 0.01344,
"grad_norm": 1.1838910835370198,
"learning_rate": 7.5949367088607605e-06,
"loss": 0.4622,
"step": 21
},
{
"epoch": 0.01408,
"grad_norm": 1.6876677020755413,
"learning_rate": 7.974683544303797e-06,
"loss": 0.3952,
"step": 22
},
{
"epoch": 0.01472,
"grad_norm": 1.0417104358020024,
"learning_rate": 8.354430379746835e-06,
"loss": 0.4105,
"step": 23
},
{
"epoch": 0.01536,
"grad_norm": 0.703419627437881,
"learning_rate": 8.734177215189873e-06,
"loss": 0.3797,
"step": 24
},
{
"epoch": 0.016,
"grad_norm": 0.7189072691985104,
"learning_rate": 9.113924050632912e-06,
"loss": 0.4206,
"step": 25
},
{
"epoch": 0.01664,
"grad_norm": 0.7280445280679197,
"learning_rate": 9.49367088607595e-06,
"loss": 0.3963,
"step": 26
},
{
"epoch": 0.01728,
"grad_norm": 0.5914861117650524,
"learning_rate": 9.873417721518988e-06,
"loss": 0.3928,
"step": 27
},
{
"epoch": 0.01792,
"grad_norm": 0.6824922709052414,
"learning_rate": 1.0253164556962025e-05,
"loss": 0.3909,
"step": 28
},
{
"epoch": 0.01856,
"grad_norm": 0.6157178643639211,
"learning_rate": 1.0632911392405063e-05,
"loss": 0.3778,
"step": 29
},
{
"epoch": 0.0192,
"grad_norm": 0.7040224694990468,
"learning_rate": 1.1012658227848103e-05,
"loss": 0.4034,
"step": 30
},
{
"epoch": 0.01984,
"grad_norm": 0.6042406871448358,
"learning_rate": 1.139240506329114e-05,
"loss": 0.3991,
"step": 31
},
{
"epoch": 0.02048,
"grad_norm": 0.5581052443719693,
"learning_rate": 1.1772151898734176e-05,
"loss": 0.4011,
"step": 32
},
{
"epoch": 0.02112,
"grad_norm": 0.4266761178757486,
"learning_rate": 1.2151898734177216e-05,
"loss": 0.3639,
"step": 33
},
{
"epoch": 0.02176,
"grad_norm": 0.49600505321594285,
"learning_rate": 1.2531645569620253e-05,
"loss": 0.3784,
"step": 34
},
{
"epoch": 0.0224,
"grad_norm": 0.45932753795907266,
"learning_rate": 1.2911392405063291e-05,
"loss": 0.3751,
"step": 35
},
{
"epoch": 0.02304,
"grad_norm": 0.5363867843128812,
"learning_rate": 1.3291139240506329e-05,
"loss": 0.3865,
"step": 36
},
{
"epoch": 0.02368,
"grad_norm": 0.5488419069165299,
"learning_rate": 1.3670886075949367e-05,
"loss": 0.3787,
"step": 37
},
{
"epoch": 0.02432,
"grad_norm": 0.48587043713463085,
"learning_rate": 1.4050632911392406e-05,
"loss": 0.3712,
"step": 38
},
{
"epoch": 0.02496,
"grad_norm": 0.4440990518273777,
"learning_rate": 1.4430379746835444e-05,
"loss": 0.3652,
"step": 39
},
{
"epoch": 0.0256,
"grad_norm": 0.3751498370487217,
"learning_rate": 1.4810126582278482e-05,
"loss": 0.3569,
"step": 40
},
{
"epoch": 0.02624,
"grad_norm": 0.43725343772159886,
"learning_rate": 1.5189873417721521e-05,
"loss": 0.3158,
"step": 41
},
{
"epoch": 0.02688,
"grad_norm": 0.4888026864502293,
"learning_rate": 1.5569620253164557e-05,
"loss": 0.3442,
"step": 42
},
{
"epoch": 0.02752,
"grad_norm": 0.44586123200596317,
"learning_rate": 1.5949367088607595e-05,
"loss": 0.3328,
"step": 43
},
{
"epoch": 0.02816,
"grad_norm": 0.49767798630217713,
"learning_rate": 1.6329113924050632e-05,
"loss": 0.351,
"step": 44
},
{
"epoch": 0.0288,
"grad_norm": 0.4336104979866255,
"learning_rate": 1.670886075949367e-05,
"loss": 0.3612,
"step": 45
},
{
"epoch": 0.02944,
"grad_norm": 0.4981279303284916,
"learning_rate": 1.708860759493671e-05,
"loss": 0.3543,
"step": 46
},
{
"epoch": 0.03008,
"grad_norm": 0.3991975649542343,
"learning_rate": 1.7468354430379746e-05,
"loss": 0.346,
"step": 47
},
{
"epoch": 0.03072,
"grad_norm": 0.4340035974965301,
"learning_rate": 1.7848101265822783e-05,
"loss": 0.3563,
"step": 48
},
{
"epoch": 0.03136,
"grad_norm": 0.40441725662186223,
"learning_rate": 1.8227848101265824e-05,
"loss": 0.3378,
"step": 49
},
{
"epoch": 0.032,
"grad_norm": 0.4015157764097668,
"learning_rate": 1.860759493670886e-05,
"loss": 0.3477,
"step": 50
},
{
"epoch": 0.03264,
"grad_norm": 0.3897529365297187,
"learning_rate": 1.89873417721519e-05,
"loss": 0.3341,
"step": 51
},
{
"epoch": 0.03328,
"grad_norm": 0.39864724195969276,
"learning_rate": 1.9367088607594938e-05,
"loss": 0.3322,
"step": 52
},
{
"epoch": 0.03392,
"grad_norm": 0.365402956200434,
"learning_rate": 1.9746835443037975e-05,
"loss": 0.297,
"step": 53
},
{
"epoch": 0.03456,
"grad_norm": 0.4449097185367681,
"learning_rate": 2.0126582278481013e-05,
"loss": 0.3424,
"step": 54
},
{
"epoch": 0.0352,
"grad_norm": 0.43181805843495463,
"learning_rate": 2.050632911392405e-05,
"loss": 0.3351,
"step": 55
},
{
"epoch": 0.03584,
"grad_norm": 0.392546267080926,
"learning_rate": 2.0886075949367092e-05,
"loss": 0.334,
"step": 56
},
{
"epoch": 0.03648,
"grad_norm": 0.38060689632546174,
"learning_rate": 2.1265822784810126e-05,
"loss": 0.3328,
"step": 57
},
{
"epoch": 0.03712,
"grad_norm": 0.389736897725578,
"learning_rate": 2.1645569620253164e-05,
"loss": 0.351,
"step": 58
},
{
"epoch": 0.03776,
"grad_norm": 0.42262406549661324,
"learning_rate": 2.2025316455696205e-05,
"loss": 0.3378,
"step": 59
},
{
"epoch": 0.0384,
"grad_norm": 0.43235384034762964,
"learning_rate": 2.240506329113924e-05,
"loss": 0.3308,
"step": 60
},
{
"epoch": 0.03904,
"grad_norm": 0.3945507271925106,
"learning_rate": 2.278481012658228e-05,
"loss": 0.3388,
"step": 61
},
{
"epoch": 0.03968,
"grad_norm": 0.40849904164663575,
"learning_rate": 2.3164556962025318e-05,
"loss": 0.3191,
"step": 62
},
{
"epoch": 0.04032,
"grad_norm": 0.4559537771952015,
"learning_rate": 2.3544303797468353e-05,
"loss": 0.3245,
"step": 63
},
{
"epoch": 0.04096,
"grad_norm": 0.40069809350325963,
"learning_rate": 2.3924050632911394e-05,
"loss": 0.3566,
"step": 64
},
{
"epoch": 0.0416,
"grad_norm": 0.37133926498283504,
"learning_rate": 2.430379746835443e-05,
"loss": 0.3147,
"step": 65
},
{
"epoch": 0.04224,
"grad_norm": 0.4628360444788038,
"learning_rate": 2.468354430379747e-05,
"loss": 0.3498,
"step": 66
},
{
"epoch": 0.04288,
"grad_norm": 0.38282967415212005,
"learning_rate": 2.5063291139240507e-05,
"loss": 0.3171,
"step": 67
},
{
"epoch": 0.04352,
"grad_norm": 0.4221420372731314,
"learning_rate": 2.5443037974683545e-05,
"loss": 0.3288,
"step": 68
},
{
"epoch": 0.04416,
"grad_norm": 0.4014634593404327,
"learning_rate": 2.5822784810126582e-05,
"loss": 0.2967,
"step": 69
},
{
"epoch": 0.0448,
"grad_norm": 0.4226874979052922,
"learning_rate": 2.620253164556962e-05,
"loss": 0.3311,
"step": 70
},
{
"epoch": 0.04544,
"grad_norm": 0.3691551141136815,
"learning_rate": 2.6582278481012658e-05,
"loss": 0.3154,
"step": 71
},
{
"epoch": 0.04608,
"grad_norm": 0.4930935215298506,
"learning_rate": 2.69620253164557e-05,
"loss": 0.3248,
"step": 72
},
{
"epoch": 0.04672,
"grad_norm": 0.40344283789369767,
"learning_rate": 2.7341772151898733e-05,
"loss": 0.3146,
"step": 73
},
{
"epoch": 0.04736,
"grad_norm": 0.40136750473831273,
"learning_rate": 2.7721518987341774e-05,
"loss": 0.3084,
"step": 74
},
{
"epoch": 0.048,
"grad_norm": 0.433002156207923,
"learning_rate": 2.8101265822784812e-05,
"loss": 0.2992,
"step": 75
},
{
"epoch": 0.04864,
"grad_norm": 0.45976406035512724,
"learning_rate": 2.8481012658227846e-05,
"loss": 0.3203,
"step": 76
},
{
"epoch": 0.04928,
"grad_norm": 0.46555202230017373,
"learning_rate": 2.8860759493670888e-05,
"loss": 0.326,
"step": 77
},
{
"epoch": 0.04992,
"grad_norm": 0.40061810400853354,
"learning_rate": 2.9240506329113925e-05,
"loss": 0.2902,
"step": 78
},
{
"epoch": 0.05056,
"grad_norm": 0.44228162263369347,
"learning_rate": 2.9620253164556963e-05,
"loss": 0.3235,
"step": 79
},
{
"epoch": 0.0512,
"grad_norm": 0.4587098467220033,
"learning_rate": 3e-05,
"loss": 0.3135,
"step": 80
},
{
"epoch": 0.05184,
"grad_norm": 0.42643590142876026,
"learning_rate": 2.9999966342756535e-05,
"loss": 0.3222,
"step": 81
},
{
"epoch": 0.05248,
"grad_norm": 0.4312879132975502,
"learning_rate": 2.9999865371177178e-05,
"loss": 0.3161,
"step": 82
},
{
"epoch": 0.05312,
"grad_norm": 0.44554806900527943,
"learning_rate": 2.9999697085715054e-05,
"loss": 0.3156,
"step": 83
},
{
"epoch": 0.05376,
"grad_norm": 0.4388949943183408,
"learning_rate": 2.9999461487125358e-05,
"loss": 0.3278,
"step": 84
},
{
"epoch": 0.0544,
"grad_norm": 0.4830420761548017,
"learning_rate": 2.999915857646538e-05,
"loss": 0.3242,
"step": 85
},
{
"epoch": 0.05504,
"grad_norm": 0.42290888303690005,
"learning_rate": 2.9998788355094472e-05,
"loss": 0.3051,
"step": 86
},
{
"epoch": 0.05568,
"grad_norm": 0.39931894559101605,
"learning_rate": 2.9998350824674046e-05,
"loss": 0.3174,
"step": 87
},
{
"epoch": 0.05632,
"grad_norm": 0.43885002522435895,
"learning_rate": 2.999784598716758e-05,
"loss": 0.3167,
"step": 88
},
{
"epoch": 0.05696,
"grad_norm": 0.4385666955310095,
"learning_rate": 2.9997273844840597e-05,
"loss": 0.32,
"step": 89
},
{
"epoch": 0.0576,
"grad_norm": 0.4241668561559285,
"learning_rate": 2.9996634400260665e-05,
"loss": 0.339,
"step": 90
},
{
"epoch": 0.05824,
"grad_norm": 0.49455156274720913,
"learning_rate": 2.9995927656297376e-05,
"loss": 0.3135,
"step": 91
},
{
"epoch": 0.05888,
"grad_norm": 0.3990302088629719,
"learning_rate": 2.9995153616122335e-05,
"loss": 0.3026,
"step": 92
},
{
"epoch": 0.05952,
"grad_norm": 0.38959449381416456,
"learning_rate": 2.9994312283209147e-05,
"loss": 0.2898,
"step": 93
},
{
"epoch": 0.06016,
"grad_norm": 0.42585557119259293,
"learning_rate": 2.9993403661333413e-05,
"loss": 0.3021,
"step": 94
},
{
"epoch": 0.0608,
"grad_norm": 0.37467889420763917,
"learning_rate": 2.9992427754572692e-05,
"loss": 0.273,
"step": 95
},
{
"epoch": 0.06144,
"grad_norm": 0.4466178048330342,
"learning_rate": 2.9991384567306485e-05,
"loss": 0.3116,
"step": 96
},
{
"epoch": 0.06208,
"grad_norm": 0.4133406435053229,
"learning_rate": 2.9990274104216246e-05,
"loss": 0.3066,
"step": 97
},
{
"epoch": 0.06272,
"grad_norm": 0.42703847116501487,
"learning_rate": 2.9989096370285314e-05,
"loss": 0.3029,
"step": 98
},
{
"epoch": 0.06336,
"grad_norm": 0.42408773456463106,
"learning_rate": 2.9987851370798936e-05,
"loss": 0.3302,
"step": 99
},
{
"epoch": 0.064,
"grad_norm": 0.36334101776279903,
"learning_rate": 2.998653911134421e-05,
"loss": 0.2749,
"step": 100
},
{
"epoch": 0.06464,
"grad_norm": 0.39914472077814156,
"learning_rate": 2.9985159597810067e-05,
"loss": 0.28,
"step": 101
},
{
"epoch": 0.06528,
"grad_norm": 0.4421338433902218,
"learning_rate": 2.9983712836387263e-05,
"loss": 0.3284,
"step": 102
},
{
"epoch": 0.06592,
"grad_norm": 0.4681114190701383,
"learning_rate": 2.9982198833568332e-05,
"loss": 0.3106,
"step": 103
},
{
"epoch": 0.06656,
"grad_norm": 0.43495699820223616,
"learning_rate": 2.998061759614756e-05,
"loss": 0.3034,
"step": 104
},
{
"epoch": 0.0672,
"grad_norm": 0.48523982047419434,
"learning_rate": 2.9978969131220956e-05,
"loss": 0.3199,
"step": 105
},
{
"epoch": 0.06784,
"grad_norm": 0.46970443054055944,
"learning_rate": 2.9977253446186236e-05,
"loss": 0.3145,
"step": 106
},
{
"epoch": 0.06848,
"grad_norm": 0.3762833085351204,
"learning_rate": 2.997547054874275e-05,
"loss": 0.2784,
"step": 107
},
{
"epoch": 0.06912,
"grad_norm": 0.4018479556059187,
"learning_rate": 2.9973620446891495e-05,
"loss": 0.29,
"step": 108
},
{
"epoch": 0.06976,
"grad_norm": 0.38557801552857196,
"learning_rate": 2.997170314893504e-05,
"loss": 0.2973,
"step": 109
},
{
"epoch": 0.0704,
"grad_norm": 0.42760504312755043,
"learning_rate": 2.9969718663477524e-05,
"loss": 0.3087,
"step": 110
},
{
"epoch": 0.07104,
"grad_norm": 0.4336880764497262,
"learning_rate": 2.9967666999424588e-05,
"loss": 0.3202,
"step": 111
},
{
"epoch": 0.07168,
"grad_norm": 0.4361380772072425,
"learning_rate": 2.9965548165983337e-05,
"loss": 0.2858,
"step": 112
},
{
"epoch": 0.07232,
"grad_norm": 0.36094868753422277,
"learning_rate": 2.9963362172662324e-05,
"loss": 0.2965,
"step": 113
},
{
"epoch": 0.07296,
"grad_norm": 0.5213407516270206,
"learning_rate": 2.9961109029271478e-05,
"loss": 0.2877,
"step": 114
},
{
"epoch": 0.0736,
"grad_norm": 0.39899966857979996,
"learning_rate": 2.9958788745922088e-05,
"loss": 0.2967,
"step": 115
},
{
"epoch": 0.07424,
"grad_norm": 0.4922285856357075,
"learning_rate": 2.995640133302672e-05,
"loss": 0.3135,
"step": 116
},
{
"epoch": 0.07488,
"grad_norm": 0.3840683744534582,
"learning_rate": 2.9953946801299213e-05,
"loss": 0.3021,
"step": 117
},
{
"epoch": 0.07552,
"grad_norm": 0.4183493778424685,
"learning_rate": 2.995142516175461e-05,
"loss": 0.307,
"step": 118
},
{
"epoch": 0.07616,
"grad_norm": 0.35220612158537656,
"learning_rate": 2.994883642570909e-05,
"loss": 0.2702,
"step": 119
},
{
"epoch": 0.0768,
"grad_norm": 0.3936594224655844,
"learning_rate": 2.9946180604779952e-05,
"loss": 0.2978,
"step": 120
},
{
"epoch": 0.07744,
"grad_norm": 0.4020931932900933,
"learning_rate": 2.9943457710885548e-05,
"loss": 0.3115,
"step": 121
},
{
"epoch": 0.07808,
"grad_norm": 0.413807873620709,
"learning_rate": 2.9940667756245218e-05,
"loss": 0.3084,
"step": 122
},
{
"epoch": 0.07872,
"grad_norm": 0.3861233473719098,
"learning_rate": 2.9937810753379266e-05,
"loss": 0.2844,
"step": 123
},
{
"epoch": 0.07936,
"grad_norm": 0.3386752443740303,
"learning_rate": 2.993488671510886e-05,
"loss": 0.3037,
"step": 124
},
{
"epoch": 0.08,
"grad_norm": 0.3972212865125969,
"learning_rate": 2.993189565455601e-05,
"loss": 0.3137,
"step": 125
},
{
"epoch": 0.08064,
"grad_norm": 0.38002895692953814,
"learning_rate": 2.9928837585143497e-05,
"loss": 0.2873,
"step": 126
},
{
"epoch": 0.08128,
"grad_norm": 0.44072168693518854,
"learning_rate": 2.992571252059482e-05,
"loss": 0.2782,
"step": 127
},
{
"epoch": 0.08192,
"grad_norm": 0.3479281011658149,
"learning_rate": 2.992252047493411e-05,
"loss": 0.2799,
"step": 128
},
{
"epoch": 0.08256,
"grad_norm": 0.4142266632731221,
"learning_rate": 2.9919261462486098e-05,
"loss": 0.3176,
"step": 129
},
{
"epoch": 0.0832,
"grad_norm": 0.35944919396457825,
"learning_rate": 2.991593549787604e-05,
"loss": 0.2963,
"step": 130
},
{
"epoch": 0.08384,
"grad_norm": 0.3837171188201694,
"learning_rate": 2.9912542596029635e-05,
"loss": 0.3057,
"step": 131
},
{
"epoch": 0.08448,
"grad_norm": 0.4763114305033498,
"learning_rate": 2.990908277217298e-05,
"loss": 0.3224,
"step": 132
},
{
"epoch": 0.08512,
"grad_norm": 0.3834181145690126,
"learning_rate": 2.9905556041832494e-05,
"loss": 0.2841,
"step": 133
},
{
"epoch": 0.08576,
"grad_norm": 0.40814627377992635,
"learning_rate": 2.990196242083485e-05,
"loss": 0.2871,
"step": 134
},
{
"epoch": 0.0864,
"grad_norm": 0.444873139895432,
"learning_rate": 2.98983019253069e-05,
"loss": 0.29,
"step": 135
},
{
"epoch": 0.08704,
"grad_norm": 0.4015570039172884,
"learning_rate": 2.9894574571675593e-05,
"loss": 0.29,
"step": 136
},
{
"epoch": 0.08768,
"grad_norm": 0.4192576277736737,
"learning_rate": 2.989078037666793e-05,
"loss": 0.3187,
"step": 137
},
{
"epoch": 0.08832,
"grad_norm": 0.4701185999968217,
"learning_rate": 2.988691935731086e-05,
"loss": 0.3039,
"step": 138
},
{
"epoch": 0.08896,
"grad_norm": 0.331862366916375,
"learning_rate": 2.988299153093122e-05,
"loss": 0.2814,
"step": 139
},
{
"epoch": 0.0896,
"grad_norm": 0.4503832184907915,
"learning_rate": 2.987899691515565e-05,
"loss": 0.3084,
"step": 140
},
{
"epoch": 0.09024,
"grad_norm": 0.3734705387771823,
"learning_rate": 2.987493552791052e-05,
"loss": 0.2837,
"step": 141
},
{
"epoch": 0.09088,
"grad_norm": 0.3978970762252604,
"learning_rate": 2.9870807387421843e-05,
"loss": 0.3032,
"step": 142
},
{
"epoch": 0.09152,
"grad_norm": 0.39348398869038875,
"learning_rate": 2.986661251221519e-05,
"loss": 0.3062,
"step": 143
},
{
"epoch": 0.09216,
"grad_norm": 0.4493064184937299,
"learning_rate": 2.986235092111563e-05,
"loss": 0.3071,
"step": 144
},
{
"epoch": 0.0928,
"grad_norm": 0.44946841788948644,
"learning_rate": 2.985802263324761e-05,
"loss": 0.3025,
"step": 145
},
{
"epoch": 0.09344,
"grad_norm": 0.5016226955727763,
"learning_rate": 2.9853627668034898e-05,
"loss": 0.3246,
"step": 146
},
{
"epoch": 0.09408,
"grad_norm": 0.399973023082225,
"learning_rate": 2.9849166045200476e-05,
"loss": 0.2811,
"step": 147
},
{
"epoch": 0.09472,
"grad_norm": 0.3739130134478663,
"learning_rate": 2.9844637784766478e-05,
"loss": 0.2855,
"step": 148
},
{
"epoch": 0.09536,
"grad_norm": 0.43805848638708655,
"learning_rate": 2.9840042907054068e-05,
"loss": 0.2798,
"step": 149
},
{
"epoch": 0.096,
"grad_norm": 0.4402314818129774,
"learning_rate": 2.9835381432683363e-05,
"loss": 0.2847,
"step": 150
},
{
"epoch": 0.09664,
"grad_norm": 0.4047049034331425,
"learning_rate": 2.9830653382573358e-05,
"loss": 0.2861,
"step": 151
},
{
"epoch": 0.09728,
"grad_norm": 0.43978737589580585,
"learning_rate": 2.982585877794179e-05,
"loss": 0.3054,
"step": 152
},
{
"epoch": 0.09792,
"grad_norm": 0.4730334615845102,
"learning_rate": 2.9820997640305097e-05,
"loss": 0.2817,
"step": 153
},
{
"epoch": 0.09856,
"grad_norm": 0.3740570479644525,
"learning_rate": 2.981606999147827e-05,
"loss": 0.2608,
"step": 154
},
{
"epoch": 0.0992,
"grad_norm": 0.481760965756092,
"learning_rate": 2.9811075853574788e-05,
"loss": 0.3113,
"step": 155
},
{
"epoch": 0.09984,
"grad_norm": 0.3961997084965111,
"learning_rate": 2.98060152490065e-05,
"loss": 0.3088,
"step": 156
},
{
"epoch": 0.10048,
"grad_norm": 0.428730401496227,
"learning_rate": 2.9800888200483552e-05,
"loss": 0.3129,
"step": 157
},
{
"epoch": 0.10112,
"grad_norm": 0.4178705790625643,
"learning_rate": 2.979569473101424e-05,
"loss": 0.3284,
"step": 158
},
{
"epoch": 0.10176,
"grad_norm": 0.4463567693300283,
"learning_rate": 2.9790434863904957e-05,
"loss": 0.3044,
"step": 159
},
{
"epoch": 0.1024,
"grad_norm": 0.3939775657775284,
"learning_rate": 2.9785108622760045e-05,
"loss": 0.2894,
"step": 160
},
{
"epoch": 0.10304,
"grad_norm": 0.38174144831499374,
"learning_rate": 2.9779716031481717e-05,
"loss": 0.2779,
"step": 161
},
{
"epoch": 0.10368,
"grad_norm": 0.38589717775088483,
"learning_rate": 2.9774257114269955e-05,
"loss": 0.3123,
"step": 162
},
{
"epoch": 0.10432,
"grad_norm": 0.3673496875585954,
"learning_rate": 2.9768731895622355e-05,
"loss": 0.2771,
"step": 163
},
{
"epoch": 0.10496,
"grad_norm": 0.3571608186420396,
"learning_rate": 2.9763140400334072e-05,
"loss": 0.2897,
"step": 164
},
{
"epoch": 0.1056,
"grad_norm": 0.3939651393603113,
"learning_rate": 2.975748265349769e-05,
"loss": 0.2966,
"step": 165
},
{
"epoch": 0.10624,
"grad_norm": 0.3805663760796052,
"learning_rate": 2.975175868050309e-05,
"loss": 0.3156,
"step": 166
},
{
"epoch": 0.10688,
"grad_norm": 0.3972270203399388,
"learning_rate": 2.9745968507037356e-05,
"loss": 0.2963,
"step": 167
},
{
"epoch": 0.10752,
"grad_norm": 0.39226032551563944,
"learning_rate": 2.974011215908467e-05,
"loss": 0.2933,
"step": 168
},
{
"epoch": 0.10816,
"grad_norm": 0.35875066299615677,
"learning_rate": 2.9734189662926163e-05,
"loss": 0.2758,
"step": 169
},
{
"epoch": 0.1088,
"grad_norm": 0.3709380144753209,
"learning_rate": 2.9728201045139813e-05,
"loss": 0.2927,
"step": 170
},
{
"epoch": 0.10944,
"grad_norm": 0.370201710217779,
"learning_rate": 2.972214633260035e-05,
"loss": 0.2866,
"step": 171
},
{
"epoch": 0.11008,
"grad_norm": 0.3758060771260273,
"learning_rate": 2.9716025552479093e-05,
"loss": 0.2994,
"step": 172
},
{
"epoch": 0.11072,
"grad_norm": 0.32580329325564505,
"learning_rate": 2.9709838732243844e-05,
"loss": 0.2802,
"step": 173
},
{
"epoch": 0.11136,
"grad_norm": 0.38248606018972797,
"learning_rate": 2.970358589965879e-05,
"loss": 0.2739,
"step": 174
},
{
"epoch": 0.112,
"grad_norm": 0.36493445057406637,
"learning_rate": 2.9697267082784342e-05,
"loss": 0.2874,
"step": 175
},
{
"epoch": 0.11264,
"grad_norm": 0.6384082314576618,
"learning_rate": 2.969088230997703e-05,
"loss": 0.2835,
"step": 176
},
{
"epoch": 0.11328,
"grad_norm": 0.4046365021981396,
"learning_rate": 2.9684431609889365e-05,
"loss": 0.2964,
"step": 177
},
{
"epoch": 0.11392,
"grad_norm": 0.3777231337256365,
"learning_rate": 2.9677915011469717e-05,
"loss": 0.302,
"step": 178
},
{
"epoch": 0.11456,
"grad_norm": 0.3550577304426545,
"learning_rate": 2.9671332543962183e-05,
"loss": 0.2809,
"step": 179
},
{
"epoch": 0.1152,
"grad_norm": 0.34027303690053873,
"learning_rate": 2.9664684236906466e-05,
"loss": 0.2998,
"step": 180
},
{
"epoch": 0.11584,
"grad_norm": 0.3705269161905217,
"learning_rate": 2.965797012013772e-05,
"loss": 0.3187,
"step": 181
},
{
"epoch": 0.11648,
"grad_norm": 0.36000193066901365,
"learning_rate": 2.9651190223786427e-05,
"loss": 0.2838,
"step": 182
},
{
"epoch": 0.11712,
"grad_norm": 0.32994641779773276,
"learning_rate": 2.9644344578278284e-05,
"loss": 0.2791,
"step": 183
},
{
"epoch": 0.11776,
"grad_norm": 0.35570577433544603,
"learning_rate": 2.963743321433402e-05,
"loss": 0.2773,
"step": 184
},
{
"epoch": 0.1184,
"grad_norm": 0.3596282933620285,
"learning_rate": 2.9630456162969298e-05,
"loss": 0.2797,
"step": 185
},
{
"epoch": 0.11904,
"grad_norm": 0.3262807239848876,
"learning_rate": 2.9623413455494563e-05,
"loss": 0.288,
"step": 186
},
{
"epoch": 0.11968,
"grad_norm": 0.3565865179949324,
"learning_rate": 2.9616305123514897e-05,
"loss": 0.2887,
"step": 187
},
{
"epoch": 0.12032,
"grad_norm": 0.39532927846862465,
"learning_rate": 2.9609131198929884e-05,
"loss": 0.2782,
"step": 188
},
{
"epoch": 0.12096,
"grad_norm": 0.40888574656849996,
"learning_rate": 2.9601891713933457e-05,
"loss": 0.2718,
"step": 189
},
{
"epoch": 0.1216,
"grad_norm": 0.3494105157933286,
"learning_rate": 2.9594586701013765e-05,
"loss": 0.2925,
"step": 190
},
{
"epoch": 0.12224,
"grad_norm": 0.38169153601979233,
"learning_rate": 2.958721619295302e-05,
"loss": 0.2783,
"step": 191
},
{
"epoch": 0.12288,
"grad_norm": 0.3650159986813621,
"learning_rate": 2.9579780222827354e-05,
"loss": 0.2839,
"step": 192
},
{
"epoch": 0.12352,
"grad_norm": 0.32047234144052256,
"learning_rate": 2.957227882400667e-05,
"loss": 0.2781,
"step": 193
},
{
"epoch": 0.12416,
"grad_norm": 0.39912184054626987,
"learning_rate": 2.9564712030154486e-05,
"loss": 0.2992,
"step": 194
},
{
"epoch": 0.1248,
"grad_norm": 0.3589964401769852,
"learning_rate": 2.9557079875227795e-05,
"loss": 0.2991,
"step": 195
},
{
"epoch": 0.12544,
"grad_norm": 0.3095249869798657,
"learning_rate": 2.95493823934769e-05,
"loss": 0.2886,
"step": 196
},
{
"epoch": 0.12608,
"grad_norm": 0.33996550944422993,
"learning_rate": 2.954161961944527e-05,
"loss": 0.2705,
"step": 197
},
{
"epoch": 0.12672,
"grad_norm": 0.3423071479543653,
"learning_rate": 2.953379158796938e-05,
"loss": 0.2795,
"step": 198
},
{
"epoch": 0.12736,
"grad_norm": 0.36082529918938056,
"learning_rate": 2.9525898334178566e-05,
"loss": 0.2887,
"step": 199
},
{
"epoch": 0.128,
"grad_norm": 0.37640661693173727,
"learning_rate": 2.951793989349484e-05,
"loss": 0.282,
"step": 200
},
{
"epoch": 0.12864,
"grad_norm": 0.3023070350545028,
"learning_rate": 2.950991630163277e-05,
"loss": 0.254,
"step": 201
},
{
"epoch": 0.12928,
"grad_norm": 0.3746495017193779,
"learning_rate": 2.950182759459928e-05,
"loss": 0.2913,
"step": 202
},
{
"epoch": 0.12992,
"grad_norm": 0.38483323478343573,
"learning_rate": 2.949367380869351e-05,
"loss": 0.2913,
"step": 203
},
{
"epoch": 0.13056,
"grad_norm": 0.4213879707012684,
"learning_rate": 2.9485454980506663e-05,
"loss": 0.2841,
"step": 204
},
{
"epoch": 0.1312,
"grad_norm": 0.4131757338780886,
"learning_rate": 2.9477171146921816e-05,
"loss": 0.2981,
"step": 205
},
{
"epoch": 0.13184,
"grad_norm": 0.4415959043983817,
"learning_rate": 2.946882234511377e-05,
"loss": 0.2975,
"step": 206
},
{
"epoch": 0.13248,
"grad_norm": 0.37499387107669097,
"learning_rate": 2.9460408612548876e-05,
"loss": 0.2844,
"step": 207
},
{
"epoch": 0.13312,
"grad_norm": 0.41827519988058354,
"learning_rate": 2.9451929986984875e-05,
"loss": 0.2958,
"step": 208
},
{
"epoch": 0.13376,
"grad_norm": 0.33683714204034787,
"learning_rate": 2.9443386506470725e-05,
"loss": 0.2686,
"step": 209
},
{
"epoch": 0.1344,
"grad_norm": 0.38867413035222104,
"learning_rate": 2.9434778209346427e-05,
"loss": 0.2876,
"step": 210
},
{
"epoch": 0.13504,
"grad_norm": 0.39089889654238963,
"learning_rate": 2.942610513424285e-05,
"loss": 0.2832,
"step": 211
},
{
"epoch": 0.13568,
"grad_norm": 0.3384476658736123,
"learning_rate": 2.9417367320081567e-05,
"loss": 0.2604,
"step": 212
},
{
"epoch": 0.13632,
"grad_norm": 0.38210814875894833,
"learning_rate": 2.940856480607468e-05,
"loss": 0.2986,
"step": 213
},
{
"epoch": 0.13696,
"grad_norm": 0.38075475653821916,
"learning_rate": 2.9399697631724637e-05,
"loss": 0.2761,
"step": 214
},
{
"epoch": 0.1376,
"grad_norm": 0.36451096593797544,
"learning_rate": 2.9390765836824053e-05,
"loss": 0.3053,
"step": 215
},
{
"epoch": 0.13824,
"grad_norm": 0.36903700040818893,
"learning_rate": 2.938176946145555e-05,
"loss": 0.2776,
"step": 216
},
{
"epoch": 0.13888,
"grad_norm": 0.3858847022411217,
"learning_rate": 2.9372708545991542e-05,
"loss": 0.2682,
"step": 217
},
{
"epoch": 0.13952,
"grad_norm": 0.3664388884748071,
"learning_rate": 2.936358313109409e-05,
"loss": 0.263,
"step": 218
},
{
"epoch": 0.14016,
"grad_norm": 0.3833106914044559,
"learning_rate": 2.935439325771471e-05,
"loss": 0.284,
"step": 219
},
{
"epoch": 0.1408,
"grad_norm": 0.4366332113795893,
"learning_rate": 2.9345138967094174e-05,
"loss": 0.2827,
"step": 220
},
{
"epoch": 0.14144,
"grad_norm": 0.35644258927978634,
"learning_rate": 2.9335820300762334e-05,
"loss": 0.2763,
"step": 221
},
{
"epoch": 0.14208,
"grad_norm": 0.4086383954714273,
"learning_rate": 2.9326437300537937e-05,
"loss": 0.2731,
"step": 222
},
{
"epoch": 0.14272,
"grad_norm": 0.37566830211795793,
"learning_rate": 2.9316990008528446e-05,
"loss": 0.3035,
"step": 223
},
{
"epoch": 0.14336,
"grad_norm": 0.37673883341537595,
"learning_rate": 2.9307478467129827e-05,
"loss": 0.2807,
"step": 224
},
{
"epoch": 0.144,
"grad_norm": 0.3658815413836672,
"learning_rate": 2.9297902719026392e-05,
"loss": 0.2872,
"step": 225
},
{
"epoch": 0.14464,
"grad_norm": 0.3723900174142704,
"learning_rate": 2.928826280719057e-05,
"loss": 0.2628,
"step": 226
},
{
"epoch": 0.14528,
"grad_norm": 0.3583483332892453,
"learning_rate": 2.9278558774882748e-05,
"loss": 0.2757,
"step": 227
},
{
"epoch": 0.14592,
"grad_norm": 0.36256098670512743,
"learning_rate": 2.9268790665651053e-05,
"loss": 0.2795,
"step": 228
},
{
"epoch": 0.14656,
"grad_norm": 0.4447205295517405,
"learning_rate": 2.925895852333117e-05,
"loss": 0.2917,
"step": 229
},
{
"epoch": 0.1472,
"grad_norm": 0.33653314086590325,
"learning_rate": 2.924906239204614e-05,
"loss": 0.2856,
"step": 230
},
{
"epoch": 0.14784,
"grad_norm": 0.3875121699387206,
"learning_rate": 2.9239102316206166e-05,
"loss": 0.2756,
"step": 231
},
{
"epoch": 0.14848,
"grad_norm": 0.4191113393584937,
"learning_rate": 2.9229078340508404e-05,
"loss": 0.2947,
"step": 232
},
{
"epoch": 0.14912,
"grad_norm": 0.3866679704383513,
"learning_rate": 2.9218990509936774e-05,
"loss": 0.2948,
"step": 233
},
{
"epoch": 0.14976,
"grad_norm": 0.44210860261964957,
"learning_rate": 2.9208838869761756e-05,
"loss": 0.2823,
"step": 234
},
{
"epoch": 0.1504,
"grad_norm": 0.3569394913477398,
"learning_rate": 2.9198623465540172e-05,
"loss": 0.2896,
"step": 235
},
{
"epoch": 0.15104,
"grad_norm": 0.3582653155321637,
"learning_rate": 2.9188344343115005e-05,
"loss": 0.2639,
"step": 236
},
{
"epoch": 0.15168,
"grad_norm": 0.3632299874629792,
"learning_rate": 2.9178001548615176e-05,
"loss": 0.2752,
"step": 237
},
{
"epoch": 0.15232,
"grad_norm": 0.3627690799272646,
"learning_rate": 2.9167595128455357e-05,
"loss": 0.2697,
"step": 238
},
{
"epoch": 0.15296,
"grad_norm": 0.31681761773997114,
"learning_rate": 2.915712512933572e-05,
"loss": 0.2886,
"step": 239
},
{
"epoch": 0.1536,
"grad_norm": 0.3652202557075009,
"learning_rate": 2.914659159824178e-05,
"loss": 0.291,
"step": 240
},
{
"epoch": 0.15424,
"grad_norm": 0.32355380076781154,
"learning_rate": 2.913599458244416e-05,
"loss": 0.288,
"step": 241
},
{
"epoch": 0.15488,
"grad_norm": 0.3263439505506079,
"learning_rate": 2.9125334129498368e-05,
"loss": 0.2669,
"step": 242
},
{
"epoch": 0.15552,
"grad_norm": 0.34698579986117173,
"learning_rate": 2.911461028724459e-05,
"loss": 0.2623,
"step": 243
},
{
"epoch": 0.15616,
"grad_norm": 0.3560347825174995,
"learning_rate": 2.9103823103807503e-05,
"loss": 0.2779,
"step": 244
},
{
"epoch": 0.1568,
"grad_norm": 0.3829349365942763,
"learning_rate": 2.9092972627596012e-05,
"loss": 0.285,
"step": 245
},
{
"epoch": 0.15744,
"grad_norm": 0.35634808472353574,
"learning_rate": 2.9082058907303064e-05,
"loss": 0.2787,
"step": 246
},
{
"epoch": 0.15808,
"grad_norm": 0.3973522847788536,
"learning_rate": 2.9071081991905436e-05,
"loss": 0.2686,
"step": 247
},
{
"epoch": 0.15872,
"grad_norm": 0.3503037505655835,
"learning_rate": 2.9060041930663477e-05,
"loss": 0.2601,
"step": 248
},
{
"epoch": 0.15936,
"grad_norm": 0.42641830259468066,
"learning_rate": 2.9048938773120933e-05,
"loss": 0.2571,
"step": 249
},
{
"epoch": 0.16,
"grad_norm": 0.33022262318082957,
"learning_rate": 2.903777256910469e-05,
"loss": 0.2539,
"step": 250
},
{
"epoch": 0.16064,
"grad_norm": 0.34806931009286257,
"learning_rate": 2.902654336872457e-05,
"loss": 0.2855,
"step": 251
},
{
"epoch": 0.16128,
"grad_norm": 0.3802291207724926,
"learning_rate": 2.9015251222373094e-05,
"loss": 0.2505,
"step": 252
},
{
"epoch": 0.16192,
"grad_norm": 0.4182370813509436,
"learning_rate": 2.9003896180725268e-05,
"loss": 0.2821,
"step": 253
},
{
"epoch": 0.16256,
"grad_norm": 0.45914515409569023,
"learning_rate": 2.8992478294738345e-05,
"loss": 0.2934,
"step": 254
},
{
"epoch": 0.1632,
"grad_norm": 0.3490846199737997,
"learning_rate": 2.8980997615651597e-05,
"loss": 0.2853,
"step": 255
},
{
"epoch": 0.16384,
"grad_norm": 0.3945607808525512,
"learning_rate": 2.8969454194986095e-05,
"loss": 0.2704,
"step": 256
},
{
"epoch": 0.16448,
"grad_norm": 0.4134287311468698,
"learning_rate": 2.8957848084544473e-05,
"loss": 0.2856,
"step": 257
},
{
"epoch": 0.16512,
"grad_norm": 0.388727953362913,
"learning_rate": 2.8946179336410682e-05,
"loss": 0.2707,
"step": 258
},
{
"epoch": 0.16576,
"grad_norm": 0.3707496935300037,
"learning_rate": 2.8934448002949775e-05,
"loss": 0.2736,
"step": 259
},
{
"epoch": 0.1664,
"grad_norm": 0.37509134347160566,
"learning_rate": 2.892265413680767e-05,
"loss": 0.2656,
"step": 260
},
{
"epoch": 0.16704,
"grad_norm": 0.37839278243532787,
"learning_rate": 2.8910797790910902e-05,
"loss": 0.2954,
"step": 261
},
{
"epoch": 0.16768,
"grad_norm": 0.3709934691909708,
"learning_rate": 2.889887901846639e-05,
"loss": 0.2771,
"step": 262
},
{
"epoch": 0.16832,
"grad_norm": 0.38441885239694634,
"learning_rate": 2.8886897872961203e-05,
"loss": 0.2872,
"step": 263
},
{
"epoch": 0.16896,
"grad_norm": 0.36199416231449505,
"learning_rate": 2.887485440816233e-05,
"loss": 0.2577,
"step": 264
},
{
"epoch": 0.1696,
"grad_norm": 0.35207892498047666,
"learning_rate": 2.8862748678116402e-05,
"loss": 0.2692,
"step": 265
},
{
"epoch": 0.17024,
"grad_norm": 0.3167681279515036,
"learning_rate": 2.885058073714949e-05,
"loss": 0.2517,
"step": 266
},
{
"epoch": 0.17088,
"grad_norm": 0.3836456596956298,
"learning_rate": 2.8838350639866843e-05,
"loss": 0.3038,
"step": 267
},
{
"epoch": 0.17152,
"grad_norm": 0.3604280243223731,
"learning_rate": 2.882605844115264e-05,
"loss": 0.2748,
"step": 268
},
{
"epoch": 0.17216,
"grad_norm": 0.33239517181664746,
"learning_rate": 2.8813704196169753e-05,
"loss": 0.2705,
"step": 269
},
{
"epoch": 0.1728,
"grad_norm": 0.4453367026441279,
"learning_rate": 2.8801287960359494e-05,
"loss": 0.2752,
"step": 270
},
{
"epoch": 0.17344,
"grad_norm": 0.3672385598637218,
"learning_rate": 2.8788809789441364e-05,
"loss": 0.2813,
"step": 271
},
{
"epoch": 0.17408,
"grad_norm": 0.42500271676682233,
"learning_rate": 2.8776269739412803e-05,
"loss": 0.2581,
"step": 272
},
{
"epoch": 0.17472,
"grad_norm": 0.3812194331681742,
"learning_rate": 2.8763667866548956e-05,
"loss": 0.2797,
"step": 273
},
{
"epoch": 0.17536,
"grad_norm": 0.398047671024292,
"learning_rate": 2.875100422740239e-05,
"loss": 0.2822,
"step": 274
},
{
"epoch": 0.176,
"grad_norm": 0.4382339198136674,
"learning_rate": 2.8738278878802865e-05,
"loss": 0.2951,
"step": 275
},
{
"epoch": 0.17664,
"grad_norm": 0.32914734501477555,
"learning_rate": 2.8725491877857073e-05,
"loss": 0.2505,
"step": 276
},
{
"epoch": 0.17728,
"grad_norm": 0.359221594407232,
"learning_rate": 2.8712643281948365e-05,
"loss": 0.2682,
"step": 277
},
{
"epoch": 0.17792,
"grad_norm": 0.3610201048162127,
"learning_rate": 2.8699733148736525e-05,
"loss": 0.2706,
"step": 278
},
{
"epoch": 0.17856,
"grad_norm": 0.3576575465005173,
"learning_rate": 2.868676153615748e-05,
"loss": 0.2725,
"step": 279
},
{
"epoch": 0.1792,
"grad_norm": 0.39648269977135403,
"learning_rate": 2.8673728502423067e-05,
"loss": 0.2638,
"step": 280
},
{
"epoch": 0.17984,
"grad_norm": 0.35937870544419476,
"learning_rate": 2.8660634106020747e-05,
"loss": 0.2753,
"step": 281
},
{
"epoch": 0.18048,
"grad_norm": 0.4065407489775043,
"learning_rate": 2.8647478405713355e-05,
"loss": 0.2814,
"step": 282
},
{
"epoch": 0.18112,
"grad_norm": 0.3401826548234812,
"learning_rate": 2.8634261460538845e-05,
"loss": 0.2734,
"step": 283
},
{
"epoch": 0.18176,
"grad_norm": 0.3871247999638719,
"learning_rate": 2.8620983329810004e-05,
"loss": 0.266,
"step": 284
},
{
"epoch": 0.1824,
"grad_norm": 0.40719492038320787,
"learning_rate": 2.860764407311421e-05,
"loss": 0.2727,
"step": 285
},
{
"epoch": 0.18304,
"grad_norm": 0.3399520864358679,
"learning_rate": 2.8594243750313137e-05,
"loss": 0.2689,
"step": 286
},
{
"epoch": 0.18368,
"grad_norm": 0.3953164307154916,
"learning_rate": 2.8580782421542514e-05,
"loss": 0.273,
"step": 287
},
{
"epoch": 0.18432,
"grad_norm": 0.4447124642066109,
"learning_rate": 2.8567260147211826e-05,
"loss": 0.282,
"step": 288
},
{
"epoch": 0.18496,
"grad_norm": 0.3479285885985358,
"learning_rate": 2.8553676988004083e-05,
"loss": 0.2846,
"step": 289
},
{
"epoch": 0.1856,
"grad_norm": 0.4427716710403484,
"learning_rate": 2.8540033004875506e-05,
"loss": 0.2731,
"step": 290
},
{
"epoch": 0.18624,
"grad_norm": 0.3389993255265989,
"learning_rate": 2.852632825905528e-05,
"loss": 0.2728,
"step": 291
},
{
"epoch": 0.18688,
"grad_norm": 0.36214801666992735,
"learning_rate": 2.851256281204526e-05,
"loss": 0.2963,
"step": 292
},
{
"epoch": 0.18752,
"grad_norm": 0.3225822277564517,
"learning_rate": 2.849873672561972e-05,
"loss": 0.256,
"step": 293
},
{
"epoch": 0.18816,
"grad_norm": 0.36090735754771,
"learning_rate": 2.8484850061825052e-05,
"loss": 0.274,
"step": 294
},
{
"epoch": 0.1888,
"grad_norm": 0.3619026443591473,
"learning_rate": 2.84709028829795e-05,
"loss": 0.2904,
"step": 295
},
{
"epoch": 0.18944,
"grad_norm": 0.3252751498399478,
"learning_rate": 2.8456895251672867e-05,
"loss": 0.2892,
"step": 296
},
{
"epoch": 0.19008,
"grad_norm": 0.3420708202146713,
"learning_rate": 2.8442827230766265e-05,
"loss": 0.2509,
"step": 297
},
{
"epoch": 0.19072,
"grad_norm": 0.3927376280930936,
"learning_rate": 2.8428698883391805e-05,
"loss": 0.2902,
"step": 298
},
{
"epoch": 0.19136,
"grad_norm": 0.33206871786295983,
"learning_rate": 2.8414510272952306e-05,
"loss": 0.2725,
"step": 299
},
{
"epoch": 0.192,
"grad_norm": 0.31541664358451077,
"learning_rate": 2.840026146312104e-05,
"loss": 0.2379,
"step": 300
},
{
"epoch": 0.19264,
"grad_norm": 0.3610931095304509,
"learning_rate": 2.8385952517841433e-05,
"loss": 0.2778,
"step": 301
},
{
"epoch": 0.19328,
"grad_norm": 0.32547764886975145,
"learning_rate": 2.837158350132677e-05,
"loss": 0.2786,
"step": 302
},
{
"epoch": 0.19392,
"grad_norm": 0.3428550126773416,
"learning_rate": 2.835715447805991e-05,
"loss": 0.2896,
"step": 303
},
{
"epoch": 0.19456,
"grad_norm": 0.34537169429710707,
"learning_rate": 2.8342665512793018e-05,
"loss": 0.2888,
"step": 304
},
{
"epoch": 0.1952,
"grad_norm": 0.3526668903699371,
"learning_rate": 2.8328116670547237e-05,
"loss": 0.2711,
"step": 305
},
{
"epoch": 0.19584,
"grad_norm": 0.3298129440685655,
"learning_rate": 2.8313508016612428e-05,
"loss": 0.2723,
"step": 306
},
{
"epoch": 0.19648,
"grad_norm": 0.3252403541009243,
"learning_rate": 2.8298839616546854e-05,
"loss": 0.2783,
"step": 307
},
{
"epoch": 0.19712,
"grad_norm": 0.31942049727047156,
"learning_rate": 2.8284111536176907e-05,
"loss": 0.3047,
"step": 308
},
{
"epoch": 0.19776,
"grad_norm": 0.28119673519843524,
"learning_rate": 2.8269323841596802e-05,
"loss": 0.2685,
"step": 309
},
{
"epoch": 0.1984,
"grad_norm": 0.3574212794376199,
"learning_rate": 2.825447659916827e-05,
"loss": 0.271,
"step": 310
},
{
"epoch": 0.19904,
"grad_norm": 0.30153919945084046,
"learning_rate": 2.823956987552028e-05,
"loss": 0.2529,
"step": 311
},
{
"epoch": 0.19968,
"grad_norm": 0.3494979054883366,
"learning_rate": 2.8224603737548737e-05,
"loss": 0.2886,
"step": 312
},
{
"epoch": 0.20032,
"grad_norm": 0.30740471706963013,
"learning_rate": 2.8209578252416162e-05,
"loss": 0.2876,
"step": 313
},
{
"epoch": 0.20096,
"grad_norm": 0.3279624895202358,
"learning_rate": 2.8194493487551402e-05,
"loss": 0.2946,
"step": 314
},
{
"epoch": 0.2016,
"grad_norm": 0.3430080164088275,
"learning_rate": 2.8179349510649354e-05,
"loss": 0.2891,
"step": 315
},
{
"epoch": 0.20224,
"grad_norm": 0.30496631367689814,
"learning_rate": 2.8164146389670605e-05,
"loss": 0.2842,
"step": 316
},
{
"epoch": 0.20288,
"grad_norm": 0.379409454373658,
"learning_rate": 2.8148884192841183e-05,
"loss": 0.294,
"step": 317
},
{
"epoch": 0.20352,
"grad_norm": 0.2948622980345085,
"learning_rate": 2.8133562988652218e-05,
"loss": 0.2584,
"step": 318
},
{
"epoch": 0.20416,
"grad_norm": 0.3443019691027711,
"learning_rate": 2.8118182845859636e-05,
"loss": 0.251,
"step": 319
},
{
"epoch": 0.2048,
"grad_norm": 0.36699154745018375,
"learning_rate": 2.810274383348387e-05,
"loss": 0.2952,
"step": 320
},
{
"epoch": 0.20544,
"grad_norm": 0.33967893078585726,
"learning_rate": 2.8087246020809536e-05,
"loss": 0.2796,
"step": 321
},
{
"epoch": 0.20608,
"grad_norm": 0.31170315997663467,
"learning_rate": 2.8071689477385117e-05,
"loss": 0.2721,
"step": 322
},
{
"epoch": 0.20672,
"grad_norm": 0.3242087780438831,
"learning_rate": 2.8056074273022666e-05,
"loss": 0.2774,
"step": 323
},
{
"epoch": 0.20736,
"grad_norm": 0.3555981636400892,
"learning_rate": 2.8040400477797476e-05,
"loss": 0.2797,
"step": 324
},
{
"epoch": 0.208,
"grad_norm": 0.309319481935765,
"learning_rate": 2.8024668162047783e-05,
"loss": 0.292,
"step": 325
},
{
"epoch": 0.20864,
"grad_norm": 0.38567954428634077,
"learning_rate": 2.8008877396374434e-05,
"loss": 0.2622,
"step": 326
},
{
"epoch": 0.20928,
"grad_norm": 0.3558299126479417,
"learning_rate": 2.799302825164058e-05,
"loss": 0.2741,
"step": 327
},
{
"epoch": 0.20992,
"grad_norm": 0.32961441916292467,
"learning_rate": 2.7977120798971374e-05,
"loss": 0.2669,
"step": 328
},
{
"epoch": 0.21056,
"grad_norm": 0.30783578854216875,
"learning_rate": 2.7961155109753596e-05,
"loss": 0.2645,
"step": 329
},
{
"epoch": 0.2112,
"grad_norm": 0.32966575581899893,
"learning_rate": 2.79451312556354e-05,
"loss": 0.2654,
"step": 330
},
{
"epoch": 0.21184,
"grad_norm": 0.31714659245735044,
"learning_rate": 2.7929049308525958e-05,
"loss": 0.2542,
"step": 331
},
{
"epoch": 0.21248,
"grad_norm": 0.3196236044042015,
"learning_rate": 2.7912909340595133e-05,
"loss": 0.2624,
"step": 332
},
{
"epoch": 0.21312,
"grad_norm": 0.3579856676787265,
"learning_rate": 2.7896711424273166e-05,
"loss": 0.272,
"step": 333
},
{
"epoch": 0.21376,
"grad_norm": 0.3327728411912822,
"learning_rate": 2.7880455632250358e-05,
"loss": 0.2588,
"step": 334
},
{
"epoch": 0.2144,
"grad_norm": 0.31027790499808855,
"learning_rate": 2.7864142037476727e-05,
"loss": 0.2728,
"step": 335
},
{
"epoch": 0.21504,
"grad_norm": 0.3202262124080976,
"learning_rate": 2.784777071316169e-05,
"loss": 0.2818,
"step": 336
},
{
"epoch": 0.21568,
"grad_norm": 0.29821110258387634,
"learning_rate": 2.7831341732773733e-05,
"loss": 0.2561,
"step": 337
},
{
"epoch": 0.21632,
"grad_norm": 0.33344963895852425,
"learning_rate": 2.7814855170040083e-05,
"loss": 0.2859,
"step": 338
},
{
"epoch": 0.21696,
"grad_norm": 0.3260713930863079,
"learning_rate": 2.7798311098946375e-05,
"loss": 0.2529,
"step": 339
},
{
"epoch": 0.2176,
"grad_norm": 0.3749841150106177,
"learning_rate": 2.7781709593736316e-05,
"loss": 0.2807,
"step": 340
},
{
"epoch": 0.21824,
"grad_norm": 0.34099250856014796,
"learning_rate": 2.7765050728911363e-05,
"loss": 0.2886,
"step": 341
},
{
"epoch": 0.21888,
"grad_norm": 0.33765459803209535,
"learning_rate": 2.7748334579230375e-05,
"loss": 0.2497,
"step": 342
},
{
"epoch": 0.21952,
"grad_norm": 0.2851983974846401,
"learning_rate": 2.773156121970929e-05,
"loss": 0.2401,
"step": 343
},
{
"epoch": 0.22016,
"grad_norm": 3.444718072231663,
"learning_rate": 2.7714730725620786e-05,
"loss": 0.2706,
"step": 344
},
{
"epoch": 0.2208,
"grad_norm": 0.41628559582587626,
"learning_rate": 2.7697843172493925e-05,
"loss": 0.2732,
"step": 345
},
{
"epoch": 0.22144,
"grad_norm": 0.33063598183417514,
"learning_rate": 2.7680898636113845e-05,
"loss": 0.2775,
"step": 346
},
{
"epoch": 0.22208,
"grad_norm": 0.3317549477105874,
"learning_rate": 2.7663897192521393e-05,
"loss": 0.2673,
"step": 347
},
{
"epoch": 0.22272,
"grad_norm": 0.39451049358358514,
"learning_rate": 2.764683891801281e-05,
"loss": 0.2853,
"step": 348
},
{
"epoch": 0.22336,
"grad_norm": 0.33825604933869896,
"learning_rate": 2.7629723889139345e-05,
"loss": 0.2836,
"step": 349
},
{
"epoch": 0.224,
"grad_norm": 0.3938596661138968,
"learning_rate": 2.7612552182706968e-05,
"loss": 0.2679,
"step": 350
},
{
"epoch": 0.22464,
"grad_norm": 0.3709101970004551,
"learning_rate": 2.759532387577599e-05,
"loss": 0.2819,
"step": 351
},
{
"epoch": 0.22528,
"grad_norm": 0.35417172495939264,
"learning_rate": 2.7578039045660713e-05,
"loss": 0.2639,
"step": 352
},
{
"epoch": 0.22592,
"grad_norm": 0.3327176351302671,
"learning_rate": 2.75606977699291e-05,
"loss": 0.2639,
"step": 353
},
{
"epoch": 0.22656,
"grad_norm": 0.3564326153038442,
"learning_rate": 2.754330012640243e-05,
"loss": 0.2556,
"step": 354
},
{
"epoch": 0.2272,
"grad_norm": 0.32512504523834834,
"learning_rate": 2.752584619315493e-05,
"loss": 0.2643,
"step": 355
},
{
"epoch": 0.22784,
"grad_norm": 0.3882343021011366,
"learning_rate": 2.7508336048513437e-05,
"loss": 0.2786,
"step": 356
},
{
"epoch": 0.22848,
"grad_norm": 0.3645455650133275,
"learning_rate": 2.7490769771057043e-05,
"loss": 0.2642,
"step": 357
},
{
"epoch": 0.22912,
"grad_norm": 0.36026260628073103,
"learning_rate": 2.747314743961675e-05,
"loss": 0.2639,
"step": 358
},
{
"epoch": 0.22976,
"grad_norm": 0.3515677194617887,
"learning_rate": 2.7455469133275095e-05,
"loss": 0.2814,
"step": 359
},
{
"epoch": 0.2304,
"grad_norm": 0.34391091885619035,
"learning_rate": 2.743773493136583e-05,
"loss": 0.2673,
"step": 360
},
{
"epoch": 0.23104,
"grad_norm": 0.3449879915659821,
"learning_rate": 2.7419944913473533e-05,
"loss": 0.2333,
"step": 361
},
{
"epoch": 0.23168,
"grad_norm": 0.31990363584739007,
"learning_rate": 2.7402099159433258e-05,
"loss": 0.2532,
"step": 362
},
{
"epoch": 0.23232,
"grad_norm": 0.3875818304841908,
"learning_rate": 2.73841977493302e-05,
"loss": 0.2811,
"step": 363
},
{
"epoch": 0.23296,
"grad_norm": 0.3566549850839257,
"learning_rate": 2.7366240763499302e-05,
"loss": 0.2507,
"step": 364
},
{
"epoch": 0.2336,
"grad_norm": 0.3080698897167787,
"learning_rate": 2.7348228282524916e-05,
"loss": 0.2603,
"step": 365
},
{
"epoch": 0.23424,
"grad_norm": 0.3357967412245159,
"learning_rate": 2.7330160387240443e-05,
"loss": 0.2451,
"step": 366
},
{
"epoch": 0.23488,
"grad_norm": 0.3527005868734707,
"learning_rate": 2.731203715872795e-05,
"loss": 0.275,
"step": 367
},
{
"epoch": 0.23552,
"grad_norm": 0.316985061769891,
"learning_rate": 2.729385867831783e-05,
"loss": 0.269,
"step": 368
},
{
"epoch": 0.23616,
"grad_norm": 0.33344202384537386,
"learning_rate": 2.7275625027588414e-05,
"loss": 0.2691,
"step": 369
},
{
"epoch": 0.2368,
"grad_norm": 0.34625417830177385,
"learning_rate": 2.7257336288365634e-05,
"loss": 0.2653,
"step": 370
},
{
"epoch": 0.23744,
"grad_norm": 0.3267922712231095,
"learning_rate": 2.7238992542722625e-05,
"loss": 0.2617,
"step": 371
},
{
"epoch": 0.23808,
"grad_norm": 0.330793248709517,
"learning_rate": 2.722059387297938e-05,
"loss": 0.2574,
"step": 372
},
{
"epoch": 0.23872,
"grad_norm": 0.3463369707262466,
"learning_rate": 2.720214036170236e-05,
"loss": 0.2627,
"step": 373
},
{
"epoch": 0.23936,
"grad_norm": 0.33043396116356993,
"learning_rate": 2.7183632091704143e-05,
"loss": 0.2708,
"step": 374
},
{
"epoch": 0.24,
"grad_norm": 0.3485722629331789,
"learning_rate": 2.716506914604305e-05,
"loss": 0.2612,
"step": 375
},
{
"epoch": 0.24064,
"grad_norm": 0.3412635617995565,
"learning_rate": 2.7146451608022748e-05,
"loss": 0.2607,
"step": 376
},
{
"epoch": 0.24128,
"grad_norm": 0.35685994147745786,
"learning_rate": 2.7127779561191905e-05,
"loss": 0.2577,
"step": 377
},
{
"epoch": 0.24192,
"grad_norm": 0.3039380817502753,
"learning_rate": 2.7109053089343815e-05,
"loss": 0.2611,
"step": 378
},
{
"epoch": 0.24256,
"grad_norm": 0.43279102042200523,
"learning_rate": 2.7090272276515993e-05,
"loss": 0.2756,
"step": 379
},
{
"epoch": 0.2432,
"grad_norm": 0.31054351672072755,
"learning_rate": 2.707143720698983e-05,
"loss": 0.2708,
"step": 380
},
{
"epoch": 0.24384,
"grad_norm": 0.36851176929762747,
"learning_rate": 2.7052547965290186e-05,
"loss": 0.276,
"step": 381
},
{
"epoch": 0.24448,
"grad_norm": 0.3627663453356185,
"learning_rate": 2.703360463618504e-05,
"loss": 0.2732,
"step": 382
},
{
"epoch": 0.24512,
"grad_norm": 0.3576702828639981,
"learning_rate": 2.7014607304685096e-05,
"loss": 0.2625,
"step": 383
},
{
"epoch": 0.24576,
"grad_norm": 0.3437585046911205,
"learning_rate": 2.699555605604339e-05,
"loss": 0.2717,
"step": 384
},
{
"epoch": 0.2464,
"grad_norm": 0.3563941457915153,
"learning_rate": 2.6976450975754923e-05,
"loss": 0.2599,
"step": 385
},
{
"epoch": 0.24704,
"grad_norm": 0.32499802120658194,
"learning_rate": 2.6957292149556276e-05,
"loss": 0.2677,
"step": 386
},
{
"epoch": 0.24768,
"grad_norm": 0.33344191861008443,
"learning_rate": 2.6938079663425218e-05,
"loss": 0.2882,
"step": 387
},
{
"epoch": 0.24832,
"grad_norm": 0.3517570106726394,
"learning_rate": 2.691881360358033e-05,
"loss": 0.2754,
"step": 388
},
{
"epoch": 0.24896,
"grad_norm": 0.3237247905400656,
"learning_rate": 2.6899494056480596e-05,
"loss": 0.2545,
"step": 389
},
{
"epoch": 0.2496,
"grad_norm": 0.3605247711441668,
"learning_rate": 2.6880121108825056e-05,
"loss": 0.2858,
"step": 390
},
{
"epoch": 0.25024,
"grad_norm": 0.3227764776694619,
"learning_rate": 2.6860694847552374e-05,
"loss": 0.2588,
"step": 391
},
{
"epoch": 0.25088,
"grad_norm": 0.30914534952022565,
"learning_rate": 2.6841215359840468e-05,
"loss": 0.265,
"step": 392
},
{
"epoch": 0.25152,
"grad_norm": 0.3040468595020032,
"learning_rate": 2.682168273310612e-05,
"loss": 0.2572,
"step": 393
},
{
"epoch": 0.25216,
"grad_norm": 0.3582628170978083,
"learning_rate": 2.680209705500458e-05,
"loss": 0.2692,
"step": 394
},
{
"epoch": 0.2528,
"grad_norm": 0.3288877765942096,
"learning_rate": 2.6782458413429177e-05,
"loss": 0.2556,
"step": 395
},
{
"epoch": 0.25344,
"grad_norm": 0.3483940416687742,
"learning_rate": 2.6762766896510914e-05,
"loss": 0.2509,
"step": 396
},
{
"epoch": 0.25408,
"grad_norm": 0.4148522716152593,
"learning_rate": 2.6743022592618075e-05,
"loss": 0.2855,
"step": 397
},
{
"epoch": 0.25472,
"grad_norm": 0.3153140469461542,
"learning_rate": 2.6723225590355852e-05,
"loss": 0.2422,
"step": 398
},
{
"epoch": 0.25536,
"grad_norm": 0.3414987747573693,
"learning_rate": 2.6703375978565906e-05,
"loss": 0.2566,
"step": 399
},
{
"epoch": 0.256,
"grad_norm": 0.3082723258062477,
"learning_rate": 2.6683473846326002e-05,
"loss": 0.2692,
"step": 400
},
{
"epoch": 0.25664,
"grad_norm": 0.3120921043255892,
"learning_rate": 2.666351928294959e-05,
"loss": 0.2484,
"step": 401
},
{
"epoch": 0.25728,
"grad_norm": 0.34491023806967935,
"learning_rate": 2.6643512377985423e-05,
"loss": 0.2634,
"step": 402
},
{
"epoch": 0.25792,
"grad_norm": 1.227578859205086,
"learning_rate": 2.6623453221217137e-05,
"loss": 0.2681,
"step": 403
},
{
"epoch": 0.25856,
"grad_norm": 0.36139208492186886,
"learning_rate": 2.660334190266285e-05,
"loss": 0.2781,
"step": 404
},
{
"epoch": 0.2592,
"grad_norm": 0.3146578283750008,
"learning_rate": 2.658317851257477e-05,
"loss": 0.2661,
"step": 405
},
{
"epoch": 0.25984,
"grad_norm": 0.3241167195918667,
"learning_rate": 2.6562963141438783e-05,
"loss": 0.2747,
"step": 406
},
{
"epoch": 0.26048,
"grad_norm": 0.32983966170598866,
"learning_rate": 2.6542695879974044e-05,
"loss": 0.2821,
"step": 407
},
{
"epoch": 0.26112,
"grad_norm": 0.3302761405872256,
"learning_rate": 2.652237681913257e-05,
"loss": 0.2677,
"step": 408
},
{
"epoch": 0.26176,
"grad_norm": 0.33848152131881326,
"learning_rate": 2.6502006050098842e-05,
"loss": 0.2542,
"step": 409
},
{
"epoch": 0.2624,
"grad_norm": 0.3413235227280745,
"learning_rate": 2.648158366428938e-05,
"loss": 0.2664,
"step": 410
},
{
"epoch": 0.26304,
"grad_norm": 0.32916338812223606,
"learning_rate": 2.6461109753352355e-05,
"loss": 0.2616,
"step": 411
},
{
"epoch": 0.26368,
"grad_norm": 0.3053892800899407,
"learning_rate": 2.6440584409167144e-05,
"loss": 0.2583,
"step": 412
},
{
"epoch": 0.26432,
"grad_norm": 0.3385379902087113,
"learning_rate": 2.6420007723843952e-05,
"loss": 0.256,
"step": 413
},
{
"epoch": 0.26496,
"grad_norm": 0.37571936271303413,
"learning_rate": 2.639937978972338e-05,
"loss": 0.2503,
"step": 414
},
{
"epoch": 0.2656,
"grad_norm": 0.2854100162554339,
"learning_rate": 2.6378700699376015e-05,
"loss": 0.2415,
"step": 415
},
{
"epoch": 0.26624,
"grad_norm": 0.31271921873205044,
"learning_rate": 2.6357970545602014e-05,
"loss": 0.2475,
"step": 416
},
{
"epoch": 0.26688,
"grad_norm": 0.34030180044647645,
"learning_rate": 2.6337189421430688e-05,
"loss": 0.2379,
"step": 417
},
{
"epoch": 0.26752,
"grad_norm": 0.4075617302343357,
"learning_rate": 2.631635742012009e-05,
"loss": 0.2742,
"step": 418
},
{
"epoch": 0.26816,
"grad_norm": 0.3607203044257088,
"learning_rate": 2.629547463515657e-05,
"loss": 0.2694,
"step": 419
},
{
"epoch": 0.2688,
"grad_norm": 0.3244858888191209,
"learning_rate": 2.6274541160254405e-05,
"loss": 0.2633,
"step": 420
},
{
"epoch": 0.26944,
"grad_norm": 0.3940598188912196,
"learning_rate": 2.6253557089355333e-05,
"loss": 0.2466,
"step": 421
},
{
"epoch": 0.27008,
"grad_norm": 0.3011262776053056,
"learning_rate": 2.6232522516628153e-05,
"loss": 0.2727,
"step": 422
},
{
"epoch": 0.27072,
"grad_norm": 0.3739061295982007,
"learning_rate": 2.6211437536468292e-05,
"loss": 0.2671,
"step": 423
},
{
"epoch": 0.27136,
"grad_norm": 0.32945664324477797,
"learning_rate": 2.6190302243497396e-05,
"loss": 0.2708,
"step": 424
},
{
"epoch": 0.272,
"grad_norm": 0.36534937038877574,
"learning_rate": 2.6169116732562892e-05,
"loss": 0.2298,
"step": 425
},
{
"epoch": 0.27264,
"grad_norm": 0.33670531669484066,
"learning_rate": 2.6147881098737562e-05,
"loss": 0.2681,
"step": 426
},
{
"epoch": 0.27328,
"grad_norm": 0.35965800175602797,
"learning_rate": 2.612659543731913e-05,
"loss": 0.2675,
"step": 427
},
{
"epoch": 0.27392,
"grad_norm": 0.3509912783523724,
"learning_rate": 2.610525984382982e-05,
"loss": 0.2752,
"step": 428
},
{
"epoch": 0.27456,
"grad_norm": 0.3341800947139574,
"learning_rate": 2.6083874414015934e-05,
"loss": 0.2651,
"step": 429
},
{
"epoch": 0.2752,
"grad_norm": 0.3512265177853723,
"learning_rate": 2.606243924384742e-05,
"loss": 0.2746,
"step": 430
},
{
"epoch": 0.27584,
"grad_norm": 0.3314007433299324,
"learning_rate": 2.6040954429517442e-05,
"loss": 0.2406,
"step": 431
},
{
"epoch": 0.27648,
"grad_norm": 0.31623472637208033,
"learning_rate": 2.6019420067441958e-05,
"loss": 0.2432,
"step": 432
},
{
"epoch": 0.27712,
"grad_norm": 0.33629626691329856,
"learning_rate": 2.599783625425926e-05,
"loss": 0.2642,
"step": 433
},
{
"epoch": 0.27776,
"grad_norm": 0.32932652640189397,
"learning_rate": 2.597620308682957e-05,
"loss": 0.2722,
"step": 434
},
{
"epoch": 0.2784,
"grad_norm": 0.3162553853731888,
"learning_rate": 2.5954520662234602e-05,
"loss": 0.253,
"step": 435
},
{
"epoch": 0.27904,
"grad_norm": 0.32294208690580367,
"learning_rate": 2.5932789077777103e-05,
"loss": 0.2453,
"step": 436
},
{
"epoch": 0.27968,
"grad_norm": 0.2860688977348506,
"learning_rate": 2.591100843098043e-05,
"loss": 0.2583,
"step": 437
},
{
"epoch": 0.28032,
"grad_norm": 0.3382649825825897,
"learning_rate": 2.5889178819588134e-05,
"loss": 0.243,
"step": 438
},
{
"epoch": 0.28096,
"grad_norm": 0.3426287563355019,
"learning_rate": 2.5867300341563477e-05,
"loss": 0.2832,
"step": 439
},
{
"epoch": 0.2816,
"grad_norm": 0.3616023005679438,
"learning_rate": 2.5845373095089028e-05,
"loss": 0.2586,
"step": 440
},
{
"epoch": 0.28224,
"grad_norm": 0.3680942120537055,
"learning_rate": 2.5823397178566217e-05,
"loss": 0.2449,
"step": 441
},
{
"epoch": 0.28288,
"grad_norm": 0.3266840109358284,
"learning_rate": 2.580137269061488e-05,
"loss": 0.2806,
"step": 442
},
{
"epoch": 0.28352,
"grad_norm": 0.3268151383316364,
"learning_rate": 2.5779299730072815e-05,
"loss": 0.264,
"step": 443
},
{
"epoch": 0.28416,
"grad_norm": 0.32651398276400556,
"learning_rate": 2.5757178395995358e-05,
"loss": 0.2672,
"step": 444
},
{
"epoch": 0.2848,
"grad_norm": 0.35504343797167337,
"learning_rate": 2.5735008787654933e-05,
"loss": 0.2563,
"step": 445
},
{
"epoch": 0.28544,
"grad_norm": 0.3332900931306675,
"learning_rate": 2.5712791004540592e-05,
"loss": 0.2679,
"step": 446
},
{
"epoch": 0.28608,
"grad_norm": 0.32153395285613084,
"learning_rate": 2.5690525146357575e-05,
"loss": 0.2572,
"step": 447
},
{
"epoch": 0.28672,
"grad_norm": 0.33090800287138983,
"learning_rate": 2.566821131302688e-05,
"loss": 0.2834,
"step": 448
},
{
"epoch": 0.28736,
"grad_norm": 0.336200932174992,
"learning_rate": 2.5645849604684775e-05,
"loss": 0.248,
"step": 449
},
{
"epoch": 0.288,
"grad_norm": 0.32237186462753925,
"learning_rate": 2.56234401216824e-05,
"loss": 0.2665,
"step": 450
},
{
"epoch": 0.28864,
"grad_norm": 0.3195380119597909,
"learning_rate": 2.5600982964585272e-05,
"loss": 0.2508,
"step": 451
},
{
"epoch": 0.28928,
"grad_norm": 0.3475060876563337,
"learning_rate": 2.5578478234172865e-05,
"loss": 0.2594,
"step": 452
},
{
"epoch": 0.28992,
"grad_norm": 0.627047630421656,
"learning_rate": 2.5555926031438134e-05,
"loss": 0.2485,
"step": 453
},
{
"epoch": 0.29056,
"grad_norm": 0.3005632921400129,
"learning_rate": 2.5533326457587072e-05,
"loss": 0.2363,
"step": 454
},
{
"epoch": 0.2912,
"grad_norm": 0.3907484843122131,
"learning_rate": 2.551067961403827e-05,
"loss": 0.2635,
"step": 455
},
{
"epoch": 0.29184,
"grad_norm": 0.30910241081220835,
"learning_rate": 2.5487985602422425e-05,
"loss": 0.2554,
"step": 456
},
{
"epoch": 0.29248,
"grad_norm": 0.3316657790671481,
"learning_rate": 2.546524452458193e-05,
"loss": 0.2465,
"step": 457
},
{
"epoch": 0.29312,
"grad_norm": 0.33514519152514977,
"learning_rate": 2.5442456482570383e-05,
"loss": 0.2509,
"step": 458
},
{
"epoch": 0.29376,
"grad_norm": 0.3135722919118478,
"learning_rate": 2.5419621578652137e-05,
"loss": 0.2584,
"step": 459
},
{
"epoch": 0.2944,
"grad_norm": 0.8767929120277721,
"learning_rate": 2.539673991530185e-05,
"loss": 0.2649,
"step": 460
},
{
"epoch": 0.29504,
"grad_norm": 0.32933106915081395,
"learning_rate": 2.5373811595204022e-05,
"loss": 0.2727,
"step": 461
},
{
"epoch": 0.29568,
"grad_norm": 0.4082581717544081,
"learning_rate": 2.5350836721252525e-05,
"loss": 0.2569,
"step": 462
},
{
"epoch": 0.29632,
"grad_norm": 0.3412102305943323,
"learning_rate": 2.5327815396550142e-05,
"loss": 0.2639,
"step": 463
},
{
"epoch": 0.29696,
"grad_norm": 0.31780550108168804,
"learning_rate": 2.530474772440812e-05,
"loss": 0.2418,
"step": 464
},
{
"epoch": 0.2976,
"grad_norm": 0.29529324755348607,
"learning_rate": 2.5281633808345702e-05,
"loss": 0.2471,
"step": 465
},
{
"epoch": 0.29824,
"grad_norm": 0.33728330149501135,
"learning_rate": 2.5258473752089636e-05,
"loss": 0.2801,
"step": 466
},
{
"epoch": 0.29888,
"grad_norm": 0.3267488434365671,
"learning_rate": 2.5235267659573746e-05,
"loss": 0.2414,
"step": 467
},
{
"epoch": 0.29952,
"grad_norm": 0.29150985108582134,
"learning_rate": 2.521201563493845e-05,
"loss": 0.265,
"step": 468
},
{
"epoch": 0.30016,
"grad_norm": 0.30336323784858,
"learning_rate": 2.5188717782530292e-05,
"loss": 0.2593,
"step": 469
},
{
"epoch": 0.3008,
"grad_norm": 0.29926798408682637,
"learning_rate": 2.516537420690146e-05,
"loss": 0.2604,
"step": 470
},
{
"epoch": 0.30144,
"grad_norm": 0.3065859162009593,
"learning_rate": 2.514198501280934e-05,
"loss": 0.2629,
"step": 471
},
{
"epoch": 0.30208,
"grad_norm": 0.3049740162502653,
"learning_rate": 2.5118550305216054e-05,
"loss": 0.2754,
"step": 472
},
{
"epoch": 0.30272,
"grad_norm": 0.30876372307121774,
"learning_rate": 2.5095070189287944e-05,
"loss": 0.2668,
"step": 473
},
{
"epoch": 0.30336,
"grad_norm": 0.29273269101927585,
"learning_rate": 2.5071544770395143e-05,
"loss": 0.2558,
"step": 474
},
{
"epoch": 0.304,
"grad_norm": 0.3076653994547958,
"learning_rate": 2.5047974154111092e-05,
"loss": 0.2489,
"step": 475
},
{
"epoch": 0.30464,
"grad_norm": 0.3333719011093153,
"learning_rate": 2.5024358446212046e-05,
"loss": 0.2435,
"step": 476
},
{
"epoch": 0.30528,
"grad_norm": 0.38541240875825966,
"learning_rate": 2.5000697752676622e-05,
"loss": 0.2791,
"step": 477
},
{
"epoch": 0.30592,
"grad_norm": 0.31076311872010254,
"learning_rate": 2.4976992179685317e-05,
"loss": 0.2693,
"step": 478
},
{
"epoch": 0.30656,
"grad_norm": 0.3140598892806967,
"learning_rate": 2.4953241833620034e-05,
"loss": 0.2453,
"step": 479
},
{
"epoch": 0.3072,
"grad_norm": 0.30866549699497225,
"learning_rate": 2.4929446821063596e-05,
"loss": 0.2494,
"step": 480
},
{
"epoch": 0.30784,
"grad_norm": 0.3057358078354188,
"learning_rate": 2.4905607248799265e-05,
"loss": 0.2583,
"step": 481
},
{
"epoch": 0.30848,
"grad_norm": 0.2832731794280749,
"learning_rate": 2.4881723223810295e-05,
"loss": 0.2687,
"step": 482
},
{
"epoch": 0.30912,
"grad_norm": 0.3051553661664977,
"learning_rate": 2.4857794853279396e-05,
"loss": 0.2599,
"step": 483
},
{
"epoch": 0.30976,
"grad_norm": 0.2998378381048481,
"learning_rate": 2.4833822244588312e-05,
"loss": 0.2451,
"step": 484
},
{
"epoch": 0.3104,
"grad_norm": 0.3174813787668605,
"learning_rate": 2.4809805505317296e-05,
"loss": 0.264,
"step": 485
},
{
"epoch": 0.31104,
"grad_norm": 0.28982862061791764,
"learning_rate": 2.4785744743244644e-05,
"loss": 0.2478,
"step": 486
},
{
"epoch": 0.31168,
"grad_norm": 0.3147239609211115,
"learning_rate": 2.4761640066346217e-05,
"loss": 0.2372,
"step": 487
},
{
"epoch": 0.31232,
"grad_norm": 0.2898871152482304,
"learning_rate": 2.4737491582794945e-05,
"loss": 0.2638,
"step": 488
},
{
"epoch": 0.31296,
"grad_norm": 0.33890483931828097,
"learning_rate": 2.4713299400960342e-05,
"loss": 0.2444,
"step": 489
},
{
"epoch": 0.3136,
"grad_norm": 0.28327565279880107,
"learning_rate": 2.4689063629408034e-05,
"loss": 0.272,
"step": 490
},
{
"epoch": 0.31424,
"grad_norm": 0.3212172431491676,
"learning_rate": 2.4664784376899257e-05,
"loss": 0.2545,
"step": 491
},
{
"epoch": 0.31488,
"grad_norm": 0.3367903918403918,
"learning_rate": 2.4640461752390367e-05,
"loss": 0.2632,
"step": 492
},
{
"epoch": 0.31552,
"grad_norm": 0.3230413061531243,
"learning_rate": 2.4616095865032366e-05,
"loss": 0.2825,
"step": 493
},
{
"epoch": 0.31616,
"grad_norm": 0.34034207513600895,
"learning_rate": 2.459168682417041e-05,
"loss": 0.271,
"step": 494
},
{
"epoch": 0.3168,
"grad_norm": 0.32141809222820283,
"learning_rate": 2.4567234739343283e-05,
"loss": 0.2396,
"step": 495
},
{
"epoch": 0.31744,
"grad_norm": 0.2753406394259948,
"learning_rate": 2.454273972028297e-05,
"loss": 0.2404,
"step": 496
},
{
"epoch": 0.31808,
"grad_norm": 0.3351114179983899,
"learning_rate": 2.451820187691411e-05,
"loss": 0.2763,
"step": 497
},
{
"epoch": 0.31872,
"grad_norm": 0.3513219675385064,
"learning_rate": 2.4493621319353525e-05,
"loss": 0.2609,
"step": 498
},
{
"epoch": 0.31936,
"grad_norm": 0.3110180234096309,
"learning_rate": 2.4468998157909723e-05,
"loss": 0.2402,
"step": 499
},
{
"epoch": 0.32,
"grad_norm": 0.2964890570904897,
"learning_rate": 2.444433250308241e-05,
"loss": 0.2659,
"step": 500
},
{
"epoch": 0.32064,
"grad_norm": 0.3579764939490037,
"learning_rate": 2.4419624465561964e-05,
"loss": 0.2514,
"step": 501
},
{
"epoch": 0.32128,
"grad_norm": 0.3144597113409246,
"learning_rate": 2.4394874156228988e-05,
"loss": 0.2608,
"step": 502
},
{
"epoch": 0.32192,
"grad_norm": 0.31609390625354067,
"learning_rate": 2.4370081686153767e-05,
"loss": 0.2443,
"step": 503
},
{
"epoch": 0.32256,
"grad_norm": 0.29746982935843747,
"learning_rate": 2.4345247166595803e-05,
"loss": 0.2377,
"step": 504
},
{
"epoch": 0.3232,
"grad_norm": 0.34126278199611504,
"learning_rate": 2.4320370709003284e-05,
"loss": 0.2554,
"step": 505
},
{
"epoch": 0.32384,
"grad_norm": 0.2990237684037871,
"learning_rate": 2.429545242501261e-05,
"loss": 0.2848,
"step": 506
},
{
"epoch": 0.32448,
"grad_norm": 0.32322206092320094,
"learning_rate": 2.4270492426447884e-05,
"loss": 0.2561,
"step": 507
},
{
"epoch": 0.32512,
"grad_norm": 0.3260943452277802,
"learning_rate": 2.424549082532041e-05,
"loss": 0.26,
"step": 508
},
{
"epoch": 0.32576,
"grad_norm": 0.4295758739365506,
"learning_rate": 2.422044773382817e-05,
"loss": 0.2513,
"step": 509
},
{
"epoch": 0.3264,
"grad_norm": 0.31835152340497586,
"learning_rate": 2.4195363264355365e-05,
"loss": 0.2505,
"step": 510
},
{
"epoch": 0.32704,
"grad_norm": 0.3531418180586244,
"learning_rate": 2.417023752947188e-05,
"loss": 0.2609,
"step": 511
},
{
"epoch": 0.32768,
"grad_norm": 0.28968278915332163,
"learning_rate": 2.4145070641932767e-05,
"loss": 0.2472,
"step": 512
},
{
"epoch": 0.32832,
"grad_norm": 0.3176579318606065,
"learning_rate": 2.4119862714677773e-05,
"loss": 0.2563,
"step": 513
},
{
"epoch": 0.32896,
"grad_norm": 0.3447755475779212,
"learning_rate": 2.4094613860830813e-05,
"loss": 0.299,
"step": 514
},
{
"epoch": 0.3296,
"grad_norm": 0.30246755944222914,
"learning_rate": 2.4069324193699453e-05,
"loss": 0.2285,
"step": 515
},
{
"epoch": 0.33024,
"grad_norm": 0.36003409699329864,
"learning_rate": 2.4043993826774433e-05,
"loss": 0.2473,
"step": 516
},
{
"epoch": 0.33088,
"grad_norm": 0.35159179010717023,
"learning_rate": 2.4018622873729136e-05,
"loss": 0.2654,
"step": 517
},
{
"epoch": 0.33152,
"grad_norm": 0.30073111767960253,
"learning_rate": 2.3993211448419055e-05,
"loss": 0.249,
"step": 518
},
{
"epoch": 0.33216,
"grad_norm": 0.3319898092071986,
"learning_rate": 2.3967759664881347e-05,
"loss": 0.2737,
"step": 519
},
{
"epoch": 0.3328,
"grad_norm": 0.4467760115064727,
"learning_rate": 2.394226763733425e-05,
"loss": 0.2585,
"step": 520
},
{
"epoch": 0.33344,
"grad_norm": 0.3041939438386619,
"learning_rate": 2.3916735480176618e-05,
"loss": 0.2266,
"step": 521
},
{
"epoch": 0.33408,
"grad_norm": 0.3026737354404247,
"learning_rate": 2.389116330798739e-05,
"loss": 0.2627,
"step": 522
},
{
"epoch": 0.33472,
"grad_norm": 0.36457095372526566,
"learning_rate": 2.3865551235525073e-05,
"loss": 0.2535,
"step": 523
},
{
"epoch": 0.33536,
"grad_norm": 0.3111554932975257,
"learning_rate": 2.3839899377727237e-05,
"loss": 0.249,
"step": 524
},
{
"epoch": 0.336,
"grad_norm": 0.2873311087186447,
"learning_rate": 2.381420784970999e-05,
"loss": 0.2808,
"step": 525
},
{
"epoch": 0.33664,
"grad_norm": 0.31448944336119605,
"learning_rate": 2.378847676676747e-05,
"loss": 0.2599,
"step": 526
},
{
"epoch": 0.33728,
"grad_norm": 0.3105232703780657,
"learning_rate": 2.3762706244371314e-05,
"loss": 0.2475,
"step": 527
},
{
"epoch": 0.33792,
"grad_norm": 0.2918281600818534,
"learning_rate": 2.3736896398170163e-05,
"loss": 0.2481,
"step": 528
},
{
"epoch": 0.33856,
"grad_norm": 0.3139402887967583,
"learning_rate": 2.3711047343989115e-05,
"loss": 0.2751,
"step": 529
},
{
"epoch": 0.3392,
"grad_norm": 0.2999435271292121,
"learning_rate": 2.368515919782923e-05,
"loss": 0.2414,
"step": 530
},
{
"epoch": 0.33984,
"grad_norm": 0.31432103076673396,
"learning_rate": 2.3659232075866982e-05,
"loss": 0.2571,
"step": 531
},
{
"epoch": 0.34048,
"grad_norm": 0.3208208970926117,
"learning_rate": 2.363326609445378e-05,
"loss": 0.2523,
"step": 532
},
{
"epoch": 0.34112,
"grad_norm": 0.3366034671455189,
"learning_rate": 2.3607261370115397e-05,
"loss": 0.2643,
"step": 533
},
{
"epoch": 0.34176,
"grad_norm": 0.30511352998629143,
"learning_rate": 2.3581218019551475e-05,
"loss": 0.2528,
"step": 534
},
{
"epoch": 0.3424,
"grad_norm": 0.2985896860812654,
"learning_rate": 2.3555136159635e-05,
"loss": 0.2657,
"step": 535
},
{
"epoch": 0.34304,
"grad_norm": 0.2900503900856783,
"learning_rate": 2.3529015907411782e-05,
"loss": 0.2503,
"step": 536
},
{
"epoch": 0.34368,
"grad_norm": 0.28173595548277097,
"learning_rate": 2.3502857380099907e-05,
"loss": 0.2681,
"step": 537
},
{
"epoch": 0.34432,
"grad_norm": 0.2998778456549225,
"learning_rate": 2.3476660695089232e-05,
"loss": 0.251,
"step": 538
},
{
"epoch": 0.34496,
"grad_norm": 0.30133046241449024,
"learning_rate": 2.345042596994085e-05,
"loss": 0.2683,
"step": 539
},
{
"epoch": 0.3456,
"grad_norm": 0.339364481773199,
"learning_rate": 2.3424153322386567e-05,
"loss": 0.2386,
"step": 540
},
{
"epoch": 0.34624,
"grad_norm": 0.30211757676321005,
"learning_rate": 2.3397842870328366e-05,
"loss": 0.2395,
"step": 541
},
{
"epoch": 0.34688,
"grad_norm": 0.28986596928919606,
"learning_rate": 2.3371494731837888e-05,
"loss": 0.2388,
"step": 542
},
{
"epoch": 0.34752,
"grad_norm": 0.2995992340705576,
"learning_rate": 2.334510902515589e-05,
"loss": 0.2653,
"step": 543
},
{
"epoch": 0.34816,
"grad_norm": 0.3101278774802601,
"learning_rate": 2.331868586869174e-05,
"loss": 0.2432,
"step": 544
},
{
"epoch": 0.3488,
"grad_norm": 0.3244592292118408,
"learning_rate": 2.329222538102284e-05,
"loss": 0.2542,
"step": 545
},
{
"epoch": 0.34944,
"grad_norm": 0.3283217812360253,
"learning_rate": 2.326572768089413e-05,
"loss": 0.2633,
"step": 546
},
{
"epoch": 0.35008,
"grad_norm": 0.29818778249713357,
"learning_rate": 2.3239192887217557e-05,
"loss": 0.2322,
"step": 547
},
{
"epoch": 0.35072,
"grad_norm": 0.30402576136874054,
"learning_rate": 2.3212621119071524e-05,
"loss": 0.2595,
"step": 548
},
{
"epoch": 0.35136,
"grad_norm": 0.3102857808050957,
"learning_rate": 2.318601249570035e-05,
"loss": 0.2395,
"step": 549
},
{
"epoch": 0.352,
"grad_norm": 0.3212647408602525,
"learning_rate": 2.315936713651378e-05,
"loss": 0.2465,
"step": 550
},
{
"epoch": 0.35264,
"grad_norm": 0.31569595485115426,
"learning_rate": 2.3132685161086363e-05,
"loss": 0.2534,
"step": 551
},
{
"epoch": 0.35328,
"grad_norm": 0.28971716623099575,
"learning_rate": 2.3105966689157016e-05,
"loss": 0.2561,
"step": 552
},
{
"epoch": 0.35392,
"grad_norm": 0.2801164703529546,
"learning_rate": 2.3079211840628424e-05,
"loss": 0.2464,
"step": 553
},
{
"epoch": 0.35456,
"grad_norm": 0.3186204827863453,
"learning_rate": 2.30524207355665e-05,
"loss": 0.2618,
"step": 554
},
{
"epoch": 0.3552,
"grad_norm": 0.32778556243620743,
"learning_rate": 2.3025593494199885e-05,
"loss": 0.2427,
"step": 555
},
{
"epoch": 0.35584,
"grad_norm": 0.2913558771027596,
"learning_rate": 2.299873023691938e-05,
"loss": 0.2353,
"step": 556
},
{
"epoch": 0.35648,
"grad_norm": 0.3031256464779152,
"learning_rate": 2.29718310842774e-05,
"loss": 0.2436,
"step": 557
},
{
"epoch": 0.35712,
"grad_norm": 0.33268039473458894,
"learning_rate": 2.294489615698747e-05,
"loss": 0.2592,
"step": 558
},
{
"epoch": 0.35776,
"grad_norm": 0.29522664892298234,
"learning_rate": 2.2917925575923635e-05,
"loss": 0.26,
"step": 559
},
{
"epoch": 0.3584,
"grad_norm": 0.3286262825442643,
"learning_rate": 2.289091946211995e-05,
"loss": 0.2605,
"step": 560
},
{
"epoch": 0.35904,
"grad_norm": 0.3246361703020228,
"learning_rate": 2.286387793676993e-05,
"loss": 0.261,
"step": 561
},
{
"epoch": 0.35968,
"grad_norm": 0.28944082962457157,
"learning_rate": 2.2836801121226008e-05,
"loss": 0.2328,
"step": 562
},
{
"epoch": 0.36032,
"grad_norm": 0.31103631358812533,
"learning_rate": 2.2809689136998966e-05,
"loss": 0.241,
"step": 563
},
{
"epoch": 0.36096,
"grad_norm": 0.3449549892583192,
"learning_rate": 2.278254210575744e-05,
"loss": 0.2445,
"step": 564
},
{
"epoch": 0.3616,
"grad_norm": 0.3157228500256909,
"learning_rate": 2.275536014932732e-05,
"loss": 0.2509,
"step": 565
},
{
"epoch": 0.36224,
"grad_norm": 0.30095023154019934,
"learning_rate": 2.272814338969124e-05,
"loss": 0.2487,
"step": 566
},
{
"epoch": 0.36288,
"grad_norm": 0.337477979450585,
"learning_rate": 2.2700891948988006e-05,
"loss": 0.2542,
"step": 567
},
{
"epoch": 0.36352,
"grad_norm": 0.2928546697145262,
"learning_rate": 2.2673605949512082e-05,
"loss": 0.2554,
"step": 568
},
{
"epoch": 0.36416,
"grad_norm": 0.32020331341086156,
"learning_rate": 2.264628551371299e-05,
"loss": 0.2709,
"step": 569
},
{
"epoch": 0.3648,
"grad_norm": 0.3289347732911309,
"learning_rate": 2.261893076419482e-05,
"loss": 0.2348,
"step": 570
},
{
"epoch": 0.36544,
"grad_norm": 0.3102608521013213,
"learning_rate": 2.2591541823715615e-05,
"loss": 0.2642,
"step": 571
},
{
"epoch": 0.36608,
"grad_norm": 0.34028430297253665,
"learning_rate": 2.256411881518689e-05,
"loss": 0.2463,
"step": 572
},
{
"epoch": 0.36672,
"grad_norm": 0.31129961600145073,
"learning_rate": 2.253666186167301e-05,
"loss": 0.2453,
"step": 573
},
{
"epoch": 0.36736,
"grad_norm": 0.2658006077381884,
"learning_rate": 2.2509171086390715e-05,
"loss": 0.2389,
"step": 574
},
{
"epoch": 0.368,
"grad_norm": 0.3792894727601205,
"learning_rate": 2.248164661270849e-05,
"loss": 0.2482,
"step": 575
},
{
"epoch": 0.36864,
"grad_norm": 0.27490046335554513,
"learning_rate": 2.245408856414605e-05,
"loss": 0.2694,
"step": 576
},
{
"epoch": 0.36928,
"grad_norm": 0.37579350044221205,
"learning_rate": 2.2426497064373797e-05,
"loss": 0.2243,
"step": 577
},
{
"epoch": 0.36992,
"grad_norm": 0.3261363751456689,
"learning_rate": 2.239887223721223e-05,
"loss": 0.2497,
"step": 578
},
{
"epoch": 0.37056,
"grad_norm": 0.34380133531131896,
"learning_rate": 2.2371214206631433e-05,
"loss": 0.2366,
"step": 579
},
{
"epoch": 0.3712,
"grad_norm": 0.3663930396216925,
"learning_rate": 2.234352309675048e-05,
"loss": 0.2659,
"step": 580
},
{
"epoch": 0.37184,
"grad_norm": 0.3144693208874203,
"learning_rate": 2.2315799031836887e-05,
"loss": 0.2289,
"step": 581
},
{
"epoch": 0.37248,
"grad_norm": 0.3393482246858389,
"learning_rate": 2.2288042136306076e-05,
"loss": 0.2695,
"step": 582
},
{
"epoch": 0.37312,
"grad_norm": 0.3749005999238712,
"learning_rate": 2.2260252534720783e-05,
"loss": 0.2457,
"step": 583
},
{
"epoch": 0.37376,
"grad_norm": 0.3150815215156016,
"learning_rate": 2.2232430351790533e-05,
"loss": 0.2593,
"step": 584
},
{
"epoch": 0.3744,
"grad_norm": 0.31855318577485014,
"learning_rate": 2.220457571237105e-05,
"loss": 0.254,
"step": 585
},
{
"epoch": 0.37504,
"grad_norm": 0.3492735691294561,
"learning_rate": 2.2176688741463732e-05,
"loss": 0.2602,
"step": 586
},
{
"epoch": 0.37568,
"grad_norm": 0.29164067568396596,
"learning_rate": 2.2148769564215045e-05,
"loss": 0.2761,
"step": 587
},
{
"epoch": 0.37632,
"grad_norm": 0.30406557246274263,
"learning_rate": 2.2120818305915997e-05,
"loss": 0.2447,
"step": 588
},
{
"epoch": 0.37696,
"grad_norm": 0.35536363697863144,
"learning_rate": 2.209283509200156e-05,
"loss": 0.2394,
"step": 589
},
{
"epoch": 0.3776,
"grad_norm": 0.2847948409258987,
"learning_rate": 2.2064820048050113e-05,
"loss": 0.2421,
"step": 590
},
{
"epoch": 0.37824,
"grad_norm": 0.3020189567244142,
"learning_rate": 2.203677329978288e-05,
"loss": 0.242,
"step": 591
},
{
"epoch": 0.37888,
"grad_norm": 0.31145773031624513,
"learning_rate": 2.200869497306336e-05,
"loss": 0.247,
"step": 592
},
{
"epoch": 0.37952,
"grad_norm": 0.3092351679740992,
"learning_rate": 2.198058519389676e-05,
"loss": 0.2481,
"step": 593
},
{
"epoch": 0.38016,
"grad_norm": 0.28783865305151457,
"learning_rate": 2.1952444088429444e-05,
"loss": 0.237,
"step": 594
},
{
"epoch": 0.3808,
"grad_norm": 0.3053618118372085,
"learning_rate": 2.1924271782948342e-05,
"loss": 0.2548,
"step": 595
},
{
"epoch": 0.38144,
"grad_norm": 0.3441276743502453,
"learning_rate": 2.1896068403880408e-05,
"loss": 0.2522,
"step": 596
},
{
"epoch": 0.38208,
"grad_norm": 0.28759508000199346,
"learning_rate": 2.1867834077792045e-05,
"loss": 0.2472,
"step": 597
},
{
"epoch": 0.38272,
"grad_norm": 0.32884139441178306,
"learning_rate": 2.183956893138852e-05,
"loss": 0.2377,
"step": 598
},
{
"epoch": 0.38336,
"grad_norm": 0.3184809395657912,
"learning_rate": 2.181127309151344e-05,
"loss": 0.2395,
"step": 599
},
{
"epoch": 0.384,
"grad_norm": 0.3579225369104567,
"learning_rate": 2.1782946685148126e-05,
"loss": 0.2362,
"step": 600
},
{
"epoch": 0.38464,
"grad_norm": 0.3668180890999166,
"learning_rate": 2.175458983941108e-05,
"loss": 0.275,
"step": 601
},
{
"epoch": 0.38528,
"grad_norm": 0.284016254548238,
"learning_rate": 2.1726202681557398e-05,
"loss": 0.2353,
"step": 602
},
{
"epoch": 0.38592,
"grad_norm": 0.3572757014239311,
"learning_rate": 2.1697785338978215e-05,
"loss": 0.2354,
"step": 603
},
{
"epoch": 0.38656,
"grad_norm": 0.2977040020384556,
"learning_rate": 2.166933793920012e-05,
"loss": 0.232,
"step": 604
},
{
"epoch": 0.3872,
"grad_norm": 0.30172127232859347,
"learning_rate": 2.1640860609884588e-05,
"loss": 0.2341,
"step": 605
},
{
"epoch": 0.38784,
"grad_norm": 0.34442418696488436,
"learning_rate": 2.161235347882741e-05,
"loss": 0.2507,
"step": 606
},
{
"epoch": 0.38848,
"grad_norm": 0.2948413370383127,
"learning_rate": 2.1583816673958104e-05,
"loss": 0.2499,
"step": 607
},
{
"epoch": 0.38912,
"grad_norm": 0.30004465284599646,
"learning_rate": 2.1555250323339368e-05,
"loss": 0.2437,
"step": 608
},
{
"epoch": 0.38976,
"grad_norm": 0.3099575774566211,
"learning_rate": 2.1526654555166484e-05,
"loss": 0.2511,
"step": 609
},
{
"epoch": 0.3904,
"grad_norm": 0.29571761737867036,
"learning_rate": 2.1498029497766744e-05,
"loss": 0.2552,
"step": 610
},
{
"epoch": 0.39104,
"grad_norm": 0.3029984741223696,
"learning_rate": 2.1469375279598898e-05,
"loss": 0.2479,
"step": 611
},
{
"epoch": 0.39168,
"grad_norm": 0.3182893728592652,
"learning_rate": 2.144069202925253e-05,
"loss": 0.2456,
"step": 612
},
{
"epoch": 0.39232,
"grad_norm": 0.3256336345330972,
"learning_rate": 2.1411979875447535e-05,
"loss": 0.2326,
"step": 613
},
{
"epoch": 0.39296,
"grad_norm": 0.3213948031280962,
"learning_rate": 2.1383238947033504e-05,
"loss": 0.255,
"step": 614
},
{
"epoch": 0.3936,
"grad_norm": 0.29013566998871226,
"learning_rate": 2.1354469372989158e-05,
"loss": 0.2663,
"step": 615
},
{
"epoch": 0.39424,
"grad_norm": 0.34420320849332475,
"learning_rate": 2.132567128242178e-05,
"loss": 0.2553,
"step": 616
},
{
"epoch": 0.39488,
"grad_norm": 0.3132567476522736,
"learning_rate": 2.12968448045666e-05,
"loss": 0.2573,
"step": 617
},
{
"epoch": 0.39552,
"grad_norm": 0.3090743818474423,
"learning_rate": 2.1267990068786278e-05,
"loss": 0.2382,
"step": 618
},
{
"epoch": 0.39616,
"grad_norm": 0.2985193819257022,
"learning_rate": 2.1239107204570245e-05,
"loss": 0.278,
"step": 619
},
{
"epoch": 0.3968,
"grad_norm": 0.3202824851156904,
"learning_rate": 2.1210196341534182e-05,
"loss": 0.2349,
"step": 620
},
{
"epoch": 0.39744,
"grad_norm": 0.28647763445799634,
"learning_rate": 2.1181257609419422e-05,
"loss": 0.2499,
"step": 621
},
{
"epoch": 0.39808,
"grad_norm": 0.3013665694615083,
"learning_rate": 2.115229113809236e-05,
"loss": 0.247,
"step": 622
},
{
"epoch": 0.39872,
"grad_norm": 0.2914815601166263,
"learning_rate": 2.1123297057543864e-05,
"loss": 0.2482,
"step": 623
},
{
"epoch": 0.39936,
"grad_norm": 0.27858576448210315,
"learning_rate": 2.109427549788872e-05,
"loss": 0.2403,
"step": 624
},
{
"epoch": 0.4,
"grad_norm": 0.2811765312335858,
"learning_rate": 2.106522658936502e-05,
"loss": 0.2276,
"step": 625
},
{
"epoch": 0.40064,
"grad_norm": 0.2975257782496434,
"learning_rate": 2.1036150462333585e-05,
"loss": 0.2868,
"step": 626
},
{
"epoch": 0.40128,
"grad_norm": 0.3101473385119221,
"learning_rate": 2.100704724727739e-05,
"loss": 0.2394,
"step": 627
},
{
"epoch": 0.40192,
"grad_norm": 0.2753240305984353,
"learning_rate": 2.0977917074800966e-05,
"loss": 0.253,
"step": 628
},
{
"epoch": 0.40256,
"grad_norm": 0.33128143290767126,
"learning_rate": 2.0948760075629816e-05,
"loss": 0.2431,
"step": 629
},
{
"epoch": 0.4032,
"grad_norm": 0.29785787572122974,
"learning_rate": 2.0919576380609847e-05,
"loss": 0.2403,
"step": 630
},
{
"epoch": 0.40384,
"grad_norm": 0.31021479700099863,
"learning_rate": 2.0890366120706748e-05,
"loss": 0.2412,
"step": 631
},
{
"epoch": 0.40448,
"grad_norm": 0.32003709399269986,
"learning_rate": 2.086112942700543e-05,
"loss": 0.2428,
"step": 632
},
{
"epoch": 0.40512,
"grad_norm": 0.28802313751198955,
"learning_rate": 2.083186643070943e-05,
"loss": 0.2509,
"step": 633
},
{
"epoch": 0.40576,
"grad_norm": 0.30877621913191516,
"learning_rate": 2.0802577263140323e-05,
"loss": 0.2503,
"step": 634
},
{
"epoch": 0.4064,
"grad_norm": 0.2798340868739933,
"learning_rate": 2.0773262055737122e-05,
"loss": 0.2586,
"step": 635
},
{
"epoch": 0.40704,
"grad_norm": 0.29456576669510637,
"learning_rate": 2.074392094005571e-05,
"loss": 0.2425,
"step": 636
},
{
"epoch": 0.40768,
"grad_norm": 0.2954614927035877,
"learning_rate": 2.0714554047768224e-05,
"loss": 0.2516,
"step": 637
},
{
"epoch": 0.40832,
"grad_norm": 0.29091464782031107,
"learning_rate": 2.068516151066249e-05,
"loss": 0.2463,
"step": 638
},
{
"epoch": 0.40896,
"grad_norm": 0.2971818958989882,
"learning_rate": 2.0655743460641403e-05,
"loss": 0.2279,
"step": 639
},
{
"epoch": 0.4096,
"grad_norm": 0.26433254320966026,
"learning_rate": 2.062630002972237e-05,
"loss": 0.2363,
"step": 640
},
{
"epoch": 0.41024,
"grad_norm": 0.3354545785932358,
"learning_rate": 2.0596831350036674e-05,
"loss": 0.2418,
"step": 641
},
{
"epoch": 0.41088,
"grad_norm": 0.3109013845802427,
"learning_rate": 2.0567337553828935e-05,
"loss": 0.2524,
"step": 642
},
{
"epoch": 0.41152,
"grad_norm": 0.3402728181158717,
"learning_rate": 2.0537818773456458e-05,
"loss": 0.2274,
"step": 643
},
{
"epoch": 0.41216,
"grad_norm": 0.31342814288430576,
"learning_rate": 2.0508275141388684e-05,
"loss": 0.2761,
"step": 644
},
{
"epoch": 0.4128,
"grad_norm": 0.2942734303684883,
"learning_rate": 2.047870679020657e-05,
"loss": 0.2235,
"step": 645
},
{
"epoch": 0.41344,
"grad_norm": 0.33039525789141777,
"learning_rate": 2.044911385260202e-05,
"loss": 0.2335,
"step": 646
},
{
"epoch": 0.41408,
"grad_norm": 0.2767673737540941,
"learning_rate": 2.0419496461377253e-05,
"loss": 0.2438,
"step": 647
},
{
"epoch": 0.41472,
"grad_norm": 0.36279599663063367,
"learning_rate": 2.0389854749444232e-05,
"loss": 0.2528,
"step": 648
},
{
"epoch": 0.41536,
"grad_norm": 0.29068552338360093,
"learning_rate": 2.0360188849824076e-05,
"loss": 0.2687,
"step": 649
},
{
"epoch": 0.416,
"grad_norm": 0.300468767033639,
"learning_rate": 2.033049889564643e-05,
"loss": 0.2415,
"step": 650
},
{
"epoch": 0.41664,
"grad_norm": 0.31567351162652707,
"learning_rate": 2.03007850201489e-05,
"loss": 0.2412,
"step": 651
},
{
"epoch": 0.41728,
"grad_norm": 0.2960445310895459,
"learning_rate": 2.0271047356676448e-05,
"loss": 0.2672,
"step": 652
},
{
"epoch": 0.41792,
"grad_norm": 0.3137476648522256,
"learning_rate": 2.0241286038680756e-05,
"loss": 0.2505,
"step": 653
},
{
"epoch": 0.41856,
"grad_norm": 0.2903028208499828,
"learning_rate": 2.0211501199719704e-05,
"loss": 0.2328,
"step": 654
},
{
"epoch": 0.4192,
"grad_norm": 0.3042506515512182,
"learning_rate": 2.0181692973456686e-05,
"loss": 0.2537,
"step": 655
},
{
"epoch": 0.41984,
"grad_norm": 0.29549374179951227,
"learning_rate": 2.015186149366007e-05,
"loss": 0.2538,
"step": 656
},
{
"epoch": 0.42048,
"grad_norm": 0.3194068056008517,
"learning_rate": 2.0122006894202577e-05,
"loss": 0.2331,
"step": 657
},
{
"epoch": 0.42112,
"grad_norm": 0.2719676743833399,
"learning_rate": 2.0092129309060672e-05,
"loss": 0.2514,
"step": 658
},
{
"epoch": 0.42176,
"grad_norm": 0.3289205044581153,
"learning_rate": 2.0062228872313976e-05,
"loss": 0.2584,
"step": 659
},
{
"epoch": 0.4224,
"grad_norm": 0.2923593537514368,
"learning_rate": 2.0032305718144665e-05,
"loss": 0.2578,
"step": 660
},
{
"epoch": 0.42304,
"grad_norm": 0.29063317542597716,
"learning_rate": 2.0002359980836853e-05,
"loss": 0.2384,
"step": 661
},
{
"epoch": 0.42368,
"grad_norm": 0.28894947654548037,
"learning_rate": 1.9972391794776003e-05,
"loss": 0.2409,
"step": 662
},
{
"epoch": 0.42432,
"grad_norm": 0.3098536165538198,
"learning_rate": 1.994240129444832e-05,
"loss": 0.2311,
"step": 663
},
{
"epoch": 0.42496,
"grad_norm": 0.3060220488640442,
"learning_rate": 1.9912388614440146e-05,
"loss": 0.2317,
"step": 664
},
{
"epoch": 0.4256,
"grad_norm": 0.28385024763607003,
"learning_rate": 1.988235388943736e-05,
"loss": 0.2625,
"step": 665
},
{
"epoch": 0.42624,
"grad_norm": 0.3250332428858291,
"learning_rate": 1.985229725422477e-05,
"loss": 0.2509,
"step": 666
},
{
"epoch": 0.42688,
"grad_norm": 0.27609914066722663,
"learning_rate": 1.98222188436855e-05,
"loss": 0.2515,
"step": 667
},
{
"epoch": 0.42752,
"grad_norm": 0.3764195638951758,
"learning_rate": 1.979211879280041e-05,
"loss": 0.2384,
"step": 668
},
{
"epoch": 0.42816,
"grad_norm": 0.2882112064916445,
"learning_rate": 1.9761997236647466e-05,
"loss": 0.2486,
"step": 669
},
{
"epoch": 0.4288,
"grad_norm": 0.3031253323342361,
"learning_rate": 1.9731854310401133e-05,
"loss": 0.2465,
"step": 670
},
{
"epoch": 0.42944,
"grad_norm": 0.31461387253551104,
"learning_rate": 1.970169014933179e-05,
"loss": 0.2368,
"step": 671
},
{
"epoch": 0.43008,
"grad_norm": 0.321768024600862,
"learning_rate": 1.967150488880511e-05,
"loss": 0.2651,
"step": 672
},
{
"epoch": 0.43072,
"grad_norm": 0.31937360402288967,
"learning_rate": 1.964129866428144e-05,
"loss": 0.2658,
"step": 673
},
{
"epoch": 0.43136,
"grad_norm": 0.28909986387996445,
"learning_rate": 1.9611071611315225e-05,
"loss": 0.2382,
"step": 674
},
{
"epoch": 0.432,
"grad_norm": 0.29848411611660336,
"learning_rate": 1.9580823865554353e-05,
"loss": 0.2193,
"step": 675
},
{
"epoch": 0.43264,
"grad_norm": 0.3251801996349692,
"learning_rate": 1.95505555627396e-05,
"loss": 0.2633,
"step": 676
},
{
"epoch": 0.43328,
"grad_norm": 0.31621533846449545,
"learning_rate": 1.952026683870398e-05,
"loss": 0.2465,
"step": 677
},
{
"epoch": 0.43392,
"grad_norm": 0.2690978345819651,
"learning_rate": 1.9489957829372165e-05,
"loss": 0.2234,
"step": 678
},
{
"epoch": 0.43456,
"grad_norm": 0.2834458494566235,
"learning_rate": 1.9459628670759837e-05,
"loss": 0.2576,
"step": 679
},
{
"epoch": 0.4352,
"grad_norm": 0.29167897883221355,
"learning_rate": 1.9429279498973116e-05,
"loss": 0.2557,
"step": 680
},
{
"epoch": 0.43584,
"grad_norm": 0.30646443869067846,
"learning_rate": 1.939891045020793e-05,
"loss": 0.248,
"step": 681
},
{
"epoch": 0.43648,
"grad_norm": 0.27473410735634074,
"learning_rate": 1.936852166074941e-05,
"loss": 0.2543,
"step": 682
},
{
"epoch": 0.43712,
"grad_norm": 0.2639389373623012,
"learning_rate": 1.9338113266971276e-05,
"loss": 0.2197,
"step": 683
},
{
"epoch": 0.43776,
"grad_norm": 0.3075802635764548,
"learning_rate": 1.9307685405335217e-05,
"loss": 0.2433,
"step": 684
},
{
"epoch": 0.4384,
"grad_norm": 0.270502938103472,
"learning_rate": 1.9277238212390296e-05,
"loss": 0.2474,
"step": 685
},
{
"epoch": 0.43904,
"grad_norm": 0.2838293202736154,
"learning_rate": 1.9246771824772326e-05,
"loss": 0.2266,
"step": 686
},
{
"epoch": 0.43968,
"grad_norm": 0.2853706779734916,
"learning_rate": 1.9216286379203257e-05,
"loss": 0.2461,
"step": 687
},
{
"epoch": 0.44032,
"grad_norm": 0.29829572135331384,
"learning_rate": 1.9185782012490564e-05,
"loss": 0.22,
"step": 688
},
{
"epoch": 0.44096,
"grad_norm": 0.2766232059234513,
"learning_rate": 1.9155258861526627e-05,
"loss": 0.2315,
"step": 689
},
{
"epoch": 0.4416,
"grad_norm": 0.27212184486467605,
"learning_rate": 1.912471706328814e-05,
"loss": 0.2457,
"step": 690
},
{
"epoch": 0.44224,
"grad_norm": 0.2746235664697286,
"learning_rate": 1.9094156754835458e-05,
"loss": 0.2239,
"step": 691
},
{
"epoch": 0.44288,
"grad_norm": 0.30054140099920756,
"learning_rate": 1.9063578073312018e-05,
"loss": 0.2473,
"step": 692
},
{
"epoch": 0.44352,
"grad_norm": 0.30126883324495013,
"learning_rate": 1.9032981155943702e-05,
"loss": 0.2623,
"step": 693
},
{
"epoch": 0.44416,
"grad_norm": 0.2628697573720338,
"learning_rate": 1.9002366140038236e-05,
"loss": 0.248,
"step": 694
},
{
"epoch": 0.4448,
"grad_norm": 0.2606114154909525,
"learning_rate": 1.8971733162984552e-05,
"loss": 0.2494,
"step": 695
},
{
"epoch": 0.44544,
"grad_norm": 0.2564796649553452,
"learning_rate": 1.8941082362252198e-05,
"loss": 0.2332,
"step": 696
},
{
"epoch": 0.44608,
"grad_norm": 0.30845561489676177,
"learning_rate": 1.891041387539069e-05,
"loss": 0.2462,
"step": 697
},
{
"epoch": 0.44672,
"grad_norm": 0.27617590444390094,
"learning_rate": 1.887972784002894e-05,
"loss": 0.2197,
"step": 698
},
{
"epoch": 0.44736,
"grad_norm": 0.30470085842871797,
"learning_rate": 1.884902439387459e-05,
"loss": 0.2575,
"step": 699
},
{
"epoch": 0.448,
"grad_norm": 0.250257667186702,
"learning_rate": 1.8818303674713422e-05,
"loss": 0.2449,
"step": 700
},
{
"epoch": 0.44864,
"grad_norm": 0.2848134111806949,
"learning_rate": 1.8787565820408734e-05,
"loss": 0.2446,
"step": 701
},
{
"epoch": 0.44928,
"grad_norm": 0.2960785043913214,
"learning_rate": 1.875681096890072e-05,
"loss": 0.2354,
"step": 702
},
{
"epoch": 0.44992,
"grad_norm": 0.24645274331403022,
"learning_rate": 1.8726039258205844e-05,
"loss": 0.2587,
"step": 703
},
{
"epoch": 0.45056,
"grad_norm": 0.2887458899807905,
"learning_rate": 1.8695250826416237e-05,
"loss": 0.2198,
"step": 704
},
{
"epoch": 0.4512,
"grad_norm": 0.2781754228552816,
"learning_rate": 1.8664445811699063e-05,
"loss": 0.2476,
"step": 705
},
{
"epoch": 0.45184,
"grad_norm": 0.27395246203285356,
"learning_rate": 1.8633624352295914e-05,
"loss": 0.2252,
"step": 706
},
{
"epoch": 0.45248,
"grad_norm": 0.2725538240915513,
"learning_rate": 1.8602786586522163e-05,
"loss": 0.2324,
"step": 707
},
{
"epoch": 0.45312,
"grad_norm": 0.28094440947721694,
"learning_rate": 1.8571932652766376e-05,
"loss": 0.2325,
"step": 708
},
{
"epoch": 0.45376,
"grad_norm": 0.2923745301438137,
"learning_rate": 1.854106268948966e-05,
"loss": 0.2254,
"step": 709
},
{
"epoch": 0.4544,
"grad_norm": 0.2801508861752559,
"learning_rate": 1.8510176835225063e-05,
"loss": 0.2433,
"step": 710
},
{
"epoch": 0.45504,
"grad_norm": 0.27180662682502943,
"learning_rate": 1.847927522857696e-05,
"loss": 0.2311,
"step": 711
},
{
"epoch": 0.45568,
"grad_norm": 0.28702262741491485,
"learning_rate": 1.8448358008220393e-05,
"loss": 0.236,
"step": 712
},
{
"epoch": 0.45632,
"grad_norm": 0.2773135841625751,
"learning_rate": 1.8417425312900485e-05,
"loss": 0.2484,
"step": 713
},
{
"epoch": 0.45696,
"grad_norm": 0.2863331904071235,
"learning_rate": 1.8386477281431814e-05,
"loss": 0.2476,
"step": 714
},
{
"epoch": 0.4576,
"grad_norm": 0.2956391633466897,
"learning_rate": 1.8355514052697757e-05,
"loss": 0.2452,
"step": 715
},
{
"epoch": 0.45824,
"grad_norm": 0.2887659913133557,
"learning_rate": 1.832453576564991e-05,
"loss": 0.2448,
"step": 716
},
{
"epoch": 0.45888,
"grad_norm": 0.29739285828614076,
"learning_rate": 1.8293542559307438e-05,
"loss": 0.2334,
"step": 717
},
{
"epoch": 0.45952,
"grad_norm": 0.3094313748335245,
"learning_rate": 1.8262534572756462e-05,
"loss": 0.2586,
"step": 718
},
{
"epoch": 0.46016,
"grad_norm": 0.2793044911825085,
"learning_rate": 1.823151194514943e-05,
"loss": 0.2503,
"step": 719
},
{
"epoch": 0.4608,
"grad_norm": 0.2781664739633705,
"learning_rate": 1.820047481570449e-05,
"loss": 0.2539,
"step": 720
},
{
"epoch": 0.46144,
"grad_norm": 0.3041251930429675,
"learning_rate": 1.816942332370487e-05,
"loss": 0.249,
"step": 721
},
{
"epoch": 0.46208,
"grad_norm": 0.2997931472177727,
"learning_rate": 1.8138357608498255e-05,
"loss": 0.2467,
"step": 722
},
{
"epoch": 0.46272,
"grad_norm": 0.2795161631070679,
"learning_rate": 1.8107277809496163e-05,
"loss": 0.2387,
"step": 723
},
{
"epoch": 0.46336,
"grad_norm": 0.2796349459167825,
"learning_rate": 1.80761840661733e-05,
"loss": 0.2332,
"step": 724
},
{
"epoch": 0.464,
"grad_norm": 0.3141385412008042,
"learning_rate": 1.8045076518066966e-05,
"loss": 0.223,
"step": 725
},
{
"epoch": 0.46464,
"grad_norm": 0.3332215661467418,
"learning_rate": 1.80139553047764e-05,
"loss": 0.241,
"step": 726
},
{
"epoch": 0.46528,
"grad_norm": 0.3014159403594374,
"learning_rate": 1.798282056596217e-05,
"loss": 0.2509,
"step": 727
},
{
"epoch": 0.46592,
"grad_norm": 0.3008006533466029,
"learning_rate": 1.795167244134553e-05,
"loss": 0.223,
"step": 728
},
{
"epoch": 0.46656,
"grad_norm": 0.42627687451650975,
"learning_rate": 1.7920511070707833e-05,
"loss": 0.2536,
"step": 729
},
{
"epoch": 0.4672,
"grad_norm": 0.32225997058646555,
"learning_rate": 1.7889336593889846e-05,
"loss": 0.2792,
"step": 730
},
{
"epoch": 0.46784,
"grad_norm": 0.3214254990679866,
"learning_rate": 1.7858149150791162e-05,
"loss": 0.2621,
"step": 731
},
{
"epoch": 0.46848,
"grad_norm": 0.29296285930759747,
"learning_rate": 1.7826948881369578e-05,
"loss": 0.2557,
"step": 732
},
{
"epoch": 0.46912,
"grad_norm": 0.32495773394451755,
"learning_rate": 1.7795735925640416e-05,
"loss": 0.2695,
"step": 733
},
{
"epoch": 0.46976,
"grad_norm": 0.31407279664152,
"learning_rate": 1.7764510423675963e-05,
"loss": 0.248,
"step": 734
},
{
"epoch": 0.4704,
"grad_norm": 0.3109001212194977,
"learning_rate": 1.7733272515604783e-05,
"loss": 0.2483,
"step": 735
},
{
"epoch": 0.47104,
"grad_norm": 0.3131228629395079,
"learning_rate": 1.7702022341611137e-05,
"loss": 0.2597,
"step": 736
},
{
"epoch": 0.47168,
"grad_norm": 0.2960954497025586,
"learning_rate": 1.7670760041934322e-05,
"loss": 0.2562,
"step": 737
},
{
"epoch": 0.47232,
"grad_norm": 0.2781329801515648,
"learning_rate": 1.7639485756868043e-05,
"loss": 0.2169,
"step": 738
},
{
"epoch": 0.47296,
"grad_norm": 0.28424930879878024,
"learning_rate": 1.7608199626759796e-05,
"loss": 0.2614,
"step": 739
},
{
"epoch": 0.4736,
"grad_norm": 0.29420062292031857,
"learning_rate": 1.7576901792010236e-05,
"loss": 0.241,
"step": 740
},
{
"epoch": 0.47424,
"grad_norm": 0.30298761962774723,
"learning_rate": 1.7545592393072542e-05,
"loss": 0.2423,
"step": 741
},
{
"epoch": 0.47488,
"grad_norm": 0.3003875946079927,
"learning_rate": 1.751427157045179e-05,
"loss": 0.2364,
"step": 742
},
{
"epoch": 0.47552,
"grad_norm": 0.28771325149749016,
"learning_rate": 1.748293946470432e-05,
"loss": 0.2372,
"step": 743
},
{
"epoch": 0.47616,
"grad_norm": 0.31694239516805306,
"learning_rate": 1.745159621643711e-05,
"loss": 0.2414,
"step": 744
},
{
"epoch": 0.4768,
"grad_norm": 0.3313169785473572,
"learning_rate": 1.742024196630713e-05,
"loss": 0.2605,
"step": 745
},
{
"epoch": 0.47744,
"grad_norm": 0.32517473503518,
"learning_rate": 1.7388876855020735e-05,
"loss": 0.2618,
"step": 746
},
{
"epoch": 0.47808,
"grad_norm": 0.2854775316164675,
"learning_rate": 1.7357501023333017e-05,
"loss": 0.2487,
"step": 747
},
{
"epoch": 0.47872,
"grad_norm": 0.3480918768403915,
"learning_rate": 1.732611461204718e-05,
"loss": 0.2551,
"step": 748
},
{
"epoch": 0.47936,
"grad_norm": 0.27798931707993946,
"learning_rate": 1.7294717762013898e-05,
"loss": 0.2208,
"step": 749
},
{
"epoch": 0.48,
"grad_norm": 0.30941965572051944,
"learning_rate": 1.7263310614130696e-05,
"loss": 0.2574,
"step": 750
},
{
"epoch": 0.48064,
"grad_norm": 0.274215437510584,
"learning_rate": 1.723189330934131e-05,
"loss": 0.2537,
"step": 751
},
{
"epoch": 0.48128,
"grad_norm": 0.27338644568214365,
"learning_rate": 1.7200465988635057e-05,
"loss": 0.2276,
"step": 752
},
{
"epoch": 0.48192,
"grad_norm": 0.30295123698091525,
"learning_rate": 1.7169028793046202e-05,
"loss": 0.241,
"step": 753
},
{
"epoch": 0.48256,
"grad_norm": 0.3023883382178243,
"learning_rate": 1.713758186365332e-05,
"loss": 0.2415,
"step": 754
},
{
"epoch": 0.4832,
"grad_norm": 0.29982309786504135,
"learning_rate": 1.710612534157868e-05,
"loss": 0.2502,
"step": 755
},
{
"epoch": 0.48384,
"grad_norm": 0.29405059504029574,
"learning_rate": 1.7074659367987588e-05,
"loss": 0.2504,
"step": 756
},
{
"epoch": 0.48448,
"grad_norm": 0.29717042313602116,
"learning_rate": 1.704318408408777e-05,
"loss": 0.2221,
"step": 757
},
{
"epoch": 0.48512,
"grad_norm": 0.28950952560619164,
"learning_rate": 1.7011699631128727e-05,
"loss": 0.2601,
"step": 758
},
{
"epoch": 0.48576,
"grad_norm": 0.27616868083933593,
"learning_rate": 1.6980206150401112e-05,
"loss": 0.2435,
"step": 759
},
{
"epoch": 0.4864,
"grad_norm": 0.3192248657906988,
"learning_rate": 1.6948703783236093e-05,
"loss": 0.2577,
"step": 760
},
{
"epoch": 0.48704,
"grad_norm": 0.261596366161892,
"learning_rate": 1.6917192671004725e-05,
"loss": 0.2385,
"step": 761
},
{
"epoch": 0.48768,
"grad_norm": 0.2908784165700428,
"learning_rate": 1.688567295511729e-05,
"loss": 0.2563,
"step": 762
},
{
"epoch": 0.48832,
"grad_norm": 0.30518493124643414,
"learning_rate": 1.6854144777022685e-05,
"loss": 0.2518,
"step": 763
},
{
"epoch": 0.48896,
"grad_norm": 0.3945838451684876,
"learning_rate": 1.682260827820779e-05,
"loss": 0.2453,
"step": 764
},
{
"epoch": 0.4896,
"grad_norm": 0.2971094396719173,
"learning_rate": 1.6791063600196818e-05,
"loss": 0.2395,
"step": 765
},
{
"epoch": 0.49024,
"grad_norm": 0.29418187047199595,
"learning_rate": 1.675951088455069e-05,
"loss": 0.2339,
"step": 766
},
{
"epoch": 0.49088,
"grad_norm": 0.31939862410372316,
"learning_rate": 1.6727950272866405e-05,
"loss": 0.2579,
"step": 767
},
{
"epoch": 0.49152,
"grad_norm": 0.2910294262479968,
"learning_rate": 1.6696381906776383e-05,
"loss": 0.2638,
"step": 768
},
{
"epoch": 0.49216,
"grad_norm": 0.2848802319438362,
"learning_rate": 1.666480592794784e-05,
"loss": 0.2495,
"step": 769
},
{
"epoch": 0.4928,
"grad_norm": 0.31358274735097835,
"learning_rate": 1.663322247808218e-05,
"loss": 0.2304,
"step": 770
},
{
"epoch": 0.49344,
"grad_norm": 0.27825339734876414,
"learning_rate": 1.6601631698914303e-05,
"loss": 0.2157,
"step": 771
},
{
"epoch": 0.49408,
"grad_norm": 0.29204963150572166,
"learning_rate": 1.657003373221202e-05,
"loss": 0.2289,
"step": 772
},
{
"epoch": 0.49472,
"grad_norm": 0.31293590502176905,
"learning_rate": 1.6538428719775402e-05,
"loss": 0.2345,
"step": 773
},
{
"epoch": 0.49536,
"grad_norm": 0.3006040607999008,
"learning_rate": 1.6506816803436117e-05,
"loss": 0.2571,
"step": 774
},
{
"epoch": 0.496,
"grad_norm": 0.26799679474754523,
"learning_rate": 1.6475198125056824e-05,
"loss": 0.252,
"step": 775
},
{
"epoch": 0.49664,
"grad_norm": 0.2821301194781917,
"learning_rate": 1.6443572826530545e-05,
"loss": 0.2331,
"step": 776
},
{
"epoch": 0.49728,
"grad_norm": 0.2967133869603498,
"learning_rate": 1.6411941049779984e-05,
"loss": 0.2526,
"step": 777
},
{
"epoch": 0.49792,
"grad_norm": 0.29653102428208794,
"learning_rate": 1.6380302936756933e-05,
"loss": 0.2407,
"step": 778
},
{
"epoch": 0.49856,
"grad_norm": 0.2712138400371119,
"learning_rate": 1.6348658629441618e-05,
"loss": 0.2687,
"step": 779
},
{
"epoch": 0.4992,
"grad_norm": 0.29223153618349473,
"learning_rate": 1.6317008269842056e-05,
"loss": 0.2603,
"step": 780
},
{
"epoch": 0.49984,
"grad_norm": 0.2895681083759542,
"learning_rate": 1.628535199999343e-05,
"loss": 0.2472,
"step": 781
},
{
"epoch": 0.50048,
"grad_norm": 0.2753546178449307,
"learning_rate": 1.6253689961957442e-05,
"loss": 0.2501,
"step": 782
},
{
"epoch": 0.50112,
"grad_norm": 0.2874283254253285,
"learning_rate": 1.6222022297821685e-05,
"loss": 0.2341,
"step": 783
},
{
"epoch": 0.50176,
"grad_norm": 0.25670956021391794,
"learning_rate": 1.6190349149698993e-05,
"loss": 0.2389,
"step": 784
},
{
"epoch": 0.5024,
"grad_norm": 0.3721981372004176,
"learning_rate": 1.6158670659726816e-05,
"loss": 0.2398,
"step": 785
},
{
"epoch": 0.50304,
"grad_norm": 0.26265623036956204,
"learning_rate": 1.612698697006657e-05,
"loss": 0.2449,
"step": 786
},
{
"epoch": 0.50368,
"grad_norm": 0.32648868955386057,
"learning_rate": 1.6095298222903018e-05,
"loss": 0.2425,
"step": 787
},
{
"epoch": 0.50432,
"grad_norm": 0.27240791695651867,
"learning_rate": 1.6063604560443602e-05,
"loss": 0.2608,
"step": 788
},
{
"epoch": 0.50496,
"grad_norm": 0.2752436970583074,
"learning_rate": 1.6031906124917835e-05,
"loss": 0.2533,
"step": 789
},
{
"epoch": 0.5056,
"grad_norm": 0.3240780608668842,
"learning_rate": 1.6000203058576643e-05,
"loss": 0.2328,
"step": 790
},
{
"epoch": 0.50624,
"grad_norm": 0.3125211914154205,
"learning_rate": 1.5968495503691738e-05,
"loss": 0.2499,
"step": 791
},
{
"epoch": 0.50688,
"grad_norm": 0.2814512806616593,
"learning_rate": 1.593678360255497e-05,
"loss": 0.2518,
"step": 792
},
{
"epoch": 0.50752,
"grad_norm": 0.26953227957530657,
"learning_rate": 1.5905067497477697e-05,
"loss": 0.2549,
"step": 793
},
{
"epoch": 0.50816,
"grad_norm": 0.27737474201549767,
"learning_rate": 1.587334733079014e-05,
"loss": 0.2262,
"step": 794
},
{
"epoch": 0.5088,
"grad_norm": 0.2999832225978434,
"learning_rate": 1.5841623244840756e-05,
"loss": 0.2303,
"step": 795
},
{
"epoch": 0.50944,
"grad_norm": 0.29629202724937315,
"learning_rate": 1.5809895381995575e-05,
"loss": 0.2728,
"step": 796
},
{
"epoch": 0.51008,
"grad_norm": 0.29824141391200387,
"learning_rate": 1.5778163884637584e-05,
"loss": 0.2376,
"step": 797
},
{
"epoch": 0.51072,
"grad_norm": 0.3117999804048647,
"learning_rate": 1.5746428895166088e-05,
"loss": 0.2509,
"step": 798
},
{
"epoch": 0.51136,
"grad_norm": 0.2868360103448722,
"learning_rate": 1.5714690555996048e-05,
"loss": 0.23,
"step": 799
},
{
"epoch": 0.512,
"grad_norm": 0.3102954368583538,
"learning_rate": 1.568294900955747e-05,
"loss": 0.2451,
"step": 800
},
{
"epoch": 0.51264,
"grad_norm": 0.32979715453781355,
"learning_rate": 1.565120439829474e-05,
"loss": 0.2576,
"step": 801
},
{
"epoch": 0.51328,
"grad_norm": 0.2570931740010072,
"learning_rate": 1.561945686466602e-05,
"loss": 0.2285,
"step": 802
},
{
"epoch": 0.51392,
"grad_norm": 0.29296845098959595,
"learning_rate": 1.5587706551142566e-05,
"loss": 0.2269,
"step": 803
},
{
"epoch": 0.51456,
"grad_norm": 0.34949753344114726,
"learning_rate": 1.5555953600208115e-05,
"loss": 0.2382,
"step": 804
},
{
"epoch": 0.5152,
"grad_norm": 0.31167228945070136,
"learning_rate": 1.5524198154358237e-05,
"loss": 0.2211,
"step": 805
},
{
"epoch": 0.51584,
"grad_norm": 0.26584493436752854,
"learning_rate": 1.5492440356099705e-05,
"loss": 0.2332,
"step": 806
},
{
"epoch": 0.51648,
"grad_norm": 0.3370974950051766,
"learning_rate": 1.5460680347949847e-05,
"loss": 0.2309,
"step": 807
},
{
"epoch": 0.51712,
"grad_norm": 0.3112492758911654,
"learning_rate": 1.5428918272435903e-05,
"loss": 0.2403,
"step": 808
},
{
"epoch": 0.51776,
"grad_norm": 0.29616924145650064,
"learning_rate": 1.5397154272094396e-05,
"loss": 0.2162,
"step": 809
},
{
"epoch": 0.5184,
"grad_norm": 0.32277865645191245,
"learning_rate": 1.5365388489470488e-05,
"loss": 0.2474,
"step": 810
},
{
"epoch": 0.51904,
"grad_norm": 0.2694955547783349,
"learning_rate": 1.5333621067117328e-05,
"loss": 0.243,
"step": 811
},
{
"epoch": 0.51968,
"grad_norm": 0.2805877678663554,
"learning_rate": 1.5301852147595442e-05,
"loss": 0.2545,
"step": 812
},
{
"epoch": 0.52032,
"grad_norm": 0.3359964195281161,
"learning_rate": 1.5270081873472057e-05,
"loss": 0.257,
"step": 813
},
{
"epoch": 0.52096,
"grad_norm": 0.29024170855725706,
"learning_rate": 1.523831038732049e-05,
"loss": 0.2381,
"step": 814
},
{
"epoch": 0.5216,
"grad_norm": 0.28870485110686106,
"learning_rate": 1.5206537831719492e-05,
"loss": 0.24,
"step": 815
},
{
"epoch": 0.52224,
"grad_norm": 0.3025657209936217,
"learning_rate": 1.5174764349252618e-05,
"loss": 0.224,
"step": 816
},
{
"epoch": 0.52288,
"grad_norm": 0.3138551823765259,
"learning_rate": 1.5142990082507577e-05,
"loss": 0.2442,
"step": 817
},
{
"epoch": 0.52352,
"grad_norm": 0.33587005223397937,
"learning_rate": 1.5111215174075599e-05,
"loss": 0.2339,
"step": 818
},
{
"epoch": 0.52416,
"grad_norm": 0.2870581549015684,
"learning_rate": 1.5079439766550794e-05,
"loss": 0.2164,
"step": 819
},
{
"epoch": 0.5248,
"grad_norm": 0.30441790714779565,
"learning_rate": 1.5047664002529515e-05,
"loss": 0.2253,
"step": 820
},
{
"epoch": 0.52544,
"grad_norm": 0.26117389090769383,
"learning_rate": 1.5015888024609712e-05,
"loss": 0.2329,
"step": 821
},
{
"epoch": 0.52608,
"grad_norm": 0.32251717028933996,
"learning_rate": 1.4984111975390292e-05,
"loss": 0.2393,
"step": 822
},
{
"epoch": 0.52672,
"grad_norm": 0.2882726784004254,
"learning_rate": 1.4952335997470491e-05,
"loss": 0.2453,
"step": 823
},
{
"epoch": 0.52736,
"grad_norm": 0.26989640657141045,
"learning_rate": 1.4920560233449207e-05,
"loss": 0.2346,
"step": 824
},
{
"epoch": 0.528,
"grad_norm": 0.2751823160442032,
"learning_rate": 1.4888784825924407e-05,
"loss": 0.2569,
"step": 825
},
{
"epoch": 0.52864,
"grad_norm": 0.29490569815536594,
"learning_rate": 1.4857009917492426e-05,
"loss": 0.2264,
"step": 826
},
{
"epoch": 0.52928,
"grad_norm": 0.27688984626798974,
"learning_rate": 1.4825235650747387e-05,
"loss": 0.2345,
"step": 827
},
{
"epoch": 0.52992,
"grad_norm": 0.28219958318446914,
"learning_rate": 1.4793462168280511e-05,
"loss": 0.2414,
"step": 828
},
{
"epoch": 0.53056,
"grad_norm": 0.2956396188477475,
"learning_rate": 1.476168961267951e-05,
"loss": 0.2285,
"step": 829
},
{
"epoch": 0.5312,
"grad_norm": 0.28322337068731906,
"learning_rate": 1.4729918126527947e-05,
"loss": 0.2407,
"step": 830
},
{
"epoch": 0.53184,
"grad_norm": 0.29936618373781043,
"learning_rate": 1.4698147852404558e-05,
"loss": 0.2702,
"step": 831
},
{
"epoch": 0.53248,
"grad_norm": 0.2885632951752394,
"learning_rate": 1.4666378932882675e-05,
"loss": 0.2453,
"step": 832
},
{
"epoch": 0.53312,
"grad_norm": 0.2824069984311681,
"learning_rate": 1.4634611510529518e-05,
"loss": 0.2872,
"step": 833
},
{
"epoch": 0.53376,
"grad_norm": 0.26481375162906423,
"learning_rate": 1.4602845727905607e-05,
"loss": 0.2368,
"step": 834
},
{
"epoch": 0.5344,
"grad_norm": 0.2982641248402793,
"learning_rate": 1.45710817275641e-05,
"loss": 0.2233,
"step": 835
},
{
"epoch": 0.53504,
"grad_norm": 0.2754410887201362,
"learning_rate": 1.4539319652050161e-05,
"loss": 0.2297,
"step": 836
},
{
"epoch": 0.53568,
"grad_norm": 0.30179074980368453,
"learning_rate": 1.4507559643900297e-05,
"loss": 0.245,
"step": 837
},
{
"epoch": 0.53632,
"grad_norm": 0.28782475725189016,
"learning_rate": 1.4475801845641769e-05,
"loss": 0.223,
"step": 838
},
{
"epoch": 0.53696,
"grad_norm": 0.27921358522056533,
"learning_rate": 1.4444046399791889e-05,
"loss": 0.2486,
"step": 839
},
{
"epoch": 0.5376,
"grad_norm": 0.25429071712312123,
"learning_rate": 1.4412293448857436e-05,
"loss": 0.2556,
"step": 840
},
{
"epoch": 0.53824,
"grad_norm": 0.33436275819647254,
"learning_rate": 1.4380543135333981e-05,
"loss": 0.2227,
"step": 841
},
{
"epoch": 0.53888,
"grad_norm": 0.25980186753049656,
"learning_rate": 1.4348795601705255e-05,
"loss": 0.2284,
"step": 842
},
{
"epoch": 0.53952,
"grad_norm": 0.2931541805752396,
"learning_rate": 1.4317050990442536e-05,
"loss": 0.2299,
"step": 843
},
{
"epoch": 0.54016,
"grad_norm": 0.27197020619538004,
"learning_rate": 1.4285309444003954e-05,
"loss": 0.2364,
"step": 844
},
{
"epoch": 0.5408,
"grad_norm": 0.32369548656685315,
"learning_rate": 1.4253571104833916e-05,
"loss": 0.2404,
"step": 845
},
{
"epoch": 0.54144,
"grad_norm": 0.24237700919520694,
"learning_rate": 1.4221836115362416e-05,
"loss": 0.2313,
"step": 846
},
{
"epoch": 0.54208,
"grad_norm": 0.24761816529676725,
"learning_rate": 1.4190104618004433e-05,
"loss": 0.2303,
"step": 847
},
{
"epoch": 0.54272,
"grad_norm": 0.2784023956156151,
"learning_rate": 1.4158376755159247e-05,
"loss": 0.2314,
"step": 848
},
{
"epoch": 0.54336,
"grad_norm": 0.31803120979153876,
"learning_rate": 1.4126652669209863e-05,
"loss": 0.2303,
"step": 849
},
{
"epoch": 0.544,
"grad_norm": 0.2929767473430786,
"learning_rate": 1.4094932502522303e-05,
"loss": 0.2122,
"step": 850
},
{
"epoch": 0.54464,
"grad_norm": 0.2953841454200276,
"learning_rate": 1.4063216397445033e-05,
"loss": 0.2329,
"step": 851
},
{
"epoch": 0.54528,
"grad_norm": 0.30878745217593195,
"learning_rate": 1.4031504496308266e-05,
"loss": 0.2399,
"step": 852
},
{
"epoch": 0.54592,
"grad_norm": 0.31428954382492774,
"learning_rate": 1.3999796941423358e-05,
"loss": 0.2499,
"step": 853
},
{
"epoch": 0.54656,
"grad_norm": 0.3060385149750599,
"learning_rate": 1.3968093875082169e-05,
"loss": 0.2389,
"step": 854
},
{
"epoch": 0.5472,
"grad_norm": 0.2781567367160064,
"learning_rate": 1.3936395439556398e-05,
"loss": 0.2346,
"step": 855
},
{
"epoch": 0.54784,
"grad_norm": 0.26401029057360725,
"learning_rate": 1.3904701777096986e-05,
"loss": 0.2527,
"step": 856
},
{
"epoch": 0.54848,
"grad_norm": 0.30906251660580814,
"learning_rate": 1.3873013029933431e-05,
"loss": 0.2272,
"step": 857
},
{
"epoch": 0.54912,
"grad_norm": 0.29563053269349876,
"learning_rate": 1.3841329340273188e-05,
"loss": 0.245,
"step": 858
},
{
"epoch": 0.54976,
"grad_norm": 0.28373950112106466,
"learning_rate": 1.3809650850301011e-05,
"loss": 0.2502,
"step": 859
},
{
"epoch": 0.5504,
"grad_norm": 0.27758705472251655,
"learning_rate": 1.3777977702178322e-05,
"loss": 0.2423,
"step": 860
},
{
"epoch": 0.55104,
"grad_norm": 0.27818143971961046,
"learning_rate": 1.3746310038042559e-05,
"loss": 0.2514,
"step": 861
},
{
"epoch": 0.55168,
"grad_norm": 0.2938599930393852,
"learning_rate": 1.3714648000006575e-05,
"loss": 0.2118,
"step": 862
},
{
"epoch": 0.55232,
"grad_norm": 0.28122086138442776,
"learning_rate": 1.3682991730157945e-05,
"loss": 0.2343,
"step": 863
},
{
"epoch": 0.55296,
"grad_norm": 0.2651049751908861,
"learning_rate": 1.3651341370558384e-05,
"loss": 0.2467,
"step": 864
},
{
"epoch": 0.5536,
"grad_norm": 0.5461359931247247,
"learning_rate": 1.3619697063243069e-05,
"loss": 0.2412,
"step": 865
},
{
"epoch": 0.55424,
"grad_norm": 0.2825582797542218,
"learning_rate": 1.3588058950220015e-05,
"loss": 0.228,
"step": 866
},
{
"epoch": 0.55488,
"grad_norm": 0.2892085558287696,
"learning_rate": 1.3556427173469458e-05,
"loss": 0.2249,
"step": 867
},
{
"epoch": 0.55552,
"grad_norm": 0.33273851653903624,
"learning_rate": 1.3524801874943172e-05,
"loss": 0.2323,
"step": 868
},
{
"epoch": 0.55616,
"grad_norm": 0.2862657843529436,
"learning_rate": 1.3493183196563889e-05,
"loss": 0.2505,
"step": 869
},
{
"epoch": 0.5568,
"grad_norm": 0.2808714505427905,
"learning_rate": 1.3461571280224602e-05,
"loss": 0.2354,
"step": 870
},
{
"epoch": 0.55744,
"grad_norm": 0.2893966418475647,
"learning_rate": 1.3429966267787981e-05,
"loss": 0.2466,
"step": 871
},
{
"epoch": 0.55808,
"grad_norm": 0.28389562050876976,
"learning_rate": 1.3398368301085699e-05,
"loss": 0.2407,
"step": 872
},
{
"epoch": 0.55872,
"grad_norm": 0.3099947400233259,
"learning_rate": 1.3366777521917827e-05,
"loss": 0.2149,
"step": 873
},
{
"epoch": 0.55936,
"grad_norm": 0.29887382629492687,
"learning_rate": 1.3335194072052159e-05,
"loss": 0.2202,
"step": 874
},
{
"epoch": 0.56,
"grad_norm": 0.26967826249749416,
"learning_rate": 1.3303618093223625e-05,
"loss": 0.2391,
"step": 875
},
{
"epoch": 0.56064,
"grad_norm": 0.2465764334550036,
"learning_rate": 1.3272049727133599e-05,
"loss": 0.2203,
"step": 876
},
{
"epoch": 0.56128,
"grad_norm": 0.305343526383751,
"learning_rate": 1.3240489115449313e-05,
"loss": 0.2182,
"step": 877
},
{
"epoch": 0.56192,
"grad_norm": 0.28091661109697963,
"learning_rate": 1.3208936399803185e-05,
"loss": 0.2535,
"step": 878
},
{
"epoch": 0.56256,
"grad_norm": 0.31220903386699683,
"learning_rate": 1.3177391721792211e-05,
"loss": 0.2446,
"step": 879
},
{
"epoch": 0.5632,
"grad_norm": 0.2768392451822474,
"learning_rate": 1.314585522297732e-05,
"loss": 0.2161,
"step": 880
},
{
"epoch": 0.56384,
"grad_norm": 0.28782642159262545,
"learning_rate": 1.3114327044882714e-05,
"loss": 0.2282,
"step": 881
},
{
"epoch": 0.56448,
"grad_norm": 0.29064566337542225,
"learning_rate": 1.308280732899528e-05,
"loss": 0.2309,
"step": 882
},
{
"epoch": 0.56512,
"grad_norm": 0.2715775172810025,
"learning_rate": 1.3051296216763904e-05,
"loss": 0.2295,
"step": 883
},
{
"epoch": 0.56576,
"grad_norm": 0.29121770887438814,
"learning_rate": 1.3019793849598892e-05,
"loss": 0.2262,
"step": 884
},
{
"epoch": 0.5664,
"grad_norm": 0.297890794161894,
"learning_rate": 1.2988300368871277e-05,
"loss": 0.2561,
"step": 885
},
{
"epoch": 0.56704,
"grad_norm": 0.271128766397459,
"learning_rate": 1.2956815915912237e-05,
"loss": 0.2344,
"step": 886
},
{
"epoch": 0.56768,
"grad_norm": 0.2781072834762682,
"learning_rate": 1.2925340632012413e-05,
"loss": 0.2444,
"step": 887
},
{
"epoch": 0.56832,
"grad_norm": 0.2859546652650676,
"learning_rate": 1.2893874658421323e-05,
"loss": 0.2551,
"step": 888
},
{
"epoch": 0.56896,
"grad_norm": 0.2748465688510692,
"learning_rate": 1.2862418136346682e-05,
"loss": 0.2385,
"step": 889
},
{
"epoch": 0.5696,
"grad_norm": 0.2673167724605502,
"learning_rate": 1.2830971206953805e-05,
"loss": 0.2528,
"step": 890
},
{
"epoch": 0.57024,
"grad_norm": 0.26724844154208877,
"learning_rate": 1.2799534011364946e-05,
"loss": 0.2516,
"step": 891
},
{
"epoch": 0.57088,
"grad_norm": 0.2579234824641567,
"learning_rate": 1.2768106690658687e-05,
"loss": 0.2315,
"step": 892
},
{
"epoch": 0.57152,
"grad_norm": 0.2527831824097707,
"learning_rate": 1.2736689385869304e-05,
"loss": 0.2296,
"step": 893
},
{
"epoch": 0.57216,
"grad_norm": 0.2664685763787419,
"learning_rate": 1.2705282237986103e-05,
"loss": 0.236,
"step": 894
},
{
"epoch": 0.5728,
"grad_norm": 0.2887492093513382,
"learning_rate": 1.2673885387952824e-05,
"loss": 0.2407,
"step": 895
},
{
"epoch": 0.57344,
"grad_norm": 0.27142741542549464,
"learning_rate": 1.2642498976666982e-05,
"loss": 0.2513,
"step": 896
},
{
"epoch": 0.57408,
"grad_norm": 0.29898124747847593,
"learning_rate": 1.2611123144979269e-05,
"loss": 0.2434,
"step": 897
},
{
"epoch": 0.57472,
"grad_norm": 0.28771180626146736,
"learning_rate": 1.2579758033692873e-05,
"loss": 0.2339,
"step": 898
},
{
"epoch": 0.57536,
"grad_norm": 0.28998889567540337,
"learning_rate": 1.2548403783562896e-05,
"loss": 0.2118,
"step": 899
},
{
"epoch": 0.576,
"grad_norm": 0.2951782292199617,
"learning_rate": 1.2517060535295681e-05,
"loss": 0.2053,
"step": 900
},
{
"epoch": 0.57664,
"grad_norm": 0.3468538532182766,
"learning_rate": 1.2485728429548214e-05,
"loss": 0.2489,
"step": 901
},
{
"epoch": 0.57728,
"grad_norm": 0.2706265741912179,
"learning_rate": 1.245440760692746e-05,
"loss": 0.2301,
"step": 902
},
{
"epoch": 0.57792,
"grad_norm": 0.2798718644343022,
"learning_rate": 1.2423098207989763e-05,
"loss": 0.2214,
"step": 903
},
{
"epoch": 0.57856,
"grad_norm": 0.2976915427803377,
"learning_rate": 1.2391800373240205e-05,
"loss": 0.2403,
"step": 904
},
{
"epoch": 0.5792,
"grad_norm": 0.27049596723000147,
"learning_rate": 1.2360514243131959e-05,
"loss": 0.228,
"step": 905
},
{
"epoch": 0.57984,
"grad_norm": 0.2703696976439254,
"learning_rate": 1.2329239958065682e-05,
"loss": 0.2327,
"step": 906
},
{
"epoch": 0.58048,
"grad_norm": 0.2570600372926204,
"learning_rate": 1.2297977658388861e-05,
"loss": 0.2428,
"step": 907
},
{
"epoch": 0.58112,
"grad_norm": 0.24470082431115914,
"learning_rate": 1.226672748439522e-05,
"loss": 0.2388,
"step": 908
},
{
"epoch": 0.58176,
"grad_norm": 0.292744513244813,
"learning_rate": 1.2235489576324041e-05,
"loss": 0.2473,
"step": 909
},
{
"epoch": 0.5824,
"grad_norm": 0.2536473659907088,
"learning_rate": 1.2204264074359588e-05,
"loss": 0.2275,
"step": 910
},
{
"epoch": 0.58304,
"grad_norm": 0.263296518582845,
"learning_rate": 1.2173051118630425e-05,
"loss": 0.2175,
"step": 911
},
{
"epoch": 0.58368,
"grad_norm": 0.24857681795290157,
"learning_rate": 1.2141850849208837e-05,
"loss": 0.2322,
"step": 912
},
{
"epoch": 0.58432,
"grad_norm": 0.3260665654581631,
"learning_rate": 1.2110663406110158e-05,
"loss": 0.2434,
"step": 913
},
{
"epoch": 0.58496,
"grad_norm": 0.27474043074530696,
"learning_rate": 1.2079488929292173e-05,
"loss": 0.2377,
"step": 914
},
{
"epoch": 0.5856,
"grad_norm": 0.28245927837321005,
"learning_rate": 1.2048327558654472e-05,
"loss": 0.2517,
"step": 915
},
{
"epoch": 0.58624,
"grad_norm": 0.29411641561968144,
"learning_rate": 1.2017179434037834e-05,
"loss": 0.2581,
"step": 916
},
{
"epoch": 0.58688,
"grad_norm": 0.26848031902662794,
"learning_rate": 1.1986044695223602e-05,
"loss": 0.2244,
"step": 917
},
{
"epoch": 0.58752,
"grad_norm": 0.29005436513316146,
"learning_rate": 1.1954923481933034e-05,
"loss": 0.2437,
"step": 918
},
{
"epoch": 0.58816,
"grad_norm": 0.2895482712254696,
"learning_rate": 1.1923815933826701e-05,
"loss": 0.255,
"step": 919
},
{
"epoch": 0.5888,
"grad_norm": 0.28193500617004763,
"learning_rate": 1.1892722190503836e-05,
"loss": 0.2523,
"step": 920
},
{
"epoch": 0.58944,
"grad_norm": 0.2726047082654661,
"learning_rate": 1.1861642391501745e-05,
"loss": 0.2195,
"step": 921
},
{
"epoch": 0.59008,
"grad_norm": 0.24683767749203545,
"learning_rate": 1.1830576676295131e-05,
"loss": 0.2443,
"step": 922
},
{
"epoch": 0.59072,
"grad_norm": 0.27491734969798565,
"learning_rate": 1.1799525184295516e-05,
"loss": 0.238,
"step": 923
},
{
"epoch": 0.59136,
"grad_norm": 0.2989310502987566,
"learning_rate": 1.1768488054850575e-05,
"loss": 0.2168,
"step": 924
},
{
"epoch": 0.592,
"grad_norm": 0.2628836187369257,
"learning_rate": 1.1737465427243544e-05,
"loss": 0.233,
"step": 925
},
{
"epoch": 0.59264,
"grad_norm": 0.26752822061811826,
"learning_rate": 1.1706457440692565e-05,
"loss": 0.2263,
"step": 926
},
{
"epoch": 0.59328,
"grad_norm": 0.2786500979998503,
"learning_rate": 1.1675464234350096e-05,
"loss": 0.2104,
"step": 927
},
{
"epoch": 0.59392,
"grad_norm": 0.2616073295413085,
"learning_rate": 1.1644485947302247e-05,
"loss": 0.2094,
"step": 928
},
{
"epoch": 0.59456,
"grad_norm": 0.26094785406848103,
"learning_rate": 1.1613522718568188e-05,
"loss": 0.2277,
"step": 929
},
{
"epoch": 0.5952,
"grad_norm": 0.26956180157071696,
"learning_rate": 1.1582574687099515e-05,
"loss": 0.2421,
"step": 930
},
{
"epoch": 0.59584,
"grad_norm": 0.2534923867260137,
"learning_rate": 1.1551641991779607e-05,
"loss": 0.2095,
"step": 931
},
{
"epoch": 0.59648,
"grad_norm": 0.2613077590529168,
"learning_rate": 1.1520724771423045e-05,
"loss": 0.2442,
"step": 932
},
{
"epoch": 0.59712,
"grad_norm": 0.28565509842439857,
"learning_rate": 1.1489823164774935e-05,
"loss": 0.2191,
"step": 933
},
{
"epoch": 0.59776,
"grad_norm": 0.24145916856078634,
"learning_rate": 1.1458937310510346e-05,
"loss": 0.2404,
"step": 934
},
{
"epoch": 0.5984,
"grad_norm": 0.2675973415183905,
"learning_rate": 1.1428067347233628e-05,
"loss": 0.2161,
"step": 935
},
{
"epoch": 0.59904,
"grad_norm": 0.29913176403299363,
"learning_rate": 1.1397213413477841e-05,
"loss": 0.2412,
"step": 936
},
{
"epoch": 0.59968,
"grad_norm": 0.253172980422077,
"learning_rate": 1.136637564770409e-05,
"loss": 0.2407,
"step": 937
},
{
"epoch": 0.60032,
"grad_norm": 0.2933110102178001,
"learning_rate": 1.1335554188300941e-05,
"loss": 0.227,
"step": 938
},
{
"epoch": 0.60096,
"grad_norm": 0.2668181844305263,
"learning_rate": 1.1304749173583767e-05,
"loss": 0.2456,
"step": 939
},
{
"epoch": 0.6016,
"grad_norm": 0.30285003168122526,
"learning_rate": 1.1273960741794158e-05,
"loss": 0.2366,
"step": 940
},
{
"epoch": 0.60224,
"grad_norm": 0.2861300051951289,
"learning_rate": 1.1243189031099285e-05,
"loss": 0.2519,
"step": 941
},
{
"epoch": 0.60288,
"grad_norm": 0.2700232363591384,
"learning_rate": 1.1212434179591267e-05,
"loss": 0.2276,
"step": 942
},
{
"epoch": 0.60352,
"grad_norm": 0.28146618766930903,
"learning_rate": 1.1181696325286582e-05,
"loss": 0.2473,
"step": 943
},
{
"epoch": 0.60416,
"grad_norm": 0.2687027470794158,
"learning_rate": 1.115097560612541e-05,
"loss": 0.2516,
"step": 944
},
{
"epoch": 0.6048,
"grad_norm": 0.2675754902708525,
"learning_rate": 1.1120272159971065e-05,
"loss": 0.2241,
"step": 945
},
{
"epoch": 0.60544,
"grad_norm": 0.290279648929516,
"learning_rate": 1.1089586124609309e-05,
"loss": 0.239,
"step": 946
},
{
"epoch": 0.60608,
"grad_norm": 0.294057403686239,
"learning_rate": 1.105891763774781e-05,
"loss": 0.2318,
"step": 947
},
{
"epoch": 0.60672,
"grad_norm": 0.25256828227195943,
"learning_rate": 1.1028266837015452e-05,
"loss": 0.2284,
"step": 948
},
{
"epoch": 0.60736,
"grad_norm": 0.2758672356678394,
"learning_rate": 1.099763385996177e-05,
"loss": 0.2269,
"step": 949
},
{
"epoch": 0.608,
"grad_norm": 0.267987007000224,
"learning_rate": 1.0967018844056298e-05,
"loss": 0.2244,
"step": 950
},
{
"epoch": 0.60864,
"grad_norm": 0.2554232644912386,
"learning_rate": 1.0936421926687988e-05,
"loss": 0.2393,
"step": 951
},
{
"epoch": 0.60928,
"grad_norm": 0.2707325937678355,
"learning_rate": 1.0905843245164545e-05,
"loss": 0.2275,
"step": 952
},
{
"epoch": 0.60992,
"grad_norm": 0.27164108322281655,
"learning_rate": 1.0875282936711864e-05,
"loss": 0.2415,
"step": 953
},
{
"epoch": 0.61056,
"grad_norm": 0.26648401964107277,
"learning_rate": 1.0844741138473375e-05,
"loss": 0.2293,
"step": 954
},
{
"epoch": 0.6112,
"grad_norm": 0.2521340523387244,
"learning_rate": 1.0814217987509439e-05,
"loss": 0.2075,
"step": 955
},
{
"epoch": 0.61184,
"grad_norm": 0.2684643123842368,
"learning_rate": 1.0783713620796746e-05,
"loss": 0.2375,
"step": 956
},
{
"epoch": 0.61248,
"grad_norm": 0.27116694710494305,
"learning_rate": 1.0753228175227671e-05,
"loss": 0.2305,
"step": 957
},
{
"epoch": 0.61312,
"grad_norm": 0.2590575742273794,
"learning_rate": 1.0722761787609705e-05,
"loss": 0.2343,
"step": 958
},
{
"epoch": 0.61376,
"grad_norm": 0.26450915604012226,
"learning_rate": 1.0692314594664786e-05,
"loss": 0.231,
"step": 959
},
{
"epoch": 0.6144,
"grad_norm": 0.2772686588210157,
"learning_rate": 1.0661886733028727e-05,
"loss": 0.2195,
"step": 960
},
{
"epoch": 0.61504,
"grad_norm": 0.2526658352973938,
"learning_rate": 1.0631478339250591e-05,
"loss": 0.2493,
"step": 961
},
{
"epoch": 0.61568,
"grad_norm": 0.25957998768193064,
"learning_rate": 1.0601089549792075e-05,
"loss": 0.2225,
"step": 962
},
{
"epoch": 0.61632,
"grad_norm": 0.25769736790438313,
"learning_rate": 1.0570720501026887e-05,
"loss": 0.2136,
"step": 963
},
{
"epoch": 0.61696,
"grad_norm": 0.25528326261877526,
"learning_rate": 1.0540371329240169e-05,
"loss": 0.2372,
"step": 964
},
{
"epoch": 0.6176,
"grad_norm": 0.249689252596263,
"learning_rate": 1.0510042170627837e-05,
"loss": 0.2134,
"step": 965
},
{
"epoch": 0.61824,
"grad_norm": 0.2758362589138092,
"learning_rate": 1.0479733161296016e-05,
"loss": 0.2357,
"step": 966
},
{
"epoch": 0.61888,
"grad_norm": 0.28258470196272717,
"learning_rate": 1.0449444437260402e-05,
"loss": 0.2206,
"step": 967
},
{
"epoch": 0.61952,
"grad_norm": 0.27809718765019137,
"learning_rate": 1.0419176134445648e-05,
"loss": 0.246,
"step": 968
},
{
"epoch": 0.62016,
"grad_norm": 0.2381941918663544,
"learning_rate": 1.0388928388684781e-05,
"loss": 0.2284,
"step": 969
},
{
"epoch": 0.6208,
"grad_norm": 0.2841270455338671,
"learning_rate": 1.0358701335718555e-05,
"loss": 0.2138,
"step": 970
},
{
"epoch": 0.62144,
"grad_norm": 0.3024383331911469,
"learning_rate": 1.0328495111194891e-05,
"loss": 0.2184,
"step": 971
},
{
"epoch": 0.62208,
"grad_norm": 0.2663851401696372,
"learning_rate": 1.0298309850668208e-05,
"loss": 0.2254,
"step": 972
},
{
"epoch": 0.62272,
"grad_norm": 0.2854229308183058,
"learning_rate": 1.0268145689598871e-05,
"loss": 0.2012,
"step": 973
},
{
"epoch": 0.62336,
"grad_norm": 0.2782131277120352,
"learning_rate": 1.0238002763352538e-05,
"loss": 0.2275,
"step": 974
},
{
"epoch": 0.624,
"grad_norm": 0.2707133951459095,
"learning_rate": 1.0207881207199593e-05,
"loss": 0.2252,
"step": 975
},
{
"epoch": 0.62464,
"grad_norm": 0.26175738176839053,
"learning_rate": 1.01777811563145e-05,
"loss": 0.2355,
"step": 976
},
{
"epoch": 0.62528,
"grad_norm": 0.2850575026663318,
"learning_rate": 1.0147702745775237e-05,
"loss": 0.2531,
"step": 977
},
{
"epoch": 0.62592,
"grad_norm": 0.25643392894513545,
"learning_rate": 1.0117646110562643e-05,
"loss": 0.247,
"step": 978
},
{
"epoch": 0.62656,
"grad_norm": 0.2809794639097617,
"learning_rate": 1.0087611385559855e-05,
"loss": 0.2364,
"step": 979
},
{
"epoch": 0.6272,
"grad_norm": 0.273925223907152,
"learning_rate": 1.0057598705551683e-05,
"loss": 0.2457,
"step": 980
},
{
"epoch": 0.62784,
"grad_norm": 0.2611585139690243,
"learning_rate": 1.0027608205223997e-05,
"loss": 0.2135,
"step": 981
},
{
"epoch": 0.62848,
"grad_norm": 0.2757636790658412,
"learning_rate": 9.99764001916315e-06,
"loss": 0.234,
"step": 982
},
{
"epoch": 0.62912,
"grad_norm": 0.2621085325200055,
"learning_rate": 9.967694281855337e-06,
"loss": 0.2416,
"step": 983
},
{
"epoch": 0.62976,
"grad_norm": 0.2536738273794677,
"learning_rate": 9.937771127686025e-06,
"loss": 0.252,
"step": 984
},
{
"epoch": 0.6304,
"grad_norm": 0.274093486797772,
"learning_rate": 9.90787069093933e-06,
"loss": 0.2229,
"step": 985
},
{
"epoch": 0.63104,
"grad_norm": 0.301875864424414,
"learning_rate": 9.877993105797429e-06,
"loss": 0.2061,
"step": 986
},
{
"epoch": 0.63168,
"grad_norm": 0.26810505359168685,
"learning_rate": 9.848138506339932e-06,
"loss": 0.2208,
"step": 987
},
{
"epoch": 0.63232,
"grad_norm": 0.2673327996540591,
"learning_rate": 9.81830702654332e-06,
"loss": 0.2308,
"step": 988
},
{
"epoch": 0.63296,
"grad_norm": 0.2706612405650287,
"learning_rate": 9.788498800280297e-06,
"loss": 0.2359,
"step": 989
},
{
"epoch": 0.6336,
"grad_norm": 0.2811331772020439,
"learning_rate": 9.75871396131924e-06,
"loss": 0.2138,
"step": 990
},
{
"epoch": 0.63424,
"grad_norm": 0.27083688175583204,
"learning_rate": 9.728952643323558e-06,
"loss": 0.2266,
"step": 991
},
{
"epoch": 0.63488,
"grad_norm": 0.27787455753996565,
"learning_rate": 9.699214979851095e-06,
"loss": 0.2418,
"step": 992
},
{
"epoch": 0.63552,
"grad_norm": 0.277957464786416,
"learning_rate": 9.669501104353572e-06,
"loss": 0.2301,
"step": 993
},
{
"epoch": 0.63616,
"grad_norm": 0.2698851027191522,
"learning_rate": 9.639811150175925e-06,
"loss": 0.2403,
"step": 994
},
{
"epoch": 0.6368,
"grad_norm": 0.2685878765882483,
"learning_rate": 9.61014525055577e-06,
"loss": 0.2294,
"step": 995
},
{
"epoch": 0.63744,
"grad_norm": 0.2680664569299894,
"learning_rate": 9.580503538622753e-06,
"loss": 0.2528,
"step": 996
},
{
"epoch": 0.63808,
"grad_norm": 0.24690195463824632,
"learning_rate": 9.550886147397989e-06,
"loss": 0.2299,
"step": 997
},
{
"epoch": 0.63872,
"grad_norm": 0.2624707704490792,
"learning_rate": 9.521293209793431e-06,
"loss": 0.2316,
"step": 998
},
{
"epoch": 0.63936,
"grad_norm": 0.2928803573448213,
"learning_rate": 9.491724858611323e-06,
"loss": 0.248,
"step": 999
},
{
"epoch": 0.64,
"grad_norm": 0.28532627144669237,
"learning_rate": 9.462181226543545e-06,
"loss": 0.2337,
"step": 1000
},
{
"epoch": 0.64064,
"grad_norm": 0.2874639246394616,
"learning_rate": 9.43266244617107e-06,
"loss": 0.2198,
"step": 1001
},
{
"epoch": 0.64128,
"grad_norm": 0.3005802763115371,
"learning_rate": 9.403168649963325e-06,
"loss": 0.2425,
"step": 1002
},
{
"epoch": 0.64192,
"grad_norm": 0.27021178999951345,
"learning_rate": 9.37369997027763e-06,
"loss": 0.2357,
"step": 1003
},
{
"epoch": 0.64256,
"grad_norm": 0.295266429086591,
"learning_rate": 9.344256539358598e-06,
"loss": 0.2179,
"step": 1004
},
{
"epoch": 0.6432,
"grad_norm": 0.268243598549565,
"learning_rate": 9.314838489337512e-06,
"loss": 0.2323,
"step": 1005
},
{
"epoch": 0.64384,
"grad_norm": 0.28382119167552344,
"learning_rate": 9.285445952231777e-06,
"loss": 0.2191,
"step": 1006
},
{
"epoch": 0.64448,
"grad_norm": 0.28583856240407873,
"learning_rate": 9.256079059944295e-06,
"loss": 0.236,
"step": 1007
},
{
"epoch": 0.64512,
"grad_norm": 0.2665670998720098,
"learning_rate": 9.226737944262879e-06,
"loss": 0.2404,
"step": 1008
},
{
"epoch": 0.64576,
"grad_norm": 0.2618331652239897,
"learning_rate": 9.197422736859681e-06,
"loss": 0.2515,
"step": 1009
},
{
"epoch": 0.6464,
"grad_norm": 0.2743575127281784,
"learning_rate": 9.168133569290574e-06,
"loss": 0.2366,
"step": 1010
},
{
"epoch": 0.64704,
"grad_norm": 0.27112251220328204,
"learning_rate": 9.138870572994573e-06,
"loss": 0.2441,
"step": 1011
},
{
"epoch": 0.64768,
"grad_norm": 0.259583608714577,
"learning_rate": 9.109633879293258e-06,
"loss": 0.2345,
"step": 1012
},
{
"epoch": 0.64832,
"grad_norm": 0.26936694813868906,
"learning_rate": 9.080423619390156e-06,
"loss": 0.2245,
"step": 1013
},
{
"epoch": 0.64896,
"grad_norm": 0.23709705994485894,
"learning_rate": 9.051239924370187e-06,
"loss": 0.2147,
"step": 1014
},
{
"epoch": 0.6496,
"grad_norm": 0.320565619860577,
"learning_rate": 9.02208292519904e-06,
"loss": 0.2686,
"step": 1015
},
{
"epoch": 0.65024,
"grad_norm": 0.23594177473213718,
"learning_rate": 8.992952752722612e-06,
"loss": 0.2373,
"step": 1016
},
{
"epoch": 0.65088,
"grad_norm": 0.2610455828430524,
"learning_rate": 8.963849537666416e-06,
"loss": 0.2298,
"step": 1017
},
{
"epoch": 0.65152,
"grad_norm": 0.27883308538071555,
"learning_rate": 8.934773410634979e-06,
"loss": 0.2284,
"step": 1018
},
{
"epoch": 0.65216,
"grad_norm": 0.243708170639781,
"learning_rate": 8.90572450211128e-06,
"loss": 0.2381,
"step": 1019
},
{
"epoch": 0.6528,
"grad_norm": 0.25214049350809076,
"learning_rate": 8.876702942456135e-06,
"loss": 0.2236,
"step": 1020
},
{
"epoch": 0.65344,
"grad_norm": 0.2695393266182502,
"learning_rate": 8.847708861907647e-06,
"loss": 0.2518,
"step": 1021
},
{
"epoch": 0.65408,
"grad_norm": 0.2894111851022704,
"learning_rate": 8.818742390580579e-06,
"loss": 0.2102,
"step": 1022
},
{
"epoch": 0.65472,
"grad_norm": 0.28025286701489427,
"learning_rate": 8.789803658465822e-06,
"loss": 0.2374,
"step": 1023
},
{
"epoch": 0.65536,
"grad_norm": 0.28195313787899245,
"learning_rate": 8.760892795429758e-06,
"loss": 0.2298,
"step": 1024
},
{
"epoch": 0.656,
"grad_norm": 0.26147501573586185,
"learning_rate": 8.732009931213728e-06,
"loss": 0.2374,
"step": 1025
},
{
"epoch": 0.65664,
"grad_norm": 0.26624563474494223,
"learning_rate": 8.703155195433398e-06,
"loss": 0.2136,
"step": 1026
},
{
"epoch": 0.65728,
"grad_norm": 0.29498065678389157,
"learning_rate": 8.674328717578224e-06,
"loss": 0.2542,
"step": 1027
},
{
"epoch": 0.65792,
"grad_norm": 0.2592368682687856,
"learning_rate": 8.645530627010841e-06,
"loss": 0.2289,
"step": 1028
},
{
"epoch": 0.65856,
"grad_norm": 0.5187956052357068,
"learning_rate": 8.616761052966497e-06,
"loss": 0.2092,
"step": 1029
},
{
"epoch": 0.6592,
"grad_norm": 0.2727221527101844,
"learning_rate": 8.588020124552466e-06,
"loss": 0.2427,
"step": 1030
},
{
"epoch": 0.65984,
"grad_norm": 0.25445411990157907,
"learning_rate": 8.559307970747471e-06,
"loss": 0.2211,
"step": 1031
},
{
"epoch": 0.66048,
"grad_norm": 0.24623299916346522,
"learning_rate": 8.530624720401107e-06,
"loss": 0.2155,
"step": 1032
},
{
"epoch": 0.66112,
"grad_norm": 0.24184315685116256,
"learning_rate": 8.501970502233253e-06,
"loss": 0.2348,
"step": 1033
},
{
"epoch": 0.66176,
"grad_norm": 0.2721264275062898,
"learning_rate": 8.473345444833518e-06,
"loss": 0.2321,
"step": 1034
},
{
"epoch": 0.6624,
"grad_norm": 0.2730712748831622,
"learning_rate": 8.444749676660633e-06,
"loss": 0.2418,
"step": 1035
},
{
"epoch": 0.66304,
"grad_norm": 0.2721775229972965,
"learning_rate": 8.416183326041903e-06,
"loss": 0.2245,
"step": 1036
},
{
"epoch": 0.66368,
"grad_norm": 0.2462298760618664,
"learning_rate": 8.387646521172593e-06,
"loss": 0.2356,
"step": 1037
},
{
"epoch": 0.66432,
"grad_norm": 0.2601849010444262,
"learning_rate": 8.359139390115416e-06,
"loss": 0.2481,
"step": 1038
},
{
"epoch": 0.66496,
"grad_norm": 0.26359632890879,
"learning_rate": 8.330662060799878e-06,
"loss": 0.2459,
"step": 1039
},
{
"epoch": 0.6656,
"grad_norm": 0.2394782595448413,
"learning_rate": 8.302214661021784e-06,
"loss": 0.2119,
"step": 1040
},
{
"epoch": 0.66624,
"grad_norm": 0.26670478343946696,
"learning_rate": 8.273797318442608e-06,
"loss": 0.216,
"step": 1041
},
{
"epoch": 0.66688,
"grad_norm": 0.2918048313261061,
"learning_rate": 8.24541016058892e-06,
"loss": 0.2311,
"step": 1042
}
],
"logging_steps": 1,
"max_steps": 1562,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 521,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 477370166870016.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}