fengyao1909's picture
Upload folder using huggingface_hub
54144d6 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.33344,
"eval_steps": 500,
"global_step": 521,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00064,
"grad_norm": 2.353584373604404,
"learning_rate": 0.0,
"loss": 0.5764,
"step": 1
},
{
"epoch": 0.00128,
"grad_norm": 2.5677274470706184,
"learning_rate": 3.79746835443038e-07,
"loss": 0.6154,
"step": 2
},
{
"epoch": 0.00192,
"grad_norm": 2.434815038353528,
"learning_rate": 7.59493670886076e-07,
"loss": 0.5762,
"step": 3
},
{
"epoch": 0.00256,
"grad_norm": 2.4265789349847133,
"learning_rate": 1.139240506329114e-06,
"loss": 0.5898,
"step": 4
},
{
"epoch": 0.0032,
"grad_norm": 2.4968640172692007,
"learning_rate": 1.518987341772152e-06,
"loss": 0.5854,
"step": 5
},
{
"epoch": 0.00384,
"grad_norm": 2.6270464110646,
"learning_rate": 1.8987341772151901e-06,
"loss": 0.6063,
"step": 6
},
{
"epoch": 0.00448,
"grad_norm": 2.3240861061539664,
"learning_rate": 2.278481012658228e-06,
"loss": 0.5713,
"step": 7
},
{
"epoch": 0.00512,
"grad_norm": 2.277096457178879,
"learning_rate": 2.6582278481012658e-06,
"loss": 0.5981,
"step": 8
},
{
"epoch": 0.00576,
"grad_norm": 2.0556904877285276,
"learning_rate": 3.037974683544304e-06,
"loss": 0.5783,
"step": 9
},
{
"epoch": 0.0064,
"grad_norm": 1.8883532047436715,
"learning_rate": 3.4177215189873417e-06,
"loss": 0.5626,
"step": 10
},
{
"epoch": 0.00704,
"grad_norm": 1.4054834462317158,
"learning_rate": 3.7974683544303802e-06,
"loss": 0.5762,
"step": 11
},
{
"epoch": 0.00768,
"grad_norm": 1.2672259034804543,
"learning_rate": 4.1772151898734175e-06,
"loss": 0.5447,
"step": 12
},
{
"epoch": 0.00832,
"grad_norm": 1.1315187874621562,
"learning_rate": 4.556962025316456e-06,
"loss": 0.5003,
"step": 13
},
{
"epoch": 0.00896,
"grad_norm": 1.2442008617105749,
"learning_rate": 4.936708860759494e-06,
"loss": 0.5235,
"step": 14
},
{
"epoch": 0.0096,
"grad_norm": 1.1708354130739331,
"learning_rate": 5.3164556962025316e-06,
"loss": 0.4659,
"step": 15
},
{
"epoch": 0.01024,
"grad_norm": 1.3068048804077694,
"learning_rate": 5.69620253164557e-06,
"loss": 0.4601,
"step": 16
},
{
"epoch": 0.01088,
"grad_norm": 1.487196594526998,
"learning_rate": 6.075949367088608e-06,
"loss": 0.487,
"step": 17
},
{
"epoch": 0.01152,
"grad_norm": 1.3702547576006188,
"learning_rate": 6.455696202531646e-06,
"loss": 0.4829,
"step": 18
},
{
"epoch": 0.01216,
"grad_norm": 0.985500593752308,
"learning_rate": 6.835443037974683e-06,
"loss": 0.4463,
"step": 19
},
{
"epoch": 0.0128,
"grad_norm": 0.9909916347896555,
"learning_rate": 7.215189873417722e-06,
"loss": 0.4554,
"step": 20
},
{
"epoch": 0.01344,
"grad_norm": 1.1838910835370198,
"learning_rate": 7.5949367088607605e-06,
"loss": 0.4622,
"step": 21
},
{
"epoch": 0.01408,
"grad_norm": 1.6876677020755413,
"learning_rate": 7.974683544303797e-06,
"loss": 0.3952,
"step": 22
},
{
"epoch": 0.01472,
"grad_norm": 1.0417104358020024,
"learning_rate": 8.354430379746835e-06,
"loss": 0.4105,
"step": 23
},
{
"epoch": 0.01536,
"grad_norm": 0.703419627437881,
"learning_rate": 8.734177215189873e-06,
"loss": 0.3797,
"step": 24
},
{
"epoch": 0.016,
"grad_norm": 0.7189072691985104,
"learning_rate": 9.113924050632912e-06,
"loss": 0.4206,
"step": 25
},
{
"epoch": 0.01664,
"grad_norm": 0.7280445280679197,
"learning_rate": 9.49367088607595e-06,
"loss": 0.3963,
"step": 26
},
{
"epoch": 0.01728,
"grad_norm": 0.5914861117650524,
"learning_rate": 9.873417721518988e-06,
"loss": 0.3928,
"step": 27
},
{
"epoch": 0.01792,
"grad_norm": 0.6824922709052414,
"learning_rate": 1.0253164556962025e-05,
"loss": 0.3909,
"step": 28
},
{
"epoch": 0.01856,
"grad_norm": 0.6157178643639211,
"learning_rate": 1.0632911392405063e-05,
"loss": 0.3778,
"step": 29
},
{
"epoch": 0.0192,
"grad_norm": 0.7040224694990468,
"learning_rate": 1.1012658227848103e-05,
"loss": 0.4034,
"step": 30
},
{
"epoch": 0.01984,
"grad_norm": 0.6042406871448358,
"learning_rate": 1.139240506329114e-05,
"loss": 0.3991,
"step": 31
},
{
"epoch": 0.02048,
"grad_norm": 0.5581052443719693,
"learning_rate": 1.1772151898734176e-05,
"loss": 0.4011,
"step": 32
},
{
"epoch": 0.02112,
"grad_norm": 0.4266761178757486,
"learning_rate": 1.2151898734177216e-05,
"loss": 0.3639,
"step": 33
},
{
"epoch": 0.02176,
"grad_norm": 0.49600505321594285,
"learning_rate": 1.2531645569620253e-05,
"loss": 0.3784,
"step": 34
},
{
"epoch": 0.0224,
"grad_norm": 0.45932753795907266,
"learning_rate": 1.2911392405063291e-05,
"loss": 0.3751,
"step": 35
},
{
"epoch": 0.02304,
"grad_norm": 0.5363867843128812,
"learning_rate": 1.3291139240506329e-05,
"loss": 0.3865,
"step": 36
},
{
"epoch": 0.02368,
"grad_norm": 0.5488419069165299,
"learning_rate": 1.3670886075949367e-05,
"loss": 0.3787,
"step": 37
},
{
"epoch": 0.02432,
"grad_norm": 0.48587043713463085,
"learning_rate": 1.4050632911392406e-05,
"loss": 0.3712,
"step": 38
},
{
"epoch": 0.02496,
"grad_norm": 0.4440990518273777,
"learning_rate": 1.4430379746835444e-05,
"loss": 0.3652,
"step": 39
},
{
"epoch": 0.0256,
"grad_norm": 0.3751498370487217,
"learning_rate": 1.4810126582278482e-05,
"loss": 0.3569,
"step": 40
},
{
"epoch": 0.02624,
"grad_norm": 0.43725343772159886,
"learning_rate": 1.5189873417721521e-05,
"loss": 0.3158,
"step": 41
},
{
"epoch": 0.02688,
"grad_norm": 0.4888026864502293,
"learning_rate": 1.5569620253164557e-05,
"loss": 0.3442,
"step": 42
},
{
"epoch": 0.02752,
"grad_norm": 0.44586123200596317,
"learning_rate": 1.5949367088607595e-05,
"loss": 0.3328,
"step": 43
},
{
"epoch": 0.02816,
"grad_norm": 0.49767798630217713,
"learning_rate": 1.6329113924050632e-05,
"loss": 0.351,
"step": 44
},
{
"epoch": 0.0288,
"grad_norm": 0.4336104979866255,
"learning_rate": 1.670886075949367e-05,
"loss": 0.3612,
"step": 45
},
{
"epoch": 0.02944,
"grad_norm": 0.4981279303284916,
"learning_rate": 1.708860759493671e-05,
"loss": 0.3543,
"step": 46
},
{
"epoch": 0.03008,
"grad_norm": 0.3991975649542343,
"learning_rate": 1.7468354430379746e-05,
"loss": 0.346,
"step": 47
},
{
"epoch": 0.03072,
"grad_norm": 0.4340035974965301,
"learning_rate": 1.7848101265822783e-05,
"loss": 0.3563,
"step": 48
},
{
"epoch": 0.03136,
"grad_norm": 0.40441725662186223,
"learning_rate": 1.8227848101265824e-05,
"loss": 0.3378,
"step": 49
},
{
"epoch": 0.032,
"grad_norm": 0.4015157764097668,
"learning_rate": 1.860759493670886e-05,
"loss": 0.3477,
"step": 50
},
{
"epoch": 0.03264,
"grad_norm": 0.3897529365297187,
"learning_rate": 1.89873417721519e-05,
"loss": 0.3341,
"step": 51
},
{
"epoch": 0.03328,
"grad_norm": 0.39864724195969276,
"learning_rate": 1.9367088607594938e-05,
"loss": 0.3322,
"step": 52
},
{
"epoch": 0.03392,
"grad_norm": 0.365402956200434,
"learning_rate": 1.9746835443037975e-05,
"loss": 0.297,
"step": 53
},
{
"epoch": 0.03456,
"grad_norm": 0.4449097185367681,
"learning_rate": 2.0126582278481013e-05,
"loss": 0.3424,
"step": 54
},
{
"epoch": 0.0352,
"grad_norm": 0.43181805843495463,
"learning_rate": 2.050632911392405e-05,
"loss": 0.3351,
"step": 55
},
{
"epoch": 0.03584,
"grad_norm": 0.392546267080926,
"learning_rate": 2.0886075949367092e-05,
"loss": 0.334,
"step": 56
},
{
"epoch": 0.03648,
"grad_norm": 0.38060689632546174,
"learning_rate": 2.1265822784810126e-05,
"loss": 0.3328,
"step": 57
},
{
"epoch": 0.03712,
"grad_norm": 0.389736897725578,
"learning_rate": 2.1645569620253164e-05,
"loss": 0.351,
"step": 58
},
{
"epoch": 0.03776,
"grad_norm": 0.42262406549661324,
"learning_rate": 2.2025316455696205e-05,
"loss": 0.3378,
"step": 59
},
{
"epoch": 0.0384,
"grad_norm": 0.43235384034762964,
"learning_rate": 2.240506329113924e-05,
"loss": 0.3308,
"step": 60
},
{
"epoch": 0.03904,
"grad_norm": 0.3945507271925106,
"learning_rate": 2.278481012658228e-05,
"loss": 0.3388,
"step": 61
},
{
"epoch": 0.03968,
"grad_norm": 0.40849904164663575,
"learning_rate": 2.3164556962025318e-05,
"loss": 0.3191,
"step": 62
},
{
"epoch": 0.04032,
"grad_norm": 0.4559537771952015,
"learning_rate": 2.3544303797468353e-05,
"loss": 0.3245,
"step": 63
},
{
"epoch": 0.04096,
"grad_norm": 0.40069809350325963,
"learning_rate": 2.3924050632911394e-05,
"loss": 0.3566,
"step": 64
},
{
"epoch": 0.0416,
"grad_norm": 0.37133926498283504,
"learning_rate": 2.430379746835443e-05,
"loss": 0.3147,
"step": 65
},
{
"epoch": 0.04224,
"grad_norm": 0.4628360444788038,
"learning_rate": 2.468354430379747e-05,
"loss": 0.3498,
"step": 66
},
{
"epoch": 0.04288,
"grad_norm": 0.38282967415212005,
"learning_rate": 2.5063291139240507e-05,
"loss": 0.3171,
"step": 67
},
{
"epoch": 0.04352,
"grad_norm": 0.4221420372731314,
"learning_rate": 2.5443037974683545e-05,
"loss": 0.3288,
"step": 68
},
{
"epoch": 0.04416,
"grad_norm": 0.4014634593404327,
"learning_rate": 2.5822784810126582e-05,
"loss": 0.2967,
"step": 69
},
{
"epoch": 0.0448,
"grad_norm": 0.4226874979052922,
"learning_rate": 2.620253164556962e-05,
"loss": 0.3311,
"step": 70
},
{
"epoch": 0.04544,
"grad_norm": 0.3691551141136815,
"learning_rate": 2.6582278481012658e-05,
"loss": 0.3154,
"step": 71
},
{
"epoch": 0.04608,
"grad_norm": 0.4930935215298506,
"learning_rate": 2.69620253164557e-05,
"loss": 0.3248,
"step": 72
},
{
"epoch": 0.04672,
"grad_norm": 0.40344283789369767,
"learning_rate": 2.7341772151898733e-05,
"loss": 0.3146,
"step": 73
},
{
"epoch": 0.04736,
"grad_norm": 0.40136750473831273,
"learning_rate": 2.7721518987341774e-05,
"loss": 0.3084,
"step": 74
},
{
"epoch": 0.048,
"grad_norm": 0.433002156207923,
"learning_rate": 2.8101265822784812e-05,
"loss": 0.2992,
"step": 75
},
{
"epoch": 0.04864,
"grad_norm": 0.45976406035512724,
"learning_rate": 2.8481012658227846e-05,
"loss": 0.3203,
"step": 76
},
{
"epoch": 0.04928,
"grad_norm": 0.46555202230017373,
"learning_rate": 2.8860759493670888e-05,
"loss": 0.326,
"step": 77
},
{
"epoch": 0.04992,
"grad_norm": 0.40061810400853354,
"learning_rate": 2.9240506329113925e-05,
"loss": 0.2902,
"step": 78
},
{
"epoch": 0.05056,
"grad_norm": 0.44228162263369347,
"learning_rate": 2.9620253164556963e-05,
"loss": 0.3235,
"step": 79
},
{
"epoch": 0.0512,
"grad_norm": 0.4587098467220033,
"learning_rate": 3e-05,
"loss": 0.3135,
"step": 80
},
{
"epoch": 0.05184,
"grad_norm": 0.42643590142876026,
"learning_rate": 2.9999966342756535e-05,
"loss": 0.3222,
"step": 81
},
{
"epoch": 0.05248,
"grad_norm": 0.4312879132975502,
"learning_rate": 2.9999865371177178e-05,
"loss": 0.3161,
"step": 82
},
{
"epoch": 0.05312,
"grad_norm": 0.44554806900527943,
"learning_rate": 2.9999697085715054e-05,
"loss": 0.3156,
"step": 83
},
{
"epoch": 0.05376,
"grad_norm": 0.4388949943183408,
"learning_rate": 2.9999461487125358e-05,
"loss": 0.3278,
"step": 84
},
{
"epoch": 0.0544,
"grad_norm": 0.4830420761548017,
"learning_rate": 2.999915857646538e-05,
"loss": 0.3242,
"step": 85
},
{
"epoch": 0.05504,
"grad_norm": 0.42290888303690005,
"learning_rate": 2.9998788355094472e-05,
"loss": 0.3051,
"step": 86
},
{
"epoch": 0.05568,
"grad_norm": 0.39931894559101605,
"learning_rate": 2.9998350824674046e-05,
"loss": 0.3174,
"step": 87
},
{
"epoch": 0.05632,
"grad_norm": 0.43885002522435895,
"learning_rate": 2.999784598716758e-05,
"loss": 0.3167,
"step": 88
},
{
"epoch": 0.05696,
"grad_norm": 0.4385666955310095,
"learning_rate": 2.9997273844840597e-05,
"loss": 0.32,
"step": 89
},
{
"epoch": 0.0576,
"grad_norm": 0.4241668561559285,
"learning_rate": 2.9996634400260665e-05,
"loss": 0.339,
"step": 90
},
{
"epoch": 0.05824,
"grad_norm": 0.49455156274720913,
"learning_rate": 2.9995927656297376e-05,
"loss": 0.3135,
"step": 91
},
{
"epoch": 0.05888,
"grad_norm": 0.3990302088629719,
"learning_rate": 2.9995153616122335e-05,
"loss": 0.3026,
"step": 92
},
{
"epoch": 0.05952,
"grad_norm": 0.38959449381416456,
"learning_rate": 2.9994312283209147e-05,
"loss": 0.2898,
"step": 93
},
{
"epoch": 0.06016,
"grad_norm": 0.42585557119259293,
"learning_rate": 2.9993403661333413e-05,
"loss": 0.3021,
"step": 94
},
{
"epoch": 0.0608,
"grad_norm": 0.37467889420763917,
"learning_rate": 2.9992427754572692e-05,
"loss": 0.273,
"step": 95
},
{
"epoch": 0.06144,
"grad_norm": 0.4466178048330342,
"learning_rate": 2.9991384567306485e-05,
"loss": 0.3116,
"step": 96
},
{
"epoch": 0.06208,
"grad_norm": 0.4133406435053229,
"learning_rate": 2.9990274104216246e-05,
"loss": 0.3066,
"step": 97
},
{
"epoch": 0.06272,
"grad_norm": 0.42703847116501487,
"learning_rate": 2.9989096370285314e-05,
"loss": 0.3029,
"step": 98
},
{
"epoch": 0.06336,
"grad_norm": 0.42408773456463106,
"learning_rate": 2.9987851370798936e-05,
"loss": 0.3302,
"step": 99
},
{
"epoch": 0.064,
"grad_norm": 0.36334101776279903,
"learning_rate": 2.998653911134421e-05,
"loss": 0.2749,
"step": 100
},
{
"epoch": 0.06464,
"grad_norm": 0.39914472077814156,
"learning_rate": 2.9985159597810067e-05,
"loss": 0.28,
"step": 101
},
{
"epoch": 0.06528,
"grad_norm": 0.4421338433902218,
"learning_rate": 2.9983712836387263e-05,
"loss": 0.3284,
"step": 102
},
{
"epoch": 0.06592,
"grad_norm": 0.4681114190701383,
"learning_rate": 2.9982198833568332e-05,
"loss": 0.3106,
"step": 103
},
{
"epoch": 0.06656,
"grad_norm": 0.43495699820223616,
"learning_rate": 2.998061759614756e-05,
"loss": 0.3034,
"step": 104
},
{
"epoch": 0.0672,
"grad_norm": 0.48523982047419434,
"learning_rate": 2.9978969131220956e-05,
"loss": 0.3199,
"step": 105
},
{
"epoch": 0.06784,
"grad_norm": 0.46970443054055944,
"learning_rate": 2.9977253446186236e-05,
"loss": 0.3145,
"step": 106
},
{
"epoch": 0.06848,
"grad_norm": 0.3762833085351204,
"learning_rate": 2.997547054874275e-05,
"loss": 0.2784,
"step": 107
},
{
"epoch": 0.06912,
"grad_norm": 0.4018479556059187,
"learning_rate": 2.9973620446891495e-05,
"loss": 0.29,
"step": 108
},
{
"epoch": 0.06976,
"grad_norm": 0.38557801552857196,
"learning_rate": 2.997170314893504e-05,
"loss": 0.2973,
"step": 109
},
{
"epoch": 0.0704,
"grad_norm": 0.42760504312755043,
"learning_rate": 2.9969718663477524e-05,
"loss": 0.3087,
"step": 110
},
{
"epoch": 0.07104,
"grad_norm": 0.4336880764497262,
"learning_rate": 2.9967666999424588e-05,
"loss": 0.3202,
"step": 111
},
{
"epoch": 0.07168,
"grad_norm": 0.4361380772072425,
"learning_rate": 2.9965548165983337e-05,
"loss": 0.2858,
"step": 112
},
{
"epoch": 0.07232,
"grad_norm": 0.36094868753422277,
"learning_rate": 2.9963362172662324e-05,
"loss": 0.2965,
"step": 113
},
{
"epoch": 0.07296,
"grad_norm": 0.5213407516270206,
"learning_rate": 2.9961109029271478e-05,
"loss": 0.2877,
"step": 114
},
{
"epoch": 0.0736,
"grad_norm": 0.39899966857979996,
"learning_rate": 2.9958788745922088e-05,
"loss": 0.2967,
"step": 115
},
{
"epoch": 0.07424,
"grad_norm": 0.4922285856357075,
"learning_rate": 2.995640133302672e-05,
"loss": 0.3135,
"step": 116
},
{
"epoch": 0.07488,
"grad_norm": 0.3840683744534582,
"learning_rate": 2.9953946801299213e-05,
"loss": 0.3021,
"step": 117
},
{
"epoch": 0.07552,
"grad_norm": 0.4183493778424685,
"learning_rate": 2.995142516175461e-05,
"loss": 0.307,
"step": 118
},
{
"epoch": 0.07616,
"grad_norm": 0.35220612158537656,
"learning_rate": 2.994883642570909e-05,
"loss": 0.2702,
"step": 119
},
{
"epoch": 0.0768,
"grad_norm": 0.3936594224655844,
"learning_rate": 2.9946180604779952e-05,
"loss": 0.2978,
"step": 120
},
{
"epoch": 0.07744,
"grad_norm": 0.4020931932900933,
"learning_rate": 2.9943457710885548e-05,
"loss": 0.3115,
"step": 121
},
{
"epoch": 0.07808,
"grad_norm": 0.413807873620709,
"learning_rate": 2.9940667756245218e-05,
"loss": 0.3084,
"step": 122
},
{
"epoch": 0.07872,
"grad_norm": 0.3861233473719098,
"learning_rate": 2.9937810753379266e-05,
"loss": 0.2844,
"step": 123
},
{
"epoch": 0.07936,
"grad_norm": 0.3386752443740303,
"learning_rate": 2.993488671510886e-05,
"loss": 0.3037,
"step": 124
},
{
"epoch": 0.08,
"grad_norm": 0.3972212865125969,
"learning_rate": 2.993189565455601e-05,
"loss": 0.3137,
"step": 125
},
{
"epoch": 0.08064,
"grad_norm": 0.38002895692953814,
"learning_rate": 2.9928837585143497e-05,
"loss": 0.2873,
"step": 126
},
{
"epoch": 0.08128,
"grad_norm": 0.44072168693518854,
"learning_rate": 2.992571252059482e-05,
"loss": 0.2782,
"step": 127
},
{
"epoch": 0.08192,
"grad_norm": 0.3479281011658149,
"learning_rate": 2.992252047493411e-05,
"loss": 0.2799,
"step": 128
},
{
"epoch": 0.08256,
"grad_norm": 0.4142266632731221,
"learning_rate": 2.9919261462486098e-05,
"loss": 0.3176,
"step": 129
},
{
"epoch": 0.0832,
"grad_norm": 0.35944919396457825,
"learning_rate": 2.991593549787604e-05,
"loss": 0.2963,
"step": 130
},
{
"epoch": 0.08384,
"grad_norm": 0.3837171188201694,
"learning_rate": 2.9912542596029635e-05,
"loss": 0.3057,
"step": 131
},
{
"epoch": 0.08448,
"grad_norm": 0.4763114305033498,
"learning_rate": 2.990908277217298e-05,
"loss": 0.3224,
"step": 132
},
{
"epoch": 0.08512,
"grad_norm": 0.3834181145690126,
"learning_rate": 2.9905556041832494e-05,
"loss": 0.2841,
"step": 133
},
{
"epoch": 0.08576,
"grad_norm": 0.40814627377992635,
"learning_rate": 2.990196242083485e-05,
"loss": 0.2871,
"step": 134
},
{
"epoch": 0.0864,
"grad_norm": 0.444873139895432,
"learning_rate": 2.98983019253069e-05,
"loss": 0.29,
"step": 135
},
{
"epoch": 0.08704,
"grad_norm": 0.4015570039172884,
"learning_rate": 2.9894574571675593e-05,
"loss": 0.29,
"step": 136
},
{
"epoch": 0.08768,
"grad_norm": 0.4192576277736737,
"learning_rate": 2.989078037666793e-05,
"loss": 0.3187,
"step": 137
},
{
"epoch": 0.08832,
"grad_norm": 0.4701185999968217,
"learning_rate": 2.988691935731086e-05,
"loss": 0.3039,
"step": 138
},
{
"epoch": 0.08896,
"grad_norm": 0.331862366916375,
"learning_rate": 2.988299153093122e-05,
"loss": 0.2814,
"step": 139
},
{
"epoch": 0.0896,
"grad_norm": 0.4503832184907915,
"learning_rate": 2.987899691515565e-05,
"loss": 0.3084,
"step": 140
},
{
"epoch": 0.09024,
"grad_norm": 0.3734705387771823,
"learning_rate": 2.987493552791052e-05,
"loss": 0.2837,
"step": 141
},
{
"epoch": 0.09088,
"grad_norm": 0.3978970762252604,
"learning_rate": 2.9870807387421843e-05,
"loss": 0.3032,
"step": 142
},
{
"epoch": 0.09152,
"grad_norm": 0.39348398869038875,
"learning_rate": 2.986661251221519e-05,
"loss": 0.3062,
"step": 143
},
{
"epoch": 0.09216,
"grad_norm": 0.4493064184937299,
"learning_rate": 2.986235092111563e-05,
"loss": 0.3071,
"step": 144
},
{
"epoch": 0.0928,
"grad_norm": 0.44946841788948644,
"learning_rate": 2.985802263324761e-05,
"loss": 0.3025,
"step": 145
},
{
"epoch": 0.09344,
"grad_norm": 0.5016226955727763,
"learning_rate": 2.9853627668034898e-05,
"loss": 0.3246,
"step": 146
},
{
"epoch": 0.09408,
"grad_norm": 0.399973023082225,
"learning_rate": 2.9849166045200476e-05,
"loss": 0.2811,
"step": 147
},
{
"epoch": 0.09472,
"grad_norm": 0.3739130134478663,
"learning_rate": 2.9844637784766478e-05,
"loss": 0.2855,
"step": 148
},
{
"epoch": 0.09536,
"grad_norm": 0.43805848638708655,
"learning_rate": 2.9840042907054068e-05,
"loss": 0.2798,
"step": 149
},
{
"epoch": 0.096,
"grad_norm": 0.4402314818129774,
"learning_rate": 2.9835381432683363e-05,
"loss": 0.2847,
"step": 150
},
{
"epoch": 0.09664,
"grad_norm": 0.4047049034331425,
"learning_rate": 2.9830653382573358e-05,
"loss": 0.2861,
"step": 151
},
{
"epoch": 0.09728,
"grad_norm": 0.43978737589580585,
"learning_rate": 2.982585877794179e-05,
"loss": 0.3054,
"step": 152
},
{
"epoch": 0.09792,
"grad_norm": 0.4730334615845102,
"learning_rate": 2.9820997640305097e-05,
"loss": 0.2817,
"step": 153
},
{
"epoch": 0.09856,
"grad_norm": 0.3740570479644525,
"learning_rate": 2.981606999147827e-05,
"loss": 0.2608,
"step": 154
},
{
"epoch": 0.0992,
"grad_norm": 0.481760965756092,
"learning_rate": 2.9811075853574788e-05,
"loss": 0.3113,
"step": 155
},
{
"epoch": 0.09984,
"grad_norm": 0.3961997084965111,
"learning_rate": 2.98060152490065e-05,
"loss": 0.3088,
"step": 156
},
{
"epoch": 0.10048,
"grad_norm": 0.428730401496227,
"learning_rate": 2.9800888200483552e-05,
"loss": 0.3129,
"step": 157
},
{
"epoch": 0.10112,
"grad_norm": 0.4178705790625643,
"learning_rate": 2.979569473101424e-05,
"loss": 0.3284,
"step": 158
},
{
"epoch": 0.10176,
"grad_norm": 0.4463567693300283,
"learning_rate": 2.9790434863904957e-05,
"loss": 0.3044,
"step": 159
},
{
"epoch": 0.1024,
"grad_norm": 0.3939775657775284,
"learning_rate": 2.9785108622760045e-05,
"loss": 0.2894,
"step": 160
},
{
"epoch": 0.10304,
"grad_norm": 0.38174144831499374,
"learning_rate": 2.9779716031481717e-05,
"loss": 0.2779,
"step": 161
},
{
"epoch": 0.10368,
"grad_norm": 0.38589717775088483,
"learning_rate": 2.9774257114269955e-05,
"loss": 0.3123,
"step": 162
},
{
"epoch": 0.10432,
"grad_norm": 0.3673496875585954,
"learning_rate": 2.9768731895622355e-05,
"loss": 0.2771,
"step": 163
},
{
"epoch": 0.10496,
"grad_norm": 0.3571608186420396,
"learning_rate": 2.9763140400334072e-05,
"loss": 0.2897,
"step": 164
},
{
"epoch": 0.1056,
"grad_norm": 0.3939651393603113,
"learning_rate": 2.975748265349769e-05,
"loss": 0.2966,
"step": 165
},
{
"epoch": 0.10624,
"grad_norm": 0.3805663760796052,
"learning_rate": 2.975175868050309e-05,
"loss": 0.3156,
"step": 166
},
{
"epoch": 0.10688,
"grad_norm": 0.3972270203399388,
"learning_rate": 2.9745968507037356e-05,
"loss": 0.2963,
"step": 167
},
{
"epoch": 0.10752,
"grad_norm": 0.39226032551563944,
"learning_rate": 2.974011215908467e-05,
"loss": 0.2933,
"step": 168
},
{
"epoch": 0.10816,
"grad_norm": 0.35875066299615677,
"learning_rate": 2.9734189662926163e-05,
"loss": 0.2758,
"step": 169
},
{
"epoch": 0.1088,
"grad_norm": 0.3709380144753209,
"learning_rate": 2.9728201045139813e-05,
"loss": 0.2927,
"step": 170
},
{
"epoch": 0.10944,
"grad_norm": 0.370201710217779,
"learning_rate": 2.972214633260035e-05,
"loss": 0.2866,
"step": 171
},
{
"epoch": 0.11008,
"grad_norm": 0.3758060771260273,
"learning_rate": 2.9716025552479093e-05,
"loss": 0.2994,
"step": 172
},
{
"epoch": 0.11072,
"grad_norm": 0.32580329325564505,
"learning_rate": 2.9709838732243844e-05,
"loss": 0.2802,
"step": 173
},
{
"epoch": 0.11136,
"grad_norm": 0.38248606018972797,
"learning_rate": 2.970358589965879e-05,
"loss": 0.2739,
"step": 174
},
{
"epoch": 0.112,
"grad_norm": 0.36493445057406637,
"learning_rate": 2.9697267082784342e-05,
"loss": 0.2874,
"step": 175
},
{
"epoch": 0.11264,
"grad_norm": 0.6384082314576618,
"learning_rate": 2.969088230997703e-05,
"loss": 0.2835,
"step": 176
},
{
"epoch": 0.11328,
"grad_norm": 0.4046365021981396,
"learning_rate": 2.9684431609889365e-05,
"loss": 0.2964,
"step": 177
},
{
"epoch": 0.11392,
"grad_norm": 0.3777231337256365,
"learning_rate": 2.9677915011469717e-05,
"loss": 0.302,
"step": 178
},
{
"epoch": 0.11456,
"grad_norm": 0.3550577304426545,
"learning_rate": 2.9671332543962183e-05,
"loss": 0.2809,
"step": 179
},
{
"epoch": 0.1152,
"grad_norm": 0.34027303690053873,
"learning_rate": 2.9664684236906466e-05,
"loss": 0.2998,
"step": 180
},
{
"epoch": 0.11584,
"grad_norm": 0.3705269161905217,
"learning_rate": 2.965797012013772e-05,
"loss": 0.3187,
"step": 181
},
{
"epoch": 0.11648,
"grad_norm": 0.36000193066901365,
"learning_rate": 2.9651190223786427e-05,
"loss": 0.2838,
"step": 182
},
{
"epoch": 0.11712,
"grad_norm": 0.32994641779773276,
"learning_rate": 2.9644344578278284e-05,
"loss": 0.2791,
"step": 183
},
{
"epoch": 0.11776,
"grad_norm": 0.35570577433544603,
"learning_rate": 2.963743321433402e-05,
"loss": 0.2773,
"step": 184
},
{
"epoch": 0.1184,
"grad_norm": 0.3596282933620285,
"learning_rate": 2.9630456162969298e-05,
"loss": 0.2797,
"step": 185
},
{
"epoch": 0.11904,
"grad_norm": 0.3262807239848876,
"learning_rate": 2.9623413455494563e-05,
"loss": 0.288,
"step": 186
},
{
"epoch": 0.11968,
"grad_norm": 0.3565865179949324,
"learning_rate": 2.9616305123514897e-05,
"loss": 0.2887,
"step": 187
},
{
"epoch": 0.12032,
"grad_norm": 0.39532927846862465,
"learning_rate": 2.9609131198929884e-05,
"loss": 0.2782,
"step": 188
},
{
"epoch": 0.12096,
"grad_norm": 0.40888574656849996,
"learning_rate": 2.9601891713933457e-05,
"loss": 0.2718,
"step": 189
},
{
"epoch": 0.1216,
"grad_norm": 0.3494105157933286,
"learning_rate": 2.9594586701013765e-05,
"loss": 0.2925,
"step": 190
},
{
"epoch": 0.12224,
"grad_norm": 0.38169153601979233,
"learning_rate": 2.958721619295302e-05,
"loss": 0.2783,
"step": 191
},
{
"epoch": 0.12288,
"grad_norm": 0.3650159986813621,
"learning_rate": 2.9579780222827354e-05,
"loss": 0.2839,
"step": 192
},
{
"epoch": 0.12352,
"grad_norm": 0.32047234144052256,
"learning_rate": 2.957227882400667e-05,
"loss": 0.2781,
"step": 193
},
{
"epoch": 0.12416,
"grad_norm": 0.39912184054626987,
"learning_rate": 2.9564712030154486e-05,
"loss": 0.2992,
"step": 194
},
{
"epoch": 0.1248,
"grad_norm": 0.3589964401769852,
"learning_rate": 2.9557079875227795e-05,
"loss": 0.2991,
"step": 195
},
{
"epoch": 0.12544,
"grad_norm": 0.3095249869798657,
"learning_rate": 2.95493823934769e-05,
"loss": 0.2886,
"step": 196
},
{
"epoch": 0.12608,
"grad_norm": 0.33996550944422993,
"learning_rate": 2.954161961944527e-05,
"loss": 0.2705,
"step": 197
},
{
"epoch": 0.12672,
"grad_norm": 0.3423071479543653,
"learning_rate": 2.953379158796938e-05,
"loss": 0.2795,
"step": 198
},
{
"epoch": 0.12736,
"grad_norm": 0.36082529918938056,
"learning_rate": 2.9525898334178566e-05,
"loss": 0.2887,
"step": 199
},
{
"epoch": 0.128,
"grad_norm": 0.37640661693173727,
"learning_rate": 2.951793989349484e-05,
"loss": 0.282,
"step": 200
},
{
"epoch": 0.12864,
"grad_norm": 0.3023070350545028,
"learning_rate": 2.950991630163277e-05,
"loss": 0.254,
"step": 201
},
{
"epoch": 0.12928,
"grad_norm": 0.3746495017193779,
"learning_rate": 2.950182759459928e-05,
"loss": 0.2913,
"step": 202
},
{
"epoch": 0.12992,
"grad_norm": 0.38483323478343573,
"learning_rate": 2.949367380869351e-05,
"loss": 0.2913,
"step": 203
},
{
"epoch": 0.13056,
"grad_norm": 0.4213879707012684,
"learning_rate": 2.9485454980506663e-05,
"loss": 0.2841,
"step": 204
},
{
"epoch": 0.1312,
"grad_norm": 0.4131757338780886,
"learning_rate": 2.9477171146921816e-05,
"loss": 0.2981,
"step": 205
},
{
"epoch": 0.13184,
"grad_norm": 0.4415959043983817,
"learning_rate": 2.946882234511377e-05,
"loss": 0.2975,
"step": 206
},
{
"epoch": 0.13248,
"grad_norm": 0.37499387107669097,
"learning_rate": 2.9460408612548876e-05,
"loss": 0.2844,
"step": 207
},
{
"epoch": 0.13312,
"grad_norm": 0.41827519988058354,
"learning_rate": 2.9451929986984875e-05,
"loss": 0.2958,
"step": 208
},
{
"epoch": 0.13376,
"grad_norm": 0.33683714204034787,
"learning_rate": 2.9443386506470725e-05,
"loss": 0.2686,
"step": 209
},
{
"epoch": 0.1344,
"grad_norm": 0.38867413035222104,
"learning_rate": 2.9434778209346427e-05,
"loss": 0.2876,
"step": 210
},
{
"epoch": 0.13504,
"grad_norm": 0.39089889654238963,
"learning_rate": 2.942610513424285e-05,
"loss": 0.2832,
"step": 211
},
{
"epoch": 0.13568,
"grad_norm": 0.3384476658736123,
"learning_rate": 2.9417367320081567e-05,
"loss": 0.2604,
"step": 212
},
{
"epoch": 0.13632,
"grad_norm": 0.38210814875894833,
"learning_rate": 2.940856480607468e-05,
"loss": 0.2986,
"step": 213
},
{
"epoch": 0.13696,
"grad_norm": 0.38075475653821916,
"learning_rate": 2.9399697631724637e-05,
"loss": 0.2761,
"step": 214
},
{
"epoch": 0.1376,
"grad_norm": 0.36451096593797544,
"learning_rate": 2.9390765836824053e-05,
"loss": 0.3053,
"step": 215
},
{
"epoch": 0.13824,
"grad_norm": 0.36903700040818893,
"learning_rate": 2.938176946145555e-05,
"loss": 0.2776,
"step": 216
},
{
"epoch": 0.13888,
"grad_norm": 0.3858847022411217,
"learning_rate": 2.9372708545991542e-05,
"loss": 0.2682,
"step": 217
},
{
"epoch": 0.13952,
"grad_norm": 0.3664388884748071,
"learning_rate": 2.936358313109409e-05,
"loss": 0.263,
"step": 218
},
{
"epoch": 0.14016,
"grad_norm": 0.3833106914044559,
"learning_rate": 2.935439325771471e-05,
"loss": 0.284,
"step": 219
},
{
"epoch": 0.1408,
"grad_norm": 0.4366332113795893,
"learning_rate": 2.9345138967094174e-05,
"loss": 0.2827,
"step": 220
},
{
"epoch": 0.14144,
"grad_norm": 0.35644258927978634,
"learning_rate": 2.9335820300762334e-05,
"loss": 0.2763,
"step": 221
},
{
"epoch": 0.14208,
"grad_norm": 0.4086383954714273,
"learning_rate": 2.9326437300537937e-05,
"loss": 0.2731,
"step": 222
},
{
"epoch": 0.14272,
"grad_norm": 0.37566830211795793,
"learning_rate": 2.9316990008528446e-05,
"loss": 0.3035,
"step": 223
},
{
"epoch": 0.14336,
"grad_norm": 0.37673883341537595,
"learning_rate": 2.9307478467129827e-05,
"loss": 0.2807,
"step": 224
},
{
"epoch": 0.144,
"grad_norm": 0.3658815413836672,
"learning_rate": 2.9297902719026392e-05,
"loss": 0.2872,
"step": 225
},
{
"epoch": 0.14464,
"grad_norm": 0.3723900174142704,
"learning_rate": 2.928826280719057e-05,
"loss": 0.2628,
"step": 226
},
{
"epoch": 0.14528,
"grad_norm": 0.3583483332892453,
"learning_rate": 2.9278558774882748e-05,
"loss": 0.2757,
"step": 227
},
{
"epoch": 0.14592,
"grad_norm": 0.36256098670512743,
"learning_rate": 2.9268790665651053e-05,
"loss": 0.2795,
"step": 228
},
{
"epoch": 0.14656,
"grad_norm": 0.4447205295517405,
"learning_rate": 2.925895852333117e-05,
"loss": 0.2917,
"step": 229
},
{
"epoch": 0.1472,
"grad_norm": 0.33653314086590325,
"learning_rate": 2.924906239204614e-05,
"loss": 0.2856,
"step": 230
},
{
"epoch": 0.14784,
"grad_norm": 0.3875121699387206,
"learning_rate": 2.9239102316206166e-05,
"loss": 0.2756,
"step": 231
},
{
"epoch": 0.14848,
"grad_norm": 0.4191113393584937,
"learning_rate": 2.9229078340508404e-05,
"loss": 0.2947,
"step": 232
},
{
"epoch": 0.14912,
"grad_norm": 0.3866679704383513,
"learning_rate": 2.9218990509936774e-05,
"loss": 0.2948,
"step": 233
},
{
"epoch": 0.14976,
"grad_norm": 0.44210860261964957,
"learning_rate": 2.9208838869761756e-05,
"loss": 0.2823,
"step": 234
},
{
"epoch": 0.1504,
"grad_norm": 0.3569394913477398,
"learning_rate": 2.9198623465540172e-05,
"loss": 0.2896,
"step": 235
},
{
"epoch": 0.15104,
"grad_norm": 0.3582653155321637,
"learning_rate": 2.9188344343115005e-05,
"loss": 0.2639,
"step": 236
},
{
"epoch": 0.15168,
"grad_norm": 0.3632299874629792,
"learning_rate": 2.9178001548615176e-05,
"loss": 0.2752,
"step": 237
},
{
"epoch": 0.15232,
"grad_norm": 0.3627690799272646,
"learning_rate": 2.9167595128455357e-05,
"loss": 0.2697,
"step": 238
},
{
"epoch": 0.15296,
"grad_norm": 0.31681761773997114,
"learning_rate": 2.915712512933572e-05,
"loss": 0.2886,
"step": 239
},
{
"epoch": 0.1536,
"grad_norm": 0.3652202557075009,
"learning_rate": 2.914659159824178e-05,
"loss": 0.291,
"step": 240
},
{
"epoch": 0.15424,
"grad_norm": 0.32355380076781154,
"learning_rate": 2.913599458244416e-05,
"loss": 0.288,
"step": 241
},
{
"epoch": 0.15488,
"grad_norm": 0.3263439505506079,
"learning_rate": 2.9125334129498368e-05,
"loss": 0.2669,
"step": 242
},
{
"epoch": 0.15552,
"grad_norm": 0.34698579986117173,
"learning_rate": 2.911461028724459e-05,
"loss": 0.2623,
"step": 243
},
{
"epoch": 0.15616,
"grad_norm": 0.3560347825174995,
"learning_rate": 2.9103823103807503e-05,
"loss": 0.2779,
"step": 244
},
{
"epoch": 0.1568,
"grad_norm": 0.3829349365942763,
"learning_rate": 2.9092972627596012e-05,
"loss": 0.285,
"step": 245
},
{
"epoch": 0.15744,
"grad_norm": 0.35634808472353574,
"learning_rate": 2.9082058907303064e-05,
"loss": 0.2787,
"step": 246
},
{
"epoch": 0.15808,
"grad_norm": 0.3973522847788536,
"learning_rate": 2.9071081991905436e-05,
"loss": 0.2686,
"step": 247
},
{
"epoch": 0.15872,
"grad_norm": 0.3503037505655835,
"learning_rate": 2.9060041930663477e-05,
"loss": 0.2601,
"step": 248
},
{
"epoch": 0.15936,
"grad_norm": 0.42641830259468066,
"learning_rate": 2.9048938773120933e-05,
"loss": 0.2571,
"step": 249
},
{
"epoch": 0.16,
"grad_norm": 0.33022262318082957,
"learning_rate": 2.903777256910469e-05,
"loss": 0.2539,
"step": 250
},
{
"epoch": 0.16064,
"grad_norm": 0.34806931009286257,
"learning_rate": 2.902654336872457e-05,
"loss": 0.2855,
"step": 251
},
{
"epoch": 0.16128,
"grad_norm": 0.3802291207724926,
"learning_rate": 2.9015251222373094e-05,
"loss": 0.2505,
"step": 252
},
{
"epoch": 0.16192,
"grad_norm": 0.4182370813509436,
"learning_rate": 2.9003896180725268e-05,
"loss": 0.2821,
"step": 253
},
{
"epoch": 0.16256,
"grad_norm": 0.45914515409569023,
"learning_rate": 2.8992478294738345e-05,
"loss": 0.2934,
"step": 254
},
{
"epoch": 0.1632,
"grad_norm": 0.3490846199737997,
"learning_rate": 2.8980997615651597e-05,
"loss": 0.2853,
"step": 255
},
{
"epoch": 0.16384,
"grad_norm": 0.3945607808525512,
"learning_rate": 2.8969454194986095e-05,
"loss": 0.2704,
"step": 256
},
{
"epoch": 0.16448,
"grad_norm": 0.4134287311468698,
"learning_rate": 2.8957848084544473e-05,
"loss": 0.2856,
"step": 257
},
{
"epoch": 0.16512,
"grad_norm": 0.388727953362913,
"learning_rate": 2.8946179336410682e-05,
"loss": 0.2707,
"step": 258
},
{
"epoch": 0.16576,
"grad_norm": 0.3707496935300037,
"learning_rate": 2.8934448002949775e-05,
"loss": 0.2736,
"step": 259
},
{
"epoch": 0.1664,
"grad_norm": 0.37509134347160566,
"learning_rate": 2.892265413680767e-05,
"loss": 0.2656,
"step": 260
},
{
"epoch": 0.16704,
"grad_norm": 0.37839278243532787,
"learning_rate": 2.8910797790910902e-05,
"loss": 0.2954,
"step": 261
},
{
"epoch": 0.16768,
"grad_norm": 0.3709934691909708,
"learning_rate": 2.889887901846639e-05,
"loss": 0.2771,
"step": 262
},
{
"epoch": 0.16832,
"grad_norm": 0.38441885239694634,
"learning_rate": 2.8886897872961203e-05,
"loss": 0.2872,
"step": 263
},
{
"epoch": 0.16896,
"grad_norm": 0.36199416231449505,
"learning_rate": 2.887485440816233e-05,
"loss": 0.2577,
"step": 264
},
{
"epoch": 0.1696,
"grad_norm": 0.35207892498047666,
"learning_rate": 2.8862748678116402e-05,
"loss": 0.2692,
"step": 265
},
{
"epoch": 0.17024,
"grad_norm": 0.3167681279515036,
"learning_rate": 2.885058073714949e-05,
"loss": 0.2517,
"step": 266
},
{
"epoch": 0.17088,
"grad_norm": 0.3836456596956298,
"learning_rate": 2.8838350639866843e-05,
"loss": 0.3038,
"step": 267
},
{
"epoch": 0.17152,
"grad_norm": 0.3604280243223731,
"learning_rate": 2.882605844115264e-05,
"loss": 0.2748,
"step": 268
},
{
"epoch": 0.17216,
"grad_norm": 0.33239517181664746,
"learning_rate": 2.8813704196169753e-05,
"loss": 0.2705,
"step": 269
},
{
"epoch": 0.1728,
"grad_norm": 0.4453367026441279,
"learning_rate": 2.8801287960359494e-05,
"loss": 0.2752,
"step": 270
},
{
"epoch": 0.17344,
"grad_norm": 0.3672385598637218,
"learning_rate": 2.8788809789441364e-05,
"loss": 0.2813,
"step": 271
},
{
"epoch": 0.17408,
"grad_norm": 0.42500271676682233,
"learning_rate": 2.8776269739412803e-05,
"loss": 0.2581,
"step": 272
},
{
"epoch": 0.17472,
"grad_norm": 0.3812194331681742,
"learning_rate": 2.8763667866548956e-05,
"loss": 0.2797,
"step": 273
},
{
"epoch": 0.17536,
"grad_norm": 0.398047671024292,
"learning_rate": 2.875100422740239e-05,
"loss": 0.2822,
"step": 274
},
{
"epoch": 0.176,
"grad_norm": 0.4382339198136674,
"learning_rate": 2.8738278878802865e-05,
"loss": 0.2951,
"step": 275
},
{
"epoch": 0.17664,
"grad_norm": 0.32914734501477555,
"learning_rate": 2.8725491877857073e-05,
"loss": 0.2505,
"step": 276
},
{
"epoch": 0.17728,
"grad_norm": 0.359221594407232,
"learning_rate": 2.8712643281948365e-05,
"loss": 0.2682,
"step": 277
},
{
"epoch": 0.17792,
"grad_norm": 0.3610201048162127,
"learning_rate": 2.8699733148736525e-05,
"loss": 0.2706,
"step": 278
},
{
"epoch": 0.17856,
"grad_norm": 0.3576575465005173,
"learning_rate": 2.868676153615748e-05,
"loss": 0.2725,
"step": 279
},
{
"epoch": 0.1792,
"grad_norm": 0.39648269977135403,
"learning_rate": 2.8673728502423067e-05,
"loss": 0.2638,
"step": 280
},
{
"epoch": 0.17984,
"grad_norm": 0.35937870544419476,
"learning_rate": 2.8660634106020747e-05,
"loss": 0.2753,
"step": 281
},
{
"epoch": 0.18048,
"grad_norm": 0.4065407489775043,
"learning_rate": 2.8647478405713355e-05,
"loss": 0.2814,
"step": 282
},
{
"epoch": 0.18112,
"grad_norm": 0.3401826548234812,
"learning_rate": 2.8634261460538845e-05,
"loss": 0.2734,
"step": 283
},
{
"epoch": 0.18176,
"grad_norm": 0.3871247999638719,
"learning_rate": 2.8620983329810004e-05,
"loss": 0.266,
"step": 284
},
{
"epoch": 0.1824,
"grad_norm": 0.40719492038320787,
"learning_rate": 2.860764407311421e-05,
"loss": 0.2727,
"step": 285
},
{
"epoch": 0.18304,
"grad_norm": 0.3399520864358679,
"learning_rate": 2.8594243750313137e-05,
"loss": 0.2689,
"step": 286
},
{
"epoch": 0.18368,
"grad_norm": 0.3953164307154916,
"learning_rate": 2.8580782421542514e-05,
"loss": 0.273,
"step": 287
},
{
"epoch": 0.18432,
"grad_norm": 0.4447124642066109,
"learning_rate": 2.8567260147211826e-05,
"loss": 0.282,
"step": 288
},
{
"epoch": 0.18496,
"grad_norm": 0.3479285885985358,
"learning_rate": 2.8553676988004083e-05,
"loss": 0.2846,
"step": 289
},
{
"epoch": 0.1856,
"grad_norm": 0.4427716710403484,
"learning_rate": 2.8540033004875506e-05,
"loss": 0.2731,
"step": 290
},
{
"epoch": 0.18624,
"grad_norm": 0.3389993255265989,
"learning_rate": 2.852632825905528e-05,
"loss": 0.2728,
"step": 291
},
{
"epoch": 0.18688,
"grad_norm": 0.36214801666992735,
"learning_rate": 2.851256281204526e-05,
"loss": 0.2963,
"step": 292
},
{
"epoch": 0.18752,
"grad_norm": 0.3225822277564517,
"learning_rate": 2.849873672561972e-05,
"loss": 0.256,
"step": 293
},
{
"epoch": 0.18816,
"grad_norm": 0.36090735754771,
"learning_rate": 2.8484850061825052e-05,
"loss": 0.274,
"step": 294
},
{
"epoch": 0.1888,
"grad_norm": 0.3619026443591473,
"learning_rate": 2.84709028829795e-05,
"loss": 0.2904,
"step": 295
},
{
"epoch": 0.18944,
"grad_norm": 0.3252751498399478,
"learning_rate": 2.8456895251672867e-05,
"loss": 0.2892,
"step": 296
},
{
"epoch": 0.19008,
"grad_norm": 0.3420708202146713,
"learning_rate": 2.8442827230766265e-05,
"loss": 0.2509,
"step": 297
},
{
"epoch": 0.19072,
"grad_norm": 0.3927376280930936,
"learning_rate": 2.8428698883391805e-05,
"loss": 0.2902,
"step": 298
},
{
"epoch": 0.19136,
"grad_norm": 0.33206871786295983,
"learning_rate": 2.8414510272952306e-05,
"loss": 0.2725,
"step": 299
},
{
"epoch": 0.192,
"grad_norm": 0.31541664358451077,
"learning_rate": 2.840026146312104e-05,
"loss": 0.2379,
"step": 300
},
{
"epoch": 0.19264,
"grad_norm": 0.3610931095304509,
"learning_rate": 2.8385952517841433e-05,
"loss": 0.2778,
"step": 301
},
{
"epoch": 0.19328,
"grad_norm": 0.32547764886975145,
"learning_rate": 2.837158350132677e-05,
"loss": 0.2786,
"step": 302
},
{
"epoch": 0.19392,
"grad_norm": 0.3428550126773416,
"learning_rate": 2.835715447805991e-05,
"loss": 0.2896,
"step": 303
},
{
"epoch": 0.19456,
"grad_norm": 0.34537169429710707,
"learning_rate": 2.8342665512793018e-05,
"loss": 0.2888,
"step": 304
},
{
"epoch": 0.1952,
"grad_norm": 0.3526668903699371,
"learning_rate": 2.8328116670547237e-05,
"loss": 0.2711,
"step": 305
},
{
"epoch": 0.19584,
"grad_norm": 0.3298129440685655,
"learning_rate": 2.8313508016612428e-05,
"loss": 0.2723,
"step": 306
},
{
"epoch": 0.19648,
"grad_norm": 0.3252403541009243,
"learning_rate": 2.8298839616546854e-05,
"loss": 0.2783,
"step": 307
},
{
"epoch": 0.19712,
"grad_norm": 0.31942049727047156,
"learning_rate": 2.8284111536176907e-05,
"loss": 0.3047,
"step": 308
},
{
"epoch": 0.19776,
"grad_norm": 0.28119673519843524,
"learning_rate": 2.8269323841596802e-05,
"loss": 0.2685,
"step": 309
},
{
"epoch": 0.1984,
"grad_norm": 0.3574212794376199,
"learning_rate": 2.825447659916827e-05,
"loss": 0.271,
"step": 310
},
{
"epoch": 0.19904,
"grad_norm": 0.30153919945084046,
"learning_rate": 2.823956987552028e-05,
"loss": 0.2529,
"step": 311
},
{
"epoch": 0.19968,
"grad_norm": 0.3494979054883366,
"learning_rate": 2.8224603737548737e-05,
"loss": 0.2886,
"step": 312
},
{
"epoch": 0.20032,
"grad_norm": 0.30740471706963013,
"learning_rate": 2.8209578252416162e-05,
"loss": 0.2876,
"step": 313
},
{
"epoch": 0.20096,
"grad_norm": 0.3279624895202358,
"learning_rate": 2.8194493487551402e-05,
"loss": 0.2946,
"step": 314
},
{
"epoch": 0.2016,
"grad_norm": 0.3430080164088275,
"learning_rate": 2.8179349510649354e-05,
"loss": 0.2891,
"step": 315
},
{
"epoch": 0.20224,
"grad_norm": 0.30496631367689814,
"learning_rate": 2.8164146389670605e-05,
"loss": 0.2842,
"step": 316
},
{
"epoch": 0.20288,
"grad_norm": 0.379409454373658,
"learning_rate": 2.8148884192841183e-05,
"loss": 0.294,
"step": 317
},
{
"epoch": 0.20352,
"grad_norm": 0.2948622980345085,
"learning_rate": 2.8133562988652218e-05,
"loss": 0.2584,
"step": 318
},
{
"epoch": 0.20416,
"grad_norm": 0.3443019691027711,
"learning_rate": 2.8118182845859636e-05,
"loss": 0.251,
"step": 319
},
{
"epoch": 0.2048,
"grad_norm": 0.36699154745018375,
"learning_rate": 2.810274383348387e-05,
"loss": 0.2952,
"step": 320
},
{
"epoch": 0.20544,
"grad_norm": 0.33967893078585726,
"learning_rate": 2.8087246020809536e-05,
"loss": 0.2796,
"step": 321
},
{
"epoch": 0.20608,
"grad_norm": 0.31170315997663467,
"learning_rate": 2.8071689477385117e-05,
"loss": 0.2721,
"step": 322
},
{
"epoch": 0.20672,
"grad_norm": 0.3242087780438831,
"learning_rate": 2.8056074273022666e-05,
"loss": 0.2774,
"step": 323
},
{
"epoch": 0.20736,
"grad_norm": 0.3555981636400892,
"learning_rate": 2.8040400477797476e-05,
"loss": 0.2797,
"step": 324
},
{
"epoch": 0.208,
"grad_norm": 0.309319481935765,
"learning_rate": 2.8024668162047783e-05,
"loss": 0.292,
"step": 325
},
{
"epoch": 0.20864,
"grad_norm": 0.38567954428634077,
"learning_rate": 2.8008877396374434e-05,
"loss": 0.2622,
"step": 326
},
{
"epoch": 0.20928,
"grad_norm": 0.3558299126479417,
"learning_rate": 2.799302825164058e-05,
"loss": 0.2741,
"step": 327
},
{
"epoch": 0.20992,
"grad_norm": 0.32961441916292467,
"learning_rate": 2.7977120798971374e-05,
"loss": 0.2669,
"step": 328
},
{
"epoch": 0.21056,
"grad_norm": 0.30783578854216875,
"learning_rate": 2.7961155109753596e-05,
"loss": 0.2645,
"step": 329
},
{
"epoch": 0.2112,
"grad_norm": 0.32966575581899893,
"learning_rate": 2.79451312556354e-05,
"loss": 0.2654,
"step": 330
},
{
"epoch": 0.21184,
"grad_norm": 0.31714659245735044,
"learning_rate": 2.7929049308525958e-05,
"loss": 0.2542,
"step": 331
},
{
"epoch": 0.21248,
"grad_norm": 0.3196236044042015,
"learning_rate": 2.7912909340595133e-05,
"loss": 0.2624,
"step": 332
},
{
"epoch": 0.21312,
"grad_norm": 0.3579856676787265,
"learning_rate": 2.7896711424273166e-05,
"loss": 0.272,
"step": 333
},
{
"epoch": 0.21376,
"grad_norm": 0.3327728411912822,
"learning_rate": 2.7880455632250358e-05,
"loss": 0.2588,
"step": 334
},
{
"epoch": 0.2144,
"grad_norm": 0.31027790499808855,
"learning_rate": 2.7864142037476727e-05,
"loss": 0.2728,
"step": 335
},
{
"epoch": 0.21504,
"grad_norm": 0.3202262124080976,
"learning_rate": 2.784777071316169e-05,
"loss": 0.2818,
"step": 336
},
{
"epoch": 0.21568,
"grad_norm": 0.29821110258387634,
"learning_rate": 2.7831341732773733e-05,
"loss": 0.2561,
"step": 337
},
{
"epoch": 0.21632,
"grad_norm": 0.33344963895852425,
"learning_rate": 2.7814855170040083e-05,
"loss": 0.2859,
"step": 338
},
{
"epoch": 0.21696,
"grad_norm": 0.3260713930863079,
"learning_rate": 2.7798311098946375e-05,
"loss": 0.2529,
"step": 339
},
{
"epoch": 0.2176,
"grad_norm": 0.3749841150106177,
"learning_rate": 2.7781709593736316e-05,
"loss": 0.2807,
"step": 340
},
{
"epoch": 0.21824,
"grad_norm": 0.34099250856014796,
"learning_rate": 2.7765050728911363e-05,
"loss": 0.2886,
"step": 341
},
{
"epoch": 0.21888,
"grad_norm": 0.33765459803209535,
"learning_rate": 2.7748334579230375e-05,
"loss": 0.2497,
"step": 342
},
{
"epoch": 0.21952,
"grad_norm": 0.2851983974846401,
"learning_rate": 2.773156121970929e-05,
"loss": 0.2401,
"step": 343
},
{
"epoch": 0.22016,
"grad_norm": 3.444718072231663,
"learning_rate": 2.7714730725620786e-05,
"loss": 0.2706,
"step": 344
},
{
"epoch": 0.2208,
"grad_norm": 0.41628559582587626,
"learning_rate": 2.7697843172493925e-05,
"loss": 0.2732,
"step": 345
},
{
"epoch": 0.22144,
"grad_norm": 0.33063598183417514,
"learning_rate": 2.7680898636113845e-05,
"loss": 0.2775,
"step": 346
},
{
"epoch": 0.22208,
"grad_norm": 0.3317549477105874,
"learning_rate": 2.7663897192521393e-05,
"loss": 0.2673,
"step": 347
},
{
"epoch": 0.22272,
"grad_norm": 0.39451049358358514,
"learning_rate": 2.764683891801281e-05,
"loss": 0.2853,
"step": 348
},
{
"epoch": 0.22336,
"grad_norm": 0.33825604933869896,
"learning_rate": 2.7629723889139345e-05,
"loss": 0.2836,
"step": 349
},
{
"epoch": 0.224,
"grad_norm": 0.3938596661138968,
"learning_rate": 2.7612552182706968e-05,
"loss": 0.2679,
"step": 350
},
{
"epoch": 0.22464,
"grad_norm": 0.3709101970004551,
"learning_rate": 2.759532387577599e-05,
"loss": 0.2819,
"step": 351
},
{
"epoch": 0.22528,
"grad_norm": 0.35417172495939264,
"learning_rate": 2.7578039045660713e-05,
"loss": 0.2639,
"step": 352
},
{
"epoch": 0.22592,
"grad_norm": 0.3327176351302671,
"learning_rate": 2.75606977699291e-05,
"loss": 0.2639,
"step": 353
},
{
"epoch": 0.22656,
"grad_norm": 0.3564326153038442,
"learning_rate": 2.754330012640243e-05,
"loss": 0.2556,
"step": 354
},
{
"epoch": 0.2272,
"grad_norm": 0.32512504523834834,
"learning_rate": 2.752584619315493e-05,
"loss": 0.2643,
"step": 355
},
{
"epoch": 0.22784,
"grad_norm": 0.3882343021011366,
"learning_rate": 2.7508336048513437e-05,
"loss": 0.2786,
"step": 356
},
{
"epoch": 0.22848,
"grad_norm": 0.3645455650133275,
"learning_rate": 2.7490769771057043e-05,
"loss": 0.2642,
"step": 357
},
{
"epoch": 0.22912,
"grad_norm": 0.36026260628073103,
"learning_rate": 2.747314743961675e-05,
"loss": 0.2639,
"step": 358
},
{
"epoch": 0.22976,
"grad_norm": 0.3515677194617887,
"learning_rate": 2.7455469133275095e-05,
"loss": 0.2814,
"step": 359
},
{
"epoch": 0.2304,
"grad_norm": 0.34391091885619035,
"learning_rate": 2.743773493136583e-05,
"loss": 0.2673,
"step": 360
},
{
"epoch": 0.23104,
"grad_norm": 0.3449879915659821,
"learning_rate": 2.7419944913473533e-05,
"loss": 0.2333,
"step": 361
},
{
"epoch": 0.23168,
"grad_norm": 0.31990363584739007,
"learning_rate": 2.7402099159433258e-05,
"loss": 0.2532,
"step": 362
},
{
"epoch": 0.23232,
"grad_norm": 0.3875818304841908,
"learning_rate": 2.73841977493302e-05,
"loss": 0.2811,
"step": 363
},
{
"epoch": 0.23296,
"grad_norm": 0.3566549850839257,
"learning_rate": 2.7366240763499302e-05,
"loss": 0.2507,
"step": 364
},
{
"epoch": 0.2336,
"grad_norm": 0.3080698897167787,
"learning_rate": 2.7348228282524916e-05,
"loss": 0.2603,
"step": 365
},
{
"epoch": 0.23424,
"grad_norm": 0.3357967412245159,
"learning_rate": 2.7330160387240443e-05,
"loss": 0.2451,
"step": 366
},
{
"epoch": 0.23488,
"grad_norm": 0.3527005868734707,
"learning_rate": 2.731203715872795e-05,
"loss": 0.275,
"step": 367
},
{
"epoch": 0.23552,
"grad_norm": 0.316985061769891,
"learning_rate": 2.729385867831783e-05,
"loss": 0.269,
"step": 368
},
{
"epoch": 0.23616,
"grad_norm": 0.33344202384537386,
"learning_rate": 2.7275625027588414e-05,
"loss": 0.2691,
"step": 369
},
{
"epoch": 0.2368,
"grad_norm": 0.34625417830177385,
"learning_rate": 2.7257336288365634e-05,
"loss": 0.2653,
"step": 370
},
{
"epoch": 0.23744,
"grad_norm": 0.3267922712231095,
"learning_rate": 2.7238992542722625e-05,
"loss": 0.2617,
"step": 371
},
{
"epoch": 0.23808,
"grad_norm": 0.330793248709517,
"learning_rate": 2.722059387297938e-05,
"loss": 0.2574,
"step": 372
},
{
"epoch": 0.23872,
"grad_norm": 0.3463369707262466,
"learning_rate": 2.720214036170236e-05,
"loss": 0.2627,
"step": 373
},
{
"epoch": 0.23936,
"grad_norm": 0.33043396116356993,
"learning_rate": 2.7183632091704143e-05,
"loss": 0.2708,
"step": 374
},
{
"epoch": 0.24,
"grad_norm": 0.3485722629331789,
"learning_rate": 2.716506914604305e-05,
"loss": 0.2612,
"step": 375
},
{
"epoch": 0.24064,
"grad_norm": 0.3412635617995565,
"learning_rate": 2.7146451608022748e-05,
"loss": 0.2607,
"step": 376
},
{
"epoch": 0.24128,
"grad_norm": 0.35685994147745786,
"learning_rate": 2.7127779561191905e-05,
"loss": 0.2577,
"step": 377
},
{
"epoch": 0.24192,
"grad_norm": 0.3039380817502753,
"learning_rate": 2.7109053089343815e-05,
"loss": 0.2611,
"step": 378
},
{
"epoch": 0.24256,
"grad_norm": 0.43279102042200523,
"learning_rate": 2.7090272276515993e-05,
"loss": 0.2756,
"step": 379
},
{
"epoch": 0.2432,
"grad_norm": 0.31054351672072755,
"learning_rate": 2.707143720698983e-05,
"loss": 0.2708,
"step": 380
},
{
"epoch": 0.24384,
"grad_norm": 0.36851176929762747,
"learning_rate": 2.7052547965290186e-05,
"loss": 0.276,
"step": 381
},
{
"epoch": 0.24448,
"grad_norm": 0.3627663453356185,
"learning_rate": 2.703360463618504e-05,
"loss": 0.2732,
"step": 382
},
{
"epoch": 0.24512,
"grad_norm": 0.3576702828639981,
"learning_rate": 2.7014607304685096e-05,
"loss": 0.2625,
"step": 383
},
{
"epoch": 0.24576,
"grad_norm": 0.3437585046911205,
"learning_rate": 2.699555605604339e-05,
"loss": 0.2717,
"step": 384
},
{
"epoch": 0.2464,
"grad_norm": 0.3563941457915153,
"learning_rate": 2.6976450975754923e-05,
"loss": 0.2599,
"step": 385
},
{
"epoch": 0.24704,
"grad_norm": 0.32499802120658194,
"learning_rate": 2.6957292149556276e-05,
"loss": 0.2677,
"step": 386
},
{
"epoch": 0.24768,
"grad_norm": 0.33344191861008443,
"learning_rate": 2.6938079663425218e-05,
"loss": 0.2882,
"step": 387
},
{
"epoch": 0.24832,
"grad_norm": 0.3517570106726394,
"learning_rate": 2.691881360358033e-05,
"loss": 0.2754,
"step": 388
},
{
"epoch": 0.24896,
"grad_norm": 0.3237247905400656,
"learning_rate": 2.6899494056480596e-05,
"loss": 0.2545,
"step": 389
},
{
"epoch": 0.2496,
"grad_norm": 0.3605247711441668,
"learning_rate": 2.6880121108825056e-05,
"loss": 0.2858,
"step": 390
},
{
"epoch": 0.25024,
"grad_norm": 0.3227764776694619,
"learning_rate": 2.6860694847552374e-05,
"loss": 0.2588,
"step": 391
},
{
"epoch": 0.25088,
"grad_norm": 0.30914534952022565,
"learning_rate": 2.6841215359840468e-05,
"loss": 0.265,
"step": 392
},
{
"epoch": 0.25152,
"grad_norm": 0.3040468595020032,
"learning_rate": 2.682168273310612e-05,
"loss": 0.2572,
"step": 393
},
{
"epoch": 0.25216,
"grad_norm": 0.3582628170978083,
"learning_rate": 2.680209705500458e-05,
"loss": 0.2692,
"step": 394
},
{
"epoch": 0.2528,
"grad_norm": 0.3288877765942096,
"learning_rate": 2.6782458413429177e-05,
"loss": 0.2556,
"step": 395
},
{
"epoch": 0.25344,
"grad_norm": 0.3483940416687742,
"learning_rate": 2.6762766896510914e-05,
"loss": 0.2509,
"step": 396
},
{
"epoch": 0.25408,
"grad_norm": 0.4148522716152593,
"learning_rate": 2.6743022592618075e-05,
"loss": 0.2855,
"step": 397
},
{
"epoch": 0.25472,
"grad_norm": 0.3153140469461542,
"learning_rate": 2.6723225590355852e-05,
"loss": 0.2422,
"step": 398
},
{
"epoch": 0.25536,
"grad_norm": 0.3414987747573693,
"learning_rate": 2.6703375978565906e-05,
"loss": 0.2566,
"step": 399
},
{
"epoch": 0.256,
"grad_norm": 0.3082723258062477,
"learning_rate": 2.6683473846326002e-05,
"loss": 0.2692,
"step": 400
},
{
"epoch": 0.25664,
"grad_norm": 0.3120921043255892,
"learning_rate": 2.666351928294959e-05,
"loss": 0.2484,
"step": 401
},
{
"epoch": 0.25728,
"grad_norm": 0.34491023806967935,
"learning_rate": 2.6643512377985423e-05,
"loss": 0.2634,
"step": 402
},
{
"epoch": 0.25792,
"grad_norm": 1.227578859205086,
"learning_rate": 2.6623453221217137e-05,
"loss": 0.2681,
"step": 403
},
{
"epoch": 0.25856,
"grad_norm": 0.36139208492186886,
"learning_rate": 2.660334190266285e-05,
"loss": 0.2781,
"step": 404
},
{
"epoch": 0.2592,
"grad_norm": 0.3146578283750008,
"learning_rate": 2.658317851257477e-05,
"loss": 0.2661,
"step": 405
},
{
"epoch": 0.25984,
"grad_norm": 0.3241167195918667,
"learning_rate": 2.6562963141438783e-05,
"loss": 0.2747,
"step": 406
},
{
"epoch": 0.26048,
"grad_norm": 0.32983966170598866,
"learning_rate": 2.6542695879974044e-05,
"loss": 0.2821,
"step": 407
},
{
"epoch": 0.26112,
"grad_norm": 0.3302761405872256,
"learning_rate": 2.652237681913257e-05,
"loss": 0.2677,
"step": 408
},
{
"epoch": 0.26176,
"grad_norm": 0.33848152131881326,
"learning_rate": 2.6502006050098842e-05,
"loss": 0.2542,
"step": 409
},
{
"epoch": 0.2624,
"grad_norm": 0.3413235227280745,
"learning_rate": 2.648158366428938e-05,
"loss": 0.2664,
"step": 410
},
{
"epoch": 0.26304,
"grad_norm": 0.32916338812223606,
"learning_rate": 2.6461109753352355e-05,
"loss": 0.2616,
"step": 411
},
{
"epoch": 0.26368,
"grad_norm": 0.3053892800899407,
"learning_rate": 2.6440584409167144e-05,
"loss": 0.2583,
"step": 412
},
{
"epoch": 0.26432,
"grad_norm": 0.3385379902087113,
"learning_rate": 2.6420007723843952e-05,
"loss": 0.256,
"step": 413
},
{
"epoch": 0.26496,
"grad_norm": 0.37571936271303413,
"learning_rate": 2.639937978972338e-05,
"loss": 0.2503,
"step": 414
},
{
"epoch": 0.2656,
"grad_norm": 0.2854100162554339,
"learning_rate": 2.6378700699376015e-05,
"loss": 0.2415,
"step": 415
},
{
"epoch": 0.26624,
"grad_norm": 0.31271921873205044,
"learning_rate": 2.6357970545602014e-05,
"loss": 0.2475,
"step": 416
},
{
"epoch": 0.26688,
"grad_norm": 0.34030180044647645,
"learning_rate": 2.6337189421430688e-05,
"loss": 0.2379,
"step": 417
},
{
"epoch": 0.26752,
"grad_norm": 0.4075617302343357,
"learning_rate": 2.631635742012009e-05,
"loss": 0.2742,
"step": 418
},
{
"epoch": 0.26816,
"grad_norm": 0.3607203044257088,
"learning_rate": 2.629547463515657e-05,
"loss": 0.2694,
"step": 419
},
{
"epoch": 0.2688,
"grad_norm": 0.3244858888191209,
"learning_rate": 2.6274541160254405e-05,
"loss": 0.2633,
"step": 420
},
{
"epoch": 0.26944,
"grad_norm": 0.3940598188912196,
"learning_rate": 2.6253557089355333e-05,
"loss": 0.2466,
"step": 421
},
{
"epoch": 0.27008,
"grad_norm": 0.3011262776053056,
"learning_rate": 2.6232522516628153e-05,
"loss": 0.2727,
"step": 422
},
{
"epoch": 0.27072,
"grad_norm": 0.3739061295982007,
"learning_rate": 2.6211437536468292e-05,
"loss": 0.2671,
"step": 423
},
{
"epoch": 0.27136,
"grad_norm": 0.32945664324477797,
"learning_rate": 2.6190302243497396e-05,
"loss": 0.2708,
"step": 424
},
{
"epoch": 0.272,
"grad_norm": 0.36534937038877574,
"learning_rate": 2.6169116732562892e-05,
"loss": 0.2298,
"step": 425
},
{
"epoch": 0.27264,
"grad_norm": 0.33670531669484066,
"learning_rate": 2.6147881098737562e-05,
"loss": 0.2681,
"step": 426
},
{
"epoch": 0.27328,
"grad_norm": 0.35965800175602797,
"learning_rate": 2.612659543731913e-05,
"loss": 0.2675,
"step": 427
},
{
"epoch": 0.27392,
"grad_norm": 0.3509912783523724,
"learning_rate": 2.610525984382982e-05,
"loss": 0.2752,
"step": 428
},
{
"epoch": 0.27456,
"grad_norm": 0.3341800947139574,
"learning_rate": 2.6083874414015934e-05,
"loss": 0.2651,
"step": 429
},
{
"epoch": 0.2752,
"grad_norm": 0.3512265177853723,
"learning_rate": 2.606243924384742e-05,
"loss": 0.2746,
"step": 430
},
{
"epoch": 0.27584,
"grad_norm": 0.3314007433299324,
"learning_rate": 2.6040954429517442e-05,
"loss": 0.2406,
"step": 431
},
{
"epoch": 0.27648,
"grad_norm": 0.31623472637208033,
"learning_rate": 2.6019420067441958e-05,
"loss": 0.2432,
"step": 432
},
{
"epoch": 0.27712,
"grad_norm": 0.33629626691329856,
"learning_rate": 2.599783625425926e-05,
"loss": 0.2642,
"step": 433
},
{
"epoch": 0.27776,
"grad_norm": 0.32932652640189397,
"learning_rate": 2.597620308682957e-05,
"loss": 0.2722,
"step": 434
},
{
"epoch": 0.2784,
"grad_norm": 0.3162553853731888,
"learning_rate": 2.5954520662234602e-05,
"loss": 0.253,
"step": 435
},
{
"epoch": 0.27904,
"grad_norm": 0.32294208690580367,
"learning_rate": 2.5932789077777103e-05,
"loss": 0.2453,
"step": 436
},
{
"epoch": 0.27968,
"grad_norm": 0.2860688977348506,
"learning_rate": 2.591100843098043e-05,
"loss": 0.2583,
"step": 437
},
{
"epoch": 0.28032,
"grad_norm": 0.3382649825825897,
"learning_rate": 2.5889178819588134e-05,
"loss": 0.243,
"step": 438
},
{
"epoch": 0.28096,
"grad_norm": 0.3426287563355019,
"learning_rate": 2.5867300341563477e-05,
"loss": 0.2832,
"step": 439
},
{
"epoch": 0.2816,
"grad_norm": 0.3616023005679438,
"learning_rate": 2.5845373095089028e-05,
"loss": 0.2586,
"step": 440
},
{
"epoch": 0.28224,
"grad_norm": 0.3680942120537055,
"learning_rate": 2.5823397178566217e-05,
"loss": 0.2449,
"step": 441
},
{
"epoch": 0.28288,
"grad_norm": 0.3266840109358284,
"learning_rate": 2.580137269061488e-05,
"loss": 0.2806,
"step": 442
},
{
"epoch": 0.28352,
"grad_norm": 0.3268151383316364,
"learning_rate": 2.5779299730072815e-05,
"loss": 0.264,
"step": 443
},
{
"epoch": 0.28416,
"grad_norm": 0.32651398276400556,
"learning_rate": 2.5757178395995358e-05,
"loss": 0.2672,
"step": 444
},
{
"epoch": 0.2848,
"grad_norm": 0.35504343797167337,
"learning_rate": 2.5735008787654933e-05,
"loss": 0.2563,
"step": 445
},
{
"epoch": 0.28544,
"grad_norm": 0.3332900931306675,
"learning_rate": 2.5712791004540592e-05,
"loss": 0.2679,
"step": 446
},
{
"epoch": 0.28608,
"grad_norm": 0.32153395285613084,
"learning_rate": 2.5690525146357575e-05,
"loss": 0.2572,
"step": 447
},
{
"epoch": 0.28672,
"grad_norm": 0.33090800287138983,
"learning_rate": 2.566821131302688e-05,
"loss": 0.2834,
"step": 448
},
{
"epoch": 0.28736,
"grad_norm": 0.336200932174992,
"learning_rate": 2.5645849604684775e-05,
"loss": 0.248,
"step": 449
},
{
"epoch": 0.288,
"grad_norm": 0.32237186462753925,
"learning_rate": 2.56234401216824e-05,
"loss": 0.2665,
"step": 450
},
{
"epoch": 0.28864,
"grad_norm": 0.3195380119597909,
"learning_rate": 2.5600982964585272e-05,
"loss": 0.2508,
"step": 451
},
{
"epoch": 0.28928,
"grad_norm": 0.3475060876563337,
"learning_rate": 2.5578478234172865e-05,
"loss": 0.2594,
"step": 452
},
{
"epoch": 0.28992,
"grad_norm": 0.627047630421656,
"learning_rate": 2.5555926031438134e-05,
"loss": 0.2485,
"step": 453
},
{
"epoch": 0.29056,
"grad_norm": 0.3005632921400129,
"learning_rate": 2.5533326457587072e-05,
"loss": 0.2363,
"step": 454
},
{
"epoch": 0.2912,
"grad_norm": 0.3907484843122131,
"learning_rate": 2.551067961403827e-05,
"loss": 0.2635,
"step": 455
},
{
"epoch": 0.29184,
"grad_norm": 0.30910241081220835,
"learning_rate": 2.5487985602422425e-05,
"loss": 0.2554,
"step": 456
},
{
"epoch": 0.29248,
"grad_norm": 0.3316657790671481,
"learning_rate": 2.546524452458193e-05,
"loss": 0.2465,
"step": 457
},
{
"epoch": 0.29312,
"grad_norm": 0.33514519152514977,
"learning_rate": 2.5442456482570383e-05,
"loss": 0.2509,
"step": 458
},
{
"epoch": 0.29376,
"grad_norm": 0.3135722919118478,
"learning_rate": 2.5419621578652137e-05,
"loss": 0.2584,
"step": 459
},
{
"epoch": 0.2944,
"grad_norm": 0.8767929120277721,
"learning_rate": 2.539673991530185e-05,
"loss": 0.2649,
"step": 460
},
{
"epoch": 0.29504,
"grad_norm": 0.32933106915081395,
"learning_rate": 2.5373811595204022e-05,
"loss": 0.2727,
"step": 461
},
{
"epoch": 0.29568,
"grad_norm": 0.4082581717544081,
"learning_rate": 2.5350836721252525e-05,
"loss": 0.2569,
"step": 462
},
{
"epoch": 0.29632,
"grad_norm": 0.3412102305943323,
"learning_rate": 2.5327815396550142e-05,
"loss": 0.2639,
"step": 463
},
{
"epoch": 0.29696,
"grad_norm": 0.31780550108168804,
"learning_rate": 2.530474772440812e-05,
"loss": 0.2418,
"step": 464
},
{
"epoch": 0.2976,
"grad_norm": 0.29529324755348607,
"learning_rate": 2.5281633808345702e-05,
"loss": 0.2471,
"step": 465
},
{
"epoch": 0.29824,
"grad_norm": 0.33728330149501135,
"learning_rate": 2.5258473752089636e-05,
"loss": 0.2801,
"step": 466
},
{
"epoch": 0.29888,
"grad_norm": 0.3267488434365671,
"learning_rate": 2.5235267659573746e-05,
"loss": 0.2414,
"step": 467
},
{
"epoch": 0.29952,
"grad_norm": 0.29150985108582134,
"learning_rate": 2.521201563493845e-05,
"loss": 0.265,
"step": 468
},
{
"epoch": 0.30016,
"grad_norm": 0.30336323784858,
"learning_rate": 2.5188717782530292e-05,
"loss": 0.2593,
"step": 469
},
{
"epoch": 0.3008,
"grad_norm": 0.29926798408682637,
"learning_rate": 2.516537420690146e-05,
"loss": 0.2604,
"step": 470
},
{
"epoch": 0.30144,
"grad_norm": 0.3065859162009593,
"learning_rate": 2.514198501280934e-05,
"loss": 0.2629,
"step": 471
},
{
"epoch": 0.30208,
"grad_norm": 0.3049740162502653,
"learning_rate": 2.5118550305216054e-05,
"loss": 0.2754,
"step": 472
},
{
"epoch": 0.30272,
"grad_norm": 0.30876372307121774,
"learning_rate": 2.5095070189287944e-05,
"loss": 0.2668,
"step": 473
},
{
"epoch": 0.30336,
"grad_norm": 0.29273269101927585,
"learning_rate": 2.5071544770395143e-05,
"loss": 0.2558,
"step": 474
},
{
"epoch": 0.304,
"grad_norm": 0.3076653994547958,
"learning_rate": 2.5047974154111092e-05,
"loss": 0.2489,
"step": 475
},
{
"epoch": 0.30464,
"grad_norm": 0.3333719011093153,
"learning_rate": 2.5024358446212046e-05,
"loss": 0.2435,
"step": 476
},
{
"epoch": 0.30528,
"grad_norm": 0.38541240875825966,
"learning_rate": 2.5000697752676622e-05,
"loss": 0.2791,
"step": 477
},
{
"epoch": 0.30592,
"grad_norm": 0.31076311872010254,
"learning_rate": 2.4976992179685317e-05,
"loss": 0.2693,
"step": 478
},
{
"epoch": 0.30656,
"grad_norm": 0.3140598892806967,
"learning_rate": 2.4953241833620034e-05,
"loss": 0.2453,
"step": 479
},
{
"epoch": 0.3072,
"grad_norm": 0.30866549699497225,
"learning_rate": 2.4929446821063596e-05,
"loss": 0.2494,
"step": 480
},
{
"epoch": 0.30784,
"grad_norm": 0.3057358078354188,
"learning_rate": 2.4905607248799265e-05,
"loss": 0.2583,
"step": 481
},
{
"epoch": 0.30848,
"grad_norm": 0.2832731794280749,
"learning_rate": 2.4881723223810295e-05,
"loss": 0.2687,
"step": 482
},
{
"epoch": 0.30912,
"grad_norm": 0.3051553661664977,
"learning_rate": 2.4857794853279396e-05,
"loss": 0.2599,
"step": 483
},
{
"epoch": 0.30976,
"grad_norm": 0.2998378381048481,
"learning_rate": 2.4833822244588312e-05,
"loss": 0.2451,
"step": 484
},
{
"epoch": 0.3104,
"grad_norm": 0.3174813787668605,
"learning_rate": 2.4809805505317296e-05,
"loss": 0.264,
"step": 485
},
{
"epoch": 0.31104,
"grad_norm": 0.28982862061791764,
"learning_rate": 2.4785744743244644e-05,
"loss": 0.2478,
"step": 486
},
{
"epoch": 0.31168,
"grad_norm": 0.3147239609211115,
"learning_rate": 2.4761640066346217e-05,
"loss": 0.2372,
"step": 487
},
{
"epoch": 0.31232,
"grad_norm": 0.2898871152482304,
"learning_rate": 2.4737491582794945e-05,
"loss": 0.2638,
"step": 488
},
{
"epoch": 0.31296,
"grad_norm": 0.33890483931828097,
"learning_rate": 2.4713299400960342e-05,
"loss": 0.2444,
"step": 489
},
{
"epoch": 0.3136,
"grad_norm": 0.28327565279880107,
"learning_rate": 2.4689063629408034e-05,
"loss": 0.272,
"step": 490
},
{
"epoch": 0.31424,
"grad_norm": 0.3212172431491676,
"learning_rate": 2.4664784376899257e-05,
"loss": 0.2545,
"step": 491
},
{
"epoch": 0.31488,
"grad_norm": 0.3367903918403918,
"learning_rate": 2.4640461752390367e-05,
"loss": 0.2632,
"step": 492
},
{
"epoch": 0.31552,
"grad_norm": 0.3230413061531243,
"learning_rate": 2.4616095865032366e-05,
"loss": 0.2825,
"step": 493
},
{
"epoch": 0.31616,
"grad_norm": 0.34034207513600895,
"learning_rate": 2.459168682417041e-05,
"loss": 0.271,
"step": 494
},
{
"epoch": 0.3168,
"grad_norm": 0.32141809222820283,
"learning_rate": 2.4567234739343283e-05,
"loss": 0.2396,
"step": 495
},
{
"epoch": 0.31744,
"grad_norm": 0.2753406394259948,
"learning_rate": 2.454273972028297e-05,
"loss": 0.2404,
"step": 496
},
{
"epoch": 0.31808,
"grad_norm": 0.3351114179983899,
"learning_rate": 2.451820187691411e-05,
"loss": 0.2763,
"step": 497
},
{
"epoch": 0.31872,
"grad_norm": 0.3513219675385064,
"learning_rate": 2.4493621319353525e-05,
"loss": 0.2609,
"step": 498
},
{
"epoch": 0.31936,
"grad_norm": 0.3110180234096309,
"learning_rate": 2.4468998157909723e-05,
"loss": 0.2402,
"step": 499
},
{
"epoch": 0.32,
"grad_norm": 0.2964890570904897,
"learning_rate": 2.444433250308241e-05,
"loss": 0.2659,
"step": 500
},
{
"epoch": 0.32064,
"grad_norm": 0.3579764939490037,
"learning_rate": 2.4419624465561964e-05,
"loss": 0.2514,
"step": 501
},
{
"epoch": 0.32128,
"grad_norm": 0.3144597113409246,
"learning_rate": 2.4394874156228988e-05,
"loss": 0.2608,
"step": 502
},
{
"epoch": 0.32192,
"grad_norm": 0.31609390625354067,
"learning_rate": 2.4370081686153767e-05,
"loss": 0.2443,
"step": 503
},
{
"epoch": 0.32256,
"grad_norm": 0.29746982935843747,
"learning_rate": 2.4345247166595803e-05,
"loss": 0.2377,
"step": 504
},
{
"epoch": 0.3232,
"grad_norm": 0.34126278199611504,
"learning_rate": 2.4320370709003284e-05,
"loss": 0.2554,
"step": 505
},
{
"epoch": 0.32384,
"grad_norm": 0.2990237684037871,
"learning_rate": 2.429545242501261e-05,
"loss": 0.2848,
"step": 506
},
{
"epoch": 0.32448,
"grad_norm": 0.32322206092320094,
"learning_rate": 2.4270492426447884e-05,
"loss": 0.2561,
"step": 507
},
{
"epoch": 0.32512,
"grad_norm": 0.3260943452277802,
"learning_rate": 2.424549082532041e-05,
"loss": 0.26,
"step": 508
},
{
"epoch": 0.32576,
"grad_norm": 0.4295758739365506,
"learning_rate": 2.422044773382817e-05,
"loss": 0.2513,
"step": 509
},
{
"epoch": 0.3264,
"grad_norm": 0.31835152340497586,
"learning_rate": 2.4195363264355365e-05,
"loss": 0.2505,
"step": 510
},
{
"epoch": 0.32704,
"grad_norm": 0.3531418180586244,
"learning_rate": 2.417023752947188e-05,
"loss": 0.2609,
"step": 511
},
{
"epoch": 0.32768,
"grad_norm": 0.28968278915332163,
"learning_rate": 2.4145070641932767e-05,
"loss": 0.2472,
"step": 512
},
{
"epoch": 0.32832,
"grad_norm": 0.3176579318606065,
"learning_rate": 2.4119862714677773e-05,
"loss": 0.2563,
"step": 513
},
{
"epoch": 0.32896,
"grad_norm": 0.3447755475779212,
"learning_rate": 2.4094613860830813e-05,
"loss": 0.299,
"step": 514
},
{
"epoch": 0.3296,
"grad_norm": 0.30246755944222914,
"learning_rate": 2.4069324193699453e-05,
"loss": 0.2285,
"step": 515
},
{
"epoch": 0.33024,
"grad_norm": 0.36003409699329864,
"learning_rate": 2.4043993826774433e-05,
"loss": 0.2473,
"step": 516
},
{
"epoch": 0.33088,
"grad_norm": 0.35159179010717023,
"learning_rate": 2.4018622873729136e-05,
"loss": 0.2654,
"step": 517
},
{
"epoch": 0.33152,
"grad_norm": 0.30073111767960253,
"learning_rate": 2.3993211448419055e-05,
"loss": 0.249,
"step": 518
},
{
"epoch": 0.33216,
"grad_norm": 0.3319898092071986,
"learning_rate": 2.3967759664881347e-05,
"loss": 0.2737,
"step": 519
},
{
"epoch": 0.3328,
"grad_norm": 0.4467760115064727,
"learning_rate": 2.394226763733425e-05,
"loss": 0.2585,
"step": 520
},
{
"epoch": 0.33344,
"grad_norm": 0.3041939438386619,
"learning_rate": 2.3916735480176618e-05,
"loss": 0.2266,
"step": 521
}
],
"logging_steps": 1,
"max_steps": 1562,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 521,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 237774868512768.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}