tinystories_baseline_100k / trainer_state.json
jennhu's picture
jennhu/olmo-7b-lora_tinystories_baseline_100k_tinystories_baseline_100k
7f30d3c verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.99328,
"eval_steps": 400,
"global_step": 1170,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00512,
"grad_norm": 0.033956460654735565,
"learning_rate": 0.001998289136013687,
"loss": 1.5175,
"step": 2
},
{
"epoch": 0.01024,
"grad_norm": 0.11115246266126633,
"learning_rate": 0.0019948674080410606,
"loss": 1.5456,
"step": 4
},
{
"epoch": 0.01536,
"grad_norm": 0.31134533882141113,
"learning_rate": 0.001991445680068435,
"loss": 1.4942,
"step": 6
},
{
"epoch": 0.02048,
"grad_norm": 0.314473956823349,
"learning_rate": 0.0019880239520958082,
"loss": 1.4622,
"step": 8
},
{
"epoch": 0.0256,
"grad_norm": 0.3232758641242981,
"learning_rate": 0.001984602224123182,
"loss": 1.4351,
"step": 10
},
{
"epoch": 0.03072,
"grad_norm": 0.30670273303985596,
"learning_rate": 0.0019811804961505563,
"loss": 1.3924,
"step": 12
},
{
"epoch": 0.03584,
"grad_norm": 0.2764911651611328,
"learning_rate": 0.0019777587681779297,
"loss": 1.3622,
"step": 14
},
{
"epoch": 0.04096,
"grad_norm": 0.24471326172351837,
"learning_rate": 0.001974337040205304,
"loss": 1.386,
"step": 16
},
{
"epoch": 0.04608,
"grad_norm": 0.21196012198925018,
"learning_rate": 0.0019709153122326774,
"loss": 1.3682,
"step": 18
},
{
"epoch": 0.0512,
"grad_norm": 0.23733310401439667,
"learning_rate": 0.0019674935842600516,
"loss": 1.3512,
"step": 20
},
{
"epoch": 0.05632,
"grad_norm": 0.16206419467926025,
"learning_rate": 0.001964071856287425,
"loss": 1.3428,
"step": 22
},
{
"epoch": 0.06144,
"grad_norm": 0.2223929464817047,
"learning_rate": 0.001960650128314799,
"loss": 1.3706,
"step": 24
},
{
"epoch": 0.06656,
"grad_norm": 0.1612645536661148,
"learning_rate": 0.0019572284003421727,
"loss": 1.3775,
"step": 26
},
{
"epoch": 0.07168,
"grad_norm": 0.16616274416446686,
"learning_rate": 0.0019538066723695465,
"loss": 1.3572,
"step": 28
},
{
"epoch": 0.0768,
"grad_norm": 0.15922699868679047,
"learning_rate": 0.0019503849443969204,
"loss": 1.3482,
"step": 30
},
{
"epoch": 0.08192,
"grad_norm": 0.14939665794372559,
"learning_rate": 0.0019469632164242944,
"loss": 1.3312,
"step": 32
},
{
"epoch": 0.08704,
"grad_norm": 0.16849732398986816,
"learning_rate": 0.0019435414884516682,
"loss": 1.3394,
"step": 34
},
{
"epoch": 0.09216,
"grad_norm": 0.1453033685684204,
"learning_rate": 0.0019401197604790419,
"loss": 1.2957,
"step": 36
},
{
"epoch": 0.09728,
"grad_norm": 0.14633750915527344,
"learning_rate": 0.001936698032506416,
"loss": 1.334,
"step": 38
},
{
"epoch": 0.1024,
"grad_norm": 0.12038926780223846,
"learning_rate": 0.0019332763045337895,
"loss": 1.3257,
"step": 40
},
{
"epoch": 0.10752,
"grad_norm": 0.1144009605050087,
"learning_rate": 0.0019298545765611636,
"loss": 1.337,
"step": 42
},
{
"epoch": 0.11264,
"grad_norm": 0.09070790559053421,
"learning_rate": 0.0019264328485885372,
"loss": 1.2969,
"step": 44
},
{
"epoch": 0.11776,
"grad_norm": 0.09154277294874191,
"learning_rate": 0.001923011120615911,
"loss": 1.3233,
"step": 46
},
{
"epoch": 0.12288,
"grad_norm": 0.09742377698421478,
"learning_rate": 0.0019195893926432848,
"loss": 1.3272,
"step": 48
},
{
"epoch": 0.128,
"grad_norm": 0.10923709720373154,
"learning_rate": 0.0019161676646706587,
"loss": 1.3353,
"step": 50
},
{
"epoch": 0.13312,
"grad_norm": 0.10012141615152359,
"learning_rate": 0.0019127459366980327,
"loss": 1.3011,
"step": 52
},
{
"epoch": 0.13824,
"grad_norm": 0.10850805044174194,
"learning_rate": 0.0019093242087254063,
"loss": 1.3372,
"step": 54
},
{
"epoch": 0.14336,
"grad_norm": 0.11083640158176422,
"learning_rate": 0.0019059024807527804,
"loss": 1.3098,
"step": 56
},
{
"epoch": 0.14848,
"grad_norm": 0.11255177110433578,
"learning_rate": 0.001902480752780154,
"loss": 1.3223,
"step": 58
},
{
"epoch": 0.1536,
"grad_norm": 0.09206261485815048,
"learning_rate": 0.0018990590248075278,
"loss": 1.2899,
"step": 60
},
{
"epoch": 0.15872,
"grad_norm": 0.09589900076389313,
"learning_rate": 0.0018956372968349016,
"loss": 1.3026,
"step": 62
},
{
"epoch": 0.16384,
"grad_norm": 0.0895056203007698,
"learning_rate": 0.0018922155688622755,
"loss": 1.2996,
"step": 64
},
{
"epoch": 0.16896,
"grad_norm": 0.11926258355379105,
"learning_rate": 0.0018887938408896493,
"loss": 1.2952,
"step": 66
},
{
"epoch": 0.17408,
"grad_norm": 0.11748767644166946,
"learning_rate": 0.0018853721129170231,
"loss": 1.3085,
"step": 68
},
{
"epoch": 0.1792,
"grad_norm": 0.09052393585443497,
"learning_rate": 0.001881950384944397,
"loss": 1.3111,
"step": 70
},
{
"epoch": 0.18432,
"grad_norm": 0.11488136649131775,
"learning_rate": 0.0018785286569717708,
"loss": 1.2884,
"step": 72
},
{
"epoch": 0.18944,
"grad_norm": 0.10231968015432358,
"learning_rate": 0.0018751069289991446,
"loss": 1.2991,
"step": 74
},
{
"epoch": 0.19456,
"grad_norm": 0.17033444344997406,
"learning_rate": 0.0018716852010265185,
"loss": 1.3188,
"step": 76
},
{
"epoch": 0.19968,
"grad_norm": 0.1066645011305809,
"learning_rate": 0.0018682634730538923,
"loss": 1.2946,
"step": 78
},
{
"epoch": 0.2048,
"grad_norm": 0.1261938363313675,
"learning_rate": 0.0018648417450812661,
"loss": 1.316,
"step": 80
},
{
"epoch": 0.20992,
"grad_norm": 0.09089711308479309,
"learning_rate": 0.00186142001710864,
"loss": 1.2927,
"step": 82
},
{
"epoch": 0.21504,
"grad_norm": 0.16600407660007477,
"learning_rate": 0.0018579982891360136,
"loss": 1.2806,
"step": 84
},
{
"epoch": 0.22016,
"grad_norm": 0.10440811514854431,
"learning_rate": 0.0018545765611633876,
"loss": 1.2928,
"step": 86
},
{
"epoch": 0.22528,
"grad_norm": 0.11303785443305969,
"learning_rate": 0.0018511548331907612,
"loss": 1.2999,
"step": 88
},
{
"epoch": 0.2304,
"grad_norm": 0.1060706302523613,
"learning_rate": 0.0018477331052181353,
"loss": 1.2887,
"step": 90
},
{
"epoch": 0.23552,
"grad_norm": 0.11111288517713547,
"learning_rate": 0.001844311377245509,
"loss": 1.2935,
"step": 92
},
{
"epoch": 0.24064,
"grad_norm": 0.10950679332017899,
"learning_rate": 0.001840889649272883,
"loss": 1.2865,
"step": 94
},
{
"epoch": 0.24576,
"grad_norm": 0.09789486229419708,
"learning_rate": 0.0018374679213002568,
"loss": 1.3088,
"step": 96
},
{
"epoch": 0.25088,
"grad_norm": 0.10912415385246277,
"learning_rate": 0.0018340461933276304,
"loss": 1.2811,
"step": 98
},
{
"epoch": 0.256,
"grad_norm": 0.11448093503713608,
"learning_rate": 0.0018306244653550044,
"loss": 1.2707,
"step": 100
},
{
"epoch": 0.26112,
"grad_norm": 0.10610745847225189,
"learning_rate": 0.001827202737382378,
"loss": 1.2994,
"step": 102
},
{
"epoch": 0.26624,
"grad_norm": 0.10192134976387024,
"learning_rate": 0.001823781009409752,
"loss": 1.2807,
"step": 104
},
{
"epoch": 0.27136,
"grad_norm": 0.10114825516939163,
"learning_rate": 0.0018203592814371257,
"loss": 1.2582,
"step": 106
},
{
"epoch": 0.27648,
"grad_norm": 0.09611225128173828,
"learning_rate": 0.0018169375534644997,
"loss": 1.2951,
"step": 108
},
{
"epoch": 0.2816,
"grad_norm": 0.08788544684648514,
"learning_rate": 0.0018135158254918733,
"loss": 1.2869,
"step": 110
},
{
"epoch": 0.28672,
"grad_norm": 0.10186023265123367,
"learning_rate": 0.0018100940975192472,
"loss": 1.2916,
"step": 112
},
{
"epoch": 0.29184,
"grad_norm": 0.11426091939210892,
"learning_rate": 0.0018066723695466212,
"loss": 1.2705,
"step": 114
},
{
"epoch": 0.29696,
"grad_norm": 0.0911969244480133,
"learning_rate": 0.0018032506415739948,
"loss": 1.2826,
"step": 116
},
{
"epoch": 0.30208,
"grad_norm": 0.08570262044668198,
"learning_rate": 0.0017998289136013689,
"loss": 1.2783,
"step": 118
},
{
"epoch": 0.3072,
"grad_norm": 0.10600815713405609,
"learning_rate": 0.0017964071856287425,
"loss": 1.2633,
"step": 120
},
{
"epoch": 0.31232,
"grad_norm": 0.10680408775806427,
"learning_rate": 0.0017929854576561163,
"loss": 1.2751,
"step": 122
},
{
"epoch": 0.31744,
"grad_norm": 0.10642975568771362,
"learning_rate": 0.0017895637296834902,
"loss": 1.2776,
"step": 124
},
{
"epoch": 0.32256,
"grad_norm": 0.09877141565084457,
"learning_rate": 0.001786142001710864,
"loss": 1.2868,
"step": 126
},
{
"epoch": 0.32768,
"grad_norm": 0.12077789753675461,
"learning_rate": 0.0017827202737382378,
"loss": 1.2825,
"step": 128
},
{
"epoch": 0.3328,
"grad_norm": 0.1019749641418457,
"learning_rate": 0.0017792985457656116,
"loss": 1.2689,
"step": 130
},
{
"epoch": 0.33792,
"grad_norm": 0.08969077467918396,
"learning_rate": 0.0017758768177929857,
"loss": 1.2795,
"step": 132
},
{
"epoch": 0.34304,
"grad_norm": 0.10050085186958313,
"learning_rate": 0.0017724550898203593,
"loss": 1.2748,
"step": 134
},
{
"epoch": 0.34816,
"grad_norm": 0.08829426020383835,
"learning_rate": 0.0017690333618477331,
"loss": 1.2577,
"step": 136
},
{
"epoch": 0.35328,
"grad_norm": 0.09129626303911209,
"learning_rate": 0.001765611633875107,
"loss": 1.2703,
"step": 138
},
{
"epoch": 0.3584,
"grad_norm": 0.10055152326822281,
"learning_rate": 0.0017621899059024808,
"loss": 1.2618,
"step": 140
},
{
"epoch": 0.36352,
"grad_norm": 0.09721378982067108,
"learning_rate": 0.0017587681779298546,
"loss": 1.2809,
"step": 142
},
{
"epoch": 0.36864,
"grad_norm": 0.09917334467172623,
"learning_rate": 0.0017553464499572285,
"loss": 1.2628,
"step": 144
},
{
"epoch": 0.37376,
"grad_norm": 0.1129026785492897,
"learning_rate": 0.0017519247219846023,
"loss": 1.2684,
"step": 146
},
{
"epoch": 0.37888,
"grad_norm": 0.08873754739761353,
"learning_rate": 0.0017485029940119761,
"loss": 1.2726,
"step": 148
},
{
"epoch": 0.384,
"grad_norm": 0.10734206438064575,
"learning_rate": 0.0017450812660393497,
"loss": 1.2545,
"step": 150
},
{
"epoch": 0.38912,
"grad_norm": 0.09691416472196579,
"learning_rate": 0.0017416595380667238,
"loss": 1.2511,
"step": 152
},
{
"epoch": 0.39424,
"grad_norm": 0.10880187153816223,
"learning_rate": 0.0017382378100940976,
"loss": 1.2656,
"step": 154
},
{
"epoch": 0.39936,
"grad_norm": 0.10392981767654419,
"learning_rate": 0.0017348160821214714,
"loss": 1.2755,
"step": 156
},
{
"epoch": 0.40448,
"grad_norm": 0.08561566472053528,
"learning_rate": 0.0017313943541488453,
"loss": 1.2956,
"step": 158
},
{
"epoch": 0.4096,
"grad_norm": 0.09822013229131699,
"learning_rate": 0.0017279726261762189,
"loss": 1.2887,
"step": 160
},
{
"epoch": 0.41472,
"grad_norm": 0.09553670883178711,
"learning_rate": 0.001724550898203593,
"loss": 1.2743,
"step": 162
},
{
"epoch": 0.41984,
"grad_norm": 0.10794595628976822,
"learning_rate": 0.0017211291702309665,
"loss": 1.2603,
"step": 164
},
{
"epoch": 0.42496,
"grad_norm": 0.09373841434717178,
"learning_rate": 0.0017177074422583406,
"loss": 1.2441,
"step": 166
},
{
"epoch": 0.43008,
"grad_norm": 0.10550329089164734,
"learning_rate": 0.0017142857142857142,
"loss": 1.2714,
"step": 168
},
{
"epoch": 0.4352,
"grad_norm": 0.08271865546703339,
"learning_rate": 0.0017108639863130882,
"loss": 1.2671,
"step": 170
},
{
"epoch": 0.44032,
"grad_norm": 0.09635920822620392,
"learning_rate": 0.001707442258340462,
"loss": 1.2861,
"step": 172
},
{
"epoch": 0.44544,
"grad_norm": 0.0995635837316513,
"learning_rate": 0.0017040205303678357,
"loss": 1.2803,
"step": 174
},
{
"epoch": 0.45056,
"grad_norm": 0.10621396452188492,
"learning_rate": 0.0017005988023952097,
"loss": 1.2685,
"step": 176
},
{
"epoch": 0.45568,
"grad_norm": 0.10926424711942673,
"learning_rate": 0.0016971770744225833,
"loss": 1.2551,
"step": 178
},
{
"epoch": 0.4608,
"grad_norm": 0.09723444283008575,
"learning_rate": 0.0016937553464499574,
"loss": 1.2547,
"step": 180
},
{
"epoch": 0.46592,
"grad_norm": 0.09107507020235062,
"learning_rate": 0.001690333618477331,
"loss": 1.2664,
"step": 182
},
{
"epoch": 0.47104,
"grad_norm": 0.08052769303321838,
"learning_rate": 0.001686911890504705,
"loss": 1.2645,
"step": 184
},
{
"epoch": 0.47616,
"grad_norm": 0.0871344730257988,
"learning_rate": 0.0016834901625320787,
"loss": 1.2694,
"step": 186
},
{
"epoch": 0.48128,
"grad_norm": 0.11148510873317719,
"learning_rate": 0.0016800684345594525,
"loss": 1.2697,
"step": 188
},
{
"epoch": 0.4864,
"grad_norm": 0.08355887234210968,
"learning_rate": 0.0016766467065868263,
"loss": 1.2458,
"step": 190
},
{
"epoch": 0.49152,
"grad_norm": 0.08468321710824966,
"learning_rate": 0.0016732249786142002,
"loss": 1.252,
"step": 192
},
{
"epoch": 0.49664,
"grad_norm": 0.08812184631824493,
"learning_rate": 0.0016698032506415742,
"loss": 1.2746,
"step": 194
},
{
"epoch": 0.50176,
"grad_norm": 0.09015596657991409,
"learning_rate": 0.0016663815226689478,
"loss": 1.2659,
"step": 196
},
{
"epoch": 0.50688,
"grad_norm": 0.08708484470844269,
"learning_rate": 0.0016629597946963216,
"loss": 1.2727,
"step": 198
},
{
"epoch": 0.512,
"grad_norm": 0.09585189074277878,
"learning_rate": 0.0016595380667236955,
"loss": 1.2702,
"step": 200
},
{
"epoch": 0.51712,
"grad_norm": 0.08958299458026886,
"learning_rate": 0.0016561163387510693,
"loss": 1.2605,
"step": 202
},
{
"epoch": 0.52224,
"grad_norm": 0.0867680162191391,
"learning_rate": 0.0016526946107784431,
"loss": 1.2561,
"step": 204
},
{
"epoch": 0.52736,
"grad_norm": 0.08700387924909592,
"learning_rate": 0.001649272882805817,
"loss": 1.2481,
"step": 206
},
{
"epoch": 0.53248,
"grad_norm": 0.08566949516534805,
"learning_rate": 0.0016458511548331908,
"loss": 1.255,
"step": 208
},
{
"epoch": 0.5376,
"grad_norm": 0.08990107476711273,
"learning_rate": 0.0016424294268605646,
"loss": 1.2558,
"step": 210
},
{
"epoch": 0.54272,
"grad_norm": 0.08288553357124329,
"learning_rate": 0.0016390076988879385,
"loss": 1.2507,
"step": 212
},
{
"epoch": 0.54784,
"grad_norm": 0.08027470856904984,
"learning_rate": 0.0016355859709153123,
"loss": 1.278,
"step": 214
},
{
"epoch": 0.55296,
"grad_norm": 0.09287162125110626,
"learning_rate": 0.0016321642429426861,
"loss": 1.2626,
"step": 216
},
{
"epoch": 0.55808,
"grad_norm": 0.09153173863887787,
"learning_rate": 0.00162874251497006,
"loss": 1.2643,
"step": 218
},
{
"epoch": 0.5632,
"grad_norm": 0.10922811180353165,
"learning_rate": 0.0016253207869974338,
"loss": 1.2755,
"step": 220
},
{
"epoch": 0.56832,
"grad_norm": 0.1041250005364418,
"learning_rate": 0.0016218990590248074,
"loss": 1.2493,
"step": 222
},
{
"epoch": 0.57344,
"grad_norm": 0.08747130632400513,
"learning_rate": 0.0016184773310521814,
"loss": 1.2491,
"step": 224
},
{
"epoch": 0.57856,
"grad_norm": 0.10124468803405762,
"learning_rate": 0.001615055603079555,
"loss": 1.2387,
"step": 226
},
{
"epoch": 0.58368,
"grad_norm": 0.09022431075572968,
"learning_rate": 0.001611633875106929,
"loss": 1.2521,
"step": 228
},
{
"epoch": 0.5888,
"grad_norm": 0.09623134136199951,
"learning_rate": 0.0016082121471343027,
"loss": 1.2701,
"step": 230
},
{
"epoch": 0.59392,
"grad_norm": 0.09208202362060547,
"learning_rate": 0.0016047904191616768,
"loss": 1.2413,
"step": 232
},
{
"epoch": 0.59904,
"grad_norm": 0.10192185640335083,
"learning_rate": 0.0016013686911890506,
"loss": 1.278,
"step": 234
},
{
"epoch": 0.60416,
"grad_norm": 0.11536680907011032,
"learning_rate": 0.0015979469632164242,
"loss": 1.2729,
"step": 236
},
{
"epoch": 0.60928,
"grad_norm": 0.11593331396579742,
"learning_rate": 0.0015945252352437982,
"loss": 1.2772,
"step": 238
},
{
"epoch": 0.6144,
"grad_norm": 0.10352569818496704,
"learning_rate": 0.0015911035072711719,
"loss": 1.2587,
"step": 240
},
{
"epoch": 0.61952,
"grad_norm": 0.11213277280330658,
"learning_rate": 0.001587681779298546,
"loss": 1.2626,
"step": 242
},
{
"epoch": 0.62464,
"grad_norm": 0.11924043297767639,
"learning_rate": 0.0015842600513259195,
"loss": 1.2826,
"step": 244
},
{
"epoch": 0.62976,
"grad_norm": 0.09039822220802307,
"learning_rate": 0.0015808383233532936,
"loss": 1.2777,
"step": 246
},
{
"epoch": 0.63488,
"grad_norm": 0.09418819099664688,
"learning_rate": 0.0015774165953806672,
"loss": 1.2765,
"step": 248
},
{
"epoch": 0.64,
"grad_norm": 0.09826017916202545,
"learning_rate": 0.001573994867408041,
"loss": 1.2474,
"step": 250
},
{
"epoch": 0.64512,
"grad_norm": 0.09216541796922684,
"learning_rate": 0.001570573139435415,
"loss": 1.243,
"step": 252
},
{
"epoch": 0.65024,
"grad_norm": 0.09861636161804199,
"learning_rate": 0.0015671514114627887,
"loss": 1.239,
"step": 254
},
{
"epoch": 0.65536,
"grad_norm": 0.1033303365111351,
"learning_rate": 0.0015637296834901627,
"loss": 1.2571,
"step": 256
},
{
"epoch": 0.66048,
"grad_norm": 0.10304012894630432,
"learning_rate": 0.0015603079555175363,
"loss": 1.2556,
"step": 258
},
{
"epoch": 0.6656,
"grad_norm": 0.0865844339132309,
"learning_rate": 0.0015568862275449104,
"loss": 1.2475,
"step": 260
},
{
"epoch": 0.67072,
"grad_norm": 0.10008803755044937,
"learning_rate": 0.001553464499572284,
"loss": 1.2795,
"step": 262
},
{
"epoch": 0.67584,
"grad_norm": 0.09249156713485718,
"learning_rate": 0.0015500427715996578,
"loss": 1.263,
"step": 264
},
{
"epoch": 0.68096,
"grad_norm": 0.09253022074699402,
"learning_rate": 0.0015466210436270317,
"loss": 1.2901,
"step": 266
},
{
"epoch": 0.68608,
"grad_norm": 0.09615321457386017,
"learning_rate": 0.0015431993156544055,
"loss": 1.2643,
"step": 268
},
{
"epoch": 0.6912,
"grad_norm": 0.13639943301677704,
"learning_rate": 0.0015397775876817793,
"loss": 1.2586,
"step": 270
},
{
"epoch": 0.69632,
"grad_norm": 0.10662351548671722,
"learning_rate": 0.0015363558597091531,
"loss": 1.2414,
"step": 272
},
{
"epoch": 0.70144,
"grad_norm": 0.08936125040054321,
"learning_rate": 0.001532934131736527,
"loss": 1.2645,
"step": 274
},
{
"epoch": 0.70656,
"grad_norm": 0.09400724619626999,
"learning_rate": 0.0015295124037639008,
"loss": 1.2274,
"step": 276
},
{
"epoch": 0.71168,
"grad_norm": 0.10700514912605286,
"learning_rate": 0.0015260906757912746,
"loss": 1.2564,
"step": 278
},
{
"epoch": 0.7168,
"grad_norm": 0.09849894791841507,
"learning_rate": 0.0015226689478186485,
"loss": 1.2658,
"step": 280
},
{
"epoch": 0.72192,
"grad_norm": 0.10522880405187607,
"learning_rate": 0.0015192472198460223,
"loss": 1.2775,
"step": 282
},
{
"epoch": 0.72704,
"grad_norm": 0.09173361957073212,
"learning_rate": 0.0015158254918733961,
"loss": 1.2738,
"step": 284
},
{
"epoch": 0.73216,
"grad_norm": 0.09410373121500015,
"learning_rate": 0.00151240376390077,
"loss": 1.2483,
"step": 286
},
{
"epoch": 0.73728,
"grad_norm": 0.08925613760948181,
"learning_rate": 0.0015089820359281436,
"loss": 1.2554,
"step": 288
},
{
"epoch": 0.7424,
"grad_norm": 0.09131377190351486,
"learning_rate": 0.0015055603079555176,
"loss": 1.2496,
"step": 290
},
{
"epoch": 0.74752,
"grad_norm": 0.10802093148231506,
"learning_rate": 0.0015021385799828914,
"loss": 1.2675,
"step": 292
},
{
"epoch": 0.75264,
"grad_norm": 0.09466376155614853,
"learning_rate": 0.0014987168520102653,
"loss": 1.2482,
"step": 294
},
{
"epoch": 0.75776,
"grad_norm": 0.10151738673448563,
"learning_rate": 0.001495295124037639,
"loss": 1.2676,
"step": 296
},
{
"epoch": 0.76288,
"grad_norm": 0.08664025366306305,
"learning_rate": 0.0014918733960650127,
"loss": 1.2496,
"step": 298
},
{
"epoch": 0.768,
"grad_norm": 0.08441973477602005,
"learning_rate": 0.0014884516680923868,
"loss": 1.254,
"step": 300
},
{
"epoch": 0.77312,
"grad_norm": 0.08028802275657654,
"learning_rate": 0.0014850299401197604,
"loss": 1.2718,
"step": 302
},
{
"epoch": 0.77824,
"grad_norm": 0.08772825449705124,
"learning_rate": 0.0014816082121471344,
"loss": 1.2624,
"step": 304
},
{
"epoch": 0.78336,
"grad_norm": 0.08271320164203644,
"learning_rate": 0.001478186484174508,
"loss": 1.2467,
"step": 306
},
{
"epoch": 0.78848,
"grad_norm": 0.08346061408519745,
"learning_rate": 0.001474764756201882,
"loss": 1.2782,
"step": 308
},
{
"epoch": 0.7936,
"grad_norm": 0.09925299137830734,
"learning_rate": 0.0014713430282292557,
"loss": 1.2489,
"step": 310
},
{
"epoch": 0.79872,
"grad_norm": 0.08743379265069962,
"learning_rate": 0.0014679213002566295,
"loss": 1.257,
"step": 312
},
{
"epoch": 0.80384,
"grad_norm": 0.08556243032217026,
"learning_rate": 0.0014644995722840036,
"loss": 1.2507,
"step": 314
},
{
"epoch": 0.80896,
"grad_norm": 0.0923091247677803,
"learning_rate": 0.0014610778443113772,
"loss": 1.2551,
"step": 316
},
{
"epoch": 0.81408,
"grad_norm": 0.0928301066160202,
"learning_rate": 0.0014576561163387512,
"loss": 1.2309,
"step": 318
},
{
"epoch": 0.8192,
"grad_norm": 0.08794920146465302,
"learning_rate": 0.0014542343883661248,
"loss": 1.2457,
"step": 320
},
{
"epoch": 0.82432,
"grad_norm": 0.10327792912721634,
"learning_rate": 0.001450812660393499,
"loss": 1.2786,
"step": 322
},
{
"epoch": 0.82944,
"grad_norm": 0.12187407165765762,
"learning_rate": 0.0014473909324208725,
"loss": 1.2814,
"step": 324
},
{
"epoch": 0.83456,
"grad_norm": 0.11819777637720108,
"learning_rate": 0.0014439692044482463,
"loss": 1.2499,
"step": 326
},
{
"epoch": 0.83968,
"grad_norm": 0.09041640162467957,
"learning_rate": 0.0014405474764756202,
"loss": 1.2693,
"step": 328
},
{
"epoch": 0.8448,
"grad_norm": 0.09638890624046326,
"learning_rate": 0.001437125748502994,
"loss": 1.2514,
"step": 330
},
{
"epoch": 0.84992,
"grad_norm": 0.12534624338150024,
"learning_rate": 0.001433704020530368,
"loss": 1.2407,
"step": 332
},
{
"epoch": 0.85504,
"grad_norm": 0.09927276521921158,
"learning_rate": 0.0014302822925577417,
"loss": 1.2749,
"step": 334
},
{
"epoch": 0.86016,
"grad_norm": 0.0886382907629013,
"learning_rate": 0.0014268605645851157,
"loss": 1.2478,
"step": 336
},
{
"epoch": 0.86528,
"grad_norm": 0.09105540066957474,
"learning_rate": 0.0014234388366124893,
"loss": 1.2538,
"step": 338
},
{
"epoch": 0.8704,
"grad_norm": 0.08825322240591049,
"learning_rate": 0.0014200171086398631,
"loss": 1.2679,
"step": 340
},
{
"epoch": 0.87552,
"grad_norm": 0.08396722376346588,
"learning_rate": 0.001416595380667237,
"loss": 1.2091,
"step": 342
},
{
"epoch": 0.88064,
"grad_norm": 0.08241663128137589,
"learning_rate": 0.0014131736526946108,
"loss": 1.2401,
"step": 344
},
{
"epoch": 0.88576,
"grad_norm": 0.11285565793514252,
"learning_rate": 0.0014097519247219846,
"loss": 1.2461,
"step": 346
},
{
"epoch": 0.89088,
"grad_norm": 0.09898606687784195,
"learning_rate": 0.0014063301967493585,
"loss": 1.2587,
"step": 348
},
{
"epoch": 0.896,
"grad_norm": 0.09175729751586914,
"learning_rate": 0.001402908468776732,
"loss": 1.2318,
"step": 350
},
{
"epoch": 0.90112,
"grad_norm": 0.08974505960941315,
"learning_rate": 0.0013994867408041061,
"loss": 1.265,
"step": 352
},
{
"epoch": 0.90624,
"grad_norm": 0.10069482773542404,
"learning_rate": 0.00139606501283148,
"loss": 1.2276,
"step": 354
},
{
"epoch": 0.91136,
"grad_norm": 0.09355876594781876,
"learning_rate": 0.0013926432848588538,
"loss": 1.2474,
"step": 356
},
{
"epoch": 0.91648,
"grad_norm": 0.09747931361198425,
"learning_rate": 0.0013892215568862276,
"loss": 1.2677,
"step": 358
},
{
"epoch": 0.9216,
"grad_norm": 0.09114759415388107,
"learning_rate": 0.0013857998289136014,
"loss": 1.2442,
"step": 360
},
{
"epoch": 0.92672,
"grad_norm": 0.09683123230934143,
"learning_rate": 0.0013823781009409753,
"loss": 1.2328,
"step": 362
},
{
"epoch": 0.93184,
"grad_norm": 0.08525967597961426,
"learning_rate": 0.0013789563729683489,
"loss": 1.2301,
"step": 364
},
{
"epoch": 0.93696,
"grad_norm": 0.09861680120229721,
"learning_rate": 0.001375534644995723,
"loss": 1.2604,
"step": 366
},
{
"epoch": 0.94208,
"grad_norm": 0.08435367792844772,
"learning_rate": 0.0013721129170230965,
"loss": 1.2605,
"step": 368
},
{
"epoch": 0.9472,
"grad_norm": 0.08719425648450851,
"learning_rate": 0.0013686911890504706,
"loss": 1.2412,
"step": 370
},
{
"epoch": 0.95232,
"grad_norm": 0.10853152722120285,
"learning_rate": 0.0013652694610778444,
"loss": 1.2503,
"step": 372
},
{
"epoch": 0.95744,
"grad_norm": 0.08797234296798706,
"learning_rate": 0.001361847733105218,
"loss": 1.2323,
"step": 374
},
{
"epoch": 0.96256,
"grad_norm": 0.09422844648361206,
"learning_rate": 0.001358426005132592,
"loss": 1.2444,
"step": 376
},
{
"epoch": 0.96768,
"grad_norm": 0.09000077098608017,
"learning_rate": 0.0013550042771599657,
"loss": 1.2573,
"step": 378
},
{
"epoch": 0.9728,
"grad_norm": 0.09097360074520111,
"learning_rate": 0.0013515825491873397,
"loss": 1.2463,
"step": 380
},
{
"epoch": 0.97792,
"grad_norm": 0.08720215409994125,
"learning_rate": 0.0013481608212147134,
"loss": 1.2719,
"step": 382
},
{
"epoch": 0.98304,
"grad_norm": 0.09287154674530029,
"learning_rate": 0.0013447390932420874,
"loss": 1.2387,
"step": 384
},
{
"epoch": 0.98816,
"grad_norm": 0.08979474008083344,
"learning_rate": 0.001341317365269461,
"loss": 1.2353,
"step": 386
},
{
"epoch": 0.99328,
"grad_norm": 0.0891214981675148,
"learning_rate": 0.0013378956372968348,
"loss": 1.2441,
"step": 388
},
{
"epoch": 0.9984,
"grad_norm": 0.09961092472076416,
"learning_rate": 0.0013344739093242087,
"loss": 1.2407,
"step": 390
},
{
"epoch": 1.00256,
"grad_norm": 0.09316655993461609,
"learning_rate": 0.0013310521813515825,
"loss": 1.2589,
"step": 392
},
{
"epoch": 1.00768,
"grad_norm": 0.6676607131958008,
"learning_rate": 0.0013276304533789566,
"loss": 1.3337,
"step": 394
},
{
"epoch": 1.0128,
"grad_norm": 0.24017909169197083,
"learning_rate": 0.0013242087254063302,
"loss": 1.2628,
"step": 396
},
{
"epoch": 1.01792,
"grad_norm": 0.17012369632720947,
"learning_rate": 0.0013207869974337042,
"loss": 1.2607,
"step": 398
},
{
"epoch": 1.02304,
"grad_norm": 0.18585637211799622,
"learning_rate": 0.0013173652694610778,
"loss": 1.2502,
"step": 400
},
{
"epoch": 1.02304,
"eval_loss": 1.2686372995376587,
"eval_runtime": 279.6676,
"eval_samples_per_second": 8.939,
"eval_steps_per_second": 1.119,
"step": 400
},
{
"epoch": 1.02816,
"grad_norm": 0.16060733795166016,
"learning_rate": 0.0013139435414884517,
"loss": 1.2418,
"step": 402
},
{
"epoch": 1.03328,
"grad_norm": 0.13407620787620544,
"learning_rate": 0.0013105218135158255,
"loss": 1.2749,
"step": 404
},
{
"epoch": 1.0384,
"grad_norm": 0.12042540311813354,
"learning_rate": 0.0013071000855431993,
"loss": 1.2465,
"step": 406
},
{
"epoch": 1.04352,
"grad_norm": 0.13832047581672668,
"learning_rate": 0.0013036783575705731,
"loss": 1.2535,
"step": 408
},
{
"epoch": 1.04864,
"grad_norm": 0.11310556530952454,
"learning_rate": 0.001300256629597947,
"loss": 1.2352,
"step": 410
},
{
"epoch": 1.05376,
"grad_norm": 0.13382209837436676,
"learning_rate": 0.001296834901625321,
"loss": 1.2507,
"step": 412
},
{
"epoch": 1.05888,
"grad_norm": 0.13580721616744995,
"learning_rate": 0.0012934131736526946,
"loss": 1.2401,
"step": 414
},
{
"epoch": 1.064,
"grad_norm": 0.11162128299474716,
"learning_rate": 0.0012899914456800685,
"loss": 1.2534,
"step": 416
},
{
"epoch": 1.06912,
"grad_norm": 0.14186108112335205,
"learning_rate": 0.0012865697177074423,
"loss": 1.248,
"step": 418
},
{
"epoch": 1.07424,
"grad_norm": 0.13840018212795258,
"learning_rate": 0.0012831479897348161,
"loss": 1.2462,
"step": 420
},
{
"epoch": 1.07936,
"grad_norm": 0.11488790065050125,
"learning_rate": 0.00127972626176219,
"loss": 1.241,
"step": 422
},
{
"epoch": 1.08448,
"grad_norm": 0.1290818750858307,
"learning_rate": 0.0012763045337895638,
"loss": 1.2275,
"step": 424
},
{
"epoch": 1.0896,
"grad_norm": 0.10360855609178543,
"learning_rate": 0.0012728828058169374,
"loss": 1.2378,
"step": 426
},
{
"epoch": 1.09472,
"grad_norm": 0.12253882735967636,
"learning_rate": 0.0012694610778443114,
"loss": 1.2562,
"step": 428
},
{
"epoch": 1.09984,
"grad_norm": 0.12016449123620987,
"learning_rate": 0.001266039349871685,
"loss": 1.2742,
"step": 430
},
{
"epoch": 1.10496,
"grad_norm": 0.12187926471233368,
"learning_rate": 0.0012626176218990591,
"loss": 1.2202,
"step": 432
},
{
"epoch": 1.11008,
"grad_norm": 0.11081688851118088,
"learning_rate": 0.001259195893926433,
"loss": 1.2474,
"step": 434
},
{
"epoch": 1.1152,
"grad_norm": 0.09387561678886414,
"learning_rate": 0.0012557741659538068,
"loss": 1.2566,
"step": 436
},
{
"epoch": 1.12032,
"grad_norm": 0.11739682406187057,
"learning_rate": 0.0012523524379811806,
"loss": 1.2421,
"step": 438
},
{
"epoch": 1.12544,
"grad_norm": 0.11595962196588516,
"learning_rate": 0.0012489307100085542,
"loss": 1.2525,
"step": 440
},
{
"epoch": 1.13056,
"grad_norm": 0.10769950598478317,
"learning_rate": 0.0012455089820359283,
"loss": 1.2501,
"step": 442
},
{
"epoch": 1.13568,
"grad_norm": 0.11145374178886414,
"learning_rate": 0.0012420872540633019,
"loss": 1.2649,
"step": 444
},
{
"epoch": 1.1408,
"grad_norm": 0.09793559461832047,
"learning_rate": 0.001238665526090676,
"loss": 1.244,
"step": 446
},
{
"epoch": 1.14592,
"grad_norm": 0.16483676433563232,
"learning_rate": 0.0012352437981180495,
"loss": 1.2298,
"step": 448
},
{
"epoch": 1.15104,
"grad_norm": 0.13711702823638916,
"learning_rate": 0.0012318220701454234,
"loss": 1.2279,
"step": 450
},
{
"epoch": 1.15616,
"grad_norm": 0.10064009577035904,
"learning_rate": 0.0012284003421727974,
"loss": 1.2381,
"step": 452
},
{
"epoch": 1.16128,
"grad_norm": 0.09517936408519745,
"learning_rate": 0.001224978614200171,
"loss": 1.231,
"step": 454
},
{
"epoch": 1.1663999999999999,
"grad_norm": 0.111509308218956,
"learning_rate": 0.001221556886227545,
"loss": 1.25,
"step": 456
},
{
"epoch": 1.1715200000000001,
"grad_norm": 0.09270152449607849,
"learning_rate": 0.0012181351582549187,
"loss": 1.2478,
"step": 458
},
{
"epoch": 1.17664,
"grad_norm": 0.10461369901895523,
"learning_rate": 0.0012147134302822927,
"loss": 1.2536,
"step": 460
},
{
"epoch": 1.18176,
"grad_norm": 0.10189452022314072,
"learning_rate": 0.0012112917023096663,
"loss": 1.2177,
"step": 462
},
{
"epoch": 1.18688,
"grad_norm": 0.08983030170202255,
"learning_rate": 0.0012078699743370402,
"loss": 1.2117,
"step": 464
},
{
"epoch": 1.192,
"grad_norm": 0.09105629473924637,
"learning_rate": 0.001204448246364414,
"loss": 1.2302,
"step": 466
},
{
"epoch": 1.19712,
"grad_norm": 0.11691851168870926,
"learning_rate": 0.0012010265183917878,
"loss": 1.2101,
"step": 468
},
{
"epoch": 1.20224,
"grad_norm": 0.08926935493946075,
"learning_rate": 0.0011976047904191617,
"loss": 1.2479,
"step": 470
},
{
"epoch": 1.20736,
"grad_norm": 0.11435071378946304,
"learning_rate": 0.0011941830624465355,
"loss": 1.2593,
"step": 472
},
{
"epoch": 1.21248,
"grad_norm": 0.10086748749017715,
"learning_rate": 0.0011907613344739095,
"loss": 1.2413,
"step": 474
},
{
"epoch": 1.2176,
"grad_norm": 0.098397396504879,
"learning_rate": 0.0011873396065012832,
"loss": 1.228,
"step": 476
},
{
"epoch": 1.22272,
"grad_norm": 0.08532971143722534,
"learning_rate": 0.001183917878528657,
"loss": 1.2309,
"step": 478
},
{
"epoch": 1.22784,
"grad_norm": 0.10852818191051483,
"learning_rate": 0.0011804961505560308,
"loss": 1.232,
"step": 480
},
{
"epoch": 1.23296,
"grad_norm": 0.09808767586946487,
"learning_rate": 0.0011770744225834046,
"loss": 1.2415,
"step": 482
},
{
"epoch": 1.23808,
"grad_norm": 0.11177875101566315,
"learning_rate": 0.0011736526946107785,
"loss": 1.2592,
"step": 484
},
{
"epoch": 1.2432,
"grad_norm": 0.1047763004899025,
"learning_rate": 0.0011702309666381523,
"loss": 1.2487,
"step": 486
},
{
"epoch": 1.24832,
"grad_norm": 0.12227226048707962,
"learning_rate": 0.001166809238665526,
"loss": 1.2481,
"step": 488
},
{
"epoch": 1.2534399999999999,
"grad_norm": 0.11121272295713425,
"learning_rate": 0.0011633875106929,
"loss": 1.2369,
"step": 490
},
{
"epoch": 1.2585600000000001,
"grad_norm": 0.1087367981672287,
"learning_rate": 0.0011599657827202738,
"loss": 1.2378,
"step": 492
},
{
"epoch": 1.26368,
"grad_norm": 0.09499981254339218,
"learning_rate": 0.0011565440547476476,
"loss": 1.2327,
"step": 494
},
{
"epoch": 1.2688,
"grad_norm": 0.12346815317869186,
"learning_rate": 0.0011531223267750215,
"loss": 1.2538,
"step": 496
},
{
"epoch": 1.27392,
"grad_norm": 0.10443610697984695,
"learning_rate": 0.0011497005988023953,
"loss": 1.2343,
"step": 498
},
{
"epoch": 1.27904,
"grad_norm": 0.1339293271303177,
"learning_rate": 0.0011462788708297691,
"loss": 1.2198,
"step": 500
},
{
"epoch": 1.28416,
"grad_norm": 0.08476725220680237,
"learning_rate": 0.0011428571428571427,
"loss": 1.236,
"step": 502
},
{
"epoch": 1.28928,
"grad_norm": 0.12042795866727829,
"learning_rate": 0.0011394354148845168,
"loss": 1.2357,
"step": 504
},
{
"epoch": 1.2944,
"grad_norm": 0.08857988566160202,
"learning_rate": 0.0011360136869118904,
"loss": 1.2476,
"step": 506
},
{
"epoch": 1.29952,
"grad_norm": 0.1092582419514656,
"learning_rate": 0.0011325919589392644,
"loss": 1.2445,
"step": 508
},
{
"epoch": 1.30464,
"grad_norm": 0.0912066176533699,
"learning_rate": 0.001129170230966638,
"loss": 1.2556,
"step": 510
},
{
"epoch": 1.30976,
"grad_norm": 0.12163588404655457,
"learning_rate": 0.001125748502994012,
"loss": 1.2427,
"step": 512
},
{
"epoch": 1.31488,
"grad_norm": 0.0835074707865715,
"learning_rate": 0.001122326775021386,
"loss": 1.2546,
"step": 514
},
{
"epoch": 1.32,
"grad_norm": 0.1106950119137764,
"learning_rate": 0.0011189050470487595,
"loss": 1.2168,
"step": 516
},
{
"epoch": 1.32512,
"grad_norm": 0.09452968090772629,
"learning_rate": 0.0011154833190761336,
"loss": 1.2314,
"step": 518
},
{
"epoch": 1.3302399999999999,
"grad_norm": 0.11672431975603104,
"learning_rate": 0.0011120615911035072,
"loss": 1.2455,
"step": 520
},
{
"epoch": 1.33536,
"grad_norm": 0.10683607310056686,
"learning_rate": 0.0011086398631308812,
"loss": 1.2474,
"step": 522
},
{
"epoch": 1.34048,
"grad_norm": 0.08974877744913101,
"learning_rate": 0.0011052181351582549,
"loss": 1.2226,
"step": 524
},
{
"epoch": 1.3456000000000001,
"grad_norm": 0.10645844787359238,
"learning_rate": 0.0011017964071856287,
"loss": 1.2545,
"step": 526
},
{
"epoch": 1.35072,
"grad_norm": 0.08754228055477142,
"learning_rate": 0.0010983746792130025,
"loss": 1.217,
"step": 528
},
{
"epoch": 1.35584,
"grad_norm": 0.10209974646568298,
"learning_rate": 0.0010949529512403763,
"loss": 1.2399,
"step": 530
},
{
"epoch": 1.36096,
"grad_norm": 0.09669913351535797,
"learning_rate": 0.0010915312232677504,
"loss": 1.2081,
"step": 532
},
{
"epoch": 1.36608,
"grad_norm": 0.10272342711687088,
"learning_rate": 0.001088109495295124,
"loss": 1.2635,
"step": 534
},
{
"epoch": 1.3712,
"grad_norm": 0.0976710096001625,
"learning_rate": 0.001084687767322498,
"loss": 1.2347,
"step": 536
},
{
"epoch": 1.37632,
"grad_norm": 0.09784968197345734,
"learning_rate": 0.0010812660393498717,
"loss": 1.2522,
"step": 538
},
{
"epoch": 1.38144,
"grad_norm": 0.09353113174438477,
"learning_rate": 0.0010778443113772455,
"loss": 1.2326,
"step": 540
},
{
"epoch": 1.38656,
"grad_norm": 0.10906370729207993,
"learning_rate": 0.0010744225834046193,
"loss": 1.2375,
"step": 542
},
{
"epoch": 1.39168,
"grad_norm": 0.11907199025154114,
"learning_rate": 0.0010710008554319932,
"loss": 1.2185,
"step": 544
},
{
"epoch": 1.3968,
"grad_norm": 0.10644809901714325,
"learning_rate": 0.001067579127459367,
"loss": 1.2216,
"step": 546
},
{
"epoch": 1.40192,
"grad_norm": 0.10000847280025482,
"learning_rate": 0.0010641573994867408,
"loss": 1.2471,
"step": 548
},
{
"epoch": 1.40704,
"grad_norm": 0.11204187572002411,
"learning_rate": 0.0010607356715141146,
"loss": 1.2693,
"step": 550
},
{
"epoch": 1.41216,
"grad_norm": 0.0837775245308876,
"learning_rate": 0.0010573139435414885,
"loss": 1.2444,
"step": 552
},
{
"epoch": 1.4172799999999999,
"grad_norm": 0.09714753180742264,
"learning_rate": 0.0010538922155688623,
"loss": 1.2525,
"step": 554
},
{
"epoch": 1.4224,
"grad_norm": 0.08763246238231659,
"learning_rate": 0.0010504704875962361,
"loss": 1.2678,
"step": 556
},
{
"epoch": 1.42752,
"grad_norm": 0.12144036591053009,
"learning_rate": 0.00104704875962361,
"loss": 1.2285,
"step": 558
},
{
"epoch": 1.4326400000000001,
"grad_norm": 0.11393667757511139,
"learning_rate": 0.0010436270316509838,
"loss": 1.253,
"step": 560
},
{
"epoch": 1.43776,
"grad_norm": 0.0934453159570694,
"learning_rate": 0.0010402053036783576,
"loss": 1.2388,
"step": 562
},
{
"epoch": 1.44288,
"grad_norm": 0.1040380522608757,
"learning_rate": 0.0010367835757057312,
"loss": 1.2446,
"step": 564
},
{
"epoch": 1.448,
"grad_norm": 0.09509964287281036,
"learning_rate": 0.0010333618477331053,
"loss": 1.2328,
"step": 566
},
{
"epoch": 1.45312,
"grad_norm": 0.09978800266981125,
"learning_rate": 0.001029940119760479,
"loss": 1.2385,
"step": 568
},
{
"epoch": 1.45824,
"grad_norm": 0.08431090414524078,
"learning_rate": 0.001026518391787853,
"loss": 1.2389,
"step": 570
},
{
"epoch": 1.46336,
"grad_norm": 0.09447863698005676,
"learning_rate": 0.0010230966638152268,
"loss": 1.2301,
"step": 572
},
{
"epoch": 1.46848,
"grad_norm": 0.09212321043014526,
"learning_rate": 0.0010196749358426006,
"loss": 1.2457,
"step": 574
},
{
"epoch": 1.4736,
"grad_norm": 0.09489751607179642,
"learning_rate": 0.0010162532078699744,
"loss": 1.2229,
"step": 576
},
{
"epoch": 1.47872,
"grad_norm": 0.08897145092487335,
"learning_rate": 0.001012831479897348,
"loss": 1.2242,
"step": 578
},
{
"epoch": 1.48384,
"grad_norm": 0.09199076145887375,
"learning_rate": 0.001009409751924722,
"loss": 1.251,
"step": 580
},
{
"epoch": 1.48896,
"grad_norm": 0.1008097380399704,
"learning_rate": 0.0010059880239520957,
"loss": 1.2633,
"step": 582
},
{
"epoch": 1.49408,
"grad_norm": 0.0928090438246727,
"learning_rate": 0.0010025662959794698,
"loss": 1.238,
"step": 584
},
{
"epoch": 1.4992,
"grad_norm": 0.09936055541038513,
"learning_rate": 0.0009991445680068436,
"loss": 1.2502,
"step": 586
},
{
"epoch": 1.5043199999999999,
"grad_norm": 0.12435046583414078,
"learning_rate": 0.0009957228400342174,
"loss": 1.238,
"step": 588
},
{
"epoch": 1.5094400000000001,
"grad_norm": 0.09954190254211426,
"learning_rate": 0.000992301112061591,
"loss": 1.2403,
"step": 590
},
{
"epoch": 1.51456,
"grad_norm": 0.0886056125164032,
"learning_rate": 0.0009888793840889649,
"loss": 1.2413,
"step": 592
},
{
"epoch": 1.5196800000000001,
"grad_norm": 0.10016464442014694,
"learning_rate": 0.0009854576561163387,
"loss": 1.2272,
"step": 594
},
{
"epoch": 1.5248,
"grad_norm": 0.08891763538122177,
"learning_rate": 0.0009820359281437125,
"loss": 1.2198,
"step": 596
},
{
"epoch": 1.52992,
"grad_norm": 0.08042890578508377,
"learning_rate": 0.0009786142001710863,
"loss": 1.2386,
"step": 598
},
{
"epoch": 1.53504,
"grad_norm": 0.09712400287389755,
"learning_rate": 0.0009751924721984602,
"loss": 1.2418,
"step": 600
},
{
"epoch": 1.54016,
"grad_norm": 0.09671667218208313,
"learning_rate": 0.0009717707442258341,
"loss": 1.2493,
"step": 602
},
{
"epoch": 1.54528,
"grad_norm": 0.1355689913034439,
"learning_rate": 0.000968349016253208,
"loss": 1.2529,
"step": 604
},
{
"epoch": 1.5504,
"grad_norm": 0.08577972650527954,
"learning_rate": 0.0009649272882805818,
"loss": 1.2444,
"step": 606
},
{
"epoch": 1.55552,
"grad_norm": 0.08881525695323944,
"learning_rate": 0.0009615055603079555,
"loss": 1.2561,
"step": 608
},
{
"epoch": 1.56064,
"grad_norm": 0.08444136381149292,
"learning_rate": 0.0009580838323353293,
"loss": 1.2378,
"step": 610
},
{
"epoch": 1.56576,
"grad_norm": 0.08552881330251694,
"learning_rate": 0.0009546621043627032,
"loss": 1.2186,
"step": 612
},
{
"epoch": 1.57088,
"grad_norm": 0.08705168962478638,
"learning_rate": 0.000951240376390077,
"loss": 1.2451,
"step": 614
},
{
"epoch": 1.576,
"grad_norm": 0.08805005997419357,
"learning_rate": 0.0009478186484174508,
"loss": 1.225,
"step": 616
},
{
"epoch": 1.5811199999999999,
"grad_norm": 0.10901911556720734,
"learning_rate": 0.0009443969204448247,
"loss": 1.2351,
"step": 618
},
{
"epoch": 1.58624,
"grad_norm": 0.08406229317188263,
"learning_rate": 0.0009409751924721985,
"loss": 1.2535,
"step": 620
},
{
"epoch": 1.5913599999999999,
"grad_norm": 0.10775440186262131,
"learning_rate": 0.0009375534644995723,
"loss": 1.2474,
"step": 622
},
{
"epoch": 1.5964800000000001,
"grad_norm": 0.09037076681852341,
"learning_rate": 0.0009341317365269461,
"loss": 1.2346,
"step": 624
},
{
"epoch": 1.6016,
"grad_norm": 0.10725659132003784,
"learning_rate": 0.00093071000855432,
"loss": 1.2156,
"step": 626
},
{
"epoch": 1.6067200000000001,
"grad_norm": 0.08220596611499786,
"learning_rate": 0.0009272882805816938,
"loss": 1.2335,
"step": 628
},
{
"epoch": 1.61184,
"grad_norm": 0.08765338361263275,
"learning_rate": 0.0009238665526090676,
"loss": 1.2311,
"step": 630
},
{
"epoch": 1.61696,
"grad_norm": 0.0885564312338829,
"learning_rate": 0.0009204448246364415,
"loss": 1.2381,
"step": 632
},
{
"epoch": 1.62208,
"grad_norm": 0.09842797368764877,
"learning_rate": 0.0009170230966638152,
"loss": 1.2362,
"step": 634
},
{
"epoch": 1.6272,
"grad_norm": 0.08304440975189209,
"learning_rate": 0.000913601368691189,
"loss": 1.2635,
"step": 636
},
{
"epoch": 1.63232,
"grad_norm": 0.0953385978937149,
"learning_rate": 0.0009101796407185628,
"loss": 1.2221,
"step": 638
},
{
"epoch": 1.63744,
"grad_norm": 0.09365107119083405,
"learning_rate": 0.0009067579127459367,
"loss": 1.2449,
"step": 640
},
{
"epoch": 1.64256,
"grad_norm": 0.09064996242523193,
"learning_rate": 0.0009033361847733106,
"loss": 1.2212,
"step": 642
},
{
"epoch": 1.64768,
"grad_norm": 0.08681759238243103,
"learning_rate": 0.0008999144568006844,
"loss": 1.2581,
"step": 644
},
{
"epoch": 1.6528,
"grad_norm": 0.08267663419246674,
"learning_rate": 0.0008964927288280582,
"loss": 1.2347,
"step": 646
},
{
"epoch": 1.6579199999999998,
"grad_norm": 0.08745119720697403,
"learning_rate": 0.000893071000855432,
"loss": 1.2289,
"step": 648
},
{
"epoch": 1.66304,
"grad_norm": 0.08867384493350983,
"learning_rate": 0.0008896492728828058,
"loss": 1.2598,
"step": 650
},
{
"epoch": 1.6681599999999999,
"grad_norm": 0.08820460736751556,
"learning_rate": 0.0008862275449101797,
"loss": 1.2458,
"step": 652
},
{
"epoch": 1.67328,
"grad_norm": 0.1035899817943573,
"learning_rate": 0.0008828058169375535,
"loss": 1.2346,
"step": 654
},
{
"epoch": 1.6784,
"grad_norm": 0.08878592401742935,
"learning_rate": 0.0008793840889649273,
"loss": 1.2332,
"step": 656
},
{
"epoch": 1.6835200000000001,
"grad_norm": 0.08506552875041962,
"learning_rate": 0.0008759623609923011,
"loss": 1.2392,
"step": 658
},
{
"epoch": 1.68864,
"grad_norm": 0.09227900952100754,
"learning_rate": 0.0008725406330196749,
"loss": 1.2552,
"step": 660
},
{
"epoch": 1.6937600000000002,
"grad_norm": 0.08019377291202545,
"learning_rate": 0.0008691189050470488,
"loss": 1.2208,
"step": 662
},
{
"epoch": 1.69888,
"grad_norm": 0.09494514763355255,
"learning_rate": 0.0008656971770744226,
"loss": 1.2367,
"step": 664
},
{
"epoch": 1.704,
"grad_norm": 0.09500183910131454,
"learning_rate": 0.0008622754491017965,
"loss": 1.1998,
"step": 666
},
{
"epoch": 1.70912,
"grad_norm": 0.08492112904787064,
"learning_rate": 0.0008588537211291703,
"loss": 1.2489,
"step": 668
},
{
"epoch": 1.71424,
"grad_norm": 0.0840214341878891,
"learning_rate": 0.0008554319931565441,
"loss": 1.2358,
"step": 670
},
{
"epoch": 1.71936,
"grad_norm": 0.0879180058836937,
"learning_rate": 0.0008520102651839178,
"loss": 1.2448,
"step": 672
},
{
"epoch": 1.72448,
"grad_norm": 0.08872208744287491,
"learning_rate": 0.0008485885372112917,
"loss": 1.2432,
"step": 674
},
{
"epoch": 1.7296,
"grad_norm": 0.09941036254167557,
"learning_rate": 0.0008451668092386655,
"loss": 1.2416,
"step": 676
},
{
"epoch": 1.73472,
"grad_norm": 0.08928696811199188,
"learning_rate": 0.0008417450812660393,
"loss": 1.2545,
"step": 678
},
{
"epoch": 1.73984,
"grad_norm": 0.08976240456104279,
"learning_rate": 0.0008383233532934132,
"loss": 1.2596,
"step": 680
},
{
"epoch": 1.7449599999999998,
"grad_norm": 0.08667703717947006,
"learning_rate": 0.0008349016253207871,
"loss": 1.2499,
"step": 682
},
{
"epoch": 1.75008,
"grad_norm": 0.08816345036029816,
"learning_rate": 0.0008314798973481608,
"loss": 1.2339,
"step": 684
},
{
"epoch": 1.7551999999999999,
"grad_norm": 0.08712169528007507,
"learning_rate": 0.0008280581693755347,
"loss": 1.2206,
"step": 686
},
{
"epoch": 1.76032,
"grad_norm": 0.08284337818622589,
"learning_rate": 0.0008246364414029085,
"loss": 1.2481,
"step": 688
},
{
"epoch": 1.76544,
"grad_norm": 0.08372201770544052,
"learning_rate": 0.0008212147134302823,
"loss": 1.2258,
"step": 690
},
{
"epoch": 1.7705600000000001,
"grad_norm": 0.08462055772542953,
"learning_rate": 0.0008177929854576561,
"loss": 1.255,
"step": 692
},
{
"epoch": 1.77568,
"grad_norm": 0.08806449919939041,
"learning_rate": 0.00081437125748503,
"loss": 1.2395,
"step": 694
},
{
"epoch": 1.7808000000000002,
"grad_norm": 0.08771070092916489,
"learning_rate": 0.0008109495295124037,
"loss": 1.2165,
"step": 696
},
{
"epoch": 1.78592,
"grad_norm": 0.10757436603307724,
"learning_rate": 0.0008075278015397775,
"loss": 1.2463,
"step": 698
},
{
"epoch": 1.79104,
"grad_norm": 0.10293210297822952,
"learning_rate": 0.0008041060735671514,
"loss": 1.2633,
"step": 700
},
{
"epoch": 1.79616,
"grad_norm": 0.08851849287748337,
"learning_rate": 0.0008006843455945253,
"loss": 1.2306,
"step": 702
},
{
"epoch": 1.80128,
"grad_norm": 0.08972053974866867,
"learning_rate": 0.0007972626176218991,
"loss": 1.2173,
"step": 704
},
{
"epoch": 1.8064,
"grad_norm": 0.12318170815706253,
"learning_rate": 0.000793840889649273,
"loss": 1.2528,
"step": 706
},
{
"epoch": 1.81152,
"grad_norm": 0.08965172618627548,
"learning_rate": 0.0007904191616766468,
"loss": 1.2102,
"step": 708
},
{
"epoch": 1.81664,
"grad_norm": 0.10421866178512573,
"learning_rate": 0.0007869974337040205,
"loss": 1.2465,
"step": 710
},
{
"epoch": 1.82176,
"grad_norm": 0.09522471576929092,
"learning_rate": 0.0007835757057313943,
"loss": 1.2386,
"step": 712
},
{
"epoch": 1.82688,
"grad_norm": 0.10268909484148026,
"learning_rate": 0.0007801539777587682,
"loss": 1.2592,
"step": 714
},
{
"epoch": 1.8319999999999999,
"grad_norm": 0.10645327717065811,
"learning_rate": 0.000776732249786142,
"loss": 1.2109,
"step": 716
},
{
"epoch": 1.83712,
"grad_norm": 0.11298143118619919,
"learning_rate": 0.0007733105218135158,
"loss": 1.2527,
"step": 718
},
{
"epoch": 1.8422399999999999,
"grad_norm": 0.09252069890499115,
"learning_rate": 0.0007698887938408897,
"loss": 1.2461,
"step": 720
},
{
"epoch": 1.8473600000000001,
"grad_norm": 0.1118890643119812,
"learning_rate": 0.0007664670658682635,
"loss": 1.2334,
"step": 722
},
{
"epoch": 1.85248,
"grad_norm": 0.09874032437801361,
"learning_rate": 0.0007630453378956373,
"loss": 1.2281,
"step": 724
},
{
"epoch": 1.8576000000000001,
"grad_norm": 0.09806526452302933,
"learning_rate": 0.0007596236099230111,
"loss": 1.2403,
"step": 726
},
{
"epoch": 1.86272,
"grad_norm": 0.11208830773830414,
"learning_rate": 0.000756201881950385,
"loss": 1.2433,
"step": 728
},
{
"epoch": 1.86784,
"grad_norm": 0.09039215743541718,
"learning_rate": 0.0007527801539777588,
"loss": 1.2366,
"step": 730
},
{
"epoch": 1.87296,
"grad_norm": 0.08755876123905182,
"learning_rate": 0.0007493584260051326,
"loss": 1.2419,
"step": 732
},
{
"epoch": 1.87808,
"grad_norm": 0.0822838544845581,
"learning_rate": 0.0007459366980325064,
"loss": 1.2155,
"step": 734
},
{
"epoch": 1.8832,
"grad_norm": 0.08834468573331833,
"learning_rate": 0.0007425149700598802,
"loss": 1.2575,
"step": 736
},
{
"epoch": 1.88832,
"grad_norm": 0.09723412245512009,
"learning_rate": 0.000739093242087254,
"loss": 1.2137,
"step": 738
},
{
"epoch": 1.89344,
"grad_norm": 0.11283569037914276,
"learning_rate": 0.0007356715141146278,
"loss": 1.2589,
"step": 740
},
{
"epoch": 1.89856,
"grad_norm": 0.08855767548084259,
"learning_rate": 0.0007322497861420018,
"loss": 1.2095,
"step": 742
},
{
"epoch": 1.90368,
"grad_norm": 0.09914392232894897,
"learning_rate": 0.0007288280581693756,
"loss": 1.2272,
"step": 744
},
{
"epoch": 1.9088,
"grad_norm": 0.10668183118104935,
"learning_rate": 0.0007254063301967494,
"loss": 1.2555,
"step": 746
},
{
"epoch": 1.91392,
"grad_norm": 0.09255476295948029,
"learning_rate": 0.0007219846022241232,
"loss": 1.2248,
"step": 748
},
{
"epoch": 1.9190399999999999,
"grad_norm": 0.10384318232536316,
"learning_rate": 0.000718562874251497,
"loss": 1.2128,
"step": 750
},
{
"epoch": 1.92416,
"grad_norm": 0.09946981072425842,
"learning_rate": 0.0007151411462788708,
"loss": 1.2323,
"step": 752
},
{
"epoch": 1.9292799999999999,
"grad_norm": 0.10720199346542358,
"learning_rate": 0.0007117194183062447,
"loss": 1.2361,
"step": 754
},
{
"epoch": 1.9344000000000001,
"grad_norm": 0.10026301443576813,
"learning_rate": 0.0007082976903336185,
"loss": 1.2443,
"step": 756
},
{
"epoch": 1.93952,
"grad_norm": 0.08168992400169373,
"learning_rate": 0.0007048759623609923,
"loss": 1.2297,
"step": 758
},
{
"epoch": 1.9446400000000001,
"grad_norm": 0.11108248680830002,
"learning_rate": 0.000701454234388366,
"loss": 1.2359,
"step": 760
},
{
"epoch": 1.94976,
"grad_norm": 0.1133013665676117,
"learning_rate": 0.00069803250641574,
"loss": 1.2569,
"step": 762
},
{
"epoch": 1.95488,
"grad_norm": 0.0839882493019104,
"learning_rate": 0.0006946107784431138,
"loss": 1.231,
"step": 764
},
{
"epoch": 1.96,
"grad_norm": 0.09552154690027237,
"learning_rate": 0.0006911890504704876,
"loss": 1.2531,
"step": 766
},
{
"epoch": 1.96512,
"grad_norm": 0.09877890348434448,
"learning_rate": 0.0006877673224978615,
"loss": 1.2795,
"step": 768
},
{
"epoch": 1.97024,
"grad_norm": 0.08949697017669678,
"learning_rate": 0.0006843455945252353,
"loss": 1.2104,
"step": 770
},
{
"epoch": 1.97536,
"grad_norm": 0.09640631079673767,
"learning_rate": 0.000680923866552609,
"loss": 1.2552,
"step": 772
},
{
"epoch": 1.98048,
"grad_norm": 0.0900396853685379,
"learning_rate": 0.0006775021385799828,
"loss": 1.2282,
"step": 774
},
{
"epoch": 1.9856,
"grad_norm": 0.0989600196480751,
"learning_rate": 0.0006740804106073567,
"loss": 1.2366,
"step": 776
},
{
"epoch": 1.99072,
"grad_norm": 0.08420181274414062,
"learning_rate": 0.0006706586826347305,
"loss": 1.2309,
"step": 778
},
{
"epoch": 1.9958399999999998,
"grad_norm": 0.0978875532746315,
"learning_rate": 0.0006672369546621043,
"loss": 1.2241,
"step": 780
},
{
"epoch": 2.0,
"grad_norm": 0.10489444434642792,
"learning_rate": 0.0006638152266894783,
"loss": 1.2438,
"step": 782
},
{
"epoch": 2.00512,
"grad_norm": 0.08600255101919174,
"learning_rate": 0.0006603934987168521,
"loss": 1.2313,
"step": 784
},
{
"epoch": 2.01024,
"grad_norm": 0.08720952272415161,
"learning_rate": 0.0006569717707442258,
"loss": 1.2322,
"step": 786
},
{
"epoch": 2.01536,
"grad_norm": 0.08914855122566223,
"learning_rate": 0.0006535500427715997,
"loss": 1.2284,
"step": 788
},
{
"epoch": 2.02048,
"grad_norm": 0.08840183168649673,
"learning_rate": 0.0006501283147989735,
"loss": 1.2535,
"step": 790
},
{
"epoch": 2.0256,
"grad_norm": 0.08714258670806885,
"learning_rate": 0.0006467065868263473,
"loss": 1.2358,
"step": 792
},
{
"epoch": 2.03072,
"grad_norm": 0.08174372464418411,
"learning_rate": 0.0006432848588537211,
"loss": 1.2419,
"step": 794
},
{
"epoch": 2.03584,
"grad_norm": 0.08205553144216537,
"learning_rate": 0.000639863130881095,
"loss": 1.2222,
"step": 796
},
{
"epoch": 2.04096,
"grad_norm": 0.0794735699892044,
"learning_rate": 0.0006364414029084687,
"loss": 1.2092,
"step": 798
},
{
"epoch": 2.04608,
"grad_norm": 0.07929161936044693,
"learning_rate": 0.0006330196749358425,
"loss": 1.2066,
"step": 800
},
{
"epoch": 2.04608,
"eval_loss": 1.2471669912338257,
"eval_runtime": 280.5686,
"eval_samples_per_second": 8.91,
"eval_steps_per_second": 1.116,
"step": 800
},
{
"epoch": 2.0512,
"grad_norm": 0.08398760110139847,
"learning_rate": 0.0006295979469632165,
"loss": 1.2302,
"step": 802
},
{
"epoch": 2.05632,
"grad_norm": 0.08723915368318558,
"learning_rate": 0.0006261762189905903,
"loss": 1.2264,
"step": 804
},
{
"epoch": 2.06144,
"grad_norm": 0.0817415714263916,
"learning_rate": 0.0006227544910179641,
"loss": 1.2301,
"step": 806
},
{
"epoch": 2.06656,
"grad_norm": 0.08779250085353851,
"learning_rate": 0.000619332763045338,
"loss": 1.2228,
"step": 808
},
{
"epoch": 2.07168,
"grad_norm": 0.09133391082286835,
"learning_rate": 0.0006159110350727117,
"loss": 1.2415,
"step": 810
},
{
"epoch": 2.0768,
"grad_norm": 0.08566722273826599,
"learning_rate": 0.0006124893071000855,
"loss": 1.2121,
"step": 812
},
{
"epoch": 2.08192,
"grad_norm": 0.08748096972703934,
"learning_rate": 0.0006090675791274593,
"loss": 1.2413,
"step": 814
},
{
"epoch": 2.08704,
"grad_norm": 0.0854722335934639,
"learning_rate": 0.0006056458511548332,
"loss": 1.2326,
"step": 816
},
{
"epoch": 2.09216,
"grad_norm": 0.08618238568305969,
"learning_rate": 0.000602224123182207,
"loss": 1.2216,
"step": 818
},
{
"epoch": 2.09728,
"grad_norm": 0.08122539520263672,
"learning_rate": 0.0005988023952095808,
"loss": 1.2048,
"step": 820
},
{
"epoch": 2.1024,
"grad_norm": 0.08748337626457214,
"learning_rate": 0.0005953806672369548,
"loss": 1.2377,
"step": 822
},
{
"epoch": 2.10752,
"grad_norm": 0.08610134571790695,
"learning_rate": 0.0005919589392643285,
"loss": 1.2312,
"step": 824
},
{
"epoch": 2.11264,
"grad_norm": 0.09176385402679443,
"learning_rate": 0.0005885372112917023,
"loss": 1.234,
"step": 826
},
{
"epoch": 2.11776,
"grad_norm": 0.08381321281194687,
"learning_rate": 0.0005851154833190762,
"loss": 1.2369,
"step": 828
},
{
"epoch": 2.12288,
"grad_norm": 0.08841554820537567,
"learning_rate": 0.00058169375534645,
"loss": 1.2283,
"step": 830
},
{
"epoch": 2.128,
"grad_norm": 0.09985855966806412,
"learning_rate": 0.0005782720273738238,
"loss": 1.2144,
"step": 832
},
{
"epoch": 2.13312,
"grad_norm": 0.09444481134414673,
"learning_rate": 0.0005748502994011976,
"loss": 1.2503,
"step": 834
},
{
"epoch": 2.13824,
"grad_norm": 0.08858395367860794,
"learning_rate": 0.0005714285714285714,
"loss": 1.2497,
"step": 836
},
{
"epoch": 2.14336,
"grad_norm": 0.08610956370830536,
"learning_rate": 0.0005680068434559452,
"loss": 1.2444,
"step": 838
},
{
"epoch": 2.14848,
"grad_norm": 0.08294233679771423,
"learning_rate": 0.000564585115483319,
"loss": 1.2388,
"step": 840
},
{
"epoch": 2.1536,
"grad_norm": 0.08838624507188797,
"learning_rate": 0.000561163387510693,
"loss": 1.2353,
"step": 842
},
{
"epoch": 2.15872,
"grad_norm": 0.07888966798782349,
"learning_rate": 0.0005577416595380668,
"loss": 1.2085,
"step": 844
},
{
"epoch": 2.16384,
"grad_norm": 0.08408137410879135,
"learning_rate": 0.0005543199315654406,
"loss": 1.1922,
"step": 846
},
{
"epoch": 2.16896,
"grad_norm": 0.08245803415775299,
"learning_rate": 0.0005508982035928143,
"loss": 1.2478,
"step": 848
},
{
"epoch": 2.17408,
"grad_norm": 0.07858633249998093,
"learning_rate": 0.0005474764756201882,
"loss": 1.2203,
"step": 850
},
{
"epoch": 2.1792,
"grad_norm": 0.1002994254231453,
"learning_rate": 0.000544054747647562,
"loss": 1.2295,
"step": 852
},
{
"epoch": 2.18432,
"grad_norm": 0.08837361633777618,
"learning_rate": 0.0005406330196749358,
"loss": 1.238,
"step": 854
},
{
"epoch": 2.18944,
"grad_norm": 0.09690374881029129,
"learning_rate": 0.0005372112917023097,
"loss": 1.2301,
"step": 856
},
{
"epoch": 2.19456,
"grad_norm": 0.1030053198337555,
"learning_rate": 0.0005337895637296835,
"loss": 1.2365,
"step": 858
},
{
"epoch": 2.19968,
"grad_norm": 0.08212369680404663,
"learning_rate": 0.0005303678357570573,
"loss": 1.2325,
"step": 860
},
{
"epoch": 2.2048,
"grad_norm": 0.10352316498756409,
"learning_rate": 0.0005269461077844312,
"loss": 1.2341,
"step": 862
},
{
"epoch": 2.20992,
"grad_norm": 0.08346536755561829,
"learning_rate": 0.000523524379811805,
"loss": 1.2237,
"step": 864
},
{
"epoch": 2.21504,
"grad_norm": 0.11156366020441055,
"learning_rate": 0.0005201026518391788,
"loss": 1.2325,
"step": 866
},
{
"epoch": 2.22016,
"grad_norm": 0.10335463285446167,
"learning_rate": 0.0005166809238665526,
"loss": 1.2258,
"step": 868
},
{
"epoch": 2.22528,
"grad_norm": 0.09020327776670456,
"learning_rate": 0.0005132591958939265,
"loss": 1.2305,
"step": 870
},
{
"epoch": 2.2304,
"grad_norm": 0.09839983284473419,
"learning_rate": 0.0005098374679213003,
"loss": 1.2529,
"step": 872
},
{
"epoch": 2.23552,
"grad_norm": 0.1528375744819641,
"learning_rate": 0.000506415739948674,
"loss": 1.2365,
"step": 874
},
{
"epoch": 2.24064,
"grad_norm": 0.08589835464954376,
"learning_rate": 0.0005029940119760479,
"loss": 1.2407,
"step": 876
},
{
"epoch": 2.24576,
"grad_norm": 0.09676992893218994,
"learning_rate": 0.0004995722840034218,
"loss": 1.2244,
"step": 878
},
{
"epoch": 2.25088,
"grad_norm": 0.09816568344831467,
"learning_rate": 0.0004961505560307955,
"loss": 1.2439,
"step": 880
},
{
"epoch": 2.2560000000000002,
"grad_norm": 0.09415734559297562,
"learning_rate": 0.0004927288280581693,
"loss": 1.2329,
"step": 882
},
{
"epoch": 2.26112,
"grad_norm": 0.0947481021285057,
"learning_rate": 0.0004893071000855432,
"loss": 1.2161,
"step": 884
},
{
"epoch": 2.26624,
"grad_norm": 0.09034735709428787,
"learning_rate": 0.00048588537211291706,
"loss": 1.25,
"step": 886
},
{
"epoch": 2.27136,
"grad_norm": 0.09715861082077026,
"learning_rate": 0.0004824636441402909,
"loss": 1.2536,
"step": 888
},
{
"epoch": 2.27648,
"grad_norm": 0.08793841302394867,
"learning_rate": 0.00047904191616766467,
"loss": 1.2425,
"step": 890
},
{
"epoch": 2.2816,
"grad_norm": 0.08474820852279663,
"learning_rate": 0.0004756201881950385,
"loss": 1.2409,
"step": 892
},
{
"epoch": 2.28672,
"grad_norm": 0.0954166129231453,
"learning_rate": 0.0004721984602224123,
"loss": 1.2292,
"step": 894
},
{
"epoch": 2.29184,
"grad_norm": 0.08861096948385239,
"learning_rate": 0.00046877673224978616,
"loss": 1.2509,
"step": 896
},
{
"epoch": 2.29696,
"grad_norm": 0.08726037293672562,
"learning_rate": 0.00046535500427716,
"loss": 1.2471,
"step": 898
},
{
"epoch": 2.30208,
"grad_norm": 0.10455521196126938,
"learning_rate": 0.0004619332763045338,
"loss": 1.2025,
"step": 900
},
{
"epoch": 2.3072,
"grad_norm": 0.09069357812404633,
"learning_rate": 0.0004585115483319076,
"loss": 1.2267,
"step": 902
},
{
"epoch": 2.31232,
"grad_norm": 0.09878482669591904,
"learning_rate": 0.0004550898203592814,
"loss": 1.2276,
"step": 904
},
{
"epoch": 2.31744,
"grad_norm": 0.1100214272737503,
"learning_rate": 0.0004516680923866553,
"loss": 1.2264,
"step": 906
},
{
"epoch": 2.32256,
"grad_norm": 0.09539143741130829,
"learning_rate": 0.0004482463644140291,
"loss": 1.2298,
"step": 908
},
{
"epoch": 2.32768,
"grad_norm": 0.0885070264339447,
"learning_rate": 0.0004448246364414029,
"loss": 1.2108,
"step": 910
},
{
"epoch": 2.3327999999999998,
"grad_norm": 0.08761118352413177,
"learning_rate": 0.00044140290846877674,
"loss": 1.2393,
"step": 912
},
{
"epoch": 2.33792,
"grad_norm": 0.09102723747491837,
"learning_rate": 0.00043798118049615057,
"loss": 1.2372,
"step": 914
},
{
"epoch": 2.3430400000000002,
"grad_norm": 0.08345375210046768,
"learning_rate": 0.0004345594525235244,
"loss": 1.2152,
"step": 916
},
{
"epoch": 2.34816,
"grad_norm": 0.0871756300330162,
"learning_rate": 0.00043113772455089823,
"loss": 1.2539,
"step": 918
},
{
"epoch": 2.35328,
"grad_norm": 0.08344704657793045,
"learning_rate": 0.00042771599657827206,
"loss": 1.2553,
"step": 920
},
{
"epoch": 2.3584,
"grad_norm": 0.09925805032253265,
"learning_rate": 0.00042429426860564584,
"loss": 1.2407,
"step": 922
},
{
"epoch": 2.36352,
"grad_norm": 0.08815500140190125,
"learning_rate": 0.00042087254063301967,
"loss": 1.2165,
"step": 924
},
{
"epoch": 2.36864,
"grad_norm": 0.08601918071508408,
"learning_rate": 0.00041745081266039355,
"loss": 1.2335,
"step": 926
},
{
"epoch": 2.37376,
"grad_norm": 0.08933749049901962,
"learning_rate": 0.0004140290846877673,
"loss": 1.2226,
"step": 928
},
{
"epoch": 2.37888,
"grad_norm": 0.09617882966995239,
"learning_rate": 0.00041060735671514116,
"loss": 1.2232,
"step": 930
},
{
"epoch": 2.384,
"grad_norm": 0.08219394832849503,
"learning_rate": 0.000407185628742515,
"loss": 1.2143,
"step": 932
},
{
"epoch": 2.38912,
"grad_norm": 0.08431462943553925,
"learning_rate": 0.00040376390076988876,
"loss": 1.2416,
"step": 934
},
{
"epoch": 2.39424,
"grad_norm": 0.09297817200422287,
"learning_rate": 0.00040034217279726265,
"loss": 1.2151,
"step": 936
},
{
"epoch": 2.39936,
"grad_norm": 0.08247403055429459,
"learning_rate": 0.0003969204448246365,
"loss": 1.2291,
"step": 938
},
{
"epoch": 2.40448,
"grad_norm": 0.09852425754070282,
"learning_rate": 0.00039349871685201025,
"loss": 1.2351,
"step": 940
},
{
"epoch": 2.4096,
"grad_norm": 0.08745532482862473,
"learning_rate": 0.0003900769888793841,
"loss": 1.2343,
"step": 942
},
{
"epoch": 2.41472,
"grad_norm": 0.0770939365029335,
"learning_rate": 0.0003866552609067579,
"loss": 1.229,
"step": 944
},
{
"epoch": 2.4198399999999998,
"grad_norm": 0.08143922686576843,
"learning_rate": 0.00038323353293413174,
"loss": 1.2168,
"step": 946
},
{
"epoch": 2.42496,
"grad_norm": 0.0852786973118782,
"learning_rate": 0.0003798118049615056,
"loss": 1.237,
"step": 948
},
{
"epoch": 2.4300800000000002,
"grad_norm": 0.09687593579292297,
"learning_rate": 0.0003763900769888794,
"loss": 1.2508,
"step": 950
},
{
"epoch": 2.4352,
"grad_norm": 0.09721924364566803,
"learning_rate": 0.0003729683490162532,
"loss": 1.2524,
"step": 952
},
{
"epoch": 2.44032,
"grad_norm": 0.09008084982633591,
"learning_rate": 0.000369546621043627,
"loss": 1.244,
"step": 954
},
{
"epoch": 2.44544,
"grad_norm": 0.0971183255314827,
"learning_rate": 0.0003661248930710009,
"loss": 1.2339,
"step": 956
},
{
"epoch": 2.45056,
"grad_norm": 0.09916377067565918,
"learning_rate": 0.0003627031650983747,
"loss": 1.2361,
"step": 958
},
{
"epoch": 2.45568,
"grad_norm": 0.08370574563741684,
"learning_rate": 0.0003592814371257485,
"loss": 1.2155,
"step": 960
},
{
"epoch": 2.4608,
"grad_norm": 0.08623719960451126,
"learning_rate": 0.00035585970915312233,
"loss": 1.2165,
"step": 962
},
{
"epoch": 2.46592,
"grad_norm": 0.07589856535196304,
"learning_rate": 0.00035243798118049616,
"loss": 1.2243,
"step": 964
},
{
"epoch": 2.47104,
"grad_norm": 0.07783632725477219,
"learning_rate": 0.00034901625320787,
"loss": 1.2296,
"step": 966
},
{
"epoch": 2.47616,
"grad_norm": 0.08424780517816544,
"learning_rate": 0.0003455945252352438,
"loss": 1.2312,
"step": 968
},
{
"epoch": 2.48128,
"grad_norm": 0.07945290952920914,
"learning_rate": 0.00034217279726261765,
"loss": 1.217,
"step": 970
},
{
"epoch": 2.4864,
"grad_norm": 0.0799039751291275,
"learning_rate": 0.0003387510692899914,
"loss": 1.2201,
"step": 972
},
{
"epoch": 2.49152,
"grad_norm": 0.09106255322694778,
"learning_rate": 0.00033532934131736525,
"loss": 1.2159,
"step": 974
},
{
"epoch": 2.49664,
"grad_norm": 0.08736411482095718,
"learning_rate": 0.00033190761334473914,
"loss": 1.2298,
"step": 976
},
{
"epoch": 2.50176,
"grad_norm": 0.08567455410957336,
"learning_rate": 0.0003284858853721129,
"loss": 1.221,
"step": 978
},
{
"epoch": 2.5068799999999998,
"grad_norm": 0.10240741819143295,
"learning_rate": 0.00032506415739948674,
"loss": 1.2296,
"step": 980
},
{
"epoch": 2.512,
"grad_norm": 0.0851578339934349,
"learning_rate": 0.0003216424294268606,
"loss": 1.233,
"step": 982
},
{
"epoch": 2.5171200000000002,
"grad_norm": 0.08598732203245163,
"learning_rate": 0.00031822070145423435,
"loss": 1.1937,
"step": 984
},
{
"epoch": 2.52224,
"grad_norm": 0.08547403663396835,
"learning_rate": 0.00031479897348160823,
"loss": 1.2158,
"step": 986
},
{
"epoch": 2.52736,
"grad_norm": 0.08678396046161652,
"learning_rate": 0.00031137724550898206,
"loss": 1.2113,
"step": 988
},
{
"epoch": 2.53248,
"grad_norm": 0.08527970314025879,
"learning_rate": 0.00030795551753635584,
"loss": 1.2508,
"step": 990
},
{
"epoch": 2.5376,
"grad_norm": 0.08052323758602142,
"learning_rate": 0.00030453378956372967,
"loss": 1.2324,
"step": 992
},
{
"epoch": 2.54272,
"grad_norm": 0.09333521127700806,
"learning_rate": 0.0003011120615911035,
"loss": 1.2381,
"step": 994
},
{
"epoch": 2.54784,
"grad_norm": 0.08944050967693329,
"learning_rate": 0.0002976903336184774,
"loss": 1.2417,
"step": 996
},
{
"epoch": 2.55296,
"grad_norm": 0.0959937795996666,
"learning_rate": 0.00029426860564585116,
"loss": 1.2346,
"step": 998
},
{
"epoch": 2.55808,
"grad_norm": 0.08061899244785309,
"learning_rate": 0.000290846877673225,
"loss": 1.2377,
"step": 1000
},
{
"epoch": 2.5632,
"grad_norm": 0.08358662575483322,
"learning_rate": 0.0002874251497005988,
"loss": 1.2432,
"step": 1002
},
{
"epoch": 2.56832,
"grad_norm": 0.08278947323560715,
"learning_rate": 0.0002840034217279726,
"loss": 1.222,
"step": 1004
},
{
"epoch": 2.5734399999999997,
"grad_norm": 0.08546797931194305,
"learning_rate": 0.0002805816937553465,
"loss": 1.2379,
"step": 1006
},
{
"epoch": 2.57856,
"grad_norm": 0.08106214553117752,
"learning_rate": 0.0002771599657827203,
"loss": 1.217,
"step": 1008
},
{
"epoch": 2.58368,
"grad_norm": 0.08738037943840027,
"learning_rate": 0.0002737382378100941,
"loss": 1.2202,
"step": 1010
},
{
"epoch": 2.5888,
"grad_norm": 0.07763461023569107,
"learning_rate": 0.0002703165098374679,
"loss": 1.2335,
"step": 1012
},
{
"epoch": 2.59392,
"grad_norm": 0.09023375809192657,
"learning_rate": 0.00026689478186484175,
"loss": 1.2557,
"step": 1014
},
{
"epoch": 2.59904,
"grad_norm": 0.08477311581373215,
"learning_rate": 0.0002634730538922156,
"loss": 1.23,
"step": 1016
},
{
"epoch": 2.6041600000000003,
"grad_norm": 0.0802718847990036,
"learning_rate": 0.0002600513259195894,
"loss": 1.2502,
"step": 1018
},
{
"epoch": 2.60928,
"grad_norm": 0.08197642862796783,
"learning_rate": 0.00025662959794696324,
"loss": 1.2117,
"step": 1020
},
{
"epoch": 2.6144,
"grad_norm": 0.07977724820375443,
"learning_rate": 0.000253207869974337,
"loss": 1.2223,
"step": 1022
},
{
"epoch": 2.61952,
"grad_norm": 0.084455206990242,
"learning_rate": 0.0002497861420017109,
"loss": 1.2296,
"step": 1024
},
{
"epoch": 2.62464,
"grad_norm": 0.08407705277204514,
"learning_rate": 0.00024636441402908467,
"loss": 1.2466,
"step": 1026
},
{
"epoch": 2.62976,
"grad_norm": 0.09497237205505371,
"learning_rate": 0.00024294268605645853,
"loss": 1.2204,
"step": 1028
},
{
"epoch": 2.63488,
"grad_norm": 0.07946959137916565,
"learning_rate": 0.00023952095808383233,
"loss": 1.2293,
"step": 1030
},
{
"epoch": 2.64,
"grad_norm": 0.08926168084144592,
"learning_rate": 0.00023609923011120616,
"loss": 1.2303,
"step": 1032
},
{
"epoch": 2.64512,
"grad_norm": 0.08781218528747559,
"learning_rate": 0.00023267750213858,
"loss": 1.2215,
"step": 1034
},
{
"epoch": 2.65024,
"grad_norm": 0.10436686873435974,
"learning_rate": 0.0002292557741659538,
"loss": 1.2446,
"step": 1036
},
{
"epoch": 2.65536,
"grad_norm": 0.08655078709125519,
"learning_rate": 0.00022583404619332765,
"loss": 1.1889,
"step": 1038
},
{
"epoch": 2.6604799999999997,
"grad_norm": 0.08525776863098145,
"learning_rate": 0.00022241231822070146,
"loss": 1.2246,
"step": 1040
},
{
"epoch": 2.6656,
"grad_norm": 0.08515007048845291,
"learning_rate": 0.00021899059024807529,
"loss": 1.2083,
"step": 1042
},
{
"epoch": 2.67072,
"grad_norm": 0.08326171338558197,
"learning_rate": 0.00021556886227544912,
"loss": 1.2463,
"step": 1044
},
{
"epoch": 2.67584,
"grad_norm": 0.07543444633483887,
"learning_rate": 0.00021214713430282292,
"loss": 1.2229,
"step": 1046
},
{
"epoch": 2.68096,
"grad_norm": 0.08731929957866669,
"learning_rate": 0.00020872540633019678,
"loss": 1.2431,
"step": 1048
},
{
"epoch": 2.68608,
"grad_norm": 0.0846877247095108,
"learning_rate": 0.00020530367835757058,
"loss": 1.2276,
"step": 1050
},
{
"epoch": 2.6912000000000003,
"grad_norm": 0.08184527605772018,
"learning_rate": 0.00020188195038494438,
"loss": 1.2151,
"step": 1052
},
{
"epoch": 2.69632,
"grad_norm": 0.0830821543931961,
"learning_rate": 0.00019846022241231824,
"loss": 1.23,
"step": 1054
},
{
"epoch": 2.70144,
"grad_norm": 0.07801831513643265,
"learning_rate": 0.00019503849443969204,
"loss": 1.2237,
"step": 1056
},
{
"epoch": 2.70656,
"grad_norm": 0.07970487326383591,
"learning_rate": 0.00019161676646706587,
"loss": 1.2296,
"step": 1058
},
{
"epoch": 2.71168,
"grad_norm": 0.08674521744251251,
"learning_rate": 0.0001881950384944397,
"loss": 1.2416,
"step": 1060
},
{
"epoch": 2.7168,
"grad_norm": 0.0784011259675026,
"learning_rate": 0.0001847733105218135,
"loss": 1.2174,
"step": 1062
},
{
"epoch": 2.72192,
"grad_norm": 0.08390358090400696,
"learning_rate": 0.00018135158254918736,
"loss": 1.2221,
"step": 1064
},
{
"epoch": 2.72704,
"grad_norm": 0.08034460991621017,
"learning_rate": 0.00017792985457656116,
"loss": 1.2367,
"step": 1066
},
{
"epoch": 2.73216,
"grad_norm": 0.0805404931306839,
"learning_rate": 0.000174508126603935,
"loss": 1.2228,
"step": 1068
},
{
"epoch": 2.73728,
"grad_norm": 0.10428917407989502,
"learning_rate": 0.00017108639863130882,
"loss": 1.2331,
"step": 1070
},
{
"epoch": 2.7424,
"grad_norm": 0.07880023866891861,
"learning_rate": 0.00016766467065868263,
"loss": 1.2151,
"step": 1072
},
{
"epoch": 2.7475199999999997,
"grad_norm": 0.08330074697732925,
"learning_rate": 0.00016424294268605646,
"loss": 1.201,
"step": 1074
},
{
"epoch": 2.75264,
"grad_norm": 0.08316068351268768,
"learning_rate": 0.0001608212147134303,
"loss": 1.2206,
"step": 1076
},
{
"epoch": 2.75776,
"grad_norm": 0.08193733543157578,
"learning_rate": 0.00015739948674080412,
"loss": 1.2133,
"step": 1078
},
{
"epoch": 2.76288,
"grad_norm": 0.0799107700586319,
"learning_rate": 0.00015397775876817792,
"loss": 1.2202,
"step": 1080
},
{
"epoch": 2.768,
"grad_norm": 0.07523773610591888,
"learning_rate": 0.00015055603079555175,
"loss": 1.2088,
"step": 1082
},
{
"epoch": 2.77312,
"grad_norm": 0.07782167941331863,
"learning_rate": 0.00014713430282292558,
"loss": 1.2191,
"step": 1084
},
{
"epoch": 2.7782400000000003,
"grad_norm": 0.0814930647611618,
"learning_rate": 0.0001437125748502994,
"loss": 1.2096,
"step": 1086
},
{
"epoch": 2.78336,
"grad_norm": 0.07619259506464005,
"learning_rate": 0.00014029084687767324,
"loss": 1.2297,
"step": 1088
},
{
"epoch": 2.78848,
"grad_norm": 0.08152459561824799,
"learning_rate": 0.00013686911890504704,
"loss": 1.2281,
"step": 1090
},
{
"epoch": 2.7936,
"grad_norm": 0.08513466268777847,
"learning_rate": 0.00013344739093242087,
"loss": 1.2183,
"step": 1092
},
{
"epoch": 2.79872,
"grad_norm": 0.07648808509111404,
"learning_rate": 0.0001300256629597947,
"loss": 1.2171,
"step": 1094
},
{
"epoch": 2.80384,
"grad_norm": 0.08608166873455048,
"learning_rate": 0.0001266039349871685,
"loss": 1.233,
"step": 1096
},
{
"epoch": 2.80896,
"grad_norm": 0.08903096616268158,
"learning_rate": 0.00012318220701454234,
"loss": 1.2329,
"step": 1098
},
{
"epoch": 2.81408,
"grad_norm": 0.07557443529367447,
"learning_rate": 0.00011976047904191617,
"loss": 1.2333,
"step": 1100
},
{
"epoch": 2.8192,
"grad_norm": 0.0815119668841362,
"learning_rate": 0.00011633875106929,
"loss": 1.2353,
"step": 1102
},
{
"epoch": 2.82432,
"grad_norm": 0.08905310928821564,
"learning_rate": 0.00011291702309666383,
"loss": 1.2255,
"step": 1104
},
{
"epoch": 2.82944,
"grad_norm": 0.08789879828691483,
"learning_rate": 0.00010949529512403764,
"loss": 1.2139,
"step": 1106
},
{
"epoch": 2.8345599999999997,
"grad_norm": 0.08430198580026627,
"learning_rate": 0.00010607356715141146,
"loss": 1.229,
"step": 1108
},
{
"epoch": 2.83968,
"grad_norm": 0.07988926768302917,
"learning_rate": 0.00010265183917878529,
"loss": 1.2316,
"step": 1110
},
{
"epoch": 2.8448,
"grad_norm": 0.08727908134460449,
"learning_rate": 9.923011120615912e-05,
"loss": 1.2237,
"step": 1112
},
{
"epoch": 2.84992,
"grad_norm": 0.10279367119073868,
"learning_rate": 9.580838323353294e-05,
"loss": 1.2565,
"step": 1114
},
{
"epoch": 2.85504,
"grad_norm": 0.08122528344392776,
"learning_rate": 9.238665526090675e-05,
"loss": 1.2353,
"step": 1116
},
{
"epoch": 2.86016,
"grad_norm": 0.09036324173212051,
"learning_rate": 8.896492728828058e-05,
"loss": 1.2337,
"step": 1118
},
{
"epoch": 2.8652800000000003,
"grad_norm": 0.07413888722658157,
"learning_rate": 8.554319931565441e-05,
"loss": 1.2367,
"step": 1120
},
{
"epoch": 2.8704,
"grad_norm": 0.08744188398122787,
"learning_rate": 8.212147134302823e-05,
"loss": 1.234,
"step": 1122
},
{
"epoch": 2.87552,
"grad_norm": 0.07657689601182938,
"learning_rate": 7.869974337040206e-05,
"loss": 1.2094,
"step": 1124
},
{
"epoch": 2.88064,
"grad_norm": 0.09120559692382812,
"learning_rate": 7.527801539777588e-05,
"loss": 1.2473,
"step": 1126
},
{
"epoch": 2.88576,
"grad_norm": 0.08199866861104965,
"learning_rate": 7.18562874251497e-05,
"loss": 1.2384,
"step": 1128
},
{
"epoch": 2.89088,
"grad_norm": 0.07703917473554611,
"learning_rate": 6.843455945252352e-05,
"loss": 1.2197,
"step": 1130
},
{
"epoch": 2.896,
"grad_norm": 0.08120223879814148,
"learning_rate": 6.501283147989735e-05,
"loss": 1.2343,
"step": 1132
},
{
"epoch": 2.90112,
"grad_norm": 0.08173457533121109,
"learning_rate": 6.159110350727117e-05,
"loss": 1.2422,
"step": 1134
},
{
"epoch": 2.90624,
"grad_norm": 0.08017323166131973,
"learning_rate": 5.8169375534645e-05,
"loss": 1.1985,
"step": 1136
},
{
"epoch": 2.91136,
"grad_norm": 0.09028081595897675,
"learning_rate": 5.474764756201882e-05,
"loss": 1.1902,
"step": 1138
},
{
"epoch": 2.91648,
"grad_norm": 0.07673865556716919,
"learning_rate": 5.1325919589392645e-05,
"loss": 1.2292,
"step": 1140
},
{
"epoch": 2.9215999999999998,
"grad_norm": 0.08590974658727646,
"learning_rate": 4.790419161676647e-05,
"loss": 1.2227,
"step": 1142
},
{
"epoch": 2.92672,
"grad_norm": 0.07928025722503662,
"learning_rate": 4.448246364414029e-05,
"loss": 1.2426,
"step": 1144
},
{
"epoch": 2.9318400000000002,
"grad_norm": 0.07864856719970703,
"learning_rate": 4.1060735671514114e-05,
"loss": 1.2476,
"step": 1146
},
{
"epoch": 2.93696,
"grad_norm": 0.08154473453760147,
"learning_rate": 3.763900769888794e-05,
"loss": 1.2081,
"step": 1148
},
{
"epoch": 2.94208,
"grad_norm": 0.07814584672451019,
"learning_rate": 3.421727972626176e-05,
"loss": 1.2463,
"step": 1150
},
{
"epoch": 2.9472,
"grad_norm": 0.07772421091794968,
"learning_rate": 3.0795551753635584e-05,
"loss": 1.2447,
"step": 1152
},
{
"epoch": 2.9523200000000003,
"grad_norm": 0.08610265702009201,
"learning_rate": 2.737382378100941e-05,
"loss": 1.226,
"step": 1154
},
{
"epoch": 2.95744,
"grad_norm": 0.086619071662426,
"learning_rate": 2.3952095808383234e-05,
"loss": 1.2616,
"step": 1156
},
{
"epoch": 2.96256,
"grad_norm": 0.07644681632518768,
"learning_rate": 2.0530367835757057e-05,
"loss": 1.2244,
"step": 1158
},
{
"epoch": 2.96768,
"grad_norm": 0.08778993040323257,
"learning_rate": 1.710863986313088e-05,
"loss": 1.2259,
"step": 1160
},
{
"epoch": 2.9728,
"grad_norm": 0.0812673419713974,
"learning_rate": 1.3686911890504705e-05,
"loss": 1.2081,
"step": 1162
},
{
"epoch": 2.97792,
"grad_norm": 0.07923007756471634,
"learning_rate": 1.0265183917878529e-05,
"loss": 1.2297,
"step": 1164
},
{
"epoch": 2.98304,
"grad_norm": 0.07516805827617645,
"learning_rate": 6.843455945252353e-06,
"loss": 1.2277,
"step": 1166
},
{
"epoch": 2.98816,
"grad_norm": 0.08043860644102097,
"learning_rate": 3.4217279726261763e-06,
"loss": 1.2087,
"step": 1168
},
{
"epoch": 2.99328,
"grad_norm": 0.07992921024560928,
"learning_rate": 0.0,
"loss": 1.2479,
"step": 1170
},
{
"epoch": 2.99328,
"step": 1170,
"total_flos": 1.3204661001922806e+19,
"train_loss": 1.2500826900840825,
"train_runtime": 40705.2136,
"train_samples_per_second": 7.37,
"train_steps_per_second": 0.029
},
{
"epoch": 2.99328,
"eval_loss": 1.2431308031082153,
"eval_runtime": 280.041,
"eval_samples_per_second": 8.927,
"eval_steps_per_second": 1.118,
"step": 1170
},
{
"epoch": 2.99328,
"eval_loss": 1.2251627445220947,
"eval_runtime": 300.2276,
"eval_samples_per_second": 8.327,
"eval_steps_per_second": 1.043,
"step": 1170
}
],
"logging_steps": 2,
"max_steps": 1170,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.3204661001922806e+19,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}