Wanff
Add fine-tuned model
d59fab3
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 0,
"global_step": 1479,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002028397565922921,
"grad_norm": 0.56640625,
"learning_rate": 9.993238674780258e-06,
"loss": 2.0083,
"step": 1
},
{
"epoch": 0.004056795131845842,
"grad_norm": 0.515625,
"learning_rate": 9.986477349560515e-06,
"loss": 1.8898,
"step": 2
},
{
"epoch": 0.006085192697768763,
"grad_norm": 0.50390625,
"learning_rate": 9.979716024340772e-06,
"loss": 1.9586,
"step": 3
},
{
"epoch": 0.008113590263691683,
"grad_norm": 0.46484375,
"learning_rate": 9.972954699121028e-06,
"loss": 1.8423,
"step": 4
},
{
"epoch": 0.010141987829614604,
"grad_norm": 0.4609375,
"learning_rate": 9.966193373901285e-06,
"loss": 1.9374,
"step": 5
},
{
"epoch": 0.012170385395537525,
"grad_norm": 0.42578125,
"learning_rate": 9.959432048681542e-06,
"loss": 1.856,
"step": 6
},
{
"epoch": 0.014198782961460446,
"grad_norm": 0.400390625,
"learning_rate": 9.9526707234618e-06,
"loss": 1.827,
"step": 7
},
{
"epoch": 0.016227180527383367,
"grad_norm": 0.40625,
"learning_rate": 9.945909398242056e-06,
"loss": 1.9399,
"step": 8
},
{
"epoch": 0.018255578093306288,
"grad_norm": 0.396484375,
"learning_rate": 9.939148073022313e-06,
"loss": 1.8973,
"step": 9
},
{
"epoch": 0.02028397565922921,
"grad_norm": 0.349609375,
"learning_rate": 9.93238674780257e-06,
"loss": 1.7955,
"step": 10
},
{
"epoch": 0.02231237322515213,
"grad_norm": 0.34765625,
"learning_rate": 9.925625422582827e-06,
"loss": 1.7947,
"step": 11
},
{
"epoch": 0.02434077079107505,
"grad_norm": 0.33203125,
"learning_rate": 9.918864097363084e-06,
"loss": 1.7894,
"step": 12
},
{
"epoch": 0.02636916835699797,
"grad_norm": 0.33984375,
"learning_rate": 9.91210277214334e-06,
"loss": 1.7834,
"step": 13
},
{
"epoch": 0.028397565922920892,
"grad_norm": 0.3125,
"learning_rate": 9.905341446923598e-06,
"loss": 1.6634,
"step": 14
},
{
"epoch": 0.030425963488843813,
"grad_norm": 0.326171875,
"learning_rate": 9.898580121703854e-06,
"loss": 1.7655,
"step": 15
},
{
"epoch": 0.032454361054766734,
"grad_norm": 0.318359375,
"learning_rate": 9.891818796484111e-06,
"loss": 1.6973,
"step": 16
},
{
"epoch": 0.034482758620689655,
"grad_norm": 0.328125,
"learning_rate": 9.885057471264368e-06,
"loss": 1.7866,
"step": 17
},
{
"epoch": 0.036511156186612576,
"grad_norm": 0.283203125,
"learning_rate": 9.878296146044625e-06,
"loss": 1.6943,
"step": 18
},
{
"epoch": 0.038539553752535496,
"grad_norm": 0.3125,
"learning_rate": 9.871534820824882e-06,
"loss": 1.7,
"step": 19
},
{
"epoch": 0.04056795131845842,
"grad_norm": 0.37109375,
"learning_rate": 9.864773495605139e-06,
"loss": 1.6642,
"step": 20
},
{
"epoch": 0.04259634888438134,
"grad_norm": 0.28125,
"learning_rate": 9.858012170385396e-06,
"loss": 1.6852,
"step": 21
},
{
"epoch": 0.04462474645030426,
"grad_norm": 0.291015625,
"learning_rate": 9.851250845165653e-06,
"loss": 1.6855,
"step": 22
},
{
"epoch": 0.04665314401622718,
"grad_norm": 0.255859375,
"learning_rate": 9.84448951994591e-06,
"loss": 1.6641,
"step": 23
},
{
"epoch": 0.0486815415821501,
"grad_norm": 0.259765625,
"learning_rate": 9.837728194726167e-06,
"loss": 1.6916,
"step": 24
},
{
"epoch": 0.05070993914807302,
"grad_norm": 0.275390625,
"learning_rate": 9.830966869506424e-06,
"loss": 1.7211,
"step": 25
},
{
"epoch": 0.05273833671399594,
"grad_norm": 0.240234375,
"learning_rate": 9.82420554428668e-06,
"loss": 1.5421,
"step": 26
},
{
"epoch": 0.05476673427991886,
"grad_norm": 0.236328125,
"learning_rate": 9.817444219066939e-06,
"loss": 1.5551,
"step": 27
},
{
"epoch": 0.056795131845841784,
"grad_norm": 0.2265625,
"learning_rate": 9.810682893847194e-06,
"loss": 1.5284,
"step": 28
},
{
"epoch": 0.058823529411764705,
"grad_norm": 0.263671875,
"learning_rate": 9.803921568627451e-06,
"loss": 1.5221,
"step": 29
},
{
"epoch": 0.060851926977687626,
"grad_norm": 0.240234375,
"learning_rate": 9.797160243407708e-06,
"loss": 1.6265,
"step": 30
},
{
"epoch": 0.06288032454361055,
"grad_norm": 0.224609375,
"learning_rate": 9.790398918187965e-06,
"loss": 1.582,
"step": 31
},
{
"epoch": 0.06490872210953347,
"grad_norm": 0.212890625,
"learning_rate": 9.783637592968222e-06,
"loss": 1.5373,
"step": 32
},
{
"epoch": 0.06693711967545639,
"grad_norm": 0.2109375,
"learning_rate": 9.776876267748479e-06,
"loss": 1.4842,
"step": 33
},
{
"epoch": 0.06896551724137931,
"grad_norm": 0.203125,
"learning_rate": 9.770114942528738e-06,
"loss": 1.4742,
"step": 34
},
{
"epoch": 0.07099391480730223,
"grad_norm": 0.2158203125,
"learning_rate": 9.763353617308994e-06,
"loss": 1.5238,
"step": 35
},
{
"epoch": 0.07302231237322515,
"grad_norm": 0.21484375,
"learning_rate": 9.75659229208925e-06,
"loss": 1.4861,
"step": 36
},
{
"epoch": 0.07505070993914807,
"grad_norm": 0.369140625,
"learning_rate": 9.749830966869507e-06,
"loss": 1.475,
"step": 37
},
{
"epoch": 0.07707910750507099,
"grad_norm": 0.2021484375,
"learning_rate": 9.743069641649763e-06,
"loss": 1.492,
"step": 38
},
{
"epoch": 0.07910750507099391,
"grad_norm": 0.2890625,
"learning_rate": 9.73630831643002e-06,
"loss": 1.4861,
"step": 39
},
{
"epoch": 0.08113590263691683,
"grad_norm": 0.19921875,
"learning_rate": 9.729546991210277e-06,
"loss": 1.4854,
"step": 40
},
{
"epoch": 0.08316430020283976,
"grad_norm": 0.201171875,
"learning_rate": 9.722785665990536e-06,
"loss": 1.4827,
"step": 41
},
{
"epoch": 0.08519269776876268,
"grad_norm": 0.1982421875,
"learning_rate": 9.716024340770793e-06,
"loss": 1.5104,
"step": 42
},
{
"epoch": 0.0872210953346856,
"grad_norm": 0.1923828125,
"learning_rate": 9.70926301555105e-06,
"loss": 1.4619,
"step": 43
},
{
"epoch": 0.08924949290060852,
"grad_norm": 0.1826171875,
"learning_rate": 9.702501690331305e-06,
"loss": 1.4507,
"step": 44
},
{
"epoch": 0.09127789046653144,
"grad_norm": 0.1875,
"learning_rate": 9.695740365111562e-06,
"loss": 1.4821,
"step": 45
},
{
"epoch": 0.09330628803245436,
"grad_norm": 0.1884765625,
"learning_rate": 9.688979039891819e-06,
"loss": 1.4255,
"step": 46
},
{
"epoch": 0.09533468559837728,
"grad_norm": 0.1806640625,
"learning_rate": 9.682217714672076e-06,
"loss": 1.3824,
"step": 47
},
{
"epoch": 0.0973630831643002,
"grad_norm": 0.19140625,
"learning_rate": 9.675456389452334e-06,
"loss": 1.3772,
"step": 48
},
{
"epoch": 0.09939148073022312,
"grad_norm": 0.1953125,
"learning_rate": 9.668695064232591e-06,
"loss": 1.4889,
"step": 49
},
{
"epoch": 0.10141987829614604,
"grad_norm": 0.1796875,
"learning_rate": 9.661933739012848e-06,
"loss": 1.4423,
"step": 50
},
{
"epoch": 0.10344827586206896,
"grad_norm": 0.1728515625,
"learning_rate": 9.655172413793105e-06,
"loss": 1.415,
"step": 51
},
{
"epoch": 0.10547667342799188,
"grad_norm": 0.169921875,
"learning_rate": 9.64841108857336e-06,
"loss": 1.3956,
"step": 52
},
{
"epoch": 0.1075050709939148,
"grad_norm": 0.197265625,
"learning_rate": 9.641649763353617e-06,
"loss": 1.469,
"step": 53
},
{
"epoch": 0.10953346855983773,
"grad_norm": 0.171875,
"learning_rate": 9.634888438133874e-06,
"loss": 1.4105,
"step": 54
},
{
"epoch": 0.11156186612576065,
"grad_norm": 0.1767578125,
"learning_rate": 9.628127112914133e-06,
"loss": 1.4032,
"step": 55
},
{
"epoch": 0.11359026369168357,
"grad_norm": 0.1650390625,
"learning_rate": 9.62136578769439e-06,
"loss": 1.2954,
"step": 56
},
{
"epoch": 0.11561866125760649,
"grad_norm": 0.162109375,
"learning_rate": 9.614604462474646e-06,
"loss": 1.3745,
"step": 57
},
{
"epoch": 0.11764705882352941,
"grad_norm": 1.578125,
"learning_rate": 9.607843137254903e-06,
"loss": 1.3565,
"step": 58
},
{
"epoch": 0.11967545638945233,
"grad_norm": 0.1640625,
"learning_rate": 9.60108181203516e-06,
"loss": 1.3677,
"step": 59
},
{
"epoch": 0.12170385395537525,
"grad_norm": 0.17578125,
"learning_rate": 9.594320486815416e-06,
"loss": 1.3665,
"step": 60
},
{
"epoch": 0.12373225152129817,
"grad_norm": 0.197265625,
"learning_rate": 9.587559161595672e-06,
"loss": 1.3636,
"step": 61
},
{
"epoch": 0.1257606490872211,
"grad_norm": 0.1640625,
"learning_rate": 9.580797836375931e-06,
"loss": 1.3645,
"step": 62
},
{
"epoch": 0.12778904665314403,
"grad_norm": 0.1630859375,
"learning_rate": 9.574036511156188e-06,
"loss": 1.3666,
"step": 63
},
{
"epoch": 0.12981744421906694,
"grad_norm": 0.1533203125,
"learning_rate": 9.567275185936445e-06,
"loss": 1.3332,
"step": 64
},
{
"epoch": 0.13184584178498987,
"grad_norm": 0.1611328125,
"learning_rate": 9.560513860716702e-06,
"loss": 1.373,
"step": 65
},
{
"epoch": 0.13387423935091278,
"grad_norm": 0.1748046875,
"learning_rate": 9.553752535496959e-06,
"loss": 1.3931,
"step": 66
},
{
"epoch": 0.1359026369168357,
"grad_norm": 0.1787109375,
"learning_rate": 9.546991210277216e-06,
"loss": 1.2847,
"step": 67
},
{
"epoch": 0.13793103448275862,
"grad_norm": 0.16796875,
"learning_rate": 9.54022988505747e-06,
"loss": 1.3494,
"step": 68
},
{
"epoch": 0.13995943204868155,
"grad_norm": 0.1591796875,
"learning_rate": 9.53346855983773e-06,
"loss": 1.3461,
"step": 69
},
{
"epoch": 0.14198782961460446,
"grad_norm": 0.2109375,
"learning_rate": 9.526707234617986e-06,
"loss": 1.3208,
"step": 70
},
{
"epoch": 0.1440162271805274,
"grad_norm": 0.259765625,
"learning_rate": 9.519945909398243e-06,
"loss": 1.3241,
"step": 71
},
{
"epoch": 0.1460446247464503,
"grad_norm": 0.1591796875,
"learning_rate": 9.5131845841785e-06,
"loss": 1.3235,
"step": 72
},
{
"epoch": 0.14807302231237324,
"grad_norm": 0.1923828125,
"learning_rate": 9.506423258958757e-06,
"loss": 1.3221,
"step": 73
},
{
"epoch": 0.15010141987829614,
"grad_norm": 0.197265625,
"learning_rate": 9.499661933739014e-06,
"loss": 1.3067,
"step": 74
},
{
"epoch": 0.15212981744421908,
"grad_norm": 0.1669921875,
"learning_rate": 9.492900608519271e-06,
"loss": 1.3077,
"step": 75
},
{
"epoch": 0.15415821501014199,
"grad_norm": 0.220703125,
"learning_rate": 9.486139283299526e-06,
"loss": 1.3585,
"step": 76
},
{
"epoch": 0.15618661257606492,
"grad_norm": 0.1650390625,
"learning_rate": 9.479377958079785e-06,
"loss": 1.3229,
"step": 77
},
{
"epoch": 0.15821501014198783,
"grad_norm": 0.173828125,
"learning_rate": 9.472616632860042e-06,
"loss": 1.3157,
"step": 78
},
{
"epoch": 0.16024340770791076,
"grad_norm": 0.1904296875,
"learning_rate": 9.465855307640299e-06,
"loss": 1.2941,
"step": 79
},
{
"epoch": 0.16227180527383367,
"grad_norm": 0.1591796875,
"learning_rate": 9.459093982420555e-06,
"loss": 1.3267,
"step": 80
},
{
"epoch": 0.1643002028397566,
"grad_norm": 0.158203125,
"learning_rate": 9.452332657200812e-06,
"loss": 1.3108,
"step": 81
},
{
"epoch": 0.1663286004056795,
"grad_norm": 0.1630859375,
"learning_rate": 9.44557133198107e-06,
"loss": 1.2806,
"step": 82
},
{
"epoch": 0.16835699797160245,
"grad_norm": 0.15625,
"learning_rate": 9.438810006761326e-06,
"loss": 1.2928,
"step": 83
},
{
"epoch": 0.17038539553752535,
"grad_norm": 0.169921875,
"learning_rate": 9.432048681541583e-06,
"loss": 1.3039,
"step": 84
},
{
"epoch": 0.1724137931034483,
"grad_norm": 0.1826171875,
"learning_rate": 9.42528735632184e-06,
"loss": 1.3077,
"step": 85
},
{
"epoch": 0.1744421906693712,
"grad_norm": 0.1513671875,
"learning_rate": 9.418526031102097e-06,
"loss": 1.3454,
"step": 86
},
{
"epoch": 0.17647058823529413,
"grad_norm": 0.1513671875,
"learning_rate": 9.411764705882354e-06,
"loss": 1.3095,
"step": 87
},
{
"epoch": 0.17849898580121704,
"grad_norm": 0.166015625,
"learning_rate": 9.40500338066261e-06,
"loss": 1.3264,
"step": 88
},
{
"epoch": 0.18052738336713997,
"grad_norm": 0.1591796875,
"learning_rate": 9.398242055442868e-06,
"loss": 1.3138,
"step": 89
},
{
"epoch": 0.18255578093306288,
"grad_norm": 0.1572265625,
"learning_rate": 9.391480730223125e-06,
"loss": 1.2476,
"step": 90
},
{
"epoch": 0.1845841784989858,
"grad_norm": 0.1669921875,
"learning_rate": 9.384719405003381e-06,
"loss": 1.2699,
"step": 91
},
{
"epoch": 0.18661257606490872,
"grad_norm": 0.1640625,
"learning_rate": 9.377958079783638e-06,
"loss": 1.3391,
"step": 92
},
{
"epoch": 0.18864097363083165,
"grad_norm": 0.1591796875,
"learning_rate": 9.371196754563895e-06,
"loss": 1.3236,
"step": 93
},
{
"epoch": 0.19066937119675456,
"grad_norm": 0.39453125,
"learning_rate": 9.364435429344152e-06,
"loss": 1.3209,
"step": 94
},
{
"epoch": 0.1926977687626775,
"grad_norm": 0.326171875,
"learning_rate": 9.357674104124409e-06,
"loss": 1.3001,
"step": 95
},
{
"epoch": 0.1947261663286004,
"grad_norm": 0.16796875,
"learning_rate": 9.350912778904666e-06,
"loss": 1.2758,
"step": 96
},
{
"epoch": 0.19675456389452334,
"grad_norm": 0.166015625,
"learning_rate": 9.344151453684923e-06,
"loss": 1.2668,
"step": 97
},
{
"epoch": 0.19878296146044624,
"grad_norm": 0.1748046875,
"learning_rate": 9.33739012846518e-06,
"loss": 1.2948,
"step": 98
},
{
"epoch": 0.20081135902636918,
"grad_norm": 0.2099609375,
"learning_rate": 9.330628803245437e-06,
"loss": 1.3023,
"step": 99
},
{
"epoch": 0.2028397565922921,
"grad_norm": 0.1650390625,
"learning_rate": 9.323867478025694e-06,
"loss": 1.2459,
"step": 100
},
{
"epoch": 0.20486815415821502,
"grad_norm": 0.1669921875,
"learning_rate": 9.31710615280595e-06,
"loss": 1.2824,
"step": 101
},
{
"epoch": 0.20689655172413793,
"grad_norm": 0.1845703125,
"learning_rate": 9.310344827586207e-06,
"loss": 1.2607,
"step": 102
},
{
"epoch": 0.20892494929006086,
"grad_norm": 0.1708984375,
"learning_rate": 9.303583502366464e-06,
"loss": 1.2807,
"step": 103
},
{
"epoch": 0.21095334685598377,
"grad_norm": 0.283203125,
"learning_rate": 9.296822177146721e-06,
"loss": 1.2656,
"step": 104
},
{
"epoch": 0.2129817444219067,
"grad_norm": 0.236328125,
"learning_rate": 9.290060851926978e-06,
"loss": 1.2689,
"step": 105
},
{
"epoch": 0.2150101419878296,
"grad_norm": 0.197265625,
"learning_rate": 9.283299526707235e-06,
"loss": 1.2439,
"step": 106
},
{
"epoch": 0.21703853955375255,
"grad_norm": 0.24609375,
"learning_rate": 9.276538201487492e-06,
"loss": 1.2258,
"step": 107
},
{
"epoch": 0.21906693711967545,
"grad_norm": 0.16796875,
"learning_rate": 9.269776876267749e-06,
"loss": 1.2496,
"step": 108
},
{
"epoch": 0.2210953346855984,
"grad_norm": 0.162109375,
"learning_rate": 9.263015551048006e-06,
"loss": 1.2327,
"step": 109
},
{
"epoch": 0.2231237322515213,
"grad_norm": 0.1748046875,
"learning_rate": 9.256254225828263e-06,
"loss": 1.2223,
"step": 110
},
{
"epoch": 0.22515212981744423,
"grad_norm": 0.1943359375,
"learning_rate": 9.24949290060852e-06,
"loss": 1.2318,
"step": 111
},
{
"epoch": 0.22718052738336714,
"grad_norm": 0.1796875,
"learning_rate": 9.242731575388777e-06,
"loss": 1.2651,
"step": 112
},
{
"epoch": 0.22920892494929007,
"grad_norm": 0.2470703125,
"learning_rate": 9.235970250169034e-06,
"loss": 1.1937,
"step": 113
},
{
"epoch": 0.23123732251521298,
"grad_norm": 0.1845703125,
"learning_rate": 9.22920892494929e-06,
"loss": 1.239,
"step": 114
},
{
"epoch": 0.2332657200811359,
"grad_norm": 0.1796875,
"learning_rate": 9.222447599729547e-06,
"loss": 1.2483,
"step": 115
},
{
"epoch": 0.23529411764705882,
"grad_norm": 0.1787109375,
"learning_rate": 9.215686274509804e-06,
"loss": 1.2162,
"step": 116
},
{
"epoch": 0.23732251521298176,
"grad_norm": 0.2080078125,
"learning_rate": 9.208924949290061e-06,
"loss": 1.2844,
"step": 117
},
{
"epoch": 0.23935091277890466,
"grad_norm": 0.1728515625,
"learning_rate": 9.202163624070318e-06,
"loss": 1.2785,
"step": 118
},
{
"epoch": 0.2413793103448276,
"grad_norm": 0.1728515625,
"learning_rate": 9.195402298850575e-06,
"loss": 1.284,
"step": 119
},
{
"epoch": 0.2434077079107505,
"grad_norm": 0.15625,
"learning_rate": 9.188640973630832e-06,
"loss": 1.2332,
"step": 120
},
{
"epoch": 0.24543610547667344,
"grad_norm": 0.1611328125,
"learning_rate": 9.181879648411089e-06,
"loss": 1.2216,
"step": 121
},
{
"epoch": 0.24746450304259635,
"grad_norm": 0.1572265625,
"learning_rate": 9.175118323191346e-06,
"loss": 1.2629,
"step": 122
},
{
"epoch": 0.24949290060851928,
"grad_norm": 0.162109375,
"learning_rate": 9.168356997971604e-06,
"loss": 1.2498,
"step": 123
},
{
"epoch": 0.2515212981744422,
"grad_norm": 0.2041015625,
"learning_rate": 9.16159567275186e-06,
"loss": 1.249,
"step": 124
},
{
"epoch": 0.2535496957403651,
"grad_norm": 0.23046875,
"learning_rate": 9.154834347532116e-06,
"loss": 1.2158,
"step": 125
},
{
"epoch": 0.25557809330628806,
"grad_norm": 0.1708984375,
"learning_rate": 9.148073022312373e-06,
"loss": 1.227,
"step": 126
},
{
"epoch": 0.25760649087221094,
"grad_norm": 0.240234375,
"learning_rate": 9.14131169709263e-06,
"loss": 1.2769,
"step": 127
},
{
"epoch": 0.25963488843813387,
"grad_norm": 0.201171875,
"learning_rate": 9.134550371872887e-06,
"loss": 1.2222,
"step": 128
},
{
"epoch": 0.2616632860040568,
"grad_norm": 0.234375,
"learning_rate": 9.127789046653144e-06,
"loss": 1.2199,
"step": 129
},
{
"epoch": 0.26369168356997974,
"grad_norm": 0.1748046875,
"learning_rate": 9.121027721433403e-06,
"loss": 1.2321,
"step": 130
},
{
"epoch": 0.2657200811359026,
"grad_norm": 0.23046875,
"learning_rate": 9.11426639621366e-06,
"loss": 1.2217,
"step": 131
},
{
"epoch": 0.26774847870182555,
"grad_norm": 0.181640625,
"learning_rate": 9.107505070993915e-06,
"loss": 1.2269,
"step": 132
},
{
"epoch": 0.2697768762677485,
"grad_norm": 0.1708984375,
"learning_rate": 9.100743745774172e-06,
"loss": 1.2449,
"step": 133
},
{
"epoch": 0.2718052738336714,
"grad_norm": 0.1728515625,
"learning_rate": 9.093982420554429e-06,
"loss": 1.2616,
"step": 134
},
{
"epoch": 0.2738336713995943,
"grad_norm": 0.189453125,
"learning_rate": 9.087221095334686e-06,
"loss": 1.2719,
"step": 135
},
{
"epoch": 0.27586206896551724,
"grad_norm": 0.1953125,
"learning_rate": 9.080459770114942e-06,
"loss": 1.217,
"step": 136
},
{
"epoch": 0.2778904665314402,
"grad_norm": 0.1708984375,
"learning_rate": 9.073698444895201e-06,
"loss": 1.21,
"step": 137
},
{
"epoch": 0.2799188640973631,
"grad_norm": 0.1806640625,
"learning_rate": 9.066937119675458e-06,
"loss": 1.1984,
"step": 138
},
{
"epoch": 0.281947261663286,
"grad_norm": 0.189453125,
"learning_rate": 9.060175794455715e-06,
"loss": 1.2262,
"step": 139
},
{
"epoch": 0.2839756592292089,
"grad_norm": 0.1708984375,
"learning_rate": 9.05341446923597e-06,
"loss": 1.2322,
"step": 140
},
{
"epoch": 0.28600405679513186,
"grad_norm": 0.2451171875,
"learning_rate": 9.046653144016227e-06,
"loss": 1.2377,
"step": 141
},
{
"epoch": 0.2880324543610548,
"grad_norm": 0.169921875,
"learning_rate": 9.039891818796484e-06,
"loss": 1.2473,
"step": 142
},
{
"epoch": 0.29006085192697767,
"grad_norm": 0.177734375,
"learning_rate": 9.033130493576741e-06,
"loss": 1.2727,
"step": 143
},
{
"epoch": 0.2920892494929006,
"grad_norm": 0.33203125,
"learning_rate": 9.026369168357e-06,
"loss": 1.1902,
"step": 144
},
{
"epoch": 0.29411764705882354,
"grad_norm": 0.185546875,
"learning_rate": 9.019607843137256e-06,
"loss": 1.2436,
"step": 145
},
{
"epoch": 0.2961460446247465,
"grad_norm": 0.185546875,
"learning_rate": 9.012846517917513e-06,
"loss": 1.1909,
"step": 146
},
{
"epoch": 0.29817444219066935,
"grad_norm": 0.17578125,
"learning_rate": 9.00608519269777e-06,
"loss": 1.2215,
"step": 147
},
{
"epoch": 0.3002028397565923,
"grad_norm": 0.2001953125,
"learning_rate": 8.999323867478025e-06,
"loss": 1.2217,
"step": 148
},
{
"epoch": 0.3022312373225152,
"grad_norm": 0.2138671875,
"learning_rate": 8.992562542258282e-06,
"loss": 1.2267,
"step": 149
},
{
"epoch": 0.30425963488843816,
"grad_norm": 0.16796875,
"learning_rate": 8.98580121703854e-06,
"loss": 1.2343,
"step": 150
},
{
"epoch": 0.30628803245436104,
"grad_norm": 0.197265625,
"learning_rate": 8.979039891818798e-06,
"loss": 1.2193,
"step": 151
},
{
"epoch": 0.30831643002028397,
"grad_norm": 0.1806640625,
"learning_rate": 8.972278566599055e-06,
"loss": 1.2008,
"step": 152
},
{
"epoch": 0.3103448275862069,
"grad_norm": 0.171875,
"learning_rate": 8.965517241379312e-06,
"loss": 1.217,
"step": 153
},
{
"epoch": 0.31237322515212984,
"grad_norm": 0.2158203125,
"learning_rate": 8.958755916159569e-06,
"loss": 1.2081,
"step": 154
},
{
"epoch": 0.3144016227180527,
"grad_norm": 0.201171875,
"learning_rate": 8.951994590939825e-06,
"loss": 1.1466,
"step": 155
},
{
"epoch": 0.31643002028397565,
"grad_norm": 0.173828125,
"learning_rate": 8.94523326572008e-06,
"loss": 1.1587,
"step": 156
},
{
"epoch": 0.3184584178498986,
"grad_norm": 0.1728515625,
"learning_rate": 8.938471940500338e-06,
"loss": 1.1839,
"step": 157
},
{
"epoch": 0.3204868154158215,
"grad_norm": 0.177734375,
"learning_rate": 8.931710615280596e-06,
"loss": 1.208,
"step": 158
},
{
"epoch": 0.3225152129817444,
"grad_norm": 0.1728515625,
"learning_rate": 8.924949290060853e-06,
"loss": 1.1899,
"step": 159
},
{
"epoch": 0.32454361054766734,
"grad_norm": 0.181640625,
"learning_rate": 8.91818796484111e-06,
"loss": 1.2019,
"step": 160
},
{
"epoch": 0.3265720081135903,
"grad_norm": 0.1875,
"learning_rate": 8.911426639621367e-06,
"loss": 1.1336,
"step": 161
},
{
"epoch": 0.3286004056795132,
"grad_norm": 0.177734375,
"learning_rate": 8.904665314401624e-06,
"loss": 1.1841,
"step": 162
},
{
"epoch": 0.3306288032454361,
"grad_norm": 0.1953125,
"learning_rate": 8.89790398918188e-06,
"loss": 1.2169,
"step": 163
},
{
"epoch": 0.332657200811359,
"grad_norm": 0.2041015625,
"learning_rate": 8.891142663962136e-06,
"loss": 1.1857,
"step": 164
},
{
"epoch": 0.33468559837728196,
"grad_norm": 0.1865234375,
"learning_rate": 8.884381338742395e-06,
"loss": 1.1672,
"step": 165
},
{
"epoch": 0.3367139959432049,
"grad_norm": 0.1884765625,
"learning_rate": 8.877620013522652e-06,
"loss": 1.2451,
"step": 166
},
{
"epoch": 0.33874239350912777,
"grad_norm": 0.1953125,
"learning_rate": 8.870858688302908e-06,
"loss": 1.2504,
"step": 167
},
{
"epoch": 0.3407707910750507,
"grad_norm": 0.23046875,
"learning_rate": 8.864097363083165e-06,
"loss": 1.194,
"step": 168
},
{
"epoch": 0.34279918864097364,
"grad_norm": 0.2158203125,
"learning_rate": 8.857336037863422e-06,
"loss": 1.1883,
"step": 169
},
{
"epoch": 0.3448275862068966,
"grad_norm": 0.1982421875,
"learning_rate": 8.85057471264368e-06,
"loss": 1.205,
"step": 170
},
{
"epoch": 0.34685598377281945,
"grad_norm": 0.193359375,
"learning_rate": 8.843813387423936e-06,
"loss": 1.2313,
"step": 171
},
{
"epoch": 0.3488843813387424,
"grad_norm": 0.22265625,
"learning_rate": 8.837052062204193e-06,
"loss": 1.163,
"step": 172
},
{
"epoch": 0.3509127789046653,
"grad_norm": 0.2109375,
"learning_rate": 8.83029073698445e-06,
"loss": 1.1458,
"step": 173
},
{
"epoch": 0.35294117647058826,
"grad_norm": 0.185546875,
"learning_rate": 8.823529411764707e-06,
"loss": 1.1783,
"step": 174
},
{
"epoch": 0.35496957403651114,
"grad_norm": 0.181640625,
"learning_rate": 8.816768086544964e-06,
"loss": 1.1763,
"step": 175
},
{
"epoch": 0.35699797160243407,
"grad_norm": 0.171875,
"learning_rate": 8.81000676132522e-06,
"loss": 1.1719,
"step": 176
},
{
"epoch": 0.359026369168357,
"grad_norm": 0.1865234375,
"learning_rate": 8.803245436105478e-06,
"loss": 1.1703,
"step": 177
},
{
"epoch": 0.36105476673427994,
"grad_norm": 0.1982421875,
"learning_rate": 8.796484110885734e-06,
"loss": 1.213,
"step": 178
},
{
"epoch": 0.3630831643002028,
"grad_norm": 0.20703125,
"learning_rate": 8.789722785665991e-06,
"loss": 1.1656,
"step": 179
},
{
"epoch": 0.36511156186612576,
"grad_norm": 0.17578125,
"learning_rate": 8.782961460446248e-06,
"loss": 1.1381,
"step": 180
},
{
"epoch": 0.3671399594320487,
"grad_norm": 0.1904296875,
"learning_rate": 8.776200135226505e-06,
"loss": 1.213,
"step": 181
},
{
"epoch": 0.3691683569979716,
"grad_norm": 0.1845703125,
"learning_rate": 8.769438810006762e-06,
"loss": 1.1966,
"step": 182
},
{
"epoch": 0.3711967545638945,
"grad_norm": 0.259765625,
"learning_rate": 8.762677484787019e-06,
"loss": 1.2488,
"step": 183
},
{
"epoch": 0.37322515212981744,
"grad_norm": 0.25390625,
"learning_rate": 8.755916159567276e-06,
"loss": 1.1837,
"step": 184
},
{
"epoch": 0.3752535496957404,
"grad_norm": 0.1865234375,
"learning_rate": 8.749154834347533e-06,
"loss": 1.1915,
"step": 185
},
{
"epoch": 0.3772819472616633,
"grad_norm": 0.19140625,
"learning_rate": 8.74239350912779e-06,
"loss": 1.187,
"step": 186
},
{
"epoch": 0.3793103448275862,
"grad_norm": 0.18359375,
"learning_rate": 8.735632183908047e-06,
"loss": 1.189,
"step": 187
},
{
"epoch": 0.3813387423935091,
"grad_norm": 0.1787109375,
"learning_rate": 8.728870858688304e-06,
"loss": 1.1898,
"step": 188
},
{
"epoch": 0.38336713995943206,
"grad_norm": 0.1982421875,
"learning_rate": 8.72210953346856e-06,
"loss": 1.1981,
"step": 189
},
{
"epoch": 0.385395537525355,
"grad_norm": 0.1865234375,
"learning_rate": 8.715348208248817e-06,
"loss": 1.1717,
"step": 190
},
{
"epoch": 0.38742393509127787,
"grad_norm": 0.1943359375,
"learning_rate": 8.708586883029074e-06,
"loss": 1.1787,
"step": 191
},
{
"epoch": 0.3894523326572008,
"grad_norm": 0.1943359375,
"learning_rate": 8.701825557809331e-06,
"loss": 1.1728,
"step": 192
},
{
"epoch": 0.39148073022312374,
"grad_norm": 0.1845703125,
"learning_rate": 8.695064232589588e-06,
"loss": 1.1653,
"step": 193
},
{
"epoch": 0.3935091277890467,
"grad_norm": 0.2734375,
"learning_rate": 8.688302907369845e-06,
"loss": 1.117,
"step": 194
},
{
"epoch": 0.39553752535496955,
"grad_norm": 0.1806640625,
"learning_rate": 8.681541582150102e-06,
"loss": 1.1384,
"step": 195
},
{
"epoch": 0.3975659229208925,
"grad_norm": 0.302734375,
"learning_rate": 8.674780256930359e-06,
"loss": 1.1527,
"step": 196
},
{
"epoch": 0.3995943204868154,
"grad_norm": 0.1953125,
"learning_rate": 8.668018931710616e-06,
"loss": 1.1487,
"step": 197
},
{
"epoch": 0.40162271805273836,
"grad_norm": 0.1982421875,
"learning_rate": 8.661257606490873e-06,
"loss": 1.2106,
"step": 198
},
{
"epoch": 0.40365111561866124,
"grad_norm": 0.197265625,
"learning_rate": 8.65449628127113e-06,
"loss": 1.1738,
"step": 199
},
{
"epoch": 0.4056795131845842,
"grad_norm": 1.890625,
"learning_rate": 8.647734956051387e-06,
"loss": 1.1729,
"step": 200
},
{
"epoch": 0.4077079107505071,
"grad_norm": 0.2099609375,
"learning_rate": 8.640973630831643e-06,
"loss": 1.1877,
"step": 201
},
{
"epoch": 0.40973630831643004,
"grad_norm": 0.2138671875,
"learning_rate": 8.6342123056119e-06,
"loss": 1.1108,
"step": 202
},
{
"epoch": 0.4117647058823529,
"grad_norm": 0.185546875,
"learning_rate": 8.627450980392157e-06,
"loss": 1.1975,
"step": 203
},
{
"epoch": 0.41379310344827586,
"grad_norm": 0.2021484375,
"learning_rate": 8.620689655172414e-06,
"loss": 1.1852,
"step": 204
},
{
"epoch": 0.4158215010141988,
"grad_norm": 0.19140625,
"learning_rate": 8.613928329952671e-06,
"loss": 1.1958,
"step": 205
},
{
"epoch": 0.4178498985801217,
"grad_norm": 0.265625,
"learning_rate": 8.607167004732928e-06,
"loss": 1.1645,
"step": 206
},
{
"epoch": 0.4198782961460446,
"grad_norm": 0.1943359375,
"learning_rate": 8.600405679513185e-06,
"loss": 1.126,
"step": 207
},
{
"epoch": 0.42190669371196754,
"grad_norm": 0.1796875,
"learning_rate": 8.593644354293442e-06,
"loss": 1.159,
"step": 208
},
{
"epoch": 0.4239350912778905,
"grad_norm": 0.181640625,
"learning_rate": 8.586883029073699e-06,
"loss": 1.1864,
"step": 209
},
{
"epoch": 0.4259634888438134,
"grad_norm": 0.224609375,
"learning_rate": 8.580121703853956e-06,
"loss": 1.1173,
"step": 210
},
{
"epoch": 0.4279918864097363,
"grad_norm": 0.1875,
"learning_rate": 8.573360378634214e-06,
"loss": 1.1146,
"step": 211
},
{
"epoch": 0.4300202839756592,
"grad_norm": 0.2275390625,
"learning_rate": 8.56659905341447e-06,
"loss": 1.2435,
"step": 212
},
{
"epoch": 0.43204868154158216,
"grad_norm": 0.2021484375,
"learning_rate": 8.559837728194726e-06,
"loss": 1.1341,
"step": 213
},
{
"epoch": 0.4340770791075051,
"grad_norm": 0.19140625,
"learning_rate": 8.553076402974983e-06,
"loss": 1.1846,
"step": 214
},
{
"epoch": 0.43610547667342797,
"grad_norm": 0.2138671875,
"learning_rate": 8.54631507775524e-06,
"loss": 1.156,
"step": 215
},
{
"epoch": 0.4381338742393509,
"grad_norm": 0.203125,
"learning_rate": 8.539553752535497e-06,
"loss": 1.1899,
"step": 216
},
{
"epoch": 0.44016227180527384,
"grad_norm": 0.17578125,
"learning_rate": 8.532792427315754e-06,
"loss": 1.1491,
"step": 217
},
{
"epoch": 0.4421906693711968,
"grad_norm": 0.185546875,
"learning_rate": 8.526031102096013e-06,
"loss": 1.116,
"step": 218
},
{
"epoch": 0.44421906693711966,
"grad_norm": 0.2041015625,
"learning_rate": 8.51926977687627e-06,
"loss": 1.1333,
"step": 219
},
{
"epoch": 0.4462474645030426,
"grad_norm": 0.2255859375,
"learning_rate": 8.512508451656525e-06,
"loss": 1.1806,
"step": 220
},
{
"epoch": 0.4482758620689655,
"grad_norm": 0.357421875,
"learning_rate": 8.505747126436782e-06,
"loss": 1.1594,
"step": 221
},
{
"epoch": 0.45030425963488846,
"grad_norm": 0.1982421875,
"learning_rate": 8.498985801217039e-06,
"loss": 1.1281,
"step": 222
},
{
"epoch": 0.45233265720081134,
"grad_norm": 0.1845703125,
"learning_rate": 8.492224475997295e-06,
"loss": 1.1715,
"step": 223
},
{
"epoch": 0.4543610547667343,
"grad_norm": 0.193359375,
"learning_rate": 8.485463150777552e-06,
"loss": 1.179,
"step": 224
},
{
"epoch": 0.4563894523326572,
"grad_norm": 0.18359375,
"learning_rate": 8.478701825557811e-06,
"loss": 1.1349,
"step": 225
},
{
"epoch": 0.45841784989858014,
"grad_norm": 0.201171875,
"learning_rate": 8.471940500338068e-06,
"loss": 1.165,
"step": 226
},
{
"epoch": 0.460446247464503,
"grad_norm": 0.1875,
"learning_rate": 8.465179175118325e-06,
"loss": 1.1169,
"step": 227
},
{
"epoch": 0.46247464503042596,
"grad_norm": 0.2021484375,
"learning_rate": 8.45841784989858e-06,
"loss": 1.17,
"step": 228
},
{
"epoch": 0.4645030425963489,
"grad_norm": 0.212890625,
"learning_rate": 8.451656524678837e-06,
"loss": 1.0952,
"step": 229
},
{
"epoch": 0.4665314401622718,
"grad_norm": 0.193359375,
"learning_rate": 8.444895199459094e-06,
"loss": 1.1216,
"step": 230
},
{
"epoch": 0.4685598377281947,
"grad_norm": 0.203125,
"learning_rate": 8.43813387423935e-06,
"loss": 1.1362,
"step": 231
},
{
"epoch": 0.47058823529411764,
"grad_norm": 0.203125,
"learning_rate": 8.43137254901961e-06,
"loss": 1.1633,
"step": 232
},
{
"epoch": 0.4726166328600406,
"grad_norm": 0.197265625,
"learning_rate": 8.424611223799866e-06,
"loss": 1.1214,
"step": 233
},
{
"epoch": 0.4746450304259635,
"grad_norm": 0.220703125,
"learning_rate": 8.417849898580123e-06,
"loss": 1.1316,
"step": 234
},
{
"epoch": 0.4766734279918864,
"grad_norm": 0.1875,
"learning_rate": 8.41108857336038e-06,
"loss": 1.1439,
"step": 235
},
{
"epoch": 0.4787018255578093,
"grad_norm": 0.205078125,
"learning_rate": 8.404327248140635e-06,
"loss": 1.1409,
"step": 236
},
{
"epoch": 0.48073022312373226,
"grad_norm": 0.2041015625,
"learning_rate": 8.397565922920892e-06,
"loss": 1.1773,
"step": 237
},
{
"epoch": 0.4827586206896552,
"grad_norm": 0.2265625,
"learning_rate": 8.390804597701149e-06,
"loss": 1.1738,
"step": 238
},
{
"epoch": 0.4847870182555781,
"grad_norm": 0.1865234375,
"learning_rate": 8.384043272481408e-06,
"loss": 1.1175,
"step": 239
},
{
"epoch": 0.486815415821501,
"grad_norm": 0.2001953125,
"learning_rate": 8.377281947261665e-06,
"loss": 1.1377,
"step": 240
},
{
"epoch": 0.48884381338742394,
"grad_norm": 0.296875,
"learning_rate": 8.370520622041922e-06,
"loss": 1.19,
"step": 241
},
{
"epoch": 0.4908722109533469,
"grad_norm": 0.2197265625,
"learning_rate": 8.363759296822178e-06,
"loss": 1.159,
"step": 242
},
{
"epoch": 0.49290060851926976,
"grad_norm": 0.21875,
"learning_rate": 8.356997971602435e-06,
"loss": 1.1575,
"step": 243
},
{
"epoch": 0.4949290060851927,
"grad_norm": 0.2080078125,
"learning_rate": 8.35023664638269e-06,
"loss": 1.1038,
"step": 244
},
{
"epoch": 0.4969574036511156,
"grad_norm": 0.19140625,
"learning_rate": 8.343475321162948e-06,
"loss": 1.1418,
"step": 245
},
{
"epoch": 0.49898580121703856,
"grad_norm": 0.2392578125,
"learning_rate": 8.336713995943206e-06,
"loss": 1.1345,
"step": 246
},
{
"epoch": 0.5010141987829615,
"grad_norm": 0.2177734375,
"learning_rate": 8.329952670723463e-06,
"loss": 1.14,
"step": 247
},
{
"epoch": 0.5030425963488844,
"grad_norm": 0.201171875,
"learning_rate": 8.32319134550372e-06,
"loss": 1.1234,
"step": 248
},
{
"epoch": 0.5050709939148073,
"grad_norm": 0.265625,
"learning_rate": 8.316430020283977e-06,
"loss": 1.1479,
"step": 249
},
{
"epoch": 0.5070993914807302,
"grad_norm": 0.205078125,
"learning_rate": 8.309668695064234e-06,
"loss": 1.1019,
"step": 250
},
{
"epoch": 0.5091277890466531,
"grad_norm": 0.1865234375,
"learning_rate": 8.30290736984449e-06,
"loss": 1.1496,
"step": 251
},
{
"epoch": 0.5111561866125761,
"grad_norm": 0.23046875,
"learning_rate": 8.296146044624746e-06,
"loss": 1.1473,
"step": 252
},
{
"epoch": 0.513184584178499,
"grad_norm": 0.1962890625,
"learning_rate": 8.289384719405005e-06,
"loss": 1.153,
"step": 253
},
{
"epoch": 0.5152129817444219,
"grad_norm": 0.20703125,
"learning_rate": 8.282623394185261e-06,
"loss": 1.1607,
"step": 254
},
{
"epoch": 0.5172413793103449,
"grad_norm": 0.205078125,
"learning_rate": 8.275862068965518e-06,
"loss": 1.1626,
"step": 255
},
{
"epoch": 0.5192697768762677,
"grad_norm": 0.1962890625,
"learning_rate": 8.269100743745775e-06,
"loss": 1.1553,
"step": 256
},
{
"epoch": 0.5212981744421906,
"grad_norm": 0.2041015625,
"learning_rate": 8.262339418526032e-06,
"loss": 1.1893,
"step": 257
},
{
"epoch": 0.5233265720081136,
"grad_norm": 0.1943359375,
"learning_rate": 8.255578093306289e-06,
"loss": 1.0982,
"step": 258
},
{
"epoch": 0.5253549695740365,
"grad_norm": 0.2080078125,
"learning_rate": 8.248816768086546e-06,
"loss": 1.1448,
"step": 259
},
{
"epoch": 0.5273833671399595,
"grad_norm": 0.22265625,
"learning_rate": 8.242055442866801e-06,
"loss": 1.1399,
"step": 260
},
{
"epoch": 0.5294117647058824,
"grad_norm": 0.1845703125,
"learning_rate": 8.23529411764706e-06,
"loss": 1.0997,
"step": 261
},
{
"epoch": 0.5314401622718052,
"grad_norm": 0.2060546875,
"learning_rate": 8.228532792427317e-06,
"loss": 1.128,
"step": 262
},
{
"epoch": 0.5334685598377282,
"grad_norm": 0.1982421875,
"learning_rate": 8.221771467207574e-06,
"loss": 1.1279,
"step": 263
},
{
"epoch": 0.5354969574036511,
"grad_norm": 0.212890625,
"learning_rate": 8.21501014198783e-06,
"loss": 1.1596,
"step": 264
},
{
"epoch": 0.537525354969574,
"grad_norm": 0.203125,
"learning_rate": 8.208248816768087e-06,
"loss": 1.1058,
"step": 265
},
{
"epoch": 0.539553752535497,
"grad_norm": 0.1953125,
"learning_rate": 8.201487491548344e-06,
"loss": 1.125,
"step": 266
},
{
"epoch": 0.5415821501014199,
"grad_norm": 0.19921875,
"learning_rate": 8.194726166328601e-06,
"loss": 1.1207,
"step": 267
},
{
"epoch": 0.5436105476673428,
"grad_norm": 0.193359375,
"learning_rate": 8.187964841108858e-06,
"loss": 1.1092,
"step": 268
},
{
"epoch": 0.5456389452332657,
"grad_norm": 0.220703125,
"learning_rate": 8.181203515889115e-06,
"loss": 1.1074,
"step": 269
},
{
"epoch": 0.5476673427991886,
"grad_norm": 0.2001953125,
"learning_rate": 8.174442190669372e-06,
"loss": 1.0965,
"step": 270
},
{
"epoch": 0.5496957403651116,
"grad_norm": 0.21875,
"learning_rate": 8.167680865449629e-06,
"loss": 1.1247,
"step": 271
},
{
"epoch": 0.5517241379310345,
"grad_norm": 0.203125,
"learning_rate": 8.160919540229886e-06,
"loss": 1.1494,
"step": 272
},
{
"epoch": 0.5537525354969574,
"grad_norm": 0.1943359375,
"learning_rate": 8.154158215010143e-06,
"loss": 1.1079,
"step": 273
},
{
"epoch": 0.5557809330628803,
"grad_norm": 0.255859375,
"learning_rate": 8.1473968897904e-06,
"loss": 1.1407,
"step": 274
},
{
"epoch": 0.5578093306288032,
"grad_norm": 0.2001953125,
"learning_rate": 8.140635564570657e-06,
"loss": 1.1192,
"step": 275
},
{
"epoch": 0.5598377281947262,
"grad_norm": 0.1923828125,
"learning_rate": 8.133874239350913e-06,
"loss": 1.1077,
"step": 276
},
{
"epoch": 0.5618661257606491,
"grad_norm": 0.20703125,
"learning_rate": 8.12711291413117e-06,
"loss": 1.1626,
"step": 277
},
{
"epoch": 0.563894523326572,
"grad_norm": 0.203125,
"learning_rate": 8.120351588911427e-06,
"loss": 1.0602,
"step": 278
},
{
"epoch": 0.565922920892495,
"grad_norm": 0.23828125,
"learning_rate": 8.113590263691684e-06,
"loss": 1.0946,
"step": 279
},
{
"epoch": 0.5679513184584178,
"grad_norm": 0.2119140625,
"learning_rate": 8.106828938471941e-06,
"loss": 1.1575,
"step": 280
},
{
"epoch": 0.5699797160243407,
"grad_norm": 0.236328125,
"learning_rate": 8.100067613252198e-06,
"loss": 1.1317,
"step": 281
},
{
"epoch": 0.5720081135902637,
"grad_norm": 0.1962890625,
"learning_rate": 8.093306288032455e-06,
"loss": 1.1557,
"step": 282
},
{
"epoch": 0.5740365111561866,
"grad_norm": 0.205078125,
"learning_rate": 8.086544962812712e-06,
"loss": 1.1137,
"step": 283
},
{
"epoch": 0.5760649087221096,
"grad_norm": 0.22265625,
"learning_rate": 8.079783637592969e-06,
"loss": 1.1279,
"step": 284
},
{
"epoch": 0.5780933062880325,
"grad_norm": 0.2041015625,
"learning_rate": 8.073022312373226e-06,
"loss": 1.0888,
"step": 285
},
{
"epoch": 0.5801217038539553,
"grad_norm": 0.203125,
"learning_rate": 8.066260987153483e-06,
"loss": 1.1286,
"step": 286
},
{
"epoch": 0.5821501014198783,
"grad_norm": 0.201171875,
"learning_rate": 8.05949966193374e-06,
"loss": 1.1542,
"step": 287
},
{
"epoch": 0.5841784989858012,
"grad_norm": 0.2060546875,
"learning_rate": 8.052738336713996e-06,
"loss": 1.1188,
"step": 288
},
{
"epoch": 0.5862068965517241,
"grad_norm": 0.2080078125,
"learning_rate": 8.045977011494253e-06,
"loss": 1.1265,
"step": 289
},
{
"epoch": 0.5882352941176471,
"grad_norm": 0.2177734375,
"learning_rate": 8.03921568627451e-06,
"loss": 1.1099,
"step": 290
},
{
"epoch": 0.59026369168357,
"grad_norm": 0.279296875,
"learning_rate": 8.032454361054767e-06,
"loss": 1.088,
"step": 291
},
{
"epoch": 0.592292089249493,
"grad_norm": 0.1953125,
"learning_rate": 8.025693035835024e-06,
"loss": 1.1426,
"step": 292
},
{
"epoch": 0.5943204868154158,
"grad_norm": 0.236328125,
"learning_rate": 8.018931710615281e-06,
"loss": 1.099,
"step": 293
},
{
"epoch": 0.5963488843813387,
"grad_norm": 0.259765625,
"learning_rate": 8.012170385395538e-06,
"loss": 1.1593,
"step": 294
},
{
"epoch": 0.5983772819472617,
"grad_norm": 0.208984375,
"learning_rate": 8.005409060175795e-06,
"loss": 1.1478,
"step": 295
},
{
"epoch": 0.6004056795131846,
"grad_norm": 0.26171875,
"learning_rate": 7.998647734956052e-06,
"loss": 1.1165,
"step": 296
},
{
"epoch": 0.6024340770791075,
"grad_norm": 0.2109375,
"learning_rate": 7.991886409736309e-06,
"loss": 1.0772,
"step": 297
},
{
"epoch": 0.6044624746450304,
"grad_norm": 0.2099609375,
"learning_rate": 7.985125084516566e-06,
"loss": 1.0881,
"step": 298
},
{
"epoch": 0.6064908722109533,
"grad_norm": 0.2021484375,
"learning_rate": 7.978363759296822e-06,
"loss": 1.0898,
"step": 299
},
{
"epoch": 0.6085192697768763,
"grad_norm": 0.314453125,
"learning_rate": 7.97160243407708e-06,
"loss": 1.095,
"step": 300
},
{
"epoch": 0.6105476673427992,
"grad_norm": 0.20703125,
"learning_rate": 7.964841108857336e-06,
"loss": 1.1129,
"step": 301
},
{
"epoch": 0.6125760649087221,
"grad_norm": 0.220703125,
"learning_rate": 7.958079783637593e-06,
"loss": 1.1106,
"step": 302
},
{
"epoch": 0.6146044624746451,
"grad_norm": 0.2001953125,
"learning_rate": 7.95131845841785e-06,
"loss": 1.0957,
"step": 303
},
{
"epoch": 0.6166328600405679,
"grad_norm": 0.296875,
"learning_rate": 7.944557133198107e-06,
"loss": 1.1653,
"step": 304
},
{
"epoch": 0.6186612576064908,
"grad_norm": 0.2119140625,
"learning_rate": 7.937795807978364e-06,
"loss": 1.13,
"step": 305
},
{
"epoch": 0.6206896551724138,
"grad_norm": 0.205078125,
"learning_rate": 7.93103448275862e-06,
"loss": 1.1194,
"step": 306
},
{
"epoch": 0.6227180527383367,
"grad_norm": 0.232421875,
"learning_rate": 7.92427315753888e-06,
"loss": 1.1271,
"step": 307
},
{
"epoch": 0.6247464503042597,
"grad_norm": 0.20703125,
"learning_rate": 7.917511832319135e-06,
"loss": 1.1328,
"step": 308
},
{
"epoch": 0.6267748478701826,
"grad_norm": 0.26953125,
"learning_rate": 7.910750507099392e-06,
"loss": 1.1269,
"step": 309
},
{
"epoch": 0.6288032454361054,
"grad_norm": 0.2041015625,
"learning_rate": 7.903989181879648e-06,
"loss": 1.094,
"step": 310
},
{
"epoch": 0.6308316430020284,
"grad_norm": 0.232421875,
"learning_rate": 7.897227856659905e-06,
"loss": 1.0956,
"step": 311
},
{
"epoch": 0.6328600405679513,
"grad_norm": 0.20703125,
"learning_rate": 7.890466531440162e-06,
"loss": 1.0989,
"step": 312
},
{
"epoch": 0.6348884381338742,
"grad_norm": 0.216796875,
"learning_rate": 7.88370520622042e-06,
"loss": 1.1115,
"step": 313
},
{
"epoch": 0.6369168356997972,
"grad_norm": 0.208984375,
"learning_rate": 7.876943881000678e-06,
"loss": 1.1377,
"step": 314
},
{
"epoch": 0.6389452332657201,
"grad_norm": 0.21875,
"learning_rate": 7.870182555780935e-06,
"loss": 1.0829,
"step": 315
},
{
"epoch": 0.640973630831643,
"grad_norm": 0.19921875,
"learning_rate": 7.86342123056119e-06,
"loss": 1.0775,
"step": 316
},
{
"epoch": 0.6430020283975659,
"grad_norm": 0.21875,
"learning_rate": 7.856659905341447e-06,
"loss": 1.058,
"step": 317
},
{
"epoch": 0.6450304259634888,
"grad_norm": 0.265625,
"learning_rate": 7.849898580121704e-06,
"loss": 1.0845,
"step": 318
},
{
"epoch": 0.6470588235294118,
"grad_norm": 0.220703125,
"learning_rate": 7.84313725490196e-06,
"loss": 1.1295,
"step": 319
},
{
"epoch": 0.6490872210953347,
"grad_norm": 0.2216796875,
"learning_rate": 7.836375929682218e-06,
"loss": 1.1128,
"step": 320
},
{
"epoch": 0.6511156186612576,
"grad_norm": 0.2275390625,
"learning_rate": 7.829614604462476e-06,
"loss": 1.0901,
"step": 321
},
{
"epoch": 0.6531440162271805,
"grad_norm": 0.197265625,
"learning_rate": 7.822853279242733e-06,
"loss": 1.0869,
"step": 322
},
{
"epoch": 0.6551724137931034,
"grad_norm": 0.2099609375,
"learning_rate": 7.81609195402299e-06,
"loss": 1.1382,
"step": 323
},
{
"epoch": 0.6572008113590264,
"grad_norm": 0.26953125,
"learning_rate": 7.809330628803245e-06,
"loss": 1.0594,
"step": 324
},
{
"epoch": 0.6592292089249493,
"grad_norm": 0.205078125,
"learning_rate": 7.802569303583502e-06,
"loss": 1.1196,
"step": 325
},
{
"epoch": 0.6612576064908722,
"grad_norm": 0.212890625,
"learning_rate": 7.795807978363759e-06,
"loss": 1.0247,
"step": 326
},
{
"epoch": 0.6632860040567952,
"grad_norm": 0.2119140625,
"learning_rate": 7.789046653144016e-06,
"loss": 1.1146,
"step": 327
},
{
"epoch": 0.665314401622718,
"grad_norm": 0.2109375,
"learning_rate": 7.782285327924275e-06,
"loss": 1.0828,
"step": 328
},
{
"epoch": 0.6673427991886409,
"grad_norm": 0.2080078125,
"learning_rate": 7.775524002704531e-06,
"loss": 1.0726,
"step": 329
},
{
"epoch": 0.6693711967545639,
"grad_norm": 0.3984375,
"learning_rate": 7.768762677484788e-06,
"loss": 1.1187,
"step": 330
},
{
"epoch": 0.6713995943204868,
"grad_norm": 0.212890625,
"learning_rate": 7.762001352265045e-06,
"loss": 1.118,
"step": 331
},
{
"epoch": 0.6734279918864098,
"grad_norm": 0.23046875,
"learning_rate": 7.7552400270453e-06,
"loss": 1.1304,
"step": 332
},
{
"epoch": 0.6754563894523327,
"grad_norm": 0.2041015625,
"learning_rate": 7.748478701825557e-06,
"loss": 1.0665,
"step": 333
},
{
"epoch": 0.6774847870182555,
"grad_norm": 0.2314453125,
"learning_rate": 7.741717376605814e-06,
"loss": 1.1892,
"step": 334
},
{
"epoch": 0.6795131845841785,
"grad_norm": 0.3046875,
"learning_rate": 7.734956051386073e-06,
"loss": 1.1124,
"step": 335
},
{
"epoch": 0.6815415821501014,
"grad_norm": 0.2216796875,
"learning_rate": 7.72819472616633e-06,
"loss": 1.1264,
"step": 336
},
{
"epoch": 0.6835699797160243,
"grad_norm": 0.2255859375,
"learning_rate": 7.721433400946587e-06,
"loss": 1.1204,
"step": 337
},
{
"epoch": 0.6855983772819473,
"grad_norm": 0.298828125,
"learning_rate": 7.714672075726844e-06,
"loss": 1.0857,
"step": 338
},
{
"epoch": 0.6876267748478702,
"grad_norm": 0.220703125,
"learning_rate": 7.7079107505071e-06,
"loss": 1.0994,
"step": 339
},
{
"epoch": 0.6896551724137931,
"grad_norm": 0.2216796875,
"learning_rate": 7.701149425287356e-06,
"loss": 1.1221,
"step": 340
},
{
"epoch": 0.691683569979716,
"grad_norm": 0.216796875,
"learning_rate": 7.694388100067613e-06,
"loss": 1.1262,
"step": 341
},
{
"epoch": 0.6937119675456389,
"grad_norm": 0.20703125,
"learning_rate": 7.687626774847871e-06,
"loss": 1.1352,
"step": 342
},
{
"epoch": 0.6957403651115619,
"grad_norm": 0.2021484375,
"learning_rate": 7.680865449628128e-06,
"loss": 1.097,
"step": 343
},
{
"epoch": 0.6977687626774848,
"grad_norm": 0.21484375,
"learning_rate": 7.674104124408385e-06,
"loss": 1.1079,
"step": 344
},
{
"epoch": 0.6997971602434077,
"grad_norm": 0.220703125,
"learning_rate": 7.667342799188642e-06,
"loss": 1.0598,
"step": 345
},
{
"epoch": 0.7018255578093306,
"grad_norm": 0.25,
"learning_rate": 7.660581473968899e-06,
"loss": 1.1034,
"step": 346
},
{
"epoch": 0.7038539553752535,
"grad_norm": 0.22265625,
"learning_rate": 7.653820148749156e-06,
"loss": 1.0847,
"step": 347
},
{
"epoch": 0.7058823529411765,
"grad_norm": 0.24609375,
"learning_rate": 7.647058823529411e-06,
"loss": 1.0882,
"step": 348
},
{
"epoch": 0.7079107505070994,
"grad_norm": 0.80078125,
"learning_rate": 7.64029749830967e-06,
"loss": 1.1341,
"step": 349
},
{
"epoch": 0.7099391480730223,
"grad_norm": 0.212890625,
"learning_rate": 7.633536173089927e-06,
"loss": 1.0863,
"step": 350
},
{
"epoch": 0.7119675456389453,
"grad_norm": 0.2109375,
"learning_rate": 7.626774847870183e-06,
"loss": 1.0326,
"step": 351
},
{
"epoch": 0.7139959432048681,
"grad_norm": 0.2333984375,
"learning_rate": 7.6200135226504404e-06,
"loss": 1.0598,
"step": 352
},
{
"epoch": 0.716024340770791,
"grad_norm": 0.2119140625,
"learning_rate": 7.613252197430697e-06,
"loss": 1.0759,
"step": 353
},
{
"epoch": 0.718052738336714,
"grad_norm": 0.2294921875,
"learning_rate": 7.606490872210954e-06,
"loss": 1.116,
"step": 354
},
{
"epoch": 0.7200811359026369,
"grad_norm": 0.21875,
"learning_rate": 7.599729546991211e-06,
"loss": 1.1235,
"step": 355
},
{
"epoch": 0.7221095334685599,
"grad_norm": 0.2578125,
"learning_rate": 7.592968221771467e-06,
"loss": 1.1006,
"step": 356
},
{
"epoch": 0.7241379310344828,
"grad_norm": 0.216796875,
"learning_rate": 7.586206896551724e-06,
"loss": 1.1092,
"step": 357
},
{
"epoch": 0.7261663286004056,
"grad_norm": 0.248046875,
"learning_rate": 7.579445571331981e-06,
"loss": 1.0582,
"step": 358
},
{
"epoch": 0.7281947261663286,
"grad_norm": 0.2275390625,
"learning_rate": 7.572684246112239e-06,
"loss": 1.1501,
"step": 359
},
{
"epoch": 0.7302231237322515,
"grad_norm": 0.2158203125,
"learning_rate": 7.565922920892496e-06,
"loss": 1.1079,
"step": 360
},
{
"epoch": 0.7322515212981744,
"grad_norm": 0.251953125,
"learning_rate": 7.559161595672753e-06,
"loss": 1.0929,
"step": 361
},
{
"epoch": 0.7342799188640974,
"grad_norm": 0.279296875,
"learning_rate": 7.5524002704530095e-06,
"loss": 1.0586,
"step": 362
},
{
"epoch": 0.7363083164300203,
"grad_norm": 0.2119140625,
"learning_rate": 7.5456389452332665e-06,
"loss": 1.1174,
"step": 363
},
{
"epoch": 0.7383367139959433,
"grad_norm": 0.2255859375,
"learning_rate": 7.5388776200135225e-06,
"loss": 1.1049,
"step": 364
},
{
"epoch": 0.7403651115618661,
"grad_norm": 0.271484375,
"learning_rate": 7.5321162947937794e-06,
"loss": 1.1212,
"step": 365
},
{
"epoch": 0.742393509127789,
"grad_norm": 0.2177734375,
"learning_rate": 7.525354969574037e-06,
"loss": 1.1117,
"step": 366
},
{
"epoch": 0.744421906693712,
"grad_norm": 0.2490234375,
"learning_rate": 7.518593644354294e-06,
"loss": 1.0885,
"step": 367
},
{
"epoch": 0.7464503042596349,
"grad_norm": 0.224609375,
"learning_rate": 7.511832319134551e-06,
"loss": 1.1178,
"step": 368
},
{
"epoch": 0.7484787018255578,
"grad_norm": 0.21484375,
"learning_rate": 7.505070993914808e-06,
"loss": 1.0965,
"step": 369
},
{
"epoch": 0.7505070993914807,
"grad_norm": 0.21875,
"learning_rate": 7.498309668695065e-06,
"loss": 1.0961,
"step": 370
},
{
"epoch": 0.7525354969574036,
"grad_norm": 0.369140625,
"learning_rate": 7.491548343475323e-06,
"loss": 1.0683,
"step": 371
},
{
"epoch": 0.7545638945233266,
"grad_norm": 0.2109375,
"learning_rate": 7.484787018255578e-06,
"loss": 1.0914,
"step": 372
},
{
"epoch": 0.7565922920892495,
"grad_norm": 0.216796875,
"learning_rate": 7.4780256930358356e-06,
"loss": 1.0738,
"step": 373
},
{
"epoch": 0.7586206896551724,
"grad_norm": 0.1982421875,
"learning_rate": 7.4712643678160925e-06,
"loss": 1.0488,
"step": 374
},
{
"epoch": 0.7606490872210954,
"grad_norm": 0.2216796875,
"learning_rate": 7.464503042596349e-06,
"loss": 1.1491,
"step": 375
},
{
"epoch": 0.7626774847870182,
"grad_norm": 0.22265625,
"learning_rate": 7.457741717376606e-06,
"loss": 1.095,
"step": 376
},
{
"epoch": 0.7647058823529411,
"grad_norm": 0.2177734375,
"learning_rate": 7.450980392156863e-06,
"loss": 1.112,
"step": 377
},
{
"epoch": 0.7667342799188641,
"grad_norm": 0.216796875,
"learning_rate": 7.444219066937121e-06,
"loss": 1.0972,
"step": 378
},
{
"epoch": 0.768762677484787,
"grad_norm": 0.26953125,
"learning_rate": 7.437457741717378e-06,
"loss": 1.1215,
"step": 379
},
{
"epoch": 0.77079107505071,
"grad_norm": 0.216796875,
"learning_rate": 7.430696416497634e-06,
"loss": 1.1147,
"step": 380
},
{
"epoch": 0.7728194726166329,
"grad_norm": 0.345703125,
"learning_rate": 7.423935091277891e-06,
"loss": 1.0614,
"step": 381
},
{
"epoch": 0.7748478701825557,
"grad_norm": 0.2197265625,
"learning_rate": 7.417173766058148e-06,
"loss": 1.1024,
"step": 382
},
{
"epoch": 0.7768762677484787,
"grad_norm": 0.255859375,
"learning_rate": 7.410412440838405e-06,
"loss": 1.0603,
"step": 383
},
{
"epoch": 0.7789046653144016,
"grad_norm": 0.2255859375,
"learning_rate": 7.403651115618662e-06,
"loss": 1.1042,
"step": 384
},
{
"epoch": 0.7809330628803245,
"grad_norm": 0.2314453125,
"learning_rate": 7.396889790398919e-06,
"loss": 1.1058,
"step": 385
},
{
"epoch": 0.7829614604462475,
"grad_norm": 0.2158203125,
"learning_rate": 7.390128465179176e-06,
"loss": 1.1207,
"step": 386
},
{
"epoch": 0.7849898580121704,
"grad_norm": 0.359375,
"learning_rate": 7.383367139959433e-06,
"loss": 1.0499,
"step": 387
},
{
"epoch": 0.7870182555780934,
"grad_norm": 0.2119140625,
"learning_rate": 7.376605814739689e-06,
"loss": 1.1111,
"step": 388
},
{
"epoch": 0.7890466531440162,
"grad_norm": 0.2216796875,
"learning_rate": 7.369844489519946e-06,
"loss": 1.1469,
"step": 389
},
{
"epoch": 0.7910750507099391,
"grad_norm": 0.25390625,
"learning_rate": 7.363083164300203e-06,
"loss": 1.0579,
"step": 390
},
{
"epoch": 0.7931034482758621,
"grad_norm": 0.2431640625,
"learning_rate": 7.35632183908046e-06,
"loss": 1.0904,
"step": 391
},
{
"epoch": 0.795131845841785,
"grad_norm": 0.208984375,
"learning_rate": 7.349560513860718e-06,
"loss": 1.1116,
"step": 392
},
{
"epoch": 0.7971602434077079,
"grad_norm": 0.236328125,
"learning_rate": 7.342799188640975e-06,
"loss": 1.0659,
"step": 393
},
{
"epoch": 0.7991886409736308,
"grad_norm": 0.2275390625,
"learning_rate": 7.3360378634212316e-06,
"loss": 1.0893,
"step": 394
},
{
"epoch": 0.8012170385395537,
"grad_norm": 0.23828125,
"learning_rate": 7.3292765382014885e-06,
"loss": 1.0927,
"step": 395
},
{
"epoch": 0.8032454361054767,
"grad_norm": 0.2021484375,
"learning_rate": 7.3225152129817445e-06,
"loss": 1.0579,
"step": 396
},
{
"epoch": 0.8052738336713996,
"grad_norm": 0.2080078125,
"learning_rate": 7.3157538877620015e-06,
"loss": 1.0357,
"step": 397
},
{
"epoch": 0.8073022312373225,
"grad_norm": 0.2158203125,
"learning_rate": 7.308992562542258e-06,
"loss": 1.0872,
"step": 398
},
{
"epoch": 0.8093306288032455,
"grad_norm": 0.2119140625,
"learning_rate": 7.302231237322516e-06,
"loss": 1.0981,
"step": 399
},
{
"epoch": 0.8113590263691683,
"grad_norm": 0.330078125,
"learning_rate": 7.295469912102773e-06,
"loss": 1.1117,
"step": 400
},
{
"epoch": 0.8133874239350912,
"grad_norm": 0.2197265625,
"learning_rate": 7.28870858688303e-06,
"loss": 1.0784,
"step": 401
},
{
"epoch": 0.8154158215010142,
"grad_norm": 0.265625,
"learning_rate": 7.281947261663287e-06,
"loss": 1.1133,
"step": 402
},
{
"epoch": 0.8174442190669371,
"grad_norm": 0.2470703125,
"learning_rate": 7.275185936443544e-06,
"loss": 1.0285,
"step": 403
},
{
"epoch": 0.8194726166328601,
"grad_norm": 0.23828125,
"learning_rate": 7.2684246112238e-06,
"loss": 1.0703,
"step": 404
},
{
"epoch": 0.821501014198783,
"grad_norm": 0.2353515625,
"learning_rate": 7.261663286004057e-06,
"loss": 1.091,
"step": 405
},
{
"epoch": 0.8235294117647058,
"grad_norm": 0.2216796875,
"learning_rate": 7.2549019607843145e-06,
"loss": 1.0706,
"step": 406
},
{
"epoch": 0.8255578093306288,
"grad_norm": 0.25,
"learning_rate": 7.248140635564571e-06,
"loss": 1.0761,
"step": 407
},
{
"epoch": 0.8275862068965517,
"grad_norm": 0.23828125,
"learning_rate": 7.241379310344828e-06,
"loss": 1.1163,
"step": 408
},
{
"epoch": 0.8296146044624746,
"grad_norm": 0.2158203125,
"learning_rate": 7.234617985125085e-06,
"loss": 1.1036,
"step": 409
},
{
"epoch": 0.8316430020283976,
"grad_norm": 0.22265625,
"learning_rate": 7.227856659905342e-06,
"loss": 1.096,
"step": 410
},
{
"epoch": 0.8336713995943205,
"grad_norm": 0.220703125,
"learning_rate": 7.221095334685599e-06,
"loss": 1.0663,
"step": 411
},
{
"epoch": 0.8356997971602435,
"grad_norm": 0.228515625,
"learning_rate": 7.214334009465855e-06,
"loss": 1.119,
"step": 412
},
{
"epoch": 0.8377281947261663,
"grad_norm": 0.22265625,
"learning_rate": 7.207572684246112e-06,
"loss": 1.1057,
"step": 413
},
{
"epoch": 0.8397565922920892,
"grad_norm": 0.23046875,
"learning_rate": 7.20081135902637e-06,
"loss": 1.0777,
"step": 414
},
{
"epoch": 0.8417849898580122,
"grad_norm": 0.2275390625,
"learning_rate": 7.194050033806627e-06,
"loss": 1.1091,
"step": 415
},
{
"epoch": 0.8438133874239351,
"grad_norm": 0.2353515625,
"learning_rate": 7.187288708586884e-06,
"loss": 1.055,
"step": 416
},
{
"epoch": 0.845841784989858,
"grad_norm": 0.2109375,
"learning_rate": 7.1805273833671405e-06,
"loss": 1.1213,
"step": 417
},
{
"epoch": 0.847870182555781,
"grad_norm": 0.2314453125,
"learning_rate": 7.1737660581473974e-06,
"loss": 1.0516,
"step": 418
},
{
"epoch": 0.8498985801217038,
"grad_norm": 0.2236328125,
"learning_rate": 7.167004732927655e-06,
"loss": 1.1068,
"step": 419
},
{
"epoch": 0.8519269776876268,
"grad_norm": 0.232421875,
"learning_rate": 7.16024340770791e-06,
"loss": 1.1144,
"step": 420
},
{
"epoch": 0.8539553752535497,
"grad_norm": 0.2216796875,
"learning_rate": 7.153482082488168e-06,
"loss": 1.0661,
"step": 421
},
{
"epoch": 0.8559837728194726,
"grad_norm": 0.27734375,
"learning_rate": 7.146720757268425e-06,
"loss": 1.1194,
"step": 422
},
{
"epoch": 0.8580121703853956,
"grad_norm": 0.2255859375,
"learning_rate": 7.139959432048682e-06,
"loss": 1.0732,
"step": 423
},
{
"epoch": 0.8600405679513184,
"grad_norm": 0.2216796875,
"learning_rate": 7.133198106828939e-06,
"loss": 1.0919,
"step": 424
},
{
"epoch": 0.8620689655172413,
"grad_norm": 0.2265625,
"learning_rate": 7.126436781609196e-06,
"loss": 1.0936,
"step": 425
},
{
"epoch": 0.8640973630831643,
"grad_norm": 0.349609375,
"learning_rate": 7.119675456389454e-06,
"loss": 1.1049,
"step": 426
},
{
"epoch": 0.8661257606490872,
"grad_norm": 0.228515625,
"learning_rate": 7.1129141311697105e-06,
"loss": 1.1039,
"step": 427
},
{
"epoch": 0.8681541582150102,
"grad_norm": 0.296875,
"learning_rate": 7.1061528059499666e-06,
"loss": 1.034,
"step": 428
},
{
"epoch": 0.8701825557809331,
"grad_norm": 0.2314453125,
"learning_rate": 7.0993914807302235e-06,
"loss": 1.1177,
"step": 429
},
{
"epoch": 0.8722109533468559,
"grad_norm": 0.33984375,
"learning_rate": 7.09263015551048e-06,
"loss": 1.0703,
"step": 430
},
{
"epoch": 0.8742393509127789,
"grad_norm": 0.21875,
"learning_rate": 7.085868830290737e-06,
"loss": 1.0331,
"step": 431
},
{
"epoch": 0.8762677484787018,
"grad_norm": 0.220703125,
"learning_rate": 7.079107505070994e-06,
"loss": 1.0849,
"step": 432
},
{
"epoch": 0.8782961460446247,
"grad_norm": 0.2265625,
"learning_rate": 7.072346179851252e-06,
"loss": 1.0612,
"step": 433
},
{
"epoch": 0.8803245436105477,
"grad_norm": 0.2177734375,
"learning_rate": 7.065584854631509e-06,
"loss": 1.0692,
"step": 434
},
{
"epoch": 0.8823529411764706,
"grad_norm": 0.2197265625,
"learning_rate": 7.058823529411766e-06,
"loss": 1.0678,
"step": 435
},
{
"epoch": 0.8843813387423936,
"grad_norm": 0.2216796875,
"learning_rate": 7.052062204192022e-06,
"loss": 1.0662,
"step": 436
},
{
"epoch": 0.8864097363083164,
"grad_norm": 0.2158203125,
"learning_rate": 7.045300878972279e-06,
"loss": 1.08,
"step": 437
},
{
"epoch": 0.8884381338742393,
"grad_norm": 0.3046875,
"learning_rate": 7.038539553752536e-06,
"loss": 1.1165,
"step": 438
},
{
"epoch": 0.8904665314401623,
"grad_norm": 0.2216796875,
"learning_rate": 7.031778228532793e-06,
"loss": 1.0933,
"step": 439
},
{
"epoch": 0.8924949290060852,
"grad_norm": 0.228515625,
"learning_rate": 7.02501690331305e-06,
"loss": 1.1124,
"step": 440
},
{
"epoch": 0.8945233265720081,
"grad_norm": 0.2314453125,
"learning_rate": 7.018255578093307e-06,
"loss": 1.1189,
"step": 441
},
{
"epoch": 0.896551724137931,
"grad_norm": 0.228515625,
"learning_rate": 7.011494252873564e-06,
"loss": 1.0761,
"step": 442
},
{
"epoch": 0.8985801217038539,
"grad_norm": 0.2294921875,
"learning_rate": 7.004732927653821e-06,
"loss": 1.0929,
"step": 443
},
{
"epoch": 0.9006085192697769,
"grad_norm": 0.2392578125,
"learning_rate": 6.997971602434077e-06,
"loss": 1.0499,
"step": 444
},
{
"epoch": 0.9026369168356998,
"grad_norm": 0.3515625,
"learning_rate": 6.991210277214334e-06,
"loss": 1.0665,
"step": 445
},
{
"epoch": 0.9046653144016227,
"grad_norm": 0.2216796875,
"learning_rate": 6.984448951994591e-06,
"loss": 1.0624,
"step": 446
},
{
"epoch": 0.9066937119675457,
"grad_norm": 0.240234375,
"learning_rate": 6.977687626774849e-06,
"loss": 1.1199,
"step": 447
},
{
"epoch": 0.9087221095334685,
"grad_norm": 0.28125,
"learning_rate": 6.970926301555106e-06,
"loss": 1.0736,
"step": 448
},
{
"epoch": 0.9107505070993914,
"grad_norm": 0.228515625,
"learning_rate": 6.9641649763353625e-06,
"loss": 1.1005,
"step": 449
},
{
"epoch": 0.9127789046653144,
"grad_norm": 0.248046875,
"learning_rate": 6.9574036511156195e-06,
"loss": 1.1016,
"step": 450
},
{
"epoch": 0.9148073022312373,
"grad_norm": 0.337890625,
"learning_rate": 6.950642325895876e-06,
"loss": 1.0351,
"step": 451
},
{
"epoch": 0.9168356997971603,
"grad_norm": 0.30078125,
"learning_rate": 6.9438810006761324e-06,
"loss": 1.055,
"step": 452
},
{
"epoch": 0.9188640973630832,
"grad_norm": 0.2177734375,
"learning_rate": 6.937119675456389e-06,
"loss": 1.0669,
"step": 453
},
{
"epoch": 0.920892494929006,
"grad_norm": 0.220703125,
"learning_rate": 6.930358350236647e-06,
"loss": 1.1177,
"step": 454
},
{
"epoch": 0.922920892494929,
"grad_norm": 0.2294921875,
"learning_rate": 6.923597025016904e-06,
"loss": 1.1167,
"step": 455
},
{
"epoch": 0.9249492900608519,
"grad_norm": 0.2578125,
"learning_rate": 6.916835699797161e-06,
"loss": 1.0982,
"step": 456
},
{
"epoch": 0.9269776876267748,
"grad_norm": 0.2421875,
"learning_rate": 6.910074374577418e-06,
"loss": 1.0828,
"step": 457
},
{
"epoch": 0.9290060851926978,
"grad_norm": 0.228515625,
"learning_rate": 6.903313049357675e-06,
"loss": 1.0629,
"step": 458
},
{
"epoch": 0.9310344827586207,
"grad_norm": 0.625,
"learning_rate": 6.896551724137932e-06,
"loss": 1.107,
"step": 459
},
{
"epoch": 0.9330628803245437,
"grad_norm": 0.23828125,
"learning_rate": 6.889790398918188e-06,
"loss": 1.0501,
"step": 460
},
{
"epoch": 0.9350912778904665,
"grad_norm": 0.2333984375,
"learning_rate": 6.8830290736984455e-06,
"loss": 1.0976,
"step": 461
},
{
"epoch": 0.9371196754563894,
"grad_norm": 0.259765625,
"learning_rate": 6.876267748478702e-06,
"loss": 1.1372,
"step": 462
},
{
"epoch": 0.9391480730223124,
"grad_norm": 0.2197265625,
"learning_rate": 6.869506423258959e-06,
"loss": 1.0403,
"step": 463
},
{
"epoch": 0.9411764705882353,
"grad_norm": 0.2216796875,
"learning_rate": 6.862745098039216e-06,
"loss": 1.0414,
"step": 464
},
{
"epoch": 0.9432048681541582,
"grad_norm": 0.22265625,
"learning_rate": 6.855983772819473e-06,
"loss": 1.104,
"step": 465
},
{
"epoch": 0.9452332657200812,
"grad_norm": 0.234375,
"learning_rate": 6.84922244759973e-06,
"loss": 1.1283,
"step": 466
},
{
"epoch": 0.947261663286004,
"grad_norm": 0.234375,
"learning_rate": 6.842461122379988e-06,
"loss": 1.0765,
"step": 467
},
{
"epoch": 0.949290060851927,
"grad_norm": 0.279296875,
"learning_rate": 6.835699797160244e-06,
"loss": 1.0378,
"step": 468
},
{
"epoch": 0.9513184584178499,
"grad_norm": 0.234375,
"learning_rate": 6.828938471940501e-06,
"loss": 1.1038,
"step": 469
},
{
"epoch": 0.9533468559837728,
"grad_norm": 0.234375,
"learning_rate": 6.822177146720758e-06,
"loss": 1.0895,
"step": 470
},
{
"epoch": 0.9553752535496958,
"grad_norm": 0.2451171875,
"learning_rate": 6.815415821501015e-06,
"loss": 1.0341,
"step": 471
},
{
"epoch": 0.9574036511156186,
"grad_norm": 0.322265625,
"learning_rate": 6.8086544962812715e-06,
"loss": 1.0862,
"step": 472
},
{
"epoch": 0.9594320486815415,
"grad_norm": 0.2421875,
"learning_rate": 6.801893171061528e-06,
"loss": 1.1023,
"step": 473
},
{
"epoch": 0.9614604462474645,
"grad_norm": 0.23046875,
"learning_rate": 6.795131845841786e-06,
"loss": 1.0123,
"step": 474
},
{
"epoch": 0.9634888438133874,
"grad_norm": 0.228515625,
"learning_rate": 6.788370520622043e-06,
"loss": 1.1256,
"step": 475
},
{
"epoch": 0.9655172413793104,
"grad_norm": 0.2353515625,
"learning_rate": 6.781609195402299e-06,
"loss": 1.0688,
"step": 476
},
{
"epoch": 0.9675456389452333,
"grad_norm": 0.2373046875,
"learning_rate": 6.774847870182556e-06,
"loss": 1.1208,
"step": 477
},
{
"epoch": 0.9695740365111561,
"grad_norm": 0.23828125,
"learning_rate": 6.768086544962813e-06,
"loss": 1.087,
"step": 478
},
{
"epoch": 0.9716024340770791,
"grad_norm": 0.2490234375,
"learning_rate": 6.76132521974307e-06,
"loss": 1.099,
"step": 479
},
{
"epoch": 0.973630831643002,
"grad_norm": 0.2490234375,
"learning_rate": 6.754563894523327e-06,
"loss": 1.1076,
"step": 480
},
{
"epoch": 0.9756592292089249,
"grad_norm": 0.2255859375,
"learning_rate": 6.7478025693035846e-06,
"loss": 1.0634,
"step": 481
},
{
"epoch": 0.9776876267748479,
"grad_norm": 0.22265625,
"learning_rate": 6.7410412440838415e-06,
"loss": 1.0835,
"step": 482
},
{
"epoch": 0.9797160243407708,
"grad_norm": 0.2353515625,
"learning_rate": 6.734279918864098e-06,
"loss": 1.0837,
"step": 483
},
{
"epoch": 0.9817444219066938,
"grad_norm": 0.228515625,
"learning_rate": 6.7275185936443544e-06,
"loss": 1.0663,
"step": 484
},
{
"epoch": 0.9837728194726166,
"grad_norm": 0.2255859375,
"learning_rate": 6.720757268424611e-06,
"loss": 1.0664,
"step": 485
},
{
"epoch": 0.9858012170385395,
"grad_norm": 0.2490234375,
"learning_rate": 6.713995943204868e-06,
"loss": 1.0677,
"step": 486
},
{
"epoch": 0.9878296146044625,
"grad_norm": 0.2216796875,
"learning_rate": 6.707234617985125e-06,
"loss": 1.0646,
"step": 487
},
{
"epoch": 0.9898580121703854,
"grad_norm": 0.2255859375,
"learning_rate": 6.700473292765383e-06,
"loss": 1.0106,
"step": 488
},
{
"epoch": 0.9918864097363083,
"grad_norm": 0.2265625,
"learning_rate": 6.69371196754564e-06,
"loss": 1.1186,
"step": 489
},
{
"epoch": 0.9939148073022313,
"grad_norm": 0.287109375,
"learning_rate": 6.686950642325897e-06,
"loss": 1.0706,
"step": 490
},
{
"epoch": 0.9959432048681541,
"grad_norm": 0.2451171875,
"learning_rate": 6.680189317106154e-06,
"loss": 1.065,
"step": 491
},
{
"epoch": 0.9979716024340771,
"grad_norm": 0.2421875,
"learning_rate": 6.67342799188641e-06,
"loss": 1.0931,
"step": 492
},
{
"epoch": 1.0,
"grad_norm": 0.244140625,
"learning_rate": 6.666666666666667e-06,
"loss": 1.0794,
"step": 493
},
{
"epoch": 1.002028397565923,
"grad_norm": 0.28125,
"learning_rate": 6.6599053414469236e-06,
"loss": 1.087,
"step": 494
},
{
"epoch": 1.0040567951318458,
"grad_norm": 0.23828125,
"learning_rate": 6.653144016227181e-06,
"loss": 1.0326,
"step": 495
},
{
"epoch": 1.0060851926977687,
"grad_norm": 0.248046875,
"learning_rate": 6.646382691007438e-06,
"loss": 1.056,
"step": 496
},
{
"epoch": 1.0081135902636917,
"grad_norm": 0.2255859375,
"learning_rate": 6.639621365787695e-06,
"loss": 1.0488,
"step": 497
},
{
"epoch": 1.0101419878296145,
"grad_norm": 0.2275390625,
"learning_rate": 6.632860040567952e-06,
"loss": 1.0683,
"step": 498
},
{
"epoch": 1.0121703853955375,
"grad_norm": 0.294921875,
"learning_rate": 6.626098715348209e-06,
"loss": 1.0659,
"step": 499
},
{
"epoch": 1.0141987829614605,
"grad_norm": 0.2333984375,
"learning_rate": 6.619337390128465e-06,
"loss": 1.0606,
"step": 500
},
{
"epoch": 1.0162271805273835,
"grad_norm": 0.30859375,
"learning_rate": 6.612576064908722e-06,
"loss": 1.0074,
"step": 501
},
{
"epoch": 1.0182555780933062,
"grad_norm": 0.283203125,
"learning_rate": 6.60581473968898e-06,
"loss": 1.0893,
"step": 502
},
{
"epoch": 1.0202839756592292,
"grad_norm": 0.22265625,
"learning_rate": 6.599053414469237e-06,
"loss": 1.079,
"step": 503
},
{
"epoch": 1.0223123732251522,
"grad_norm": 0.322265625,
"learning_rate": 6.5922920892494935e-06,
"loss": 1.0616,
"step": 504
},
{
"epoch": 1.024340770791075,
"grad_norm": 0.2333984375,
"learning_rate": 6.5855307640297504e-06,
"loss": 1.0783,
"step": 505
},
{
"epoch": 1.026369168356998,
"grad_norm": 0.2314453125,
"learning_rate": 6.578769438810007e-06,
"loss": 1.0645,
"step": 506
},
{
"epoch": 1.028397565922921,
"grad_norm": 0.2490234375,
"learning_rate": 6.572008113590265e-06,
"loss": 1.0546,
"step": 507
},
{
"epoch": 1.0304259634888437,
"grad_norm": 0.2333984375,
"learning_rate": 6.56524678837052e-06,
"loss": 1.088,
"step": 508
},
{
"epoch": 1.0324543610547667,
"grad_norm": 0.2412109375,
"learning_rate": 6.558485463150778e-06,
"loss": 1.1028,
"step": 509
},
{
"epoch": 1.0344827586206897,
"grad_norm": 0.2314453125,
"learning_rate": 6.551724137931035e-06,
"loss": 1.0477,
"step": 510
},
{
"epoch": 1.0365111561866125,
"grad_norm": 0.2294921875,
"learning_rate": 6.544962812711292e-06,
"loss": 1.0453,
"step": 511
},
{
"epoch": 1.0385395537525355,
"grad_norm": 0.2275390625,
"learning_rate": 6.538201487491549e-06,
"loss": 1.0321,
"step": 512
},
{
"epoch": 1.0405679513184585,
"grad_norm": 0.2314453125,
"learning_rate": 6.531440162271806e-06,
"loss": 1.0569,
"step": 513
},
{
"epoch": 1.0425963488843812,
"grad_norm": 0.234375,
"learning_rate": 6.5246788370520635e-06,
"loss": 1.0619,
"step": 514
},
{
"epoch": 1.0446247464503042,
"grad_norm": 0.251953125,
"learning_rate": 6.51791751183232e-06,
"loss": 1.0559,
"step": 515
},
{
"epoch": 1.0466531440162272,
"grad_norm": 0.2431640625,
"learning_rate": 6.5111561866125765e-06,
"loss": 1.0531,
"step": 516
},
{
"epoch": 1.04868154158215,
"grad_norm": 0.22265625,
"learning_rate": 6.504394861392833e-06,
"loss": 1.0724,
"step": 517
},
{
"epoch": 1.050709939148073,
"grad_norm": 0.2451171875,
"learning_rate": 6.49763353617309e-06,
"loss": 1.0478,
"step": 518
},
{
"epoch": 1.052738336713996,
"grad_norm": 0.21875,
"learning_rate": 6.490872210953347e-06,
"loss": 1.0659,
"step": 519
},
{
"epoch": 1.054766734279919,
"grad_norm": 0.2216796875,
"learning_rate": 6.484110885733604e-06,
"loss": 1.0672,
"step": 520
},
{
"epoch": 1.0567951318458417,
"grad_norm": 0.2734375,
"learning_rate": 6.477349560513861e-06,
"loss": 1.0845,
"step": 521
},
{
"epoch": 1.0588235294117647,
"grad_norm": 0.310546875,
"learning_rate": 6.470588235294119e-06,
"loss": 1.0633,
"step": 522
},
{
"epoch": 1.0608519269776877,
"grad_norm": 0.26171875,
"learning_rate": 6.463826910074376e-06,
"loss": 1.1189,
"step": 523
},
{
"epoch": 1.0628803245436105,
"grad_norm": 0.296875,
"learning_rate": 6.457065584854632e-06,
"loss": 1.0884,
"step": 524
},
{
"epoch": 1.0649087221095335,
"grad_norm": 0.2255859375,
"learning_rate": 6.450304259634889e-06,
"loss": 1.0337,
"step": 525
},
{
"epoch": 1.0669371196754565,
"grad_norm": 0.2255859375,
"learning_rate": 6.4435429344151456e-06,
"loss": 1.0333,
"step": 526
},
{
"epoch": 1.0689655172413792,
"grad_norm": 0.2333984375,
"learning_rate": 6.4367816091954025e-06,
"loss": 1.0785,
"step": 527
},
{
"epoch": 1.0709939148073022,
"grad_norm": 0.2255859375,
"learning_rate": 6.430020283975659e-06,
"loss": 1.0,
"step": 528
},
{
"epoch": 1.0730223123732252,
"grad_norm": 0.2392578125,
"learning_rate": 6.423258958755917e-06,
"loss": 1.0664,
"step": 529
},
{
"epoch": 1.075050709939148,
"grad_norm": 0.29296875,
"learning_rate": 6.416497633536174e-06,
"loss": 1.1181,
"step": 530
},
{
"epoch": 1.077079107505071,
"grad_norm": 0.255859375,
"learning_rate": 6.409736308316431e-06,
"loss": 1.0471,
"step": 531
},
{
"epoch": 1.079107505070994,
"grad_norm": 0.2421875,
"learning_rate": 6.402974983096687e-06,
"loss": 1.0518,
"step": 532
},
{
"epoch": 1.081135902636917,
"grad_norm": 0.2470703125,
"learning_rate": 6.396213657876944e-06,
"loss": 1.0784,
"step": 533
},
{
"epoch": 1.0831643002028397,
"grad_norm": 0.32421875,
"learning_rate": 6.389452332657201e-06,
"loss": 1.0695,
"step": 534
},
{
"epoch": 1.0851926977687627,
"grad_norm": 0.234375,
"learning_rate": 6.382691007437458e-06,
"loss": 1.044,
"step": 535
},
{
"epoch": 1.0872210953346857,
"grad_norm": 0.337890625,
"learning_rate": 6.3759296822177155e-06,
"loss": 1.049,
"step": 536
},
{
"epoch": 1.0892494929006085,
"grad_norm": 0.287109375,
"learning_rate": 6.3691683569979724e-06,
"loss": 1.0108,
"step": 537
},
{
"epoch": 1.0912778904665315,
"grad_norm": 0.234375,
"learning_rate": 6.362407031778229e-06,
"loss": 1.02,
"step": 538
},
{
"epoch": 1.0933062880324544,
"grad_norm": 0.26953125,
"learning_rate": 6.355645706558486e-06,
"loss": 1.0442,
"step": 539
},
{
"epoch": 1.0953346855983772,
"grad_norm": 0.302734375,
"learning_rate": 6.348884381338742e-06,
"loss": 1.0711,
"step": 540
},
{
"epoch": 1.0973630831643002,
"grad_norm": 0.35546875,
"learning_rate": 6.342123056118999e-06,
"loss": 1.0747,
"step": 541
},
{
"epoch": 1.0993914807302232,
"grad_norm": 0.234375,
"learning_rate": 6.335361730899256e-06,
"loss": 1.0477,
"step": 542
},
{
"epoch": 1.101419878296146,
"grad_norm": 0.2314453125,
"learning_rate": 6.328600405679514e-06,
"loss": 1.023,
"step": 543
},
{
"epoch": 1.103448275862069,
"grad_norm": 0.234375,
"learning_rate": 6.321839080459771e-06,
"loss": 1.0482,
"step": 544
},
{
"epoch": 1.105476673427992,
"grad_norm": 0.2470703125,
"learning_rate": 6.315077755240028e-06,
"loss": 1.1157,
"step": 545
},
{
"epoch": 1.1075050709939147,
"grad_norm": 0.2451171875,
"learning_rate": 6.308316430020285e-06,
"loss": 1.0663,
"step": 546
},
{
"epoch": 1.1095334685598377,
"grad_norm": 0.2294921875,
"learning_rate": 6.3015551048005416e-06,
"loss": 1.0461,
"step": 547
},
{
"epoch": 1.1115618661257607,
"grad_norm": 0.2412109375,
"learning_rate": 6.294793779580798e-06,
"loss": 1.0951,
"step": 548
},
{
"epoch": 1.1135902636916835,
"grad_norm": 0.2431640625,
"learning_rate": 6.2880324543610545e-06,
"loss": 1.0817,
"step": 549
},
{
"epoch": 1.1156186612576064,
"grad_norm": 0.2578125,
"learning_rate": 6.281271129141312e-06,
"loss": 1.0373,
"step": 550
},
{
"epoch": 1.1176470588235294,
"grad_norm": 0.26953125,
"learning_rate": 6.274509803921569e-06,
"loss": 1.0159,
"step": 551
},
{
"epoch": 1.1196754563894524,
"grad_norm": 0.263671875,
"learning_rate": 6.267748478701826e-06,
"loss": 1.0672,
"step": 552
},
{
"epoch": 1.1217038539553752,
"grad_norm": 0.23046875,
"learning_rate": 6.260987153482083e-06,
"loss": 1.0657,
"step": 553
},
{
"epoch": 1.1237322515212982,
"grad_norm": 0.2275390625,
"learning_rate": 6.25422582826234e-06,
"loss": 1.0105,
"step": 554
},
{
"epoch": 1.1257606490872212,
"grad_norm": 0.2412109375,
"learning_rate": 6.247464503042598e-06,
"loss": 1.0405,
"step": 555
},
{
"epoch": 1.127789046653144,
"grad_norm": 0.2490234375,
"learning_rate": 6.240703177822853e-06,
"loss": 1.0467,
"step": 556
},
{
"epoch": 1.129817444219067,
"grad_norm": 0.2255859375,
"learning_rate": 6.233941852603111e-06,
"loss": 1.0506,
"step": 557
},
{
"epoch": 1.13184584178499,
"grad_norm": 0.2392578125,
"learning_rate": 6.227180527383368e-06,
"loss": 1.0681,
"step": 558
},
{
"epoch": 1.1338742393509127,
"grad_norm": 0.2421875,
"learning_rate": 6.2204192021636245e-06,
"loss": 1.126,
"step": 559
},
{
"epoch": 1.1359026369168357,
"grad_norm": 0.2451171875,
"learning_rate": 6.213657876943881e-06,
"loss": 1.0295,
"step": 560
},
{
"epoch": 1.1379310344827587,
"grad_norm": 0.28515625,
"learning_rate": 6.206896551724138e-06,
"loss": 1.076,
"step": 561
},
{
"epoch": 1.1399594320486814,
"grad_norm": 0.240234375,
"learning_rate": 6.200135226504396e-06,
"loss": 1.0124,
"step": 562
},
{
"epoch": 1.1419878296146044,
"grad_norm": 0.2490234375,
"learning_rate": 6.193373901284653e-06,
"loss": 1.1096,
"step": 563
},
{
"epoch": 1.1440162271805274,
"grad_norm": 0.296875,
"learning_rate": 6.186612576064909e-06,
"loss": 1.0413,
"step": 564
},
{
"epoch": 1.1460446247464504,
"grad_norm": 0.2314453125,
"learning_rate": 6.179851250845166e-06,
"loss": 1.0461,
"step": 565
},
{
"epoch": 1.1480730223123732,
"grad_norm": 0.330078125,
"learning_rate": 6.173089925625423e-06,
"loss": 1.0517,
"step": 566
},
{
"epoch": 1.1501014198782962,
"grad_norm": 0.2451171875,
"learning_rate": 6.16632860040568e-06,
"loss": 1.0945,
"step": 567
},
{
"epoch": 1.1521298174442192,
"grad_norm": 0.251953125,
"learning_rate": 6.159567275185937e-06,
"loss": 1.1018,
"step": 568
},
{
"epoch": 1.154158215010142,
"grad_norm": 0.275390625,
"learning_rate": 6.1528059499661945e-06,
"loss": 0.9737,
"step": 569
},
{
"epoch": 1.156186612576065,
"grad_norm": 0.2431640625,
"learning_rate": 6.146044624746451e-06,
"loss": 1.0363,
"step": 570
},
{
"epoch": 1.158215010141988,
"grad_norm": 0.61328125,
"learning_rate": 6.139283299526708e-06,
"loss": 1.071,
"step": 571
},
{
"epoch": 1.1602434077079107,
"grad_norm": 0.2392578125,
"learning_rate": 6.132521974306964e-06,
"loss": 1.0491,
"step": 572
},
{
"epoch": 1.1622718052738337,
"grad_norm": 0.2451171875,
"learning_rate": 6.125760649087221e-06,
"loss": 1.0269,
"step": 573
},
{
"epoch": 1.1643002028397567,
"grad_norm": 0.2490234375,
"learning_rate": 6.118999323867478e-06,
"loss": 1.0467,
"step": 574
},
{
"epoch": 1.1663286004056794,
"grad_norm": 0.25390625,
"learning_rate": 6.112237998647735e-06,
"loss": 1.0575,
"step": 575
},
{
"epoch": 1.1683569979716024,
"grad_norm": 0.2392578125,
"learning_rate": 6.105476673427993e-06,
"loss": 1.0519,
"step": 576
},
{
"epoch": 1.1703853955375254,
"grad_norm": 0.259765625,
"learning_rate": 6.09871534820825e-06,
"loss": 1.0848,
"step": 577
},
{
"epoch": 1.1724137931034484,
"grad_norm": 0.2333984375,
"learning_rate": 6.091954022988507e-06,
"loss": 1.0639,
"step": 578
},
{
"epoch": 1.1744421906693712,
"grad_norm": 0.23828125,
"learning_rate": 6.0851926977687636e-06,
"loss": 1.0619,
"step": 579
},
{
"epoch": 1.1764705882352942,
"grad_norm": 0.2392578125,
"learning_rate": 6.07843137254902e-06,
"loss": 1.0422,
"step": 580
},
{
"epoch": 1.178498985801217,
"grad_norm": 0.2294921875,
"learning_rate": 6.0716700473292766e-06,
"loss": 1.0245,
"step": 581
},
{
"epoch": 1.18052738336714,
"grad_norm": 0.2255859375,
"learning_rate": 6.0649087221095335e-06,
"loss": 1.051,
"step": 582
},
{
"epoch": 1.182555780933063,
"grad_norm": 0.298828125,
"learning_rate": 6.058147396889791e-06,
"loss": 1.0893,
"step": 583
},
{
"epoch": 1.184584178498986,
"grad_norm": 0.2412109375,
"learning_rate": 6.051386071670048e-06,
"loss": 1.0841,
"step": 584
},
{
"epoch": 1.1866125760649087,
"grad_norm": 0.2451171875,
"learning_rate": 6.044624746450305e-06,
"loss": 1.0444,
"step": 585
},
{
"epoch": 1.1886409736308317,
"grad_norm": 0.2490234375,
"learning_rate": 6.037863421230562e-06,
"loss": 1.0222,
"step": 586
},
{
"epoch": 1.1906693711967546,
"grad_norm": 0.26171875,
"learning_rate": 6.031102096010819e-06,
"loss": 1.0554,
"step": 587
},
{
"epoch": 1.1926977687626774,
"grad_norm": 0.25,
"learning_rate": 6.024340770791075e-06,
"loss": 1.0436,
"step": 588
},
{
"epoch": 1.1947261663286004,
"grad_norm": 0.255859375,
"learning_rate": 6.017579445571332e-06,
"loss": 1.0239,
"step": 589
},
{
"epoch": 1.1967545638945234,
"grad_norm": 0.2373046875,
"learning_rate": 6.01081812035159e-06,
"loss": 1.0412,
"step": 590
},
{
"epoch": 1.1987829614604462,
"grad_norm": 0.2431640625,
"learning_rate": 6.0040567951318465e-06,
"loss": 1.0504,
"step": 591
},
{
"epoch": 1.2008113590263692,
"grad_norm": 0.236328125,
"learning_rate": 5.9972954699121034e-06,
"loss": 1.0352,
"step": 592
},
{
"epoch": 1.2028397565922921,
"grad_norm": 0.2353515625,
"learning_rate": 5.99053414469236e-06,
"loss": 1.0506,
"step": 593
},
{
"epoch": 1.204868154158215,
"grad_norm": 0.2412109375,
"learning_rate": 5.983772819472617e-06,
"loss": 1.0723,
"step": 594
},
{
"epoch": 1.206896551724138,
"grad_norm": 0.251953125,
"learning_rate": 5.977011494252874e-06,
"loss": 1.0563,
"step": 595
},
{
"epoch": 1.208924949290061,
"grad_norm": 0.244140625,
"learning_rate": 5.97025016903313e-06,
"loss": 1.0346,
"step": 596
},
{
"epoch": 1.2109533468559839,
"grad_norm": 0.2451171875,
"learning_rate": 5.963488843813387e-06,
"loss": 1.0636,
"step": 597
},
{
"epoch": 1.2129817444219066,
"grad_norm": 0.236328125,
"learning_rate": 5.956727518593645e-06,
"loss": 1.0694,
"step": 598
},
{
"epoch": 1.2150101419878296,
"grad_norm": 0.251953125,
"learning_rate": 5.949966193373902e-06,
"loss": 1.0332,
"step": 599
},
{
"epoch": 1.2170385395537526,
"grad_norm": 0.234375,
"learning_rate": 5.943204868154159e-06,
"loss": 1.0194,
"step": 600
},
{
"epoch": 1.2190669371196754,
"grad_norm": 0.37890625,
"learning_rate": 5.936443542934416e-06,
"loss": 1.0426,
"step": 601
},
{
"epoch": 1.2210953346855984,
"grad_norm": 0.2412109375,
"learning_rate": 5.9296822177146725e-06,
"loss": 1.0372,
"step": 602
},
{
"epoch": 1.2231237322515214,
"grad_norm": 0.50390625,
"learning_rate": 5.92292089249493e-06,
"loss": 0.9994,
"step": 603
},
{
"epoch": 1.2251521298174441,
"grad_norm": 0.263671875,
"learning_rate": 5.9161595672751855e-06,
"loss": 1.0621,
"step": 604
},
{
"epoch": 1.2271805273833671,
"grad_norm": 0.236328125,
"learning_rate": 5.909398242055443e-06,
"loss": 1.0593,
"step": 605
},
{
"epoch": 1.2292089249492901,
"grad_norm": 0.25,
"learning_rate": 5.9026369168357e-06,
"loss": 1.0912,
"step": 606
},
{
"epoch": 1.231237322515213,
"grad_norm": 0.23828125,
"learning_rate": 5.895875591615957e-06,
"loss": 1.0958,
"step": 607
},
{
"epoch": 1.2332657200811359,
"grad_norm": 0.2392578125,
"learning_rate": 5.889114266396214e-06,
"loss": 0.997,
"step": 608
},
{
"epoch": 1.2352941176470589,
"grad_norm": 0.248046875,
"learning_rate": 5.882352941176471e-06,
"loss": 1.1018,
"step": 609
},
{
"epoch": 1.2373225152129819,
"grad_norm": 0.244140625,
"learning_rate": 5.875591615956729e-06,
"loss": 1.0352,
"step": 610
},
{
"epoch": 1.2393509127789046,
"grad_norm": 0.265625,
"learning_rate": 5.868830290736986e-06,
"loss": 1.0638,
"step": 611
},
{
"epoch": 1.2413793103448276,
"grad_norm": 0.24609375,
"learning_rate": 5.862068965517242e-06,
"loss": 1.0008,
"step": 612
},
{
"epoch": 1.2434077079107504,
"grad_norm": 0.251953125,
"learning_rate": 5.8553076402974986e-06,
"loss": 1.0329,
"step": 613
},
{
"epoch": 1.2454361054766734,
"grad_norm": 0.2412109375,
"learning_rate": 5.8485463150777555e-06,
"loss": 1.0629,
"step": 614
},
{
"epoch": 1.2474645030425964,
"grad_norm": 0.240234375,
"learning_rate": 5.841784989858012e-06,
"loss": 1.0493,
"step": 615
},
{
"epoch": 1.2494929006085194,
"grad_norm": 0.25,
"learning_rate": 5.835023664638269e-06,
"loss": 1.0285,
"step": 616
},
{
"epoch": 1.2515212981744421,
"grad_norm": 0.2392578125,
"learning_rate": 5.828262339418527e-06,
"loss": 1.0356,
"step": 617
},
{
"epoch": 1.2535496957403651,
"grad_norm": 0.31640625,
"learning_rate": 5.821501014198784e-06,
"loss": 1.0757,
"step": 618
},
{
"epoch": 1.2555780933062881,
"grad_norm": 0.267578125,
"learning_rate": 5.814739688979041e-06,
"loss": 1.0564,
"step": 619
},
{
"epoch": 1.2576064908722109,
"grad_norm": 0.25,
"learning_rate": 5.807978363759297e-06,
"loss": 1.0453,
"step": 620
},
{
"epoch": 1.2596348884381339,
"grad_norm": 0.24609375,
"learning_rate": 5.801217038539554e-06,
"loss": 1.0489,
"step": 621
},
{
"epoch": 1.2616632860040569,
"grad_norm": 0.25390625,
"learning_rate": 5.794455713319811e-06,
"loss": 1.0171,
"step": 622
},
{
"epoch": 1.2636916835699799,
"grad_norm": 0.263671875,
"learning_rate": 5.787694388100068e-06,
"loss": 1.0695,
"step": 623
},
{
"epoch": 1.2657200811359026,
"grad_norm": 0.2431640625,
"learning_rate": 5.7809330628803254e-06,
"loss": 1.0492,
"step": 624
},
{
"epoch": 1.2677484787018256,
"grad_norm": 0.3984375,
"learning_rate": 5.774171737660582e-06,
"loss": 1.0291,
"step": 625
},
{
"epoch": 1.2697768762677484,
"grad_norm": 0.318359375,
"learning_rate": 5.767410412440839e-06,
"loss": 1.0349,
"step": 626
},
{
"epoch": 1.2718052738336714,
"grad_norm": 0.23046875,
"learning_rate": 5.760649087221096e-06,
"loss": 1.0465,
"step": 627
},
{
"epoch": 1.2738336713995944,
"grad_norm": 0.26171875,
"learning_rate": 5.753887762001352e-06,
"loss": 1.0746,
"step": 628
},
{
"epoch": 1.2758620689655173,
"grad_norm": 0.2578125,
"learning_rate": 5.747126436781609e-06,
"loss": 1.058,
"step": 629
},
{
"epoch": 1.2778904665314401,
"grad_norm": 0.255859375,
"learning_rate": 5.740365111561866e-06,
"loss": 1.0603,
"step": 630
},
{
"epoch": 1.279918864097363,
"grad_norm": 0.251953125,
"learning_rate": 5.733603786342124e-06,
"loss": 1.0767,
"step": 631
},
{
"epoch": 1.2819472616632859,
"grad_norm": 0.248046875,
"learning_rate": 5.726842461122381e-06,
"loss": 1.0272,
"step": 632
},
{
"epoch": 1.2839756592292089,
"grad_norm": 0.2373046875,
"learning_rate": 5.720081135902638e-06,
"loss": 1.0608,
"step": 633
},
{
"epoch": 1.2860040567951319,
"grad_norm": 0.2421875,
"learning_rate": 5.7133198106828946e-06,
"loss": 1.0151,
"step": 634
},
{
"epoch": 1.2880324543610548,
"grad_norm": 0.287109375,
"learning_rate": 5.7065584854631515e-06,
"loss": 1.0519,
"step": 635
},
{
"epoch": 1.2900608519269776,
"grad_norm": 0.23828125,
"learning_rate": 5.6997971602434075e-06,
"loss": 1.0587,
"step": 636
},
{
"epoch": 1.2920892494929006,
"grad_norm": 0.2353515625,
"learning_rate": 5.6930358350236644e-06,
"loss": 1.0706,
"step": 637
},
{
"epoch": 1.2941176470588236,
"grad_norm": 0.2314453125,
"learning_rate": 5.686274509803922e-06,
"loss": 1.0084,
"step": 638
},
{
"epoch": 1.2961460446247464,
"grad_norm": 0.25,
"learning_rate": 5.679513184584179e-06,
"loss": 1.0433,
"step": 639
},
{
"epoch": 1.2981744421906694,
"grad_norm": 0.2353515625,
"learning_rate": 5.672751859364436e-06,
"loss": 1.0044,
"step": 640
},
{
"epoch": 1.3002028397565923,
"grad_norm": 0.248046875,
"learning_rate": 5.665990534144693e-06,
"loss": 0.9463,
"step": 641
},
{
"epoch": 1.3022312373225153,
"grad_norm": 0.23828125,
"learning_rate": 5.65922920892495e-06,
"loss": 1.0042,
"step": 642
},
{
"epoch": 1.304259634888438,
"grad_norm": 0.322265625,
"learning_rate": 5.652467883705207e-06,
"loss": 1.0487,
"step": 643
},
{
"epoch": 1.306288032454361,
"grad_norm": 0.2412109375,
"learning_rate": 5.645706558485463e-06,
"loss": 1.0599,
"step": 644
},
{
"epoch": 1.3083164300202839,
"grad_norm": 0.240234375,
"learning_rate": 5.638945233265721e-06,
"loss": 1.0294,
"step": 645
},
{
"epoch": 1.3103448275862069,
"grad_norm": 0.24609375,
"learning_rate": 5.6321839080459775e-06,
"loss": 1.0401,
"step": 646
},
{
"epoch": 1.3123732251521298,
"grad_norm": 0.248046875,
"learning_rate": 5.625422582826234e-06,
"loss": 1.0143,
"step": 647
},
{
"epoch": 1.3144016227180528,
"grad_norm": 0.2314453125,
"learning_rate": 5.618661257606491e-06,
"loss": 1.0506,
"step": 648
},
{
"epoch": 1.3164300202839756,
"grad_norm": 0.23828125,
"learning_rate": 5.611899932386748e-06,
"loss": 1.0314,
"step": 649
},
{
"epoch": 1.3184584178498986,
"grad_norm": 0.2333984375,
"learning_rate": 5.605138607167005e-06,
"loss": 1.0434,
"step": 650
},
{
"epoch": 1.3204868154158216,
"grad_norm": 0.2890625,
"learning_rate": 5.598377281947263e-06,
"loss": 1.0598,
"step": 651
},
{
"epoch": 1.3225152129817443,
"grad_norm": 0.28515625,
"learning_rate": 5.591615956727519e-06,
"loss": 1.011,
"step": 652
},
{
"epoch": 1.3245436105476673,
"grad_norm": 0.25390625,
"learning_rate": 5.584854631507776e-06,
"loss": 1.0506,
"step": 653
},
{
"epoch": 1.3265720081135903,
"grad_norm": 0.24609375,
"learning_rate": 5.578093306288033e-06,
"loss": 1.0387,
"step": 654
},
{
"epoch": 1.3286004056795133,
"grad_norm": 0.2490234375,
"learning_rate": 5.57133198106829e-06,
"loss": 1.0571,
"step": 655
},
{
"epoch": 1.330628803245436,
"grad_norm": 0.26953125,
"learning_rate": 5.564570655848547e-06,
"loss": 1.0351,
"step": 656
},
{
"epoch": 1.332657200811359,
"grad_norm": 0.2431640625,
"learning_rate": 5.5578093306288035e-06,
"loss": 1.0454,
"step": 657
},
{
"epoch": 1.3346855983772818,
"grad_norm": 0.248046875,
"learning_rate": 5.551048005409061e-06,
"loss": 1.0374,
"step": 658
},
{
"epoch": 1.3367139959432048,
"grad_norm": 0.279296875,
"learning_rate": 5.544286680189318e-06,
"loss": 1.0395,
"step": 659
},
{
"epoch": 1.3387423935091278,
"grad_norm": 0.2490234375,
"learning_rate": 5.537525354969574e-06,
"loss": 0.989,
"step": 660
},
{
"epoch": 1.3407707910750508,
"grad_norm": 0.267578125,
"learning_rate": 5.530764029749831e-06,
"loss": 1.0449,
"step": 661
},
{
"epoch": 1.3427991886409736,
"grad_norm": 0.23828125,
"learning_rate": 5.524002704530088e-06,
"loss": 0.9985,
"step": 662
},
{
"epoch": 1.3448275862068966,
"grad_norm": 0.275390625,
"learning_rate": 5.517241379310345e-06,
"loss": 1.0594,
"step": 663
},
{
"epoch": 1.3468559837728193,
"grad_norm": 0.2353515625,
"learning_rate": 5.510480054090602e-06,
"loss": 1.0711,
"step": 664
},
{
"epoch": 1.3488843813387423,
"grad_norm": 0.251953125,
"learning_rate": 5.50371872887086e-06,
"loss": 1.0372,
"step": 665
},
{
"epoch": 1.3509127789046653,
"grad_norm": 0.25390625,
"learning_rate": 5.4969574036511166e-06,
"loss": 1.0782,
"step": 666
},
{
"epoch": 1.3529411764705883,
"grad_norm": 0.26171875,
"learning_rate": 5.4901960784313735e-06,
"loss": 1.0556,
"step": 667
},
{
"epoch": 1.354969574036511,
"grad_norm": 0.30078125,
"learning_rate": 5.4834347532116295e-06,
"loss": 1.0984,
"step": 668
},
{
"epoch": 1.356997971602434,
"grad_norm": 0.2431640625,
"learning_rate": 5.4766734279918865e-06,
"loss": 0.9965,
"step": 669
},
{
"epoch": 1.359026369168357,
"grad_norm": 0.279296875,
"learning_rate": 5.469912102772143e-06,
"loss": 1.0513,
"step": 670
},
{
"epoch": 1.3610547667342798,
"grad_norm": 0.259765625,
"learning_rate": 5.4631507775524e-06,
"loss": 1.0406,
"step": 671
},
{
"epoch": 1.3630831643002028,
"grad_norm": 0.2431640625,
"learning_rate": 5.456389452332658e-06,
"loss": 1.0864,
"step": 672
},
{
"epoch": 1.3651115618661258,
"grad_norm": 0.275390625,
"learning_rate": 5.449628127112915e-06,
"loss": 1.0379,
"step": 673
},
{
"epoch": 1.3671399594320488,
"grad_norm": 0.51171875,
"learning_rate": 5.442866801893172e-06,
"loss": 1.0279,
"step": 674
},
{
"epoch": 1.3691683569979716,
"grad_norm": 0.796875,
"learning_rate": 5.436105476673429e-06,
"loss": 1.0537,
"step": 675
},
{
"epoch": 1.3711967545638946,
"grad_norm": 0.248046875,
"learning_rate": 5.429344151453685e-06,
"loss": 1.022,
"step": 676
},
{
"epoch": 1.3732251521298173,
"grad_norm": 0.271484375,
"learning_rate": 5.422582826233942e-06,
"loss": 1.0154,
"step": 677
},
{
"epoch": 1.3752535496957403,
"grad_norm": 0.25390625,
"learning_rate": 5.415821501014199e-06,
"loss": 1.0449,
"step": 678
},
{
"epoch": 1.3772819472616633,
"grad_norm": 0.24609375,
"learning_rate": 5.409060175794456e-06,
"loss": 1.0143,
"step": 679
},
{
"epoch": 1.3793103448275863,
"grad_norm": 0.2314453125,
"learning_rate": 5.402298850574713e-06,
"loss": 1.047,
"step": 680
},
{
"epoch": 1.381338742393509,
"grad_norm": 0.2431640625,
"learning_rate": 5.39553752535497e-06,
"loss": 1.036,
"step": 681
},
{
"epoch": 1.383367139959432,
"grad_norm": 0.236328125,
"learning_rate": 5.388776200135227e-06,
"loss": 1.076,
"step": 682
},
{
"epoch": 1.385395537525355,
"grad_norm": 0.2392578125,
"learning_rate": 5.382014874915484e-06,
"loss": 1.0543,
"step": 683
},
{
"epoch": 1.3874239350912778,
"grad_norm": 0.28125,
"learning_rate": 5.37525354969574e-06,
"loss": 1.0474,
"step": 684
},
{
"epoch": 1.3894523326572008,
"grad_norm": 0.283203125,
"learning_rate": 5.368492224475997e-06,
"loss": 1.0287,
"step": 685
},
{
"epoch": 1.3914807302231238,
"grad_norm": 0.36328125,
"learning_rate": 5.361730899256255e-06,
"loss": 1.0412,
"step": 686
},
{
"epoch": 1.3935091277890468,
"grad_norm": 0.26171875,
"learning_rate": 5.354969574036512e-06,
"loss": 1.1062,
"step": 687
},
{
"epoch": 1.3955375253549696,
"grad_norm": 0.23046875,
"learning_rate": 5.348208248816769e-06,
"loss": 1.0338,
"step": 688
},
{
"epoch": 1.3975659229208925,
"grad_norm": 0.251953125,
"learning_rate": 5.3414469235970255e-06,
"loss": 1.0185,
"step": 689
},
{
"epoch": 1.3995943204868153,
"grad_norm": 0.251953125,
"learning_rate": 5.3346855983772824e-06,
"loss": 1.0041,
"step": 690
},
{
"epoch": 1.4016227180527383,
"grad_norm": 0.263671875,
"learning_rate": 5.32792427315754e-06,
"loss": 1.0337,
"step": 691
},
{
"epoch": 1.4036511156186613,
"grad_norm": 0.2470703125,
"learning_rate": 5.321162947937795e-06,
"loss": 1.0103,
"step": 692
},
{
"epoch": 1.4056795131845843,
"grad_norm": 0.302734375,
"learning_rate": 5.314401622718053e-06,
"loss": 1.0466,
"step": 693
},
{
"epoch": 1.407707910750507,
"grad_norm": 0.27734375,
"learning_rate": 5.30764029749831e-06,
"loss": 1.0092,
"step": 694
},
{
"epoch": 1.40973630831643,
"grad_norm": 0.25,
"learning_rate": 5.300878972278567e-06,
"loss": 1.0347,
"step": 695
},
{
"epoch": 1.4117647058823528,
"grad_norm": 0.24609375,
"learning_rate": 5.294117647058824e-06,
"loss": 1.0402,
"step": 696
},
{
"epoch": 1.4137931034482758,
"grad_norm": 0.255859375,
"learning_rate": 5.287356321839081e-06,
"loss": 1.0706,
"step": 697
},
{
"epoch": 1.4158215010141988,
"grad_norm": 0.248046875,
"learning_rate": 5.280594996619339e-06,
"loss": 1.0361,
"step": 698
},
{
"epoch": 1.4178498985801218,
"grad_norm": 0.251953125,
"learning_rate": 5.2738336713995955e-06,
"loss": 1.0189,
"step": 699
},
{
"epoch": 1.4198782961460445,
"grad_norm": 0.30859375,
"learning_rate": 5.2670723461798516e-06,
"loss": 1.0324,
"step": 700
},
{
"epoch": 1.4219066937119675,
"grad_norm": 0.248046875,
"learning_rate": 5.2603110209601085e-06,
"loss": 1.0408,
"step": 701
},
{
"epoch": 1.4239350912778905,
"grad_norm": 0.271484375,
"learning_rate": 5.253549695740365e-06,
"loss": 1.0557,
"step": 702
},
{
"epoch": 1.4259634888438133,
"grad_norm": 0.2578125,
"learning_rate": 5.246788370520622e-06,
"loss": 1.0519,
"step": 703
},
{
"epoch": 1.4279918864097363,
"grad_norm": 0.365234375,
"learning_rate": 5.240027045300879e-06,
"loss": 0.9978,
"step": 704
},
{
"epoch": 1.4300202839756593,
"grad_norm": 0.2578125,
"learning_rate": 5.233265720081136e-06,
"loss": 0.9964,
"step": 705
},
{
"epoch": 1.4320486815415823,
"grad_norm": 0.25390625,
"learning_rate": 5.226504394861394e-06,
"loss": 1.0495,
"step": 706
},
{
"epoch": 1.434077079107505,
"grad_norm": 0.259765625,
"learning_rate": 5.219743069641651e-06,
"loss": 1.0334,
"step": 707
},
{
"epoch": 1.436105476673428,
"grad_norm": 0.310546875,
"learning_rate": 5.212981744421907e-06,
"loss": 1.0414,
"step": 708
},
{
"epoch": 1.4381338742393508,
"grad_norm": 0.275390625,
"learning_rate": 5.206220419202164e-06,
"loss": 1.0865,
"step": 709
},
{
"epoch": 1.4401622718052738,
"grad_norm": 0.255859375,
"learning_rate": 5.199459093982421e-06,
"loss": 1.0293,
"step": 710
},
{
"epoch": 1.4421906693711968,
"grad_norm": 0.25390625,
"learning_rate": 5.192697768762678e-06,
"loss": 1.0371,
"step": 711
},
{
"epoch": 1.4442190669371198,
"grad_norm": 0.267578125,
"learning_rate": 5.1859364435429345e-06,
"loss": 1.0549,
"step": 712
},
{
"epoch": 1.4462474645030425,
"grad_norm": 0.2490234375,
"learning_rate": 5.179175118323192e-06,
"loss": 1.0262,
"step": 713
},
{
"epoch": 1.4482758620689655,
"grad_norm": 0.240234375,
"learning_rate": 5.172413793103449e-06,
"loss": 1.0362,
"step": 714
},
{
"epoch": 1.4503042596348885,
"grad_norm": 0.259765625,
"learning_rate": 5.165652467883706e-06,
"loss": 1.0823,
"step": 715
},
{
"epoch": 1.4523326572008113,
"grad_norm": 0.2333984375,
"learning_rate": 5.158891142663962e-06,
"loss": 1.0068,
"step": 716
},
{
"epoch": 1.4543610547667343,
"grad_norm": 0.263671875,
"learning_rate": 5.152129817444219e-06,
"loss": 1.0176,
"step": 717
},
{
"epoch": 1.4563894523326573,
"grad_norm": 0.27734375,
"learning_rate": 5.145368492224476e-06,
"loss": 1.0531,
"step": 718
},
{
"epoch": 1.4584178498985803,
"grad_norm": 0.2470703125,
"learning_rate": 5.138607167004733e-06,
"loss": 1.003,
"step": 719
},
{
"epoch": 1.460446247464503,
"grad_norm": 0.37109375,
"learning_rate": 5.131845841784991e-06,
"loss": 1.097,
"step": 720
},
{
"epoch": 1.462474645030426,
"grad_norm": 0.255859375,
"learning_rate": 5.1250845165652475e-06,
"loss": 1.0326,
"step": 721
},
{
"epoch": 1.4645030425963488,
"grad_norm": 0.259765625,
"learning_rate": 5.1183231913455045e-06,
"loss": 1.0651,
"step": 722
},
{
"epoch": 1.4665314401622718,
"grad_norm": 0.234375,
"learning_rate": 5.111561866125761e-06,
"loss": 1.0116,
"step": 723
},
{
"epoch": 1.4685598377281948,
"grad_norm": 0.2734375,
"learning_rate": 5.1048005409060174e-06,
"loss": 1.0215,
"step": 724
},
{
"epoch": 1.4705882352941178,
"grad_norm": 0.2451171875,
"learning_rate": 5.098039215686274e-06,
"loss": 1.0207,
"step": 725
},
{
"epoch": 1.4726166328600405,
"grad_norm": 0.26171875,
"learning_rate": 5.091277890466531e-06,
"loss": 1.0264,
"step": 726
},
{
"epoch": 1.4746450304259635,
"grad_norm": 0.251953125,
"learning_rate": 5.084516565246789e-06,
"loss": 1.0648,
"step": 727
},
{
"epoch": 1.4766734279918863,
"grad_norm": 0.275390625,
"learning_rate": 5.077755240027046e-06,
"loss": 1.0485,
"step": 728
},
{
"epoch": 1.4787018255578093,
"grad_norm": 0.2490234375,
"learning_rate": 5.070993914807303e-06,
"loss": 1.0618,
"step": 729
},
{
"epoch": 1.4807302231237323,
"grad_norm": 0.28125,
"learning_rate": 5.06423258958756e-06,
"loss": 1.0572,
"step": 730
},
{
"epoch": 1.4827586206896552,
"grad_norm": 0.259765625,
"learning_rate": 5.057471264367817e-06,
"loss": 1.0321,
"step": 731
},
{
"epoch": 1.484787018255578,
"grad_norm": 0.259765625,
"learning_rate": 5.050709939148073e-06,
"loss": 1.0498,
"step": 732
},
{
"epoch": 1.486815415821501,
"grad_norm": 0.5,
"learning_rate": 5.04394861392833e-06,
"loss": 1.0393,
"step": 733
},
{
"epoch": 1.488843813387424,
"grad_norm": 0.24609375,
"learning_rate": 5.037187288708587e-06,
"loss": 1.047,
"step": 734
},
{
"epoch": 1.4908722109533468,
"grad_norm": 0.267578125,
"learning_rate": 5.030425963488844e-06,
"loss": 1.128,
"step": 735
},
{
"epoch": 1.4929006085192698,
"grad_norm": 0.24609375,
"learning_rate": 5.023664638269101e-06,
"loss": 1.0581,
"step": 736
},
{
"epoch": 1.4949290060851927,
"grad_norm": 0.2294921875,
"learning_rate": 5.016903313049358e-06,
"loss": 0.9735,
"step": 737
},
{
"epoch": 1.4969574036511157,
"grad_norm": 0.333984375,
"learning_rate": 5.010141987829615e-06,
"loss": 1.0535,
"step": 738
},
{
"epoch": 1.4989858012170385,
"grad_norm": 0.2392578125,
"learning_rate": 5.003380662609873e-06,
"loss": 1.0524,
"step": 739
},
{
"epoch": 1.5010141987829615,
"grad_norm": 0.2392578125,
"learning_rate": 4.996619337390129e-06,
"loss": 1.0575,
"step": 740
},
{
"epoch": 1.5030425963488843,
"grad_norm": 0.255859375,
"learning_rate": 4.989858012170386e-06,
"loss": 1.008,
"step": 741
},
{
"epoch": 1.5050709939148073,
"grad_norm": 0.244140625,
"learning_rate": 4.983096686950643e-06,
"loss": 1.0194,
"step": 742
},
{
"epoch": 1.5070993914807302,
"grad_norm": 0.2451171875,
"learning_rate": 4.9763353617309e-06,
"loss": 1.0579,
"step": 743
},
{
"epoch": 1.5091277890466532,
"grad_norm": 0.3359375,
"learning_rate": 4.9695740365111565e-06,
"loss": 1.0465,
"step": 744
},
{
"epoch": 1.5111561866125762,
"grad_norm": 0.2431640625,
"learning_rate": 4.962812711291413e-06,
"loss": 1.044,
"step": 745
},
{
"epoch": 1.513184584178499,
"grad_norm": 0.248046875,
"learning_rate": 4.95605138607167e-06,
"loss": 1.048,
"step": 746
},
{
"epoch": 1.5152129817444218,
"grad_norm": 0.2412109375,
"learning_rate": 4.949290060851927e-06,
"loss": 1.0386,
"step": 747
},
{
"epoch": 1.5172413793103448,
"grad_norm": 0.267578125,
"learning_rate": 4.942528735632184e-06,
"loss": 1.0929,
"step": 748
},
{
"epoch": 1.5192697768762677,
"grad_norm": 0.337890625,
"learning_rate": 4.935767410412441e-06,
"loss": 1.0158,
"step": 749
},
{
"epoch": 1.5212981744421907,
"grad_norm": 0.248046875,
"learning_rate": 4.929006085192698e-06,
"loss": 1.0559,
"step": 750
},
{
"epoch": 1.5233265720081137,
"grad_norm": 0.263671875,
"learning_rate": 4.922244759972955e-06,
"loss": 1.0256,
"step": 751
},
{
"epoch": 1.5253549695740365,
"grad_norm": 0.294921875,
"learning_rate": 4.915483434753212e-06,
"loss": 1.0062,
"step": 752
},
{
"epoch": 1.5273833671399595,
"grad_norm": 0.251953125,
"learning_rate": 4.9087221095334696e-06,
"loss": 1.0676,
"step": 753
},
{
"epoch": 1.5294117647058822,
"grad_norm": 0.2392578125,
"learning_rate": 4.901960784313726e-06,
"loss": 1.0193,
"step": 754
},
{
"epoch": 1.5314401622718052,
"grad_norm": 0.2421875,
"learning_rate": 4.8951994590939825e-06,
"loss": 1.0317,
"step": 755
},
{
"epoch": 1.5334685598377282,
"grad_norm": 0.2353515625,
"learning_rate": 4.8884381338742394e-06,
"loss": 1.0093,
"step": 756
},
{
"epoch": 1.5354969574036512,
"grad_norm": 0.283203125,
"learning_rate": 4.881676808654497e-06,
"loss": 1.0976,
"step": 757
},
{
"epoch": 1.537525354969574,
"grad_norm": 0.236328125,
"learning_rate": 4.874915483434753e-06,
"loss": 1.0135,
"step": 758
},
{
"epoch": 1.539553752535497,
"grad_norm": 0.2421875,
"learning_rate": 4.86815415821501e-06,
"loss": 1.02,
"step": 759
},
{
"epoch": 1.5415821501014197,
"grad_norm": 0.259765625,
"learning_rate": 4.861392832995268e-06,
"loss": 1.0506,
"step": 760
},
{
"epoch": 1.5436105476673427,
"grad_norm": 0.28515625,
"learning_rate": 4.854631507775525e-06,
"loss": 1.0188,
"step": 761
},
{
"epoch": 1.5456389452332657,
"grad_norm": 0.26171875,
"learning_rate": 4.847870182555781e-06,
"loss": 1.0333,
"step": 762
},
{
"epoch": 1.5476673427991887,
"grad_norm": 0.248046875,
"learning_rate": 4.841108857336038e-06,
"loss": 1.0798,
"step": 763
},
{
"epoch": 1.5496957403651117,
"grad_norm": 0.240234375,
"learning_rate": 4.834347532116296e-06,
"loss": 1.01,
"step": 764
},
{
"epoch": 1.5517241379310345,
"grad_norm": 0.333984375,
"learning_rate": 4.8275862068965525e-06,
"loss": 1.081,
"step": 765
},
{
"epoch": 1.5537525354969572,
"grad_norm": 0.248046875,
"learning_rate": 4.8208248816768086e-06,
"loss": 1.0455,
"step": 766
},
{
"epoch": 1.5557809330628802,
"grad_norm": 0.240234375,
"learning_rate": 4.814063556457066e-06,
"loss": 1.0435,
"step": 767
},
{
"epoch": 1.5578093306288032,
"grad_norm": 0.369140625,
"learning_rate": 4.807302231237323e-06,
"loss": 1.0402,
"step": 768
},
{
"epoch": 1.5598377281947262,
"grad_norm": 0.3046875,
"learning_rate": 4.80054090601758e-06,
"loss": 1.0597,
"step": 769
},
{
"epoch": 1.5618661257606492,
"grad_norm": 0.248046875,
"learning_rate": 4.793779580797836e-06,
"loss": 1.0579,
"step": 770
},
{
"epoch": 1.563894523326572,
"grad_norm": 0.251953125,
"learning_rate": 4.787018255578094e-06,
"loss": 1.0255,
"step": 771
},
{
"epoch": 1.565922920892495,
"grad_norm": 0.2470703125,
"learning_rate": 4.780256930358351e-06,
"loss": 1.0427,
"step": 772
},
{
"epoch": 1.5679513184584177,
"grad_norm": 0.24609375,
"learning_rate": 4.773495605138608e-06,
"loss": 1.0013,
"step": 773
},
{
"epoch": 1.5699797160243407,
"grad_norm": 0.33984375,
"learning_rate": 4.766734279918865e-06,
"loss": 1.0138,
"step": 774
},
{
"epoch": 1.5720081135902637,
"grad_norm": 0.275390625,
"learning_rate": 4.759972954699122e-06,
"loss": 1.0689,
"step": 775
},
{
"epoch": 1.5740365111561867,
"grad_norm": 0.2392578125,
"learning_rate": 4.7532116294793785e-06,
"loss": 1.0476,
"step": 776
},
{
"epoch": 1.5760649087221097,
"grad_norm": 0.2578125,
"learning_rate": 4.7464503042596354e-06,
"loss": 1.0279,
"step": 777
},
{
"epoch": 1.5780933062880325,
"grad_norm": 0.255859375,
"learning_rate": 4.739688979039892e-06,
"loss": 1.0612,
"step": 778
},
{
"epoch": 1.5801217038539552,
"grad_norm": 0.255859375,
"learning_rate": 4.732927653820149e-06,
"loss": 1.0301,
"step": 779
},
{
"epoch": 1.5821501014198782,
"grad_norm": 0.302734375,
"learning_rate": 4.726166328600406e-06,
"loss": 1.0758,
"step": 780
},
{
"epoch": 1.5841784989858012,
"grad_norm": 0.306640625,
"learning_rate": 4.719405003380663e-06,
"loss": 1.0069,
"step": 781
},
{
"epoch": 1.5862068965517242,
"grad_norm": 0.271484375,
"learning_rate": 4.71264367816092e-06,
"loss": 1.0242,
"step": 782
},
{
"epoch": 1.5882352941176472,
"grad_norm": 0.265625,
"learning_rate": 4.705882352941177e-06,
"loss": 1.109,
"step": 783
},
{
"epoch": 1.59026369168357,
"grad_norm": 0.259765625,
"learning_rate": 4.699121027721434e-06,
"loss": 0.9922,
"step": 784
},
{
"epoch": 1.592292089249493,
"grad_norm": 0.267578125,
"learning_rate": 4.692359702501691e-06,
"loss": 1.0403,
"step": 785
},
{
"epoch": 1.5943204868154157,
"grad_norm": 0.26953125,
"learning_rate": 4.685598377281948e-06,
"loss": 0.9909,
"step": 786
},
{
"epoch": 1.5963488843813387,
"grad_norm": 0.267578125,
"learning_rate": 4.6788370520622046e-06,
"loss": 1.0451,
"step": 787
},
{
"epoch": 1.5983772819472617,
"grad_norm": 0.2431640625,
"learning_rate": 4.6720757268424615e-06,
"loss": 1.0213,
"step": 788
},
{
"epoch": 1.6004056795131847,
"grad_norm": 0.236328125,
"learning_rate": 4.665314401622718e-06,
"loss": 0.9739,
"step": 789
},
{
"epoch": 1.6024340770791075,
"grad_norm": 0.271484375,
"learning_rate": 4.658553076402975e-06,
"loss": 1.0517,
"step": 790
},
{
"epoch": 1.6044624746450304,
"grad_norm": 0.26171875,
"learning_rate": 4.651791751183232e-06,
"loss": 1.0491,
"step": 791
},
{
"epoch": 1.6064908722109532,
"grad_norm": 0.2490234375,
"learning_rate": 4.645030425963489e-06,
"loss": 1.0499,
"step": 792
},
{
"epoch": 1.6085192697768762,
"grad_norm": 0.255859375,
"learning_rate": 4.638269100743746e-06,
"loss": 1.0303,
"step": 793
},
{
"epoch": 1.6105476673427992,
"grad_norm": 0.2734375,
"learning_rate": 4.631507775524003e-06,
"loss": 1.0469,
"step": 794
},
{
"epoch": 1.6125760649087222,
"grad_norm": 0.2734375,
"learning_rate": 4.62474645030426e-06,
"loss": 1.0545,
"step": 795
},
{
"epoch": 1.6146044624746452,
"grad_norm": 0.255859375,
"learning_rate": 4.617985125084517e-06,
"loss": 1.0506,
"step": 796
},
{
"epoch": 1.616632860040568,
"grad_norm": 0.3984375,
"learning_rate": 4.611223799864774e-06,
"loss": 1.0621,
"step": 797
},
{
"epoch": 1.6186612576064907,
"grad_norm": 0.373046875,
"learning_rate": 4.604462474645031e-06,
"loss": 1.0528,
"step": 798
},
{
"epoch": 1.6206896551724137,
"grad_norm": 0.265625,
"learning_rate": 4.5977011494252875e-06,
"loss": 1.0781,
"step": 799
},
{
"epoch": 1.6227180527383367,
"grad_norm": 0.27734375,
"learning_rate": 4.590939824205544e-06,
"loss": 1.053,
"step": 800
},
{
"epoch": 1.6247464503042597,
"grad_norm": 0.25390625,
"learning_rate": 4.584178498985802e-06,
"loss": 1.0392,
"step": 801
},
{
"epoch": 1.6267748478701827,
"grad_norm": 0.25390625,
"learning_rate": 4.577417173766058e-06,
"loss": 1.0657,
"step": 802
},
{
"epoch": 1.6288032454361054,
"grad_norm": 0.25390625,
"learning_rate": 4.570655848546315e-06,
"loss": 1.0228,
"step": 803
},
{
"epoch": 1.6308316430020284,
"grad_norm": 0.265625,
"learning_rate": 4.563894523326572e-06,
"loss": 1.0226,
"step": 804
},
{
"epoch": 1.6328600405679512,
"grad_norm": 0.259765625,
"learning_rate": 4.55713319810683e-06,
"loss": 1.0523,
"step": 805
},
{
"epoch": 1.6348884381338742,
"grad_norm": 0.255859375,
"learning_rate": 4.550371872887086e-06,
"loss": 1.011,
"step": 806
},
{
"epoch": 1.6369168356997972,
"grad_norm": 0.2578125,
"learning_rate": 4.543610547667343e-06,
"loss": 1.0638,
"step": 807
},
{
"epoch": 1.6389452332657202,
"grad_norm": 0.26953125,
"learning_rate": 4.5368492224476005e-06,
"loss": 1.0666,
"step": 808
},
{
"epoch": 1.6409736308316432,
"grad_norm": 0.2578125,
"learning_rate": 4.5300878972278575e-06,
"loss": 1.0697,
"step": 809
},
{
"epoch": 1.643002028397566,
"grad_norm": 0.25,
"learning_rate": 4.5233265720081135e-06,
"loss": 1.0536,
"step": 810
},
{
"epoch": 1.6450304259634887,
"grad_norm": 0.328125,
"learning_rate": 4.5165652467883704e-06,
"loss": 1.0281,
"step": 811
},
{
"epoch": 1.6470588235294117,
"grad_norm": 0.3984375,
"learning_rate": 4.509803921568628e-06,
"loss": 1.0882,
"step": 812
},
{
"epoch": 1.6490872210953347,
"grad_norm": 0.255859375,
"learning_rate": 4.503042596348885e-06,
"loss": 1.0889,
"step": 813
},
{
"epoch": 1.6511156186612577,
"grad_norm": 0.357421875,
"learning_rate": 4.496281271129141e-06,
"loss": 1.0441,
"step": 814
},
{
"epoch": 1.6531440162271807,
"grad_norm": 0.25,
"learning_rate": 4.489519945909399e-06,
"loss": 1.075,
"step": 815
},
{
"epoch": 1.6551724137931034,
"grad_norm": 0.255859375,
"learning_rate": 4.482758620689656e-06,
"loss": 1.0429,
"step": 816
},
{
"epoch": 1.6572008113590264,
"grad_norm": 0.2490234375,
"learning_rate": 4.475997295469913e-06,
"loss": 1.0555,
"step": 817
},
{
"epoch": 1.6592292089249492,
"grad_norm": 0.279296875,
"learning_rate": 4.469235970250169e-06,
"loss": 1.0478,
"step": 818
},
{
"epoch": 1.6612576064908722,
"grad_norm": 0.265625,
"learning_rate": 4.4624746450304266e-06,
"loss": 1.0178,
"step": 819
},
{
"epoch": 1.6632860040567952,
"grad_norm": 0.28125,
"learning_rate": 4.4557133198106835e-06,
"loss": 1.009,
"step": 820
},
{
"epoch": 1.6653144016227182,
"grad_norm": 0.25390625,
"learning_rate": 4.44895199459094e-06,
"loss": 1.008,
"step": 821
},
{
"epoch": 1.667342799188641,
"grad_norm": 0.25390625,
"learning_rate": 4.442190669371197e-06,
"loss": 1.0553,
"step": 822
},
{
"epoch": 1.669371196754564,
"grad_norm": 0.263671875,
"learning_rate": 4.435429344151454e-06,
"loss": 1.0913,
"step": 823
},
{
"epoch": 1.6713995943204867,
"grad_norm": 0.26953125,
"learning_rate": 4.428668018931711e-06,
"loss": 1.0635,
"step": 824
},
{
"epoch": 1.6734279918864097,
"grad_norm": 0.2451171875,
"learning_rate": 4.421906693711968e-06,
"loss": 1.0251,
"step": 825
},
{
"epoch": 1.6754563894523327,
"grad_norm": 0.2373046875,
"learning_rate": 4.415145368492225e-06,
"loss": 1.0472,
"step": 826
},
{
"epoch": 1.6774847870182557,
"grad_norm": 0.25390625,
"learning_rate": 4.408384043272482e-06,
"loss": 1.0504,
"step": 827
},
{
"epoch": 1.6795131845841786,
"grad_norm": 0.244140625,
"learning_rate": 4.401622718052739e-06,
"loss": 1.0581,
"step": 828
},
{
"epoch": 1.6815415821501014,
"grad_norm": 0.244140625,
"learning_rate": 4.394861392832996e-06,
"loss": 1.0142,
"step": 829
},
{
"epoch": 1.6835699797160242,
"grad_norm": 0.2451171875,
"learning_rate": 4.388100067613253e-06,
"loss": 1.0157,
"step": 830
},
{
"epoch": 1.6855983772819472,
"grad_norm": 0.25390625,
"learning_rate": 4.3813387423935095e-06,
"loss": 1.0668,
"step": 831
},
{
"epoch": 1.6876267748478702,
"grad_norm": 0.296875,
"learning_rate": 4.374577417173766e-06,
"loss": 1.009,
"step": 832
},
{
"epoch": 1.6896551724137931,
"grad_norm": 0.2412109375,
"learning_rate": 4.367816091954023e-06,
"loss": 1.0159,
"step": 833
},
{
"epoch": 1.6916835699797161,
"grad_norm": 0.2578125,
"learning_rate": 4.36105476673428e-06,
"loss": 1.0275,
"step": 834
},
{
"epoch": 1.693711967545639,
"grad_norm": 0.240234375,
"learning_rate": 4.354293441514537e-06,
"loss": 0.9794,
"step": 835
},
{
"epoch": 1.695740365111562,
"grad_norm": 0.2578125,
"learning_rate": 4.347532116294794e-06,
"loss": 1.0311,
"step": 836
},
{
"epoch": 1.6977687626774847,
"grad_norm": 0.2431640625,
"learning_rate": 4.340770791075051e-06,
"loss": 1.0232,
"step": 837
},
{
"epoch": 1.6997971602434077,
"grad_norm": 0.265625,
"learning_rate": 4.334009465855308e-06,
"loss": 1.0875,
"step": 838
},
{
"epoch": 1.7018255578093306,
"grad_norm": 0.2451171875,
"learning_rate": 4.327248140635565e-06,
"loss": 1.0545,
"step": 839
},
{
"epoch": 1.7038539553752536,
"grad_norm": 0.2578125,
"learning_rate": 4.320486815415822e-06,
"loss": 1.1009,
"step": 840
},
{
"epoch": 1.7058823529411766,
"grad_norm": 0.240234375,
"learning_rate": 4.313725490196079e-06,
"loss": 1.0431,
"step": 841
},
{
"epoch": 1.7079107505070994,
"grad_norm": 0.30078125,
"learning_rate": 4.3069641649763355e-06,
"loss": 1.0407,
"step": 842
},
{
"epoch": 1.7099391480730222,
"grad_norm": 0.265625,
"learning_rate": 4.3002028397565924e-06,
"loss": 1.0143,
"step": 843
},
{
"epoch": 1.7119675456389452,
"grad_norm": 0.287109375,
"learning_rate": 4.293441514536849e-06,
"loss": 1.0071,
"step": 844
},
{
"epoch": 1.7139959432048681,
"grad_norm": 0.25390625,
"learning_rate": 4.286680189317107e-06,
"loss": 1.0152,
"step": 845
},
{
"epoch": 1.7160243407707911,
"grad_norm": 0.25,
"learning_rate": 4.279918864097363e-06,
"loss": 1.0049,
"step": 846
},
{
"epoch": 1.7180527383367141,
"grad_norm": 0.251953125,
"learning_rate": 4.27315753887762e-06,
"loss": 1.048,
"step": 847
},
{
"epoch": 1.720081135902637,
"grad_norm": 0.265625,
"learning_rate": 4.266396213657877e-06,
"loss": 1.0403,
"step": 848
},
{
"epoch": 1.7221095334685599,
"grad_norm": 0.2431640625,
"learning_rate": 4.259634888438135e-06,
"loss": 1.0488,
"step": 849
},
{
"epoch": 1.7241379310344827,
"grad_norm": 0.2451171875,
"learning_rate": 4.252873563218391e-06,
"loss": 1.063,
"step": 850
},
{
"epoch": 1.7261663286004056,
"grad_norm": 0.265625,
"learning_rate": 4.246112237998648e-06,
"loss": 1.0591,
"step": 851
},
{
"epoch": 1.7281947261663286,
"grad_norm": 0.234375,
"learning_rate": 4.2393509127789055e-06,
"loss": 1.0191,
"step": 852
},
{
"epoch": 1.7302231237322516,
"grad_norm": 0.275390625,
"learning_rate": 4.232589587559162e-06,
"loss": 1.0538,
"step": 853
},
{
"epoch": 1.7322515212981744,
"grad_norm": 0.349609375,
"learning_rate": 4.2258282623394185e-06,
"loss": 1.07,
"step": 854
},
{
"epoch": 1.7342799188640974,
"grad_norm": 0.24609375,
"learning_rate": 4.219066937119675e-06,
"loss": 1.0387,
"step": 855
},
{
"epoch": 1.7363083164300201,
"grad_norm": 0.24609375,
"learning_rate": 4.212305611899933e-06,
"loss": 1.0549,
"step": 856
},
{
"epoch": 1.7383367139959431,
"grad_norm": 0.3671875,
"learning_rate": 4.20554428668019e-06,
"loss": 1.0035,
"step": 857
},
{
"epoch": 1.7403651115618661,
"grad_norm": 0.2490234375,
"learning_rate": 4.198782961460446e-06,
"loss": 1.0304,
"step": 858
},
{
"epoch": 1.7423935091277891,
"grad_norm": 0.251953125,
"learning_rate": 4.192021636240704e-06,
"loss": 1.0472,
"step": 859
},
{
"epoch": 1.744421906693712,
"grad_norm": 0.291015625,
"learning_rate": 4.185260311020961e-06,
"loss": 1.0034,
"step": 860
},
{
"epoch": 1.7464503042596349,
"grad_norm": 0.271484375,
"learning_rate": 4.178498985801218e-06,
"loss": 1.0649,
"step": 861
},
{
"epoch": 1.7484787018255576,
"grad_norm": 0.2431640625,
"learning_rate": 4.171737660581474e-06,
"loss": 1.0257,
"step": 862
},
{
"epoch": 1.7505070993914806,
"grad_norm": 0.37109375,
"learning_rate": 4.1649763353617315e-06,
"loss": 1.0053,
"step": 863
},
{
"epoch": 1.7525354969574036,
"grad_norm": 0.2470703125,
"learning_rate": 4.1582150101419884e-06,
"loss": 1.0393,
"step": 864
},
{
"epoch": 1.7545638945233266,
"grad_norm": 0.26171875,
"learning_rate": 4.151453684922245e-06,
"loss": 1.0857,
"step": 865
},
{
"epoch": 1.7565922920892496,
"grad_norm": 0.263671875,
"learning_rate": 4.144692359702502e-06,
"loss": 1.0793,
"step": 866
},
{
"epoch": 1.7586206896551724,
"grad_norm": 0.30859375,
"learning_rate": 4.137931034482759e-06,
"loss": 1.0013,
"step": 867
},
{
"epoch": 1.7606490872210954,
"grad_norm": 0.25390625,
"learning_rate": 4.131169709263016e-06,
"loss": 1.0325,
"step": 868
},
{
"epoch": 1.7626774847870181,
"grad_norm": 0.2470703125,
"learning_rate": 4.124408384043273e-06,
"loss": 1.0407,
"step": 869
},
{
"epoch": 1.7647058823529411,
"grad_norm": 0.2421875,
"learning_rate": 4.11764705882353e-06,
"loss": 1.0443,
"step": 870
},
{
"epoch": 1.7667342799188641,
"grad_norm": 0.298828125,
"learning_rate": 4.110885733603787e-06,
"loss": 1.0305,
"step": 871
},
{
"epoch": 1.768762677484787,
"grad_norm": 0.25390625,
"learning_rate": 4.104124408384044e-06,
"loss": 1.0855,
"step": 872
},
{
"epoch": 1.77079107505071,
"grad_norm": 0.24609375,
"learning_rate": 4.097363083164301e-06,
"loss": 1.0202,
"step": 873
},
{
"epoch": 1.7728194726166329,
"grad_norm": 0.2431640625,
"learning_rate": 4.0906017579445575e-06,
"loss": 1.0145,
"step": 874
},
{
"epoch": 1.7748478701825556,
"grad_norm": 0.2578125,
"learning_rate": 4.0838404327248145e-06,
"loss": 1.0533,
"step": 875
},
{
"epoch": 1.7768762677484786,
"grad_norm": 0.40234375,
"learning_rate": 4.077079107505071e-06,
"loss": 1.0158,
"step": 876
},
{
"epoch": 1.7789046653144016,
"grad_norm": 0.263671875,
"learning_rate": 4.070317782285328e-06,
"loss": 0.9991,
"step": 877
},
{
"epoch": 1.7809330628803246,
"grad_norm": 0.255859375,
"learning_rate": 4.063556457065585e-06,
"loss": 1.0217,
"step": 878
},
{
"epoch": 1.7829614604462476,
"grad_norm": 0.24609375,
"learning_rate": 4.056795131845842e-06,
"loss": 1.0227,
"step": 879
},
{
"epoch": 1.7849898580121704,
"grad_norm": 0.29296875,
"learning_rate": 4.050033806626099e-06,
"loss": 1.0078,
"step": 880
},
{
"epoch": 1.7870182555780934,
"grad_norm": 0.271484375,
"learning_rate": 4.043272481406356e-06,
"loss": 1.0662,
"step": 881
},
{
"epoch": 1.7890466531440161,
"grad_norm": 0.263671875,
"learning_rate": 4.036511156186613e-06,
"loss": 1.0436,
"step": 882
},
{
"epoch": 1.791075050709939,
"grad_norm": 0.326171875,
"learning_rate": 4.02974983096687e-06,
"loss": 1.0535,
"step": 883
},
{
"epoch": 1.793103448275862,
"grad_norm": 0.265625,
"learning_rate": 4.022988505747127e-06,
"loss": 1.0301,
"step": 884
},
{
"epoch": 1.795131845841785,
"grad_norm": 0.25,
"learning_rate": 4.0162271805273836e-06,
"loss": 1.0391,
"step": 885
},
{
"epoch": 1.7971602434077079,
"grad_norm": 0.3671875,
"learning_rate": 4.0094658553076405e-06,
"loss": 1.0251,
"step": 886
},
{
"epoch": 1.7991886409736308,
"grad_norm": 0.251953125,
"learning_rate": 4.002704530087897e-06,
"loss": 1.0301,
"step": 887
},
{
"epoch": 1.8012170385395536,
"grad_norm": 0.267578125,
"learning_rate": 3.995943204868154e-06,
"loss": 1.021,
"step": 888
},
{
"epoch": 1.8032454361054766,
"grad_norm": 0.361328125,
"learning_rate": 3.989181879648411e-06,
"loss": 1.051,
"step": 889
},
{
"epoch": 1.8052738336713996,
"grad_norm": 0.25390625,
"learning_rate": 3.982420554428668e-06,
"loss": 0.9883,
"step": 890
},
{
"epoch": 1.8073022312373226,
"grad_norm": 0.93359375,
"learning_rate": 3.975659229208925e-06,
"loss": 1.055,
"step": 891
},
{
"epoch": 1.8093306288032456,
"grad_norm": 0.2392578125,
"learning_rate": 3.968897903989182e-06,
"loss": 1.0091,
"step": 892
},
{
"epoch": 1.8113590263691683,
"grad_norm": 0.25,
"learning_rate": 3.96213657876944e-06,
"loss": 1.0557,
"step": 893
},
{
"epoch": 1.8133874239350911,
"grad_norm": 0.2578125,
"learning_rate": 3.955375253549696e-06,
"loss": 1.0326,
"step": 894
},
{
"epoch": 1.815415821501014,
"grad_norm": 0.275390625,
"learning_rate": 3.948613928329953e-06,
"loss": 1.0162,
"step": 895
},
{
"epoch": 1.817444219066937,
"grad_norm": 0.26171875,
"learning_rate": 3.94185260311021e-06,
"loss": 1.0341,
"step": 896
},
{
"epoch": 1.81947261663286,
"grad_norm": 0.25390625,
"learning_rate": 3.935091277890467e-06,
"loss": 1.0521,
"step": 897
},
{
"epoch": 1.821501014198783,
"grad_norm": 0.25390625,
"learning_rate": 3.928329952670723e-06,
"loss": 1.0544,
"step": 898
},
{
"epoch": 1.8235294117647058,
"grad_norm": 0.287109375,
"learning_rate": 3.92156862745098e-06,
"loss": 0.9953,
"step": 899
},
{
"epoch": 1.8255578093306288,
"grad_norm": 0.244140625,
"learning_rate": 3.914807302231238e-06,
"loss": 1.019,
"step": 900
},
{
"epoch": 1.8275862068965516,
"grad_norm": 0.26171875,
"learning_rate": 3.908045977011495e-06,
"loss": 0.9932,
"step": 901
},
{
"epoch": 1.8296146044624746,
"grad_norm": 0.251953125,
"learning_rate": 3.901284651791751e-06,
"loss": 1.0182,
"step": 902
},
{
"epoch": 1.8316430020283976,
"grad_norm": 0.27734375,
"learning_rate": 3.894523326572008e-06,
"loss": 1.0809,
"step": 903
},
{
"epoch": 1.8336713995943206,
"grad_norm": 0.275390625,
"learning_rate": 3.887762001352266e-06,
"loss": 1.0352,
"step": 904
},
{
"epoch": 1.8356997971602436,
"grad_norm": 0.337890625,
"learning_rate": 3.881000676132523e-06,
"loss": 1.0161,
"step": 905
},
{
"epoch": 1.8377281947261663,
"grad_norm": 0.2578125,
"learning_rate": 3.874239350912779e-06,
"loss": 1.0188,
"step": 906
},
{
"epoch": 1.839756592292089,
"grad_norm": 0.2412109375,
"learning_rate": 3.8674780256930365e-06,
"loss": 1.0352,
"step": 907
},
{
"epoch": 1.841784989858012,
"grad_norm": 0.27734375,
"learning_rate": 3.860716700473293e-06,
"loss": 1.0515,
"step": 908
},
{
"epoch": 1.843813387423935,
"grad_norm": 0.265625,
"learning_rate": 3.85395537525355e-06,
"loss": 1.074,
"step": 909
},
{
"epoch": 1.845841784989858,
"grad_norm": 0.310546875,
"learning_rate": 3.847194050033806e-06,
"loss": 1.0543,
"step": 910
},
{
"epoch": 1.847870182555781,
"grad_norm": 0.27734375,
"learning_rate": 3.840432724814064e-06,
"loss": 1.0243,
"step": 911
},
{
"epoch": 1.8498985801217038,
"grad_norm": 0.255859375,
"learning_rate": 3.833671399594321e-06,
"loss": 1.0708,
"step": 912
},
{
"epoch": 1.8519269776876268,
"grad_norm": 0.263671875,
"learning_rate": 3.826910074374578e-06,
"loss": 1.0486,
"step": 913
},
{
"epoch": 1.8539553752535496,
"grad_norm": 0.24609375,
"learning_rate": 3.820148749154835e-06,
"loss": 1.0197,
"step": 914
},
{
"epoch": 1.8559837728194726,
"grad_norm": 0.25390625,
"learning_rate": 3.8133874239350913e-06,
"loss": 1.0298,
"step": 915
},
{
"epoch": 1.8580121703853956,
"grad_norm": 0.2578125,
"learning_rate": 3.8066260987153487e-06,
"loss": 1.0123,
"step": 916
},
{
"epoch": 1.8600405679513186,
"grad_norm": 0.2578125,
"learning_rate": 3.7998647734956056e-06,
"loss": 0.9994,
"step": 917
},
{
"epoch": 1.8620689655172413,
"grad_norm": 0.24609375,
"learning_rate": 3.793103448275862e-06,
"loss": 1.0284,
"step": 918
},
{
"epoch": 1.8640973630831643,
"grad_norm": 0.287109375,
"learning_rate": 3.7863421230561194e-06,
"loss": 1.0786,
"step": 919
},
{
"epoch": 1.866125760649087,
"grad_norm": 0.318359375,
"learning_rate": 3.7795807978363763e-06,
"loss": 1.0081,
"step": 920
},
{
"epoch": 1.86815415821501,
"grad_norm": 0.251953125,
"learning_rate": 3.7728194726166332e-06,
"loss": 0.9898,
"step": 921
},
{
"epoch": 1.870182555780933,
"grad_norm": 0.2470703125,
"learning_rate": 3.7660581473968897e-06,
"loss": 1.0628,
"step": 922
},
{
"epoch": 1.872210953346856,
"grad_norm": 0.32421875,
"learning_rate": 3.759296822177147e-06,
"loss": 1.0391,
"step": 923
},
{
"epoch": 1.874239350912779,
"grad_norm": 0.240234375,
"learning_rate": 3.752535496957404e-06,
"loss": 1.0142,
"step": 924
},
{
"epoch": 1.8762677484787018,
"grad_norm": 0.265625,
"learning_rate": 3.7457741717376613e-06,
"loss": 1.046,
"step": 925
},
{
"epoch": 1.8782961460446246,
"grad_norm": 0.25,
"learning_rate": 3.7390128465179178e-06,
"loss": 1.0367,
"step": 926
},
{
"epoch": 1.8803245436105476,
"grad_norm": 0.26953125,
"learning_rate": 3.7322515212981747e-06,
"loss": 0.9889,
"step": 927
},
{
"epoch": 1.8823529411764706,
"grad_norm": 0.259765625,
"learning_rate": 3.7254901960784316e-06,
"loss": 1.0335,
"step": 928
},
{
"epoch": 1.8843813387423936,
"grad_norm": 0.27734375,
"learning_rate": 3.718728870858689e-06,
"loss": 1.0742,
"step": 929
},
{
"epoch": 1.8864097363083165,
"grad_norm": 0.248046875,
"learning_rate": 3.7119675456389454e-06,
"loss": 1.0103,
"step": 930
},
{
"epoch": 1.8884381338742393,
"grad_norm": 0.2470703125,
"learning_rate": 3.7052062204192023e-06,
"loss": 1.029,
"step": 931
},
{
"epoch": 1.8904665314401623,
"grad_norm": 0.25390625,
"learning_rate": 3.6984448951994597e-06,
"loss": 1.0185,
"step": 932
},
{
"epoch": 1.892494929006085,
"grad_norm": 0.349609375,
"learning_rate": 3.6916835699797166e-06,
"loss": 1.0273,
"step": 933
},
{
"epoch": 1.894523326572008,
"grad_norm": 0.279296875,
"learning_rate": 3.684922244759973e-06,
"loss": 1.0552,
"step": 934
},
{
"epoch": 1.896551724137931,
"grad_norm": 0.251953125,
"learning_rate": 3.67816091954023e-06,
"loss": 0.9835,
"step": 935
},
{
"epoch": 1.898580121703854,
"grad_norm": 0.251953125,
"learning_rate": 3.6713995943204873e-06,
"loss": 1.0271,
"step": 936
},
{
"epoch": 1.900608519269777,
"grad_norm": 0.2431640625,
"learning_rate": 3.6646382691007442e-06,
"loss": 1.0514,
"step": 937
},
{
"epoch": 1.9026369168356998,
"grad_norm": 0.263671875,
"learning_rate": 3.6578769438810007e-06,
"loss": 1.0763,
"step": 938
},
{
"epoch": 1.9046653144016226,
"grad_norm": 0.255859375,
"learning_rate": 3.651115618661258e-06,
"loss": 1.0356,
"step": 939
},
{
"epoch": 1.9066937119675456,
"grad_norm": 0.314453125,
"learning_rate": 3.644354293441515e-06,
"loss": 1.0892,
"step": 940
},
{
"epoch": 1.9087221095334685,
"grad_norm": 0.25,
"learning_rate": 3.637592968221772e-06,
"loss": 1.0789,
"step": 941
},
{
"epoch": 1.9107505070993915,
"grad_norm": 0.255859375,
"learning_rate": 3.6308316430020284e-06,
"loss": 1.0647,
"step": 942
},
{
"epoch": 1.9127789046653145,
"grad_norm": 0.26953125,
"learning_rate": 3.6240703177822857e-06,
"loss": 1.0863,
"step": 943
},
{
"epoch": 1.9148073022312373,
"grad_norm": 0.2470703125,
"learning_rate": 3.6173089925625426e-06,
"loss": 0.9983,
"step": 944
},
{
"epoch": 1.9168356997971603,
"grad_norm": 0.2578125,
"learning_rate": 3.6105476673427995e-06,
"loss": 0.9991,
"step": 945
},
{
"epoch": 1.918864097363083,
"grad_norm": 0.263671875,
"learning_rate": 3.603786342123056e-06,
"loss": 1.1201,
"step": 946
},
{
"epoch": 1.920892494929006,
"grad_norm": 0.265625,
"learning_rate": 3.5970250169033134e-06,
"loss": 1.0471,
"step": 947
},
{
"epoch": 1.922920892494929,
"grad_norm": 0.375,
"learning_rate": 3.5902636916835703e-06,
"loss": 0.9676,
"step": 948
},
{
"epoch": 1.924949290060852,
"grad_norm": 0.251953125,
"learning_rate": 3.5835023664638276e-06,
"loss": 1.0699,
"step": 949
},
{
"epoch": 1.9269776876267748,
"grad_norm": 0.263671875,
"learning_rate": 3.576741041244084e-06,
"loss": 1.0475,
"step": 950
},
{
"epoch": 1.9290060851926978,
"grad_norm": 0.255859375,
"learning_rate": 3.569979716024341e-06,
"loss": 1.0266,
"step": 951
},
{
"epoch": 1.9310344827586206,
"grad_norm": 0.2451171875,
"learning_rate": 3.563218390804598e-06,
"loss": 0.9755,
"step": 952
},
{
"epoch": 1.9330628803245435,
"grad_norm": 0.251953125,
"learning_rate": 3.5564570655848552e-06,
"loss": 1.0245,
"step": 953
},
{
"epoch": 1.9350912778904665,
"grad_norm": 0.25,
"learning_rate": 3.5496957403651117e-06,
"loss": 1.052,
"step": 954
},
{
"epoch": 1.9371196754563895,
"grad_norm": 0.341796875,
"learning_rate": 3.5429344151453686e-06,
"loss": 1.0402,
"step": 955
},
{
"epoch": 1.9391480730223125,
"grad_norm": 0.291015625,
"learning_rate": 3.536173089925626e-06,
"loss": 1.0447,
"step": 956
},
{
"epoch": 1.9411764705882353,
"grad_norm": 0.26171875,
"learning_rate": 3.529411764705883e-06,
"loss": 1.0743,
"step": 957
},
{
"epoch": 1.943204868154158,
"grad_norm": 0.26171875,
"learning_rate": 3.5226504394861394e-06,
"loss": 1.066,
"step": 958
},
{
"epoch": 1.945233265720081,
"grad_norm": 0.25390625,
"learning_rate": 3.5158891142663963e-06,
"loss": 1.0455,
"step": 959
},
{
"epoch": 1.947261663286004,
"grad_norm": 0.267578125,
"learning_rate": 3.5091277890466536e-06,
"loss": 1.0448,
"step": 960
},
{
"epoch": 1.949290060851927,
"grad_norm": 0.30859375,
"learning_rate": 3.5023664638269105e-06,
"loss": 1.012,
"step": 961
},
{
"epoch": 1.95131845841785,
"grad_norm": 0.376953125,
"learning_rate": 3.495605138607167e-06,
"loss": 0.9866,
"step": 962
},
{
"epoch": 1.9533468559837728,
"grad_norm": 0.267578125,
"learning_rate": 3.4888438133874244e-06,
"loss": 1.0504,
"step": 963
},
{
"epoch": 1.9553752535496958,
"grad_norm": 0.263671875,
"learning_rate": 3.4820824881676813e-06,
"loss": 1.0103,
"step": 964
},
{
"epoch": 1.9574036511156185,
"grad_norm": 0.265625,
"learning_rate": 3.475321162947938e-06,
"loss": 1.0262,
"step": 965
},
{
"epoch": 1.9594320486815415,
"grad_norm": 0.259765625,
"learning_rate": 3.4685598377281947e-06,
"loss": 1.0183,
"step": 966
},
{
"epoch": 1.9614604462474645,
"grad_norm": 0.2451171875,
"learning_rate": 3.461798512508452e-06,
"loss": 1.0283,
"step": 967
},
{
"epoch": 1.9634888438133875,
"grad_norm": 0.25390625,
"learning_rate": 3.455037187288709e-06,
"loss": 1.0364,
"step": 968
},
{
"epoch": 1.9655172413793105,
"grad_norm": 0.251953125,
"learning_rate": 3.448275862068966e-06,
"loss": 1.0066,
"step": 969
},
{
"epoch": 1.9675456389452333,
"grad_norm": 0.24609375,
"learning_rate": 3.4415145368492227e-06,
"loss": 1.083,
"step": 970
},
{
"epoch": 1.969574036511156,
"grad_norm": 0.28515625,
"learning_rate": 3.4347532116294797e-06,
"loss": 1.0155,
"step": 971
},
{
"epoch": 1.971602434077079,
"grad_norm": 0.263671875,
"learning_rate": 3.4279918864097366e-06,
"loss": 1.0288,
"step": 972
},
{
"epoch": 1.973630831643002,
"grad_norm": 0.2578125,
"learning_rate": 3.421230561189994e-06,
"loss": 1.0395,
"step": 973
},
{
"epoch": 1.975659229208925,
"grad_norm": 0.25,
"learning_rate": 3.4144692359702504e-06,
"loss": 1.0201,
"step": 974
},
{
"epoch": 1.977687626774848,
"grad_norm": 0.248046875,
"learning_rate": 3.4077079107505073e-06,
"loss": 1.0418,
"step": 975
},
{
"epoch": 1.9797160243407708,
"grad_norm": 0.25390625,
"learning_rate": 3.400946585530764e-06,
"loss": 1.0185,
"step": 976
},
{
"epoch": 1.9817444219066938,
"grad_norm": 0.25,
"learning_rate": 3.3941852603110215e-06,
"loss": 1.0386,
"step": 977
},
{
"epoch": 1.9837728194726165,
"grad_norm": 0.255859375,
"learning_rate": 3.387423935091278e-06,
"loss": 1.0045,
"step": 978
},
{
"epoch": 1.9858012170385395,
"grad_norm": 0.2470703125,
"learning_rate": 3.380662609871535e-06,
"loss": 1.0222,
"step": 979
},
{
"epoch": 1.9878296146044625,
"grad_norm": 0.251953125,
"learning_rate": 3.3739012846517923e-06,
"loss": 1.0219,
"step": 980
},
{
"epoch": 1.9898580121703855,
"grad_norm": 0.365234375,
"learning_rate": 3.367139959432049e-06,
"loss": 1.0264,
"step": 981
},
{
"epoch": 1.9918864097363083,
"grad_norm": 0.259765625,
"learning_rate": 3.3603786342123057e-06,
"loss": 1.0746,
"step": 982
},
{
"epoch": 1.9939148073022313,
"grad_norm": 0.2490234375,
"learning_rate": 3.3536173089925626e-06,
"loss": 1.0116,
"step": 983
},
{
"epoch": 1.995943204868154,
"grad_norm": 0.255859375,
"learning_rate": 3.34685598377282e-06,
"loss": 1.02,
"step": 984
},
{
"epoch": 1.997971602434077,
"grad_norm": 0.291015625,
"learning_rate": 3.340094658553077e-06,
"loss": 1.0196,
"step": 985
},
{
"epoch": 2.0,
"grad_norm": 0.248046875,
"learning_rate": 3.3333333333333333e-06,
"loss": 1.0513,
"step": 986
},
{
"epoch": 2.002028397565923,
"grad_norm": 0.279296875,
"learning_rate": 3.3265720081135907e-06,
"loss": 0.9924,
"step": 987
},
{
"epoch": 2.004056795131846,
"grad_norm": 0.25,
"learning_rate": 3.3198106828938476e-06,
"loss": 1.0519,
"step": 988
},
{
"epoch": 2.006085192697769,
"grad_norm": 0.275390625,
"learning_rate": 3.3130493576741045e-06,
"loss": 1.0348,
"step": 989
},
{
"epoch": 2.0081135902636915,
"grad_norm": 0.314453125,
"learning_rate": 3.306288032454361e-06,
"loss": 0.9656,
"step": 990
},
{
"epoch": 2.0101419878296145,
"grad_norm": 0.27734375,
"learning_rate": 3.2995267072346183e-06,
"loss": 1.0121,
"step": 991
},
{
"epoch": 2.0121703853955375,
"grad_norm": 0.28125,
"learning_rate": 3.2927653820148752e-06,
"loss": 1.0166,
"step": 992
},
{
"epoch": 2.0141987829614605,
"grad_norm": 0.5625,
"learning_rate": 3.2860040567951326e-06,
"loss": 1.0339,
"step": 993
},
{
"epoch": 2.0162271805273835,
"grad_norm": 0.2734375,
"learning_rate": 3.279242731575389e-06,
"loss": 1.0445,
"step": 994
},
{
"epoch": 2.0182555780933065,
"grad_norm": 0.2578125,
"learning_rate": 3.272481406355646e-06,
"loss": 1.0673,
"step": 995
},
{
"epoch": 2.020283975659229,
"grad_norm": 0.255859375,
"learning_rate": 3.265720081135903e-06,
"loss": 0.9861,
"step": 996
},
{
"epoch": 2.022312373225152,
"grad_norm": 0.267578125,
"learning_rate": 3.25895875591616e-06,
"loss": 1.0428,
"step": 997
},
{
"epoch": 2.024340770791075,
"grad_norm": 0.259765625,
"learning_rate": 3.2521974306964167e-06,
"loss": 1.0235,
"step": 998
},
{
"epoch": 2.026369168356998,
"grad_norm": 0.259765625,
"learning_rate": 3.2454361054766736e-06,
"loss": 1.0356,
"step": 999
},
{
"epoch": 2.028397565922921,
"grad_norm": 0.25,
"learning_rate": 3.2386747802569305e-06,
"loss": 1.0242,
"step": 1000
},
{
"epoch": 2.030425963488844,
"grad_norm": 0.255859375,
"learning_rate": 3.231913455037188e-06,
"loss": 1.0575,
"step": 1001
},
{
"epoch": 2.032454361054767,
"grad_norm": 0.25390625,
"learning_rate": 3.2251521298174443e-06,
"loss": 1.0393,
"step": 1002
},
{
"epoch": 2.0344827586206895,
"grad_norm": 0.30078125,
"learning_rate": 3.2183908045977012e-06,
"loss": 0.9703,
"step": 1003
},
{
"epoch": 2.0365111561866125,
"grad_norm": 0.251953125,
"learning_rate": 3.2116294793779586e-06,
"loss": 1.0081,
"step": 1004
},
{
"epoch": 2.0385395537525355,
"grad_norm": 0.271484375,
"learning_rate": 3.2048681541582155e-06,
"loss": 1.0917,
"step": 1005
},
{
"epoch": 2.0405679513184585,
"grad_norm": 0.25390625,
"learning_rate": 3.198106828938472e-06,
"loss": 1.0668,
"step": 1006
},
{
"epoch": 2.0425963488843815,
"grad_norm": 0.2578125,
"learning_rate": 3.191345503718729e-06,
"loss": 1.036,
"step": 1007
},
{
"epoch": 2.0446247464503045,
"grad_norm": 0.267578125,
"learning_rate": 3.1845841784989862e-06,
"loss": 1.0184,
"step": 1008
},
{
"epoch": 2.046653144016227,
"grad_norm": 0.248046875,
"learning_rate": 3.177822853279243e-06,
"loss": 1.0415,
"step": 1009
},
{
"epoch": 2.04868154158215,
"grad_norm": 0.2490234375,
"learning_rate": 3.1710615280594996e-06,
"loss": 1.0184,
"step": 1010
},
{
"epoch": 2.050709939148073,
"grad_norm": 0.255859375,
"learning_rate": 3.164300202839757e-06,
"loss": 1.0386,
"step": 1011
},
{
"epoch": 2.052738336713996,
"grad_norm": 0.2470703125,
"learning_rate": 3.157538877620014e-06,
"loss": 1.0234,
"step": 1012
},
{
"epoch": 2.054766734279919,
"grad_norm": 0.25,
"learning_rate": 3.1507775524002708e-06,
"loss": 1.0344,
"step": 1013
},
{
"epoch": 2.056795131845842,
"grad_norm": 0.3203125,
"learning_rate": 3.1440162271805273e-06,
"loss": 1.0343,
"step": 1014
},
{
"epoch": 2.0588235294117645,
"grad_norm": 0.25390625,
"learning_rate": 3.1372549019607846e-06,
"loss": 1.034,
"step": 1015
},
{
"epoch": 2.0608519269776875,
"grad_norm": 0.259765625,
"learning_rate": 3.1304935767410415e-06,
"loss": 1.0121,
"step": 1016
},
{
"epoch": 2.0628803245436105,
"grad_norm": 0.31640625,
"learning_rate": 3.123732251521299e-06,
"loss": 1.022,
"step": 1017
},
{
"epoch": 2.0649087221095335,
"grad_norm": 0.388671875,
"learning_rate": 3.1169709263015553e-06,
"loss": 1.1013,
"step": 1018
},
{
"epoch": 2.0669371196754565,
"grad_norm": 0.2451171875,
"learning_rate": 3.1102096010818122e-06,
"loss": 0.994,
"step": 1019
},
{
"epoch": 2.0689655172413794,
"grad_norm": 0.275390625,
"learning_rate": 3.103448275862069e-06,
"loss": 1.0659,
"step": 1020
},
{
"epoch": 2.0709939148073024,
"grad_norm": 0.2470703125,
"learning_rate": 3.0966869506423265e-06,
"loss": 1.0322,
"step": 1021
},
{
"epoch": 2.073022312373225,
"grad_norm": 0.3203125,
"learning_rate": 3.089925625422583e-06,
"loss": 1.0494,
"step": 1022
},
{
"epoch": 2.075050709939148,
"grad_norm": 0.263671875,
"learning_rate": 3.08316430020284e-06,
"loss": 1.0761,
"step": 1023
},
{
"epoch": 2.077079107505071,
"grad_norm": 0.296875,
"learning_rate": 3.0764029749830972e-06,
"loss": 1.0021,
"step": 1024
},
{
"epoch": 2.079107505070994,
"grad_norm": 0.255859375,
"learning_rate": 3.069641649763354e-06,
"loss": 0.9927,
"step": 1025
},
{
"epoch": 2.081135902636917,
"grad_norm": 0.2578125,
"learning_rate": 3.0628803245436106e-06,
"loss": 0.999,
"step": 1026
},
{
"epoch": 2.08316430020284,
"grad_norm": 0.25,
"learning_rate": 3.0561189993238675e-06,
"loss": 1.0619,
"step": 1027
},
{
"epoch": 2.0851926977687625,
"grad_norm": 0.2470703125,
"learning_rate": 3.049357674104125e-06,
"loss": 1.0605,
"step": 1028
},
{
"epoch": 2.0872210953346855,
"grad_norm": 0.3046875,
"learning_rate": 3.0425963488843818e-06,
"loss": 0.971,
"step": 1029
},
{
"epoch": 2.0892494929006085,
"grad_norm": 0.26171875,
"learning_rate": 3.0358350236646383e-06,
"loss": 1.0451,
"step": 1030
},
{
"epoch": 2.0912778904665315,
"grad_norm": 0.26171875,
"learning_rate": 3.0290736984448956e-06,
"loss": 1.0517,
"step": 1031
},
{
"epoch": 2.0933062880324544,
"grad_norm": 0.271484375,
"learning_rate": 3.0223123732251525e-06,
"loss": 1.0314,
"step": 1032
},
{
"epoch": 2.0953346855983774,
"grad_norm": 0.26171875,
"learning_rate": 3.0155510480054094e-06,
"loss": 1.0495,
"step": 1033
},
{
"epoch": 2.0973630831643,
"grad_norm": 0.251953125,
"learning_rate": 3.008789722785666e-06,
"loss": 1.0287,
"step": 1034
},
{
"epoch": 2.099391480730223,
"grad_norm": 0.26953125,
"learning_rate": 3.0020283975659233e-06,
"loss": 1.0541,
"step": 1035
},
{
"epoch": 2.101419878296146,
"grad_norm": 0.259765625,
"learning_rate": 2.99526707234618e-06,
"loss": 1.045,
"step": 1036
},
{
"epoch": 2.103448275862069,
"grad_norm": 0.2578125,
"learning_rate": 2.988505747126437e-06,
"loss": 1.0201,
"step": 1037
},
{
"epoch": 2.105476673427992,
"grad_norm": 0.283203125,
"learning_rate": 2.9817444219066936e-06,
"loss": 1.0175,
"step": 1038
},
{
"epoch": 2.107505070993915,
"grad_norm": 0.25,
"learning_rate": 2.974983096686951e-06,
"loss": 1.0352,
"step": 1039
},
{
"epoch": 2.109533468559838,
"grad_norm": 0.306640625,
"learning_rate": 2.968221771467208e-06,
"loss": 1.0876,
"step": 1040
},
{
"epoch": 2.1115618661257605,
"grad_norm": 0.359375,
"learning_rate": 2.961460446247465e-06,
"loss": 0.9568,
"step": 1041
},
{
"epoch": 2.1135902636916835,
"grad_norm": 0.3828125,
"learning_rate": 2.9546991210277216e-06,
"loss": 0.9813,
"step": 1042
},
{
"epoch": 2.1156186612576064,
"grad_norm": 0.263671875,
"learning_rate": 2.9479377958079785e-06,
"loss": 1.0693,
"step": 1043
},
{
"epoch": 2.1176470588235294,
"grad_norm": 0.251953125,
"learning_rate": 2.9411764705882355e-06,
"loss": 0.9901,
"step": 1044
},
{
"epoch": 2.1196754563894524,
"grad_norm": 0.26171875,
"learning_rate": 2.934415145368493e-06,
"loss": 1.0182,
"step": 1045
},
{
"epoch": 2.1217038539553754,
"grad_norm": 0.2578125,
"learning_rate": 2.9276538201487493e-06,
"loss": 1.0229,
"step": 1046
},
{
"epoch": 2.123732251521298,
"grad_norm": 0.25390625,
"learning_rate": 2.920892494929006e-06,
"loss": 1.06,
"step": 1047
},
{
"epoch": 2.125760649087221,
"grad_norm": 0.2421875,
"learning_rate": 2.9141311697092635e-06,
"loss": 1.0253,
"step": 1048
},
{
"epoch": 2.127789046653144,
"grad_norm": 0.306640625,
"learning_rate": 2.9073698444895204e-06,
"loss": 1.0695,
"step": 1049
},
{
"epoch": 2.129817444219067,
"grad_norm": 0.296875,
"learning_rate": 2.900608519269777e-06,
"loss": 1.0031,
"step": 1050
},
{
"epoch": 2.13184584178499,
"grad_norm": 0.296875,
"learning_rate": 2.893847194050034e-06,
"loss": 1.0045,
"step": 1051
},
{
"epoch": 2.133874239350913,
"grad_norm": 0.2490234375,
"learning_rate": 2.887085868830291e-06,
"loss": 1.0059,
"step": 1052
},
{
"epoch": 2.135902636916836,
"grad_norm": 0.248046875,
"learning_rate": 2.880324543610548e-06,
"loss": 1.004,
"step": 1053
},
{
"epoch": 2.1379310344827585,
"grad_norm": 0.259765625,
"learning_rate": 2.8735632183908046e-06,
"loss": 1.0363,
"step": 1054
},
{
"epoch": 2.1399594320486814,
"grad_norm": 0.279296875,
"learning_rate": 2.866801893171062e-06,
"loss": 1.0313,
"step": 1055
},
{
"epoch": 2.1419878296146044,
"grad_norm": 0.25390625,
"learning_rate": 2.860040567951319e-06,
"loss": 1.0128,
"step": 1056
},
{
"epoch": 2.1440162271805274,
"grad_norm": 0.28125,
"learning_rate": 2.8532792427315757e-06,
"loss": 1.0291,
"step": 1057
},
{
"epoch": 2.1460446247464504,
"grad_norm": 0.25390625,
"learning_rate": 2.8465179175118322e-06,
"loss": 1.0473,
"step": 1058
},
{
"epoch": 2.1480730223123734,
"grad_norm": 0.26171875,
"learning_rate": 2.8397565922920896e-06,
"loss": 1.018,
"step": 1059
},
{
"epoch": 2.150101419878296,
"grad_norm": 0.29296875,
"learning_rate": 2.8329952670723465e-06,
"loss": 1.0078,
"step": 1060
},
{
"epoch": 2.152129817444219,
"grad_norm": 0.25390625,
"learning_rate": 2.8262339418526034e-06,
"loss": 1.0386,
"step": 1061
},
{
"epoch": 2.154158215010142,
"grad_norm": 0.283203125,
"learning_rate": 2.8194726166328603e-06,
"loss": 1.0723,
"step": 1062
},
{
"epoch": 2.156186612576065,
"grad_norm": 0.2490234375,
"learning_rate": 2.812711291413117e-06,
"loss": 1.0031,
"step": 1063
},
{
"epoch": 2.158215010141988,
"grad_norm": 0.251953125,
"learning_rate": 2.805949966193374e-06,
"loss": 1.0709,
"step": 1064
},
{
"epoch": 2.160243407707911,
"grad_norm": 0.25,
"learning_rate": 2.7991886409736314e-06,
"loss": 1.0254,
"step": 1065
},
{
"epoch": 2.162271805273834,
"grad_norm": 0.25,
"learning_rate": 2.792427315753888e-06,
"loss": 0.9745,
"step": 1066
},
{
"epoch": 2.1643002028397564,
"grad_norm": 0.306640625,
"learning_rate": 2.785665990534145e-06,
"loss": 0.9904,
"step": 1067
},
{
"epoch": 2.1663286004056794,
"grad_norm": 0.27734375,
"learning_rate": 2.7789046653144018e-06,
"loss": 1.0366,
"step": 1068
},
{
"epoch": 2.1683569979716024,
"grad_norm": 0.24609375,
"learning_rate": 2.772143340094659e-06,
"loss": 0.9916,
"step": 1069
},
{
"epoch": 2.1703853955375254,
"grad_norm": 0.27734375,
"learning_rate": 2.7653820148749156e-06,
"loss": 1.0308,
"step": 1070
},
{
"epoch": 2.1724137931034484,
"grad_norm": 0.2470703125,
"learning_rate": 2.7586206896551725e-06,
"loss": 0.987,
"step": 1071
},
{
"epoch": 2.1744421906693714,
"grad_norm": 0.25390625,
"learning_rate": 2.75185936443543e-06,
"loss": 1.036,
"step": 1072
},
{
"epoch": 2.176470588235294,
"grad_norm": 0.255859375,
"learning_rate": 2.7450980392156867e-06,
"loss": 1.0158,
"step": 1073
},
{
"epoch": 2.178498985801217,
"grad_norm": 0.275390625,
"learning_rate": 2.7383367139959432e-06,
"loss": 1.0426,
"step": 1074
},
{
"epoch": 2.18052738336714,
"grad_norm": 0.267578125,
"learning_rate": 2.7315753887762e-06,
"loss": 1.0285,
"step": 1075
},
{
"epoch": 2.182555780933063,
"grad_norm": 0.2578125,
"learning_rate": 2.7248140635564575e-06,
"loss": 0.9971,
"step": 1076
},
{
"epoch": 2.184584178498986,
"grad_norm": 0.248046875,
"learning_rate": 2.7180527383367144e-06,
"loss": 1.0294,
"step": 1077
},
{
"epoch": 2.186612576064909,
"grad_norm": 0.349609375,
"learning_rate": 2.711291413116971e-06,
"loss": 1.017,
"step": 1078
},
{
"epoch": 2.1886409736308314,
"grad_norm": 0.265625,
"learning_rate": 2.704530087897228e-06,
"loss": 1.0512,
"step": 1079
},
{
"epoch": 2.1906693711967544,
"grad_norm": 0.314453125,
"learning_rate": 2.697768762677485e-06,
"loss": 1.0469,
"step": 1080
},
{
"epoch": 2.1926977687626774,
"grad_norm": 0.2734375,
"learning_rate": 2.691007437457742e-06,
"loss": 1.0847,
"step": 1081
},
{
"epoch": 2.1947261663286004,
"grad_norm": 0.251953125,
"learning_rate": 2.6842461122379985e-06,
"loss": 1.0098,
"step": 1082
},
{
"epoch": 2.1967545638945234,
"grad_norm": 0.25390625,
"learning_rate": 2.677484787018256e-06,
"loss": 1.0116,
"step": 1083
},
{
"epoch": 2.1987829614604464,
"grad_norm": 0.396484375,
"learning_rate": 2.6707234617985128e-06,
"loss": 1.0333,
"step": 1084
},
{
"epoch": 2.2008113590263694,
"grad_norm": 0.322265625,
"learning_rate": 2.66396213657877e-06,
"loss": 0.9811,
"step": 1085
},
{
"epoch": 2.202839756592292,
"grad_norm": 0.2578125,
"learning_rate": 2.6572008113590266e-06,
"loss": 1.0221,
"step": 1086
},
{
"epoch": 2.204868154158215,
"grad_norm": 0.32421875,
"learning_rate": 2.6504394861392835e-06,
"loss": 1.0477,
"step": 1087
},
{
"epoch": 2.206896551724138,
"grad_norm": 0.2470703125,
"learning_rate": 2.6436781609195404e-06,
"loss": 1.0179,
"step": 1088
},
{
"epoch": 2.208924949290061,
"grad_norm": 0.427734375,
"learning_rate": 2.6369168356997977e-06,
"loss": 0.9656,
"step": 1089
},
{
"epoch": 2.210953346855984,
"grad_norm": 0.271484375,
"learning_rate": 2.6301555104800542e-06,
"loss": 0.9941,
"step": 1090
},
{
"epoch": 2.212981744421907,
"grad_norm": 0.2470703125,
"learning_rate": 2.623394185260311e-06,
"loss": 0.9892,
"step": 1091
},
{
"epoch": 2.2150101419878294,
"grad_norm": 0.25390625,
"learning_rate": 2.616632860040568e-06,
"loss": 1.0165,
"step": 1092
},
{
"epoch": 2.2170385395537524,
"grad_norm": 0.2490234375,
"learning_rate": 2.6098715348208254e-06,
"loss": 1.0287,
"step": 1093
},
{
"epoch": 2.2190669371196754,
"grad_norm": 0.25,
"learning_rate": 2.603110209601082e-06,
"loss": 1.0269,
"step": 1094
},
{
"epoch": 2.2210953346855984,
"grad_norm": 0.25,
"learning_rate": 2.596348884381339e-06,
"loss": 1.0034,
"step": 1095
},
{
"epoch": 2.2231237322515214,
"grad_norm": 0.251953125,
"learning_rate": 2.589587559161596e-06,
"loss": 1.024,
"step": 1096
},
{
"epoch": 2.2251521298174444,
"grad_norm": 0.26171875,
"learning_rate": 2.582826233941853e-06,
"loss": 1.0846,
"step": 1097
},
{
"epoch": 2.227180527383367,
"grad_norm": 0.279296875,
"learning_rate": 2.5760649087221095e-06,
"loss": 1.0558,
"step": 1098
},
{
"epoch": 2.22920892494929,
"grad_norm": 0.251953125,
"learning_rate": 2.5693035835023664e-06,
"loss": 0.9899,
"step": 1099
},
{
"epoch": 2.231237322515213,
"grad_norm": 0.26171875,
"learning_rate": 2.5625422582826238e-06,
"loss": 1.0611,
"step": 1100
},
{
"epoch": 2.233265720081136,
"grad_norm": 0.263671875,
"learning_rate": 2.5557809330628807e-06,
"loss": 1.0497,
"step": 1101
},
{
"epoch": 2.235294117647059,
"grad_norm": 0.28515625,
"learning_rate": 2.549019607843137e-06,
"loss": 1.037,
"step": 1102
},
{
"epoch": 2.237322515212982,
"grad_norm": 0.25,
"learning_rate": 2.5422582826233945e-06,
"loss": 1.0295,
"step": 1103
},
{
"epoch": 2.239350912778905,
"grad_norm": 0.2490234375,
"learning_rate": 2.5354969574036514e-06,
"loss": 1.0433,
"step": 1104
},
{
"epoch": 2.2413793103448274,
"grad_norm": 0.267578125,
"learning_rate": 2.5287356321839083e-06,
"loss": 1.002,
"step": 1105
},
{
"epoch": 2.2434077079107504,
"grad_norm": 0.251953125,
"learning_rate": 2.521974306964165e-06,
"loss": 1.0088,
"step": 1106
},
{
"epoch": 2.2454361054766734,
"grad_norm": 0.353515625,
"learning_rate": 2.515212981744422e-06,
"loss": 0.9883,
"step": 1107
},
{
"epoch": 2.2474645030425964,
"grad_norm": 0.2578125,
"learning_rate": 2.508451656524679e-06,
"loss": 1.0244,
"step": 1108
},
{
"epoch": 2.2494929006085194,
"grad_norm": 0.28515625,
"learning_rate": 2.5016903313049364e-06,
"loss": 1.0374,
"step": 1109
},
{
"epoch": 2.2515212981744424,
"grad_norm": 0.26953125,
"learning_rate": 2.494929006085193e-06,
"loss": 1.0797,
"step": 1110
},
{
"epoch": 2.2535496957403653,
"grad_norm": 0.263671875,
"learning_rate": 2.48816768086545e-06,
"loss": 1.0207,
"step": 1111
},
{
"epoch": 2.255578093306288,
"grad_norm": 0.2578125,
"learning_rate": 2.4814063556457067e-06,
"loss": 1.0208,
"step": 1112
},
{
"epoch": 2.257606490872211,
"grad_norm": 0.248046875,
"learning_rate": 2.4746450304259636e-06,
"loss": 1.0544,
"step": 1113
},
{
"epoch": 2.259634888438134,
"grad_norm": 0.353515625,
"learning_rate": 2.4678837052062205e-06,
"loss": 0.9771,
"step": 1114
},
{
"epoch": 2.261663286004057,
"grad_norm": 0.255859375,
"learning_rate": 2.4611223799864774e-06,
"loss": 1.0019,
"step": 1115
},
{
"epoch": 2.26369168356998,
"grad_norm": 0.25390625,
"learning_rate": 2.4543610547667348e-06,
"loss": 1.0274,
"step": 1116
},
{
"epoch": 2.2657200811359024,
"grad_norm": 0.291015625,
"learning_rate": 2.4475997295469913e-06,
"loss": 1.0097,
"step": 1117
},
{
"epoch": 2.2677484787018254,
"grad_norm": 0.279296875,
"learning_rate": 2.4408384043272486e-06,
"loss": 1.0111,
"step": 1118
},
{
"epoch": 2.2697768762677484,
"grad_norm": 0.25390625,
"learning_rate": 2.434077079107505e-06,
"loss": 1.0339,
"step": 1119
},
{
"epoch": 2.2718052738336714,
"grad_norm": 0.26953125,
"learning_rate": 2.4273157538877624e-06,
"loss": 1.0308,
"step": 1120
},
{
"epoch": 2.2738336713995944,
"grad_norm": 0.337890625,
"learning_rate": 2.420554428668019e-06,
"loss": 1.0253,
"step": 1121
},
{
"epoch": 2.2758620689655173,
"grad_norm": 0.259765625,
"learning_rate": 2.4137931034482762e-06,
"loss": 1.0244,
"step": 1122
},
{
"epoch": 2.2778904665314403,
"grad_norm": 0.375,
"learning_rate": 2.407031778228533e-06,
"loss": 1.014,
"step": 1123
},
{
"epoch": 2.279918864097363,
"grad_norm": 0.25390625,
"learning_rate": 2.40027045300879e-06,
"loss": 1.037,
"step": 1124
},
{
"epoch": 2.281947261663286,
"grad_norm": 0.259765625,
"learning_rate": 2.393509127789047e-06,
"loss": 1.0582,
"step": 1125
},
{
"epoch": 2.283975659229209,
"grad_norm": 0.369140625,
"learning_rate": 2.386747802569304e-06,
"loss": 1.0285,
"step": 1126
},
{
"epoch": 2.286004056795132,
"grad_norm": 0.2490234375,
"learning_rate": 2.379986477349561e-06,
"loss": 1.0411,
"step": 1127
},
{
"epoch": 2.288032454361055,
"grad_norm": 0.267578125,
"learning_rate": 2.3732251521298177e-06,
"loss": 1.0125,
"step": 1128
},
{
"epoch": 2.290060851926978,
"grad_norm": 0.291015625,
"learning_rate": 2.3664638269100746e-06,
"loss": 1.0455,
"step": 1129
},
{
"epoch": 2.292089249492901,
"grad_norm": 0.271484375,
"learning_rate": 2.3597025016903315e-06,
"loss": 1.0079,
"step": 1130
},
{
"epoch": 2.2941176470588234,
"grad_norm": 0.2470703125,
"learning_rate": 2.3529411764705885e-06,
"loss": 1.0286,
"step": 1131
},
{
"epoch": 2.2961460446247464,
"grad_norm": 0.24609375,
"learning_rate": 2.3461798512508454e-06,
"loss": 1.0324,
"step": 1132
},
{
"epoch": 2.2981744421906694,
"grad_norm": 0.271484375,
"learning_rate": 2.3394185260311023e-06,
"loss": 1.0273,
"step": 1133
},
{
"epoch": 2.3002028397565923,
"grad_norm": 0.25,
"learning_rate": 2.332657200811359e-06,
"loss": 1.0111,
"step": 1134
},
{
"epoch": 2.3022312373225153,
"grad_norm": 0.4609375,
"learning_rate": 2.325895875591616e-06,
"loss": 0.9797,
"step": 1135
},
{
"epoch": 2.3042596348884383,
"grad_norm": 0.271484375,
"learning_rate": 2.319134550371873e-06,
"loss": 0.9904,
"step": 1136
},
{
"epoch": 2.306288032454361,
"grad_norm": 0.251953125,
"learning_rate": 2.31237322515213e-06,
"loss": 1.0559,
"step": 1137
},
{
"epoch": 2.308316430020284,
"grad_norm": 0.25390625,
"learning_rate": 2.305611899932387e-06,
"loss": 1.0311,
"step": 1138
},
{
"epoch": 2.310344827586207,
"grad_norm": 0.26953125,
"learning_rate": 2.2988505747126437e-06,
"loss": 1.0926,
"step": 1139
},
{
"epoch": 2.31237322515213,
"grad_norm": 0.26171875,
"learning_rate": 2.292089249492901e-06,
"loss": 1.0408,
"step": 1140
},
{
"epoch": 2.314401622718053,
"grad_norm": 0.265625,
"learning_rate": 2.2853279242731576e-06,
"loss": 1.0569,
"step": 1141
},
{
"epoch": 2.316430020283976,
"grad_norm": 0.26953125,
"learning_rate": 2.278566599053415e-06,
"loss": 1.0491,
"step": 1142
},
{
"epoch": 2.3184584178498984,
"grad_norm": 0.255859375,
"learning_rate": 2.2718052738336714e-06,
"loss": 1.0171,
"step": 1143
},
{
"epoch": 2.3204868154158214,
"grad_norm": 0.26953125,
"learning_rate": 2.2650439486139287e-06,
"loss": 1.0477,
"step": 1144
},
{
"epoch": 2.3225152129817443,
"grad_norm": 0.26953125,
"learning_rate": 2.2582826233941852e-06,
"loss": 1.0472,
"step": 1145
},
{
"epoch": 2.3245436105476673,
"grad_norm": 0.267578125,
"learning_rate": 2.2515212981744425e-06,
"loss": 1.0426,
"step": 1146
},
{
"epoch": 2.3265720081135903,
"grad_norm": 0.263671875,
"learning_rate": 2.2447599729546995e-06,
"loss": 1.0696,
"step": 1147
},
{
"epoch": 2.3286004056795133,
"grad_norm": 0.267578125,
"learning_rate": 2.2379986477349564e-06,
"loss": 1.0435,
"step": 1148
},
{
"epoch": 2.3306288032454363,
"grad_norm": 0.267578125,
"learning_rate": 2.2312373225152133e-06,
"loss": 1.0468,
"step": 1149
},
{
"epoch": 2.332657200811359,
"grad_norm": 0.2490234375,
"learning_rate": 2.22447599729547e-06,
"loss": 1.013,
"step": 1150
},
{
"epoch": 2.334685598377282,
"grad_norm": 0.265625,
"learning_rate": 2.217714672075727e-06,
"loss": 1.0125,
"step": 1151
},
{
"epoch": 2.336713995943205,
"grad_norm": 0.26171875,
"learning_rate": 2.210953346855984e-06,
"loss": 1.0292,
"step": 1152
},
{
"epoch": 2.338742393509128,
"grad_norm": 0.52734375,
"learning_rate": 2.204192021636241e-06,
"loss": 0.9943,
"step": 1153
},
{
"epoch": 2.340770791075051,
"grad_norm": 0.341796875,
"learning_rate": 2.197430696416498e-06,
"loss": 1.0632,
"step": 1154
},
{
"epoch": 2.342799188640974,
"grad_norm": 0.2490234375,
"learning_rate": 2.1906693711967548e-06,
"loss": 0.9995,
"step": 1155
},
{
"epoch": 2.344827586206897,
"grad_norm": 0.2451171875,
"learning_rate": 2.1839080459770117e-06,
"loss": 0.983,
"step": 1156
},
{
"epoch": 2.3468559837728193,
"grad_norm": 0.255859375,
"learning_rate": 2.1771467207572686e-06,
"loss": 1.003,
"step": 1157
},
{
"epoch": 2.3488843813387423,
"grad_norm": 0.26171875,
"learning_rate": 2.1703853955375255e-06,
"loss": 1.0498,
"step": 1158
},
{
"epoch": 2.3509127789046653,
"grad_norm": 0.28125,
"learning_rate": 2.1636240703177824e-06,
"loss": 1.0288,
"step": 1159
},
{
"epoch": 2.3529411764705883,
"grad_norm": 0.2578125,
"learning_rate": 2.1568627450980393e-06,
"loss": 1.0166,
"step": 1160
},
{
"epoch": 2.3549695740365113,
"grad_norm": 0.259765625,
"learning_rate": 2.1501014198782962e-06,
"loss": 1.0634,
"step": 1161
},
{
"epoch": 2.356997971602434,
"grad_norm": 0.291015625,
"learning_rate": 2.1433400946585536e-06,
"loss": 1.0077,
"step": 1162
},
{
"epoch": 2.359026369168357,
"grad_norm": 0.263671875,
"learning_rate": 2.13657876943881e-06,
"loss": 1.0331,
"step": 1163
},
{
"epoch": 2.36105476673428,
"grad_norm": 0.259765625,
"learning_rate": 2.1298174442190674e-06,
"loss": 0.987,
"step": 1164
},
{
"epoch": 2.363083164300203,
"grad_norm": 0.25,
"learning_rate": 2.123056118999324e-06,
"loss": 1.0156,
"step": 1165
},
{
"epoch": 2.365111561866126,
"grad_norm": 0.25390625,
"learning_rate": 2.116294793779581e-06,
"loss": 0.9946,
"step": 1166
},
{
"epoch": 2.367139959432049,
"grad_norm": 0.25390625,
"learning_rate": 2.1095334685598377e-06,
"loss": 0.9646,
"step": 1167
},
{
"epoch": 2.369168356997972,
"grad_norm": 0.267578125,
"learning_rate": 2.102772143340095e-06,
"loss": 1.0295,
"step": 1168
},
{
"epoch": 2.3711967545638943,
"grad_norm": 0.259765625,
"learning_rate": 2.096010818120352e-06,
"loss": 1.0657,
"step": 1169
},
{
"epoch": 2.3732251521298173,
"grad_norm": 0.275390625,
"learning_rate": 2.089249492900609e-06,
"loss": 1.0251,
"step": 1170
},
{
"epoch": 2.3752535496957403,
"grad_norm": 0.26171875,
"learning_rate": 2.0824881676808658e-06,
"loss": 1.0299,
"step": 1171
},
{
"epoch": 2.3772819472616633,
"grad_norm": 0.265625,
"learning_rate": 2.0757268424611227e-06,
"loss": 1.0147,
"step": 1172
},
{
"epoch": 2.3793103448275863,
"grad_norm": 0.2734375,
"learning_rate": 2.0689655172413796e-06,
"loss": 0.9979,
"step": 1173
},
{
"epoch": 2.3813387423935093,
"grad_norm": 0.265625,
"learning_rate": 2.0622041920216365e-06,
"loss": 1.0701,
"step": 1174
},
{
"epoch": 2.3833671399594323,
"grad_norm": 0.25390625,
"learning_rate": 2.0554428668018934e-06,
"loss": 1.0585,
"step": 1175
},
{
"epoch": 2.385395537525355,
"grad_norm": 0.287109375,
"learning_rate": 2.0486815415821503e-06,
"loss": 1.0384,
"step": 1176
},
{
"epoch": 2.387423935091278,
"grad_norm": 0.326171875,
"learning_rate": 2.0419202163624072e-06,
"loss": 1.0231,
"step": 1177
},
{
"epoch": 2.389452332657201,
"grad_norm": 0.2470703125,
"learning_rate": 2.035158891142664e-06,
"loss": 0.9854,
"step": 1178
},
{
"epoch": 2.391480730223124,
"grad_norm": 0.26953125,
"learning_rate": 2.028397565922921e-06,
"loss": 0.9973,
"step": 1179
},
{
"epoch": 2.393509127789047,
"grad_norm": 0.37109375,
"learning_rate": 2.021636240703178e-06,
"loss": 1.0059,
"step": 1180
},
{
"epoch": 2.3955375253549693,
"grad_norm": 0.259765625,
"learning_rate": 2.014874915483435e-06,
"loss": 1.0267,
"step": 1181
},
{
"epoch": 2.3975659229208923,
"grad_norm": 0.275390625,
"learning_rate": 2.0081135902636918e-06,
"loss": 1.015,
"step": 1182
},
{
"epoch": 2.3995943204868153,
"grad_norm": 0.267578125,
"learning_rate": 2.0013522650439487e-06,
"loss": 1.0269,
"step": 1183
},
{
"epoch": 2.4016227180527383,
"grad_norm": 0.28515625,
"learning_rate": 1.9945909398242056e-06,
"loss": 1.0155,
"step": 1184
},
{
"epoch": 2.4036511156186613,
"grad_norm": 0.267578125,
"learning_rate": 1.9878296146044625e-06,
"loss": 1.0457,
"step": 1185
},
{
"epoch": 2.4056795131845843,
"grad_norm": 0.251953125,
"learning_rate": 1.98106828938472e-06,
"loss": 0.9996,
"step": 1186
},
{
"epoch": 2.4077079107505073,
"grad_norm": 0.2578125,
"learning_rate": 1.9743069641649763e-06,
"loss": 1.0397,
"step": 1187
},
{
"epoch": 2.40973630831643,
"grad_norm": 0.255859375,
"learning_rate": 1.9675456389452337e-06,
"loss": 1.0423,
"step": 1188
},
{
"epoch": 2.411764705882353,
"grad_norm": 0.259765625,
"learning_rate": 1.96078431372549e-06,
"loss": 1.0214,
"step": 1189
},
{
"epoch": 2.413793103448276,
"grad_norm": 0.25390625,
"learning_rate": 1.9540229885057475e-06,
"loss": 0.9866,
"step": 1190
},
{
"epoch": 2.415821501014199,
"grad_norm": 0.267578125,
"learning_rate": 1.947261663286004e-06,
"loss": 1.045,
"step": 1191
},
{
"epoch": 2.417849898580122,
"grad_norm": 0.2490234375,
"learning_rate": 1.9405003380662613e-06,
"loss": 1.0609,
"step": 1192
},
{
"epoch": 2.4198782961460448,
"grad_norm": 0.2578125,
"learning_rate": 1.9337390128465182e-06,
"loss": 1.0639,
"step": 1193
},
{
"epoch": 2.4219066937119678,
"grad_norm": 0.283203125,
"learning_rate": 1.926977687626775e-06,
"loss": 1.0107,
"step": 1194
},
{
"epoch": 2.4239350912778903,
"grad_norm": 0.283203125,
"learning_rate": 1.920216362407032e-06,
"loss": 1.041,
"step": 1195
},
{
"epoch": 2.4259634888438133,
"grad_norm": 0.25390625,
"learning_rate": 1.913455037187289e-06,
"loss": 1.0544,
"step": 1196
},
{
"epoch": 2.4279918864097363,
"grad_norm": 0.255859375,
"learning_rate": 1.9066937119675457e-06,
"loss": 1.0184,
"step": 1197
},
{
"epoch": 2.4300202839756593,
"grad_norm": 0.28515625,
"learning_rate": 1.8999323867478028e-06,
"loss": 1.0371,
"step": 1198
},
{
"epoch": 2.4320486815415823,
"grad_norm": 0.265625,
"learning_rate": 1.8931710615280597e-06,
"loss": 1.0313,
"step": 1199
},
{
"epoch": 2.4340770791075053,
"grad_norm": 0.27734375,
"learning_rate": 1.8864097363083166e-06,
"loss": 1.0275,
"step": 1200
},
{
"epoch": 2.436105476673428,
"grad_norm": 0.255859375,
"learning_rate": 1.8796484110885735e-06,
"loss": 1.0283,
"step": 1201
},
{
"epoch": 2.438133874239351,
"grad_norm": 0.25,
"learning_rate": 1.8728870858688306e-06,
"loss": 1.0195,
"step": 1202
},
{
"epoch": 2.440162271805274,
"grad_norm": 0.25,
"learning_rate": 1.8661257606490873e-06,
"loss": 1.0568,
"step": 1203
},
{
"epoch": 2.4421906693711968,
"grad_norm": 0.306640625,
"learning_rate": 1.8593644354293445e-06,
"loss": 1.0432,
"step": 1204
},
{
"epoch": 2.4442190669371198,
"grad_norm": 0.3671875,
"learning_rate": 1.8526031102096012e-06,
"loss": 1.0448,
"step": 1205
},
{
"epoch": 2.4462474645030428,
"grad_norm": 0.267578125,
"learning_rate": 1.8458417849898583e-06,
"loss": 1.0647,
"step": 1206
},
{
"epoch": 2.4482758620689653,
"grad_norm": 0.263671875,
"learning_rate": 1.839080459770115e-06,
"loss": 1.0329,
"step": 1207
},
{
"epoch": 2.4503042596348883,
"grad_norm": 0.2578125,
"learning_rate": 1.8323191345503721e-06,
"loss": 1.0383,
"step": 1208
},
{
"epoch": 2.4523326572008113,
"grad_norm": 0.3203125,
"learning_rate": 1.825557809330629e-06,
"loss": 1.0725,
"step": 1209
},
{
"epoch": 2.4543610547667343,
"grad_norm": 0.25390625,
"learning_rate": 1.818796484110886e-06,
"loss": 1.0147,
"step": 1210
},
{
"epoch": 2.4563894523326573,
"grad_norm": 0.2578125,
"learning_rate": 1.8120351588911429e-06,
"loss": 1.0861,
"step": 1211
},
{
"epoch": 2.4584178498985803,
"grad_norm": 0.26171875,
"learning_rate": 1.8052738336713998e-06,
"loss": 1.0131,
"step": 1212
},
{
"epoch": 2.4604462474645032,
"grad_norm": 0.25390625,
"learning_rate": 1.7985125084516567e-06,
"loss": 1.0415,
"step": 1213
},
{
"epoch": 2.462474645030426,
"grad_norm": 0.259765625,
"learning_rate": 1.7917511832319138e-06,
"loss": 0.9922,
"step": 1214
},
{
"epoch": 2.464503042596349,
"grad_norm": 0.265625,
"learning_rate": 1.7849898580121705e-06,
"loss": 1.0788,
"step": 1215
},
{
"epoch": 2.4665314401622718,
"grad_norm": 0.2578125,
"learning_rate": 1.7782285327924276e-06,
"loss": 1.0135,
"step": 1216
},
{
"epoch": 2.4685598377281948,
"grad_norm": 0.25390625,
"learning_rate": 1.7714672075726843e-06,
"loss": 1.0231,
"step": 1217
},
{
"epoch": 2.4705882352941178,
"grad_norm": 0.2890625,
"learning_rate": 1.7647058823529414e-06,
"loss": 1.0446,
"step": 1218
},
{
"epoch": 2.4726166328600407,
"grad_norm": 0.248046875,
"learning_rate": 1.7579445571331981e-06,
"loss": 1.0019,
"step": 1219
},
{
"epoch": 2.4746450304259637,
"grad_norm": 0.302734375,
"learning_rate": 1.7511832319134553e-06,
"loss": 0.9993,
"step": 1220
},
{
"epoch": 2.4766734279918863,
"grad_norm": 0.2734375,
"learning_rate": 1.7444219066937122e-06,
"loss": 0.9909,
"step": 1221
},
{
"epoch": 2.4787018255578093,
"grad_norm": 0.26953125,
"learning_rate": 1.737660581473969e-06,
"loss": 1.0211,
"step": 1222
},
{
"epoch": 2.4807302231237323,
"grad_norm": 0.275390625,
"learning_rate": 1.730899256254226e-06,
"loss": 1.0205,
"step": 1223
},
{
"epoch": 2.4827586206896552,
"grad_norm": 0.37109375,
"learning_rate": 1.724137931034483e-06,
"loss": 1.0238,
"step": 1224
},
{
"epoch": 2.4847870182555782,
"grad_norm": 0.25390625,
"learning_rate": 1.7173766058147398e-06,
"loss": 0.9695,
"step": 1225
},
{
"epoch": 2.486815415821501,
"grad_norm": 0.3359375,
"learning_rate": 1.710615280594997e-06,
"loss": 0.9944,
"step": 1226
},
{
"epoch": 2.4888438133874238,
"grad_norm": 0.251953125,
"learning_rate": 1.7038539553752536e-06,
"loss": 1.0094,
"step": 1227
},
{
"epoch": 2.4908722109533468,
"grad_norm": 0.265625,
"learning_rate": 1.6970926301555108e-06,
"loss": 1.0162,
"step": 1228
},
{
"epoch": 2.4929006085192698,
"grad_norm": 0.263671875,
"learning_rate": 1.6903313049357675e-06,
"loss": 1.0123,
"step": 1229
},
{
"epoch": 2.4949290060851927,
"grad_norm": 0.2490234375,
"learning_rate": 1.6835699797160246e-06,
"loss": 1.0224,
"step": 1230
},
{
"epoch": 2.4969574036511157,
"grad_norm": 0.255859375,
"learning_rate": 1.6768086544962813e-06,
"loss": 0.9803,
"step": 1231
},
{
"epoch": 2.4989858012170387,
"grad_norm": 0.259765625,
"learning_rate": 1.6700473292765384e-06,
"loss": 1.0051,
"step": 1232
},
{
"epoch": 2.5010141987829613,
"grad_norm": 0.2578125,
"learning_rate": 1.6632860040567953e-06,
"loss": 1.0288,
"step": 1233
},
{
"epoch": 2.5030425963488843,
"grad_norm": 0.25,
"learning_rate": 1.6565246788370522e-06,
"loss": 1.0262,
"step": 1234
},
{
"epoch": 2.5050709939148073,
"grad_norm": 0.294921875,
"learning_rate": 1.6497633536173092e-06,
"loss": 1.0119,
"step": 1235
},
{
"epoch": 2.5070993914807302,
"grad_norm": 0.30859375,
"learning_rate": 1.6430020283975663e-06,
"loss": 1.0361,
"step": 1236
},
{
"epoch": 2.5091277890466532,
"grad_norm": 0.30078125,
"learning_rate": 1.636240703177823e-06,
"loss": 0.9858,
"step": 1237
},
{
"epoch": 2.5111561866125762,
"grad_norm": 0.259765625,
"learning_rate": 1.62947937795808e-06,
"loss": 1.0318,
"step": 1238
},
{
"epoch": 2.513184584178499,
"grad_norm": 0.244140625,
"learning_rate": 1.6227180527383368e-06,
"loss": 1.0007,
"step": 1239
},
{
"epoch": 2.5152129817444218,
"grad_norm": 0.248046875,
"learning_rate": 1.615956727518594e-06,
"loss": 1.0325,
"step": 1240
},
{
"epoch": 2.5172413793103448,
"grad_norm": 0.302734375,
"learning_rate": 1.6091954022988506e-06,
"loss": 1.0019,
"step": 1241
},
{
"epoch": 2.5192697768762677,
"grad_norm": 0.337890625,
"learning_rate": 1.6024340770791077e-06,
"loss": 1.0205,
"step": 1242
},
{
"epoch": 2.5212981744421907,
"grad_norm": 0.283203125,
"learning_rate": 1.5956727518593644e-06,
"loss": 1.0035,
"step": 1243
},
{
"epoch": 2.5233265720081137,
"grad_norm": 0.314453125,
"learning_rate": 1.5889114266396216e-06,
"loss": 1.0094,
"step": 1244
},
{
"epoch": 2.5253549695740363,
"grad_norm": 0.294921875,
"learning_rate": 1.5821501014198785e-06,
"loss": 1.0204,
"step": 1245
},
{
"epoch": 2.5273833671399597,
"grad_norm": 0.298828125,
"learning_rate": 1.5753887762001354e-06,
"loss": 0.9897,
"step": 1246
},
{
"epoch": 2.5294117647058822,
"grad_norm": 0.2578125,
"learning_rate": 1.5686274509803923e-06,
"loss": 1.051,
"step": 1247
},
{
"epoch": 2.5314401622718052,
"grad_norm": 0.2734375,
"learning_rate": 1.5618661257606494e-06,
"loss": 0.9939,
"step": 1248
},
{
"epoch": 2.5334685598377282,
"grad_norm": 0.2578125,
"learning_rate": 1.5551048005409061e-06,
"loss": 1.0292,
"step": 1249
},
{
"epoch": 2.535496957403651,
"grad_norm": 0.388671875,
"learning_rate": 1.5483434753211632e-06,
"loss": 1.1074,
"step": 1250
},
{
"epoch": 2.537525354969574,
"grad_norm": 0.412109375,
"learning_rate": 1.54158215010142e-06,
"loss": 0.9867,
"step": 1251
},
{
"epoch": 2.5395537525354968,
"grad_norm": 0.255859375,
"learning_rate": 1.534820824881677e-06,
"loss": 1.0396,
"step": 1252
},
{
"epoch": 2.5415821501014197,
"grad_norm": 0.28125,
"learning_rate": 1.5280594996619338e-06,
"loss": 0.9953,
"step": 1253
},
{
"epoch": 2.5436105476673427,
"grad_norm": 0.279296875,
"learning_rate": 1.5212981744421909e-06,
"loss": 1.0594,
"step": 1254
},
{
"epoch": 2.5456389452332657,
"grad_norm": 0.296875,
"learning_rate": 1.5145368492224478e-06,
"loss": 1.0398,
"step": 1255
},
{
"epoch": 2.5476673427991887,
"grad_norm": 0.25,
"learning_rate": 1.5077755240027047e-06,
"loss": 1.0247,
"step": 1256
},
{
"epoch": 2.5496957403651117,
"grad_norm": 0.263671875,
"learning_rate": 1.5010141987829616e-06,
"loss": 1.003,
"step": 1257
},
{
"epoch": 2.5517241379310347,
"grad_norm": 0.28515625,
"learning_rate": 1.4942528735632185e-06,
"loss": 1.0286,
"step": 1258
},
{
"epoch": 2.5537525354969572,
"grad_norm": 0.25,
"learning_rate": 1.4874915483434755e-06,
"loss": 1.0062,
"step": 1259
},
{
"epoch": 2.5557809330628802,
"grad_norm": 0.2490234375,
"learning_rate": 1.4807302231237326e-06,
"loss": 1.0012,
"step": 1260
},
{
"epoch": 2.5578093306288032,
"grad_norm": 0.251953125,
"learning_rate": 1.4739688979039893e-06,
"loss": 1.0322,
"step": 1261
},
{
"epoch": 2.559837728194726,
"grad_norm": 0.255859375,
"learning_rate": 1.4672075726842464e-06,
"loss": 1.0477,
"step": 1262
},
{
"epoch": 2.561866125760649,
"grad_norm": 0.255859375,
"learning_rate": 1.460446247464503e-06,
"loss": 1.0406,
"step": 1263
},
{
"epoch": 2.5638945233265718,
"grad_norm": 0.25390625,
"learning_rate": 1.4536849222447602e-06,
"loss": 1.0073,
"step": 1264
},
{
"epoch": 2.565922920892495,
"grad_norm": 0.25390625,
"learning_rate": 1.446923597025017e-06,
"loss": 1.0306,
"step": 1265
},
{
"epoch": 2.5679513184584177,
"grad_norm": 0.28125,
"learning_rate": 1.440162271805274e-06,
"loss": 0.9963,
"step": 1266
},
{
"epoch": 2.5699797160243407,
"grad_norm": 0.26953125,
"learning_rate": 1.433400946585531e-06,
"loss": 0.9997,
"step": 1267
},
{
"epoch": 2.5720081135902637,
"grad_norm": 0.384765625,
"learning_rate": 1.4266396213657879e-06,
"loss": 1.0072,
"step": 1268
},
{
"epoch": 2.5740365111561867,
"grad_norm": 0.2470703125,
"learning_rate": 1.4198782961460448e-06,
"loss": 1.0053,
"step": 1269
},
{
"epoch": 2.5760649087221097,
"grad_norm": 0.25390625,
"learning_rate": 1.4131169709263017e-06,
"loss": 1.029,
"step": 1270
},
{
"epoch": 2.5780933062880322,
"grad_norm": 0.275390625,
"learning_rate": 1.4063556457065586e-06,
"loss": 1.0295,
"step": 1271
},
{
"epoch": 2.5801217038539552,
"grad_norm": 0.30078125,
"learning_rate": 1.3995943204868157e-06,
"loss": 1.0263,
"step": 1272
},
{
"epoch": 2.582150101419878,
"grad_norm": 0.255859375,
"learning_rate": 1.3928329952670724e-06,
"loss": 0.997,
"step": 1273
},
{
"epoch": 2.584178498985801,
"grad_norm": 0.2578125,
"learning_rate": 1.3860716700473295e-06,
"loss": 1.0262,
"step": 1274
},
{
"epoch": 2.586206896551724,
"grad_norm": 0.25390625,
"learning_rate": 1.3793103448275862e-06,
"loss": 1.0284,
"step": 1275
},
{
"epoch": 2.588235294117647,
"grad_norm": 0.263671875,
"learning_rate": 1.3725490196078434e-06,
"loss": 0.9981,
"step": 1276
},
{
"epoch": 2.59026369168357,
"grad_norm": 0.259765625,
"learning_rate": 1.3657876943881e-06,
"loss": 1.0424,
"step": 1277
},
{
"epoch": 2.5922920892494927,
"grad_norm": 0.28515625,
"learning_rate": 1.3590263691683572e-06,
"loss": 1.0289,
"step": 1278
},
{
"epoch": 2.5943204868154157,
"grad_norm": 0.25390625,
"learning_rate": 1.352265043948614e-06,
"loss": 1.0007,
"step": 1279
},
{
"epoch": 2.5963488843813387,
"grad_norm": 0.25390625,
"learning_rate": 1.345503718728871e-06,
"loss": 1.0017,
"step": 1280
},
{
"epoch": 2.5983772819472617,
"grad_norm": 0.263671875,
"learning_rate": 1.338742393509128e-06,
"loss": 1.0152,
"step": 1281
},
{
"epoch": 2.6004056795131847,
"grad_norm": 0.37109375,
"learning_rate": 1.331981068289385e-06,
"loss": 1.0119,
"step": 1282
},
{
"epoch": 2.6024340770791072,
"grad_norm": 0.296875,
"learning_rate": 1.3252197430696418e-06,
"loss": 1.0538,
"step": 1283
},
{
"epoch": 2.6044624746450307,
"grad_norm": 0.267578125,
"learning_rate": 1.3184584178498989e-06,
"loss": 1.0105,
"step": 1284
},
{
"epoch": 2.606490872210953,
"grad_norm": 0.25390625,
"learning_rate": 1.3116970926301556e-06,
"loss": 0.9894,
"step": 1285
},
{
"epoch": 2.608519269776876,
"grad_norm": 0.3515625,
"learning_rate": 1.3049357674104127e-06,
"loss": 0.9832,
"step": 1286
},
{
"epoch": 2.610547667342799,
"grad_norm": 0.26171875,
"learning_rate": 1.2981744421906694e-06,
"loss": 1.0766,
"step": 1287
},
{
"epoch": 2.612576064908722,
"grad_norm": 0.2578125,
"learning_rate": 1.2914131169709265e-06,
"loss": 1.0027,
"step": 1288
},
{
"epoch": 2.614604462474645,
"grad_norm": 0.255859375,
"learning_rate": 1.2846517917511832e-06,
"loss": 1.0245,
"step": 1289
},
{
"epoch": 2.6166328600405677,
"grad_norm": 0.25390625,
"learning_rate": 1.2778904665314403e-06,
"loss": 1.065,
"step": 1290
},
{
"epoch": 2.6186612576064907,
"grad_norm": 0.33203125,
"learning_rate": 1.2711291413116973e-06,
"loss": 1.0342,
"step": 1291
},
{
"epoch": 2.6206896551724137,
"grad_norm": 0.275390625,
"learning_rate": 1.2643678160919542e-06,
"loss": 1.045,
"step": 1292
},
{
"epoch": 2.6227180527383367,
"grad_norm": 0.31640625,
"learning_rate": 1.257606490872211e-06,
"loss": 1.0383,
"step": 1293
},
{
"epoch": 2.6247464503042597,
"grad_norm": 0.255859375,
"learning_rate": 1.2508451656524682e-06,
"loss": 0.9908,
"step": 1294
},
{
"epoch": 2.6267748478701827,
"grad_norm": 0.255859375,
"learning_rate": 1.244083840432725e-06,
"loss": 1.048,
"step": 1295
},
{
"epoch": 2.6288032454361057,
"grad_norm": 0.271484375,
"learning_rate": 1.2373225152129818e-06,
"loss": 1.0501,
"step": 1296
},
{
"epoch": 2.630831643002028,
"grad_norm": 0.25,
"learning_rate": 1.2305611899932387e-06,
"loss": 0.9794,
"step": 1297
},
{
"epoch": 2.632860040567951,
"grad_norm": 0.28125,
"learning_rate": 1.2237998647734956e-06,
"loss": 0.9768,
"step": 1298
},
{
"epoch": 2.634888438133874,
"grad_norm": 0.248046875,
"learning_rate": 1.2170385395537525e-06,
"loss": 1.0291,
"step": 1299
},
{
"epoch": 2.636916835699797,
"grad_norm": 0.248046875,
"learning_rate": 1.2102772143340095e-06,
"loss": 1.0042,
"step": 1300
},
{
"epoch": 2.63894523326572,
"grad_norm": 0.267578125,
"learning_rate": 1.2035158891142666e-06,
"loss": 1.049,
"step": 1301
},
{
"epoch": 2.640973630831643,
"grad_norm": 0.2451171875,
"learning_rate": 1.1967545638945235e-06,
"loss": 1.0256,
"step": 1302
},
{
"epoch": 2.643002028397566,
"grad_norm": 0.318359375,
"learning_rate": 1.1899932386747804e-06,
"loss": 0.9939,
"step": 1303
},
{
"epoch": 2.6450304259634887,
"grad_norm": 0.259765625,
"learning_rate": 1.1832319134550373e-06,
"loss": 1.0379,
"step": 1304
},
{
"epoch": 2.6470588235294117,
"grad_norm": 0.31640625,
"learning_rate": 1.1764705882352942e-06,
"loss": 1.0315,
"step": 1305
},
{
"epoch": 2.6490872210953347,
"grad_norm": 0.25390625,
"learning_rate": 1.1697092630155511e-06,
"loss": 1.0205,
"step": 1306
},
{
"epoch": 2.6511156186612577,
"grad_norm": 0.251953125,
"learning_rate": 1.162947937795808e-06,
"loss": 1.0316,
"step": 1307
},
{
"epoch": 2.6531440162271807,
"grad_norm": 0.255859375,
"learning_rate": 1.156186612576065e-06,
"loss": 1.05,
"step": 1308
},
{
"epoch": 2.655172413793103,
"grad_norm": 0.248046875,
"learning_rate": 1.1494252873563219e-06,
"loss": 1.0154,
"step": 1309
},
{
"epoch": 2.6572008113590266,
"grad_norm": 0.255859375,
"learning_rate": 1.1426639621365788e-06,
"loss": 1.0074,
"step": 1310
},
{
"epoch": 2.659229208924949,
"grad_norm": 0.275390625,
"learning_rate": 1.1359026369168357e-06,
"loss": 1.0492,
"step": 1311
},
{
"epoch": 2.661257606490872,
"grad_norm": 0.396484375,
"learning_rate": 1.1291413116970926e-06,
"loss": 1.0047,
"step": 1312
},
{
"epoch": 2.663286004056795,
"grad_norm": 0.2734375,
"learning_rate": 1.1223799864773497e-06,
"loss": 1.0508,
"step": 1313
},
{
"epoch": 2.665314401622718,
"grad_norm": 0.25390625,
"learning_rate": 1.1156186612576066e-06,
"loss": 1.0369,
"step": 1314
},
{
"epoch": 2.667342799188641,
"grad_norm": 0.25390625,
"learning_rate": 1.1088573360378636e-06,
"loss": 1.0213,
"step": 1315
},
{
"epoch": 2.6693711967545637,
"grad_norm": 0.2578125,
"learning_rate": 1.1020960108181205e-06,
"loss": 1.0402,
"step": 1316
},
{
"epoch": 2.6713995943204867,
"grad_norm": 0.494140625,
"learning_rate": 1.0953346855983774e-06,
"loss": 0.9848,
"step": 1317
},
{
"epoch": 2.6734279918864097,
"grad_norm": 0.296875,
"learning_rate": 1.0885733603786343e-06,
"loss": 1.0522,
"step": 1318
},
{
"epoch": 2.6754563894523327,
"grad_norm": 0.26171875,
"learning_rate": 1.0818120351588912e-06,
"loss": 1.0088,
"step": 1319
},
{
"epoch": 2.6774847870182557,
"grad_norm": 0.283203125,
"learning_rate": 1.0750507099391481e-06,
"loss": 1.0378,
"step": 1320
},
{
"epoch": 2.6795131845841786,
"grad_norm": 0.341796875,
"learning_rate": 1.068289384719405e-06,
"loss": 1.0448,
"step": 1321
},
{
"epoch": 2.6815415821501016,
"grad_norm": 0.291015625,
"learning_rate": 1.061528059499662e-06,
"loss": 1.0244,
"step": 1322
},
{
"epoch": 2.683569979716024,
"grad_norm": 0.259765625,
"learning_rate": 1.0547667342799188e-06,
"loss": 1.0069,
"step": 1323
},
{
"epoch": 2.685598377281947,
"grad_norm": 0.267578125,
"learning_rate": 1.048005409060176e-06,
"loss": 1.042,
"step": 1324
},
{
"epoch": 2.68762677484787,
"grad_norm": 0.248046875,
"learning_rate": 1.0412440838404329e-06,
"loss": 1.0167,
"step": 1325
},
{
"epoch": 2.689655172413793,
"grad_norm": 0.294921875,
"learning_rate": 1.0344827586206898e-06,
"loss": 1.0087,
"step": 1326
},
{
"epoch": 2.691683569979716,
"grad_norm": 0.255859375,
"learning_rate": 1.0277214334009467e-06,
"loss": 1.0285,
"step": 1327
},
{
"epoch": 2.6937119675456387,
"grad_norm": 0.26171875,
"learning_rate": 1.0209601081812036e-06,
"loss": 1.0014,
"step": 1328
},
{
"epoch": 2.695740365111562,
"grad_norm": 0.2451171875,
"learning_rate": 1.0141987829614605e-06,
"loss": 1.0103,
"step": 1329
},
{
"epoch": 2.6977687626774847,
"grad_norm": 0.25,
"learning_rate": 1.0074374577417174e-06,
"loss": 1.0248,
"step": 1330
},
{
"epoch": 2.6997971602434077,
"grad_norm": 0.248046875,
"learning_rate": 1.0006761325219743e-06,
"loss": 1.0008,
"step": 1331
},
{
"epoch": 2.7018255578093306,
"grad_norm": 0.2490234375,
"learning_rate": 9.939148073022313e-07,
"loss": 0.9954,
"step": 1332
},
{
"epoch": 2.7038539553752536,
"grad_norm": 0.25390625,
"learning_rate": 9.871534820824882e-07,
"loss": 1.0192,
"step": 1333
},
{
"epoch": 2.7058823529411766,
"grad_norm": 0.2470703125,
"learning_rate": 9.80392156862745e-07,
"loss": 1.0073,
"step": 1334
},
{
"epoch": 2.707910750507099,
"grad_norm": 0.328125,
"learning_rate": 9.73630831643002e-07,
"loss": 1.0504,
"step": 1335
},
{
"epoch": 2.709939148073022,
"grad_norm": 0.37890625,
"learning_rate": 9.668695064232591e-07,
"loss": 1.0183,
"step": 1336
},
{
"epoch": 2.711967545638945,
"grad_norm": 0.255859375,
"learning_rate": 9.60108181203516e-07,
"loss": 1.0052,
"step": 1337
},
{
"epoch": 2.713995943204868,
"grad_norm": 0.25,
"learning_rate": 9.533468559837728e-07,
"loss": 1.0111,
"step": 1338
},
{
"epoch": 2.716024340770791,
"grad_norm": 0.244140625,
"learning_rate": 9.465855307640299e-07,
"loss": 1.0238,
"step": 1339
},
{
"epoch": 2.718052738336714,
"grad_norm": 0.265625,
"learning_rate": 9.398242055442868e-07,
"loss": 1.0333,
"step": 1340
},
{
"epoch": 2.720081135902637,
"grad_norm": 0.2490234375,
"learning_rate": 9.330628803245437e-07,
"loss": 1.0292,
"step": 1341
},
{
"epoch": 2.7221095334685597,
"grad_norm": 0.244140625,
"learning_rate": 9.263015551048006e-07,
"loss": 0.9643,
"step": 1342
},
{
"epoch": 2.7241379310344827,
"grad_norm": 0.251953125,
"learning_rate": 9.195402298850575e-07,
"loss": 1.0272,
"step": 1343
},
{
"epoch": 2.7261663286004056,
"grad_norm": 0.263671875,
"learning_rate": 9.127789046653145e-07,
"loss": 1.0355,
"step": 1344
},
{
"epoch": 2.7281947261663286,
"grad_norm": 0.255859375,
"learning_rate": 9.060175794455714e-07,
"loss": 1.0653,
"step": 1345
},
{
"epoch": 2.7302231237322516,
"grad_norm": 0.26171875,
"learning_rate": 8.992562542258283e-07,
"loss": 1.0541,
"step": 1346
},
{
"epoch": 2.732251521298174,
"grad_norm": 0.279296875,
"learning_rate": 8.924949290060852e-07,
"loss": 1.0,
"step": 1347
},
{
"epoch": 2.7342799188640976,
"grad_norm": 0.291015625,
"learning_rate": 8.857336037863422e-07,
"loss": 1.0285,
"step": 1348
},
{
"epoch": 2.73630831643002,
"grad_norm": 0.25390625,
"learning_rate": 8.789722785665991e-07,
"loss": 1.0156,
"step": 1349
},
{
"epoch": 2.738336713995943,
"grad_norm": 0.291015625,
"learning_rate": 8.722109533468561e-07,
"loss": 1.0228,
"step": 1350
},
{
"epoch": 2.740365111561866,
"grad_norm": 0.255859375,
"learning_rate": 8.65449628127113e-07,
"loss": 1.0123,
"step": 1351
},
{
"epoch": 2.742393509127789,
"grad_norm": 0.25390625,
"learning_rate": 8.586883029073699e-07,
"loss": 1.0359,
"step": 1352
},
{
"epoch": 2.744421906693712,
"grad_norm": 0.291015625,
"learning_rate": 8.519269776876268e-07,
"loss": 1.0283,
"step": 1353
},
{
"epoch": 2.7464503042596347,
"grad_norm": 0.251953125,
"learning_rate": 8.451656524678837e-07,
"loss": 1.0165,
"step": 1354
},
{
"epoch": 2.7484787018255576,
"grad_norm": 0.25390625,
"learning_rate": 8.384043272481406e-07,
"loss": 1.0471,
"step": 1355
},
{
"epoch": 2.7505070993914806,
"grad_norm": 0.25,
"learning_rate": 8.316430020283977e-07,
"loss": 1.037,
"step": 1356
},
{
"epoch": 2.7525354969574036,
"grad_norm": 0.357421875,
"learning_rate": 8.248816768086546e-07,
"loss": 1.0044,
"step": 1357
},
{
"epoch": 2.7545638945233266,
"grad_norm": 0.259765625,
"learning_rate": 8.181203515889115e-07,
"loss": 1.0651,
"step": 1358
},
{
"epoch": 2.7565922920892496,
"grad_norm": 0.255859375,
"learning_rate": 8.113590263691684e-07,
"loss": 1.0226,
"step": 1359
},
{
"epoch": 2.7586206896551726,
"grad_norm": 0.35546875,
"learning_rate": 8.045977011494253e-07,
"loss": 1.0059,
"step": 1360
},
{
"epoch": 2.760649087221095,
"grad_norm": 0.251953125,
"learning_rate": 7.978363759296822e-07,
"loss": 0.9958,
"step": 1361
},
{
"epoch": 2.762677484787018,
"grad_norm": 0.251953125,
"learning_rate": 7.910750507099392e-07,
"loss": 1.0321,
"step": 1362
},
{
"epoch": 2.764705882352941,
"grad_norm": 0.255859375,
"learning_rate": 7.843137254901962e-07,
"loss": 1.0443,
"step": 1363
},
{
"epoch": 2.766734279918864,
"grad_norm": 0.2578125,
"learning_rate": 7.775524002704531e-07,
"loss": 1.0088,
"step": 1364
},
{
"epoch": 2.768762677484787,
"grad_norm": 0.251953125,
"learning_rate": 7.7079107505071e-07,
"loss": 1.0268,
"step": 1365
},
{
"epoch": 2.77079107505071,
"grad_norm": 0.26171875,
"learning_rate": 7.640297498309669e-07,
"loss": 1.0209,
"step": 1366
},
{
"epoch": 2.772819472616633,
"grad_norm": 0.287109375,
"learning_rate": 7.572684246112239e-07,
"loss": 0.9831,
"step": 1367
},
{
"epoch": 2.7748478701825556,
"grad_norm": 0.275390625,
"learning_rate": 7.505070993914808e-07,
"loss": 0.9727,
"step": 1368
},
{
"epoch": 2.7768762677484786,
"grad_norm": 0.25390625,
"learning_rate": 7.437457741717377e-07,
"loss": 1.012,
"step": 1369
},
{
"epoch": 2.7789046653144016,
"grad_norm": 0.27734375,
"learning_rate": 7.369844489519946e-07,
"loss": 1.0417,
"step": 1370
},
{
"epoch": 2.7809330628803246,
"grad_norm": 0.25,
"learning_rate": 7.302231237322515e-07,
"loss": 0.9739,
"step": 1371
},
{
"epoch": 2.7829614604462476,
"grad_norm": 0.2490234375,
"learning_rate": 7.234617985125085e-07,
"loss": 1.0208,
"step": 1372
},
{
"epoch": 2.78498985801217,
"grad_norm": 0.26953125,
"learning_rate": 7.167004732927655e-07,
"loss": 1.0333,
"step": 1373
},
{
"epoch": 2.7870182555780936,
"grad_norm": 0.2578125,
"learning_rate": 7.099391480730224e-07,
"loss": 0.9805,
"step": 1374
},
{
"epoch": 2.789046653144016,
"grad_norm": 0.26171875,
"learning_rate": 7.031778228532793e-07,
"loss": 1.0046,
"step": 1375
},
{
"epoch": 2.791075050709939,
"grad_norm": 0.3046875,
"learning_rate": 6.964164976335362e-07,
"loss": 1.0241,
"step": 1376
},
{
"epoch": 2.793103448275862,
"grad_norm": 0.25,
"learning_rate": 6.896551724137931e-07,
"loss": 1.0465,
"step": 1377
},
{
"epoch": 2.795131845841785,
"grad_norm": 0.25,
"learning_rate": 6.8289384719405e-07,
"loss": 0.9962,
"step": 1378
},
{
"epoch": 2.797160243407708,
"grad_norm": 0.25,
"learning_rate": 6.76132521974307e-07,
"loss": 1.0428,
"step": 1379
},
{
"epoch": 2.7991886409736306,
"grad_norm": 0.2451171875,
"learning_rate": 6.69371196754564e-07,
"loss": 0.9963,
"step": 1380
},
{
"epoch": 2.8012170385395536,
"grad_norm": 0.25390625,
"learning_rate": 6.626098715348209e-07,
"loss": 0.9982,
"step": 1381
},
{
"epoch": 2.8032454361054766,
"grad_norm": 0.28125,
"learning_rate": 6.558485463150778e-07,
"loss": 1.038,
"step": 1382
},
{
"epoch": 2.8052738336713996,
"grad_norm": 0.255859375,
"learning_rate": 6.490872210953347e-07,
"loss": 1.0555,
"step": 1383
},
{
"epoch": 2.8073022312373226,
"grad_norm": 0.330078125,
"learning_rate": 6.423258958755916e-07,
"loss": 1.0506,
"step": 1384
},
{
"epoch": 2.8093306288032456,
"grad_norm": 0.2578125,
"learning_rate": 6.355645706558486e-07,
"loss": 0.986,
"step": 1385
},
{
"epoch": 2.8113590263691686,
"grad_norm": 0.251953125,
"learning_rate": 6.288032454361055e-07,
"loss": 1.0352,
"step": 1386
},
{
"epoch": 2.813387423935091,
"grad_norm": 0.25390625,
"learning_rate": 6.220419202163624e-07,
"loss": 1.0198,
"step": 1387
},
{
"epoch": 2.815415821501014,
"grad_norm": 0.263671875,
"learning_rate": 6.152805949966194e-07,
"loss": 1.0464,
"step": 1388
},
{
"epoch": 2.817444219066937,
"grad_norm": 0.2470703125,
"learning_rate": 6.085192697768763e-07,
"loss": 1.029,
"step": 1389
},
{
"epoch": 2.81947261663286,
"grad_norm": 0.26171875,
"learning_rate": 6.017579445571333e-07,
"loss": 1.0276,
"step": 1390
},
{
"epoch": 2.821501014198783,
"grad_norm": 0.255859375,
"learning_rate": 5.949966193373902e-07,
"loss": 1.0087,
"step": 1391
},
{
"epoch": 2.8235294117647056,
"grad_norm": 0.265625,
"learning_rate": 5.882352941176471e-07,
"loss": 0.9787,
"step": 1392
},
{
"epoch": 2.825557809330629,
"grad_norm": 0.283203125,
"learning_rate": 5.81473968897904e-07,
"loss": 0.9883,
"step": 1393
},
{
"epoch": 2.8275862068965516,
"grad_norm": 0.3359375,
"learning_rate": 5.747126436781609e-07,
"loss": 1.0263,
"step": 1394
},
{
"epoch": 2.8296146044624746,
"grad_norm": 0.30859375,
"learning_rate": 5.679513184584178e-07,
"loss": 1.0101,
"step": 1395
},
{
"epoch": 2.8316430020283976,
"grad_norm": 0.255859375,
"learning_rate": 5.611899932386749e-07,
"loss": 1.0315,
"step": 1396
},
{
"epoch": 2.8336713995943206,
"grad_norm": 0.2578125,
"learning_rate": 5.544286680189318e-07,
"loss": 1.0142,
"step": 1397
},
{
"epoch": 2.8356997971602436,
"grad_norm": 0.251953125,
"learning_rate": 5.476673427991887e-07,
"loss": 1.007,
"step": 1398
},
{
"epoch": 2.837728194726166,
"grad_norm": 0.26953125,
"learning_rate": 5.409060175794456e-07,
"loss": 1.0391,
"step": 1399
},
{
"epoch": 2.839756592292089,
"grad_norm": 0.2578125,
"learning_rate": 5.341446923597025e-07,
"loss": 1.0453,
"step": 1400
},
{
"epoch": 2.841784989858012,
"grad_norm": 0.255859375,
"learning_rate": 5.273833671399594e-07,
"loss": 1.0113,
"step": 1401
},
{
"epoch": 2.843813387423935,
"grad_norm": 0.251953125,
"learning_rate": 5.206220419202164e-07,
"loss": 1.026,
"step": 1402
},
{
"epoch": 2.845841784989858,
"grad_norm": 0.251953125,
"learning_rate": 5.138607167004734e-07,
"loss": 1.0115,
"step": 1403
},
{
"epoch": 2.847870182555781,
"grad_norm": 0.27734375,
"learning_rate": 5.070993914807303e-07,
"loss": 1.0217,
"step": 1404
},
{
"epoch": 2.849898580121704,
"grad_norm": 0.2578125,
"learning_rate": 5.003380662609872e-07,
"loss": 1.0084,
"step": 1405
},
{
"epoch": 2.8519269776876266,
"grad_norm": 0.314453125,
"learning_rate": 4.935767410412441e-07,
"loss": 0.988,
"step": 1406
},
{
"epoch": 2.8539553752535496,
"grad_norm": 0.291015625,
"learning_rate": 4.86815415821501e-07,
"loss": 1.0294,
"step": 1407
},
{
"epoch": 2.8559837728194726,
"grad_norm": 0.2451171875,
"learning_rate": 4.80054090601758e-07,
"loss": 1.0212,
"step": 1408
},
{
"epoch": 2.8580121703853956,
"grad_norm": 0.259765625,
"learning_rate": 4.732927653820149e-07,
"loss": 1.043,
"step": 1409
},
{
"epoch": 2.8600405679513186,
"grad_norm": 0.3125,
"learning_rate": 4.6653144016227184e-07,
"loss": 0.9945,
"step": 1410
},
{
"epoch": 2.862068965517241,
"grad_norm": 0.291015625,
"learning_rate": 4.5977011494252875e-07,
"loss": 0.9841,
"step": 1411
},
{
"epoch": 2.8640973630831645,
"grad_norm": 0.2490234375,
"learning_rate": 4.530087897227857e-07,
"loss": 1.0136,
"step": 1412
},
{
"epoch": 2.866125760649087,
"grad_norm": 0.3515625,
"learning_rate": 4.462474645030426e-07,
"loss": 1.0142,
"step": 1413
},
{
"epoch": 2.86815415821501,
"grad_norm": 0.271484375,
"learning_rate": 4.3948613928329954e-07,
"loss": 1.022,
"step": 1414
},
{
"epoch": 2.870182555780933,
"grad_norm": 0.25390625,
"learning_rate": 4.327248140635565e-07,
"loss": 1.0315,
"step": 1415
},
{
"epoch": 2.872210953346856,
"grad_norm": 0.2451171875,
"learning_rate": 4.259634888438134e-07,
"loss": 1.0136,
"step": 1416
},
{
"epoch": 2.874239350912779,
"grad_norm": 0.2451171875,
"learning_rate": 4.192021636240703e-07,
"loss": 1.006,
"step": 1417
},
{
"epoch": 2.8762677484787016,
"grad_norm": 0.2470703125,
"learning_rate": 4.124408384043273e-07,
"loss": 0.9906,
"step": 1418
},
{
"epoch": 2.8782961460446246,
"grad_norm": 0.2734375,
"learning_rate": 4.056795131845842e-07,
"loss": 1.0487,
"step": 1419
},
{
"epoch": 2.8803245436105476,
"grad_norm": 0.26171875,
"learning_rate": 3.989181879648411e-07,
"loss": 1.0244,
"step": 1420
},
{
"epoch": 2.8823529411764706,
"grad_norm": 0.296875,
"learning_rate": 3.921568627450981e-07,
"loss": 1.0045,
"step": 1421
},
{
"epoch": 2.8843813387423936,
"grad_norm": 0.259765625,
"learning_rate": 3.85395537525355e-07,
"loss": 1.0387,
"step": 1422
},
{
"epoch": 2.8864097363083165,
"grad_norm": 0.265625,
"learning_rate": 3.7863421230561195e-07,
"loss": 1.0187,
"step": 1423
},
{
"epoch": 2.8884381338742395,
"grad_norm": 0.287109375,
"learning_rate": 3.7187288708586886e-07,
"loss": 0.9929,
"step": 1424
},
{
"epoch": 2.890466531440162,
"grad_norm": 0.25390625,
"learning_rate": 3.651115618661258e-07,
"loss": 1.013,
"step": 1425
},
{
"epoch": 2.892494929006085,
"grad_norm": 0.314453125,
"learning_rate": 3.5835023664638274e-07,
"loss": 0.9687,
"step": 1426
},
{
"epoch": 2.894523326572008,
"grad_norm": 0.267578125,
"learning_rate": 3.5158891142663965e-07,
"loss": 1.0375,
"step": 1427
},
{
"epoch": 2.896551724137931,
"grad_norm": 0.26171875,
"learning_rate": 3.4482758620689656e-07,
"loss": 1.0068,
"step": 1428
},
{
"epoch": 2.898580121703854,
"grad_norm": 0.248046875,
"learning_rate": 3.380662609871535e-07,
"loss": 1.0095,
"step": 1429
},
{
"epoch": 2.900608519269777,
"grad_norm": 0.25,
"learning_rate": 3.3130493576741044e-07,
"loss": 1.0516,
"step": 1430
},
{
"epoch": 2.9026369168357,
"grad_norm": 0.345703125,
"learning_rate": 3.2454361054766735e-07,
"loss": 0.9735,
"step": 1431
},
{
"epoch": 2.9046653144016226,
"grad_norm": 0.24609375,
"learning_rate": 3.177822853279243e-07,
"loss": 0.9895,
"step": 1432
},
{
"epoch": 2.9066937119675456,
"grad_norm": 0.25390625,
"learning_rate": 3.110209601081812e-07,
"loss": 1.0294,
"step": 1433
},
{
"epoch": 2.9087221095334685,
"grad_norm": 0.2470703125,
"learning_rate": 3.0425963488843814e-07,
"loss": 0.9815,
"step": 1434
},
{
"epoch": 2.9107505070993915,
"grad_norm": 0.25390625,
"learning_rate": 2.974983096686951e-07,
"loss": 1.0464,
"step": 1435
},
{
"epoch": 2.9127789046653145,
"grad_norm": 0.296875,
"learning_rate": 2.90736984448952e-07,
"loss": 0.9971,
"step": 1436
},
{
"epoch": 2.914807302231237,
"grad_norm": 0.265625,
"learning_rate": 2.839756592292089e-07,
"loss": 1.0677,
"step": 1437
},
{
"epoch": 2.9168356997971605,
"grad_norm": 0.310546875,
"learning_rate": 2.772143340094659e-07,
"loss": 1.0104,
"step": 1438
},
{
"epoch": 2.918864097363083,
"grad_norm": 0.283203125,
"learning_rate": 2.704530087897228e-07,
"loss": 1.011,
"step": 1439
},
{
"epoch": 2.920892494929006,
"grad_norm": 0.2890625,
"learning_rate": 2.636916835699797e-07,
"loss": 0.9893,
"step": 1440
},
{
"epoch": 2.922920892494929,
"grad_norm": 0.2490234375,
"learning_rate": 2.569303583502367e-07,
"loss": 1.0341,
"step": 1441
},
{
"epoch": 2.924949290060852,
"grad_norm": 0.25390625,
"learning_rate": 2.501690331304936e-07,
"loss": 1.0429,
"step": 1442
},
{
"epoch": 2.926977687626775,
"grad_norm": 0.25,
"learning_rate": 2.434077079107505e-07,
"loss": 0.9993,
"step": 1443
},
{
"epoch": 2.9290060851926976,
"grad_norm": 0.279296875,
"learning_rate": 2.3664638269100746e-07,
"loss": 1.0904,
"step": 1444
},
{
"epoch": 2.9310344827586206,
"grad_norm": 0.2578125,
"learning_rate": 2.2988505747126437e-07,
"loss": 1.0504,
"step": 1445
},
{
"epoch": 2.9330628803245435,
"grad_norm": 0.25,
"learning_rate": 2.231237322515213e-07,
"loss": 0.9868,
"step": 1446
},
{
"epoch": 2.9350912778904665,
"grad_norm": 0.2470703125,
"learning_rate": 2.1636240703177825e-07,
"loss": 1.0201,
"step": 1447
},
{
"epoch": 2.9371196754563895,
"grad_norm": 0.25,
"learning_rate": 2.0960108181203516e-07,
"loss": 1.068,
"step": 1448
},
{
"epoch": 2.9391480730223125,
"grad_norm": 0.2734375,
"learning_rate": 2.028397565922921e-07,
"loss": 1.0115,
"step": 1449
},
{
"epoch": 2.9411764705882355,
"grad_norm": 0.2451171875,
"learning_rate": 1.9607843137254904e-07,
"loss": 0.9892,
"step": 1450
},
{
"epoch": 2.943204868154158,
"grad_norm": 0.68359375,
"learning_rate": 1.8931710615280598e-07,
"loss": 1.0336,
"step": 1451
},
{
"epoch": 2.945233265720081,
"grad_norm": 0.26953125,
"learning_rate": 1.825557809330629e-07,
"loss": 1.0369,
"step": 1452
},
{
"epoch": 2.947261663286004,
"grad_norm": 0.251953125,
"learning_rate": 1.7579445571331983e-07,
"loss": 0.9911,
"step": 1453
},
{
"epoch": 2.949290060851927,
"grad_norm": 0.25390625,
"learning_rate": 1.6903313049357676e-07,
"loss": 1.0183,
"step": 1454
},
{
"epoch": 2.95131845841785,
"grad_norm": 0.259765625,
"learning_rate": 1.6227180527383367e-07,
"loss": 1.0408,
"step": 1455
},
{
"epoch": 2.9533468559837726,
"grad_norm": 0.369140625,
"learning_rate": 1.555104800540906e-07,
"loss": 1.0125,
"step": 1456
},
{
"epoch": 2.955375253549696,
"grad_norm": 0.255859375,
"learning_rate": 1.4874915483434755e-07,
"loss": 1.0095,
"step": 1457
},
{
"epoch": 2.9574036511156185,
"grad_norm": 0.255859375,
"learning_rate": 1.4198782961460446e-07,
"loss": 1.0189,
"step": 1458
},
{
"epoch": 2.9594320486815415,
"grad_norm": 0.3203125,
"learning_rate": 1.352265043948614e-07,
"loss": 1.0923,
"step": 1459
},
{
"epoch": 2.9614604462474645,
"grad_norm": 0.2490234375,
"learning_rate": 1.2846517917511834e-07,
"loss": 1.011,
"step": 1460
},
{
"epoch": 2.9634888438133875,
"grad_norm": 0.37109375,
"learning_rate": 1.2170385395537525e-07,
"loss": 0.9992,
"step": 1461
},
{
"epoch": 2.9655172413793105,
"grad_norm": 0.32421875,
"learning_rate": 1.1494252873563219e-07,
"loss": 1.0249,
"step": 1462
},
{
"epoch": 2.967545638945233,
"grad_norm": 0.2578125,
"learning_rate": 1.0818120351588913e-07,
"loss": 1.0594,
"step": 1463
},
{
"epoch": 2.969574036511156,
"grad_norm": 0.2734375,
"learning_rate": 1.0141987829614605e-07,
"loss": 1.0559,
"step": 1464
},
{
"epoch": 2.971602434077079,
"grad_norm": 0.271484375,
"learning_rate": 9.465855307640299e-08,
"loss": 1.0337,
"step": 1465
},
{
"epoch": 2.973630831643002,
"grad_norm": 0.263671875,
"learning_rate": 8.789722785665991e-08,
"loss": 1.0693,
"step": 1466
},
{
"epoch": 2.975659229208925,
"grad_norm": 0.255859375,
"learning_rate": 8.113590263691684e-08,
"loss": 1.0364,
"step": 1467
},
{
"epoch": 2.977687626774848,
"grad_norm": 0.251953125,
"learning_rate": 7.437457741717378e-08,
"loss": 1.0163,
"step": 1468
},
{
"epoch": 2.979716024340771,
"grad_norm": 0.265625,
"learning_rate": 6.76132521974307e-08,
"loss": 0.9708,
"step": 1469
},
{
"epoch": 2.9817444219066935,
"grad_norm": 0.251953125,
"learning_rate": 6.085192697768762e-08,
"loss": 1.016,
"step": 1470
},
{
"epoch": 2.9837728194726165,
"grad_norm": 0.251953125,
"learning_rate": 5.409060175794456e-08,
"loss": 1.0521,
"step": 1471
},
{
"epoch": 2.9858012170385395,
"grad_norm": 0.298828125,
"learning_rate": 4.7329276538201494e-08,
"loss": 1.0276,
"step": 1472
},
{
"epoch": 2.9878296146044625,
"grad_norm": 0.26171875,
"learning_rate": 4.056795131845842e-08,
"loss": 1.0053,
"step": 1473
},
{
"epoch": 2.9898580121703855,
"grad_norm": 0.27734375,
"learning_rate": 3.380662609871535e-08,
"loss": 0.9943,
"step": 1474
},
{
"epoch": 2.991886409736308,
"grad_norm": 0.265625,
"learning_rate": 2.704530087897228e-08,
"loss": 1.0071,
"step": 1475
},
{
"epoch": 2.9939148073022315,
"grad_norm": 0.255859375,
"learning_rate": 2.028397565922921e-08,
"loss": 1.0272,
"step": 1476
},
{
"epoch": 2.995943204868154,
"grad_norm": 0.25390625,
"learning_rate": 1.352265043948614e-08,
"loss": 1.0101,
"step": 1477
},
{
"epoch": 2.997971602434077,
"grad_norm": 0.251953125,
"learning_rate": 6.76132521974307e-09,
"loss": 1.0342,
"step": 1478
},
{
"epoch": 3.0,
"grad_norm": 0.259765625,
"learning_rate": 0.0,
"loss": 1.0294,
"step": 1479
}
],
"logging_steps": 1.0,
"max_steps": 1479,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.72557905413931e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}