kmchiti's picture
Training in progress, step 1000, checkpoint
053b8cb verified
raw
history blame
168 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.6865774116031582,
"eval_steps": 500,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0006865774116031583,
"grad_norm": 0.48046875,
"learning_rate": 1.1363636363636364e-07,
"loss": 0.3846,
"step": 1
},
{
"epoch": 0.0013731548232063166,
"grad_norm": 0.458984375,
"learning_rate": 2.2727272727272729e-07,
"loss": 0.3841,
"step": 2
},
{
"epoch": 0.0020597322348094747,
"grad_norm": 0.4921875,
"learning_rate": 3.409090909090909e-07,
"loss": 0.3765,
"step": 3
},
{
"epoch": 0.0027463096464126332,
"grad_norm": 0.484375,
"learning_rate": 4.5454545454545457e-07,
"loss": 0.3732,
"step": 4
},
{
"epoch": 0.0034328870580157913,
"grad_norm": 0.55859375,
"learning_rate": 5.681818181818182e-07,
"loss": 0.3724,
"step": 5
},
{
"epoch": 0.004119464469618949,
"grad_norm": 0.5625,
"learning_rate": 6.818181818181818e-07,
"loss": 0.3947,
"step": 6
},
{
"epoch": 0.004806041881222108,
"grad_norm": 0.5625,
"learning_rate": 7.954545454545455e-07,
"loss": 0.3758,
"step": 7
},
{
"epoch": 0.0054926192928252664,
"grad_norm": 0.5703125,
"learning_rate": 9.090909090909091e-07,
"loss": 0.3984,
"step": 8
},
{
"epoch": 0.006179196704428424,
"grad_norm": 0.57421875,
"learning_rate": 1.0227272727272729e-06,
"loss": 0.3905,
"step": 9
},
{
"epoch": 0.006865774116031583,
"grad_norm": 0.51953125,
"learning_rate": 1.1363636363636364e-06,
"loss": 0.3914,
"step": 10
},
{
"epoch": 0.007552351527634741,
"grad_norm": 0.59765625,
"learning_rate": 1.25e-06,
"loss": 0.3939,
"step": 11
},
{
"epoch": 0.008238928939237899,
"grad_norm": 0.515625,
"learning_rate": 1.3636363636363636e-06,
"loss": 0.3767,
"step": 12
},
{
"epoch": 0.008925506350841057,
"grad_norm": 0.54296875,
"learning_rate": 1.4772727272727275e-06,
"loss": 0.3718,
"step": 13
},
{
"epoch": 0.009612083762444216,
"grad_norm": 0.59375,
"learning_rate": 1.590909090909091e-06,
"loss": 0.3907,
"step": 14
},
{
"epoch": 0.010298661174047374,
"grad_norm": 0.478515625,
"learning_rate": 1.7045454545454546e-06,
"loss": 0.3783,
"step": 15
},
{
"epoch": 0.010985238585650533,
"grad_norm": 0.5234375,
"learning_rate": 1.8181818181818183e-06,
"loss": 0.372,
"step": 16
},
{
"epoch": 0.01167181599725369,
"grad_norm": 0.5078125,
"learning_rate": 1.931818181818182e-06,
"loss": 0.3888,
"step": 17
},
{
"epoch": 0.012358393408856848,
"grad_norm": 0.48046875,
"learning_rate": 2.0454545454545457e-06,
"loss": 0.3647,
"step": 18
},
{
"epoch": 0.013044970820460007,
"grad_norm": 0.470703125,
"learning_rate": 2.1590909090909092e-06,
"loss": 0.3687,
"step": 19
},
{
"epoch": 0.013731548232063165,
"grad_norm": 0.431640625,
"learning_rate": 2.2727272727272728e-06,
"loss": 0.3797,
"step": 20
},
{
"epoch": 0.014418125643666324,
"grad_norm": 0.470703125,
"learning_rate": 2.3863636363636367e-06,
"loss": 0.3753,
"step": 21
},
{
"epoch": 0.015104703055269482,
"grad_norm": 0.3984375,
"learning_rate": 2.5e-06,
"loss": 0.366,
"step": 22
},
{
"epoch": 0.01579128046687264,
"grad_norm": 0.55078125,
"learning_rate": 2.6136363636363637e-06,
"loss": 0.3711,
"step": 23
},
{
"epoch": 0.016477857878475798,
"grad_norm": 0.46484375,
"learning_rate": 2.7272727272727272e-06,
"loss": 0.3709,
"step": 24
},
{
"epoch": 0.017164435290078956,
"grad_norm": 0.396484375,
"learning_rate": 2.8409090909090916e-06,
"loss": 0.3615,
"step": 25
},
{
"epoch": 0.017851012701682115,
"grad_norm": 0.375,
"learning_rate": 2.954545454545455e-06,
"loss": 0.3467,
"step": 26
},
{
"epoch": 0.018537590113285273,
"grad_norm": 0.484375,
"learning_rate": 3.0681818181818186e-06,
"loss": 0.3494,
"step": 27
},
{
"epoch": 0.01922416752488843,
"grad_norm": 0.3828125,
"learning_rate": 3.181818181818182e-06,
"loss": 0.3552,
"step": 28
},
{
"epoch": 0.01991074493649159,
"grad_norm": 0.39453125,
"learning_rate": 3.2954545454545456e-06,
"loss": 0.3394,
"step": 29
},
{
"epoch": 0.02059732234809475,
"grad_norm": 0.34375,
"learning_rate": 3.409090909090909e-06,
"loss": 0.335,
"step": 30
},
{
"epoch": 0.021283899759697907,
"grad_norm": 0.32421875,
"learning_rate": 3.522727272727273e-06,
"loss": 0.347,
"step": 31
},
{
"epoch": 0.021970477171301066,
"grad_norm": 0.30859375,
"learning_rate": 3.6363636363636366e-06,
"loss": 0.3467,
"step": 32
},
{
"epoch": 0.02265705458290422,
"grad_norm": 0.29296875,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.3411,
"step": 33
},
{
"epoch": 0.02334363199450738,
"grad_norm": 0.345703125,
"learning_rate": 3.863636363636364e-06,
"loss": 0.3373,
"step": 34
},
{
"epoch": 0.024030209406110538,
"grad_norm": 0.2734375,
"learning_rate": 3.9772727272727275e-06,
"loss": 0.3408,
"step": 35
},
{
"epoch": 0.024716786817713696,
"grad_norm": 0.265625,
"learning_rate": 4.0909090909090915e-06,
"loss": 0.3404,
"step": 36
},
{
"epoch": 0.025403364229316855,
"grad_norm": 0.25390625,
"learning_rate": 4.204545454545455e-06,
"loss": 0.346,
"step": 37
},
{
"epoch": 0.026089941640920013,
"grad_norm": 0.255859375,
"learning_rate": 4.3181818181818185e-06,
"loss": 0.3428,
"step": 38
},
{
"epoch": 0.026776519052523172,
"grad_norm": 0.2373046875,
"learning_rate": 4.4318181818181824e-06,
"loss": 0.3418,
"step": 39
},
{
"epoch": 0.02746309646412633,
"grad_norm": 0.259765625,
"learning_rate": 4.5454545454545455e-06,
"loss": 0.339,
"step": 40
},
{
"epoch": 0.02814967387572949,
"grad_norm": 0.2373046875,
"learning_rate": 4.6590909090909095e-06,
"loss": 0.3137,
"step": 41
},
{
"epoch": 0.028836251287332648,
"grad_norm": 0.2265625,
"learning_rate": 4.772727272727273e-06,
"loss": 0.3213,
"step": 42
},
{
"epoch": 0.029522828698935806,
"grad_norm": 0.220703125,
"learning_rate": 4.8863636363636365e-06,
"loss": 0.3121,
"step": 43
},
{
"epoch": 0.030209406110538965,
"grad_norm": 0.24609375,
"learning_rate": 5e-06,
"loss": 0.3153,
"step": 44
},
{
"epoch": 0.030895983522142123,
"grad_norm": 0.2412109375,
"learning_rate": 5.113636363636364e-06,
"loss": 0.3107,
"step": 45
},
{
"epoch": 0.03158256093374528,
"grad_norm": 0.2021484375,
"learning_rate": 5.2272727272727274e-06,
"loss": 0.2963,
"step": 46
},
{
"epoch": 0.03226913834534844,
"grad_norm": 0.2236328125,
"learning_rate": 5.340909090909091e-06,
"loss": 0.2936,
"step": 47
},
{
"epoch": 0.032955715756951595,
"grad_norm": 0.2119140625,
"learning_rate": 5.4545454545454545e-06,
"loss": 0.2909,
"step": 48
},
{
"epoch": 0.03364229316855476,
"grad_norm": 0.193359375,
"learning_rate": 5.568181818181818e-06,
"loss": 0.2891,
"step": 49
},
{
"epoch": 0.03432887058015791,
"grad_norm": 0.1875,
"learning_rate": 5.681818181818183e-06,
"loss": 0.3015,
"step": 50
},
{
"epoch": 0.035015447991761074,
"grad_norm": 0.1796875,
"learning_rate": 5.795454545454546e-06,
"loss": 0.2934,
"step": 51
},
{
"epoch": 0.03570202540336423,
"grad_norm": 0.1689453125,
"learning_rate": 5.90909090909091e-06,
"loss": 0.2977,
"step": 52
},
{
"epoch": 0.036388602814967384,
"grad_norm": 0.1669921875,
"learning_rate": 6.022727272727273e-06,
"loss": 0.2956,
"step": 53
},
{
"epoch": 0.037075180226570546,
"grad_norm": 0.1708984375,
"learning_rate": 6.136363636363637e-06,
"loss": 0.2857,
"step": 54
},
{
"epoch": 0.0377617576381737,
"grad_norm": 0.16015625,
"learning_rate": 6.25e-06,
"loss": 0.2854,
"step": 55
},
{
"epoch": 0.03844833504977686,
"grad_norm": 0.16015625,
"learning_rate": 6.363636363636364e-06,
"loss": 0.2727,
"step": 56
},
{
"epoch": 0.03913491246138002,
"grad_norm": 0.15625,
"learning_rate": 6.477272727272727e-06,
"loss": 0.2857,
"step": 57
},
{
"epoch": 0.03982148987298318,
"grad_norm": 0.146484375,
"learning_rate": 6.590909090909091e-06,
"loss": 0.2739,
"step": 58
},
{
"epoch": 0.040508067284586335,
"grad_norm": 0.1484375,
"learning_rate": 6.704545454545454e-06,
"loss": 0.276,
"step": 59
},
{
"epoch": 0.0411946446961895,
"grad_norm": 0.14453125,
"learning_rate": 6.818181818181818e-06,
"loss": 0.2705,
"step": 60
},
{
"epoch": 0.04188122210779265,
"grad_norm": 0.1396484375,
"learning_rate": 6.931818181818183e-06,
"loss": 0.2786,
"step": 61
},
{
"epoch": 0.042567799519395814,
"grad_norm": 0.1435546875,
"learning_rate": 7.045454545454546e-06,
"loss": 0.2739,
"step": 62
},
{
"epoch": 0.04325437693099897,
"grad_norm": 0.1416015625,
"learning_rate": 7.15909090909091e-06,
"loss": 0.2608,
"step": 63
},
{
"epoch": 0.04394095434260213,
"grad_norm": 0.154296875,
"learning_rate": 7.272727272727273e-06,
"loss": 0.2702,
"step": 64
},
{
"epoch": 0.04462753175420529,
"grad_norm": 0.1435546875,
"learning_rate": 7.386363636363637e-06,
"loss": 0.2533,
"step": 65
},
{
"epoch": 0.04531410916580844,
"grad_norm": 0.1396484375,
"learning_rate": 7.500000000000001e-06,
"loss": 0.2546,
"step": 66
},
{
"epoch": 0.046000686577411604,
"grad_norm": 0.1455078125,
"learning_rate": 7.613636363636364e-06,
"loss": 0.2556,
"step": 67
},
{
"epoch": 0.04668726398901476,
"grad_norm": 0.14453125,
"learning_rate": 7.727272727272727e-06,
"loss": 0.2565,
"step": 68
},
{
"epoch": 0.04737384140061792,
"grad_norm": 0.1396484375,
"learning_rate": 7.840909090909091e-06,
"loss": 0.2564,
"step": 69
},
{
"epoch": 0.048060418812221076,
"grad_norm": 0.1376953125,
"learning_rate": 7.954545454545455e-06,
"loss": 0.251,
"step": 70
},
{
"epoch": 0.04874699622382424,
"grad_norm": 0.15625,
"learning_rate": 8.068181818181819e-06,
"loss": 0.2331,
"step": 71
},
{
"epoch": 0.04943357363542739,
"grad_norm": 0.154296875,
"learning_rate": 8.181818181818183e-06,
"loss": 0.2446,
"step": 72
},
{
"epoch": 0.050120151047030555,
"grad_norm": 0.130859375,
"learning_rate": 8.295454545454547e-06,
"loss": 0.2319,
"step": 73
},
{
"epoch": 0.05080672845863371,
"grad_norm": 0.14453125,
"learning_rate": 8.40909090909091e-06,
"loss": 0.2242,
"step": 74
},
{
"epoch": 0.05149330587023687,
"grad_norm": 0.15234375,
"learning_rate": 8.522727272727273e-06,
"loss": 0.2326,
"step": 75
},
{
"epoch": 0.05217988328184003,
"grad_norm": 0.138671875,
"learning_rate": 8.636363636363637e-06,
"loss": 0.2288,
"step": 76
},
{
"epoch": 0.05286646069344319,
"grad_norm": 0.1337890625,
"learning_rate": 8.750000000000001e-06,
"loss": 0.2166,
"step": 77
},
{
"epoch": 0.053553038105046344,
"grad_norm": 0.1455078125,
"learning_rate": 8.863636363636365e-06,
"loss": 0.2296,
"step": 78
},
{
"epoch": 0.0542396155166495,
"grad_norm": 0.1474609375,
"learning_rate": 8.977272727272727e-06,
"loss": 0.214,
"step": 79
},
{
"epoch": 0.05492619292825266,
"grad_norm": 0.1455078125,
"learning_rate": 9.090909090909091e-06,
"loss": 0.2157,
"step": 80
},
{
"epoch": 0.055612770339855816,
"grad_norm": 0.142578125,
"learning_rate": 9.204545454545455e-06,
"loss": 0.2127,
"step": 81
},
{
"epoch": 0.05629934775145898,
"grad_norm": 0.138671875,
"learning_rate": 9.318181818181819e-06,
"loss": 0.2045,
"step": 82
},
{
"epoch": 0.05698592516306213,
"grad_norm": 0.138671875,
"learning_rate": 9.431818181818183e-06,
"loss": 0.1994,
"step": 83
},
{
"epoch": 0.057672502574665295,
"grad_norm": 0.1416015625,
"learning_rate": 9.545454545454547e-06,
"loss": 0.2053,
"step": 84
},
{
"epoch": 0.05835907998626845,
"grad_norm": 0.1357421875,
"learning_rate": 9.65909090909091e-06,
"loss": 0.1845,
"step": 85
},
{
"epoch": 0.05904565739787161,
"grad_norm": 0.130859375,
"learning_rate": 9.772727272727273e-06,
"loss": 0.1869,
"step": 86
},
{
"epoch": 0.05973223480947477,
"grad_norm": 0.1435546875,
"learning_rate": 9.886363636363637e-06,
"loss": 0.1928,
"step": 87
},
{
"epoch": 0.06041881222107793,
"grad_norm": 0.1376953125,
"learning_rate": 1e-05,
"loss": 0.1915,
"step": 88
},
{
"epoch": 0.061105389632681084,
"grad_norm": 0.1328125,
"learning_rate": 9.999998787742986e-06,
"loss": 0.1911,
"step": 89
},
{
"epoch": 0.061791967044284246,
"grad_norm": 0.12890625,
"learning_rate": 9.999995150972593e-06,
"loss": 0.1823,
"step": 90
},
{
"epoch": 0.0624785444558874,
"grad_norm": 0.12890625,
"learning_rate": 9.999989089690783e-06,
"loss": 0.1729,
"step": 91
},
{
"epoch": 0.06316512186749056,
"grad_norm": 0.12060546875,
"learning_rate": 9.999980603900821e-06,
"loss": 0.1627,
"step": 92
},
{
"epoch": 0.06385169927909372,
"grad_norm": 0.125,
"learning_rate": 9.99996969360728e-06,
"loss": 0.1691,
"step": 93
},
{
"epoch": 0.06453827669069688,
"grad_norm": 0.123046875,
"learning_rate": 9.999956358816037e-06,
"loss": 0.175,
"step": 94
},
{
"epoch": 0.06522485410230003,
"grad_norm": 0.12451171875,
"learning_rate": 9.999940599534277e-06,
"loss": 0.1685,
"step": 95
},
{
"epoch": 0.06591143151390319,
"grad_norm": 0.11572265625,
"learning_rate": 9.99992241577049e-06,
"loss": 0.1599,
"step": 96
},
{
"epoch": 0.06659800892550635,
"grad_norm": 0.11328125,
"learning_rate": 9.999901807534473e-06,
"loss": 0.1658,
"step": 97
},
{
"epoch": 0.06728458633710951,
"grad_norm": 0.11279296875,
"learning_rate": 9.999878774837331e-06,
"loss": 0.1652,
"step": 98
},
{
"epoch": 0.06797116374871266,
"grad_norm": 0.1083984375,
"learning_rate": 9.999853317691473e-06,
"loss": 0.1499,
"step": 99
},
{
"epoch": 0.06865774116031582,
"grad_norm": 0.1015625,
"learning_rate": 9.999825436110612e-06,
"loss": 0.1522,
"step": 100
},
{
"epoch": 0.06934431857191899,
"grad_norm": 0.109375,
"learning_rate": 9.999795130109777e-06,
"loss": 0.1652,
"step": 101
},
{
"epoch": 0.07003089598352215,
"grad_norm": 0.10205078125,
"learning_rate": 9.999762399705288e-06,
"loss": 0.1515,
"step": 102
},
{
"epoch": 0.0707174733951253,
"grad_norm": 0.0947265625,
"learning_rate": 9.999727244914785e-06,
"loss": 0.1451,
"step": 103
},
{
"epoch": 0.07140405080672846,
"grad_norm": 0.095703125,
"learning_rate": 9.999689665757205e-06,
"loss": 0.1461,
"step": 104
},
{
"epoch": 0.07209062821833162,
"grad_norm": 0.08740234375,
"learning_rate": 9.9996496622528e-06,
"loss": 0.1393,
"step": 105
},
{
"epoch": 0.07277720562993477,
"grad_norm": 0.09033203125,
"learning_rate": 9.999607234423118e-06,
"loss": 0.1367,
"step": 106
},
{
"epoch": 0.07346378304153793,
"grad_norm": 0.0859375,
"learning_rate": 9.99956238229102e-06,
"loss": 0.1269,
"step": 107
},
{
"epoch": 0.07415036045314109,
"grad_norm": 0.08251953125,
"learning_rate": 9.999515105880674e-06,
"loss": 0.1372,
"step": 108
},
{
"epoch": 0.07483693786474425,
"grad_norm": 0.0859375,
"learning_rate": 9.999465405217547e-06,
"loss": 0.1274,
"step": 109
},
{
"epoch": 0.0755235152763474,
"grad_norm": 0.0830078125,
"learning_rate": 9.99941328032842e-06,
"loss": 0.1425,
"step": 110
},
{
"epoch": 0.07621009268795056,
"grad_norm": 0.08251953125,
"learning_rate": 9.999358731241378e-06,
"loss": 0.126,
"step": 111
},
{
"epoch": 0.07689667009955373,
"grad_norm": 0.0849609375,
"learning_rate": 9.999301757985807e-06,
"loss": 0.1378,
"step": 112
},
{
"epoch": 0.07758324751115689,
"grad_norm": 0.0830078125,
"learning_rate": 9.999242360592406e-06,
"loss": 0.1446,
"step": 113
},
{
"epoch": 0.07826982492276004,
"grad_norm": 0.08447265625,
"learning_rate": 9.999180539093177e-06,
"loss": 0.1272,
"step": 114
},
{
"epoch": 0.0789564023343632,
"grad_norm": 0.0830078125,
"learning_rate": 9.999116293521427e-06,
"loss": 0.1317,
"step": 115
},
{
"epoch": 0.07964297974596636,
"grad_norm": 0.076171875,
"learning_rate": 9.999049623911771e-06,
"loss": 0.127,
"step": 116
},
{
"epoch": 0.08032955715756952,
"grad_norm": 0.07275390625,
"learning_rate": 9.998980530300129e-06,
"loss": 0.1318,
"step": 117
},
{
"epoch": 0.08101613456917267,
"grad_norm": 0.07177734375,
"learning_rate": 9.998909012723729e-06,
"loss": 0.1315,
"step": 118
},
{
"epoch": 0.08170271198077583,
"grad_norm": 0.0712890625,
"learning_rate": 9.9988350712211e-06,
"loss": 0.1246,
"step": 119
},
{
"epoch": 0.082389289392379,
"grad_norm": 0.07470703125,
"learning_rate": 9.998758705832084e-06,
"loss": 0.1244,
"step": 120
},
{
"epoch": 0.08307586680398214,
"grad_norm": 0.0732421875,
"learning_rate": 9.998679916597822e-06,
"loss": 0.1238,
"step": 121
},
{
"epoch": 0.0837624442155853,
"grad_norm": 0.0654296875,
"learning_rate": 9.998598703560766e-06,
"loss": 0.126,
"step": 122
},
{
"epoch": 0.08444902162718847,
"grad_norm": 0.0673828125,
"learning_rate": 9.998515066764672e-06,
"loss": 0.1258,
"step": 123
},
{
"epoch": 0.08513559903879163,
"grad_norm": 0.0673828125,
"learning_rate": 9.998429006254601e-06,
"loss": 0.1252,
"step": 124
},
{
"epoch": 0.08582217645039478,
"grad_norm": 0.06396484375,
"learning_rate": 9.99834052207692e-06,
"loss": 0.1163,
"step": 125
},
{
"epoch": 0.08650875386199794,
"grad_norm": 0.07275390625,
"learning_rate": 9.998249614279306e-06,
"loss": 0.1269,
"step": 126
},
{
"epoch": 0.0871953312736011,
"grad_norm": 0.06640625,
"learning_rate": 9.998156282910736e-06,
"loss": 0.1267,
"step": 127
},
{
"epoch": 0.08788190868520426,
"grad_norm": 0.06787109375,
"learning_rate": 9.998060528021493e-06,
"loss": 0.1224,
"step": 128
},
{
"epoch": 0.08856848609680741,
"grad_norm": 0.06298828125,
"learning_rate": 9.997962349663172e-06,
"loss": 0.123,
"step": 129
},
{
"epoch": 0.08925506350841057,
"grad_norm": 0.061279296875,
"learning_rate": 9.997861747888669e-06,
"loss": 0.1259,
"step": 130
},
{
"epoch": 0.08994164092001374,
"grad_norm": 0.06787109375,
"learning_rate": 9.997758722752182e-06,
"loss": 0.1234,
"step": 131
},
{
"epoch": 0.09062821833161688,
"grad_norm": 0.06591796875,
"learning_rate": 9.997653274309225e-06,
"loss": 0.1189,
"step": 132
},
{
"epoch": 0.09131479574322005,
"grad_norm": 0.060791015625,
"learning_rate": 9.997545402616607e-06,
"loss": 0.1156,
"step": 133
},
{
"epoch": 0.09200137315482321,
"grad_norm": 0.062255859375,
"learning_rate": 9.997435107732451e-06,
"loss": 0.1192,
"step": 134
},
{
"epoch": 0.09268795056642637,
"grad_norm": 0.061767578125,
"learning_rate": 9.997322389716178e-06,
"loss": 0.1104,
"step": 135
},
{
"epoch": 0.09337452797802952,
"grad_norm": 0.0634765625,
"learning_rate": 9.99720724862852e-06,
"loss": 0.1199,
"step": 136
},
{
"epoch": 0.09406110538963268,
"grad_norm": 0.0625,
"learning_rate": 9.997089684531514e-06,
"loss": 0.1147,
"step": 137
},
{
"epoch": 0.09474768280123584,
"grad_norm": 0.056884765625,
"learning_rate": 9.996969697488499e-06,
"loss": 0.1186,
"step": 138
},
{
"epoch": 0.095434260212839,
"grad_norm": 0.056396484375,
"learning_rate": 9.996847287564124e-06,
"loss": 0.1054,
"step": 139
},
{
"epoch": 0.09612083762444215,
"grad_norm": 0.064453125,
"learning_rate": 9.99672245482434e-06,
"loss": 0.119,
"step": 140
},
{
"epoch": 0.09680741503604531,
"grad_norm": 0.055908203125,
"learning_rate": 9.996595199336404e-06,
"loss": 0.1132,
"step": 141
},
{
"epoch": 0.09749399244764848,
"grad_norm": 0.08251953125,
"learning_rate": 9.99646552116888e-06,
"loss": 0.119,
"step": 142
},
{
"epoch": 0.09818056985925164,
"grad_norm": 0.059326171875,
"learning_rate": 9.996333420391635e-06,
"loss": 0.1144,
"step": 143
},
{
"epoch": 0.09886714727085479,
"grad_norm": 0.07080078125,
"learning_rate": 9.996198897075842e-06,
"loss": 0.1195,
"step": 144
},
{
"epoch": 0.09955372468245795,
"grad_norm": 0.056884765625,
"learning_rate": 9.996061951293981e-06,
"loss": 0.1136,
"step": 145
},
{
"epoch": 0.10024030209406111,
"grad_norm": 0.06884765625,
"learning_rate": 9.995922583119836e-06,
"loss": 0.1078,
"step": 146
},
{
"epoch": 0.10092687950566426,
"grad_norm": 0.060791015625,
"learning_rate": 9.995780792628494e-06,
"loss": 0.1192,
"step": 147
},
{
"epoch": 0.10161345691726742,
"grad_norm": 0.0537109375,
"learning_rate": 9.99563657989635e-06,
"loss": 0.103,
"step": 148
},
{
"epoch": 0.10230003432887058,
"grad_norm": 0.064453125,
"learning_rate": 9.995489945001104e-06,
"loss": 0.1227,
"step": 149
},
{
"epoch": 0.10298661174047374,
"grad_norm": 0.057861328125,
"learning_rate": 9.995340888021761e-06,
"loss": 0.1059,
"step": 150
},
{
"epoch": 0.10367318915207689,
"grad_norm": 0.054443359375,
"learning_rate": 9.995189409038626e-06,
"loss": 0.1101,
"step": 151
},
{
"epoch": 0.10435976656368005,
"grad_norm": 0.06787109375,
"learning_rate": 9.995035508133316e-06,
"loss": 0.1257,
"step": 152
},
{
"epoch": 0.10504634397528322,
"grad_norm": 0.05712890625,
"learning_rate": 9.994879185388747e-06,
"loss": 0.1123,
"step": 153
},
{
"epoch": 0.10573292138688638,
"grad_norm": 0.055908203125,
"learning_rate": 9.994720440889147e-06,
"loss": 0.1118,
"step": 154
},
{
"epoch": 0.10641949879848953,
"grad_norm": 0.057373046875,
"learning_rate": 9.994559274720043e-06,
"loss": 0.1161,
"step": 155
},
{
"epoch": 0.10710607621009269,
"grad_norm": 0.05419921875,
"learning_rate": 9.994395686968267e-06,
"loss": 0.1159,
"step": 156
},
{
"epoch": 0.10779265362169585,
"grad_norm": 0.0654296875,
"learning_rate": 9.994229677721957e-06,
"loss": 0.1072,
"step": 157
},
{
"epoch": 0.108479231033299,
"grad_norm": 0.06640625,
"learning_rate": 9.994061247070557e-06,
"loss": 0.1131,
"step": 158
},
{
"epoch": 0.10916580844490216,
"grad_norm": 0.057861328125,
"learning_rate": 9.993890395104812e-06,
"loss": 0.1125,
"step": 159
},
{
"epoch": 0.10985238585650532,
"grad_norm": 0.050537109375,
"learning_rate": 9.993717121916778e-06,
"loss": 0.108,
"step": 160
},
{
"epoch": 0.11053896326810848,
"grad_norm": 0.060546875,
"learning_rate": 9.993541427599805e-06,
"loss": 0.1077,
"step": 161
},
{
"epoch": 0.11122554067971163,
"grad_norm": 0.058837890625,
"learning_rate": 9.993363312248559e-06,
"loss": 0.1006,
"step": 162
},
{
"epoch": 0.1119121180913148,
"grad_norm": 0.056396484375,
"learning_rate": 9.993182775959004e-06,
"loss": 0.1138,
"step": 163
},
{
"epoch": 0.11259869550291796,
"grad_norm": 0.056640625,
"learning_rate": 9.992999818828408e-06,
"loss": 0.1093,
"step": 164
},
{
"epoch": 0.11328527291452112,
"grad_norm": 0.059814453125,
"learning_rate": 9.992814440955346e-06,
"loss": 0.1042,
"step": 165
},
{
"epoch": 0.11397185032612427,
"grad_norm": 0.05419921875,
"learning_rate": 9.992626642439695e-06,
"loss": 0.1077,
"step": 166
},
{
"epoch": 0.11465842773772743,
"grad_norm": 0.052490234375,
"learning_rate": 9.992436423382639e-06,
"loss": 0.1057,
"step": 167
},
{
"epoch": 0.11534500514933059,
"grad_norm": 0.055419921875,
"learning_rate": 9.992243783886663e-06,
"loss": 0.1147,
"step": 168
},
{
"epoch": 0.11603158256093375,
"grad_norm": 0.052001953125,
"learning_rate": 9.992048724055559e-06,
"loss": 0.1167,
"step": 169
},
{
"epoch": 0.1167181599725369,
"grad_norm": 0.053466796875,
"learning_rate": 9.99185124399442e-06,
"loss": 0.1223,
"step": 170
},
{
"epoch": 0.11740473738414006,
"grad_norm": 0.0546875,
"learning_rate": 9.991651343809642e-06,
"loss": 0.1134,
"step": 171
},
{
"epoch": 0.11809131479574322,
"grad_norm": 0.0546875,
"learning_rate": 9.991449023608932e-06,
"loss": 0.1033,
"step": 172
},
{
"epoch": 0.11877789220734637,
"grad_norm": 0.05126953125,
"learning_rate": 9.991244283501294e-06,
"loss": 0.1045,
"step": 173
},
{
"epoch": 0.11946446961894953,
"grad_norm": 0.049072265625,
"learning_rate": 9.991037123597041e-06,
"loss": 0.1018,
"step": 174
},
{
"epoch": 0.1201510470305527,
"grad_norm": 0.055419921875,
"learning_rate": 9.990827544007783e-06,
"loss": 0.1089,
"step": 175
},
{
"epoch": 0.12083762444215586,
"grad_norm": 0.05322265625,
"learning_rate": 9.990615544846439e-06,
"loss": 0.1114,
"step": 176
},
{
"epoch": 0.121524201853759,
"grad_norm": 0.051513671875,
"learning_rate": 9.99040112622723e-06,
"loss": 0.1115,
"step": 177
},
{
"epoch": 0.12221077926536217,
"grad_norm": 0.056396484375,
"learning_rate": 9.990184288265679e-06,
"loss": 0.1022,
"step": 178
},
{
"epoch": 0.12289735667696533,
"grad_norm": 0.056884765625,
"learning_rate": 9.989965031078616e-06,
"loss": 0.0975,
"step": 179
},
{
"epoch": 0.12358393408856849,
"grad_norm": 0.046630859375,
"learning_rate": 9.989743354784174e-06,
"loss": 0.0964,
"step": 180
},
{
"epoch": 0.12427051150017164,
"grad_norm": 0.05859375,
"learning_rate": 9.989519259501786e-06,
"loss": 0.1064,
"step": 181
},
{
"epoch": 0.1249570889117748,
"grad_norm": 0.0537109375,
"learning_rate": 9.989292745352191e-06,
"loss": 0.0997,
"step": 182
},
{
"epoch": 0.12564366632337795,
"grad_norm": 0.054443359375,
"learning_rate": 9.98906381245743e-06,
"loss": 0.1122,
"step": 183
},
{
"epoch": 0.1263302437349811,
"grad_norm": 0.053955078125,
"learning_rate": 9.988832460940846e-06,
"loss": 0.0986,
"step": 184
},
{
"epoch": 0.12701682114658427,
"grad_norm": 0.05322265625,
"learning_rate": 9.98859869092709e-06,
"loss": 0.1093,
"step": 185
},
{
"epoch": 0.12770339855818744,
"grad_norm": 0.0498046875,
"learning_rate": 9.98836250254211e-06,
"loss": 0.0981,
"step": 186
},
{
"epoch": 0.1283899759697906,
"grad_norm": 0.051025390625,
"learning_rate": 9.988123895913162e-06,
"loss": 0.1165,
"step": 187
},
{
"epoch": 0.12907655338139376,
"grad_norm": 0.052978515625,
"learning_rate": 9.987882871168801e-06,
"loss": 0.1141,
"step": 188
},
{
"epoch": 0.12976313079299692,
"grad_norm": 0.0458984375,
"learning_rate": 9.987639428438888e-06,
"loss": 0.1002,
"step": 189
},
{
"epoch": 0.13044970820460006,
"grad_norm": 0.05029296875,
"learning_rate": 9.987393567854585e-06,
"loss": 0.0978,
"step": 190
},
{
"epoch": 0.13113628561620322,
"grad_norm": 0.05419921875,
"learning_rate": 9.987145289548356e-06,
"loss": 0.0973,
"step": 191
},
{
"epoch": 0.13182286302780638,
"grad_norm": 0.053466796875,
"learning_rate": 9.986894593653969e-06,
"loss": 0.1088,
"step": 192
},
{
"epoch": 0.13250944043940954,
"grad_norm": 0.0478515625,
"learning_rate": 9.986641480306495e-06,
"loss": 0.1175,
"step": 193
},
{
"epoch": 0.1331960178510127,
"grad_norm": 0.0498046875,
"learning_rate": 9.986385949642307e-06,
"loss": 0.1183,
"step": 194
},
{
"epoch": 0.13388259526261587,
"grad_norm": 0.058837890625,
"learning_rate": 9.986128001799077e-06,
"loss": 0.1023,
"step": 195
},
{
"epoch": 0.13456917267421903,
"grad_norm": 0.05029296875,
"learning_rate": 9.985867636915784e-06,
"loss": 0.1152,
"step": 196
},
{
"epoch": 0.13525575008582216,
"grad_norm": 0.05029296875,
"learning_rate": 9.98560485513271e-06,
"loss": 0.1187,
"step": 197
},
{
"epoch": 0.13594232749742532,
"grad_norm": 0.05029296875,
"learning_rate": 9.985339656591434e-06,
"loss": 0.1111,
"step": 198
},
{
"epoch": 0.1366289049090285,
"grad_norm": 0.049560546875,
"learning_rate": 9.985072041434841e-06,
"loss": 0.1104,
"step": 199
},
{
"epoch": 0.13731548232063165,
"grad_norm": 0.045166015625,
"learning_rate": 9.984802009807117e-06,
"loss": 0.1009,
"step": 200
},
{
"epoch": 0.1380020597322348,
"grad_norm": 0.048583984375,
"learning_rate": 9.984529561853749e-06,
"loss": 0.0999,
"step": 201
},
{
"epoch": 0.13868863714383797,
"grad_norm": 0.049560546875,
"learning_rate": 9.984254697721528e-06,
"loss": 0.1025,
"step": 202
},
{
"epoch": 0.13937521455544113,
"grad_norm": 0.052490234375,
"learning_rate": 9.983977417558544e-06,
"loss": 0.1009,
"step": 203
},
{
"epoch": 0.1400617919670443,
"grad_norm": 0.0498046875,
"learning_rate": 9.98369772151419e-06,
"loss": 0.1008,
"step": 204
},
{
"epoch": 0.14074836937864743,
"grad_norm": 0.059326171875,
"learning_rate": 9.983415609739165e-06,
"loss": 0.107,
"step": 205
},
{
"epoch": 0.1414349467902506,
"grad_norm": 0.0576171875,
"learning_rate": 9.98313108238546e-06,
"loss": 0.1196,
"step": 206
},
{
"epoch": 0.14212152420185376,
"grad_norm": 0.053955078125,
"learning_rate": 9.982844139606375e-06,
"loss": 0.1,
"step": 207
},
{
"epoch": 0.14280810161345692,
"grad_norm": 0.059814453125,
"learning_rate": 9.982554781556512e-06,
"loss": 0.1047,
"step": 208
},
{
"epoch": 0.14349467902506008,
"grad_norm": 0.04638671875,
"learning_rate": 9.982263008391769e-06,
"loss": 0.1052,
"step": 209
},
{
"epoch": 0.14418125643666324,
"grad_norm": 0.051025390625,
"learning_rate": 9.981968820269347e-06,
"loss": 0.0929,
"step": 210
},
{
"epoch": 0.1448678338482664,
"grad_norm": 0.056640625,
"learning_rate": 9.98167221734775e-06,
"loss": 0.1063,
"step": 211
},
{
"epoch": 0.14555441125986954,
"grad_norm": 0.0556640625,
"learning_rate": 9.981373199786782e-06,
"loss": 0.0981,
"step": 212
},
{
"epoch": 0.1462409886714727,
"grad_norm": 0.056396484375,
"learning_rate": 9.981071767747547e-06,
"loss": 0.0982,
"step": 213
},
{
"epoch": 0.14692756608307586,
"grad_norm": 0.052490234375,
"learning_rate": 9.980767921392453e-06,
"loss": 0.1059,
"step": 214
},
{
"epoch": 0.14761414349467902,
"grad_norm": 0.0546875,
"learning_rate": 9.980461660885204e-06,
"loss": 0.1098,
"step": 215
},
{
"epoch": 0.14830072090628219,
"grad_norm": 0.048583984375,
"learning_rate": 9.98015298639081e-06,
"loss": 0.1041,
"step": 216
},
{
"epoch": 0.14898729831788535,
"grad_norm": 0.053955078125,
"learning_rate": 9.979841898075577e-06,
"loss": 0.097,
"step": 217
},
{
"epoch": 0.1496738757294885,
"grad_norm": 0.052001953125,
"learning_rate": 9.979528396107114e-06,
"loss": 0.1045,
"step": 218
},
{
"epoch": 0.15036045314109167,
"grad_norm": 0.057373046875,
"learning_rate": 9.97921248065433e-06,
"loss": 0.1028,
"step": 219
},
{
"epoch": 0.1510470305526948,
"grad_norm": 0.052490234375,
"learning_rate": 9.978894151887435e-06,
"loss": 0.1006,
"step": 220
},
{
"epoch": 0.15173360796429797,
"grad_norm": 0.051025390625,
"learning_rate": 9.978573409977937e-06,
"loss": 0.1098,
"step": 221
},
{
"epoch": 0.15242018537590113,
"grad_norm": 0.052734375,
"learning_rate": 9.978250255098645e-06,
"loss": 0.1036,
"step": 222
},
{
"epoch": 0.1531067627875043,
"grad_norm": 0.048095703125,
"learning_rate": 9.977924687423672e-06,
"loss": 0.0981,
"step": 223
},
{
"epoch": 0.15379334019910745,
"grad_norm": 0.05029296875,
"learning_rate": 9.977596707128424e-06,
"loss": 0.0971,
"step": 224
},
{
"epoch": 0.15447991761071062,
"grad_norm": 0.054443359375,
"learning_rate": 9.977266314389611e-06,
"loss": 0.0996,
"step": 225
},
{
"epoch": 0.15516649502231378,
"grad_norm": 0.060546875,
"learning_rate": 9.976933509385245e-06,
"loss": 0.1187,
"step": 226
},
{
"epoch": 0.1558530724339169,
"grad_norm": 0.052978515625,
"learning_rate": 9.976598292294632e-06,
"loss": 0.1016,
"step": 227
},
{
"epoch": 0.15653964984552007,
"grad_norm": 0.049560546875,
"learning_rate": 9.976260663298384e-06,
"loss": 0.0957,
"step": 228
},
{
"epoch": 0.15722622725712324,
"grad_norm": 0.060791015625,
"learning_rate": 9.975920622578403e-06,
"loss": 0.0985,
"step": 229
},
{
"epoch": 0.1579128046687264,
"grad_norm": 0.050048828125,
"learning_rate": 9.975578170317905e-06,
"loss": 0.1004,
"step": 230
},
{
"epoch": 0.15859938208032956,
"grad_norm": 0.047607421875,
"learning_rate": 9.97523330670139e-06,
"loss": 0.1095,
"step": 231
},
{
"epoch": 0.15928595949193272,
"grad_norm": 0.053955078125,
"learning_rate": 9.974886031914665e-06,
"loss": 0.1072,
"step": 232
},
{
"epoch": 0.15997253690353588,
"grad_norm": 0.07275390625,
"learning_rate": 9.974536346144838e-06,
"loss": 0.1003,
"step": 233
},
{
"epoch": 0.16065911431513905,
"grad_norm": 0.05126953125,
"learning_rate": 9.974184249580309e-06,
"loss": 0.1118,
"step": 234
},
{
"epoch": 0.16134569172674218,
"grad_norm": 0.0517578125,
"learning_rate": 9.973829742410784e-06,
"loss": 0.1,
"step": 235
},
{
"epoch": 0.16203226913834534,
"grad_norm": 0.056396484375,
"learning_rate": 9.973472824827262e-06,
"loss": 0.103,
"step": 236
},
{
"epoch": 0.1627188465499485,
"grad_norm": 0.045166015625,
"learning_rate": 9.973113497022047e-06,
"loss": 0.098,
"step": 237
},
{
"epoch": 0.16340542396155167,
"grad_norm": 0.045166015625,
"learning_rate": 9.972751759188736e-06,
"loss": 0.1063,
"step": 238
},
{
"epoch": 0.16409200137315483,
"grad_norm": 0.04931640625,
"learning_rate": 9.972387611522227e-06,
"loss": 0.1141,
"step": 239
},
{
"epoch": 0.164778578784758,
"grad_norm": 0.04833984375,
"learning_rate": 9.972021054218712e-06,
"loss": 0.1058,
"step": 240
},
{
"epoch": 0.16546515619636115,
"grad_norm": 0.052734375,
"learning_rate": 9.971652087475691e-06,
"loss": 0.1034,
"step": 241
},
{
"epoch": 0.16615173360796429,
"grad_norm": 0.056884765625,
"learning_rate": 9.971280711491952e-06,
"loss": 0.1071,
"step": 242
},
{
"epoch": 0.16683831101956745,
"grad_norm": 0.05419921875,
"learning_rate": 9.970906926467588e-06,
"loss": 0.1097,
"step": 243
},
{
"epoch": 0.1675248884311706,
"grad_norm": 0.0498046875,
"learning_rate": 9.970530732603984e-06,
"loss": 0.1005,
"step": 244
},
{
"epoch": 0.16821146584277377,
"grad_norm": 0.04833984375,
"learning_rate": 9.97015213010383e-06,
"loss": 0.1067,
"step": 245
},
{
"epoch": 0.16889804325437693,
"grad_norm": 0.051025390625,
"learning_rate": 9.969771119171108e-06,
"loss": 0.0961,
"step": 246
},
{
"epoch": 0.1695846206659801,
"grad_norm": 0.053466796875,
"learning_rate": 9.969387700011098e-06,
"loss": 0.0966,
"step": 247
},
{
"epoch": 0.17027119807758326,
"grad_norm": 0.059814453125,
"learning_rate": 9.969001872830383e-06,
"loss": 0.1192,
"step": 248
},
{
"epoch": 0.1709577754891864,
"grad_norm": 0.06298828125,
"learning_rate": 9.968613637836833e-06,
"loss": 0.1069,
"step": 249
},
{
"epoch": 0.17164435290078955,
"grad_norm": 0.05517578125,
"learning_rate": 9.968222995239628e-06,
"loss": 0.1083,
"step": 250
},
{
"epoch": 0.17233093031239272,
"grad_norm": 0.052001953125,
"learning_rate": 9.967829945249234e-06,
"loss": 0.0978,
"step": 251
},
{
"epoch": 0.17301750772399588,
"grad_norm": 0.048828125,
"learning_rate": 9.967434488077422e-06,
"loss": 0.1058,
"step": 252
},
{
"epoch": 0.17370408513559904,
"grad_norm": 0.05029296875,
"learning_rate": 9.967036623937252e-06,
"loss": 0.0946,
"step": 253
},
{
"epoch": 0.1743906625472022,
"grad_norm": 0.0537109375,
"learning_rate": 9.966636353043092e-06,
"loss": 0.095,
"step": 254
},
{
"epoch": 0.17507723995880536,
"grad_norm": 0.047119140625,
"learning_rate": 9.966233675610599e-06,
"loss": 0.1071,
"step": 255
},
{
"epoch": 0.17576381737040853,
"grad_norm": 0.055908203125,
"learning_rate": 9.965828591856725e-06,
"loss": 0.0944,
"step": 256
},
{
"epoch": 0.17645039478201166,
"grad_norm": 0.048828125,
"learning_rate": 9.965421101999721e-06,
"loss": 0.1117,
"step": 257
},
{
"epoch": 0.17713697219361482,
"grad_norm": 0.0478515625,
"learning_rate": 9.965011206259138e-06,
"loss": 0.1015,
"step": 258
},
{
"epoch": 0.17782354960521798,
"grad_norm": 0.052490234375,
"learning_rate": 9.964598904855818e-06,
"loss": 0.1103,
"step": 259
},
{
"epoch": 0.17851012701682115,
"grad_norm": 0.04931640625,
"learning_rate": 9.964184198011903e-06,
"loss": 0.11,
"step": 260
},
{
"epoch": 0.1791967044284243,
"grad_norm": 0.052734375,
"learning_rate": 9.963767085950824e-06,
"loss": 0.1093,
"step": 261
},
{
"epoch": 0.17988328184002747,
"grad_norm": 0.043701171875,
"learning_rate": 9.96334756889732e-06,
"loss": 0.1038,
"step": 262
},
{
"epoch": 0.18056985925163063,
"grad_norm": 0.050537109375,
"learning_rate": 9.962925647077414e-06,
"loss": 0.1021,
"step": 263
},
{
"epoch": 0.18125643666323377,
"grad_norm": 0.050537109375,
"learning_rate": 9.962501320718432e-06,
"loss": 0.0979,
"step": 264
},
{
"epoch": 0.18194301407483693,
"grad_norm": 0.050048828125,
"learning_rate": 9.96207459004899e-06,
"loss": 0.1059,
"step": 265
},
{
"epoch": 0.1826295914864401,
"grad_norm": 0.051513671875,
"learning_rate": 9.961645455299006e-06,
"loss": 0.107,
"step": 266
},
{
"epoch": 0.18331616889804325,
"grad_norm": 0.044189453125,
"learning_rate": 9.961213916699685e-06,
"loss": 0.1066,
"step": 267
},
{
"epoch": 0.18400274630964641,
"grad_norm": 0.04736328125,
"learning_rate": 9.960779974483537e-06,
"loss": 0.1127,
"step": 268
},
{
"epoch": 0.18468932372124958,
"grad_norm": 0.04833984375,
"learning_rate": 9.96034362888436e-06,
"loss": 0.1034,
"step": 269
},
{
"epoch": 0.18537590113285274,
"grad_norm": 0.05517578125,
"learning_rate": 9.959904880137246e-06,
"loss": 0.121,
"step": 270
},
{
"epoch": 0.1860624785444559,
"grad_norm": 0.0478515625,
"learning_rate": 9.959463728478586e-06,
"loss": 0.1028,
"step": 271
},
{
"epoch": 0.18674905595605903,
"grad_norm": 0.049560546875,
"learning_rate": 9.959020174146066e-06,
"loss": 0.1042,
"step": 272
},
{
"epoch": 0.1874356333676622,
"grad_norm": 0.06201171875,
"learning_rate": 9.958574217378663e-06,
"loss": 0.1,
"step": 273
},
{
"epoch": 0.18812221077926536,
"grad_norm": 0.0517578125,
"learning_rate": 9.95812585841665e-06,
"loss": 0.095,
"step": 274
},
{
"epoch": 0.18880878819086852,
"grad_norm": 0.0517578125,
"learning_rate": 9.957675097501594e-06,
"loss": 0.1086,
"step": 275
},
{
"epoch": 0.18949536560247168,
"grad_norm": 0.053955078125,
"learning_rate": 9.957221934876355e-06,
"loss": 0.1024,
"step": 276
},
{
"epoch": 0.19018194301407484,
"grad_norm": 0.049560546875,
"learning_rate": 9.956766370785093e-06,
"loss": 0.0951,
"step": 277
},
{
"epoch": 0.190868520425678,
"grad_norm": 0.048583984375,
"learning_rate": 9.956308405473252e-06,
"loss": 0.1097,
"step": 278
},
{
"epoch": 0.19155509783728114,
"grad_norm": 0.0478515625,
"learning_rate": 9.95584803918758e-06,
"loss": 0.1037,
"step": 279
},
{
"epoch": 0.1922416752488843,
"grad_norm": 0.04638671875,
"learning_rate": 9.955385272176108e-06,
"loss": 0.1065,
"step": 280
},
{
"epoch": 0.19292825266048746,
"grad_norm": 0.052490234375,
"learning_rate": 9.95492010468817e-06,
"loss": 0.0983,
"step": 281
},
{
"epoch": 0.19361483007209063,
"grad_norm": 0.05029296875,
"learning_rate": 9.954452536974387e-06,
"loss": 0.1056,
"step": 282
},
{
"epoch": 0.1943014074836938,
"grad_norm": 0.047607421875,
"learning_rate": 9.953982569286679e-06,
"loss": 0.0873,
"step": 283
},
{
"epoch": 0.19498798489529695,
"grad_norm": 0.05322265625,
"learning_rate": 9.953510201878251e-06,
"loss": 0.1029,
"step": 284
},
{
"epoch": 0.1956745623069001,
"grad_norm": 0.04541015625,
"learning_rate": 9.953035435003608e-06,
"loss": 0.0978,
"step": 285
},
{
"epoch": 0.19636113971850327,
"grad_norm": 0.0556640625,
"learning_rate": 9.952558268918546e-06,
"loss": 0.1034,
"step": 286
},
{
"epoch": 0.1970477171301064,
"grad_norm": 0.050048828125,
"learning_rate": 9.952078703880153e-06,
"loss": 0.0946,
"step": 287
},
{
"epoch": 0.19773429454170957,
"grad_norm": 0.05224609375,
"learning_rate": 9.951596740146809e-06,
"loss": 0.1014,
"step": 288
},
{
"epoch": 0.19842087195331273,
"grad_norm": 0.055419921875,
"learning_rate": 9.951112377978185e-06,
"loss": 0.0935,
"step": 289
},
{
"epoch": 0.1991074493649159,
"grad_norm": 0.0498046875,
"learning_rate": 9.950625617635247e-06,
"loss": 0.1053,
"step": 290
},
{
"epoch": 0.19979402677651906,
"grad_norm": 0.056396484375,
"learning_rate": 9.950136459380253e-06,
"loss": 0.1005,
"step": 291
},
{
"epoch": 0.20048060418812222,
"grad_norm": 0.05419921875,
"learning_rate": 9.949644903476752e-06,
"loss": 0.1031,
"step": 292
},
{
"epoch": 0.20116718159972538,
"grad_norm": 0.061279296875,
"learning_rate": 9.949150950189586e-06,
"loss": 0.0987,
"step": 293
},
{
"epoch": 0.20185375901132852,
"grad_norm": 0.06005859375,
"learning_rate": 9.948654599784886e-06,
"loss": 0.1033,
"step": 294
},
{
"epoch": 0.20254033642293168,
"grad_norm": 0.05419921875,
"learning_rate": 9.948155852530075e-06,
"loss": 0.0913,
"step": 295
},
{
"epoch": 0.20322691383453484,
"grad_norm": 0.052734375,
"learning_rate": 9.947654708693872e-06,
"loss": 0.1012,
"step": 296
},
{
"epoch": 0.203913491246138,
"grad_norm": 0.050048828125,
"learning_rate": 9.947151168546281e-06,
"loss": 0.1038,
"step": 297
},
{
"epoch": 0.20460006865774116,
"grad_norm": 0.0478515625,
"learning_rate": 9.946645232358602e-06,
"loss": 0.0978,
"step": 298
},
{
"epoch": 0.20528664606934433,
"grad_norm": 0.056396484375,
"learning_rate": 9.94613690040342e-06,
"loss": 0.1099,
"step": 299
},
{
"epoch": 0.2059732234809475,
"grad_norm": 0.04931640625,
"learning_rate": 9.945626172954617e-06,
"loss": 0.1112,
"step": 300
},
{
"epoch": 0.20665980089255062,
"grad_norm": 0.05029296875,
"learning_rate": 9.945113050287363e-06,
"loss": 0.0974,
"step": 301
},
{
"epoch": 0.20734637830415378,
"grad_norm": 0.060546875,
"learning_rate": 9.94459753267812e-06,
"loss": 0.0924,
"step": 302
},
{
"epoch": 0.20803295571575695,
"grad_norm": 0.057373046875,
"learning_rate": 9.944079620404638e-06,
"loss": 0.0959,
"step": 303
},
{
"epoch": 0.2087195331273601,
"grad_norm": 0.045654296875,
"learning_rate": 9.943559313745957e-06,
"loss": 0.1024,
"step": 304
},
{
"epoch": 0.20940611053896327,
"grad_norm": 0.046142578125,
"learning_rate": 9.943036612982409e-06,
"loss": 0.1018,
"step": 305
},
{
"epoch": 0.21009268795056643,
"grad_norm": 0.046875,
"learning_rate": 9.942511518395616e-06,
"loss": 0.1018,
"step": 306
},
{
"epoch": 0.2107792653621696,
"grad_norm": 0.05517578125,
"learning_rate": 9.941984030268487e-06,
"loss": 0.0995,
"step": 307
},
{
"epoch": 0.21146584277377276,
"grad_norm": 0.05712890625,
"learning_rate": 9.941454148885226e-06,
"loss": 0.1024,
"step": 308
},
{
"epoch": 0.2121524201853759,
"grad_norm": 0.052001953125,
"learning_rate": 9.940921874531322e-06,
"loss": 0.0983,
"step": 309
},
{
"epoch": 0.21283899759697905,
"grad_norm": 0.053955078125,
"learning_rate": 9.94038720749355e-06,
"loss": 0.1033,
"step": 310
},
{
"epoch": 0.2135255750085822,
"grad_norm": 0.0576171875,
"learning_rate": 9.939850148059983e-06,
"loss": 0.0983,
"step": 311
},
{
"epoch": 0.21421215242018538,
"grad_norm": 0.046875,
"learning_rate": 9.939310696519977e-06,
"loss": 0.098,
"step": 312
},
{
"epoch": 0.21489872983178854,
"grad_norm": 0.060546875,
"learning_rate": 9.938768853164176e-06,
"loss": 0.1087,
"step": 313
},
{
"epoch": 0.2155853072433917,
"grad_norm": 0.05908203125,
"learning_rate": 9.93822461828452e-06,
"loss": 0.1025,
"step": 314
},
{
"epoch": 0.21627188465499486,
"grad_norm": 0.044677734375,
"learning_rate": 9.937677992174228e-06,
"loss": 0.097,
"step": 315
},
{
"epoch": 0.216958462066598,
"grad_norm": 0.051513671875,
"learning_rate": 9.937128975127814e-06,
"loss": 0.1113,
"step": 316
},
{
"epoch": 0.21764503947820116,
"grad_norm": 0.05126953125,
"learning_rate": 9.936577567441074e-06,
"loss": 0.1005,
"step": 317
},
{
"epoch": 0.21833161688980432,
"grad_norm": 0.05029296875,
"learning_rate": 9.936023769411103e-06,
"loss": 0.0923,
"step": 318
},
{
"epoch": 0.21901819430140748,
"grad_norm": 0.04736328125,
"learning_rate": 9.935467581336269e-06,
"loss": 0.1031,
"step": 319
},
{
"epoch": 0.21970477171301064,
"grad_norm": 0.052978515625,
"learning_rate": 9.93490900351624e-06,
"loss": 0.09,
"step": 320
},
{
"epoch": 0.2203913491246138,
"grad_norm": 0.052490234375,
"learning_rate": 9.934348036251969e-06,
"loss": 0.0996,
"step": 321
},
{
"epoch": 0.22107792653621697,
"grad_norm": 0.052734375,
"learning_rate": 9.933784679845687e-06,
"loss": 0.1056,
"step": 322
},
{
"epoch": 0.22176450394782013,
"grad_norm": 0.05224609375,
"learning_rate": 9.933218934600927e-06,
"loss": 0.1074,
"step": 323
},
{
"epoch": 0.22245108135942326,
"grad_norm": 0.05322265625,
"learning_rate": 9.9326508008225e-06,
"loss": 0.109,
"step": 324
},
{
"epoch": 0.22313765877102643,
"grad_norm": 0.0498046875,
"learning_rate": 9.932080278816503e-06,
"loss": 0.0926,
"step": 325
},
{
"epoch": 0.2238242361826296,
"grad_norm": 0.053466796875,
"learning_rate": 9.931507368890323e-06,
"loss": 0.0987,
"step": 326
},
{
"epoch": 0.22451081359423275,
"grad_norm": 0.051025390625,
"learning_rate": 9.930932071352635e-06,
"loss": 0.104,
"step": 327
},
{
"epoch": 0.2251973910058359,
"grad_norm": 0.059814453125,
"learning_rate": 9.930354386513399e-06,
"loss": 0.1022,
"step": 328
},
{
"epoch": 0.22588396841743907,
"grad_norm": 0.047119140625,
"learning_rate": 9.929774314683856e-06,
"loss": 0.099,
"step": 329
},
{
"epoch": 0.22657054582904224,
"grad_norm": 0.0517578125,
"learning_rate": 9.929191856176543e-06,
"loss": 0.0948,
"step": 330
},
{
"epoch": 0.22725712324064537,
"grad_norm": 0.0498046875,
"learning_rate": 9.928607011305273e-06,
"loss": 0.1033,
"step": 331
},
{
"epoch": 0.22794370065224853,
"grad_norm": 0.049072265625,
"learning_rate": 9.928019780385152e-06,
"loss": 0.1096,
"step": 332
},
{
"epoch": 0.2286302780638517,
"grad_norm": 0.05224609375,
"learning_rate": 9.927430163732566e-06,
"loss": 0.0993,
"step": 333
},
{
"epoch": 0.22931685547545486,
"grad_norm": 0.050048828125,
"learning_rate": 9.926838161665195e-06,
"loss": 0.0969,
"step": 334
},
{
"epoch": 0.23000343288705802,
"grad_norm": 0.05322265625,
"learning_rate": 9.926243774501993e-06,
"loss": 0.1008,
"step": 335
},
{
"epoch": 0.23069001029866118,
"grad_norm": 0.0478515625,
"learning_rate": 9.925647002563205e-06,
"loss": 0.0915,
"step": 336
},
{
"epoch": 0.23137658771026434,
"grad_norm": 0.0537109375,
"learning_rate": 9.92504784617036e-06,
"loss": 0.1082,
"step": 337
},
{
"epoch": 0.2320631651218675,
"grad_norm": 0.051025390625,
"learning_rate": 9.924446305646278e-06,
"loss": 0.101,
"step": 338
},
{
"epoch": 0.23274974253347064,
"grad_norm": 0.083984375,
"learning_rate": 9.923842381315049e-06,
"loss": 0.1018,
"step": 339
},
{
"epoch": 0.2334363199450738,
"grad_norm": 0.049072265625,
"learning_rate": 9.92323607350206e-06,
"loss": 0.0961,
"step": 340
},
{
"epoch": 0.23412289735667696,
"grad_norm": 0.057373046875,
"learning_rate": 9.92262738253398e-06,
"loss": 0.1025,
"step": 341
},
{
"epoch": 0.23480947476828012,
"grad_norm": 0.048828125,
"learning_rate": 9.922016308738757e-06,
"loss": 0.0994,
"step": 342
},
{
"epoch": 0.2354960521798833,
"grad_norm": 0.049072265625,
"learning_rate": 9.921402852445627e-06,
"loss": 0.0979,
"step": 343
},
{
"epoch": 0.23618262959148645,
"grad_norm": 0.04833984375,
"learning_rate": 9.920787013985106e-06,
"loss": 0.0862,
"step": 344
},
{
"epoch": 0.2368692070030896,
"grad_norm": 0.049560546875,
"learning_rate": 9.920168793689e-06,
"loss": 0.1006,
"step": 345
},
{
"epoch": 0.23755578441469274,
"grad_norm": 0.050537109375,
"learning_rate": 9.919548191890395e-06,
"loss": 0.0911,
"step": 346
},
{
"epoch": 0.2382423618262959,
"grad_norm": 0.045166015625,
"learning_rate": 9.918925208923654e-06,
"loss": 0.1028,
"step": 347
},
{
"epoch": 0.23892893923789907,
"grad_norm": 0.04931640625,
"learning_rate": 9.918299845124433e-06,
"loss": 0.1066,
"step": 348
},
{
"epoch": 0.23961551664950223,
"grad_norm": 0.052734375,
"learning_rate": 9.917672100829664e-06,
"loss": 0.0895,
"step": 349
},
{
"epoch": 0.2403020940611054,
"grad_norm": 0.048095703125,
"learning_rate": 9.917041976377564e-06,
"loss": 0.1003,
"step": 350
},
{
"epoch": 0.24098867147270855,
"grad_norm": 0.0537109375,
"learning_rate": 9.916409472107632e-06,
"loss": 0.1059,
"step": 351
},
{
"epoch": 0.24167524888431172,
"grad_norm": 0.050537109375,
"learning_rate": 9.915774588360649e-06,
"loss": 0.0993,
"step": 352
},
{
"epoch": 0.24236182629591485,
"grad_norm": 0.055908203125,
"learning_rate": 9.915137325478677e-06,
"loss": 0.1147,
"step": 353
},
{
"epoch": 0.243048403707518,
"grad_norm": 0.048095703125,
"learning_rate": 9.914497683805065e-06,
"loss": 0.1039,
"step": 354
},
{
"epoch": 0.24373498111912117,
"grad_norm": 0.048583984375,
"learning_rate": 9.913855663684438e-06,
"loss": 0.1015,
"step": 355
},
{
"epoch": 0.24442155853072434,
"grad_norm": 0.05029296875,
"learning_rate": 9.9132112654627e-06,
"loss": 0.096,
"step": 356
},
{
"epoch": 0.2451081359423275,
"grad_norm": 0.050048828125,
"learning_rate": 9.912564489487047e-06,
"loss": 0.1075,
"step": 357
},
{
"epoch": 0.24579471335393066,
"grad_norm": 0.05029296875,
"learning_rate": 9.911915336105943e-06,
"loss": 0.1002,
"step": 358
},
{
"epoch": 0.24648129076553382,
"grad_norm": 0.0546875,
"learning_rate": 9.911263805669147e-06,
"loss": 0.1126,
"step": 359
},
{
"epoch": 0.24716786817713698,
"grad_norm": 0.05419921875,
"learning_rate": 9.910609898527686e-06,
"loss": 0.0963,
"step": 360
},
{
"epoch": 0.24785444558874012,
"grad_norm": 0.05908203125,
"learning_rate": 9.909953615033872e-06,
"loss": 0.1024,
"step": 361
},
{
"epoch": 0.24854102300034328,
"grad_norm": 0.048095703125,
"learning_rate": 9.9092949555413e-06,
"loss": 0.1034,
"step": 362
},
{
"epoch": 0.24922760041194644,
"grad_norm": 0.0546875,
"learning_rate": 9.908633920404844e-06,
"loss": 0.0964,
"step": 363
},
{
"epoch": 0.2499141778235496,
"grad_norm": 0.049560546875,
"learning_rate": 9.907970509980657e-06,
"loss": 0.0991,
"step": 364
},
{
"epoch": 0.25060075523515274,
"grad_norm": 0.04931640625,
"learning_rate": 9.90730472462617e-06,
"loss": 0.0935,
"step": 365
},
{
"epoch": 0.2512873326467559,
"grad_norm": 0.05224609375,
"learning_rate": 9.906636564700096e-06,
"loss": 0.0963,
"step": 366
},
{
"epoch": 0.25197391005835906,
"grad_norm": 0.059326171875,
"learning_rate": 9.905966030562426e-06,
"loss": 0.1065,
"step": 367
},
{
"epoch": 0.2526604874699622,
"grad_norm": 0.05078125,
"learning_rate": 9.905293122574433e-06,
"loss": 0.0968,
"step": 368
},
{
"epoch": 0.2533470648815654,
"grad_norm": 0.057373046875,
"learning_rate": 9.904617841098666e-06,
"loss": 0.0966,
"step": 369
},
{
"epoch": 0.25403364229316855,
"grad_norm": 0.06396484375,
"learning_rate": 9.903940186498953e-06,
"loss": 0.1047,
"step": 370
},
{
"epoch": 0.2547202197047717,
"grad_norm": 0.05078125,
"learning_rate": 9.903260159140404e-06,
"loss": 0.0971,
"step": 371
},
{
"epoch": 0.2554067971163749,
"grad_norm": 0.06689453125,
"learning_rate": 9.902577759389402e-06,
"loss": 0.1025,
"step": 372
},
{
"epoch": 0.25609337452797803,
"grad_norm": 0.052001953125,
"learning_rate": 9.901892987613612e-06,
"loss": 0.1088,
"step": 373
},
{
"epoch": 0.2567799519395812,
"grad_norm": 0.047119140625,
"learning_rate": 9.901205844181976e-06,
"loss": 0.091,
"step": 374
},
{
"epoch": 0.25746652935118436,
"grad_norm": 0.055419921875,
"learning_rate": 9.900516329464713e-06,
"loss": 0.1,
"step": 375
},
{
"epoch": 0.2581531067627875,
"grad_norm": 0.052490234375,
"learning_rate": 9.89982444383332e-06,
"loss": 0.1043,
"step": 376
},
{
"epoch": 0.2588396841743907,
"grad_norm": 0.04833984375,
"learning_rate": 9.899130187660573e-06,
"loss": 0.1011,
"step": 377
},
{
"epoch": 0.25952626158599384,
"grad_norm": 0.052490234375,
"learning_rate": 9.898433561320525e-06,
"loss": 0.0956,
"step": 378
},
{
"epoch": 0.26021283899759695,
"grad_norm": 0.056884765625,
"learning_rate": 9.897734565188504e-06,
"loss": 0.1147,
"step": 379
},
{
"epoch": 0.2608994164092001,
"grad_norm": 0.051025390625,
"learning_rate": 9.897033199641114e-06,
"loss": 0.0911,
"step": 380
},
{
"epoch": 0.2615859938208033,
"grad_norm": 0.053955078125,
"learning_rate": 9.896329465056238e-06,
"loss": 0.1079,
"step": 381
},
{
"epoch": 0.26227257123240644,
"grad_norm": 0.05029296875,
"learning_rate": 9.895623361813036e-06,
"loss": 0.0945,
"step": 382
},
{
"epoch": 0.2629591486440096,
"grad_norm": 0.05224609375,
"learning_rate": 9.894914890291944e-06,
"loss": 0.0953,
"step": 383
},
{
"epoch": 0.26364572605561276,
"grad_norm": 0.047119140625,
"learning_rate": 9.89420405087467e-06,
"loss": 0.1101,
"step": 384
},
{
"epoch": 0.2643323034672159,
"grad_norm": 0.05126953125,
"learning_rate": 9.893490843944201e-06,
"loss": 0.0926,
"step": 385
},
{
"epoch": 0.2650188808788191,
"grad_norm": 0.0546875,
"learning_rate": 9.892775269884802e-06,
"loss": 0.0907,
"step": 386
},
{
"epoch": 0.26570545829042225,
"grad_norm": 0.05078125,
"learning_rate": 9.892057329082009e-06,
"loss": 0.0903,
"step": 387
},
{
"epoch": 0.2663920357020254,
"grad_norm": 0.05517578125,
"learning_rate": 9.891337021922633e-06,
"loss": 0.0953,
"step": 388
},
{
"epoch": 0.26707861311362857,
"grad_norm": 0.056396484375,
"learning_rate": 9.890614348794764e-06,
"loss": 0.0968,
"step": 389
},
{
"epoch": 0.26776519052523173,
"grad_norm": 0.0458984375,
"learning_rate": 9.889889310087766e-06,
"loss": 0.1017,
"step": 390
},
{
"epoch": 0.2684517679368349,
"grad_norm": 0.0478515625,
"learning_rate": 9.889161906192271e-06,
"loss": 0.0925,
"step": 391
},
{
"epoch": 0.26913834534843806,
"grad_norm": 0.052734375,
"learning_rate": 9.888432137500194e-06,
"loss": 0.1004,
"step": 392
},
{
"epoch": 0.2698249227600412,
"grad_norm": 0.05126953125,
"learning_rate": 9.88770000440472e-06,
"loss": 0.0939,
"step": 393
},
{
"epoch": 0.2705115001716443,
"grad_norm": 0.05322265625,
"learning_rate": 9.886965507300309e-06,
"loss": 0.093,
"step": 394
},
{
"epoch": 0.2711980775832475,
"grad_norm": 0.0556640625,
"learning_rate": 9.886228646582694e-06,
"loss": 0.102,
"step": 395
},
{
"epoch": 0.27188465499485065,
"grad_norm": 0.0546875,
"learning_rate": 9.885489422648878e-06,
"loss": 0.1058,
"step": 396
},
{
"epoch": 0.2725712324064538,
"grad_norm": 0.0546875,
"learning_rate": 9.884747835897145e-06,
"loss": 0.1039,
"step": 397
},
{
"epoch": 0.273257809818057,
"grad_norm": 0.04931640625,
"learning_rate": 9.884003886727044e-06,
"loss": 0.0908,
"step": 398
},
{
"epoch": 0.27394438722966014,
"grad_norm": 0.046630859375,
"learning_rate": 9.883257575539404e-06,
"loss": 0.0924,
"step": 399
},
{
"epoch": 0.2746309646412633,
"grad_norm": 0.052490234375,
"learning_rate": 9.88250890273632e-06,
"loss": 0.1073,
"step": 400
},
{
"epoch": 0.27531754205286646,
"grad_norm": 0.045654296875,
"learning_rate": 9.881757868721166e-06,
"loss": 0.0973,
"step": 401
},
{
"epoch": 0.2760041194644696,
"grad_norm": 0.050537109375,
"learning_rate": 9.881004473898585e-06,
"loss": 0.0924,
"step": 402
},
{
"epoch": 0.2766906968760728,
"grad_norm": 0.06298828125,
"learning_rate": 9.880248718674486e-06,
"loss": 0.11,
"step": 403
},
{
"epoch": 0.27737727428767595,
"grad_norm": 0.052978515625,
"learning_rate": 9.879490603456062e-06,
"loss": 0.093,
"step": 404
},
{
"epoch": 0.2780638516992791,
"grad_norm": 0.048095703125,
"learning_rate": 9.878730128651768e-06,
"loss": 0.1046,
"step": 405
},
{
"epoch": 0.27875042911088227,
"grad_norm": 0.050537109375,
"learning_rate": 9.877967294671333e-06,
"loss": 0.1064,
"step": 406
},
{
"epoch": 0.27943700652248543,
"grad_norm": 0.050537109375,
"learning_rate": 9.87720210192576e-06,
"loss": 0.0954,
"step": 407
},
{
"epoch": 0.2801235839340886,
"grad_norm": 0.047607421875,
"learning_rate": 9.876434550827315e-06,
"loss": 0.0944,
"step": 408
},
{
"epoch": 0.2808101613456917,
"grad_norm": 0.05419921875,
"learning_rate": 9.875664641789545e-06,
"loss": 0.1119,
"step": 409
},
{
"epoch": 0.28149673875729486,
"grad_norm": 0.053955078125,
"learning_rate": 9.874892375227262e-06,
"loss": 0.1031,
"step": 410
},
{
"epoch": 0.282183316168898,
"grad_norm": 0.0498046875,
"learning_rate": 9.874117751556544e-06,
"loss": 0.0984,
"step": 411
},
{
"epoch": 0.2828698935805012,
"grad_norm": 0.047607421875,
"learning_rate": 9.873340771194749e-06,
"loss": 0.0978,
"step": 412
},
{
"epoch": 0.28355647099210435,
"grad_norm": 0.052001953125,
"learning_rate": 9.872561434560493e-06,
"loss": 0.0999,
"step": 413
},
{
"epoch": 0.2842430484037075,
"grad_norm": 0.04638671875,
"learning_rate": 9.871779742073675e-06,
"loss": 0.0871,
"step": 414
},
{
"epoch": 0.28492962581531067,
"grad_norm": 0.050048828125,
"learning_rate": 9.870995694155449e-06,
"loss": 0.0974,
"step": 415
},
{
"epoch": 0.28561620322691383,
"grad_norm": 0.05224609375,
"learning_rate": 9.87020929122825e-06,
"loss": 0.0981,
"step": 416
},
{
"epoch": 0.286302780638517,
"grad_norm": 0.05712890625,
"learning_rate": 9.869420533715777e-06,
"loss": 0.0807,
"step": 417
},
{
"epoch": 0.28698935805012016,
"grad_norm": 0.0556640625,
"learning_rate": 9.868629422042994e-06,
"loss": 0.1069,
"step": 418
},
{
"epoch": 0.2876759354617233,
"grad_norm": 0.06201171875,
"learning_rate": 9.867835956636137e-06,
"loss": 0.101,
"step": 419
},
{
"epoch": 0.2883625128733265,
"grad_norm": 0.051025390625,
"learning_rate": 9.867040137922712e-06,
"loss": 0.1105,
"step": 420
},
{
"epoch": 0.28904909028492964,
"grad_norm": 0.049072265625,
"learning_rate": 9.866241966331491e-06,
"loss": 0.1075,
"step": 421
},
{
"epoch": 0.2897356676965328,
"grad_norm": 0.047607421875,
"learning_rate": 9.865441442292513e-06,
"loss": 0.0941,
"step": 422
},
{
"epoch": 0.29042224510813597,
"grad_norm": 0.048583984375,
"learning_rate": 9.864638566237084e-06,
"loss": 0.0968,
"step": 423
},
{
"epoch": 0.2911088225197391,
"grad_norm": 0.050048828125,
"learning_rate": 9.86383333859778e-06,
"loss": 0.0945,
"step": 424
},
{
"epoch": 0.29179539993134224,
"grad_norm": 0.059326171875,
"learning_rate": 9.863025759808444e-06,
"loss": 0.0993,
"step": 425
},
{
"epoch": 0.2924819773429454,
"grad_norm": 0.0498046875,
"learning_rate": 9.862215830304178e-06,
"loss": 0.0939,
"step": 426
},
{
"epoch": 0.29316855475454856,
"grad_norm": 0.053466796875,
"learning_rate": 9.861403550521361e-06,
"loss": 0.1086,
"step": 427
},
{
"epoch": 0.2938551321661517,
"grad_norm": 0.052978515625,
"learning_rate": 9.860588920897633e-06,
"loss": 0.0952,
"step": 428
},
{
"epoch": 0.2945417095777549,
"grad_norm": 0.05322265625,
"learning_rate": 9.859771941871903e-06,
"loss": 0.0867,
"step": 429
},
{
"epoch": 0.29522828698935805,
"grad_norm": 0.056640625,
"learning_rate": 9.858952613884339e-06,
"loss": 0.1108,
"step": 430
},
{
"epoch": 0.2959148644009612,
"grad_norm": 0.045654296875,
"learning_rate": 9.858130937376384e-06,
"loss": 0.102,
"step": 431
},
{
"epoch": 0.29660144181256437,
"grad_norm": 0.0517578125,
"learning_rate": 9.857306912790737e-06,
"loss": 0.1038,
"step": 432
},
{
"epoch": 0.29728801922416753,
"grad_norm": 0.059814453125,
"learning_rate": 9.85648054057137e-06,
"loss": 0.0986,
"step": 433
},
{
"epoch": 0.2979745966357707,
"grad_norm": 0.05078125,
"learning_rate": 9.855651821163516e-06,
"loss": 0.0851,
"step": 434
},
{
"epoch": 0.29866117404737386,
"grad_norm": 0.0478515625,
"learning_rate": 9.854820755013672e-06,
"loss": 0.0922,
"step": 435
},
{
"epoch": 0.299347751458977,
"grad_norm": 0.046875,
"learning_rate": 9.853987342569604e-06,
"loss": 0.0868,
"step": 436
},
{
"epoch": 0.3000343288705802,
"grad_norm": 0.048583984375,
"learning_rate": 9.853151584280332e-06,
"loss": 0.0863,
"step": 437
},
{
"epoch": 0.30072090628218334,
"grad_norm": 0.052490234375,
"learning_rate": 9.852313480596155e-06,
"loss": 0.1026,
"step": 438
},
{
"epoch": 0.30140748369378645,
"grad_norm": 0.044921875,
"learning_rate": 9.851473031968621e-06,
"loss": 0.099,
"step": 439
},
{
"epoch": 0.3020940611053896,
"grad_norm": 0.0625,
"learning_rate": 9.850630238850549e-06,
"loss": 0.1165,
"step": 440
},
{
"epoch": 0.3027806385169928,
"grad_norm": 0.05078125,
"learning_rate": 9.849785101696022e-06,
"loss": 0.1013,
"step": 441
},
{
"epoch": 0.30346721592859593,
"grad_norm": 0.05322265625,
"learning_rate": 9.848937620960382e-06,
"loss": 0.0974,
"step": 442
},
{
"epoch": 0.3041537933401991,
"grad_norm": 0.059326171875,
"learning_rate": 9.848087797100234e-06,
"loss": 0.0901,
"step": 443
},
{
"epoch": 0.30484037075180226,
"grad_norm": 0.0556640625,
"learning_rate": 9.84723563057345e-06,
"loss": 0.0938,
"step": 444
},
{
"epoch": 0.3055269481634054,
"grad_norm": 0.05029296875,
"learning_rate": 9.84638112183916e-06,
"loss": 0.0967,
"step": 445
},
{
"epoch": 0.3062135255750086,
"grad_norm": 0.055908203125,
"learning_rate": 9.845524271357757e-06,
"loss": 0.0848,
"step": 446
},
{
"epoch": 0.30690010298661174,
"grad_norm": 0.0478515625,
"learning_rate": 9.844665079590892e-06,
"loss": 0.0881,
"step": 447
},
{
"epoch": 0.3075866803982149,
"grad_norm": 0.05078125,
"learning_rate": 9.843803547001487e-06,
"loss": 0.0942,
"step": 448
},
{
"epoch": 0.30827325780981807,
"grad_norm": 0.05419921875,
"learning_rate": 9.842939674053715e-06,
"loss": 0.0922,
"step": 449
},
{
"epoch": 0.30895983522142123,
"grad_norm": 0.05712890625,
"learning_rate": 9.842073461213017e-06,
"loss": 0.0988,
"step": 450
},
{
"epoch": 0.3096464126330244,
"grad_norm": 0.05224609375,
"learning_rate": 9.841204908946091e-06,
"loss": 0.0916,
"step": 451
},
{
"epoch": 0.31033299004462755,
"grad_norm": 0.050537109375,
"learning_rate": 9.840334017720896e-06,
"loss": 0.0971,
"step": 452
},
{
"epoch": 0.3110195674562307,
"grad_norm": 0.05224609375,
"learning_rate": 9.839460788006652e-06,
"loss": 0.0919,
"step": 453
},
{
"epoch": 0.3117061448678338,
"grad_norm": 0.053955078125,
"learning_rate": 9.838585220273837e-06,
"loss": 0.0892,
"step": 454
},
{
"epoch": 0.312392722279437,
"grad_norm": 0.051025390625,
"learning_rate": 9.837707314994192e-06,
"loss": 0.096,
"step": 455
},
{
"epoch": 0.31307929969104015,
"grad_norm": 0.050537109375,
"learning_rate": 9.836827072640716e-06,
"loss": 0.1013,
"step": 456
},
{
"epoch": 0.3137658771026433,
"grad_norm": 0.053466796875,
"learning_rate": 9.835944493687665e-06,
"loss": 0.0991,
"step": 457
},
{
"epoch": 0.31445245451424647,
"grad_norm": 0.049072265625,
"learning_rate": 9.835059578610556e-06,
"loss": 0.106,
"step": 458
},
{
"epoch": 0.31513903192584963,
"grad_norm": 0.057373046875,
"learning_rate": 9.834172327886166e-06,
"loss": 0.0946,
"step": 459
},
{
"epoch": 0.3158256093374528,
"grad_norm": 0.0556640625,
"learning_rate": 9.833282741992526e-06,
"loss": 0.1118,
"step": 460
},
{
"epoch": 0.31651218674905596,
"grad_norm": 0.05712890625,
"learning_rate": 9.832390821408929e-06,
"loss": 0.0994,
"step": 461
},
{
"epoch": 0.3171987641606591,
"grad_norm": 0.05126953125,
"learning_rate": 9.831496566615927e-06,
"loss": 0.1095,
"step": 462
},
{
"epoch": 0.3178853415722623,
"grad_norm": 0.054443359375,
"learning_rate": 9.830599978095323e-06,
"loss": 0.098,
"step": 463
},
{
"epoch": 0.31857191898386544,
"grad_norm": 0.046630859375,
"learning_rate": 9.829701056330188e-06,
"loss": 0.1083,
"step": 464
},
{
"epoch": 0.3192584963954686,
"grad_norm": 0.051513671875,
"learning_rate": 9.828799801804837e-06,
"loss": 0.0942,
"step": 465
},
{
"epoch": 0.31994507380707177,
"grad_norm": 0.045654296875,
"learning_rate": 9.827896215004853e-06,
"loss": 0.1035,
"step": 466
},
{
"epoch": 0.32063165121867493,
"grad_norm": 0.04638671875,
"learning_rate": 9.826990296417071e-06,
"loss": 0.1024,
"step": 467
},
{
"epoch": 0.3213182286302781,
"grad_norm": 0.0517578125,
"learning_rate": 9.826082046529581e-06,
"loss": 0.103,
"step": 468
},
{
"epoch": 0.3220048060418812,
"grad_norm": 0.05078125,
"learning_rate": 9.825171465831732e-06,
"loss": 0.0943,
"step": 469
},
{
"epoch": 0.32269138345348436,
"grad_norm": 0.053955078125,
"learning_rate": 9.824258554814126e-06,
"loss": 0.0973,
"step": 470
},
{
"epoch": 0.3233779608650875,
"grad_norm": 0.050537109375,
"learning_rate": 9.823343313968624e-06,
"loss": 0.0958,
"step": 471
},
{
"epoch": 0.3240645382766907,
"grad_norm": 0.053955078125,
"learning_rate": 9.82242574378834e-06,
"loss": 0.0972,
"step": 472
},
{
"epoch": 0.32475111568829385,
"grad_norm": 0.046142578125,
"learning_rate": 9.821505844767642e-06,
"loss": 0.0943,
"step": 473
},
{
"epoch": 0.325437693099897,
"grad_norm": 0.056396484375,
"learning_rate": 9.820583617402153e-06,
"loss": 0.0925,
"step": 474
},
{
"epoch": 0.32612427051150017,
"grad_norm": 0.05322265625,
"learning_rate": 9.819659062188754e-06,
"loss": 0.0903,
"step": 475
},
{
"epoch": 0.32681084792310333,
"grad_norm": 0.05615234375,
"learning_rate": 9.818732179625578e-06,
"loss": 0.0951,
"step": 476
},
{
"epoch": 0.3274974253347065,
"grad_norm": 0.041748046875,
"learning_rate": 9.817802970212009e-06,
"loss": 0.0964,
"step": 477
},
{
"epoch": 0.32818400274630966,
"grad_norm": 0.0625,
"learning_rate": 9.81687143444869e-06,
"loss": 0.1017,
"step": 478
},
{
"epoch": 0.3288705801579128,
"grad_norm": 0.04931640625,
"learning_rate": 9.815937572837511e-06,
"loss": 0.0909,
"step": 479
},
{
"epoch": 0.329557157569516,
"grad_norm": 0.052978515625,
"learning_rate": 9.815001385881624e-06,
"loss": 0.0889,
"step": 480
},
{
"epoch": 0.33024373498111914,
"grad_norm": 0.056884765625,
"learning_rate": 9.814062874085424e-06,
"loss": 0.0918,
"step": 481
},
{
"epoch": 0.3309303123927223,
"grad_norm": 0.05712890625,
"learning_rate": 9.813122037954567e-06,
"loss": 0.1072,
"step": 482
},
{
"epoch": 0.33161688980432547,
"grad_norm": 0.052734375,
"learning_rate": 9.812178877995954e-06,
"loss": 0.0905,
"step": 483
},
{
"epoch": 0.33230346721592857,
"grad_norm": 0.04638671875,
"learning_rate": 9.811233394717742e-06,
"loss": 0.0895,
"step": 484
},
{
"epoch": 0.33299004462753173,
"grad_norm": 0.050048828125,
"learning_rate": 9.810285588629342e-06,
"loss": 0.1091,
"step": 485
},
{
"epoch": 0.3336766220391349,
"grad_norm": 0.0517578125,
"learning_rate": 9.809335460241412e-06,
"loss": 0.0977,
"step": 486
},
{
"epoch": 0.33436319945073806,
"grad_norm": 0.046875,
"learning_rate": 9.808383010065863e-06,
"loss": 0.0955,
"step": 487
},
{
"epoch": 0.3350497768623412,
"grad_norm": 0.048095703125,
"learning_rate": 9.807428238615858e-06,
"loss": 0.0835,
"step": 488
},
{
"epoch": 0.3357363542739444,
"grad_norm": 0.050537109375,
"learning_rate": 9.806471146405809e-06,
"loss": 0.0982,
"step": 489
},
{
"epoch": 0.33642293168554754,
"grad_norm": 0.045166015625,
"learning_rate": 9.805511733951379e-06,
"loss": 0.0884,
"step": 490
},
{
"epoch": 0.3371095090971507,
"grad_norm": 0.053955078125,
"learning_rate": 9.804550001769478e-06,
"loss": 0.0991,
"step": 491
},
{
"epoch": 0.33779608650875387,
"grad_norm": 0.052978515625,
"learning_rate": 9.803585950378274e-06,
"loss": 0.0955,
"step": 492
},
{
"epoch": 0.33848266392035703,
"grad_norm": 0.06494140625,
"learning_rate": 9.802619580297178e-06,
"loss": 0.0919,
"step": 493
},
{
"epoch": 0.3391692413319602,
"grad_norm": 0.051025390625,
"learning_rate": 9.801650892046851e-06,
"loss": 0.1032,
"step": 494
},
{
"epoch": 0.33985581874356335,
"grad_norm": 0.048828125,
"learning_rate": 9.800679886149203e-06,
"loss": 0.103,
"step": 495
},
{
"epoch": 0.3405423961551665,
"grad_norm": 0.0537109375,
"learning_rate": 9.799706563127395e-06,
"loss": 0.098,
"step": 496
},
{
"epoch": 0.3412289735667697,
"grad_norm": 0.049072265625,
"learning_rate": 9.798730923505833e-06,
"loss": 0.0885,
"step": 497
},
{
"epoch": 0.3419155509783728,
"grad_norm": 0.051513671875,
"learning_rate": 9.797752967810176e-06,
"loss": 0.1006,
"step": 498
},
{
"epoch": 0.34260212838997595,
"grad_norm": 0.05419921875,
"learning_rate": 9.796772696567323e-06,
"loss": 0.0914,
"step": 499
},
{
"epoch": 0.3432887058015791,
"grad_norm": 0.060791015625,
"learning_rate": 9.795790110305431e-06,
"loss": 0.0924,
"step": 500
},
{
"epoch": 0.34397528321318227,
"grad_norm": 0.049072265625,
"learning_rate": 9.794805209553896e-06,
"loss": 0.0916,
"step": 501
},
{
"epoch": 0.34466186062478543,
"grad_norm": 0.0498046875,
"learning_rate": 9.793817994843362e-06,
"loss": 0.0906,
"step": 502
},
{
"epoch": 0.3453484380363886,
"grad_norm": 0.053955078125,
"learning_rate": 9.792828466705725e-06,
"loss": 0.1107,
"step": 503
},
{
"epoch": 0.34603501544799176,
"grad_norm": 0.052490234375,
"learning_rate": 9.79183662567412e-06,
"loss": 0.091,
"step": 504
},
{
"epoch": 0.3467215928595949,
"grad_norm": 0.054443359375,
"learning_rate": 9.790842472282935e-06,
"loss": 0.1009,
"step": 505
},
{
"epoch": 0.3474081702711981,
"grad_norm": 0.047119140625,
"learning_rate": 9.789846007067802e-06,
"loss": 0.0908,
"step": 506
},
{
"epoch": 0.34809474768280124,
"grad_norm": 0.050048828125,
"learning_rate": 9.788847230565592e-06,
"loss": 0.0912,
"step": 507
},
{
"epoch": 0.3487813250944044,
"grad_norm": 0.046630859375,
"learning_rate": 9.787846143314433e-06,
"loss": 0.099,
"step": 508
},
{
"epoch": 0.34946790250600757,
"grad_norm": 0.05322265625,
"learning_rate": 9.786842745853685e-06,
"loss": 0.0913,
"step": 509
},
{
"epoch": 0.35015447991761073,
"grad_norm": 0.046875,
"learning_rate": 9.785837038723966e-06,
"loss": 0.0976,
"step": 510
},
{
"epoch": 0.3508410573292139,
"grad_norm": 0.050048828125,
"learning_rate": 9.784829022467128e-06,
"loss": 0.1042,
"step": 511
},
{
"epoch": 0.35152763474081705,
"grad_norm": 0.05712890625,
"learning_rate": 9.783818697626273e-06,
"loss": 0.0993,
"step": 512
},
{
"epoch": 0.35221421215242016,
"grad_norm": 0.0439453125,
"learning_rate": 9.782806064745742e-06,
"loss": 0.0862,
"step": 513
},
{
"epoch": 0.3529007895640233,
"grad_norm": 0.046875,
"learning_rate": 9.781791124371124e-06,
"loss": 0.098,
"step": 514
},
{
"epoch": 0.3535873669756265,
"grad_norm": 0.05078125,
"learning_rate": 9.78077387704925e-06,
"loss": 0.0924,
"step": 515
},
{
"epoch": 0.35427394438722964,
"grad_norm": 0.049072265625,
"learning_rate": 9.779754323328192e-06,
"loss": 0.0944,
"step": 516
},
{
"epoch": 0.3549605217988328,
"grad_norm": 0.051025390625,
"learning_rate": 9.778732463757267e-06,
"loss": 0.0977,
"step": 517
},
{
"epoch": 0.35564709921043597,
"grad_norm": 0.059326171875,
"learning_rate": 9.777708298887034e-06,
"loss": 0.0957,
"step": 518
},
{
"epoch": 0.35633367662203913,
"grad_norm": 0.07275390625,
"learning_rate": 9.776681829269291e-06,
"loss": 0.0824,
"step": 519
},
{
"epoch": 0.3570202540336423,
"grad_norm": 0.050537109375,
"learning_rate": 9.775653055457082e-06,
"loss": 0.0913,
"step": 520
},
{
"epoch": 0.35770683144524545,
"grad_norm": 0.05078125,
"learning_rate": 9.774621978004692e-06,
"loss": 0.0943,
"step": 521
},
{
"epoch": 0.3583934088568486,
"grad_norm": 0.055419921875,
"learning_rate": 9.773588597467642e-06,
"loss": 0.0938,
"step": 522
},
{
"epoch": 0.3590799862684518,
"grad_norm": 0.04833984375,
"learning_rate": 9.772552914402701e-06,
"loss": 0.0934,
"step": 523
},
{
"epoch": 0.35976656368005494,
"grad_norm": 0.057373046875,
"learning_rate": 9.771514929367875e-06,
"loss": 0.1038,
"step": 524
},
{
"epoch": 0.3604531410916581,
"grad_norm": 0.05615234375,
"learning_rate": 9.77047464292241e-06,
"loss": 0.0952,
"step": 525
},
{
"epoch": 0.36113971850326126,
"grad_norm": 0.048828125,
"learning_rate": 9.76943205562679e-06,
"loss": 0.0932,
"step": 526
},
{
"epoch": 0.3618262959148644,
"grad_norm": 0.045166015625,
"learning_rate": 9.768387168042745e-06,
"loss": 0.0898,
"step": 527
},
{
"epoch": 0.36251287332646753,
"grad_norm": 0.055908203125,
"learning_rate": 9.76733998073324e-06,
"loss": 0.1001,
"step": 528
},
{
"epoch": 0.3631994507380707,
"grad_norm": 0.045654296875,
"learning_rate": 9.766290494262477e-06,
"loss": 0.0942,
"step": 529
},
{
"epoch": 0.36388602814967386,
"grad_norm": 0.0634765625,
"learning_rate": 9.7652387091959e-06,
"loss": 0.1166,
"step": 530
},
{
"epoch": 0.364572605561277,
"grad_norm": 0.0546875,
"learning_rate": 9.764184626100193e-06,
"loss": 0.0933,
"step": 531
},
{
"epoch": 0.3652591829728802,
"grad_norm": 0.06201171875,
"learning_rate": 9.763128245543272e-06,
"loss": 0.0961,
"step": 532
},
{
"epoch": 0.36594576038448334,
"grad_norm": 0.04638671875,
"learning_rate": 9.7620695680943e-06,
"loss": 0.0964,
"step": 533
},
{
"epoch": 0.3666323377960865,
"grad_norm": 0.04833984375,
"learning_rate": 9.761008594323666e-06,
"loss": 0.0975,
"step": 534
},
{
"epoch": 0.36731891520768967,
"grad_norm": 0.050537109375,
"learning_rate": 9.759945324803006e-06,
"loss": 0.0951,
"step": 535
},
{
"epoch": 0.36800549261929283,
"grad_norm": 0.05029296875,
"learning_rate": 9.75887976010519e-06,
"loss": 0.0882,
"step": 536
},
{
"epoch": 0.368692070030896,
"grad_norm": 0.0634765625,
"learning_rate": 9.75781190080432e-06,
"loss": 0.1036,
"step": 537
},
{
"epoch": 0.36937864744249915,
"grad_norm": 0.04638671875,
"learning_rate": 9.756741747475744e-06,
"loss": 0.0964,
"step": 538
},
{
"epoch": 0.3700652248541023,
"grad_norm": 0.048095703125,
"learning_rate": 9.755669300696035e-06,
"loss": 0.1018,
"step": 539
},
{
"epoch": 0.3707518022657055,
"grad_norm": 0.04833984375,
"learning_rate": 9.75459456104301e-06,
"loss": 0.0919,
"step": 540
},
{
"epoch": 0.37143837967730864,
"grad_norm": 0.043701171875,
"learning_rate": 9.753517529095716e-06,
"loss": 0.0999,
"step": 541
},
{
"epoch": 0.3721249570889118,
"grad_norm": 0.046875,
"learning_rate": 9.752438205434439e-06,
"loss": 0.0877,
"step": 542
},
{
"epoch": 0.3728115345005149,
"grad_norm": 0.048828125,
"learning_rate": 9.751356590640696e-06,
"loss": 0.0926,
"step": 543
},
{
"epoch": 0.37349811191211807,
"grad_norm": 0.04931640625,
"learning_rate": 9.750272685297241e-06,
"loss": 0.0983,
"step": 544
},
{
"epoch": 0.37418468932372123,
"grad_norm": 0.052734375,
"learning_rate": 9.749186489988065e-06,
"loss": 0.0966,
"step": 545
},
{
"epoch": 0.3748712667353244,
"grad_norm": 0.04736328125,
"learning_rate": 9.748098005298384e-06,
"loss": 0.1003,
"step": 546
},
{
"epoch": 0.37555784414692756,
"grad_norm": 0.05029296875,
"learning_rate": 9.747007231814656e-06,
"loss": 0.0979,
"step": 547
},
{
"epoch": 0.3762444215585307,
"grad_norm": 0.061767578125,
"learning_rate": 9.745914170124568e-06,
"loss": 0.1027,
"step": 548
},
{
"epoch": 0.3769309989701339,
"grad_norm": 0.045166015625,
"learning_rate": 9.74481882081704e-06,
"loss": 0.0885,
"step": 549
},
{
"epoch": 0.37761757638173704,
"grad_norm": 0.06494140625,
"learning_rate": 9.743721184482226e-06,
"loss": 0.0991,
"step": 550
},
{
"epoch": 0.3783041537933402,
"grad_norm": 0.047607421875,
"learning_rate": 9.742621261711512e-06,
"loss": 0.092,
"step": 551
},
{
"epoch": 0.37899073120494337,
"grad_norm": 0.048828125,
"learning_rate": 9.741519053097516e-06,
"loss": 0.1007,
"step": 552
},
{
"epoch": 0.3796773086165465,
"grad_norm": 0.049560546875,
"learning_rate": 9.740414559234085e-06,
"loss": 0.0954,
"step": 553
},
{
"epoch": 0.3803638860281497,
"grad_norm": 0.05517578125,
"learning_rate": 9.739307780716301e-06,
"loss": 0.1059,
"step": 554
},
{
"epoch": 0.38105046343975285,
"grad_norm": 0.049072265625,
"learning_rate": 9.738198718140471e-06,
"loss": 0.0927,
"step": 555
},
{
"epoch": 0.381737040851356,
"grad_norm": 0.057373046875,
"learning_rate": 9.737087372104143e-06,
"loss": 0.0929,
"step": 556
},
{
"epoch": 0.3824236182629592,
"grad_norm": 0.046875,
"learning_rate": 9.735973743206085e-06,
"loss": 0.1015,
"step": 557
},
{
"epoch": 0.3831101956745623,
"grad_norm": 0.05517578125,
"learning_rate": 9.7348578320463e-06,
"loss": 0.0959,
"step": 558
},
{
"epoch": 0.38379677308616544,
"grad_norm": 0.056640625,
"learning_rate": 9.73373963922602e-06,
"loss": 0.1032,
"step": 559
},
{
"epoch": 0.3844833504977686,
"grad_norm": 0.0615234375,
"learning_rate": 9.732619165347705e-06,
"loss": 0.0872,
"step": 560
},
{
"epoch": 0.38516992790937177,
"grad_norm": 0.057373046875,
"learning_rate": 9.731496411015046e-06,
"loss": 0.0942,
"step": 561
},
{
"epoch": 0.38585650532097493,
"grad_norm": 0.0654296875,
"learning_rate": 9.73037137683296e-06,
"loss": 0.1003,
"step": 562
},
{
"epoch": 0.3865430827325781,
"grad_norm": 0.04931640625,
"learning_rate": 9.729244063407594e-06,
"loss": 0.094,
"step": 563
},
{
"epoch": 0.38722966014418125,
"grad_norm": 0.05029296875,
"learning_rate": 9.728114471346324e-06,
"loss": 0.0935,
"step": 564
},
{
"epoch": 0.3879162375557844,
"grad_norm": 0.0546875,
"learning_rate": 9.726982601257755e-06,
"loss": 0.1077,
"step": 565
},
{
"epoch": 0.3886028149673876,
"grad_norm": 0.05517578125,
"learning_rate": 9.725848453751712e-06,
"loss": 0.1007,
"step": 566
},
{
"epoch": 0.38928939237899074,
"grad_norm": 0.052978515625,
"learning_rate": 9.724712029439255e-06,
"loss": 0.0951,
"step": 567
},
{
"epoch": 0.3899759697905939,
"grad_norm": 0.055419921875,
"learning_rate": 9.723573328932669e-06,
"loss": 0.0993,
"step": 568
},
{
"epoch": 0.39066254720219706,
"grad_norm": 0.051025390625,
"learning_rate": 9.722432352845458e-06,
"loss": 0.096,
"step": 569
},
{
"epoch": 0.3913491246138002,
"grad_norm": 0.04638671875,
"learning_rate": 9.721289101792367e-06,
"loss": 0.0948,
"step": 570
},
{
"epoch": 0.3920357020254034,
"grad_norm": 0.050537109375,
"learning_rate": 9.72014357638935e-06,
"loss": 0.0967,
"step": 571
},
{
"epoch": 0.39272227943700655,
"grad_norm": 0.052978515625,
"learning_rate": 9.718995777253598e-06,
"loss": 0.0923,
"step": 572
},
{
"epoch": 0.39340885684860966,
"grad_norm": 0.044189453125,
"learning_rate": 9.717845705003523e-06,
"loss": 0.0938,
"step": 573
},
{
"epoch": 0.3940954342602128,
"grad_norm": 0.049072265625,
"learning_rate": 9.716693360258761e-06,
"loss": 0.0976,
"step": 574
},
{
"epoch": 0.394782011671816,
"grad_norm": 0.05078125,
"learning_rate": 9.715538743640177e-06,
"loss": 0.0969,
"step": 575
},
{
"epoch": 0.39546858908341914,
"grad_norm": 0.05322265625,
"learning_rate": 9.71438185576985e-06,
"loss": 0.0894,
"step": 576
},
{
"epoch": 0.3961551664950223,
"grad_norm": 0.04736328125,
"learning_rate": 9.71322269727109e-06,
"loss": 0.1066,
"step": 577
},
{
"epoch": 0.39684174390662547,
"grad_norm": 0.0556640625,
"learning_rate": 9.712061268768436e-06,
"loss": 0.0931,
"step": 578
},
{
"epoch": 0.39752832131822863,
"grad_norm": 0.06298828125,
"learning_rate": 9.710897570887639e-06,
"loss": 0.1004,
"step": 579
},
{
"epoch": 0.3982148987298318,
"grad_norm": 0.0517578125,
"learning_rate": 9.709731604255675e-06,
"loss": 0.0894,
"step": 580
},
{
"epoch": 0.39890147614143495,
"grad_norm": 0.052978515625,
"learning_rate": 9.70856336950075e-06,
"loss": 0.0988,
"step": 581
},
{
"epoch": 0.3995880535530381,
"grad_norm": 0.052490234375,
"learning_rate": 9.707392867252282e-06,
"loss": 0.1024,
"step": 582
},
{
"epoch": 0.4002746309646413,
"grad_norm": 0.05029296875,
"learning_rate": 9.706220098140917e-06,
"loss": 0.0846,
"step": 583
},
{
"epoch": 0.40096120837624444,
"grad_norm": 0.047607421875,
"learning_rate": 9.705045062798519e-06,
"loss": 0.1155,
"step": 584
},
{
"epoch": 0.4016477857878476,
"grad_norm": 0.05859375,
"learning_rate": 9.703867761858177e-06,
"loss": 0.1043,
"step": 585
},
{
"epoch": 0.40233436319945076,
"grad_norm": 0.0537109375,
"learning_rate": 9.702688195954198e-06,
"loss": 0.0926,
"step": 586
},
{
"epoch": 0.4030209406110539,
"grad_norm": 0.06005859375,
"learning_rate": 9.70150636572211e-06,
"loss": 0.0909,
"step": 587
},
{
"epoch": 0.40370751802265703,
"grad_norm": 0.05078125,
"learning_rate": 9.700322271798657e-06,
"loss": 0.1098,
"step": 588
},
{
"epoch": 0.4043940954342602,
"grad_norm": 0.055419921875,
"learning_rate": 9.69913591482181e-06,
"loss": 0.0972,
"step": 589
},
{
"epoch": 0.40508067284586335,
"grad_norm": 0.048583984375,
"learning_rate": 9.697947295430754e-06,
"loss": 0.0873,
"step": 590
},
{
"epoch": 0.4057672502574665,
"grad_norm": 0.051025390625,
"learning_rate": 9.696756414265894e-06,
"loss": 0.0976,
"step": 591
},
{
"epoch": 0.4064538276690697,
"grad_norm": 0.057373046875,
"learning_rate": 9.695563271968853e-06,
"loss": 0.1141,
"step": 592
},
{
"epoch": 0.40714040508067284,
"grad_norm": 0.04736328125,
"learning_rate": 9.694367869182479e-06,
"loss": 0.0915,
"step": 593
},
{
"epoch": 0.407826982492276,
"grad_norm": 0.06005859375,
"learning_rate": 9.693170206550824e-06,
"loss": 0.0912,
"step": 594
},
{
"epoch": 0.40851355990387916,
"grad_norm": 0.046875,
"learning_rate": 9.69197028471917e-06,
"loss": 0.0946,
"step": 595
},
{
"epoch": 0.4092001373154823,
"grad_norm": 0.05615234375,
"learning_rate": 9.690768104334015e-06,
"loss": 0.1031,
"step": 596
},
{
"epoch": 0.4098867147270855,
"grad_norm": 0.053955078125,
"learning_rate": 9.689563666043065e-06,
"loss": 0.1008,
"step": 597
},
{
"epoch": 0.41057329213868865,
"grad_norm": 0.046875,
"learning_rate": 9.688356970495252e-06,
"loss": 0.1021,
"step": 598
},
{
"epoch": 0.4112598695502918,
"grad_norm": 0.054931640625,
"learning_rate": 9.687148018340717e-06,
"loss": 0.0992,
"step": 599
},
{
"epoch": 0.411946446961895,
"grad_norm": 0.04931640625,
"learning_rate": 9.685936810230824e-06,
"loss": 0.096,
"step": 600
},
{
"epoch": 0.41263302437349814,
"grad_norm": 0.05126953125,
"learning_rate": 9.684723346818149e-06,
"loss": 0.1063,
"step": 601
},
{
"epoch": 0.41331960178510124,
"grad_norm": 0.050537109375,
"learning_rate": 9.683507628756477e-06,
"loss": 0.0882,
"step": 602
},
{
"epoch": 0.4140061791967044,
"grad_norm": 0.05078125,
"learning_rate": 9.682289656700823e-06,
"loss": 0.0973,
"step": 603
},
{
"epoch": 0.41469275660830757,
"grad_norm": 0.04833984375,
"learning_rate": 9.6810694313074e-06,
"loss": 0.0842,
"step": 604
},
{
"epoch": 0.41537933401991073,
"grad_norm": 0.051025390625,
"learning_rate": 9.679846953233644e-06,
"loss": 0.0895,
"step": 605
},
{
"epoch": 0.4160659114315139,
"grad_norm": 0.052734375,
"learning_rate": 9.678622223138203e-06,
"loss": 0.1089,
"step": 606
},
{
"epoch": 0.41675248884311705,
"grad_norm": 0.048583984375,
"learning_rate": 9.677395241680939e-06,
"loss": 0.0863,
"step": 607
},
{
"epoch": 0.4174390662547202,
"grad_norm": 0.058349609375,
"learning_rate": 9.676166009522925e-06,
"loss": 0.0873,
"step": 608
},
{
"epoch": 0.4181256436663234,
"grad_norm": 0.048583984375,
"learning_rate": 9.674934527326447e-06,
"loss": 0.0956,
"step": 609
},
{
"epoch": 0.41881222107792654,
"grad_norm": 0.0537109375,
"learning_rate": 9.673700795755008e-06,
"loss": 0.1002,
"step": 610
},
{
"epoch": 0.4194987984895297,
"grad_norm": 0.054443359375,
"learning_rate": 9.672464815473315e-06,
"loss": 0.1013,
"step": 611
},
{
"epoch": 0.42018537590113286,
"grad_norm": 0.054931640625,
"learning_rate": 9.67122658714729e-06,
"loss": 0.105,
"step": 612
},
{
"epoch": 0.420871953312736,
"grad_norm": 0.054443359375,
"learning_rate": 9.669986111444071e-06,
"loss": 0.0911,
"step": 613
},
{
"epoch": 0.4215585307243392,
"grad_norm": 0.056640625,
"learning_rate": 9.668743389032001e-06,
"loss": 0.1001,
"step": 614
},
{
"epoch": 0.42224510813594235,
"grad_norm": 0.0517578125,
"learning_rate": 9.667498420580632e-06,
"loss": 0.0878,
"step": 615
},
{
"epoch": 0.4229316855475455,
"grad_norm": 0.046142578125,
"learning_rate": 9.666251206760732e-06,
"loss": 0.0914,
"step": 616
},
{
"epoch": 0.4236182629591486,
"grad_norm": 0.0517578125,
"learning_rate": 9.665001748244276e-06,
"loss": 0.1006,
"step": 617
},
{
"epoch": 0.4243048403707518,
"grad_norm": 0.052734375,
"learning_rate": 9.663750045704448e-06,
"loss": 0.1053,
"step": 618
},
{
"epoch": 0.42499141778235494,
"grad_norm": 0.05224609375,
"learning_rate": 9.66249609981564e-06,
"loss": 0.1015,
"step": 619
},
{
"epoch": 0.4256779951939581,
"grad_norm": 0.056640625,
"learning_rate": 9.661239911253457e-06,
"loss": 0.0913,
"step": 620
},
{
"epoch": 0.42636457260556127,
"grad_norm": 0.04443359375,
"learning_rate": 9.659981480694708e-06,
"loss": 0.0923,
"step": 621
},
{
"epoch": 0.4270511500171644,
"grad_norm": 0.050048828125,
"learning_rate": 9.65872080881741e-06,
"loss": 0.1104,
"step": 622
},
{
"epoch": 0.4277377274287676,
"grad_norm": 0.0556640625,
"learning_rate": 9.65745789630079e-06,
"loss": 0.0925,
"step": 623
},
{
"epoch": 0.42842430484037075,
"grad_norm": 0.056640625,
"learning_rate": 9.656192743825283e-06,
"loss": 0.084,
"step": 624
},
{
"epoch": 0.4291108822519739,
"grad_norm": 0.064453125,
"learning_rate": 9.654925352072526e-06,
"loss": 0.106,
"step": 625
},
{
"epoch": 0.4297974596635771,
"grad_norm": 0.05322265625,
"learning_rate": 9.653655721725367e-06,
"loss": 0.0945,
"step": 626
},
{
"epoch": 0.43048403707518024,
"grad_norm": 0.05224609375,
"learning_rate": 9.652383853467858e-06,
"loss": 0.1017,
"step": 627
},
{
"epoch": 0.4311706144867834,
"grad_norm": 0.05712890625,
"learning_rate": 9.651109747985257e-06,
"loss": 0.0931,
"step": 628
},
{
"epoch": 0.43185719189838656,
"grad_norm": 0.053955078125,
"learning_rate": 9.649833405964029e-06,
"loss": 0.0952,
"step": 629
},
{
"epoch": 0.4325437693099897,
"grad_norm": 0.0556640625,
"learning_rate": 9.64855482809184e-06,
"loss": 0.0954,
"step": 630
},
{
"epoch": 0.4332303467215929,
"grad_norm": 0.048828125,
"learning_rate": 9.647274015057568e-06,
"loss": 0.0936,
"step": 631
},
{
"epoch": 0.433916924133196,
"grad_norm": 0.048828125,
"learning_rate": 9.645990967551287e-06,
"loss": 0.0965,
"step": 632
},
{
"epoch": 0.43460350154479915,
"grad_norm": 0.051513671875,
"learning_rate": 9.64470568626428e-06,
"loss": 0.1029,
"step": 633
},
{
"epoch": 0.4352900789564023,
"grad_norm": 0.050048828125,
"learning_rate": 9.64341817188903e-06,
"loss": 0.103,
"step": 634
},
{
"epoch": 0.4359766563680055,
"grad_norm": 0.05615234375,
"learning_rate": 9.642128425119226e-06,
"loss": 0.0952,
"step": 635
},
{
"epoch": 0.43666323377960864,
"grad_norm": 0.049560546875,
"learning_rate": 9.640836446649761e-06,
"loss": 0.09,
"step": 636
},
{
"epoch": 0.4373498111912118,
"grad_norm": 0.05322265625,
"learning_rate": 9.639542237176726e-06,
"loss": 0.0992,
"step": 637
},
{
"epoch": 0.43803638860281496,
"grad_norm": 0.057861328125,
"learning_rate": 9.638245797397418e-06,
"loss": 0.1053,
"step": 638
},
{
"epoch": 0.4387229660144181,
"grad_norm": 0.05615234375,
"learning_rate": 9.636947128010332e-06,
"loss": 0.0963,
"step": 639
},
{
"epoch": 0.4394095434260213,
"grad_norm": 0.0615234375,
"learning_rate": 9.635646229715168e-06,
"loss": 0.0979,
"step": 640
},
{
"epoch": 0.44009612083762445,
"grad_norm": 0.05224609375,
"learning_rate": 9.634343103212824e-06,
"loss": 0.0891,
"step": 641
},
{
"epoch": 0.4407826982492276,
"grad_norm": 0.05712890625,
"learning_rate": 9.6330377492054e-06,
"loss": 0.0994,
"step": 642
},
{
"epoch": 0.4414692756608308,
"grad_norm": 0.0517578125,
"learning_rate": 9.631730168396196e-06,
"loss": 0.0962,
"step": 643
},
{
"epoch": 0.44215585307243394,
"grad_norm": 0.060302734375,
"learning_rate": 9.630420361489711e-06,
"loss": 0.0996,
"step": 644
},
{
"epoch": 0.4428424304840371,
"grad_norm": 0.053955078125,
"learning_rate": 9.629108329191646e-06,
"loss": 0.0907,
"step": 645
},
{
"epoch": 0.44352900789564026,
"grad_norm": 0.05029296875,
"learning_rate": 9.627794072208896e-06,
"loss": 0.0972,
"step": 646
},
{
"epoch": 0.44421558530724337,
"grad_norm": 0.057373046875,
"learning_rate": 9.62647759124956e-06,
"loss": 0.0972,
"step": 647
},
{
"epoch": 0.44490216271884653,
"grad_norm": 0.05224609375,
"learning_rate": 9.62515888702293e-06,
"loss": 0.0909,
"step": 648
},
{
"epoch": 0.4455887401304497,
"grad_norm": 0.05615234375,
"learning_rate": 9.6238379602395e-06,
"loss": 0.0956,
"step": 649
},
{
"epoch": 0.44627531754205285,
"grad_norm": 0.064453125,
"learning_rate": 9.622514811610964e-06,
"loss": 0.103,
"step": 650
},
{
"epoch": 0.446961894953656,
"grad_norm": 0.0556640625,
"learning_rate": 9.621189441850206e-06,
"loss": 0.0925,
"step": 651
},
{
"epoch": 0.4476484723652592,
"grad_norm": 0.046142578125,
"learning_rate": 9.619861851671306e-06,
"loss": 0.0864,
"step": 652
},
{
"epoch": 0.44833504977686234,
"grad_norm": 0.052490234375,
"learning_rate": 9.618532041789548e-06,
"loss": 0.1086,
"step": 653
},
{
"epoch": 0.4490216271884655,
"grad_norm": 0.04931640625,
"learning_rate": 9.61720001292141e-06,
"loss": 0.0875,
"step": 654
},
{
"epoch": 0.44970820460006866,
"grad_norm": 0.0595703125,
"learning_rate": 9.61586576578456e-06,
"loss": 0.108,
"step": 655
},
{
"epoch": 0.4503947820116718,
"grad_norm": 0.052978515625,
"learning_rate": 9.614529301097867e-06,
"loss": 0.097,
"step": 656
},
{
"epoch": 0.451081359423275,
"grad_norm": 0.050537109375,
"learning_rate": 9.613190619581393e-06,
"loss": 0.091,
"step": 657
},
{
"epoch": 0.45176793683487815,
"grad_norm": 0.055419921875,
"learning_rate": 9.611849721956392e-06,
"loss": 0.092,
"step": 658
},
{
"epoch": 0.4524545142464813,
"grad_norm": 0.05908203125,
"learning_rate": 9.610506608945315e-06,
"loss": 0.093,
"step": 659
},
{
"epoch": 0.45314109165808447,
"grad_norm": 0.050048828125,
"learning_rate": 9.609161281271808e-06,
"loss": 0.1124,
"step": 660
},
{
"epoch": 0.45382766906968763,
"grad_norm": 0.052490234375,
"learning_rate": 9.607813739660705e-06,
"loss": 0.0958,
"step": 661
},
{
"epoch": 0.45451424648129074,
"grad_norm": 0.05126953125,
"learning_rate": 9.606463984838034e-06,
"loss": 0.0844,
"step": 662
},
{
"epoch": 0.4552008238928939,
"grad_norm": 0.050048828125,
"learning_rate": 9.605112017531022e-06,
"loss": 0.0888,
"step": 663
},
{
"epoch": 0.45588740130449706,
"grad_norm": 0.056396484375,
"learning_rate": 9.603757838468079e-06,
"loss": 0.0944,
"step": 664
},
{
"epoch": 0.4565739787161002,
"grad_norm": 0.05078125,
"learning_rate": 9.602401448378816e-06,
"loss": 0.1006,
"step": 665
},
{
"epoch": 0.4572605561277034,
"grad_norm": 0.05712890625,
"learning_rate": 9.601042847994021e-06,
"loss": 0.0997,
"step": 666
},
{
"epoch": 0.45794713353930655,
"grad_norm": 0.058837890625,
"learning_rate": 9.59968203804569e-06,
"loss": 0.0903,
"step": 667
},
{
"epoch": 0.4586337109509097,
"grad_norm": 0.056640625,
"learning_rate": 9.598319019267e-06,
"loss": 0.0885,
"step": 668
},
{
"epoch": 0.4593202883625129,
"grad_norm": 0.051513671875,
"learning_rate": 9.596953792392318e-06,
"loss": 0.1093,
"step": 669
},
{
"epoch": 0.46000686577411604,
"grad_norm": 0.05908203125,
"learning_rate": 9.595586358157202e-06,
"loss": 0.1034,
"step": 670
},
{
"epoch": 0.4606934431857192,
"grad_norm": 0.059326171875,
"learning_rate": 9.5942167172984e-06,
"loss": 0.0948,
"step": 671
},
{
"epoch": 0.46138002059732236,
"grad_norm": 0.055419921875,
"learning_rate": 9.592844870553849e-06,
"loss": 0.0891,
"step": 672
},
{
"epoch": 0.4620665980089255,
"grad_norm": 0.0546875,
"learning_rate": 9.591470818662672e-06,
"loss": 0.0923,
"step": 673
},
{
"epoch": 0.4627531754205287,
"grad_norm": 0.0556640625,
"learning_rate": 9.590094562365184e-06,
"loss": 0.0913,
"step": 674
},
{
"epoch": 0.46343975283213185,
"grad_norm": 0.04833984375,
"learning_rate": 9.588716102402882e-06,
"loss": 0.0977,
"step": 675
},
{
"epoch": 0.464126330243735,
"grad_norm": 0.057861328125,
"learning_rate": 9.58733543951846e-06,
"loss": 0.0921,
"step": 676
},
{
"epoch": 0.4648129076553381,
"grad_norm": 0.052734375,
"learning_rate": 9.58595257445579e-06,
"loss": 0.0969,
"step": 677
},
{
"epoch": 0.4654994850669413,
"grad_norm": 0.0537109375,
"learning_rate": 9.584567507959929e-06,
"loss": 0.0944,
"step": 678
},
{
"epoch": 0.46618606247854444,
"grad_norm": 0.053955078125,
"learning_rate": 9.583180240777128e-06,
"loss": 0.0925,
"step": 679
},
{
"epoch": 0.4668726398901476,
"grad_norm": 0.05517578125,
"learning_rate": 9.581790773654821e-06,
"loss": 0.0953,
"step": 680
},
{
"epoch": 0.46755921730175076,
"grad_norm": 0.0458984375,
"learning_rate": 9.580399107341627e-06,
"loss": 0.0962,
"step": 681
},
{
"epoch": 0.4682457947133539,
"grad_norm": 0.0458984375,
"learning_rate": 9.579005242587344e-06,
"loss": 0.0924,
"step": 682
},
{
"epoch": 0.4689323721249571,
"grad_norm": 0.056396484375,
"learning_rate": 9.577609180142967e-06,
"loss": 0.091,
"step": 683
},
{
"epoch": 0.46961894953656025,
"grad_norm": 0.053955078125,
"learning_rate": 9.576210920760662e-06,
"loss": 0.0871,
"step": 684
},
{
"epoch": 0.4703055269481634,
"grad_norm": 0.055419921875,
"learning_rate": 9.574810465193787e-06,
"loss": 0.0861,
"step": 685
},
{
"epoch": 0.4709921043597666,
"grad_norm": 0.055908203125,
"learning_rate": 9.57340781419688e-06,
"loss": 0.0872,
"step": 686
},
{
"epoch": 0.47167868177136973,
"grad_norm": 0.05078125,
"learning_rate": 9.572002968525662e-06,
"loss": 0.0869,
"step": 687
},
{
"epoch": 0.4723652591829729,
"grad_norm": 0.052978515625,
"learning_rate": 9.57059592893704e-06,
"loss": 0.0853,
"step": 688
},
{
"epoch": 0.47305183659457606,
"grad_norm": 0.053955078125,
"learning_rate": 9.569186696189095e-06,
"loss": 0.1048,
"step": 689
},
{
"epoch": 0.4737384140061792,
"grad_norm": 0.0546875,
"learning_rate": 9.567775271041099e-06,
"loss": 0.101,
"step": 690
},
{
"epoch": 0.4744249914177824,
"grad_norm": 0.053955078125,
"learning_rate": 9.566361654253499e-06,
"loss": 0.0919,
"step": 691
},
{
"epoch": 0.4751115688293855,
"grad_norm": 0.05615234375,
"learning_rate": 9.564945846587925e-06,
"loss": 0.0973,
"step": 692
},
{
"epoch": 0.47579814624098865,
"grad_norm": 0.050048828125,
"learning_rate": 9.563527848807186e-06,
"loss": 0.0926,
"step": 693
},
{
"epoch": 0.4764847236525918,
"grad_norm": 0.05615234375,
"learning_rate": 9.562107661675276e-06,
"loss": 0.0939,
"step": 694
},
{
"epoch": 0.477171301064195,
"grad_norm": 0.046875,
"learning_rate": 9.560685285957361e-06,
"loss": 0.0936,
"step": 695
},
{
"epoch": 0.47785787847579814,
"grad_norm": 0.05224609375,
"learning_rate": 9.55926072241979e-06,
"loss": 0.0903,
"step": 696
},
{
"epoch": 0.4785444558874013,
"grad_norm": 0.052001953125,
"learning_rate": 9.557833971830093e-06,
"loss": 0.0903,
"step": 697
},
{
"epoch": 0.47923103329900446,
"grad_norm": 0.052978515625,
"learning_rate": 9.556405034956974e-06,
"loss": 0.089,
"step": 698
},
{
"epoch": 0.4799176107106076,
"grad_norm": 0.053955078125,
"learning_rate": 9.554973912570316e-06,
"loss": 0.098,
"step": 699
},
{
"epoch": 0.4806041881222108,
"grad_norm": 0.054443359375,
"learning_rate": 9.553540605441182e-06,
"loss": 0.0954,
"step": 700
},
{
"epoch": 0.48129076553381395,
"grad_norm": 0.053466796875,
"learning_rate": 9.552105114341811e-06,
"loss": 0.0783,
"step": 701
},
{
"epoch": 0.4819773429454171,
"grad_norm": 0.0634765625,
"learning_rate": 9.550667440045618e-06,
"loss": 0.0872,
"step": 702
},
{
"epoch": 0.48266392035702027,
"grad_norm": 0.0498046875,
"learning_rate": 9.549227583327193e-06,
"loss": 0.0912,
"step": 703
},
{
"epoch": 0.48335049776862343,
"grad_norm": 0.05859375,
"learning_rate": 9.547785544962303e-06,
"loss": 0.0978,
"step": 704
},
{
"epoch": 0.4840370751802266,
"grad_norm": 0.04736328125,
"learning_rate": 9.546341325727893e-06,
"loss": 0.0913,
"step": 705
},
{
"epoch": 0.4847236525918297,
"grad_norm": 0.05419921875,
"learning_rate": 9.54489492640208e-06,
"loss": 0.0833,
"step": 706
},
{
"epoch": 0.48541023000343286,
"grad_norm": 0.046630859375,
"learning_rate": 9.543446347764159e-06,
"loss": 0.086,
"step": 707
},
{
"epoch": 0.486096807415036,
"grad_norm": 0.05126953125,
"learning_rate": 9.541995590594589e-06,
"loss": 0.0908,
"step": 708
},
{
"epoch": 0.4867833848266392,
"grad_norm": 0.061279296875,
"learning_rate": 9.540542655675014e-06,
"loss": 0.1035,
"step": 709
},
{
"epoch": 0.48746996223824235,
"grad_norm": 0.056884765625,
"learning_rate": 9.539087543788251e-06,
"loss": 0.1008,
"step": 710
},
{
"epoch": 0.4881565396498455,
"grad_norm": 0.05078125,
"learning_rate": 9.537630255718285e-06,
"loss": 0.0905,
"step": 711
},
{
"epoch": 0.4888431170614487,
"grad_norm": 0.057861328125,
"learning_rate": 9.53617079225027e-06,
"loss": 0.1008,
"step": 712
},
{
"epoch": 0.48952969447305184,
"grad_norm": 0.047607421875,
"learning_rate": 9.534709154170542e-06,
"loss": 0.0945,
"step": 713
},
{
"epoch": 0.490216271884655,
"grad_norm": 0.051025390625,
"learning_rate": 9.533245342266604e-06,
"loss": 0.0926,
"step": 714
},
{
"epoch": 0.49090284929625816,
"grad_norm": 0.0478515625,
"learning_rate": 9.531779357327125e-06,
"loss": 0.0938,
"step": 715
},
{
"epoch": 0.4915894267078613,
"grad_norm": 0.0595703125,
"learning_rate": 9.530311200141957e-06,
"loss": 0.0921,
"step": 716
},
{
"epoch": 0.4922760041194645,
"grad_norm": 0.052001953125,
"learning_rate": 9.528840871502108e-06,
"loss": 0.0971,
"step": 717
},
{
"epoch": 0.49296258153106765,
"grad_norm": 0.052978515625,
"learning_rate": 9.527368372199767e-06,
"loss": 0.0921,
"step": 718
},
{
"epoch": 0.4936491589426708,
"grad_norm": 0.056884765625,
"learning_rate": 9.525893703028289e-06,
"loss": 0.0892,
"step": 719
},
{
"epoch": 0.49433573635427397,
"grad_norm": 0.055908203125,
"learning_rate": 9.524416864782196e-06,
"loss": 0.0887,
"step": 720
},
{
"epoch": 0.4950223137658771,
"grad_norm": 0.048583984375,
"learning_rate": 9.522937858257177e-06,
"loss": 0.0823,
"step": 721
},
{
"epoch": 0.49570889117748024,
"grad_norm": 0.051513671875,
"learning_rate": 9.5214566842501e-06,
"loss": 0.0999,
"step": 722
},
{
"epoch": 0.4963954685890834,
"grad_norm": 0.052001953125,
"learning_rate": 9.519973343558984e-06,
"loss": 0.0871,
"step": 723
},
{
"epoch": 0.49708204600068656,
"grad_norm": 0.0537109375,
"learning_rate": 9.518487836983035e-06,
"loss": 0.1034,
"step": 724
},
{
"epoch": 0.4977686234122897,
"grad_norm": 0.060791015625,
"learning_rate": 9.517000165322607e-06,
"loss": 0.0851,
"step": 725
},
{
"epoch": 0.4984552008238929,
"grad_norm": 0.062255859375,
"learning_rate": 9.515510329379234e-06,
"loss": 0.0945,
"step": 726
},
{
"epoch": 0.49914177823549605,
"grad_norm": 0.05224609375,
"learning_rate": 9.514018329955608e-06,
"loss": 0.0937,
"step": 727
},
{
"epoch": 0.4998283556470992,
"grad_norm": 0.053466796875,
"learning_rate": 9.51252416785559e-06,
"loss": 0.0926,
"step": 728
},
{
"epoch": 0.5005149330587023,
"grad_norm": 0.045654296875,
"learning_rate": 9.51102784388421e-06,
"loss": 0.1019,
"step": 729
},
{
"epoch": 0.5012015104703055,
"grad_norm": 0.052734375,
"learning_rate": 9.509529358847655e-06,
"loss": 0.1007,
"step": 730
},
{
"epoch": 0.5018880878819086,
"grad_norm": 0.053466796875,
"learning_rate": 9.508028713553282e-06,
"loss": 0.1017,
"step": 731
},
{
"epoch": 0.5025746652935118,
"grad_norm": 0.051025390625,
"learning_rate": 9.50652590880961e-06,
"loss": 0.0902,
"step": 732
},
{
"epoch": 0.503261242705115,
"grad_norm": 0.048828125,
"learning_rate": 9.505020945426318e-06,
"loss": 0.0859,
"step": 733
},
{
"epoch": 0.5039478201167181,
"grad_norm": 0.0439453125,
"learning_rate": 9.503513824214254e-06,
"loss": 0.0862,
"step": 734
},
{
"epoch": 0.5046343975283213,
"grad_norm": 0.048095703125,
"learning_rate": 9.502004545985428e-06,
"loss": 0.096,
"step": 735
},
{
"epoch": 0.5053209749399244,
"grad_norm": 0.06005859375,
"learning_rate": 9.500493111553007e-06,
"loss": 0.1036,
"step": 736
},
{
"epoch": 0.5060075523515276,
"grad_norm": 0.049560546875,
"learning_rate": 9.498979521731327e-06,
"loss": 0.093,
"step": 737
},
{
"epoch": 0.5066941297631308,
"grad_norm": 0.0615234375,
"learning_rate": 9.497463777335875e-06,
"loss": 0.1004,
"step": 738
},
{
"epoch": 0.5073807071747339,
"grad_norm": 0.07666015625,
"learning_rate": 9.495945879183312e-06,
"loss": 0.0959,
"step": 739
},
{
"epoch": 0.5080672845863371,
"grad_norm": 0.049072265625,
"learning_rate": 9.49442582809145e-06,
"loss": 0.0913,
"step": 740
},
{
"epoch": 0.5087538619979403,
"grad_norm": 0.06494140625,
"learning_rate": 9.49290362487926e-06,
"loss": 0.1015,
"step": 741
},
{
"epoch": 0.5094404394095434,
"grad_norm": 0.05322265625,
"learning_rate": 9.49137927036688e-06,
"loss": 0.103,
"step": 742
},
{
"epoch": 0.5101270168211466,
"grad_norm": 0.0625,
"learning_rate": 9.489852765375602e-06,
"loss": 0.0989,
"step": 743
},
{
"epoch": 0.5108135942327497,
"grad_norm": 0.059326171875,
"learning_rate": 9.488324110727878e-06,
"loss": 0.0826,
"step": 744
},
{
"epoch": 0.5115001716443529,
"grad_norm": 0.0732421875,
"learning_rate": 9.486793307247318e-06,
"loss": 0.0959,
"step": 745
},
{
"epoch": 0.5121867490559561,
"grad_norm": 0.0546875,
"learning_rate": 9.48526035575869e-06,
"loss": 0.0856,
"step": 746
},
{
"epoch": 0.5128733264675592,
"grad_norm": 0.05908203125,
"learning_rate": 9.483725257087919e-06,
"loss": 0.0915,
"step": 747
},
{
"epoch": 0.5135599038791624,
"grad_norm": 0.051513671875,
"learning_rate": 9.482188012062084e-06,
"loss": 0.0828,
"step": 748
},
{
"epoch": 0.5142464812907656,
"grad_norm": 0.0634765625,
"learning_rate": 9.480648621509426e-06,
"loss": 0.0934,
"step": 749
},
{
"epoch": 0.5149330587023687,
"grad_norm": 0.05322265625,
"learning_rate": 9.47910708625934e-06,
"loss": 0.0966,
"step": 750
},
{
"epoch": 0.5156196361139719,
"grad_norm": 0.05615234375,
"learning_rate": 9.477563407142372e-06,
"loss": 0.0973,
"step": 751
},
{
"epoch": 0.516306213525575,
"grad_norm": 0.049072265625,
"learning_rate": 9.476017584990229e-06,
"loss": 0.0875,
"step": 752
},
{
"epoch": 0.5169927909371782,
"grad_norm": 0.05322265625,
"learning_rate": 9.474469620635773e-06,
"loss": 0.0918,
"step": 753
},
{
"epoch": 0.5176793683487814,
"grad_norm": 0.052734375,
"learning_rate": 9.472919514913013e-06,
"loss": 0.0888,
"step": 754
},
{
"epoch": 0.5183659457603845,
"grad_norm": 0.05322265625,
"learning_rate": 9.471367268657121e-06,
"loss": 0.097,
"step": 755
},
{
"epoch": 0.5190525231719877,
"grad_norm": 0.055908203125,
"learning_rate": 9.469812882704413e-06,
"loss": 0.0899,
"step": 756
},
{
"epoch": 0.5197391005835909,
"grad_norm": 0.0595703125,
"learning_rate": 9.468256357892367e-06,
"loss": 0.1025,
"step": 757
},
{
"epoch": 0.5204256779951939,
"grad_norm": 0.056640625,
"learning_rate": 9.466697695059604e-06,
"loss": 0.0955,
"step": 758
},
{
"epoch": 0.5211122554067971,
"grad_norm": 0.047119140625,
"learning_rate": 9.465136895045907e-06,
"loss": 0.0967,
"step": 759
},
{
"epoch": 0.5217988328184002,
"grad_norm": 0.0556640625,
"learning_rate": 9.4635739586922e-06,
"loss": 0.0967,
"step": 760
},
{
"epoch": 0.5224854102300034,
"grad_norm": 0.058837890625,
"learning_rate": 9.462008886840567e-06,
"loss": 0.0854,
"step": 761
},
{
"epoch": 0.5231719876416066,
"grad_norm": 0.052001953125,
"learning_rate": 9.460441680334236e-06,
"loss": 0.0929,
"step": 762
},
{
"epoch": 0.5238585650532097,
"grad_norm": 0.0546875,
"learning_rate": 9.458872340017592e-06,
"loss": 0.0887,
"step": 763
},
{
"epoch": 0.5245451424648129,
"grad_norm": 0.051513671875,
"learning_rate": 9.45730086673616e-06,
"loss": 0.1038,
"step": 764
},
{
"epoch": 0.525231719876416,
"grad_norm": 0.057373046875,
"learning_rate": 9.455727261336626e-06,
"loss": 0.0929,
"step": 765
},
{
"epoch": 0.5259182972880192,
"grad_norm": 0.060302734375,
"learning_rate": 9.454151524666815e-06,
"loss": 0.0988,
"step": 766
},
{
"epoch": 0.5266048746996224,
"grad_norm": 0.056396484375,
"learning_rate": 9.452573657575705e-06,
"loss": 0.0942,
"step": 767
},
{
"epoch": 0.5272914521112255,
"grad_norm": 0.05224609375,
"learning_rate": 9.450993660913418e-06,
"loss": 0.0837,
"step": 768
},
{
"epoch": 0.5279780295228287,
"grad_norm": 0.05712890625,
"learning_rate": 9.449411535531227e-06,
"loss": 0.0894,
"step": 769
},
{
"epoch": 0.5286646069344318,
"grad_norm": 0.056884765625,
"learning_rate": 9.447827282281551e-06,
"loss": 0.1023,
"step": 770
},
{
"epoch": 0.529351184346035,
"grad_norm": 0.04736328125,
"learning_rate": 9.44624090201796e-06,
"loss": 0.0952,
"step": 771
},
{
"epoch": 0.5300377617576382,
"grad_norm": 0.06396484375,
"learning_rate": 9.444652395595159e-06,
"loss": 0.1045,
"step": 772
},
{
"epoch": 0.5307243391692413,
"grad_norm": 0.05224609375,
"learning_rate": 9.443061763869007e-06,
"loss": 0.0942,
"step": 773
},
{
"epoch": 0.5314109165808445,
"grad_norm": 0.05322265625,
"learning_rate": 9.44146900769651e-06,
"loss": 0.0997,
"step": 774
},
{
"epoch": 0.5320974939924477,
"grad_norm": 0.054931640625,
"learning_rate": 9.439874127935807e-06,
"loss": 0.0953,
"step": 775
},
{
"epoch": 0.5327840714040508,
"grad_norm": 0.0498046875,
"learning_rate": 9.438277125446194e-06,
"loss": 0.0955,
"step": 776
},
{
"epoch": 0.533470648815654,
"grad_norm": 0.04736328125,
"learning_rate": 9.436678001088106e-06,
"loss": 0.0969,
"step": 777
},
{
"epoch": 0.5341572262272571,
"grad_norm": 0.053955078125,
"learning_rate": 9.435076755723119e-06,
"loss": 0.0874,
"step": 778
},
{
"epoch": 0.5348438036388603,
"grad_norm": 0.050537109375,
"learning_rate": 9.43347339021395e-06,
"loss": 0.0976,
"step": 779
},
{
"epoch": 0.5355303810504635,
"grad_norm": 0.052734375,
"learning_rate": 9.431867905424466e-06,
"loss": 0.0917,
"step": 780
},
{
"epoch": 0.5362169584620666,
"grad_norm": 0.05419921875,
"learning_rate": 9.430260302219672e-06,
"loss": 0.0985,
"step": 781
},
{
"epoch": 0.5369035358736698,
"grad_norm": 0.054931640625,
"learning_rate": 9.428650581465713e-06,
"loss": 0.0945,
"step": 782
},
{
"epoch": 0.537590113285273,
"grad_norm": 0.0546875,
"learning_rate": 9.427038744029872e-06,
"loss": 0.1026,
"step": 783
},
{
"epoch": 0.5382766906968761,
"grad_norm": 0.04931640625,
"learning_rate": 9.425424790780581e-06,
"loss": 0.0837,
"step": 784
},
{
"epoch": 0.5389632681084793,
"grad_norm": 0.04541015625,
"learning_rate": 9.423808722587407e-06,
"loss": 0.0862,
"step": 785
},
{
"epoch": 0.5396498455200824,
"grad_norm": 0.05224609375,
"learning_rate": 9.422190540321055e-06,
"loss": 0.0933,
"step": 786
},
{
"epoch": 0.5403364229316856,
"grad_norm": 0.06396484375,
"learning_rate": 9.42057024485337e-06,
"loss": 0.0911,
"step": 787
},
{
"epoch": 0.5410230003432887,
"grad_norm": 0.057373046875,
"learning_rate": 9.418947837057338e-06,
"loss": 0.0949,
"step": 788
},
{
"epoch": 0.5417095777548918,
"grad_norm": 0.056640625,
"learning_rate": 9.41732331780708e-06,
"loss": 0.0948,
"step": 789
},
{
"epoch": 0.542396155166495,
"grad_norm": 0.064453125,
"learning_rate": 9.415696687977857e-06,
"loss": 0.0874,
"step": 790
},
{
"epoch": 0.5430827325780981,
"grad_norm": 0.048583984375,
"learning_rate": 9.414067948446064e-06,
"loss": 0.0907,
"step": 791
},
{
"epoch": 0.5437693099897013,
"grad_norm": 0.05810546875,
"learning_rate": 9.412437100089236e-06,
"loss": 0.0937,
"step": 792
},
{
"epoch": 0.5444558874013045,
"grad_norm": 0.060546875,
"learning_rate": 9.410804143786046e-06,
"loss": 0.0928,
"step": 793
},
{
"epoch": 0.5451424648129076,
"grad_norm": 0.0537109375,
"learning_rate": 9.409169080416296e-06,
"loss": 0.0875,
"step": 794
},
{
"epoch": 0.5458290422245108,
"grad_norm": 0.05712890625,
"learning_rate": 9.407531910860928e-06,
"loss": 0.0894,
"step": 795
},
{
"epoch": 0.546515619636114,
"grad_norm": 0.053955078125,
"learning_rate": 9.405892636002016e-06,
"loss": 0.0935,
"step": 796
},
{
"epoch": 0.5472021970477171,
"grad_norm": 0.0537109375,
"learning_rate": 9.404251256722772e-06,
"loss": 0.0936,
"step": 797
},
{
"epoch": 0.5478887744593203,
"grad_norm": 0.052490234375,
"learning_rate": 9.402607773907539e-06,
"loss": 0.0898,
"step": 798
},
{
"epoch": 0.5485753518709234,
"grad_norm": 0.0498046875,
"learning_rate": 9.400962188441795e-06,
"loss": 0.098,
"step": 799
},
{
"epoch": 0.5492619292825266,
"grad_norm": 0.05859375,
"learning_rate": 9.39931450121215e-06,
"loss": 0.0848,
"step": 800
},
{
"epoch": 0.5499485066941298,
"grad_norm": 0.058349609375,
"learning_rate": 9.397664713106345e-06,
"loss": 0.1012,
"step": 801
},
{
"epoch": 0.5506350841057329,
"grad_norm": 0.052490234375,
"learning_rate": 9.396012825013256e-06,
"loss": 0.101,
"step": 802
},
{
"epoch": 0.5513216615173361,
"grad_norm": 0.049560546875,
"learning_rate": 9.394358837822886e-06,
"loss": 0.1025,
"step": 803
},
{
"epoch": 0.5520082389289392,
"grad_norm": 0.052490234375,
"learning_rate": 9.392702752426377e-06,
"loss": 0.0948,
"step": 804
},
{
"epoch": 0.5526948163405424,
"grad_norm": 0.053955078125,
"learning_rate": 9.391044569715987e-06,
"loss": 0.0985,
"step": 805
},
{
"epoch": 0.5533813937521456,
"grad_norm": 0.056884765625,
"learning_rate": 9.389384290585123e-06,
"loss": 0.1005,
"step": 806
},
{
"epoch": 0.5540679711637487,
"grad_norm": 0.052734375,
"learning_rate": 9.387721915928309e-06,
"loss": 0.087,
"step": 807
},
{
"epoch": 0.5547545485753519,
"grad_norm": 0.0576171875,
"learning_rate": 9.386057446641195e-06,
"loss": 0.0993,
"step": 808
},
{
"epoch": 0.555441125986955,
"grad_norm": 0.056396484375,
"learning_rate": 9.384390883620573e-06,
"loss": 0.0926,
"step": 809
},
{
"epoch": 0.5561277033985582,
"grad_norm": 0.057373046875,
"learning_rate": 9.38272222776435e-06,
"loss": 0.1032,
"step": 810
},
{
"epoch": 0.5568142808101614,
"grad_norm": 0.0625,
"learning_rate": 9.381051479971569e-06,
"loss": 0.0889,
"step": 811
},
{
"epoch": 0.5575008582217645,
"grad_norm": 0.052490234375,
"learning_rate": 9.379378641142394e-06,
"loss": 0.0935,
"step": 812
},
{
"epoch": 0.5581874356333677,
"grad_norm": 0.05224609375,
"learning_rate": 9.377703712178122e-06,
"loss": 0.1001,
"step": 813
},
{
"epoch": 0.5588740130449709,
"grad_norm": 0.056884765625,
"learning_rate": 9.37602669398117e-06,
"loss": 0.0928,
"step": 814
},
{
"epoch": 0.559560590456574,
"grad_norm": 0.060791015625,
"learning_rate": 9.374347587455087e-06,
"loss": 0.0922,
"step": 815
},
{
"epoch": 0.5602471678681772,
"grad_norm": 0.048828125,
"learning_rate": 9.372666393504537e-06,
"loss": 0.0906,
"step": 816
},
{
"epoch": 0.5609337452797803,
"grad_norm": 0.059814453125,
"learning_rate": 9.370983113035323e-06,
"loss": 0.1046,
"step": 817
},
{
"epoch": 0.5616203226913834,
"grad_norm": 0.05810546875,
"learning_rate": 9.369297746954358e-06,
"loss": 0.0881,
"step": 818
},
{
"epoch": 0.5623069001029866,
"grad_norm": 0.048095703125,
"learning_rate": 9.367610296169689e-06,
"loss": 0.086,
"step": 819
},
{
"epoch": 0.5629934775145897,
"grad_norm": 0.049072265625,
"learning_rate": 9.365920761590478e-06,
"loss": 0.105,
"step": 820
},
{
"epoch": 0.5636800549261929,
"grad_norm": 0.0556640625,
"learning_rate": 9.36422914412702e-06,
"loss": 0.0976,
"step": 821
},
{
"epoch": 0.564366632337796,
"grad_norm": 0.06298828125,
"learning_rate": 9.362535444690721e-06,
"loss": 0.0834,
"step": 822
},
{
"epoch": 0.5650532097493992,
"grad_norm": 0.050048828125,
"learning_rate": 9.360839664194116e-06,
"loss": 0.098,
"step": 823
},
{
"epoch": 0.5657397871610024,
"grad_norm": 0.050537109375,
"learning_rate": 9.35914180355086e-06,
"loss": 0.0978,
"step": 824
},
{
"epoch": 0.5664263645726055,
"grad_norm": 0.052734375,
"learning_rate": 9.357441863675727e-06,
"loss": 0.1036,
"step": 825
},
{
"epoch": 0.5671129419842087,
"grad_norm": 0.05810546875,
"learning_rate": 9.355739845484611e-06,
"loss": 0.0887,
"step": 826
},
{
"epoch": 0.5677995193958119,
"grad_norm": 0.054931640625,
"learning_rate": 9.354035749894527e-06,
"loss": 0.0994,
"step": 827
},
{
"epoch": 0.568486096807415,
"grad_norm": 0.049560546875,
"learning_rate": 9.352329577823613e-06,
"loss": 0.0922,
"step": 828
},
{
"epoch": 0.5691726742190182,
"grad_norm": 0.052978515625,
"learning_rate": 9.350621330191116e-06,
"loss": 0.0962,
"step": 829
},
{
"epoch": 0.5698592516306213,
"grad_norm": 0.051025390625,
"learning_rate": 9.348911007917411e-06,
"loss": 0.0928,
"step": 830
},
{
"epoch": 0.5705458290422245,
"grad_norm": 0.051025390625,
"learning_rate": 9.347198611923986e-06,
"loss": 0.0984,
"step": 831
},
{
"epoch": 0.5712324064538277,
"grad_norm": 0.0576171875,
"learning_rate": 9.345484143133447e-06,
"loss": 0.0835,
"step": 832
},
{
"epoch": 0.5719189838654308,
"grad_norm": 0.052001953125,
"learning_rate": 9.343767602469519e-06,
"loss": 0.0918,
"step": 833
},
{
"epoch": 0.572605561277034,
"grad_norm": 0.053466796875,
"learning_rate": 9.342048990857037e-06,
"loss": 0.0963,
"step": 834
},
{
"epoch": 0.5732921386886372,
"grad_norm": 0.050048828125,
"learning_rate": 9.340328309221962e-06,
"loss": 0.0853,
"step": 835
},
{
"epoch": 0.5739787161002403,
"grad_norm": 0.047119140625,
"learning_rate": 9.33860555849136e-06,
"loss": 0.0893,
"step": 836
},
{
"epoch": 0.5746652935118435,
"grad_norm": 0.05419921875,
"learning_rate": 9.336880739593415e-06,
"loss": 0.1013,
"step": 837
},
{
"epoch": 0.5753518709234466,
"grad_norm": 0.06103515625,
"learning_rate": 9.335153853457431e-06,
"loss": 0.0981,
"step": 838
},
{
"epoch": 0.5760384483350498,
"grad_norm": 0.046875,
"learning_rate": 9.333424901013818e-06,
"loss": 0.0936,
"step": 839
},
{
"epoch": 0.576725025746653,
"grad_norm": 0.0498046875,
"learning_rate": 9.331693883194105e-06,
"loss": 0.0915,
"step": 840
},
{
"epoch": 0.5774116031582561,
"grad_norm": 0.046142578125,
"learning_rate": 9.329960800930929e-06,
"loss": 0.0912,
"step": 841
},
{
"epoch": 0.5780981805698593,
"grad_norm": 0.05419921875,
"learning_rate": 9.328225655158045e-06,
"loss": 0.094,
"step": 842
},
{
"epoch": 0.5787847579814624,
"grad_norm": 0.055419921875,
"learning_rate": 9.32648844681031e-06,
"loss": 0.0873,
"step": 843
},
{
"epoch": 0.5794713353930656,
"grad_norm": 0.047119140625,
"learning_rate": 9.324749176823704e-06,
"loss": 0.0872,
"step": 844
},
{
"epoch": 0.5801579128046688,
"grad_norm": 0.05810546875,
"learning_rate": 9.323007846135312e-06,
"loss": 0.1131,
"step": 845
},
{
"epoch": 0.5808444902162719,
"grad_norm": 0.05615234375,
"learning_rate": 9.321264455683327e-06,
"loss": 0.1014,
"step": 846
},
{
"epoch": 0.5815310676278751,
"grad_norm": 0.052734375,
"learning_rate": 9.31951900640706e-06,
"loss": 0.0864,
"step": 847
},
{
"epoch": 0.5822176450394781,
"grad_norm": 0.055419921875,
"learning_rate": 9.317771499246918e-06,
"loss": 0.0848,
"step": 848
},
{
"epoch": 0.5829042224510813,
"grad_norm": 0.05322265625,
"learning_rate": 9.316021935144431e-06,
"loss": 0.091,
"step": 849
},
{
"epoch": 0.5835907998626845,
"grad_norm": 0.05126953125,
"learning_rate": 9.314270315042225e-06,
"loss": 0.1072,
"step": 850
},
{
"epoch": 0.5842773772742876,
"grad_norm": 0.050048828125,
"learning_rate": 9.312516639884047e-06,
"loss": 0.0895,
"step": 851
},
{
"epoch": 0.5849639546858908,
"grad_norm": 0.0634765625,
"learning_rate": 9.310760910614736e-06,
"loss": 0.0877,
"step": 852
},
{
"epoch": 0.585650532097494,
"grad_norm": 0.058837890625,
"learning_rate": 9.309003128180249e-06,
"loss": 0.092,
"step": 853
},
{
"epoch": 0.5863371095090971,
"grad_norm": 0.060302734375,
"learning_rate": 9.307243293527645e-06,
"loss": 0.1001,
"step": 854
},
{
"epoch": 0.5870236869207003,
"grad_norm": 0.056884765625,
"learning_rate": 9.30548140760509e-06,
"loss": 0.091,
"step": 855
},
{
"epoch": 0.5877102643323034,
"grad_norm": 0.057373046875,
"learning_rate": 9.303717471361855e-06,
"loss": 0.0973,
"step": 856
},
{
"epoch": 0.5883968417439066,
"grad_norm": 0.0625,
"learning_rate": 9.301951485748314e-06,
"loss": 0.0982,
"step": 857
},
{
"epoch": 0.5890834191555098,
"grad_norm": 0.052734375,
"learning_rate": 9.300183451715945e-06,
"loss": 0.0876,
"step": 858
},
{
"epoch": 0.5897699965671129,
"grad_norm": 0.05322265625,
"learning_rate": 9.298413370217333e-06,
"loss": 0.0829,
"step": 859
},
{
"epoch": 0.5904565739787161,
"grad_norm": 0.0537109375,
"learning_rate": 9.296641242206165e-06,
"loss": 0.0866,
"step": 860
},
{
"epoch": 0.5911431513903193,
"grad_norm": 0.048095703125,
"learning_rate": 9.294867068637227e-06,
"loss": 0.0947,
"step": 861
},
{
"epoch": 0.5918297288019224,
"grad_norm": 0.049560546875,
"learning_rate": 9.29309085046641e-06,
"loss": 0.102,
"step": 862
},
{
"epoch": 0.5925163062135256,
"grad_norm": 0.052490234375,
"learning_rate": 9.29131258865071e-06,
"loss": 0.0918,
"step": 863
},
{
"epoch": 0.5932028836251287,
"grad_norm": 0.053466796875,
"learning_rate": 9.289532284148218e-06,
"loss": 0.0869,
"step": 864
},
{
"epoch": 0.5938894610367319,
"grad_norm": 0.06396484375,
"learning_rate": 9.287749937918125e-06,
"loss": 0.0993,
"step": 865
},
{
"epoch": 0.5945760384483351,
"grad_norm": 0.05908203125,
"learning_rate": 9.285965550920732e-06,
"loss": 0.1037,
"step": 866
},
{
"epoch": 0.5952626158599382,
"grad_norm": 0.052978515625,
"learning_rate": 9.284179124117426e-06,
"loss": 0.0958,
"step": 867
},
{
"epoch": 0.5959491932715414,
"grad_norm": 0.0595703125,
"learning_rate": 9.282390658470703e-06,
"loss": 0.1054,
"step": 868
},
{
"epoch": 0.5966357706831446,
"grad_norm": 0.056884765625,
"learning_rate": 9.280600154944153e-06,
"loss": 0.0934,
"step": 869
},
{
"epoch": 0.5973223480947477,
"grad_norm": 0.05810546875,
"learning_rate": 9.278807614502467e-06,
"loss": 0.0926,
"step": 870
},
{
"epoch": 0.5980089255063509,
"grad_norm": 0.05517578125,
"learning_rate": 9.27701303811143e-06,
"loss": 0.0886,
"step": 871
},
{
"epoch": 0.598695502917954,
"grad_norm": 0.0537109375,
"learning_rate": 9.275216426737924e-06,
"loss": 0.0868,
"step": 872
},
{
"epoch": 0.5993820803295572,
"grad_norm": 0.05419921875,
"learning_rate": 9.273417781349933e-06,
"loss": 0.0865,
"step": 873
},
{
"epoch": 0.6000686577411604,
"grad_norm": 0.055908203125,
"learning_rate": 9.271617102916528e-06,
"loss": 0.0964,
"step": 874
},
{
"epoch": 0.6007552351527635,
"grad_norm": 0.056640625,
"learning_rate": 9.269814392407883e-06,
"loss": 0.0864,
"step": 875
},
{
"epoch": 0.6014418125643667,
"grad_norm": 0.061279296875,
"learning_rate": 9.268009650795264e-06,
"loss": 0.1008,
"step": 876
},
{
"epoch": 0.6021283899759697,
"grad_norm": 0.05029296875,
"learning_rate": 9.26620287905103e-06,
"loss": 0.0897,
"step": 877
},
{
"epoch": 0.6028149673875729,
"grad_norm": 0.050537109375,
"learning_rate": 9.264394078148636e-06,
"loss": 0.0867,
"step": 878
},
{
"epoch": 0.6035015447991761,
"grad_norm": 0.05712890625,
"learning_rate": 9.26258324906263e-06,
"loss": 0.086,
"step": 879
},
{
"epoch": 0.6041881222107792,
"grad_norm": 0.0478515625,
"learning_rate": 9.260770392768652e-06,
"loss": 0.0777,
"step": 880
},
{
"epoch": 0.6048746996223824,
"grad_norm": 0.056884765625,
"learning_rate": 9.258955510243431e-06,
"loss": 0.0914,
"step": 881
},
{
"epoch": 0.6055612770339855,
"grad_norm": 0.05322265625,
"learning_rate": 9.257138602464795e-06,
"loss": 0.0919,
"step": 882
},
{
"epoch": 0.6062478544455887,
"grad_norm": 0.05322265625,
"learning_rate": 9.255319670411658e-06,
"loss": 0.0873,
"step": 883
},
{
"epoch": 0.6069344318571919,
"grad_norm": 0.0576171875,
"learning_rate": 9.253498715064025e-06,
"loss": 0.0908,
"step": 884
},
{
"epoch": 0.607621009268795,
"grad_norm": 0.05322265625,
"learning_rate": 9.251675737402992e-06,
"loss": 0.0957,
"step": 885
},
{
"epoch": 0.6083075866803982,
"grad_norm": 0.055908203125,
"learning_rate": 9.249850738410749e-06,
"loss": 0.0968,
"step": 886
},
{
"epoch": 0.6089941640920014,
"grad_norm": 0.051025390625,
"learning_rate": 9.248023719070563e-06,
"loss": 0.091,
"step": 887
},
{
"epoch": 0.6096807415036045,
"grad_norm": 0.06591796875,
"learning_rate": 9.246194680366802e-06,
"loss": 0.0961,
"step": 888
},
{
"epoch": 0.6103673189152077,
"grad_norm": 0.04638671875,
"learning_rate": 9.244363623284916e-06,
"loss": 0.0899,
"step": 889
},
{
"epoch": 0.6110538963268108,
"grad_norm": 0.0576171875,
"learning_rate": 9.242530548811444e-06,
"loss": 0.0935,
"step": 890
},
{
"epoch": 0.611740473738414,
"grad_norm": 0.06298828125,
"learning_rate": 9.240695457934012e-06,
"loss": 0.1028,
"step": 891
},
{
"epoch": 0.6124270511500172,
"grad_norm": 0.055908203125,
"learning_rate": 9.23885835164133e-06,
"loss": 0.0843,
"step": 892
},
{
"epoch": 0.6131136285616203,
"grad_norm": 0.0537109375,
"learning_rate": 9.237019230923196e-06,
"loss": 0.0933,
"step": 893
},
{
"epoch": 0.6138002059732235,
"grad_norm": 0.052734375,
"learning_rate": 9.235178096770494e-06,
"loss": 0.101,
"step": 894
},
{
"epoch": 0.6144867833848267,
"grad_norm": 0.051025390625,
"learning_rate": 9.233334950175194e-06,
"loss": 0.0922,
"step": 895
},
{
"epoch": 0.6151733607964298,
"grad_norm": 0.05517578125,
"learning_rate": 9.231489792130343e-06,
"loss": 0.0838,
"step": 896
},
{
"epoch": 0.615859938208033,
"grad_norm": 0.06494140625,
"learning_rate": 9.229642623630081e-06,
"loss": 0.0853,
"step": 897
},
{
"epoch": 0.6165465156196361,
"grad_norm": 0.0595703125,
"learning_rate": 9.227793445669627e-06,
"loss": 0.0868,
"step": 898
},
{
"epoch": 0.6172330930312393,
"grad_norm": 0.056884765625,
"learning_rate": 9.225942259245281e-06,
"loss": 0.0944,
"step": 899
},
{
"epoch": 0.6179196704428425,
"grad_norm": 0.068359375,
"learning_rate": 9.224089065354428e-06,
"loss": 0.1033,
"step": 900
},
{
"epoch": 0.6186062478544456,
"grad_norm": 0.05615234375,
"learning_rate": 9.222233864995533e-06,
"loss": 0.0969,
"step": 901
},
{
"epoch": 0.6192928252660488,
"grad_norm": 0.06103515625,
"learning_rate": 9.220376659168141e-06,
"loss": 0.105,
"step": 902
},
{
"epoch": 0.619979402677652,
"grad_norm": 0.061767578125,
"learning_rate": 9.21851744887288e-06,
"loss": 0.095,
"step": 903
},
{
"epoch": 0.6206659800892551,
"grad_norm": 0.0546875,
"learning_rate": 9.216656235111463e-06,
"loss": 0.0905,
"step": 904
},
{
"epoch": 0.6213525575008583,
"grad_norm": 0.06298828125,
"learning_rate": 9.214793018886666e-06,
"loss": 0.095,
"step": 905
},
{
"epoch": 0.6220391349124614,
"grad_norm": 0.050537109375,
"learning_rate": 9.212927801202361e-06,
"loss": 0.0948,
"step": 906
},
{
"epoch": 0.6227257123240645,
"grad_norm": 0.049072265625,
"learning_rate": 9.211060583063489e-06,
"loss": 0.0764,
"step": 907
},
{
"epoch": 0.6234122897356676,
"grad_norm": 0.056884765625,
"learning_rate": 9.209191365476074e-06,
"loss": 0.1005,
"step": 908
},
{
"epoch": 0.6240988671472708,
"grad_norm": 0.049560546875,
"learning_rate": 9.207320149447212e-06,
"loss": 0.0921,
"step": 909
},
{
"epoch": 0.624785444558874,
"grad_norm": 0.0517578125,
"learning_rate": 9.20544693598508e-06,
"loss": 0.0889,
"step": 910
},
{
"epoch": 0.6254720219704771,
"grad_norm": 0.060791015625,
"learning_rate": 9.20357172609893e-06,
"loss": 0.0899,
"step": 911
},
{
"epoch": 0.6261585993820803,
"grad_norm": 0.053955078125,
"learning_rate": 9.201694520799086e-06,
"loss": 0.0969,
"step": 912
},
{
"epoch": 0.6268451767936835,
"grad_norm": 0.05615234375,
"learning_rate": 9.199815321096953e-06,
"loss": 0.0914,
"step": 913
},
{
"epoch": 0.6275317542052866,
"grad_norm": 0.06201171875,
"learning_rate": 9.19793412800501e-06,
"loss": 0.0913,
"step": 914
},
{
"epoch": 0.6282183316168898,
"grad_norm": 0.05908203125,
"learning_rate": 9.196050942536806e-06,
"loss": 0.0856,
"step": 915
},
{
"epoch": 0.6289049090284929,
"grad_norm": 0.050048828125,
"learning_rate": 9.194165765706963e-06,
"loss": 0.0999,
"step": 916
},
{
"epoch": 0.6295914864400961,
"grad_norm": 0.0595703125,
"learning_rate": 9.192278598531182e-06,
"loss": 0.0952,
"step": 917
},
{
"epoch": 0.6302780638516993,
"grad_norm": 0.06298828125,
"learning_rate": 9.19038944202623e-06,
"loss": 0.0897,
"step": 918
},
{
"epoch": 0.6309646412633024,
"grad_norm": 0.054931640625,
"learning_rate": 9.18849829720995e-06,
"loss": 0.0901,
"step": 919
},
{
"epoch": 0.6316512186749056,
"grad_norm": 0.056884765625,
"learning_rate": 9.186605165101253e-06,
"loss": 0.1012,
"step": 920
},
{
"epoch": 0.6323377960865088,
"grad_norm": 0.055908203125,
"learning_rate": 9.184710046720123e-06,
"loss": 0.0927,
"step": 921
},
{
"epoch": 0.6330243734981119,
"grad_norm": 0.05126953125,
"learning_rate": 9.182812943087614e-06,
"loss": 0.0838,
"step": 922
},
{
"epoch": 0.6337109509097151,
"grad_norm": 0.053955078125,
"learning_rate": 9.18091385522585e-06,
"loss": 0.0855,
"step": 923
},
{
"epoch": 0.6343975283213182,
"grad_norm": 0.049072265625,
"learning_rate": 9.179012784158023e-06,
"loss": 0.0889,
"step": 924
},
{
"epoch": 0.6350841057329214,
"grad_norm": 0.0556640625,
"learning_rate": 9.177109730908393e-06,
"loss": 0.0857,
"step": 925
},
{
"epoch": 0.6357706831445246,
"grad_norm": 0.047607421875,
"learning_rate": 9.175204696502288e-06,
"loss": 0.103,
"step": 926
},
{
"epoch": 0.6364572605561277,
"grad_norm": 0.054931640625,
"learning_rate": 9.173297681966105e-06,
"loss": 0.0952,
"step": 927
},
{
"epoch": 0.6371438379677309,
"grad_norm": 0.05126953125,
"learning_rate": 9.171388688327307e-06,
"loss": 0.0926,
"step": 928
},
{
"epoch": 0.637830415379334,
"grad_norm": 0.06396484375,
"learning_rate": 9.169477716614425e-06,
"loss": 0.0887,
"step": 929
},
{
"epoch": 0.6385169927909372,
"grad_norm": 0.05126953125,
"learning_rate": 9.167564767857052e-06,
"loss": 0.0798,
"step": 930
},
{
"epoch": 0.6392035702025404,
"grad_norm": 0.05322265625,
"learning_rate": 9.165649843085848e-06,
"loss": 0.0948,
"step": 931
},
{
"epoch": 0.6398901476141435,
"grad_norm": 0.050048828125,
"learning_rate": 9.163732943332539e-06,
"loss": 0.0893,
"step": 932
},
{
"epoch": 0.6405767250257467,
"grad_norm": 0.06494140625,
"learning_rate": 9.161814069629914e-06,
"loss": 0.0857,
"step": 933
},
{
"epoch": 0.6412633024373499,
"grad_norm": 0.06494140625,
"learning_rate": 9.159893223011824e-06,
"loss": 0.0977,
"step": 934
},
{
"epoch": 0.641949879848953,
"grad_norm": 0.052734375,
"learning_rate": 9.157970404513185e-06,
"loss": 0.0938,
"step": 935
},
{
"epoch": 0.6426364572605562,
"grad_norm": 0.059326171875,
"learning_rate": 9.156045615169978e-06,
"loss": 0.0931,
"step": 936
},
{
"epoch": 0.6433230346721592,
"grad_norm": 0.06298828125,
"learning_rate": 9.154118856019239e-06,
"loss": 0.0912,
"step": 937
},
{
"epoch": 0.6440096120837624,
"grad_norm": 0.054443359375,
"learning_rate": 9.15219012809907e-06,
"loss": 0.0838,
"step": 938
},
{
"epoch": 0.6446961894953656,
"grad_norm": 0.0654296875,
"learning_rate": 9.150259432448632e-06,
"loss": 0.0969,
"step": 939
},
{
"epoch": 0.6453827669069687,
"grad_norm": 0.0556640625,
"learning_rate": 9.148326770108147e-06,
"loss": 0.0904,
"step": 940
},
{
"epoch": 0.6460693443185719,
"grad_norm": 0.05712890625,
"learning_rate": 9.146392142118899e-06,
"loss": 0.092,
"step": 941
},
{
"epoch": 0.646755921730175,
"grad_norm": 0.0654296875,
"learning_rate": 9.144455549523227e-06,
"loss": 0.0849,
"step": 942
},
{
"epoch": 0.6474424991417782,
"grad_norm": 0.053466796875,
"learning_rate": 9.14251699336453e-06,
"loss": 0.0963,
"step": 943
},
{
"epoch": 0.6481290765533814,
"grad_norm": 0.0537109375,
"learning_rate": 9.140576474687263e-06,
"loss": 0.087,
"step": 944
},
{
"epoch": 0.6488156539649845,
"grad_norm": 0.052734375,
"learning_rate": 9.138633994536945e-06,
"loss": 0.0896,
"step": 945
},
{
"epoch": 0.6495022313765877,
"grad_norm": 0.051513671875,
"learning_rate": 9.136689553960144e-06,
"loss": 0.0901,
"step": 946
},
{
"epoch": 0.6501888087881909,
"grad_norm": 0.058837890625,
"learning_rate": 9.134743154004488e-06,
"loss": 0.0904,
"step": 947
},
{
"epoch": 0.650875386199794,
"grad_norm": 0.06396484375,
"learning_rate": 9.132794795718662e-06,
"loss": 0.0975,
"step": 948
},
{
"epoch": 0.6515619636113972,
"grad_norm": 0.06298828125,
"learning_rate": 9.1308444801524e-06,
"loss": 0.1025,
"step": 949
},
{
"epoch": 0.6522485410230003,
"grad_norm": 0.055419921875,
"learning_rate": 9.128892208356496e-06,
"loss": 0.0894,
"step": 950
},
{
"epoch": 0.6529351184346035,
"grad_norm": 0.0498046875,
"learning_rate": 9.126937981382802e-06,
"loss": 0.0826,
"step": 951
},
{
"epoch": 0.6536216958462067,
"grad_norm": 0.04931640625,
"learning_rate": 9.12498180028421e-06,
"loss": 0.092,
"step": 952
},
{
"epoch": 0.6543082732578098,
"grad_norm": 0.056396484375,
"learning_rate": 9.12302366611468e-06,
"loss": 0.0983,
"step": 953
},
{
"epoch": 0.654994850669413,
"grad_norm": 0.0517578125,
"learning_rate": 9.121063579929214e-06,
"loss": 0.0926,
"step": 954
},
{
"epoch": 0.6556814280810161,
"grad_norm": 0.054443359375,
"learning_rate": 9.119101542783868e-06,
"loss": 0.0969,
"step": 955
},
{
"epoch": 0.6563680054926193,
"grad_norm": 0.05419921875,
"learning_rate": 9.117137555735753e-06,
"loss": 0.0981,
"step": 956
},
{
"epoch": 0.6570545829042225,
"grad_norm": 0.05126953125,
"learning_rate": 9.115171619843025e-06,
"loss": 0.0978,
"step": 957
},
{
"epoch": 0.6577411603158256,
"grad_norm": 0.05859375,
"learning_rate": 9.113203736164894e-06,
"loss": 0.0913,
"step": 958
},
{
"epoch": 0.6584277377274288,
"grad_norm": 0.05810546875,
"learning_rate": 9.111233905761618e-06,
"loss": 0.0937,
"step": 959
},
{
"epoch": 0.659114315139032,
"grad_norm": 0.062255859375,
"learning_rate": 9.109262129694506e-06,
"loss": 0.0891,
"step": 960
},
{
"epoch": 0.6598008925506351,
"grad_norm": 0.05712890625,
"learning_rate": 9.107288409025909e-06,
"loss": 0.1115,
"step": 961
},
{
"epoch": 0.6604874699622383,
"grad_norm": 0.059814453125,
"learning_rate": 9.105312744819232e-06,
"loss": 0.0859,
"step": 962
},
{
"epoch": 0.6611740473738414,
"grad_norm": 0.052001953125,
"learning_rate": 9.103335138138926e-06,
"loss": 0.1066,
"step": 963
},
{
"epoch": 0.6618606247854446,
"grad_norm": 0.05126953125,
"learning_rate": 9.101355590050489e-06,
"loss": 0.092,
"step": 964
},
{
"epoch": 0.6625472021970478,
"grad_norm": 0.05615234375,
"learning_rate": 9.09937410162046e-06,
"loss": 0.0964,
"step": 965
},
{
"epoch": 0.6632337796086509,
"grad_norm": 0.056640625,
"learning_rate": 9.09739067391643e-06,
"loss": 0.093,
"step": 966
},
{
"epoch": 0.663920357020254,
"grad_norm": 0.052978515625,
"learning_rate": 9.095405308007029e-06,
"loss": 0.0849,
"step": 967
},
{
"epoch": 0.6646069344318571,
"grad_norm": 0.056640625,
"learning_rate": 9.093418004961939e-06,
"loss": 0.0884,
"step": 968
},
{
"epoch": 0.6652935118434603,
"grad_norm": 0.05224609375,
"learning_rate": 9.091428765851877e-06,
"loss": 0.0883,
"step": 969
},
{
"epoch": 0.6659800892550635,
"grad_norm": 0.056640625,
"learning_rate": 9.089437591748607e-06,
"loss": 0.09,
"step": 970
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.053466796875,
"learning_rate": 9.087444483724937e-06,
"loss": 0.0873,
"step": 971
},
{
"epoch": 0.6673532440782698,
"grad_norm": 0.053466796875,
"learning_rate": 9.085449442854715e-06,
"loss": 0.0878,
"step": 972
},
{
"epoch": 0.668039821489873,
"grad_norm": 0.049560546875,
"learning_rate": 9.08345247021283e-06,
"loss": 0.0842,
"step": 973
},
{
"epoch": 0.6687263989014761,
"grad_norm": 0.0498046875,
"learning_rate": 9.081453566875215e-06,
"loss": 0.1038,
"step": 974
},
{
"epoch": 0.6694129763130793,
"grad_norm": 0.06103515625,
"learning_rate": 9.079452733918841e-06,
"loss": 0.0974,
"step": 975
},
{
"epoch": 0.6700995537246824,
"grad_norm": 0.05615234375,
"learning_rate": 9.077449972421716e-06,
"loss": 0.0938,
"step": 976
},
{
"epoch": 0.6707861311362856,
"grad_norm": 0.05615234375,
"learning_rate": 9.07544528346289e-06,
"loss": 0.1028,
"step": 977
},
{
"epoch": 0.6714727085478888,
"grad_norm": 0.052001953125,
"learning_rate": 9.073438668122454e-06,
"loss": 0.0789,
"step": 978
},
{
"epoch": 0.6721592859594919,
"grad_norm": 0.049072265625,
"learning_rate": 9.07143012748153e-06,
"loss": 0.0985,
"step": 979
},
{
"epoch": 0.6728458633710951,
"grad_norm": 0.056396484375,
"learning_rate": 9.069419662622284e-06,
"loss": 0.096,
"step": 980
},
{
"epoch": 0.6735324407826982,
"grad_norm": 0.06591796875,
"learning_rate": 9.067407274627917e-06,
"loss": 0.0943,
"step": 981
},
{
"epoch": 0.6742190181943014,
"grad_norm": 0.06640625,
"learning_rate": 9.06539296458266e-06,
"loss": 0.0937,
"step": 982
},
{
"epoch": 0.6749055956059046,
"grad_norm": 0.052490234375,
"learning_rate": 9.063376733571791e-06,
"loss": 0.091,
"step": 983
},
{
"epoch": 0.6755921730175077,
"grad_norm": 0.056396484375,
"learning_rate": 9.061358582681614e-06,
"loss": 0.092,
"step": 984
},
{
"epoch": 0.6762787504291109,
"grad_norm": 0.05712890625,
"learning_rate": 9.05933851299947e-06,
"loss": 0.0982,
"step": 985
},
{
"epoch": 0.6769653278407141,
"grad_norm": 0.0634765625,
"learning_rate": 9.057316525613735e-06,
"loss": 0.0859,
"step": 986
},
{
"epoch": 0.6776519052523172,
"grad_norm": 0.060546875,
"learning_rate": 9.055292621613815e-06,
"loss": 0.1053,
"step": 987
},
{
"epoch": 0.6783384826639204,
"grad_norm": 0.05419921875,
"learning_rate": 9.053266802090152e-06,
"loss": 0.0977,
"step": 988
},
{
"epoch": 0.6790250600755235,
"grad_norm": 0.052978515625,
"learning_rate": 9.051239068134221e-06,
"loss": 0.0913,
"step": 989
},
{
"epoch": 0.6797116374871267,
"grad_norm": 0.06005859375,
"learning_rate": 9.049209420838522e-06,
"loss": 0.0874,
"step": 990
},
{
"epoch": 0.6803982148987299,
"grad_norm": 0.056640625,
"learning_rate": 9.047177861296595e-06,
"loss": 0.0895,
"step": 991
},
{
"epoch": 0.681084792310333,
"grad_norm": 0.057373046875,
"learning_rate": 9.045144390603e-06,
"loss": 0.0886,
"step": 992
},
{
"epoch": 0.6817713697219362,
"grad_norm": 0.054931640625,
"learning_rate": 9.043109009853337e-06,
"loss": 0.0862,
"step": 993
},
{
"epoch": 0.6824579471335394,
"grad_norm": 0.047119140625,
"learning_rate": 9.041071720144232e-06,
"loss": 0.0853,
"step": 994
},
{
"epoch": 0.6831445245451425,
"grad_norm": 0.060546875,
"learning_rate": 9.039032522573328e-06,
"loss": 0.1006,
"step": 995
},
{
"epoch": 0.6838311019567456,
"grad_norm": 0.0712890625,
"learning_rate": 9.036991418239316e-06,
"loss": 0.0803,
"step": 996
},
{
"epoch": 0.6845176793683487,
"grad_norm": 0.0576171875,
"learning_rate": 9.034948408241898e-06,
"loss": 0.0875,
"step": 997
},
{
"epoch": 0.6852042567799519,
"grad_norm": 0.06689453125,
"learning_rate": 9.032903493681811e-06,
"loss": 0.085,
"step": 998
},
{
"epoch": 0.685890834191555,
"grad_norm": 0.05615234375,
"learning_rate": 9.030856675660816e-06,
"loss": 0.0925,
"step": 999
},
{
"epoch": 0.6865774116031582,
"grad_norm": 0.053466796875,
"learning_rate": 9.028807955281701e-06,
"loss": 0.0915,
"step": 1000
}
],
"logging_steps": 1,
"max_steps": 4368,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.08857892274176e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}