sravanthib's picture
Training completed
31235b6 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.1141552511415525,
"eval_steps": 0,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00022831050228310502,
"grad_norm": 59.67314529418945,
"learning_rate": 0.0,
"loss": 10.6752,
"step": 1
},
{
"epoch": 0.00045662100456621003,
"grad_norm": 17.693151473999023,
"learning_rate": 0.0002559580248098155,
"loss": 0.6248,
"step": 2
},
{
"epoch": 0.0006849315068493151,
"grad_norm": 18.046871185302734,
"learning_rate": 0.00040568387108221287,
"loss": 0.6678,
"step": 3
},
{
"epoch": 0.0009132420091324201,
"grad_norm": 0.9820695519447327,
"learning_rate": 0.000511916049619631,
"loss": 0.0564,
"step": 4
},
{
"epoch": 0.001141552511415525,
"grad_norm": 0.2884497046470642,
"learning_rate": 0.000594316128917787,
"loss": 0.034,
"step": 5
},
{
"epoch": 0.0013698630136986301,
"grad_norm": 0.0833420529961586,
"learning_rate": 0.0006616418958920283,
"loss": 0.0307,
"step": 6
},
{
"epoch": 0.0015981735159817352,
"grad_norm": 0.10984054207801819,
"learning_rate": 0.0007185650207899778,
"loss": 0.0311,
"step": 7
},
{
"epoch": 0.0018264840182648401,
"grad_norm": 0.13283254206180573,
"learning_rate": 0.0007678740744294463,
"loss": 0.032,
"step": 8
},
{
"epoch": 0.002054794520547945,
"grad_norm": 0.14496515691280365,
"learning_rate": 0.0008113677421644257,
"loss": 0.0303,
"step": 9
},
{
"epoch": 0.00228310502283105,
"grad_norm": 0.14172254502773285,
"learning_rate": 0.0008502741537276026,
"loss": 0.0294,
"step": 10
},
{
"epoch": 0.002511415525114155,
"grad_norm": 0.11957576870918274,
"learning_rate": 0.0008854692840710254,
"loss": 0.0302,
"step": 11
},
{
"epoch": 0.0027397260273972603,
"grad_norm": 0.11714160442352295,
"learning_rate": 0.0009175999207018438,
"loss": 0.0283,
"step": 12
},
{
"epoch": 0.0029680365296803654,
"grad_norm": 0.12587222456932068,
"learning_rate": 0.0009471572411831842,
"loss": 0.0284,
"step": 13
},
{
"epoch": 0.0031963470319634705,
"grad_norm": 0.13972364366054535,
"learning_rate": 0.0009745230455997932,
"loss": 0.0292,
"step": 14
},
{
"epoch": 0.003424657534246575,
"grad_norm": 0.1309085339307785,
"learning_rate": 0.0009999999999999998,
"loss": 0.0272,
"step": 15
},
{
"epoch": 0.0036529680365296802,
"grad_norm": 0.18927231431007385,
"learning_rate": 0.001,
"loss": 0.0254,
"step": 16
},
{
"epoch": 0.0038812785388127853,
"grad_norm": 0.16019679605960846,
"learning_rate": 0.001,
"loss": 0.0285,
"step": 17
},
{
"epoch": 0.00410958904109589,
"grad_norm": 0.14211571216583252,
"learning_rate": 0.001,
"loss": 0.026,
"step": 18
},
{
"epoch": 0.0043378995433789955,
"grad_norm": 0.17546725273132324,
"learning_rate": 0.001,
"loss": 0.0271,
"step": 19
},
{
"epoch": 0.0045662100456621,
"grad_norm": 0.12021715939044952,
"learning_rate": 0.001,
"loss": 0.024,
"step": 20
},
{
"epoch": 0.004794520547945206,
"grad_norm": 0.13626410067081451,
"learning_rate": 0.001,
"loss": 0.0261,
"step": 21
},
{
"epoch": 0.00502283105022831,
"grad_norm": 0.12207438051700592,
"learning_rate": 0.001,
"loss": 0.0244,
"step": 22
},
{
"epoch": 0.005251141552511416,
"grad_norm": 0.17474311590194702,
"learning_rate": 0.001,
"loss": 0.0237,
"step": 23
},
{
"epoch": 0.005479452054794521,
"grad_norm": 0.042765919119119644,
"learning_rate": 0.001,
"loss": 0.0206,
"step": 24
},
{
"epoch": 0.005707762557077625,
"grad_norm": 0.0823250487446785,
"learning_rate": 0.001,
"loss": 0.0218,
"step": 25
},
{
"epoch": 0.005936073059360731,
"grad_norm": 0.09036653488874435,
"learning_rate": 0.001,
"loss": 0.0211,
"step": 26
},
{
"epoch": 0.0061643835616438354,
"grad_norm": 0.06528954952955246,
"learning_rate": 0.001,
"loss": 0.0205,
"step": 27
},
{
"epoch": 0.006392694063926941,
"grad_norm": 0.07076761871576309,
"learning_rate": 0.001,
"loss": 0.0205,
"step": 28
},
{
"epoch": 0.006621004566210046,
"grad_norm": 0.08131472766399384,
"learning_rate": 0.001,
"loss": 0.0191,
"step": 29
},
{
"epoch": 0.00684931506849315,
"grad_norm": 0.097812220454216,
"learning_rate": 0.001,
"loss": 0.0191,
"step": 30
},
{
"epoch": 0.007077625570776256,
"grad_norm": 0.06373079121112823,
"learning_rate": 0.001,
"loss": 0.0191,
"step": 31
},
{
"epoch": 0.0073059360730593605,
"grad_norm": 0.05190230533480644,
"learning_rate": 0.001,
"loss": 0.018,
"step": 32
},
{
"epoch": 0.007534246575342466,
"grad_norm": 0.059811294078826904,
"learning_rate": 0.001,
"loss": 0.0191,
"step": 33
},
{
"epoch": 0.007762557077625571,
"grad_norm": 0.06886769086122513,
"learning_rate": 0.001,
"loss": 0.0172,
"step": 34
},
{
"epoch": 0.007990867579908675,
"grad_norm": 0.06065753847360611,
"learning_rate": 0.001,
"loss": 0.0179,
"step": 35
},
{
"epoch": 0.00821917808219178,
"grad_norm": 0.047076545655727386,
"learning_rate": 0.001,
"loss": 0.0165,
"step": 36
},
{
"epoch": 0.008447488584474886,
"grad_norm": 0.07710444182157516,
"learning_rate": 0.001,
"loss": 0.016,
"step": 37
},
{
"epoch": 0.008675799086757991,
"grad_norm": 0.050819285213947296,
"learning_rate": 0.001,
"loss": 0.0161,
"step": 38
},
{
"epoch": 0.008904109589041096,
"grad_norm": 0.04452894255518913,
"learning_rate": 0.001,
"loss": 0.0148,
"step": 39
},
{
"epoch": 0.0091324200913242,
"grad_norm": 0.06119012087583542,
"learning_rate": 0.001,
"loss": 0.0147,
"step": 40
},
{
"epoch": 0.009360730593607305,
"grad_norm": 0.043577950447797775,
"learning_rate": 0.001,
"loss": 0.0134,
"step": 41
},
{
"epoch": 0.009589041095890411,
"grad_norm": 0.06228714436292648,
"learning_rate": 0.001,
"loss": 0.0134,
"step": 42
},
{
"epoch": 0.009817351598173516,
"grad_norm": 0.08107709139585495,
"learning_rate": 0.001,
"loss": 0.0119,
"step": 43
},
{
"epoch": 0.01004566210045662,
"grad_norm": 0.08609241992235184,
"learning_rate": 0.001,
"loss": 0.0117,
"step": 44
},
{
"epoch": 0.010273972602739725,
"grad_norm": 0.08933087438344955,
"learning_rate": 0.001,
"loss": 0.0101,
"step": 45
},
{
"epoch": 0.010502283105022832,
"grad_norm": 0.23321422934532166,
"learning_rate": 0.001,
"loss": 0.0103,
"step": 46
},
{
"epoch": 0.010730593607305937,
"grad_norm": 0.1518358290195465,
"learning_rate": 0.001,
"loss": 0.0101,
"step": 47
},
{
"epoch": 0.010958904109589041,
"grad_norm": 0.15060600638389587,
"learning_rate": 0.001,
"loss": 0.009,
"step": 48
},
{
"epoch": 0.011187214611872146,
"grad_norm": 0.2696841359138489,
"learning_rate": 0.001,
"loss": 0.0087,
"step": 49
},
{
"epoch": 0.01141552511415525,
"grad_norm": 0.08441965281963348,
"learning_rate": 0.001,
"loss": 0.009,
"step": 50
},
{
"epoch": 0.011643835616438357,
"grad_norm": 0.1832842081785202,
"learning_rate": 0.001,
"loss": 0.0199,
"step": 51
},
{
"epoch": 0.011872146118721462,
"grad_norm": 0.21883782744407654,
"learning_rate": 0.001,
"loss": 0.0158,
"step": 52
},
{
"epoch": 0.012100456621004566,
"grad_norm": 12.722305297851562,
"learning_rate": 0.001,
"loss": 0.0915,
"step": 53
},
{
"epoch": 0.012328767123287671,
"grad_norm": 0.2270480841398239,
"learning_rate": 0.001,
"loss": 0.0383,
"step": 54
},
{
"epoch": 0.012557077625570776,
"grad_norm": 1.0806418657302856,
"learning_rate": 0.001,
"loss": 0.072,
"step": 55
},
{
"epoch": 0.012785388127853882,
"grad_norm": 0.42152509093284607,
"learning_rate": 0.001,
"loss": 0.051,
"step": 56
},
{
"epoch": 0.013013698630136987,
"grad_norm": 0.19152699410915375,
"learning_rate": 0.001,
"loss": 0.0437,
"step": 57
},
{
"epoch": 0.013242009132420091,
"grad_norm": 0.15559057891368866,
"learning_rate": 0.001,
"loss": 0.0407,
"step": 58
},
{
"epoch": 0.013470319634703196,
"grad_norm": 0.18103821575641632,
"learning_rate": 0.001,
"loss": 0.0378,
"step": 59
},
{
"epoch": 0.0136986301369863,
"grad_norm": 0.2188289612531662,
"learning_rate": 0.001,
"loss": 0.0382,
"step": 60
},
{
"epoch": 0.013926940639269407,
"grad_norm": 0.22403009235858917,
"learning_rate": 0.001,
"loss": 0.0368,
"step": 61
},
{
"epoch": 0.014155251141552512,
"grad_norm": 0.23726648092269897,
"learning_rate": 0.001,
"loss": 0.0308,
"step": 62
},
{
"epoch": 0.014383561643835616,
"grad_norm": 0.5590624809265137,
"learning_rate": 0.001,
"loss": 0.0265,
"step": 63
},
{
"epoch": 0.014611872146118721,
"grad_norm": 0.20665256679058075,
"learning_rate": 0.001,
"loss": 0.0249,
"step": 64
},
{
"epoch": 0.014840182648401826,
"grad_norm": 0.2618805170059204,
"learning_rate": 0.001,
"loss": 0.0241,
"step": 65
},
{
"epoch": 0.015068493150684932,
"grad_norm": 0.2558732330799103,
"learning_rate": 0.001,
"loss": 0.0222,
"step": 66
},
{
"epoch": 0.015296803652968037,
"grad_norm": 0.24830466508865356,
"learning_rate": 0.001,
"loss": 0.0234,
"step": 67
},
{
"epoch": 0.015525114155251141,
"grad_norm": 0.658237874507904,
"learning_rate": 0.001,
"loss": 0.0205,
"step": 68
},
{
"epoch": 0.015753424657534248,
"grad_norm": 0.264330118894577,
"learning_rate": 0.001,
"loss": 0.0225,
"step": 69
},
{
"epoch": 0.01598173515981735,
"grad_norm": 0.2591581642627716,
"learning_rate": 0.001,
"loss": 0.0243,
"step": 70
},
{
"epoch": 0.016210045662100457,
"grad_norm": 0.20444399118423462,
"learning_rate": 0.001,
"loss": 0.0209,
"step": 71
},
{
"epoch": 0.01643835616438356,
"grad_norm": 0.15570659935474396,
"learning_rate": 0.001,
"loss": 0.0176,
"step": 72
},
{
"epoch": 0.016666666666666666,
"grad_norm": 0.9550731778144836,
"learning_rate": 0.001,
"loss": 0.0209,
"step": 73
},
{
"epoch": 0.016894977168949773,
"grad_norm": 0.17412568628787994,
"learning_rate": 0.001,
"loss": 0.0173,
"step": 74
},
{
"epoch": 0.017123287671232876,
"grad_norm": 0.17629070580005646,
"learning_rate": 0.001,
"loss": 0.0188,
"step": 75
},
{
"epoch": 0.017351598173515982,
"grad_norm": 0.1633068323135376,
"learning_rate": 0.001,
"loss": 0.02,
"step": 76
},
{
"epoch": 0.017579908675799085,
"grad_norm": 0.15935851633548737,
"learning_rate": 0.001,
"loss": 0.0168,
"step": 77
},
{
"epoch": 0.01780821917808219,
"grad_norm": 0.12234501540660858,
"learning_rate": 0.001,
"loss": 0.0154,
"step": 78
},
{
"epoch": 0.018036529680365298,
"grad_norm": 0.21797019243240356,
"learning_rate": 0.001,
"loss": 0.0134,
"step": 79
},
{
"epoch": 0.0182648401826484,
"grad_norm": 0.14621035754680634,
"learning_rate": 0.001,
"loss": 0.013,
"step": 80
},
{
"epoch": 0.018493150684931507,
"grad_norm": 0.07467932254076004,
"learning_rate": 0.001,
"loss": 0.0118,
"step": 81
},
{
"epoch": 0.01872146118721461,
"grad_norm": 0.07849911600351334,
"learning_rate": 0.001,
"loss": 0.0109,
"step": 82
},
{
"epoch": 0.018949771689497717,
"grad_norm": 0.12953932583332062,
"learning_rate": 0.001,
"loss": 0.0108,
"step": 83
},
{
"epoch": 0.019178082191780823,
"grad_norm": 0.07049839198589325,
"learning_rate": 0.001,
"loss": 0.0101,
"step": 84
},
{
"epoch": 0.019406392694063926,
"grad_norm": 0.06369508057832718,
"learning_rate": 0.001,
"loss": 0.0095,
"step": 85
},
{
"epoch": 0.019634703196347032,
"grad_norm": 0.057968154549598694,
"learning_rate": 0.001,
"loss": 0.0095,
"step": 86
},
{
"epoch": 0.01986301369863014,
"grad_norm": 0.07910202443599701,
"learning_rate": 0.001,
"loss": 0.0097,
"step": 87
},
{
"epoch": 0.02009132420091324,
"grad_norm": 0.049049049615859985,
"learning_rate": 0.001,
"loss": 0.009,
"step": 88
},
{
"epoch": 0.020319634703196348,
"grad_norm": 0.041860196739435196,
"learning_rate": 0.001,
"loss": 0.008,
"step": 89
},
{
"epoch": 0.02054794520547945,
"grad_norm": 0.056602053344249725,
"learning_rate": 0.001,
"loss": 0.0093,
"step": 90
},
{
"epoch": 0.020776255707762557,
"grad_norm": 0.08956869691610336,
"learning_rate": 0.001,
"loss": 0.0107,
"step": 91
},
{
"epoch": 0.021004566210045664,
"grad_norm": 0.033224668353796005,
"learning_rate": 0.001,
"loss": 0.007,
"step": 92
},
{
"epoch": 0.021232876712328767,
"grad_norm": 0.047221846878528595,
"learning_rate": 0.001,
"loss": 0.0065,
"step": 93
},
{
"epoch": 0.021461187214611873,
"grad_norm": 0.05241613835096359,
"learning_rate": 0.001,
"loss": 0.0073,
"step": 94
},
{
"epoch": 0.021689497716894976,
"grad_norm": 0.05120820179581642,
"learning_rate": 0.001,
"loss": 0.0075,
"step": 95
},
{
"epoch": 0.021917808219178082,
"grad_norm": 0.042824823409318924,
"learning_rate": 0.001,
"loss": 0.0072,
"step": 96
},
{
"epoch": 0.02214611872146119,
"grad_norm": 0.037190262228250504,
"learning_rate": 0.001,
"loss": 0.0061,
"step": 97
},
{
"epoch": 0.02237442922374429,
"grad_norm": 0.03563378378748894,
"learning_rate": 0.001,
"loss": 0.007,
"step": 98
},
{
"epoch": 0.022602739726027398,
"grad_norm": 0.03606602922081947,
"learning_rate": 0.001,
"loss": 0.0059,
"step": 99
},
{
"epoch": 0.0228310502283105,
"grad_norm": 0.03840276971459389,
"learning_rate": 0.001,
"loss": 0.0061,
"step": 100
},
{
"epoch": 0.023059360730593607,
"grad_norm": 0.12917055189609528,
"learning_rate": 0.001,
"loss": 0.0103,
"step": 101
},
{
"epoch": 0.023287671232876714,
"grad_norm": 0.09650158882141113,
"learning_rate": 0.001,
"loss": 0.0092,
"step": 102
},
{
"epoch": 0.023515981735159817,
"grad_norm": 0.7314733862876892,
"learning_rate": 0.001,
"loss": 0.0113,
"step": 103
},
{
"epoch": 0.023744292237442923,
"grad_norm": 0.1915358155965805,
"learning_rate": 0.001,
"loss": 0.0094,
"step": 104
},
{
"epoch": 0.023972602739726026,
"grad_norm": 0.23454691469669342,
"learning_rate": 0.001,
"loss": 0.014,
"step": 105
},
{
"epoch": 0.024200913242009132,
"grad_norm": 0.1961510330438614,
"learning_rate": 0.001,
"loss": 0.0132,
"step": 106
},
{
"epoch": 0.02442922374429224,
"grad_norm": 0.12320326268672943,
"learning_rate": 0.001,
"loss": 0.0097,
"step": 107
},
{
"epoch": 0.024657534246575342,
"grad_norm": 0.05942022427916527,
"learning_rate": 0.001,
"loss": 0.0093,
"step": 108
},
{
"epoch": 0.024885844748858448,
"grad_norm": 0.04996173083782196,
"learning_rate": 0.001,
"loss": 0.008,
"step": 109
},
{
"epoch": 0.02511415525114155,
"grad_norm": 0.048785947263240814,
"learning_rate": 0.001,
"loss": 0.0087,
"step": 110
},
{
"epoch": 0.025342465753424658,
"grad_norm": 0.15529130399227142,
"learning_rate": 0.001,
"loss": 0.0094,
"step": 111
},
{
"epoch": 0.025570776255707764,
"grad_norm": 0.06682206690311432,
"learning_rate": 0.001,
"loss": 0.0087,
"step": 112
},
{
"epoch": 0.025799086757990867,
"grad_norm": 0.07254649698734283,
"learning_rate": 0.001,
"loss": 0.0098,
"step": 113
},
{
"epoch": 0.026027397260273973,
"grad_norm": 0.03909542039036751,
"learning_rate": 0.001,
"loss": 0.0065,
"step": 114
},
{
"epoch": 0.026255707762557076,
"grad_norm": 0.03716771677136421,
"learning_rate": 0.001,
"loss": 0.0086,
"step": 115
},
{
"epoch": 0.026484018264840183,
"grad_norm": 0.04341251775622368,
"learning_rate": 0.001,
"loss": 0.007,
"step": 116
},
{
"epoch": 0.02671232876712329,
"grad_norm": 0.0455278642475605,
"learning_rate": 0.001,
"loss": 0.0069,
"step": 117
},
{
"epoch": 0.026940639269406392,
"grad_norm": 0.0869159922003746,
"learning_rate": 0.001,
"loss": 0.0086,
"step": 118
},
{
"epoch": 0.0271689497716895,
"grad_norm": 0.05491505563259125,
"learning_rate": 0.001,
"loss": 0.0068,
"step": 119
},
{
"epoch": 0.0273972602739726,
"grad_norm": 0.05067432299256325,
"learning_rate": 0.001,
"loss": 0.0062,
"step": 120
},
{
"epoch": 0.027625570776255708,
"grad_norm": 0.06873013079166412,
"learning_rate": 0.001,
"loss": 0.008,
"step": 121
},
{
"epoch": 0.027853881278538814,
"grad_norm": 0.03151897341012955,
"learning_rate": 0.001,
"loss": 0.0055,
"step": 122
},
{
"epoch": 0.028082191780821917,
"grad_norm": 0.027348244562745094,
"learning_rate": 0.001,
"loss": 0.006,
"step": 123
},
{
"epoch": 0.028310502283105023,
"grad_norm": 0.04307318106293678,
"learning_rate": 0.001,
"loss": 0.0064,
"step": 124
},
{
"epoch": 0.028538812785388126,
"grad_norm": 0.0409172885119915,
"learning_rate": 0.001,
"loss": 0.0065,
"step": 125
},
{
"epoch": 0.028767123287671233,
"grad_norm": 0.042198970913887024,
"learning_rate": 0.001,
"loss": 0.0072,
"step": 126
},
{
"epoch": 0.02899543378995434,
"grad_norm": 0.046845417469739914,
"learning_rate": 0.001,
"loss": 0.0067,
"step": 127
},
{
"epoch": 0.029223744292237442,
"grad_norm": 0.03862365707755089,
"learning_rate": 0.001,
"loss": 0.0067,
"step": 128
},
{
"epoch": 0.02945205479452055,
"grad_norm": 0.04204321652650833,
"learning_rate": 0.001,
"loss": 0.0074,
"step": 129
},
{
"epoch": 0.02968036529680365,
"grad_norm": 0.03613033518195152,
"learning_rate": 0.001,
"loss": 0.0067,
"step": 130
},
{
"epoch": 0.029908675799086758,
"grad_norm": 0.03899417817592621,
"learning_rate": 0.001,
"loss": 0.0061,
"step": 131
},
{
"epoch": 0.030136986301369864,
"grad_norm": 0.03047838620841503,
"learning_rate": 0.001,
"loss": 0.0059,
"step": 132
},
{
"epoch": 0.030365296803652967,
"grad_norm": 0.04626467451453209,
"learning_rate": 0.001,
"loss": 0.0061,
"step": 133
},
{
"epoch": 0.030593607305936073,
"grad_norm": 0.04004530981183052,
"learning_rate": 0.001,
"loss": 0.0077,
"step": 134
},
{
"epoch": 0.030821917808219176,
"grad_norm": 0.03990226984024048,
"learning_rate": 0.001,
"loss": 0.0061,
"step": 135
},
{
"epoch": 0.031050228310502283,
"grad_norm": 0.035800885409116745,
"learning_rate": 0.001,
"loss": 0.0042,
"step": 136
},
{
"epoch": 0.03127853881278539,
"grad_norm": 0.03377184644341469,
"learning_rate": 0.001,
"loss": 0.0044,
"step": 137
},
{
"epoch": 0.031506849315068496,
"grad_norm": 0.031017042696475983,
"learning_rate": 0.001,
"loss": 0.0044,
"step": 138
},
{
"epoch": 0.031735159817351595,
"grad_norm": 0.027331147342920303,
"learning_rate": 0.001,
"loss": 0.0039,
"step": 139
},
{
"epoch": 0.0319634703196347,
"grad_norm": 0.034048646688461304,
"learning_rate": 0.001,
"loss": 0.0042,
"step": 140
},
{
"epoch": 0.03219178082191781,
"grad_norm": 0.03277864679694176,
"learning_rate": 0.001,
"loss": 0.0053,
"step": 141
},
{
"epoch": 0.032420091324200914,
"grad_norm": 0.04241342470049858,
"learning_rate": 0.001,
"loss": 0.0041,
"step": 142
},
{
"epoch": 0.03264840182648402,
"grad_norm": 0.026137417182326317,
"learning_rate": 0.001,
"loss": 0.0034,
"step": 143
},
{
"epoch": 0.03287671232876712,
"grad_norm": 0.03562963008880615,
"learning_rate": 0.001,
"loss": 0.0035,
"step": 144
},
{
"epoch": 0.033105022831050226,
"grad_norm": 0.026813900098204613,
"learning_rate": 0.001,
"loss": 0.0031,
"step": 145
},
{
"epoch": 0.03333333333333333,
"grad_norm": 0.030897343531250954,
"learning_rate": 0.001,
"loss": 0.0044,
"step": 146
},
{
"epoch": 0.03356164383561644,
"grad_norm": 0.02891898714005947,
"learning_rate": 0.001,
"loss": 0.0034,
"step": 147
},
{
"epoch": 0.033789954337899546,
"grad_norm": 0.03819667547941208,
"learning_rate": 0.001,
"loss": 0.0033,
"step": 148
},
{
"epoch": 0.034018264840182645,
"grad_norm": 0.02293401025235653,
"learning_rate": 0.001,
"loss": 0.0025,
"step": 149
},
{
"epoch": 0.03424657534246575,
"grad_norm": 0.02600831165909767,
"learning_rate": 0.001,
"loss": 0.0031,
"step": 150
},
{
"epoch": 0.03447488584474886,
"grad_norm": 0.040420051664114,
"learning_rate": 0.001,
"loss": 0.0051,
"step": 151
},
{
"epoch": 0.034703196347031964,
"grad_norm": 0.03907687962055206,
"learning_rate": 0.001,
"loss": 0.0047,
"step": 152
},
{
"epoch": 0.03493150684931507,
"grad_norm": 0.03037801943719387,
"learning_rate": 0.001,
"loss": 0.0034,
"step": 153
},
{
"epoch": 0.03515981735159817,
"grad_norm": 0.05104570835828781,
"learning_rate": 0.001,
"loss": 0.0061,
"step": 154
},
{
"epoch": 0.03538812785388128,
"grad_norm": 0.033460833132267,
"learning_rate": 0.001,
"loss": 0.0035,
"step": 155
},
{
"epoch": 0.03561643835616438,
"grad_norm": 0.034624133259058,
"learning_rate": 0.001,
"loss": 0.0038,
"step": 156
},
{
"epoch": 0.03584474885844749,
"grad_norm": 0.03363336622714996,
"learning_rate": 0.001,
"loss": 0.0046,
"step": 157
},
{
"epoch": 0.036073059360730596,
"grad_norm": 0.03651309013366699,
"learning_rate": 0.001,
"loss": 0.0042,
"step": 158
},
{
"epoch": 0.036301369863013695,
"grad_norm": 0.031121717765927315,
"learning_rate": 0.001,
"loss": 0.0034,
"step": 159
},
{
"epoch": 0.0365296803652968,
"grad_norm": 0.03925270959734917,
"learning_rate": 0.001,
"loss": 0.0045,
"step": 160
},
{
"epoch": 0.03675799086757991,
"grad_norm": 0.02922016754746437,
"learning_rate": 0.001,
"loss": 0.004,
"step": 161
},
{
"epoch": 0.036986301369863014,
"grad_norm": 0.03618766367435455,
"learning_rate": 0.001,
"loss": 0.004,
"step": 162
},
{
"epoch": 0.03721461187214612,
"grad_norm": 0.05399168282747269,
"learning_rate": 0.001,
"loss": 0.0049,
"step": 163
},
{
"epoch": 0.03744292237442922,
"grad_norm": 0.047811247408390045,
"learning_rate": 0.001,
"loss": 0.0045,
"step": 164
},
{
"epoch": 0.03767123287671233,
"grad_norm": 0.041643090546131134,
"learning_rate": 0.001,
"loss": 0.0038,
"step": 165
},
{
"epoch": 0.03789954337899543,
"grad_norm": 0.03867914155125618,
"learning_rate": 0.001,
"loss": 0.0045,
"step": 166
},
{
"epoch": 0.03812785388127854,
"grad_norm": 0.0361204594373703,
"learning_rate": 0.001,
"loss": 0.0041,
"step": 167
},
{
"epoch": 0.038356164383561646,
"grad_norm": 0.036205410957336426,
"learning_rate": 0.001,
"loss": 0.0048,
"step": 168
},
{
"epoch": 0.03858447488584475,
"grad_norm": 0.03310992196202278,
"learning_rate": 0.001,
"loss": 0.0032,
"step": 169
},
{
"epoch": 0.03881278538812785,
"grad_norm": 0.027686715126037598,
"learning_rate": 0.001,
"loss": 0.002,
"step": 170
},
{
"epoch": 0.03904109589041096,
"grad_norm": 0.029249897226691246,
"learning_rate": 0.001,
"loss": 0.0038,
"step": 171
},
{
"epoch": 0.039269406392694065,
"grad_norm": 0.03591005504131317,
"learning_rate": 0.001,
"loss": 0.0035,
"step": 172
},
{
"epoch": 0.03949771689497717,
"grad_norm": 0.030710754916071892,
"learning_rate": 0.001,
"loss": 0.0029,
"step": 173
},
{
"epoch": 0.03972602739726028,
"grad_norm": 0.03295068442821503,
"learning_rate": 0.001,
"loss": 0.002,
"step": 174
},
{
"epoch": 0.03995433789954338,
"grad_norm": 0.02918722666800022,
"learning_rate": 0.001,
"loss": 0.0032,
"step": 175
},
{
"epoch": 0.04018264840182648,
"grad_norm": 0.035701602697372437,
"learning_rate": 0.001,
"loss": 0.004,
"step": 176
},
{
"epoch": 0.04041095890410959,
"grad_norm": 0.03620489314198494,
"learning_rate": 0.001,
"loss": 0.0027,
"step": 177
},
{
"epoch": 0.040639269406392696,
"grad_norm": 0.04025309905409813,
"learning_rate": 0.001,
"loss": 0.0036,
"step": 178
},
{
"epoch": 0.0408675799086758,
"grad_norm": 0.03256874904036522,
"learning_rate": 0.001,
"loss": 0.0045,
"step": 179
},
{
"epoch": 0.0410958904109589,
"grad_norm": 0.03545399010181427,
"learning_rate": 0.001,
"loss": 0.003,
"step": 180
},
{
"epoch": 0.04132420091324201,
"grad_norm": 0.04845140874385834,
"learning_rate": 0.001,
"loss": 0.0047,
"step": 181
},
{
"epoch": 0.041552511415525115,
"grad_norm": 0.045855190604925156,
"learning_rate": 0.001,
"loss": 0.0041,
"step": 182
},
{
"epoch": 0.04178082191780822,
"grad_norm": 0.026962406933307648,
"learning_rate": 0.001,
"loss": 0.0028,
"step": 183
},
{
"epoch": 0.04200913242009133,
"grad_norm": 0.028487997129559517,
"learning_rate": 0.001,
"loss": 0.0039,
"step": 184
},
{
"epoch": 0.04223744292237443,
"grad_norm": 0.038144659250974655,
"learning_rate": 0.001,
"loss": 0.0036,
"step": 185
},
{
"epoch": 0.04246575342465753,
"grad_norm": 0.0443580225110054,
"learning_rate": 0.001,
"loss": 0.0038,
"step": 186
},
{
"epoch": 0.04269406392694064,
"grad_norm": 0.035410862416028976,
"learning_rate": 0.001,
"loss": 0.0036,
"step": 187
},
{
"epoch": 0.042922374429223746,
"grad_norm": 0.0394715741276741,
"learning_rate": 0.001,
"loss": 0.0028,
"step": 188
},
{
"epoch": 0.04315068493150685,
"grad_norm": 0.03207629173994064,
"learning_rate": 0.001,
"loss": 0.0028,
"step": 189
},
{
"epoch": 0.04337899543378995,
"grad_norm": 0.02908760868012905,
"learning_rate": 0.001,
"loss": 0.002,
"step": 190
},
{
"epoch": 0.04360730593607306,
"grad_norm": 0.0272049680352211,
"learning_rate": 0.001,
"loss": 0.0028,
"step": 191
},
{
"epoch": 0.043835616438356165,
"grad_norm": 0.02282743901014328,
"learning_rate": 0.001,
"loss": 0.0015,
"step": 192
},
{
"epoch": 0.04406392694063927,
"grad_norm": 0.024958152323961258,
"learning_rate": 0.001,
"loss": 0.0017,
"step": 193
},
{
"epoch": 0.04429223744292238,
"grad_norm": 0.029786400496959686,
"learning_rate": 0.001,
"loss": 0.0031,
"step": 194
},
{
"epoch": 0.04452054794520548,
"grad_norm": 0.023932697251439095,
"learning_rate": 0.001,
"loss": 0.0025,
"step": 195
},
{
"epoch": 0.04474885844748858,
"grad_norm": 0.02262377366423607,
"learning_rate": 0.001,
"loss": 0.0021,
"step": 196
},
{
"epoch": 0.04497716894977169,
"grad_norm": 0.033051978796720505,
"learning_rate": 0.001,
"loss": 0.0026,
"step": 197
},
{
"epoch": 0.045205479452054796,
"grad_norm": 0.029031749814748764,
"learning_rate": 0.001,
"loss": 0.0022,
"step": 198
},
{
"epoch": 0.0454337899543379,
"grad_norm": 0.030305176973342896,
"learning_rate": 0.001,
"loss": 0.0026,
"step": 199
},
{
"epoch": 0.045662100456621,
"grad_norm": 0.02067619003355503,
"learning_rate": 0.001,
"loss": 0.0014,
"step": 200
},
{
"epoch": 0.04589041095890411,
"grad_norm": 0.03859075903892517,
"learning_rate": 0.001,
"loss": 0.0041,
"step": 201
},
{
"epoch": 0.046118721461187215,
"grad_norm": 0.03318578004837036,
"learning_rate": 0.001,
"loss": 0.0024,
"step": 202
},
{
"epoch": 0.04634703196347032,
"grad_norm": 0.03525965288281441,
"learning_rate": 0.001,
"loss": 0.0029,
"step": 203
},
{
"epoch": 0.04657534246575343,
"grad_norm": 0.041064050048589706,
"learning_rate": 0.001,
"loss": 0.0045,
"step": 204
},
{
"epoch": 0.04680365296803653,
"grad_norm": 0.045857496559619904,
"learning_rate": 0.001,
"loss": 0.0033,
"step": 205
},
{
"epoch": 0.047031963470319633,
"grad_norm": 0.037904538214206696,
"learning_rate": 0.001,
"loss": 0.0035,
"step": 206
},
{
"epoch": 0.04726027397260274,
"grad_norm": 0.03063504584133625,
"learning_rate": 0.001,
"loss": 0.0027,
"step": 207
},
{
"epoch": 0.047488584474885846,
"grad_norm": 0.040485553443431854,
"learning_rate": 0.001,
"loss": 0.004,
"step": 208
},
{
"epoch": 0.04771689497716895,
"grad_norm": 0.034435346722602844,
"learning_rate": 0.001,
"loss": 0.0046,
"step": 209
},
{
"epoch": 0.04794520547945205,
"grad_norm": 0.027862414717674255,
"learning_rate": 0.001,
"loss": 0.002,
"step": 210
},
{
"epoch": 0.04817351598173516,
"grad_norm": 0.034713245928287506,
"learning_rate": 0.001,
"loss": 0.0039,
"step": 211
},
{
"epoch": 0.048401826484018265,
"grad_norm": 0.038781870156526566,
"learning_rate": 0.001,
"loss": 0.0046,
"step": 212
},
{
"epoch": 0.04863013698630137,
"grad_norm": 0.025890646502375603,
"learning_rate": 0.001,
"loss": 0.0022,
"step": 213
},
{
"epoch": 0.04885844748858448,
"grad_norm": 0.0285344235599041,
"learning_rate": 0.001,
"loss": 0.0025,
"step": 214
},
{
"epoch": 0.04908675799086758,
"grad_norm": 0.032012905925512314,
"learning_rate": 0.001,
"loss": 0.003,
"step": 215
},
{
"epoch": 0.049315068493150684,
"grad_norm": 0.04779508709907532,
"learning_rate": 0.001,
"loss": 0.0037,
"step": 216
},
{
"epoch": 0.04954337899543379,
"grad_norm": 0.039367783814668655,
"learning_rate": 0.001,
"loss": 0.0027,
"step": 217
},
{
"epoch": 0.049771689497716896,
"grad_norm": 0.02745324745774269,
"learning_rate": 0.001,
"loss": 0.0021,
"step": 218
},
{
"epoch": 0.05,
"grad_norm": 0.03268812596797943,
"learning_rate": 0.001,
"loss": 0.0033,
"step": 219
},
{
"epoch": 0.0502283105022831,
"grad_norm": 0.023665225133299828,
"learning_rate": 0.001,
"loss": 0.0028,
"step": 220
},
{
"epoch": 0.05045662100456621,
"grad_norm": 0.0373012013733387,
"learning_rate": 0.001,
"loss": 0.0039,
"step": 221
},
{
"epoch": 0.050684931506849315,
"grad_norm": 0.033793918788433075,
"learning_rate": 0.001,
"loss": 0.0036,
"step": 222
},
{
"epoch": 0.05091324200913242,
"grad_norm": 0.0297444686293602,
"learning_rate": 0.001,
"loss": 0.003,
"step": 223
},
{
"epoch": 0.05114155251141553,
"grad_norm": 0.05024491623044014,
"learning_rate": 0.001,
"loss": 0.0035,
"step": 224
},
{
"epoch": 0.05136986301369863,
"grad_norm": 0.03143681213259697,
"learning_rate": 0.001,
"loss": 0.0032,
"step": 225
},
{
"epoch": 0.051598173515981734,
"grad_norm": 0.023645315319299698,
"learning_rate": 0.001,
"loss": 0.0025,
"step": 226
},
{
"epoch": 0.05182648401826484,
"grad_norm": 0.02782478556036949,
"learning_rate": 0.001,
"loss": 0.002,
"step": 227
},
{
"epoch": 0.052054794520547946,
"grad_norm": 0.0307586882263422,
"learning_rate": 0.001,
"loss": 0.0032,
"step": 228
},
{
"epoch": 0.05228310502283105,
"grad_norm": 0.04269454628229141,
"learning_rate": 0.001,
"loss": 0.0039,
"step": 229
},
{
"epoch": 0.05251141552511415,
"grad_norm": 0.035806287080049515,
"learning_rate": 0.001,
"loss": 0.003,
"step": 230
},
{
"epoch": 0.05273972602739726,
"grad_norm": 0.03528301417827606,
"learning_rate": 0.001,
"loss": 0.0032,
"step": 231
},
{
"epoch": 0.052968036529680365,
"grad_norm": 0.029358338564634323,
"learning_rate": 0.001,
"loss": 0.0025,
"step": 232
},
{
"epoch": 0.05319634703196347,
"grad_norm": 0.021077649667859077,
"learning_rate": 0.001,
"loss": 0.003,
"step": 233
},
{
"epoch": 0.05342465753424658,
"grad_norm": 0.029840657487511635,
"learning_rate": 0.001,
"loss": 0.0017,
"step": 234
},
{
"epoch": 0.05365296803652968,
"grad_norm": 0.028463926166296005,
"learning_rate": 0.001,
"loss": 0.0027,
"step": 235
},
{
"epoch": 0.053881278538812784,
"grad_norm": 0.026239361613988876,
"learning_rate": 0.001,
"loss": 0.0024,
"step": 236
},
{
"epoch": 0.05410958904109589,
"grad_norm": 0.02149251475930214,
"learning_rate": 0.001,
"loss": 0.0014,
"step": 237
},
{
"epoch": 0.054337899543379,
"grad_norm": 0.02750280313193798,
"learning_rate": 0.001,
"loss": 0.0021,
"step": 238
},
{
"epoch": 0.0545662100456621,
"grad_norm": 0.028853842988610268,
"learning_rate": 0.001,
"loss": 0.0024,
"step": 239
},
{
"epoch": 0.0547945205479452,
"grad_norm": 0.03062448836863041,
"learning_rate": 0.001,
"loss": 0.0028,
"step": 240
},
{
"epoch": 0.05502283105022831,
"grad_norm": 0.021715497598052025,
"learning_rate": 0.001,
"loss": 0.0017,
"step": 241
},
{
"epoch": 0.055251141552511415,
"grad_norm": 0.03351881727576256,
"learning_rate": 0.001,
"loss": 0.0025,
"step": 242
},
{
"epoch": 0.05547945205479452,
"grad_norm": 0.025600440800189972,
"learning_rate": 0.001,
"loss": 0.002,
"step": 243
},
{
"epoch": 0.05570776255707763,
"grad_norm": 0.03094620630145073,
"learning_rate": 0.001,
"loss": 0.0018,
"step": 244
},
{
"epoch": 0.05593607305936073,
"grad_norm": 0.03529248386621475,
"learning_rate": 0.001,
"loss": 0.0023,
"step": 245
},
{
"epoch": 0.056164383561643834,
"grad_norm": 0.026421545073390007,
"learning_rate": 0.001,
"loss": 0.002,
"step": 246
},
{
"epoch": 0.05639269406392694,
"grad_norm": 0.018347790464758873,
"learning_rate": 0.001,
"loss": 0.0012,
"step": 247
},
{
"epoch": 0.05662100456621005,
"grad_norm": 0.02605101279914379,
"learning_rate": 0.001,
"loss": 0.0028,
"step": 248
},
{
"epoch": 0.05684931506849315,
"grad_norm": 0.027538320049643517,
"learning_rate": 0.001,
"loss": 0.003,
"step": 249
},
{
"epoch": 0.05707762557077625,
"grad_norm": 0.030089175328612328,
"learning_rate": 0.001,
"loss": 0.002,
"step": 250
},
{
"epoch": 0.05730593607305936,
"grad_norm": 0.02568584680557251,
"learning_rate": 0.001,
"loss": 0.0018,
"step": 251
},
{
"epoch": 0.057534246575342465,
"grad_norm": 0.043693918734788895,
"learning_rate": 0.001,
"loss": 0.0039,
"step": 252
},
{
"epoch": 0.05776255707762557,
"grad_norm": 0.025515882298350334,
"learning_rate": 0.001,
"loss": 0.0029,
"step": 253
},
{
"epoch": 0.05799086757990868,
"grad_norm": 0.023086579516530037,
"learning_rate": 0.001,
"loss": 0.002,
"step": 254
},
{
"epoch": 0.05821917808219178,
"grad_norm": 0.03552839159965515,
"learning_rate": 0.001,
"loss": 0.0031,
"step": 255
},
{
"epoch": 0.058447488584474884,
"grad_norm": 0.030602211132645607,
"learning_rate": 0.001,
"loss": 0.0028,
"step": 256
},
{
"epoch": 0.05867579908675799,
"grad_norm": 0.02757362276315689,
"learning_rate": 0.001,
"loss": 0.0021,
"step": 257
},
{
"epoch": 0.0589041095890411,
"grad_norm": 0.04006500914692879,
"learning_rate": 0.001,
"loss": 0.0025,
"step": 258
},
{
"epoch": 0.0591324200913242,
"grad_norm": 0.039859503507614136,
"learning_rate": 0.001,
"loss": 0.0025,
"step": 259
},
{
"epoch": 0.0593607305936073,
"grad_norm": 0.02268202416598797,
"learning_rate": 0.001,
"loss": 0.0019,
"step": 260
},
{
"epoch": 0.05958904109589041,
"grad_norm": 0.020849550142884254,
"learning_rate": 0.001,
"loss": 0.0019,
"step": 261
},
{
"epoch": 0.059817351598173515,
"grad_norm": 0.026384403929114342,
"learning_rate": 0.001,
"loss": 0.0022,
"step": 262
},
{
"epoch": 0.06004566210045662,
"grad_norm": 0.029226887971162796,
"learning_rate": 0.001,
"loss": 0.0024,
"step": 263
},
{
"epoch": 0.06027397260273973,
"grad_norm": 0.029352016746997833,
"learning_rate": 0.001,
"loss": 0.0027,
"step": 264
},
{
"epoch": 0.06050228310502283,
"grad_norm": 0.023828251287341118,
"learning_rate": 0.001,
"loss": 0.0022,
"step": 265
},
{
"epoch": 0.060730593607305934,
"grad_norm": 0.050515275448560715,
"learning_rate": 0.001,
"loss": 0.0065,
"step": 266
},
{
"epoch": 0.06095890410958904,
"grad_norm": 0.3609565198421478,
"learning_rate": 0.001,
"loss": 0.0033,
"step": 267
},
{
"epoch": 0.06118721461187215,
"grad_norm": 0.030405467376112938,
"learning_rate": 0.001,
"loss": 0.002,
"step": 268
},
{
"epoch": 0.06141552511415525,
"grad_norm": 0.07481672614812851,
"learning_rate": 0.001,
"loss": 0.0027,
"step": 269
},
{
"epoch": 0.06164383561643835,
"grad_norm": 0.09166887402534485,
"learning_rate": 0.001,
"loss": 0.003,
"step": 270
},
{
"epoch": 0.06187214611872146,
"grad_norm": 0.06070258840918541,
"learning_rate": 0.001,
"loss": 0.0038,
"step": 271
},
{
"epoch": 0.062100456621004566,
"grad_norm": 0.02546994574368,
"learning_rate": 0.001,
"loss": 0.0026,
"step": 272
},
{
"epoch": 0.06232876712328767,
"grad_norm": 0.028366973623633385,
"learning_rate": 0.001,
"loss": 0.0021,
"step": 273
},
{
"epoch": 0.06255707762557078,
"grad_norm": 0.02752639539539814,
"learning_rate": 0.001,
"loss": 0.0028,
"step": 274
},
{
"epoch": 0.06278538812785388,
"grad_norm": 0.02514069154858589,
"learning_rate": 0.001,
"loss": 0.0025,
"step": 275
},
{
"epoch": 0.06301369863013699,
"grad_norm": 0.03297794982790947,
"learning_rate": 0.001,
"loss": 0.0031,
"step": 276
},
{
"epoch": 0.06324200913242009,
"grad_norm": 0.03322751075029373,
"learning_rate": 0.001,
"loss": 0.0027,
"step": 277
},
{
"epoch": 0.06347031963470319,
"grad_norm": 0.028292890638113022,
"learning_rate": 0.001,
"loss": 0.0035,
"step": 278
},
{
"epoch": 0.0636986301369863,
"grad_norm": 0.04020245000720024,
"learning_rate": 0.001,
"loss": 0.0025,
"step": 279
},
{
"epoch": 0.0639269406392694,
"grad_norm": 0.03231251239776611,
"learning_rate": 0.001,
"loss": 0.0039,
"step": 280
},
{
"epoch": 0.06415525114155252,
"grad_norm": 0.0225644800812006,
"learning_rate": 0.001,
"loss": 0.0025,
"step": 281
},
{
"epoch": 0.06438356164383562,
"grad_norm": 0.028778597712516785,
"learning_rate": 0.001,
"loss": 0.0023,
"step": 282
},
{
"epoch": 0.06461187214611872,
"grad_norm": 0.02618185058236122,
"learning_rate": 0.001,
"loss": 0.0025,
"step": 283
},
{
"epoch": 0.06484018264840183,
"grad_norm": 0.03890310227870941,
"learning_rate": 0.001,
"loss": 0.0026,
"step": 284
},
{
"epoch": 0.06506849315068493,
"grad_norm": 0.029423601925373077,
"learning_rate": 0.001,
"loss": 0.0031,
"step": 285
},
{
"epoch": 0.06529680365296804,
"grad_norm": 0.04089478775858879,
"learning_rate": 0.001,
"loss": 0.0035,
"step": 286
},
{
"epoch": 0.06552511415525114,
"grad_norm": 0.031911611557006836,
"learning_rate": 0.001,
"loss": 0.0029,
"step": 287
},
{
"epoch": 0.06575342465753424,
"grad_norm": 0.02856455370783806,
"learning_rate": 0.001,
"loss": 0.0019,
"step": 288
},
{
"epoch": 0.06598173515981735,
"grad_norm": 0.02316523902118206,
"learning_rate": 0.001,
"loss": 0.0015,
"step": 289
},
{
"epoch": 0.06621004566210045,
"grad_norm": 0.021586967632174492,
"learning_rate": 0.001,
"loss": 0.002,
"step": 290
},
{
"epoch": 0.06643835616438357,
"grad_norm": 0.020875398069620132,
"learning_rate": 0.001,
"loss": 0.0021,
"step": 291
},
{
"epoch": 0.06666666666666667,
"grad_norm": 0.025591716170310974,
"learning_rate": 0.001,
"loss": 0.0027,
"step": 292
},
{
"epoch": 0.06689497716894977,
"grad_norm": 0.02905621938407421,
"learning_rate": 0.001,
"loss": 0.0021,
"step": 293
},
{
"epoch": 0.06712328767123288,
"grad_norm": 0.03460671007633209,
"learning_rate": 0.001,
"loss": 0.0031,
"step": 294
},
{
"epoch": 0.06735159817351598,
"grad_norm": 0.014558055438101292,
"learning_rate": 0.001,
"loss": 0.0008,
"step": 295
},
{
"epoch": 0.06757990867579909,
"grad_norm": 0.021651627495884895,
"learning_rate": 0.001,
"loss": 0.0021,
"step": 296
},
{
"epoch": 0.06780821917808219,
"grad_norm": 0.020275374874472618,
"learning_rate": 0.001,
"loss": 0.0012,
"step": 297
},
{
"epoch": 0.06803652968036529,
"grad_norm": 0.030108539387583733,
"learning_rate": 0.001,
"loss": 0.0021,
"step": 298
},
{
"epoch": 0.0682648401826484,
"grad_norm": 0.02870999090373516,
"learning_rate": 0.001,
"loss": 0.0016,
"step": 299
},
{
"epoch": 0.0684931506849315,
"grad_norm": 0.030189916491508484,
"learning_rate": 0.001,
"loss": 0.0027,
"step": 300
},
{
"epoch": 0.06872146118721462,
"grad_norm": 0.048917006701231,
"learning_rate": 0.001,
"loss": 0.0051,
"step": 301
},
{
"epoch": 0.06894977168949772,
"grad_norm": 0.0351158082485199,
"learning_rate": 0.001,
"loss": 0.003,
"step": 302
},
{
"epoch": 0.06917808219178082,
"grad_norm": 0.0304318368434906,
"learning_rate": 0.001,
"loss": 0.003,
"step": 303
},
{
"epoch": 0.06940639269406393,
"grad_norm": 0.02364553138613701,
"learning_rate": 0.001,
"loss": 0.0018,
"step": 304
},
{
"epoch": 0.06963470319634703,
"grad_norm": 0.025430144742131233,
"learning_rate": 0.001,
"loss": 0.0033,
"step": 305
},
{
"epoch": 0.06986301369863014,
"grad_norm": 0.028122954070568085,
"learning_rate": 0.001,
"loss": 0.0028,
"step": 306
},
{
"epoch": 0.07009132420091324,
"grad_norm": 0.04655618220567703,
"learning_rate": 0.001,
"loss": 0.0038,
"step": 307
},
{
"epoch": 0.07031963470319634,
"grad_norm": 0.03192426636815071,
"learning_rate": 0.001,
"loss": 0.0033,
"step": 308
},
{
"epoch": 0.07054794520547945,
"grad_norm": 0.03930205851793289,
"learning_rate": 0.001,
"loss": 0.0024,
"step": 309
},
{
"epoch": 0.07077625570776255,
"grad_norm": 0.0391114316880703,
"learning_rate": 0.001,
"loss": 0.0033,
"step": 310
},
{
"epoch": 0.07100456621004567,
"grad_norm": 0.02882283739745617,
"learning_rate": 0.001,
"loss": 0.0017,
"step": 311
},
{
"epoch": 0.07123287671232877,
"grad_norm": 0.025312229990959167,
"learning_rate": 0.001,
"loss": 0.0023,
"step": 312
},
{
"epoch": 0.07146118721461187,
"grad_norm": 0.03631848841905594,
"learning_rate": 0.001,
"loss": 0.0023,
"step": 313
},
{
"epoch": 0.07168949771689498,
"grad_norm": 0.02449788525700569,
"learning_rate": 0.001,
"loss": 0.0017,
"step": 314
},
{
"epoch": 0.07191780821917808,
"grad_norm": 0.0258337315171957,
"learning_rate": 0.001,
"loss": 0.0018,
"step": 315
},
{
"epoch": 0.07214611872146119,
"grad_norm": 0.023845955729484558,
"learning_rate": 0.001,
"loss": 0.0015,
"step": 316
},
{
"epoch": 0.07237442922374429,
"grad_norm": 0.024546071887016296,
"learning_rate": 0.001,
"loss": 0.0022,
"step": 317
},
{
"epoch": 0.07260273972602739,
"grad_norm": 0.0188372153788805,
"learning_rate": 0.001,
"loss": 0.0013,
"step": 318
},
{
"epoch": 0.0728310502283105,
"grad_norm": 0.03890606015920639,
"learning_rate": 0.001,
"loss": 0.0047,
"step": 319
},
{
"epoch": 0.0730593607305936,
"grad_norm": 0.02590329386293888,
"learning_rate": 0.001,
"loss": 0.0024,
"step": 320
},
{
"epoch": 0.07328767123287672,
"grad_norm": 0.036657921969890594,
"learning_rate": 0.001,
"loss": 0.0035,
"step": 321
},
{
"epoch": 0.07351598173515982,
"grad_norm": 0.04023008793592453,
"learning_rate": 0.001,
"loss": 0.0033,
"step": 322
},
{
"epoch": 0.07374429223744292,
"grad_norm": 0.025426125153899193,
"learning_rate": 0.001,
"loss": 0.0027,
"step": 323
},
{
"epoch": 0.07397260273972603,
"grad_norm": 0.02883792109787464,
"learning_rate": 0.001,
"loss": 0.0021,
"step": 324
},
{
"epoch": 0.07420091324200913,
"grad_norm": 0.02551659569144249,
"learning_rate": 0.001,
"loss": 0.0021,
"step": 325
},
{
"epoch": 0.07442922374429224,
"grad_norm": 0.023540591821074486,
"learning_rate": 0.001,
"loss": 0.0023,
"step": 326
},
{
"epoch": 0.07465753424657534,
"grad_norm": 0.02690877579152584,
"learning_rate": 0.001,
"loss": 0.0026,
"step": 327
},
{
"epoch": 0.07488584474885844,
"grad_norm": 0.020135624334216118,
"learning_rate": 0.001,
"loss": 0.0021,
"step": 328
},
{
"epoch": 0.07511415525114155,
"grad_norm": 0.026753783226013184,
"learning_rate": 0.001,
"loss": 0.003,
"step": 329
},
{
"epoch": 0.07534246575342465,
"grad_norm": 0.0383230559527874,
"learning_rate": 0.001,
"loss": 0.0031,
"step": 330
},
{
"epoch": 0.07557077625570777,
"grad_norm": 0.03493601456284523,
"learning_rate": 0.001,
"loss": 0.0033,
"step": 331
},
{
"epoch": 0.07579908675799087,
"grad_norm": 0.02847091108560562,
"learning_rate": 0.001,
"loss": 0.0036,
"step": 332
},
{
"epoch": 0.07602739726027398,
"grad_norm": 0.023921307176351547,
"learning_rate": 0.001,
"loss": 0.0029,
"step": 333
},
{
"epoch": 0.07625570776255708,
"grad_norm": 0.03113155998289585,
"learning_rate": 0.001,
"loss": 0.0022,
"step": 334
},
{
"epoch": 0.07648401826484018,
"grad_norm": 0.024777159094810486,
"learning_rate": 0.001,
"loss": 0.0024,
"step": 335
},
{
"epoch": 0.07671232876712329,
"grad_norm": 0.02515614964067936,
"learning_rate": 0.001,
"loss": 0.0024,
"step": 336
},
{
"epoch": 0.07694063926940639,
"grad_norm": 0.023284632712602615,
"learning_rate": 0.001,
"loss": 0.0016,
"step": 337
},
{
"epoch": 0.0771689497716895,
"grad_norm": 0.023549994453787804,
"learning_rate": 0.001,
"loss": 0.0025,
"step": 338
},
{
"epoch": 0.0773972602739726,
"grad_norm": 0.026529377326369286,
"learning_rate": 0.001,
"loss": 0.0017,
"step": 339
},
{
"epoch": 0.0776255707762557,
"grad_norm": 0.02118872106075287,
"learning_rate": 0.001,
"loss": 0.002,
"step": 340
},
{
"epoch": 0.07785388127853882,
"grad_norm": 0.0226143728941679,
"learning_rate": 0.001,
"loss": 0.0024,
"step": 341
},
{
"epoch": 0.07808219178082192,
"grad_norm": 0.022813035175204277,
"learning_rate": 0.001,
"loss": 0.0018,
"step": 342
},
{
"epoch": 0.07831050228310503,
"grad_norm": 0.019757017493247986,
"learning_rate": 0.001,
"loss": 0.0018,
"step": 343
},
{
"epoch": 0.07853881278538813,
"grad_norm": 0.02227397821843624,
"learning_rate": 0.001,
"loss": 0.0017,
"step": 344
},
{
"epoch": 0.07876712328767123,
"grad_norm": 0.022303001955151558,
"learning_rate": 0.001,
"loss": 0.0018,
"step": 345
},
{
"epoch": 0.07899543378995434,
"grad_norm": 0.025369267910718918,
"learning_rate": 0.001,
"loss": 0.0026,
"step": 346
},
{
"epoch": 0.07922374429223744,
"grad_norm": 0.022909611463546753,
"learning_rate": 0.001,
"loss": 0.0015,
"step": 347
},
{
"epoch": 0.07945205479452055,
"grad_norm": 0.02747984044253826,
"learning_rate": 0.001,
"loss": 0.0022,
"step": 348
},
{
"epoch": 0.07968036529680365,
"grad_norm": 0.028999097645282745,
"learning_rate": 0.001,
"loss": 0.003,
"step": 349
},
{
"epoch": 0.07990867579908675,
"grad_norm": 0.013709438033401966,
"learning_rate": 0.001,
"loss": 0.001,
"step": 350
},
{
"epoch": 0.08013698630136987,
"grad_norm": 0.03311995416879654,
"learning_rate": 0.001,
"loss": 0.0025,
"step": 351
},
{
"epoch": 0.08036529680365297,
"grad_norm": 0.030428579077124596,
"learning_rate": 0.001,
"loss": 0.0024,
"step": 352
},
{
"epoch": 0.08059360730593608,
"grad_norm": 0.02569733001291752,
"learning_rate": 0.001,
"loss": 0.0025,
"step": 353
},
{
"epoch": 0.08082191780821918,
"grad_norm": 0.03375837951898575,
"learning_rate": 0.001,
"loss": 0.0027,
"step": 354
},
{
"epoch": 0.08105022831050228,
"grad_norm": 0.02408471703529358,
"learning_rate": 0.001,
"loss": 0.0025,
"step": 355
},
{
"epoch": 0.08127853881278539,
"grad_norm": 0.025053909048438072,
"learning_rate": 0.001,
"loss": 0.0025,
"step": 356
},
{
"epoch": 0.08150684931506849,
"grad_norm": 0.03166033327579498,
"learning_rate": 0.001,
"loss": 0.0022,
"step": 357
},
{
"epoch": 0.0817351598173516,
"grad_norm": 0.023597661405801773,
"learning_rate": 0.001,
"loss": 0.0027,
"step": 358
},
{
"epoch": 0.0819634703196347,
"grad_norm": 0.02543063834309578,
"learning_rate": 0.001,
"loss": 0.0015,
"step": 359
},
{
"epoch": 0.0821917808219178,
"grad_norm": 0.024594414979219437,
"learning_rate": 0.001,
"loss": 0.0021,
"step": 360
},
{
"epoch": 0.08242009132420092,
"grad_norm": 0.026880159974098206,
"learning_rate": 0.001,
"loss": 0.0032,
"step": 361
},
{
"epoch": 0.08264840182648402,
"grad_norm": 0.0315290130674839,
"learning_rate": 0.001,
"loss": 0.0027,
"step": 362
},
{
"epoch": 0.08287671232876713,
"grad_norm": 0.027256738394498825,
"learning_rate": 0.001,
"loss": 0.0028,
"step": 363
},
{
"epoch": 0.08310502283105023,
"grad_norm": 0.022752612829208374,
"learning_rate": 0.001,
"loss": 0.0024,
"step": 364
},
{
"epoch": 0.08333333333333333,
"grad_norm": 0.013999447226524353,
"learning_rate": 0.001,
"loss": 0.0016,
"step": 365
},
{
"epoch": 0.08356164383561644,
"grad_norm": 0.026544874534010887,
"learning_rate": 0.001,
"loss": 0.0021,
"step": 366
},
{
"epoch": 0.08378995433789954,
"grad_norm": 0.018856002017855644,
"learning_rate": 0.001,
"loss": 0.0023,
"step": 367
},
{
"epoch": 0.08401826484018265,
"grad_norm": 0.04184157773852348,
"learning_rate": 0.001,
"loss": 0.0043,
"step": 368
},
{
"epoch": 0.08424657534246575,
"grad_norm": 0.027606133371591568,
"learning_rate": 0.001,
"loss": 0.0022,
"step": 369
},
{
"epoch": 0.08447488584474885,
"grad_norm": 0.0274574626237154,
"learning_rate": 0.001,
"loss": 0.0024,
"step": 370
},
{
"epoch": 0.08470319634703197,
"grad_norm": 0.029858067631721497,
"learning_rate": 0.001,
"loss": 0.0033,
"step": 371
},
{
"epoch": 0.08493150684931507,
"grad_norm": 0.026789812371134758,
"learning_rate": 0.001,
"loss": 0.0034,
"step": 372
},
{
"epoch": 0.08515981735159818,
"grad_norm": 0.029502468183636665,
"learning_rate": 0.001,
"loss": 0.0022,
"step": 373
},
{
"epoch": 0.08538812785388128,
"grad_norm": 0.025616176426410675,
"learning_rate": 0.001,
"loss": 0.0027,
"step": 374
},
{
"epoch": 0.08561643835616438,
"grad_norm": 0.016593433916568756,
"learning_rate": 0.001,
"loss": 0.0013,
"step": 375
},
{
"epoch": 0.08584474885844749,
"grad_norm": 0.026096921414136887,
"learning_rate": 0.001,
"loss": 0.0019,
"step": 376
},
{
"epoch": 0.08607305936073059,
"grad_norm": 0.034800466150045395,
"learning_rate": 0.001,
"loss": 0.0023,
"step": 377
},
{
"epoch": 0.0863013698630137,
"grad_norm": 0.025603458285331726,
"learning_rate": 0.001,
"loss": 0.0027,
"step": 378
},
{
"epoch": 0.0865296803652968,
"grad_norm": 0.01851038821041584,
"learning_rate": 0.001,
"loss": 0.0022,
"step": 379
},
{
"epoch": 0.0867579908675799,
"grad_norm": 0.028083520010113716,
"learning_rate": 0.001,
"loss": 0.0023,
"step": 380
},
{
"epoch": 0.08698630136986302,
"grad_norm": 0.022135423496365547,
"learning_rate": 0.001,
"loss": 0.0021,
"step": 381
},
{
"epoch": 0.08721461187214612,
"grad_norm": 0.02563360147178173,
"learning_rate": 0.001,
"loss": 0.003,
"step": 382
},
{
"epoch": 0.08744292237442923,
"grad_norm": 0.03189925476908684,
"learning_rate": 0.001,
"loss": 0.0022,
"step": 383
},
{
"epoch": 0.08767123287671233,
"grad_norm": 0.026175467297434807,
"learning_rate": 0.001,
"loss": 0.0019,
"step": 384
},
{
"epoch": 0.08789954337899543,
"grad_norm": 0.019512465223670006,
"learning_rate": 0.001,
"loss": 0.002,
"step": 385
},
{
"epoch": 0.08812785388127854,
"grad_norm": 0.013086398132145405,
"learning_rate": 0.001,
"loss": 0.0012,
"step": 386
},
{
"epoch": 0.08835616438356164,
"grad_norm": 0.018814057111740112,
"learning_rate": 0.001,
"loss": 0.0014,
"step": 387
},
{
"epoch": 0.08858447488584476,
"grad_norm": 0.018231388181447983,
"learning_rate": 0.001,
"loss": 0.0014,
"step": 388
},
{
"epoch": 0.08881278538812785,
"grad_norm": 0.0169826727360487,
"learning_rate": 0.001,
"loss": 0.0013,
"step": 389
},
{
"epoch": 0.08904109589041095,
"grad_norm": 0.03351948410272598,
"learning_rate": 0.001,
"loss": 0.0024,
"step": 390
},
{
"epoch": 0.08926940639269407,
"grad_norm": 0.023230386897921562,
"learning_rate": 0.001,
"loss": 0.0024,
"step": 391
},
{
"epoch": 0.08949771689497717,
"grad_norm": 0.02241365611553192,
"learning_rate": 0.001,
"loss": 0.0022,
"step": 392
},
{
"epoch": 0.08972602739726028,
"grad_norm": 0.021022368222475052,
"learning_rate": 0.001,
"loss": 0.0017,
"step": 393
},
{
"epoch": 0.08995433789954338,
"grad_norm": 0.022241264581680298,
"learning_rate": 0.001,
"loss": 0.0019,
"step": 394
},
{
"epoch": 0.09018264840182648,
"grad_norm": 0.02163674309849739,
"learning_rate": 0.001,
"loss": 0.0018,
"step": 395
},
{
"epoch": 0.09041095890410959,
"grad_norm": 0.020653806626796722,
"learning_rate": 0.001,
"loss": 0.0013,
"step": 396
},
{
"epoch": 0.09063926940639269,
"grad_norm": 0.020344195887446404,
"learning_rate": 0.001,
"loss": 0.0019,
"step": 397
},
{
"epoch": 0.0908675799086758,
"grad_norm": 0.015474921092391014,
"learning_rate": 0.001,
"loss": 0.001,
"step": 398
},
{
"epoch": 0.0910958904109589,
"grad_norm": 0.017434895038604736,
"learning_rate": 0.001,
"loss": 0.0012,
"step": 399
},
{
"epoch": 0.091324200913242,
"grad_norm": 0.02458396926522255,
"learning_rate": 0.001,
"loss": 0.0016,
"step": 400
},
{
"epoch": 0.09155251141552512,
"grad_norm": 0.03149225190281868,
"learning_rate": 0.001,
"loss": 0.0035,
"step": 401
},
{
"epoch": 0.09178082191780822,
"grad_norm": 0.026796750724315643,
"learning_rate": 0.001,
"loss": 0.0014,
"step": 402
},
{
"epoch": 0.09200913242009133,
"grad_norm": 0.020359905436635017,
"learning_rate": 0.001,
"loss": 0.0017,
"step": 403
},
{
"epoch": 0.09223744292237443,
"grad_norm": 0.024055240675807,
"learning_rate": 0.001,
"loss": 0.0013,
"step": 404
},
{
"epoch": 0.09246575342465753,
"grad_norm": 0.026445262134075165,
"learning_rate": 0.001,
"loss": 0.0019,
"step": 405
},
{
"epoch": 0.09269406392694064,
"grad_norm": 0.02413698472082615,
"learning_rate": 0.001,
"loss": 0.0023,
"step": 406
},
{
"epoch": 0.09292237442922374,
"grad_norm": 0.024934392422437668,
"learning_rate": 0.001,
"loss": 0.0023,
"step": 407
},
{
"epoch": 0.09315068493150686,
"grad_norm": 0.024041904136538506,
"learning_rate": 0.001,
"loss": 0.0022,
"step": 408
},
{
"epoch": 0.09337899543378995,
"grad_norm": 0.029535695910453796,
"learning_rate": 0.001,
"loss": 0.0021,
"step": 409
},
{
"epoch": 0.09360730593607305,
"grad_norm": 0.022993121296167374,
"learning_rate": 0.001,
"loss": 0.0024,
"step": 410
},
{
"epoch": 0.09383561643835617,
"grad_norm": 0.018401680514216423,
"learning_rate": 0.001,
"loss": 0.0014,
"step": 411
},
{
"epoch": 0.09406392694063927,
"grad_norm": 0.018391454592347145,
"learning_rate": 0.001,
"loss": 0.0015,
"step": 412
},
{
"epoch": 0.09429223744292238,
"grad_norm": 0.03675055503845215,
"learning_rate": 0.001,
"loss": 0.004,
"step": 413
},
{
"epoch": 0.09452054794520548,
"grad_norm": 0.026887210085988045,
"learning_rate": 0.001,
"loss": 0.0014,
"step": 414
},
{
"epoch": 0.09474885844748858,
"grad_norm": 0.02171693742275238,
"learning_rate": 0.001,
"loss": 0.0017,
"step": 415
},
{
"epoch": 0.09497716894977169,
"grad_norm": 0.036046102643013,
"learning_rate": 0.001,
"loss": 0.0031,
"step": 416
},
{
"epoch": 0.09520547945205479,
"grad_norm": 0.02878933772444725,
"learning_rate": 0.001,
"loss": 0.0021,
"step": 417
},
{
"epoch": 0.0954337899543379,
"grad_norm": 0.017262322828173637,
"learning_rate": 0.001,
"loss": 0.0022,
"step": 418
},
{
"epoch": 0.095662100456621,
"grad_norm": 0.028725091367959976,
"learning_rate": 0.001,
"loss": 0.0029,
"step": 419
},
{
"epoch": 0.0958904109589041,
"grad_norm": 0.03320247679948807,
"learning_rate": 0.001,
"loss": 0.0023,
"step": 420
},
{
"epoch": 0.09611872146118722,
"grad_norm": 0.025160877034068108,
"learning_rate": 0.001,
"loss": 0.0026,
"step": 421
},
{
"epoch": 0.09634703196347032,
"grad_norm": 0.023186132311820984,
"learning_rate": 0.001,
"loss": 0.0023,
"step": 422
},
{
"epoch": 0.09657534246575343,
"grad_norm": 0.03161732107400894,
"learning_rate": 0.001,
"loss": 0.0036,
"step": 423
},
{
"epoch": 0.09680365296803653,
"grad_norm": 0.023892000317573547,
"learning_rate": 0.001,
"loss": 0.003,
"step": 424
},
{
"epoch": 0.09703196347031963,
"grad_norm": 0.04748233035206795,
"learning_rate": 0.001,
"loss": 0.0028,
"step": 425
},
{
"epoch": 0.09726027397260274,
"grad_norm": 0.018185172230005264,
"learning_rate": 0.001,
"loss": 0.0016,
"step": 426
},
{
"epoch": 0.09748858447488584,
"grad_norm": 0.024023696780204773,
"learning_rate": 0.001,
"loss": 0.0027,
"step": 427
},
{
"epoch": 0.09771689497716896,
"grad_norm": 0.019455142319202423,
"learning_rate": 0.001,
"loss": 0.0026,
"step": 428
},
{
"epoch": 0.09794520547945205,
"grad_norm": 0.02732614241540432,
"learning_rate": 0.001,
"loss": 0.0026,
"step": 429
},
{
"epoch": 0.09817351598173515,
"grad_norm": 0.017890289425849915,
"learning_rate": 0.001,
"loss": 0.0014,
"step": 430
},
{
"epoch": 0.09840182648401827,
"grad_norm": 0.028596822172403336,
"learning_rate": 0.001,
"loss": 0.0022,
"step": 431
},
{
"epoch": 0.09863013698630137,
"grad_norm": 0.03205295652151108,
"learning_rate": 0.001,
"loss": 0.0027,
"step": 432
},
{
"epoch": 0.09885844748858448,
"grad_norm": 0.03697388991713524,
"learning_rate": 0.001,
"loss": 0.0024,
"step": 433
},
{
"epoch": 0.09908675799086758,
"grad_norm": 0.03635745123028755,
"learning_rate": 0.001,
"loss": 0.0031,
"step": 434
},
{
"epoch": 0.09931506849315068,
"grad_norm": 0.023816758766770363,
"learning_rate": 0.001,
"loss": 0.0025,
"step": 435
},
{
"epoch": 0.09954337899543379,
"grad_norm": 0.019579321146011353,
"learning_rate": 0.001,
"loss": 0.0021,
"step": 436
},
{
"epoch": 0.09977168949771689,
"grad_norm": 0.023318948224186897,
"learning_rate": 0.001,
"loss": 0.0021,
"step": 437
},
{
"epoch": 0.1,
"grad_norm": 0.022768663242459297,
"learning_rate": 0.001,
"loss": 0.0021,
"step": 438
},
{
"epoch": 0.1002283105022831,
"grad_norm": 0.015700766816735268,
"learning_rate": 0.001,
"loss": 0.0014,
"step": 439
},
{
"epoch": 0.1004566210045662,
"grad_norm": 0.01778263971209526,
"learning_rate": 0.001,
"loss": 0.0017,
"step": 440
},
{
"epoch": 0.10068493150684932,
"grad_norm": 0.028968170285224915,
"learning_rate": 0.001,
"loss": 0.0016,
"step": 441
},
{
"epoch": 0.10091324200913242,
"grad_norm": 0.01981866918504238,
"learning_rate": 0.001,
"loss": 0.0012,
"step": 442
},
{
"epoch": 0.10114155251141553,
"grad_norm": 0.022714197635650635,
"learning_rate": 0.001,
"loss": 0.0017,
"step": 443
},
{
"epoch": 0.10136986301369863,
"grad_norm": 0.024588901549577713,
"learning_rate": 0.001,
"loss": 0.0017,
"step": 444
},
{
"epoch": 0.10159817351598173,
"grad_norm": 0.02210937812924385,
"learning_rate": 0.001,
"loss": 0.0016,
"step": 445
},
{
"epoch": 0.10182648401826484,
"grad_norm": 0.015890007838606834,
"learning_rate": 0.001,
"loss": 0.001,
"step": 446
},
{
"epoch": 0.10205479452054794,
"grad_norm": 0.02576160989701748,
"learning_rate": 0.001,
"loss": 0.0022,
"step": 447
},
{
"epoch": 0.10228310502283106,
"grad_norm": 0.025480084121227264,
"learning_rate": 0.001,
"loss": 0.0019,
"step": 448
},
{
"epoch": 0.10251141552511416,
"grad_norm": 0.020510738715529442,
"learning_rate": 0.001,
"loss": 0.0017,
"step": 449
},
{
"epoch": 0.10273972602739725,
"grad_norm": 0.026737291365861893,
"learning_rate": 0.001,
"loss": 0.0023,
"step": 450
},
{
"epoch": 0.10296803652968037,
"grad_norm": 0.03111446276307106,
"learning_rate": 0.001,
"loss": 0.0021,
"step": 451
},
{
"epoch": 0.10319634703196347,
"grad_norm": 0.029617153108119965,
"learning_rate": 0.001,
"loss": 0.0028,
"step": 452
},
{
"epoch": 0.10342465753424658,
"grad_norm": 0.033933065831661224,
"learning_rate": 0.001,
"loss": 0.0026,
"step": 453
},
{
"epoch": 0.10365296803652968,
"grad_norm": 0.029769249260425568,
"learning_rate": 0.001,
"loss": 0.0026,
"step": 454
},
{
"epoch": 0.10388127853881278,
"grad_norm": 0.029685623943805695,
"learning_rate": 0.001,
"loss": 0.0018,
"step": 455
},
{
"epoch": 0.10410958904109589,
"grad_norm": 0.03061087615787983,
"learning_rate": 0.001,
"loss": 0.0034,
"step": 456
},
{
"epoch": 0.10433789954337899,
"grad_norm": 0.02060793712735176,
"learning_rate": 0.001,
"loss": 0.0027,
"step": 457
},
{
"epoch": 0.1045662100456621,
"grad_norm": 0.02304467186331749,
"learning_rate": 0.001,
"loss": 0.0023,
"step": 458
},
{
"epoch": 0.1047945205479452,
"grad_norm": 0.0261305570602417,
"learning_rate": 0.001,
"loss": 0.0026,
"step": 459
},
{
"epoch": 0.1050228310502283,
"grad_norm": 0.023978248238563538,
"learning_rate": 0.001,
"loss": 0.0024,
"step": 460
},
{
"epoch": 0.10525114155251142,
"grad_norm": 0.02428649179637432,
"learning_rate": 0.001,
"loss": 0.0016,
"step": 461
},
{
"epoch": 0.10547945205479452,
"grad_norm": 0.0215776227414608,
"learning_rate": 0.001,
"loss": 0.0022,
"step": 462
},
{
"epoch": 0.10570776255707763,
"grad_norm": 0.020924601703882217,
"learning_rate": 0.001,
"loss": 0.0018,
"step": 463
},
{
"epoch": 0.10593607305936073,
"grad_norm": 0.020037012174725533,
"learning_rate": 0.001,
"loss": 0.0031,
"step": 464
},
{
"epoch": 0.10616438356164383,
"grad_norm": 0.021177353337407112,
"learning_rate": 0.001,
"loss": 0.0025,
"step": 465
},
{
"epoch": 0.10639269406392694,
"grad_norm": 0.021240398287773132,
"learning_rate": 0.001,
"loss": 0.0019,
"step": 466
},
{
"epoch": 0.10662100456621004,
"grad_norm": 0.022526200860738754,
"learning_rate": 0.001,
"loss": 0.0017,
"step": 467
},
{
"epoch": 0.10684931506849316,
"grad_norm": 0.02899310737848282,
"learning_rate": 0.001,
"loss": 0.0024,
"step": 468
},
{
"epoch": 0.10707762557077626,
"grad_norm": 0.021294210106134415,
"learning_rate": 0.001,
"loss": 0.0025,
"step": 469
},
{
"epoch": 0.10730593607305935,
"grad_norm": 0.019539158791303635,
"learning_rate": 0.001,
"loss": 0.0016,
"step": 470
},
{
"epoch": 0.10753424657534247,
"grad_norm": 0.03813247010111809,
"learning_rate": 0.001,
"loss": 0.0023,
"step": 471
},
{
"epoch": 0.10776255707762557,
"grad_norm": 0.027778642252087593,
"learning_rate": 0.001,
"loss": 0.0031,
"step": 472
},
{
"epoch": 0.10799086757990868,
"grad_norm": 0.023844033479690552,
"learning_rate": 0.001,
"loss": 0.0024,
"step": 473
},
{
"epoch": 0.10821917808219178,
"grad_norm": 0.023807501420378685,
"learning_rate": 0.001,
"loss": 0.0031,
"step": 474
},
{
"epoch": 0.10844748858447488,
"grad_norm": 0.023057186976075172,
"learning_rate": 0.001,
"loss": 0.0022,
"step": 475
},
{
"epoch": 0.108675799086758,
"grad_norm": 0.018374644219875336,
"learning_rate": 0.001,
"loss": 0.0018,
"step": 476
},
{
"epoch": 0.10890410958904109,
"grad_norm": 0.022881170734763145,
"learning_rate": 0.001,
"loss": 0.0016,
"step": 477
},
{
"epoch": 0.1091324200913242,
"grad_norm": 0.017999105155467987,
"learning_rate": 0.001,
"loss": 0.0019,
"step": 478
},
{
"epoch": 0.1093607305936073,
"grad_norm": 0.026413699612021446,
"learning_rate": 0.001,
"loss": 0.0035,
"step": 479
},
{
"epoch": 0.1095890410958904,
"grad_norm": 0.026815691962838173,
"learning_rate": 0.001,
"loss": 0.0031,
"step": 480
},
{
"epoch": 0.10981735159817352,
"grad_norm": 0.01882576383650303,
"learning_rate": 0.001,
"loss": 0.0018,
"step": 481
},
{
"epoch": 0.11004566210045662,
"grad_norm": 0.022626416757702827,
"learning_rate": 0.001,
"loss": 0.0016,
"step": 482
},
{
"epoch": 0.11027397260273973,
"grad_norm": 0.0262600127607584,
"learning_rate": 0.001,
"loss": 0.0021,
"step": 483
},
{
"epoch": 0.11050228310502283,
"grad_norm": 0.017802784219384193,
"learning_rate": 0.001,
"loss": 0.0018,
"step": 484
},
{
"epoch": 0.11073059360730593,
"grad_norm": 0.017433062195777893,
"learning_rate": 0.001,
"loss": 0.0011,
"step": 485
},
{
"epoch": 0.11095890410958904,
"grad_norm": 0.023387275636196136,
"learning_rate": 0.001,
"loss": 0.0027,
"step": 486
},
{
"epoch": 0.11118721461187214,
"grad_norm": 0.021118011325597763,
"learning_rate": 0.001,
"loss": 0.0022,
"step": 487
},
{
"epoch": 0.11141552511415526,
"grad_norm": 0.01577088050544262,
"learning_rate": 0.001,
"loss": 0.0017,
"step": 488
},
{
"epoch": 0.11164383561643836,
"grad_norm": 0.020268132910132408,
"learning_rate": 0.001,
"loss": 0.0013,
"step": 489
},
{
"epoch": 0.11187214611872145,
"grad_norm": 0.01911369152367115,
"learning_rate": 0.001,
"loss": 0.0018,
"step": 490
},
{
"epoch": 0.11210045662100457,
"grad_norm": 0.02497555874288082,
"learning_rate": 0.001,
"loss": 0.0024,
"step": 491
},
{
"epoch": 0.11232876712328767,
"grad_norm": 0.02308499813079834,
"learning_rate": 0.001,
"loss": 0.0015,
"step": 492
},
{
"epoch": 0.11255707762557078,
"grad_norm": 0.01704687625169754,
"learning_rate": 0.001,
"loss": 0.0015,
"step": 493
},
{
"epoch": 0.11278538812785388,
"grad_norm": 0.01520821824669838,
"learning_rate": 0.001,
"loss": 0.0012,
"step": 494
},
{
"epoch": 0.11301369863013698,
"grad_norm": 0.021169276908040047,
"learning_rate": 0.001,
"loss": 0.0016,
"step": 495
},
{
"epoch": 0.1132420091324201,
"grad_norm": 0.02852361463010311,
"learning_rate": 0.001,
"loss": 0.0019,
"step": 496
},
{
"epoch": 0.11347031963470319,
"grad_norm": 0.02134719118475914,
"learning_rate": 0.001,
"loss": 0.0021,
"step": 497
},
{
"epoch": 0.1136986301369863,
"grad_norm": 0.02251187339425087,
"learning_rate": 0.001,
"loss": 0.0023,
"step": 498
},
{
"epoch": 0.1139269406392694,
"grad_norm": 0.01491115614771843,
"learning_rate": 0.001,
"loss": 0.0011,
"step": 499
},
{
"epoch": 0.1141552511415525,
"grad_norm": 0.02773105911910534,
"learning_rate": 0.001,
"loss": 0.0018,
"step": 500
},
{
"epoch": 0.1141552511415525,
"step": 500,
"total_flos": 1.2027275771904e+17,
"train_loss": 0.030520909884246068,
"train_runtime": 3128.3367,
"train_samples_per_second": 3.197,
"train_steps_per_second": 0.16
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.2027275771904e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}