romainnn's picture
Training in progress, step 966, checkpoint
086945d verified
{
"best_metric": 0.723136305809021,
"best_model_checkpoint": "miner_id_24/checkpoint-900",
"epoch": 0.06728190841023855,
"eval_steps": 100,
"global_step": 966,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 6.965000870625109e-05,
"grad_norm": 1.0377936363220215,
"learning_rate": 2e-05,
"loss": 2.3142,
"step": 1
},
{
"epoch": 6.965000870625109e-05,
"eval_loss": 2.563941717147827,
"eval_runtime": 694.6699,
"eval_samples_per_second": 7.198,
"eval_steps_per_second": 1.799,
"step": 1
},
{
"epoch": 0.00013930001741250218,
"grad_norm": 1.5463413000106812,
"learning_rate": 4e-05,
"loss": 2.6219,
"step": 2
},
{
"epoch": 0.00020895002611875328,
"grad_norm": 0.9576510787010193,
"learning_rate": 6e-05,
"loss": 2.4328,
"step": 3
},
{
"epoch": 0.00027860003482500437,
"grad_norm": 0.9721766710281372,
"learning_rate": 8e-05,
"loss": 2.3784,
"step": 4
},
{
"epoch": 0.00034825004353125546,
"grad_norm": 1.0480666160583496,
"learning_rate": 0.0001,
"loss": 2.5397,
"step": 5
},
{
"epoch": 0.00041790005223750655,
"grad_norm": 8.270813941955566,
"learning_rate": 0.00012,
"loss": 2.3505,
"step": 6
},
{
"epoch": 0.00048755006094375764,
"grad_norm": 0.8568385243415833,
"learning_rate": 0.00014,
"loss": 2.2338,
"step": 7
},
{
"epoch": 0.0005572000696500087,
"grad_norm": 0.7186582088470459,
"learning_rate": 0.00016,
"loss": 2.1361,
"step": 8
},
{
"epoch": 0.0006268500783562598,
"grad_norm": 0.8668791055679321,
"learning_rate": 0.00018,
"loss": 2.1404,
"step": 9
},
{
"epoch": 0.0006965000870625109,
"grad_norm": 0.6644730567932129,
"learning_rate": 0.0002,
"loss": 1.9522,
"step": 10
},
{
"epoch": 0.000766150095768762,
"grad_norm": 0.7633213996887207,
"learning_rate": 0.00019999946004996418,
"loss": 1.9414,
"step": 11
},
{
"epoch": 0.0008358001044750131,
"grad_norm": 0.8425551652908325,
"learning_rate": 0.00019999784020568754,
"loss": 1.7541,
"step": 12
},
{
"epoch": 0.0009054501131812642,
"grad_norm": 0.7430157661437988,
"learning_rate": 0.00019999514048466284,
"loss": 2.0221,
"step": 13
},
{
"epoch": 0.0009751001218875153,
"grad_norm": 0.6478707194328308,
"learning_rate": 0.00019999136091604434,
"loss": 2.061,
"step": 14
},
{
"epoch": 0.0010447501305937664,
"grad_norm": 0.6233001351356506,
"learning_rate": 0.00019998650154064764,
"loss": 1.7651,
"step": 15
},
{
"epoch": 0.0011144001393000175,
"grad_norm": 0.5294800400733948,
"learning_rate": 0.0001999805624109491,
"loss": 1.8399,
"step": 16
},
{
"epoch": 0.0011840501480062686,
"grad_norm": 0.5066989660263062,
"learning_rate": 0.0001999735435910854,
"loss": 1.9775,
"step": 17
},
{
"epoch": 0.0012537001567125197,
"grad_norm": 0.5490643978118896,
"learning_rate": 0.00019996544515685281,
"loss": 1.7321,
"step": 18
},
{
"epoch": 0.0013233501654187707,
"grad_norm": 0.8576249480247498,
"learning_rate": 0.00019995626719570626,
"loss": 1.7238,
"step": 19
},
{
"epoch": 0.0013930001741250218,
"grad_norm": 0.6412836313247681,
"learning_rate": 0.00019994600980675862,
"loss": 1.9291,
"step": 20
},
{
"epoch": 0.001462650182831273,
"grad_norm": 0.5579408407211304,
"learning_rate": 0.0001999346731007794,
"loss": 1.7642,
"step": 21
},
{
"epoch": 0.001532300191537524,
"grad_norm": 0.5578712224960327,
"learning_rate": 0.00019992225720019376,
"loss": 1.7988,
"step": 22
},
{
"epoch": 0.0016019502002437751,
"grad_norm": 0.5795004963874817,
"learning_rate": 0.00019990876223908093,
"loss": 1.8818,
"step": 23
},
{
"epoch": 0.0016716002089500262,
"grad_norm": 0.48596304655075073,
"learning_rate": 0.00019989418836317304,
"loss": 1.7715,
"step": 24
},
{
"epoch": 0.0017412502176562773,
"grad_norm": 0.672593355178833,
"learning_rate": 0.00019987853572985342,
"loss": 1.6647,
"step": 25
},
{
"epoch": 0.0018109002263625284,
"grad_norm": 0.6527593731880188,
"learning_rate": 0.00019986180450815485,
"loss": 1.6806,
"step": 26
},
{
"epoch": 0.0018805502350687795,
"grad_norm": 0.6159369945526123,
"learning_rate": 0.00019984399487875778,
"loss": 1.6252,
"step": 27
},
{
"epoch": 0.0019502002437750306,
"grad_norm": 0.6747246384620667,
"learning_rate": 0.00019982510703398843,
"loss": 1.6697,
"step": 28
},
{
"epoch": 0.0020198502524812817,
"grad_norm": 0.6250666975975037,
"learning_rate": 0.00019980514117781667,
"loss": 1.7791,
"step": 29
},
{
"epoch": 0.0020895002611875328,
"grad_norm": 0.5189153552055359,
"learning_rate": 0.00019978409752585376,
"loss": 1.6126,
"step": 30
},
{
"epoch": 0.002159150269893784,
"grad_norm": 0.6016886234283447,
"learning_rate": 0.00019976197630535014,
"loss": 1.8378,
"step": 31
},
{
"epoch": 0.002228800278600035,
"grad_norm": 0.6658028364181519,
"learning_rate": 0.00019973877775519285,
"loss": 1.5398,
"step": 32
},
{
"epoch": 0.002298450287306286,
"grad_norm": 0.6278268098831177,
"learning_rate": 0.0001997145021259031,
"loss": 1.6046,
"step": 33
},
{
"epoch": 0.002368100296012537,
"grad_norm": 0.6449319124221802,
"learning_rate": 0.00019968914967963337,
"loss": 1.6949,
"step": 34
},
{
"epoch": 0.002437750304718788,
"grad_norm": 0.5705320239067078,
"learning_rate": 0.0001996627206901648,
"loss": 1.699,
"step": 35
},
{
"epoch": 0.0025074003134250393,
"grad_norm": 0.698817253112793,
"learning_rate": 0.00019963521544290403,
"loss": 1.3933,
"step": 36
},
{
"epoch": 0.0025770503221312904,
"grad_norm": 0.6723275780677795,
"learning_rate": 0.00019960663423488026,
"loss": 1.3995,
"step": 37
},
{
"epoch": 0.0026467003308375415,
"grad_norm": 0.6986438632011414,
"learning_rate": 0.00019957697737474196,
"loss": 1.5379,
"step": 38
},
{
"epoch": 0.0027163503395437926,
"grad_norm": 0.7800816297531128,
"learning_rate": 0.0001995462451827536,
"loss": 1.5991,
"step": 39
},
{
"epoch": 0.0027860003482500437,
"grad_norm": 0.7049386501312256,
"learning_rate": 0.00019951443799079215,
"loss": 1.4532,
"step": 40
},
{
"epoch": 0.0028556503569562948,
"grad_norm": 0.7777565717697144,
"learning_rate": 0.0001994815561423435,
"loss": 1.8033,
"step": 41
},
{
"epoch": 0.002925300365662546,
"grad_norm": 0.7464177012443542,
"learning_rate": 0.00019944759999249872,
"loss": 1.5926,
"step": 42
},
{
"epoch": 0.002994950374368797,
"grad_norm": 0.5269952416419983,
"learning_rate": 0.0001994125699079503,
"loss": 1.7358,
"step": 43
},
{
"epoch": 0.003064600383075048,
"grad_norm": 0.6710164546966553,
"learning_rate": 0.00019937646626698823,
"loss": 1.3895,
"step": 44
},
{
"epoch": 0.003134250391781299,
"grad_norm": 0.6867531538009644,
"learning_rate": 0.00019933928945949564,
"loss": 1.3977,
"step": 45
},
{
"epoch": 0.0032039004004875502,
"grad_norm": 0.6760386824607849,
"learning_rate": 0.000199301039886945,
"loss": 1.5334,
"step": 46
},
{
"epoch": 0.0032735504091938013,
"grad_norm": 0.8017314076423645,
"learning_rate": 0.0001992617179623934,
"loss": 1.3485,
"step": 47
},
{
"epoch": 0.0033432004179000524,
"grad_norm": 0.8706843256950378,
"learning_rate": 0.00019922132411047833,
"loss": 1.6267,
"step": 48
},
{
"epoch": 0.0034128504266063035,
"grad_norm": 0.8783407807350159,
"learning_rate": 0.0001991798587674131,
"loss": 1.5161,
"step": 49
},
{
"epoch": 0.0034825004353125546,
"grad_norm": 0.7308568954467773,
"learning_rate": 0.0001991373223809819,
"loss": 1.6129,
"step": 50
},
{
"epoch": 0.0035521504440188057,
"grad_norm": 0.7637537717819214,
"learning_rate": 0.00019909371541053524,
"loss": 1.5135,
"step": 51
},
{
"epoch": 0.0036218004527250568,
"grad_norm": 0.7845759987831116,
"learning_rate": 0.00019904903832698484,
"loss": 1.5176,
"step": 52
},
{
"epoch": 0.003691450461431308,
"grad_norm": 0.7081618309020996,
"learning_rate": 0.0001990032916127985,
"loss": 1.5891,
"step": 53
},
{
"epoch": 0.003761100470137559,
"grad_norm": 0.7322244048118591,
"learning_rate": 0.00019895647576199506,
"loss": 1.3892,
"step": 54
},
{
"epoch": 0.00383075047884381,
"grad_norm": 0.8247037529945374,
"learning_rate": 0.0001989085912801389,
"loss": 1.229,
"step": 55
},
{
"epoch": 0.003900400487550061,
"grad_norm": 0.7730288505554199,
"learning_rate": 0.00019885963868433463,
"loss": 1.4962,
"step": 56
},
{
"epoch": 0.003970050496256312,
"grad_norm": 0.8732311129570007,
"learning_rate": 0.00019880961850322128,
"loss": 1.507,
"step": 57
},
{
"epoch": 0.004039700504962563,
"grad_norm": 0.7709734439849854,
"learning_rate": 0.00019875853127696692,
"loss": 1.5573,
"step": 58
},
{
"epoch": 0.004109350513668814,
"grad_norm": 0.6652419567108154,
"learning_rate": 0.00019870637755726244,
"loss": 1.4967,
"step": 59
},
{
"epoch": 0.0041790005223750655,
"grad_norm": 0.7002225518226624,
"learning_rate": 0.000198653157907316,
"loss": 1.6385,
"step": 60
},
{
"epoch": 0.004248650531081317,
"grad_norm": 0.7703307867050171,
"learning_rate": 0.00019859887290184656,
"loss": 1.4653,
"step": 61
},
{
"epoch": 0.004318300539787568,
"grad_norm": 0.7544863820075989,
"learning_rate": 0.00019854352312707798,
"loss": 1.492,
"step": 62
},
{
"epoch": 0.004387950548493819,
"grad_norm": 0.8162996768951416,
"learning_rate": 0.00019848710918073247,
"loss": 1.1976,
"step": 63
},
{
"epoch": 0.00445760055720007,
"grad_norm": 0.5825150012969971,
"learning_rate": 0.00019842963167202433,
"loss": 1.4162,
"step": 64
},
{
"epoch": 0.004527250565906321,
"grad_norm": 0.6794354319572449,
"learning_rate": 0.00019837109122165317,
"loss": 1.4261,
"step": 65
},
{
"epoch": 0.004596900574612572,
"grad_norm": 0.724295437335968,
"learning_rate": 0.0001983114884617974,
"loss": 1.4105,
"step": 66
},
{
"epoch": 0.004666550583318823,
"grad_norm": 0.8312812447547913,
"learning_rate": 0.00019825082403610725,
"loss": 1.4328,
"step": 67
},
{
"epoch": 0.004736200592025074,
"grad_norm": 0.7422550320625305,
"learning_rate": 0.0001981890985996979,
"loss": 1.4478,
"step": 68
},
{
"epoch": 0.004805850600731325,
"grad_norm": 0.8899093866348267,
"learning_rate": 0.00019812631281914233,
"loss": 1.1302,
"step": 69
},
{
"epoch": 0.004875500609437576,
"grad_norm": 0.838991105556488,
"learning_rate": 0.0001980624673724643,
"loss": 1.5665,
"step": 70
},
{
"epoch": 0.0049451506181438275,
"grad_norm": 0.7630224823951721,
"learning_rate": 0.0001979975629491308,
"loss": 1.3839,
"step": 71
},
{
"epoch": 0.005014800626850079,
"grad_norm": 0.7271626591682434,
"learning_rate": 0.00019793160025004475,
"loss": 1.1867,
"step": 72
},
{
"epoch": 0.00508445063555633,
"grad_norm": 0.6928589344024658,
"learning_rate": 0.00019786457998753737,
"loss": 1.6149,
"step": 73
},
{
"epoch": 0.005154100644262581,
"grad_norm": 0.8479191660881042,
"learning_rate": 0.00019779650288536058,
"loss": 1.2566,
"step": 74
},
{
"epoch": 0.005223750652968832,
"grad_norm": 0.7954538464546204,
"learning_rate": 0.000197727369678679,
"loss": 1.1289,
"step": 75
},
{
"epoch": 0.005293400661675083,
"grad_norm": 0.8336564302444458,
"learning_rate": 0.00019765718111406218,
"loss": 1.227,
"step": 76
},
{
"epoch": 0.005363050670381334,
"grad_norm": 0.7440236806869507,
"learning_rate": 0.00019758593794947648,
"loss": 1.4401,
"step": 77
},
{
"epoch": 0.005432700679087585,
"grad_norm": 0.5975192785263062,
"learning_rate": 0.00019751364095427692,
"loss": 1.4655,
"step": 78
},
{
"epoch": 0.005502350687793836,
"grad_norm": 0.7023612260818481,
"learning_rate": 0.0001974402909091988,
"loss": 1.5098,
"step": 79
},
{
"epoch": 0.005572000696500087,
"grad_norm": 0.6060627698898315,
"learning_rate": 0.00019736588860634925,
"loss": 1.4346,
"step": 80
},
{
"epoch": 0.005641650705206338,
"grad_norm": 0.6663565039634705,
"learning_rate": 0.00019729043484919883,
"loss": 1.1718,
"step": 81
},
{
"epoch": 0.0057113007139125895,
"grad_norm": 0.7931796908378601,
"learning_rate": 0.00019721393045257277,
"loss": 1.2598,
"step": 82
},
{
"epoch": 0.005780950722618841,
"grad_norm": 0.8470779061317444,
"learning_rate": 0.000197136376242642,
"loss": 1.0741,
"step": 83
},
{
"epoch": 0.005850600731325092,
"grad_norm": 0.6171009540557861,
"learning_rate": 0.00019705777305691456,
"loss": 1.4427,
"step": 84
},
{
"epoch": 0.005920250740031343,
"grad_norm": 0.8087684512138367,
"learning_rate": 0.00019697812174422632,
"loss": 1.4372,
"step": 85
},
{
"epoch": 0.005989900748737594,
"grad_norm": 0.6931564211845398,
"learning_rate": 0.00019689742316473182,
"loss": 1.1907,
"step": 86
},
{
"epoch": 0.006059550757443845,
"grad_norm": 0.6536969542503357,
"learning_rate": 0.00019681567818989506,
"loss": 1.3734,
"step": 87
},
{
"epoch": 0.006129200766150096,
"grad_norm": 0.6640751361846924,
"learning_rate": 0.00019673288770248013,
"loss": 1.4367,
"step": 88
},
{
"epoch": 0.006198850774856347,
"grad_norm": 0.564035177230835,
"learning_rate": 0.00019664905259654156,
"loss": 1.4644,
"step": 89
},
{
"epoch": 0.006268500783562598,
"grad_norm": 0.8123689889907837,
"learning_rate": 0.0001965641737774147,
"loss": 1.5373,
"step": 90
},
{
"epoch": 0.006338150792268849,
"grad_norm": 0.7990655899047852,
"learning_rate": 0.00019647825216170597,
"loss": 1.4824,
"step": 91
},
{
"epoch": 0.0064078008009751004,
"grad_norm": 0.7879489660263062,
"learning_rate": 0.00019639128867728298,
"loss": 1.3882,
"step": 92
},
{
"epoch": 0.0064774508096813515,
"grad_norm": 0.7157430648803711,
"learning_rate": 0.00019630328426326448,
"loss": 1.5377,
"step": 93
},
{
"epoch": 0.006547100818387603,
"grad_norm": 0.7268451452255249,
"learning_rate": 0.00019621423987001014,
"loss": 1.2801,
"step": 94
},
{
"epoch": 0.006616750827093854,
"grad_norm": 0.8534408807754517,
"learning_rate": 0.00019612415645911047,
"loss": 1.2232,
"step": 95
},
{
"epoch": 0.006686400835800105,
"grad_norm": 0.7566258907318115,
"learning_rate": 0.00019603303500337628,
"loss": 1.0665,
"step": 96
},
{
"epoch": 0.006756050844506356,
"grad_norm": 0.764929473400116,
"learning_rate": 0.00019594087648682824,
"loss": 1.0974,
"step": 97
},
{
"epoch": 0.006825700853212607,
"grad_norm": 0.8856674432754517,
"learning_rate": 0.00019584768190468625,
"loss": 1.2374,
"step": 98
},
{
"epoch": 0.006895350861918858,
"grad_norm": 0.8611932396888733,
"learning_rate": 0.0001957534522633586,
"loss": 1.5207,
"step": 99
},
{
"epoch": 0.006965000870625109,
"grad_norm": 0.7111679315567017,
"learning_rate": 0.00019565818858043136,
"loss": 1.3399,
"step": 100
},
{
"epoch": 0.006965000870625109,
"eval_loss": 1.2061141729354858,
"eval_runtime": 699.7303,
"eval_samples_per_second": 7.146,
"eval_steps_per_second": 1.786,
"step": 100
},
{
"epoch": 0.00703465087933136,
"grad_norm": 0.7039173245429993,
"learning_rate": 0.00019556189188465702,
"loss": 1.4391,
"step": 101
},
{
"epoch": 0.007104300888037611,
"grad_norm": 0.8350788354873657,
"learning_rate": 0.00019546456321594376,
"loss": 1.1431,
"step": 102
},
{
"epoch": 0.0071739508967438624,
"grad_norm": 0.6535744667053223,
"learning_rate": 0.0001953662036253438,
"loss": 1.296,
"step": 103
},
{
"epoch": 0.0072436009054501135,
"grad_norm": 0.7496301531791687,
"learning_rate": 0.00019526681417504258,
"loss": 1.311,
"step": 104
},
{
"epoch": 0.007313250914156365,
"grad_norm": 0.7061691880226135,
"learning_rate": 0.0001951663959383468,
"loss": 1.3601,
"step": 105
},
{
"epoch": 0.007382900922862616,
"grad_norm": 0.8221380114555359,
"learning_rate": 0.00019506494999967298,
"loss": 1.3149,
"step": 106
},
{
"epoch": 0.007452550931568867,
"grad_norm": 0.9544386267662048,
"learning_rate": 0.000194962477454536,
"loss": 1.2967,
"step": 107
},
{
"epoch": 0.007522200940275118,
"grad_norm": 0.8127594590187073,
"learning_rate": 0.00019485897940953688,
"loss": 1.4015,
"step": 108
},
{
"epoch": 0.007591850948981369,
"grad_norm": 0.7376645803451538,
"learning_rate": 0.0001947544569823511,
"loss": 1.4958,
"step": 109
},
{
"epoch": 0.00766150095768762,
"grad_norm": 0.6602767705917358,
"learning_rate": 0.00019464891130171647,
"loss": 1.3593,
"step": 110
},
{
"epoch": 0.007731150966393871,
"grad_norm": 0.9318028092384338,
"learning_rate": 0.0001945423435074208,
"loss": 1.0125,
"step": 111
},
{
"epoch": 0.007800800975100122,
"grad_norm": 0.7048940062522888,
"learning_rate": 0.00019443475475028983,
"loss": 1.4342,
"step": 112
},
{
"epoch": 0.007870450983806372,
"grad_norm": 0.9778817892074585,
"learning_rate": 0.00019432614619217459,
"loss": 1.0368,
"step": 113
},
{
"epoch": 0.007940100992512624,
"grad_norm": 0.808047890663147,
"learning_rate": 0.000194216519005939,
"loss": 1.105,
"step": 114
},
{
"epoch": 0.008009751001218875,
"grad_norm": 0.7996501326560974,
"learning_rate": 0.0001941058743754471,
"loss": 1.1383,
"step": 115
},
{
"epoch": 0.008079401009925127,
"grad_norm": 1.0752230882644653,
"learning_rate": 0.00019399421349555035,
"loss": 1.3508,
"step": 116
},
{
"epoch": 0.008149051018631377,
"grad_norm": 0.7151166200637817,
"learning_rate": 0.00019388153757207471,
"loss": 1.4086,
"step": 117
},
{
"epoch": 0.008218701027337629,
"grad_norm": 0.7622511386871338,
"learning_rate": 0.00019376784782180746,
"loss": 1.1942,
"step": 118
},
{
"epoch": 0.008288351036043879,
"grad_norm": 0.6896407008171082,
"learning_rate": 0.0001936531454724844,
"loss": 1.2571,
"step": 119
},
{
"epoch": 0.008358001044750131,
"grad_norm": 0.7991106510162354,
"learning_rate": 0.00019353743176277622,
"loss": 1.2531,
"step": 120
},
{
"epoch": 0.008427651053456381,
"grad_norm": 0.8540248870849609,
"learning_rate": 0.00019342070794227536,
"loss": 1.223,
"step": 121
},
{
"epoch": 0.008497301062162633,
"grad_norm": 0.8329891562461853,
"learning_rate": 0.00019330297527148246,
"loss": 0.9099,
"step": 122
},
{
"epoch": 0.008566951070868883,
"grad_norm": 0.7838830351829529,
"learning_rate": 0.00019318423502179272,
"loss": 1.3098,
"step": 123
},
{
"epoch": 0.008636601079575135,
"grad_norm": 0.7665576338768005,
"learning_rate": 0.00019306448847548216,
"loss": 1.3633,
"step": 124
},
{
"epoch": 0.008706251088281386,
"grad_norm": 0.7157841324806213,
"learning_rate": 0.00019294373692569383,
"loss": 0.9222,
"step": 125
},
{
"epoch": 0.008775901096987638,
"grad_norm": 0.944957971572876,
"learning_rate": 0.0001928219816764238,
"loss": 1.0901,
"step": 126
},
{
"epoch": 0.008845551105693888,
"grad_norm": 0.636736273765564,
"learning_rate": 0.0001926992240425071,
"loss": 1.3484,
"step": 127
},
{
"epoch": 0.00891520111440014,
"grad_norm": 0.6209918260574341,
"learning_rate": 0.0001925754653496035,
"loss": 1.3551,
"step": 128
},
{
"epoch": 0.00898485112310639,
"grad_norm": 0.7056594491004944,
"learning_rate": 0.00019245070693418322,
"loss": 1.4229,
"step": 129
},
{
"epoch": 0.009054501131812642,
"grad_norm": 0.7279839515686035,
"learning_rate": 0.00019232495014351246,
"loss": 1.0699,
"step": 130
},
{
"epoch": 0.009124151140518892,
"grad_norm": 0.6324151754379272,
"learning_rate": 0.00019219819633563891,
"loss": 1.3833,
"step": 131
},
{
"epoch": 0.009193801149225144,
"grad_norm": 0.7449592351913452,
"learning_rate": 0.00019207044687937703,
"loss": 1.2067,
"step": 132
},
{
"epoch": 0.009263451157931394,
"grad_norm": 0.939274787902832,
"learning_rate": 0.0001919417031542933,
"loss": 1.3229,
"step": 133
},
{
"epoch": 0.009333101166637646,
"grad_norm": 0.8192336559295654,
"learning_rate": 0.00019181196655069127,
"loss": 1.1575,
"step": 134
},
{
"epoch": 0.009402751175343897,
"grad_norm": 0.7507984638214111,
"learning_rate": 0.00019168123846959666,
"loss": 1.0461,
"step": 135
},
{
"epoch": 0.009472401184050148,
"grad_norm": 0.6593666672706604,
"learning_rate": 0.00019154952032274206,
"loss": 1.3806,
"step": 136
},
{
"epoch": 0.009542051192756399,
"grad_norm": 0.6475424766540527,
"learning_rate": 0.00019141681353255184,
"loss": 0.9218,
"step": 137
},
{
"epoch": 0.00961170120146265,
"grad_norm": 0.7746126651763916,
"learning_rate": 0.00019128311953212678,
"loss": 0.8967,
"step": 138
},
{
"epoch": 0.009681351210168901,
"grad_norm": 0.7104780673980713,
"learning_rate": 0.00019114843976522842,
"loss": 1.1855,
"step": 139
},
{
"epoch": 0.009751001218875153,
"grad_norm": 0.597457230091095,
"learning_rate": 0.00019101277568626374,
"loss": 1.0809,
"step": 140
},
{
"epoch": 0.009820651227581403,
"grad_norm": 0.8071316480636597,
"learning_rate": 0.00019087612876026908,
"loss": 1.0129,
"step": 141
},
{
"epoch": 0.009890301236287655,
"grad_norm": 0.8741605877876282,
"learning_rate": 0.00019073850046289484,
"loss": 0.8784,
"step": 142
},
{
"epoch": 0.009959951244993905,
"grad_norm": 0.7503401637077332,
"learning_rate": 0.00019059989228038902,
"loss": 1.1498,
"step": 143
},
{
"epoch": 0.010029601253700157,
"grad_norm": 0.7068141102790833,
"learning_rate": 0.0001904603057095815,
"loss": 1.2644,
"step": 144
},
{
"epoch": 0.010099251262406407,
"grad_norm": 0.7954654097557068,
"learning_rate": 0.0001903197422578678,
"loss": 1.1108,
"step": 145
},
{
"epoch": 0.01016890127111266,
"grad_norm": 0.7548302412033081,
"learning_rate": 0.0001901782034431927,
"loss": 0.9177,
"step": 146
},
{
"epoch": 0.01023855127981891,
"grad_norm": 0.7617766261100769,
"learning_rate": 0.00019003569079403395,
"loss": 1.256,
"step": 147
},
{
"epoch": 0.010308201288525162,
"grad_norm": 0.7205716967582703,
"learning_rate": 0.00018989220584938573,
"loss": 1.3767,
"step": 148
},
{
"epoch": 0.010377851297231412,
"grad_norm": 0.6221201419830322,
"learning_rate": 0.00018974775015874213,
"loss": 1.3329,
"step": 149
},
{
"epoch": 0.010447501305937664,
"grad_norm": 0.565428614616394,
"learning_rate": 0.00018960232528208022,
"loss": 1.1155,
"step": 150
},
{
"epoch": 0.010517151314643914,
"grad_norm": 0.7672913074493408,
"learning_rate": 0.00018945593278984333,
"loss": 0.9654,
"step": 151
},
{
"epoch": 0.010586801323350166,
"grad_norm": 0.737074077129364,
"learning_rate": 0.00018930857426292412,
"loss": 1.0644,
"step": 152
},
{
"epoch": 0.010656451332056416,
"grad_norm": 0.6545393466949463,
"learning_rate": 0.0001891602512926474,
"loss": 1.2058,
"step": 153
},
{
"epoch": 0.010726101340762668,
"grad_norm": 0.8019453287124634,
"learning_rate": 0.00018901096548075305,
"loss": 1.3134,
"step": 154
},
{
"epoch": 0.010795751349468918,
"grad_norm": 0.8307440876960754,
"learning_rate": 0.00018886071843937866,
"loss": 1.152,
"step": 155
},
{
"epoch": 0.01086540135817517,
"grad_norm": 0.8050329089164734,
"learning_rate": 0.00018870951179104212,
"loss": 0.9473,
"step": 156
},
{
"epoch": 0.01093505136688142,
"grad_norm": 0.7510560154914856,
"learning_rate": 0.00018855734716862417,
"loss": 1.2265,
"step": 157
},
{
"epoch": 0.011004701375587672,
"grad_norm": 0.7653977274894714,
"learning_rate": 0.00018840422621535066,
"loss": 1.3356,
"step": 158
},
{
"epoch": 0.011074351384293923,
"grad_norm": 0.7661434412002563,
"learning_rate": 0.00018825015058477481,
"loss": 0.9601,
"step": 159
},
{
"epoch": 0.011144001393000175,
"grad_norm": 0.7829368114471436,
"learning_rate": 0.00018809512194075957,
"loss": 1.0675,
"step": 160
},
{
"epoch": 0.011213651401706425,
"grad_norm": 0.6673858761787415,
"learning_rate": 0.00018793914195745933,
"loss": 1.4312,
"step": 161
},
{
"epoch": 0.011283301410412677,
"grad_norm": 0.8060672879219055,
"learning_rate": 0.00018778221231930203,
"loss": 1.0241,
"step": 162
},
{
"epoch": 0.011352951419118927,
"grad_norm": 1.0137969255447388,
"learning_rate": 0.00018762433472097097,
"loss": 1.1867,
"step": 163
},
{
"epoch": 0.011422601427825179,
"grad_norm": 0.9313655495643616,
"learning_rate": 0.0001874655108673864,
"loss": 1.3046,
"step": 164
},
{
"epoch": 0.01149225143653143,
"grad_norm": 0.9493317008018494,
"learning_rate": 0.00018730574247368732,
"loss": 1.1123,
"step": 165
},
{
"epoch": 0.011561901445237681,
"grad_norm": 0.8069944977760315,
"learning_rate": 0.0001871450312652126,
"loss": 1.0592,
"step": 166
},
{
"epoch": 0.011631551453943931,
"grad_norm": 0.6559287905693054,
"learning_rate": 0.00018698337897748283,
"loss": 1.2388,
"step": 167
},
{
"epoch": 0.011701201462650183,
"grad_norm": 0.650059700012207,
"learning_rate": 0.0001868207873561811,
"loss": 0.9891,
"step": 168
},
{
"epoch": 0.011770851471356434,
"grad_norm": 0.6247674822807312,
"learning_rate": 0.00018665725815713443,
"loss": 1.2925,
"step": 169
},
{
"epoch": 0.011840501480062686,
"grad_norm": 0.7453685402870178,
"learning_rate": 0.00018649279314629483,
"loss": 1.06,
"step": 170
},
{
"epoch": 0.011910151488768936,
"grad_norm": 0.826835572719574,
"learning_rate": 0.00018632739409972003,
"loss": 0.9637,
"step": 171
},
{
"epoch": 0.011979801497475188,
"grad_norm": 0.7538785338401794,
"learning_rate": 0.00018616106280355444,
"loss": 1.0126,
"step": 172
},
{
"epoch": 0.012049451506181438,
"grad_norm": 0.8348299264907837,
"learning_rate": 0.00018599380105400982,
"loss": 0.988,
"step": 173
},
{
"epoch": 0.01211910151488769,
"grad_norm": 0.8298357725143433,
"learning_rate": 0.00018582561065734604,
"loss": 1.0608,
"step": 174
},
{
"epoch": 0.01218875152359394,
"grad_norm": 0.6961440443992615,
"learning_rate": 0.00018565649342985118,
"loss": 1.1564,
"step": 175
},
{
"epoch": 0.012258401532300192,
"grad_norm": 0.664256751537323,
"learning_rate": 0.00018548645119782238,
"loss": 1.1865,
"step": 176
},
{
"epoch": 0.012328051541006442,
"grad_norm": 0.7857444882392883,
"learning_rate": 0.0001853154857975458,
"loss": 0.9903,
"step": 177
},
{
"epoch": 0.012397701549712694,
"grad_norm": 0.758602499961853,
"learning_rate": 0.0001851435990752769,
"loss": 1.3456,
"step": 178
},
{
"epoch": 0.012467351558418945,
"grad_norm": 0.768666684627533,
"learning_rate": 0.0001849707928872206,
"loss": 0.9773,
"step": 179
},
{
"epoch": 0.012537001567125197,
"grad_norm": 0.8674852848052979,
"learning_rate": 0.00018479706909951094,
"loss": 1.0203,
"step": 180
},
{
"epoch": 0.012606651575831447,
"grad_norm": 0.6384921669960022,
"learning_rate": 0.0001846224295881913,
"loss": 1.1004,
"step": 181
},
{
"epoch": 0.012676301584537699,
"grad_norm": 0.6848528981208801,
"learning_rate": 0.00018444687623919386,
"loss": 1.0699,
"step": 182
},
{
"epoch": 0.012745951593243949,
"grad_norm": 0.6943731307983398,
"learning_rate": 0.00018427041094831937,
"loss": 1.1812,
"step": 183
},
{
"epoch": 0.012815601601950201,
"grad_norm": 1.0284762382507324,
"learning_rate": 0.00018409303562121662,
"loss": 1.1307,
"step": 184
},
{
"epoch": 0.012885251610656451,
"grad_norm": 0.7977420091629028,
"learning_rate": 0.00018391475217336193,
"loss": 1.0772,
"step": 185
},
{
"epoch": 0.012954901619362703,
"grad_norm": 0.678799569606781,
"learning_rate": 0.0001837355625300383,
"loss": 1.1816,
"step": 186
},
{
"epoch": 0.013024551628068953,
"grad_norm": 0.7933035492897034,
"learning_rate": 0.00018355546862631493,
"loss": 1.2014,
"step": 187
},
{
"epoch": 0.013094201636775205,
"grad_norm": 0.7373278737068176,
"learning_rate": 0.00018337447240702594,
"loss": 0.9163,
"step": 188
},
{
"epoch": 0.013163851645481455,
"grad_norm": 0.7306934595108032,
"learning_rate": 0.00018319257582674964,
"loss": 0.8467,
"step": 189
},
{
"epoch": 0.013233501654187707,
"grad_norm": 0.6722437739372253,
"learning_rate": 0.00018300978084978735,
"loss": 1.1145,
"step": 190
},
{
"epoch": 0.013303151662893958,
"grad_norm": 0.8375574350357056,
"learning_rate": 0.00018282608945014217,
"loss": 0.8763,
"step": 191
},
{
"epoch": 0.01337280167160021,
"grad_norm": 0.6876571774482727,
"learning_rate": 0.0001826415036114976,
"loss": 1.3694,
"step": 192
},
{
"epoch": 0.01344245168030646,
"grad_norm": 0.5936222076416016,
"learning_rate": 0.0001824560253271963,
"loss": 1.4071,
"step": 193
},
{
"epoch": 0.013512101689012712,
"grad_norm": 0.6679614782333374,
"learning_rate": 0.00018226965660021836,
"loss": 0.8098,
"step": 194
},
{
"epoch": 0.013581751697718962,
"grad_norm": 0.8226193189620972,
"learning_rate": 0.00018208239944315978,
"loss": 0.6594,
"step": 195
},
{
"epoch": 0.013651401706425214,
"grad_norm": 0.8376763463020325,
"learning_rate": 0.0001818942558782108,
"loss": 1.0417,
"step": 196
},
{
"epoch": 0.013721051715131464,
"grad_norm": 0.773747444152832,
"learning_rate": 0.00018170522793713387,
"loss": 0.7496,
"step": 197
},
{
"epoch": 0.013790701723837716,
"grad_norm": 0.8213014006614685,
"learning_rate": 0.00018151531766124186,
"loss": 0.842,
"step": 198
},
{
"epoch": 0.013860351732543966,
"grad_norm": 0.6993326544761658,
"learning_rate": 0.000181324527101376,
"loss": 1.1651,
"step": 199
},
{
"epoch": 0.013930001741250218,
"grad_norm": 0.550957977771759,
"learning_rate": 0.00018113285831788365,
"loss": 1.2762,
"step": 200
},
{
"epoch": 0.013930001741250218,
"eval_loss": 0.993212103843689,
"eval_runtime": 699.7494,
"eval_samples_per_second": 7.145,
"eval_steps_per_second": 1.786,
"step": 200
},
{
"epoch": 0.013999651749956469,
"grad_norm": 0.6803005933761597,
"learning_rate": 0.00018094031338059617,
"loss": 1.2403,
"step": 201
},
{
"epoch": 0.01406930175866272,
"grad_norm": 0.6137078404426575,
"learning_rate": 0.00018074689436880644,
"loss": 0.9294,
"step": 202
},
{
"epoch": 0.01413895176736897,
"grad_norm": 0.6511885523796082,
"learning_rate": 0.00018055260337124652,
"loss": 1.2509,
"step": 203
},
{
"epoch": 0.014208601776075223,
"grad_norm": 0.6647017598152161,
"learning_rate": 0.0001803574424860651,
"loss": 1.1067,
"step": 204
},
{
"epoch": 0.014278251784781473,
"grad_norm": 0.7390187382698059,
"learning_rate": 0.0001801614138208046,
"loss": 1.0816,
"step": 205
},
{
"epoch": 0.014347901793487725,
"grad_norm": 0.7152518033981323,
"learning_rate": 0.0001799645194923788,
"loss": 0.9844,
"step": 206
},
{
"epoch": 0.014417551802193975,
"grad_norm": 0.8229650855064392,
"learning_rate": 0.00017976676162704966,
"loss": 1.1316,
"step": 207
},
{
"epoch": 0.014487201810900227,
"grad_norm": 0.7085878252983093,
"learning_rate": 0.0001795681423604045,
"loss": 0.9282,
"step": 208
},
{
"epoch": 0.014556851819606477,
"grad_norm": 0.8368147015571594,
"learning_rate": 0.00017936866383733298,
"loss": 0.8718,
"step": 209
},
{
"epoch": 0.01462650182831273,
"grad_norm": 0.7303407192230225,
"learning_rate": 0.00017916832821200375,
"loss": 0.8913,
"step": 210
},
{
"epoch": 0.01469615183701898,
"grad_norm": 0.6697463989257812,
"learning_rate": 0.00017896713764784143,
"loss": 1.0783,
"step": 211
},
{
"epoch": 0.014765801845725231,
"grad_norm": 0.5616613030433655,
"learning_rate": 0.000178765094317503,
"loss": 1.2869,
"step": 212
},
{
"epoch": 0.014835451854431482,
"grad_norm": 0.5711467266082764,
"learning_rate": 0.00017856220040285458,
"loss": 1.0144,
"step": 213
},
{
"epoch": 0.014905101863137734,
"grad_norm": 0.7759966850280762,
"learning_rate": 0.00017835845809494768,
"loss": 1.117,
"step": 214
},
{
"epoch": 0.014974751871843984,
"grad_norm": 0.5759698152542114,
"learning_rate": 0.00017815386959399565,
"loss": 1.1662,
"step": 215
},
{
"epoch": 0.015044401880550236,
"grad_norm": 0.6275411248207092,
"learning_rate": 0.0001779484371093498,
"loss": 1.2339,
"step": 216
},
{
"epoch": 0.015114051889256486,
"grad_norm": 0.803784191608429,
"learning_rate": 0.00017774216285947576,
"loss": 0.8127,
"step": 217
},
{
"epoch": 0.015183701897962738,
"grad_norm": 0.7878329157829285,
"learning_rate": 0.00017753504907192923,
"loss": 0.7944,
"step": 218
},
{
"epoch": 0.015253351906668988,
"grad_norm": 0.753667950630188,
"learning_rate": 0.00017732709798333221,
"loss": 1.2632,
"step": 219
},
{
"epoch": 0.01532300191537524,
"grad_norm": 0.6178960204124451,
"learning_rate": 0.0001771183118393486,
"loss": 0.9552,
"step": 220
},
{
"epoch": 0.01539265192408149,
"grad_norm": 0.6457561254501343,
"learning_rate": 0.00017690869289466017,
"loss": 0.9573,
"step": 221
},
{
"epoch": 0.015462301932787742,
"grad_norm": 0.7319156527519226,
"learning_rate": 0.00017669824341294202,
"loss": 0.8473,
"step": 222
},
{
"epoch": 0.015531951941493993,
"grad_norm": 0.6461290717124939,
"learning_rate": 0.00017648696566683824,
"loss": 1.0797,
"step": 223
},
{
"epoch": 0.015601601950200245,
"grad_norm": 0.7656479477882385,
"learning_rate": 0.00017627486193793742,
"loss": 0.9595,
"step": 224
},
{
"epoch": 0.015671251958906496,
"grad_norm": 0.7314528226852417,
"learning_rate": 0.00017606193451674785,
"loss": 1.1522,
"step": 225
},
{
"epoch": 0.015740901967612745,
"grad_norm": 0.5844183564186096,
"learning_rate": 0.00017584818570267284,
"loss": 0.6874,
"step": 226
},
{
"epoch": 0.015810551976318997,
"grad_norm": 0.756650447845459,
"learning_rate": 0.00017563361780398613,
"loss": 1.1152,
"step": 227
},
{
"epoch": 0.01588020198502525,
"grad_norm": 0.7920497059822083,
"learning_rate": 0.00017541823313780647,
"loss": 0.7904,
"step": 228
},
{
"epoch": 0.0159498519937315,
"grad_norm": 0.7280418872833252,
"learning_rate": 0.00017520203403007312,
"loss": 0.9489,
"step": 229
},
{
"epoch": 0.01601950200243775,
"grad_norm": 0.6644127368927002,
"learning_rate": 0.0001749850228155203,
"loss": 1.0123,
"step": 230
},
{
"epoch": 0.016089152011144,
"grad_norm": 0.6218852996826172,
"learning_rate": 0.0001747672018376524,
"loss": 1.1297,
"step": 231
},
{
"epoch": 0.016158802019850253,
"grad_norm": 0.7259179949760437,
"learning_rate": 0.00017454857344871824,
"loss": 1.2077,
"step": 232
},
{
"epoch": 0.016228452028556505,
"grad_norm": 0.6896301507949829,
"learning_rate": 0.00017432914000968592,
"loss": 1.4735,
"step": 233
},
{
"epoch": 0.016298102037262754,
"grad_norm": 0.6918095350265503,
"learning_rate": 0.00017410890389021736,
"loss": 1.1311,
"step": 234
},
{
"epoch": 0.016367752045969006,
"grad_norm": 0.7965865731239319,
"learning_rate": 0.00017388786746864256,
"loss": 1.2436,
"step": 235
},
{
"epoch": 0.016437402054675258,
"grad_norm": 0.7081993222236633,
"learning_rate": 0.000173666033131934,
"loss": 1.0674,
"step": 236
},
{
"epoch": 0.01650705206338151,
"grad_norm": 0.6959885358810425,
"learning_rate": 0.00017344340327568082,
"loss": 1.1808,
"step": 237
},
{
"epoch": 0.016576702072087758,
"grad_norm": 0.6657646298408508,
"learning_rate": 0.000173219980304063,
"loss": 0.9132,
"step": 238
},
{
"epoch": 0.01664635208079401,
"grad_norm": 0.5461063385009766,
"learning_rate": 0.0001729957666298254,
"loss": 1.2554,
"step": 239
},
{
"epoch": 0.016716002089500262,
"grad_norm": 0.5713803768157959,
"learning_rate": 0.0001727707646742516,
"loss": 1.236,
"step": 240
},
{
"epoch": 0.016785652098206514,
"grad_norm": 0.6570878624916077,
"learning_rate": 0.00017254497686713797,
"loss": 1.1216,
"step": 241
},
{
"epoch": 0.016855302106912762,
"grad_norm": 0.7191223502159119,
"learning_rate": 0.0001723184056467671,
"loss": 1.2225,
"step": 242
},
{
"epoch": 0.016924952115619014,
"grad_norm": 0.6774346232414246,
"learning_rate": 0.0001720910534598818,
"loss": 1.4341,
"step": 243
},
{
"epoch": 0.016994602124325266,
"grad_norm": 0.7842647433280945,
"learning_rate": 0.0001718629227616585,
"loss": 1.2086,
"step": 244
},
{
"epoch": 0.01706425213303152,
"grad_norm": 0.6781778931617737,
"learning_rate": 0.00017163401601568077,
"loss": 0.9324,
"step": 245
},
{
"epoch": 0.017133902141737767,
"grad_norm": 0.7419726252555847,
"learning_rate": 0.00017140433569391275,
"loss": 0.8826,
"step": 246
},
{
"epoch": 0.01720355215044402,
"grad_norm": 0.6957391500473022,
"learning_rate": 0.00017117388427667236,
"loss": 0.5565,
"step": 247
},
{
"epoch": 0.01727320215915027,
"grad_norm": 0.6904794573783875,
"learning_rate": 0.0001709426642526046,
"loss": 1.0979,
"step": 248
},
{
"epoch": 0.017342852167856523,
"grad_norm": 0.7743323445320129,
"learning_rate": 0.00017071067811865476,
"loss": 0.6322,
"step": 249
},
{
"epoch": 0.01741250217656277,
"grad_norm": 0.6866056323051453,
"learning_rate": 0.0001704779283800412,
"loss": 0.9873,
"step": 250
},
{
"epoch": 0.017482152185269023,
"grad_norm": 0.5904546976089478,
"learning_rate": 0.00017024441755022856,
"loss": 1.0898,
"step": 251
},
{
"epoch": 0.017551802193975275,
"grad_norm": 0.6349841952323914,
"learning_rate": 0.00017001014815090038,
"loss": 1.0947,
"step": 252
},
{
"epoch": 0.017621452202681527,
"grad_norm": 0.6754809617996216,
"learning_rate": 0.0001697751227119322,
"loss": 0.9881,
"step": 253
},
{
"epoch": 0.017691102211387776,
"grad_norm": 0.6565687656402588,
"learning_rate": 0.00016953934377136377,
"loss": 1.0908,
"step": 254
},
{
"epoch": 0.017760752220094028,
"grad_norm": 0.5469555854797363,
"learning_rate": 0.0001693028138753721,
"loss": 0.8385,
"step": 255
},
{
"epoch": 0.01783040222880028,
"grad_norm": 0.6178275942802429,
"learning_rate": 0.0001690655355782437,
"loss": 0.9317,
"step": 256
},
{
"epoch": 0.017900052237506528,
"grad_norm": 0.8108107447624207,
"learning_rate": 0.0001688275114423471,
"loss": 0.8016,
"step": 257
},
{
"epoch": 0.01796970224621278,
"grad_norm": 0.6483268141746521,
"learning_rate": 0.00016858874403810509,
"loss": 1.0697,
"step": 258
},
{
"epoch": 0.018039352254919032,
"grad_norm": 0.7654364109039307,
"learning_rate": 0.00016834923594396698,
"loss": 1.1524,
"step": 259
},
{
"epoch": 0.018109002263625284,
"grad_norm": 0.6824004650115967,
"learning_rate": 0.00016810898974638097,
"loss": 1.31,
"step": 260
},
{
"epoch": 0.018178652272331532,
"grad_norm": 0.6116809248924255,
"learning_rate": 0.00016786800803976585,
"loss": 1.0788,
"step": 261
},
{
"epoch": 0.018248302281037784,
"grad_norm": 0.7678197026252747,
"learning_rate": 0.00016762629342648318,
"loss": 0.7855,
"step": 262
},
{
"epoch": 0.018317952289744036,
"grad_norm": 0.6764957904815674,
"learning_rate": 0.00016738384851680937,
"loss": 0.9709,
"step": 263
},
{
"epoch": 0.018387602298450288,
"grad_norm": 0.6751796007156372,
"learning_rate": 0.0001671406759289071,
"loss": 1.2517,
"step": 264
},
{
"epoch": 0.018457252307156537,
"grad_norm": 0.7578874230384827,
"learning_rate": 0.00016689677828879738,
"loss": 1.0033,
"step": 265
},
{
"epoch": 0.01852690231586279,
"grad_norm": 0.5653178095817566,
"learning_rate": 0.0001666521582303309,
"loss": 1.1913,
"step": 266
},
{
"epoch": 0.01859655232456904,
"grad_norm": 0.7313902974128723,
"learning_rate": 0.00016640681839515993,
"loss": 1.0418,
"step": 267
},
{
"epoch": 0.018666202333275293,
"grad_norm": 0.5821707248687744,
"learning_rate": 0.0001661607614327095,
"loss": 0.886,
"step": 268
},
{
"epoch": 0.01873585234198154,
"grad_norm": 0.6478776335716248,
"learning_rate": 0.0001659139900001489,
"loss": 1.2479,
"step": 269
},
{
"epoch": 0.018805502350687793,
"grad_norm": 0.6471793055534363,
"learning_rate": 0.00016566650676236305,
"loss": 0.9999,
"step": 270
},
{
"epoch": 0.018875152359394045,
"grad_norm": 0.6918301582336426,
"learning_rate": 0.0001654183143919236,
"loss": 0.8315,
"step": 271
},
{
"epoch": 0.018944802368100297,
"grad_norm": 0.62820965051651,
"learning_rate": 0.0001651694155690601,
"loss": 1.0534,
"step": 272
},
{
"epoch": 0.019014452376806545,
"grad_norm": 0.5358027219772339,
"learning_rate": 0.00016491981298163118,
"loss": 1.1642,
"step": 273
},
{
"epoch": 0.019084102385512797,
"grad_norm": 0.6293304562568665,
"learning_rate": 0.0001646695093250953,
"loss": 0.8443,
"step": 274
},
{
"epoch": 0.01915375239421905,
"grad_norm": 0.6544604301452637,
"learning_rate": 0.00016441850730248184,
"loss": 0.7902,
"step": 275
},
{
"epoch": 0.0192234024029253,
"grad_norm": 0.723544716835022,
"learning_rate": 0.0001641668096243619,
"loss": 0.7972,
"step": 276
},
{
"epoch": 0.01929305241163155,
"grad_norm": 0.6971920728683472,
"learning_rate": 0.00016391441900881875,
"loss": 1.0068,
"step": 277
},
{
"epoch": 0.019362702420337802,
"grad_norm": 0.6442938446998596,
"learning_rate": 0.00016366133818141893,
"loss": 0.9171,
"step": 278
},
{
"epoch": 0.019432352429044054,
"grad_norm": 0.5508981347084045,
"learning_rate": 0.00016340756987518243,
"loss": 1.2581,
"step": 279
},
{
"epoch": 0.019502002437750306,
"grad_norm": 0.6451659798622131,
"learning_rate": 0.0001631531168305534,
"loss": 0.692,
"step": 280
},
{
"epoch": 0.019571652446456554,
"grad_norm": 0.719409704208374,
"learning_rate": 0.00016289798179537046,
"loss": 1.0723,
"step": 281
},
{
"epoch": 0.019641302455162806,
"grad_norm": 0.6584640145301819,
"learning_rate": 0.00016264216752483697,
"loss": 1.0083,
"step": 282
},
{
"epoch": 0.019710952463869058,
"grad_norm": 0.6936922669410706,
"learning_rate": 0.00016238567678149147,
"loss": 1.1018,
"step": 283
},
{
"epoch": 0.01978060247257531,
"grad_norm": 0.8725325465202332,
"learning_rate": 0.00016212851233517772,
"loss": 1.0276,
"step": 284
},
{
"epoch": 0.01985025248128156,
"grad_norm": 0.6702690720558167,
"learning_rate": 0.0001618706769630147,
"loss": 1.0521,
"step": 285
},
{
"epoch": 0.01991990248998781,
"grad_norm": 0.604901909828186,
"learning_rate": 0.0001616121734493668,
"loss": 0.8782,
"step": 286
},
{
"epoch": 0.019989552498694062,
"grad_norm": 0.5754973292350769,
"learning_rate": 0.00016135300458581365,
"loss": 1.1281,
"step": 287
},
{
"epoch": 0.020059202507400314,
"grad_norm": 0.6314234137535095,
"learning_rate": 0.00016109317317111995,
"loss": 0.8964,
"step": 288
},
{
"epoch": 0.020128852516106563,
"grad_norm": 0.5530171990394592,
"learning_rate": 0.0001608326820112054,
"loss": 1.278,
"step": 289
},
{
"epoch": 0.020198502524812815,
"grad_norm": 0.7363768219947815,
"learning_rate": 0.00016057153391911422,
"loss": 1.0563,
"step": 290
},
{
"epoch": 0.020268152533519067,
"grad_norm": 0.634734570980072,
"learning_rate": 0.00016030973171498477,
"loss": 0.9834,
"step": 291
},
{
"epoch": 0.02033780254222532,
"grad_norm": 0.5349484086036682,
"learning_rate": 0.00016004727822601934,
"loss": 1.1927,
"step": 292
},
{
"epoch": 0.020407452550931567,
"grad_norm": 0.6138120889663696,
"learning_rate": 0.00015978417628645326,
"loss": 0.8267,
"step": 293
},
{
"epoch": 0.02047710255963782,
"grad_norm": 0.5792511701583862,
"learning_rate": 0.0001595204287375246,
"loss": 1.317,
"step": 294
},
{
"epoch": 0.02054675256834407,
"grad_norm": 0.648102879524231,
"learning_rate": 0.00015925603842744334,
"loss": 0.7643,
"step": 295
},
{
"epoch": 0.020616402577050323,
"grad_norm": 0.6310989856719971,
"learning_rate": 0.00015899100821136064,
"loss": 0.8994,
"step": 296
},
{
"epoch": 0.02068605258575657,
"grad_norm": 0.6773801445960999,
"learning_rate": 0.00015872534095133793,
"loss": 0.961,
"step": 297
},
{
"epoch": 0.020755702594462824,
"grad_norm": 0.6812910437583923,
"learning_rate": 0.00015845903951631623,
"loss": 0.8269,
"step": 298
},
{
"epoch": 0.020825352603169076,
"grad_norm": 0.7168356776237488,
"learning_rate": 0.00015819210678208484,
"loss": 1.2156,
"step": 299
},
{
"epoch": 0.020895002611875328,
"grad_norm": 0.6270495653152466,
"learning_rate": 0.0001579245456312506,
"loss": 1.029,
"step": 300
},
{
"epoch": 0.020895002611875328,
"eval_loss": 0.9014175534248352,
"eval_runtime": 700.0853,
"eval_samples_per_second": 7.142,
"eval_steps_per_second": 1.785,
"step": 300
},
{
"epoch": 0.020964652620581576,
"grad_norm": 0.6509414315223694,
"learning_rate": 0.00015765635895320656,
"loss": 1.1077,
"step": 301
},
{
"epoch": 0.021034302629287828,
"grad_norm": 0.7492027282714844,
"learning_rate": 0.00015738754964410084,
"loss": 0.5395,
"step": 302
},
{
"epoch": 0.02110395263799408,
"grad_norm": 0.601356029510498,
"learning_rate": 0.00015711812060680534,
"loss": 1.0082,
"step": 303
},
{
"epoch": 0.021173602646700332,
"grad_norm": 0.7457994818687439,
"learning_rate": 0.00015684807475088453,
"loss": 1.318,
"step": 304
},
{
"epoch": 0.02124325265540658,
"grad_norm": 0.7976076602935791,
"learning_rate": 0.00015657741499256367,
"loss": 0.7,
"step": 305
},
{
"epoch": 0.021312902664112832,
"grad_norm": 0.7381129264831543,
"learning_rate": 0.00015630614425469775,
"loss": 0.9987,
"step": 306
},
{
"epoch": 0.021382552672819084,
"grad_norm": 0.8430412411689758,
"learning_rate": 0.00015603426546673967,
"loss": 1.0874,
"step": 307
},
{
"epoch": 0.021452202681525336,
"grad_norm": 0.6384485363960266,
"learning_rate": 0.00015576178156470862,
"loss": 1.2032,
"step": 308
},
{
"epoch": 0.021521852690231585,
"grad_norm": 0.788506031036377,
"learning_rate": 0.0001554886954911585,
"loss": 1.3688,
"step": 309
},
{
"epoch": 0.021591502698937837,
"grad_norm": 0.6341352462768555,
"learning_rate": 0.00015521501019514597,
"loss": 1.4594,
"step": 310
},
{
"epoch": 0.02166115270764409,
"grad_norm": 0.6707578897476196,
"learning_rate": 0.00015494072863219874,
"loss": 1.1494,
"step": 311
},
{
"epoch": 0.02173080271635034,
"grad_norm": 0.609851598739624,
"learning_rate": 0.00015466585376428365,
"loss": 0.9684,
"step": 312
},
{
"epoch": 0.02180045272505659,
"grad_norm": 0.7177265882492065,
"learning_rate": 0.00015439038855977454,
"loss": 0.8522,
"step": 313
},
{
"epoch": 0.02187010273376284,
"grad_norm": 0.6207813024520874,
"learning_rate": 0.00015411433599342038,
"loss": 0.4699,
"step": 314
},
{
"epoch": 0.021939752742469093,
"grad_norm": 0.6561682820320129,
"learning_rate": 0.00015383769904631306,
"loss": 0.7518,
"step": 315
},
{
"epoch": 0.022009402751175345,
"grad_norm": 0.7517587542533875,
"learning_rate": 0.00015356048070585513,
"loss": 1.2278,
"step": 316
},
{
"epoch": 0.022079052759881593,
"grad_norm": 0.6116645932197571,
"learning_rate": 0.00015328268396572762,
"loss": 0.9742,
"step": 317
},
{
"epoch": 0.022148702768587845,
"grad_norm": 0.5882527232170105,
"learning_rate": 0.00015300431182585777,
"loss": 0.8036,
"step": 318
},
{
"epoch": 0.022218352777294097,
"grad_norm": 0.5738014578819275,
"learning_rate": 0.00015272536729238654,
"loss": 0.7848,
"step": 319
},
{
"epoch": 0.02228800278600035,
"grad_norm": 0.7317819595336914,
"learning_rate": 0.0001524458533776361,
"loss": 1.0656,
"step": 320
},
{
"epoch": 0.022357652794706598,
"grad_norm": 0.6275020837783813,
"learning_rate": 0.00015216577310007745,
"loss": 0.9123,
"step": 321
},
{
"epoch": 0.02242730280341285,
"grad_norm": 0.8332412838935852,
"learning_rate": 0.00015188512948429765,
"loss": 1.1836,
"step": 322
},
{
"epoch": 0.022496952812119102,
"grad_norm": 0.6414222121238708,
"learning_rate": 0.00015160392556096735,
"loss": 0.8959,
"step": 323
},
{
"epoch": 0.022566602820825354,
"grad_norm": 0.6147682070732117,
"learning_rate": 0.00015132216436680796,
"loss": 0.937,
"step": 324
},
{
"epoch": 0.022636252829531602,
"grad_norm": 0.5949112176895142,
"learning_rate": 0.00015103984894455878,
"loss": 1.1365,
"step": 325
},
{
"epoch": 0.022705902838237854,
"grad_norm": 0.6494925022125244,
"learning_rate": 0.00015075698234294423,
"loss": 0.9603,
"step": 326
},
{
"epoch": 0.022775552846944106,
"grad_norm": 0.6222386956214905,
"learning_rate": 0.00015047356761664098,
"loss": 1.1083,
"step": 327
},
{
"epoch": 0.022845202855650358,
"grad_norm": 0.6448621153831482,
"learning_rate": 0.00015018960782624486,
"loss": 0.8984,
"step": 328
},
{
"epoch": 0.022914852864356607,
"grad_norm": 0.7695071697235107,
"learning_rate": 0.00014990510603823782,
"loss": 0.9996,
"step": 329
},
{
"epoch": 0.02298450287306286,
"grad_norm": 0.7322002649307251,
"learning_rate": 0.00014962006532495488,
"loss": 0.9976,
"step": 330
},
{
"epoch": 0.02305415288176911,
"grad_norm": 0.5676226615905762,
"learning_rate": 0.00014933448876455096,
"loss": 1.0891,
"step": 331
},
{
"epoch": 0.023123802890475362,
"grad_norm": 0.839449405670166,
"learning_rate": 0.00014904837944096743,
"loss": 0.6213,
"step": 332
},
{
"epoch": 0.02319345289918161,
"grad_norm": 0.6786718964576721,
"learning_rate": 0.00014876174044389922,
"loss": 1.0854,
"step": 333
},
{
"epoch": 0.023263102907887863,
"grad_norm": 0.7376294732093811,
"learning_rate": 0.00014847457486876097,
"loss": 0.9289,
"step": 334
},
{
"epoch": 0.023332752916594115,
"grad_norm": 0.71031653881073,
"learning_rate": 0.00014818688581665396,
"loss": 1.0325,
"step": 335
},
{
"epoch": 0.023402402925300367,
"grad_norm": 0.6212656497955322,
"learning_rate": 0.00014789867639433248,
"loss": 1.0627,
"step": 336
},
{
"epoch": 0.023472052934006615,
"grad_norm": 0.698070228099823,
"learning_rate": 0.00014760994971417022,
"loss": 1.1891,
"step": 337
},
{
"epoch": 0.023541702942712867,
"grad_norm": 0.7134040594100952,
"learning_rate": 0.00014732070889412693,
"loss": 1.0185,
"step": 338
},
{
"epoch": 0.02361135295141912,
"grad_norm": 0.5352413058280945,
"learning_rate": 0.00014703095705771434,
"loss": 0.3684,
"step": 339
},
{
"epoch": 0.02368100296012537,
"grad_norm": 0.6988404393196106,
"learning_rate": 0.00014674069733396276,
"loss": 0.947,
"step": 340
},
{
"epoch": 0.02375065296883162,
"grad_norm": 0.7194476127624512,
"learning_rate": 0.00014644993285738717,
"loss": 0.8271,
"step": 341
},
{
"epoch": 0.02382030297753787,
"grad_norm": 0.6885733604431152,
"learning_rate": 0.00014615866676795334,
"loss": 0.7825,
"step": 342
},
{
"epoch": 0.023889952986244124,
"grad_norm": 0.6990646123886108,
"learning_rate": 0.00014586690221104397,
"loss": 0.9145,
"step": 343
},
{
"epoch": 0.023959602994950376,
"grad_norm": 0.7719680070877075,
"learning_rate": 0.00014557464233742477,
"loss": 0.5737,
"step": 344
},
{
"epoch": 0.024029253003656624,
"grad_norm": 0.7187089323997498,
"learning_rate": 0.00014528189030321029,
"loss": 0.7873,
"step": 345
},
{
"epoch": 0.024098903012362876,
"grad_norm": 0.6850745677947998,
"learning_rate": 0.00014498864926982996,
"loss": 0.9,
"step": 346
},
{
"epoch": 0.024168553021069128,
"grad_norm": 0.8452913761138916,
"learning_rate": 0.0001446949224039939,
"loss": 0.9123,
"step": 347
},
{
"epoch": 0.02423820302977538,
"grad_norm": 0.6649196147918701,
"learning_rate": 0.00014440071287765875,
"loss": 0.8189,
"step": 348
},
{
"epoch": 0.02430785303848163,
"grad_norm": 0.7251694798469543,
"learning_rate": 0.0001441060238679934,
"loss": 1.0816,
"step": 349
},
{
"epoch": 0.02437750304718788,
"grad_norm": 0.6829720139503479,
"learning_rate": 0.00014381085855734468,
"loss": 0.9725,
"step": 350
},
{
"epoch": 0.024447153055894132,
"grad_norm": 0.7007995843887329,
"learning_rate": 0.00014351522013320302,
"loss": 1.047,
"step": 351
},
{
"epoch": 0.024516803064600384,
"grad_norm": 0.7575050592422485,
"learning_rate": 0.0001432191117881679,
"loss": 0.7961,
"step": 352
},
{
"epoch": 0.024586453073306633,
"grad_norm": 0.6370393633842468,
"learning_rate": 0.0001429225367199136,
"loss": 0.9137,
"step": 353
},
{
"epoch": 0.024656103082012885,
"grad_norm": 0.6170664429664612,
"learning_rate": 0.0001426254981311545,
"loss": 0.8138,
"step": 354
},
{
"epoch": 0.024725753090719137,
"grad_norm": 0.7749223709106445,
"learning_rate": 0.00014232799922961052,
"loss": 1.1226,
"step": 355
},
{
"epoch": 0.02479540309942539,
"grad_norm": 0.6036125421524048,
"learning_rate": 0.00014203004322797252,
"loss": 1.204,
"step": 356
},
{
"epoch": 0.024865053108131637,
"grad_norm": 0.6835645437240601,
"learning_rate": 0.00014173163334386753,
"loss": 0.8434,
"step": 357
},
{
"epoch": 0.02493470311683789,
"grad_norm": 0.6302729249000549,
"learning_rate": 0.00014143277279982414,
"loss": 0.6518,
"step": 358
},
{
"epoch": 0.02500435312554414,
"grad_norm": 0.5898759365081787,
"learning_rate": 0.00014113346482323762,
"loss": 0.6565,
"step": 359
},
{
"epoch": 0.025074003134250393,
"grad_norm": 0.6143885254859924,
"learning_rate": 0.00014083371264633497,
"loss": 1.2938,
"step": 360
},
{
"epoch": 0.02514365314295664,
"grad_norm": 0.5840321183204651,
"learning_rate": 0.00014053351950614018,
"loss": 0.7797,
"step": 361
},
{
"epoch": 0.025213303151662893,
"grad_norm": 0.6148191690444946,
"learning_rate": 0.00014023288864443916,
"loss": 0.7165,
"step": 362
},
{
"epoch": 0.025282953160369145,
"grad_norm": 0.6650532484054565,
"learning_rate": 0.0001399318233077448,
"loss": 1.0991,
"step": 363
},
{
"epoch": 0.025352603169075397,
"grad_norm": 0.5263816714286804,
"learning_rate": 0.00013963032674726197,
"loss": 0.5039,
"step": 364
},
{
"epoch": 0.025422253177781646,
"grad_norm": 0.8048628568649292,
"learning_rate": 0.00013932840221885217,
"loss": 1.19,
"step": 365
},
{
"epoch": 0.025491903186487898,
"grad_norm": 0.6668381094932556,
"learning_rate": 0.0001390260529829986,
"loss": 0.9708,
"step": 366
},
{
"epoch": 0.02556155319519415,
"grad_norm": 0.6639387607574463,
"learning_rate": 0.00013872328230477086,
"loss": 0.9414,
"step": 367
},
{
"epoch": 0.025631203203900402,
"grad_norm": 0.696017324924469,
"learning_rate": 0.00013842009345378976,
"loss": 0.9,
"step": 368
},
{
"epoch": 0.02570085321260665,
"grad_norm": 0.584456205368042,
"learning_rate": 0.00013811648970419194,
"loss": 1.0158,
"step": 369
},
{
"epoch": 0.025770503221312902,
"grad_norm": 0.7759786248207092,
"learning_rate": 0.00013781247433459449,
"loss": 0.9564,
"step": 370
},
{
"epoch": 0.025840153230019154,
"grad_norm": 0.7399227619171143,
"learning_rate": 0.00013750805062805955,
"loss": 0.887,
"step": 371
},
{
"epoch": 0.025909803238725406,
"grad_norm": 0.6674394607543945,
"learning_rate": 0.00013720322187205897,
"loss": 1.1418,
"step": 372
},
{
"epoch": 0.025979453247431655,
"grad_norm": 0.591126561164856,
"learning_rate": 0.00013689799135843875,
"loss": 1.1361,
"step": 373
},
{
"epoch": 0.026049103256137907,
"grad_norm": 0.6162034273147583,
"learning_rate": 0.0001365923623833834,
"loss": 0.9725,
"step": 374
},
{
"epoch": 0.02611875326484416,
"grad_norm": 0.6250083446502686,
"learning_rate": 0.0001362863382473804,
"loss": 0.8571,
"step": 375
},
{
"epoch": 0.02618840327355041,
"grad_norm": 0.5744304060935974,
"learning_rate": 0.00013597992225518465,
"loss": 1.2338,
"step": 376
},
{
"epoch": 0.02625805328225666,
"grad_norm": 0.6333332061767578,
"learning_rate": 0.0001356731177157827,
"loss": 1.0476,
"step": 377
},
{
"epoch": 0.02632770329096291,
"grad_norm": 0.7278969883918762,
"learning_rate": 0.00013536592794235696,
"loss": 0.9087,
"step": 378
},
{
"epoch": 0.026397353299669163,
"grad_norm": 0.6979010701179504,
"learning_rate": 0.00013505835625225,
"loss": 0.952,
"step": 379
},
{
"epoch": 0.026467003308375415,
"grad_norm": 0.6789504289627075,
"learning_rate": 0.00013475040596692877,
"loss": 1.0368,
"step": 380
},
{
"epoch": 0.026536653317081663,
"grad_norm": 0.7653933763504028,
"learning_rate": 0.00013444208041194855,
"loss": 0.8965,
"step": 381
},
{
"epoch": 0.026606303325787915,
"grad_norm": 0.5833761096000671,
"learning_rate": 0.00013413338291691726,
"loss": 0.8849,
"step": 382
},
{
"epoch": 0.026675953334494167,
"grad_norm": 0.742056131362915,
"learning_rate": 0.00013382431681545942,
"loss": 1.0168,
"step": 383
},
{
"epoch": 0.02674560334320042,
"grad_norm": 0.6038824915885925,
"learning_rate": 0.00013351488544518004,
"loss": 0.7484,
"step": 384
},
{
"epoch": 0.026815253351906668,
"grad_norm": 0.7503067851066589,
"learning_rate": 0.00013320509214762868,
"loss": 0.7915,
"step": 385
},
{
"epoch": 0.02688490336061292,
"grad_norm": 0.6701642274856567,
"learning_rate": 0.00013289494026826336,
"loss": 0.791,
"step": 386
},
{
"epoch": 0.02695455336931917,
"grad_norm": 0.6913783550262451,
"learning_rate": 0.0001325844331564146,
"loss": 0.7336,
"step": 387
},
{
"epoch": 0.027024203378025424,
"grad_norm": 0.5814367532730103,
"learning_rate": 0.00013227357416524876,
"loss": 0.9077,
"step": 388
},
{
"epoch": 0.027093853386731672,
"grad_norm": 0.6972191333770752,
"learning_rate": 0.0001319623666517324,
"loss": 0.9515,
"step": 389
},
{
"epoch": 0.027163503395437924,
"grad_norm": 0.6530499458312988,
"learning_rate": 0.00013165081397659563,
"loss": 0.6957,
"step": 390
},
{
"epoch": 0.027233153404144176,
"grad_norm": 0.5678091645240784,
"learning_rate": 0.00013133891950429605,
"loss": 0.8997,
"step": 391
},
{
"epoch": 0.027302803412850428,
"grad_norm": 0.6870533227920532,
"learning_rate": 0.00013102668660298228,
"loss": 1.0608,
"step": 392
},
{
"epoch": 0.027372453421556676,
"grad_norm": 0.8118611574172974,
"learning_rate": 0.00013071411864445763,
"loss": 0.7108,
"step": 393
},
{
"epoch": 0.02744210343026293,
"grad_norm": 0.6881155967712402,
"learning_rate": 0.0001304012190041437,
"loss": 1.0917,
"step": 394
},
{
"epoch": 0.02751175343896918,
"grad_norm": 0.647470235824585,
"learning_rate": 0.00013008799106104397,
"loss": 0.7477,
"step": 395
},
{
"epoch": 0.027581403447675432,
"grad_norm": 0.653819739818573,
"learning_rate": 0.00012977443819770716,
"loss": 0.8722,
"step": 396
},
{
"epoch": 0.02765105345638168,
"grad_norm": 0.6762019395828247,
"learning_rate": 0.00012946056380019094,
"loss": 1.0542,
"step": 397
},
{
"epoch": 0.027720703465087933,
"grad_norm": 0.5804311037063599,
"learning_rate": 0.00012914637125802512,
"loss": 1.2926,
"step": 398
},
{
"epoch": 0.027790353473794185,
"grad_norm": 0.6955252885818481,
"learning_rate": 0.0001288318639641752,
"loss": 1.0947,
"step": 399
},
{
"epoch": 0.027860003482500437,
"grad_norm": 0.7045977711677551,
"learning_rate": 0.00012851704531500563,
"loss": 1.1416,
"step": 400
},
{
"epoch": 0.027860003482500437,
"eval_loss": 0.8443693518638611,
"eval_runtime": 700.1995,
"eval_samples_per_second": 7.141,
"eval_steps_per_second": 1.785,
"step": 400
},
{
"epoch": 0.027929653491206685,
"grad_norm": 0.6152036786079407,
"learning_rate": 0.00012820191871024328,
"loss": 0.8517,
"step": 401
},
{
"epoch": 0.027999303499912937,
"grad_norm": 0.6213567852973938,
"learning_rate": 0.00012788648755294055,
"loss": 0.861,
"step": 402
},
{
"epoch": 0.02806895350861919,
"grad_norm": 0.6279333233833313,
"learning_rate": 0.00012757075524943873,
"loss": 1.1324,
"step": 403
},
{
"epoch": 0.02813860351732544,
"grad_norm": 0.5852387547492981,
"learning_rate": 0.0001272547252093312,
"loss": 0.9501,
"step": 404
},
{
"epoch": 0.02820825352603169,
"grad_norm": 0.6280404329299927,
"learning_rate": 0.00012693840084542662,
"loss": 1.1233,
"step": 405
},
{
"epoch": 0.02827790353473794,
"grad_norm": 0.6563053131103516,
"learning_rate": 0.00012662178557371198,
"loss": 1.1278,
"step": 406
},
{
"epoch": 0.028347553543444193,
"grad_norm": 0.6248413920402527,
"learning_rate": 0.00012630488281331585,
"loss": 0.9008,
"step": 407
},
{
"epoch": 0.028417203552150445,
"grad_norm": 0.5682319402694702,
"learning_rate": 0.00012598769598647135,
"loss": 0.9898,
"step": 408
},
{
"epoch": 0.028486853560856694,
"grad_norm": 0.6207916736602783,
"learning_rate": 0.00012567022851847927,
"loss": 1.0291,
"step": 409
},
{
"epoch": 0.028556503569562946,
"grad_norm": 0.7249537706375122,
"learning_rate": 0.000125352483837671,
"loss": 0.9478,
"step": 410
},
{
"epoch": 0.028626153578269198,
"grad_norm": 0.8715054988861084,
"learning_rate": 0.00012503446537537162,
"loss": 1.0623,
"step": 411
},
{
"epoch": 0.02869580358697545,
"grad_norm": 0.6953936815261841,
"learning_rate": 0.0001247161765658627,
"loss": 1.089,
"step": 412
},
{
"epoch": 0.0287654535956817,
"grad_norm": 0.5827656388282776,
"learning_rate": 0.0001243976208463453,
"loss": 0.8708,
"step": 413
},
{
"epoch": 0.02883510360438795,
"grad_norm": 0.7496638298034668,
"learning_rate": 0.00012407880165690287,
"loss": 0.8053,
"step": 414
},
{
"epoch": 0.028904753613094202,
"grad_norm": 0.7032145261764526,
"learning_rate": 0.00012375972244046415,
"loss": 1.0352,
"step": 415
},
{
"epoch": 0.028974403621800454,
"grad_norm": 0.7112724184989929,
"learning_rate": 0.00012344038664276568,
"loss": 0.7082,
"step": 416
},
{
"epoch": 0.029044053630506703,
"grad_norm": 0.6337069869041443,
"learning_rate": 0.0001231207977123151,
"loss": 0.7147,
"step": 417
},
{
"epoch": 0.029113703639212955,
"grad_norm": 0.639981210231781,
"learning_rate": 0.00012280095910035342,
"loss": 0.4832,
"step": 418
},
{
"epoch": 0.029183353647919207,
"grad_norm": 0.6611121892929077,
"learning_rate": 0.00012248087426081812,
"loss": 0.9912,
"step": 419
},
{
"epoch": 0.02925300365662546,
"grad_norm": 0.5735837817192078,
"learning_rate": 0.00012216054665030552,
"loss": 1.2525,
"step": 420
},
{
"epoch": 0.029322653665331707,
"grad_norm": 0.7706820964813232,
"learning_rate": 0.00012183997972803374,
"loss": 0.8705,
"step": 421
},
{
"epoch": 0.02939230367403796,
"grad_norm": 0.5474764108657837,
"learning_rate": 0.00012151917695580523,
"loss": 0.7432,
"step": 422
},
{
"epoch": 0.02946195368274421,
"grad_norm": 0.5462170243263245,
"learning_rate": 0.00012119814179796935,
"loss": 1.0711,
"step": 423
},
{
"epoch": 0.029531603691450463,
"grad_norm": 0.673670768737793,
"learning_rate": 0.000120876877721385,
"loss": 1.3386,
"step": 424
},
{
"epoch": 0.02960125370015671,
"grad_norm": 0.7265173196792603,
"learning_rate": 0.00012055538819538319,
"loss": 1.1199,
"step": 425
},
{
"epoch": 0.029670903708862963,
"grad_norm": 0.5875483751296997,
"learning_rate": 0.00012023367669172946,
"loss": 1.0887,
"step": 426
},
{
"epoch": 0.029740553717569215,
"grad_norm": 0.6158230304718018,
"learning_rate": 0.00011991174668458666,
"loss": 0.9483,
"step": 427
},
{
"epoch": 0.029810203726275467,
"grad_norm": 0.6764160990715027,
"learning_rate": 0.00011958960165047717,
"loss": 0.9178,
"step": 428
},
{
"epoch": 0.029879853734981716,
"grad_norm": 0.6038265824317932,
"learning_rate": 0.00011926724506824538,
"loss": 0.9309,
"step": 429
},
{
"epoch": 0.029949503743687968,
"grad_norm": 0.5902111530303955,
"learning_rate": 0.0001189446804190203,
"loss": 0.8358,
"step": 430
},
{
"epoch": 0.03001915375239422,
"grad_norm": 0.6535676121711731,
"learning_rate": 0.00011862191118617775,
"loss": 0.6587,
"step": 431
},
{
"epoch": 0.03008880376110047,
"grad_norm": 0.6216766834259033,
"learning_rate": 0.00011829894085530298,
"loss": 0.7479,
"step": 432
},
{
"epoch": 0.03015845376980672,
"grad_norm": 0.6829842925071716,
"learning_rate": 0.0001179757729141528,
"loss": 0.6207,
"step": 433
},
{
"epoch": 0.030228103778512972,
"grad_norm": 0.7262370586395264,
"learning_rate": 0.00011765241085261802,
"loss": 1.0663,
"step": 434
},
{
"epoch": 0.030297753787219224,
"grad_norm": 0.6845910549163818,
"learning_rate": 0.00011732885816268582,
"loss": 0.7484,
"step": 435
},
{
"epoch": 0.030367403795925476,
"grad_norm": 0.7333625555038452,
"learning_rate": 0.00011700511833840186,
"loss": 0.8087,
"step": 436
},
{
"epoch": 0.030437053804631724,
"grad_norm": 0.6632218360900879,
"learning_rate": 0.00011668119487583277,
"loss": 1.2482,
"step": 437
},
{
"epoch": 0.030506703813337976,
"grad_norm": 0.5340752601623535,
"learning_rate": 0.00011635709127302829,
"loss": 0.866,
"step": 438
},
{
"epoch": 0.03057635382204423,
"grad_norm": 0.7423261404037476,
"learning_rate": 0.0001160328110299834,
"loss": 0.94,
"step": 439
},
{
"epoch": 0.03064600383075048,
"grad_norm": 0.5144674777984619,
"learning_rate": 0.0001157083576486007,
"loss": 0.9346,
"step": 440
},
{
"epoch": 0.03071565383945673,
"grad_norm": 0.5007227063179016,
"learning_rate": 0.00011538373463265248,
"loss": 1.0962,
"step": 441
},
{
"epoch": 0.03078530384816298,
"grad_norm": 0.5233269929885864,
"learning_rate": 0.00011505894548774294,
"loss": 0.6513,
"step": 442
},
{
"epoch": 0.030854953856869233,
"grad_norm": 0.6934007406234741,
"learning_rate": 0.0001147339937212703,
"loss": 0.7084,
"step": 443
},
{
"epoch": 0.030924603865575485,
"grad_norm": 0.6242351531982422,
"learning_rate": 0.00011440888284238888,
"loss": 0.6915,
"step": 444
},
{
"epoch": 0.030994253874281733,
"grad_norm": 0.5059527158737183,
"learning_rate": 0.00011408361636197133,
"loss": 1.2365,
"step": 445
},
{
"epoch": 0.031063903882987985,
"grad_norm": 0.5710117220878601,
"learning_rate": 0.00011375819779257057,
"loss": 0.7813,
"step": 446
},
{
"epoch": 0.031133553891694237,
"grad_norm": 0.5397061705589294,
"learning_rate": 0.000113432630648382,
"loss": 0.5191,
"step": 447
},
{
"epoch": 0.03120320390040049,
"grad_norm": 0.6234595775604248,
"learning_rate": 0.00011310691844520543,
"loss": 0.7069,
"step": 448
},
{
"epoch": 0.03127285390910674,
"grad_norm": 0.5587515830993652,
"learning_rate": 0.00011278106470040717,
"loss": 0.8174,
"step": 449
},
{
"epoch": 0.03134250391781299,
"grad_norm": 0.4725956618785858,
"learning_rate": 0.00011245507293288204,
"loss": 1.1901,
"step": 450
},
{
"epoch": 0.03141215392651924,
"grad_norm": 0.7420422434806824,
"learning_rate": 0.00011212894666301536,
"loss": 1.136,
"step": 451
},
{
"epoch": 0.03148180393522549,
"grad_norm": 0.6457960605621338,
"learning_rate": 0.000111802689412645,
"loss": 0.6502,
"step": 452
},
{
"epoch": 0.031551453943931745,
"grad_norm": 0.672398567199707,
"learning_rate": 0.00011147630470502319,
"loss": 0.9223,
"step": 453
},
{
"epoch": 0.031621103952637994,
"grad_norm": 0.7210835218429565,
"learning_rate": 0.00011114979606477866,
"loss": 1.151,
"step": 454
},
{
"epoch": 0.03169075396134424,
"grad_norm": 0.7231703996658325,
"learning_rate": 0.00011082316701787843,
"loss": 0.8565,
"step": 455
},
{
"epoch": 0.0317604039700505,
"grad_norm": 0.6620053648948669,
"learning_rate": 0.00011049642109158981,
"loss": 1.088,
"step": 456
},
{
"epoch": 0.031830053978756746,
"grad_norm": 0.8204821348190308,
"learning_rate": 0.00011016956181444231,
"loss": 1.1381,
"step": 457
},
{
"epoch": 0.031899703987463,
"grad_norm": 0.6240087747573853,
"learning_rate": 0.00010984259271618947,
"loss": 0.8316,
"step": 458
},
{
"epoch": 0.03196935399616925,
"grad_norm": 0.6648886203765869,
"learning_rate": 0.00010951551732777083,
"loss": 1.0288,
"step": 459
},
{
"epoch": 0.0320390040048755,
"grad_norm": 0.8034060001373291,
"learning_rate": 0.00010918833918127376,
"loss": 0.93,
"step": 460
},
{
"epoch": 0.032108654013581754,
"grad_norm": 0.5740483999252319,
"learning_rate": 0.00010886106180989526,
"loss": 0.7948,
"step": 461
},
{
"epoch": 0.032178304022288,
"grad_norm": 0.5168555378913879,
"learning_rate": 0.00010853368874790392,
"loss": 0.7923,
"step": 462
},
{
"epoch": 0.03224795403099425,
"grad_norm": 0.5505993962287903,
"learning_rate": 0.0001082062235306017,
"loss": 1.2188,
"step": 463
},
{
"epoch": 0.03231760403970051,
"grad_norm": 0.5564302206039429,
"learning_rate": 0.00010787866969428569,
"loss": 0.8798,
"step": 464
},
{
"epoch": 0.032387254048406755,
"grad_norm": 0.6746006011962891,
"learning_rate": 0.00010755103077620998,
"loss": 1.0295,
"step": 465
},
{
"epoch": 0.03245690405711301,
"grad_norm": 0.6252794861793518,
"learning_rate": 0.00010722331031454748,
"loss": 1.0924,
"step": 466
},
{
"epoch": 0.03252655406581926,
"grad_norm": 0.6156384944915771,
"learning_rate": 0.00010689551184835176,
"loss": 0.732,
"step": 467
},
{
"epoch": 0.03259620407452551,
"grad_norm": 0.7271072268486023,
"learning_rate": 0.00010656763891751865,
"loss": 0.5997,
"step": 468
},
{
"epoch": 0.03266585408323176,
"grad_norm": 0.6153301000595093,
"learning_rate": 0.00010623969506274813,
"loss": 0.9489,
"step": 469
},
{
"epoch": 0.03273550409193801,
"grad_norm": 0.7981113791465759,
"learning_rate": 0.00010591168382550616,
"loss": 0.8335,
"step": 470
},
{
"epoch": 0.03280515410064426,
"grad_norm": 0.781737744808197,
"learning_rate": 0.00010558360874798631,
"loss": 1.1474,
"step": 471
},
{
"epoch": 0.032874804109350515,
"grad_norm": 0.7384591102600098,
"learning_rate": 0.0001052554733730716,
"loss": 1.0917,
"step": 472
},
{
"epoch": 0.032944454118056764,
"grad_norm": 0.7052910923957825,
"learning_rate": 0.00010492728124429618,
"loss": 1.068,
"step": 473
},
{
"epoch": 0.03301410412676302,
"grad_norm": 0.6287469267845154,
"learning_rate": 0.00010459903590580706,
"loss": 0.6939,
"step": 474
},
{
"epoch": 0.03308375413546927,
"grad_norm": 0.5639947652816772,
"learning_rate": 0.00010427074090232592,
"loss": 0.737,
"step": 475
},
{
"epoch": 0.033153404144175516,
"grad_norm": 0.7723355293273926,
"learning_rate": 0.00010394239977911068,
"loss": 1.145,
"step": 476
},
{
"epoch": 0.03322305415288177,
"grad_norm": 0.7035319209098816,
"learning_rate": 0.00010361401608191741,
"loss": 0.584,
"step": 477
},
{
"epoch": 0.03329270416158802,
"grad_norm": 0.6127707362174988,
"learning_rate": 0.00010328559335696188,
"loss": 0.9795,
"step": 478
},
{
"epoch": 0.03336235417029427,
"grad_norm": 0.5730832815170288,
"learning_rate": 0.00010295713515088134,
"loss": 0.8133,
"step": 479
},
{
"epoch": 0.033432004179000524,
"grad_norm": 0.7129435539245605,
"learning_rate": 0.00010262864501069617,
"loss": 1.1408,
"step": 480
},
{
"epoch": 0.03350165418770677,
"grad_norm": 0.5180230736732483,
"learning_rate": 0.00010230012648377162,
"loss": 0.9543,
"step": 481
},
{
"epoch": 0.03357130419641303,
"grad_norm": 0.6325164437294006,
"learning_rate": 0.00010197158311777957,
"loss": 0.8672,
"step": 482
},
{
"epoch": 0.033640954205119276,
"grad_norm": 0.7068666815757751,
"learning_rate": 0.00010164301846066,
"loss": 0.9489,
"step": 483
},
{
"epoch": 0.033710604213825525,
"grad_norm": 0.6100176572799683,
"learning_rate": 0.0001013144360605829,
"loss": 0.9124,
"step": 484
},
{
"epoch": 0.03378025422253178,
"grad_norm": 0.6595302820205688,
"learning_rate": 0.00010098583946590985,
"loss": 0.6994,
"step": 485
},
{
"epoch": 0.03384990423123803,
"grad_norm": 0.6590490341186523,
"learning_rate": 0.00010065723222515566,
"loss": 0.6314,
"step": 486
},
{
"epoch": 0.03391955423994428,
"grad_norm": 0.619118869304657,
"learning_rate": 0.00010032861788695024,
"loss": 0.7488,
"step": 487
},
{
"epoch": 0.03398920424865053,
"grad_norm": 0.6756129264831543,
"learning_rate": 0.0001,
"loss": 0.6419,
"step": 488
},
{
"epoch": 0.03405885425735678,
"grad_norm": 0.7198984026908875,
"learning_rate": 9.967138211304978e-05,
"loss": 0.8794,
"step": 489
},
{
"epoch": 0.03412850426606304,
"grad_norm": 0.684007465839386,
"learning_rate": 9.934276777484436e-05,
"loss": 1.1634,
"step": 490
},
{
"epoch": 0.034198154274769285,
"grad_norm": 0.5058736801147461,
"learning_rate": 9.90141605340902e-05,
"loss": 0.7194,
"step": 491
},
{
"epoch": 0.034267804283475534,
"grad_norm": 0.6622017025947571,
"learning_rate": 9.868556393941713e-05,
"loss": 1.059,
"step": 492
},
{
"epoch": 0.03433745429218179,
"grad_norm": 0.6841214895248413,
"learning_rate": 9.835698153933999e-05,
"loss": 0.8254,
"step": 493
},
{
"epoch": 0.03440710430088804,
"grad_norm": 0.6854826807975769,
"learning_rate": 9.802841688222043e-05,
"loss": 0.8211,
"step": 494
},
{
"epoch": 0.034476754309594286,
"grad_norm": 0.6080586314201355,
"learning_rate": 9.769987351622836e-05,
"loss": 0.8337,
"step": 495
},
{
"epoch": 0.03454640431830054,
"grad_norm": 0.5680797696113586,
"learning_rate": 9.737135498930385e-05,
"loss": 0.9282,
"step": 496
},
{
"epoch": 0.03461605432700679,
"grad_norm": 0.5402217507362366,
"learning_rate": 9.704286484911868e-05,
"loss": 0.7917,
"step": 497
},
{
"epoch": 0.034685704335713045,
"grad_norm": 0.5929046273231506,
"learning_rate": 9.671440664303814e-05,
"loss": 0.9316,
"step": 498
},
{
"epoch": 0.034755354344419294,
"grad_norm": 0.5998024940490723,
"learning_rate": 9.638598391808261e-05,
"loss": 1.173,
"step": 499
},
{
"epoch": 0.03482500435312554,
"grad_norm": 0.6345599889755249,
"learning_rate": 9.605760022088934e-05,
"loss": 0.7952,
"step": 500
},
{
"epoch": 0.03482500435312554,
"eval_loss": 0.7969969511032104,
"eval_runtime": 700.3094,
"eval_samples_per_second": 7.14,
"eval_steps_per_second": 1.785,
"step": 500
},
{
"epoch": 0.0348946543618318,
"grad_norm": 0.5795607566833496,
"learning_rate": 9.572925909767412e-05,
"loss": 0.4495,
"step": 501
},
{
"epoch": 0.034964304370538046,
"grad_norm": 0.6874101161956787,
"learning_rate": 9.540096409419296e-05,
"loss": 0.8444,
"step": 502
},
{
"epoch": 0.035033954379244295,
"grad_norm": 0.5595911145210266,
"learning_rate": 9.507271875570381e-05,
"loss": 0.9391,
"step": 503
},
{
"epoch": 0.03510360438795055,
"grad_norm": 0.525644063949585,
"learning_rate": 9.474452662692838e-05,
"loss": 0.7833,
"step": 504
},
{
"epoch": 0.0351732543966568,
"grad_norm": 0.6366891264915466,
"learning_rate": 9.441639125201368e-05,
"loss": 1.0472,
"step": 505
},
{
"epoch": 0.035242904405363054,
"grad_norm": 0.8487269878387451,
"learning_rate": 9.408831617449385e-05,
"loss": 1.0513,
"step": 506
},
{
"epoch": 0.0353125544140693,
"grad_norm": 0.7027648091316223,
"learning_rate": 9.376030493725189e-05,
"loss": 0.9505,
"step": 507
},
{
"epoch": 0.03538220442277555,
"grad_norm": 0.6772575974464417,
"learning_rate": 9.343236108248139e-05,
"loss": 1.0417,
"step": 508
},
{
"epoch": 0.03545185443148181,
"grad_norm": 0.5657368898391724,
"learning_rate": 9.310448815164826e-05,
"loss": 0.9236,
"step": 509
},
{
"epoch": 0.035521504440188055,
"grad_norm": 0.64215087890625,
"learning_rate": 9.277668968545253e-05,
"loss": 1.0035,
"step": 510
},
{
"epoch": 0.035591154448894304,
"grad_norm": 0.6276829242706299,
"learning_rate": 9.244896922379007e-05,
"loss": 0.8375,
"step": 511
},
{
"epoch": 0.03566080445760056,
"grad_norm": 0.5804170966148376,
"learning_rate": 9.212133030571437e-05,
"loss": 0.4934,
"step": 512
},
{
"epoch": 0.03573045446630681,
"grad_norm": 0.7230868935585022,
"learning_rate": 9.17937764693983e-05,
"loss": 0.9427,
"step": 513
},
{
"epoch": 0.035800104475013056,
"grad_norm": 0.6632394194602966,
"learning_rate": 9.146631125209607e-05,
"loss": 0.4176,
"step": 514
},
{
"epoch": 0.03586975448371931,
"grad_norm": 0.5885234475135803,
"learning_rate": 9.113893819010475e-05,
"loss": 0.6042,
"step": 515
},
{
"epoch": 0.03593940449242556,
"grad_norm": 0.5666863322257996,
"learning_rate": 9.081166081872626e-05,
"loss": 1.5152,
"step": 516
},
{
"epoch": 0.036009054501131815,
"grad_norm": 0.7007538676261902,
"learning_rate": 9.048448267222918e-05,
"loss": 0.9444,
"step": 517
},
{
"epoch": 0.036078704509838064,
"grad_norm": 0.6212923526763916,
"learning_rate": 9.015740728381054e-05,
"loss": 0.634,
"step": 518
},
{
"epoch": 0.03614835451854431,
"grad_norm": 0.6189596056938171,
"learning_rate": 8.98304381855577e-05,
"loss": 1.1091,
"step": 519
},
{
"epoch": 0.03621800452725057,
"grad_norm": 0.6159670948982239,
"learning_rate": 8.95035789084102e-05,
"loss": 0.787,
"step": 520
},
{
"epoch": 0.036287654535956816,
"grad_norm": 0.6371515989303589,
"learning_rate": 8.917683298212158e-05,
"loss": 0.6172,
"step": 521
},
{
"epoch": 0.036357304544663065,
"grad_norm": 0.6314066052436829,
"learning_rate": 8.885020393522135e-05,
"loss": 0.9702,
"step": 522
},
{
"epoch": 0.03642695455336932,
"grad_norm": 0.6285626888275146,
"learning_rate": 8.852369529497679e-05,
"loss": 0.9819,
"step": 523
},
{
"epoch": 0.03649660456207557,
"grad_norm": 0.5257949233055115,
"learning_rate": 8.819731058735501e-05,
"loss": 0.8288,
"step": 524
},
{
"epoch": 0.036566254570781824,
"grad_norm": 0.611438512802124,
"learning_rate": 8.787105333698465e-05,
"loss": 0.9246,
"step": 525
},
{
"epoch": 0.03663590457948807,
"grad_norm": 0.5995710492134094,
"learning_rate": 8.754492706711798e-05,
"loss": 0.6855,
"step": 526
},
{
"epoch": 0.03670555458819432,
"grad_norm": 0.681425154209137,
"learning_rate": 8.721893529959287e-05,
"loss": 1.1644,
"step": 527
},
{
"epoch": 0.036775204596900576,
"grad_norm": 0.7111718654632568,
"learning_rate": 8.68930815547946e-05,
"loss": 0.9181,
"step": 528
},
{
"epoch": 0.036844854605606825,
"grad_norm": 0.5794047713279724,
"learning_rate": 8.656736935161802e-05,
"loss": 1.061,
"step": 529
},
{
"epoch": 0.03691450461431307,
"grad_norm": 0.5971503257751465,
"learning_rate": 8.624180220742946e-05,
"loss": 0.5903,
"step": 530
},
{
"epoch": 0.03698415462301933,
"grad_norm": 0.7091482281684875,
"learning_rate": 8.59163836380287e-05,
"loss": 0.8907,
"step": 531
},
{
"epoch": 0.03705380463172558,
"grad_norm": 0.6185580492019653,
"learning_rate": 8.559111715761114e-05,
"loss": 0.8452,
"step": 532
},
{
"epoch": 0.03712345464043183,
"grad_norm": 0.68827223777771,
"learning_rate": 8.52660062787297e-05,
"loss": 0.8711,
"step": 533
},
{
"epoch": 0.03719310464913808,
"grad_norm": 0.6279632449150085,
"learning_rate": 8.494105451225704e-05,
"loss": 0.6453,
"step": 534
},
{
"epoch": 0.03726275465784433,
"grad_norm": 0.7252237200737,
"learning_rate": 8.461626536734753e-05,
"loss": 1.1148,
"step": 535
},
{
"epoch": 0.037332404666550585,
"grad_norm": 0.6377342939376831,
"learning_rate": 8.429164235139931e-05,
"loss": 1.0532,
"step": 536
},
{
"epoch": 0.037402054675256834,
"grad_norm": 0.7409278154373169,
"learning_rate": 8.396718897001663e-05,
"loss": 1.0161,
"step": 537
},
{
"epoch": 0.03747170468396308,
"grad_norm": 0.6048555970191956,
"learning_rate": 8.364290872697173e-05,
"loss": 1.012,
"step": 538
},
{
"epoch": 0.03754135469266934,
"grad_norm": 0.7676815390586853,
"learning_rate": 8.331880512416724e-05,
"loss": 0.9402,
"step": 539
},
{
"epoch": 0.037611004701375586,
"grad_norm": 0.6360906958580017,
"learning_rate": 8.299488166159817e-05,
"loss": 0.4591,
"step": 540
},
{
"epoch": 0.03768065471008184,
"grad_norm": 0.6816183924674988,
"learning_rate": 8.267114183731421e-05,
"loss": 0.661,
"step": 541
},
{
"epoch": 0.03775030471878809,
"grad_norm": 0.6955873966217041,
"learning_rate": 8.234758914738199e-05,
"loss": 0.8015,
"step": 542
},
{
"epoch": 0.03781995472749434,
"grad_norm": 0.787493884563446,
"learning_rate": 8.20242270858472e-05,
"loss": 0.6941,
"step": 543
},
{
"epoch": 0.037889604736200594,
"grad_norm": 0.5939062833786011,
"learning_rate": 8.170105914469702e-05,
"loss": 0.9034,
"step": 544
},
{
"epoch": 0.03795925474490684,
"grad_norm": 0.5235042572021484,
"learning_rate": 8.137808881382226e-05,
"loss": 1.0283,
"step": 545
},
{
"epoch": 0.03802890475361309,
"grad_norm": 0.7017082571983337,
"learning_rate": 8.105531958097972e-05,
"loss": 1.0407,
"step": 546
},
{
"epoch": 0.038098554762319346,
"grad_norm": 0.7762130498886108,
"learning_rate": 8.073275493175464e-05,
"loss": 0.7814,
"step": 547
},
{
"epoch": 0.038168204771025595,
"grad_norm": 0.588405191898346,
"learning_rate": 8.041039834952287e-05,
"loss": 0.8832,
"step": 548
},
{
"epoch": 0.03823785477973185,
"grad_norm": 0.7792285084724426,
"learning_rate": 8.008825331541335e-05,
"loss": 1.051,
"step": 549
},
{
"epoch": 0.0383075047884381,
"grad_norm": 0.6209467649459839,
"learning_rate": 7.976632330827056e-05,
"loss": 0.8802,
"step": 550
},
{
"epoch": 0.03837715479714435,
"grad_norm": 0.5231680274009705,
"learning_rate": 7.944461180461686e-05,
"loss": 0.7529,
"step": 551
},
{
"epoch": 0.0384468048058506,
"grad_norm": 0.6021607518196106,
"learning_rate": 7.912312227861503e-05,
"loss": 1.1235,
"step": 552
},
{
"epoch": 0.03851645481455685,
"grad_norm": 0.5573668479919434,
"learning_rate": 7.880185820203065e-05,
"loss": 0.6753,
"step": 553
},
{
"epoch": 0.0385861048232631,
"grad_norm": 0.5354910492897034,
"learning_rate": 7.848082304419478e-05,
"loss": 0.6843,
"step": 554
},
{
"epoch": 0.038655754831969355,
"grad_norm": 0.606436014175415,
"learning_rate": 7.816002027196627e-05,
"loss": 1.0557,
"step": 555
},
{
"epoch": 0.038725404840675604,
"grad_norm": 0.6580552458763123,
"learning_rate": 7.783945334969451e-05,
"loss": 0.6222,
"step": 556
},
{
"epoch": 0.03879505484938186,
"grad_norm": 0.6174128651618958,
"learning_rate": 7.751912573918193e-05,
"loss": 0.8194,
"step": 557
},
{
"epoch": 0.03886470485808811,
"grad_norm": 0.6724019646644592,
"learning_rate": 7.719904089964658e-05,
"loss": 1.0095,
"step": 558
},
{
"epoch": 0.038934354866794356,
"grad_norm": 0.7200993299484253,
"learning_rate": 7.687920228768493e-05,
"loss": 0.8115,
"step": 559
},
{
"epoch": 0.03900400487550061,
"grad_norm": 0.5682472586631775,
"learning_rate": 7.655961335723433e-05,
"loss": 0.7034,
"step": 560
},
{
"epoch": 0.03907365488420686,
"grad_norm": 0.7236086130142212,
"learning_rate": 7.624027755953592e-05,
"loss": 0.9028,
"step": 561
},
{
"epoch": 0.03914330489291311,
"grad_norm": 0.5866789221763611,
"learning_rate": 7.592119834309715e-05,
"loss": 0.8919,
"step": 562
},
{
"epoch": 0.039212954901619364,
"grad_norm": 0.6271937489509583,
"learning_rate": 7.560237915365472e-05,
"loss": 0.6447,
"step": 563
},
{
"epoch": 0.03928260491032561,
"grad_norm": 0.5319473147392273,
"learning_rate": 7.528382343413734e-05,
"loss": 1.0977,
"step": 564
},
{
"epoch": 0.03935225491903187,
"grad_norm": 0.673537015914917,
"learning_rate": 7.49655346246284e-05,
"loss": 0.6669,
"step": 565
},
{
"epoch": 0.039421904927738116,
"grad_norm": 0.7043957114219666,
"learning_rate": 7.464751616232902e-05,
"loss": 0.6334,
"step": 566
},
{
"epoch": 0.039491554936444365,
"grad_norm": 0.6532731652259827,
"learning_rate": 7.432977148152074e-05,
"loss": 0.659,
"step": 567
},
{
"epoch": 0.03956120494515062,
"grad_norm": 0.6882482767105103,
"learning_rate": 7.401230401352866e-05,
"loss": 0.711,
"step": 568
},
{
"epoch": 0.03963085495385687,
"grad_norm": 0.7171745896339417,
"learning_rate": 7.369511718668418e-05,
"loss": 0.941,
"step": 569
},
{
"epoch": 0.03970050496256312,
"grad_norm": 0.6474679708480835,
"learning_rate": 7.337821442628805e-05,
"loss": 0.8192,
"step": 570
},
{
"epoch": 0.03977015497126937,
"grad_norm": 0.7054280042648315,
"learning_rate": 7.306159915457342e-05,
"loss": 0.6327,
"step": 571
},
{
"epoch": 0.03983980497997562,
"grad_norm": 0.7624709606170654,
"learning_rate": 7.274527479066883e-05,
"loss": 0.8132,
"step": 572
},
{
"epoch": 0.039909454988681876,
"grad_norm": 0.6930527687072754,
"learning_rate": 7.242924475056127e-05,
"loss": 0.8482,
"step": 573
},
{
"epoch": 0.039979104997388125,
"grad_norm": 0.6599513292312622,
"learning_rate": 7.211351244705946e-05,
"loss": 0.6787,
"step": 574
},
{
"epoch": 0.04004875500609437,
"grad_norm": 0.7311400771141052,
"learning_rate": 7.179808128975674e-05,
"loss": 0.9747,
"step": 575
},
{
"epoch": 0.04011840501480063,
"grad_norm": 0.615138828754425,
"learning_rate": 7.148295468499438e-05,
"loss": 0.9404,
"step": 576
},
{
"epoch": 0.04018805502350688,
"grad_norm": 0.6401761174201965,
"learning_rate": 7.116813603582482e-05,
"loss": 0.4915,
"step": 577
},
{
"epoch": 0.040257705032213126,
"grad_norm": 0.6191440224647522,
"learning_rate": 7.08536287419749e-05,
"loss": 0.6031,
"step": 578
},
{
"epoch": 0.04032735504091938,
"grad_norm": 0.5751050710678101,
"learning_rate": 7.053943619980907e-05,
"loss": 0.8371,
"step": 579
},
{
"epoch": 0.04039700504962563,
"grad_norm": 0.518409252166748,
"learning_rate": 7.022556180229285e-05,
"loss": 0.4333,
"step": 580
},
{
"epoch": 0.040466655058331885,
"grad_norm": 0.5712803602218628,
"learning_rate": 6.991200893895608e-05,
"loss": 0.796,
"step": 581
},
{
"epoch": 0.040536305067038134,
"grad_norm": 0.661482036113739,
"learning_rate": 6.959878099585635e-05,
"loss": 0.8585,
"step": 582
},
{
"epoch": 0.04060595507574438,
"grad_norm": 0.6602011322975159,
"learning_rate": 6.92858813555424e-05,
"loss": 0.9474,
"step": 583
},
{
"epoch": 0.04067560508445064,
"grad_norm": 0.5971815586090088,
"learning_rate": 6.897331339701776e-05,
"loss": 0.7689,
"step": 584
},
{
"epoch": 0.040745255093156886,
"grad_norm": 0.571740448474884,
"learning_rate": 6.866108049570397e-05,
"loss": 0.9023,
"step": 585
},
{
"epoch": 0.040814905101863135,
"grad_norm": 0.6928638219833374,
"learning_rate": 6.834918602340438e-05,
"loss": 0.8899,
"step": 586
},
{
"epoch": 0.04088455511056939,
"grad_norm": 0.6468199491500854,
"learning_rate": 6.803763334826763e-05,
"loss": 0.8841,
"step": 587
},
{
"epoch": 0.04095420511927564,
"grad_norm": 0.6777251362800598,
"learning_rate": 6.772642583475126e-05,
"loss": 0.8491,
"step": 588
},
{
"epoch": 0.041023855127981894,
"grad_norm": 0.5866687297821045,
"learning_rate": 6.741556684358545e-05,
"loss": 0.6435,
"step": 589
},
{
"epoch": 0.04109350513668814,
"grad_norm": 0.5522730350494385,
"learning_rate": 6.710505973173664e-05,
"loss": 0.9188,
"step": 590
},
{
"epoch": 0.04116315514539439,
"grad_norm": 0.7048250436782837,
"learning_rate": 6.679490785237137e-05,
"loss": 0.911,
"step": 591
},
{
"epoch": 0.041232805154100646,
"grad_norm": 0.849677324295044,
"learning_rate": 6.648511455482003e-05,
"loss": 1.0408,
"step": 592
},
{
"epoch": 0.041302455162806895,
"grad_norm": 0.653287947177887,
"learning_rate": 6.617568318454059e-05,
"loss": 1.187,
"step": 593
},
{
"epoch": 0.04137210517151314,
"grad_norm": 0.5278560519218445,
"learning_rate": 6.586661708308272e-05,
"loss": 0.8789,
"step": 594
},
{
"epoch": 0.0414417551802194,
"grad_norm": 0.7803817987442017,
"learning_rate": 6.555791958805147e-05,
"loss": 0.8788,
"step": 595
},
{
"epoch": 0.04151140518892565,
"grad_norm": 0.6425774097442627,
"learning_rate": 6.524959403307125e-05,
"loss": 0.9296,
"step": 596
},
{
"epoch": 0.0415810551976319,
"grad_norm": 0.5787883400917053,
"learning_rate": 6.494164374775e-05,
"loss": 1.0127,
"step": 597
},
{
"epoch": 0.04165070520633815,
"grad_norm": 0.5686517357826233,
"learning_rate": 6.463407205764305e-05,
"loss": 0.7869,
"step": 598
},
{
"epoch": 0.0417203552150444,
"grad_norm": 0.5126462578773499,
"learning_rate": 6.43268822842173e-05,
"loss": 1.2029,
"step": 599
},
{
"epoch": 0.041790005223750655,
"grad_norm": 0.5618976950645447,
"learning_rate": 6.402007774481536e-05,
"loss": 0.5725,
"step": 600
},
{
"epoch": 0.041790005223750655,
"eval_loss": 0.7635987401008606,
"eval_runtime": 701.6781,
"eval_samples_per_second": 7.126,
"eval_steps_per_second": 1.781,
"step": 600
},
{
"epoch": 0.041859655232456904,
"grad_norm": 0.6774680018424988,
"learning_rate": 6.371366175261964e-05,
"loss": 0.9805,
"step": 601
},
{
"epoch": 0.04192930524116315,
"grad_norm": 0.7227701544761658,
"learning_rate": 6.340763761661665e-05,
"loss": 0.933,
"step": 602
},
{
"epoch": 0.04199895524986941,
"grad_norm": 0.7895076870918274,
"learning_rate": 6.310200864156126e-05,
"loss": 0.9677,
"step": 603
},
{
"epoch": 0.042068605258575656,
"grad_norm": 0.6837015748023987,
"learning_rate": 6.279677812794103e-05,
"loss": 1.1069,
"step": 604
},
{
"epoch": 0.04213825526728191,
"grad_norm": 0.8501606583595276,
"learning_rate": 6.249194937194047e-05,
"loss": 0.961,
"step": 605
},
{
"epoch": 0.04220790527598816,
"grad_norm": 0.7296304106712341,
"learning_rate": 6.218752566540554e-05,
"loss": 0.9667,
"step": 606
},
{
"epoch": 0.04227755528469441,
"grad_norm": 0.5765381455421448,
"learning_rate": 6.188351029580805e-05,
"loss": 1.0982,
"step": 607
},
{
"epoch": 0.042347205293400664,
"grad_norm": 0.7557181119918823,
"learning_rate": 6.157990654621024e-05,
"loss": 0.9381,
"step": 608
},
{
"epoch": 0.04241685530210691,
"grad_norm": 0.6191427707672119,
"learning_rate": 6.127671769522916e-05,
"loss": 0.9322,
"step": 609
},
{
"epoch": 0.04248650531081316,
"grad_norm": 0.5968077778816223,
"learning_rate": 6.097394701700145e-05,
"loss": 0.9394,
"step": 610
},
{
"epoch": 0.042556155319519416,
"grad_norm": 0.5749527812004089,
"learning_rate": 6.067159778114788e-05,
"loss": 0.7593,
"step": 611
},
{
"epoch": 0.042625805328225665,
"grad_norm": 0.5655612945556641,
"learning_rate": 6.036967325273807e-05,
"loss": 1.0865,
"step": 612
},
{
"epoch": 0.04269545533693192,
"grad_norm": 0.7150444984436035,
"learning_rate": 6.0068176692255175e-05,
"loss": 0.612,
"step": 613
},
{
"epoch": 0.04276510534563817,
"grad_norm": 0.6594777703285217,
"learning_rate": 5.976711135556086e-05,
"loss": 0.6786,
"step": 614
},
{
"epoch": 0.04283475535434442,
"grad_norm": 0.6561244130134583,
"learning_rate": 5.946648049385985e-05,
"loss": 0.9041,
"step": 615
},
{
"epoch": 0.04290440536305067,
"grad_norm": 0.5820670127868652,
"learning_rate": 5.916628735366505e-05,
"loss": 0.6228,
"step": 616
},
{
"epoch": 0.04297405537175692,
"grad_norm": 0.7414914965629578,
"learning_rate": 5.886653517676239e-05,
"loss": 0.7384,
"step": 617
},
{
"epoch": 0.04304370538046317,
"grad_norm": 0.7077262997627258,
"learning_rate": 5.8567227200175865e-05,
"loss": 1.0201,
"step": 618
},
{
"epoch": 0.043113355389169425,
"grad_norm": 0.6975839734077454,
"learning_rate": 5.8268366656132476e-05,
"loss": 0.6453,
"step": 619
},
{
"epoch": 0.04318300539787567,
"grad_norm": 0.6871505379676819,
"learning_rate": 5.796995677202753e-05,
"loss": 1.0648,
"step": 620
},
{
"epoch": 0.04325265540658193,
"grad_norm": 0.6167171001434326,
"learning_rate": 5.76720007703895e-05,
"loss": 0.7303,
"step": 621
},
{
"epoch": 0.04332230541528818,
"grad_norm": 0.7851260900497437,
"learning_rate": 5.7374501868845544e-05,
"loss": 0.7858,
"step": 622
},
{
"epoch": 0.043391955423994426,
"grad_norm": 0.5275984406471252,
"learning_rate": 5.7077463280086415e-05,
"loss": 0.7998,
"step": 623
},
{
"epoch": 0.04346160543270068,
"grad_norm": 0.7553796768188477,
"learning_rate": 5.6780888211832116e-05,
"loss": 0.6115,
"step": 624
},
{
"epoch": 0.04353125544140693,
"grad_norm": 0.7186095118522644,
"learning_rate": 5.648477986679703e-05,
"loss": 0.9616,
"step": 625
},
{
"epoch": 0.04360090545011318,
"grad_norm": 0.7424410581588745,
"learning_rate": 5.6189141442655325e-05,
"loss": 0.8707,
"step": 626
},
{
"epoch": 0.043670555458819434,
"grad_norm": 0.6303914189338684,
"learning_rate": 5.589397613200662e-05,
"loss": 0.8386,
"step": 627
},
{
"epoch": 0.04374020546752568,
"grad_norm": 0.7636226415634155,
"learning_rate": 5.559928712234126e-05,
"loss": 0.8905,
"step": 628
},
{
"epoch": 0.04380985547623194,
"grad_norm": 0.6990499496459961,
"learning_rate": 5.530507759600614e-05,
"loss": 0.964,
"step": 629
},
{
"epoch": 0.043879505484938186,
"grad_norm": 0.6701223254203796,
"learning_rate": 5.501135073017008e-05,
"loss": 0.8774,
"step": 630
},
{
"epoch": 0.043949155493644435,
"grad_norm": 0.5796250104904175,
"learning_rate": 5.471810969678975e-05,
"loss": 0.6749,
"step": 631
},
{
"epoch": 0.04401880550235069,
"grad_norm": 0.6239587664604187,
"learning_rate": 5.442535766257525e-05,
"loss": 0.9801,
"step": 632
},
{
"epoch": 0.04408845551105694,
"grad_norm": 0.8477646112442017,
"learning_rate": 5.413309778895602e-05,
"loss": 0.6404,
"step": 633
},
{
"epoch": 0.04415810551976319,
"grad_norm": 0.7139285802841187,
"learning_rate": 5.3841333232046654e-05,
"loss": 1.1062,
"step": 634
},
{
"epoch": 0.04422775552846944,
"grad_norm": 0.5378491878509521,
"learning_rate": 5.355006714261285e-05,
"loss": 1.2571,
"step": 635
},
{
"epoch": 0.04429740553717569,
"grad_norm": 0.647861659526825,
"learning_rate": 5.325930266603724e-05,
"loss": 1.2096,
"step": 636
},
{
"epoch": 0.044367055545881946,
"grad_norm": 0.7343048453330994,
"learning_rate": 5.296904294228569e-05,
"loss": 0.9278,
"step": 637
},
{
"epoch": 0.044436705554588195,
"grad_norm": 0.5826293230056763,
"learning_rate": 5.267929110587307e-05,
"loss": 1.0683,
"step": 638
},
{
"epoch": 0.04450635556329444,
"grad_norm": 0.6172500848770142,
"learning_rate": 5.2390050285829786e-05,
"loss": 0.9441,
"step": 639
},
{
"epoch": 0.0445760055720007,
"grad_norm": 0.7326881885528564,
"learning_rate": 5.210132360566755e-05,
"loss": 0.7529,
"step": 640
},
{
"epoch": 0.04464565558070695,
"grad_norm": 0.7021967768669128,
"learning_rate": 5.181311418334608e-05,
"loss": 0.606,
"step": 641
},
{
"epoch": 0.044715305589413196,
"grad_norm": 0.6962524652481079,
"learning_rate": 5.1525425131239056e-05,
"loss": 0.8838,
"step": 642
},
{
"epoch": 0.04478495559811945,
"grad_norm": 0.535213828086853,
"learning_rate": 5.123825955610079e-05,
"loss": 0.8108,
"step": 643
},
{
"epoch": 0.0448546056068257,
"grad_norm": 0.5601661801338196,
"learning_rate": 5.0951620559032573e-05,
"loss": 0.5116,
"step": 644
},
{
"epoch": 0.044924255615531955,
"grad_norm": 0.6015167832374573,
"learning_rate": 5.066551123544907e-05,
"loss": 0.7486,
"step": 645
},
{
"epoch": 0.044993905624238204,
"grad_norm": 0.8018868565559387,
"learning_rate": 5.0379934675045145e-05,
"loss": 0.9923,
"step": 646
},
{
"epoch": 0.04506355563294445,
"grad_norm": 0.6844683289527893,
"learning_rate": 5.009489396176221e-05,
"loss": 0.9141,
"step": 647
},
{
"epoch": 0.04513320564165071,
"grad_norm": 0.5720611810684204,
"learning_rate": 4.9810392173755194e-05,
"loss": 0.7879,
"step": 648
},
{
"epoch": 0.045202855650356956,
"grad_norm": 0.5712713599205017,
"learning_rate": 4.9526432383359036e-05,
"loss": 0.9627,
"step": 649
},
{
"epoch": 0.045272505659063204,
"grad_norm": 0.5877520442008972,
"learning_rate": 4.92430176570558e-05,
"loss": 0.6014,
"step": 650
},
{
"epoch": 0.04534215566776946,
"grad_norm": 0.639779806137085,
"learning_rate": 4.896015105544124e-05,
"loss": 0.6532,
"step": 651
},
{
"epoch": 0.04541180567647571,
"grad_norm": 0.5214322209358215,
"learning_rate": 4.867783563319206e-05,
"loss": 0.6277,
"step": 652
},
{
"epoch": 0.045481455685181964,
"grad_norm": 0.6788254380226135,
"learning_rate": 4.8396074439032604e-05,
"loss": 0.5997,
"step": 653
},
{
"epoch": 0.04555110569388821,
"grad_norm": 0.7286319732666016,
"learning_rate": 4.811487051570235e-05,
"loss": 0.9064,
"step": 654
},
{
"epoch": 0.04562075570259446,
"grad_norm": 0.6942530870437622,
"learning_rate": 4.783422689992256e-05,
"loss": 1.2174,
"step": 655
},
{
"epoch": 0.045690405711300716,
"grad_norm": 0.6202605366706848,
"learning_rate": 4.7554146622363914e-05,
"loss": 0.9942,
"step": 656
},
{
"epoch": 0.045760055720006965,
"grad_norm": 0.6402217745780945,
"learning_rate": 4.727463270761346e-05,
"loss": 0.9941,
"step": 657
},
{
"epoch": 0.04582970572871321,
"grad_norm": 0.5262777209281921,
"learning_rate": 4.699568817414224e-05,
"loss": 0.8669,
"step": 658
},
{
"epoch": 0.04589935573741947,
"grad_norm": 0.6133191585540771,
"learning_rate": 4.6717316034272394e-05,
"loss": 0.9069,
"step": 659
},
{
"epoch": 0.04596900574612572,
"grad_norm": 0.7493846416473389,
"learning_rate": 4.643951929414493e-05,
"loss": 0.6228,
"step": 660
},
{
"epoch": 0.04603865575483197,
"grad_norm": 0.642196774482727,
"learning_rate": 4.616230095368697e-05,
"loss": 1.012,
"step": 661
},
{
"epoch": 0.04610830576353822,
"grad_norm": 0.726894736289978,
"learning_rate": 4.5885664006579645e-05,
"loss": 1.0356,
"step": 662
},
{
"epoch": 0.04617795577224447,
"grad_norm": 0.7074050307273865,
"learning_rate": 4.5609611440225474e-05,
"loss": 1.0333,
"step": 663
},
{
"epoch": 0.046247605780950725,
"grad_norm": 0.7056405544281006,
"learning_rate": 4.533414623571637e-05,
"loss": 0.5944,
"step": 664
},
{
"epoch": 0.04631725578965697,
"grad_norm": 0.7887142896652222,
"learning_rate": 4.505927136780128e-05,
"loss": 0.8546,
"step": 665
},
{
"epoch": 0.04638690579836322,
"grad_norm": 0.5718196034431458,
"learning_rate": 4.478498980485405e-05,
"loss": 0.7971,
"step": 666
},
{
"epoch": 0.04645655580706948,
"grad_norm": 0.4922311007976532,
"learning_rate": 4.4511304508841544e-05,
"loss": 0.4773,
"step": 667
},
{
"epoch": 0.046526205815775726,
"grad_norm": 0.5427528619766235,
"learning_rate": 4.423821843529139e-05,
"loss": 0.5889,
"step": 668
},
{
"epoch": 0.04659585582448198,
"grad_norm": 0.5341909527778625,
"learning_rate": 4.396573453326037e-05,
"loss": 0.7427,
"step": 669
},
{
"epoch": 0.04666550583318823,
"grad_norm": 0.7404798269271851,
"learning_rate": 4.369385574530227e-05,
"loss": 1.1909,
"step": 670
},
{
"epoch": 0.04673515584189448,
"grad_norm": 0.6806610226631165,
"learning_rate": 4.342258500743638e-05,
"loss": 0.9576,
"step": 671
},
{
"epoch": 0.046804805850600734,
"grad_norm": 0.6135253310203552,
"learning_rate": 4.315192524911551e-05,
"loss": 0.7204,
"step": 672
},
{
"epoch": 0.04687445585930698,
"grad_norm": 0.8514856100082397,
"learning_rate": 4.288187939319465e-05,
"loss": 0.9307,
"step": 673
},
{
"epoch": 0.04694410586801323,
"grad_norm": 0.6521239280700684,
"learning_rate": 4.261245035589917e-05,
"loss": 0.6885,
"step": 674
},
{
"epoch": 0.047013755876719486,
"grad_norm": 0.6027514338493347,
"learning_rate": 4.234364104679347e-05,
"loss": 0.9786,
"step": 675
},
{
"epoch": 0.047083405885425735,
"grad_norm": 0.6285941004753113,
"learning_rate": 4.207545436874941e-05,
"loss": 0.6983,
"step": 676
},
{
"epoch": 0.04715305589413199,
"grad_norm": 0.6285765767097473,
"learning_rate": 4.1807893217915195e-05,
"loss": 0.8987,
"step": 677
},
{
"epoch": 0.04722270590283824,
"grad_norm": 0.7090179324150085,
"learning_rate": 4.15409604836838e-05,
"loss": 1.0551,
"step": 678
},
{
"epoch": 0.04729235591154449,
"grad_norm": 0.6713972091674805,
"learning_rate": 4.127465904866209e-05,
"loss": 0.7779,
"step": 679
},
{
"epoch": 0.04736200592025074,
"grad_norm": 0.6123691201210022,
"learning_rate": 4.1008991788639386e-05,
"loss": 0.6502,
"step": 680
},
{
"epoch": 0.04743165592895699,
"grad_norm": 0.8065311312675476,
"learning_rate": 4.0743961572556686e-05,
"loss": 0.6814,
"step": 681
},
{
"epoch": 0.04750130593766324,
"grad_norm": 0.6417213082313538,
"learning_rate": 4.047957126247541e-05,
"loss": 0.8127,
"step": 682
},
{
"epoch": 0.047570955946369495,
"grad_norm": 0.7060418725013733,
"learning_rate": 4.021582371354674e-05,
"loss": 0.9657,
"step": 683
},
{
"epoch": 0.04764060595507574,
"grad_norm": 0.6365180015563965,
"learning_rate": 3.99527217739807e-05,
"loss": 0.8965,
"step": 684
},
{
"epoch": 0.047710255963782,
"grad_norm": 0.7569335103034973,
"learning_rate": 3.969026828501523e-05,
"loss": 0.9742,
"step": 685
},
{
"epoch": 0.04777990597248825,
"grad_norm": 0.6113385558128357,
"learning_rate": 3.942846608088583e-05,
"loss": 0.8562,
"step": 686
},
{
"epoch": 0.047849555981194496,
"grad_norm": 0.5718615651130676,
"learning_rate": 3.916731798879462e-05,
"loss": 0.6826,
"step": 687
},
{
"epoch": 0.04791920598990075,
"grad_norm": 0.718606173992157,
"learning_rate": 3.8906826828880085e-05,
"loss": 0.5029,
"step": 688
},
{
"epoch": 0.047988855998607,
"grad_norm": 0.745060384273529,
"learning_rate": 3.8646995414186396e-05,
"loss": 0.4777,
"step": 689
},
{
"epoch": 0.04805850600731325,
"grad_norm": 0.6253296136856079,
"learning_rate": 3.838782655063325e-05,
"loss": 0.4763,
"step": 690
},
{
"epoch": 0.048128156016019503,
"grad_norm": 0.7446655631065369,
"learning_rate": 3.812932303698533e-05,
"loss": 0.7823,
"step": 691
},
{
"epoch": 0.04819780602472575,
"grad_norm": 0.7678576111793518,
"learning_rate": 3.7871487664822326e-05,
"loss": 0.7656,
"step": 692
},
{
"epoch": 0.04826745603343201,
"grad_norm": 0.7170537710189819,
"learning_rate": 3.7614323218508506e-05,
"loss": 1.0093,
"step": 693
},
{
"epoch": 0.048337106042138256,
"grad_norm": 0.7178253531455994,
"learning_rate": 3.7357832475163045e-05,
"loss": 0.9605,
"step": 694
},
{
"epoch": 0.048406756050844504,
"grad_norm": 0.6666684746742249,
"learning_rate": 3.710201820462956e-05,
"loss": 0.9654,
"step": 695
},
{
"epoch": 0.04847640605955076,
"grad_norm": 0.6459413766860962,
"learning_rate": 3.6846883169446625e-05,
"loss": 0.6705,
"step": 696
},
{
"epoch": 0.04854605606825701,
"grad_norm": 0.6586235165596008,
"learning_rate": 3.659243012481757e-05,
"loss": 1.0915,
"step": 697
},
{
"epoch": 0.04861570607696326,
"grad_norm": 0.6067480444908142,
"learning_rate": 3.63386618185811e-05,
"loss": 0.8191,
"step": 698
},
{
"epoch": 0.04868535608566951,
"grad_norm": 0.7405864000320435,
"learning_rate": 3.6085580991181256e-05,
"loss": 0.9778,
"step": 699
},
{
"epoch": 0.04875500609437576,
"grad_norm": 0.6318597197532654,
"learning_rate": 3.583319037563816e-05,
"loss": 0.6675,
"step": 700
},
{
"epoch": 0.04875500609437576,
"eval_loss": 0.7419635653495789,
"eval_runtime": 700.4042,
"eval_samples_per_second": 7.139,
"eval_steps_per_second": 1.785,
"step": 700
},
{
"epoch": 0.048824656103082016,
"grad_norm": 0.6579747200012207,
"learning_rate": 3.558149269751816e-05,
"loss": 0.64,
"step": 701
},
{
"epoch": 0.048894306111788265,
"grad_norm": 0.6741796731948853,
"learning_rate": 3.5330490674904735e-05,
"loss": 0.7894,
"step": 702
},
{
"epoch": 0.04896395612049451,
"grad_norm": 0.691154956817627,
"learning_rate": 3.5080187018368846e-05,
"loss": 0.8126,
"step": 703
},
{
"epoch": 0.04903360612920077,
"grad_norm": 0.5884422659873962,
"learning_rate": 3.483058443093989e-05,
"loss": 0.4997,
"step": 704
},
{
"epoch": 0.04910325613790702,
"grad_norm": 0.8021077513694763,
"learning_rate": 3.458168560807643e-05,
"loss": 0.9094,
"step": 705
},
{
"epoch": 0.049172906146613266,
"grad_norm": 0.6837207674980164,
"learning_rate": 3.433349323763696e-05,
"loss": 0.8385,
"step": 706
},
{
"epoch": 0.04924255615531952,
"grad_norm": 0.815160870552063,
"learning_rate": 3.408600999985112e-05,
"loss": 0.7504,
"step": 707
},
{
"epoch": 0.04931220616402577,
"grad_norm": 0.6362173557281494,
"learning_rate": 3.383923856729052e-05,
"loss": 0.962,
"step": 708
},
{
"epoch": 0.049381856172732025,
"grad_norm": 0.7275608777999878,
"learning_rate": 3.359318160484011e-05,
"loss": 1.1645,
"step": 709
},
{
"epoch": 0.04945150618143827,
"grad_norm": 0.7200846672058105,
"learning_rate": 3.334784176966912e-05,
"loss": 1.1489,
"step": 710
},
{
"epoch": 0.04952115619014452,
"grad_norm": 0.7058080434799194,
"learning_rate": 3.310322171120267e-05,
"loss": 0.7897,
"step": 711
},
{
"epoch": 0.04959080619885078,
"grad_norm": 0.6900257468223572,
"learning_rate": 3.28593240710929e-05,
"loss": 0.8203,
"step": 712
},
{
"epoch": 0.049660456207557026,
"grad_norm": 0.6234864592552185,
"learning_rate": 3.261615148319063e-05,
"loss": 0.8475,
"step": 713
},
{
"epoch": 0.049730106216263274,
"grad_norm": 0.7157082557678223,
"learning_rate": 3.2373706573516794e-05,
"loss": 1.1521,
"step": 714
},
{
"epoch": 0.04979975622496953,
"grad_norm": 0.6452792286872864,
"learning_rate": 3.21319919602342e-05,
"loss": 0.7429,
"step": 715
},
{
"epoch": 0.04986940623367578,
"grad_norm": 0.6651695966720581,
"learning_rate": 3.189101025361905e-05,
"loss": 0.7481,
"step": 716
},
{
"epoch": 0.049939056242382034,
"grad_norm": 0.5767229199409485,
"learning_rate": 3.165076405603303e-05,
"loss": 1.2513,
"step": 717
},
{
"epoch": 0.05000870625108828,
"grad_norm": 0.6223350763320923,
"learning_rate": 3.141125596189494e-05,
"loss": 1.0635,
"step": 718
},
{
"epoch": 0.05007835625979453,
"grad_norm": 0.6872287392616272,
"learning_rate": 3.117248855765294e-05,
"loss": 0.6846,
"step": 719
},
{
"epoch": 0.050148006268500786,
"grad_norm": 0.6780046224594116,
"learning_rate": 3.093446442175631e-05,
"loss": 0.7238,
"step": 720
},
{
"epoch": 0.050217656277207035,
"grad_norm": 0.5555802583694458,
"learning_rate": 3.069718612462793e-05,
"loss": 0.8503,
"step": 721
},
{
"epoch": 0.05028730628591328,
"grad_norm": 0.7299566268920898,
"learning_rate": 3.0460656228636254e-05,
"loss": 0.8579,
"step": 722
},
{
"epoch": 0.05035695629461954,
"grad_norm": 0.6805000305175781,
"learning_rate": 3.022487728806783e-05,
"loss": 0.8994,
"step": 723
},
{
"epoch": 0.05042660630332579,
"grad_norm": 0.5568419098854065,
"learning_rate": 2.9989851849099594e-05,
"loss": 0.9992,
"step": 724
},
{
"epoch": 0.05049625631203204,
"grad_norm": 0.7006337642669678,
"learning_rate": 2.9755582449771457e-05,
"loss": 0.9476,
"step": 725
},
{
"epoch": 0.05056590632073829,
"grad_norm": 0.7835425734519958,
"learning_rate": 2.952207161995879e-05,
"loss": 1.0143,
"step": 726
},
{
"epoch": 0.05063555632944454,
"grad_norm": 0.6196465492248535,
"learning_rate": 2.9289321881345254e-05,
"loss": 0.7623,
"step": 727
},
{
"epoch": 0.050705206338150795,
"grad_norm": 0.7238385677337646,
"learning_rate": 2.905733574739542e-05,
"loss": 0.9173,
"step": 728
},
{
"epoch": 0.05077485634685704,
"grad_norm": 0.45640066266059875,
"learning_rate": 2.8826115723327684e-05,
"loss": 0.3747,
"step": 729
},
{
"epoch": 0.05084450635556329,
"grad_norm": 0.7860556840896606,
"learning_rate": 2.8595664306087312e-05,
"loss": 0.677,
"step": 730
},
{
"epoch": 0.05091415636426955,
"grad_norm": 0.7076509594917297,
"learning_rate": 2.8365983984319254e-05,
"loss": 0.6773,
"step": 731
},
{
"epoch": 0.050983806372975796,
"grad_norm": 0.5683595538139343,
"learning_rate": 2.8137077238341525e-05,
"loss": 0.7685,
"step": 732
},
{
"epoch": 0.05105345638168205,
"grad_norm": 0.6466002464294434,
"learning_rate": 2.7908946540118208e-05,
"loss": 0.6539,
"step": 733
},
{
"epoch": 0.0511231063903883,
"grad_norm": 0.7310590147972107,
"learning_rate": 2.7681594353232932e-05,
"loss": 0.6498,
"step": 734
},
{
"epoch": 0.05119275639909455,
"grad_norm": 0.6998217701911926,
"learning_rate": 2.7455023132862044e-05,
"loss": 0.827,
"step": 735
},
{
"epoch": 0.051262406407800803,
"grad_norm": 0.6120029091835022,
"learning_rate": 2.7229235325748393e-05,
"loss": 0.7574,
"step": 736
},
{
"epoch": 0.05133205641650705,
"grad_norm": 0.6969332695007324,
"learning_rate": 2.7004233370174603e-05,
"loss": 0.9495,
"step": 737
},
{
"epoch": 0.0514017064252133,
"grad_norm": 0.5970465540885925,
"learning_rate": 2.6780019695937008e-05,
"loss": 0.826,
"step": 738
},
{
"epoch": 0.051471356433919556,
"grad_norm": 0.5893230438232422,
"learning_rate": 2.6556596724319193e-05,
"loss": 0.5827,
"step": 739
},
{
"epoch": 0.051541006442625804,
"grad_norm": 0.6217379570007324,
"learning_rate": 2.6333966868066042e-05,
"loss": 0.833,
"step": 740
},
{
"epoch": 0.05161065645133206,
"grad_norm": 0.7289059162139893,
"learning_rate": 2.6112132531357457e-05,
"loss": 0.6796,
"step": 741
},
{
"epoch": 0.05168030646003831,
"grad_norm": 0.6685306429862976,
"learning_rate": 2.5891096109782642e-05,
"loss": 0.8579,
"step": 742
},
{
"epoch": 0.05174995646874456,
"grad_norm": 0.6785428524017334,
"learning_rate": 2.567085999031408e-05,
"loss": 1.1535,
"step": 743
},
{
"epoch": 0.05181960647745081,
"grad_norm": 0.5720734000205994,
"learning_rate": 2.5451426551281798e-05,
"loss": 0.8504,
"step": 744
},
{
"epoch": 0.05188925648615706,
"grad_norm": 0.8368062376976013,
"learning_rate": 2.5232798162347604e-05,
"loss": 0.866,
"step": 745
},
{
"epoch": 0.05195890649486331,
"grad_norm": 0.5373237133026123,
"learning_rate": 2.5014977184479694e-05,
"loss": 1.1392,
"step": 746
},
{
"epoch": 0.052028556503569565,
"grad_norm": 0.9247710704803467,
"learning_rate": 2.4797965969926907e-05,
"loss": 0.8317,
"step": 747
},
{
"epoch": 0.05209820651227581,
"grad_norm": 0.6235398650169373,
"learning_rate": 2.4581766862193556e-05,
"loss": 0.889,
"step": 748
},
{
"epoch": 0.05216785652098207,
"grad_norm": 0.5890073776245117,
"learning_rate": 2.4366382196013892e-05,
"loss": 1.0977,
"step": 749
},
{
"epoch": 0.05223750652968832,
"grad_norm": 0.5582912564277649,
"learning_rate": 2.4151814297327158e-05,
"loss": 0.6759,
"step": 750
},
{
"epoch": 0.052307156538394566,
"grad_norm": 0.6418405771255493,
"learning_rate": 2.3938065483252183e-05,
"loss": 0.5678,
"step": 751
},
{
"epoch": 0.05237680654710082,
"grad_norm": 0.5797872543334961,
"learning_rate": 2.372513806206258e-05,
"loss": 0.6385,
"step": 752
},
{
"epoch": 0.05244645655580707,
"grad_norm": 0.6586098074913025,
"learning_rate": 2.3513034333161765e-05,
"loss": 0.8608,
"step": 753
},
{
"epoch": 0.05251610656451332,
"grad_norm": 0.5528561472892761,
"learning_rate": 2.3301756587057987e-05,
"loss": 0.6811,
"step": 754
},
{
"epoch": 0.05258575657321957,
"grad_norm": 0.5883040428161621,
"learning_rate": 2.3091307105339856e-05,
"loss": 0.6142,
"step": 755
},
{
"epoch": 0.05265540658192582,
"grad_norm": 0.9445425271987915,
"learning_rate": 2.2881688160651405e-05,
"loss": 0.8142,
"step": 756
},
{
"epoch": 0.05272505659063208,
"grad_norm": 0.6835020184516907,
"learning_rate": 2.267290201666782e-05,
"loss": 0.8235,
"step": 757
},
{
"epoch": 0.052794706599338326,
"grad_norm": 0.6816075444221497,
"learning_rate": 2.246495092807077e-05,
"loss": 1.0772,
"step": 758
},
{
"epoch": 0.052864356608044574,
"grad_norm": 0.5880750417709351,
"learning_rate": 2.2257837140524274e-05,
"loss": 1.0342,
"step": 759
},
{
"epoch": 0.05293400661675083,
"grad_norm": 0.6749791502952576,
"learning_rate": 2.20515628906502e-05,
"loss": 0.6126,
"step": 760
},
{
"epoch": 0.05300365662545708,
"grad_norm": 0.7459970712661743,
"learning_rate": 2.1846130406004396e-05,
"loss": 0.6544,
"step": 761
},
{
"epoch": 0.05307330663416333,
"grad_norm": 0.5859512686729431,
"learning_rate": 2.164154190505231e-05,
"loss": 0.7144,
"step": 762
},
{
"epoch": 0.05314295664286958,
"grad_norm": 0.6339436173439026,
"learning_rate": 2.1437799597145425e-05,
"loss": 0.5725,
"step": 763
},
{
"epoch": 0.05321260665157583,
"grad_norm": 0.7248126268386841,
"learning_rate": 2.1234905682496986e-05,
"loss": 0.7997,
"step": 764
},
{
"epoch": 0.05328225666028208,
"grad_norm": 0.6739416718482971,
"learning_rate": 2.103286235215859e-05,
"loss": 0.7482,
"step": 765
},
{
"epoch": 0.053351906668988335,
"grad_norm": 0.7312667369842529,
"learning_rate": 2.083167178799623e-05,
"loss": 1.0439,
"step": 766
},
{
"epoch": 0.05342155667769458,
"grad_norm": 0.6655896902084351,
"learning_rate": 2.0631336162667035e-05,
"loss": 0.8695,
"step": 767
},
{
"epoch": 0.05349120668640084,
"grad_norm": 0.6517478823661804,
"learning_rate": 2.0431857639595486e-05,
"loss": 0.6283,
"step": 768
},
{
"epoch": 0.05356085669510709,
"grad_norm": 0.5833168029785156,
"learning_rate": 2.023323837295037e-05,
"loss": 1.2862,
"step": 769
},
{
"epoch": 0.053630506703813335,
"grad_norm": 0.45417115092277527,
"learning_rate": 2.0035480507621218e-05,
"loss": 0.4238,
"step": 770
},
{
"epoch": 0.05370015671251959,
"grad_norm": 0.6575907468795776,
"learning_rate": 1.983858617919543e-05,
"loss": 1.034,
"step": 771
},
{
"epoch": 0.05376980672122584,
"grad_norm": 0.606704831123352,
"learning_rate": 1.9642557513934933e-05,
"loss": 0.8014,
"step": 772
},
{
"epoch": 0.05383945672993209,
"grad_norm": 0.594321608543396,
"learning_rate": 1.9447396628753467e-05,
"loss": 0.5752,
"step": 773
},
{
"epoch": 0.05390910673863834,
"grad_norm": 0.7383103966712952,
"learning_rate": 1.925310563119358e-05,
"loss": 0.7493,
"step": 774
},
{
"epoch": 0.05397875674734459,
"grad_norm": 0.636978268623352,
"learning_rate": 1.905968661940385e-05,
"loss": 0.4319,
"step": 775
},
{
"epoch": 0.05404840675605085,
"grad_norm": 0.6960916519165039,
"learning_rate": 1.8867141682116374e-05,
"loss": 0.9924,
"step": 776
},
{
"epoch": 0.054118056764757096,
"grad_norm": 0.649654746055603,
"learning_rate": 1.8675472898624014e-05,
"loss": 0.7308,
"step": 777
},
{
"epoch": 0.054187706773463344,
"grad_norm": 0.6827317476272583,
"learning_rate": 1.8484682338758152e-05,
"loss": 0.7227,
"step": 778
},
{
"epoch": 0.0542573567821696,
"grad_norm": 0.6983030438423157,
"learning_rate": 1.8294772062866138e-05,
"loss": 0.8553,
"step": 779
},
{
"epoch": 0.05432700679087585,
"grad_norm": 0.5816463232040405,
"learning_rate": 1.8105744121789225e-05,
"loss": 0.7053,
"step": 780
},
{
"epoch": 0.0543966567995821,
"grad_norm": 0.8149849772453308,
"learning_rate": 1.791760055684023e-05,
"loss": 0.7378,
"step": 781
},
{
"epoch": 0.05446630680828835,
"grad_norm": 0.626234233379364,
"learning_rate": 1.7730343399781668e-05,
"loss": 0.8566,
"step": 782
},
{
"epoch": 0.0545359568169946,
"grad_norm": 0.7223556637763977,
"learning_rate": 1.754397467280372e-05,
"loss": 0.7798,
"step": 783
},
{
"epoch": 0.054605606825700856,
"grad_norm": 0.6546375155448914,
"learning_rate": 1.735849638850242e-05,
"loss": 1.0634,
"step": 784
},
{
"epoch": 0.054675256834407104,
"grad_norm": 0.6382943987846375,
"learning_rate": 1.7173910549857854e-05,
"loss": 0.7336,
"step": 785
},
{
"epoch": 0.05474490684311335,
"grad_norm": 0.592207133769989,
"learning_rate": 1.699021915021266e-05,
"loss": 0.5601,
"step": 786
},
{
"epoch": 0.05481455685181961,
"grad_norm": 0.6741936206817627,
"learning_rate": 1.6807424173250354e-05,
"loss": 0.9638,
"step": 787
},
{
"epoch": 0.05488420686052586,
"grad_norm": 0.5983725190162659,
"learning_rate": 1.6625527592974077e-05,
"loss": 0.7403,
"step": 788
},
{
"epoch": 0.054953856869232105,
"grad_norm": 0.5087631940841675,
"learning_rate": 1.6444531373685078e-05,
"loss": 0.9725,
"step": 789
},
{
"epoch": 0.05502350687793836,
"grad_norm": 0.7693138122558594,
"learning_rate": 1.6264437469961703e-05,
"loss": 0.6232,
"step": 790
},
{
"epoch": 0.05509315688664461,
"grad_norm": 0.9830653071403503,
"learning_rate": 1.6085247826638093e-05,
"loss": 0.7752,
"step": 791
},
{
"epoch": 0.055162806895350865,
"grad_norm": 0.6889302134513855,
"learning_rate": 1.5906964378783373e-05,
"loss": 0.6974,
"step": 792
},
{
"epoch": 0.05523245690405711,
"grad_norm": 0.6805455088615417,
"learning_rate": 1.5729589051680647e-05,
"loss": 0.9143,
"step": 793
},
{
"epoch": 0.05530210691276336,
"grad_norm": 0.6505549550056458,
"learning_rate": 1.5553123760806143e-05,
"loss": 0.6784,
"step": 794
},
{
"epoch": 0.05537175692146962,
"grad_norm": 0.6062676310539246,
"learning_rate": 1.5377570411808718e-05,
"loss": 0.8088,
"step": 795
},
{
"epoch": 0.055441406930175866,
"grad_norm": 0.5329009890556335,
"learning_rate": 1.5202930900489054e-05,
"loss": 0.4477,
"step": 796
},
{
"epoch": 0.055511056938882114,
"grad_norm": 0.6530266404151917,
"learning_rate": 1.502920711277943e-05,
"loss": 0.7462,
"step": 797
},
{
"epoch": 0.05558070694758837,
"grad_norm": 0.6333693861961365,
"learning_rate": 1.4856400924723079e-05,
"loss": 1.1035,
"step": 798
},
{
"epoch": 0.05565035695629462,
"grad_norm": 0.7612791061401367,
"learning_rate": 1.4684514202454225e-05,
"loss": 0.9053,
"step": 799
},
{
"epoch": 0.05572000696500087,
"grad_norm": 0.6711084842681885,
"learning_rate": 1.4513548802177634e-05,
"loss": 1.0815,
"step": 800
},
{
"epoch": 0.05572000696500087,
"eval_loss": 0.7279470562934875,
"eval_runtime": 700.3911,
"eval_samples_per_second": 7.139,
"eval_steps_per_second": 1.785,
"step": 800
},
{
"epoch": 0.05578965697370712,
"grad_norm": 0.8243626356124878,
"learning_rate": 1.4343506570148846e-05,
"loss": 0.9067,
"step": 801
},
{
"epoch": 0.05585930698241337,
"grad_norm": 0.740206241607666,
"learning_rate": 1.4174389342653971e-05,
"loss": 1.0956,
"step": 802
},
{
"epoch": 0.055928956991119626,
"grad_norm": 0.6383155584335327,
"learning_rate": 1.4006198945990168e-05,
"loss": 0.9274,
"step": 803
},
{
"epoch": 0.055998606999825874,
"grad_norm": 0.7425148487091064,
"learning_rate": 1.3838937196445579e-05,
"loss": 1.083,
"step": 804
},
{
"epoch": 0.05606825700853212,
"grad_norm": 0.6034273505210876,
"learning_rate": 1.367260590028e-05,
"loss": 0.7125,
"step": 805
},
{
"epoch": 0.05613790701723838,
"grad_norm": 0.7047588229179382,
"learning_rate": 1.3507206853705178e-05,
"loss": 0.7749,
"step": 806
},
{
"epoch": 0.05620755702594463,
"grad_norm": 0.7387014627456665,
"learning_rate": 1.334274184286558e-05,
"loss": 0.7397,
"step": 807
},
{
"epoch": 0.05627720703465088,
"grad_norm": 0.6060226559638977,
"learning_rate": 1.3179212643818929e-05,
"loss": 0.5144,
"step": 808
},
{
"epoch": 0.05634685704335713,
"grad_norm": 0.7422417402267456,
"learning_rate": 1.3016621022517206e-05,
"loss": 0.7739,
"step": 809
},
{
"epoch": 0.05641650705206338,
"grad_norm": 0.6336711645126343,
"learning_rate": 1.2854968734787398e-05,
"loss": 0.471,
"step": 810
},
{
"epoch": 0.056486157060769634,
"grad_norm": 0.667668879032135,
"learning_rate": 1.2694257526312725e-05,
"loss": 0.4143,
"step": 811
},
{
"epoch": 0.05655580706947588,
"grad_norm": 0.6936927437782288,
"learning_rate": 1.2534489132613603e-05,
"loss": 0.8842,
"step": 812
},
{
"epoch": 0.05662545707818213,
"grad_norm": 0.6019664406776428,
"learning_rate": 1.2375665279029048e-05,
"loss": 0.7445,
"step": 813
},
{
"epoch": 0.05669510708688839,
"grad_norm": 0.7595625519752502,
"learning_rate": 1.221778768069799e-05,
"loss": 0.8676,
"step": 814
},
{
"epoch": 0.056764757095594635,
"grad_norm": 0.593315839767456,
"learning_rate": 1.206085804254069e-05,
"loss": 0.7546,
"step": 815
},
{
"epoch": 0.05683440710430089,
"grad_norm": 0.7907949090003967,
"learning_rate": 1.1904878059240442e-05,
"loss": 1.0131,
"step": 816
},
{
"epoch": 0.05690405711300714,
"grad_norm": 0.6472040414810181,
"learning_rate": 1.174984941522519e-05,
"loss": 0.6795,
"step": 817
},
{
"epoch": 0.05697370712171339,
"grad_norm": 0.6748494505882263,
"learning_rate": 1.1595773784649389e-05,
"loss": 0.9777,
"step": 818
},
{
"epoch": 0.05704335713041964,
"grad_norm": 0.7594382166862488,
"learning_rate": 1.1442652831375855e-05,
"loss": 0.8305,
"step": 819
},
{
"epoch": 0.05711300713912589,
"grad_norm": 0.5605437159538269,
"learning_rate": 1.1290488208957895e-05,
"loss": 0.9774,
"step": 820
},
{
"epoch": 0.05718265714783214,
"grad_norm": 0.7108663320541382,
"learning_rate": 1.1139281560621362e-05,
"loss": 1.1447,
"step": 821
},
{
"epoch": 0.057252307156538396,
"grad_norm": 0.7549561858177185,
"learning_rate": 1.0989034519246954e-05,
"loss": 1.0838,
"step": 822
},
{
"epoch": 0.057321957165244644,
"grad_norm": 0.5975289940834045,
"learning_rate": 1.0839748707352603e-05,
"loss": 1.0126,
"step": 823
},
{
"epoch": 0.0573916071739509,
"grad_norm": 0.6680442094802856,
"learning_rate": 1.06914257370759e-05,
"loss": 0.5809,
"step": 824
},
{
"epoch": 0.05746125718265715,
"grad_norm": 0.7288407683372498,
"learning_rate": 1.0544067210156671e-05,
"loss": 0.9369,
"step": 825
},
{
"epoch": 0.0575309071913634,
"grad_norm": 0.7064124345779419,
"learning_rate": 1.0397674717919802e-05,
"loss": 0.8142,
"step": 826
},
{
"epoch": 0.05760055720006965,
"grad_norm": 0.6422365307807922,
"learning_rate": 1.0252249841257877e-05,
"loss": 0.5993,
"step": 827
},
{
"epoch": 0.0576702072087759,
"grad_norm": 0.6080381870269775,
"learning_rate": 1.0107794150614281e-05,
"loss": 0.6939,
"step": 828
},
{
"epoch": 0.05773985721748215,
"grad_norm": 0.6256659030914307,
"learning_rate": 9.964309205966083e-06,
"loss": 0.4506,
"step": 829
},
{
"epoch": 0.057809507226188404,
"grad_norm": 0.6198416352272034,
"learning_rate": 9.821796556807339e-06,
"loss": 0.6324,
"step": 830
},
{
"epoch": 0.05787915723489465,
"grad_norm": 0.6347202658653259,
"learning_rate": 9.680257742132215e-06,
"loss": 0.6047,
"step": 831
},
{
"epoch": 0.05794880724360091,
"grad_norm": 0.60918128490448,
"learning_rate": 9.539694290418488e-06,
"loss": 0.9085,
"step": 832
},
{
"epoch": 0.05801845725230716,
"grad_norm": 0.6706361174583435,
"learning_rate": 9.400107719610995e-06,
"loss": 0.9078,
"step": 833
},
{
"epoch": 0.058088107261013405,
"grad_norm": 0.7337279915809631,
"learning_rate": 9.261499537105177e-06,
"loss": 1.0197,
"step": 834
},
{
"epoch": 0.05815775726971966,
"grad_norm": 0.5747254490852356,
"learning_rate": 9.12387123973093e-06,
"loss": 0.8288,
"step": 835
},
{
"epoch": 0.05822740727842591,
"grad_norm": 0.6484262347221375,
"learning_rate": 8.98722431373631e-06,
"loss": 1.1276,
"step": 836
},
{
"epoch": 0.05829705728713216,
"grad_norm": 0.6793870329856873,
"learning_rate": 8.851560234771594e-06,
"loss": 0.5941,
"step": 837
},
{
"epoch": 0.05836670729583841,
"grad_norm": 0.6910689473152161,
"learning_rate": 8.716880467873234e-06,
"loss": 0.9097,
"step": 838
},
{
"epoch": 0.05843635730454466,
"grad_norm": 0.7062430381774902,
"learning_rate": 8.583186467448167e-06,
"loss": 0.9619,
"step": 839
},
{
"epoch": 0.05850600731325092,
"grad_norm": 0.8270265460014343,
"learning_rate": 8.45047967725796e-06,
"loss": 1.0196,
"step": 840
},
{
"epoch": 0.058575657321957166,
"grad_norm": 0.6949748992919922,
"learning_rate": 8.318761530403374e-06,
"loss": 0.5329,
"step": 841
},
{
"epoch": 0.058645307330663414,
"grad_norm": 0.7285637855529785,
"learning_rate": 8.188033449308719e-06,
"loss": 0.6849,
"step": 842
},
{
"epoch": 0.05871495733936967,
"grad_norm": 0.5861655473709106,
"learning_rate": 8.058296845706715e-06,
"loss": 0.8638,
"step": 843
},
{
"epoch": 0.05878460734807592,
"grad_norm": 0.7448881268501282,
"learning_rate": 7.929553120622968e-06,
"loss": 0.8458,
"step": 844
},
{
"epoch": 0.058854257356782166,
"grad_norm": 0.5610641241073608,
"learning_rate": 7.801803664361095e-06,
"loss": 0.4706,
"step": 845
},
{
"epoch": 0.05892390736548842,
"grad_norm": 0.5610293745994568,
"learning_rate": 7.675049856487549e-06,
"loss": 0.5503,
"step": 846
},
{
"epoch": 0.05899355737419467,
"grad_norm": 0.6175963282585144,
"learning_rate": 7.5492930658168096e-06,
"loss": 0.6195,
"step": 847
},
{
"epoch": 0.059063207382900926,
"grad_norm": 0.6749705672264099,
"learning_rate": 7.42453465039652e-06,
"loss": 0.7353,
"step": 848
},
{
"epoch": 0.059132857391607174,
"grad_norm": 0.6812541484832764,
"learning_rate": 7.300775957492923e-06,
"loss": 0.6882,
"step": 849
},
{
"epoch": 0.05920250740031342,
"grad_norm": 0.6131837368011475,
"learning_rate": 7.178018323576208e-06,
"loss": 0.9945,
"step": 850
},
{
"epoch": 0.05927215740901968,
"grad_norm": 0.6159570217132568,
"learning_rate": 7.056263074306191e-06,
"loss": 0.7943,
"step": 851
},
{
"epoch": 0.05934180741772593,
"grad_norm": 0.7175585627555847,
"learning_rate": 6.935511524517835e-06,
"loss": 0.8498,
"step": 852
},
{
"epoch": 0.059411457426432175,
"grad_norm": 0.7083918452262878,
"learning_rate": 6.815764978207284e-06,
"loss": 0.9473,
"step": 853
},
{
"epoch": 0.05948110743513843,
"grad_norm": 0.7349149584770203,
"learning_rate": 6.6970247285175315e-06,
"loss": 0.9025,
"step": 854
},
{
"epoch": 0.05955075744384468,
"grad_norm": 0.6739192008972168,
"learning_rate": 6.579292057724639e-06,
"loss": 0.8435,
"step": 855
},
{
"epoch": 0.059620407452550934,
"grad_norm": 0.6588095426559448,
"learning_rate": 6.4625682372237874e-06,
"loss": 0.6966,
"step": 856
},
{
"epoch": 0.05969005746125718,
"grad_norm": 0.5185966491699219,
"learning_rate": 6.346854527515622e-06,
"loss": 0.6977,
"step": 857
},
{
"epoch": 0.05975970746996343,
"grad_norm": 0.5705149173736572,
"learning_rate": 6.23215217819253e-06,
"loss": 0.6574,
"step": 858
},
{
"epoch": 0.05982935747866969,
"grad_norm": 0.5465989112854004,
"learning_rate": 6.11846242792532e-06,
"loss": 0.5492,
"step": 859
},
{
"epoch": 0.059899007487375935,
"grad_norm": 0.7820805311203003,
"learning_rate": 6.005786504449651e-06,
"loss": 0.8664,
"step": 860
},
{
"epoch": 0.059968657496082184,
"grad_norm": 0.7436554431915283,
"learning_rate": 5.894125624552915e-06,
"loss": 0.9035,
"step": 861
},
{
"epoch": 0.06003830750478844,
"grad_norm": 0.7402638792991638,
"learning_rate": 5.7834809940610195e-06,
"loss": 0.7703,
"step": 862
},
{
"epoch": 0.06010795751349469,
"grad_norm": 0.6208961009979248,
"learning_rate": 5.673853807825424e-06,
"loss": 0.7226,
"step": 863
},
{
"epoch": 0.06017760752220094,
"grad_norm": 0.5884114503860474,
"learning_rate": 5.565245249710194e-06,
"loss": 1.0493,
"step": 864
},
{
"epoch": 0.06024725753090719,
"grad_norm": 0.7064511179924011,
"learning_rate": 5.457656492579211e-06,
"loss": 1.0538,
"step": 865
},
{
"epoch": 0.06031690753961344,
"grad_norm": 0.714733362197876,
"learning_rate": 5.351088698283558e-06,
"loss": 0.7942,
"step": 866
},
{
"epoch": 0.060386557548319696,
"grad_norm": 0.6394374966621399,
"learning_rate": 5.2455430176489014e-06,
"loss": 0.7437,
"step": 867
},
{
"epoch": 0.060456207557025944,
"grad_norm": 0.6636267900466919,
"learning_rate": 5.1410205904631415e-06,
"loss": 0.8204,
"step": 868
},
{
"epoch": 0.06052585756573219,
"grad_norm": 0.6036087274551392,
"learning_rate": 5.037522545464024e-06,
"loss": 0.7066,
"step": 869
},
{
"epoch": 0.06059550757443845,
"grad_norm": 0.6227147579193115,
"learning_rate": 4.9350500003270465e-06,
"loss": 0.7101,
"step": 870
},
{
"epoch": 0.060665157583144697,
"grad_norm": 0.5791090130805969,
"learning_rate": 4.833604061653252e-06,
"loss": 0.6439,
"step": 871
},
{
"epoch": 0.06073480759185095,
"grad_norm": 0.5661488771438599,
"learning_rate": 4.73318582495742e-06,
"loss": 0.5134,
"step": 872
},
{
"epoch": 0.0608044576005572,
"grad_norm": 0.7721818089485168,
"learning_rate": 4.633796374656174e-06,
"loss": 0.8566,
"step": 873
},
{
"epoch": 0.06087410760926345,
"grad_norm": 0.7348571419715881,
"learning_rate": 4.535436784056269e-06,
"loss": 0.6653,
"step": 874
},
{
"epoch": 0.060943757617969704,
"grad_norm": 0.6881682872772217,
"learning_rate": 4.438108115342965e-06,
"loss": 0.7876,
"step": 875
},
{
"epoch": 0.06101340762667595,
"grad_norm": 0.6156147718429565,
"learning_rate": 4.3418114195686536e-06,
"loss": 0.8429,
"step": 876
},
{
"epoch": 0.0610830576353822,
"grad_norm": 0.6420087218284607,
"learning_rate": 4.246547736641382e-06,
"loss": 0.7274,
"step": 877
},
{
"epoch": 0.06115270764408846,
"grad_norm": 0.5134680271148682,
"learning_rate": 4.152318095313778e-06,
"loss": 0.5185,
"step": 878
},
{
"epoch": 0.061222357652794705,
"grad_norm": 0.6913058757781982,
"learning_rate": 4.05912351317177e-06,
"loss": 0.9036,
"step": 879
},
{
"epoch": 0.06129200766150096,
"grad_norm": 0.5641781091690063,
"learning_rate": 3.966964996623735e-06,
"loss": 0.8567,
"step": 880
},
{
"epoch": 0.06136165767020721,
"grad_norm": 0.5682424306869507,
"learning_rate": 3.875843540889546e-06,
"loss": 0.7562,
"step": 881
},
{
"epoch": 0.06143130767891346,
"grad_norm": 0.5852996110916138,
"learning_rate": 3.785760129989868e-06,
"loss": 0.4581,
"step": 882
},
{
"epoch": 0.06150095768761971,
"grad_norm": 0.6625421047210693,
"learning_rate": 3.6967157367355567e-06,
"loss": 1.0613,
"step": 883
},
{
"epoch": 0.06157060769632596,
"grad_norm": 0.7365720868110657,
"learning_rate": 3.6087113227170287e-06,
"loss": 0.8548,
"step": 884
},
{
"epoch": 0.06164025770503221,
"grad_norm": 0.596820592880249,
"learning_rate": 3.5217478382940426e-06,
"loss": 0.7301,
"step": 885
},
{
"epoch": 0.061709907713738466,
"grad_norm": 0.7230522632598877,
"learning_rate": 3.4358262225853254e-06,
"loss": 1.0264,
"step": 886
},
{
"epoch": 0.061779557722444714,
"grad_norm": 0.550679087638855,
"learning_rate": 3.3509474034584596e-06,
"loss": 0.6914,
"step": 887
},
{
"epoch": 0.06184920773115097,
"grad_norm": 0.6080251932144165,
"learning_rate": 3.267112297519881e-06,
"loss": 0.8706,
"step": 888
},
{
"epoch": 0.06191885773985722,
"grad_norm": 0.6070705056190491,
"learning_rate": 3.184321810104962e-06,
"loss": 1.0111,
"step": 889
},
{
"epoch": 0.061988507748563466,
"grad_norm": 0.6949368715286255,
"learning_rate": 3.102576835268212e-06,
"loss": 0.9892,
"step": 890
},
{
"epoch": 0.06205815775726972,
"grad_norm": 0.7588335275650024,
"learning_rate": 3.0218782557737136e-06,
"loss": 0.8309,
"step": 891
},
{
"epoch": 0.06212780776597597,
"grad_norm": 0.5684018135070801,
"learning_rate": 2.9422269430854245e-06,
"loss": 0.8553,
"step": 892
},
{
"epoch": 0.06219745777468222,
"grad_norm": 0.554639995098114,
"learning_rate": 2.863623757357992e-06,
"loss": 0.7984,
"step": 893
},
{
"epoch": 0.062267107783388474,
"grad_norm": 0.653669536113739,
"learning_rate": 2.7860695474272392e-06,
"loss": 0.8296,
"step": 894
},
{
"epoch": 0.06233675779209472,
"grad_norm": 0.610150158405304,
"learning_rate": 2.709565150801152e-06,
"loss": 0.5203,
"step": 895
},
{
"epoch": 0.06240640780080098,
"grad_norm": 0.6130475401878357,
"learning_rate": 2.634111393650751e-06,
"loss": 0.8298,
"step": 896
},
{
"epoch": 0.06247605780950723,
"grad_norm": 0.5449431538581848,
"learning_rate": 2.559709090801221e-06,
"loss": 0.7497,
"step": 897
},
{
"epoch": 0.06254570781821348,
"grad_norm": 0.6247503757476807,
"learning_rate": 2.4863590457230743e-06,
"loss": 1.1263,
"step": 898
},
{
"epoch": 0.06261535782691972,
"grad_norm": 0.7267642617225647,
"learning_rate": 2.4140620505235135e-06,
"loss": 0.7873,
"step": 899
},
{
"epoch": 0.06268500783562599,
"grad_norm": 0.7534024119377136,
"learning_rate": 2.342818885937825e-06,
"loss": 1.0745,
"step": 900
},
{
"epoch": 0.06268500783562599,
"eval_loss": 0.723136305809021,
"eval_runtime": 700.403,
"eval_samples_per_second": 7.139,
"eval_steps_per_second": 1.785,
"step": 900
},
{
"epoch": 0.06275465784433223,
"grad_norm": 0.7109830379486084,
"learning_rate": 2.272630321321023e-06,
"loss": 0.704,
"step": 901
},
{
"epoch": 0.06282430785303848,
"grad_norm": 0.4886980950832367,
"learning_rate": 2.20349711463943e-06,
"loss": 0.4915,
"step": 902
},
{
"epoch": 0.06289395786174473,
"grad_norm": 0.6534592509269714,
"learning_rate": 2.135420012462619e-06,
"loss": 0.6073,
"step": 903
},
{
"epoch": 0.06296360787045098,
"grad_norm": 0.5471417903900146,
"learning_rate": 2.0683997499552632e-06,
"loss": 0.6319,
"step": 904
},
{
"epoch": 0.06303325787915723,
"grad_norm": 0.765691876411438,
"learning_rate": 2.0024370508692104e-06,
"loss": 0.9544,
"step": 905
},
{
"epoch": 0.06310290788786349,
"grad_norm": 0.6834742426872253,
"learning_rate": 1.9375326275357208e-06,
"loss": 0.8162,
"step": 906
},
{
"epoch": 0.06317255789656974,
"grad_norm": 0.7233893871307373,
"learning_rate": 1.8736871808576861e-06,
"loss": 1.0311,
"step": 907
},
{
"epoch": 0.06324220790527599,
"grad_norm": 0.6150738000869751,
"learning_rate": 1.8109014003021452e-06,
"loss": 0.9241,
"step": 908
},
{
"epoch": 0.06331185791398224,
"grad_norm": 0.7470687031745911,
"learning_rate": 1.7491759638927686e-06,
"loss": 1.1686,
"step": 909
},
{
"epoch": 0.06338150792268848,
"grad_norm": 0.7098023295402527,
"learning_rate": 1.6885115382026085e-06,
"loss": 1.1531,
"step": 910
},
{
"epoch": 0.06345115793139475,
"grad_norm": 0.6397354006767273,
"learning_rate": 1.628908778346827e-06,
"loss": 0.9153,
"step": 911
},
{
"epoch": 0.063520807940101,
"grad_norm": 0.6609793305397034,
"learning_rate": 1.5703683279756797e-06,
"loss": 0.641,
"step": 912
},
{
"epoch": 0.06359045794880724,
"grad_norm": 0.7062059640884399,
"learning_rate": 1.5128908192675318e-06,
"loss": 0.7182,
"step": 913
},
{
"epoch": 0.06366010795751349,
"grad_norm": 0.6093196272850037,
"learning_rate": 1.4564768729220412e-06,
"loss": 0.6793,
"step": 914
},
{
"epoch": 0.06372975796621974,
"grad_norm": 0.6978054642677307,
"learning_rate": 1.401127098153443e-06,
"loss": 0.7592,
"step": 915
},
{
"epoch": 0.063799407974926,
"grad_norm": 0.5635403394699097,
"learning_rate": 1.3468420926840197e-06,
"loss": 0.869,
"step": 916
},
{
"epoch": 0.06386905798363225,
"grad_norm": 0.6903446912765503,
"learning_rate": 1.2936224427375521e-06,
"loss": 0.7401,
"step": 917
},
{
"epoch": 0.0639387079923385,
"grad_norm": 0.6210869550704956,
"learning_rate": 1.2414687230331123e-06,
"loss": 0.5908,
"step": 918
},
{
"epoch": 0.06400835800104475,
"grad_norm": 0.6113409399986267,
"learning_rate": 1.1903814967787253e-06,
"loss": 0.5493,
"step": 919
},
{
"epoch": 0.064078008009751,
"grad_norm": 0.9400643706321716,
"learning_rate": 1.1403613156654059e-06,
"loss": 1.0418,
"step": 920
},
{
"epoch": 0.06414765801845725,
"grad_norm": 0.683574378490448,
"learning_rate": 1.091408719861109e-06,
"loss": 0.9345,
"step": 921
},
{
"epoch": 0.06421730802716351,
"grad_norm": 0.7595987915992737,
"learning_rate": 1.0435242380049559e-06,
"loss": 0.8716,
"step": 922
},
{
"epoch": 0.06428695803586976,
"grad_norm": 0.6851724982261658,
"learning_rate": 9.967083872015282e-07,
"loss": 0.5158,
"step": 923
},
{
"epoch": 0.064356608044576,
"grad_norm": 0.6724770069122314,
"learning_rate": 9.509616730151827e-07,
"loss": 0.5133,
"step": 924
},
{
"epoch": 0.06442625805328225,
"grad_norm": 0.6596947312355042,
"learning_rate": 9.062845894647676e-07,
"loss": 0.6722,
"step": 925
},
{
"epoch": 0.0644959080619885,
"grad_norm": 0.5619158148765564,
"learning_rate": 8.626776190181041e-07,
"loss": 0.9499,
"step": 926
},
{
"epoch": 0.06456555807069476,
"grad_norm": 0.7573150992393494,
"learning_rate": 8.20141232586924e-07,
"loss": 0.7521,
"step": 927
},
{
"epoch": 0.06463520807940101,
"grad_norm": 0.6126770377159119,
"learning_rate": 7.786758895216629e-07,
"loss": 0.6616,
"step": 928
},
{
"epoch": 0.06470485808810726,
"grad_norm": 0.7481774687767029,
"learning_rate": 7.382820376066302e-07,
"loss": 0.8779,
"step": 929
},
{
"epoch": 0.06477450809681351,
"grad_norm": 0.7029200792312622,
"learning_rate": 6.98960113055025e-07,
"loss": 0.7685,
"step": 930
},
{
"epoch": 0.06484415810551976,
"grad_norm": 0.6455416679382324,
"learning_rate": 6.607105405043612e-07,
"loss": 1.0069,
"step": 931
},
{
"epoch": 0.06491380811422602,
"grad_norm": 0.7011751532554626,
"learning_rate": 6.23533733011783e-07,
"loss": 0.6548,
"step": 932
},
{
"epoch": 0.06498345812293227,
"grad_norm": 0.7533524036407471,
"learning_rate": 5.8743009204969e-07,
"loss": 0.7463,
"step": 933
},
{
"epoch": 0.06505310813163852,
"grad_norm": 0.5586950182914734,
"learning_rate": 5.52400007501297e-07,
"loss": 0.6125,
"step": 934
},
{
"epoch": 0.06512275814034477,
"grad_norm": 0.6539096832275391,
"learning_rate": 5.184438576565253e-07,
"loss": 0.8559,
"step": 935
},
{
"epoch": 0.06519240814905101,
"grad_norm": 0.7584323883056641,
"learning_rate": 4.855620092078627e-07,
"loss": 1.1142,
"step": 936
},
{
"epoch": 0.06526205815775726,
"grad_norm": 0.6609397530555725,
"learning_rate": 4.537548172464101e-07,
"loss": 0.8978,
"step": 937
},
{
"epoch": 0.06533170816646353,
"grad_norm": 0.6159988641738892,
"learning_rate": 4.230226252580516e-07,
"loss": 0.6993,
"step": 938
},
{
"epoch": 0.06540135817516977,
"grad_norm": 0.6153664588928223,
"learning_rate": 3.9336576511976863e-07,
"loss": 0.4574,
"step": 939
},
{
"epoch": 0.06547100818387602,
"grad_norm": 0.6489300727844238,
"learning_rate": 3.6478455709598734e-07,
"loss": 0.7568,
"step": 940
},
{
"epoch": 0.06554065819258227,
"grad_norm": 0.6248874664306641,
"learning_rate": 3.372793098352256e-07,
"loss": 0.6879,
"step": 941
},
{
"epoch": 0.06561030820128852,
"grad_norm": 0.5801978707313538,
"learning_rate": 3.108503203666402e-07,
"loss": 0.7331,
"step": 942
},
{
"epoch": 0.06567995820999478,
"grad_norm": 0.605501115322113,
"learning_rate": 2.8549787409691833e-07,
"loss": 0.6179,
"step": 943
},
{
"epoch": 0.06574960821870103,
"grad_norm": 0.5972608327865601,
"learning_rate": 2.6122224480715775e-07,
"loss": 0.6514,
"step": 944
},
{
"epoch": 0.06581925822740728,
"grad_norm": 0.7556172609329224,
"learning_rate": 2.380236946498693e-07,
"loss": 0.8719,
"step": 945
},
{
"epoch": 0.06588890823611353,
"grad_norm": 0.6486802101135254,
"learning_rate": 2.1590247414624566e-07,
"loss": 0.5719,
"step": 946
},
{
"epoch": 0.06595855824481978,
"grad_norm": 0.638469398021698,
"learning_rate": 1.948588221833303e-07,
"loss": 0.6393,
"step": 947
},
{
"epoch": 0.06602820825352604,
"grad_norm": 0.7082604765892029,
"learning_rate": 1.7489296601156392e-07,
"loss": 1.0018,
"step": 948
},
{
"epoch": 0.06609785826223229,
"grad_norm": 0.6530460119247437,
"learning_rate": 1.5600512124221978e-07,
"loss": 0.7418,
"step": 949
},
{
"epoch": 0.06616750827093854,
"grad_norm": 0.653685986995697,
"learning_rate": 1.3819549184516112e-07,
"loss": 0.9309,
"step": 950
},
{
"epoch": 0.06623715827964478,
"grad_norm": 0.5263675451278687,
"learning_rate": 1.2146427014657625e-07,
"loss": 0.7189,
"step": 951
},
{
"epoch": 0.06630680828835103,
"grad_norm": 0.6783672571182251,
"learning_rate": 1.0581163682695793e-07,
"loss": 0.5871,
"step": 952
},
{
"epoch": 0.06637645829705728,
"grad_norm": 0.4727168083190918,
"learning_rate": 9.123776091908287e-08,
"loss": 0.3484,
"step": 953
},
{
"epoch": 0.06644610830576354,
"grad_norm": 0.5385925769805908,
"learning_rate": 7.774279980626853e-08,
"loss": 0.5899,
"step": 954
},
{
"epoch": 0.06651575831446979,
"grad_norm": 0.6668855547904968,
"learning_rate": 6.532689922059687e-08,
"loss": 1.0131,
"step": 955
},
{
"epoch": 0.06658540832317604,
"grad_norm": 0.6244344115257263,
"learning_rate": 5.3990193241393313e-08,
"loss": 0.7458,
"step": 956
},
{
"epoch": 0.06665505833188229,
"grad_norm": 0.6702743768692017,
"learning_rate": 4.373280429375015e-08,
"loss": 0.8924,
"step": 957
},
{
"epoch": 0.06672470834058854,
"grad_norm": 0.6103947758674622,
"learning_rate": 3.4554843147216464e-08,
"loss": 1.0036,
"step": 958
},
{
"epoch": 0.0667943583492948,
"grad_norm": 0.622797966003418,
"learning_rate": 2.6456408914599108e-08,
"loss": 0.8497,
"step": 959
},
{
"epoch": 0.06686400835800105,
"grad_norm": 0.7076674699783325,
"learning_rate": 1.9437589050907977e-08,
"loss": 0.5629,
"step": 960
},
{
"epoch": 0.0669336583667073,
"grad_norm": 0.7682867050170898,
"learning_rate": 1.3498459352367931e-08,
"loss": 0.7463,
"step": 961
},
{
"epoch": 0.06700330837541355,
"grad_norm": 0.7987236380577087,
"learning_rate": 8.639083955663818e-09,
"loss": 1.1664,
"step": 962
},
{
"epoch": 0.0670729583841198,
"grad_norm": 0.7837391495704651,
"learning_rate": 4.859515337174436e-09,
"loss": 0.6505,
"step": 963
},
{
"epoch": 0.06714260839282606,
"grad_norm": 0.6566223502159119,
"learning_rate": 2.1597943124729292e-09,
"loss": 0.8524,
"step": 964
},
{
"epoch": 0.0672122584015323,
"grad_norm": 0.6998875737190247,
"learning_rate": 5.399500358493903e-10,
"loss": 0.8817,
"step": 965
},
{
"epoch": 0.06728190841023855,
"grad_norm": 0.6083624362945557,
"learning_rate": 0.0,
"loss": 0.8767,
"step": 966
}
],
"logging_steps": 1,
"max_steps": 966,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 2,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.628352553502376e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}