xzuyn's picture
Upload Step 400/4099
e0e3a46 verified
raw
history blame
79.3 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.09758477677482313,
"eval_steps": 10,
"global_step": 400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00024396194193705782,
"grad_norm": 4.8585286140441895,
"learning_rate": 2.4999420463141455e-07,
"loss": 2.9081,
"step": 1
},
{
"epoch": 0.00024396194193705782,
"eval_loss": 2.639136552810669,
"eval_runtime": 157.6053,
"eval_samples_per_second": 1.624,
"eval_steps_per_second": 0.812,
"step": 1
},
{
"epoch": 0.00048792388387411563,
"grad_norm": 3.586596965789795,
"learning_rate": 2.4998840671678217e-07,
"loss": 2.4085,
"step": 2
},
{
"epoch": 0.0007318858258111735,
"grad_norm": 4.514856815338135,
"learning_rate": 2.499826062544247e-07,
"loss": 2.867,
"step": 3
},
{
"epoch": 0.0009758477677482313,
"grad_norm": 3.343158483505249,
"learning_rate": 2.4997680324266246e-07,
"loss": 2.5093,
"step": 4
},
{
"epoch": 0.0012198097096852891,
"grad_norm": 4.163078784942627,
"learning_rate": 2.499709976798144e-07,
"loss": 2.9917,
"step": 5
},
{
"epoch": 0.001463771651622347,
"grad_norm": 4.113401889801025,
"learning_rate": 2.4996518956419777e-07,
"loss": 2.8629,
"step": 6
},
{
"epoch": 0.0017077335935594047,
"grad_norm": 2.110043525695801,
"learning_rate": 2.499593788941286e-07,
"loss": 2.3666,
"step": 7
},
{
"epoch": 0.0019516955354964625,
"grad_norm": 3.960318088531494,
"learning_rate": 2.499535656679212e-07,
"loss": 2.6438,
"step": 8
},
{
"epoch": 0.0021956574774335204,
"grad_norm": 3.959432601928711,
"learning_rate": 2.499477498838886e-07,
"loss": 2.6457,
"step": 9
},
{
"epoch": 0.0024396194193705783,
"grad_norm": 2.219346523284912,
"learning_rate": 2.4994193154034227e-07,
"loss": 2.3086,
"step": 10
},
{
"epoch": 0.0024396194193705783,
"eval_loss": 2.3810582160949707,
"eval_runtime": 157.7847,
"eval_samples_per_second": 1.622,
"eval_steps_per_second": 0.811,
"step": 10
},
{
"epoch": 0.002683581361307636,
"grad_norm": 2.600377082824707,
"learning_rate": 2.499361106355922e-07,
"loss": 2.3537,
"step": 11
},
{
"epoch": 0.002927543303244694,
"grad_norm": 3.251347303390503,
"learning_rate": 2.499302871679468e-07,
"loss": 2.483,
"step": 12
},
{
"epoch": 0.0031715052451817514,
"grad_norm": 2.1139895915985107,
"learning_rate": 2.4992446113571303e-07,
"loss": 2.288,
"step": 13
},
{
"epoch": 0.0034154671871188093,
"grad_norm": 3.138744592666626,
"learning_rate": 2.4991863253719657e-07,
"loss": 2.4845,
"step": 14
},
{
"epoch": 0.003659429129055867,
"grad_norm": 2.0805656909942627,
"learning_rate": 2.4991280137070126e-07,
"loss": 2.2727,
"step": 15
},
{
"epoch": 0.003903391070992925,
"grad_norm": 2.57004714012146,
"learning_rate": 2.499069676345297e-07,
"loss": 2.3858,
"step": 16
},
{
"epoch": 0.004147353012929983,
"grad_norm": 1.8521772623062134,
"learning_rate": 2.499011313269829e-07,
"loss": 2.2256,
"step": 17
},
{
"epoch": 0.004391314954867041,
"grad_norm": 2.250250816345215,
"learning_rate": 2.498952924463603e-07,
"loss": 2.375,
"step": 18
},
{
"epoch": 0.004635276896804099,
"grad_norm": 2.7878353595733643,
"learning_rate": 2.498894509909601e-07,
"loss": 2.0609,
"step": 19
},
{
"epoch": 0.0048792388387411565,
"grad_norm": 2.4599826335906982,
"learning_rate": 2.4988360695907864e-07,
"loss": 2.1944,
"step": 20
},
{
"epoch": 0.0048792388387411565,
"eval_loss": 2.1548237800598145,
"eval_runtime": 157.9,
"eval_samples_per_second": 1.621,
"eval_steps_per_second": 0.811,
"step": 20
},
{
"epoch": 0.005123200780678214,
"grad_norm": 2.118277072906494,
"learning_rate": 2.49877760349011e-07,
"loss": 2.1107,
"step": 21
},
{
"epoch": 0.005367162722615272,
"grad_norm": 1.5559130907058716,
"learning_rate": 2.498719111590508e-07,
"loss": 1.992,
"step": 22
},
{
"epoch": 0.00561112466455233,
"grad_norm": 2.998913049697876,
"learning_rate": 2.498660593874899e-07,
"loss": 2.2592,
"step": 23
},
{
"epoch": 0.005855086606489388,
"grad_norm": 1.370886206626892,
"learning_rate": 2.4986020503261886e-07,
"loss": 2.0988,
"step": 24
},
{
"epoch": 0.006099048548426446,
"grad_norm": 1.2692762613296509,
"learning_rate": 2.498543480927266e-07,
"loss": 2.1908,
"step": 25
},
{
"epoch": 0.006343010490363503,
"grad_norm": 1.6744440793991089,
"learning_rate": 2.4984848856610065e-07,
"loss": 2.2077,
"step": 26
},
{
"epoch": 0.006586972432300561,
"grad_norm": 1.3982892036437988,
"learning_rate": 2.4984262645102706e-07,
"loss": 2.2539,
"step": 27
},
{
"epoch": 0.006830934374237619,
"grad_norm": 1.3442888259887695,
"learning_rate": 2.4983676174579014e-07,
"loss": 2.2487,
"step": 28
},
{
"epoch": 0.0070748963161746765,
"grad_norm": 1.1121150255203247,
"learning_rate": 2.498308944486729e-07,
"loss": 2.024,
"step": 29
},
{
"epoch": 0.007318858258111734,
"grad_norm": 1.4833574295043945,
"learning_rate": 2.4982502455795676e-07,
"loss": 2.107,
"step": 30
},
{
"epoch": 0.007318858258111734,
"eval_loss": 2.051649570465088,
"eval_runtime": 158.0175,
"eval_samples_per_second": 1.62,
"eval_steps_per_second": 0.81,
"step": 30
},
{
"epoch": 0.007562820200048792,
"grad_norm": 1.5546934604644775,
"learning_rate": 2.498191520719216e-07,
"loss": 2.151,
"step": 31
},
{
"epoch": 0.00780678214198585,
"grad_norm": 1.101186752319336,
"learning_rate": 2.4981327698884575e-07,
"loss": 2.0822,
"step": 32
},
{
"epoch": 0.008050744083922909,
"grad_norm": 1.13623046875,
"learning_rate": 2.498073993070061e-07,
"loss": 2.0729,
"step": 33
},
{
"epoch": 0.008294706025859966,
"grad_norm": 1.3326915502548218,
"learning_rate": 2.49801519024678e-07,
"loss": 2.2334,
"step": 34
},
{
"epoch": 0.008538667967797023,
"grad_norm": 1.1969497203826904,
"learning_rate": 2.497956361401352e-07,
"loss": 2.1631,
"step": 35
},
{
"epoch": 0.008782629909734082,
"grad_norm": 1.0180652141571045,
"learning_rate": 2.4978975065165004e-07,
"loss": 2.0552,
"step": 36
},
{
"epoch": 0.009026591851671139,
"grad_norm": 1.7680776119232178,
"learning_rate": 2.497838625574932e-07,
"loss": 2.2854,
"step": 37
},
{
"epoch": 0.009270553793608197,
"grad_norm": 1.048871397972107,
"learning_rate": 2.497779718559339e-07,
"loss": 2.27,
"step": 38
},
{
"epoch": 0.009514515735545254,
"grad_norm": 1.0272551774978638,
"learning_rate": 2.497720785452398e-07,
"loss": 1.9276,
"step": 39
},
{
"epoch": 0.009758477677482313,
"grad_norm": 0.9949386119842529,
"learning_rate": 2.497661826236771e-07,
"loss": 2.1643,
"step": 40
},
{
"epoch": 0.009758477677482313,
"eval_loss": 1.9904688596725464,
"eval_runtime": 157.9911,
"eval_samples_per_second": 1.62,
"eval_steps_per_second": 0.81,
"step": 40
},
{
"epoch": 0.01000243961941937,
"grad_norm": 1.153521180152893,
"learning_rate": 2.497602840895103e-07,
"loss": 2.0555,
"step": 41
},
{
"epoch": 0.010246401561356429,
"grad_norm": 1.1365783214569092,
"learning_rate": 2.4975438294100266e-07,
"loss": 1.9699,
"step": 42
},
{
"epoch": 0.010490363503293486,
"grad_norm": 1.3392469882965088,
"learning_rate": 2.497484791764155e-07,
"loss": 2.1889,
"step": 43
},
{
"epoch": 0.010734325445230545,
"grad_norm": 1.1810263395309448,
"learning_rate": 2.4974257279400897e-07,
"loss": 1.9938,
"step": 44
},
{
"epoch": 0.010978287387167602,
"grad_norm": 0.8270505666732788,
"learning_rate": 2.497366637920414e-07,
"loss": 2.1701,
"step": 45
},
{
"epoch": 0.01122224932910466,
"grad_norm": 1.1721283197402954,
"learning_rate": 2.497307521687697e-07,
"loss": 2.0702,
"step": 46
},
{
"epoch": 0.011466211271041717,
"grad_norm": 0.8560613989830017,
"learning_rate": 2.497248379224492e-07,
"loss": 2.0357,
"step": 47
},
{
"epoch": 0.011710173212978776,
"grad_norm": 2.072547674179077,
"learning_rate": 2.497189210513339e-07,
"loss": 2.1774,
"step": 48
},
{
"epoch": 0.011954135154915833,
"grad_norm": 1.9676735401153564,
"learning_rate": 2.497130015536758e-07,
"loss": 2.1073,
"step": 49
},
{
"epoch": 0.012198097096852892,
"grad_norm": 0.868861198425293,
"learning_rate": 2.497070794277257e-07,
"loss": 2.0378,
"step": 50
},
{
"epoch": 0.012198097096852892,
"eval_loss": 1.958860993385315,
"eval_runtime": 157.3873,
"eval_samples_per_second": 1.627,
"eval_steps_per_second": 0.813,
"step": 50
},
{
"epoch": 0.012442059038789949,
"grad_norm": 1.0588116645812988,
"learning_rate": 2.497011546717327e-07,
"loss": 2.1439,
"step": 51
},
{
"epoch": 0.012686020980727006,
"grad_norm": 0.9421451687812805,
"learning_rate": 2.496952272839445e-07,
"loss": 1.9826,
"step": 52
},
{
"epoch": 0.012929982922664065,
"grad_norm": 0.88938969373703,
"learning_rate": 2.4968929726260705e-07,
"loss": 1.9675,
"step": 53
},
{
"epoch": 0.013173944864601122,
"grad_norm": 0.8794369101524353,
"learning_rate": 2.4968336460596485e-07,
"loss": 1.9546,
"step": 54
},
{
"epoch": 0.01341790680653818,
"grad_norm": 0.7067832350730896,
"learning_rate": 2.4967742931226075e-07,
"loss": 1.8798,
"step": 55
},
{
"epoch": 0.013661868748475237,
"grad_norm": 1.4922388792037964,
"learning_rate": 2.4967149137973625e-07,
"loss": 1.9596,
"step": 56
},
{
"epoch": 0.013905830690412296,
"grad_norm": 0.8123573660850525,
"learning_rate": 2.496655508066309e-07,
"loss": 1.9043,
"step": 57
},
{
"epoch": 0.014149792632349353,
"grad_norm": 0.8600869178771973,
"learning_rate": 2.4965960759118313e-07,
"loss": 1.9608,
"step": 58
},
{
"epoch": 0.014393754574286412,
"grad_norm": 0.7148178219795227,
"learning_rate": 2.4965366173162953e-07,
"loss": 2.0545,
"step": 59
},
{
"epoch": 0.014637716516223469,
"grad_norm": 0.8177701234817505,
"learning_rate": 2.4964771322620516e-07,
"loss": 2.0236,
"step": 60
},
{
"epoch": 0.014637716516223469,
"eval_loss": 1.934555172920227,
"eval_runtime": 157.5281,
"eval_samples_per_second": 1.625,
"eval_steps_per_second": 0.813,
"step": 60
},
{
"epoch": 0.014881678458160527,
"grad_norm": 0.6155992746353149,
"learning_rate": 2.4964176207314356e-07,
"loss": 2.066,
"step": 61
},
{
"epoch": 0.015125640400097584,
"grad_norm": 0.9341537356376648,
"learning_rate": 2.496358082706767e-07,
"loss": 1.9537,
"step": 62
},
{
"epoch": 0.015369602342034643,
"grad_norm": 1.3128167390823364,
"learning_rate": 2.4962985181703483e-07,
"loss": 2.0044,
"step": 63
},
{
"epoch": 0.0156135642839717,
"grad_norm": 1.2402898073196411,
"learning_rate": 2.496238927104469e-07,
"loss": 1.962,
"step": 64
},
{
"epoch": 0.015857526225908757,
"grad_norm": 0.8261551260948181,
"learning_rate": 2.4961793094913995e-07,
"loss": 2.1043,
"step": 65
},
{
"epoch": 0.016101488167845818,
"grad_norm": 1.3150850534439087,
"learning_rate": 2.4961196653133975e-07,
"loss": 2.1101,
"step": 66
},
{
"epoch": 0.016345450109782875,
"grad_norm": 0.5901480317115784,
"learning_rate": 2.4960599945527027e-07,
"loss": 1.7913,
"step": 67
},
{
"epoch": 0.01658941205171993,
"grad_norm": 1.4552851915359497,
"learning_rate": 2.49600029719154e-07,
"loss": 1.9979,
"step": 68
},
{
"epoch": 0.01683337399365699,
"grad_norm": 0.6188462376594543,
"learning_rate": 2.495940573212118e-07,
"loss": 1.759,
"step": 69
},
{
"epoch": 0.017077335935594046,
"grad_norm": 0.6212908029556274,
"learning_rate": 2.4958808225966306e-07,
"loss": 1.9251,
"step": 70
},
{
"epoch": 0.017077335935594046,
"eval_loss": 1.919191837310791,
"eval_runtime": 157.4683,
"eval_samples_per_second": 1.626,
"eval_steps_per_second": 0.813,
"step": 70
},
{
"epoch": 0.017321297877531106,
"grad_norm": 0.6586403250694275,
"learning_rate": 2.4958210453272533e-07,
"loss": 2.0447,
"step": 71
},
{
"epoch": 0.017565259819468163,
"grad_norm": 0.6836444139480591,
"learning_rate": 2.4957612413861483e-07,
"loss": 2.0525,
"step": 72
},
{
"epoch": 0.01780922176140522,
"grad_norm": 0.7636261582374573,
"learning_rate": 2.4957014107554603e-07,
"loss": 2.0984,
"step": 73
},
{
"epoch": 0.018053183703342277,
"grad_norm": 0.5293551683425903,
"learning_rate": 2.4956415534173195e-07,
"loss": 1.8238,
"step": 74
},
{
"epoch": 0.018297145645279338,
"grad_norm": 0.5500568151473999,
"learning_rate": 2.495581669353838e-07,
"loss": 1.8841,
"step": 75
},
{
"epoch": 0.018541107587216395,
"grad_norm": 0.7883771061897278,
"learning_rate": 2.4955217585471147e-07,
"loss": 1.9951,
"step": 76
},
{
"epoch": 0.01878506952915345,
"grad_norm": 0.6567949056625366,
"learning_rate": 2.495461820979229e-07,
"loss": 2.0119,
"step": 77
},
{
"epoch": 0.01902903147109051,
"grad_norm": 0.8867214918136597,
"learning_rate": 2.4954018566322477e-07,
"loss": 1.8826,
"step": 78
},
{
"epoch": 0.01927299341302757,
"grad_norm": 0.8271172642707825,
"learning_rate": 2.4953418654882195e-07,
"loss": 1.9226,
"step": 79
},
{
"epoch": 0.019516955354964626,
"grad_norm": 0.5612655878067017,
"learning_rate": 2.495281847529178e-07,
"loss": 1.9987,
"step": 80
},
{
"epoch": 0.019516955354964626,
"eval_loss": 1.9070545434951782,
"eval_runtime": 157.7755,
"eval_samples_per_second": 1.623,
"eval_steps_per_second": 0.811,
"step": 80
},
{
"epoch": 0.019760917296901683,
"grad_norm": 0.9746911525726318,
"learning_rate": 2.4952218027371403e-07,
"loss": 2.0771,
"step": 81
},
{
"epoch": 0.02000487923883874,
"grad_norm": 0.7961266040802002,
"learning_rate": 2.495161731094107e-07,
"loss": 1.9497,
"step": 82
},
{
"epoch": 0.0202488411807758,
"grad_norm": 0.5901756286621094,
"learning_rate": 2.4951016325820637e-07,
"loss": 1.9636,
"step": 83
},
{
"epoch": 0.020492803122712858,
"grad_norm": 0.572099506855011,
"learning_rate": 2.4950415071829794e-07,
"loss": 2.0077,
"step": 84
},
{
"epoch": 0.020736765064649915,
"grad_norm": 0.7444072961807251,
"learning_rate": 2.4949813548788067e-07,
"loss": 1.9713,
"step": 85
},
{
"epoch": 0.02098072700658697,
"grad_norm": 1.6917086839675903,
"learning_rate": 2.4949211756514816e-07,
"loss": 2.1275,
"step": 86
},
{
"epoch": 0.02122468894852403,
"grad_norm": 0.4941423535346985,
"learning_rate": 2.494860969482926e-07,
"loss": 2.0304,
"step": 87
},
{
"epoch": 0.02146865089046109,
"grad_norm": 0.7001515626907349,
"learning_rate": 2.4948007363550424e-07,
"loss": 2.0102,
"step": 88
},
{
"epoch": 0.021712612832398146,
"grad_norm": 0.6658152341842651,
"learning_rate": 2.4947404762497197e-07,
"loss": 1.6802,
"step": 89
},
{
"epoch": 0.021956574774335203,
"grad_norm": 0.7706289291381836,
"learning_rate": 2.49468018914883e-07,
"loss": 2.0452,
"step": 90
},
{
"epoch": 0.021956574774335203,
"eval_loss": 1.8989028930664062,
"eval_runtime": 158.0707,
"eval_samples_per_second": 1.62,
"eval_steps_per_second": 0.81,
"step": 90
},
{
"epoch": 0.02220053671627226,
"grad_norm": 0.4736054837703705,
"learning_rate": 2.4946198750342283e-07,
"loss": 1.9606,
"step": 91
},
{
"epoch": 0.02244449865820932,
"grad_norm": 0.6369607448577881,
"learning_rate": 2.4945595338877547e-07,
"loss": 1.9367,
"step": 92
},
{
"epoch": 0.022688460600146378,
"grad_norm": 0.780017614364624,
"learning_rate": 2.494499165691231e-07,
"loss": 1.8239,
"step": 93
},
{
"epoch": 0.022932422542083435,
"grad_norm": 1.0048651695251465,
"learning_rate": 2.4944387704264644e-07,
"loss": 1.851,
"step": 94
},
{
"epoch": 0.02317638448402049,
"grad_norm": 0.5539764165878296,
"learning_rate": 2.494378348075246e-07,
"loss": 1.7927,
"step": 95
},
{
"epoch": 0.023420346425957552,
"grad_norm": 0.5273501873016357,
"learning_rate": 2.494317898619349e-07,
"loss": 1.7911,
"step": 96
},
{
"epoch": 0.02366430836789461,
"grad_norm": 1.1313800811767578,
"learning_rate": 2.4942574220405314e-07,
"loss": 1.9152,
"step": 97
},
{
"epoch": 0.023908270309831666,
"grad_norm": 0.8607046604156494,
"learning_rate": 2.4941969183205344e-07,
"loss": 2.0688,
"step": 98
},
{
"epoch": 0.024152232251768723,
"grad_norm": 0.9859471321105957,
"learning_rate": 2.494136387441083e-07,
"loss": 2.0554,
"step": 99
},
{
"epoch": 0.024396194193705784,
"grad_norm": 0.5871405005455017,
"learning_rate": 2.494075829383886e-07,
"loss": 1.8362,
"step": 100
},
{
"epoch": 0.024396194193705784,
"eval_loss": 1.8896028995513916,
"eval_runtime": 157.8345,
"eval_samples_per_second": 1.622,
"eval_steps_per_second": 0.811,
"step": 100
},
{
"epoch": 0.02464015613564284,
"grad_norm": 0.5069964528083801,
"learning_rate": 2.494015244130635e-07,
"loss": 1.8013,
"step": 101
},
{
"epoch": 0.024884118077579898,
"grad_norm": 0.7139447927474976,
"learning_rate": 2.493954631663007e-07,
"loss": 1.8216,
"step": 102
},
{
"epoch": 0.025128080019516955,
"grad_norm": 0.48631080985069275,
"learning_rate": 2.493893991962659e-07,
"loss": 1.9325,
"step": 103
},
{
"epoch": 0.02537204196145401,
"grad_norm": 0.5576779842376709,
"learning_rate": 2.493833325011235e-07,
"loss": 2.0052,
"step": 104
},
{
"epoch": 0.025616003903391072,
"grad_norm": 0.6407865285873413,
"learning_rate": 2.4937726307903606e-07,
"loss": 1.9411,
"step": 105
},
{
"epoch": 0.02585996584532813,
"grad_norm": 0.7654765248298645,
"learning_rate": 2.493711909281646e-07,
"loss": 1.9438,
"step": 106
},
{
"epoch": 0.026103927787265186,
"grad_norm": 1.2607905864715576,
"learning_rate": 2.493651160466685e-07,
"loss": 2.0134,
"step": 107
},
{
"epoch": 0.026347889729202243,
"grad_norm": 0.8633036017417908,
"learning_rate": 2.493590384327053e-07,
"loss": 1.9775,
"step": 108
},
{
"epoch": 0.026591851671139304,
"grad_norm": 0.7568155527114868,
"learning_rate": 2.49352958084431e-07,
"loss": 1.9074,
"step": 109
},
{
"epoch": 0.02683581361307636,
"grad_norm": 0.5505961179733276,
"learning_rate": 2.49346875e-07,
"loss": 1.8467,
"step": 110
},
{
"epoch": 0.02683581361307636,
"eval_loss": 1.8828259706497192,
"eval_runtime": 158.4116,
"eval_samples_per_second": 1.616,
"eval_steps_per_second": 0.808,
"step": 110
},
{
"epoch": 0.027079775555013418,
"grad_norm": 0.5095446109771729,
"learning_rate": 2.49340789177565e-07,
"loss": 1.9961,
"step": 111
},
{
"epoch": 0.027323737496950475,
"grad_norm": 1.7097959518432617,
"learning_rate": 2.4933470061527687e-07,
"loss": 1.9335,
"step": 112
},
{
"epoch": 0.027567699438887535,
"grad_norm": 1.0115768909454346,
"learning_rate": 2.493286093112851e-07,
"loss": 1.8118,
"step": 113
},
{
"epoch": 0.027811661380824592,
"grad_norm": 0.6412175297737122,
"learning_rate": 2.493225152637374e-07,
"loss": 1.9623,
"step": 114
},
{
"epoch": 0.02805562332276165,
"grad_norm": 0.5357053875923157,
"learning_rate": 2.4931641847077963e-07,
"loss": 1.8131,
"step": 115
},
{
"epoch": 0.028299585264698706,
"grad_norm": 0.6828150153160095,
"learning_rate": 2.493103189305562e-07,
"loss": 1.767,
"step": 116
},
{
"epoch": 0.028543547206635766,
"grad_norm": 0.5804136395454407,
"learning_rate": 2.493042166412099e-07,
"loss": 1.9831,
"step": 117
},
{
"epoch": 0.028787509148572824,
"grad_norm": 0.6375969052314758,
"learning_rate": 2.492981116008816e-07,
"loss": 1.9651,
"step": 118
},
{
"epoch": 0.02903147109050988,
"grad_norm": 0.6621755957603455,
"learning_rate": 2.492920038077106e-07,
"loss": 2.1064,
"step": 119
},
{
"epoch": 0.029275433032446938,
"grad_norm": 0.7436494827270508,
"learning_rate": 2.492858932598346e-07,
"loss": 1.8961,
"step": 120
},
{
"epoch": 0.029275433032446938,
"eval_loss": 1.8782259225845337,
"eval_runtime": 158.1634,
"eval_samples_per_second": 1.619,
"eval_steps_per_second": 0.809,
"step": 120
},
{
"epoch": 0.029519394974383995,
"grad_norm": 0.5152058005332947,
"learning_rate": 2.4927977995538954e-07,
"loss": 1.875,
"step": 121
},
{
"epoch": 0.029763356916321055,
"grad_norm": 0.4640464782714844,
"learning_rate": 2.4927366389250973e-07,
"loss": 1.8429,
"step": 122
},
{
"epoch": 0.030007318858258112,
"grad_norm": 0.6126062273979187,
"learning_rate": 2.4926754506932774e-07,
"loss": 1.9581,
"step": 123
},
{
"epoch": 0.03025128080019517,
"grad_norm": 0.5338674187660217,
"learning_rate": 2.4926142348397453e-07,
"loss": 1.9682,
"step": 124
},
{
"epoch": 0.030495242742132226,
"grad_norm": 0.48220378160476685,
"learning_rate": 2.492552991345792e-07,
"loss": 1.9316,
"step": 125
},
{
"epoch": 0.030739204684069286,
"grad_norm": 1.0571016073226929,
"learning_rate": 2.4924917201926936e-07,
"loss": 1.9837,
"step": 126
},
{
"epoch": 0.030983166626006343,
"grad_norm": 0.5729621052742004,
"learning_rate": 2.492430421361708e-07,
"loss": 1.7242,
"step": 127
},
{
"epoch": 0.0312271285679434,
"grad_norm": 0.9092426896095276,
"learning_rate": 2.4923690948340783e-07,
"loss": 1.8327,
"step": 128
},
{
"epoch": 0.03147109050988046,
"grad_norm": 0.44636791944503784,
"learning_rate": 2.4923077405910264e-07,
"loss": 2.0464,
"step": 129
},
{
"epoch": 0.031715052451817514,
"grad_norm": 0.6733670830726624,
"learning_rate": 2.4922463586137616e-07,
"loss": 1.8564,
"step": 130
},
{
"epoch": 0.031715052451817514,
"eval_loss": 1.873685359954834,
"eval_runtime": 158.2193,
"eval_samples_per_second": 1.618,
"eval_steps_per_second": 0.809,
"step": 130
},
{
"epoch": 0.03195901439375457,
"grad_norm": 0.6245723366737366,
"learning_rate": 2.4921849488834745e-07,
"loss": 2.0072,
"step": 131
},
{
"epoch": 0.032202976335691635,
"grad_norm": 0.47369739413261414,
"learning_rate": 2.4921235113813376e-07,
"loss": 2.0033,
"step": 132
},
{
"epoch": 0.03244693827762869,
"grad_norm": 0.6961667537689209,
"learning_rate": 2.492062046088508e-07,
"loss": 1.8175,
"step": 133
},
{
"epoch": 0.03269090021956575,
"grad_norm": 0.7953224182128906,
"learning_rate": 2.4920005529861254e-07,
"loss": 1.8035,
"step": 134
},
{
"epoch": 0.032934862161502806,
"grad_norm": 0.516058087348938,
"learning_rate": 2.491939032055311e-07,
"loss": 1.8855,
"step": 135
},
{
"epoch": 0.03317882410343986,
"grad_norm": 0.6488027572631836,
"learning_rate": 2.491877483277171e-07,
"loss": 1.9622,
"step": 136
},
{
"epoch": 0.03342278604537692,
"grad_norm": 0.6827359199523926,
"learning_rate": 2.4918159066327943e-07,
"loss": 1.847,
"step": 137
},
{
"epoch": 0.03366674798731398,
"grad_norm": 0.4918162226676941,
"learning_rate": 2.49175430210325e-07,
"loss": 1.9214,
"step": 138
},
{
"epoch": 0.033910709929251034,
"grad_norm": 0.7824620008468628,
"learning_rate": 2.491692669669594e-07,
"loss": 1.8472,
"step": 139
},
{
"epoch": 0.03415467187118809,
"grad_norm": 0.7084971070289612,
"learning_rate": 2.4916310093128616e-07,
"loss": 1.8638,
"step": 140
},
{
"epoch": 0.03415467187118809,
"eval_loss": 1.869973063468933,
"eval_runtime": 157.6522,
"eval_samples_per_second": 1.624,
"eval_steps_per_second": 0.812,
"step": 140
},
{
"epoch": 0.034398633813125155,
"grad_norm": 0.4873005747795105,
"learning_rate": 2.491569321014073e-07,
"loss": 1.9326,
"step": 141
},
{
"epoch": 0.03464259575506221,
"grad_norm": 0.6483212113380432,
"learning_rate": 2.49150760475423e-07,
"loss": 1.9035,
"step": 142
},
{
"epoch": 0.03488655769699927,
"grad_norm": 0.46081703901290894,
"learning_rate": 2.4914458605143187e-07,
"loss": 1.9746,
"step": 143
},
{
"epoch": 0.035130519638936326,
"grad_norm": 0.683131754398346,
"learning_rate": 2.491384088275306e-07,
"loss": 1.8517,
"step": 144
},
{
"epoch": 0.03537448158087338,
"grad_norm": 0.4871167242527008,
"learning_rate": 2.491322288018143e-07,
"loss": 1.7198,
"step": 145
},
{
"epoch": 0.03561844352281044,
"grad_norm": 0.6227270364761353,
"learning_rate": 2.4912604597237626e-07,
"loss": 1.8555,
"step": 146
},
{
"epoch": 0.0358624054647475,
"grad_norm": 0.5372536182403564,
"learning_rate": 2.4911986033730807e-07,
"loss": 1.8245,
"step": 147
},
{
"epoch": 0.036106367406684554,
"grad_norm": 0.7428392171859741,
"learning_rate": 2.491136718946997e-07,
"loss": 2.0657,
"step": 148
},
{
"epoch": 0.03635032934862162,
"grad_norm": 0.9103279709815979,
"learning_rate": 2.4910748064263914e-07,
"loss": 1.9042,
"step": 149
},
{
"epoch": 0.036594291290558675,
"grad_norm": 1.1896861791610718,
"learning_rate": 2.491012865792129e-07,
"loss": 1.8883,
"step": 150
},
{
"epoch": 0.036594291290558675,
"eval_loss": 1.86661696434021,
"eval_runtime": 158.3624,
"eval_samples_per_second": 1.617,
"eval_steps_per_second": 0.808,
"step": 150
},
{
"epoch": 0.03683825323249573,
"grad_norm": 0.7221816182136536,
"learning_rate": 2.490950897025056e-07,
"loss": 1.8696,
"step": 151
},
{
"epoch": 0.03708221517443279,
"grad_norm": 0.5009371042251587,
"learning_rate": 2.4908889001060015e-07,
"loss": 1.923,
"step": 152
},
{
"epoch": 0.037326177116369846,
"grad_norm": 0.6172135472297668,
"learning_rate": 2.490826875015777e-07,
"loss": 1.9862,
"step": 153
},
{
"epoch": 0.0375701390583069,
"grad_norm": 0.9549673199653625,
"learning_rate": 2.490764821735178e-07,
"loss": 1.9981,
"step": 154
},
{
"epoch": 0.03781410100024396,
"grad_norm": 0.5264533758163452,
"learning_rate": 2.4907027402449803e-07,
"loss": 1.8822,
"step": 155
},
{
"epoch": 0.03805806294218102,
"grad_norm": 0.4591792821884155,
"learning_rate": 2.4906406305259434e-07,
"loss": 1.9013,
"step": 156
},
{
"epoch": 0.038302024884118074,
"grad_norm": 0.4885839819908142,
"learning_rate": 2.4905784925588094e-07,
"loss": 1.918,
"step": 157
},
{
"epoch": 0.03854598682605514,
"grad_norm": 0.5201852917671204,
"learning_rate": 2.4905163263243023e-07,
"loss": 1.9607,
"step": 158
},
{
"epoch": 0.038789948767992195,
"grad_norm": 0.7386835813522339,
"learning_rate": 2.4904541318031294e-07,
"loss": 1.8633,
"step": 159
},
{
"epoch": 0.03903391070992925,
"grad_norm": 0.5655650496482849,
"learning_rate": 2.49039190897598e-07,
"loss": 1.9402,
"step": 160
},
{
"epoch": 0.03903391070992925,
"eval_loss": 1.864166021347046,
"eval_runtime": 158.2099,
"eval_samples_per_second": 1.618,
"eval_steps_per_second": 0.809,
"step": 160
},
{
"epoch": 0.03927787265186631,
"grad_norm": 0.6714135408401489,
"learning_rate": 2.490329657823525e-07,
"loss": 1.7962,
"step": 161
},
{
"epoch": 0.039521834593803366,
"grad_norm": 0.685165524482727,
"learning_rate": 2.490267378326419e-07,
"loss": 1.9055,
"step": 162
},
{
"epoch": 0.03976579653574042,
"grad_norm": 0.5688671469688416,
"learning_rate": 2.490205070465299e-07,
"loss": 1.8434,
"step": 163
},
{
"epoch": 0.04000975847767748,
"grad_norm": 0.6001088619232178,
"learning_rate": 2.4901427342207823e-07,
"loss": 1.8715,
"step": 164
},
{
"epoch": 0.04025372041961454,
"grad_norm": 0.5576404929161072,
"learning_rate": 2.490080369573472e-07,
"loss": 1.8664,
"step": 165
},
{
"epoch": 0.0404976823615516,
"grad_norm": 0.4974159002304077,
"learning_rate": 2.4900179765039496e-07,
"loss": 1.7923,
"step": 166
},
{
"epoch": 0.04074164430348866,
"grad_norm": 0.48131653666496277,
"learning_rate": 2.489955554992782e-07,
"loss": 1.8561,
"step": 167
},
{
"epoch": 0.040985606245425715,
"grad_norm": 0.49776557087898254,
"learning_rate": 2.489893105020518e-07,
"loss": 1.798,
"step": 168
},
{
"epoch": 0.04122956818736277,
"grad_norm": 0.7587680220603943,
"learning_rate": 2.489830626567686e-07,
"loss": 1.9562,
"step": 169
},
{
"epoch": 0.04147353012929983,
"grad_norm": 0.6052951216697693,
"learning_rate": 2.4897681196148e-07,
"loss": 1.9305,
"step": 170
},
{
"epoch": 0.04147353012929983,
"eval_loss": 1.8620600700378418,
"eval_runtime": 157.5929,
"eval_samples_per_second": 1.624,
"eval_steps_per_second": 0.812,
"step": 170
},
{
"epoch": 0.041717492071236886,
"grad_norm": 0.5671830177307129,
"learning_rate": 2.4897055841423537e-07,
"loss": 1.8514,
"step": 171
},
{
"epoch": 0.04196145401317394,
"grad_norm": 0.4015696346759796,
"learning_rate": 2.489643020130825e-07,
"loss": 1.8889,
"step": 172
},
{
"epoch": 0.042205415955111,
"grad_norm": 0.8785597681999207,
"learning_rate": 2.4895804275606724e-07,
"loss": 1.8905,
"step": 173
},
{
"epoch": 0.04244937789704806,
"grad_norm": 0.573078453540802,
"learning_rate": 2.489517806412337e-07,
"loss": 2.0164,
"step": 174
},
{
"epoch": 0.04269333983898512,
"grad_norm": 0.48950624465942383,
"learning_rate": 2.4894551566662435e-07,
"loss": 2.0895,
"step": 175
},
{
"epoch": 0.04293730178092218,
"grad_norm": 0.5515138506889343,
"learning_rate": 2.4893924783027967e-07,
"loss": 1.9163,
"step": 176
},
{
"epoch": 0.043181263722859235,
"grad_norm": 0.4793028235435486,
"learning_rate": 2.4893297713023835e-07,
"loss": 1.8189,
"step": 177
},
{
"epoch": 0.04342522566479629,
"grad_norm": 0.5240328311920166,
"learning_rate": 2.4892670356453745e-07,
"loss": 1.9361,
"step": 178
},
{
"epoch": 0.04366918760673335,
"grad_norm": 0.5339527726173401,
"learning_rate": 2.4892042713121207e-07,
"loss": 1.9248,
"step": 179
},
{
"epoch": 0.043913149548670406,
"grad_norm": 0.468458890914917,
"learning_rate": 2.4891414782829566e-07,
"loss": 1.9061,
"step": 180
},
{
"epoch": 0.043913149548670406,
"eval_loss": 1.8581455945968628,
"eval_runtime": 157.6293,
"eval_samples_per_second": 1.624,
"eval_steps_per_second": 0.812,
"step": 180
},
{
"epoch": 0.04415711149060746,
"grad_norm": 0.5706861019134521,
"learning_rate": 2.4890786565381976e-07,
"loss": 1.8752,
"step": 181
},
{
"epoch": 0.04440107343254452,
"grad_norm": 0.573175311088562,
"learning_rate": 2.489015806058142e-07,
"loss": 1.9895,
"step": 182
},
{
"epoch": 0.044645035374481584,
"grad_norm": 1.2761479616165161,
"learning_rate": 2.4889529268230683e-07,
"loss": 1.9355,
"step": 183
},
{
"epoch": 0.04488899731641864,
"grad_norm": 3.7102456092834473,
"learning_rate": 2.4888900188132405e-07,
"loss": 1.9278,
"step": 184
},
{
"epoch": 0.0451329592583557,
"grad_norm": 0.5471494793891907,
"learning_rate": 2.4888270820089003e-07,
"loss": 1.9218,
"step": 185
},
{
"epoch": 0.045376921200292755,
"grad_norm": 0.9872457385063171,
"learning_rate": 2.488764116390274e-07,
"loss": 1.936,
"step": 186
},
{
"epoch": 0.04562088314222981,
"grad_norm": 0.528155505657196,
"learning_rate": 2.488701121937568e-07,
"loss": 1.9575,
"step": 187
},
{
"epoch": 0.04586484508416687,
"grad_norm": 0.51887446641922,
"learning_rate": 2.488638098630973e-07,
"loss": 1.8338,
"step": 188
},
{
"epoch": 0.046108807026103926,
"grad_norm": 0.4276951253414154,
"learning_rate": 2.4885750464506606e-07,
"loss": 2.0073,
"step": 189
},
{
"epoch": 0.04635276896804098,
"grad_norm": 0.5127749443054199,
"learning_rate": 2.488511965376782e-07,
"loss": 1.9237,
"step": 190
},
{
"epoch": 0.04635276896804098,
"eval_loss": 1.856198787689209,
"eval_runtime": 157.9524,
"eval_samples_per_second": 1.621,
"eval_steps_per_second": 0.81,
"step": 190
},
{
"epoch": 0.04659673090997804,
"grad_norm": 0.5734567046165466,
"learning_rate": 2.488448855389473e-07,
"loss": 1.955,
"step": 191
},
{
"epoch": 0.046840692851915104,
"grad_norm": 0.4853633940219879,
"learning_rate": 2.48838571646885e-07,
"loss": 1.9313,
"step": 192
},
{
"epoch": 0.04708465479385216,
"grad_norm": 0.8106932044029236,
"learning_rate": 2.488322548595012e-07,
"loss": 1.9164,
"step": 193
},
{
"epoch": 0.04732861673578922,
"grad_norm": 0.6387647986412048,
"learning_rate": 2.488259351748038e-07,
"loss": 2.0275,
"step": 194
},
{
"epoch": 0.047572578677726275,
"grad_norm": 0.48080340027809143,
"learning_rate": 2.48819612590799e-07,
"loss": 1.966,
"step": 195
},
{
"epoch": 0.04781654061966333,
"grad_norm": 0.464213103055954,
"learning_rate": 2.4881328710549126e-07,
"loss": 1.8753,
"step": 196
},
{
"epoch": 0.04806050256160039,
"grad_norm": 0.7000899314880371,
"learning_rate": 2.48806958716883e-07,
"loss": 2.0136,
"step": 197
},
{
"epoch": 0.048304464503537446,
"grad_norm": 0.474881112575531,
"learning_rate": 2.488006274229749e-07,
"loss": 1.9193,
"step": 198
},
{
"epoch": 0.0485484264454745,
"grad_norm": 0.5639634132385254,
"learning_rate": 2.4879429322176583e-07,
"loss": 1.8432,
"step": 199
},
{
"epoch": 0.04879238838741157,
"grad_norm": 0.41461923718452454,
"learning_rate": 2.4878795611125284e-07,
"loss": 1.8943,
"step": 200
},
{
"epoch": 0.04879238838741157,
"eval_loss": 1.8539921045303345,
"eval_runtime": 157.8624,
"eval_samples_per_second": 1.622,
"eval_steps_per_second": 0.811,
"step": 200
},
{
"epoch": 0.049036350329348624,
"grad_norm": 0.5546320080757141,
"learning_rate": 2.487816160894311e-07,
"loss": 1.8561,
"step": 201
},
{
"epoch": 0.04928031227128568,
"grad_norm": 0.4563431441783905,
"learning_rate": 2.4877527315429387e-07,
"loss": 1.9516,
"step": 202
},
{
"epoch": 0.04952427421322274,
"grad_norm": 0.48537513613700867,
"learning_rate": 2.4876892730383267e-07,
"loss": 2.0183,
"step": 203
},
{
"epoch": 0.049768236155159795,
"grad_norm": 0.5398459434509277,
"learning_rate": 2.4876257853603717e-07,
"loss": 1.9771,
"step": 204
},
{
"epoch": 0.05001219809709685,
"grad_norm": 0.47974419593811035,
"learning_rate": 2.4875622684889513e-07,
"loss": 1.8562,
"step": 205
},
{
"epoch": 0.05025616003903391,
"grad_norm": 0.42705652117729187,
"learning_rate": 2.4874987224039246e-07,
"loss": 1.9547,
"step": 206
},
{
"epoch": 0.050500121980970966,
"grad_norm": 1.4771904945373535,
"learning_rate": 2.4874351470851334e-07,
"loss": 1.9176,
"step": 207
},
{
"epoch": 0.05074408392290802,
"grad_norm": 0.4936388432979584,
"learning_rate": 2.4873715425123986e-07,
"loss": 1.986,
"step": 208
},
{
"epoch": 0.05098804586484509,
"grad_norm": 0.45525163412094116,
"learning_rate": 2.4873079086655244e-07,
"loss": 1.9623,
"step": 209
},
{
"epoch": 0.051232007806782144,
"grad_norm": 0.429779052734375,
"learning_rate": 2.487244245524296e-07,
"loss": 1.7466,
"step": 210
},
{
"epoch": 0.051232007806782144,
"eval_loss": 1.8527003526687622,
"eval_runtime": 157.4992,
"eval_samples_per_second": 1.625,
"eval_steps_per_second": 0.813,
"step": 210
},
{
"epoch": 0.0514759697487192,
"grad_norm": 0.4459904432296753,
"learning_rate": 2.487180553068481e-07,
"loss": 1.9284,
"step": 211
},
{
"epoch": 0.05171993169065626,
"grad_norm": 0.8717539310455322,
"learning_rate": 2.487116831277826e-07,
"loss": 1.7543,
"step": 212
},
{
"epoch": 0.051963893632593315,
"grad_norm": 0.7227014303207397,
"learning_rate": 2.4870530801320607e-07,
"loss": 1.8261,
"step": 213
},
{
"epoch": 0.05220785557453037,
"grad_norm": 0.4853971302509308,
"learning_rate": 2.486989299610895e-07,
"loss": 1.9214,
"step": 214
},
{
"epoch": 0.05245181751646743,
"grad_norm": 0.5626842975616455,
"learning_rate": 2.4869254896940207e-07,
"loss": 1.8116,
"step": 215
},
{
"epoch": 0.052695779458404486,
"grad_norm": 0.4326629340648651,
"learning_rate": 2.4868616503611124e-07,
"loss": 1.7844,
"step": 216
},
{
"epoch": 0.05293974140034155,
"grad_norm": 0.43978720903396606,
"learning_rate": 2.486797781591823e-07,
"loss": 1.7327,
"step": 217
},
{
"epoch": 0.05318370334227861,
"grad_norm": 1.3520264625549316,
"learning_rate": 2.4867338833657884e-07,
"loss": 1.9084,
"step": 218
},
{
"epoch": 0.053427665284215664,
"grad_norm": 1.791759967803955,
"learning_rate": 2.4866699556626256e-07,
"loss": 2.0314,
"step": 219
},
{
"epoch": 0.05367162722615272,
"grad_norm": 0.7393069267272949,
"learning_rate": 2.486605998461933e-07,
"loss": 1.8518,
"step": 220
},
{
"epoch": 0.05367162722615272,
"eval_loss": 1.850144386291504,
"eval_runtime": 156.9992,
"eval_samples_per_second": 1.631,
"eval_steps_per_second": 0.815,
"step": 220
},
{
"epoch": 0.05391558916808978,
"grad_norm": 0.4648591876029968,
"learning_rate": 2.4865420117432884e-07,
"loss": 1.9889,
"step": 221
},
{
"epoch": 0.054159551110026835,
"grad_norm": 0.4539943337440491,
"learning_rate": 2.4864779954862536e-07,
"loss": 1.8777,
"step": 222
},
{
"epoch": 0.05440351305196389,
"grad_norm": 23.188865661621094,
"learning_rate": 2.486413949670369e-07,
"loss": 1.9913,
"step": 223
},
{
"epoch": 0.05464747499390095,
"grad_norm": 0.5861213803291321,
"learning_rate": 2.486349874275158e-07,
"loss": 1.5643,
"step": 224
},
{
"epoch": 0.054891436935838006,
"grad_norm": 0.4710935056209564,
"learning_rate": 2.486285769280123e-07,
"loss": 1.9896,
"step": 225
},
{
"epoch": 0.05513539887777507,
"grad_norm": 0.5323078632354736,
"learning_rate": 2.48622163466475e-07,
"loss": 1.7714,
"step": 226
},
{
"epoch": 0.05537936081971213,
"grad_norm": 0.5247780680656433,
"learning_rate": 2.486157470408504e-07,
"loss": 1.9497,
"step": 227
},
{
"epoch": 0.055623322761649184,
"grad_norm": 0.48543304204940796,
"learning_rate": 2.4860932764908314e-07,
"loss": 1.9012,
"step": 228
},
{
"epoch": 0.05586728470358624,
"grad_norm": 0.5412744879722595,
"learning_rate": 2.486029052891161e-07,
"loss": 1.8044,
"step": 229
},
{
"epoch": 0.0561112466455233,
"grad_norm": 0.4210870563983917,
"learning_rate": 2.4859647995889003e-07,
"loss": 1.7522,
"step": 230
},
{
"epoch": 0.0561112466455233,
"eval_loss": 1.845929741859436,
"eval_runtime": 157.1033,
"eval_samples_per_second": 1.63,
"eval_steps_per_second": 0.815,
"step": 230
},
{
"epoch": 0.056355208587460355,
"grad_norm": 0.49198633432388306,
"learning_rate": 2.4859005165634397e-07,
"loss": 1.6787,
"step": 231
},
{
"epoch": 0.05659917052939741,
"grad_norm": 0.4444589912891388,
"learning_rate": 2.4858362037941493e-07,
"loss": 1.8522,
"step": 232
},
{
"epoch": 0.05684313247133447,
"grad_norm": 0.42611005902290344,
"learning_rate": 2.485771861260381e-07,
"loss": 1.773,
"step": 233
},
{
"epoch": 0.05708709441327153,
"grad_norm": 0.44933363795280457,
"learning_rate": 2.485707488941467e-07,
"loss": 1.839,
"step": 234
},
{
"epoch": 0.05733105635520859,
"grad_norm": 0.510879397392273,
"learning_rate": 2.48564308681672e-07,
"loss": 1.9736,
"step": 235
},
{
"epoch": 0.05757501829714565,
"grad_norm": 0.48234203457832336,
"learning_rate": 2.485578654865435e-07,
"loss": 1.8358,
"step": 236
},
{
"epoch": 0.057818980239082704,
"grad_norm": 0.5287805795669556,
"learning_rate": 2.485514193066886e-07,
"loss": 1.7455,
"step": 237
},
{
"epoch": 0.05806294218101976,
"grad_norm": 0.4200873374938965,
"learning_rate": 2.485449701400329e-07,
"loss": 1.8146,
"step": 238
},
{
"epoch": 0.05830690412295682,
"grad_norm": 0.42826953530311584,
"learning_rate": 2.485385179845001e-07,
"loss": 1.8783,
"step": 239
},
{
"epoch": 0.058550866064893875,
"grad_norm": 0.6160483360290527,
"learning_rate": 2.4853206283801187e-07,
"loss": 2.0157,
"step": 240
},
{
"epoch": 0.058550866064893875,
"eval_loss": 1.8428621292114258,
"eval_runtime": 157.1726,
"eval_samples_per_second": 1.629,
"eval_steps_per_second": 0.814,
"step": 240
},
{
"epoch": 0.05879482800683093,
"grad_norm": 0.517240047454834,
"learning_rate": 2.4852560469848794e-07,
"loss": 1.8066,
"step": 241
},
{
"epoch": 0.05903878994876799,
"grad_norm": 0.45431217551231384,
"learning_rate": 2.4851914356384624e-07,
"loss": 1.763,
"step": 242
},
{
"epoch": 0.05928275189070505,
"grad_norm": 0.5374858975410461,
"learning_rate": 2.485126794320027e-07,
"loss": 1.7991,
"step": 243
},
{
"epoch": 0.05952671383264211,
"grad_norm": 0.4840785562992096,
"learning_rate": 2.4850621230087125e-07,
"loss": 1.9219,
"step": 244
},
{
"epoch": 0.05977067577457917,
"grad_norm": 0.6035332083702087,
"learning_rate": 2.4849974216836405e-07,
"loss": 1.8103,
"step": 245
},
{
"epoch": 0.060014637716516224,
"grad_norm": 0.44333499670028687,
"learning_rate": 2.4849326903239115e-07,
"loss": 1.8412,
"step": 246
},
{
"epoch": 0.06025859965845328,
"grad_norm": 0.7768390774726868,
"learning_rate": 2.4848679289086074e-07,
"loss": 1.9089,
"step": 247
},
{
"epoch": 0.06050256160039034,
"grad_norm": 0.5787532329559326,
"learning_rate": 2.4848031374167913e-07,
"loss": 1.9024,
"step": 248
},
{
"epoch": 0.060746523542327395,
"grad_norm": 0.4455646276473999,
"learning_rate": 2.484738315827505e-07,
"loss": 1.9293,
"step": 249
},
{
"epoch": 0.06099048548426445,
"grad_norm": 0.48859095573425293,
"learning_rate": 2.484673464119773e-07,
"loss": 1.8183,
"step": 250
},
{
"epoch": 0.06099048548426445,
"eval_loss": 1.8416523933410645,
"eval_runtime": 156.2376,
"eval_samples_per_second": 1.639,
"eval_steps_per_second": 0.819,
"step": 250
},
{
"epoch": 0.061234447426201516,
"grad_norm": 0.4281693398952484,
"learning_rate": 2.484608582272598e-07,
"loss": 1.9258,
"step": 251
},
{
"epoch": 0.06147840936813857,
"grad_norm": 0.43426513671875,
"learning_rate": 2.4845436702649656e-07,
"loss": 2.0341,
"step": 252
},
{
"epoch": 0.06172237131007563,
"grad_norm": 0.5216272473335266,
"learning_rate": 2.48447872807584e-07,
"loss": 1.8391,
"step": 253
},
{
"epoch": 0.06196633325201269,
"grad_norm": 0.4329265356063843,
"learning_rate": 2.484413755684167e-07,
"loss": 1.8692,
"step": 254
},
{
"epoch": 0.062210295193949744,
"grad_norm": 1.1542620658874512,
"learning_rate": 2.484348753068872e-07,
"loss": 1.9009,
"step": 255
},
{
"epoch": 0.0624542571358868,
"grad_norm": 0.44065535068511963,
"learning_rate": 2.484283720208861e-07,
"loss": 1.7906,
"step": 256
},
{
"epoch": 0.06269821907782386,
"grad_norm": 0.4028589129447937,
"learning_rate": 2.4842186570830207e-07,
"loss": 1.821,
"step": 257
},
{
"epoch": 0.06294218101976091,
"grad_norm": 0.5287508964538574,
"learning_rate": 2.484153563670218e-07,
"loss": 1.6887,
"step": 258
},
{
"epoch": 0.06318614296169797,
"grad_norm": 0.472429096698761,
"learning_rate": 2.4840884399493006e-07,
"loss": 1.8086,
"step": 259
},
{
"epoch": 0.06343010490363503,
"grad_norm": 0.40466898679733276,
"learning_rate": 2.4840232858990943e-07,
"loss": 1.8095,
"step": 260
},
{
"epoch": 0.06343010490363503,
"eval_loss": 1.8428053855895996,
"eval_runtime": 156.6484,
"eval_samples_per_second": 1.634,
"eval_steps_per_second": 0.817,
"step": 260
},
{
"epoch": 0.06367406684557209,
"grad_norm": 0.5649131536483765,
"learning_rate": 2.4839581014984084e-07,
"loss": 1.8726,
"step": 261
},
{
"epoch": 0.06391802878750914,
"grad_norm": 0.5180754065513611,
"learning_rate": 2.48389288672603e-07,
"loss": 1.9934,
"step": 262
},
{
"epoch": 0.0641619907294462,
"grad_norm": 0.4884182810783386,
"learning_rate": 2.483827641560728e-07,
"loss": 1.7776,
"step": 263
},
{
"epoch": 0.06440595267138327,
"grad_norm": 0.5376865267753601,
"learning_rate": 2.48376236598125e-07,
"loss": 1.7831,
"step": 264
},
{
"epoch": 0.06464991461332033,
"grad_norm": 0.7305421829223633,
"learning_rate": 2.4836970599663255e-07,
"loss": 1.8499,
"step": 265
},
{
"epoch": 0.06489387655525738,
"grad_norm": 0.4067825376987457,
"learning_rate": 2.4836317234946626e-07,
"loss": 1.9762,
"step": 266
},
{
"epoch": 0.06513783849719444,
"grad_norm": 1.1095890998840332,
"learning_rate": 2.48356635654495e-07,
"loss": 1.884,
"step": 267
},
{
"epoch": 0.0653818004391315,
"grad_norm": 1.5947470664978027,
"learning_rate": 2.4835009590958575e-07,
"loss": 1.8838,
"step": 268
},
{
"epoch": 0.06562576238106856,
"grad_norm": 0.5433115363121033,
"learning_rate": 2.483435531126034e-07,
"loss": 1.9129,
"step": 269
},
{
"epoch": 0.06586972432300561,
"grad_norm": 0.43899622559547424,
"learning_rate": 2.483370072614108e-07,
"loss": 1.7831,
"step": 270
},
{
"epoch": 0.06586972432300561,
"eval_loss": 1.839111328125,
"eval_runtime": 156.1734,
"eval_samples_per_second": 1.639,
"eval_steps_per_second": 0.82,
"step": 270
},
{
"epoch": 0.06611368626494267,
"grad_norm": 0.44969475269317627,
"learning_rate": 2.483304583538689e-07,
"loss": 1.901,
"step": 271
},
{
"epoch": 0.06635764820687973,
"grad_norm": 0.42426538467407227,
"learning_rate": 2.4832390638783666e-07,
"loss": 1.8534,
"step": 272
},
{
"epoch": 0.06660161014881678,
"grad_norm": 0.511674702167511,
"learning_rate": 2.4831735136117095e-07,
"loss": 1.9139,
"step": 273
},
{
"epoch": 0.06684557209075384,
"grad_norm": 0.43454718589782715,
"learning_rate": 2.4831079327172674e-07,
"loss": 1.9442,
"step": 274
},
{
"epoch": 0.0670895340326909,
"grad_norm": 0.4460424780845642,
"learning_rate": 2.4830423211735686e-07,
"loss": 1.9378,
"step": 275
},
{
"epoch": 0.06733349597462795,
"grad_norm": 0.6298746466636658,
"learning_rate": 2.482976678959123e-07,
"loss": 1.8372,
"step": 276
},
{
"epoch": 0.06757745791656501,
"grad_norm": 0.44850224256515503,
"learning_rate": 2.4829110060524197e-07,
"loss": 1.8511,
"step": 277
},
{
"epoch": 0.06782141985850207,
"grad_norm": 0.4357118308544159,
"learning_rate": 2.482845302431927e-07,
"loss": 1.763,
"step": 278
},
{
"epoch": 0.06806538180043913,
"grad_norm": 0.3952440023422241,
"learning_rate": 2.4827795680760933e-07,
"loss": 1.9439,
"step": 279
},
{
"epoch": 0.06830934374237618,
"grad_norm": 0.4903910458087921,
"learning_rate": 2.482713802963348e-07,
"loss": 1.811,
"step": 280
},
{
"epoch": 0.06830934374237618,
"eval_loss": 1.8365715742111206,
"eval_runtime": 157.7942,
"eval_samples_per_second": 1.622,
"eval_steps_per_second": 0.811,
"step": 280
},
{
"epoch": 0.06855330568431325,
"grad_norm": 0.5027759075164795,
"learning_rate": 2.4826480070720985e-07,
"loss": 1.9209,
"step": 281
},
{
"epoch": 0.06879726762625031,
"grad_norm": 0.4530917704105377,
"learning_rate": 2.482582180380734e-07,
"loss": 1.8037,
"step": 282
},
{
"epoch": 0.06904122956818737,
"grad_norm": 0.4016598165035248,
"learning_rate": 2.482516322867622e-07,
"loss": 1.8756,
"step": 283
},
{
"epoch": 0.06928519151012442,
"grad_norm": 0.4351702630519867,
"learning_rate": 2.48245043451111e-07,
"loss": 2.0021,
"step": 284
},
{
"epoch": 0.06952915345206148,
"grad_norm": 0.4535478949546814,
"learning_rate": 2.482384515289525e-07,
"loss": 1.8903,
"step": 285
},
{
"epoch": 0.06977311539399854,
"grad_norm": 0.4296678304672241,
"learning_rate": 2.482318565181174e-07,
"loss": 1.916,
"step": 286
},
{
"epoch": 0.0700170773359356,
"grad_norm": 0.6348395347595215,
"learning_rate": 2.4822525841643453e-07,
"loss": 1.895,
"step": 287
},
{
"epoch": 0.07026103927787265,
"grad_norm": 0.4949493706226349,
"learning_rate": 2.482186572217303e-07,
"loss": 2.07,
"step": 288
},
{
"epoch": 0.07050500121980971,
"grad_norm": 0.4145565927028656,
"learning_rate": 2.482120529318294e-07,
"loss": 1.8886,
"step": 289
},
{
"epoch": 0.07074896316174677,
"grad_norm": 0.5197605490684509,
"learning_rate": 2.482054455445545e-07,
"loss": 1.876,
"step": 290
},
{
"epoch": 0.07074896316174677,
"eval_loss": 1.8359309434890747,
"eval_runtime": 156.5279,
"eval_samples_per_second": 1.635,
"eval_steps_per_second": 0.818,
"step": 290
},
{
"epoch": 0.07099292510368382,
"grad_norm": 0.42653581500053406,
"learning_rate": 2.481988350577259e-07,
"loss": 1.8605,
"step": 291
},
{
"epoch": 0.07123688704562088,
"grad_norm": 0.3822322189807892,
"learning_rate": 2.481922214691622e-07,
"loss": 1.844,
"step": 292
},
{
"epoch": 0.07148084898755794,
"grad_norm": 0.4121018946170807,
"learning_rate": 2.481856047766798e-07,
"loss": 1.9521,
"step": 293
},
{
"epoch": 0.071724810929495,
"grad_norm": 0.3980840742588043,
"learning_rate": 2.4817898497809304e-07,
"loss": 1.8008,
"step": 294
},
{
"epoch": 0.07196877287143205,
"grad_norm": 0.7482399344444275,
"learning_rate": 2.4817236207121427e-07,
"loss": 1.8344,
"step": 295
},
{
"epoch": 0.07221273481336911,
"grad_norm": 0.5517648458480835,
"learning_rate": 2.4816573605385374e-07,
"loss": 1.9856,
"step": 296
},
{
"epoch": 0.07245669675530617,
"grad_norm": 0.3954029381275177,
"learning_rate": 2.481591069238197e-07,
"loss": 1.7306,
"step": 297
},
{
"epoch": 0.07270065869724324,
"grad_norm": 0.6213473677635193,
"learning_rate": 2.481524746789182e-07,
"loss": 1.873,
"step": 298
},
{
"epoch": 0.0729446206391803,
"grad_norm": 0.42206960916519165,
"learning_rate": 2.4814583931695343e-07,
"loss": 1.9073,
"step": 299
},
{
"epoch": 0.07318858258111735,
"grad_norm": 0.4138680100440979,
"learning_rate": 2.4813920083572734e-07,
"loss": 1.7581,
"step": 300
},
{
"epoch": 0.07318858258111735,
"eval_loss": 1.8346822261810303,
"eval_runtime": 156.8712,
"eval_samples_per_second": 1.632,
"eval_steps_per_second": 0.816,
"step": 300
},
{
"epoch": 0.07343254452305441,
"grad_norm": 0.9438842535018921,
"learning_rate": 2.481325592330399e-07,
"loss": 1.8472,
"step": 301
},
{
"epoch": 0.07367650646499146,
"grad_norm": 0.3860412538051605,
"learning_rate": 2.4812591450668896e-07,
"loss": 1.8402,
"step": 302
},
{
"epoch": 0.07392046840692852,
"grad_norm": 0.33647987246513367,
"learning_rate": 2.4811926665447034e-07,
"loss": 1.9474,
"step": 303
},
{
"epoch": 0.07416443034886558,
"grad_norm": 0.3667222559452057,
"learning_rate": 2.481126156741779e-07,
"loss": 1.8661,
"step": 304
},
{
"epoch": 0.07440839229080264,
"grad_norm": 0.47111183404922485,
"learning_rate": 2.481059615636031e-07,
"loss": 1.7963,
"step": 305
},
{
"epoch": 0.07465235423273969,
"grad_norm": 0.4970519244670868,
"learning_rate": 2.480993043205356e-07,
"loss": 1.7931,
"step": 306
},
{
"epoch": 0.07489631617467675,
"grad_norm": 0.43172699213027954,
"learning_rate": 2.4809264394276297e-07,
"loss": 1.8096,
"step": 307
},
{
"epoch": 0.0751402781166138,
"grad_norm": 1.3444660902023315,
"learning_rate": 2.4808598042807057e-07,
"loss": 1.9013,
"step": 308
},
{
"epoch": 0.07538424005855086,
"grad_norm": 0.39566361904144287,
"learning_rate": 2.4807931377424167e-07,
"loss": 1.8494,
"step": 309
},
{
"epoch": 0.07562820200048792,
"grad_norm": 0.37536919116973877,
"learning_rate": 2.4807264397905757e-07,
"loss": 1.9214,
"step": 310
},
{
"epoch": 0.07562820200048792,
"eval_loss": 1.8326919078826904,
"eval_runtime": 156.8066,
"eval_samples_per_second": 1.633,
"eval_steps_per_second": 0.816,
"step": 310
},
{
"epoch": 0.07587216394242498,
"grad_norm": 0.515691339969635,
"learning_rate": 2.480659710402974e-07,
"loss": 1.8315,
"step": 311
},
{
"epoch": 0.07611612588436203,
"grad_norm": 0.5210254192352295,
"learning_rate": 2.480592949557383e-07,
"loss": 1.9244,
"step": 312
},
{
"epoch": 0.07636008782629909,
"grad_norm": 0.5208694338798523,
"learning_rate": 2.4805261572315513e-07,
"loss": 1.8838,
"step": 313
},
{
"epoch": 0.07660404976823615,
"grad_norm": 0.4405214786529541,
"learning_rate": 2.480459333403207e-07,
"loss": 1.816,
"step": 314
},
{
"epoch": 0.07684801171017322,
"grad_norm": 0.4438663423061371,
"learning_rate": 2.480392478050059e-07,
"loss": 1.7578,
"step": 315
},
{
"epoch": 0.07709197365211028,
"grad_norm": 0.4870030879974365,
"learning_rate": 2.4803255911497927e-07,
"loss": 2.0076,
"step": 316
},
{
"epoch": 0.07733593559404733,
"grad_norm": 0.44352516531944275,
"learning_rate": 2.4802586726800744e-07,
"loss": 1.8897,
"step": 317
},
{
"epoch": 0.07757989753598439,
"grad_norm": 0.40144485235214233,
"learning_rate": 2.4801917226185476e-07,
"loss": 1.9574,
"step": 318
},
{
"epoch": 0.07782385947792145,
"grad_norm": 0.4221437871456146,
"learning_rate": 2.480124740942837e-07,
"loss": 1.8748,
"step": 319
},
{
"epoch": 0.0780678214198585,
"grad_norm": 0.39843979477882385,
"learning_rate": 2.480057727630543e-07,
"loss": 1.996,
"step": 320
},
{
"epoch": 0.0780678214198585,
"eval_loss": 1.8313816785812378,
"eval_runtime": 156.6502,
"eval_samples_per_second": 1.634,
"eval_steps_per_second": 0.817,
"step": 320
},
{
"epoch": 0.07831178336179556,
"grad_norm": 0.7306655645370483,
"learning_rate": 2.479990682659248e-07,
"loss": 1.8732,
"step": 321
},
{
"epoch": 0.07855574530373262,
"grad_norm": 0.46410149335861206,
"learning_rate": 2.4799236060065104e-07,
"loss": 1.9037,
"step": 322
},
{
"epoch": 0.07879970724566968,
"grad_norm": 0.4528440833091736,
"learning_rate": 2.47985649764987e-07,
"loss": 1.8296,
"step": 323
},
{
"epoch": 0.07904366918760673,
"grad_norm": 0.5731680989265442,
"learning_rate": 2.4797893575668437e-07,
"loss": 1.839,
"step": 324
},
{
"epoch": 0.07928763112954379,
"grad_norm": 0.3977627456188202,
"learning_rate": 2.4797221857349267e-07,
"loss": 1.9664,
"step": 325
},
{
"epoch": 0.07953159307148085,
"grad_norm": 0.7255275249481201,
"learning_rate": 2.4796549821315954e-07,
"loss": 1.8649,
"step": 326
},
{
"epoch": 0.0797755550134179,
"grad_norm": 0.4904336929321289,
"learning_rate": 2.479587746734302e-07,
"loss": 1.945,
"step": 327
},
{
"epoch": 0.08001951695535496,
"grad_norm": 0.46819430589675903,
"learning_rate": 2.4795204795204794e-07,
"loss": 1.894,
"step": 328
},
{
"epoch": 0.08026347889729202,
"grad_norm": 0.8833802938461304,
"learning_rate": 2.479453180467538e-07,
"loss": 1.8628,
"step": 329
},
{
"epoch": 0.08050744083922907,
"grad_norm": 0.44334056973457336,
"learning_rate": 2.479385849552867e-07,
"loss": 1.8583,
"step": 330
},
{
"epoch": 0.08050744083922907,
"eval_loss": 1.8302311897277832,
"eval_runtime": 156.8163,
"eval_samples_per_second": 1.632,
"eval_steps_per_second": 0.816,
"step": 330
},
{
"epoch": 0.08075140278116613,
"grad_norm": 0.4154978394508362,
"learning_rate": 2.479318486753834e-07,
"loss": 1.7181,
"step": 331
},
{
"epoch": 0.0809953647231032,
"grad_norm": 0.5498473048210144,
"learning_rate": 2.479251092047787e-07,
"loss": 2.1092,
"step": 332
},
{
"epoch": 0.08123932666504026,
"grad_norm": 0.41959795355796814,
"learning_rate": 2.4791836654120494e-07,
"loss": 1.853,
"step": 333
},
{
"epoch": 0.08148328860697732,
"grad_norm": 0.48775970935821533,
"learning_rate": 2.4791162068239256e-07,
"loss": 1.878,
"step": 334
},
{
"epoch": 0.08172725054891437,
"grad_norm": 1.0387691259384155,
"learning_rate": 2.4790487162606977e-07,
"loss": 1.9639,
"step": 335
},
{
"epoch": 0.08197121249085143,
"grad_norm": 0.4307618737220764,
"learning_rate": 2.478981193699626e-07,
"loss": 1.798,
"step": 336
},
{
"epoch": 0.08221517443278849,
"grad_norm": 0.8073650598526001,
"learning_rate": 2.478913639117949e-07,
"loss": 1.8512,
"step": 337
},
{
"epoch": 0.08245913637472554,
"grad_norm": 0.785327136516571,
"learning_rate": 2.478846052492885e-07,
"loss": 1.8926,
"step": 338
},
{
"epoch": 0.0827030983166626,
"grad_norm": 0.4723658263683319,
"learning_rate": 2.478778433801629e-07,
"loss": 1.9997,
"step": 339
},
{
"epoch": 0.08294706025859966,
"grad_norm": 0.4107203185558319,
"learning_rate": 2.478710783021355e-07,
"loss": 1.8609,
"step": 340
},
{
"epoch": 0.08294706025859966,
"eval_loss": 1.829516887664795,
"eval_runtime": 156.5752,
"eval_samples_per_second": 1.635,
"eval_steps_per_second": 0.817,
"step": 340
},
{
"epoch": 0.08319102220053672,
"grad_norm": 0.40097326040267944,
"learning_rate": 2.4786431001292156e-07,
"loss": 1.7514,
"step": 341
},
{
"epoch": 0.08343498414247377,
"grad_norm": 0.39558151364326477,
"learning_rate": 2.478575385102342e-07,
"loss": 1.9019,
"step": 342
},
{
"epoch": 0.08367894608441083,
"grad_norm": 0.3937402367591858,
"learning_rate": 2.4785076379178427e-07,
"loss": 2.0703,
"step": 343
},
{
"epoch": 0.08392290802634789,
"grad_norm": 0.3737332820892334,
"learning_rate": 2.478439858552805e-07,
"loss": 1.8953,
"step": 344
},
{
"epoch": 0.08416686996828494,
"grad_norm": 0.3693140745162964,
"learning_rate": 2.4783720469842943e-07,
"loss": 1.8952,
"step": 345
},
{
"epoch": 0.084410831910222,
"grad_norm": 0.41011977195739746,
"learning_rate": 2.4783042031893544e-07,
"loss": 1.7306,
"step": 346
},
{
"epoch": 0.08465479385215906,
"grad_norm": 0.4407089352607727,
"learning_rate": 2.478236327145007e-07,
"loss": 1.8516,
"step": 347
},
{
"epoch": 0.08489875579409611,
"grad_norm": 0.4775758683681488,
"learning_rate": 2.4781684188282526e-07,
"loss": 1.8198,
"step": 348
},
{
"epoch": 0.08514271773603319,
"grad_norm": 0.37072694301605225,
"learning_rate": 2.4781004782160693e-07,
"loss": 1.9177,
"step": 349
},
{
"epoch": 0.08538667967797024,
"grad_norm": 0.3914446532726288,
"learning_rate": 2.478032505285412e-07,
"loss": 1.8334,
"step": 350
},
{
"epoch": 0.08538667967797024,
"eval_loss": 1.8291497230529785,
"eval_runtime": 157.2832,
"eval_samples_per_second": 1.628,
"eval_steps_per_second": 0.814,
"step": 350
},
{
"epoch": 0.0856306416199073,
"grad_norm": 0.40111953020095825,
"learning_rate": 2.4779645000132166e-07,
"loss": 1.9745,
"step": 351
},
{
"epoch": 0.08587460356184436,
"grad_norm": 0.4218769967556,
"learning_rate": 2.477896462376395e-07,
"loss": 1.7767,
"step": 352
},
{
"epoch": 0.08611856550378141,
"grad_norm": 1.2748806476593018,
"learning_rate": 2.4778283923518366e-07,
"loss": 1.9835,
"step": 353
},
{
"epoch": 0.08636252744571847,
"grad_norm": 0.9254433512687683,
"learning_rate": 2.477760289916411e-07,
"loss": 1.8909,
"step": 354
},
{
"epoch": 0.08660648938765553,
"grad_norm": 1.155629277229309,
"learning_rate": 2.477692155046964e-07,
"loss": 2.0672,
"step": 355
},
{
"epoch": 0.08685045132959258,
"grad_norm": 0.6299034357070923,
"learning_rate": 2.47762398772032e-07,
"loss": 1.9787,
"step": 356
},
{
"epoch": 0.08709441327152964,
"grad_norm": 0.7239134907722473,
"learning_rate": 2.4775557879132803e-07,
"loss": 1.7728,
"step": 357
},
{
"epoch": 0.0873383752134667,
"grad_norm": 0.4112605154514313,
"learning_rate": 2.4774875556026265e-07,
"loss": 1.824,
"step": 358
},
{
"epoch": 0.08758233715540376,
"grad_norm": 0.4959578812122345,
"learning_rate": 2.477419290765115e-07,
"loss": 1.7778,
"step": 359
},
{
"epoch": 0.08782629909734081,
"grad_norm": 0.4753192961215973,
"learning_rate": 2.4773509933774833e-07,
"loss": 1.6845,
"step": 360
},
{
"epoch": 0.08782629909734081,
"eval_loss": 1.8272368907928467,
"eval_runtime": 156.5455,
"eval_samples_per_second": 1.635,
"eval_steps_per_second": 0.818,
"step": 360
},
{
"epoch": 0.08807026103927787,
"grad_norm": 0.39284539222717285,
"learning_rate": 2.4772826634164435e-07,
"loss": 1.6858,
"step": 361
},
{
"epoch": 0.08831422298121493,
"grad_norm": 0.48466554284095764,
"learning_rate": 2.4772143008586876e-07,
"loss": 1.9059,
"step": 362
},
{
"epoch": 0.08855818492315198,
"grad_norm": 0.4809161424636841,
"learning_rate": 2.4771459056808844e-07,
"loss": 1.9083,
"step": 363
},
{
"epoch": 0.08880214686508904,
"grad_norm": 0.5406439900398254,
"learning_rate": 2.477077477859681e-07,
"loss": 1.8219,
"step": 364
},
{
"epoch": 0.0890461088070261,
"grad_norm": 0.5194385647773743,
"learning_rate": 2.4770090173717014e-07,
"loss": 1.7921,
"step": 365
},
{
"epoch": 0.08929007074896317,
"grad_norm": 0.412882536649704,
"learning_rate": 2.4769405241935484e-07,
"loss": 1.7941,
"step": 366
},
{
"epoch": 0.08953403269090023,
"grad_norm": 0.37151506543159485,
"learning_rate": 2.476871998301802e-07,
"loss": 1.7942,
"step": 367
},
{
"epoch": 0.08977799463283728,
"grad_norm": 0.4231220483779907,
"learning_rate": 2.476803439673019e-07,
"loss": 1.8722,
"step": 368
},
{
"epoch": 0.09002195657477434,
"grad_norm": 0.5867494344711304,
"learning_rate": 2.476734848283735e-07,
"loss": 1.9138,
"step": 369
},
{
"epoch": 0.0902659185167114,
"grad_norm": 0.3956262171268463,
"learning_rate": 2.476666224110462e-07,
"loss": 1.9813,
"step": 370
},
{
"epoch": 0.0902659185167114,
"eval_loss": 1.826444149017334,
"eval_runtime": 157.275,
"eval_samples_per_second": 1.628,
"eval_steps_per_second": 0.814,
"step": 370
},
{
"epoch": 0.09050988045864845,
"grad_norm": 0.42614656686782837,
"learning_rate": 2.476597567129691e-07,
"loss": 1.7726,
"step": 371
},
{
"epoch": 0.09075384240058551,
"grad_norm": 0.47062888741493225,
"learning_rate": 2.4765288773178894e-07,
"loss": 1.8998,
"step": 372
},
{
"epoch": 0.09099780434252257,
"grad_norm": 0.43838515877723694,
"learning_rate": 2.476460154651503e-07,
"loss": 1.8538,
"step": 373
},
{
"epoch": 0.09124176628445962,
"grad_norm": 0.6669487357139587,
"learning_rate": 2.4763913991069527e-07,
"loss": 1.8683,
"step": 374
},
{
"epoch": 0.09148572822639668,
"grad_norm": 0.4067532420158386,
"learning_rate": 2.4763226106606407e-07,
"loss": 1.8279,
"step": 375
},
{
"epoch": 0.09172969016833374,
"grad_norm": 1.4081276655197144,
"learning_rate": 2.476253789288943e-07,
"loss": 1.6806,
"step": 376
},
{
"epoch": 0.0919736521102708,
"grad_norm": 0.5126282572746277,
"learning_rate": 2.4761849349682154e-07,
"loss": 1.7196,
"step": 377
},
{
"epoch": 0.09221761405220785,
"grad_norm": 0.47513243556022644,
"learning_rate": 2.4761160476747895e-07,
"loss": 1.7233,
"step": 378
},
{
"epoch": 0.09246157599414491,
"grad_norm": 0.5680952072143555,
"learning_rate": 2.4760471273849755e-07,
"loss": 1.9624,
"step": 379
},
{
"epoch": 0.09270553793608197,
"grad_norm": 0.4912157654762268,
"learning_rate": 2.47597817407506e-07,
"loss": 1.961,
"step": 380
},
{
"epoch": 0.09270553793608197,
"eval_loss": 1.8258123397827148,
"eval_runtime": 156.3289,
"eval_samples_per_second": 1.638,
"eval_steps_per_second": 0.819,
"step": 380
},
{
"epoch": 0.09294949987801902,
"grad_norm": 0.5005534291267395,
"learning_rate": 2.475909187721307e-07,
"loss": 1.8626,
"step": 381
},
{
"epoch": 0.09319346181995608,
"grad_norm": 0.45611926913261414,
"learning_rate": 2.4758401682999573e-07,
"loss": 1.919,
"step": 382
},
{
"epoch": 0.09343742376189315,
"grad_norm": 0.5665335655212402,
"learning_rate": 2.475771115787231e-07,
"loss": 1.8476,
"step": 383
},
{
"epoch": 0.09368138570383021,
"grad_norm": 0.4179742634296417,
"learning_rate": 2.475702030159322e-07,
"loss": 1.7702,
"step": 384
},
{
"epoch": 0.09392534764576727,
"grad_norm": 0.44780439138412476,
"learning_rate": 2.475632911392405e-07,
"loss": 1.7905,
"step": 385
},
{
"epoch": 0.09416930958770432,
"grad_norm": 0.9271466732025146,
"learning_rate": 2.475563759462629e-07,
"loss": 1.976,
"step": 386
},
{
"epoch": 0.09441327152964138,
"grad_norm": 0.6895579099655151,
"learning_rate": 2.475494574346122e-07,
"loss": 1.9016,
"step": 387
},
{
"epoch": 0.09465723347157844,
"grad_norm": 0.4328395426273346,
"learning_rate": 2.475425356018988e-07,
"loss": 1.7875,
"step": 388
},
{
"epoch": 0.0949011954135155,
"grad_norm": 0.4196988344192505,
"learning_rate": 2.475356104457307e-07,
"loss": 1.7607,
"step": 389
},
{
"epoch": 0.09514515735545255,
"grad_norm": 0.4333524703979492,
"learning_rate": 2.4752868196371393e-07,
"loss": 1.9771,
"step": 390
},
{
"epoch": 0.09514515735545255,
"eval_loss": 1.8251597881317139,
"eval_runtime": 157.0151,
"eval_samples_per_second": 1.63,
"eval_steps_per_second": 0.815,
"step": 390
},
{
"epoch": 0.09538911929738961,
"grad_norm": 0.6076596975326538,
"learning_rate": 2.47521750153452e-07,
"loss": 2.1356,
"step": 391
},
{
"epoch": 0.09563308123932666,
"grad_norm": 0.43572092056274414,
"learning_rate": 2.4751481501254606e-07,
"loss": 1.9217,
"step": 392
},
{
"epoch": 0.09587704318126372,
"grad_norm": 23.73161506652832,
"learning_rate": 2.4750787653859505e-07,
"loss": 2.1093,
"step": 393
},
{
"epoch": 0.09612100512320078,
"grad_norm": 0.46901410818099976,
"learning_rate": 2.475009347291956e-07,
"loss": 1.9877,
"step": 394
},
{
"epoch": 0.09636496706513784,
"grad_norm": 0.4053335189819336,
"learning_rate": 2.47493989581942e-07,
"loss": 1.9272,
"step": 395
},
{
"epoch": 0.09660892900707489,
"grad_norm": 0.4614839255809784,
"learning_rate": 2.4748704109442635e-07,
"loss": 1.885,
"step": 396
},
{
"epoch": 0.09685289094901195,
"grad_norm": 0.4277932047843933,
"learning_rate": 2.4748008926423817e-07,
"loss": 1.808,
"step": 397
},
{
"epoch": 0.097096852890949,
"grad_norm": 0.41171425580978394,
"learning_rate": 2.474731340889649e-07,
"loss": 1.928,
"step": 398
},
{
"epoch": 0.09734081483288606,
"grad_norm": 0.41549429297447205,
"learning_rate": 2.4746617556619163e-07,
"loss": 1.7844,
"step": 399
},
{
"epoch": 0.09758477677482313,
"grad_norm": 0.4279956817626953,
"learning_rate": 2.4745921369350094e-07,
"loss": 1.9173,
"step": 400
},
{
"epoch": 0.09758477677482313,
"eval_loss": 1.823663353919983,
"eval_runtime": 157.0142,
"eval_samples_per_second": 1.63,
"eval_steps_per_second": 0.815,
"step": 400
}
],
"logging_steps": 1,
"max_steps": 4099,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 10,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.0291845984681984e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}