ironlady's picture
Upload folder using huggingface_hub
bdf403e verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.2468827930174564,
"eval_steps": 500,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0012468827930174563,
"grad_norm": 0.804787278175354,
"learning_rate": 0.0,
"loss": 1.1757,
"step": 1
},
{
"epoch": 0.0024937655860349127,
"grad_norm": 0.790920615196228,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.1733,
"step": 2
},
{
"epoch": 0.003740648379052369,
"grad_norm": 0.7582283616065979,
"learning_rate": 4.000000000000001e-06,
"loss": 1.2245,
"step": 3
},
{
"epoch": 0.004987531172069825,
"grad_norm": 0.8788309097290039,
"learning_rate": 6e-06,
"loss": 1.2966,
"step": 4
},
{
"epoch": 0.006234413965087282,
"grad_norm": 0.8301644921302795,
"learning_rate": 8.000000000000001e-06,
"loss": 1.1881,
"step": 5
},
{
"epoch": 0.007481296758104738,
"grad_norm": 0.8226048946380615,
"learning_rate": 1e-05,
"loss": 1.2827,
"step": 6
},
{
"epoch": 0.008728179551122194,
"grad_norm": 0.9518041610717773,
"learning_rate": 9.98994974874372e-06,
"loss": 1.3434,
"step": 7
},
{
"epoch": 0.00997506234413965,
"grad_norm": 0.8201262354850769,
"learning_rate": 9.979899497487437e-06,
"loss": 1.325,
"step": 8
},
{
"epoch": 0.011221945137157107,
"grad_norm": 0.893440842628479,
"learning_rate": 9.969849246231156e-06,
"loss": 1.2632,
"step": 9
},
{
"epoch": 0.012468827930174564,
"grad_norm": 0.8964802026748657,
"learning_rate": 9.959798994974875e-06,
"loss": 1.443,
"step": 10
},
{
"epoch": 0.01371571072319202,
"grad_norm": 0.8286556005477905,
"learning_rate": 9.949748743718594e-06,
"loss": 1.2181,
"step": 11
},
{
"epoch": 0.014962593516209476,
"grad_norm": 0.8081548810005188,
"learning_rate": 9.939698492462311e-06,
"loss": 1.135,
"step": 12
},
{
"epoch": 0.016209476309226933,
"grad_norm": 0.9213963747024536,
"learning_rate": 9.929648241206032e-06,
"loss": 1.2343,
"step": 13
},
{
"epoch": 0.017456359102244388,
"grad_norm": 0.8860071301460266,
"learning_rate": 9.91959798994975e-06,
"loss": 1.1948,
"step": 14
},
{
"epoch": 0.018703241895261846,
"grad_norm": 0.862621545791626,
"learning_rate": 9.909547738693468e-06,
"loss": 1.2403,
"step": 15
},
{
"epoch": 0.0199501246882793,
"grad_norm": 0.8532464504241943,
"learning_rate": 9.899497487437186e-06,
"loss": 1.1682,
"step": 16
},
{
"epoch": 0.02119700748129676,
"grad_norm": 0.856429934501648,
"learning_rate": 9.889447236180906e-06,
"loss": 1.2942,
"step": 17
},
{
"epoch": 0.022443890274314215,
"grad_norm": 0.8162599802017212,
"learning_rate": 9.879396984924624e-06,
"loss": 1.1377,
"step": 18
},
{
"epoch": 0.02369077306733167,
"grad_norm": 0.8886470198631287,
"learning_rate": 9.869346733668343e-06,
"loss": 1.0809,
"step": 19
},
{
"epoch": 0.02493765586034913,
"grad_norm": 0.9593124985694885,
"learning_rate": 9.859296482412062e-06,
"loss": 1.2939,
"step": 20
},
{
"epoch": 0.026184538653366583,
"grad_norm": 0.7625219225883484,
"learning_rate": 9.84924623115578e-06,
"loss": 0.9588,
"step": 21
},
{
"epoch": 0.02743142144638404,
"grad_norm": 0.8982052206993103,
"learning_rate": 9.839195979899498e-06,
"loss": 1.1753,
"step": 22
},
{
"epoch": 0.028678304239401497,
"grad_norm": 0.7942224144935608,
"learning_rate": 9.829145728643217e-06,
"loss": 1.3045,
"step": 23
},
{
"epoch": 0.029925187032418952,
"grad_norm": 0.836660623550415,
"learning_rate": 9.819095477386936e-06,
"loss": 1.193,
"step": 24
},
{
"epoch": 0.03117206982543641,
"grad_norm": 0.8261688947677612,
"learning_rate": 9.809045226130655e-06,
"loss": 1.1704,
"step": 25
},
{
"epoch": 0.032418952618453865,
"grad_norm": 0.7656853795051575,
"learning_rate": 9.798994974874372e-06,
"loss": 1.1259,
"step": 26
},
{
"epoch": 0.03366583541147132,
"grad_norm": 0.7688207030296326,
"learning_rate": 9.788944723618091e-06,
"loss": 1.0714,
"step": 27
},
{
"epoch": 0.034912718204488775,
"grad_norm": 0.7970144152641296,
"learning_rate": 9.77889447236181e-06,
"loss": 1.0393,
"step": 28
},
{
"epoch": 0.03615960099750624,
"grad_norm": 0.7941057085990906,
"learning_rate": 9.768844221105529e-06,
"loss": 1.1779,
"step": 29
},
{
"epoch": 0.03740648379052369,
"grad_norm": 1.0016902685165405,
"learning_rate": 9.758793969849248e-06,
"loss": 1.225,
"step": 30
},
{
"epoch": 0.03865336658354115,
"grad_norm": 0.7756917476654053,
"learning_rate": 9.748743718592965e-06,
"loss": 1.0145,
"step": 31
},
{
"epoch": 0.0399002493765586,
"grad_norm": 0.7895839214324951,
"learning_rate": 9.738693467336684e-06,
"loss": 1.1148,
"step": 32
},
{
"epoch": 0.04114713216957606,
"grad_norm": 0.8240381479263306,
"learning_rate": 9.728643216080402e-06,
"loss": 1.2112,
"step": 33
},
{
"epoch": 0.04239401496259352,
"grad_norm": 0.8370051383972168,
"learning_rate": 9.718592964824122e-06,
"loss": 1.0704,
"step": 34
},
{
"epoch": 0.043640897755610975,
"grad_norm": 0.9508376121520996,
"learning_rate": 9.70854271356784e-06,
"loss": 0.9383,
"step": 35
},
{
"epoch": 0.04488778054862843,
"grad_norm": 0.8891584873199463,
"learning_rate": 9.698492462311559e-06,
"loss": 1.1037,
"step": 36
},
{
"epoch": 0.046134663341645885,
"grad_norm": NaN,
"learning_rate": 9.688442211055276e-06,
"loss": 1.0071,
"step": 37
},
{
"epoch": 0.04738154613466334,
"grad_norm": 0.9675924181938171,
"learning_rate": 9.688442211055276e-06,
"loss": 1.2434,
"step": 38
},
{
"epoch": 0.048628428927680795,
"grad_norm": 0.9512777924537659,
"learning_rate": 9.678391959798997e-06,
"loss": 1.3088,
"step": 39
},
{
"epoch": 0.04987531172069826,
"grad_norm": 0.8200197219848633,
"learning_rate": 9.668341708542714e-06,
"loss": 1.0938,
"step": 40
},
{
"epoch": 0.05112219451371571,
"grad_norm": 0.7867804169654846,
"learning_rate": 9.658291457286433e-06,
"loss": 1.0325,
"step": 41
},
{
"epoch": 0.05236907730673317,
"grad_norm": 0.845544159412384,
"learning_rate": 9.648241206030152e-06,
"loss": 1.1014,
"step": 42
},
{
"epoch": 0.05361596009975062,
"grad_norm": 0.957232654094696,
"learning_rate": 9.63819095477387e-06,
"loss": 1.1305,
"step": 43
},
{
"epoch": 0.05486284289276808,
"grad_norm": 0.9232934713363647,
"learning_rate": 9.628140703517588e-06,
"loss": 1.1946,
"step": 44
},
{
"epoch": 0.05610972568578554,
"grad_norm": 0.8494632244110107,
"learning_rate": 9.618090452261307e-06,
"loss": 0.9476,
"step": 45
},
{
"epoch": 0.057356608478802994,
"grad_norm": 0.9420594573020935,
"learning_rate": 9.608040201005026e-06,
"loss": 1.2314,
"step": 46
},
{
"epoch": 0.05860349127182045,
"grad_norm": 0.8446037173271179,
"learning_rate": 9.597989949748745e-06,
"loss": 1.0914,
"step": 47
},
{
"epoch": 0.059850374064837904,
"grad_norm": 0.8058528304100037,
"learning_rate": 9.587939698492464e-06,
"loss": 1.0774,
"step": 48
},
{
"epoch": 0.06109725685785536,
"grad_norm": 0.8828840255737305,
"learning_rate": 9.577889447236181e-06,
"loss": 1.0563,
"step": 49
},
{
"epoch": 0.06234413965087282,
"grad_norm": 0.8446970582008362,
"learning_rate": 9.5678391959799e-06,
"loss": 1.0121,
"step": 50
},
{
"epoch": 0.06359102244389027,
"grad_norm": 0.9123779535293579,
"learning_rate": 9.55778894472362e-06,
"loss": 1.0908,
"step": 51
},
{
"epoch": 0.06483790523690773,
"grad_norm": 0.8337482213973999,
"learning_rate": 9.547738693467338e-06,
"loss": 0.962,
"step": 52
},
{
"epoch": 0.06608478802992519,
"grad_norm": 0.9094334840774536,
"learning_rate": 9.537688442211056e-06,
"loss": 1.1041,
"step": 53
},
{
"epoch": 0.06733167082294264,
"grad_norm": 0.9099583625793457,
"learning_rate": 9.527638190954775e-06,
"loss": 1.17,
"step": 54
},
{
"epoch": 0.0685785536159601,
"grad_norm": 0.9041770100593567,
"learning_rate": 9.517587939698492e-06,
"loss": 1.1931,
"step": 55
},
{
"epoch": 0.06982543640897755,
"grad_norm": 0.8545364141464233,
"learning_rate": 9.507537688442213e-06,
"loss": 1.1039,
"step": 56
},
{
"epoch": 0.07107231920199501,
"grad_norm": 0.7713873982429504,
"learning_rate": 9.49748743718593e-06,
"loss": 1.0732,
"step": 57
},
{
"epoch": 0.07231920199501247,
"grad_norm": 0.8700547218322754,
"learning_rate": 9.487437185929649e-06,
"loss": 1.1166,
"step": 58
},
{
"epoch": 0.07356608478802992,
"grad_norm": 0.8497898578643799,
"learning_rate": 9.477386934673368e-06,
"loss": 1.0371,
"step": 59
},
{
"epoch": 0.07481296758104738,
"grad_norm": 0.8998361825942993,
"learning_rate": 9.467336683417087e-06,
"loss": 1.0501,
"step": 60
},
{
"epoch": 0.07605985037406483,
"grad_norm": 0.8359726071357727,
"learning_rate": 9.457286432160804e-06,
"loss": 1.0504,
"step": 61
},
{
"epoch": 0.0773067331670823,
"grad_norm": 0.8385725021362305,
"learning_rate": 9.447236180904523e-06,
"loss": 0.9689,
"step": 62
},
{
"epoch": 0.07855361596009976,
"grad_norm": 0.8776196837425232,
"learning_rate": 9.437185929648242e-06,
"loss": 1.2352,
"step": 63
},
{
"epoch": 0.0798004987531172,
"grad_norm": 0.8644843697547913,
"learning_rate": 9.427135678391961e-06,
"loss": 1.2404,
"step": 64
},
{
"epoch": 0.08104738154613467,
"grad_norm": 0.8087641000747681,
"learning_rate": 9.41708542713568e-06,
"loss": 1.1482,
"step": 65
},
{
"epoch": 0.08229426433915212,
"grad_norm": 0.8813942074775696,
"learning_rate": 9.407035175879397e-06,
"loss": 1.1132,
"step": 66
},
{
"epoch": 0.08354114713216958,
"grad_norm": 0.828109085559845,
"learning_rate": 9.396984924623116e-06,
"loss": 1.0426,
"step": 67
},
{
"epoch": 0.08478802992518704,
"grad_norm": 0.9150412678718567,
"learning_rate": 9.386934673366835e-06,
"loss": 1.0266,
"step": 68
},
{
"epoch": 0.08603491271820449,
"grad_norm": 0.8631511330604553,
"learning_rate": 9.376884422110554e-06,
"loss": 0.932,
"step": 69
},
{
"epoch": 0.08728179551122195,
"grad_norm": 0.9301472902297974,
"learning_rate": 9.366834170854272e-06,
"loss": 1.0432,
"step": 70
},
{
"epoch": 0.0885286783042394,
"grad_norm": 0.8076997399330139,
"learning_rate": 9.35678391959799e-06,
"loss": 1.0833,
"step": 71
},
{
"epoch": 0.08977556109725686,
"grad_norm": 0.8393658399581909,
"learning_rate": 9.34673366834171e-06,
"loss": 1.0647,
"step": 72
},
{
"epoch": 0.09102244389027432,
"grad_norm": 0.7714030742645264,
"learning_rate": 9.336683417085429e-06,
"loss": 0.9166,
"step": 73
},
{
"epoch": 0.09226932668329177,
"grad_norm": 0.9179011583328247,
"learning_rate": 9.326633165829146e-06,
"loss": 1.0333,
"step": 74
},
{
"epoch": 0.09351620947630923,
"grad_norm": 0.9402894973754883,
"learning_rate": 9.316582914572865e-06,
"loss": 0.853,
"step": 75
},
{
"epoch": 0.09476309226932668,
"grad_norm": 0.8588060736656189,
"learning_rate": 9.306532663316584e-06,
"loss": 1.0942,
"step": 76
},
{
"epoch": 0.09600997506234414,
"grad_norm": 0.8041732311248779,
"learning_rate": 9.296482412060303e-06,
"loss": 1.0404,
"step": 77
},
{
"epoch": 0.09725685785536159,
"grad_norm": 0.903870165348053,
"learning_rate": 9.28643216080402e-06,
"loss": 1.1191,
"step": 78
},
{
"epoch": 0.09850374064837905,
"grad_norm": 0.8884689211845398,
"learning_rate": 9.276381909547739e-06,
"loss": 1.0389,
"step": 79
},
{
"epoch": 0.09975062344139651,
"grad_norm": 0.8007616996765137,
"learning_rate": 9.266331658291458e-06,
"loss": 0.9541,
"step": 80
},
{
"epoch": 0.10099750623441396,
"grad_norm": 0.8733034729957581,
"learning_rate": 9.256281407035177e-06,
"loss": 1.0882,
"step": 81
},
{
"epoch": 0.10224438902743142,
"grad_norm": 0.8743678331375122,
"learning_rate": 9.246231155778896e-06,
"loss": 1.1188,
"step": 82
},
{
"epoch": 0.10349127182044887,
"grad_norm": 0.8868730664253235,
"learning_rate": 9.236180904522613e-06,
"loss": 1.0531,
"step": 83
},
{
"epoch": 0.10473815461346633,
"grad_norm": 0.9104211926460266,
"learning_rate": 9.226130653266332e-06,
"loss": 1.0754,
"step": 84
},
{
"epoch": 0.1059850374064838,
"grad_norm": 0.7882856726646423,
"learning_rate": 9.216080402010051e-06,
"loss": 0.9641,
"step": 85
},
{
"epoch": 0.10723192019950124,
"grad_norm": 0.9119329452514648,
"learning_rate": 9.20603015075377e-06,
"loss": 1.1104,
"step": 86
},
{
"epoch": 0.1084788029925187,
"grad_norm": 0.8091046810150146,
"learning_rate": 9.195979899497488e-06,
"loss": 0.9407,
"step": 87
},
{
"epoch": 0.10972568578553615,
"grad_norm": 0.8358633518218994,
"learning_rate": 9.185929648241207e-06,
"loss": 1.1229,
"step": 88
},
{
"epoch": 0.11097256857855362,
"grad_norm": 0.8730635046958923,
"learning_rate": 9.175879396984926e-06,
"loss": 1.0661,
"step": 89
},
{
"epoch": 0.11221945137157108,
"grad_norm": 0.8081979155540466,
"learning_rate": 9.165829145728645e-06,
"loss": 0.9716,
"step": 90
},
{
"epoch": 0.11346633416458853,
"grad_norm": 0.8541569709777832,
"learning_rate": 9.155778894472362e-06,
"loss": 1.0404,
"step": 91
},
{
"epoch": 0.11471321695760599,
"grad_norm": 0.910975992679596,
"learning_rate": 9.14572864321608e-06,
"loss": 0.9578,
"step": 92
},
{
"epoch": 0.11596009975062344,
"grad_norm": 1.0678647756576538,
"learning_rate": 9.1356783919598e-06,
"loss": 1.1944,
"step": 93
},
{
"epoch": 0.1172069825436409,
"grad_norm": 0.8805521726608276,
"learning_rate": 9.125628140703519e-06,
"loss": 1.0066,
"step": 94
},
{
"epoch": 0.11845386533665836,
"grad_norm": 0.9644659757614136,
"learning_rate": 9.115577889447236e-06,
"loss": 1.2971,
"step": 95
},
{
"epoch": 0.11970074812967581,
"grad_norm": 0.8603120446205139,
"learning_rate": 9.105527638190955e-06,
"loss": 1.0409,
"step": 96
},
{
"epoch": 0.12094763092269327,
"grad_norm": 0.9210130572319031,
"learning_rate": 9.095477386934674e-06,
"loss": 1.002,
"step": 97
},
{
"epoch": 0.12219451371571072,
"grad_norm": 1.0696684122085571,
"learning_rate": 9.085427135678393e-06,
"loss": 1.1809,
"step": 98
},
{
"epoch": 0.12344139650872818,
"grad_norm": 0.8545973896980286,
"learning_rate": 9.075376884422112e-06,
"loss": 0.9832,
"step": 99
},
{
"epoch": 0.12468827930174564,
"grad_norm": 0.8907841444015503,
"learning_rate": 9.06532663316583e-06,
"loss": 1.0469,
"step": 100
},
{
"epoch": 0.1259351620947631,
"grad_norm": 0.9196416735649109,
"learning_rate": 9.055276381909548e-06,
"loss": 1.1903,
"step": 101
},
{
"epoch": 0.12718204488778054,
"grad_norm": 0.9015523195266724,
"learning_rate": 9.045226130653267e-06,
"loss": 0.9898,
"step": 102
},
{
"epoch": 0.128428927680798,
"grad_norm": 0.9770464301109314,
"learning_rate": 9.035175879396986e-06,
"loss": 0.9703,
"step": 103
},
{
"epoch": 0.12967581047381546,
"grad_norm": 0.9229538440704346,
"learning_rate": 9.025125628140704e-06,
"loss": 1.1239,
"step": 104
},
{
"epoch": 0.13092269326683292,
"grad_norm": 0.9863462448120117,
"learning_rate": 9.015075376884423e-06,
"loss": 1.1099,
"step": 105
},
{
"epoch": 0.13216957605985039,
"grad_norm": 0.8445208668708801,
"learning_rate": 9.005025125628142e-06,
"loss": 0.8989,
"step": 106
},
{
"epoch": 0.13341645885286782,
"grad_norm": 0.9167593121528625,
"learning_rate": 8.99497487437186e-06,
"loss": 1.0772,
"step": 107
},
{
"epoch": 0.13466334164588528,
"grad_norm": 0.9626464247703552,
"learning_rate": 8.984924623115578e-06,
"loss": 1.0707,
"step": 108
},
{
"epoch": 0.13591022443890274,
"grad_norm": 0.8707993626594543,
"learning_rate": 8.974874371859297e-06,
"loss": 1.1505,
"step": 109
},
{
"epoch": 0.1371571072319202,
"grad_norm": 0.8417862057685852,
"learning_rate": 8.964824120603016e-06,
"loss": 0.9744,
"step": 110
},
{
"epoch": 0.13840399002493767,
"grad_norm": 0.9404187798500061,
"learning_rate": 8.954773869346735e-06,
"loss": 1.029,
"step": 111
},
{
"epoch": 0.1396508728179551,
"grad_norm": 0.930839478969574,
"learning_rate": 8.944723618090452e-06,
"loss": 1.0657,
"step": 112
},
{
"epoch": 0.14089775561097256,
"grad_norm": 0.9424152970314026,
"learning_rate": 8.934673366834171e-06,
"loss": 1.1423,
"step": 113
},
{
"epoch": 0.14214463840399003,
"grad_norm": 0.929474949836731,
"learning_rate": 8.92462311557789e-06,
"loss": 1.1445,
"step": 114
},
{
"epoch": 0.1433915211970075,
"grad_norm": 0.8301953077316284,
"learning_rate": 8.914572864321609e-06,
"loss": 0.9701,
"step": 115
},
{
"epoch": 0.14463840399002495,
"grad_norm": 0.8988891839981079,
"learning_rate": 8.904522613065328e-06,
"loss": 1.0065,
"step": 116
},
{
"epoch": 0.14588528678304238,
"grad_norm": 1.025470495223999,
"learning_rate": 8.894472361809045e-06,
"loss": 1.1352,
"step": 117
},
{
"epoch": 0.14713216957605985,
"grad_norm": 0.9278755784034729,
"learning_rate": 8.884422110552764e-06,
"loss": 1.0044,
"step": 118
},
{
"epoch": 0.1483790523690773,
"grad_norm": 0.8967325091362,
"learning_rate": 8.874371859296483e-06,
"loss": 0.9192,
"step": 119
},
{
"epoch": 0.14962593516209477,
"grad_norm": 0.9023865461349487,
"learning_rate": 8.864321608040202e-06,
"loss": 1.1131,
"step": 120
},
{
"epoch": 0.15087281795511223,
"grad_norm": 0.9116983413696289,
"learning_rate": 8.85427135678392e-06,
"loss": 1.0766,
"step": 121
},
{
"epoch": 0.15211970074812967,
"grad_norm": 0.8636501431465149,
"learning_rate": 8.84422110552764e-06,
"loss": 0.9834,
"step": 122
},
{
"epoch": 0.15336658354114713,
"grad_norm": 0.8848153352737427,
"learning_rate": 8.834170854271358e-06,
"loss": 1.0549,
"step": 123
},
{
"epoch": 0.1546134663341646,
"grad_norm": 0.9087468385696411,
"learning_rate": 8.824120603015077e-06,
"loss": 0.9653,
"step": 124
},
{
"epoch": 0.15586034912718205,
"grad_norm": 0.9463018774986267,
"learning_rate": 8.814070351758794e-06,
"loss": 1.1276,
"step": 125
},
{
"epoch": 0.1571072319201995,
"grad_norm": 0.8822473883628845,
"learning_rate": 8.804020100502513e-06,
"loss": 0.915,
"step": 126
},
{
"epoch": 0.15835411471321695,
"grad_norm": 0.7982257604598999,
"learning_rate": 8.793969849246232e-06,
"loss": 0.9285,
"step": 127
},
{
"epoch": 0.1596009975062344,
"grad_norm": 0.9931653738021851,
"learning_rate": 8.78391959798995e-06,
"loss": 1.0668,
"step": 128
},
{
"epoch": 0.16084788029925187,
"grad_norm": 0.9117117524147034,
"learning_rate": 8.773869346733668e-06,
"loss": 0.9205,
"step": 129
},
{
"epoch": 0.16209476309226933,
"grad_norm": 0.9098284244537354,
"learning_rate": 8.763819095477387e-06,
"loss": 1.0401,
"step": 130
},
{
"epoch": 0.1633416458852868,
"grad_norm": 0.8916042447090149,
"learning_rate": 8.753768844221106e-06,
"loss": 0.8941,
"step": 131
},
{
"epoch": 0.16458852867830423,
"grad_norm": 0.9162269234657288,
"learning_rate": 8.743718592964825e-06,
"loss": 0.9431,
"step": 132
},
{
"epoch": 0.1658354114713217,
"grad_norm": 0.8569380640983582,
"learning_rate": 8.733668341708544e-06,
"loss": 1.0078,
"step": 133
},
{
"epoch": 0.16708229426433915,
"grad_norm": 0.9323944449424744,
"learning_rate": 8.723618090452261e-06,
"loss": 1.0198,
"step": 134
},
{
"epoch": 0.16832917705735662,
"grad_norm": 0.8320883512496948,
"learning_rate": 8.71356783919598e-06,
"loss": 0.9819,
"step": 135
},
{
"epoch": 0.16957605985037408,
"grad_norm": 0.9712987542152405,
"learning_rate": 8.7035175879397e-06,
"loss": 1.102,
"step": 136
},
{
"epoch": 0.1708229426433915,
"grad_norm": 0.8580834865570068,
"learning_rate": 8.693467336683418e-06,
"loss": 1.0492,
"step": 137
},
{
"epoch": 0.17206982543640897,
"grad_norm": 0.8358916640281677,
"learning_rate": 8.683417085427136e-06,
"loss": 0.9639,
"step": 138
},
{
"epoch": 0.17331670822942644,
"grad_norm": 0.9156075716018677,
"learning_rate": 8.673366834170856e-06,
"loss": 0.9791,
"step": 139
},
{
"epoch": 0.1745635910224439,
"grad_norm": 0.8522980809211731,
"learning_rate": 8.663316582914574e-06,
"loss": 0.9836,
"step": 140
},
{
"epoch": 0.17581047381546136,
"grad_norm": 0.9722943305969238,
"learning_rate": 8.653266331658293e-06,
"loss": 0.9084,
"step": 141
},
{
"epoch": 0.1770573566084788,
"grad_norm": 0.9219420552253723,
"learning_rate": 8.64321608040201e-06,
"loss": 0.9007,
"step": 142
},
{
"epoch": 0.17830423940149626,
"grad_norm": 0.8741053342819214,
"learning_rate": 8.63316582914573e-06,
"loss": 0.8609,
"step": 143
},
{
"epoch": 0.17955112219451372,
"grad_norm": 0.8802844882011414,
"learning_rate": 8.623115577889448e-06,
"loss": 0.8339,
"step": 144
},
{
"epoch": 0.18079800498753118,
"grad_norm": 0.9138995409011841,
"learning_rate": 8.613065326633167e-06,
"loss": 0.9333,
"step": 145
},
{
"epoch": 0.18204488778054864,
"grad_norm": 0.9759301543235779,
"learning_rate": 8.603015075376884e-06,
"loss": 0.9451,
"step": 146
},
{
"epoch": 0.18329177057356608,
"grad_norm": 0.9661501049995422,
"learning_rate": 8.592964824120603e-06,
"loss": 1.0628,
"step": 147
},
{
"epoch": 0.18453865336658354,
"grad_norm": 0.868390679359436,
"learning_rate": 8.582914572864322e-06,
"loss": 0.8716,
"step": 148
},
{
"epoch": 0.185785536159601,
"grad_norm": 0.9796278476715088,
"learning_rate": 8.572864321608041e-06,
"loss": 1.1557,
"step": 149
},
{
"epoch": 0.18703241895261846,
"grad_norm": 0.9851312041282654,
"learning_rate": 8.56281407035176e-06,
"loss": 1.1572,
"step": 150
},
{
"epoch": 0.1882793017456359,
"grad_norm": 0.9189980626106262,
"learning_rate": 8.552763819095477e-06,
"loss": 0.8847,
"step": 151
},
{
"epoch": 0.18952618453865336,
"grad_norm": 1.0895061492919922,
"learning_rate": 8.542713567839196e-06,
"loss": 1.0452,
"step": 152
},
{
"epoch": 0.19077306733167082,
"grad_norm": 0.869541347026825,
"learning_rate": 8.532663316582915e-06,
"loss": 0.8575,
"step": 153
},
{
"epoch": 0.19201995012468828,
"grad_norm": 0.9798868298530579,
"learning_rate": 8.522613065326634e-06,
"loss": 1.049,
"step": 154
},
{
"epoch": 0.19326683291770574,
"grad_norm": 0.866492509841919,
"learning_rate": 8.512562814070352e-06,
"loss": 0.9961,
"step": 155
},
{
"epoch": 0.19451371571072318,
"grad_norm": 0.9212818741798401,
"learning_rate": 8.50251256281407e-06,
"loss": 0.8827,
"step": 156
},
{
"epoch": 0.19576059850374064,
"grad_norm": 0.9386404156684875,
"learning_rate": 8.49246231155779e-06,
"loss": 0.9415,
"step": 157
},
{
"epoch": 0.1970074812967581,
"grad_norm": 0.8929441571235657,
"learning_rate": 8.482412060301509e-06,
"loss": 0.9375,
"step": 158
},
{
"epoch": 0.19825436408977556,
"grad_norm": 0.8426975011825562,
"learning_rate": 8.472361809045226e-06,
"loss": 0.8832,
"step": 159
},
{
"epoch": 0.19950124688279303,
"grad_norm": 0.9047837257385254,
"learning_rate": 8.462311557788947e-06,
"loss": 0.9897,
"step": 160
},
{
"epoch": 0.20074812967581046,
"grad_norm": 0.9232057929039001,
"learning_rate": 8.452261306532664e-06,
"loss": 0.8688,
"step": 161
},
{
"epoch": 0.20199501246882792,
"grad_norm": 0.9078006744384766,
"learning_rate": 8.442211055276383e-06,
"loss": 0.9336,
"step": 162
},
{
"epoch": 0.20324189526184538,
"grad_norm": 1.0887279510498047,
"learning_rate": 8.4321608040201e-06,
"loss": 1.1665,
"step": 163
},
{
"epoch": 0.20448877805486285,
"grad_norm": 0.9234921336174011,
"learning_rate": 8.42211055276382e-06,
"loss": 0.9505,
"step": 164
},
{
"epoch": 0.2057356608478803,
"grad_norm": 1.1069387197494507,
"learning_rate": 8.412060301507538e-06,
"loss": 1.1186,
"step": 165
},
{
"epoch": 0.20698254364089774,
"grad_norm": 0.9209922552108765,
"learning_rate": 8.402010050251257e-06,
"loss": 0.9881,
"step": 166
},
{
"epoch": 0.2082294264339152,
"grad_norm": 0.981281578540802,
"learning_rate": 8.391959798994976e-06,
"loss": 0.994,
"step": 167
},
{
"epoch": 0.20947630922693267,
"grad_norm": 0.9972983002662659,
"learning_rate": 8.381909547738695e-06,
"loss": 1.1614,
"step": 168
},
{
"epoch": 0.21072319201995013,
"grad_norm": 0.8576176166534424,
"learning_rate": 8.371859296482412e-06,
"loss": 0.9102,
"step": 169
},
{
"epoch": 0.2119700748129676,
"grad_norm": 1.0340651273727417,
"learning_rate": 8.361809045226131e-06,
"loss": 1.0294,
"step": 170
},
{
"epoch": 0.21321695760598502,
"grad_norm": 0.8626962900161743,
"learning_rate": 8.35175879396985e-06,
"loss": 0.9308,
"step": 171
},
{
"epoch": 0.2144638403990025,
"grad_norm": 0.9029504656791687,
"learning_rate": 8.341708542713568e-06,
"loss": 1.0267,
"step": 172
},
{
"epoch": 0.21571072319201995,
"grad_norm": 0.9364102482795715,
"learning_rate": 8.331658291457287e-06,
"loss": 0.9364,
"step": 173
},
{
"epoch": 0.2169576059850374,
"grad_norm": 0.9712755680084229,
"learning_rate": 8.321608040201006e-06,
"loss": 1.0915,
"step": 174
},
{
"epoch": 0.21820448877805487,
"grad_norm": 0.895484447479248,
"learning_rate": 8.311557788944725e-06,
"loss": 0.9385,
"step": 175
},
{
"epoch": 0.2194513715710723,
"grad_norm": 1.0257028341293335,
"learning_rate": 8.301507537688442e-06,
"loss": 1.0074,
"step": 176
},
{
"epoch": 0.22069825436408977,
"grad_norm": 0.9275676012039185,
"learning_rate": 8.291457286432163e-06,
"loss": 0.934,
"step": 177
},
{
"epoch": 0.22194513715710723,
"grad_norm": 1.0876961946487427,
"learning_rate": 8.28140703517588e-06,
"loss": 0.9464,
"step": 178
},
{
"epoch": 0.2231920199501247,
"grad_norm": 1.0299268960952759,
"learning_rate": 8.271356783919599e-06,
"loss": 1.0997,
"step": 179
},
{
"epoch": 0.22443890274314215,
"grad_norm": 1.0264921188354492,
"learning_rate": 8.261306532663316e-06,
"loss": 1.1708,
"step": 180
},
{
"epoch": 0.2256857855361596,
"grad_norm": 1.0043810606002808,
"learning_rate": 8.251256281407037e-06,
"loss": 0.9285,
"step": 181
},
{
"epoch": 0.22693266832917705,
"grad_norm": 0.9285091161727905,
"learning_rate": 8.241206030150754e-06,
"loss": 0.9953,
"step": 182
},
{
"epoch": 0.2281795511221945,
"grad_norm": 1.0979065895080566,
"learning_rate": 8.231155778894473e-06,
"loss": 1.1952,
"step": 183
},
{
"epoch": 0.22942643391521197,
"grad_norm": 0.9747474789619446,
"learning_rate": 8.221105527638192e-06,
"loss": 0.9779,
"step": 184
},
{
"epoch": 0.23067331670822944,
"grad_norm": 0.920181393623352,
"learning_rate": 8.211055276381911e-06,
"loss": 0.8476,
"step": 185
},
{
"epoch": 0.23192019950124687,
"grad_norm": 0.9287520051002502,
"learning_rate": 8.201005025125628e-06,
"loss": 1.0632,
"step": 186
},
{
"epoch": 0.23316708229426433,
"grad_norm": 1.055479645729065,
"learning_rate": 8.190954773869347e-06,
"loss": 1.0071,
"step": 187
},
{
"epoch": 0.2344139650872818,
"grad_norm": 1.0627549886703491,
"learning_rate": 8.180904522613066e-06,
"loss": 0.9392,
"step": 188
},
{
"epoch": 0.23566084788029926,
"grad_norm": 0.9968197345733643,
"learning_rate": 8.170854271356785e-06,
"loss": 1.0351,
"step": 189
},
{
"epoch": 0.23690773067331672,
"grad_norm": 0.9990450739860535,
"learning_rate": 8.160804020100503e-06,
"loss": 0.9192,
"step": 190
},
{
"epoch": 0.23815461346633415,
"grad_norm": 0.9214025139808655,
"learning_rate": 8.150753768844222e-06,
"loss": 0.7776,
"step": 191
},
{
"epoch": 0.23940149625935161,
"grad_norm": 0.9826399683952332,
"learning_rate": 8.14070351758794e-06,
"loss": 0.995,
"step": 192
},
{
"epoch": 0.24064837905236908,
"grad_norm": 0.971366822719574,
"learning_rate": 8.130653266331658e-06,
"loss": 1.0092,
"step": 193
},
{
"epoch": 0.24189526184538654,
"grad_norm": 0.9440286159515381,
"learning_rate": 8.120603015075379e-06,
"loss": 0.9745,
"step": 194
},
{
"epoch": 0.243142144638404,
"grad_norm": 0.9012700915336609,
"learning_rate": 8.110552763819096e-06,
"loss": 0.9489,
"step": 195
},
{
"epoch": 0.24438902743142144,
"grad_norm": 0.9466804265975952,
"learning_rate": 8.100502512562815e-06,
"loss": 1.0844,
"step": 196
},
{
"epoch": 0.2456359102244389,
"grad_norm": 0.9686530232429504,
"learning_rate": 8.090452261306532e-06,
"loss": 1.0657,
"step": 197
},
{
"epoch": 0.24688279301745636,
"grad_norm": 0.8855727314949036,
"learning_rate": 8.080402010050253e-06,
"loss": 0.8664,
"step": 198
},
{
"epoch": 0.24812967581047382,
"grad_norm": 0.9230241775512695,
"learning_rate": 8.07035175879397e-06,
"loss": 0.8856,
"step": 199
},
{
"epoch": 0.24937655860349128,
"grad_norm": 0.9400664567947388,
"learning_rate": 8.060301507537689e-06,
"loss": 1.0216,
"step": 200
},
{
"epoch": 0.2506234413965087,
"grad_norm": 0.8937174081802368,
"learning_rate": 8.050251256281408e-06,
"loss": 1.0588,
"step": 201
},
{
"epoch": 0.2518703241895262,
"grad_norm": 1.0428383350372314,
"learning_rate": 8.040201005025127e-06,
"loss": 1.0089,
"step": 202
},
{
"epoch": 0.25311720698254364,
"grad_norm": 1.0297296047210693,
"learning_rate": 8.030150753768844e-06,
"loss": 0.9482,
"step": 203
},
{
"epoch": 0.2543640897755611,
"grad_norm": 0.8881064653396606,
"learning_rate": 8.020100502512563e-06,
"loss": 0.7866,
"step": 204
},
{
"epoch": 0.25561097256857856,
"grad_norm": 0.9601223468780518,
"learning_rate": 8.010050251256282e-06,
"loss": 0.9292,
"step": 205
},
{
"epoch": 0.256857855361596,
"grad_norm": 0.9894243478775024,
"learning_rate": 8.000000000000001e-06,
"loss": 1.041,
"step": 206
},
{
"epoch": 0.2581047381546135,
"grad_norm": 0.9173250198364258,
"learning_rate": 7.989949748743719e-06,
"loss": 0.9038,
"step": 207
},
{
"epoch": 0.2593516209476309,
"grad_norm": 0.9618817567825317,
"learning_rate": 7.979899497487438e-06,
"loss": 0.9921,
"step": 208
},
{
"epoch": 0.26059850374064836,
"grad_norm": 0.9067187309265137,
"learning_rate": 7.969849246231157e-06,
"loss": 0.9796,
"step": 209
},
{
"epoch": 0.26184538653366585,
"grad_norm": 1.0154800415039062,
"learning_rate": 7.959798994974876e-06,
"loss": 0.9552,
"step": 210
},
{
"epoch": 0.2630922693266833,
"grad_norm": 1.0074129104614258,
"learning_rate": 7.949748743718595e-06,
"loss": 0.937,
"step": 211
},
{
"epoch": 0.26433915211970077,
"grad_norm": 0.8649031519889832,
"learning_rate": 7.939698492462312e-06,
"loss": 0.8111,
"step": 212
},
{
"epoch": 0.2655860349127182,
"grad_norm": 0.9405893087387085,
"learning_rate": 7.929648241206031e-06,
"loss": 0.9264,
"step": 213
},
{
"epoch": 0.26683291770573564,
"grad_norm": 0.8886278867721558,
"learning_rate": 7.91959798994975e-06,
"loss": 0.9354,
"step": 214
},
{
"epoch": 0.26807980049875313,
"grad_norm": 0.9831581711769104,
"learning_rate": 7.909547738693469e-06,
"loss": 0.9102,
"step": 215
},
{
"epoch": 0.26932668329177056,
"grad_norm": 0.927852988243103,
"learning_rate": 7.899497487437186e-06,
"loss": 0.9489,
"step": 216
},
{
"epoch": 0.27057356608478805,
"grad_norm": 0.8811565041542053,
"learning_rate": 7.889447236180905e-06,
"loss": 0.879,
"step": 217
},
{
"epoch": 0.2718204488778055,
"grad_norm": 0.9356923699378967,
"learning_rate": 7.879396984924622e-06,
"loss": 1.0154,
"step": 218
},
{
"epoch": 0.2730673316708229,
"grad_norm": 0.8990209698677063,
"learning_rate": 7.869346733668343e-06,
"loss": 0.744,
"step": 219
},
{
"epoch": 0.2743142144638404,
"grad_norm": 0.8560864329338074,
"learning_rate": 7.85929648241206e-06,
"loss": 0.9409,
"step": 220
},
{
"epoch": 0.27556109725685785,
"grad_norm": 0.9691808223724365,
"learning_rate": 7.84924623115578e-06,
"loss": 0.9914,
"step": 221
},
{
"epoch": 0.27680798004987534,
"grad_norm": 0.9157412648200989,
"learning_rate": 7.839195979899498e-06,
"loss": 0.9293,
"step": 222
},
{
"epoch": 0.27805486284289277,
"grad_norm": 0.9317874312400818,
"learning_rate": 7.829145728643217e-06,
"loss": 0.974,
"step": 223
},
{
"epoch": 0.2793017456359102,
"grad_norm": 0.9577739834785461,
"learning_rate": 7.819095477386935e-06,
"loss": 0.9103,
"step": 224
},
{
"epoch": 0.2805486284289277,
"grad_norm": 1.081833839416504,
"learning_rate": 7.809045226130654e-06,
"loss": 1.0984,
"step": 225
},
{
"epoch": 0.2817955112219451,
"grad_norm": 1.0292571783065796,
"learning_rate": 7.798994974874373e-06,
"loss": 1.0348,
"step": 226
},
{
"epoch": 0.2830423940149626,
"grad_norm": 0.9041290283203125,
"learning_rate": 7.788944723618092e-06,
"loss": 1.0038,
"step": 227
},
{
"epoch": 0.28428927680798005,
"grad_norm": 1.130258560180664,
"learning_rate": 7.77889447236181e-06,
"loss": 0.8911,
"step": 228
},
{
"epoch": 0.2855361596009975,
"grad_norm": 0.9539062976837158,
"learning_rate": 7.768844221105528e-06,
"loss": 0.947,
"step": 229
},
{
"epoch": 0.286783042394015,
"grad_norm": 1.0099889039993286,
"learning_rate": 7.758793969849247e-06,
"loss": 0.9407,
"step": 230
},
{
"epoch": 0.2880299251870324,
"grad_norm": 0.9209743142127991,
"learning_rate": 7.748743718592966e-06,
"loss": 0.8845,
"step": 231
},
{
"epoch": 0.2892768079800499,
"grad_norm": 1.0240118503570557,
"learning_rate": 7.738693467336685e-06,
"loss": 0.9787,
"step": 232
},
{
"epoch": 0.29052369077306733,
"grad_norm": 0.9892071485519409,
"learning_rate": 7.728643216080402e-06,
"loss": 1.0175,
"step": 233
},
{
"epoch": 0.29177057356608477,
"grad_norm": 1.0961612462997437,
"learning_rate": 7.718592964824121e-06,
"loss": 1.1846,
"step": 234
},
{
"epoch": 0.29301745635910226,
"grad_norm": 0.9142261743545532,
"learning_rate": 7.70854271356784e-06,
"loss": 0.7115,
"step": 235
},
{
"epoch": 0.2942643391521197,
"grad_norm": 0.9984656572341919,
"learning_rate": 7.698492462311559e-06,
"loss": 1.0165,
"step": 236
},
{
"epoch": 0.2955112219451372,
"grad_norm": 1.1105763912200928,
"learning_rate": 7.688442211055276e-06,
"loss": 1.0942,
"step": 237
},
{
"epoch": 0.2967581047381546,
"grad_norm": 0.8985201120376587,
"learning_rate": 7.678391959798995e-06,
"loss": 1.0469,
"step": 238
},
{
"epoch": 0.29800498753117205,
"grad_norm": 0.9157400727272034,
"learning_rate": 7.668341708542714e-06,
"loss": 0.9511,
"step": 239
},
{
"epoch": 0.29925187032418954,
"grad_norm": 0.9150097966194153,
"learning_rate": 7.658291457286433e-06,
"loss": 0.8733,
"step": 240
},
{
"epoch": 0.300498753117207,
"grad_norm": 0.9809494614601135,
"learning_rate": 7.64824120603015e-06,
"loss": 0.8775,
"step": 241
},
{
"epoch": 0.30174563591022446,
"grad_norm": 1.1237106323242188,
"learning_rate": 7.63819095477387e-06,
"loss": 1.028,
"step": 242
},
{
"epoch": 0.3029925187032419,
"grad_norm": 1.0430853366851807,
"learning_rate": 7.628140703517588e-06,
"loss": 0.9584,
"step": 243
},
{
"epoch": 0.30423940149625933,
"grad_norm": 1.0607941150665283,
"learning_rate": 7.618090452261308e-06,
"loss": 1.0414,
"step": 244
},
{
"epoch": 0.3054862842892768,
"grad_norm": 0.9155808091163635,
"learning_rate": 7.608040201005026e-06,
"loss": 0.8695,
"step": 245
},
{
"epoch": 0.30673316708229426,
"grad_norm": 0.8979218006134033,
"learning_rate": 7.597989949748744e-06,
"loss": 0.7672,
"step": 246
},
{
"epoch": 0.30798004987531175,
"grad_norm": 0.9346151351928711,
"learning_rate": 7.587939698492463e-06,
"loss": 0.8253,
"step": 247
},
{
"epoch": 0.3092269326683292,
"grad_norm": 1.0465093851089478,
"learning_rate": 7.577889447236182e-06,
"loss": 0.9914,
"step": 248
},
{
"epoch": 0.3104738154613466,
"grad_norm": 0.9367716312408447,
"learning_rate": 7.5678391959799e-06,
"loss": 0.8551,
"step": 249
},
{
"epoch": 0.3117206982543641,
"grad_norm": 0.9532157778739929,
"learning_rate": 7.557788944723619e-06,
"loss": 0.8579,
"step": 250
},
{
"epoch": 0.31296758104738154,
"grad_norm": 0.9339020252227783,
"learning_rate": 7.547738693467337e-06,
"loss": 0.8881,
"step": 251
},
{
"epoch": 0.314214463840399,
"grad_norm": 0.8677574396133423,
"learning_rate": 7.537688442211056e-06,
"loss": 0.8123,
"step": 252
},
{
"epoch": 0.31546134663341646,
"grad_norm": 0.8985550999641418,
"learning_rate": 7.527638190954774e-06,
"loss": 0.9194,
"step": 253
},
{
"epoch": 0.3167082294264339,
"grad_norm": 0.9129196405410767,
"learning_rate": 7.517587939698493e-06,
"loss": 1.0114,
"step": 254
},
{
"epoch": 0.3179551122194514,
"grad_norm": 0.9237337112426758,
"learning_rate": 7.507537688442211e-06,
"loss": 0.8201,
"step": 255
},
{
"epoch": 0.3192019950124688,
"grad_norm": 0.9522047638893127,
"learning_rate": 7.49748743718593e-06,
"loss": 0.8514,
"step": 256
},
{
"epoch": 0.3204488778054863,
"grad_norm": 0.9452963471412659,
"learning_rate": 7.487437185929649e-06,
"loss": 0.8917,
"step": 257
},
{
"epoch": 0.32169576059850374,
"grad_norm": 1.0754653215408325,
"learning_rate": 7.4773869346733675e-06,
"loss": 1.0405,
"step": 258
},
{
"epoch": 0.3229426433915212,
"grad_norm": 0.9160327911376953,
"learning_rate": 7.467336683417086e-06,
"loss": 0.6823,
"step": 259
},
{
"epoch": 0.32418952618453867,
"grad_norm": 1.0155515670776367,
"learning_rate": 7.4572864321608055e-06,
"loss": 0.906,
"step": 260
},
{
"epoch": 0.3254364089775561,
"grad_norm": 0.9582393765449524,
"learning_rate": 7.447236180904524e-06,
"loss": 0.9716,
"step": 261
},
{
"epoch": 0.3266832917705736,
"grad_norm": 0.9741798043251038,
"learning_rate": 7.437185929648242e-06,
"loss": 0.7866,
"step": 262
},
{
"epoch": 0.327930174563591,
"grad_norm": 0.9519482254981995,
"learning_rate": 7.42713567839196e-06,
"loss": 0.7525,
"step": 263
},
{
"epoch": 0.32917705735660846,
"grad_norm": 1.07245671749115,
"learning_rate": 7.417085427135679e-06,
"loss": 0.9398,
"step": 264
},
{
"epoch": 0.33042394014962595,
"grad_norm": 1.0767868757247925,
"learning_rate": 7.407035175879398e-06,
"loss": 0.8896,
"step": 265
},
{
"epoch": 0.3316708229426434,
"grad_norm": 0.9522539377212524,
"learning_rate": 7.396984924623116e-06,
"loss": 0.8058,
"step": 266
},
{
"epoch": 0.3329177057356609,
"grad_norm": 1.0144683122634888,
"learning_rate": 7.386934673366835e-06,
"loss": 0.9157,
"step": 267
},
{
"epoch": 0.3341645885286783,
"grad_norm": 1.0524057149887085,
"learning_rate": 7.376884422110553e-06,
"loss": 1.0011,
"step": 268
},
{
"epoch": 0.33541147132169574,
"grad_norm": 0.8897056579589844,
"learning_rate": 7.366834170854272e-06,
"loss": 0.8157,
"step": 269
},
{
"epoch": 0.33665835411471323,
"grad_norm": 0.9304531216621399,
"learning_rate": 7.35678391959799e-06,
"loss": 1.0127,
"step": 270
},
{
"epoch": 0.33790523690773067,
"grad_norm": 0.9404273629188538,
"learning_rate": 7.346733668341709e-06,
"loss": 0.7511,
"step": 271
},
{
"epoch": 0.33915211970074816,
"grad_norm": 0.8715949058532715,
"learning_rate": 7.336683417085427e-06,
"loss": 0.8367,
"step": 272
},
{
"epoch": 0.3403990024937656,
"grad_norm": 0.9307643175125122,
"learning_rate": 7.326633165829146e-06,
"loss": 0.7443,
"step": 273
},
{
"epoch": 0.341645885286783,
"grad_norm": 0.9046649932861328,
"learning_rate": 7.316582914572865e-06,
"loss": 0.8355,
"step": 274
},
{
"epoch": 0.3428927680798005,
"grad_norm": 0.9491796493530273,
"learning_rate": 7.3065326633165835e-06,
"loss": 1.0413,
"step": 275
},
{
"epoch": 0.34413965087281795,
"grad_norm": 0.9755702018737793,
"learning_rate": 7.296482412060302e-06,
"loss": 0.8975,
"step": 276
},
{
"epoch": 0.34538653366583544,
"grad_norm": 0.9344161748886108,
"learning_rate": 7.2864321608040215e-06,
"loss": 0.9113,
"step": 277
},
{
"epoch": 0.34663341645885287,
"grad_norm": 0.9998181462287903,
"learning_rate": 7.27638190954774e-06,
"loss": 0.7695,
"step": 278
},
{
"epoch": 0.3478802992518703,
"grad_norm": 0.9748741984367371,
"learning_rate": 7.266331658291458e-06,
"loss": 0.9033,
"step": 279
},
{
"epoch": 0.3491271820448878,
"grad_norm": 1.0220352411270142,
"learning_rate": 7.256281407035176e-06,
"loss": 0.8415,
"step": 280
},
{
"epoch": 0.35037406483790523,
"grad_norm": 0.8541595935821533,
"learning_rate": 7.246231155778896e-06,
"loss": 0.8744,
"step": 281
},
{
"epoch": 0.3516209476309227,
"grad_norm": 0.9145262241363525,
"learning_rate": 7.236180904522614e-06,
"loss": 0.9193,
"step": 282
},
{
"epoch": 0.35286783042394015,
"grad_norm": 1.0143400430679321,
"learning_rate": 7.226130653266332e-06,
"loss": 0.6842,
"step": 283
},
{
"epoch": 0.3541147132169576,
"grad_norm": 0.9740715026855469,
"learning_rate": 7.21608040201005e-06,
"loss": 0.7454,
"step": 284
},
{
"epoch": 0.3553615960099751,
"grad_norm": 1.1085376739501953,
"learning_rate": 7.206030150753769e-06,
"loss": 0.8096,
"step": 285
},
{
"epoch": 0.3566084788029925,
"grad_norm": 1.0065027475357056,
"learning_rate": 7.195979899497488e-06,
"loss": 0.8284,
"step": 286
},
{
"epoch": 0.35785536159601,
"grad_norm": 0.9899936318397522,
"learning_rate": 7.185929648241206e-06,
"loss": 0.7753,
"step": 287
},
{
"epoch": 0.35910224438902744,
"grad_norm": 0.9796255230903625,
"learning_rate": 7.175879396984925e-06,
"loss": 0.8388,
"step": 288
},
{
"epoch": 0.36034912718204487,
"grad_norm": 0.9738274216651917,
"learning_rate": 7.165829145728643e-06,
"loss": 0.8133,
"step": 289
},
{
"epoch": 0.36159600997506236,
"grad_norm": 1.0395734310150146,
"learning_rate": 7.155778894472362e-06,
"loss": 0.9888,
"step": 290
},
{
"epoch": 0.3628428927680798,
"grad_norm": 1.0151019096374512,
"learning_rate": 7.145728643216081e-06,
"loss": 0.7418,
"step": 291
},
{
"epoch": 0.3640897755610973,
"grad_norm": 0.9445755481719971,
"learning_rate": 7.1356783919597995e-06,
"loss": 0.7979,
"step": 292
},
{
"epoch": 0.3653366583541147,
"grad_norm": 0.9288540482521057,
"learning_rate": 7.125628140703518e-06,
"loss": 0.7825,
"step": 293
},
{
"epoch": 0.36658354114713215,
"grad_norm": 1.0280643701553345,
"learning_rate": 7.1155778894472375e-06,
"loss": 0.9515,
"step": 294
},
{
"epoch": 0.36783042394014964,
"grad_norm": 0.9743002653121948,
"learning_rate": 7.105527638190956e-06,
"loss": 0.8183,
"step": 295
},
{
"epoch": 0.3690773067331671,
"grad_norm": 1.041757345199585,
"learning_rate": 7.095477386934674e-06,
"loss": 0.7621,
"step": 296
},
{
"epoch": 0.37032418952618457,
"grad_norm": 1.2918349504470825,
"learning_rate": 7.085427135678392e-06,
"loss": 1.1412,
"step": 297
},
{
"epoch": 0.371571072319202,
"grad_norm": 1.1440565586090088,
"learning_rate": 7.075376884422112e-06,
"loss": 1.0236,
"step": 298
},
{
"epoch": 0.37281795511221943,
"grad_norm": 1.126747488975525,
"learning_rate": 7.06532663316583e-06,
"loss": 1.0067,
"step": 299
},
{
"epoch": 0.3740648379052369,
"grad_norm": 0.9919477105140686,
"learning_rate": 7.055276381909548e-06,
"loss": 0.8802,
"step": 300
},
{
"epoch": 0.37531172069825436,
"grad_norm": 1.1108204126358032,
"learning_rate": 7.045226130653266e-06,
"loss": 0.867,
"step": 301
},
{
"epoch": 0.3765586034912718,
"grad_norm": 0.9554193615913391,
"learning_rate": 7.035175879396986e-06,
"loss": 0.809,
"step": 302
},
{
"epoch": 0.3778054862842893,
"grad_norm": 0.9657833576202393,
"learning_rate": 7.025125628140704e-06,
"loss": 0.8409,
"step": 303
},
{
"epoch": 0.3790523690773067,
"grad_norm": 1.0594302415847778,
"learning_rate": 7.015075376884422e-06,
"loss": 0.9006,
"step": 304
},
{
"epoch": 0.3802992518703242,
"grad_norm": 1.049154281616211,
"learning_rate": 7.005025125628141e-06,
"loss": 0.8555,
"step": 305
},
{
"epoch": 0.38154613466334164,
"grad_norm": 1.096117377281189,
"learning_rate": 6.99497487437186e-06,
"loss": 0.8898,
"step": 306
},
{
"epoch": 0.3827930174563591,
"grad_norm": 1.090014934539795,
"learning_rate": 6.984924623115578e-06,
"loss": 0.905,
"step": 307
},
{
"epoch": 0.38403990024937656,
"grad_norm": 1.0771458148956299,
"learning_rate": 6.974874371859297e-06,
"loss": 0.7606,
"step": 308
},
{
"epoch": 0.385286783042394,
"grad_norm": 0.9209988713264465,
"learning_rate": 6.9648241206030155e-06,
"loss": 0.8825,
"step": 309
},
{
"epoch": 0.3865336658354115,
"grad_norm": 0.9999722242355347,
"learning_rate": 6.954773869346734e-06,
"loss": 0.7956,
"step": 310
},
{
"epoch": 0.3877805486284289,
"grad_norm": 1.006635069847107,
"learning_rate": 6.9447236180904535e-06,
"loss": 0.8601,
"step": 311
},
{
"epoch": 0.38902743142144636,
"grad_norm": 1.0573351383209229,
"learning_rate": 6.934673366834172e-06,
"loss": 1.0764,
"step": 312
},
{
"epoch": 0.39027431421446385,
"grad_norm": 1.0885146856307983,
"learning_rate": 6.92462311557789e-06,
"loss": 0.897,
"step": 313
},
{
"epoch": 0.3915211970074813,
"grad_norm": 0.9416261911392212,
"learning_rate": 6.914572864321608e-06,
"loss": 0.8288,
"step": 314
},
{
"epoch": 0.39276807980049877,
"grad_norm": 1.2340617179870605,
"learning_rate": 6.904522613065328e-06,
"loss": 1.1464,
"step": 315
},
{
"epoch": 0.3940149625935162,
"grad_norm": 0.9569803476333618,
"learning_rate": 6.894472361809046e-06,
"loss": 0.7072,
"step": 316
},
{
"epoch": 0.39526184538653364,
"grad_norm": 1.1045235395431519,
"learning_rate": 6.884422110552764e-06,
"loss": 0.9001,
"step": 317
},
{
"epoch": 0.39650872817955113,
"grad_norm": 0.9991887211799622,
"learning_rate": 6.874371859296482e-06,
"loss": 0.8517,
"step": 318
},
{
"epoch": 0.39775561097256856,
"grad_norm": 1.051188588142395,
"learning_rate": 6.864321608040202e-06,
"loss": 0.8602,
"step": 319
},
{
"epoch": 0.39900249376558605,
"grad_norm": 1.045560598373413,
"learning_rate": 6.85427135678392e-06,
"loss": 0.8124,
"step": 320
},
{
"epoch": 0.4002493765586035,
"grad_norm": 0.9770002961158752,
"learning_rate": 6.844221105527638e-06,
"loss": 0.9043,
"step": 321
},
{
"epoch": 0.4014962593516209,
"grad_norm": 1.0477802753448486,
"learning_rate": 6.834170854271357e-06,
"loss": 0.8489,
"step": 322
},
{
"epoch": 0.4027431421446384,
"grad_norm": 0.9778210520744324,
"learning_rate": 6.824120603015076e-06,
"loss": 0.9421,
"step": 323
},
{
"epoch": 0.40399002493765584,
"grad_norm": 1.0833961963653564,
"learning_rate": 6.814070351758794e-06,
"loss": 0.766,
"step": 324
},
{
"epoch": 0.40523690773067333,
"grad_norm": 1.0004370212554932,
"learning_rate": 6.804020100502513e-06,
"loss": 0.8319,
"step": 325
},
{
"epoch": 0.40648379052369077,
"grad_norm": 1.0450783967971802,
"learning_rate": 6.7939698492462315e-06,
"loss": 0.8021,
"step": 326
},
{
"epoch": 0.4077306733167082,
"grad_norm": 1.0916366577148438,
"learning_rate": 6.7839195979899505e-06,
"loss": 1.0041,
"step": 327
},
{
"epoch": 0.4089775561097257,
"grad_norm": 0.9943376779556274,
"learning_rate": 6.7738693467336695e-06,
"loss": 0.7624,
"step": 328
},
{
"epoch": 0.4102244389027431,
"grad_norm": 0.9278281927108765,
"learning_rate": 6.763819095477388e-06,
"loss": 0.8407,
"step": 329
},
{
"epoch": 0.4114713216957606,
"grad_norm": 1.1130763292312622,
"learning_rate": 6.753768844221106e-06,
"loss": 0.8263,
"step": 330
},
{
"epoch": 0.41271820448877805,
"grad_norm": 1.0273207426071167,
"learning_rate": 6.743718592964824e-06,
"loss": 0.8396,
"step": 331
},
{
"epoch": 0.4139650872817955,
"grad_norm": 1.03300940990448,
"learning_rate": 6.733668341708544e-06,
"loss": 0.8851,
"step": 332
},
{
"epoch": 0.415211970074813,
"grad_norm": 0.9019345045089722,
"learning_rate": 6.723618090452262e-06,
"loss": 0.7098,
"step": 333
},
{
"epoch": 0.4164588528678304,
"grad_norm": 0.9586036801338196,
"learning_rate": 6.71356783919598e-06,
"loss": 0.7729,
"step": 334
},
{
"epoch": 0.4177057356608479,
"grad_norm": 1.0987576246261597,
"learning_rate": 6.703517587939698e-06,
"loss": 0.9049,
"step": 335
},
{
"epoch": 0.41895261845386533,
"grad_norm": 1.0312920808792114,
"learning_rate": 6.693467336683418e-06,
"loss": 0.9232,
"step": 336
},
{
"epoch": 0.42019950124688277,
"grad_norm": 0.8923942446708679,
"learning_rate": 6.683417085427136e-06,
"loss": 0.7087,
"step": 337
},
{
"epoch": 0.42144638403990026,
"grad_norm": 1.1419446468353271,
"learning_rate": 6.673366834170854e-06,
"loss": 1.057,
"step": 338
},
{
"epoch": 0.4226932668329177,
"grad_norm": 1.2685928344726562,
"learning_rate": 6.663316582914573e-06,
"loss": 0.9991,
"step": 339
},
{
"epoch": 0.4239401496259352,
"grad_norm": 0.9922438263893127,
"learning_rate": 6.653266331658292e-06,
"loss": 0.8301,
"step": 340
},
{
"epoch": 0.4251870324189526,
"grad_norm": 1.0687658786773682,
"learning_rate": 6.64321608040201e-06,
"loss": 1.0261,
"step": 341
},
{
"epoch": 0.42643391521197005,
"grad_norm": 1.0315366983413696,
"learning_rate": 6.633165829145729e-06,
"loss": 0.8198,
"step": 342
},
{
"epoch": 0.42768079800498754,
"grad_norm": 1.029801845550537,
"learning_rate": 6.6231155778894475e-06,
"loss": 0.8738,
"step": 343
},
{
"epoch": 0.428927680798005,
"grad_norm": 1.0056087970733643,
"learning_rate": 6.6130653266331665e-06,
"loss": 0.7687,
"step": 344
},
{
"epoch": 0.43017456359102246,
"grad_norm": 0.958608090877533,
"learning_rate": 6.6030150753768855e-06,
"loss": 0.7997,
"step": 345
},
{
"epoch": 0.4314214463840399,
"grad_norm": 1.0940543413162231,
"learning_rate": 6.592964824120604e-06,
"loss": 0.9541,
"step": 346
},
{
"epoch": 0.43266832917705733,
"grad_norm": 1.0316964387893677,
"learning_rate": 6.582914572864322e-06,
"loss": 0.9278,
"step": 347
},
{
"epoch": 0.4339152119700748,
"grad_norm": 1.2830352783203125,
"learning_rate": 6.572864321608042e-06,
"loss": 1.0554,
"step": 348
},
{
"epoch": 0.43516209476309226,
"grad_norm": 0.9945348501205444,
"learning_rate": 6.56281407035176e-06,
"loss": 0.7879,
"step": 349
},
{
"epoch": 0.43640897755610975,
"grad_norm": 1.0605803728103638,
"learning_rate": 6.552763819095478e-06,
"loss": 0.9316,
"step": 350
},
{
"epoch": 0.4376558603491272,
"grad_norm": 1.1072403192520142,
"learning_rate": 6.542713567839196e-06,
"loss": 0.9295,
"step": 351
},
{
"epoch": 0.4389027431421446,
"grad_norm": 1.1351176500320435,
"learning_rate": 6.532663316582916e-06,
"loss": 0.9723,
"step": 352
},
{
"epoch": 0.4401496259351621,
"grad_norm": 1.0387343168258667,
"learning_rate": 6.522613065326634e-06,
"loss": 0.7906,
"step": 353
},
{
"epoch": 0.44139650872817954,
"grad_norm": 1.0206201076507568,
"learning_rate": 6.512562814070352e-06,
"loss": 0.8631,
"step": 354
},
{
"epoch": 0.442643391521197,
"grad_norm": 0.9973576068878174,
"learning_rate": 6.50251256281407e-06,
"loss": 0.8078,
"step": 355
},
{
"epoch": 0.44389027431421446,
"grad_norm": 1.0855823755264282,
"learning_rate": 6.492462311557789e-06,
"loss": 1.0692,
"step": 356
},
{
"epoch": 0.4451371571072319,
"grad_norm": 1.0727882385253906,
"learning_rate": 6.482412060301508e-06,
"loss": 0.9002,
"step": 357
},
{
"epoch": 0.4463840399002494,
"grad_norm": 1.0234915018081665,
"learning_rate": 6.472361809045226e-06,
"loss": 0.9022,
"step": 358
},
{
"epoch": 0.4476309226932668,
"grad_norm": 1.0042359828948975,
"learning_rate": 6.462311557788945e-06,
"loss": 0.8119,
"step": 359
},
{
"epoch": 0.4488778054862843,
"grad_norm": 0.9693371057510376,
"learning_rate": 6.4522613065326635e-06,
"loss": 0.7467,
"step": 360
},
{
"epoch": 0.45012468827930174,
"grad_norm": 1.0523459911346436,
"learning_rate": 6.4422110552763825e-06,
"loss": 0.8219,
"step": 361
},
{
"epoch": 0.4513715710723192,
"grad_norm": 0.9389395713806152,
"learning_rate": 6.4321608040201015e-06,
"loss": 0.7628,
"step": 362
},
{
"epoch": 0.45261845386533667,
"grad_norm": 0.9867566823959351,
"learning_rate": 6.42211055276382e-06,
"loss": 0.7433,
"step": 363
},
{
"epoch": 0.4538653366583541,
"grad_norm": 1.0178903341293335,
"learning_rate": 6.412060301507538e-06,
"loss": 0.8021,
"step": 364
},
{
"epoch": 0.4551122194513716,
"grad_norm": 1.019600749015808,
"learning_rate": 6.402010050251258e-06,
"loss": 0.8883,
"step": 365
},
{
"epoch": 0.456359102244389,
"grad_norm": 1.0928678512573242,
"learning_rate": 6.391959798994976e-06,
"loss": 0.9591,
"step": 366
},
{
"epoch": 0.45760598503740646,
"grad_norm": 1.0122618675231934,
"learning_rate": 6.381909547738694e-06,
"loss": 0.7998,
"step": 367
},
{
"epoch": 0.45885286783042395,
"grad_norm": 1.0172462463378906,
"learning_rate": 6.371859296482412e-06,
"loss": 0.7293,
"step": 368
},
{
"epoch": 0.4600997506234414,
"grad_norm": 1.1669033765792847,
"learning_rate": 6.361809045226132e-06,
"loss": 0.9151,
"step": 369
},
{
"epoch": 0.4613466334164589,
"grad_norm": 1.0527026653289795,
"learning_rate": 6.35175879396985e-06,
"loss": 0.8687,
"step": 370
},
{
"epoch": 0.4625935162094763,
"grad_norm": 0.9200843572616577,
"learning_rate": 6.341708542713568e-06,
"loss": 0.6577,
"step": 371
},
{
"epoch": 0.46384039900249374,
"grad_norm": 1.089831829071045,
"learning_rate": 6.331658291457286e-06,
"loss": 0.9584,
"step": 372
},
{
"epoch": 0.46508728179551123,
"grad_norm": 1.0251846313476562,
"learning_rate": 6.321608040201006e-06,
"loss": 0.8594,
"step": 373
},
{
"epoch": 0.46633416458852867,
"grad_norm": 0.9644805788993835,
"learning_rate": 6.311557788944724e-06,
"loss": 0.8014,
"step": 374
},
{
"epoch": 0.46758104738154616,
"grad_norm": 1.163680911064148,
"learning_rate": 6.301507537688442e-06,
"loss": 0.8767,
"step": 375
},
{
"epoch": 0.4688279301745636,
"grad_norm": 0.9366633892059326,
"learning_rate": 6.291457286432161e-06,
"loss": 0.7751,
"step": 376
},
{
"epoch": 0.470074812967581,
"grad_norm": 0.9507472515106201,
"learning_rate": 6.28140703517588e-06,
"loss": 0.8121,
"step": 377
},
{
"epoch": 0.4713216957605985,
"grad_norm": 0.9354878067970276,
"learning_rate": 6.2713567839195985e-06,
"loss": 0.7697,
"step": 378
},
{
"epoch": 0.47256857855361595,
"grad_norm": 0.9445653557777405,
"learning_rate": 6.2613065326633175e-06,
"loss": 0.7846,
"step": 379
},
{
"epoch": 0.47381546134663344,
"grad_norm": 1.0566297769546509,
"learning_rate": 6.251256281407036e-06,
"loss": 0.838,
"step": 380
},
{
"epoch": 0.47506234413965087,
"grad_norm": 0.9639749526977539,
"learning_rate": 6.241206030150754e-06,
"loss": 0.7409,
"step": 381
},
{
"epoch": 0.4763092269326683,
"grad_norm": 0.9957520961761475,
"learning_rate": 6.231155778894474e-06,
"loss": 0.8416,
"step": 382
},
{
"epoch": 0.4775561097256858,
"grad_norm": 1.082114577293396,
"learning_rate": 6.221105527638192e-06,
"loss": 0.8489,
"step": 383
},
{
"epoch": 0.47880299251870323,
"grad_norm": 0.9237020611763,
"learning_rate": 6.21105527638191e-06,
"loss": 0.7696,
"step": 384
},
{
"epoch": 0.4800498753117207,
"grad_norm": 1.0793203115463257,
"learning_rate": 6.201005025125628e-06,
"loss": 0.8835,
"step": 385
},
{
"epoch": 0.48129675810473815,
"grad_norm": 0.9898436665534973,
"learning_rate": 6.190954773869348e-06,
"loss": 0.7432,
"step": 386
},
{
"epoch": 0.4825436408977556,
"grad_norm": 0.9922778010368347,
"learning_rate": 6.180904522613066e-06,
"loss": 0.8182,
"step": 387
},
{
"epoch": 0.4837905236907731,
"grad_norm": 0.9298007488250732,
"learning_rate": 6.170854271356784e-06,
"loss": 0.7286,
"step": 388
},
{
"epoch": 0.4850374064837905,
"grad_norm": 0.9403723478317261,
"learning_rate": 6.160804020100502e-06,
"loss": 0.7235,
"step": 389
},
{
"epoch": 0.486284289276808,
"grad_norm": 0.9329712986946106,
"learning_rate": 6.150753768844222e-06,
"loss": 0.7276,
"step": 390
},
{
"epoch": 0.48753117206982544,
"grad_norm": 1.0052450895309448,
"learning_rate": 6.14070351758794e-06,
"loss": 0.7996,
"step": 391
},
{
"epoch": 0.48877805486284287,
"grad_norm": 1.0098786354064941,
"learning_rate": 6.130653266331658e-06,
"loss": 0.9035,
"step": 392
},
{
"epoch": 0.49002493765586036,
"grad_norm": 1.0230143070220947,
"learning_rate": 6.120603015075377e-06,
"loss": 0.7571,
"step": 393
},
{
"epoch": 0.4912718204488778,
"grad_norm": 1.1021591424942017,
"learning_rate": 6.110552763819096e-06,
"loss": 0.7802,
"step": 394
},
{
"epoch": 0.4925187032418953,
"grad_norm": 1.154613971710205,
"learning_rate": 6.1005025125628145e-06,
"loss": 0.8112,
"step": 395
},
{
"epoch": 0.4937655860349127,
"grad_norm": 1.0493168830871582,
"learning_rate": 6.0904522613065335e-06,
"loss": 0.771,
"step": 396
},
{
"epoch": 0.49501246882793015,
"grad_norm": 1.172802209854126,
"learning_rate": 6.080402010050252e-06,
"loss": 0.8897,
"step": 397
},
{
"epoch": 0.49625935162094764,
"grad_norm": 1.0466126203536987,
"learning_rate": 6.070351758793971e-06,
"loss": 0.7515,
"step": 398
},
{
"epoch": 0.4975062344139651,
"grad_norm": 1.0226330757141113,
"learning_rate": 6.06030150753769e-06,
"loss": 0.7747,
"step": 399
},
{
"epoch": 0.49875311720698257,
"grad_norm": 1.0427227020263672,
"learning_rate": 6.050251256281408e-06,
"loss": 0.7962,
"step": 400
},
{
"epoch": 0.5,
"grad_norm": 1.104669451713562,
"learning_rate": 6.040201005025126e-06,
"loss": 0.9699,
"step": 401
},
{
"epoch": 0.5012468827930174,
"grad_norm": 1.0949004888534546,
"learning_rate": 6.030150753768844e-06,
"loss": 0.9231,
"step": 402
},
{
"epoch": 0.5024937655860349,
"grad_norm": 1.0837571620941162,
"learning_rate": 6.020100502512564e-06,
"loss": 0.9493,
"step": 403
},
{
"epoch": 0.5037406483790524,
"grad_norm": 1.4410837888717651,
"learning_rate": 6.010050251256282e-06,
"loss": 1.1473,
"step": 404
},
{
"epoch": 0.5049875311720698,
"grad_norm": 1.178643822669983,
"learning_rate": 6e-06,
"loss": 0.9554,
"step": 405
},
{
"epoch": 0.5062344139650873,
"grad_norm": 1.1893959045410156,
"learning_rate": 5.989949748743718e-06,
"loss": 0.8837,
"step": 406
},
{
"epoch": 0.5074812967581047,
"grad_norm": 1.5423134565353394,
"learning_rate": 5.979899497487438e-06,
"loss": 1.2201,
"step": 407
},
{
"epoch": 0.5087281795511222,
"grad_norm": 1.3641034364700317,
"learning_rate": 5.969849246231156e-06,
"loss": 1.0886,
"step": 408
},
{
"epoch": 0.5099750623441397,
"grad_norm": 1.4659836292266846,
"learning_rate": 5.959798994974874e-06,
"loss": 1.2189,
"step": 409
},
{
"epoch": 0.5112219451371571,
"grad_norm": 1.52463698387146,
"learning_rate": 5.949748743718593e-06,
"loss": 1.2287,
"step": 410
},
{
"epoch": 0.5124688279301746,
"grad_norm": 1.5492169857025146,
"learning_rate": 5.939698492462312e-06,
"loss": 1.2809,
"step": 411
},
{
"epoch": 0.513715710723192,
"grad_norm": 1.8154112100601196,
"learning_rate": 5.9296482412060305e-06,
"loss": 1.2805,
"step": 412
},
{
"epoch": 0.5149625935162094,
"grad_norm": 1.690354824066162,
"learning_rate": 5.9195979899497495e-06,
"loss": 1.3875,
"step": 413
},
{
"epoch": 0.516209476309227,
"grad_norm": 1.7147947549819946,
"learning_rate": 5.909547738693468e-06,
"loss": 1.363,
"step": 414
},
{
"epoch": 0.5174563591022444,
"grad_norm": 1.8920574188232422,
"learning_rate": 5.899497487437187e-06,
"loss": 1.4414,
"step": 415
},
{
"epoch": 0.5187032418952618,
"grad_norm": 1.6709070205688477,
"learning_rate": 5.889447236180905e-06,
"loss": 1.5424,
"step": 416
},
{
"epoch": 0.5199501246882793,
"grad_norm": 2.0101499557495117,
"learning_rate": 5.879396984924624e-06,
"loss": 1.551,
"step": 417
},
{
"epoch": 0.5211970074812967,
"grad_norm": 1.601006031036377,
"learning_rate": 5.869346733668342e-06,
"loss": 1.3572,
"step": 418
},
{
"epoch": 0.5224438902743143,
"grad_norm": 1.6006678342819214,
"learning_rate": 5.859296482412061e-06,
"loss": 1.2775,
"step": 419
},
{
"epoch": 0.5236907730673317,
"grad_norm": 1.3266113996505737,
"learning_rate": 5.84924623115578e-06,
"loss": 1.0825,
"step": 420
},
{
"epoch": 0.5249376558603491,
"grad_norm": 1.8975056409835815,
"learning_rate": 5.839195979899498e-06,
"loss": 1.3971,
"step": 421
},
{
"epoch": 0.5261845386533666,
"grad_norm": 1.5751831531524658,
"learning_rate": 5.829145728643216e-06,
"loss": 1.3774,
"step": 422
},
{
"epoch": 0.527431421446384,
"grad_norm": 1.7386391162872314,
"learning_rate": 5.819095477386936e-06,
"loss": 1.3489,
"step": 423
},
{
"epoch": 0.5286783042394015,
"grad_norm": 1.5445035696029663,
"learning_rate": 5.809045226130654e-06,
"loss": 1.4158,
"step": 424
},
{
"epoch": 0.529925187032419,
"grad_norm": 1.3711769580841064,
"learning_rate": 5.798994974874372e-06,
"loss": 1.2585,
"step": 425
},
{
"epoch": 0.5311720698254364,
"grad_norm": 1.5423415899276733,
"learning_rate": 5.78894472361809e-06,
"loss": 1.3683,
"step": 426
},
{
"epoch": 0.5324189526184538,
"grad_norm": 1.5914721488952637,
"learning_rate": 5.778894472361809e-06,
"loss": 1.255,
"step": 427
},
{
"epoch": 0.5336658354114713,
"grad_norm": 1.5025103092193604,
"learning_rate": 5.768844221105528e-06,
"loss": 1.2946,
"step": 428
},
{
"epoch": 0.5349127182044888,
"grad_norm": 1.662097454071045,
"learning_rate": 5.7587939698492465e-06,
"loss": 1.4691,
"step": 429
},
{
"epoch": 0.5361596009975063,
"grad_norm": 1.581234097480774,
"learning_rate": 5.7487437185929655e-06,
"loss": 1.6212,
"step": 430
},
{
"epoch": 0.5374064837905237,
"grad_norm": 1.2948893308639526,
"learning_rate": 5.738693467336684e-06,
"loss": 1.2803,
"step": 431
},
{
"epoch": 0.5386533665835411,
"grad_norm": 1.494753122329712,
"learning_rate": 5.728643216080403e-06,
"loss": 1.3346,
"step": 432
},
{
"epoch": 0.5399002493765586,
"grad_norm": 1.5078567266464233,
"learning_rate": 5.718592964824121e-06,
"loss": 1.3832,
"step": 433
},
{
"epoch": 0.5411471321695761,
"grad_norm": 1.5009632110595703,
"learning_rate": 5.70854271356784e-06,
"loss": 1.3823,
"step": 434
},
{
"epoch": 0.5423940149625935,
"grad_norm": 1.3894951343536377,
"learning_rate": 5.698492462311558e-06,
"loss": 1.2064,
"step": 435
},
{
"epoch": 0.543640897755611,
"grad_norm": 1.6113992929458618,
"learning_rate": 5.688442211055277e-06,
"loss": 1.2654,
"step": 436
},
{
"epoch": 0.5448877805486284,
"grad_norm": 1.2530064582824707,
"learning_rate": 5.678391959798996e-06,
"loss": 1.3238,
"step": 437
},
{
"epoch": 0.5461346633416458,
"grad_norm": 1.3698482513427734,
"learning_rate": 5.668341708542714e-06,
"loss": 1.2708,
"step": 438
},
{
"epoch": 0.5473815461346634,
"grad_norm": 1.546341896057129,
"learning_rate": 5.658291457286432e-06,
"loss": 1.3232,
"step": 439
},
{
"epoch": 0.5486284289276808,
"grad_norm": 1.5164636373519897,
"learning_rate": 5.648241206030152e-06,
"loss": 1.4437,
"step": 440
},
{
"epoch": 0.5498753117206983,
"grad_norm": 1.4249422550201416,
"learning_rate": 5.63819095477387e-06,
"loss": 1.3236,
"step": 441
},
{
"epoch": 0.5511221945137157,
"grad_norm": 1.3124446868896484,
"learning_rate": 5.628140703517588e-06,
"loss": 1.3035,
"step": 442
},
{
"epoch": 0.5523690773067331,
"grad_norm": 1.5412633419036865,
"learning_rate": 5.618090452261306e-06,
"loss": 1.4026,
"step": 443
},
{
"epoch": 0.5536159600997507,
"grad_norm": 1.53081214427948,
"learning_rate": 5.608040201005026e-06,
"loss": 1.2611,
"step": 444
},
{
"epoch": 0.5548628428927681,
"grad_norm": 1.1497313976287842,
"learning_rate": 5.597989949748744e-06,
"loss": 1.1849,
"step": 445
},
{
"epoch": 0.5561097256857855,
"grad_norm": 1.5226142406463623,
"learning_rate": 5.5879396984924625e-06,
"loss": 1.4378,
"step": 446
},
{
"epoch": 0.557356608478803,
"grad_norm": 1.5681039094924927,
"learning_rate": 5.577889447236181e-06,
"loss": 1.6733,
"step": 447
},
{
"epoch": 0.5586034912718204,
"grad_norm": 1.3931708335876465,
"learning_rate": 5.5678391959799e-06,
"loss": 1.3721,
"step": 448
},
{
"epoch": 0.559850374064838,
"grad_norm": 1.365352988243103,
"learning_rate": 5.557788944723619e-06,
"loss": 1.5183,
"step": 449
},
{
"epoch": 0.5610972568578554,
"grad_norm": 1.3289903402328491,
"learning_rate": 5.547738693467337e-06,
"loss": 1.5054,
"step": 450
},
{
"epoch": 0.5623441396508728,
"grad_norm": 1.2827537059783936,
"learning_rate": 5.537688442211056e-06,
"loss": 1.2045,
"step": 451
},
{
"epoch": 0.5635910224438903,
"grad_norm": 1.336918830871582,
"learning_rate": 5.527638190954774e-06,
"loss": 1.1425,
"step": 452
},
{
"epoch": 0.5648379052369077,
"grad_norm": 1.2265493869781494,
"learning_rate": 5.517587939698493e-06,
"loss": 1.2198,
"step": 453
},
{
"epoch": 0.5660847880299252,
"grad_norm": 1.4016344547271729,
"learning_rate": 5.507537688442212e-06,
"loss": 1.4,
"step": 454
},
{
"epoch": 0.5673316708229427,
"grad_norm": 1.3361009359359741,
"learning_rate": 5.49748743718593e-06,
"loss": 1.3654,
"step": 455
},
{
"epoch": 0.5685785536159601,
"grad_norm": 1.447028636932373,
"learning_rate": 5.487437185929648e-06,
"loss": 1.4682,
"step": 456
},
{
"epoch": 0.5698254364089775,
"grad_norm": 1.2176581621170044,
"learning_rate": 5.477386934673368e-06,
"loss": 1.2963,
"step": 457
},
{
"epoch": 0.571072319201995,
"grad_norm": 1.2136200666427612,
"learning_rate": 5.467336683417086e-06,
"loss": 1.2497,
"step": 458
},
{
"epoch": 0.5723192019950125,
"grad_norm": 1.420364260673523,
"learning_rate": 5.457286432160804e-06,
"loss": 1.2681,
"step": 459
},
{
"epoch": 0.57356608478803,
"grad_norm": 1.3635661602020264,
"learning_rate": 5.447236180904522e-06,
"loss": 1.3178,
"step": 460
},
{
"epoch": 0.5748129675810474,
"grad_norm": 1.3945159912109375,
"learning_rate": 5.437185929648242e-06,
"loss": 1.3192,
"step": 461
},
{
"epoch": 0.5760598503740648,
"grad_norm": 1.2764129638671875,
"learning_rate": 5.42713567839196e-06,
"loss": 1.4384,
"step": 462
},
{
"epoch": 0.5773067331670823,
"grad_norm": 1.2246918678283691,
"learning_rate": 5.4170854271356785e-06,
"loss": 1.1863,
"step": 463
},
{
"epoch": 0.5785536159600998,
"grad_norm": 1.3122367858886719,
"learning_rate": 5.407035175879397e-06,
"loss": 1.1194,
"step": 464
},
{
"epoch": 0.5798004987531172,
"grad_norm": 1.1648610830307007,
"learning_rate": 5.3969849246231165e-06,
"loss": 1.2397,
"step": 465
},
{
"epoch": 0.5810473815461347,
"grad_norm": 1.4122895002365112,
"learning_rate": 5.386934673366835e-06,
"loss": 1.5005,
"step": 466
},
{
"epoch": 0.5822942643391521,
"grad_norm": 1.1804543733596802,
"learning_rate": 5.376884422110553e-06,
"loss": 1.3592,
"step": 467
},
{
"epoch": 0.5835411471321695,
"grad_norm": 1.192355751991272,
"learning_rate": 5.366834170854272e-06,
"loss": 1.2864,
"step": 468
},
{
"epoch": 0.5847880299251871,
"grad_norm": 1.4562029838562012,
"learning_rate": 5.356783919597991e-06,
"loss": 1.5208,
"step": 469
},
{
"epoch": 0.5860349127182045,
"grad_norm": 1.123356580734253,
"learning_rate": 5.346733668341709e-06,
"loss": 1.1004,
"step": 470
},
{
"epoch": 0.587281795511222,
"grad_norm": 1.1915100812911987,
"learning_rate": 5.336683417085428e-06,
"loss": 1.2777,
"step": 471
},
{
"epoch": 0.5885286783042394,
"grad_norm": 1.2414828538894653,
"learning_rate": 5.326633165829146e-06,
"loss": 1.3355,
"step": 472
},
{
"epoch": 0.5897755610972568,
"grad_norm": 1.334938406944275,
"learning_rate": 5.316582914572864e-06,
"loss": 1.374,
"step": 473
},
{
"epoch": 0.5910224438902744,
"grad_norm": 1.1577836275100708,
"learning_rate": 5.306532663316584e-06,
"loss": 1.3759,
"step": 474
},
{
"epoch": 0.5922693266832918,
"grad_norm": 1.2841064929962158,
"learning_rate": 5.296482412060302e-06,
"loss": 1.3366,
"step": 475
},
{
"epoch": 0.5935162094763092,
"grad_norm": 1.5192992687225342,
"learning_rate": 5.28643216080402e-06,
"loss": 1.2881,
"step": 476
},
{
"epoch": 0.5947630922693267,
"grad_norm": 1.3636270761489868,
"learning_rate": 5.2763819095477384e-06,
"loss": 1.3256,
"step": 477
},
{
"epoch": 0.5960099750623441,
"grad_norm": 1.3187315464019775,
"learning_rate": 5.266331658291458e-06,
"loss": 1.6553,
"step": 478
},
{
"epoch": 0.5972568578553616,
"grad_norm": 1.5698240995407104,
"learning_rate": 5.256281407035176e-06,
"loss": 1.6228,
"step": 479
},
{
"epoch": 0.5985037406483791,
"grad_norm": 1.5735142230987549,
"learning_rate": 5.2462311557788945e-06,
"loss": 1.5562,
"step": 480
},
{
"epoch": 0.5997506234413965,
"grad_norm": 1.2758464813232422,
"learning_rate": 5.236180904522613e-06,
"loss": 1.3339,
"step": 481
},
{
"epoch": 0.600997506234414,
"grad_norm": 1.276182770729065,
"learning_rate": 5.2261306532663325e-06,
"loss": 1.3058,
"step": 482
},
{
"epoch": 0.6022443890274314,
"grad_norm": 1.3235769271850586,
"learning_rate": 5.216080402010051e-06,
"loss": 1.5037,
"step": 483
},
{
"epoch": 0.6034912718204489,
"grad_norm": 1.6739779710769653,
"learning_rate": 5.206030150753769e-06,
"loss": 1.6547,
"step": 484
},
{
"epoch": 0.6047381546134664,
"grad_norm": 1.2202621698379517,
"learning_rate": 5.195979899497488e-06,
"loss": 1.2164,
"step": 485
},
{
"epoch": 0.6059850374064838,
"grad_norm": 1.4694870710372925,
"learning_rate": 5.185929648241207e-06,
"loss": 1.469,
"step": 486
},
{
"epoch": 0.6072319201995012,
"grad_norm": 1.2256667613983154,
"learning_rate": 5.175879396984925e-06,
"loss": 1.4083,
"step": 487
},
{
"epoch": 0.6084788029925187,
"grad_norm": 1.1571601629257202,
"learning_rate": 5.165829145728644e-06,
"loss": 1.355,
"step": 488
},
{
"epoch": 0.6097256857855362,
"grad_norm": 1.1311854124069214,
"learning_rate": 5.155778894472362e-06,
"loss": 1.3116,
"step": 489
},
{
"epoch": 0.6109725685785536,
"grad_norm": 1.1487990617752075,
"learning_rate": 5.145728643216081e-06,
"loss": 1.2591,
"step": 490
},
{
"epoch": 0.6122194513715711,
"grad_norm": 1.2499967813491821,
"learning_rate": 5.1356783919598e-06,
"loss": 1.1438,
"step": 491
},
{
"epoch": 0.6134663341645885,
"grad_norm": 1.1885159015655518,
"learning_rate": 5.125628140703518e-06,
"loss": 1.2083,
"step": 492
},
{
"epoch": 0.614713216957606,
"grad_norm": 1.1704846620559692,
"learning_rate": 5.115577889447236e-06,
"loss": 1.3491,
"step": 493
},
{
"epoch": 0.6159600997506235,
"grad_norm": 1.457107663154602,
"learning_rate": 5.1055276381909544e-06,
"loss": 1.4857,
"step": 494
},
{
"epoch": 0.6172069825436409,
"grad_norm": 1.2045649290084839,
"learning_rate": 5.095477386934674e-06,
"loss": 1.3802,
"step": 495
},
{
"epoch": 0.6184538653366584,
"grad_norm": 1.1182193756103516,
"learning_rate": 5.085427135678392e-06,
"loss": 1.2746,
"step": 496
},
{
"epoch": 0.6197007481296758,
"grad_norm": 1.5713269710540771,
"learning_rate": 5.0753768844221105e-06,
"loss": 1.6005,
"step": 497
},
{
"epoch": 0.6209476309226932,
"grad_norm": 1.0437641143798828,
"learning_rate": 5.065326633165829e-06,
"loss": 1.2162,
"step": 498
},
{
"epoch": 0.6221945137157108,
"grad_norm": 1.1712435483932495,
"learning_rate": 5.0552763819095485e-06,
"loss": 1.329,
"step": 499
},
{
"epoch": 0.6234413965087282,
"grad_norm": 1.398896336555481,
"learning_rate": 5.045226130653267e-06,
"loss": 1.4741,
"step": 500
},
{
"epoch": 0.6246882793017456,
"grad_norm": 1.2904688119888306,
"learning_rate": 5.035175879396985e-06,
"loss": 1.3501,
"step": 501
},
{
"epoch": 0.6259351620947631,
"grad_norm": 1.2536730766296387,
"learning_rate": 5.025125628140704e-06,
"loss": 1.3771,
"step": 502
},
{
"epoch": 0.6271820448877805,
"grad_norm": 1.2482725381851196,
"learning_rate": 5.015075376884423e-06,
"loss": 1.348,
"step": 503
},
{
"epoch": 0.628428927680798,
"grad_norm": 1.1218414306640625,
"learning_rate": 5.005025125628141e-06,
"loss": 1.3244,
"step": 504
},
{
"epoch": 0.6296758104738155,
"grad_norm": 0.9919432997703552,
"learning_rate": 4.99497487437186e-06,
"loss": 1.2138,
"step": 505
},
{
"epoch": 0.6309226932668329,
"grad_norm": 1.1359126567840576,
"learning_rate": 4.984924623115578e-06,
"loss": 1.3458,
"step": 506
},
{
"epoch": 0.6321695760598504,
"grad_norm": 1.238451361656189,
"learning_rate": 4.974874371859297e-06,
"loss": 1.36,
"step": 507
},
{
"epoch": 0.6334164588528678,
"grad_norm": 1.025715947151184,
"learning_rate": 4.964824120603016e-06,
"loss": 1.294,
"step": 508
},
{
"epoch": 0.6346633416458853,
"grad_norm": 1.098476529121399,
"learning_rate": 4.954773869346734e-06,
"loss": 1.1711,
"step": 509
},
{
"epoch": 0.6359102244389028,
"grad_norm": 1.1356350183486938,
"learning_rate": 4.944723618090453e-06,
"loss": 1.3081,
"step": 510
},
{
"epoch": 0.6371571072319202,
"grad_norm": 1.0799708366394043,
"learning_rate": 4.934673366834171e-06,
"loss": 1.2362,
"step": 511
},
{
"epoch": 0.6384039900249376,
"grad_norm": 1.2346608638763428,
"learning_rate": 4.92462311557789e-06,
"loss": 1.3984,
"step": 512
},
{
"epoch": 0.6396508728179551,
"grad_norm": 1.131801724433899,
"learning_rate": 4.914572864321608e-06,
"loss": 1.299,
"step": 513
},
{
"epoch": 0.6408977556109726,
"grad_norm": 1.111167550086975,
"learning_rate": 4.904522613065327e-06,
"loss": 1.2996,
"step": 514
},
{
"epoch": 0.64214463840399,
"grad_norm": 0.9856006503105164,
"learning_rate": 4.8944723618090455e-06,
"loss": 1.1244,
"step": 515
},
{
"epoch": 0.6433915211970075,
"grad_norm": 1.3219460248947144,
"learning_rate": 4.8844221105527645e-06,
"loss": 1.5762,
"step": 516
},
{
"epoch": 0.6446384039900249,
"grad_norm": 1.373515248298645,
"learning_rate": 4.874371859296483e-06,
"loss": 1.4603,
"step": 517
},
{
"epoch": 0.6458852867830424,
"grad_norm": 1.4860966205596924,
"learning_rate": 4.864321608040201e-06,
"loss": 1.4246,
"step": 518
},
{
"epoch": 0.6471321695760599,
"grad_norm": 1.070772409439087,
"learning_rate": 4.85427135678392e-06,
"loss": 1.3235,
"step": 519
},
{
"epoch": 0.6483790523690773,
"grad_norm": 1.101556420326233,
"learning_rate": 4.844221105527638e-06,
"loss": 1.2472,
"step": 520
},
{
"epoch": 0.6496259351620948,
"grad_norm": 1.3003178834915161,
"learning_rate": 4.834170854271357e-06,
"loss": 1.3101,
"step": 521
},
{
"epoch": 0.6508728179551122,
"grad_norm": 1.1553622484207153,
"learning_rate": 4.824120603015076e-06,
"loss": 1.4617,
"step": 522
},
{
"epoch": 0.6521197007481296,
"grad_norm": 1.3094005584716797,
"learning_rate": 4.814070351758794e-06,
"loss": 1.4772,
"step": 523
},
{
"epoch": 0.6533665835411472,
"grad_norm": 1.0930010080337524,
"learning_rate": 4.804020100502513e-06,
"loss": 1.2367,
"step": 524
},
{
"epoch": 0.6546134663341646,
"grad_norm": 1.0848156213760376,
"learning_rate": 4.793969849246232e-06,
"loss": 1.1232,
"step": 525
},
{
"epoch": 0.655860349127182,
"grad_norm": 1.1442714929580688,
"learning_rate": 4.78391959798995e-06,
"loss": 1.1326,
"step": 526
},
{
"epoch": 0.6571072319201995,
"grad_norm": 1.0560922622680664,
"learning_rate": 4.773869346733669e-06,
"loss": 1.2323,
"step": 527
},
{
"epoch": 0.6583541147132169,
"grad_norm": 1.078669786453247,
"learning_rate": 4.763819095477387e-06,
"loss": 1.1893,
"step": 528
},
{
"epoch": 0.6596009975062345,
"grad_norm": 1.2652770280838013,
"learning_rate": 4.753768844221106e-06,
"loss": 1.3677,
"step": 529
},
{
"epoch": 0.6608478802992519,
"grad_norm": 1.1706942319869995,
"learning_rate": 4.743718592964824e-06,
"loss": 1.3092,
"step": 530
},
{
"epoch": 0.6620947630922693,
"grad_norm": 1.112500786781311,
"learning_rate": 4.733668341708543e-06,
"loss": 1.3348,
"step": 531
},
{
"epoch": 0.6633416458852868,
"grad_norm": 1.1695048809051514,
"learning_rate": 4.7236180904522615e-06,
"loss": 1.2274,
"step": 532
},
{
"epoch": 0.6645885286783042,
"grad_norm": 1.0850027799606323,
"learning_rate": 4.7135678391959805e-06,
"loss": 1.3211,
"step": 533
},
{
"epoch": 0.6658354114713217,
"grad_norm": 1.0453435182571411,
"learning_rate": 4.703517587939699e-06,
"loss": 1.2421,
"step": 534
},
{
"epoch": 0.6670822942643392,
"grad_norm": 1.064238429069519,
"learning_rate": 4.693467336683418e-06,
"loss": 1.2931,
"step": 535
},
{
"epoch": 0.6683291770573566,
"grad_norm": 1.073265790939331,
"learning_rate": 4.683417085427136e-06,
"loss": 1.332,
"step": 536
},
{
"epoch": 0.669576059850374,
"grad_norm": 1.1147503852844238,
"learning_rate": 4.673366834170855e-06,
"loss": 1.2703,
"step": 537
},
{
"epoch": 0.6708229426433915,
"grad_norm": 1.1737515926361084,
"learning_rate": 4.663316582914573e-06,
"loss": 1.337,
"step": 538
},
{
"epoch": 0.672069825436409,
"grad_norm": 1.122194766998291,
"learning_rate": 4.653266331658292e-06,
"loss": 1.356,
"step": 539
},
{
"epoch": 0.6733167082294265,
"grad_norm": 1.0874652862548828,
"learning_rate": 4.64321608040201e-06,
"loss": 1.2351,
"step": 540
},
{
"epoch": 0.6745635910224439,
"grad_norm": 1.030617117881775,
"learning_rate": 4.633165829145729e-06,
"loss": 1.1084,
"step": 541
},
{
"epoch": 0.6758104738154613,
"grad_norm": 1.0776344537734985,
"learning_rate": 4.623115577889448e-06,
"loss": 1.3709,
"step": 542
},
{
"epoch": 0.6770573566084788,
"grad_norm": 1.1286858320236206,
"learning_rate": 4.613065326633166e-06,
"loss": 1.1697,
"step": 543
},
{
"epoch": 0.6783042394014963,
"grad_norm": 1.2843730449676514,
"learning_rate": 4.603015075376885e-06,
"loss": 1.3009,
"step": 544
},
{
"epoch": 0.6795511221945137,
"grad_norm": 1.150352954864502,
"learning_rate": 4.592964824120603e-06,
"loss": 1.4233,
"step": 545
},
{
"epoch": 0.6807980049875312,
"grad_norm": 0.9618976712226868,
"learning_rate": 4.582914572864322e-06,
"loss": 1.0999,
"step": 546
},
{
"epoch": 0.6820448877805486,
"grad_norm": 1.1732901334762573,
"learning_rate": 4.57286432160804e-06,
"loss": 1.3423,
"step": 547
},
{
"epoch": 0.683291770573566,
"grad_norm": 1.0924255847930908,
"learning_rate": 4.562814070351759e-06,
"loss": 1.3029,
"step": 548
},
{
"epoch": 0.6845386533665836,
"grad_norm": 1.1003159284591675,
"learning_rate": 4.5527638190954775e-06,
"loss": 1.3181,
"step": 549
},
{
"epoch": 0.685785536159601,
"grad_norm": 1.1251845359802246,
"learning_rate": 4.5427135678391965e-06,
"loss": 1.276,
"step": 550
},
{
"epoch": 0.6870324189526185,
"grad_norm": 1.1196423768997192,
"learning_rate": 4.532663316582915e-06,
"loss": 1.3641,
"step": 551
},
{
"epoch": 0.6882793017456359,
"grad_norm": 1.332815408706665,
"learning_rate": 4.522613065326634e-06,
"loss": 1.4331,
"step": 552
},
{
"epoch": 0.6895261845386533,
"grad_norm": 1.5822710990905762,
"learning_rate": 4.512562814070352e-06,
"loss": 1.2518,
"step": 553
},
{
"epoch": 0.6907730673316709,
"grad_norm": 1.0803861618041992,
"learning_rate": 4.502512562814071e-06,
"loss": 1.1996,
"step": 554
},
{
"epoch": 0.6920199501246883,
"grad_norm": 1.1029341220855713,
"learning_rate": 4.492462311557789e-06,
"loss": 1.3841,
"step": 555
},
{
"epoch": 0.6932668329177057,
"grad_norm": 1.160920262336731,
"learning_rate": 4.482412060301508e-06,
"loss": 1.4129,
"step": 556
},
{
"epoch": 0.6945137157107232,
"grad_norm": 1.213218092918396,
"learning_rate": 4.472361809045226e-06,
"loss": 1.2498,
"step": 557
},
{
"epoch": 0.6957605985037406,
"grad_norm": 1.095435380935669,
"learning_rate": 4.462311557788945e-06,
"loss": 1.2925,
"step": 558
},
{
"epoch": 0.6970074812967582,
"grad_norm": 1.1194297075271606,
"learning_rate": 4.452261306532664e-06,
"loss": 1.1858,
"step": 559
},
{
"epoch": 0.6982543640897756,
"grad_norm": 1.2164180278778076,
"learning_rate": 4.442211055276382e-06,
"loss": 1.3955,
"step": 560
},
{
"epoch": 0.699501246882793,
"grad_norm": 1.169387936592102,
"learning_rate": 4.432160804020101e-06,
"loss": 1.3091,
"step": 561
},
{
"epoch": 0.7007481296758105,
"grad_norm": 1.1217790842056274,
"learning_rate": 4.42211055276382e-06,
"loss": 1.2544,
"step": 562
},
{
"epoch": 0.7019950124688279,
"grad_norm": 0.9259798526763916,
"learning_rate": 4.412060301507538e-06,
"loss": 1.1452,
"step": 563
},
{
"epoch": 0.7032418952618454,
"grad_norm": 1.0845868587493896,
"learning_rate": 4.4020100502512564e-06,
"loss": 1.1751,
"step": 564
},
{
"epoch": 0.7044887780548629,
"grad_norm": 1.0227254629135132,
"learning_rate": 4.391959798994975e-06,
"loss": 1.3191,
"step": 565
},
{
"epoch": 0.7057356608478803,
"grad_norm": 1.1065592765808105,
"learning_rate": 4.3819095477386936e-06,
"loss": 1.168,
"step": 566
},
{
"epoch": 0.7069825436408977,
"grad_norm": 0.9589051008224487,
"learning_rate": 4.3718592964824125e-06,
"loss": 1.1115,
"step": 567
},
{
"epoch": 0.7082294264339152,
"grad_norm": 0.984656035900116,
"learning_rate": 4.361809045226131e-06,
"loss": 1.1782,
"step": 568
},
{
"epoch": 0.7094763092269327,
"grad_norm": 1.153030514717102,
"learning_rate": 4.35175879396985e-06,
"loss": 1.4849,
"step": 569
},
{
"epoch": 0.7107231920199502,
"grad_norm": 1.1514592170715332,
"learning_rate": 4.341708542713568e-06,
"loss": 1.5695,
"step": 570
},
{
"epoch": 0.7119700748129676,
"grad_norm": 1.2167006731033325,
"learning_rate": 4.331658291457287e-06,
"loss": 1.6739,
"step": 571
},
{
"epoch": 0.713216957605985,
"grad_norm": 1.0637681484222412,
"learning_rate": 4.321608040201005e-06,
"loss": 1.1619,
"step": 572
},
{
"epoch": 0.7144638403990025,
"grad_norm": 1.095629096031189,
"learning_rate": 4.311557788944724e-06,
"loss": 1.3744,
"step": 573
},
{
"epoch": 0.71571072319202,
"grad_norm": 1.192861557006836,
"learning_rate": 4.301507537688442e-06,
"loss": 1.4814,
"step": 574
},
{
"epoch": 0.7169576059850374,
"grad_norm": 1.1575725078582764,
"learning_rate": 4.291457286432161e-06,
"loss": 1.3772,
"step": 575
},
{
"epoch": 0.7182044887780549,
"grad_norm": 1.2418595552444458,
"learning_rate": 4.28140703517588e-06,
"loss": 1.3419,
"step": 576
},
{
"epoch": 0.7194513715710723,
"grad_norm": 0.997127115726471,
"learning_rate": 4.271356783919598e-06,
"loss": 1.3651,
"step": 577
},
{
"epoch": 0.7206982543640897,
"grad_norm": 1.2453523874282837,
"learning_rate": 4.261306532663317e-06,
"loss": 1.532,
"step": 578
},
{
"epoch": 0.7219451371571073,
"grad_norm": 0.9462280869483948,
"learning_rate": 4.251256281407035e-06,
"loss": 1.1298,
"step": 579
},
{
"epoch": 0.7231920199501247,
"grad_norm": 1.0504893064498901,
"learning_rate": 4.241206030150754e-06,
"loss": 1.3086,
"step": 580
},
{
"epoch": 0.7244389027431422,
"grad_norm": 1.0261592864990234,
"learning_rate": 4.231155778894473e-06,
"loss": 1.2857,
"step": 581
},
{
"epoch": 0.7256857855361596,
"grad_norm": 1.2512131929397583,
"learning_rate": 4.221105527638191e-06,
"loss": 1.4445,
"step": 582
},
{
"epoch": 0.726932668329177,
"grad_norm": 1.1242586374282837,
"learning_rate": 4.21105527638191e-06,
"loss": 1.3446,
"step": 583
},
{
"epoch": 0.7281795511221946,
"grad_norm": 1.0649203062057495,
"learning_rate": 4.2010050251256285e-06,
"loss": 1.3638,
"step": 584
},
{
"epoch": 0.729426433915212,
"grad_norm": 1.1724040508270264,
"learning_rate": 4.1909547738693475e-06,
"loss": 1.4811,
"step": 585
},
{
"epoch": 0.7306733167082294,
"grad_norm": 1.0804775953292847,
"learning_rate": 4.180904522613066e-06,
"loss": 1.1316,
"step": 586
},
{
"epoch": 0.7319201995012469,
"grad_norm": 0.9892793297767639,
"learning_rate": 4.170854271356784e-06,
"loss": 1.1471,
"step": 587
},
{
"epoch": 0.7331670822942643,
"grad_norm": 1.151212215423584,
"learning_rate": 4.160804020100503e-06,
"loss": 1.4482,
"step": 588
},
{
"epoch": 0.7344139650872819,
"grad_norm": 1.1802674531936646,
"learning_rate": 4.150753768844221e-06,
"loss": 1.6117,
"step": 589
},
{
"epoch": 0.7356608478802993,
"grad_norm": 1.0757607221603394,
"learning_rate": 4.14070351758794e-06,
"loss": 1.3565,
"step": 590
},
{
"epoch": 0.7369077306733167,
"grad_norm": 1.0788909196853638,
"learning_rate": 4.130653266331658e-06,
"loss": 1.2429,
"step": 591
},
{
"epoch": 0.7381546134663342,
"grad_norm": 1.0227360725402832,
"learning_rate": 4.120603015075377e-06,
"loss": 1.1207,
"step": 592
},
{
"epoch": 0.7394014962593516,
"grad_norm": 1.2680491209030151,
"learning_rate": 4.110552763819096e-06,
"loss": 1.529,
"step": 593
},
{
"epoch": 0.7406483790523691,
"grad_norm": 1.0967222452163696,
"learning_rate": 4.100502512562814e-06,
"loss": 1.3779,
"step": 594
},
{
"epoch": 0.7418952618453866,
"grad_norm": 1.031927227973938,
"learning_rate": 4.090452261306533e-06,
"loss": 1.3051,
"step": 595
},
{
"epoch": 0.743142144638404,
"grad_norm": 1.0825140476226807,
"learning_rate": 4.080402010050251e-06,
"loss": 1.3006,
"step": 596
},
{
"epoch": 0.7443890274314214,
"grad_norm": 1.0165408849716187,
"learning_rate": 4.07035175879397e-06,
"loss": 1.2319,
"step": 597
},
{
"epoch": 0.7456359102244389,
"grad_norm": 1.074092984199524,
"learning_rate": 4.060301507537689e-06,
"loss": 1.3783,
"step": 598
},
{
"epoch": 0.7468827930174564,
"grad_norm": 1.0302280187606812,
"learning_rate": 4.0502512562814074e-06,
"loss": 1.2394,
"step": 599
},
{
"epoch": 0.7481296758104738,
"grad_norm": 1.197632908821106,
"learning_rate": 4.040201005025126e-06,
"loss": 1.4848,
"step": 600
},
{
"epoch": 0.7493765586034913,
"grad_norm": 1.1284730434417725,
"learning_rate": 4.0301507537688446e-06,
"loss": 1.3831,
"step": 601
},
{
"epoch": 0.7506234413965087,
"grad_norm": 0.9015758633613586,
"learning_rate": 4.0201005025125635e-06,
"loss": 1.1153,
"step": 602
},
{
"epoch": 0.7518703241895262,
"grad_norm": 1.0198216438293457,
"learning_rate": 4.010050251256282e-06,
"loss": 1.1892,
"step": 603
},
{
"epoch": 0.7531172069825436,
"grad_norm": 1.0180002450942993,
"learning_rate": 4.000000000000001e-06,
"loss": 1.3982,
"step": 604
},
{
"epoch": 0.7543640897755611,
"grad_norm": 1.0819799900054932,
"learning_rate": 3.989949748743719e-06,
"loss": 1.3013,
"step": 605
},
{
"epoch": 0.7556109725685786,
"grad_norm": 1.1782817840576172,
"learning_rate": 3.979899497487438e-06,
"loss": 1.2649,
"step": 606
},
{
"epoch": 0.756857855361596,
"grad_norm": 1.2745203971862793,
"learning_rate": 3.969849246231156e-06,
"loss": 1.3252,
"step": 607
},
{
"epoch": 0.7581047381546134,
"grad_norm": 1.0373176336288452,
"learning_rate": 3.959798994974875e-06,
"loss": 1.1759,
"step": 608
},
{
"epoch": 0.7593516209476309,
"grad_norm": 1.1250362396240234,
"learning_rate": 3.949748743718593e-06,
"loss": 1.3396,
"step": 609
},
{
"epoch": 0.7605985037406484,
"grad_norm": 1.0422521829605103,
"learning_rate": 3.939698492462311e-06,
"loss": 1.2848,
"step": 610
},
{
"epoch": 0.7618453865336658,
"grad_norm": 1.1368873119354248,
"learning_rate": 3.92964824120603e-06,
"loss": 1.3126,
"step": 611
},
{
"epoch": 0.7630922693266833,
"grad_norm": 1.0962395668029785,
"learning_rate": 3.919597989949749e-06,
"loss": 1.2602,
"step": 612
},
{
"epoch": 0.7643391521197007,
"grad_norm": 1.0593301057815552,
"learning_rate": 3.909547738693467e-06,
"loss": 1.2048,
"step": 613
},
{
"epoch": 0.7655860349127181,
"grad_norm": 1.0383837223052979,
"learning_rate": 3.899497487437186e-06,
"loss": 1.1758,
"step": 614
},
{
"epoch": 0.7668329177057357,
"grad_norm": 1.099158525466919,
"learning_rate": 3.889447236180905e-06,
"loss": 1.3342,
"step": 615
},
{
"epoch": 0.7680798004987531,
"grad_norm": 1.0604772567749023,
"learning_rate": 3.8793969849246234e-06,
"loss": 1.352,
"step": 616
},
{
"epoch": 0.7693266832917706,
"grad_norm": 1.0652525424957275,
"learning_rate": 3.869346733668342e-06,
"loss": 1.1875,
"step": 617
},
{
"epoch": 0.770573566084788,
"grad_norm": 1.2529562711715698,
"learning_rate": 3.8592964824120606e-06,
"loss": 1.4779,
"step": 618
},
{
"epoch": 0.7718204488778054,
"grad_norm": 1.0753412246704102,
"learning_rate": 3.8492462311557795e-06,
"loss": 1.4986,
"step": 619
},
{
"epoch": 0.773067331670823,
"grad_norm": 1.0033202171325684,
"learning_rate": 3.839195979899498e-06,
"loss": 1.2085,
"step": 620
},
{
"epoch": 0.7743142144638404,
"grad_norm": 1.1038106679916382,
"learning_rate": 3.829145728643217e-06,
"loss": 1.4254,
"step": 621
},
{
"epoch": 0.7755610972568578,
"grad_norm": 0.9478548765182495,
"learning_rate": 3.819095477386935e-06,
"loss": 1.1043,
"step": 622
},
{
"epoch": 0.7768079800498753,
"grad_norm": 1.2829041481018066,
"learning_rate": 3.809045226130654e-06,
"loss": 1.4101,
"step": 623
},
{
"epoch": 0.7780548628428927,
"grad_norm": 1.0391925573349,
"learning_rate": 3.798994974874372e-06,
"loss": 1.3809,
"step": 624
},
{
"epoch": 0.7793017456359103,
"grad_norm": 1.0022780895233154,
"learning_rate": 3.788944723618091e-06,
"loss": 1.3742,
"step": 625
},
{
"epoch": 0.7805486284289277,
"grad_norm": 1.083511471748352,
"learning_rate": 3.7788944723618095e-06,
"loss": 1.2992,
"step": 626
},
{
"epoch": 0.7817955112219451,
"grad_norm": 1.0486705303192139,
"learning_rate": 3.768844221105528e-06,
"loss": 1.0928,
"step": 627
},
{
"epoch": 0.7830423940149626,
"grad_norm": 1.0774915218353271,
"learning_rate": 3.7587939698492466e-06,
"loss": 1.1697,
"step": 628
},
{
"epoch": 0.78428927680798,
"grad_norm": 0.9593791961669922,
"learning_rate": 3.748743718592965e-06,
"loss": 1.3415,
"step": 629
},
{
"epoch": 0.7855361596009975,
"grad_norm": 1.141543984413147,
"learning_rate": 3.7386934673366837e-06,
"loss": 1.4172,
"step": 630
},
{
"epoch": 0.786783042394015,
"grad_norm": 1.0077931880950928,
"learning_rate": 3.7286432160804027e-06,
"loss": 1.0945,
"step": 631
},
{
"epoch": 0.7880299251870324,
"grad_norm": 1.0111457109451294,
"learning_rate": 3.718592964824121e-06,
"loss": 1.0663,
"step": 632
},
{
"epoch": 0.7892768079800498,
"grad_norm": 1.1871503591537476,
"learning_rate": 3.7085427135678394e-06,
"loss": 1.5321,
"step": 633
},
{
"epoch": 0.7905236907730673,
"grad_norm": 1.2060590982437134,
"learning_rate": 3.698492462311558e-06,
"loss": 1.3901,
"step": 634
},
{
"epoch": 0.7917705735660848,
"grad_norm": 1.1934421062469482,
"learning_rate": 3.6884422110552766e-06,
"loss": 1.3379,
"step": 635
},
{
"epoch": 0.7930174563591023,
"grad_norm": 1.077783226966858,
"learning_rate": 3.678391959798995e-06,
"loss": 1.3164,
"step": 636
},
{
"epoch": 0.7942643391521197,
"grad_norm": 1.0920122861862183,
"learning_rate": 3.6683417085427137e-06,
"loss": 1.2686,
"step": 637
},
{
"epoch": 0.7955112219451371,
"grad_norm": 1.2184644937515259,
"learning_rate": 3.6582914572864327e-06,
"loss": 1.4722,
"step": 638
},
{
"epoch": 0.7967581047381546,
"grad_norm": 0.9998170137405396,
"learning_rate": 3.648241206030151e-06,
"loss": 1.2636,
"step": 639
},
{
"epoch": 0.7980049875311721,
"grad_norm": 0.9682722687721252,
"learning_rate": 3.63819095477387e-06,
"loss": 1.1168,
"step": 640
},
{
"epoch": 0.7992518703241895,
"grad_norm": 0.9886475801467896,
"learning_rate": 3.628140703517588e-06,
"loss": 1.2235,
"step": 641
},
{
"epoch": 0.800498753117207,
"grad_norm": 0.9414662718772888,
"learning_rate": 3.618090452261307e-06,
"loss": 1.2258,
"step": 642
},
{
"epoch": 0.8017456359102244,
"grad_norm": 0.9854826927185059,
"learning_rate": 3.608040201005025e-06,
"loss": 1.3498,
"step": 643
},
{
"epoch": 0.8029925187032418,
"grad_norm": 1.0102778673171997,
"learning_rate": 3.597989949748744e-06,
"loss": 1.0918,
"step": 644
},
{
"epoch": 0.8042394014962594,
"grad_norm": 0.9639750719070435,
"learning_rate": 3.5879396984924626e-06,
"loss": 1.207,
"step": 645
},
{
"epoch": 0.8054862842892768,
"grad_norm": 1.2800425291061401,
"learning_rate": 3.577889447236181e-06,
"loss": 1.3786,
"step": 646
},
{
"epoch": 0.8067331670822943,
"grad_norm": 1.0239495038986206,
"learning_rate": 3.5678391959798997e-06,
"loss": 1.1972,
"step": 647
},
{
"epoch": 0.8079800498753117,
"grad_norm": 1.0544627904891968,
"learning_rate": 3.5577889447236187e-06,
"loss": 1.2415,
"step": 648
},
{
"epoch": 0.8092269326683291,
"grad_norm": 1.0860525369644165,
"learning_rate": 3.547738693467337e-06,
"loss": 1.2335,
"step": 649
},
{
"epoch": 0.8104738154613467,
"grad_norm": 1.3814754486083984,
"learning_rate": 3.537688442211056e-06,
"loss": 1.3163,
"step": 650
},
{
"epoch": 0.8117206982543641,
"grad_norm": 1.1752089262008667,
"learning_rate": 3.527638190954774e-06,
"loss": 1.3965,
"step": 651
},
{
"epoch": 0.8129675810473815,
"grad_norm": 1.2307369709014893,
"learning_rate": 3.517587939698493e-06,
"loss": 1.4356,
"step": 652
},
{
"epoch": 0.814214463840399,
"grad_norm": 0.9724376201629639,
"learning_rate": 3.507537688442211e-06,
"loss": 1.2193,
"step": 653
},
{
"epoch": 0.8154613466334164,
"grad_norm": 1.1516051292419434,
"learning_rate": 3.49748743718593e-06,
"loss": 1.3796,
"step": 654
},
{
"epoch": 0.816708229426434,
"grad_norm": 1.0130220651626587,
"learning_rate": 3.4874371859296487e-06,
"loss": 1.2647,
"step": 655
},
{
"epoch": 0.8179551122194514,
"grad_norm": 1.0963294506072998,
"learning_rate": 3.477386934673367e-06,
"loss": 1.3587,
"step": 656
},
{
"epoch": 0.8192019950124688,
"grad_norm": 1.1782459020614624,
"learning_rate": 3.467336683417086e-06,
"loss": 1.3392,
"step": 657
},
{
"epoch": 0.8204488778054863,
"grad_norm": 1.1211333274841309,
"learning_rate": 3.457286432160804e-06,
"loss": 1.3198,
"step": 658
},
{
"epoch": 0.8216957605985037,
"grad_norm": 1.2690104246139526,
"learning_rate": 3.447236180904523e-06,
"loss": 1.5677,
"step": 659
},
{
"epoch": 0.8229426433915212,
"grad_norm": 1.1098829507827759,
"learning_rate": 3.437185929648241e-06,
"loss": 1.4978,
"step": 660
},
{
"epoch": 0.8241895261845387,
"grad_norm": 1.2354493141174316,
"learning_rate": 3.42713567839196e-06,
"loss": 1.39,
"step": 661
},
{
"epoch": 0.8254364089775561,
"grad_norm": 1.2133121490478516,
"learning_rate": 3.4170854271356786e-06,
"loss": 1.3905,
"step": 662
},
{
"epoch": 0.8266832917705735,
"grad_norm": 0.9457760453224182,
"learning_rate": 3.407035175879397e-06,
"loss": 1.1879,
"step": 663
},
{
"epoch": 0.827930174563591,
"grad_norm": 0.9937940239906311,
"learning_rate": 3.3969849246231158e-06,
"loss": 1.1324,
"step": 664
},
{
"epoch": 0.8291770573566085,
"grad_norm": 1.1607681512832642,
"learning_rate": 3.3869346733668347e-06,
"loss": 1.5416,
"step": 665
},
{
"epoch": 0.830423940149626,
"grad_norm": 1.069676160812378,
"learning_rate": 3.376884422110553e-06,
"loss": 1.2551,
"step": 666
},
{
"epoch": 0.8316708229426434,
"grad_norm": 1.0121736526489258,
"learning_rate": 3.366834170854272e-06,
"loss": 1.2695,
"step": 667
},
{
"epoch": 0.8329177057356608,
"grad_norm": 1.0263285636901855,
"learning_rate": 3.35678391959799e-06,
"loss": 1.2441,
"step": 668
},
{
"epoch": 0.8341645885286783,
"grad_norm": 1.028357744216919,
"learning_rate": 3.346733668341709e-06,
"loss": 1.2744,
"step": 669
},
{
"epoch": 0.8354114713216958,
"grad_norm": 1.0364710092544556,
"learning_rate": 3.336683417085427e-06,
"loss": 1.2813,
"step": 670
},
{
"epoch": 0.8366583541147132,
"grad_norm": 1.119977593421936,
"learning_rate": 3.326633165829146e-06,
"loss": 1.4322,
"step": 671
},
{
"epoch": 0.8379052369077307,
"grad_norm": 0.9980776309967041,
"learning_rate": 3.3165829145728647e-06,
"loss": 1.1649,
"step": 672
},
{
"epoch": 0.8391521197007481,
"grad_norm": 1.0738976001739502,
"learning_rate": 3.3065326633165833e-06,
"loss": 1.3231,
"step": 673
},
{
"epoch": 0.8403990024937655,
"grad_norm": 1.1432924270629883,
"learning_rate": 3.296482412060302e-06,
"loss": 1.207,
"step": 674
},
{
"epoch": 0.8416458852867831,
"grad_norm": 1.0316051244735718,
"learning_rate": 3.286432160804021e-06,
"loss": 1.294,
"step": 675
},
{
"epoch": 0.8428927680798005,
"grad_norm": 1.0117923021316528,
"learning_rate": 3.276381909547739e-06,
"loss": 1.2837,
"step": 676
},
{
"epoch": 0.844139650872818,
"grad_norm": 0.9859020113945007,
"learning_rate": 3.266331658291458e-06,
"loss": 1.2838,
"step": 677
},
{
"epoch": 0.8453865336658354,
"grad_norm": 1.038971185684204,
"learning_rate": 3.256281407035176e-06,
"loss": 1.2354,
"step": 678
},
{
"epoch": 0.8466334164588528,
"grad_norm": 1.1101958751678467,
"learning_rate": 3.2462311557788946e-06,
"loss": 1.3817,
"step": 679
},
{
"epoch": 0.8478802992518704,
"grad_norm": 1.0614064931869507,
"learning_rate": 3.236180904522613e-06,
"loss": 1.3577,
"step": 680
},
{
"epoch": 0.8491271820448878,
"grad_norm": 1.3358986377716064,
"learning_rate": 3.2261306532663318e-06,
"loss": 1.4581,
"step": 681
},
{
"epoch": 0.8503740648379052,
"grad_norm": 1.1418943405151367,
"learning_rate": 3.2160804020100507e-06,
"loss": 1.4387,
"step": 682
},
{
"epoch": 0.8516209476309227,
"grad_norm": 1.0399991273880005,
"learning_rate": 3.206030150753769e-06,
"loss": 1.3392,
"step": 683
},
{
"epoch": 0.8528678304239401,
"grad_norm": 1.194429874420166,
"learning_rate": 3.195979899497488e-06,
"loss": 1.3924,
"step": 684
},
{
"epoch": 0.8541147132169576,
"grad_norm": 1.3985058069229126,
"learning_rate": 3.185929648241206e-06,
"loss": 1.1775,
"step": 685
},
{
"epoch": 0.8553615960099751,
"grad_norm": 0.9131481051445007,
"learning_rate": 3.175879396984925e-06,
"loss": 1.2358,
"step": 686
},
{
"epoch": 0.8566084788029925,
"grad_norm": 1.034224033355713,
"learning_rate": 3.165829145728643e-06,
"loss": 1.2832,
"step": 687
},
{
"epoch": 0.85785536159601,
"grad_norm": 1.062367558479309,
"learning_rate": 3.155778894472362e-06,
"loss": 1.3703,
"step": 688
},
{
"epoch": 0.8591022443890274,
"grad_norm": 1.0593260526657104,
"learning_rate": 3.1457286432160807e-06,
"loss": 1.2351,
"step": 689
},
{
"epoch": 0.8603491271820449,
"grad_norm": 1.2291982173919678,
"learning_rate": 3.1356783919597993e-06,
"loss": 1.3969,
"step": 690
},
{
"epoch": 0.8615960099750624,
"grad_norm": 0.8859289884567261,
"learning_rate": 3.125628140703518e-06,
"loss": 1.172,
"step": 691
},
{
"epoch": 0.8628428927680798,
"grad_norm": 1.0447728633880615,
"learning_rate": 3.115577889447237e-06,
"loss": 1.3169,
"step": 692
},
{
"epoch": 0.8640897755610972,
"grad_norm": 1.0128638744354248,
"learning_rate": 3.105527638190955e-06,
"loss": 1.26,
"step": 693
},
{
"epoch": 0.8653366583541147,
"grad_norm": 1.0045210123062134,
"learning_rate": 3.095477386934674e-06,
"loss": 1.1433,
"step": 694
},
{
"epoch": 0.8665835411471322,
"grad_norm": 0.983322024345398,
"learning_rate": 3.085427135678392e-06,
"loss": 1.3336,
"step": 695
},
{
"epoch": 0.8678304239401496,
"grad_norm": 0.9351543188095093,
"learning_rate": 3.075376884422111e-06,
"loss": 1.1872,
"step": 696
},
{
"epoch": 0.8690773067331671,
"grad_norm": 0.9706789255142212,
"learning_rate": 3.065326633165829e-06,
"loss": 1.182,
"step": 697
},
{
"epoch": 0.8703241895261845,
"grad_norm": 0.9718403220176697,
"learning_rate": 3.055276381909548e-06,
"loss": 1.3236,
"step": 698
},
{
"epoch": 0.871571072319202,
"grad_norm": 1.0157040357589722,
"learning_rate": 3.0452261306532668e-06,
"loss": 1.2083,
"step": 699
},
{
"epoch": 0.8728179551122195,
"grad_norm": 1.0136648416519165,
"learning_rate": 3.0351758793969853e-06,
"loss": 1.3794,
"step": 700
},
{
"epoch": 0.8740648379052369,
"grad_norm": 1.012591004371643,
"learning_rate": 3.025125628140704e-06,
"loss": 1.237,
"step": 701
},
{
"epoch": 0.8753117206982544,
"grad_norm": 1.0355769395828247,
"learning_rate": 3.015075376884422e-06,
"loss": 1.3695,
"step": 702
},
{
"epoch": 0.8765586034912718,
"grad_norm": 0.9932037591934204,
"learning_rate": 3.005025125628141e-06,
"loss": 1.2643,
"step": 703
},
{
"epoch": 0.8778054862842892,
"grad_norm": 1.0271167755126953,
"learning_rate": 2.994974874371859e-06,
"loss": 1.2842,
"step": 704
},
{
"epoch": 0.8790523690773068,
"grad_norm": 1.0666383504867554,
"learning_rate": 2.984924623115578e-06,
"loss": 1.2272,
"step": 705
},
{
"epoch": 0.8802992518703242,
"grad_norm": 1.0097944736480713,
"learning_rate": 2.9748743718592967e-06,
"loss": 1.2491,
"step": 706
},
{
"epoch": 0.8815461346633416,
"grad_norm": 0.9316381812095642,
"learning_rate": 2.9648241206030153e-06,
"loss": 1.2248,
"step": 707
},
{
"epoch": 0.8827930174563591,
"grad_norm": 0.9848239421844482,
"learning_rate": 2.954773869346734e-06,
"loss": 1.2071,
"step": 708
},
{
"epoch": 0.8840399002493765,
"grad_norm": 1.0080039501190186,
"learning_rate": 2.9447236180904524e-06,
"loss": 1.2221,
"step": 709
},
{
"epoch": 0.885286783042394,
"grad_norm": 1.2530858516693115,
"learning_rate": 2.934673366834171e-06,
"loss": 1.4369,
"step": 710
},
{
"epoch": 0.8865336658354115,
"grad_norm": 0.9040453433990479,
"learning_rate": 2.92462311557789e-06,
"loss": 1.1381,
"step": 711
},
{
"epoch": 0.8877805486284289,
"grad_norm": 1.0322935581207275,
"learning_rate": 2.914572864321608e-06,
"loss": 1.334,
"step": 712
},
{
"epoch": 0.8890274314214464,
"grad_norm": 1.054388165473938,
"learning_rate": 2.904522613065327e-06,
"loss": 1.3207,
"step": 713
},
{
"epoch": 0.8902743142144638,
"grad_norm": 1.1435095071792603,
"learning_rate": 2.894472361809045e-06,
"loss": 1.3643,
"step": 714
},
{
"epoch": 0.8915211970074813,
"grad_norm": 0.9743549227714539,
"learning_rate": 2.884422110552764e-06,
"loss": 1.0902,
"step": 715
},
{
"epoch": 0.8927680798004988,
"grad_norm": 1.0644567012786865,
"learning_rate": 2.8743718592964828e-06,
"loss": 1.3784,
"step": 716
},
{
"epoch": 0.8940149625935162,
"grad_norm": 1.0366824865341187,
"learning_rate": 2.8643216080402013e-06,
"loss": 1.3163,
"step": 717
},
{
"epoch": 0.8952618453865336,
"grad_norm": 1.4961094856262207,
"learning_rate": 2.85427135678392e-06,
"loss": 1.3051,
"step": 718
},
{
"epoch": 0.8965087281795511,
"grad_norm": 0.9245879054069519,
"learning_rate": 2.8442211055276384e-06,
"loss": 1.1978,
"step": 719
},
{
"epoch": 0.8977556109725686,
"grad_norm": 1.0222971439361572,
"learning_rate": 2.834170854271357e-06,
"loss": 1.2455,
"step": 720
},
{
"epoch": 0.899002493765586,
"grad_norm": 1.0799505710601807,
"learning_rate": 2.824120603015076e-06,
"loss": 1.2106,
"step": 721
},
{
"epoch": 0.9002493765586035,
"grad_norm": 0.9997124075889587,
"learning_rate": 2.814070351758794e-06,
"loss": 1.2447,
"step": 722
},
{
"epoch": 0.9014962593516209,
"grad_norm": 1.0951141119003296,
"learning_rate": 2.804020100502513e-06,
"loss": 1.3694,
"step": 723
},
{
"epoch": 0.9027431421446384,
"grad_norm": 1.0227795839309692,
"learning_rate": 2.7939698492462313e-06,
"loss": 1.2869,
"step": 724
},
{
"epoch": 0.9039900249376559,
"grad_norm": 1.0229079723358154,
"learning_rate": 2.78391959798995e-06,
"loss": 1.3032,
"step": 725
},
{
"epoch": 0.9052369077306733,
"grad_norm": 1.0651566982269287,
"learning_rate": 2.7738693467336684e-06,
"loss": 1.2817,
"step": 726
},
{
"epoch": 0.9064837905236908,
"grad_norm": 1.0881601572036743,
"learning_rate": 2.763819095477387e-06,
"loss": 1.2633,
"step": 727
},
{
"epoch": 0.9077306733167082,
"grad_norm": 1.0890527963638306,
"learning_rate": 2.753768844221106e-06,
"loss": 1.3236,
"step": 728
},
{
"epoch": 0.9089775561097256,
"grad_norm": 1.1237494945526123,
"learning_rate": 2.743718592964824e-06,
"loss": 1.3812,
"step": 729
},
{
"epoch": 0.9102244389027432,
"grad_norm": 1.0216248035430908,
"learning_rate": 2.733668341708543e-06,
"loss": 1.2982,
"step": 730
},
{
"epoch": 0.9114713216957606,
"grad_norm": 1.0323688983917236,
"learning_rate": 2.723618090452261e-06,
"loss": 1.3214,
"step": 731
},
{
"epoch": 0.912718204488778,
"grad_norm": 0.9869887828826904,
"learning_rate": 2.71356783919598e-06,
"loss": 1.263,
"step": 732
},
{
"epoch": 0.9139650872817955,
"grad_norm": 1.0594792366027832,
"learning_rate": 2.7035175879396983e-06,
"loss": 1.353,
"step": 733
},
{
"epoch": 0.9152119700748129,
"grad_norm": 0.9987196326255798,
"learning_rate": 2.6934673366834173e-06,
"loss": 1.2559,
"step": 734
},
{
"epoch": 0.9164588528678305,
"grad_norm": 1.2770543098449707,
"learning_rate": 2.683417085427136e-06,
"loss": 1.4697,
"step": 735
},
{
"epoch": 0.9177057356608479,
"grad_norm": 1.1007297039031982,
"learning_rate": 2.6733668341708545e-06,
"loss": 1.3475,
"step": 736
},
{
"epoch": 0.9189526184538653,
"grad_norm": 0.9552852511405945,
"learning_rate": 2.663316582914573e-06,
"loss": 1.1784,
"step": 737
},
{
"epoch": 0.9201995012468828,
"grad_norm": 1.377469539642334,
"learning_rate": 2.653266331658292e-06,
"loss": 1.3585,
"step": 738
},
{
"epoch": 0.9214463840399002,
"grad_norm": 1.1127710342407227,
"learning_rate": 2.64321608040201e-06,
"loss": 1.5175,
"step": 739
},
{
"epoch": 0.9226932668329177,
"grad_norm": 1.0479665994644165,
"learning_rate": 2.633165829145729e-06,
"loss": 1.2823,
"step": 740
},
{
"epoch": 0.9239401496259352,
"grad_norm": 1.2476303577423096,
"learning_rate": 2.6231155778894473e-06,
"loss": 1.3308,
"step": 741
},
{
"epoch": 0.9251870324189526,
"grad_norm": 1.1280089616775513,
"learning_rate": 2.6130653266331663e-06,
"loss": 1.26,
"step": 742
},
{
"epoch": 0.92643391521197,
"grad_norm": 0.9391908049583435,
"learning_rate": 2.6030150753768844e-06,
"loss": 1.2002,
"step": 743
},
{
"epoch": 0.9276807980049875,
"grad_norm": 0.956329882144928,
"learning_rate": 2.5929648241206034e-06,
"loss": 1.1182,
"step": 744
},
{
"epoch": 0.928927680798005,
"grad_norm": 1.1058745384216309,
"learning_rate": 2.582914572864322e-06,
"loss": 1.4141,
"step": 745
},
{
"epoch": 0.9301745635910225,
"grad_norm": 1.0975598096847534,
"learning_rate": 2.5728643216080405e-06,
"loss": 1.2396,
"step": 746
},
{
"epoch": 0.9314214463840399,
"grad_norm": 0.9466800093650818,
"learning_rate": 2.562814070351759e-06,
"loss": 1.0547,
"step": 747
},
{
"epoch": 0.9326683291770573,
"grad_norm": 1.034044861793518,
"learning_rate": 2.5527638190954772e-06,
"loss": 1.2348,
"step": 748
},
{
"epoch": 0.9339152119700748,
"grad_norm": 1.1218293905258179,
"learning_rate": 2.542713567839196e-06,
"loss": 1.4085,
"step": 749
},
{
"epoch": 0.9351620947630923,
"grad_norm": 1.077072262763977,
"learning_rate": 2.5326633165829143e-06,
"loss": 1.3878,
"step": 750
},
{
"epoch": 0.9364089775561097,
"grad_norm": 1.10988187789917,
"learning_rate": 2.5226130653266333e-06,
"loss": 1.2204,
"step": 751
},
{
"epoch": 0.9376558603491272,
"grad_norm": 1.0233125686645508,
"learning_rate": 2.512562814070352e-06,
"loss": 1.2402,
"step": 752
},
{
"epoch": 0.9389027431421446,
"grad_norm": 1.1426713466644287,
"learning_rate": 2.5025125628140705e-06,
"loss": 1.2172,
"step": 753
},
{
"epoch": 0.940149625935162,
"grad_norm": 1.2279871702194214,
"learning_rate": 2.492462311557789e-06,
"loss": 1.6441,
"step": 754
},
{
"epoch": 0.9413965087281796,
"grad_norm": 1.0472629070281982,
"learning_rate": 2.482412060301508e-06,
"loss": 1.3554,
"step": 755
},
{
"epoch": 0.942643391521197,
"grad_norm": 0.9111616015434265,
"learning_rate": 2.4723618090452266e-06,
"loss": 1.0862,
"step": 756
},
{
"epoch": 0.9438902743142145,
"grad_norm": 0.9185367822647095,
"learning_rate": 2.462311557788945e-06,
"loss": 1.143,
"step": 757
},
{
"epoch": 0.9451371571072319,
"grad_norm": 1.0564497709274292,
"learning_rate": 2.4522613065326637e-06,
"loss": 1.4001,
"step": 758
},
{
"epoch": 0.9463840399002493,
"grad_norm": 1.0216621160507202,
"learning_rate": 2.4422110552763823e-06,
"loss": 1.0497,
"step": 759
},
{
"epoch": 0.9476309226932669,
"grad_norm": 0.9879399538040161,
"learning_rate": 2.4321608040201004e-06,
"loss": 1.2711,
"step": 760
},
{
"epoch": 0.9488778054862843,
"grad_norm": 1.1382951736450195,
"learning_rate": 2.422110552763819e-06,
"loss": 1.504,
"step": 761
},
{
"epoch": 0.9501246882793017,
"grad_norm": 1.0174823999404907,
"learning_rate": 2.412060301507538e-06,
"loss": 1.431,
"step": 762
},
{
"epoch": 0.9513715710723192,
"grad_norm": 1.011014461517334,
"learning_rate": 2.4020100502512565e-06,
"loss": 1.2351,
"step": 763
},
{
"epoch": 0.9526184538653366,
"grad_norm": 1.1895780563354492,
"learning_rate": 2.391959798994975e-06,
"loss": 1.2247,
"step": 764
},
{
"epoch": 0.9538653366583542,
"grad_norm": 1.028640627861023,
"learning_rate": 2.3819095477386936e-06,
"loss": 1.3066,
"step": 765
},
{
"epoch": 0.9551122194513716,
"grad_norm": 1.0584501028060913,
"learning_rate": 2.371859296482412e-06,
"loss": 1.4395,
"step": 766
},
{
"epoch": 0.956359102244389,
"grad_norm": 1.070300817489624,
"learning_rate": 2.3618090452261308e-06,
"loss": 1.3507,
"step": 767
},
{
"epoch": 0.9576059850374065,
"grad_norm": 1.1228018999099731,
"learning_rate": 2.3517587939698493e-06,
"loss": 1.3777,
"step": 768
},
{
"epoch": 0.9588528678304239,
"grad_norm": 1.0767176151275635,
"learning_rate": 2.341708542713568e-06,
"loss": 1.2773,
"step": 769
},
{
"epoch": 0.9600997506234414,
"grad_norm": 1.0168311595916748,
"learning_rate": 2.3316582914572865e-06,
"loss": 1.2264,
"step": 770
},
{
"epoch": 0.9613466334164589,
"grad_norm": 1.1517490148544312,
"learning_rate": 2.321608040201005e-06,
"loss": 1.462,
"step": 771
},
{
"epoch": 0.9625935162094763,
"grad_norm": 1.0027258396148682,
"learning_rate": 2.311557788944724e-06,
"loss": 1.2101,
"step": 772
},
{
"epoch": 0.9638403990024937,
"grad_norm": 1.0144740343093872,
"learning_rate": 2.3015075376884426e-06,
"loss": 1.4337,
"step": 773
},
{
"epoch": 0.9650872817955112,
"grad_norm": 0.9291934370994568,
"learning_rate": 2.291457286432161e-06,
"loss": 1.2853,
"step": 774
},
{
"epoch": 0.9663341645885287,
"grad_norm": 1.0499387979507446,
"learning_rate": 2.2814070351758797e-06,
"loss": 1.1576,
"step": 775
},
{
"epoch": 0.9675810473815462,
"grad_norm": 1.0553680658340454,
"learning_rate": 2.2713567839195983e-06,
"loss": 1.3076,
"step": 776
},
{
"epoch": 0.9688279301745636,
"grad_norm": 1.0661462545394897,
"learning_rate": 2.261306532663317e-06,
"loss": 1.201,
"step": 777
},
{
"epoch": 0.970074812967581,
"grad_norm": 1.1224738359451294,
"learning_rate": 2.2512562814070354e-06,
"loss": 1.3089,
"step": 778
},
{
"epoch": 0.9713216957605985,
"grad_norm": 1.0061415433883667,
"learning_rate": 2.241206030150754e-06,
"loss": 1.2459,
"step": 779
},
{
"epoch": 0.972568578553616,
"grad_norm": 1.1413079500198364,
"learning_rate": 2.2311557788944725e-06,
"loss": 1.4552,
"step": 780
},
{
"epoch": 0.9738154613466334,
"grad_norm": 0.9246280193328857,
"learning_rate": 2.221105527638191e-06,
"loss": 1.1705,
"step": 781
},
{
"epoch": 0.9750623441396509,
"grad_norm": 1.0393040180206299,
"learning_rate": 2.21105527638191e-06,
"loss": 1.2436,
"step": 782
},
{
"epoch": 0.9763092269326683,
"grad_norm": 0.9905415177345276,
"learning_rate": 2.2010050251256282e-06,
"loss": 1.3433,
"step": 783
},
{
"epoch": 0.9775561097256857,
"grad_norm": 1.1033501625061035,
"learning_rate": 2.1909547738693468e-06,
"loss": 1.5008,
"step": 784
},
{
"epoch": 0.9788029925187033,
"grad_norm": 1.008102297782898,
"learning_rate": 2.1809045226130653e-06,
"loss": 1.2053,
"step": 785
},
{
"epoch": 0.9800498753117207,
"grad_norm": 1.174916386604309,
"learning_rate": 2.170854271356784e-06,
"loss": 1.4446,
"step": 786
},
{
"epoch": 0.9812967581047382,
"grad_norm": 0.9445826411247253,
"learning_rate": 2.1608040201005025e-06,
"loss": 1.2386,
"step": 787
},
{
"epoch": 0.9825436408977556,
"grad_norm": 1.0598101615905762,
"learning_rate": 2.150753768844221e-06,
"loss": 1.1389,
"step": 788
},
{
"epoch": 0.983790523690773,
"grad_norm": 1.0747401714324951,
"learning_rate": 2.14070351758794e-06,
"loss": 1.3252,
"step": 789
},
{
"epoch": 0.9850374064837906,
"grad_norm": 1.1230816841125488,
"learning_rate": 2.1306532663316586e-06,
"loss": 1.5111,
"step": 790
},
{
"epoch": 0.986284289276808,
"grad_norm": 1.186692714691162,
"learning_rate": 2.120603015075377e-06,
"loss": 1.3256,
"step": 791
},
{
"epoch": 0.9875311720698254,
"grad_norm": 1.3021119832992554,
"learning_rate": 2.1105527638190957e-06,
"loss": 1.5464,
"step": 792
},
{
"epoch": 0.9887780548628429,
"grad_norm": 1.0297656059265137,
"learning_rate": 2.1005025125628143e-06,
"loss": 1.2611,
"step": 793
},
{
"epoch": 0.9900249376558603,
"grad_norm": 1.0473464727401733,
"learning_rate": 2.090452261306533e-06,
"loss": 1.4004,
"step": 794
},
{
"epoch": 0.9912718204488778,
"grad_norm": 1.0219793319702148,
"learning_rate": 2.0804020100502514e-06,
"loss": 1.2777,
"step": 795
},
{
"epoch": 0.9925187032418953,
"grad_norm": 2.672146797180176,
"learning_rate": 2.07035175879397e-06,
"loss": 1.4929,
"step": 796
},
{
"epoch": 0.9937655860349127,
"grad_norm": 0.9890034198760986,
"learning_rate": 2.0603015075376885e-06,
"loss": 1.1897,
"step": 797
},
{
"epoch": 0.9950124688279302,
"grad_norm": 1.0329457521438599,
"learning_rate": 2.050251256281407e-06,
"loss": 1.2733,
"step": 798
},
{
"epoch": 0.9962593516209476,
"grad_norm": 1.071303129196167,
"learning_rate": 2.0402010050251257e-06,
"loss": 1.4831,
"step": 799
},
{
"epoch": 0.9975062344139651,
"grad_norm": 1.0302478075027466,
"learning_rate": 2.0301507537688446e-06,
"loss": 1.3157,
"step": 800
},
{
"epoch": 0.9987531172069826,
"grad_norm": 1.0172933340072632,
"learning_rate": 2.020100502512563e-06,
"loss": 1.2603,
"step": 801
},
{
"epoch": 1.0,
"grad_norm": 1.048662543296814,
"learning_rate": 2.0100502512562818e-06,
"loss": 1.2641,
"step": 802
},
{
"epoch": 1.0012468827930174,
"grad_norm": 1.001355767250061,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.2497,
"step": 803
},
{
"epoch": 1.0024937655860349,
"grad_norm": 0.9234004616737366,
"learning_rate": 1.989949748743719e-06,
"loss": 1.1551,
"step": 804
},
{
"epoch": 1.0037406483790523,
"grad_norm": 0.8222893476486206,
"learning_rate": 1.9798994974874375e-06,
"loss": 1.0946,
"step": 805
},
{
"epoch": 1.0049875311720697,
"grad_norm": 0.9572646021842957,
"learning_rate": 1.9698492462311556e-06,
"loss": 1.013,
"step": 806
},
{
"epoch": 1.0062344139650872,
"grad_norm": 0.9779718518257141,
"learning_rate": 1.9597989949748746e-06,
"loss": 1.0529,
"step": 807
},
{
"epoch": 1.0074812967581048,
"grad_norm": 0.910854160785675,
"learning_rate": 1.949748743718593e-06,
"loss": 1.1047,
"step": 808
},
{
"epoch": 1.0087281795511223,
"grad_norm": 0.9201194047927856,
"learning_rate": 1.9396984924623117e-06,
"loss": 0.9817,
"step": 809
},
{
"epoch": 1.0099750623441397,
"grad_norm": 0.9647354483604431,
"learning_rate": 1.9296482412060303e-06,
"loss": 1.244,
"step": 810
},
{
"epoch": 1.0112219451371571,
"grad_norm": 1.0115735530853271,
"learning_rate": 1.919597989949749e-06,
"loss": 1.2198,
"step": 811
},
{
"epoch": 1.0124688279301746,
"grad_norm": 1.027173399925232,
"learning_rate": 1.9095477386934674e-06,
"loss": 1.239,
"step": 812
},
{
"epoch": 1.013715710723192,
"grad_norm": 0.9246682524681091,
"learning_rate": 1.899497487437186e-06,
"loss": 1.0748,
"step": 813
},
{
"epoch": 1.0149625935162094,
"grad_norm": 0.8469527959823608,
"learning_rate": 1.8894472361809047e-06,
"loss": 1.01,
"step": 814
},
{
"epoch": 1.0162094763092269,
"grad_norm": 0.9737430810928345,
"learning_rate": 1.8793969849246233e-06,
"loss": 1.1242,
"step": 815
},
{
"epoch": 1.0174563591022443,
"grad_norm": 0.9349671006202698,
"learning_rate": 1.8693467336683419e-06,
"loss": 0.9914,
"step": 816
},
{
"epoch": 1.018703241895262,
"grad_norm": 0.8969345092773438,
"learning_rate": 1.8592964824120604e-06,
"loss": 1.0868,
"step": 817
},
{
"epoch": 1.0199501246882794,
"grad_norm": 0.9075314402580261,
"learning_rate": 1.849246231155779e-06,
"loss": 1.066,
"step": 818
},
{
"epoch": 1.0211970074812968,
"grad_norm": 0.9441591501235962,
"learning_rate": 1.8391959798994976e-06,
"loss": 1.073,
"step": 819
},
{
"epoch": 1.0224438902743143,
"grad_norm": 1.0162612199783325,
"learning_rate": 1.8291457286432163e-06,
"loss": 1.19,
"step": 820
},
{
"epoch": 1.0236907730673317,
"grad_norm": 0.9536542296409607,
"learning_rate": 1.819095477386935e-06,
"loss": 1.1637,
"step": 821
},
{
"epoch": 1.0249376558603491,
"grad_norm": 0.8764417767524719,
"learning_rate": 1.8090452261306535e-06,
"loss": 1.1082,
"step": 822
},
{
"epoch": 1.0261845386533666,
"grad_norm": 0.91340172290802,
"learning_rate": 1.798994974874372e-06,
"loss": 1.0617,
"step": 823
},
{
"epoch": 1.027431421446384,
"grad_norm": 0.9357399940490723,
"learning_rate": 1.7889447236180906e-06,
"loss": 1.0514,
"step": 824
},
{
"epoch": 1.0286783042394014,
"grad_norm": 0.993704080581665,
"learning_rate": 1.7788944723618094e-06,
"loss": 1.0921,
"step": 825
},
{
"epoch": 1.0299251870324189,
"grad_norm": 0.9839049577713013,
"learning_rate": 1.768844221105528e-06,
"loss": 1.3393,
"step": 826
},
{
"epoch": 1.0311720698254363,
"grad_norm": 0.847509503364563,
"learning_rate": 1.7587939698492465e-06,
"loss": 0.8505,
"step": 827
},
{
"epoch": 1.032418952618454,
"grad_norm": 0.9607179164886475,
"learning_rate": 1.748743718592965e-06,
"loss": 1.077,
"step": 828
},
{
"epoch": 1.0336658354114714,
"grad_norm": 0.918692946434021,
"learning_rate": 1.7386934673366834e-06,
"loss": 1.2237,
"step": 829
},
{
"epoch": 1.0349127182044888,
"grad_norm": 0.9612240791320801,
"learning_rate": 1.728643216080402e-06,
"loss": 1.1001,
"step": 830
},
{
"epoch": 1.0361596009975063,
"grad_norm": 1.1265960931777954,
"learning_rate": 1.7185929648241205e-06,
"loss": 1.2717,
"step": 831
},
{
"epoch": 1.0374064837905237,
"grad_norm": 1.0081263780593872,
"learning_rate": 1.7085427135678393e-06,
"loss": 1.389,
"step": 832
},
{
"epoch": 1.0386533665835411,
"grad_norm": 0.9362667202949524,
"learning_rate": 1.6984924623115579e-06,
"loss": 1.2143,
"step": 833
},
{
"epoch": 1.0399002493765586,
"grad_norm": 0.9931688904762268,
"learning_rate": 1.6884422110552764e-06,
"loss": 1.3677,
"step": 834
},
{
"epoch": 1.041147132169576,
"grad_norm": 1.0642098188400269,
"learning_rate": 1.678391959798995e-06,
"loss": 1.1892,
"step": 835
},
{
"epoch": 1.0423940149625934,
"grad_norm": 0.9026042222976685,
"learning_rate": 1.6683417085427136e-06,
"loss": 1.0138,
"step": 836
},
{
"epoch": 1.043640897755611,
"grad_norm": 0.8133838772773743,
"learning_rate": 1.6582914572864323e-06,
"loss": 0.9461,
"step": 837
},
{
"epoch": 1.0448877805486285,
"grad_norm": 1.0019959211349487,
"learning_rate": 1.648241206030151e-06,
"loss": 1.2256,
"step": 838
},
{
"epoch": 1.046134663341646,
"grad_norm": 1.0445399284362793,
"learning_rate": 1.6381909547738695e-06,
"loss": 1.2502,
"step": 839
},
{
"epoch": 1.0473815461346634,
"grad_norm": 0.9480726718902588,
"learning_rate": 1.628140703517588e-06,
"loss": 1.0447,
"step": 840
},
{
"epoch": 1.0486284289276808,
"grad_norm": 0.9324421286582947,
"learning_rate": 1.6180904522613066e-06,
"loss": 0.9952,
"step": 841
},
{
"epoch": 1.0498753117206983,
"grad_norm": 0.8906459212303162,
"learning_rate": 1.6080402010050254e-06,
"loss": 1.1185,
"step": 842
},
{
"epoch": 1.0511221945137157,
"grad_norm": 0.9397410154342651,
"learning_rate": 1.597989949748744e-06,
"loss": 0.9051,
"step": 843
},
{
"epoch": 1.0523690773067331,
"grad_norm": 0.8315420150756836,
"learning_rate": 1.5879396984924625e-06,
"loss": 0.9294,
"step": 844
},
{
"epoch": 1.0536159600997506,
"grad_norm": 0.9962395429611206,
"learning_rate": 1.577889447236181e-06,
"loss": 1.2918,
"step": 845
},
{
"epoch": 1.054862842892768,
"grad_norm": 0.9400811195373535,
"learning_rate": 1.5678391959798996e-06,
"loss": 1.0574,
"step": 846
},
{
"epoch": 1.0561097256857854,
"grad_norm": 0.940241813659668,
"learning_rate": 1.5577889447236184e-06,
"loss": 1.2522,
"step": 847
},
{
"epoch": 1.057356608478803,
"grad_norm": 0.8882105350494385,
"learning_rate": 1.547738693467337e-06,
"loss": 1.094,
"step": 848
},
{
"epoch": 1.0586034912718205,
"grad_norm": 1.0110504627227783,
"learning_rate": 1.5376884422110555e-06,
"loss": 1.0726,
"step": 849
},
{
"epoch": 1.059850374064838,
"grad_norm": 0.9382202625274658,
"learning_rate": 1.527638190954774e-06,
"loss": 1.3236,
"step": 850
},
{
"epoch": 1.0610972568578554,
"grad_norm": 0.964337170124054,
"learning_rate": 1.5175879396984927e-06,
"loss": 1.1698,
"step": 851
},
{
"epoch": 1.0623441396508728,
"grad_norm": 1.0176410675048828,
"learning_rate": 1.507537688442211e-06,
"loss": 1.2585,
"step": 852
},
{
"epoch": 1.0635910224438903,
"grad_norm": 0.9323766231536865,
"learning_rate": 1.4974874371859296e-06,
"loss": 1.111,
"step": 853
},
{
"epoch": 1.0648379052369077,
"grad_norm": 1.0200289487838745,
"learning_rate": 1.4874371859296483e-06,
"loss": 1.2257,
"step": 854
},
{
"epoch": 1.0660847880299251,
"grad_norm": 0.9815992116928101,
"learning_rate": 1.477386934673367e-06,
"loss": 0.9851,
"step": 855
},
{
"epoch": 1.0673316708229426,
"grad_norm": 0.9349173903465271,
"learning_rate": 1.4673366834170855e-06,
"loss": 1.0439,
"step": 856
},
{
"epoch": 1.0685785536159602,
"grad_norm": 0.959049642086029,
"learning_rate": 1.457286432160804e-06,
"loss": 1.0026,
"step": 857
},
{
"epoch": 1.0698254364089776,
"grad_norm": 0.9442126750946045,
"learning_rate": 1.4472361809045226e-06,
"loss": 1.0292,
"step": 858
},
{
"epoch": 1.071072319201995,
"grad_norm": 0.8776487112045288,
"learning_rate": 1.4371859296482414e-06,
"loss": 1.0048,
"step": 859
},
{
"epoch": 1.0723192019950125,
"grad_norm": 0.9066770076751709,
"learning_rate": 1.42713567839196e-06,
"loss": 0.9421,
"step": 860
},
{
"epoch": 1.07356608478803,
"grad_norm": 0.9708060026168823,
"learning_rate": 1.4170854271356785e-06,
"loss": 1.1516,
"step": 861
},
{
"epoch": 1.0748129675810474,
"grad_norm": 0.9670248627662659,
"learning_rate": 1.407035175879397e-06,
"loss": 1.0676,
"step": 862
},
{
"epoch": 1.0760598503740648,
"grad_norm": 0.9366007447242737,
"learning_rate": 1.3969849246231156e-06,
"loss": 1.122,
"step": 863
},
{
"epoch": 1.0773067331670823,
"grad_norm": 0.8750181794166565,
"learning_rate": 1.3869346733668342e-06,
"loss": 0.9104,
"step": 864
},
{
"epoch": 1.0785536159600997,
"grad_norm": 0.9705141186714172,
"learning_rate": 1.376884422110553e-06,
"loss": 1.1194,
"step": 865
},
{
"epoch": 1.0798004987531171,
"grad_norm": 0.972750723361969,
"learning_rate": 1.3668341708542715e-06,
"loss": 1.1771,
"step": 866
},
{
"epoch": 1.0810473815461346,
"grad_norm": 0.988541841506958,
"learning_rate": 1.35678391959799e-06,
"loss": 1.1732,
"step": 867
},
{
"epoch": 1.0822942643391522,
"grad_norm": 1.050854206085205,
"learning_rate": 1.3467336683417087e-06,
"loss": 1.1185,
"step": 868
},
{
"epoch": 1.0835411471321696,
"grad_norm": 1.416115164756775,
"learning_rate": 1.3366834170854272e-06,
"loss": 1.105,
"step": 869
},
{
"epoch": 1.084788029925187,
"grad_norm": 0.9432351589202881,
"learning_rate": 1.326633165829146e-06,
"loss": 1.0184,
"step": 870
},
{
"epoch": 1.0860349127182045,
"grad_norm": 0.9624066948890686,
"learning_rate": 1.3165829145728646e-06,
"loss": 1.0313,
"step": 871
},
{
"epoch": 1.087281795511222,
"grad_norm": 1.0634950399398804,
"learning_rate": 1.3065326633165831e-06,
"loss": 1.0587,
"step": 872
},
{
"epoch": 1.0885286783042394,
"grad_norm": 1.0073400735855103,
"learning_rate": 1.2964824120603017e-06,
"loss": 1.292,
"step": 873
},
{
"epoch": 1.0897755610972568,
"grad_norm": 0.902108907699585,
"learning_rate": 1.2864321608040203e-06,
"loss": 1.0376,
"step": 874
},
{
"epoch": 1.0910224438902743,
"grad_norm": 1.0740609169006348,
"learning_rate": 1.2763819095477386e-06,
"loss": 1.3528,
"step": 875
},
{
"epoch": 1.0922693266832917,
"grad_norm": 0.9529274106025696,
"learning_rate": 1.2663316582914572e-06,
"loss": 1.0768,
"step": 876
},
{
"epoch": 1.0935162094763093,
"grad_norm": 0.9792256355285645,
"learning_rate": 1.256281407035176e-06,
"loss": 1.1694,
"step": 877
},
{
"epoch": 1.0947630922693268,
"grad_norm": 1.0662548542022705,
"learning_rate": 1.2462311557788945e-06,
"loss": 1.2568,
"step": 878
},
{
"epoch": 1.0960099750623442,
"grad_norm": 0.9529067873954773,
"learning_rate": 1.2361809045226133e-06,
"loss": 1.2098,
"step": 879
},
{
"epoch": 1.0972568578553616,
"grad_norm": 0.9482414126396179,
"learning_rate": 1.2261306532663318e-06,
"loss": 1.0585,
"step": 880
},
{
"epoch": 1.098503740648379,
"grad_norm": 0.9822997450828552,
"learning_rate": 1.2160804020100502e-06,
"loss": 1.3011,
"step": 881
},
{
"epoch": 1.0997506234413965,
"grad_norm": 0.9486746788024902,
"learning_rate": 1.206030150753769e-06,
"loss": 1.0877,
"step": 882
},
{
"epoch": 1.100997506234414,
"grad_norm": 0.8915067911148071,
"learning_rate": 1.1959798994974875e-06,
"loss": 1.0558,
"step": 883
},
{
"epoch": 1.1022443890274314,
"grad_norm": 1.0369261503219604,
"learning_rate": 1.185929648241206e-06,
"loss": 1.1102,
"step": 884
},
{
"epoch": 1.1034912718204488,
"grad_norm": 1.073757529258728,
"learning_rate": 1.1758793969849247e-06,
"loss": 1.2988,
"step": 885
},
{
"epoch": 1.1047381546134662,
"grad_norm": 0.967126727104187,
"learning_rate": 1.1658291457286432e-06,
"loss": 1.0799,
"step": 886
},
{
"epoch": 1.1059850374064837,
"grad_norm": 1.0032625198364258,
"learning_rate": 1.155778894472362e-06,
"loss": 1.1895,
"step": 887
},
{
"epoch": 1.1072319201995013,
"grad_norm": 0.896228015422821,
"learning_rate": 1.1457286432160806e-06,
"loss": 1.1643,
"step": 888
},
{
"epoch": 1.1084788029925188,
"grad_norm": 1.151524305343628,
"learning_rate": 1.1356783919597991e-06,
"loss": 1.0271,
"step": 889
},
{
"epoch": 1.1097256857855362,
"grad_norm": 0.9097614884376526,
"learning_rate": 1.1256281407035177e-06,
"loss": 1.0671,
"step": 890
},
{
"epoch": 1.1109725685785536,
"grad_norm": 0.9432956576347351,
"learning_rate": 1.1155778894472363e-06,
"loss": 1.0508,
"step": 891
},
{
"epoch": 1.112219451371571,
"grad_norm": 0.9730440974235535,
"learning_rate": 1.105527638190955e-06,
"loss": 1.1427,
"step": 892
},
{
"epoch": 1.1134663341645885,
"grad_norm": 0.9195826649665833,
"learning_rate": 1.0954773869346734e-06,
"loss": 0.9505,
"step": 893
},
{
"epoch": 1.114713216957606,
"grad_norm": 0.8559366464614868,
"learning_rate": 1.085427135678392e-06,
"loss": 1.0993,
"step": 894
},
{
"epoch": 1.1159600997506234,
"grad_norm": 1.0328896045684814,
"learning_rate": 1.0753768844221105e-06,
"loss": 1.1671,
"step": 895
},
{
"epoch": 1.1172069825436408,
"grad_norm": 0.9966070055961609,
"learning_rate": 1.0653266331658293e-06,
"loss": 0.8159,
"step": 896
},
{
"epoch": 1.1184538653366585,
"grad_norm": 0.8760312795639038,
"learning_rate": 1.0552763819095479e-06,
"loss": 1.0193,
"step": 897
},
{
"epoch": 1.119700748129676,
"grad_norm": 0.9578038454055786,
"learning_rate": 1.0452261306532664e-06,
"loss": 1.0885,
"step": 898
},
{
"epoch": 1.1209476309226933,
"grad_norm": 1.0279982089996338,
"learning_rate": 1.035175879396985e-06,
"loss": 1.3416,
"step": 899
},
{
"epoch": 1.1221945137157108,
"grad_norm": 1.0367847681045532,
"learning_rate": 1.0251256281407035e-06,
"loss": 1.1524,
"step": 900
},
{
"epoch": 1.1234413965087282,
"grad_norm": 0.9701456427574158,
"learning_rate": 1.0150753768844223e-06,
"loss": 1.1234,
"step": 901
},
{
"epoch": 1.1246882793017456,
"grad_norm": 0.8951571583747864,
"learning_rate": 1.0050251256281409e-06,
"loss": 1.0297,
"step": 902
},
{
"epoch": 1.125935162094763,
"grad_norm": 0.9758408665657043,
"learning_rate": 9.949748743718594e-07,
"loss": 1.2459,
"step": 903
},
{
"epoch": 1.1271820448877805,
"grad_norm": 1.001495361328125,
"learning_rate": 9.849246231155778e-07,
"loss": 1.1554,
"step": 904
},
{
"epoch": 1.128428927680798,
"grad_norm": 0.9806671142578125,
"learning_rate": 9.748743718592966e-07,
"loss": 1.3005,
"step": 905
},
{
"epoch": 1.1296758104738154,
"grad_norm": 1.1555672883987427,
"learning_rate": 9.648241206030151e-07,
"loss": 1.1304,
"step": 906
},
{
"epoch": 1.1309226932668328,
"grad_norm": 0.900561511516571,
"learning_rate": 9.547738693467337e-07,
"loss": 0.9279,
"step": 907
},
{
"epoch": 1.1321695760598505,
"grad_norm": 0.9637525677680969,
"learning_rate": 9.447236180904524e-07,
"loss": 1.1184,
"step": 908
},
{
"epoch": 1.133416458852868,
"grad_norm": 1.051108479499817,
"learning_rate": 9.346733668341709e-07,
"loss": 1.0561,
"step": 909
},
{
"epoch": 1.1346633416458853,
"grad_norm": 1.0213427543640137,
"learning_rate": 9.246231155778895e-07,
"loss": 1.0418,
"step": 910
},
{
"epoch": 1.1359102244389028,
"grad_norm": 1.1131244897842407,
"learning_rate": 9.145728643216082e-07,
"loss": 1.3049,
"step": 911
},
{
"epoch": 1.1371571072319202,
"grad_norm": 0.8933857083320618,
"learning_rate": 9.045226130653267e-07,
"loss": 0.942,
"step": 912
},
{
"epoch": 1.1384039900249376,
"grad_norm": 0.991568386554718,
"learning_rate": 8.944723618090453e-07,
"loss": 1.034,
"step": 913
},
{
"epoch": 1.139650872817955,
"grad_norm": 1.0073168277740479,
"learning_rate": 8.84422110552764e-07,
"loss": 1.2999,
"step": 914
},
{
"epoch": 1.1408977556109725,
"grad_norm": 0.941887617111206,
"learning_rate": 8.743718592964825e-07,
"loss": 1.0767,
"step": 915
},
{
"epoch": 1.14214463840399,
"grad_norm": 0.8970725536346436,
"learning_rate": 8.64321608040201e-07,
"loss": 0.9196,
"step": 916
},
{
"epoch": 1.1433915211970076,
"grad_norm": 0.8672690391540527,
"learning_rate": 8.542713567839197e-07,
"loss": 0.9363,
"step": 917
},
{
"epoch": 1.144638403990025,
"grad_norm": 1.0099691152572632,
"learning_rate": 8.442211055276382e-07,
"loss": 1.0098,
"step": 918
},
{
"epoch": 1.1458852867830425,
"grad_norm": 0.9472724199295044,
"learning_rate": 8.341708542713568e-07,
"loss": 1.0484,
"step": 919
},
{
"epoch": 1.14713216957606,
"grad_norm": 0.9121310114860535,
"learning_rate": 8.241206030150755e-07,
"loss": 1.1415,
"step": 920
},
{
"epoch": 1.1483790523690773,
"grad_norm": 0.8841565251350403,
"learning_rate": 8.14070351758794e-07,
"loss": 1.0377,
"step": 921
},
{
"epoch": 1.1496259351620948,
"grad_norm": 1.1179547309875488,
"learning_rate": 8.040201005025127e-07,
"loss": 1.3833,
"step": 922
},
{
"epoch": 1.1508728179551122,
"grad_norm": 0.9561689496040344,
"learning_rate": 7.939698492462313e-07,
"loss": 1.0964,
"step": 923
},
{
"epoch": 1.1521197007481296,
"grad_norm": 1.1863527297973633,
"learning_rate": 7.839195979899498e-07,
"loss": 1.259,
"step": 924
},
{
"epoch": 1.153366583541147,
"grad_norm": 0.9494880437850952,
"learning_rate": 7.738693467336685e-07,
"loss": 1.3203,
"step": 925
},
{
"epoch": 1.1546134663341645,
"grad_norm": 0.9367597103118896,
"learning_rate": 7.63819095477387e-07,
"loss": 1.1497,
"step": 926
},
{
"epoch": 1.155860349127182,
"grad_norm": 0.9247015714645386,
"learning_rate": 7.537688442211055e-07,
"loss": 1.2215,
"step": 927
},
{
"epoch": 1.1571072319201996,
"grad_norm": 0.9937137961387634,
"learning_rate": 7.437185929648242e-07,
"loss": 1.255,
"step": 928
},
{
"epoch": 1.158354114713217,
"grad_norm": 0.9087586998939514,
"learning_rate": 7.336683417085427e-07,
"loss": 1.0672,
"step": 929
},
{
"epoch": 1.1596009975062345,
"grad_norm": 0.9231095910072327,
"learning_rate": 7.236180904522613e-07,
"loss": 1.0545,
"step": 930
},
{
"epoch": 1.160847880299252,
"grad_norm": 0.9966895580291748,
"learning_rate": 7.1356783919598e-07,
"loss": 1.1914,
"step": 931
},
{
"epoch": 1.1620947630922693,
"grad_norm": 0.9739454388618469,
"learning_rate": 7.035175879396985e-07,
"loss": 1.1846,
"step": 932
},
{
"epoch": 1.1633416458852868,
"grad_norm": 0.832830548286438,
"learning_rate": 6.934673366834171e-07,
"loss": 1.0775,
"step": 933
},
{
"epoch": 1.1645885286783042,
"grad_norm": 0.9566227793693542,
"learning_rate": 6.834170854271358e-07,
"loss": 0.957,
"step": 934
},
{
"epoch": 1.1658354114713216,
"grad_norm": 1.0254335403442383,
"learning_rate": 6.733668341708543e-07,
"loss": 1.147,
"step": 935
},
{
"epoch": 1.167082294264339,
"grad_norm": 0.9612658619880676,
"learning_rate": 6.63316582914573e-07,
"loss": 1.2579,
"step": 936
},
{
"epoch": 1.1683291770573567,
"grad_norm": 0.804253101348877,
"learning_rate": 6.532663316582916e-07,
"loss": 0.881,
"step": 937
},
{
"epoch": 1.1695760598503742,
"grad_norm": 0.9291054606437683,
"learning_rate": 6.432160804020101e-07,
"loss": 1.0936,
"step": 938
},
{
"epoch": 1.1708229426433916,
"grad_norm": 0.838405191898346,
"learning_rate": 6.331658291457286e-07,
"loss": 0.8982,
"step": 939
},
{
"epoch": 1.172069825436409,
"grad_norm": 0.9158410429954529,
"learning_rate": 6.231155778894473e-07,
"loss": 1.1168,
"step": 940
},
{
"epoch": 1.1733167082294265,
"grad_norm": 0.9490392804145813,
"learning_rate": 6.130653266331659e-07,
"loss": 1.0928,
"step": 941
},
{
"epoch": 1.174563591022444,
"grad_norm": 0.9580014944076538,
"learning_rate": 6.030150753768845e-07,
"loss": 1.1794,
"step": 942
},
{
"epoch": 1.1758104738154613,
"grad_norm": 0.9477241635322571,
"learning_rate": 5.92964824120603e-07,
"loss": 0.9946,
"step": 943
},
{
"epoch": 1.1770573566084788,
"grad_norm": 0.9754717946052551,
"learning_rate": 5.829145728643216e-07,
"loss": 0.937,
"step": 944
},
{
"epoch": 1.1783042394014962,
"grad_norm": 1.1054816246032715,
"learning_rate": 5.728643216080403e-07,
"loss": 1.3298,
"step": 945
},
{
"epoch": 1.1795511221945136,
"grad_norm": 1.0003180503845215,
"learning_rate": 5.628140703517588e-07,
"loss": 1.2283,
"step": 946
},
{
"epoch": 1.180798004987531,
"grad_norm": 1.0288554430007935,
"learning_rate": 5.527638190954775e-07,
"loss": 1.0733,
"step": 947
},
{
"epoch": 1.1820448877805487,
"grad_norm": 0.9287012219429016,
"learning_rate": 5.42713567839196e-07,
"loss": 1.0559,
"step": 948
},
{
"epoch": 1.1832917705735662,
"grad_norm": 0.9414504766464233,
"learning_rate": 5.326633165829146e-07,
"loss": 1.0715,
"step": 949
},
{
"epoch": 1.1845386533665836,
"grad_norm": 1.2494735717773438,
"learning_rate": 5.226130653266332e-07,
"loss": 1.4612,
"step": 950
},
{
"epoch": 1.185785536159601,
"grad_norm": 1.0314311981201172,
"learning_rate": 5.125628140703518e-07,
"loss": 1.1663,
"step": 951
},
{
"epoch": 1.1870324189526185,
"grad_norm": 1.1451873779296875,
"learning_rate": 5.025125628140704e-07,
"loss": 1.2636,
"step": 952
},
{
"epoch": 1.188279301745636,
"grad_norm": 0.95524001121521,
"learning_rate": 4.924623115577889e-07,
"loss": 1.0053,
"step": 953
},
{
"epoch": 1.1895261845386533,
"grad_norm": 1.1086084842681885,
"learning_rate": 4.824120603015076e-07,
"loss": 1.3138,
"step": 954
},
{
"epoch": 1.1907730673316708,
"grad_norm": 1.0394378900527954,
"learning_rate": 4.723618090452262e-07,
"loss": 1.2855,
"step": 955
},
{
"epoch": 1.1920199501246882,
"grad_norm": 0.974128782749176,
"learning_rate": 4.6231155778894475e-07,
"loss": 1.1754,
"step": 956
},
{
"epoch": 1.1932668329177059,
"grad_norm": 0.989520788192749,
"learning_rate": 4.5226130653266337e-07,
"loss": 1.2944,
"step": 957
},
{
"epoch": 1.1945137157107233,
"grad_norm": 0.9629969596862793,
"learning_rate": 4.42211055276382e-07,
"loss": 1.0392,
"step": 958
},
{
"epoch": 1.1957605985037407,
"grad_norm": 1.0542442798614502,
"learning_rate": 4.321608040201005e-07,
"loss": 1.4224,
"step": 959
},
{
"epoch": 1.1970074812967582,
"grad_norm": 0.9659432172775269,
"learning_rate": 4.221105527638191e-07,
"loss": 1.1098,
"step": 960
},
{
"epoch": 1.1982543640897756,
"grad_norm": 0.869775116443634,
"learning_rate": 4.1206030150753773e-07,
"loss": 0.9089,
"step": 961
},
{
"epoch": 1.199501246882793,
"grad_norm": 1.1481077671051025,
"learning_rate": 4.0201005025125634e-07,
"loss": 1.1947,
"step": 962
},
{
"epoch": 1.2007481296758105,
"grad_norm": 0.9253513216972351,
"learning_rate": 3.919597989949749e-07,
"loss": 0.935,
"step": 963
},
{
"epoch": 1.201995012468828,
"grad_norm": 1.0706729888916016,
"learning_rate": 3.819095477386935e-07,
"loss": 0.9812,
"step": 964
},
{
"epoch": 1.2032418952618453,
"grad_norm": 0.9205166697502136,
"learning_rate": 3.718592964824121e-07,
"loss": 1.1668,
"step": 965
},
{
"epoch": 1.2044887780548628,
"grad_norm": 1.062045693397522,
"learning_rate": 3.6180904522613065e-07,
"loss": 1.1189,
"step": 966
},
{
"epoch": 1.2057356608478802,
"grad_norm": 1.0185612440109253,
"learning_rate": 3.5175879396984927e-07,
"loss": 1.263,
"step": 967
},
{
"epoch": 1.2069825436408976,
"grad_norm": 1.0928072929382324,
"learning_rate": 3.417085427135679e-07,
"loss": 1.1389,
"step": 968
},
{
"epoch": 1.2082294264339153,
"grad_norm": 1.0667775869369507,
"learning_rate": 3.316582914572865e-07,
"loss": 1.152,
"step": 969
},
{
"epoch": 1.2094763092269327,
"grad_norm": 0.9324660897254944,
"learning_rate": 3.2160804020100506e-07,
"loss": 0.9736,
"step": 970
},
{
"epoch": 1.2107231920199502,
"grad_norm": 0.8895764946937561,
"learning_rate": 3.1155778894472363e-07,
"loss": 1.1097,
"step": 971
},
{
"epoch": 1.2119700748129676,
"grad_norm": 0.9087288975715637,
"learning_rate": 3.0150753768844224e-07,
"loss": 1.1095,
"step": 972
},
{
"epoch": 1.213216957605985,
"grad_norm": 0.9338837265968323,
"learning_rate": 2.914572864321608e-07,
"loss": 0.9704,
"step": 973
},
{
"epoch": 1.2144638403990025,
"grad_norm": 0.9220953583717346,
"learning_rate": 2.814070351758794e-07,
"loss": 1.1147,
"step": 974
},
{
"epoch": 1.21571072319202,
"grad_norm": 0.9933000802993774,
"learning_rate": 2.71356783919598e-07,
"loss": 1.1277,
"step": 975
},
{
"epoch": 1.2169576059850373,
"grad_norm": 1.0079030990600586,
"learning_rate": 2.613065326633166e-07,
"loss": 0.9931,
"step": 976
},
{
"epoch": 1.218204488778055,
"grad_norm": 0.9079055786132812,
"learning_rate": 2.512562814070352e-07,
"loss": 1.0279,
"step": 977
},
{
"epoch": 1.2194513715710724,
"grad_norm": 0.9380108118057251,
"learning_rate": 2.412060301507538e-07,
"loss": 1.1186,
"step": 978
},
{
"epoch": 1.2206982543640899,
"grad_norm": 1.0020259618759155,
"learning_rate": 2.3115577889447237e-07,
"loss": 1.0412,
"step": 979
},
{
"epoch": 1.2219451371571073,
"grad_norm": 0.9419663548469543,
"learning_rate": 2.21105527638191e-07,
"loss": 1.0396,
"step": 980
},
{
"epoch": 1.2231920199501247,
"grad_norm": 1.0515295267105103,
"learning_rate": 2.1105527638190956e-07,
"loss": 1.2899,
"step": 981
},
{
"epoch": 1.2244389027431422,
"grad_norm": 0.9472441077232361,
"learning_rate": 2.0100502512562817e-07,
"loss": 1.0375,
"step": 982
},
{
"epoch": 1.2256857855361596,
"grad_norm": 0.8925400972366333,
"learning_rate": 1.9095477386934676e-07,
"loss": 0.8216,
"step": 983
},
{
"epoch": 1.226932668329177,
"grad_norm": 0.9674944281578064,
"learning_rate": 1.8090452261306533e-07,
"loss": 1.0872,
"step": 984
},
{
"epoch": 1.2281795511221945,
"grad_norm": 0.8792274594306946,
"learning_rate": 1.7085427135678394e-07,
"loss": 1.0083,
"step": 985
},
{
"epoch": 1.229426433915212,
"grad_norm": 0.929137110710144,
"learning_rate": 1.6080402010050253e-07,
"loss": 0.9598,
"step": 986
},
{
"epoch": 1.2306733167082293,
"grad_norm": 0.9648697376251221,
"learning_rate": 1.5075376884422112e-07,
"loss": 1.1901,
"step": 987
},
{
"epoch": 1.2319201995012468,
"grad_norm": 0.924969494342804,
"learning_rate": 1.407035175879397e-07,
"loss": 1.0915,
"step": 988
},
{
"epoch": 1.2331670822942644,
"grad_norm": 0.930644154548645,
"learning_rate": 1.306532663316583e-07,
"loss": 1.0326,
"step": 989
},
{
"epoch": 1.2344139650872819,
"grad_norm": 1.0661444664001465,
"learning_rate": 1.206030150753769e-07,
"loss": 1.1937,
"step": 990
},
{
"epoch": 1.2356608478802993,
"grad_norm": 0.9451942443847656,
"learning_rate": 1.105527638190955e-07,
"loss": 1.0124,
"step": 991
},
{
"epoch": 1.2369077306733167,
"grad_norm": 0.9993079304695129,
"learning_rate": 1.0050251256281409e-07,
"loss": 1.3394,
"step": 992
},
{
"epoch": 1.2381546134663342,
"grad_norm": 0.8736663460731506,
"learning_rate": 9.045226130653266e-08,
"loss": 1.0151,
"step": 993
},
{
"epoch": 1.2394014962593516,
"grad_norm": 1.0243713855743408,
"learning_rate": 8.040201005025127e-08,
"loss": 1.1272,
"step": 994
},
{
"epoch": 1.240648379052369,
"grad_norm": 0.9341378211975098,
"learning_rate": 7.035175879396986e-08,
"loss": 1.0561,
"step": 995
},
{
"epoch": 1.2418952618453865,
"grad_norm": 0.8320474624633789,
"learning_rate": 6.030150753768845e-08,
"loss": 0.9748,
"step": 996
},
{
"epoch": 1.2431421446384041,
"grad_norm": 0.9634078145027161,
"learning_rate": 5.025125628140704e-08,
"loss": 1.0482,
"step": 997
},
{
"epoch": 1.2443890274314215,
"grad_norm": 0.9500167965888977,
"learning_rate": 4.020100502512563e-08,
"loss": 1.0441,
"step": 998
},
{
"epoch": 1.245635910224439,
"grad_norm": 1.0087471008300781,
"learning_rate": 3.015075376884422e-08,
"loss": 1.1111,
"step": 999
},
{
"epoch": 1.2468827930174564,
"grad_norm": 0.8897825479507446,
"learning_rate": 2.0100502512562817e-08,
"loss": 0.9434,
"step": 1000
}
],
"logging_steps": 1,
"max_steps": 1000,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.1068487782293504e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}