Text Generation
Safetensors
Chinese
English
llama
conversational
oklm-sft / trainer_state.json
WZYDL123's picture
Upload folder using huggingface_hub
04dab7b verified
raw
history blame
99.4 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9999881495526456,
"eval_steps": 500,
"global_step": 56256,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0017775671031581442,
"grad_norm": 1.7014890909194946,
"learning_rate": 2.5e-06,
"loss": 2.2156,
"step": 100
},
{
"epoch": 0.0035551342063162884,
"grad_norm": 1.0592787265777588,
"learning_rate": 5e-06,
"loss": 2.1147,
"step": 200
},
{
"epoch": 0.005332701309474433,
"grad_norm": 0.7293457388877869,
"learning_rate": 7.5e-06,
"loss": 1.9982,
"step": 300
},
{
"epoch": 0.007110268412632577,
"grad_norm": 0.5222712755203247,
"learning_rate": 1e-05,
"loss": 1.9311,
"step": 400
},
{
"epoch": 0.008887835515790721,
"grad_norm": 0.37243402004241943,
"learning_rate": 1.25e-05,
"loss": 1.8657,
"step": 500
},
{
"epoch": 0.010665402618948865,
"grad_norm": 0.47352147102355957,
"learning_rate": 1.5e-05,
"loss": 1.8218,
"step": 600
},
{
"epoch": 0.01244296972210701,
"grad_norm": 0.2529783248901367,
"learning_rate": 1.75e-05,
"loss": 1.7801,
"step": 700
},
{
"epoch": 0.014220536825265154,
"grad_norm": 0.2332669496536255,
"learning_rate": 2e-05,
"loss": 1.7856,
"step": 800
},
{
"epoch": 0.0159981039284233,
"grad_norm": 0.3814944326877594,
"learning_rate": 2.25e-05,
"loss": 1.7645,
"step": 900
},
{
"epoch": 0.017775671031581442,
"grad_norm": 0.17010626196861267,
"learning_rate": 2.5e-05,
"loss": 1.7822,
"step": 1000
},
{
"epoch": 0.019553238134739588,
"grad_norm": 0.1512402594089508,
"learning_rate": 2.7500000000000004e-05,
"loss": 1.763,
"step": 1100
},
{
"epoch": 0.02133080523789773,
"grad_norm": 0.18556953966617584,
"learning_rate": 3e-05,
"loss": 1.7465,
"step": 1200
},
{
"epoch": 0.023108372341055877,
"grad_norm": 0.16216696798801422,
"learning_rate": 3.2500000000000004e-05,
"loss": 1.752,
"step": 1300
},
{
"epoch": 0.02488593944421402,
"grad_norm": 0.21479512751102448,
"learning_rate": 3.5e-05,
"loss": 1.7313,
"step": 1400
},
{
"epoch": 0.026663506547372165,
"grad_norm": 0.1630880981683731,
"learning_rate": 3.7500000000000003e-05,
"loss": 1.7159,
"step": 1500
},
{
"epoch": 0.028441073650530307,
"grad_norm": 0.152820885181427,
"learning_rate": 4e-05,
"loss": 1.7233,
"step": 1600
},
{
"epoch": 0.030218640753688453,
"grad_norm": 0.15548893809318542,
"learning_rate": 4.25e-05,
"loss": 1.7333,
"step": 1700
},
{
"epoch": 0.0319962078568466,
"grad_norm": 0.437898188829422,
"learning_rate": 4.5e-05,
"loss": 1.7188,
"step": 1800
},
{
"epoch": 0.03377377496000474,
"grad_norm": 0.18985818326473236,
"learning_rate": 4.75e-05,
"loss": 1.7269,
"step": 1900
},
{
"epoch": 0.035551342063162884,
"grad_norm": 0.21630148589611053,
"learning_rate": 5e-05,
"loss": 1.7373,
"step": 2000
},
{
"epoch": 0.03732890916632103,
"grad_norm": 0.14671586453914642,
"learning_rate": 4.9999580904497634e-05,
"loss": 1.7236,
"step": 2100
},
{
"epoch": 0.039106476269479176,
"grad_norm": 0.209241583943367,
"learning_rate": 4.99983236320418e-05,
"loss": 1.7082,
"step": 2200
},
{
"epoch": 0.04088404337263732,
"grad_norm": 0.20861445367336273,
"learning_rate": 4.9996228224785886e-05,
"loss": 1.7319,
"step": 2300
},
{
"epoch": 0.04266161047579546,
"grad_norm": 0.1577518880367279,
"learning_rate": 4.999329475298396e-05,
"loss": 1.7084,
"step": 2400
},
{
"epoch": 0.044439177578953604,
"grad_norm": 0.14321212470531464,
"learning_rate": 4.998952331498839e-05,
"loss": 1.715,
"step": 2500
},
{
"epoch": 0.04621674468211175,
"grad_norm": 0.16167956590652466,
"learning_rate": 4.99849140372466e-05,
"loss": 1.7072,
"step": 2600
},
{
"epoch": 0.047994311785269896,
"grad_norm": 0.14405596256256104,
"learning_rate": 4.9979467074296805e-05,
"loss": 1.7354,
"step": 2700
},
{
"epoch": 0.04977187888842804,
"grad_norm": 0.18818779289722443,
"learning_rate": 4.9973182608762805e-05,
"loss": 1.7246,
"step": 2800
},
{
"epoch": 0.05154944599158618,
"grad_norm": 0.14068296551704407,
"learning_rate": 4.996606085134791e-05,
"loss": 1.7138,
"step": 2900
},
{
"epoch": 0.05332701309474433,
"grad_norm": 0.1438419222831726,
"learning_rate": 4.995810204082784e-05,
"loss": 1.7085,
"step": 3000
},
{
"epoch": 0.05510458019790247,
"grad_norm": 0.1487807333469391,
"learning_rate": 4.994930644404272e-05,
"loss": 1.703,
"step": 3100
},
{
"epoch": 0.056882147301060615,
"grad_norm": 0.1583404541015625,
"learning_rate": 4.993967435588816e-05,
"loss": 1.7036,
"step": 3200
},
{
"epoch": 0.05865971440421876,
"grad_norm": 0.25886544585227966,
"learning_rate": 4.992920609930535e-05,
"loss": 1.6993,
"step": 3300
},
{
"epoch": 0.06043728150737691,
"grad_norm": 0.18560856580734253,
"learning_rate": 4.991790202527022e-05,
"loss": 1.7111,
"step": 3400
},
{
"epoch": 0.06221484861053505,
"grad_norm": 0.14303149282932281,
"learning_rate": 4.990576251278172e-05,
"loss": 1.7104,
"step": 3500
},
{
"epoch": 0.0639924157136932,
"grad_norm": 0.1497068852186203,
"learning_rate": 4.9892787968849033e-05,
"loss": 1.7038,
"step": 3600
},
{
"epoch": 0.06576998281685134,
"grad_norm": 0.17193421721458435,
"learning_rate": 4.987897882847801e-05,
"loss": 1.6955,
"step": 3700
},
{
"epoch": 0.06754754992000948,
"grad_norm": 0.19913499057292938,
"learning_rate": 4.9864335554656526e-05,
"loss": 1.7009,
"step": 3800
},
{
"epoch": 0.06932511702316763,
"grad_norm": 0.15236733853816986,
"learning_rate": 4.984885863833901e-05,
"loss": 1.6994,
"step": 3900
},
{
"epoch": 0.07110268412632577,
"grad_norm": 0.18027468025684357,
"learning_rate": 4.9832548598429955e-05,
"loss": 1.6974,
"step": 4000
},
{
"epoch": 0.07288025122948391,
"grad_norm": 0.19036361575126648,
"learning_rate": 4.981540598176649e-05,
"loss": 1.6957,
"step": 4100
},
{
"epoch": 0.07465781833264205,
"grad_norm": 0.18389485776424408,
"learning_rate": 4.979743136310011e-05,
"loss": 1.7133,
"step": 4200
},
{
"epoch": 0.0764353854358002,
"grad_norm": 0.16271525621414185,
"learning_rate": 4.977862534507735e-05,
"loss": 1.7093,
"step": 4300
},
{
"epoch": 0.07821295253895835,
"grad_norm": 0.15076510608196259,
"learning_rate": 4.975898855821964e-05,
"loss": 1.7007,
"step": 4400
},
{
"epoch": 0.0799905196421165,
"grad_norm": 0.6439979672431946,
"learning_rate": 4.9738521660902074e-05,
"loss": 1.7049,
"step": 4500
},
{
"epoch": 0.08176808674527464,
"grad_norm": 0.23859179019927979,
"learning_rate": 4.971722533933144e-05,
"loss": 1.7128,
"step": 4600
},
{
"epoch": 0.08354565384843278,
"grad_norm": 0.18766574561595917,
"learning_rate": 4.969510030752314e-05,
"loss": 1.6976,
"step": 4700
},
{
"epoch": 0.08532322095159092,
"grad_norm": 0.1395421326160431,
"learning_rate": 4.9672147307277285e-05,
"loss": 1.6957,
"step": 4800
},
{
"epoch": 0.08710078805474906,
"grad_norm": 0.15366794168949127,
"learning_rate": 4.9648367108153795e-05,
"loss": 1.6966,
"step": 4900
},
{
"epoch": 0.08887835515790721,
"grad_norm": 0.14610810577869415,
"learning_rate": 4.9623760507446646e-05,
"loss": 1.6964,
"step": 5000
},
{
"epoch": 0.09065592226106535,
"grad_norm": 0.19104835391044617,
"learning_rate": 4.9598328330157084e-05,
"loss": 1.697,
"step": 5100
},
{
"epoch": 0.0924334893642235,
"grad_norm": 0.15467491745948792,
"learning_rate": 4.957207142896599e-05,
"loss": 1.7051,
"step": 5200
},
{
"epoch": 0.09421105646738165,
"grad_norm": 0.15119831264019012,
"learning_rate": 4.9544990684205324e-05,
"loss": 1.6961,
"step": 5300
},
{
"epoch": 0.09598862357053979,
"grad_norm": 0.13215667009353638,
"learning_rate": 4.951708700382853e-05,
"loss": 1.6961,
"step": 5400
},
{
"epoch": 0.09776619067369793,
"grad_norm": 0.1464473158121109,
"learning_rate": 4.948836132338017e-05,
"loss": 1.6968,
"step": 5500
},
{
"epoch": 0.09954375777685608,
"grad_norm": 0.17705056071281433,
"learning_rate": 4.945881460596453e-05,
"loss": 1.7023,
"step": 5600
},
{
"epoch": 0.10132132488001422,
"grad_norm": 0.1780652105808258,
"learning_rate": 4.942844784221331e-05,
"loss": 1.7127,
"step": 5700
},
{
"epoch": 0.10309889198317236,
"grad_norm": 0.13343891501426697,
"learning_rate": 4.9397262050252444e-05,
"loss": 1.6882,
"step": 5800
},
{
"epoch": 0.1048764590863305,
"grad_norm": 0.14272858202457428,
"learning_rate": 4.9365258275667935e-05,
"loss": 1.7006,
"step": 5900
},
{
"epoch": 0.10665402618948866,
"grad_norm": 0.1572321206331253,
"learning_rate": 4.933243759147084e-05,
"loss": 1.6909,
"step": 6000
},
{
"epoch": 0.1084315932926468,
"grad_norm": 0.1878873109817505,
"learning_rate": 4.9298801098061234e-05,
"loss": 1.7001,
"step": 6100
},
{
"epoch": 0.11020916039580494,
"grad_norm": 0.15412138402462006,
"learning_rate": 4.926434992319137e-05,
"loss": 1.7009,
"step": 6200
},
{
"epoch": 0.11198672749896309,
"grad_norm": 0.16579018533229828,
"learning_rate": 4.922908522192785e-05,
"loss": 1.6903,
"step": 6300
},
{
"epoch": 0.11376429460212123,
"grad_norm": 0.176075279712677,
"learning_rate": 4.919300817661288e-05,
"loss": 1.6814,
"step": 6400
},
{
"epoch": 0.11554186170527937,
"grad_norm": 0.1489766240119934,
"learning_rate": 4.9156119996824646e-05,
"loss": 1.6834,
"step": 6500
},
{
"epoch": 0.11731942880843751,
"grad_norm": 0.14244747161865234,
"learning_rate": 4.911842191933679e-05,
"loss": 1.698,
"step": 6600
},
{
"epoch": 0.11909699591159566,
"grad_norm": 0.18538010120391846,
"learning_rate": 4.9079915208076874e-05,
"loss": 1.7075,
"step": 6700
},
{
"epoch": 0.12087456301475381,
"grad_norm": 0.13722339272499084,
"learning_rate": 4.9040601154084064e-05,
"loss": 1.6904,
"step": 6800
},
{
"epoch": 0.12265213011791196,
"grad_norm": 0.14853331446647644,
"learning_rate": 4.900048107546581e-05,
"loss": 1.7006,
"step": 6900
},
{
"epoch": 0.1244296972210701,
"grad_norm": 0.1475294679403305,
"learning_rate": 4.895955631735369e-05,
"loss": 1.7084,
"step": 7000
},
{
"epoch": 0.12620726432422824,
"grad_norm": 0.16500729322433472,
"learning_rate": 4.8917828251858245e-05,
"loss": 1.6824,
"step": 7100
},
{
"epoch": 0.1279848314273864,
"grad_norm": 0.14395256340503693,
"learning_rate": 4.8875727542547924e-05,
"loss": 1.6846,
"step": 7200
},
{
"epoch": 0.12976239853054453,
"grad_norm": 0.14854487776756287,
"learning_rate": 4.8832405083980224e-05,
"loss": 1.72,
"step": 7300
},
{
"epoch": 0.13153996563370268,
"grad_norm": 0.13804668188095093,
"learning_rate": 4.8788283581110025e-05,
"loss": 1.6994,
"step": 7400
},
{
"epoch": 0.1333175327368608,
"grad_norm": 0.19897769391536713,
"learning_rate": 4.874336451322718e-05,
"loss": 1.6748,
"step": 7500
},
{
"epoch": 0.13509509984001897,
"grad_norm": 0.18809333443641663,
"learning_rate": 4.869764938636205e-05,
"loss": 1.7039,
"step": 7600
},
{
"epoch": 0.1368726669431771,
"grad_norm": 0.15036119520664215,
"learning_rate": 4.865113973323494e-05,
"loss": 1.6873,
"step": 7700
},
{
"epoch": 0.13865023404633525,
"grad_norm": 0.24881285429000854,
"learning_rate": 4.8603837113204786e-05,
"loss": 1.7069,
"step": 7800
},
{
"epoch": 0.14042780114949338,
"grad_norm": 0.13876497745513916,
"learning_rate": 4.85557431122168e-05,
"loss": 1.6825,
"step": 7900
},
{
"epoch": 0.14220536825265154,
"grad_norm": 0.1649981439113617,
"learning_rate": 4.850685934274935e-05,
"loss": 1.6943,
"step": 8000
},
{
"epoch": 0.1439829353558097,
"grad_norm": 0.14828725159168243,
"learning_rate": 4.845718744375987e-05,
"loss": 1.6928,
"step": 8100
},
{
"epoch": 0.14576050245896782,
"grad_norm": 0.15515898168087006,
"learning_rate": 4.84067290806299e-05,
"loss": 1.6938,
"step": 8200
},
{
"epoch": 0.14753806956212598,
"grad_norm": 0.21222877502441406,
"learning_rate": 4.83554859451093e-05,
"loss": 1.6775,
"step": 8300
},
{
"epoch": 0.1493156366652841,
"grad_norm": 0.14965397119522095,
"learning_rate": 4.830345975525948e-05,
"loss": 1.6952,
"step": 8400
},
{
"epoch": 0.15109320376844226,
"grad_norm": 0.1583070456981659,
"learning_rate": 4.8250652255395806e-05,
"loss": 1.6856,
"step": 8500
},
{
"epoch": 0.1528707708716004,
"grad_norm": 0.1827002763748169,
"learning_rate": 4.819706521602914e-05,
"loss": 1.696,
"step": 8600
},
{
"epoch": 0.15464833797475855,
"grad_norm": 0.21312415599822998,
"learning_rate": 4.8142700433806456e-05,
"loss": 1.6839,
"step": 8700
},
{
"epoch": 0.1564259050779167,
"grad_norm": 0.14075049757957458,
"learning_rate": 4.80875597314506e-05,
"loss": 1.6846,
"step": 8800
},
{
"epoch": 0.15820347218107483,
"grad_norm": 0.15312770009040833,
"learning_rate": 4.8031644957699214e-05,
"loss": 1.6856,
"step": 8900
},
{
"epoch": 0.159981039284233,
"grad_norm": 0.16638757288455963,
"learning_rate": 4.797495798724271e-05,
"loss": 1.6922,
"step": 9000
},
{
"epoch": 0.16175860638739112,
"grad_norm": 0.13447363674640656,
"learning_rate": 4.791750072066143e-05,
"loss": 1.6845,
"step": 9100
},
{
"epoch": 0.16353617349054927,
"grad_norm": 0.1486334651708603,
"learning_rate": 4.785927508436194e-05,
"loss": 1.6966,
"step": 9200
},
{
"epoch": 0.1653137405937074,
"grad_norm": 0.1405581384897232,
"learning_rate": 4.780028303051243e-05,
"loss": 1.6883,
"step": 9300
},
{
"epoch": 0.16709130769686556,
"grad_norm": 0.1692507416009903,
"learning_rate": 4.774052653697725e-05,
"loss": 1.6829,
"step": 9400
},
{
"epoch": 0.1688688748000237,
"grad_norm": 0.17827360332012177,
"learning_rate": 4.76800076072506e-05,
"loss": 1.698,
"step": 9500
},
{
"epoch": 0.17064644190318184,
"grad_norm": 0.1813431978225708,
"learning_rate": 4.7618728270389405e-05,
"loss": 1.6936,
"step": 9600
},
{
"epoch": 0.17242400900634,
"grad_norm": 0.15732981264591217,
"learning_rate": 4.755669058094521e-05,
"loss": 1.6756,
"step": 9700
},
{
"epoch": 0.17420157610949813,
"grad_norm": 0.1365622580051422,
"learning_rate": 4.749389661889535e-05,
"loss": 1.6869,
"step": 9800
},
{
"epoch": 0.17597914321265629,
"grad_norm": 0.14390863478183746,
"learning_rate": 4.7430348489573175e-05,
"loss": 1.6986,
"step": 9900
},
{
"epoch": 0.17775671031581441,
"grad_norm": 0.17032405734062195,
"learning_rate": 4.7366048323597524e-05,
"loss": 1.6997,
"step": 10000
},
{
"epoch": 0.17953427741897257,
"grad_norm": 0.15666988492012024,
"learning_rate": 4.73009982768012e-05,
"loss": 1.6908,
"step": 10100
},
{
"epoch": 0.1813118445221307,
"grad_norm": 0.12964856624603271,
"learning_rate": 4.723520053015879e-05,
"loss": 1.676,
"step": 10200
},
{
"epoch": 0.18308941162528886,
"grad_norm": 0.15315160155296326,
"learning_rate": 4.716865728971346e-05,
"loss": 1.6899,
"step": 10300
},
{
"epoch": 0.184866978728447,
"grad_norm": 0.17329467833042145,
"learning_rate": 4.710137078650302e-05,
"loss": 1.6755,
"step": 10400
},
{
"epoch": 0.18664454583160514,
"grad_norm": 0.16102010011672974,
"learning_rate": 4.703334327648516e-05,
"loss": 1.6779,
"step": 10500
},
{
"epoch": 0.1884221129347633,
"grad_norm": 0.170249804854393,
"learning_rate": 4.6964577040461745e-05,
"loss": 1.7001,
"step": 10600
},
{
"epoch": 0.19019968003792143,
"grad_norm": 0.14801470935344696,
"learning_rate": 4.689507438400239e-05,
"loss": 1.6881,
"step": 10700
},
{
"epoch": 0.19197724714107958,
"grad_norm": 0.2009027749300003,
"learning_rate": 4.682483763736718e-05,
"loss": 1.6944,
"step": 10800
},
{
"epoch": 0.1937548142442377,
"grad_norm": 0.15776540338993073,
"learning_rate": 4.6753869155428454e-05,
"loss": 1.6849,
"step": 10900
},
{
"epoch": 0.19553238134739587,
"grad_norm": 0.1666073054075241,
"learning_rate": 4.6682171317591947e-05,
"loss": 1.6986,
"step": 11000
},
{
"epoch": 0.197309948450554,
"grad_norm": 0.204326793551445,
"learning_rate": 4.660974652771698e-05,
"loss": 1.6927,
"step": 11100
},
{
"epoch": 0.19908751555371215,
"grad_norm": 0.17319276928901672,
"learning_rate": 4.653659721403583e-05,
"loss": 1.6804,
"step": 11200
},
{
"epoch": 0.2008650826568703,
"grad_norm": 0.19199158251285553,
"learning_rate": 4.6462725829072386e-05,
"loss": 1.6692,
"step": 11300
},
{
"epoch": 0.20264264976002844,
"grad_norm": 0.15492092072963715,
"learning_rate": 4.638813484955985e-05,
"loss": 1.695,
"step": 11400
},
{
"epoch": 0.2044202168631866,
"grad_norm": 0.2306402027606964,
"learning_rate": 4.631282677635775e-05,
"loss": 1.7068,
"step": 11500
},
{
"epoch": 0.20619778396634472,
"grad_norm": 0.20894396305084229,
"learning_rate": 4.62375678895541e-05,
"loss": 1.7145,
"step": 11600
},
{
"epoch": 0.20797535106950288,
"grad_norm": 0.31019458174705505,
"learning_rate": 4.616084033514059e-05,
"loss": 1.688,
"step": 11700
},
{
"epoch": 0.209752918172661,
"grad_norm": 0.22205297648906708,
"learning_rate": 4.6083403307686204e-05,
"loss": 1.6989,
"step": 11800
},
{
"epoch": 0.21153048527581916,
"grad_norm": 0.15302753448486328,
"learning_rate": 4.600525940347174e-05,
"loss": 1.6929,
"step": 11900
},
{
"epoch": 0.21330805237897732,
"grad_norm": 0.1468563824892044,
"learning_rate": 4.5926411242477904e-05,
"loss": 1.6924,
"step": 12000
},
{
"epoch": 0.21508561948213545,
"grad_norm": 0.1425103396177292,
"learning_rate": 4.584686146829748e-05,
"loss": 1.6904,
"step": 12100
},
{
"epoch": 0.2168631865852936,
"grad_norm": 0.1582684963941574,
"learning_rate": 4.5766612748046654e-05,
"loss": 1.6804,
"step": 12200
},
{
"epoch": 0.21864075368845173,
"grad_norm": 0.16768227517604828,
"learning_rate": 4.5685667772275654e-05,
"loss": 1.6796,
"step": 12300
},
{
"epoch": 0.2204183207916099,
"grad_norm": 0.1611669808626175,
"learning_rate": 4.56040292548785e-05,
"loss": 1.6749,
"step": 12400
},
{
"epoch": 0.22219588789476802,
"grad_norm": 0.13350994884967804,
"learning_rate": 4.5521699933002026e-05,
"loss": 1.7013,
"step": 12500
},
{
"epoch": 0.22397345499792617,
"grad_norm": 0.14940309524536133,
"learning_rate": 4.5438682566954124e-05,
"loss": 1.6814,
"step": 12600
},
{
"epoch": 0.2257510221010843,
"grad_norm": 0.13618171215057373,
"learning_rate": 4.5354979940111166e-05,
"loss": 1.6852,
"step": 12700
},
{
"epoch": 0.22752858920424246,
"grad_norm": 0.13858729600906372,
"learning_rate": 4.52705948588247e-05,
"loss": 1.7117,
"step": 12800
},
{
"epoch": 0.22930615630740062,
"grad_norm": 0.1507061868906021,
"learning_rate": 4.518553015232737e-05,
"loss": 1.6789,
"step": 12900
},
{
"epoch": 0.23108372341055874,
"grad_norm": 0.17016680538654327,
"learning_rate": 4.5099788672638064e-05,
"loss": 1.6925,
"step": 13000
},
{
"epoch": 0.2328612905137169,
"grad_norm": 0.1454281359910965,
"learning_rate": 4.501337329446625e-05,
"loss": 1.6942,
"step": 13100
},
{
"epoch": 0.23463885761687503,
"grad_norm": 0.13199830055236816,
"learning_rate": 4.492628691511563e-05,
"loss": 1.6844,
"step": 13200
},
{
"epoch": 0.23641642472003319,
"grad_norm": 0.1504441648721695,
"learning_rate": 4.483853245438702e-05,
"loss": 1.6803,
"step": 13300
},
{
"epoch": 0.23819399182319131,
"grad_norm": 0.14603202044963837,
"learning_rate": 4.4750112854480376e-05,
"loss": 1.6776,
"step": 13400
},
{
"epoch": 0.23997155892634947,
"grad_norm": 0.20005132257938385,
"learning_rate": 4.466103107989624e-05,
"loss": 1.6995,
"step": 13500
},
{
"epoch": 0.24174912602950763,
"grad_norm": 0.20756611227989197,
"learning_rate": 4.457129011733629e-05,
"loss": 1.691,
"step": 13600
},
{
"epoch": 0.24352669313266576,
"grad_norm": 0.1558232605457306,
"learning_rate": 4.448089297560325e-05,
"loss": 1.6815,
"step": 13700
},
{
"epoch": 0.2453042602358239,
"grad_norm": 0.18202444911003113,
"learning_rate": 4.4389842685499944e-05,
"loss": 1.6758,
"step": 13800
},
{
"epoch": 0.24708182733898204,
"grad_norm": 0.1685715764760971,
"learning_rate": 4.429814229972775e-05,
"loss": 1.684,
"step": 13900
},
{
"epoch": 0.2488593944421402,
"grad_norm": 0.1511525958776474,
"learning_rate": 4.420579489278419e-05,
"loss": 1.672,
"step": 14000
},
{
"epoch": 0.25063696154529835,
"grad_norm": 0.13901682198047638,
"learning_rate": 4.411280356085991e-05,
"loss": 1.6787,
"step": 14100
},
{
"epoch": 0.2524145286484565,
"grad_norm": 0.15039555728435516,
"learning_rate": 4.4019171421734826e-05,
"loss": 1.6854,
"step": 14200
},
{
"epoch": 0.2541920957516146,
"grad_norm": 0.14443428814411163,
"learning_rate": 4.392490161467361e-05,
"loss": 1.692,
"step": 14300
},
{
"epoch": 0.2559696628547728,
"grad_norm": 0.1846003532409668,
"learning_rate": 4.382999730032042e-05,
"loss": 1.6828,
"step": 14400
},
{
"epoch": 0.2577472299579309,
"grad_norm": 0.1854531168937683,
"learning_rate": 4.3734461660592985e-05,
"loss": 1.687,
"step": 14500
},
{
"epoch": 0.25952479706108905,
"grad_norm": 0.21927309036254883,
"learning_rate": 4.363829789857584e-05,
"loss": 1.6873,
"step": 14600
},
{
"epoch": 0.2613023641642472,
"grad_norm": 0.22467108070850372,
"learning_rate": 4.3541509238413e-05,
"loss": 1.6893,
"step": 14700
},
{
"epoch": 0.26307993126740536,
"grad_norm": 0.20354901254177094,
"learning_rate": 4.344409892519985e-05,
"loss": 1.6937,
"step": 14800
},
{
"epoch": 0.2648574983705635,
"grad_norm": 0.15710541605949402,
"learning_rate": 4.3346070224874304e-05,
"loss": 1.6897,
"step": 14900
},
{
"epoch": 0.2666350654737216,
"grad_norm": 0.16541948914527893,
"learning_rate": 4.3247426424107364e-05,
"loss": 1.6786,
"step": 15000
},
{
"epoch": 0.26841263257687975,
"grad_norm": 0.2642144560813904,
"learning_rate": 4.314817083019289e-05,
"loss": 1.6734,
"step": 15100
},
{
"epoch": 0.27019019968003793,
"grad_norm": 0.15868282318115234,
"learning_rate": 4.3048306770936716e-05,
"loss": 1.6839,
"step": 15200
},
{
"epoch": 0.27196776678319606,
"grad_norm": 0.20308874547481537,
"learning_rate": 4.2947837594545094e-05,
"loss": 1.6897,
"step": 15300
},
{
"epoch": 0.2737453338863542,
"grad_norm": 0.1677379161119461,
"learning_rate": 4.2847780346308484e-05,
"loss": 1.6795,
"step": 15400
},
{
"epoch": 0.2755229009895124,
"grad_norm": 0.14271363615989685,
"learning_rate": 4.27461170280642e-05,
"loss": 1.6788,
"step": 15500
},
{
"epoch": 0.2773004680926705,
"grad_norm": 0.16974543035030365,
"learning_rate": 4.2643858724393424e-05,
"loss": 1.6868,
"step": 15600
},
{
"epoch": 0.27907803519582863,
"grad_norm": 0.15350034832954407,
"learning_rate": 4.254100886377579e-05,
"loss": 1.6737,
"step": 15700
},
{
"epoch": 0.28085560229898676,
"grad_norm": 0.18880531191825867,
"learning_rate": 4.2437570894524404e-05,
"loss": 1.6816,
"step": 15800
},
{
"epoch": 0.28263316940214495,
"grad_norm": 0.14773619174957275,
"learning_rate": 4.233354828467028e-05,
"loss": 1.6799,
"step": 15900
},
{
"epoch": 0.2844107365053031,
"grad_norm": 0.1591775268316269,
"learning_rate": 4.2228944521846054e-05,
"loss": 1.6704,
"step": 16000
},
{
"epoch": 0.2861883036084612,
"grad_norm": 0.1422175019979477,
"learning_rate": 4.2123763113169053e-05,
"loss": 1.6882,
"step": 16100
},
{
"epoch": 0.2879658707116194,
"grad_norm": 0.1634337157011032,
"learning_rate": 4.2018007585123695e-05,
"loss": 1.6716,
"step": 16200
},
{
"epoch": 0.2897434378147775,
"grad_norm": 0.1616571843624115,
"learning_rate": 4.1911681483443284e-05,
"loss": 1.6814,
"step": 16300
},
{
"epoch": 0.29152100491793564,
"grad_norm": 0.1432926207780838,
"learning_rate": 4.180478837299109e-05,
"loss": 1.6781,
"step": 16400
},
{
"epoch": 0.2932985720210938,
"grad_norm": 0.14793144166469574,
"learning_rate": 4.1697331837640866e-05,
"loss": 1.675,
"step": 16500
},
{
"epoch": 0.29507613912425196,
"grad_norm": 0.14463911950588226,
"learning_rate": 4.158931548015665e-05,
"loss": 1.6866,
"step": 16600
},
{
"epoch": 0.2968537062274101,
"grad_norm": 0.14069664478302002,
"learning_rate": 4.148074292207203e-05,
"loss": 1.6848,
"step": 16700
},
{
"epoch": 0.2986312733305682,
"grad_norm": 0.16380813717842102,
"learning_rate": 4.137161780356866e-05,
"loss": 1.6676,
"step": 16800
},
{
"epoch": 0.3004088404337264,
"grad_norm": 0.16407877206802368,
"learning_rate": 4.126304322856126e-05,
"loss": 1.6757,
"step": 16900
},
{
"epoch": 0.3021864075368845,
"grad_norm": 0.1595907211303711,
"learning_rate": 4.1152829417731065e-05,
"loss": 1.6894,
"step": 17000
},
{
"epoch": 0.30396397464004266,
"grad_norm": 0.1606622189283371,
"learning_rate": 4.104207404064811e-05,
"loss": 1.675,
"step": 17100
},
{
"epoch": 0.3057415417432008,
"grad_norm": 0.1525093913078308,
"learning_rate": 4.093078081067882e-05,
"loss": 1.6864,
"step": 17200
},
{
"epoch": 0.30751910884635897,
"grad_norm": 0.18236620724201202,
"learning_rate": 4.081895345922257e-05,
"loss": 1.6756,
"step": 17300
},
{
"epoch": 0.3092966759495171,
"grad_norm": 0.1441909819841385,
"learning_rate": 4.070659573558656e-05,
"loss": 1.6889,
"step": 17400
},
{
"epoch": 0.3110742430526752,
"grad_norm": 0.182451993227005,
"learning_rate": 4.059371140686013e-05,
"loss": 1.6873,
"step": 17500
},
{
"epoch": 0.3128518101558334,
"grad_norm": 0.17770905792713165,
"learning_rate": 4.048030425778841e-05,
"loss": 1.6881,
"step": 17600
},
{
"epoch": 0.31462937725899154,
"grad_norm": 0.14115692675113678,
"learning_rate": 4.0366378090645516e-05,
"loss": 1.6789,
"step": 17700
},
{
"epoch": 0.31640694436214967,
"grad_norm": 0.1899385303258896,
"learning_rate": 4.0251936725106985e-05,
"loss": 1.6796,
"step": 17800
},
{
"epoch": 0.3181845114653078,
"grad_norm": 0.15735557675361633,
"learning_rate": 4.013698399812173e-05,
"loss": 1.6774,
"step": 17900
},
{
"epoch": 0.319962078568466,
"grad_norm": 0.19576773047447205,
"learning_rate": 4.002152376378343e-05,
"loss": 1.6815,
"step": 18000
},
{
"epoch": 0.3217396456716241,
"grad_norm": 0.17470435798168182,
"learning_rate": 3.9905559893201285e-05,
"loss": 1.6879,
"step": 18100
},
{
"epoch": 0.32351721277478224,
"grad_norm": 0.2007114738225937,
"learning_rate": 3.9789096274370205e-05,
"loss": 1.6728,
"step": 18200
},
{
"epoch": 0.32529477987794037,
"grad_norm": 0.13873660564422607,
"learning_rate": 3.967213681204051e-05,
"loss": 1.6911,
"step": 18300
},
{
"epoch": 0.32707234698109855,
"grad_norm": 0.15716473758220673,
"learning_rate": 3.955468542758697e-05,
"loss": 1.6881,
"step": 18400
},
{
"epoch": 0.3288499140842567,
"grad_norm": 0.15948426723480225,
"learning_rate": 3.9436746058877335e-05,
"loss": 1.7005,
"step": 18500
},
{
"epoch": 0.3306274811874148,
"grad_norm": 0.15321232378482819,
"learning_rate": 3.9318322660140324e-05,
"loss": 1.6858,
"step": 18600
},
{
"epoch": 0.332405048290573,
"grad_norm": 0.16375650465488434,
"learning_rate": 3.919941920183305e-05,
"loss": 1.6702,
"step": 18700
},
{
"epoch": 0.3341826153937311,
"grad_norm": 0.14579662680625916,
"learning_rate": 3.908003967050787e-05,
"loss": 1.6779,
"step": 18800
},
{
"epoch": 0.33596018249688925,
"grad_norm": 0.19252930581569672,
"learning_rate": 3.896018806867876e-05,
"loss": 1.6847,
"step": 18900
},
{
"epoch": 0.3377377496000474,
"grad_norm": 0.1748981475830078,
"learning_rate": 3.88398684146871e-05,
"loss": 1.6524,
"step": 19000
},
{
"epoch": 0.33951531670320556,
"grad_norm": 0.14768213033676147,
"learning_rate": 3.871908474256696e-05,
"loss": 1.6621,
"step": 19100
},
{
"epoch": 0.3412928838063637,
"grad_norm": 0.18400093913078308,
"learning_rate": 3.859784110190985e-05,
"loss": 1.6792,
"step": 19200
},
{
"epoch": 0.3430704509095218,
"grad_norm": 0.1892794668674469,
"learning_rate": 3.8476141557728906e-05,
"loss": 1.6883,
"step": 19300
},
{
"epoch": 0.34484801801268,
"grad_norm": 0.13941031694412231,
"learning_rate": 3.835399019032268e-05,
"loss": 1.6685,
"step": 19400
},
{
"epoch": 0.34662558511583813,
"grad_norm": 0.13327963650226593,
"learning_rate": 3.8231391095138236e-05,
"loss": 1.6791,
"step": 19500
},
{
"epoch": 0.34840315221899626,
"grad_norm": 0.14174780249595642,
"learning_rate": 3.810834838263396e-05,
"loss": 1.6789,
"step": 19600
},
{
"epoch": 0.3501807193221544,
"grad_norm": 0.2639550268650055,
"learning_rate": 3.798486617814162e-05,
"loss": 1.6694,
"step": 19700
},
{
"epoch": 0.35195828642531257,
"grad_norm": 0.14735499024391174,
"learning_rate": 3.786094862172816e-05,
"loss": 1.6751,
"step": 19800
},
{
"epoch": 0.3537358535284707,
"grad_norm": 0.1680241823196411,
"learning_rate": 3.7736599868056804e-05,
"loss": 1.6791,
"step": 19900
},
{
"epoch": 0.35551342063162883,
"grad_norm": 0.15196190774440765,
"learning_rate": 3.761182408624783e-05,
"loss": 1.6741,
"step": 20000
},
{
"epoch": 0.357290987734787,
"grad_norm": 0.14523537456989288,
"learning_rate": 3.748662545973876e-05,
"loss": 1.6732,
"step": 20100
},
{
"epoch": 0.35906855483794514,
"grad_norm": 0.1658225953578949,
"learning_rate": 3.7361008186144095e-05,
"loss": 1.6842,
"step": 20200
},
{
"epoch": 0.36084612194110327,
"grad_norm": 0.2060202807188034,
"learning_rate": 3.723497647711458e-05,
"loss": 1.6757,
"step": 20300
},
{
"epoch": 0.3626236890442614,
"grad_norm": 0.15790830552577972,
"learning_rate": 3.7108534558196005e-05,
"loss": 1.6613,
"step": 20400
},
{
"epoch": 0.3644012561474196,
"grad_norm": 0.15922047197818756,
"learning_rate": 3.6981686668687545e-05,
"loss": 1.6623,
"step": 20500
},
{
"epoch": 0.3661788232505777,
"grad_norm": 0.17766642570495605,
"learning_rate": 3.685443706149958e-05,
"loss": 1.6847,
"step": 20600
},
{
"epoch": 0.36795639035373584,
"grad_norm": 0.1501617580652237,
"learning_rate": 3.672679000301118e-05,
"loss": 1.6717,
"step": 20700
},
{
"epoch": 0.369733957456894,
"grad_norm": 0.1573089063167572,
"learning_rate": 3.659874977292696e-05,
"loss": 1.6723,
"step": 20800
},
{
"epoch": 0.37151152456005215,
"grad_norm": 0.15815529227256775,
"learning_rate": 3.647032066413372e-05,
"loss": 1.6782,
"step": 20900
},
{
"epoch": 0.3732890916632103,
"grad_norm": 0.16356757283210754,
"learning_rate": 3.634150698255639e-05,
"loss": 1.6694,
"step": 21000
},
{
"epoch": 0.3750666587663684,
"grad_norm": 0.14859165251255035,
"learning_rate": 3.6213606854414085e-05,
"loss": 1.6686,
"step": 21100
},
{
"epoch": 0.3768442258695266,
"grad_norm": 0.15533782541751862,
"learning_rate": 3.608404073421511e-05,
"loss": 1.6675,
"step": 21200
},
{
"epoch": 0.3786217929726847,
"grad_norm": 0.1758899837732315,
"learning_rate": 3.595410299228654e-05,
"loss": 1.6786,
"step": 21300
},
{
"epoch": 0.38039936007584285,
"grad_norm": 0.15762227773666382,
"learning_rate": 3.582379798513425e-05,
"loss": 1.6662,
"step": 21400
},
{
"epoch": 0.382176927179001,
"grad_norm": 0.1720816045999527,
"learning_rate": 3.569313008157762e-05,
"loss": 1.6942,
"step": 21500
},
{
"epoch": 0.38395449428215916,
"grad_norm": 0.17334651947021484,
"learning_rate": 3.556210366260312e-05,
"loss": 1.6791,
"step": 21600
},
{
"epoch": 0.3857320613853173,
"grad_norm": 0.15751953423023224,
"learning_rate": 3.5430723121217376e-05,
"loss": 1.6784,
"step": 21700
},
{
"epoch": 0.3875096284884754,
"grad_norm": 0.16097095608711243,
"learning_rate": 3.529899286229991e-05,
"loss": 1.6689,
"step": 21800
},
{
"epoch": 0.3892871955916336,
"grad_norm": 0.15864881873130798,
"learning_rate": 3.5166917302455425e-05,
"loss": 1.6738,
"step": 21900
},
{
"epoch": 0.39106476269479173,
"grad_norm": 0.1525215208530426,
"learning_rate": 3.5034500869865796e-05,
"loss": 1.6887,
"step": 22000
},
{
"epoch": 0.39284232979794986,
"grad_norm": 0.14002804458141327,
"learning_rate": 3.490174800414151e-05,
"loss": 1.6745,
"step": 22100
},
{
"epoch": 0.394619896901108,
"grad_norm": 0.17650793492794037,
"learning_rate": 3.47686631561729e-05,
"loss": 1.6713,
"step": 22200
},
{
"epoch": 0.3963974640042662,
"grad_norm": 0.16852478682994843,
"learning_rate": 3.463525078798085e-05,
"loss": 1.6872,
"step": 22300
},
{
"epoch": 0.3981750311074243,
"grad_norm": 0.16134943068027496,
"learning_rate": 3.450151537256725e-05,
"loss": 1.677,
"step": 22400
},
{
"epoch": 0.39995259821058243,
"grad_norm": 0.15445928275585175,
"learning_rate": 3.4367461393764976e-05,
"loss": 1.673,
"step": 22500
},
{
"epoch": 0.4017301653137406,
"grad_norm": 0.15707698464393616,
"learning_rate": 3.42330933460876e-05,
"loss": 1.6687,
"step": 22600
},
{
"epoch": 0.40350773241689875,
"grad_norm": 0.13525037467479706,
"learning_rate": 3.4098415734578684e-05,
"loss": 1.6729,
"step": 22700
},
{
"epoch": 0.4052852995200569,
"grad_norm": 0.15618863701820374,
"learning_rate": 3.3963433074660714e-05,
"loss": 1.684,
"step": 22800
},
{
"epoch": 0.407062866623215,
"grad_norm": 0.18125438690185547,
"learning_rate": 3.382814989198375e-05,
"loss": 1.6793,
"step": 22900
},
{
"epoch": 0.4088404337263732,
"grad_norm": 0.1549660563468933,
"learning_rate": 3.3692570722273676e-05,
"loss": 1.6848,
"step": 23000
},
{
"epoch": 0.4106180008295313,
"grad_norm": 0.18558810651302338,
"learning_rate": 3.35567001111801e-05,
"loss": 1.6687,
"step": 23100
},
{
"epoch": 0.41239556793268944,
"grad_norm": 0.18007346987724304,
"learning_rate": 3.3420542614123984e-05,
"loss": 1.6714,
"step": 23200
},
{
"epoch": 0.41417313503584763,
"grad_norm": 0.15658414363861084,
"learning_rate": 3.328683432967708e-05,
"loss": 1.6799,
"step": 23300
},
{
"epoch": 0.41595070213900576,
"grad_norm": 0.18134590983390808,
"learning_rate": 3.3150122275317875e-05,
"loss": 1.6743,
"step": 23400
},
{
"epoch": 0.4177282692421639,
"grad_norm": 0.15867780148983002,
"learning_rate": 3.3013136966591515e-05,
"loss": 1.6683,
"step": 23500
},
{
"epoch": 0.419505836345322,
"grad_norm": 0.17692945897579193,
"learning_rate": 3.287588299629216e-05,
"loss": 1.6685,
"step": 23600
},
{
"epoch": 0.4212834034484802,
"grad_norm": 0.13905645906925201,
"learning_rate": 3.273836496622152e-05,
"loss": 1.6715,
"step": 23700
},
{
"epoch": 0.4230609705516383,
"grad_norm": 0.1454002857208252,
"learning_rate": 3.260058748703464e-05,
"loss": 1.6773,
"step": 23800
},
{
"epoch": 0.42483853765479646,
"grad_norm": 0.13487789034843445,
"learning_rate": 3.2462555178085255e-05,
"loss": 1.655,
"step": 23900
},
{
"epoch": 0.42661610475795464,
"grad_norm": 0.1867651492357254,
"learning_rate": 3.2324272667270975e-05,
"loss": 1.6725,
"step": 24000
},
{
"epoch": 0.42839367186111277,
"grad_norm": 0.14305393397808075,
"learning_rate": 3.218574459087805e-05,
"loss": 1.6717,
"step": 24100
},
{
"epoch": 0.4301712389642709,
"grad_norm": 0.14234061539173126,
"learning_rate": 3.2046975593425975e-05,
"loss": 1.6917,
"step": 24200
},
{
"epoch": 0.431948806067429,
"grad_norm": 0.15563951432704926,
"learning_rate": 3.1907970327511786e-05,
"loss": 1.6725,
"step": 24300
},
{
"epoch": 0.4337263731705872,
"grad_norm": 0.14877410233020782,
"learning_rate": 3.176873345365402e-05,
"loss": 1.6802,
"step": 24400
},
{
"epoch": 0.43550394027374534,
"grad_norm": 0.16491292417049408,
"learning_rate": 3.162926964013648e-05,
"loss": 1.6671,
"step": 24500
},
{
"epoch": 0.43728150737690347,
"grad_norm": 0.1698901653289795,
"learning_rate": 3.1489583562851724e-05,
"loss": 1.6782,
"step": 24600
},
{
"epoch": 0.4390590744800616,
"grad_norm": 0.18841049075126648,
"learning_rate": 3.1349679905144285e-05,
"loss": 1.6671,
"step": 24700
},
{
"epoch": 0.4408366415832198,
"grad_norm": 0.14066390693187714,
"learning_rate": 3.120956335765367e-05,
"loss": 1.6597,
"step": 24800
},
{
"epoch": 0.4426142086863779,
"grad_norm": 0.14379048347473145,
"learning_rate": 3.1069238618157064e-05,
"loss": 1.6696,
"step": 24900
},
{
"epoch": 0.44439177578953604,
"grad_norm": 0.17776834964752197,
"learning_rate": 3.092871039141184e-05,
"loss": 1.6769,
"step": 25000
},
{
"epoch": 0.4461693428926942,
"grad_norm": 0.1451658457517624,
"learning_rate": 3.078798338899784e-05,
"loss": 1.6727,
"step": 25100
},
{
"epoch": 0.44794690999585235,
"grad_norm": 0.14523907005786896,
"learning_rate": 3.064706232915933e-05,
"loss": 1.6858,
"step": 25200
},
{
"epoch": 0.4497244770990105,
"grad_norm": 0.17594589293003082,
"learning_rate": 3.050595193664693e-05,
"loss": 1.6599,
"step": 25300
},
{
"epoch": 0.4515020442021686,
"grad_norm": 0.14906199276447296,
"learning_rate": 3.0364656942559087e-05,
"loss": 1.666,
"step": 25400
},
{
"epoch": 0.4532796113053268,
"grad_norm": 0.20227928459644318,
"learning_rate": 3.0223182084183545e-05,
"loss": 1.6799,
"step": 25500
},
{
"epoch": 0.4550571784084849,
"grad_norm": 0.15447662770748138,
"learning_rate": 3.0081532104838424e-05,
"loss": 1.6709,
"step": 25600
},
{
"epoch": 0.45683474551164305,
"grad_norm": 0.1919887661933899,
"learning_rate": 2.9939711753713285e-05,
"loss": 1.6863,
"step": 25700
},
{
"epoch": 0.45861231261480123,
"grad_norm": 0.23030731081962585,
"learning_rate": 2.9797725785709828e-05,
"loss": 1.68,
"step": 25800
},
{
"epoch": 0.46038987971795936,
"grad_norm": 0.14024241268634796,
"learning_rate": 2.9655578961282497e-05,
"loss": 1.6705,
"step": 25900
},
{
"epoch": 0.4621674468211175,
"grad_norm": 0.14363612234592438,
"learning_rate": 2.951327604627888e-05,
"loss": 1.6695,
"step": 26000
},
{
"epoch": 0.4639450139242756,
"grad_norm": 0.15318314731121063,
"learning_rate": 2.9370821811779908e-05,
"loss": 1.6665,
"step": 26100
},
{
"epoch": 0.4657225810274338,
"grad_norm": 0.16767314076423645,
"learning_rate": 2.9228221033939895e-05,
"loss": 1.6627,
"step": 26200
},
{
"epoch": 0.46750014813059193,
"grad_norm": 0.18546494841575623,
"learning_rate": 2.9085478493826413e-05,
"loss": 1.6892,
"step": 26300
},
{
"epoch": 0.46927771523375006,
"grad_norm": 0.14965227246284485,
"learning_rate": 2.8942598977259995e-05,
"loss": 1.6681,
"step": 26400
},
{
"epoch": 0.47105528233690824,
"grad_norm": 0.1423717737197876,
"learning_rate": 2.879958727465365e-05,
"loss": 1.662,
"step": 26500
},
{
"epoch": 0.47283284944006637,
"grad_norm": 0.154624342918396,
"learning_rate": 2.8656448180852285e-05,
"loss": 1.683,
"step": 26600
},
{
"epoch": 0.4746104165432245,
"grad_norm": 0.14358487725257874,
"learning_rate": 2.8513186494971944e-05,
"loss": 1.6704,
"step": 26700
},
{
"epoch": 0.47638798364638263,
"grad_norm": 0.14034679532051086,
"learning_rate": 2.836980702023888e-05,
"loss": 1.672,
"step": 26800
},
{
"epoch": 0.4781655507495408,
"grad_norm": 0.14413665235042572,
"learning_rate": 2.822631456382853e-05,
"loss": 1.6645,
"step": 26900
},
{
"epoch": 0.47994311785269894,
"grad_norm": 0.14380885660648346,
"learning_rate": 2.8082713936704348e-05,
"loss": 1.6671,
"step": 27000
},
{
"epoch": 0.48172068495585707,
"grad_norm": 0.15115170180797577,
"learning_rate": 2.7939009953456487e-05,
"loss": 1.6714,
"step": 27100
},
{
"epoch": 0.48349825205901525,
"grad_norm": 0.16167448461055756,
"learning_rate": 2.779520743214039e-05,
"loss": 1.6691,
"step": 27200
},
{
"epoch": 0.4852758191621734,
"grad_norm": 0.14348022639751434,
"learning_rate": 2.765131119411526e-05,
"loss": 1.6723,
"step": 27300
},
{
"epoch": 0.4870533862653315,
"grad_norm": 0.14456488192081451,
"learning_rate": 2.7507326063882376e-05,
"loss": 1.6724,
"step": 27400
},
{
"epoch": 0.48883095336848964,
"grad_norm": 0.15705521404743195,
"learning_rate": 2.7363256868923388e-05,
"loss": 1.6699,
"step": 27500
},
{
"epoch": 0.4906085204716478,
"grad_norm": 0.1625920534133911,
"learning_rate": 2.721910843953842e-05,
"loss": 1.6644,
"step": 27600
},
{
"epoch": 0.49238608757480595,
"grad_norm": 0.13969144225120544,
"learning_rate": 2.7074885608684154e-05,
"loss": 1.679,
"step": 27700
},
{
"epoch": 0.4941636546779641,
"grad_norm": 0.1736816018819809,
"learning_rate": 2.6930593211811763e-05,
"loss": 1.672,
"step": 27800
},
{
"epoch": 0.4959412217811222,
"grad_norm": 0.19090887904167175,
"learning_rate": 2.678767996247037e-05,
"loss": 1.6787,
"step": 27900
},
{
"epoch": 0.4977187888842804,
"grad_norm": 0.17863860726356506,
"learning_rate": 2.6643263524000922e-05,
"loss": 1.6841,
"step": 28000
},
{
"epoch": 0.4994963559874385,
"grad_norm": 0.1446855366230011,
"learning_rate": 2.64987919907833e-05,
"loss": 1.6655,
"step": 28100
},
{
"epoch": 0.5012739230905967,
"grad_norm": 0.1714792400598526,
"learning_rate": 2.6354270206607095e-05,
"loss": 1.6716,
"step": 28200
},
{
"epoch": 0.5030514901937548,
"grad_norm": 0.1719600409269333,
"learning_rate": 2.6209703016946675e-05,
"loss": 1.6551,
"step": 28300
},
{
"epoch": 0.504829057296913,
"grad_norm": 0.18065394461154938,
"learning_rate": 2.6065095268798772e-05,
"loss": 1.6647,
"step": 28400
},
{
"epoch": 0.5066066244000711,
"grad_norm": 0.14981447160243988,
"learning_rate": 2.5920451810519935e-05,
"loss": 1.6666,
"step": 28500
},
{
"epoch": 0.5083841915032292,
"grad_norm": 0.14168864488601685,
"learning_rate": 2.5775777491663976e-05,
"loss": 1.6619,
"step": 28600
},
{
"epoch": 0.5101617586063873,
"grad_norm": 0.1458740234375,
"learning_rate": 2.563107716281941e-05,
"loss": 1.6616,
"step": 28700
},
{
"epoch": 0.5119393257095456,
"grad_norm": 0.15625467896461487,
"learning_rate": 2.5486355675446804e-05,
"loss": 1.6606,
"step": 28800
},
{
"epoch": 0.5137168928127037,
"grad_norm": 0.1387881189584732,
"learning_rate": 2.5341617881716105e-05,
"loss": 1.6714,
"step": 28900
},
{
"epoch": 0.5154944599158618,
"grad_norm": 0.16523011028766632,
"learning_rate": 2.5196868634343986e-05,
"loss": 1.6772,
"step": 29000
},
{
"epoch": 0.51727202701902,
"grad_norm": 0.16577035188674927,
"learning_rate": 2.505211278643112e-05,
"loss": 1.6662,
"step": 29100
},
{
"epoch": 0.5190495941221781,
"grad_norm": 0.20573437213897705,
"learning_rate": 2.490735519129951e-05,
"loss": 1.6763,
"step": 29200
},
{
"epoch": 0.5208271612253362,
"grad_norm": 0.1646687239408493,
"learning_rate": 2.4762600702329707e-05,
"loss": 1.6713,
"step": 29300
},
{
"epoch": 0.5226047283284944,
"grad_norm": 0.15127506852149963,
"learning_rate": 2.461785417279814e-05,
"loss": 1.6754,
"step": 29400
},
{
"epoch": 0.5243822954316525,
"grad_norm": 0.13983801007270813,
"learning_rate": 2.4473120455714367e-05,
"loss": 1.6683,
"step": 29500
},
{
"epoch": 0.5261598625348107,
"grad_norm": 0.14342284202575684,
"learning_rate": 2.4328404403658382e-05,
"loss": 1.6817,
"step": 29600
},
{
"epoch": 0.5279374296379689,
"grad_norm": 0.16623562574386597,
"learning_rate": 2.41837108686179e-05,
"loss": 1.6819,
"step": 29700
},
{
"epoch": 0.529714996741127,
"grad_norm": 0.14822550117969513,
"learning_rate": 2.4039044701825705e-05,
"loss": 1.6691,
"step": 29800
},
{
"epoch": 0.5314925638442851,
"grad_norm": 0.28164225816726685,
"learning_rate": 2.3894410753596987e-05,
"loss": 1.6736,
"step": 29900
},
{
"epoch": 0.5332701309474432,
"grad_norm": 0.20413027703762054,
"learning_rate": 2.3751259642565925e-05,
"loss": 1.6758,
"step": 30000
},
{
"epoch": 0.5350476980506014,
"grad_norm": 0.15408293902873993,
"learning_rate": 2.36067042347753e-05,
"loss": 1.6683,
"step": 30100
},
{
"epoch": 0.5368252651537595,
"grad_norm": 0.14037184417247772,
"learning_rate": 2.346219554090377e-05,
"loss": 1.6692,
"step": 30200
},
{
"epoch": 0.5386028322569177,
"grad_norm": 0.15781673789024353,
"learning_rate": 2.3317738405986828e-05,
"loss": 1.6627,
"step": 30300
},
{
"epoch": 0.5403803993600759,
"grad_norm": 0.1486879140138626,
"learning_rate": 2.3173337673331313e-05,
"loss": 1.6728,
"step": 30400
},
{
"epoch": 0.542157966463234,
"grad_norm": 0.20719771087169647,
"learning_rate": 2.302899818435304e-05,
"loss": 1.665,
"step": 30500
},
{
"epoch": 0.5439355335663921,
"grad_norm": 0.16389068961143494,
"learning_rate": 2.288472477841445e-05,
"loss": 1.671,
"step": 30600
},
{
"epoch": 0.5457131006695503,
"grad_norm": 0.17496538162231445,
"learning_rate": 2.274052229266239e-05,
"loss": 1.6686,
"step": 30700
},
{
"epoch": 0.5474906677727084,
"grad_norm": 0.32813844084739685,
"learning_rate": 2.259639556186592e-05,
"loss": 1.6752,
"step": 30800
},
{
"epoch": 0.5492682348758665,
"grad_norm": 0.5157455205917358,
"learning_rate": 2.2452349418254213e-05,
"loss": 1.6838,
"step": 30900
},
{
"epoch": 0.5510458019790248,
"grad_norm": 0.14271293580532074,
"learning_rate": 2.2308388691354538e-05,
"loss": 1.6753,
"step": 31000
},
{
"epoch": 0.5528233690821829,
"grad_norm": 0.28928157687187195,
"learning_rate": 2.216451820783035e-05,
"loss": 1.6693,
"step": 31100
},
{
"epoch": 0.554600936185341,
"grad_norm": 0.144424170255661,
"learning_rate": 2.2020742791319452e-05,
"loss": 1.6622,
"step": 31200
},
{
"epoch": 0.5563785032884991,
"grad_norm": 0.16234175860881805,
"learning_rate": 2.1877067262272284e-05,
"loss": 1.6818,
"step": 31300
},
{
"epoch": 0.5581560703916573,
"grad_norm": 0.19830650091171265,
"learning_rate": 2.173349643779028e-05,
"loss": 1.6768,
"step": 31400
},
{
"epoch": 0.5599336374948154,
"grad_norm": Infinity,
"learning_rate": 2.159146918661628e-05,
"loss": 1.6824,
"step": 31500
},
{
"epoch": 0.5617112045979735,
"grad_norm": 0.16732582449913025,
"learning_rate": 2.144812104128816e-05,
"loss": 1.6745,
"step": 31600
},
{
"epoch": 0.5634887717011318,
"grad_norm": 0.1458720713853836,
"learning_rate": 2.130489198207977e-05,
"loss": 1.6559,
"step": 31700
},
{
"epoch": 0.5652663388042899,
"grad_norm": 0.1879144012928009,
"learning_rate": 2.1161786811123463e-05,
"loss": 1.6582,
"step": 31800
},
{
"epoch": 0.567043905907448,
"grad_norm": 0.15326225757598877,
"learning_rate": 2.1018810326397926e-05,
"loss": 1.6632,
"step": 31900
},
{
"epoch": 0.5688214730106061,
"grad_norm": 0.14871954917907715,
"learning_rate": 2.087596732156729e-05,
"loss": 1.6683,
"step": 32000
},
{
"epoch": 0.5705990401137643,
"grad_norm": 0.138087660074234,
"learning_rate": 2.073326258582043e-05,
"loss": 1.6687,
"step": 32100
},
{
"epoch": 0.5723766072169224,
"grad_norm": 0.1453862488269806,
"learning_rate": 2.0592125796717588e-05,
"loss": 1.6658,
"step": 32200
},
{
"epoch": 0.5741541743200805,
"grad_norm": 0.14493419229984283,
"learning_rate": 2.044971044602353e-05,
"loss": 1.6667,
"step": 32300
},
{
"epoch": 0.5759317414232388,
"grad_norm": 0.22620221972465515,
"learning_rate": 2.0307447655800402e-05,
"loss": 1.6818,
"step": 32400
},
{
"epoch": 0.5777093085263969,
"grad_norm": 0.14831425249576569,
"learning_rate": 2.016534219578384e-05,
"loss": 1.6518,
"step": 32500
},
{
"epoch": 0.579486875629555,
"grad_norm": 0.15642555058002472,
"learning_rate": 2.0023398830434578e-05,
"loss": 1.6578,
"step": 32600
},
{
"epoch": 0.5812644427327132,
"grad_norm": 0.18855425715446472,
"learning_rate": 1.9881622318778698e-05,
"loss": 1.6719,
"step": 32700
},
{
"epoch": 0.5830420098358713,
"grad_norm": 0.14109855890274048,
"learning_rate": 1.974001741424807e-05,
"loss": 1.6672,
"step": 32800
},
{
"epoch": 0.5848195769390294,
"grad_norm": 0.16638268530368805,
"learning_rate": 1.959858886452098e-05,
"loss": 1.6732,
"step": 32900
},
{
"epoch": 0.5865971440421875,
"grad_norm": 0.15555280447006226,
"learning_rate": 1.9457341411362953e-05,
"loss": 1.6738,
"step": 33000
},
{
"epoch": 0.5883747111453458,
"grad_norm": 0.19915728271007538,
"learning_rate": 1.9316279790467785e-05,
"loss": 1.6828,
"step": 33100
},
{
"epoch": 0.5901522782485039,
"grad_norm": 0.14633417129516602,
"learning_rate": 1.9175408731298737e-05,
"loss": 1.6582,
"step": 33200
},
{
"epoch": 0.591929845351662,
"grad_norm": 0.15195755660533905,
"learning_rate": 1.9034732956930004e-05,
"loss": 1.6722,
"step": 33300
},
{
"epoch": 0.5937074124548202,
"grad_norm": 0.1364789605140686,
"learning_rate": 1.8894257183888324e-05,
"loss": 1.6797,
"step": 33400
},
{
"epoch": 0.5954849795579783,
"grad_norm": 0.1638212352991104,
"learning_rate": 1.8753986121994874e-05,
"loss": 1.6607,
"step": 33500
},
{
"epoch": 0.5972625466611364,
"grad_norm": 0.16077277064323425,
"learning_rate": 1.8613924474207344e-05,
"loss": 1.6731,
"step": 33600
},
{
"epoch": 0.5990401137642946,
"grad_norm": 0.16311664879322052,
"learning_rate": 1.8474076936462277e-05,
"loss": 1.6604,
"step": 33700
},
{
"epoch": 0.6008176808674528,
"grad_norm": 0.16206273436546326,
"learning_rate": 1.833444819751758e-05,
"loss": 1.6769,
"step": 33800
},
{
"epoch": 0.6025952479706109,
"grad_norm": 0.1508202999830246,
"learning_rate": 1.8195042938795387e-05,
"loss": 1.6739,
"step": 33900
},
{
"epoch": 0.604372815073769,
"grad_norm": 0.16781938076019287,
"learning_rate": 1.8055865834225045e-05,
"loss": 1.6665,
"step": 34000
},
{
"epoch": 0.6061503821769272,
"grad_norm": 0.1427120417356491,
"learning_rate": 1.7916921550086444e-05,
"loss": 1.68,
"step": 34100
},
{
"epoch": 0.6079279492800853,
"grad_norm": 0.15128500759601593,
"learning_rate": 1.7778214744853537e-05,
"loss": 1.6666,
"step": 34200
},
{
"epoch": 0.6097055163832434,
"grad_norm": 0.19677314162254333,
"learning_rate": 1.76397500690382e-05,
"loss": 1.673,
"step": 34300
},
{
"epoch": 0.6114830834864016,
"grad_norm": 0.1646704375743866,
"learning_rate": 1.7501532165034255e-05,
"loss": 1.6748,
"step": 34400
},
{
"epoch": 0.6132606505895598,
"grad_norm": 0.15692903101444244,
"learning_rate": 1.736356566696186e-05,
"loss": 1.6713,
"step": 34500
},
{
"epoch": 0.6150382176927179,
"grad_norm": 0.15300609171390533,
"learning_rate": 1.7225855200512113e-05,
"loss": 1.6673,
"step": 34600
},
{
"epoch": 0.6168157847958761,
"grad_norm": 0.18005123734474182,
"learning_rate": 1.7088405382791988e-05,
"loss": 1.6638,
"step": 34700
},
{
"epoch": 0.6185933518990342,
"grad_norm": 0.13681212067604065,
"learning_rate": 1.6951220822169514e-05,
"loss": 1.6638,
"step": 34800
},
{
"epoch": 0.6203709190021923,
"grad_norm": 0.15197409689426422,
"learning_rate": 1.681430611811928e-05,
"loss": 1.6831,
"step": 34900
},
{
"epoch": 0.6221484861053505,
"grad_norm": 0.14276647567749023,
"learning_rate": 1.667766586106822e-05,
"loss": 1.6715,
"step": 35000
},
{
"epoch": 0.6239260532085086,
"grad_norm": 0.1536717563867569,
"learning_rate": 1.654130463224171e-05,
"loss": 1.6608,
"step": 35100
},
{
"epoch": 0.6257036203116668,
"grad_norm": 0.14087150990962982,
"learning_rate": 1.6405227003509966e-05,
"loss": 1.671,
"step": 35200
},
{
"epoch": 0.627481187414825,
"grad_norm": 0.1598573476076126,
"learning_rate": 1.6269437537234758e-05,
"loss": 1.6824,
"step": 35300
},
{
"epoch": 0.6292587545179831,
"grad_norm": 0.14146994054317474,
"learning_rate": 1.613394078611646e-05,
"loss": 1.6736,
"step": 35400
},
{
"epoch": 0.6310363216211412,
"grad_norm": 0.1572994738817215,
"learning_rate": 1.599874129304138e-05,
"loss": 1.6656,
"step": 35500
},
{
"epoch": 0.6328138887242993,
"grad_norm": 0.1504960060119629,
"learning_rate": 1.5863843590929483e-05,
"loss": 1.6639,
"step": 35600
},
{
"epoch": 0.6345914558274575,
"grad_norm": 0.17305798828601837,
"learning_rate": 1.572925220258239e-05,
"loss": 1.6563,
"step": 35700
},
{
"epoch": 0.6363690229306156,
"grad_norm": 0.19001583755016327,
"learning_rate": 1.5594971640531735e-05,
"loss": 1.6694,
"step": 35800
},
{
"epoch": 0.6381465900337737,
"grad_norm": 0.16306428611278534,
"learning_rate": 1.5461006406887892e-05,
"loss": 1.6597,
"step": 35900
},
{
"epoch": 0.639924157136932,
"grad_norm": 0.1538590043783188,
"learning_rate": 1.532736099318901e-05,
"loss": 1.6573,
"step": 36000
},
{
"epoch": 0.6417017242400901,
"grad_norm": 0.14714497327804565,
"learning_rate": 1.5194039880250432e-05,
"loss": 1.6647,
"step": 36100
},
{
"epoch": 0.6434792913432482,
"grad_norm": 0.1425635814666748,
"learning_rate": 1.5061047538014466e-05,
"loss": 1.67,
"step": 36200
},
{
"epoch": 0.6452568584464063,
"grad_norm": 0.14618100225925446,
"learning_rate": 1.4928388425400514e-05,
"loss": 1.6456,
"step": 36300
},
{
"epoch": 0.6470344255495645,
"grad_norm": 0.13993218541145325,
"learning_rate": 1.479606699015556e-05,
"loss": 1.6603,
"step": 36400
},
{
"epoch": 0.6488119926527226,
"grad_norm": 0.1687382161617279,
"learning_rate": 1.4664087668705082e-05,
"loss": 1.6684,
"step": 36500
},
{
"epoch": 0.6505895597558807,
"grad_norm": 0.1678340882062912,
"learning_rate": 1.453245488600427e-05,
"loss": 1.6762,
"step": 36600
},
{
"epoch": 0.652367126859039,
"grad_norm": 0.15924805402755737,
"learning_rate": 1.4401173055389722e-05,
"loss": 1.6775,
"step": 36700
},
{
"epoch": 0.6541446939621971,
"grad_norm": 0.149272158741951,
"learning_rate": 1.42702465784314e-05,
"loss": 1.6607,
"step": 36800
},
{
"epoch": 0.6559222610653552,
"grad_norm": 0.16075880825519562,
"learning_rate": 1.4139679844785125e-05,
"loss": 1.6526,
"step": 36900
},
{
"epoch": 0.6576998281685134,
"grad_norm": 0.15068253874778748,
"learning_rate": 1.4009477232045356e-05,
"loss": 1.6502,
"step": 37000
},
{
"epoch": 0.6594773952716715,
"grad_norm": 0.15616253018379211,
"learning_rate": 1.387964310559845e-05,
"loss": 1.6681,
"step": 37100
},
{
"epoch": 0.6612549623748296,
"grad_norm": 0.16503843665122986,
"learning_rate": 1.3750181818476282e-05,
"loss": 1.6583,
"step": 37200
},
{
"epoch": 0.6630325294779877,
"grad_norm": 0.1866072565317154,
"learning_rate": 1.36210977112103e-05,
"loss": 1.662,
"step": 37300
},
{
"epoch": 0.664810096581146,
"grad_norm": 0.14251679182052612,
"learning_rate": 1.3492395111686013e-05,
"loss": 1.6654,
"step": 37400
},
{
"epoch": 0.6665876636843041,
"grad_norm": 0.3741336464881897,
"learning_rate": 1.3364078334997871e-05,
"loss": 1.6506,
"step": 37500
},
{
"epoch": 0.6683652307874622,
"grad_norm": 0.15886838734149933,
"learning_rate": 1.3236151683304582e-05,
"loss": 1.6593,
"step": 37600
},
{
"epoch": 0.6701427978906204,
"grad_norm": 0.1486154943704605,
"learning_rate": 1.3108619445684905e-05,
"loss": 1.6752,
"step": 37700
},
{
"epoch": 0.6719203649937785,
"grad_norm": 0.14979256689548492,
"learning_rate": 1.2981485897993812e-05,
"loss": 1.6614,
"step": 37800
},
{
"epoch": 0.6736979320969366,
"grad_norm": 0.14133110642433167,
"learning_rate": 1.2854755302719146e-05,
"loss": 1.6715,
"step": 37900
},
{
"epoch": 0.6754754992000948,
"grad_norm": 0.16530480980873108,
"learning_rate": 1.2728431908838707e-05,
"loss": 1.6732,
"step": 38000
},
{
"epoch": 0.677253066303253,
"grad_norm": 0.13394702970981598,
"learning_rate": 1.2602519951677793e-05,
"loss": 1.6682,
"step": 38100
},
{
"epoch": 0.6790306334064111,
"grad_norm": 0.1656082421541214,
"learning_rate": 1.2477023652767197e-05,
"loss": 1.6655,
"step": 38200
},
{
"epoch": 0.6808082005095693,
"grad_norm": 0.18027155101299286,
"learning_rate": 1.2351947219701676e-05,
"loss": 1.6624,
"step": 38300
},
{
"epoch": 0.6825857676127274,
"grad_norm": 0.1868833750486374,
"learning_rate": 1.2227294845998873e-05,
"loss": 1.6671,
"step": 38400
},
{
"epoch": 0.6843633347158855,
"grad_norm": 0.14410994946956635,
"learning_rate": 1.2103070710958724e-05,
"loss": 1.6698,
"step": 38500
},
{
"epoch": 0.6861409018190436,
"grad_norm": 0.16073070466518402,
"learning_rate": 1.1980514742799768e-05,
"loss": 1.6697,
"step": 38600
},
{
"epoch": 0.6879184689222018,
"grad_norm": 0.13548509776592255,
"learning_rate": 1.1857155179376509e-05,
"loss": 1.6619,
"step": 38700
},
{
"epoch": 0.68969603602536,
"grad_norm": 0.20386448502540588,
"learning_rate": 1.1734236264525464e-05,
"loss": 1.6802,
"step": 38800
},
{
"epoch": 0.6914736031285181,
"grad_norm": 0.13660947978496552,
"learning_rate": 1.1611762119427785e-05,
"loss": 1.6718,
"step": 38900
},
{
"epoch": 0.6932511702316763,
"grad_norm": 0.16920311748981476,
"learning_rate": 1.1489736850352542e-05,
"loss": 1.6529,
"step": 39000
},
{
"epoch": 0.6950287373348344,
"grad_norm": 0.1494123786687851,
"learning_rate": 1.1368164548519047e-05,
"loss": 1.6876,
"step": 39100
},
{
"epoch": 0.6968063044379925,
"grad_norm": 0.16994184255599976,
"learning_rate": 1.1247049289959693e-05,
"loss": 1.6635,
"step": 39200
},
{
"epoch": 0.6985838715411506,
"grad_norm": 0.16828663647174835,
"learning_rate": 1.1126395135383297e-05,
"loss": 1.6505,
"step": 39300
},
{
"epoch": 0.7003614386443088,
"grad_norm": 0.1685681790113449,
"learning_rate": 1.1006206130038932e-05,
"loss": 1.6674,
"step": 39400
},
{
"epoch": 0.702139005747467,
"grad_norm": 0.14324048161506653,
"learning_rate": 1.0886486303580332e-05,
"loss": 1.6712,
"step": 39500
},
{
"epoch": 0.7039165728506251,
"grad_norm": 0.14247146248817444,
"learning_rate": 1.0767239669930756e-05,
"loss": 1.6638,
"step": 39600
},
{
"epoch": 0.7056941399537833,
"grad_norm": 0.18177978694438934,
"learning_rate": 1.0648470227148434e-05,
"loss": 1.6489,
"step": 39700
},
{
"epoch": 0.7074717070569414,
"grad_norm": 0.13986016809940338,
"learning_rate": 1.0530181957292506e-05,
"loss": 1.6603,
"step": 39800
},
{
"epoch": 0.7092492741600995,
"grad_norm": 0.22386716306209564,
"learning_rate": 1.0412378826289529e-05,
"loss": 1.6701,
"step": 39900
},
{
"epoch": 0.7110268412632577,
"grad_norm": 0.1482144594192505,
"learning_rate": 1.0295064783800485e-05,
"loss": 1.6711,
"step": 40000
},
{
"epoch": 0.7128044083664158,
"grad_norm": 0.1389176994562149,
"learning_rate": 1.0178243763088382e-05,
"loss": 1.6643,
"step": 40100
},
{
"epoch": 0.714581975469574,
"grad_norm": 0.16256819665431976,
"learning_rate": 1.0061919680886375e-05,
"loss": 1.6737,
"step": 40200
},
{
"epoch": 0.7163595425727322,
"grad_norm": 0.1738821119070053,
"learning_rate": 9.946096437266427e-06,
"loss": 1.6854,
"step": 40300
},
{
"epoch": 0.7181371096758903,
"grad_norm": 0.1526288390159607,
"learning_rate": 9.830777915508584e-06,
"loss": 1.6535,
"step": 40400
},
{
"epoch": 0.7199146767790484,
"grad_norm": 0.13686831295490265,
"learning_rate": 9.71596798197075e-06,
"loss": 1.6807,
"step": 40500
},
{
"epoch": 0.7216922438822065,
"grad_norm": 0.1469413936138153,
"learning_rate": 9.602810911756332e-06,
"loss": 1.675,
"step": 40600
},
{
"epoch": 0.7234698109853647,
"grad_norm": 0.14766348898410797,
"learning_rate": 9.489024503787308e-06,
"loss": 1.6554,
"step": 40700
},
{
"epoch": 0.7252473780885228,
"grad_norm": 0.15909671783447266,
"learning_rate": 9.375758142223712e-06,
"loss": 1.6665,
"step": 40800
},
{
"epoch": 0.727024945191681,
"grad_norm": 0.17711246013641357,
"learning_rate": 9.263015624619362e-06,
"loss": 1.66,
"step": 40900
},
{
"epoch": 0.7288025122948392,
"grad_norm": 0.15886163711547852,
"learning_rate": 9.150800730964821e-06,
"loss": 1.6639,
"step": 41000
},
{
"epoch": 0.7305800793979973,
"grad_norm": 0.14557142555713654,
"learning_rate": 9.039117223560666e-06,
"loss": 1.677,
"step": 41100
},
{
"epoch": 0.7323576465011554,
"grad_norm": 0.15475749969482422,
"learning_rate": 8.927968846891351e-06,
"loss": 1.6677,
"step": 41200
},
{
"epoch": 0.7341352136043136,
"grad_norm": 0.1995362639427185,
"learning_rate": 8.817359327499659e-06,
"loss": 1.6635,
"step": 41300
},
{
"epoch": 0.7359127807074717,
"grad_norm": 0.16061349213123322,
"learning_rate": 8.70729237386175e-06,
"loss": 1.6696,
"step": 41400
},
{
"epoch": 0.7376903478106298,
"grad_norm": 0.1676475554704666,
"learning_rate": 8.597771676262848e-06,
"loss": 1.6609,
"step": 41500
},
{
"epoch": 0.739467914913788,
"grad_norm": 0.28265243768692017,
"learning_rate": 8.488800906673493e-06,
"loss": 1.657,
"step": 41600
},
{
"epoch": 0.7412454820169462,
"grad_norm": 0.15924739837646484,
"learning_rate": 8.380383718626441e-06,
"loss": 1.661,
"step": 41700
},
{
"epoch": 0.7430230491201043,
"grad_norm": 0.15197895467281342,
"learning_rate": 8.27252374709416e-06,
"loss": 1.6663,
"step": 41800
},
{
"epoch": 0.7448006162232624,
"grad_norm": 0.15987786650657654,
"learning_rate": 8.165224608366981e-06,
"loss": 1.6657,
"step": 41900
},
{
"epoch": 0.7465781833264206,
"grad_norm": 0.14073099195957184,
"learning_rate": 8.058489899931795e-06,
"loss": 1.6596,
"step": 42000
},
{
"epoch": 0.7483557504295787,
"grad_norm": 0.148057222366333,
"learning_rate": 7.95232320035152e-06,
"loss": 1.6642,
"step": 42100
},
{
"epoch": 0.7501333175327368,
"grad_norm": 0.17104440927505493,
"learning_rate": 7.846728069145052e-06,
"loss": 1.6587,
"step": 42200
},
{
"epoch": 0.7519108846358951,
"grad_norm": 0.1572682410478592,
"learning_rate": 7.741708046667947e-06,
"loss": 1.6748,
"step": 42300
},
{
"epoch": 0.7536884517390532,
"grad_norm": 0.17384777963161469,
"learning_rate": 7.637266653993755e-06,
"loss": 1.6731,
"step": 42400
},
{
"epoch": 0.7554660188422113,
"grad_norm": 0.15057361125946045,
"learning_rate": 7.533407392795896e-06,
"loss": 1.6753,
"step": 42500
},
{
"epoch": 0.7572435859453694,
"grad_norm": 0.17582474648952484,
"learning_rate": 7.431163571532962e-06,
"loss": 1.6614,
"step": 42600
},
{
"epoch": 0.7590211530485276,
"grad_norm": 0.16274411976337433,
"learning_rate": 7.328473092285082e-06,
"loss": 1.6622,
"step": 42700
},
{
"epoch": 0.7607987201516857,
"grad_norm": 0.14647479355335236,
"learning_rate": 7.226375097632967e-06,
"loss": 1.6672,
"step": 42800
},
{
"epoch": 0.7625762872548438,
"grad_norm": 0.14855672419071198,
"learning_rate": 7.124873010681446e-06,
"loss": 1.6734,
"step": 42900
},
{
"epoch": 0.764353854358002,
"grad_norm": 0.14453125,
"learning_rate": 7.0239702345559766e-06,
"loss": 1.6629,
"step": 43000
},
{
"epoch": 0.7661314214611602,
"grad_norm": 0.15817640721797943,
"learning_rate": 6.923670152288514e-06,
"loss": 1.6726,
"step": 43100
},
{
"epoch": 0.7679089885643183,
"grad_norm": 0.14252114295959473,
"learning_rate": 6.823976126704137e-06,
"loss": 1.6561,
"step": 43200
},
{
"epoch": 0.7696865556674765,
"grad_norm": 0.16966678202152252,
"learning_rate": 6.724891500308264e-06,
"loss": 1.6703,
"step": 43300
},
{
"epoch": 0.7714641227706346,
"grad_norm": 0.1773873120546341,
"learning_rate": 6.626419595174596e-06,
"loss": 1.6519,
"step": 43400
},
{
"epoch": 0.7732416898737927,
"grad_norm": 0.15773586928844452,
"learning_rate": 6.528563712833738e-06,
"loss": 1.6652,
"step": 43500
},
{
"epoch": 0.7750192569769508,
"grad_norm": 0.13970991969108582,
"learning_rate": 6.431327134162498e-06,
"loss": 1.6402,
"step": 43600
},
{
"epoch": 0.776796824080109,
"grad_norm": 0.20692099630832672,
"learning_rate": 6.3347131192739105e-06,
"loss": 1.6667,
"step": 43700
},
{
"epoch": 0.7785743911832672,
"grad_norm": 0.14494654536247253,
"learning_rate": 6.238724907407897e-06,
"loss": 1.6598,
"step": 43800
},
{
"epoch": 0.7803519582864253,
"grad_norm": 0.15285950899124146,
"learning_rate": 6.143365716822691e-06,
"loss": 1.6675,
"step": 43900
},
{
"epoch": 0.7821295253895835,
"grad_norm": 0.1541031152009964,
"learning_rate": 6.048638744686922e-06,
"loss": 1.6609,
"step": 44000
},
{
"epoch": 0.7839070924927416,
"grad_norm": 0.16380751132965088,
"learning_rate": 5.954547166972424e-06,
"loss": 1.6751,
"step": 44100
},
{
"epoch": 0.7856846595958997,
"grad_norm": 0.1482234001159668,
"learning_rate": 5.8610941383477615e-06,
"loss": 1.6584,
"step": 44200
},
{
"epoch": 0.7874622266990579,
"grad_norm": 0.15207888185977936,
"learning_rate": 5.768282792072455e-06,
"loss": 1.6495,
"step": 44300
},
{
"epoch": 0.789239793802216,
"grad_norm": 0.14984245598316193,
"learning_rate": 5.6761162398919264e-06,
"loss": 1.6837,
"step": 44400
},
{
"epoch": 0.7910173609053742,
"grad_norm": 0.18983450531959534,
"learning_rate": 5.584597571933176e-06,
"loss": 1.6604,
"step": 44500
},
{
"epoch": 0.7927949280085324,
"grad_norm": 0.14986811578273773,
"learning_rate": 5.493729856601171e-06,
"loss": 1.6734,
"step": 44600
},
{
"epoch": 0.7945724951116905,
"grad_norm": 0.16871729493141174,
"learning_rate": 5.4035161404759755e-06,
"loss": 1.6535,
"step": 44700
},
{
"epoch": 0.7963500622148486,
"grad_norm": 0.1682252287864685,
"learning_rate": 5.313959448210609e-06,
"loss": 1.654,
"step": 44800
},
{
"epoch": 0.7981276293180067,
"grad_norm": 0.16398753225803375,
"learning_rate": 5.225062782429624e-06,
"loss": 1.6665,
"step": 44900
},
{
"epoch": 0.7999051964211649,
"grad_norm": 0.16041302680969238,
"learning_rate": 5.136829123628442e-06,
"loss": 1.6668,
"step": 45000
},
{
"epoch": 0.801682763524323,
"grad_norm": 0.14892232418060303,
"learning_rate": 5.049261430073432e-06,
"loss": 1.6698,
"step": 45100
},
{
"epoch": 0.8034603306274812,
"grad_norm": 0.15795736014842987,
"learning_rate": 4.962362637702711e-06,
"loss": 1.6449,
"step": 45200
},
{
"epoch": 0.8052378977306394,
"grad_norm": 0.14031356573104858,
"learning_rate": 4.8761356600277284e-06,
"loss": 1.6736,
"step": 45300
},
{
"epoch": 0.8070154648337975,
"grad_norm": 0.14954744279384613,
"learning_rate": 4.790583388035561e-06,
"loss": 1.6595,
"step": 45400
},
{
"epoch": 0.8087930319369556,
"grad_norm": 0.1448160856962204,
"learning_rate": 4.705708690092006e-06,
"loss": 1.6697,
"step": 45500
},
{
"epoch": 0.8105705990401137,
"grad_norm": 0.1565515398979187,
"learning_rate": 4.621514411845399e-06,
"loss": 1.6642,
"step": 45600
},
{
"epoch": 0.8123481661432719,
"grad_norm": 0.17093238234519958,
"learning_rate": 4.5380033761312e-06,
"loss": 1.6697,
"step": 45700
},
{
"epoch": 0.81412573324643,
"grad_norm": 0.14756350219249725,
"learning_rate": 4.4560032277625644e-06,
"loss": 1.6726,
"step": 45800
},
{
"epoch": 0.8159033003495882,
"grad_norm": 0.15235283970832825,
"learning_rate": 4.373860152031772e-06,
"loss": 1.6617,
"step": 45900
},
{
"epoch": 0.8176808674527464,
"grad_norm": 0.16607356071472168,
"learning_rate": 4.292408622096306e-06,
"loss": 1.6648,
"step": 46000
},
{
"epoch": 0.8194584345559045,
"grad_norm": 0.1486140936613083,
"learning_rate": 4.211651368833752e-06,
"loss": 1.6664,
"step": 46100
},
{
"epoch": 0.8212360016590626,
"grad_norm": 0.1572180539369583,
"learning_rate": 4.131591099844242e-06,
"loss": 1.667,
"step": 46200
},
{
"epoch": 0.8230135687622208,
"grad_norm": 0.1429453045129776,
"learning_rate": 4.052230499359672e-06,
"loss": 1.6601,
"step": 46300
},
{
"epoch": 0.8247911358653789,
"grad_norm": 0.2002544403076172,
"learning_rate": 3.973572228153693e-06,
"loss": 1.6767,
"step": 46400
},
{
"epoch": 0.826568702968537,
"grad_norm": 0.2108883261680603,
"learning_rate": 3.895618923452526e-06,
"loss": 1.6682,
"step": 46500
},
{
"epoch": 0.8283462700716953,
"grad_norm": 0.16697534918785095,
"learning_rate": 3.818373198846526e-06,
"loss": 1.6739,
"step": 46600
},
{
"epoch": 0.8301238371748534,
"grad_norm": 0.21570728719234467,
"learning_rate": 3.741837644202542e-06,
"loss": 1.6602,
"step": 46700
},
{
"epoch": 0.8319014042780115,
"grad_norm": 0.14422467350959778,
"learning_rate": 3.6660148255771187e-06,
"loss": 1.6702,
"step": 46800
},
{
"epoch": 0.8336789713811696,
"grad_norm": 0.19726932048797607,
"learning_rate": 3.590907285130435e-06,
"loss": 1.6776,
"step": 46900
},
{
"epoch": 0.8354565384843278,
"grad_norm": 0.15185341238975525,
"learning_rate": 3.5165175410410838e-06,
"loss": 1.6568,
"step": 47000
},
{
"epoch": 0.8372341055874859,
"grad_norm": 0.13279466331005096,
"learning_rate": 3.4428480874216407e-06,
"loss": 1.6847,
"step": 47100
},
{
"epoch": 0.839011672690644,
"grad_norm": 0.15089605748653412,
"learning_rate": 3.3699013942350367e-06,
"loss": 1.6742,
"step": 47200
},
{
"epoch": 0.8407892397938023,
"grad_norm": 0.17505663633346558,
"learning_rate": 3.2976799072117564e-06,
"loss": 1.6749,
"step": 47300
},
{
"epoch": 0.8425668068969604,
"grad_norm": 0.1557130366563797,
"learning_rate": 3.226186047767829e-06,
"loss": 1.6768,
"step": 47400
},
{
"epoch": 0.8443443740001185,
"grad_norm": 0.17593321204185486,
"learning_rate": 3.1554222129236505e-06,
"loss": 1.6651,
"step": 47500
},
{
"epoch": 0.8461219411032767,
"grad_norm": 0.14146077632904053,
"learning_rate": 3.0853907752236123e-06,
"loss": 1.6463,
"step": 47600
},
{
"epoch": 0.8478995082064348,
"grad_norm": 0.16316835582256317,
"learning_rate": 3.0160940826565566e-06,
"loss": 1.665,
"step": 47700
},
{
"epoch": 0.8496770753095929,
"grad_norm": 0.15772958099842072,
"learning_rate": 2.947534458577067e-06,
"loss": 1.6691,
"step": 47800
},
{
"epoch": 0.851454642412751,
"grad_norm": 0.15084761381149292,
"learning_rate": 2.879714201627548e-06,
"loss": 1.6562,
"step": 47900
},
{
"epoch": 0.8532322095159093,
"grad_norm": 0.1439027339220047,
"learning_rate": 2.812635585661169e-06,
"loss": 1.6736,
"step": 48000
},
{
"epoch": 0.8550097766190674,
"grad_norm": 0.14739972352981567,
"learning_rate": 2.746300859665632e-06,
"loss": 1.656,
"step": 48100
},
{
"epoch": 0.8567873437222255,
"grad_norm": 0.14110083878040314,
"learning_rate": 2.6807122476877637e-06,
"loss": 1.6568,
"step": 48200
},
{
"epoch": 0.8585649108253837,
"grad_norm": 0.17874117195606232,
"learning_rate": 2.6158719487589467e-06,
"loss": 1.6855,
"step": 48300
},
{
"epoch": 0.8603424779285418,
"grad_norm": 0.20830568671226501,
"learning_rate": 2.5517821368213927e-06,
"loss": 1.6613,
"step": 48400
},
{
"epoch": 0.8621200450316999,
"grad_norm": 0.24184918403625488,
"learning_rate": 2.4884449606552564e-06,
"loss": 1.6665,
"step": 48500
},
{
"epoch": 0.863897612134858,
"grad_norm": 0.18940667808055878,
"learning_rate": 2.4258625438065898e-06,
"loss": 1.6668,
"step": 48600
},
{
"epoch": 0.8656751792380163,
"grad_norm": 0.1848708689212799,
"learning_rate": 2.3640369845161464e-06,
"loss": 1.6668,
"step": 48700
},
{
"epoch": 0.8674527463411744,
"grad_norm": 0.14313864707946777,
"learning_rate": 2.302970355649034e-06,
"loss": 1.6648,
"step": 48800
},
{
"epoch": 0.8692303134443325,
"grad_norm": 0.144060418009758,
"learning_rate": 2.242664704625216e-06,
"loss": 1.6684,
"step": 48900
},
{
"epoch": 0.8710078805474907,
"grad_norm": 0.1597507745027542,
"learning_rate": 2.1831220533508556e-06,
"loss": 1.6655,
"step": 49000
},
{
"epoch": 0.8727854476506488,
"grad_norm": 0.1441722959280014,
"learning_rate": 2.124344398150546e-06,
"loss": 1.6778,
"step": 49100
},
{
"epoch": 0.8745630147538069,
"grad_norm": 0.16209940612316132,
"learning_rate": 2.0663337097003576e-06,
"loss": 1.6608,
"step": 49200
},
{
"epoch": 0.8763405818569651,
"grad_norm": 0.33218371868133545,
"learning_rate": 2.0090919329617876e-06,
"loss": 1.6411,
"step": 49300
},
{
"epoch": 0.8781181489601232,
"grad_norm": 0.1453067809343338,
"learning_rate": 1.9526209871165184e-06,
"loss": 1.6652,
"step": 49400
},
{
"epoch": 0.8798957160632814,
"grad_norm": 0.18067798018455505,
"learning_rate": 1.8969227655021098e-06,
"loss": 1.6777,
"step": 49500
},
{
"epoch": 0.8816732831664396,
"grad_norm": 0.14852333068847656,
"learning_rate": 1.8419991355484945e-06,
"loss": 1.6616,
"step": 49600
},
{
"epoch": 0.8834508502695977,
"grad_norm": 0.17207376658916473,
"learning_rate": 1.7878519387153763e-06,
"loss": 1.6693,
"step": 49700
},
{
"epoch": 0.8852284173727558,
"grad_norm": Infinity,
"learning_rate": 1.7350128216860744e-06,
"loss": 1.6699,
"step": 49800
},
{
"epoch": 0.8870059844759139,
"grad_norm": 0.1677992343902588,
"learning_rate": 1.6824161021340963e-06,
"loss": 1.662,
"step": 49900
},
{
"epoch": 0.8887835515790721,
"grad_norm": 0.1469300240278244,
"learning_rate": 1.6306011661451375e-06,
"loss": 1.6804,
"step": 50000
},
{
"epoch": 0.8905611186822302,
"grad_norm": 0.17192143201828003,
"learning_rate": 1.5795697509517316e-06,
"loss": 1.6682,
"step": 50100
},
{
"epoch": 0.8923386857853884,
"grad_norm": 0.16549953818321228,
"learning_rate": 1.529323567516805e-06,
"loss": 1.6442,
"step": 50200
},
{
"epoch": 0.8941162528885466,
"grad_norm": 0.21391679346561432,
"learning_rate": 1.4803549924437943e-06,
"loss": 1.6649,
"step": 50300
},
{
"epoch": 0.8958938199917047,
"grad_norm": 0.14870049059391022,
"learning_rate": 1.4316764061822001e-06,
"loss": 1.6526,
"step": 50400
},
{
"epoch": 0.8976713870948628,
"grad_norm": 0.14293646812438965,
"learning_rate": 1.3837880101939342e-06,
"loss": 1.6585,
"step": 50500
},
{
"epoch": 0.899448954198021,
"grad_norm": 0.19570215046405792,
"learning_rate": 1.3366914100639061e-06,
"loss": 1.6568,
"step": 50600
},
{
"epoch": 0.9012265213011791,
"grad_norm": 0.13882361352443695,
"learning_rate": 1.2903881848299797e-06,
"loss": 1.6541,
"step": 50700
},
{
"epoch": 0.9030040884043372,
"grad_norm": 0.1370488852262497,
"learning_rate": 1.244879886930031e-06,
"loss": 1.6625,
"step": 50800
},
{
"epoch": 0.9047816555074955,
"grad_norm": 0.18005253374576569,
"learning_rate": 1.200168042149899e-06,
"loss": 1.6708,
"step": 50900
},
{
"epoch": 0.9065592226106536,
"grad_norm": 0.18435104191303253,
"learning_rate": 1.156254149572225e-06,
"loss": 1.6642,
"step": 51000
},
{
"epoch": 0.9083367897138117,
"grad_norm": 0.1873762458562851,
"learning_rate": 1.1131396815261985e-06,
"loss": 1.6561,
"step": 51100
},
{
"epoch": 0.9101143568169698,
"grad_norm": 0.1366182565689087,
"learning_rate": 1.0708260835381927e-06,
"loss": 1.6456,
"step": 51200
},
{
"epoch": 0.911891923920128,
"grad_norm": 0.16981613636016846,
"learning_rate": 1.0293147742832966e-06,
"loss": 1.6732,
"step": 51300
},
{
"epoch": 0.9136694910232861,
"grad_norm": 0.18389706313610077,
"learning_rate": 9.88607145537751e-07,
"loss": 1.6679,
"step": 51400
},
{
"epoch": 0.9154470581264442,
"grad_norm": 0.15548266470432281,
"learning_rate": 9.487045621322799e-07,
"loss": 1.6619,
"step": 51500
},
{
"epoch": 0.9172246252296025,
"grad_norm": 0.18912291526794434,
"learning_rate": 9.096083619063473e-07,
"loss": 1.6736,
"step": 51600
},
{
"epoch": 0.9190021923327606,
"grad_norm": 0.16964443027973175,
"learning_rate": 8.713198556632885e-07,
"loss": 1.6748,
"step": 51700
},
{
"epoch": 0.9207797594359187,
"grad_norm": 0.14487318694591522,
"learning_rate": 8.338403271263589e-07,
"loss": 1.6692,
"step": 51800
},
{
"epoch": 0.9225573265390768,
"grad_norm": 0.15898752212524414,
"learning_rate": 7.971710328957132e-07,
"loss": 1.6646,
"step": 51900
},
{
"epoch": 0.924334893642235,
"grad_norm": 0.15839669108390808,
"learning_rate": 7.613132024062469e-07,
"loss": 1.662,
"step": 52000
},
{
"epoch": 0.9261124607453931,
"grad_norm": 0.15475721657276154,
"learning_rate": 7.262680378864017e-07,
"loss": 1.6671,
"step": 52100
},
{
"epoch": 0.9278900278485512,
"grad_norm": 0.1529570072889328,
"learning_rate": 6.920367143178452e-07,
"loss": 1.6726,
"step": 52200
},
{
"epoch": 0.9296675949517095,
"grad_norm": 0.1656341701745987,
"learning_rate": 6.586203793960771e-07,
"loss": 1.6623,
"step": 52300
},
{
"epoch": 0.9314451620548676,
"grad_norm": 0.1463242918252945,
"learning_rate": 6.260201534919491e-07,
"loss": 1.6662,
"step": 52400
},
{
"epoch": 0.9332227291580257,
"grad_norm": 0.13823845982551575,
"learning_rate": 5.942371296141058e-07,
"loss": 1.6453,
"step": 52500
},
{
"epoch": 0.9350002962611839,
"grad_norm": 0.14740724861621857,
"learning_rate": 5.632723733723366e-07,
"loss": 1.6574,
"step": 52600
},
{
"epoch": 0.936777863364342,
"grad_norm": 0.13956210017204285,
"learning_rate": 5.331269229418484e-07,
"loss": 1.6577,
"step": 52700
},
{
"epoch": 0.9385554304675001,
"grad_norm": 0.14831486344337463,
"learning_rate": 5.038017890284547e-07,
"loss": 1.6491,
"step": 52800
},
{
"epoch": 0.9403329975706582,
"grad_norm": 0.17318527400493622,
"learning_rate": 4.75297954834697e-07,
"loss": 1.6704,
"step": 52900
},
{
"epoch": 0.9421105646738165,
"grad_norm": 0.15295392274856567,
"learning_rate": 4.476163760268659e-07,
"loss": 1.6386,
"step": 53000
},
{
"epoch": 0.9438881317769746,
"grad_norm": 0.1569519191980362,
"learning_rate": 4.207579807029821e-07,
"loss": 1.6618,
"step": 53100
},
{
"epoch": 0.9456656988801327,
"grad_norm": 0.14801190793514252,
"learning_rate": 3.947236693616574e-07,
"loss": 1.6625,
"step": 53200
},
{
"epoch": 0.9474432659832909,
"grad_norm": 0.16047972440719604,
"learning_rate": 3.697623220822066e-07,
"loss": 1.6702,
"step": 53300
},
{
"epoch": 0.949220833086449,
"grad_norm": 0.17775191366672516,
"learning_rate": 3.453705075406932e-07,
"loss": 1.6671,
"step": 53400
},
{
"epoch": 0.9509984001896071,
"grad_norm": 0.15378819406032562,
"learning_rate": 3.218053045458136e-07,
"loss": 1.6644,
"step": 53500
},
{
"epoch": 0.9527759672927653,
"grad_norm": 0.14244551956653595,
"learning_rate": 2.990675031832174e-07,
"loss": 1.6791,
"step": 53600
},
{
"epoch": 0.9545535343959235,
"grad_norm": 0.1715448796749115,
"learning_rate": 2.7715786579772527e-07,
"loss": 1.6614,
"step": 53700
},
{
"epoch": 0.9563311014990816,
"grad_norm": 0.15638667345046997,
"learning_rate": 2.560771269677742e-07,
"loss": 1.6782,
"step": 53800
},
{
"epoch": 0.9581086686022398,
"grad_norm": 0.18975552916526794,
"learning_rate": 2.358259934807927e-07,
"loss": 1.6656,
"step": 53900
},
{
"epoch": 0.9598862357053979,
"grad_norm": 0.14271163940429688,
"learning_rate": 2.1640514430950055e-07,
"loss": 1.6574,
"step": 54000
},
{
"epoch": 0.961663802808556,
"grad_norm": 0.20691907405853271,
"learning_rate": 1.978152305891351e-07,
"loss": 1.6523,
"step": 54100
},
{
"epoch": 0.9634413699117141,
"grad_norm": 0.15907599031925201,
"learning_rate": 1.8005687559563834e-07,
"loss": 1.6763,
"step": 54200
},
{
"epoch": 0.9652189370148723,
"grad_norm": 0.13600093126296997,
"learning_rate": 1.6313067472474576e-07,
"loss": 1.6771,
"step": 54300
},
{
"epoch": 0.9669965041180305,
"grad_norm": 0.157552108168602,
"learning_rate": 1.470371954720301e-07,
"loss": 1.6601,
"step": 54400
},
{
"epoch": 0.9687740712211886,
"grad_norm": 0.14566664397716522,
"learning_rate": 1.3177697741387218e-07,
"loss": 1.6758,
"step": 54500
},
{
"epoch": 0.9705516383243468,
"grad_norm": 0.16759639978408813,
"learning_rate": 1.1735053218937808e-07,
"loss": 1.6591,
"step": 54600
},
{
"epoch": 0.9723292054275049,
"grad_norm": 0.16849561035633087,
"learning_rate": 1.0375834348320401e-07,
"loss": 1.6756,
"step": 54700
},
{
"epoch": 0.974106772530663,
"grad_norm": 0.17653703689575195,
"learning_rate": 9.100086700936649e-08,
"loss": 1.6621,
"step": 54800
},
{
"epoch": 0.9758843396338212,
"grad_norm": 0.17266112565994263,
"learning_rate": 7.907853049594905e-08,
"loss": 1.673,
"step": 54900
},
{
"epoch": 0.9776619067369793,
"grad_norm": 0.1719112992286682,
"learning_rate": 6.799173367075528e-08,
"loss": 1.6574,
"step": 55000
},
{
"epoch": 0.9794394738401375,
"grad_norm": 0.1552852988243103,
"learning_rate": 5.774084824792247e-08,
"loss": 1.6673,
"step": 55100
},
{
"epoch": 0.9812170409432956,
"grad_norm": 0.1883048564195633,
"learning_rate": 4.8326217915448114e-08,
"loss": 1.6688,
"step": 55200
},
{
"epoch": 0.9829946080464538,
"grad_norm": 0.16423700749874115,
"learning_rate": 3.97481583236714e-08,
"loss": 1.6706,
"step": 55300
},
{
"epoch": 0.9847721751496119,
"grad_norm": 0.14842714369297028,
"learning_rate": 3.2006957074690035e-08,
"loss": 1.6586,
"step": 55400
},
{
"epoch": 0.98654974225277,
"grad_norm": 0.13827410340309143,
"learning_rate": 2.510287371270681e-08,
"loss": 1.6537,
"step": 55500
},
{
"epoch": 0.9883273093559282,
"grad_norm": 0.14455120265483856,
"learning_rate": 1.903613971535323e-08,
"loss": 1.6689,
"step": 55600
},
{
"epoch": 0.9901048764590863,
"grad_norm": 0.14416253566741943,
"learning_rate": 1.385510381303745e-08,
"loss": 1.6618,
"step": 55700
},
{
"epoch": 0.9918824435622444,
"grad_norm": 0.14247511327266693,
"learning_rate": 9.455272617062139e-09,
"loss": 1.6753,
"step": 55800
},
{
"epoch": 0.9936600106654027,
"grad_norm": 0.15325996279716492,
"learning_rate": 5.893315412855427e-09,
"loss": 1.6689,
"step": 55900
},
{
"epoch": 0.9954375777685608,
"grad_norm": 0.17999306321144104,
"learning_rate": 3.169351624432437e-09,
"loss": 1.6623,
"step": 56000
},
{
"epoch": 0.9972151448717189,
"grad_norm": 0.27596476674079895,
"learning_rate": 1.283472579871603e-09,
"loss": 1.67,
"step": 56100
},
{
"epoch": 0.998992711974877,
"grad_norm": 0.14970412850379944,
"learning_rate": 2.3574150824490215e-10,
"loss": 1.6558,
"step": 56200
}
],
"logging_steps": 100,
"max_steps": 56256,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.6446037739742167e+18,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}