Dirty-Alice-Tiny-1.1B-v1 / trainer_state.json
D1rtyB1rd's picture
Upload 8 files
8768107 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 5959,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0033562678301728476,
"grad_norm": 3.027224540710449,
"learning_rate": 4.984896794764222e-05,
"loss": 2.5909,
"step": 20
},
{
"epoch": 0.006712535660345695,
"grad_norm": 2.6114368438720703,
"learning_rate": 4.968115455613358e-05,
"loss": 2.367,
"step": 40
},
{
"epoch": 0.010068803490518544,
"grad_norm": 18.757535934448242,
"learning_rate": 4.951334116462494e-05,
"loss": 2.286,
"step": 60
},
{
"epoch": 0.01342507132069139,
"grad_norm": 2.032654047012329,
"learning_rate": 4.9345527773116296e-05,
"loss": 2.2961,
"step": 80
},
{
"epoch": 0.01678133915086424,
"grad_norm": 1.8827602863311768,
"learning_rate": 4.9177714381607655e-05,
"loss": 2.2622,
"step": 100
},
{
"epoch": 0.020137606981037087,
"grad_norm": 1.8459181785583496,
"learning_rate": 4.9009900990099014e-05,
"loss": 2.2569,
"step": 120
},
{
"epoch": 0.023493874811209934,
"grad_norm": 1.9432493448257446,
"learning_rate": 4.8842087598590366e-05,
"loss": 2.2069,
"step": 140
},
{
"epoch": 0.02685014264138278,
"grad_norm": 1.9693349599838257,
"learning_rate": 4.8674274207081725e-05,
"loss": 2.2437,
"step": 160
},
{
"epoch": 0.03020641047155563,
"grad_norm": 1.8286640644073486,
"learning_rate": 4.8506460815573084e-05,
"loss": 2.1793,
"step": 180
},
{
"epoch": 0.03356267830172848,
"grad_norm": 1.883164644241333,
"learning_rate": 4.833864742406444e-05,
"loss": 2.2013,
"step": 200
},
{
"epoch": 0.036918946131901324,
"grad_norm": 10.042224884033203,
"learning_rate": 4.817922470213123e-05,
"loss": 2.2325,
"step": 220
},
{
"epoch": 0.040275213962074174,
"grad_norm": 1.909852147102356,
"learning_rate": 4.801141131062259e-05,
"loss": 2.2372,
"step": 240
},
{
"epoch": 0.043631481792247025,
"grad_norm": 1.8228334188461304,
"learning_rate": 4.784359791911395e-05,
"loss": 2.236,
"step": 260
},
{
"epoch": 0.04698774962241987,
"grad_norm": 1.8686975240707397,
"learning_rate": 4.767578452760531e-05,
"loss": 2.1993,
"step": 280
},
{
"epoch": 0.05034401745259272,
"grad_norm": 1.6455596685409546,
"learning_rate": 4.750797113609666e-05,
"loss": 2.2244,
"step": 300
},
{
"epoch": 0.05370028528276556,
"grad_norm": 1.5980720520019531,
"learning_rate": 4.734015774458802e-05,
"loss": 2.1412,
"step": 320
},
{
"epoch": 0.05705655311293841,
"grad_norm": 1.7958396673202515,
"learning_rate": 4.717234435307938e-05,
"loss": 2.1516,
"step": 340
},
{
"epoch": 0.06041282094311126,
"grad_norm": 1.6902369260787964,
"learning_rate": 4.700453096157074e-05,
"loss": 2.1842,
"step": 360
},
{
"epoch": 0.0637690887732841,
"grad_norm": 1.7754089832305908,
"learning_rate": 4.6836717570062096e-05,
"loss": 2.1591,
"step": 380
},
{
"epoch": 0.06712535660345696,
"grad_norm": 1.718556523323059,
"learning_rate": 4.6668904178553455e-05,
"loss": 2.155,
"step": 400
},
{
"epoch": 0.0704816244336298,
"grad_norm": 1.6255844831466675,
"learning_rate": 4.650109078704481e-05,
"loss": 2.2235,
"step": 420
},
{
"epoch": 0.07383789226380265,
"grad_norm": 1.6990654468536377,
"learning_rate": 4.6333277395536166e-05,
"loss": 2.1942,
"step": 440
},
{
"epoch": 0.0771941600939755,
"grad_norm": 1.7159547805786133,
"learning_rate": 4.6165464004027525e-05,
"loss": 2.2133,
"step": 460
},
{
"epoch": 0.08055042792414835,
"grad_norm": 1.8792229890823364,
"learning_rate": 4.5997650612518884e-05,
"loss": 2.1534,
"step": 480
},
{
"epoch": 0.08390669575432119,
"grad_norm": 1.4530068635940552,
"learning_rate": 4.582983722101024e-05,
"loss": 2.1365,
"step": 500
},
{
"epoch": 0.08726296358449405,
"grad_norm": 1.5878769159317017,
"learning_rate": 4.56620238295016e-05,
"loss": 2.1513,
"step": 520
},
{
"epoch": 0.09061923141466689,
"grad_norm": 1.6161599159240723,
"learning_rate": 4.5494210437992954e-05,
"loss": 2.1434,
"step": 540
},
{
"epoch": 0.09397549924483974,
"grad_norm": 1.5275565385818481,
"learning_rate": 4.5326397046484306e-05,
"loss": 2.1995,
"step": 560
},
{
"epoch": 0.09733176707501258,
"grad_norm": 1.3771929740905762,
"learning_rate": 4.5158583654975665e-05,
"loss": 2.1546,
"step": 580
},
{
"epoch": 0.10068803490518544,
"grad_norm": 1.630591630935669,
"learning_rate": 4.4990770263467024e-05,
"loss": 2.1317,
"step": 600
},
{
"epoch": 0.10404430273535828,
"grad_norm": 1.5199378728866577,
"learning_rate": 4.482295687195838e-05,
"loss": 2.116,
"step": 620
},
{
"epoch": 0.10740057056553112,
"grad_norm": 1.7431048154830933,
"learning_rate": 4.465514348044974e-05,
"loss": 2.1379,
"step": 640
},
{
"epoch": 0.11075683839570398,
"grad_norm": 1.6243093013763428,
"learning_rate": 4.44873300889411e-05,
"loss": 2.1478,
"step": 660
},
{
"epoch": 0.11411310622587682,
"grad_norm": 1.4836941957473755,
"learning_rate": 4.431951669743245e-05,
"loss": 2.1522,
"step": 680
},
{
"epoch": 0.11746937405604967,
"grad_norm": 1.5157350301742554,
"learning_rate": 4.415170330592381e-05,
"loss": 2.104,
"step": 700
},
{
"epoch": 0.12082564188622252,
"grad_norm": 1.3875478506088257,
"learning_rate": 4.398388991441517e-05,
"loss": 2.1495,
"step": 720
},
{
"epoch": 0.12418190971639537,
"grad_norm": 1.4270449876785278,
"learning_rate": 4.381607652290653e-05,
"loss": 2.1275,
"step": 740
},
{
"epoch": 0.1275381775465682,
"grad_norm": 1.522594690322876,
"learning_rate": 4.364826313139789e-05,
"loss": 2.1439,
"step": 760
},
{
"epoch": 0.13089444537674105,
"grad_norm": 1.3996539115905762,
"learning_rate": 4.348044973988925e-05,
"loss": 2.1384,
"step": 780
},
{
"epoch": 0.13425071320691392,
"grad_norm": 1.5052591562271118,
"learning_rate": 4.33126363483806e-05,
"loss": 2.1005,
"step": 800
},
{
"epoch": 0.13760698103708677,
"grad_norm": 1.4939976930618286,
"learning_rate": 4.314482295687196e-05,
"loss": 2.078,
"step": 820
},
{
"epoch": 0.1409632488672596,
"grad_norm": 1.5155856609344482,
"learning_rate": 4.297700956536332e-05,
"loss": 2.0766,
"step": 840
},
{
"epoch": 0.14431951669743245,
"grad_norm": 1.5032541751861572,
"learning_rate": 4.280919617385468e-05,
"loss": 2.1383,
"step": 860
},
{
"epoch": 0.1476757845276053,
"grad_norm": 1.5815625190734863,
"learning_rate": 4.2641382782346036e-05,
"loss": 2.138,
"step": 880
},
{
"epoch": 0.15103205235777814,
"grad_norm": 1.5529719591140747,
"learning_rate": 4.2473569390837395e-05,
"loss": 2.1252,
"step": 900
},
{
"epoch": 0.154388320187951,
"grad_norm": 1.5002542734146118,
"learning_rate": 4.230575599932875e-05,
"loss": 2.1158,
"step": 920
},
{
"epoch": 0.15774458801812385,
"grad_norm": 1.4248254299163818,
"learning_rate": 4.2137942607820106e-05,
"loss": 2.1441,
"step": 940
},
{
"epoch": 0.1611008558482967,
"grad_norm": 1.3779915571212769,
"learning_rate": 4.1970129216311465e-05,
"loss": 2.1105,
"step": 960
},
{
"epoch": 0.16445712367846954,
"grad_norm": 1.439987301826477,
"learning_rate": 4.1802315824802824e-05,
"loss": 2.0988,
"step": 980
},
{
"epoch": 0.16781339150864238,
"grad_norm": 1.3819688558578491,
"learning_rate": 4.163450243329418e-05,
"loss": 2.0594,
"step": 1000
},
{
"epoch": 0.17116965933881523,
"grad_norm": 1.4762790203094482,
"learning_rate": 4.146668904178554e-05,
"loss": 2.1448,
"step": 1020
},
{
"epoch": 0.1745259271689881,
"grad_norm": 1.3910346031188965,
"learning_rate": 4.1298875650276894e-05,
"loss": 2.1189,
"step": 1040
},
{
"epoch": 0.17788219499916094,
"grad_norm": 1.3728772401809692,
"learning_rate": 4.1131062258768246e-05,
"loss": 2.1025,
"step": 1060
},
{
"epoch": 0.18123846282933379,
"grad_norm": 1.466235637664795,
"learning_rate": 4.0963248867259605e-05,
"loss": 2.1163,
"step": 1080
},
{
"epoch": 0.18459473065950663,
"grad_norm": 1.418556809425354,
"learning_rate": 4.0795435475750964e-05,
"loss": 2.11,
"step": 1100
},
{
"epoch": 0.18795099848967947,
"grad_norm": 1.3588992357254028,
"learning_rate": 4.062762208424232e-05,
"loss": 2.1174,
"step": 1120
},
{
"epoch": 0.19130726631985231,
"grad_norm": 1.490565538406372,
"learning_rate": 4.045980869273368e-05,
"loss": 2.1176,
"step": 1140
},
{
"epoch": 0.19466353415002516,
"grad_norm": 1.4399744272232056,
"learning_rate": 4.029199530122504e-05,
"loss": 2.1222,
"step": 1160
},
{
"epoch": 0.19801980198019803,
"grad_norm": 1.379418134689331,
"learning_rate": 4.012418190971639e-05,
"loss": 2.1003,
"step": 1180
},
{
"epoch": 0.20137606981037087,
"grad_norm": 1.4052506685256958,
"learning_rate": 3.995636851820775e-05,
"loss": 2.0951,
"step": 1200
},
{
"epoch": 0.20473233764054372,
"grad_norm": 1.3870118856430054,
"learning_rate": 3.978855512669911e-05,
"loss": 2.1201,
"step": 1220
},
{
"epoch": 0.20808860547071656,
"grad_norm": 1.334796667098999,
"learning_rate": 3.962074173519047e-05,
"loss": 2.1252,
"step": 1240
},
{
"epoch": 0.2114448733008894,
"grad_norm": 1.2433503866195679,
"learning_rate": 3.945292834368183e-05,
"loss": 2.1048,
"step": 1260
},
{
"epoch": 0.21480114113106225,
"grad_norm": 1.422275185585022,
"learning_rate": 3.928511495217319e-05,
"loss": 2.0829,
"step": 1280
},
{
"epoch": 0.21815740896123512,
"grad_norm": 1.429117202758789,
"learning_rate": 3.911730156066454e-05,
"loss": 2.0545,
"step": 1300
},
{
"epoch": 0.22151367679140796,
"grad_norm": 1.40293550491333,
"learning_rate": 3.89494881691559e-05,
"loss": 2.0176,
"step": 1320
},
{
"epoch": 0.2248699446215808,
"grad_norm": 1.2488166093826294,
"learning_rate": 3.878167477764726e-05,
"loss": 2.0846,
"step": 1340
},
{
"epoch": 0.22822621245175365,
"grad_norm": 1.2881702184677124,
"learning_rate": 3.861386138613862e-05,
"loss": 2.069,
"step": 1360
},
{
"epoch": 0.2315824802819265,
"grad_norm": 1.3793479204177856,
"learning_rate": 3.8446047994629976e-05,
"loss": 2.0808,
"step": 1380
},
{
"epoch": 0.23493874811209933,
"grad_norm": 1.3845285177230835,
"learning_rate": 3.8278234603121335e-05,
"loss": 2.1008,
"step": 1400
},
{
"epoch": 0.2382950159422722,
"grad_norm": 1.4609571695327759,
"learning_rate": 3.811042121161269e-05,
"loss": 2.085,
"step": 1420
},
{
"epoch": 0.24165128377244505,
"grad_norm": 1.3259775638580322,
"learning_rate": 3.7942607820104046e-05,
"loss": 2.0631,
"step": 1440
},
{
"epoch": 0.2450075516026179,
"grad_norm": 1.248612880706787,
"learning_rate": 3.7774794428595405e-05,
"loss": 2.0908,
"step": 1460
},
{
"epoch": 0.24836381943279073,
"grad_norm": 1.4042353630065918,
"learning_rate": 3.7606981037086764e-05,
"loss": 2.0705,
"step": 1480
},
{
"epoch": 0.2517200872629636,
"grad_norm": 1.2640959024429321,
"learning_rate": 3.743916764557812e-05,
"loss": 2.0626,
"step": 1500
},
{
"epoch": 0.2550763550931364,
"grad_norm": 1.355383038520813,
"learning_rate": 3.7271354254069475e-05,
"loss": 2.1025,
"step": 1520
},
{
"epoch": 0.25843262292330926,
"grad_norm": 1.2184921503067017,
"learning_rate": 3.7103540862560834e-05,
"loss": 2.0187,
"step": 1540
},
{
"epoch": 0.2617888907534821,
"grad_norm": 1.2813303470611572,
"learning_rate": 3.6935727471052186e-05,
"loss": 2.087,
"step": 1560
},
{
"epoch": 0.26514515858365495,
"grad_norm": 1.3968662023544312,
"learning_rate": 3.6767914079543545e-05,
"loss": 2.0643,
"step": 1580
},
{
"epoch": 0.26850142641382785,
"grad_norm": 1.337203025817871,
"learning_rate": 3.6600100688034904e-05,
"loss": 2.0377,
"step": 1600
},
{
"epoch": 0.2718576942440007,
"grad_norm": 1.3844518661499023,
"learning_rate": 3.643228729652626e-05,
"loss": 2.0807,
"step": 1620
},
{
"epoch": 0.27521396207417353,
"grad_norm": 1.787477731704712,
"learning_rate": 3.626447390501762e-05,
"loss": 2.0775,
"step": 1640
},
{
"epoch": 0.2785702299043464,
"grad_norm": 1.3247798681259155,
"learning_rate": 3.609666051350898e-05,
"loss": 2.0996,
"step": 1660
},
{
"epoch": 0.2819264977345192,
"grad_norm": 1.2748069763183594,
"learning_rate": 3.592884712200033e-05,
"loss": 2.0737,
"step": 1680
},
{
"epoch": 0.28528276556469206,
"grad_norm": 1.346238374710083,
"learning_rate": 3.576103373049169e-05,
"loss": 2.1076,
"step": 1700
},
{
"epoch": 0.2886390333948649,
"grad_norm": 1.3545295000076294,
"learning_rate": 3.559322033898305e-05,
"loss": 2.0663,
"step": 1720
},
{
"epoch": 0.29199530122503775,
"grad_norm": 1.3100897073745728,
"learning_rate": 3.542540694747441e-05,
"loss": 2.0872,
"step": 1740
},
{
"epoch": 0.2953515690552106,
"grad_norm": 1.3519947528839111,
"learning_rate": 3.525759355596577e-05,
"loss": 2.0269,
"step": 1760
},
{
"epoch": 0.29870783688538344,
"grad_norm": 1.2966337203979492,
"learning_rate": 3.508978016445713e-05,
"loss": 2.0784,
"step": 1780
},
{
"epoch": 0.3020641047155563,
"grad_norm": 1.3702917098999023,
"learning_rate": 3.492196677294848e-05,
"loss": 2.0906,
"step": 1800
},
{
"epoch": 0.3054203725457291,
"grad_norm": 1.338409185409546,
"learning_rate": 3.475415338143984e-05,
"loss": 2.0899,
"step": 1820
},
{
"epoch": 0.308776640375902,
"grad_norm": 1.3070242404937744,
"learning_rate": 3.45863399899312e-05,
"loss": 2.0514,
"step": 1840
},
{
"epoch": 0.31213290820607487,
"grad_norm": 1.2753440141677856,
"learning_rate": 3.441852659842256e-05,
"loss": 2.0809,
"step": 1860
},
{
"epoch": 0.3154891760362477,
"grad_norm": 1.3125340938568115,
"learning_rate": 3.4250713206913916e-05,
"loss": 2.0372,
"step": 1880
},
{
"epoch": 0.31884544386642055,
"grad_norm": 1.3050023317337036,
"learning_rate": 3.4082899815405275e-05,
"loss": 2.072,
"step": 1900
},
{
"epoch": 0.3222017116965934,
"grad_norm": 1.4283207654953003,
"learning_rate": 3.391508642389663e-05,
"loss": 2.0489,
"step": 1920
},
{
"epoch": 0.32555797952676624,
"grad_norm": 1.4320636987686157,
"learning_rate": 3.3747273032387986e-05,
"loss": 2.1004,
"step": 1940
},
{
"epoch": 0.3289142473569391,
"grad_norm": 1.3432669639587402,
"learning_rate": 3.3579459640879345e-05,
"loss": 2.0837,
"step": 1960
},
{
"epoch": 0.3322705151871119,
"grad_norm": 1.22812819480896,
"learning_rate": 3.3411646249370704e-05,
"loss": 2.0651,
"step": 1980
},
{
"epoch": 0.33562678301728477,
"grad_norm": 1.3818988800048828,
"learning_rate": 3.324383285786206e-05,
"loss": 2.0446,
"step": 2000
},
{
"epoch": 0.3389830508474576,
"grad_norm": 1.2890549898147583,
"learning_rate": 3.3076019466353415e-05,
"loss": 2.0326,
"step": 2020
},
{
"epoch": 0.34233931867763046,
"grad_norm": 1.341260552406311,
"learning_rate": 3.2908206074844774e-05,
"loss": 2.0733,
"step": 2040
},
{
"epoch": 0.3456955865078033,
"grad_norm": 1.2970625162124634,
"learning_rate": 3.274039268333613e-05,
"loss": 2.0749,
"step": 2060
},
{
"epoch": 0.3490518543379762,
"grad_norm": 1.3300013542175293,
"learning_rate": 3.2572579291827485e-05,
"loss": 2.0283,
"step": 2080
},
{
"epoch": 0.35240812216814904,
"grad_norm": 1.372938871383667,
"learning_rate": 3.2404765900318844e-05,
"loss": 2.0703,
"step": 2100
},
{
"epoch": 0.3557643899983219,
"grad_norm": 1.355000615119934,
"learning_rate": 3.22369525088102e-05,
"loss": 2.0641,
"step": 2120
},
{
"epoch": 0.3591206578284947,
"grad_norm": 1.3288871049880981,
"learning_rate": 3.206913911730156e-05,
"loss": 2.0169,
"step": 2140
},
{
"epoch": 0.36247692565866757,
"grad_norm": 1.2701308727264404,
"learning_rate": 3.190132572579292e-05,
"loss": 2.1025,
"step": 2160
},
{
"epoch": 0.3658331934888404,
"grad_norm": 1.3620569705963135,
"learning_rate": 3.173351233428428e-05,
"loss": 2.0621,
"step": 2180
},
{
"epoch": 0.36918946131901326,
"grad_norm": 1.1960408687591553,
"learning_rate": 3.156569894277563e-05,
"loss": 2.0452,
"step": 2200
},
{
"epoch": 0.3725457291491861,
"grad_norm": 1.3412033319473267,
"learning_rate": 3.139788555126699e-05,
"loss": 2.0655,
"step": 2220
},
{
"epoch": 0.37590199697935894,
"grad_norm": 1.3558080196380615,
"learning_rate": 3.123007215975835e-05,
"loss": 2.0277,
"step": 2240
},
{
"epoch": 0.3792582648095318,
"grad_norm": 1.2877964973449707,
"learning_rate": 3.106225876824971e-05,
"loss": 2.0384,
"step": 2260
},
{
"epoch": 0.38261453263970463,
"grad_norm": 1.3294323682785034,
"learning_rate": 3.089444537674107e-05,
"loss": 1.9997,
"step": 2280
},
{
"epoch": 0.3859708004698775,
"grad_norm": 1.2727768421173096,
"learning_rate": 3.072663198523243e-05,
"loss": 1.9849,
"step": 2300
},
{
"epoch": 0.3893270683000503,
"grad_norm": 1.2134991884231567,
"learning_rate": 3.055881859372378e-05,
"loss": 2.0725,
"step": 2320
},
{
"epoch": 0.3926833361302232,
"grad_norm": 1.2667819261550903,
"learning_rate": 3.0391005202215138e-05,
"loss": 2.0429,
"step": 2340
},
{
"epoch": 0.39603960396039606,
"grad_norm": 1.3160877227783203,
"learning_rate": 3.0223191810706497e-05,
"loss": 2.0154,
"step": 2360
},
{
"epoch": 0.3993958717905689,
"grad_norm": 1.2551796436309814,
"learning_rate": 3.0055378419197856e-05,
"loss": 2.0383,
"step": 2380
},
{
"epoch": 0.40275213962074174,
"grad_norm": 1.3122833967208862,
"learning_rate": 2.988756502768921e-05,
"loss": 2.0263,
"step": 2400
},
{
"epoch": 0.4061084074509146,
"grad_norm": 1.269384503364563,
"learning_rate": 2.971975163618057e-05,
"loss": 2.1241,
"step": 2420
},
{
"epoch": 0.40946467528108743,
"grad_norm": 1.3302291631698608,
"learning_rate": 2.955193824467193e-05,
"loss": 2.0616,
"step": 2440
},
{
"epoch": 0.4128209431112603,
"grad_norm": 1.2562564611434937,
"learning_rate": 2.9384124853163285e-05,
"loss": 2.0613,
"step": 2460
},
{
"epoch": 0.4161772109414331,
"grad_norm": 1.2373179197311401,
"learning_rate": 2.9216311461654644e-05,
"loss": 2.0202,
"step": 2480
},
{
"epoch": 0.41953347877160596,
"grad_norm": 1.3266092538833618,
"learning_rate": 2.9048498070145996e-05,
"loss": 2.0174,
"step": 2500
},
{
"epoch": 0.4228897466017788,
"grad_norm": 1.27751624584198,
"learning_rate": 2.8880684678637355e-05,
"loss": 2.0549,
"step": 2520
},
{
"epoch": 0.42624601443195165,
"grad_norm": 1.2350589036941528,
"learning_rate": 2.871287128712871e-05,
"loss": 2.0126,
"step": 2540
},
{
"epoch": 0.4296022822621245,
"grad_norm": 1.3619853258132935,
"learning_rate": 2.854505789562007e-05,
"loss": 1.9725,
"step": 2560
},
{
"epoch": 0.4329585500922974,
"grad_norm": 1.354687213897705,
"learning_rate": 2.837724450411143e-05,
"loss": 2.0492,
"step": 2580
},
{
"epoch": 0.43631481792247023,
"grad_norm": 1.3418916463851929,
"learning_rate": 2.8209431112602784e-05,
"loss": 1.976,
"step": 2600
},
{
"epoch": 0.4396710857526431,
"grad_norm": 1.232704520225525,
"learning_rate": 2.8041617721094143e-05,
"loss": 2.0035,
"step": 2620
},
{
"epoch": 0.4430273535828159,
"grad_norm": 1.3459244966506958,
"learning_rate": 2.7873804329585502e-05,
"loss": 2.0612,
"step": 2640
},
{
"epoch": 0.44638362141298876,
"grad_norm": 1.2673430442810059,
"learning_rate": 2.7705990938076858e-05,
"loss": 2.0134,
"step": 2660
},
{
"epoch": 0.4497398892431616,
"grad_norm": 1.229880928993225,
"learning_rate": 2.7538177546568216e-05,
"loss": 2.0324,
"step": 2680
},
{
"epoch": 0.45309615707333445,
"grad_norm": 1.3053526878356934,
"learning_rate": 2.7370364155059575e-05,
"loss": 2.0031,
"step": 2700
},
{
"epoch": 0.4564524249035073,
"grad_norm": 1.3416264057159424,
"learning_rate": 2.720255076355093e-05,
"loss": 2.0513,
"step": 2720
},
{
"epoch": 0.45980869273368014,
"grad_norm": 1.3494229316711426,
"learning_rate": 2.703473737204229e-05,
"loss": 2.0527,
"step": 2740
},
{
"epoch": 0.463164960563853,
"grad_norm": 1.2861367464065552,
"learning_rate": 2.686692398053365e-05,
"loss": 2.0327,
"step": 2760
},
{
"epoch": 0.4665212283940258,
"grad_norm": 1.260968565940857,
"learning_rate": 2.6699110589025004e-05,
"loss": 2.0399,
"step": 2780
},
{
"epoch": 0.46987749622419867,
"grad_norm": 1.4496228694915771,
"learning_rate": 2.6531297197516363e-05,
"loss": 2.0532,
"step": 2800
},
{
"epoch": 0.47323376405437156,
"grad_norm": 1.2266364097595215,
"learning_rate": 2.6363483806007722e-05,
"loss": 2.0196,
"step": 2820
},
{
"epoch": 0.4765900318845444,
"grad_norm": 1.4289458990097046,
"learning_rate": 2.6195670414499078e-05,
"loss": 2.0379,
"step": 2840
},
{
"epoch": 0.47994629971471725,
"grad_norm": 1.3267526626586914,
"learning_rate": 2.6027857022990437e-05,
"loss": 2.0068,
"step": 2860
},
{
"epoch": 0.4833025675448901,
"grad_norm": 1.308477520942688,
"learning_rate": 2.5860043631481796e-05,
"loss": 2.0204,
"step": 2880
},
{
"epoch": 0.48665883537506294,
"grad_norm": 1.2613425254821777,
"learning_rate": 2.569223023997315e-05,
"loss": 2.0319,
"step": 2900
},
{
"epoch": 0.4900151032052358,
"grad_norm": 1.2851049900054932,
"learning_rate": 2.552441684846451e-05,
"loss": 2.044,
"step": 2920
},
{
"epoch": 0.4933713710354086,
"grad_norm": 1.2507027387619019,
"learning_rate": 2.535660345695587e-05,
"loss": 1.9964,
"step": 2940
},
{
"epoch": 0.49672763886558147,
"grad_norm": 1.3747198581695557,
"learning_rate": 2.5188790065447225e-05,
"loss": 1.9995,
"step": 2960
},
{
"epoch": 0.5000839066957543,
"grad_norm": 1.2839736938476562,
"learning_rate": 2.5020976673938584e-05,
"loss": 2.0137,
"step": 2980
},
{
"epoch": 0.5034401745259272,
"grad_norm": 1.428585410118103,
"learning_rate": 2.485316328242994e-05,
"loss": 2.0149,
"step": 3000
},
{
"epoch": 0.5067964423561,
"grad_norm": 1.3017884492874146,
"learning_rate": 2.46853498909213e-05,
"loss": 2.0023,
"step": 3020
},
{
"epoch": 0.5101527101862728,
"grad_norm": 1.2209047079086304,
"learning_rate": 2.4517536499412654e-05,
"loss": 2.0092,
"step": 3040
},
{
"epoch": 0.5135089780164457,
"grad_norm": 1.2542091608047485,
"learning_rate": 2.4349723107904013e-05,
"loss": 2.0137,
"step": 3060
},
{
"epoch": 0.5168652458466185,
"grad_norm": 1.1981834173202515,
"learning_rate": 2.4181909716395372e-05,
"loss": 2.0013,
"step": 3080
},
{
"epoch": 0.5202215136767914,
"grad_norm": 1.3641618490219116,
"learning_rate": 2.4014096324886727e-05,
"loss": 2.0412,
"step": 3100
},
{
"epoch": 0.5235777815069642,
"grad_norm": 1.2675738334655762,
"learning_rate": 2.3846282933378083e-05,
"loss": 2.0088,
"step": 3120
},
{
"epoch": 0.5269340493371371,
"grad_norm": 1.2499597072601318,
"learning_rate": 2.3678469541869442e-05,
"loss": 1.9926,
"step": 3140
},
{
"epoch": 0.5302903171673099,
"grad_norm": 1.260705828666687,
"learning_rate": 2.3510656150360798e-05,
"loss": 1.9775,
"step": 3160
},
{
"epoch": 0.5336465849974829,
"grad_norm": 1.2815592288970947,
"learning_rate": 2.3342842758852156e-05,
"loss": 1.9964,
"step": 3180
},
{
"epoch": 0.5370028528276557,
"grad_norm": 1.191841721534729,
"learning_rate": 2.3175029367343515e-05,
"loss": 1.9858,
"step": 3200
},
{
"epoch": 0.5403591206578285,
"grad_norm": 1.2903523445129395,
"learning_rate": 2.300721597583487e-05,
"loss": 2.0101,
"step": 3220
},
{
"epoch": 0.5437153884880014,
"grad_norm": 1.257596731185913,
"learning_rate": 2.283940258432623e-05,
"loss": 1.9916,
"step": 3240
},
{
"epoch": 0.5470716563181742,
"grad_norm": 1.2740247249603271,
"learning_rate": 2.267158919281759e-05,
"loss": 1.9724,
"step": 3260
},
{
"epoch": 0.5504279241483471,
"grad_norm": 1.2834815979003906,
"learning_rate": 2.2503775801308944e-05,
"loss": 2.0144,
"step": 3280
},
{
"epoch": 0.5537841919785199,
"grad_norm": 1.2792032957077026,
"learning_rate": 2.2335962409800303e-05,
"loss": 2.0241,
"step": 3300
},
{
"epoch": 0.5571404598086928,
"grad_norm": 1.3966797590255737,
"learning_rate": 2.2168149018291662e-05,
"loss": 2.0202,
"step": 3320
},
{
"epoch": 0.5604967276388656,
"grad_norm": 1.2389239072799683,
"learning_rate": 2.2000335626783018e-05,
"loss": 2.0221,
"step": 3340
},
{
"epoch": 0.5638529954690384,
"grad_norm": 1.2616690397262573,
"learning_rate": 2.1832522235274374e-05,
"loss": 2.0266,
"step": 3360
},
{
"epoch": 0.5672092632992113,
"grad_norm": 1.2490557432174683,
"learning_rate": 2.1664708843765732e-05,
"loss": 2.011,
"step": 3380
},
{
"epoch": 0.5705655311293841,
"grad_norm": 1.2576720714569092,
"learning_rate": 2.149689545225709e-05,
"loss": 2.0368,
"step": 3400
},
{
"epoch": 0.573921798959557,
"grad_norm": 1.2379933595657349,
"learning_rate": 2.1329082060748447e-05,
"loss": 1.9964,
"step": 3420
},
{
"epoch": 0.5772780667897298,
"grad_norm": 1.259509801864624,
"learning_rate": 2.1161268669239806e-05,
"loss": 1.9827,
"step": 3440
},
{
"epoch": 0.5806343346199027,
"grad_norm": 1.2831110954284668,
"learning_rate": 2.0993455277731165e-05,
"loss": 2.0438,
"step": 3460
},
{
"epoch": 0.5839906024500755,
"grad_norm": 1.331018090248108,
"learning_rate": 2.082564188622252e-05,
"loss": 2.0088,
"step": 3480
},
{
"epoch": 0.5873468702802483,
"grad_norm": 1.2574256658554077,
"learning_rate": 2.065782849471388e-05,
"loss": 2.0474,
"step": 3500
},
{
"epoch": 0.5907031381104212,
"grad_norm": 1.283894658088684,
"learning_rate": 2.049001510320524e-05,
"loss": 2.0543,
"step": 3520
},
{
"epoch": 0.594059405940594,
"grad_norm": 1.2675976753234863,
"learning_rate": 2.0322201711696594e-05,
"loss": 2.0182,
"step": 3540
},
{
"epoch": 0.5974156737707669,
"grad_norm": 1.2579065561294556,
"learning_rate": 2.0154388320187953e-05,
"loss": 1.9765,
"step": 3560
},
{
"epoch": 0.6007719416009397,
"grad_norm": 1.2290009260177612,
"learning_rate": 1.9986574928679312e-05,
"loss": 1.9856,
"step": 3580
},
{
"epoch": 0.6041282094311126,
"grad_norm": 1.3220741748809814,
"learning_rate": 1.9818761537170667e-05,
"loss": 1.9789,
"step": 3600
},
{
"epoch": 0.6074844772612854,
"grad_norm": 1.3620175123214722,
"learning_rate": 1.9650948145662023e-05,
"loss": 1.973,
"step": 3620
},
{
"epoch": 0.6108407450914582,
"grad_norm": 1.3105921745300293,
"learning_rate": 1.9483134754153382e-05,
"loss": 2.0177,
"step": 3640
},
{
"epoch": 0.6141970129216311,
"grad_norm": 1.303818941116333,
"learning_rate": 1.931532136264474e-05,
"loss": 2.0125,
"step": 3660
},
{
"epoch": 0.617553280751804,
"grad_norm": 1.2042673826217651,
"learning_rate": 1.9147507971136096e-05,
"loss": 2.0166,
"step": 3680
},
{
"epoch": 0.6209095485819769,
"grad_norm": 1.2778286933898926,
"learning_rate": 1.8979694579627455e-05,
"loss": 1.9743,
"step": 3700
},
{
"epoch": 0.6242658164121497,
"grad_norm": 1.3580365180969238,
"learning_rate": 1.8811881188118814e-05,
"loss": 2.0165,
"step": 3720
},
{
"epoch": 0.6276220842423226,
"grad_norm": 1.2921738624572754,
"learning_rate": 1.864406779661017e-05,
"loss": 1.96,
"step": 3740
},
{
"epoch": 0.6309783520724954,
"grad_norm": 1.347383737564087,
"learning_rate": 1.847625440510153e-05,
"loss": 1.9895,
"step": 3760
},
{
"epoch": 0.6343346199026683,
"grad_norm": 1.2058087587356567,
"learning_rate": 1.8308441013592888e-05,
"loss": 1.9735,
"step": 3780
},
{
"epoch": 0.6376908877328411,
"grad_norm": 1.3013418912887573,
"learning_rate": 1.8140627622084243e-05,
"loss": 1.96,
"step": 3800
},
{
"epoch": 0.641047155563014,
"grad_norm": 1.2196553945541382,
"learning_rate": 1.7972814230575602e-05,
"loss": 1.9808,
"step": 3820
},
{
"epoch": 0.6444034233931868,
"grad_norm": 1.2661124467849731,
"learning_rate": 1.7805000839066958e-05,
"loss": 2.0022,
"step": 3840
},
{
"epoch": 0.6477596912233596,
"grad_norm": 1.3183765411376953,
"learning_rate": 1.7637187447558313e-05,
"loss": 1.9696,
"step": 3860
},
{
"epoch": 0.6511159590535325,
"grad_norm": 1.2164355516433716,
"learning_rate": 1.7469374056049672e-05,
"loss": 1.9387,
"step": 3880
},
{
"epoch": 0.6544722268837053,
"grad_norm": 1.2561441659927368,
"learning_rate": 1.730156066454103e-05,
"loss": 2.0252,
"step": 3900
},
{
"epoch": 0.6578284947138782,
"grad_norm": 1.240861177444458,
"learning_rate": 1.7133747273032387e-05,
"loss": 1.9737,
"step": 3920
},
{
"epoch": 0.661184762544051,
"grad_norm": 1.2271203994750977,
"learning_rate": 1.6965933881523746e-05,
"loss": 1.991,
"step": 3940
},
{
"epoch": 0.6645410303742239,
"grad_norm": 1.224165678024292,
"learning_rate": 1.6798120490015105e-05,
"loss": 1.9771,
"step": 3960
},
{
"epoch": 0.6678972982043967,
"grad_norm": 1.222956657409668,
"learning_rate": 1.663030709850646e-05,
"loss": 1.9659,
"step": 3980
},
{
"epoch": 0.6712535660345695,
"grad_norm": 1.2539464235305786,
"learning_rate": 1.646249370699782e-05,
"loss": 1.9811,
"step": 4000
},
{
"epoch": 0.6746098338647424,
"grad_norm": 1.272801160812378,
"learning_rate": 1.629468031548918e-05,
"loss": 2.0171,
"step": 4020
},
{
"epoch": 0.6779661016949152,
"grad_norm": 1.2539173364639282,
"learning_rate": 1.6126866923980534e-05,
"loss": 1.953,
"step": 4040
},
{
"epoch": 0.6813223695250881,
"grad_norm": 1.2918198108673096,
"learning_rate": 1.5959053532471893e-05,
"loss": 1.9742,
"step": 4060
},
{
"epoch": 0.6846786373552609,
"grad_norm": 1.3085472583770752,
"learning_rate": 1.5791240140963252e-05,
"loss": 1.9693,
"step": 4080
},
{
"epoch": 0.6880349051854338,
"grad_norm": 1.2754298448562622,
"learning_rate": 1.5623426749454607e-05,
"loss": 1.9819,
"step": 4100
},
{
"epoch": 0.6913911730156066,
"grad_norm": 1.3267842531204224,
"learning_rate": 1.5455613357945963e-05,
"loss": 1.9549,
"step": 4120
},
{
"epoch": 0.6947474408457794,
"grad_norm": 1.1842241287231445,
"learning_rate": 1.5287799966437322e-05,
"loss": 1.9816,
"step": 4140
},
{
"epoch": 0.6981037086759524,
"grad_norm": 1.2266135215759277,
"learning_rate": 1.511998657492868e-05,
"loss": 1.956,
"step": 4160
},
{
"epoch": 0.7014599765061252,
"grad_norm": 1.2384594678878784,
"learning_rate": 1.4952173183420038e-05,
"loss": 1.9761,
"step": 4180
},
{
"epoch": 0.7048162443362981,
"grad_norm": 1.2703733444213867,
"learning_rate": 1.4784359791911395e-05,
"loss": 1.9602,
"step": 4200
},
{
"epoch": 0.7081725121664709,
"grad_norm": 1.3347383737564087,
"learning_rate": 1.4616546400402753e-05,
"loss": 1.9863,
"step": 4220
},
{
"epoch": 0.7115287799966438,
"grad_norm": 1.2556087970733643,
"learning_rate": 1.4448733008894112e-05,
"loss": 1.9937,
"step": 4240
},
{
"epoch": 0.7148850478268166,
"grad_norm": 1.2546402215957642,
"learning_rate": 1.4280919617385469e-05,
"loss": 1.9428,
"step": 4260
},
{
"epoch": 0.7182413156569895,
"grad_norm": 1.20867121219635,
"learning_rate": 1.4113106225876826e-05,
"loss": 1.9702,
"step": 4280
},
{
"epoch": 0.7215975834871623,
"grad_norm": 1.2617672681808472,
"learning_rate": 1.3945292834368185e-05,
"loss": 1.9688,
"step": 4300
},
{
"epoch": 0.7249538513173351,
"grad_norm": 1.306674599647522,
"learning_rate": 1.3777479442859542e-05,
"loss": 1.9497,
"step": 4320
},
{
"epoch": 0.728310119147508,
"grad_norm": 1.3713186979293823,
"learning_rate": 1.3609666051350898e-05,
"loss": 2.0179,
"step": 4340
},
{
"epoch": 0.7316663869776808,
"grad_norm": 1.1903387308120728,
"learning_rate": 1.3441852659842255e-05,
"loss": 1.9861,
"step": 4360
},
{
"epoch": 0.7350226548078537,
"grad_norm": 1.2360427379608154,
"learning_rate": 1.3274039268333612e-05,
"loss": 1.9742,
"step": 4380
},
{
"epoch": 0.7383789226380265,
"grad_norm": 1.1744493246078491,
"learning_rate": 1.3106225876824971e-05,
"loss": 2.0021,
"step": 4400
},
{
"epoch": 0.7417351904681994,
"grad_norm": 1.25652015209198,
"learning_rate": 1.2938412485316329e-05,
"loss": 2.0034,
"step": 4420
},
{
"epoch": 0.7450914582983722,
"grad_norm": 1.2795733213424683,
"learning_rate": 1.2770599093807686e-05,
"loss": 2.0015,
"step": 4440
},
{
"epoch": 0.748447726128545,
"grad_norm": 1.2552342414855957,
"learning_rate": 1.2602785702299045e-05,
"loss": 1.9703,
"step": 4460
},
{
"epoch": 0.7518039939587179,
"grad_norm": 1.196215271949768,
"learning_rate": 1.2434972310790402e-05,
"loss": 1.9434,
"step": 4480
},
{
"epoch": 0.7551602617888907,
"grad_norm": 1.2752282619476318,
"learning_rate": 1.226715891928176e-05,
"loss": 2.0011,
"step": 4500
},
{
"epoch": 0.7585165296190636,
"grad_norm": 1.361426591873169,
"learning_rate": 1.2099345527773117e-05,
"loss": 2.001,
"step": 4520
},
{
"epoch": 0.7618727974492364,
"grad_norm": 1.295401930809021,
"learning_rate": 1.1931532136264474e-05,
"loss": 1.9785,
"step": 4540
},
{
"epoch": 0.7652290652794093,
"grad_norm": 1.1889725923538208,
"learning_rate": 1.1763718744755831e-05,
"loss": 2.0203,
"step": 4560
},
{
"epoch": 0.7685853331095821,
"grad_norm": 1.2194198369979858,
"learning_rate": 1.159590535324719e-05,
"loss": 1.954,
"step": 4580
},
{
"epoch": 0.771941600939755,
"grad_norm": 1.1998612880706787,
"learning_rate": 1.1428091961738547e-05,
"loss": 1.9129,
"step": 4600
},
{
"epoch": 0.7752978687699278,
"grad_norm": 1.1526966094970703,
"learning_rate": 1.1260278570229905e-05,
"loss": 1.941,
"step": 4620
},
{
"epoch": 0.7786541366001006,
"grad_norm": 1.2833491563796997,
"learning_rate": 1.1092465178721262e-05,
"loss": 1.9606,
"step": 4640
},
{
"epoch": 0.7820104044302736,
"grad_norm": 1.3405060768127441,
"learning_rate": 1.0924651787212619e-05,
"loss": 1.9828,
"step": 4660
},
{
"epoch": 0.7853666722604464,
"grad_norm": 1.2940865755081177,
"learning_rate": 1.0756838395703978e-05,
"loss": 1.9309,
"step": 4680
},
{
"epoch": 0.7887229400906193,
"grad_norm": 1.3541054725646973,
"learning_rate": 1.0589025004195335e-05,
"loss": 1.9213,
"step": 4700
},
{
"epoch": 0.7920792079207921,
"grad_norm": 1.2736974954605103,
"learning_rate": 1.0421211612686693e-05,
"loss": 1.9526,
"step": 4720
},
{
"epoch": 0.795435475750965,
"grad_norm": 1.2431901693344116,
"learning_rate": 1.0253398221178052e-05,
"loss": 1.9348,
"step": 4740
},
{
"epoch": 0.7987917435811378,
"grad_norm": 1.245354175567627,
"learning_rate": 1.0085584829669407e-05,
"loss": 1.9294,
"step": 4760
},
{
"epoch": 0.8021480114113106,
"grad_norm": 1.2285710573196411,
"learning_rate": 9.917771438160766e-06,
"loss": 1.9766,
"step": 4780
},
{
"epoch": 0.8055042792414835,
"grad_norm": 1.1929337978363037,
"learning_rate": 9.749958046652123e-06,
"loss": 1.9265,
"step": 4800
},
{
"epoch": 0.8088605470716563,
"grad_norm": 1.3306503295898438,
"learning_rate": 9.58214465514348e-06,
"loss": 1.966,
"step": 4820
},
{
"epoch": 0.8122168149018292,
"grad_norm": 1.203856110572815,
"learning_rate": 9.41433126363484e-06,
"loss": 1.9775,
"step": 4840
},
{
"epoch": 0.815573082732002,
"grad_norm": 1.2907806634902954,
"learning_rate": 9.246517872126197e-06,
"loss": 1.9544,
"step": 4860
},
{
"epoch": 0.8189293505621749,
"grad_norm": 1.36453378200531,
"learning_rate": 9.078704480617552e-06,
"loss": 1.935,
"step": 4880
},
{
"epoch": 0.8222856183923477,
"grad_norm": 1.228378415107727,
"learning_rate": 8.910891089108911e-06,
"loss": 1.9942,
"step": 4900
},
{
"epoch": 0.8256418862225205,
"grad_norm": 1.2804518938064575,
"learning_rate": 8.743077697600269e-06,
"loss": 1.9865,
"step": 4920
},
{
"epoch": 0.8289981540526934,
"grad_norm": 1.2092355489730835,
"learning_rate": 8.575264306091626e-06,
"loss": 1.9931,
"step": 4940
},
{
"epoch": 0.8323544218828662,
"grad_norm": 1.248852252960205,
"learning_rate": 8.407450914582985e-06,
"loss": 1.9831,
"step": 4960
},
{
"epoch": 0.8357106897130391,
"grad_norm": 1.3584181070327759,
"learning_rate": 8.239637523074342e-06,
"loss": 1.9706,
"step": 4980
},
{
"epoch": 0.8390669575432119,
"grad_norm": 1.250568151473999,
"learning_rate": 8.0718241315657e-06,
"loss": 1.9449,
"step": 5000
},
{
"epoch": 0.8424232253733848,
"grad_norm": 1.3448213338851929,
"learning_rate": 7.904010740057057e-06,
"loss": 1.9221,
"step": 5020
},
{
"epoch": 0.8457794932035576,
"grad_norm": 1.2519199848175049,
"learning_rate": 7.736197348548414e-06,
"loss": 1.9444,
"step": 5040
},
{
"epoch": 0.8491357610337305,
"grad_norm": 1.2377424240112305,
"learning_rate": 7.568383957039772e-06,
"loss": 1.9438,
"step": 5060
},
{
"epoch": 0.8524920288639033,
"grad_norm": 1.3130027055740356,
"learning_rate": 7.40057056553113e-06,
"loss": 1.9791,
"step": 5080
},
{
"epoch": 0.8558482966940761,
"grad_norm": 1.2841044664382935,
"learning_rate": 7.232757174022488e-06,
"loss": 1.9539,
"step": 5100
},
{
"epoch": 0.859204564524249,
"grad_norm": 1.2105425596237183,
"learning_rate": 7.064943782513845e-06,
"loss": 1.9738,
"step": 5120
},
{
"epoch": 0.8625608323544219,
"grad_norm": 1.2560657262802124,
"learning_rate": 6.897130391005202e-06,
"loss": 1.9486,
"step": 5140
},
{
"epoch": 0.8659171001845948,
"grad_norm": 1.2118816375732422,
"learning_rate": 6.72931699949656e-06,
"loss": 1.9107,
"step": 5160
},
{
"epoch": 0.8692733680147676,
"grad_norm": 1.2164198160171509,
"learning_rate": 6.561503607987918e-06,
"loss": 2.0168,
"step": 5180
},
{
"epoch": 0.8726296358449405,
"grad_norm": 1.358729362487793,
"learning_rate": 6.393690216479275e-06,
"loss": 1.9401,
"step": 5200
},
{
"epoch": 0.8759859036751133,
"grad_norm": 1.2905333042144775,
"learning_rate": 6.225876824970633e-06,
"loss": 1.9235,
"step": 5220
},
{
"epoch": 0.8793421715052862,
"grad_norm": 1.2965201139450073,
"learning_rate": 6.058063433461991e-06,
"loss": 1.9166,
"step": 5240
},
{
"epoch": 0.882698439335459,
"grad_norm": 1.3677794933319092,
"learning_rate": 5.890250041953348e-06,
"loss": 1.9327,
"step": 5260
},
{
"epoch": 0.8860547071656318,
"grad_norm": 1.312015175819397,
"learning_rate": 5.722436650444705e-06,
"loss": 1.9032,
"step": 5280
},
{
"epoch": 0.8894109749958047,
"grad_norm": 1.2666916847229004,
"learning_rate": 5.554623258936063e-06,
"loss": 1.9014,
"step": 5300
},
{
"epoch": 0.8927672428259775,
"grad_norm": 1.2646892070770264,
"learning_rate": 5.3868098674274214e-06,
"loss": 1.9586,
"step": 5320
},
{
"epoch": 0.8961235106561504,
"grad_norm": 1.2514413595199585,
"learning_rate": 5.218996475918779e-06,
"loss": 1.959,
"step": 5340
},
{
"epoch": 0.8994797784863232,
"grad_norm": 1.28076171875,
"learning_rate": 5.051183084410136e-06,
"loss": 1.9393,
"step": 5360
},
{
"epoch": 0.902836046316496,
"grad_norm": 1.3480361700057983,
"learning_rate": 4.883369692901494e-06,
"loss": 1.9074,
"step": 5380
},
{
"epoch": 0.9061923141466689,
"grad_norm": 1.3494782447814941,
"learning_rate": 4.715556301392851e-06,
"loss": 1.9162,
"step": 5400
},
{
"epoch": 0.9095485819768417,
"grad_norm": 1.3703207969665527,
"learning_rate": 4.547742909884209e-06,
"loss": 1.9728,
"step": 5420
},
{
"epoch": 0.9129048498070146,
"grad_norm": 1.2115719318389893,
"learning_rate": 4.379929518375567e-06,
"loss": 1.9381,
"step": 5440
},
{
"epoch": 0.9162611176371874,
"grad_norm": 1.2599093914031982,
"learning_rate": 4.212116126866924e-06,
"loss": 1.9405,
"step": 5460
},
{
"epoch": 0.9196173854673603,
"grad_norm": 1.1931716203689575,
"learning_rate": 4.044302735358282e-06,
"loss": 1.9205,
"step": 5480
},
{
"epoch": 0.9229736532975331,
"grad_norm": 1.2630369663238525,
"learning_rate": 3.876489343849639e-06,
"loss": 1.9257,
"step": 5500
},
{
"epoch": 0.926329921127706,
"grad_norm": 1.26536226272583,
"learning_rate": 3.7086759523409966e-06,
"loss": 1.9136,
"step": 5520
},
{
"epoch": 0.9296861889578788,
"grad_norm": 1.25338876247406,
"learning_rate": 3.5408625608323547e-06,
"loss": 1.875,
"step": 5540
},
{
"epoch": 0.9330424567880516,
"grad_norm": 1.2348542213439941,
"learning_rate": 3.3730491693237124e-06,
"loss": 1.959,
"step": 5560
},
{
"epoch": 0.9363987246182245,
"grad_norm": 1.3011400699615479,
"learning_rate": 3.2052357778150696e-06,
"loss": 1.9571,
"step": 5580
},
{
"epoch": 0.9397549924483973,
"grad_norm": 1.343929409980774,
"learning_rate": 3.0374223863064273e-06,
"loss": 1.9492,
"step": 5600
},
{
"epoch": 0.9431112602785703,
"grad_norm": 1.281829833984375,
"learning_rate": 2.869608994797785e-06,
"loss": 1.9429,
"step": 5620
},
{
"epoch": 0.9464675281087431,
"grad_norm": 1.1878955364227295,
"learning_rate": 2.7017956032891427e-06,
"loss": 1.9288,
"step": 5640
},
{
"epoch": 0.949823795938916,
"grad_norm": 1.2426503896713257,
"learning_rate": 2.5339822117805e-06,
"loss": 1.9498,
"step": 5660
},
{
"epoch": 0.9531800637690888,
"grad_norm": 1.3009203672409058,
"learning_rate": 2.366168820271858e-06,
"loss": 1.9254,
"step": 5680
},
{
"epoch": 0.9565363315992617,
"grad_norm": 1.3365298509597778,
"learning_rate": 2.1983554287632153e-06,
"loss": 1.9745,
"step": 5700
},
{
"epoch": 0.9598925994294345,
"grad_norm": 1.2553895711898804,
"learning_rate": 2.030542037254573e-06,
"loss": 1.9636,
"step": 5720
},
{
"epoch": 0.9632488672596073,
"grad_norm": 1.168750286102295,
"learning_rate": 1.8627286457459307e-06,
"loss": 1.9333,
"step": 5740
},
{
"epoch": 0.9666051350897802,
"grad_norm": 1.3157403469085693,
"learning_rate": 1.6949152542372882e-06,
"loss": 1.9635,
"step": 5760
},
{
"epoch": 0.969961402919953,
"grad_norm": 1.2229833602905273,
"learning_rate": 1.5271018627286458e-06,
"loss": 1.9534,
"step": 5780
},
{
"epoch": 0.9733176707501259,
"grad_norm": 1.3683537244796753,
"learning_rate": 1.3592884712200033e-06,
"loss": 1.9916,
"step": 5800
},
{
"epoch": 0.9766739385802987,
"grad_norm": 1.1630358695983887,
"learning_rate": 1.191475079711361e-06,
"loss": 1.9098,
"step": 5820
},
{
"epoch": 0.9800302064104716,
"grad_norm": 1.249770164489746,
"learning_rate": 1.0236616882027187e-06,
"loss": 1.9411,
"step": 5840
},
{
"epoch": 0.9833864742406444,
"grad_norm": 1.2611877918243408,
"learning_rate": 8.558482966940762e-07,
"loss": 1.9093,
"step": 5860
},
{
"epoch": 0.9867427420708172,
"grad_norm": 1.2660034894943237,
"learning_rate": 6.880349051854338e-07,
"loss": 1.9,
"step": 5880
},
{
"epoch": 0.9900990099009901,
"grad_norm": 1.263708233833313,
"learning_rate": 5.202215136767914e-07,
"loss": 1.9256,
"step": 5900
},
{
"epoch": 0.9934552777311629,
"grad_norm": 1.257265329360962,
"learning_rate": 3.5240812216814904e-07,
"loss": 1.9474,
"step": 5920
},
{
"epoch": 0.9968115455613358,
"grad_norm": 1.3897913694381714,
"learning_rate": 1.8459473065950665e-07,
"loss": 1.9567,
"step": 5940
}
],
"logging_steps": 20,
"max_steps": 5959,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 5959,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.81749288803369e+17,
"train_batch_size": 18,
"trial_name": null,
"trial_params": null
}