Llama-Poro-2-70B-SFT / trainer_state.json
laineyyy's picture
Upload folder using huggingface_hub
01fae1e verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9997936016511868,
"eval_steps": 500,
"global_step": 4844,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.000412796697626419,
"grad_norm": 19.038115839779415,
"learning_rate": 3.424657534246575e-08,
"loss": 0.9872,
"step": 1
},
{
"epoch": 0.008255933952528379,
"grad_norm": 2.2532416397119936,
"learning_rate": 6.849315068493151e-07,
"loss": 0.9325,
"step": 20
},
{
"epoch": 0.016511867905056758,
"grad_norm": 1.273128125464113,
"learning_rate": 1.3698630136986302e-06,
"loss": 0.7772,
"step": 40
},
{
"epoch": 0.02476780185758514,
"grad_norm": 1.7056377918705128,
"learning_rate": 2.0547945205479454e-06,
"loss": 0.6878,
"step": 60
},
{
"epoch": 0.033023735810113516,
"grad_norm": 1.1237902715212524,
"learning_rate": 2.7397260273972604e-06,
"loss": 0.6306,
"step": 80
},
{
"epoch": 0.0412796697626419,
"grad_norm": 0.7107592373281232,
"learning_rate": 3.4246575342465754e-06,
"loss": 0.5945,
"step": 100
},
{
"epoch": 0.04953560371517028,
"grad_norm": 0.5435287549705509,
"learning_rate": 4.109589041095891e-06,
"loss": 0.5678,
"step": 120
},
{
"epoch": 0.05779153766769866,
"grad_norm": 0.5519820327604449,
"learning_rate": 4.7945205479452054e-06,
"loss": 0.5645,
"step": 140
},
{
"epoch": 0.06604747162022703,
"grad_norm": 0.45781758347158724,
"learning_rate": 4.9998904438825655e-06,
"loss": 0.5435,
"step": 160
},
{
"epoch": 0.07430340557275542,
"grad_norm": 0.4113409226796853,
"learning_rate": 4.9993538656061865e-06,
"loss": 0.5384,
"step": 180
},
{
"epoch": 0.0825593395252838,
"grad_norm": 0.4061348667993117,
"learning_rate": 4.998370238474193e-06,
"loss": 0.5251,
"step": 200
},
{
"epoch": 0.09081527347781218,
"grad_norm": 0.3595538605503618,
"learning_rate": 4.996939738423808e-06,
"loss": 0.5268,
"step": 220
},
{
"epoch": 0.09907120743034056,
"grad_norm": 0.43476988938274347,
"learning_rate": 4.995062621322529e-06,
"loss": 0.5218,
"step": 240
},
{
"epoch": 0.10732714138286893,
"grad_norm": 0.37354960760902434,
"learning_rate": 4.992739222922349e-06,
"loss": 0.5092,
"step": 260
},
{
"epoch": 0.11558307533539731,
"grad_norm": 0.541197152311886,
"learning_rate": 4.989969958799716e-06,
"loss": 0.5164,
"step": 280
},
{
"epoch": 0.1238390092879257,
"grad_norm": 0.6905500202450845,
"learning_rate": 4.9867553242811925e-06,
"loss": 0.5093,
"step": 300
},
{
"epoch": 0.13209494324045407,
"grad_norm": 0.35631823372676763,
"learning_rate": 4.983095894354858e-06,
"loss": 0.5029,
"step": 320
},
{
"epoch": 0.14035087719298245,
"grad_norm": 0.41074164917056266,
"learning_rate": 4.978992323567467e-06,
"loss": 0.5081,
"step": 340
},
{
"epoch": 0.14860681114551083,
"grad_norm": 0.38311089041727175,
"learning_rate": 4.974445345907372e-06,
"loss": 0.4942,
"step": 360
},
{
"epoch": 0.1568627450980392,
"grad_norm": 1.276850727632135,
"learning_rate": 4.9694557746732365e-06,
"loss": 0.4941,
"step": 380
},
{
"epoch": 0.1651186790505676,
"grad_norm": 0.37634442822337005,
"learning_rate": 4.9640245023285645e-06,
"loss": 0.488,
"step": 400
},
{
"epoch": 0.17337461300309598,
"grad_norm": 0.34093150208574735,
"learning_rate": 4.958152500342071e-06,
"loss": 0.4936,
"step": 420
},
{
"epoch": 0.18163054695562436,
"grad_norm": 0.3697170253248781,
"learning_rate": 4.951840819013918e-06,
"loss": 0.4823,
"step": 440
},
{
"epoch": 0.18988648090815274,
"grad_norm": 0.3673753466126514,
"learning_rate": 4.94509058728785e-06,
"loss": 0.4865,
"step": 460
},
{
"epoch": 0.19814241486068113,
"grad_norm": 0.36105295583859276,
"learning_rate": 4.937903012549266e-06,
"loss": 0.4754,
"step": 480
},
{
"epoch": 0.20639834881320948,
"grad_norm": 0.36846225805717936,
"learning_rate": 4.930279380409266e-06,
"loss": 0.4704,
"step": 500
},
{
"epoch": 0.21465428276573786,
"grad_norm": 0.3173068336889516,
"learning_rate": 4.922221054474686e-06,
"loss": 0.473,
"step": 520
},
{
"epoch": 0.22291021671826625,
"grad_norm": 0.35259900729321336,
"learning_rate": 4.913729476104205e-06,
"loss": 0.4595,
"step": 540
},
{
"epoch": 0.23116615067079463,
"grad_norm": 0.36729347937445916,
"learning_rate": 4.9048061641505355e-06,
"loss": 0.4741,
"step": 560
},
{
"epoch": 0.239422084623323,
"grad_norm": 0.3318610632934225,
"learning_rate": 4.8954527146887455e-06,
"loss": 0.4648,
"step": 580
},
{
"epoch": 0.2476780185758514,
"grad_norm": 5.235550050863072,
"learning_rate": 4.885670800730784e-06,
"loss": 0.4768,
"step": 600
},
{
"epoch": 0.25593395252837975,
"grad_norm": 0.3182067180223638,
"learning_rate": 4.87546217192623e-06,
"loss": 0.4659,
"step": 620
},
{
"epoch": 0.26418988648090813,
"grad_norm": 0.3656873978361783,
"learning_rate": 4.864828654249344e-06,
"loss": 0.4598,
"step": 640
},
{
"epoch": 0.2724458204334365,
"grad_norm": 0.3501798939645083,
"learning_rate": 4.853772149672461e-06,
"loss": 0.4681,
"step": 660
},
{
"epoch": 0.2807017543859649,
"grad_norm": 0.3777815400079612,
"learning_rate": 4.842294635825794e-06,
"loss": 0.4615,
"step": 680
},
{
"epoch": 0.2889576883384933,
"grad_norm": 0.38852070579564624,
"learning_rate": 4.830398165643704e-06,
"loss": 0.4518,
"step": 700
},
{
"epoch": 0.29721362229102166,
"grad_norm": 0.32960545022053456,
"learning_rate": 4.818084866997499e-06,
"loss": 0.4551,
"step": 720
},
{
"epoch": 0.30546955624355004,
"grad_norm": 0.3581229581442626,
"learning_rate": 4.805356942314833e-06,
"loss": 0.4585,
"step": 740
},
{
"epoch": 0.3137254901960784,
"grad_norm": 0.32377521697576256,
"learning_rate": 4.792216668185765e-06,
"loss": 0.4469,
"step": 760
},
{
"epoch": 0.3219814241486068,
"grad_norm": 0.32217232079348435,
"learning_rate": 4.778666394955554e-06,
"loss": 0.4626,
"step": 780
},
{
"epoch": 0.3302373581011352,
"grad_norm": 0.4595767097639659,
"learning_rate": 4.764708546304267e-06,
"loss": 0.4624,
"step": 800
},
{
"epoch": 0.3384932920536636,
"grad_norm": 0.32949086041842535,
"learning_rate": 4.75034561881326e-06,
"loss": 0.446,
"step": 820
},
{
"epoch": 0.34674922600619196,
"grad_norm": 0.4894422691515238,
"learning_rate": 4.735580181518631e-06,
"loss": 0.4541,
"step": 840
},
{
"epoch": 0.35500515995872034,
"grad_norm": 0.3280899378337465,
"learning_rate": 4.720414875451701e-06,
"loss": 0.4487,
"step": 860
},
{
"epoch": 0.3632610939112487,
"grad_norm": 1.837588625073277,
"learning_rate": 4.704852413166629e-06,
"loss": 0.4518,
"step": 880
},
{
"epoch": 0.3715170278637771,
"grad_norm": 0.40018979567066226,
"learning_rate": 4.688895578255228e-06,
"loss": 0.4465,
"step": 900
},
{
"epoch": 0.3797729618163055,
"grad_norm": 0.3242631003772359,
"learning_rate": 4.672547224849072e-06,
"loss": 0.4411,
"step": 920
},
{
"epoch": 0.38802889576883387,
"grad_norm": 0.3442813940284623,
"learning_rate": 4.655810277108994e-06,
"loss": 0.4457,
"step": 940
},
{
"epoch": 0.39628482972136225,
"grad_norm": 0.32602733959556546,
"learning_rate": 4.638687728702054e-06,
"loss": 0.4433,
"step": 960
},
{
"epoch": 0.40454076367389064,
"grad_norm": 0.354500321094183,
"learning_rate": 4.6211826422660685e-06,
"loss": 0.4383,
"step": 980
},
{
"epoch": 0.41279669762641896,
"grad_norm": 0.348616547530873,
"learning_rate": 4.6032981488618155e-06,
"loss": 0.4413,
"step": 1000
},
{
"epoch": 0.42105263157894735,
"grad_norm": 0.32717814527815386,
"learning_rate": 4.585037447412993e-06,
"loss": 0.4368,
"step": 1020
},
{
"epoch": 0.4293085655314757,
"grad_norm": 0.6796110151903153,
"learning_rate": 4.566403804134042e-06,
"loss": 0.4436,
"step": 1040
},
{
"epoch": 0.4375644994840041,
"grad_norm": 0.3316261638741136,
"learning_rate": 4.547400551945927e-06,
"loss": 0.4425,
"step": 1060
},
{
"epoch": 0.4458204334365325,
"grad_norm": 0.40026371472445244,
"learning_rate": 4.528031089880001e-06,
"loss": 0.4457,
"step": 1080
},
{
"epoch": 0.4540763673890609,
"grad_norm": 0.32173726498871624,
"learning_rate": 4.508298882470025e-06,
"loss": 0.4437,
"step": 1100
},
{
"epoch": 0.46233230134158926,
"grad_norm": 0.33503635231733775,
"learning_rate": 4.488207459132484e-06,
"loss": 0.4379,
"step": 1120
},
{
"epoch": 0.47058823529411764,
"grad_norm": 0.35244768504197327,
"learning_rate": 4.467760413535303e-06,
"loss": 0.44,
"step": 1140
},
{
"epoch": 0.478844169246646,
"grad_norm": 0.31115577007431877,
"learning_rate": 4.44696140295505e-06,
"loss": 0.4329,
"step": 1160
},
{
"epoch": 0.4871001031991744,
"grad_norm": 0.3238091738004505,
"learning_rate": 4.425814147622786e-06,
"loss": 0.4337,
"step": 1180
},
{
"epoch": 0.4953560371517028,
"grad_norm": 0.49541016985287156,
"learning_rate": 4.404322430058634e-06,
"loss": 0.4287,
"step": 1200
},
{
"epoch": 0.5036119711042312,
"grad_norm": 0.37221572328367447,
"learning_rate": 4.382490094395223e-06,
"loss": 0.464,
"step": 1220
},
{
"epoch": 0.5118679050567595,
"grad_norm": 0.33878190692897947,
"learning_rate": 4.360321045690098e-06,
"loss": 0.4409,
"step": 1240
},
{
"epoch": 0.5201238390092879,
"grad_norm": 0.3936174788244509,
"learning_rate": 4.337819249227243e-06,
"loss": 0.4343,
"step": 1260
},
{
"epoch": 0.5283797729618163,
"grad_norm": 0.33598616505477236,
"learning_rate": 4.3149887298078275e-06,
"loss": 0.431,
"step": 1280
},
{
"epoch": 0.5366357069143447,
"grad_norm": 0.3519965938188954,
"learning_rate": 4.2918335710303035e-06,
"loss": 0.426,
"step": 1300
},
{
"epoch": 0.544891640866873,
"grad_norm": 0.320454217828392,
"learning_rate": 4.268357914559994e-06,
"loss": 0.4304,
"step": 1320
},
{
"epoch": 0.5531475748194015,
"grad_norm": 0.3719935953584569,
"learning_rate": 4.244565959388287e-06,
"loss": 0.436,
"step": 1340
},
{
"epoch": 0.5614035087719298,
"grad_norm": 0.3613570450407545,
"learning_rate": 4.2204619610815846e-06,
"loss": 0.4335,
"step": 1360
},
{
"epoch": 0.5696594427244582,
"grad_norm": 0.3126539947940401,
"learning_rate": 4.19605023102012e-06,
"loss": 0.4263,
"step": 1380
},
{
"epoch": 0.5779153766769866,
"grad_norm": 0.6458529290452903,
"learning_rate": 4.171335135626809e-06,
"loss": 0.4265,
"step": 1400
},
{
"epoch": 0.586171310629515,
"grad_norm": 0.3604632525755521,
"learning_rate": 4.146321095586238e-06,
"loss": 0.4362,
"step": 1420
},
{
"epoch": 0.5944272445820433,
"grad_norm": 0.30925366125534925,
"learning_rate": 4.121012585053958e-06,
"loss": 0.4233,
"step": 1440
},
{
"epoch": 0.6026831785345718,
"grad_norm": 0.3100306182604728,
"learning_rate": 4.095414130856215e-06,
"loss": 0.4274,
"step": 1460
},
{
"epoch": 0.6109391124871001,
"grad_norm": 0.3253333926411185,
"learning_rate": 4.069530311680247e-06,
"loss": 0.4215,
"step": 1480
},
{
"epoch": 0.6191950464396285,
"grad_norm": 0.3161203579495709,
"learning_rate": 4.043365757255323e-06,
"loss": 0.4195,
"step": 1500
},
{
"epoch": 0.6274509803921569,
"grad_norm": 0.5372305576307022,
"learning_rate": 4.016925147524638e-06,
"loss": 0.4207,
"step": 1520
},
{
"epoch": 0.6357069143446853,
"grad_norm": 0.4217012521666707,
"learning_rate": 3.99021321180823e-06,
"loss": 0.4189,
"step": 1540
},
{
"epoch": 0.6439628482972136,
"grad_norm": 0.31058380585253026,
"learning_rate": 3.96323472795707e-06,
"loss": 0.4224,
"step": 1560
},
{
"epoch": 0.6522187822497421,
"grad_norm": 0.3096196075520948,
"learning_rate": 3.935994521498468e-06,
"loss": 0.4298,
"step": 1580
},
{
"epoch": 0.6604747162022704,
"grad_norm": 0.31321817530598023,
"learning_rate": 3.908497464772946e-06,
"loss": 0.4165,
"step": 1600
},
{
"epoch": 0.6687306501547987,
"grad_norm": 0.34189314788820424,
"learning_rate": 3.880748476062751e-06,
"loss": 0.4209,
"step": 1620
},
{
"epoch": 0.6769865841073271,
"grad_norm": 0.314020426119392,
"learning_rate": 3.852752518712135e-06,
"loss": 0.4235,
"step": 1640
},
{
"epoch": 0.6852425180598555,
"grad_norm": 0.33309865122629745,
"learning_rate": 3.824514600239591e-06,
"loss": 0.4272,
"step": 1660
},
{
"epoch": 0.6934984520123839,
"grad_norm": 0.31011244271423327,
"learning_rate": 3.796039771442169e-06,
"loss": 0.4195,
"step": 1680
},
{
"epoch": 0.7017543859649122,
"grad_norm": 0.31754715779100456,
"learning_rate": 3.767333125492072e-06,
"loss": 0.4177,
"step": 1700
},
{
"epoch": 0.7100103199174407,
"grad_norm": 0.325717634618246,
"learning_rate": 3.7383997970256525e-06,
"loss": 0.4208,
"step": 1720
},
{
"epoch": 0.718266253869969,
"grad_norm": 0.3221005127147903,
"learning_rate": 3.7092449612250083e-06,
"loss": 0.4258,
"step": 1740
},
{
"epoch": 0.7265221878224974,
"grad_norm": 2.3793212357938245,
"learning_rate": 3.6798738328923162e-06,
"loss": 0.4117,
"step": 1760
},
{
"epoch": 0.7347781217750258,
"grad_norm": 0.34615783300395964,
"learning_rate": 3.650291665517085e-06,
"loss": 0.4259,
"step": 1780
},
{
"epoch": 0.7430340557275542,
"grad_norm": 0.30258388313281476,
"learning_rate": 3.6205037503364827e-06,
"loss": 0.4138,
"step": 1800
},
{
"epoch": 0.7512899896800825,
"grad_norm": 0.319420203738432,
"learning_rate": 3.590515415388919e-06,
"loss": 0.414,
"step": 1820
},
{
"epoch": 0.759545923632611,
"grad_norm": 0.3131116585624073,
"learning_rate": 3.5603320245610375e-06,
"loss": 0.4171,
"step": 1840
},
{
"epoch": 0.7678018575851393,
"grad_norm": 0.44357805104504766,
"learning_rate": 3.529958976628304e-06,
"loss": 0.4149,
"step": 1860
},
{
"epoch": 0.7760577915376677,
"grad_norm": 0.3482318385617704,
"learning_rate": 3.499401704289347e-06,
"loss": 0.4068,
"step": 1880
},
{
"epoch": 0.7843137254901961,
"grad_norm": 1.4359924674280429,
"learning_rate": 3.468665673194237e-06,
"loss": 0.412,
"step": 1900
},
{
"epoch": 0.7925696594427245,
"grad_norm": 0.30332274823266964,
"learning_rate": 3.437756380966866e-06,
"loss": 0.4136,
"step": 1920
},
{
"epoch": 0.8008255933952528,
"grad_norm": 0.30856700243712654,
"learning_rate": 3.4066793562216135e-06,
"loss": 0.41,
"step": 1940
},
{
"epoch": 0.8090815273477813,
"grad_norm": 0.42071490847162873,
"learning_rate": 3.375440157574462e-06,
"loss": 0.4061,
"step": 1960
},
{
"epoch": 0.8173374613003096,
"grad_norm": 2.6302213294970933,
"learning_rate": 3.3440443726487583e-06,
"loss": 0.4025,
"step": 1980
},
{
"epoch": 0.8255933952528379,
"grad_norm": 0.29774501528511826,
"learning_rate": 3.3124976170757694e-06,
"loss": 0.4201,
"step": 2000
},
{
"epoch": 0.8338493292053664,
"grad_norm": 0.32604189076380047,
"learning_rate": 3.2808055334902487e-06,
"loss": 0.4091,
"step": 2020
},
{
"epoch": 0.8421052631578947,
"grad_norm": 0.3285267181834793,
"learning_rate": 3.2489737905211537e-06,
"loss": 0.4175,
"step": 2040
},
{
"epoch": 0.8503611971104231,
"grad_norm": 0.310713417485478,
"learning_rate": 3.217008081777726e-06,
"loss": 0.4092,
"step": 2060
},
{
"epoch": 0.8586171310629515,
"grad_norm": 0.3073088712514811,
"learning_rate": 3.184914124831098e-06,
"loss": 0.4053,
"step": 2080
},
{
"epoch": 0.8668730650154799,
"grad_norm": 0.337054135152426,
"learning_rate": 3.1526976601916153e-06,
"loss": 0.4112,
"step": 2100
},
{
"epoch": 0.8751289989680082,
"grad_norm": 0.35614170658863653,
"learning_rate": 3.1203644502820592e-06,
"loss": 0.4111,
"step": 2120
},
{
"epoch": 0.8833849329205367,
"grad_norm": 0.31220727299527445,
"learning_rate": 3.0879202784069407e-06,
"loss": 0.4143,
"step": 2140
},
{
"epoch": 0.891640866873065,
"grad_norm": 0.31989316355962877,
"learning_rate": 3.05537094771807e-06,
"loss": 0.4207,
"step": 2160
},
{
"epoch": 0.8998968008255934,
"grad_norm": 0.36471398705088465,
"learning_rate": 3.022722280176568e-06,
"loss": 0.4121,
"step": 2180
},
{
"epoch": 0.9081527347781218,
"grad_norm": 0.3241199840126995,
"learning_rate": 2.9899801155115204e-06,
"loss": 0.418,
"step": 2200
},
{
"epoch": 0.9164086687306502,
"grad_norm": 0.414035145600113,
"learning_rate": 2.9571503101754466e-06,
"loss": 0.4097,
"step": 2220
},
{
"epoch": 0.9246646026831785,
"grad_norm": 0.3068665528352179,
"learning_rate": 2.9242387362967815e-06,
"loss": 0.4087,
"step": 2240
},
{
"epoch": 0.932920536635707,
"grad_norm": 0.3120720460499234,
"learning_rate": 2.8912512806295573e-06,
"loss": 0.4074,
"step": 2260
},
{
"epoch": 0.9411764705882353,
"grad_norm": 0.31140736933032415,
"learning_rate": 2.858193843500455e-06,
"loss": 0.4048,
"step": 2280
},
{
"epoch": 0.9494324045407637,
"grad_norm": 0.30731820437254637,
"learning_rate": 2.8250723377534474e-06,
"loss": 0.4025,
"step": 2300
},
{
"epoch": 0.957688338493292,
"grad_norm": 0.31008178937315156,
"learning_rate": 2.791892687692189e-06,
"loss": 0.4017,
"step": 2320
},
{
"epoch": 0.9659442724458205,
"grad_norm": 0.31453985857721983,
"learning_rate": 2.7586608280203632e-06,
"loss": 0.3964,
"step": 2340
},
{
"epoch": 0.9742002063983488,
"grad_norm": 0.3454533037677527,
"learning_rate": 2.725382702780164e-06,
"loss": 0.4019,
"step": 2360
},
{
"epoch": 0.9824561403508771,
"grad_norm": 0.31086090786087217,
"learning_rate": 2.6920642642891114e-06,
"loss": 0.4078,
"step": 2380
},
{
"epoch": 0.9907120743034056,
"grad_norm": 0.29964942829401026,
"learning_rate": 2.6587114720753882e-06,
"loss": 0.4054,
"step": 2400
},
{
"epoch": 0.9989680082559339,
"grad_norm": 0.31845271001148817,
"learning_rate": 2.625330291811882e-06,
"loss": 0.4017,
"step": 2420
},
{
"epoch": 0.9997936016511868,
"eval_loss": 0.5107099413871765,
"eval_runtime": 88.8692,
"eval_samples_per_second": 6.054,
"eval_steps_per_second": 0.056,
"step": 2422
},
{
"epoch": 1.0074303405572755,
"grad_norm": 0.33713877028152195,
"learning_rate": 2.591926694249128e-06,
"loss": 0.3594,
"step": 2440
},
{
"epoch": 1.0156862745098039,
"grad_norm": 0.35458325001613433,
"learning_rate": 2.558506654147353e-06,
"loss": 0.3297,
"step": 2460
},
{
"epoch": 1.0239422084623322,
"grad_norm": 0.33361671982328645,
"learning_rate": 2.525076149207788e-06,
"loss": 0.3327,
"step": 2480
},
{
"epoch": 1.0321981424148607,
"grad_norm": 0.41547804329917404,
"learning_rate": 2.4916411590034672e-06,
"loss": 0.3307,
"step": 2500
},
{
"epoch": 1.040454076367389,
"grad_norm": 0.3280076602568464,
"learning_rate": 2.4582076639096837e-06,
"loss": 0.3295,
"step": 2520
},
{
"epoch": 1.0487100103199174,
"grad_norm": 0.3249157740900077,
"learning_rate": 2.4247816440343078e-06,
"loss": 0.322,
"step": 2540
},
{
"epoch": 1.0569659442724457,
"grad_norm": 0.35605530558146004,
"learning_rate": 2.391369078148148e-06,
"loss": 0.335,
"step": 2560
},
{
"epoch": 1.0652218782249743,
"grad_norm": 0.3549484263180951,
"learning_rate": 2.3579759426155552e-06,
"loss": 0.3153,
"step": 2580
},
{
"epoch": 1.0734778121775026,
"grad_norm": 0.3465115375264916,
"learning_rate": 2.324608210325456e-06,
"loss": 0.3264,
"step": 2600
},
{
"epoch": 1.081733746130031,
"grad_norm": 0.3357910072373825,
"learning_rate": 2.291271849623004e-06,
"loss": 0.3212,
"step": 2620
},
{
"epoch": 1.0899896800825593,
"grad_norm": 0.344383755229491,
"learning_rate": 2.2579728232420524e-06,
"loss": 0.3157,
"step": 2640
},
{
"epoch": 1.0982456140350878,
"grad_norm": 0.31538428598087054,
"learning_rate": 2.2247170872386205e-06,
"loss": 0.3229,
"step": 2660
},
{
"epoch": 1.1065015479876161,
"grad_norm": 0.3246184372803657,
"learning_rate": 2.1915105899255617e-06,
"loss": 0.3232,
"step": 2680
},
{
"epoch": 1.1147574819401445,
"grad_norm": 0.3443636135756491,
"learning_rate": 2.158359270808612e-06,
"loss": 0.3203,
"step": 2700
},
{
"epoch": 1.1230134158926728,
"grad_norm": 0.34084061773858154,
"learning_rate": 2.125269059524018e-06,
"loss": 0.3276,
"step": 2720
},
{
"epoch": 1.1312693498452013,
"grad_norm": 0.46655604655860133,
"learning_rate": 2.092245874777926e-06,
"loss": 0.3267,
"step": 2740
},
{
"epoch": 1.1395252837977297,
"grad_norm": 0.33228746141272103,
"learning_rate": 2.059295623287729e-06,
"loss": 0.3272,
"step": 2760
},
{
"epoch": 1.147781217750258,
"grad_norm": 0.348538665454056,
"learning_rate": 2.026424198725557e-06,
"loss": 0.3228,
"step": 2780
},
{
"epoch": 1.1560371517027863,
"grad_norm": 0.3708880778253579,
"learning_rate": 1.9936374806641016e-06,
"loss": 0.3268,
"step": 2800
},
{
"epoch": 1.1642930856553146,
"grad_norm": 0.3467837537594021,
"learning_rate": 1.96094133352496e-06,
"loss": 0.3287,
"step": 2820
},
{
"epoch": 1.1725490196078432,
"grad_norm": 0.331776837754727,
"learning_rate": 1.9283416055296906e-06,
"loss": 0.3135,
"step": 2840
},
{
"epoch": 1.1808049535603715,
"grad_norm": 0.33972138880439906,
"learning_rate": 1.8958441276537695e-06,
"loss": 0.3165,
"step": 2860
},
{
"epoch": 1.1890608875128998,
"grad_norm": 0.3290544472328714,
"learning_rate": 1.8634547125836246e-06,
"loss": 0.3231,
"step": 2880
},
{
"epoch": 1.1973168214654284,
"grad_norm": 0.4324023615254428,
"learning_rate": 1.8311791536769485e-06,
"loss": 0.3191,
"step": 2900
},
{
"epoch": 1.2055727554179567,
"grad_norm": 0.34414515738152696,
"learning_rate": 1.799023223926461e-06,
"loss": 0.3214,
"step": 2920
},
{
"epoch": 1.213828689370485,
"grad_norm": 0.3522816282310391,
"learning_rate": 1.766992674927322e-06,
"loss": 0.322,
"step": 2940
},
{
"epoch": 1.2220846233230134,
"grad_norm": 0.33214309648209184,
"learning_rate": 1.7350932358483675e-06,
"loss": 0.3245,
"step": 2960
},
{
"epoch": 1.2303405572755417,
"grad_norm": 0.32354241105977966,
"learning_rate": 1.703330612407355e-06,
"loss": 0.3189,
"step": 2980
},
{
"epoch": 1.2385964912280703,
"grad_norm": 0.33884732729314554,
"learning_rate": 1.6717104858504088e-06,
"loss": 0.3211,
"step": 3000
},
{
"epoch": 1.2468524251805986,
"grad_norm": 0.3735533087692056,
"learning_rate": 1.6402385119358372e-06,
"loss": 0.3188,
"step": 3020
},
{
"epoch": 1.255108359133127,
"grad_norm": 0.3379937818585682,
"learning_rate": 1.6089203199225117e-06,
"loss": 0.3201,
"step": 3040
},
{
"epoch": 1.2633642930856552,
"grad_norm": 0.33385226604310425,
"learning_rate": 1.5777615115629874e-06,
"loss": 0.3197,
"step": 3060
},
{
"epoch": 1.2716202270381838,
"grad_norm": 0.3353612788825473,
"learning_rate": 1.546767660101537e-06,
"loss": 0.3178,
"step": 3080
},
{
"epoch": 1.279876160990712,
"grad_norm": 0.3384674558918743,
"learning_rate": 1.5159443092772924e-06,
"loss": 0.3235,
"step": 3100
},
{
"epoch": 1.2881320949432404,
"grad_norm": 0.384875131380503,
"learning_rate": 1.4852969723326555e-06,
"loss": 0.3187,
"step": 3120
},
{
"epoch": 1.2963880288957688,
"grad_norm": 0.44608590681938926,
"learning_rate": 1.4548311310271724e-06,
"loss": 0.3246,
"step": 3140
},
{
"epoch": 1.304643962848297,
"grad_norm": 0.33842522169860273,
"learning_rate": 1.4245522346570289e-06,
"loss": 0.3226,
"step": 3160
},
{
"epoch": 1.3128998968008256,
"grad_norm": 0.38547488897682075,
"learning_rate": 1.3944656990803645e-06,
"loss": 0.3198,
"step": 3180
},
{
"epoch": 1.321155830753354,
"grad_norm": 0.3552101566974137,
"learning_rate": 1.364576905748553e-06,
"loss": 0.3199,
"step": 3200
},
{
"epoch": 1.3294117647058823,
"grad_norm": 0.3225112166653656,
"learning_rate": 1.3348912007436538e-06,
"loss": 0.3194,
"step": 3220
},
{
"epoch": 1.3376676986584108,
"grad_norm": 0.34375670285803067,
"learning_rate": 1.3054138938221708e-06,
"loss": 0.3279,
"step": 3240
},
{
"epoch": 1.3459236326109392,
"grad_norm": 0.330557856124175,
"learning_rate": 1.2761502574653286e-06,
"loss": 0.3199,
"step": 3260
},
{
"epoch": 1.3541795665634675,
"grad_norm": 0.3309833353922965,
"learning_rate": 1.247105525936001e-06,
"loss": 0.3146,
"step": 3280
},
{
"epoch": 1.3624355005159958,
"grad_norm": 0.3452194374171664,
"learning_rate": 1.2182848943424786e-06,
"loss": 0.3176,
"step": 3300
},
{
"epoch": 1.3706914344685242,
"grad_norm": 0.3216278030301951,
"learning_rate": 1.1896935177092456e-06,
"loss": 0.3185,
"step": 3320
},
{
"epoch": 1.3789473684210527,
"grad_norm": 0.4265735872664729,
"learning_rate": 1.16133651005492e-06,
"loss": 0.3207,
"step": 3340
},
{
"epoch": 1.387203302373581,
"grad_norm": 0.32503296466693565,
"learning_rate": 1.1332189434775354e-06,
"loss": 0.3199,
"step": 3360
},
{
"epoch": 1.3954592363261094,
"grad_norm": 0.33776061146175834,
"learning_rate": 1.1053458472473078e-06,
"loss": 0.317,
"step": 3380
},
{
"epoch": 1.403715170278638,
"grad_norm": 0.33951723955583724,
"learning_rate": 1.0777222069070797e-06,
"loss": 0.3152,
"step": 3400
},
{
"epoch": 1.4119711042311662,
"grad_norm": 0.35231254237908666,
"learning_rate": 1.0503529633805711e-06,
"loss": 0.3156,
"step": 3420
},
{
"epoch": 1.4202270381836946,
"grad_norm": 0.34046838697292375,
"learning_rate": 1.0232430120886227e-06,
"loss": 0.3198,
"step": 3440
},
{
"epoch": 1.4284829721362229,
"grad_norm": 0.31629713031455653,
"learning_rate": 9.963972020735658e-07,
"loss": 0.3149,
"step": 3460
},
{
"epoch": 1.4367389060887512,
"grad_norm": 0.8014048883476281,
"learning_rate": 9.698203351319019e-07,
"loss": 0.3235,
"step": 3480
},
{
"epoch": 1.4449948400412795,
"grad_norm": 0.32201920421165664,
"learning_rate": 9.435171649554234e-07,
"loss": 0.3217,
"step": 3500
},
{
"epoch": 1.453250773993808,
"grad_norm": 0.36221640610133427,
"learning_rate": 9.17492396280934e-07,
"loss": 0.3156,
"step": 3520
},
{
"epoch": 1.4615067079463364,
"grad_norm": 0.32962860003755834,
"learning_rate": 8.917506840487448e-07,
"loss": 0.3152,
"step": 3540
},
{
"epoch": 1.4697626418988647,
"grad_norm": 0.3605181380604464,
"learning_rate": 8.662966325700531e-07,
"loss": 0.3206,
"step": 3560
},
{
"epoch": 1.4780185758513933,
"grad_norm": 0.32779483961721695,
"learning_rate": 8.411347947033982e-07,
"loss": 0.3095,
"step": 3580
},
{
"epoch": 1.4862745098039216,
"grad_norm": 0.3270784844703272,
"learning_rate": 8.162696710403026e-07,
"loss": 0.319,
"step": 3600
},
{
"epoch": 1.49453044375645,
"grad_norm": 0.3265816952545498,
"learning_rate": 7.917057091002772e-07,
"loss": 0.3161,
"step": 3620
},
{
"epoch": 1.5027863777089783,
"grad_norm": 0.32668356632045076,
"learning_rate": 7.674473025353063e-07,
"loss": 0.3136,
"step": 3640
},
{
"epoch": 1.5110423116615066,
"grad_norm": 0.3329149198486246,
"learning_rate": 7.434987903439795e-07,
"loss": 0.3145,
"step": 3660
},
{
"epoch": 1.519298245614035,
"grad_norm": 0.33015476593592213,
"learning_rate": 7.198644560953877e-07,
"loss": 0.3154,
"step": 3680
},
{
"epoch": 1.5275541795665635,
"grad_norm": 0.3427897311331206,
"learning_rate": 6.965485271629426e-07,
"loss": 0.3219,
"step": 3700
},
{
"epoch": 1.5358101135190918,
"grad_norm": 0.31710290567990695,
"learning_rate": 6.735551739682458e-07,
"loss": 0.3118,
"step": 3720
},
{
"epoch": 1.5440660474716204,
"grad_norm": 0.3709576851610617,
"learning_rate": 6.508885092351374e-07,
"loss": 0.3147,
"step": 3740
},
{
"epoch": 1.5523219814241487,
"grad_norm": 0.34606775925501615,
"learning_rate": 6.285525872540777e-07,
"loss": 0.3151,
"step": 3760
},
{
"epoch": 1.560577915376677,
"grad_norm": 0.32914754720880296,
"learning_rate": 6.065514031569658e-07,
"loss": 0.319,
"step": 3780
},
{
"epoch": 1.5688338493292053,
"grad_norm": 0.32959782478095573,
"learning_rate": 5.848888922025553e-07,
"loss": 0.3109,
"step": 3800
},
{
"epoch": 1.5770897832817337,
"grad_norm": 0.3461782278815208,
"learning_rate": 5.635689290725629e-07,
"loss": 0.3118,
"step": 3820
},
{
"epoch": 1.585345717234262,
"grad_norm": 0.33719762865745856,
"learning_rate": 5.425953271786289e-07,
"loss": 0.3142,
"step": 3840
},
{
"epoch": 1.5936016511867905,
"grad_norm": 0.3229411688412049,
"learning_rate": 5.219718379802203e-07,
"loss": 0.3074,
"step": 3860
},
{
"epoch": 1.6018575851393189,
"grad_norm": 0.339153428971675,
"learning_rate": 5.017021503136299e-07,
"loss": 0.3085,
"step": 3880
},
{
"epoch": 1.6101135190918474,
"grad_norm": 0.328366872614432,
"learning_rate": 4.817898897321648e-07,
"loss": 0.3211,
"step": 3900
},
{
"epoch": 1.6183694530443757,
"grad_norm": 0.3308504300204108,
"learning_rate": 4.6223861785766184e-07,
"loss": 0.3219,
"step": 3920
},
{
"epoch": 1.626625386996904,
"grad_norm": 0.3290571513653433,
"learning_rate": 4.430518317434351e-07,
"loss": 0.3098,
"step": 3940
},
{
"epoch": 1.6348813209494324,
"grad_norm": 0.32391898185345264,
"learning_rate": 4.242329632487707e-07,
"loss": 0.3165,
"step": 3960
},
{
"epoch": 1.6431372549019607,
"grad_norm": 0.32636444036143686,
"learning_rate": 4.057853784250884e-07,
"loss": 0.3114,
"step": 3980
},
{
"epoch": 1.651393188854489,
"grad_norm": 0.4579751480579007,
"learning_rate": 3.877123769138652e-07,
"loss": 0.3157,
"step": 4000
},
{
"epoch": 1.6596491228070176,
"grad_norm": 0.3305784303773594,
"learning_rate": 3.7001719135644793e-07,
"loss": 0.3178,
"step": 4020
},
{
"epoch": 1.667905056759546,
"grad_norm": 0.33328615826236435,
"learning_rate": 3.527029868158394e-07,
"loss": 0.3149,
"step": 4040
},
{
"epoch": 1.6761609907120743,
"grad_norm": 0.3376266856154567,
"learning_rate": 3.3577286021058085e-07,
"loss": 0.3214,
"step": 4060
},
{
"epoch": 1.6844169246646028,
"grad_norm": 0.36052770349787855,
"learning_rate": 3.192298397608165e-07,
"loss": 0.3181,
"step": 4080
},
{
"epoch": 1.6926728586171311,
"grad_norm": 0.3186034103836647,
"learning_rate": 3.0307688444664975e-07,
"loss": 0.3109,
"step": 4100
},
{
"epoch": 1.7009287925696595,
"grad_norm": 0.32364083786767023,
"learning_rate": 2.873168834788842e-07,
"loss": 0.3164,
"step": 4120
},
{
"epoch": 1.7091847265221878,
"grad_norm": 0.3352263407283556,
"learning_rate": 2.719526557822391e-07,
"loss": 0.3135,
"step": 4140
},
{
"epoch": 1.717440660474716,
"grad_norm": 0.324247569201673,
"learning_rate": 2.5698694949114504e-07,
"loss": 0.3172,
"step": 4160
},
{
"epoch": 1.7256965944272444,
"grad_norm": 0.3241547440096801,
"learning_rate": 2.4242244145819187e-07,
"loss": 0.311,
"step": 4180
},
{
"epoch": 1.733952528379773,
"grad_norm": 0.3383830207704809,
"learning_rate": 2.2826173677533593e-07,
"loss": 0.3155,
"step": 4200
},
{
"epoch": 1.7422084623323013,
"grad_norm": 0.3950633617255487,
"learning_rate": 2.1450736830793405e-07,
"loss": 0.3142,
"step": 4220
},
{
"epoch": 1.7504643962848299,
"grad_norm": 0.5099914758815567,
"learning_rate": 2.0116179624170478e-07,
"loss": 0.3107,
"step": 4240
},
{
"epoch": 1.7587203302373582,
"grad_norm": 1.0026514809885159,
"learning_rate": 1.8822740764268098e-07,
"loss": 0.3087,
"step": 4260
},
{
"epoch": 1.7669762641898865,
"grad_norm": 1662.5837568982583,
"learning_rate": 1.757065160302504e-07,
"loss": 0.3182,
"step": 4280
},
{
"epoch": 1.7752321981424148,
"grad_norm": 0.3278171902579247,
"learning_rate": 1.6360136096334107e-07,
"loss": 0.3066,
"step": 4300
},
{
"epoch": 1.7834881320949432,
"grad_norm": 0.3533064691586645,
"learning_rate": 1.519141076398442e-07,
"loss": 0.3166,
"step": 4320
},
{
"epoch": 1.7917440660474715,
"grad_norm": 0.8178680644268199,
"learning_rate": 1.406468465093344e-07,
"loss": 0.3167,
"step": 4340
},
{
"epoch": 1.8,
"grad_norm": 0.5677100980648624,
"learning_rate": 1.2980159289915805e-07,
"loss": 0.3139,
"step": 4360
},
{
"epoch": 1.8082559339525284,
"grad_norm": 0.327287746933213,
"learning_rate": 1.1938028665396172e-07,
"loss": 0.3123,
"step": 4380
},
{
"epoch": 1.816511867905057,
"grad_norm": 0.3594205243876799,
"learning_rate": 1.0938479178871892e-07,
"loss": 0.3158,
"step": 4400
},
{
"epoch": 1.8247678018575852,
"grad_norm": 0.3265638946104959,
"learning_rate": 9.981689615532364e-08,
"loss": 0.3124,
"step": 4420
},
{
"epoch": 1.8330237358101136,
"grad_norm": 0.33268480190653027,
"learning_rate": 9.06783111228024e-08,
"loss": 0.3161,
"step": 4440
},
{
"epoch": 1.841279669762642,
"grad_norm": 0.3434703383531059,
"learning_rate": 8.19706712712115e-08,
"loss": 0.3139,
"step": 4460
},
{
"epoch": 1.8495356037151702,
"grad_norm": 0.3476346041767014,
"learning_rate": 7.369553409926427e-08,
"loss": 0.313,
"step": 4480
},
{
"epoch": 1.8577915376676986,
"grad_norm": 0.32495041685434667,
"learning_rate": 6.585437974574921e-08,
"loss": 0.308,
"step": 4500
},
{
"epoch": 1.8660474716202269,
"grad_norm": 0.334105753613722,
"learning_rate": 5.844861072478336e-08,
"loss": 0.3146,
"step": 4520
},
{
"epoch": 1.8743034055727554,
"grad_norm": 0.32248303496350533,
"learning_rate": 5.147955167495111e-08,
"loss": 0.3164,
"step": 4540
},
{
"epoch": 1.8825593395252838,
"grad_norm": 0.3170520025481871,
"learning_rate": 4.494844912237145e-08,
"loss": 0.309,
"step": 4560
},
{
"epoch": 1.8908152734778123,
"grad_norm": 0.32425657312680345,
"learning_rate": 3.885647125773578e-08,
"loss": 0.3158,
"step": 4580
},
{
"epoch": 1.8990712074303406,
"grad_norm": 0.32676487500274765,
"learning_rate": 3.320470772736062e-08,
"loss": 0.3176,
"step": 4600
},
{
"epoch": 1.907327141382869,
"grad_norm": 0.3366864111509815,
"learning_rate": 2.799416943828598e-08,
"loss": 0.318,
"step": 4620
},
{
"epoch": 1.9155830753353973,
"grad_norm": 0.33022963805900657,
"learning_rate": 2.3225788377459478e-08,
"loss": 0.3161,
"step": 4640
},
{
"epoch": 1.9238390092879256,
"grad_norm": 0.4334852710323891,
"learning_rate": 1.890041744503468e-08,
"loss": 0.3207,
"step": 4660
},
{
"epoch": 1.932094943240454,
"grad_norm": 0.3368172513862861,
"learning_rate": 1.5018830301817277e-08,
"loss": 0.3112,
"step": 4680
},
{
"epoch": 1.9403508771929825,
"grad_norm": 0.328478614234708,
"learning_rate": 1.1581721230883302e-08,
"loss": 0.3175,
"step": 4700
},
{
"epoch": 1.9486068111455108,
"grad_norm": 0.3003501782030781,
"learning_rate": 8.589705013396509e-09,
"loss": 0.3056,
"step": 4720
},
{
"epoch": 1.9568627450980394,
"grad_norm": 0.33048912522337326,
"learning_rate": 6.043316818643008e-09,
"loss": 0.3189,
"step": 4740
},
{
"epoch": 1.9651186790505677,
"grad_norm": 0.3300692526475954,
"learning_rate": 3.9430121083106065e-09,
"loss": 0.3153,
"step": 4760
},
{
"epoch": 1.973374613003096,
"grad_norm": 0.3265423101136748,
"learning_rate": 2.2891665550200946e-09,
"loss": 0.3142,
"step": 4780
},
{
"epoch": 1.9816305469556244,
"grad_norm": 0.3321586725232749,
"learning_rate": 1.0820759751309363e-09,
"loss": 0.3133,
"step": 4800
},
{
"epoch": 1.9898864809081527,
"grad_norm": 0.3312240597270669,
"learning_rate": 3.219562758302597e-10,
"loss": 0.3094,
"step": 4820
},
{
"epoch": 1.998142414860681,
"grad_norm": 0.32676874574809356,
"learning_rate": 8.943416513751412e-12,
"loss": 0.3136,
"step": 4840
},
{
"epoch": 1.9997936016511868,
"eval_loss": 0.49619075655937195,
"eval_runtime": 93.5055,
"eval_samples_per_second": 5.754,
"eval_steps_per_second": 0.053,
"step": 4844
},
{
"epoch": 1.9997936016511868,
"step": 4844,
"total_flos": 2.009123424043008e+16,
"train_loss": 0.08715950149479826,
"train_runtime": 137060.7981,
"train_samples_per_second": 4.524,
"train_steps_per_second": 0.035
}
],
"logging_steps": 20,
"max_steps": 4844,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.009123424043008e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}