{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9997936016511868, "eval_steps": 500, "global_step": 4844, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000412796697626419, "grad_norm": 19.038115839779415, "learning_rate": 3.424657534246575e-08, "loss": 0.9872, "step": 1 }, { "epoch": 0.008255933952528379, "grad_norm": 2.2532416397119936, "learning_rate": 6.849315068493151e-07, "loss": 0.9325, "step": 20 }, { "epoch": 0.016511867905056758, "grad_norm": 1.273128125464113, "learning_rate": 1.3698630136986302e-06, "loss": 0.7772, "step": 40 }, { "epoch": 0.02476780185758514, "grad_norm": 1.7056377918705128, "learning_rate": 2.0547945205479454e-06, "loss": 0.6878, "step": 60 }, { "epoch": 0.033023735810113516, "grad_norm": 1.1237902715212524, "learning_rate": 2.7397260273972604e-06, "loss": 0.6306, "step": 80 }, { "epoch": 0.0412796697626419, "grad_norm": 0.7107592373281232, "learning_rate": 3.4246575342465754e-06, "loss": 0.5945, "step": 100 }, { "epoch": 0.04953560371517028, "grad_norm": 0.5435287549705509, "learning_rate": 4.109589041095891e-06, "loss": 0.5678, "step": 120 }, { "epoch": 0.05779153766769866, "grad_norm": 0.5519820327604449, "learning_rate": 4.7945205479452054e-06, "loss": 0.5645, "step": 140 }, { "epoch": 0.06604747162022703, "grad_norm": 0.45781758347158724, "learning_rate": 4.9998904438825655e-06, "loss": 0.5435, "step": 160 }, { "epoch": 0.07430340557275542, "grad_norm": 0.4113409226796853, "learning_rate": 4.9993538656061865e-06, "loss": 0.5384, "step": 180 }, { "epoch": 0.0825593395252838, "grad_norm": 0.4061348667993117, "learning_rate": 4.998370238474193e-06, "loss": 0.5251, "step": 200 }, { "epoch": 0.09081527347781218, "grad_norm": 0.3595538605503618, "learning_rate": 4.996939738423808e-06, "loss": 0.5268, "step": 220 }, { "epoch": 0.09907120743034056, "grad_norm": 0.43476988938274347, "learning_rate": 4.995062621322529e-06, "loss": 0.5218, "step": 240 }, { "epoch": 0.10732714138286893, "grad_norm": 0.37354960760902434, "learning_rate": 4.992739222922349e-06, "loss": 0.5092, "step": 260 }, { "epoch": 0.11558307533539731, "grad_norm": 0.541197152311886, "learning_rate": 4.989969958799716e-06, "loss": 0.5164, "step": 280 }, { "epoch": 0.1238390092879257, "grad_norm": 0.6905500202450845, "learning_rate": 4.9867553242811925e-06, "loss": 0.5093, "step": 300 }, { "epoch": 0.13209494324045407, "grad_norm": 0.35631823372676763, "learning_rate": 4.983095894354858e-06, "loss": 0.5029, "step": 320 }, { "epoch": 0.14035087719298245, "grad_norm": 0.41074164917056266, "learning_rate": 4.978992323567467e-06, "loss": 0.5081, "step": 340 }, { "epoch": 0.14860681114551083, "grad_norm": 0.38311089041727175, "learning_rate": 4.974445345907372e-06, "loss": 0.4942, "step": 360 }, { "epoch": 0.1568627450980392, "grad_norm": 1.276850727632135, "learning_rate": 4.9694557746732365e-06, "loss": 0.4941, "step": 380 }, { "epoch": 0.1651186790505676, "grad_norm": 0.37634442822337005, "learning_rate": 4.9640245023285645e-06, "loss": 0.488, "step": 400 }, { "epoch": 0.17337461300309598, "grad_norm": 0.34093150208574735, "learning_rate": 4.958152500342071e-06, "loss": 0.4936, "step": 420 }, { "epoch": 0.18163054695562436, "grad_norm": 0.3697170253248781, "learning_rate": 4.951840819013918e-06, "loss": 0.4823, "step": 440 }, { "epoch": 0.18988648090815274, "grad_norm": 0.3673753466126514, "learning_rate": 4.94509058728785e-06, "loss": 0.4865, "step": 460 }, { "epoch": 0.19814241486068113, "grad_norm": 0.36105295583859276, "learning_rate": 4.937903012549266e-06, "loss": 0.4754, "step": 480 }, { "epoch": 0.20639834881320948, "grad_norm": 0.36846225805717936, "learning_rate": 4.930279380409266e-06, "loss": 0.4704, "step": 500 }, { "epoch": 0.21465428276573786, "grad_norm": 0.3173068336889516, "learning_rate": 4.922221054474686e-06, "loss": 0.473, "step": 520 }, { "epoch": 0.22291021671826625, "grad_norm": 0.35259900729321336, "learning_rate": 4.913729476104205e-06, "loss": 0.4595, "step": 540 }, { "epoch": 0.23116615067079463, "grad_norm": 0.36729347937445916, "learning_rate": 4.9048061641505355e-06, "loss": 0.4741, "step": 560 }, { "epoch": 0.239422084623323, "grad_norm": 0.3318610632934225, "learning_rate": 4.8954527146887455e-06, "loss": 0.4648, "step": 580 }, { "epoch": 0.2476780185758514, "grad_norm": 5.235550050863072, "learning_rate": 4.885670800730784e-06, "loss": 0.4768, "step": 600 }, { "epoch": 0.25593395252837975, "grad_norm": 0.3182067180223638, "learning_rate": 4.87546217192623e-06, "loss": 0.4659, "step": 620 }, { "epoch": 0.26418988648090813, "grad_norm": 0.3656873978361783, "learning_rate": 4.864828654249344e-06, "loss": 0.4598, "step": 640 }, { "epoch": 0.2724458204334365, "grad_norm": 0.3501798939645083, "learning_rate": 4.853772149672461e-06, "loss": 0.4681, "step": 660 }, { "epoch": 0.2807017543859649, "grad_norm": 0.3777815400079612, "learning_rate": 4.842294635825794e-06, "loss": 0.4615, "step": 680 }, { "epoch": 0.2889576883384933, "grad_norm": 0.38852070579564624, "learning_rate": 4.830398165643704e-06, "loss": 0.4518, "step": 700 }, { "epoch": 0.29721362229102166, "grad_norm": 0.32960545022053456, "learning_rate": 4.818084866997499e-06, "loss": 0.4551, "step": 720 }, { "epoch": 0.30546955624355004, "grad_norm": 0.3581229581442626, "learning_rate": 4.805356942314833e-06, "loss": 0.4585, "step": 740 }, { "epoch": 0.3137254901960784, "grad_norm": 0.32377521697576256, "learning_rate": 4.792216668185765e-06, "loss": 0.4469, "step": 760 }, { "epoch": 0.3219814241486068, "grad_norm": 0.32217232079348435, "learning_rate": 4.778666394955554e-06, "loss": 0.4626, "step": 780 }, { "epoch": 0.3302373581011352, "grad_norm": 0.4595767097639659, "learning_rate": 4.764708546304267e-06, "loss": 0.4624, "step": 800 }, { "epoch": 0.3384932920536636, "grad_norm": 0.32949086041842535, "learning_rate": 4.75034561881326e-06, "loss": 0.446, "step": 820 }, { "epoch": 0.34674922600619196, "grad_norm": 0.4894422691515238, "learning_rate": 4.735580181518631e-06, "loss": 0.4541, "step": 840 }, { "epoch": 0.35500515995872034, "grad_norm": 0.3280899378337465, "learning_rate": 4.720414875451701e-06, "loss": 0.4487, "step": 860 }, { "epoch": 0.3632610939112487, "grad_norm": 1.837588625073277, "learning_rate": 4.704852413166629e-06, "loss": 0.4518, "step": 880 }, { "epoch": 0.3715170278637771, "grad_norm": 0.40018979567066226, "learning_rate": 4.688895578255228e-06, "loss": 0.4465, "step": 900 }, { "epoch": 0.3797729618163055, "grad_norm": 0.3242631003772359, "learning_rate": 4.672547224849072e-06, "loss": 0.4411, "step": 920 }, { "epoch": 0.38802889576883387, "grad_norm": 0.3442813940284623, "learning_rate": 4.655810277108994e-06, "loss": 0.4457, "step": 940 }, { "epoch": 0.39628482972136225, "grad_norm": 0.32602733959556546, "learning_rate": 4.638687728702054e-06, "loss": 0.4433, "step": 960 }, { "epoch": 0.40454076367389064, "grad_norm": 0.354500321094183, "learning_rate": 4.6211826422660685e-06, "loss": 0.4383, "step": 980 }, { "epoch": 0.41279669762641896, "grad_norm": 0.348616547530873, "learning_rate": 4.6032981488618155e-06, "loss": 0.4413, "step": 1000 }, { "epoch": 0.42105263157894735, "grad_norm": 0.32717814527815386, "learning_rate": 4.585037447412993e-06, "loss": 0.4368, "step": 1020 }, { "epoch": 0.4293085655314757, "grad_norm": 0.6796110151903153, "learning_rate": 4.566403804134042e-06, "loss": 0.4436, "step": 1040 }, { "epoch": 0.4375644994840041, "grad_norm": 0.3316261638741136, "learning_rate": 4.547400551945927e-06, "loss": 0.4425, "step": 1060 }, { "epoch": 0.4458204334365325, "grad_norm": 0.40026371472445244, "learning_rate": 4.528031089880001e-06, "loss": 0.4457, "step": 1080 }, { "epoch": 0.4540763673890609, "grad_norm": 0.32173726498871624, "learning_rate": 4.508298882470025e-06, "loss": 0.4437, "step": 1100 }, { "epoch": 0.46233230134158926, "grad_norm": 0.33503635231733775, "learning_rate": 4.488207459132484e-06, "loss": 0.4379, "step": 1120 }, { "epoch": 0.47058823529411764, "grad_norm": 0.35244768504197327, "learning_rate": 4.467760413535303e-06, "loss": 0.44, "step": 1140 }, { "epoch": 0.478844169246646, "grad_norm": 0.31115577007431877, "learning_rate": 4.44696140295505e-06, "loss": 0.4329, "step": 1160 }, { "epoch": 0.4871001031991744, "grad_norm": 0.3238091738004505, "learning_rate": 4.425814147622786e-06, "loss": 0.4337, "step": 1180 }, { "epoch": 0.4953560371517028, "grad_norm": 0.49541016985287156, "learning_rate": 4.404322430058634e-06, "loss": 0.4287, "step": 1200 }, { "epoch": 0.5036119711042312, "grad_norm": 0.37221572328367447, "learning_rate": 4.382490094395223e-06, "loss": 0.464, "step": 1220 }, { "epoch": 0.5118679050567595, "grad_norm": 0.33878190692897947, "learning_rate": 4.360321045690098e-06, "loss": 0.4409, "step": 1240 }, { "epoch": 0.5201238390092879, "grad_norm": 0.3936174788244509, "learning_rate": 4.337819249227243e-06, "loss": 0.4343, "step": 1260 }, { "epoch": 0.5283797729618163, "grad_norm": 0.33598616505477236, "learning_rate": 4.3149887298078275e-06, "loss": 0.431, "step": 1280 }, { "epoch": 0.5366357069143447, "grad_norm": 0.3519965938188954, "learning_rate": 4.2918335710303035e-06, "loss": 0.426, "step": 1300 }, { "epoch": 0.544891640866873, "grad_norm": 0.320454217828392, "learning_rate": 4.268357914559994e-06, "loss": 0.4304, "step": 1320 }, { "epoch": 0.5531475748194015, "grad_norm": 0.3719935953584569, "learning_rate": 4.244565959388287e-06, "loss": 0.436, "step": 1340 }, { "epoch": 0.5614035087719298, "grad_norm": 0.3613570450407545, "learning_rate": 4.2204619610815846e-06, "loss": 0.4335, "step": 1360 }, { "epoch": 0.5696594427244582, "grad_norm": 0.3126539947940401, "learning_rate": 4.19605023102012e-06, "loss": 0.4263, "step": 1380 }, { "epoch": 0.5779153766769866, "grad_norm": 0.6458529290452903, "learning_rate": 4.171335135626809e-06, "loss": 0.4265, "step": 1400 }, { "epoch": 0.586171310629515, "grad_norm": 0.3604632525755521, "learning_rate": 4.146321095586238e-06, "loss": 0.4362, "step": 1420 }, { "epoch": 0.5944272445820433, "grad_norm": 0.30925366125534925, "learning_rate": 4.121012585053958e-06, "loss": 0.4233, "step": 1440 }, { "epoch": 0.6026831785345718, "grad_norm": 0.3100306182604728, "learning_rate": 4.095414130856215e-06, "loss": 0.4274, "step": 1460 }, { "epoch": 0.6109391124871001, "grad_norm": 0.3253333926411185, "learning_rate": 4.069530311680247e-06, "loss": 0.4215, "step": 1480 }, { "epoch": 0.6191950464396285, "grad_norm": 0.3161203579495709, "learning_rate": 4.043365757255323e-06, "loss": 0.4195, "step": 1500 }, { "epoch": 0.6274509803921569, "grad_norm": 0.5372305576307022, "learning_rate": 4.016925147524638e-06, "loss": 0.4207, "step": 1520 }, { "epoch": 0.6357069143446853, "grad_norm": 0.4217012521666707, "learning_rate": 3.99021321180823e-06, "loss": 0.4189, "step": 1540 }, { "epoch": 0.6439628482972136, "grad_norm": 0.31058380585253026, "learning_rate": 3.96323472795707e-06, "loss": 0.4224, "step": 1560 }, { "epoch": 0.6522187822497421, "grad_norm": 0.3096196075520948, "learning_rate": 3.935994521498468e-06, "loss": 0.4298, "step": 1580 }, { "epoch": 0.6604747162022704, "grad_norm": 0.31321817530598023, "learning_rate": 3.908497464772946e-06, "loss": 0.4165, "step": 1600 }, { "epoch": 0.6687306501547987, "grad_norm": 0.34189314788820424, "learning_rate": 3.880748476062751e-06, "loss": 0.4209, "step": 1620 }, { "epoch": 0.6769865841073271, "grad_norm": 0.314020426119392, "learning_rate": 3.852752518712135e-06, "loss": 0.4235, "step": 1640 }, { "epoch": 0.6852425180598555, "grad_norm": 0.33309865122629745, "learning_rate": 3.824514600239591e-06, "loss": 0.4272, "step": 1660 }, { "epoch": 0.6934984520123839, "grad_norm": 0.31011244271423327, "learning_rate": 3.796039771442169e-06, "loss": 0.4195, "step": 1680 }, { "epoch": 0.7017543859649122, "grad_norm": 0.31754715779100456, "learning_rate": 3.767333125492072e-06, "loss": 0.4177, "step": 1700 }, { "epoch": 0.7100103199174407, "grad_norm": 0.325717634618246, "learning_rate": 3.7383997970256525e-06, "loss": 0.4208, "step": 1720 }, { "epoch": 0.718266253869969, "grad_norm": 0.3221005127147903, "learning_rate": 3.7092449612250083e-06, "loss": 0.4258, "step": 1740 }, { "epoch": 0.7265221878224974, "grad_norm": 2.3793212357938245, "learning_rate": 3.6798738328923162e-06, "loss": 0.4117, "step": 1760 }, { "epoch": 0.7347781217750258, "grad_norm": 0.34615783300395964, "learning_rate": 3.650291665517085e-06, "loss": 0.4259, "step": 1780 }, { "epoch": 0.7430340557275542, "grad_norm": 0.30258388313281476, "learning_rate": 3.6205037503364827e-06, "loss": 0.4138, "step": 1800 }, { "epoch": 0.7512899896800825, "grad_norm": 0.319420203738432, "learning_rate": 3.590515415388919e-06, "loss": 0.414, "step": 1820 }, { "epoch": 0.759545923632611, "grad_norm": 0.3131116585624073, "learning_rate": 3.5603320245610375e-06, "loss": 0.4171, "step": 1840 }, { "epoch": 0.7678018575851393, "grad_norm": 0.44357805104504766, "learning_rate": 3.529958976628304e-06, "loss": 0.4149, "step": 1860 }, { "epoch": 0.7760577915376677, "grad_norm": 0.3482318385617704, "learning_rate": 3.499401704289347e-06, "loss": 0.4068, "step": 1880 }, { "epoch": 0.7843137254901961, "grad_norm": 1.4359924674280429, "learning_rate": 3.468665673194237e-06, "loss": 0.412, "step": 1900 }, { "epoch": 0.7925696594427245, "grad_norm": 0.30332274823266964, "learning_rate": 3.437756380966866e-06, "loss": 0.4136, "step": 1920 }, { "epoch": 0.8008255933952528, "grad_norm": 0.30856700243712654, "learning_rate": 3.4066793562216135e-06, "loss": 0.41, "step": 1940 }, { "epoch": 0.8090815273477813, "grad_norm": 0.42071490847162873, "learning_rate": 3.375440157574462e-06, "loss": 0.4061, "step": 1960 }, { "epoch": 0.8173374613003096, "grad_norm": 2.6302213294970933, "learning_rate": 3.3440443726487583e-06, "loss": 0.4025, "step": 1980 }, { "epoch": 0.8255933952528379, "grad_norm": 0.29774501528511826, "learning_rate": 3.3124976170757694e-06, "loss": 0.4201, "step": 2000 }, { "epoch": 0.8338493292053664, "grad_norm": 0.32604189076380047, "learning_rate": 3.2808055334902487e-06, "loss": 0.4091, "step": 2020 }, { "epoch": 0.8421052631578947, "grad_norm": 0.3285267181834793, "learning_rate": 3.2489737905211537e-06, "loss": 0.4175, "step": 2040 }, { "epoch": 0.8503611971104231, "grad_norm": 0.310713417485478, "learning_rate": 3.217008081777726e-06, "loss": 0.4092, "step": 2060 }, { "epoch": 0.8586171310629515, "grad_norm": 0.3073088712514811, "learning_rate": 3.184914124831098e-06, "loss": 0.4053, "step": 2080 }, { "epoch": 0.8668730650154799, "grad_norm": 0.337054135152426, "learning_rate": 3.1526976601916153e-06, "loss": 0.4112, "step": 2100 }, { "epoch": 0.8751289989680082, "grad_norm": 0.35614170658863653, "learning_rate": 3.1203644502820592e-06, "loss": 0.4111, "step": 2120 }, { "epoch": 0.8833849329205367, "grad_norm": 0.31220727299527445, "learning_rate": 3.0879202784069407e-06, "loss": 0.4143, "step": 2140 }, { "epoch": 0.891640866873065, "grad_norm": 0.31989316355962877, "learning_rate": 3.05537094771807e-06, "loss": 0.4207, "step": 2160 }, { "epoch": 0.8998968008255934, "grad_norm": 0.36471398705088465, "learning_rate": 3.022722280176568e-06, "loss": 0.4121, "step": 2180 }, { "epoch": 0.9081527347781218, "grad_norm": 0.3241199840126995, "learning_rate": 2.9899801155115204e-06, "loss": 0.418, "step": 2200 }, { "epoch": 0.9164086687306502, "grad_norm": 0.414035145600113, "learning_rate": 2.9571503101754466e-06, "loss": 0.4097, "step": 2220 }, { "epoch": 0.9246646026831785, "grad_norm": 0.3068665528352179, "learning_rate": 2.9242387362967815e-06, "loss": 0.4087, "step": 2240 }, { "epoch": 0.932920536635707, "grad_norm": 0.3120720460499234, "learning_rate": 2.8912512806295573e-06, "loss": 0.4074, "step": 2260 }, { "epoch": 0.9411764705882353, "grad_norm": 0.31140736933032415, "learning_rate": 2.858193843500455e-06, "loss": 0.4048, "step": 2280 }, { "epoch": 0.9494324045407637, "grad_norm": 0.30731820437254637, "learning_rate": 2.8250723377534474e-06, "loss": 0.4025, "step": 2300 }, { "epoch": 0.957688338493292, "grad_norm": 0.31008178937315156, "learning_rate": 2.791892687692189e-06, "loss": 0.4017, "step": 2320 }, { "epoch": 0.9659442724458205, "grad_norm": 0.31453985857721983, "learning_rate": 2.7586608280203632e-06, "loss": 0.3964, "step": 2340 }, { "epoch": 0.9742002063983488, "grad_norm": 0.3454533037677527, "learning_rate": 2.725382702780164e-06, "loss": 0.4019, "step": 2360 }, { "epoch": 0.9824561403508771, "grad_norm": 0.31086090786087217, "learning_rate": 2.6920642642891114e-06, "loss": 0.4078, "step": 2380 }, { "epoch": 0.9907120743034056, "grad_norm": 0.29964942829401026, "learning_rate": 2.6587114720753882e-06, "loss": 0.4054, "step": 2400 }, { "epoch": 0.9989680082559339, "grad_norm": 0.31845271001148817, "learning_rate": 2.625330291811882e-06, "loss": 0.4017, "step": 2420 }, { "epoch": 0.9997936016511868, "eval_loss": 0.5107099413871765, "eval_runtime": 88.8692, "eval_samples_per_second": 6.054, "eval_steps_per_second": 0.056, "step": 2422 }, { "epoch": 1.0074303405572755, "grad_norm": 0.33713877028152195, "learning_rate": 2.591926694249128e-06, "loss": 0.3594, "step": 2440 }, { "epoch": 1.0156862745098039, "grad_norm": 0.35458325001613433, "learning_rate": 2.558506654147353e-06, "loss": 0.3297, "step": 2460 }, { "epoch": 1.0239422084623322, "grad_norm": 0.33361671982328645, "learning_rate": 2.525076149207788e-06, "loss": 0.3327, "step": 2480 }, { "epoch": 1.0321981424148607, "grad_norm": 0.41547804329917404, "learning_rate": 2.4916411590034672e-06, "loss": 0.3307, "step": 2500 }, { "epoch": 1.040454076367389, "grad_norm": 0.3280076602568464, "learning_rate": 2.4582076639096837e-06, "loss": 0.3295, "step": 2520 }, { "epoch": 1.0487100103199174, "grad_norm": 0.3249157740900077, "learning_rate": 2.4247816440343078e-06, "loss": 0.322, "step": 2540 }, { "epoch": 1.0569659442724457, "grad_norm": 0.35605530558146004, "learning_rate": 2.391369078148148e-06, "loss": 0.335, "step": 2560 }, { "epoch": 1.0652218782249743, "grad_norm": 0.3549484263180951, "learning_rate": 2.3579759426155552e-06, "loss": 0.3153, "step": 2580 }, { "epoch": 1.0734778121775026, "grad_norm": 0.3465115375264916, "learning_rate": 2.324608210325456e-06, "loss": 0.3264, "step": 2600 }, { "epoch": 1.081733746130031, "grad_norm": 0.3357910072373825, "learning_rate": 2.291271849623004e-06, "loss": 0.3212, "step": 2620 }, { "epoch": 1.0899896800825593, "grad_norm": 0.344383755229491, "learning_rate": 2.2579728232420524e-06, "loss": 0.3157, "step": 2640 }, { "epoch": 1.0982456140350878, "grad_norm": 0.31538428598087054, "learning_rate": 2.2247170872386205e-06, "loss": 0.3229, "step": 2660 }, { "epoch": 1.1065015479876161, "grad_norm": 0.3246184372803657, "learning_rate": 2.1915105899255617e-06, "loss": 0.3232, "step": 2680 }, { "epoch": 1.1147574819401445, "grad_norm": 0.3443636135756491, "learning_rate": 2.158359270808612e-06, "loss": 0.3203, "step": 2700 }, { "epoch": 1.1230134158926728, "grad_norm": 0.34084061773858154, "learning_rate": 2.125269059524018e-06, "loss": 0.3276, "step": 2720 }, { "epoch": 1.1312693498452013, "grad_norm": 0.46655604655860133, "learning_rate": 2.092245874777926e-06, "loss": 0.3267, "step": 2740 }, { "epoch": 1.1395252837977297, "grad_norm": 0.33228746141272103, "learning_rate": 2.059295623287729e-06, "loss": 0.3272, "step": 2760 }, { "epoch": 1.147781217750258, "grad_norm": 0.348538665454056, "learning_rate": 2.026424198725557e-06, "loss": 0.3228, "step": 2780 }, { "epoch": 1.1560371517027863, "grad_norm": 0.3708880778253579, "learning_rate": 1.9936374806641016e-06, "loss": 0.3268, "step": 2800 }, { "epoch": 1.1642930856553146, "grad_norm": 0.3467837537594021, "learning_rate": 1.96094133352496e-06, "loss": 0.3287, "step": 2820 }, { "epoch": 1.1725490196078432, "grad_norm": 0.331776837754727, "learning_rate": 1.9283416055296906e-06, "loss": 0.3135, "step": 2840 }, { "epoch": 1.1808049535603715, "grad_norm": 0.33972138880439906, "learning_rate": 1.8958441276537695e-06, "loss": 0.3165, "step": 2860 }, { "epoch": 1.1890608875128998, "grad_norm": 0.3290544472328714, "learning_rate": 1.8634547125836246e-06, "loss": 0.3231, "step": 2880 }, { "epoch": 1.1973168214654284, "grad_norm": 0.4324023615254428, "learning_rate": 1.8311791536769485e-06, "loss": 0.3191, "step": 2900 }, { "epoch": 1.2055727554179567, "grad_norm": 0.34414515738152696, "learning_rate": 1.799023223926461e-06, "loss": 0.3214, "step": 2920 }, { "epoch": 1.213828689370485, "grad_norm": 0.3522816282310391, "learning_rate": 1.766992674927322e-06, "loss": 0.322, "step": 2940 }, { "epoch": 1.2220846233230134, "grad_norm": 0.33214309648209184, "learning_rate": 1.7350932358483675e-06, "loss": 0.3245, "step": 2960 }, { "epoch": 1.2303405572755417, "grad_norm": 0.32354241105977966, "learning_rate": 1.703330612407355e-06, "loss": 0.3189, "step": 2980 }, { "epoch": 1.2385964912280703, "grad_norm": 0.33884732729314554, "learning_rate": 1.6717104858504088e-06, "loss": 0.3211, "step": 3000 }, { "epoch": 1.2468524251805986, "grad_norm": 0.3735533087692056, "learning_rate": 1.6402385119358372e-06, "loss": 0.3188, "step": 3020 }, { "epoch": 1.255108359133127, "grad_norm": 0.3379937818585682, "learning_rate": 1.6089203199225117e-06, "loss": 0.3201, "step": 3040 }, { "epoch": 1.2633642930856552, "grad_norm": 0.33385226604310425, "learning_rate": 1.5777615115629874e-06, "loss": 0.3197, "step": 3060 }, { "epoch": 1.2716202270381838, "grad_norm": 0.3353612788825473, "learning_rate": 1.546767660101537e-06, "loss": 0.3178, "step": 3080 }, { "epoch": 1.279876160990712, "grad_norm": 0.3384674558918743, "learning_rate": 1.5159443092772924e-06, "loss": 0.3235, "step": 3100 }, { "epoch": 1.2881320949432404, "grad_norm": 0.384875131380503, "learning_rate": 1.4852969723326555e-06, "loss": 0.3187, "step": 3120 }, { "epoch": 1.2963880288957688, "grad_norm": 0.44608590681938926, "learning_rate": 1.4548311310271724e-06, "loss": 0.3246, "step": 3140 }, { "epoch": 1.304643962848297, "grad_norm": 0.33842522169860273, "learning_rate": 1.4245522346570289e-06, "loss": 0.3226, "step": 3160 }, { "epoch": 1.3128998968008256, "grad_norm": 0.38547488897682075, "learning_rate": 1.3944656990803645e-06, "loss": 0.3198, "step": 3180 }, { "epoch": 1.321155830753354, "grad_norm": 0.3552101566974137, "learning_rate": 1.364576905748553e-06, "loss": 0.3199, "step": 3200 }, { "epoch": 1.3294117647058823, "grad_norm": 0.3225112166653656, "learning_rate": 1.3348912007436538e-06, "loss": 0.3194, "step": 3220 }, { "epoch": 1.3376676986584108, "grad_norm": 0.34375670285803067, "learning_rate": 1.3054138938221708e-06, "loss": 0.3279, "step": 3240 }, { "epoch": 1.3459236326109392, "grad_norm": 0.330557856124175, "learning_rate": 1.2761502574653286e-06, "loss": 0.3199, "step": 3260 }, { "epoch": 1.3541795665634675, "grad_norm": 0.3309833353922965, "learning_rate": 1.247105525936001e-06, "loss": 0.3146, "step": 3280 }, { "epoch": 1.3624355005159958, "grad_norm": 0.3452194374171664, "learning_rate": 1.2182848943424786e-06, "loss": 0.3176, "step": 3300 }, { "epoch": 1.3706914344685242, "grad_norm": 0.3216278030301951, "learning_rate": 1.1896935177092456e-06, "loss": 0.3185, "step": 3320 }, { "epoch": 1.3789473684210527, "grad_norm": 0.4265735872664729, "learning_rate": 1.16133651005492e-06, "loss": 0.3207, "step": 3340 }, { "epoch": 1.387203302373581, "grad_norm": 0.32503296466693565, "learning_rate": 1.1332189434775354e-06, "loss": 0.3199, "step": 3360 }, { "epoch": 1.3954592363261094, "grad_norm": 0.33776061146175834, "learning_rate": 1.1053458472473078e-06, "loss": 0.317, "step": 3380 }, { "epoch": 1.403715170278638, "grad_norm": 0.33951723955583724, "learning_rate": 1.0777222069070797e-06, "loss": 0.3152, "step": 3400 }, { "epoch": 1.4119711042311662, "grad_norm": 0.35231254237908666, "learning_rate": 1.0503529633805711e-06, "loss": 0.3156, "step": 3420 }, { "epoch": 1.4202270381836946, "grad_norm": 0.34046838697292375, "learning_rate": 1.0232430120886227e-06, "loss": 0.3198, "step": 3440 }, { "epoch": 1.4284829721362229, "grad_norm": 0.31629713031455653, "learning_rate": 9.963972020735658e-07, "loss": 0.3149, "step": 3460 }, { "epoch": 1.4367389060887512, "grad_norm": 0.8014048883476281, "learning_rate": 9.698203351319019e-07, "loss": 0.3235, "step": 3480 }, { "epoch": 1.4449948400412795, "grad_norm": 0.32201920421165664, "learning_rate": 9.435171649554234e-07, "loss": 0.3217, "step": 3500 }, { "epoch": 1.453250773993808, "grad_norm": 0.36221640610133427, "learning_rate": 9.17492396280934e-07, "loss": 0.3156, "step": 3520 }, { "epoch": 1.4615067079463364, "grad_norm": 0.32962860003755834, "learning_rate": 8.917506840487448e-07, "loss": 0.3152, "step": 3540 }, { "epoch": 1.4697626418988647, "grad_norm": 0.3605181380604464, "learning_rate": 8.662966325700531e-07, "loss": 0.3206, "step": 3560 }, { "epoch": 1.4780185758513933, "grad_norm": 0.32779483961721695, "learning_rate": 8.411347947033982e-07, "loss": 0.3095, "step": 3580 }, { "epoch": 1.4862745098039216, "grad_norm": 0.3270784844703272, "learning_rate": 8.162696710403026e-07, "loss": 0.319, "step": 3600 }, { "epoch": 1.49453044375645, "grad_norm": 0.3265816952545498, "learning_rate": 7.917057091002772e-07, "loss": 0.3161, "step": 3620 }, { "epoch": 1.5027863777089783, "grad_norm": 0.32668356632045076, "learning_rate": 7.674473025353063e-07, "loss": 0.3136, "step": 3640 }, { "epoch": 1.5110423116615066, "grad_norm": 0.3329149198486246, "learning_rate": 7.434987903439795e-07, "loss": 0.3145, "step": 3660 }, { "epoch": 1.519298245614035, "grad_norm": 0.33015476593592213, "learning_rate": 7.198644560953877e-07, "loss": 0.3154, "step": 3680 }, { "epoch": 1.5275541795665635, "grad_norm": 0.3427897311331206, "learning_rate": 6.965485271629426e-07, "loss": 0.3219, "step": 3700 }, { "epoch": 1.5358101135190918, "grad_norm": 0.31710290567990695, "learning_rate": 6.735551739682458e-07, "loss": 0.3118, "step": 3720 }, { "epoch": 1.5440660474716204, "grad_norm": 0.3709576851610617, "learning_rate": 6.508885092351374e-07, "loss": 0.3147, "step": 3740 }, { "epoch": 1.5523219814241487, "grad_norm": 0.34606775925501615, "learning_rate": 6.285525872540777e-07, "loss": 0.3151, "step": 3760 }, { "epoch": 1.560577915376677, "grad_norm": 0.32914754720880296, "learning_rate": 6.065514031569658e-07, "loss": 0.319, "step": 3780 }, { "epoch": 1.5688338493292053, "grad_norm": 0.32959782478095573, "learning_rate": 5.848888922025553e-07, "loss": 0.3109, "step": 3800 }, { "epoch": 1.5770897832817337, "grad_norm": 0.3461782278815208, "learning_rate": 5.635689290725629e-07, "loss": 0.3118, "step": 3820 }, { "epoch": 1.585345717234262, "grad_norm": 0.33719762865745856, "learning_rate": 5.425953271786289e-07, "loss": 0.3142, "step": 3840 }, { "epoch": 1.5936016511867905, "grad_norm": 0.3229411688412049, "learning_rate": 5.219718379802203e-07, "loss": 0.3074, "step": 3860 }, { "epoch": 1.6018575851393189, "grad_norm": 0.339153428971675, "learning_rate": 5.017021503136299e-07, "loss": 0.3085, "step": 3880 }, { "epoch": 1.6101135190918474, "grad_norm": 0.328366872614432, "learning_rate": 4.817898897321648e-07, "loss": 0.3211, "step": 3900 }, { "epoch": 1.6183694530443757, "grad_norm": 0.3308504300204108, "learning_rate": 4.6223861785766184e-07, "loss": 0.3219, "step": 3920 }, { "epoch": 1.626625386996904, "grad_norm": 0.3290571513653433, "learning_rate": 4.430518317434351e-07, "loss": 0.3098, "step": 3940 }, { "epoch": 1.6348813209494324, "grad_norm": 0.32391898185345264, "learning_rate": 4.242329632487707e-07, "loss": 0.3165, "step": 3960 }, { "epoch": 1.6431372549019607, "grad_norm": 0.32636444036143686, "learning_rate": 4.057853784250884e-07, "loss": 0.3114, "step": 3980 }, { "epoch": 1.651393188854489, "grad_norm": 0.4579751480579007, "learning_rate": 3.877123769138652e-07, "loss": 0.3157, "step": 4000 }, { "epoch": 1.6596491228070176, "grad_norm": 0.3305784303773594, "learning_rate": 3.7001719135644793e-07, "loss": 0.3178, "step": 4020 }, { "epoch": 1.667905056759546, "grad_norm": 0.33328615826236435, "learning_rate": 3.527029868158394e-07, "loss": 0.3149, "step": 4040 }, { "epoch": 1.6761609907120743, "grad_norm": 0.3376266856154567, "learning_rate": 3.3577286021058085e-07, "loss": 0.3214, "step": 4060 }, { "epoch": 1.6844169246646028, "grad_norm": 0.36052770349787855, "learning_rate": 3.192298397608165e-07, "loss": 0.3181, "step": 4080 }, { "epoch": 1.6926728586171311, "grad_norm": 0.3186034103836647, "learning_rate": 3.0307688444664975e-07, "loss": 0.3109, "step": 4100 }, { "epoch": 1.7009287925696595, "grad_norm": 0.32364083786767023, "learning_rate": 2.873168834788842e-07, "loss": 0.3164, "step": 4120 }, { "epoch": 1.7091847265221878, "grad_norm": 0.3352263407283556, "learning_rate": 2.719526557822391e-07, "loss": 0.3135, "step": 4140 }, { "epoch": 1.717440660474716, "grad_norm": 0.324247569201673, "learning_rate": 2.5698694949114504e-07, "loss": 0.3172, "step": 4160 }, { "epoch": 1.7256965944272444, "grad_norm": 0.3241547440096801, "learning_rate": 2.4242244145819187e-07, "loss": 0.311, "step": 4180 }, { "epoch": 1.733952528379773, "grad_norm": 0.3383830207704809, "learning_rate": 2.2826173677533593e-07, "loss": 0.3155, "step": 4200 }, { "epoch": 1.7422084623323013, "grad_norm": 0.3950633617255487, "learning_rate": 2.1450736830793405e-07, "loss": 0.3142, "step": 4220 }, { "epoch": 1.7504643962848299, "grad_norm": 0.5099914758815567, "learning_rate": 2.0116179624170478e-07, "loss": 0.3107, "step": 4240 }, { "epoch": 1.7587203302373582, "grad_norm": 1.0026514809885159, "learning_rate": 1.8822740764268098e-07, "loss": 0.3087, "step": 4260 }, { "epoch": 1.7669762641898865, "grad_norm": 1662.5837568982583, "learning_rate": 1.757065160302504e-07, "loss": 0.3182, "step": 4280 }, { "epoch": 1.7752321981424148, "grad_norm": 0.3278171902579247, "learning_rate": 1.6360136096334107e-07, "loss": 0.3066, "step": 4300 }, { "epoch": 1.7834881320949432, "grad_norm": 0.3533064691586645, "learning_rate": 1.519141076398442e-07, "loss": 0.3166, "step": 4320 }, { "epoch": 1.7917440660474715, "grad_norm": 0.8178680644268199, "learning_rate": 1.406468465093344e-07, "loss": 0.3167, "step": 4340 }, { "epoch": 1.8, "grad_norm": 0.5677100980648624, "learning_rate": 1.2980159289915805e-07, "loss": 0.3139, "step": 4360 }, { "epoch": 1.8082559339525284, "grad_norm": 0.327287746933213, "learning_rate": 1.1938028665396172e-07, "loss": 0.3123, "step": 4380 }, { "epoch": 1.816511867905057, "grad_norm": 0.3594205243876799, "learning_rate": 1.0938479178871892e-07, "loss": 0.3158, "step": 4400 }, { "epoch": 1.8247678018575852, "grad_norm": 0.3265638946104959, "learning_rate": 9.981689615532364e-08, "loss": 0.3124, "step": 4420 }, { "epoch": 1.8330237358101136, "grad_norm": 0.33268480190653027, "learning_rate": 9.06783111228024e-08, "loss": 0.3161, "step": 4440 }, { "epoch": 1.841279669762642, "grad_norm": 0.3434703383531059, "learning_rate": 8.19706712712115e-08, "loss": 0.3139, "step": 4460 }, { "epoch": 1.8495356037151702, "grad_norm": 0.3476346041767014, "learning_rate": 7.369553409926427e-08, "loss": 0.313, "step": 4480 }, { "epoch": 1.8577915376676986, "grad_norm": 0.32495041685434667, "learning_rate": 6.585437974574921e-08, "loss": 0.308, "step": 4500 }, { "epoch": 1.8660474716202269, "grad_norm": 0.334105753613722, "learning_rate": 5.844861072478336e-08, "loss": 0.3146, "step": 4520 }, { "epoch": 1.8743034055727554, "grad_norm": 0.32248303496350533, "learning_rate": 5.147955167495111e-08, "loss": 0.3164, "step": 4540 }, { "epoch": 1.8825593395252838, "grad_norm": 0.3170520025481871, "learning_rate": 4.494844912237145e-08, "loss": 0.309, "step": 4560 }, { "epoch": 1.8908152734778123, "grad_norm": 0.32425657312680345, "learning_rate": 3.885647125773578e-08, "loss": 0.3158, "step": 4580 }, { "epoch": 1.8990712074303406, "grad_norm": 0.32676487500274765, "learning_rate": 3.320470772736062e-08, "loss": 0.3176, "step": 4600 }, { "epoch": 1.907327141382869, "grad_norm": 0.3366864111509815, "learning_rate": 2.799416943828598e-08, "loss": 0.318, "step": 4620 }, { "epoch": 1.9155830753353973, "grad_norm": 0.33022963805900657, "learning_rate": 2.3225788377459478e-08, "loss": 0.3161, "step": 4640 }, { "epoch": 1.9238390092879256, "grad_norm": 0.4334852710323891, "learning_rate": 1.890041744503468e-08, "loss": 0.3207, "step": 4660 }, { "epoch": 1.932094943240454, "grad_norm": 0.3368172513862861, "learning_rate": 1.5018830301817277e-08, "loss": 0.3112, "step": 4680 }, { "epoch": 1.9403508771929825, "grad_norm": 0.328478614234708, "learning_rate": 1.1581721230883302e-08, "loss": 0.3175, "step": 4700 }, { "epoch": 1.9486068111455108, "grad_norm": 0.3003501782030781, "learning_rate": 8.589705013396509e-09, "loss": 0.3056, "step": 4720 }, { "epoch": 1.9568627450980394, "grad_norm": 0.33048912522337326, "learning_rate": 6.043316818643008e-09, "loss": 0.3189, "step": 4740 }, { "epoch": 1.9651186790505677, "grad_norm": 0.3300692526475954, "learning_rate": 3.9430121083106065e-09, "loss": 0.3153, "step": 4760 }, { "epoch": 1.973374613003096, "grad_norm": 0.3265423101136748, "learning_rate": 2.2891665550200946e-09, "loss": 0.3142, "step": 4780 }, { "epoch": 1.9816305469556244, "grad_norm": 0.3321586725232749, "learning_rate": 1.0820759751309363e-09, "loss": 0.3133, "step": 4800 }, { "epoch": 1.9898864809081527, "grad_norm": 0.3312240597270669, "learning_rate": 3.219562758302597e-10, "loss": 0.3094, "step": 4820 }, { "epoch": 1.998142414860681, "grad_norm": 0.32676874574809356, "learning_rate": 8.943416513751412e-12, "loss": 0.3136, "step": 4840 }, { "epoch": 1.9997936016511868, "eval_loss": 0.49619075655937195, "eval_runtime": 93.5055, "eval_samples_per_second": 5.754, "eval_steps_per_second": 0.053, "step": 4844 }, { "epoch": 1.9997936016511868, "step": 4844, "total_flos": 2.009123424043008e+16, "train_loss": 0.08715950149479826, "train_runtime": 137060.7981, "train_samples_per_second": 4.524, "train_steps_per_second": 0.035 } ], "logging_steps": 20, "max_steps": 4844, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.009123424043008e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }