|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9997936016511868, |
|
"eval_steps": 500, |
|
"global_step": 4844, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.000412796697626419, |
|
"grad_norm": 19.038115839779415, |
|
"learning_rate": 3.424657534246575e-08, |
|
"loss": 0.9872, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.008255933952528379, |
|
"grad_norm": 2.2532416397119936, |
|
"learning_rate": 6.849315068493151e-07, |
|
"loss": 0.9325, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.016511867905056758, |
|
"grad_norm": 1.273128125464113, |
|
"learning_rate": 1.3698630136986302e-06, |
|
"loss": 0.7772, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.02476780185758514, |
|
"grad_norm": 1.7056377918705128, |
|
"learning_rate": 2.0547945205479454e-06, |
|
"loss": 0.6878, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.033023735810113516, |
|
"grad_norm": 1.1237902715212524, |
|
"learning_rate": 2.7397260273972604e-06, |
|
"loss": 0.6306, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0412796697626419, |
|
"grad_norm": 0.7107592373281232, |
|
"learning_rate": 3.4246575342465754e-06, |
|
"loss": 0.5945, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.04953560371517028, |
|
"grad_norm": 0.5435287549705509, |
|
"learning_rate": 4.109589041095891e-06, |
|
"loss": 0.5678, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.05779153766769866, |
|
"grad_norm": 0.5519820327604449, |
|
"learning_rate": 4.7945205479452054e-06, |
|
"loss": 0.5645, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.06604747162022703, |
|
"grad_norm": 0.45781758347158724, |
|
"learning_rate": 4.9998904438825655e-06, |
|
"loss": 0.5435, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.07430340557275542, |
|
"grad_norm": 0.4113409226796853, |
|
"learning_rate": 4.9993538656061865e-06, |
|
"loss": 0.5384, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.0825593395252838, |
|
"grad_norm": 0.4061348667993117, |
|
"learning_rate": 4.998370238474193e-06, |
|
"loss": 0.5251, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.09081527347781218, |
|
"grad_norm": 0.3595538605503618, |
|
"learning_rate": 4.996939738423808e-06, |
|
"loss": 0.5268, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.09907120743034056, |
|
"grad_norm": 0.43476988938274347, |
|
"learning_rate": 4.995062621322529e-06, |
|
"loss": 0.5218, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.10732714138286893, |
|
"grad_norm": 0.37354960760902434, |
|
"learning_rate": 4.992739222922349e-06, |
|
"loss": 0.5092, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.11558307533539731, |
|
"grad_norm": 0.541197152311886, |
|
"learning_rate": 4.989969958799716e-06, |
|
"loss": 0.5164, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.1238390092879257, |
|
"grad_norm": 0.6905500202450845, |
|
"learning_rate": 4.9867553242811925e-06, |
|
"loss": 0.5093, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.13209494324045407, |
|
"grad_norm": 0.35631823372676763, |
|
"learning_rate": 4.983095894354858e-06, |
|
"loss": 0.5029, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.14035087719298245, |
|
"grad_norm": 0.41074164917056266, |
|
"learning_rate": 4.978992323567467e-06, |
|
"loss": 0.5081, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.14860681114551083, |
|
"grad_norm": 0.38311089041727175, |
|
"learning_rate": 4.974445345907372e-06, |
|
"loss": 0.4942, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.1568627450980392, |
|
"grad_norm": 1.276850727632135, |
|
"learning_rate": 4.9694557746732365e-06, |
|
"loss": 0.4941, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.1651186790505676, |
|
"grad_norm": 0.37634442822337005, |
|
"learning_rate": 4.9640245023285645e-06, |
|
"loss": 0.488, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.17337461300309598, |
|
"grad_norm": 0.34093150208574735, |
|
"learning_rate": 4.958152500342071e-06, |
|
"loss": 0.4936, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.18163054695562436, |
|
"grad_norm": 0.3697170253248781, |
|
"learning_rate": 4.951840819013918e-06, |
|
"loss": 0.4823, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.18988648090815274, |
|
"grad_norm": 0.3673753466126514, |
|
"learning_rate": 4.94509058728785e-06, |
|
"loss": 0.4865, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.19814241486068113, |
|
"grad_norm": 0.36105295583859276, |
|
"learning_rate": 4.937903012549266e-06, |
|
"loss": 0.4754, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.20639834881320948, |
|
"grad_norm": 0.36846225805717936, |
|
"learning_rate": 4.930279380409266e-06, |
|
"loss": 0.4704, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.21465428276573786, |
|
"grad_norm": 0.3173068336889516, |
|
"learning_rate": 4.922221054474686e-06, |
|
"loss": 0.473, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.22291021671826625, |
|
"grad_norm": 0.35259900729321336, |
|
"learning_rate": 4.913729476104205e-06, |
|
"loss": 0.4595, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.23116615067079463, |
|
"grad_norm": 0.36729347937445916, |
|
"learning_rate": 4.9048061641505355e-06, |
|
"loss": 0.4741, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.239422084623323, |
|
"grad_norm": 0.3318610632934225, |
|
"learning_rate": 4.8954527146887455e-06, |
|
"loss": 0.4648, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.2476780185758514, |
|
"grad_norm": 5.235550050863072, |
|
"learning_rate": 4.885670800730784e-06, |
|
"loss": 0.4768, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.25593395252837975, |
|
"grad_norm": 0.3182067180223638, |
|
"learning_rate": 4.87546217192623e-06, |
|
"loss": 0.4659, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.26418988648090813, |
|
"grad_norm": 0.3656873978361783, |
|
"learning_rate": 4.864828654249344e-06, |
|
"loss": 0.4598, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.2724458204334365, |
|
"grad_norm": 0.3501798939645083, |
|
"learning_rate": 4.853772149672461e-06, |
|
"loss": 0.4681, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.2807017543859649, |
|
"grad_norm": 0.3777815400079612, |
|
"learning_rate": 4.842294635825794e-06, |
|
"loss": 0.4615, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.2889576883384933, |
|
"grad_norm": 0.38852070579564624, |
|
"learning_rate": 4.830398165643704e-06, |
|
"loss": 0.4518, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.29721362229102166, |
|
"grad_norm": 0.32960545022053456, |
|
"learning_rate": 4.818084866997499e-06, |
|
"loss": 0.4551, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.30546955624355004, |
|
"grad_norm": 0.3581229581442626, |
|
"learning_rate": 4.805356942314833e-06, |
|
"loss": 0.4585, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.3137254901960784, |
|
"grad_norm": 0.32377521697576256, |
|
"learning_rate": 4.792216668185765e-06, |
|
"loss": 0.4469, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.3219814241486068, |
|
"grad_norm": 0.32217232079348435, |
|
"learning_rate": 4.778666394955554e-06, |
|
"loss": 0.4626, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.3302373581011352, |
|
"grad_norm": 0.4595767097639659, |
|
"learning_rate": 4.764708546304267e-06, |
|
"loss": 0.4624, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.3384932920536636, |
|
"grad_norm": 0.32949086041842535, |
|
"learning_rate": 4.75034561881326e-06, |
|
"loss": 0.446, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.34674922600619196, |
|
"grad_norm": 0.4894422691515238, |
|
"learning_rate": 4.735580181518631e-06, |
|
"loss": 0.4541, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.35500515995872034, |
|
"grad_norm": 0.3280899378337465, |
|
"learning_rate": 4.720414875451701e-06, |
|
"loss": 0.4487, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.3632610939112487, |
|
"grad_norm": 1.837588625073277, |
|
"learning_rate": 4.704852413166629e-06, |
|
"loss": 0.4518, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.3715170278637771, |
|
"grad_norm": 0.40018979567066226, |
|
"learning_rate": 4.688895578255228e-06, |
|
"loss": 0.4465, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.3797729618163055, |
|
"grad_norm": 0.3242631003772359, |
|
"learning_rate": 4.672547224849072e-06, |
|
"loss": 0.4411, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.38802889576883387, |
|
"grad_norm": 0.3442813940284623, |
|
"learning_rate": 4.655810277108994e-06, |
|
"loss": 0.4457, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.39628482972136225, |
|
"grad_norm": 0.32602733959556546, |
|
"learning_rate": 4.638687728702054e-06, |
|
"loss": 0.4433, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.40454076367389064, |
|
"grad_norm": 0.354500321094183, |
|
"learning_rate": 4.6211826422660685e-06, |
|
"loss": 0.4383, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.41279669762641896, |
|
"grad_norm": 0.348616547530873, |
|
"learning_rate": 4.6032981488618155e-06, |
|
"loss": 0.4413, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.42105263157894735, |
|
"grad_norm": 0.32717814527815386, |
|
"learning_rate": 4.585037447412993e-06, |
|
"loss": 0.4368, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.4293085655314757, |
|
"grad_norm": 0.6796110151903153, |
|
"learning_rate": 4.566403804134042e-06, |
|
"loss": 0.4436, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.4375644994840041, |
|
"grad_norm": 0.3316261638741136, |
|
"learning_rate": 4.547400551945927e-06, |
|
"loss": 0.4425, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.4458204334365325, |
|
"grad_norm": 0.40026371472445244, |
|
"learning_rate": 4.528031089880001e-06, |
|
"loss": 0.4457, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.4540763673890609, |
|
"grad_norm": 0.32173726498871624, |
|
"learning_rate": 4.508298882470025e-06, |
|
"loss": 0.4437, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.46233230134158926, |
|
"grad_norm": 0.33503635231733775, |
|
"learning_rate": 4.488207459132484e-06, |
|
"loss": 0.4379, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.47058823529411764, |
|
"grad_norm": 0.35244768504197327, |
|
"learning_rate": 4.467760413535303e-06, |
|
"loss": 0.44, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.478844169246646, |
|
"grad_norm": 0.31115577007431877, |
|
"learning_rate": 4.44696140295505e-06, |
|
"loss": 0.4329, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.4871001031991744, |
|
"grad_norm": 0.3238091738004505, |
|
"learning_rate": 4.425814147622786e-06, |
|
"loss": 0.4337, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.4953560371517028, |
|
"grad_norm": 0.49541016985287156, |
|
"learning_rate": 4.404322430058634e-06, |
|
"loss": 0.4287, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.5036119711042312, |
|
"grad_norm": 0.37221572328367447, |
|
"learning_rate": 4.382490094395223e-06, |
|
"loss": 0.464, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.5118679050567595, |
|
"grad_norm": 0.33878190692897947, |
|
"learning_rate": 4.360321045690098e-06, |
|
"loss": 0.4409, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.5201238390092879, |
|
"grad_norm": 0.3936174788244509, |
|
"learning_rate": 4.337819249227243e-06, |
|
"loss": 0.4343, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.5283797729618163, |
|
"grad_norm": 0.33598616505477236, |
|
"learning_rate": 4.3149887298078275e-06, |
|
"loss": 0.431, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.5366357069143447, |
|
"grad_norm": 0.3519965938188954, |
|
"learning_rate": 4.2918335710303035e-06, |
|
"loss": 0.426, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.544891640866873, |
|
"grad_norm": 0.320454217828392, |
|
"learning_rate": 4.268357914559994e-06, |
|
"loss": 0.4304, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.5531475748194015, |
|
"grad_norm": 0.3719935953584569, |
|
"learning_rate": 4.244565959388287e-06, |
|
"loss": 0.436, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.5614035087719298, |
|
"grad_norm": 0.3613570450407545, |
|
"learning_rate": 4.2204619610815846e-06, |
|
"loss": 0.4335, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.5696594427244582, |
|
"grad_norm": 0.3126539947940401, |
|
"learning_rate": 4.19605023102012e-06, |
|
"loss": 0.4263, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.5779153766769866, |
|
"grad_norm": 0.6458529290452903, |
|
"learning_rate": 4.171335135626809e-06, |
|
"loss": 0.4265, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.586171310629515, |
|
"grad_norm": 0.3604632525755521, |
|
"learning_rate": 4.146321095586238e-06, |
|
"loss": 0.4362, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.5944272445820433, |
|
"grad_norm": 0.30925366125534925, |
|
"learning_rate": 4.121012585053958e-06, |
|
"loss": 0.4233, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.6026831785345718, |
|
"grad_norm": 0.3100306182604728, |
|
"learning_rate": 4.095414130856215e-06, |
|
"loss": 0.4274, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.6109391124871001, |
|
"grad_norm": 0.3253333926411185, |
|
"learning_rate": 4.069530311680247e-06, |
|
"loss": 0.4215, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.6191950464396285, |
|
"grad_norm": 0.3161203579495709, |
|
"learning_rate": 4.043365757255323e-06, |
|
"loss": 0.4195, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.6274509803921569, |
|
"grad_norm": 0.5372305576307022, |
|
"learning_rate": 4.016925147524638e-06, |
|
"loss": 0.4207, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.6357069143446853, |
|
"grad_norm": 0.4217012521666707, |
|
"learning_rate": 3.99021321180823e-06, |
|
"loss": 0.4189, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.6439628482972136, |
|
"grad_norm": 0.31058380585253026, |
|
"learning_rate": 3.96323472795707e-06, |
|
"loss": 0.4224, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.6522187822497421, |
|
"grad_norm": 0.3096196075520948, |
|
"learning_rate": 3.935994521498468e-06, |
|
"loss": 0.4298, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.6604747162022704, |
|
"grad_norm": 0.31321817530598023, |
|
"learning_rate": 3.908497464772946e-06, |
|
"loss": 0.4165, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.6687306501547987, |
|
"grad_norm": 0.34189314788820424, |
|
"learning_rate": 3.880748476062751e-06, |
|
"loss": 0.4209, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.6769865841073271, |
|
"grad_norm": 0.314020426119392, |
|
"learning_rate": 3.852752518712135e-06, |
|
"loss": 0.4235, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.6852425180598555, |
|
"grad_norm": 0.33309865122629745, |
|
"learning_rate": 3.824514600239591e-06, |
|
"loss": 0.4272, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.6934984520123839, |
|
"grad_norm": 0.31011244271423327, |
|
"learning_rate": 3.796039771442169e-06, |
|
"loss": 0.4195, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.7017543859649122, |
|
"grad_norm": 0.31754715779100456, |
|
"learning_rate": 3.767333125492072e-06, |
|
"loss": 0.4177, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.7100103199174407, |
|
"grad_norm": 0.325717634618246, |
|
"learning_rate": 3.7383997970256525e-06, |
|
"loss": 0.4208, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.718266253869969, |
|
"grad_norm": 0.3221005127147903, |
|
"learning_rate": 3.7092449612250083e-06, |
|
"loss": 0.4258, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.7265221878224974, |
|
"grad_norm": 2.3793212357938245, |
|
"learning_rate": 3.6798738328923162e-06, |
|
"loss": 0.4117, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.7347781217750258, |
|
"grad_norm": 0.34615783300395964, |
|
"learning_rate": 3.650291665517085e-06, |
|
"loss": 0.4259, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.7430340557275542, |
|
"grad_norm": 0.30258388313281476, |
|
"learning_rate": 3.6205037503364827e-06, |
|
"loss": 0.4138, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.7512899896800825, |
|
"grad_norm": 0.319420203738432, |
|
"learning_rate": 3.590515415388919e-06, |
|
"loss": 0.414, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.759545923632611, |
|
"grad_norm": 0.3131116585624073, |
|
"learning_rate": 3.5603320245610375e-06, |
|
"loss": 0.4171, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.7678018575851393, |
|
"grad_norm": 0.44357805104504766, |
|
"learning_rate": 3.529958976628304e-06, |
|
"loss": 0.4149, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.7760577915376677, |
|
"grad_norm": 0.3482318385617704, |
|
"learning_rate": 3.499401704289347e-06, |
|
"loss": 0.4068, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.7843137254901961, |
|
"grad_norm": 1.4359924674280429, |
|
"learning_rate": 3.468665673194237e-06, |
|
"loss": 0.412, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.7925696594427245, |
|
"grad_norm": 0.30332274823266964, |
|
"learning_rate": 3.437756380966866e-06, |
|
"loss": 0.4136, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.8008255933952528, |
|
"grad_norm": 0.30856700243712654, |
|
"learning_rate": 3.4066793562216135e-06, |
|
"loss": 0.41, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.8090815273477813, |
|
"grad_norm": 0.42071490847162873, |
|
"learning_rate": 3.375440157574462e-06, |
|
"loss": 0.4061, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.8173374613003096, |
|
"grad_norm": 2.6302213294970933, |
|
"learning_rate": 3.3440443726487583e-06, |
|
"loss": 0.4025, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.8255933952528379, |
|
"grad_norm": 0.29774501528511826, |
|
"learning_rate": 3.3124976170757694e-06, |
|
"loss": 0.4201, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.8338493292053664, |
|
"grad_norm": 0.32604189076380047, |
|
"learning_rate": 3.2808055334902487e-06, |
|
"loss": 0.4091, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.8421052631578947, |
|
"grad_norm": 0.3285267181834793, |
|
"learning_rate": 3.2489737905211537e-06, |
|
"loss": 0.4175, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.8503611971104231, |
|
"grad_norm": 0.310713417485478, |
|
"learning_rate": 3.217008081777726e-06, |
|
"loss": 0.4092, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.8586171310629515, |
|
"grad_norm": 0.3073088712514811, |
|
"learning_rate": 3.184914124831098e-06, |
|
"loss": 0.4053, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.8668730650154799, |
|
"grad_norm": 0.337054135152426, |
|
"learning_rate": 3.1526976601916153e-06, |
|
"loss": 0.4112, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.8751289989680082, |
|
"grad_norm": 0.35614170658863653, |
|
"learning_rate": 3.1203644502820592e-06, |
|
"loss": 0.4111, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.8833849329205367, |
|
"grad_norm": 0.31220727299527445, |
|
"learning_rate": 3.0879202784069407e-06, |
|
"loss": 0.4143, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.891640866873065, |
|
"grad_norm": 0.31989316355962877, |
|
"learning_rate": 3.05537094771807e-06, |
|
"loss": 0.4207, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.8998968008255934, |
|
"grad_norm": 0.36471398705088465, |
|
"learning_rate": 3.022722280176568e-06, |
|
"loss": 0.4121, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.9081527347781218, |
|
"grad_norm": 0.3241199840126995, |
|
"learning_rate": 2.9899801155115204e-06, |
|
"loss": 0.418, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.9164086687306502, |
|
"grad_norm": 0.414035145600113, |
|
"learning_rate": 2.9571503101754466e-06, |
|
"loss": 0.4097, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.9246646026831785, |
|
"grad_norm": 0.3068665528352179, |
|
"learning_rate": 2.9242387362967815e-06, |
|
"loss": 0.4087, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.932920536635707, |
|
"grad_norm": 0.3120720460499234, |
|
"learning_rate": 2.8912512806295573e-06, |
|
"loss": 0.4074, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.9411764705882353, |
|
"grad_norm": 0.31140736933032415, |
|
"learning_rate": 2.858193843500455e-06, |
|
"loss": 0.4048, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.9494324045407637, |
|
"grad_norm": 0.30731820437254637, |
|
"learning_rate": 2.8250723377534474e-06, |
|
"loss": 0.4025, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.957688338493292, |
|
"grad_norm": 0.31008178937315156, |
|
"learning_rate": 2.791892687692189e-06, |
|
"loss": 0.4017, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.9659442724458205, |
|
"grad_norm": 0.31453985857721983, |
|
"learning_rate": 2.7586608280203632e-06, |
|
"loss": 0.3964, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.9742002063983488, |
|
"grad_norm": 0.3454533037677527, |
|
"learning_rate": 2.725382702780164e-06, |
|
"loss": 0.4019, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.9824561403508771, |
|
"grad_norm": 0.31086090786087217, |
|
"learning_rate": 2.6920642642891114e-06, |
|
"loss": 0.4078, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.9907120743034056, |
|
"grad_norm": 0.29964942829401026, |
|
"learning_rate": 2.6587114720753882e-06, |
|
"loss": 0.4054, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.9989680082559339, |
|
"grad_norm": 0.31845271001148817, |
|
"learning_rate": 2.625330291811882e-06, |
|
"loss": 0.4017, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.9997936016511868, |
|
"eval_loss": 0.5107099413871765, |
|
"eval_runtime": 88.8692, |
|
"eval_samples_per_second": 6.054, |
|
"eval_steps_per_second": 0.056, |
|
"step": 2422 |
|
}, |
|
{ |
|
"epoch": 1.0074303405572755, |
|
"grad_norm": 0.33713877028152195, |
|
"learning_rate": 2.591926694249128e-06, |
|
"loss": 0.3594, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 1.0156862745098039, |
|
"grad_norm": 0.35458325001613433, |
|
"learning_rate": 2.558506654147353e-06, |
|
"loss": 0.3297, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 1.0239422084623322, |
|
"grad_norm": 0.33361671982328645, |
|
"learning_rate": 2.525076149207788e-06, |
|
"loss": 0.3327, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 1.0321981424148607, |
|
"grad_norm": 0.41547804329917404, |
|
"learning_rate": 2.4916411590034672e-06, |
|
"loss": 0.3307, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.040454076367389, |
|
"grad_norm": 0.3280076602568464, |
|
"learning_rate": 2.4582076639096837e-06, |
|
"loss": 0.3295, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 1.0487100103199174, |
|
"grad_norm": 0.3249157740900077, |
|
"learning_rate": 2.4247816440343078e-06, |
|
"loss": 0.322, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 1.0569659442724457, |
|
"grad_norm": 0.35605530558146004, |
|
"learning_rate": 2.391369078148148e-06, |
|
"loss": 0.335, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 1.0652218782249743, |
|
"grad_norm": 0.3549484263180951, |
|
"learning_rate": 2.3579759426155552e-06, |
|
"loss": 0.3153, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 1.0734778121775026, |
|
"grad_norm": 0.3465115375264916, |
|
"learning_rate": 2.324608210325456e-06, |
|
"loss": 0.3264, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.081733746130031, |
|
"grad_norm": 0.3357910072373825, |
|
"learning_rate": 2.291271849623004e-06, |
|
"loss": 0.3212, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 1.0899896800825593, |
|
"grad_norm": 0.344383755229491, |
|
"learning_rate": 2.2579728232420524e-06, |
|
"loss": 0.3157, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 1.0982456140350878, |
|
"grad_norm": 0.31538428598087054, |
|
"learning_rate": 2.2247170872386205e-06, |
|
"loss": 0.3229, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 1.1065015479876161, |
|
"grad_norm": 0.3246184372803657, |
|
"learning_rate": 2.1915105899255617e-06, |
|
"loss": 0.3232, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 1.1147574819401445, |
|
"grad_norm": 0.3443636135756491, |
|
"learning_rate": 2.158359270808612e-06, |
|
"loss": 0.3203, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.1230134158926728, |
|
"grad_norm": 0.34084061773858154, |
|
"learning_rate": 2.125269059524018e-06, |
|
"loss": 0.3276, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 1.1312693498452013, |
|
"grad_norm": 0.46655604655860133, |
|
"learning_rate": 2.092245874777926e-06, |
|
"loss": 0.3267, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 1.1395252837977297, |
|
"grad_norm": 0.33228746141272103, |
|
"learning_rate": 2.059295623287729e-06, |
|
"loss": 0.3272, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 1.147781217750258, |
|
"grad_norm": 0.348538665454056, |
|
"learning_rate": 2.026424198725557e-06, |
|
"loss": 0.3228, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 1.1560371517027863, |
|
"grad_norm": 0.3708880778253579, |
|
"learning_rate": 1.9936374806641016e-06, |
|
"loss": 0.3268, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.1642930856553146, |
|
"grad_norm": 0.3467837537594021, |
|
"learning_rate": 1.96094133352496e-06, |
|
"loss": 0.3287, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 1.1725490196078432, |
|
"grad_norm": 0.331776837754727, |
|
"learning_rate": 1.9283416055296906e-06, |
|
"loss": 0.3135, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 1.1808049535603715, |
|
"grad_norm": 0.33972138880439906, |
|
"learning_rate": 1.8958441276537695e-06, |
|
"loss": 0.3165, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 1.1890608875128998, |
|
"grad_norm": 0.3290544472328714, |
|
"learning_rate": 1.8634547125836246e-06, |
|
"loss": 0.3231, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 1.1973168214654284, |
|
"grad_norm": 0.4324023615254428, |
|
"learning_rate": 1.8311791536769485e-06, |
|
"loss": 0.3191, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.2055727554179567, |
|
"grad_norm": 0.34414515738152696, |
|
"learning_rate": 1.799023223926461e-06, |
|
"loss": 0.3214, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 1.213828689370485, |
|
"grad_norm": 0.3522816282310391, |
|
"learning_rate": 1.766992674927322e-06, |
|
"loss": 0.322, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 1.2220846233230134, |
|
"grad_norm": 0.33214309648209184, |
|
"learning_rate": 1.7350932358483675e-06, |
|
"loss": 0.3245, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 1.2303405572755417, |
|
"grad_norm": 0.32354241105977966, |
|
"learning_rate": 1.703330612407355e-06, |
|
"loss": 0.3189, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 1.2385964912280703, |
|
"grad_norm": 0.33884732729314554, |
|
"learning_rate": 1.6717104858504088e-06, |
|
"loss": 0.3211, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.2468524251805986, |
|
"grad_norm": 0.3735533087692056, |
|
"learning_rate": 1.6402385119358372e-06, |
|
"loss": 0.3188, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 1.255108359133127, |
|
"grad_norm": 0.3379937818585682, |
|
"learning_rate": 1.6089203199225117e-06, |
|
"loss": 0.3201, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 1.2633642930856552, |
|
"grad_norm": 0.33385226604310425, |
|
"learning_rate": 1.5777615115629874e-06, |
|
"loss": 0.3197, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 1.2716202270381838, |
|
"grad_norm": 0.3353612788825473, |
|
"learning_rate": 1.546767660101537e-06, |
|
"loss": 0.3178, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 1.279876160990712, |
|
"grad_norm": 0.3384674558918743, |
|
"learning_rate": 1.5159443092772924e-06, |
|
"loss": 0.3235, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.2881320949432404, |
|
"grad_norm": 0.384875131380503, |
|
"learning_rate": 1.4852969723326555e-06, |
|
"loss": 0.3187, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 1.2963880288957688, |
|
"grad_norm": 0.44608590681938926, |
|
"learning_rate": 1.4548311310271724e-06, |
|
"loss": 0.3246, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 1.304643962848297, |
|
"grad_norm": 0.33842522169860273, |
|
"learning_rate": 1.4245522346570289e-06, |
|
"loss": 0.3226, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 1.3128998968008256, |
|
"grad_norm": 0.38547488897682075, |
|
"learning_rate": 1.3944656990803645e-06, |
|
"loss": 0.3198, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 1.321155830753354, |
|
"grad_norm": 0.3552101566974137, |
|
"learning_rate": 1.364576905748553e-06, |
|
"loss": 0.3199, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.3294117647058823, |
|
"grad_norm": 0.3225112166653656, |
|
"learning_rate": 1.3348912007436538e-06, |
|
"loss": 0.3194, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 1.3376676986584108, |
|
"grad_norm": 0.34375670285803067, |
|
"learning_rate": 1.3054138938221708e-06, |
|
"loss": 0.3279, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 1.3459236326109392, |
|
"grad_norm": 0.330557856124175, |
|
"learning_rate": 1.2761502574653286e-06, |
|
"loss": 0.3199, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 1.3541795665634675, |
|
"grad_norm": 0.3309833353922965, |
|
"learning_rate": 1.247105525936001e-06, |
|
"loss": 0.3146, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 1.3624355005159958, |
|
"grad_norm": 0.3452194374171664, |
|
"learning_rate": 1.2182848943424786e-06, |
|
"loss": 0.3176, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.3706914344685242, |
|
"grad_norm": 0.3216278030301951, |
|
"learning_rate": 1.1896935177092456e-06, |
|
"loss": 0.3185, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 1.3789473684210527, |
|
"grad_norm": 0.4265735872664729, |
|
"learning_rate": 1.16133651005492e-06, |
|
"loss": 0.3207, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 1.387203302373581, |
|
"grad_norm": 0.32503296466693565, |
|
"learning_rate": 1.1332189434775354e-06, |
|
"loss": 0.3199, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 1.3954592363261094, |
|
"grad_norm": 0.33776061146175834, |
|
"learning_rate": 1.1053458472473078e-06, |
|
"loss": 0.317, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 1.403715170278638, |
|
"grad_norm": 0.33951723955583724, |
|
"learning_rate": 1.0777222069070797e-06, |
|
"loss": 0.3152, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.4119711042311662, |
|
"grad_norm": 0.35231254237908666, |
|
"learning_rate": 1.0503529633805711e-06, |
|
"loss": 0.3156, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 1.4202270381836946, |
|
"grad_norm": 0.34046838697292375, |
|
"learning_rate": 1.0232430120886227e-06, |
|
"loss": 0.3198, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 1.4284829721362229, |
|
"grad_norm": 0.31629713031455653, |
|
"learning_rate": 9.963972020735658e-07, |
|
"loss": 0.3149, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 1.4367389060887512, |
|
"grad_norm": 0.8014048883476281, |
|
"learning_rate": 9.698203351319019e-07, |
|
"loss": 0.3235, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 1.4449948400412795, |
|
"grad_norm": 0.32201920421165664, |
|
"learning_rate": 9.435171649554234e-07, |
|
"loss": 0.3217, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.453250773993808, |
|
"grad_norm": 0.36221640610133427, |
|
"learning_rate": 9.17492396280934e-07, |
|
"loss": 0.3156, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 1.4615067079463364, |
|
"grad_norm": 0.32962860003755834, |
|
"learning_rate": 8.917506840487448e-07, |
|
"loss": 0.3152, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 1.4697626418988647, |
|
"grad_norm": 0.3605181380604464, |
|
"learning_rate": 8.662966325700531e-07, |
|
"loss": 0.3206, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 1.4780185758513933, |
|
"grad_norm": 0.32779483961721695, |
|
"learning_rate": 8.411347947033982e-07, |
|
"loss": 0.3095, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 1.4862745098039216, |
|
"grad_norm": 0.3270784844703272, |
|
"learning_rate": 8.162696710403026e-07, |
|
"loss": 0.319, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.49453044375645, |
|
"grad_norm": 0.3265816952545498, |
|
"learning_rate": 7.917057091002772e-07, |
|
"loss": 0.3161, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 1.5027863777089783, |
|
"grad_norm": 0.32668356632045076, |
|
"learning_rate": 7.674473025353063e-07, |
|
"loss": 0.3136, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 1.5110423116615066, |
|
"grad_norm": 0.3329149198486246, |
|
"learning_rate": 7.434987903439795e-07, |
|
"loss": 0.3145, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 1.519298245614035, |
|
"grad_norm": 0.33015476593592213, |
|
"learning_rate": 7.198644560953877e-07, |
|
"loss": 0.3154, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 1.5275541795665635, |
|
"grad_norm": 0.3427897311331206, |
|
"learning_rate": 6.965485271629426e-07, |
|
"loss": 0.3219, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.5358101135190918, |
|
"grad_norm": 0.31710290567990695, |
|
"learning_rate": 6.735551739682458e-07, |
|
"loss": 0.3118, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 1.5440660474716204, |
|
"grad_norm": 0.3709576851610617, |
|
"learning_rate": 6.508885092351374e-07, |
|
"loss": 0.3147, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 1.5523219814241487, |
|
"grad_norm": 0.34606775925501615, |
|
"learning_rate": 6.285525872540777e-07, |
|
"loss": 0.3151, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 1.560577915376677, |
|
"grad_norm": 0.32914754720880296, |
|
"learning_rate": 6.065514031569658e-07, |
|
"loss": 0.319, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 1.5688338493292053, |
|
"grad_norm": 0.32959782478095573, |
|
"learning_rate": 5.848888922025553e-07, |
|
"loss": 0.3109, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.5770897832817337, |
|
"grad_norm": 0.3461782278815208, |
|
"learning_rate": 5.635689290725629e-07, |
|
"loss": 0.3118, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 1.585345717234262, |
|
"grad_norm": 0.33719762865745856, |
|
"learning_rate": 5.425953271786289e-07, |
|
"loss": 0.3142, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 1.5936016511867905, |
|
"grad_norm": 0.3229411688412049, |
|
"learning_rate": 5.219718379802203e-07, |
|
"loss": 0.3074, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 1.6018575851393189, |
|
"grad_norm": 0.339153428971675, |
|
"learning_rate": 5.017021503136299e-07, |
|
"loss": 0.3085, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 1.6101135190918474, |
|
"grad_norm": 0.328366872614432, |
|
"learning_rate": 4.817898897321648e-07, |
|
"loss": 0.3211, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.6183694530443757, |
|
"grad_norm": 0.3308504300204108, |
|
"learning_rate": 4.6223861785766184e-07, |
|
"loss": 0.3219, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 1.626625386996904, |
|
"grad_norm": 0.3290571513653433, |
|
"learning_rate": 4.430518317434351e-07, |
|
"loss": 0.3098, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 1.6348813209494324, |
|
"grad_norm": 0.32391898185345264, |
|
"learning_rate": 4.242329632487707e-07, |
|
"loss": 0.3165, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 1.6431372549019607, |
|
"grad_norm": 0.32636444036143686, |
|
"learning_rate": 4.057853784250884e-07, |
|
"loss": 0.3114, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 1.651393188854489, |
|
"grad_norm": 0.4579751480579007, |
|
"learning_rate": 3.877123769138652e-07, |
|
"loss": 0.3157, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.6596491228070176, |
|
"grad_norm": 0.3305784303773594, |
|
"learning_rate": 3.7001719135644793e-07, |
|
"loss": 0.3178, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 1.667905056759546, |
|
"grad_norm": 0.33328615826236435, |
|
"learning_rate": 3.527029868158394e-07, |
|
"loss": 0.3149, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 1.6761609907120743, |
|
"grad_norm": 0.3376266856154567, |
|
"learning_rate": 3.3577286021058085e-07, |
|
"loss": 0.3214, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 1.6844169246646028, |
|
"grad_norm": 0.36052770349787855, |
|
"learning_rate": 3.192298397608165e-07, |
|
"loss": 0.3181, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 1.6926728586171311, |
|
"grad_norm": 0.3186034103836647, |
|
"learning_rate": 3.0307688444664975e-07, |
|
"loss": 0.3109, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.7009287925696595, |
|
"grad_norm": 0.32364083786767023, |
|
"learning_rate": 2.873168834788842e-07, |
|
"loss": 0.3164, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 1.7091847265221878, |
|
"grad_norm": 0.3352263407283556, |
|
"learning_rate": 2.719526557822391e-07, |
|
"loss": 0.3135, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 1.717440660474716, |
|
"grad_norm": 0.324247569201673, |
|
"learning_rate": 2.5698694949114504e-07, |
|
"loss": 0.3172, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 1.7256965944272444, |
|
"grad_norm": 0.3241547440096801, |
|
"learning_rate": 2.4242244145819187e-07, |
|
"loss": 0.311, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 1.733952528379773, |
|
"grad_norm": 0.3383830207704809, |
|
"learning_rate": 2.2826173677533593e-07, |
|
"loss": 0.3155, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.7422084623323013, |
|
"grad_norm": 0.3950633617255487, |
|
"learning_rate": 2.1450736830793405e-07, |
|
"loss": 0.3142, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 1.7504643962848299, |
|
"grad_norm": 0.5099914758815567, |
|
"learning_rate": 2.0116179624170478e-07, |
|
"loss": 0.3107, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 1.7587203302373582, |
|
"grad_norm": 1.0026514809885159, |
|
"learning_rate": 1.8822740764268098e-07, |
|
"loss": 0.3087, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 1.7669762641898865, |
|
"grad_norm": 1662.5837568982583, |
|
"learning_rate": 1.757065160302504e-07, |
|
"loss": 0.3182, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 1.7752321981424148, |
|
"grad_norm": 0.3278171902579247, |
|
"learning_rate": 1.6360136096334107e-07, |
|
"loss": 0.3066, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 1.7834881320949432, |
|
"grad_norm": 0.3533064691586645, |
|
"learning_rate": 1.519141076398442e-07, |
|
"loss": 0.3166, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 1.7917440660474715, |
|
"grad_norm": 0.8178680644268199, |
|
"learning_rate": 1.406468465093344e-07, |
|
"loss": 0.3167, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.5677100980648624, |
|
"learning_rate": 1.2980159289915805e-07, |
|
"loss": 0.3139, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 1.8082559339525284, |
|
"grad_norm": 0.327287746933213, |
|
"learning_rate": 1.1938028665396172e-07, |
|
"loss": 0.3123, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 1.816511867905057, |
|
"grad_norm": 0.3594205243876799, |
|
"learning_rate": 1.0938479178871892e-07, |
|
"loss": 0.3158, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 1.8247678018575852, |
|
"grad_norm": 0.3265638946104959, |
|
"learning_rate": 9.981689615532364e-08, |
|
"loss": 0.3124, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 1.8330237358101136, |
|
"grad_norm": 0.33268480190653027, |
|
"learning_rate": 9.06783111228024e-08, |
|
"loss": 0.3161, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 1.841279669762642, |
|
"grad_norm": 0.3434703383531059, |
|
"learning_rate": 8.19706712712115e-08, |
|
"loss": 0.3139, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 1.8495356037151702, |
|
"grad_norm": 0.3476346041767014, |
|
"learning_rate": 7.369553409926427e-08, |
|
"loss": 0.313, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 1.8577915376676986, |
|
"grad_norm": 0.32495041685434667, |
|
"learning_rate": 6.585437974574921e-08, |
|
"loss": 0.308, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.8660474716202269, |
|
"grad_norm": 0.334105753613722, |
|
"learning_rate": 5.844861072478336e-08, |
|
"loss": 0.3146, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 1.8743034055727554, |
|
"grad_norm": 0.32248303496350533, |
|
"learning_rate": 5.147955167495111e-08, |
|
"loss": 0.3164, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 1.8825593395252838, |
|
"grad_norm": 0.3170520025481871, |
|
"learning_rate": 4.494844912237145e-08, |
|
"loss": 0.309, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 1.8908152734778123, |
|
"grad_norm": 0.32425657312680345, |
|
"learning_rate": 3.885647125773578e-08, |
|
"loss": 0.3158, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 1.8990712074303406, |
|
"grad_norm": 0.32676487500274765, |
|
"learning_rate": 3.320470772736062e-08, |
|
"loss": 0.3176, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 1.907327141382869, |
|
"grad_norm": 0.3366864111509815, |
|
"learning_rate": 2.799416943828598e-08, |
|
"loss": 0.318, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 1.9155830753353973, |
|
"grad_norm": 0.33022963805900657, |
|
"learning_rate": 2.3225788377459478e-08, |
|
"loss": 0.3161, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 1.9238390092879256, |
|
"grad_norm": 0.4334852710323891, |
|
"learning_rate": 1.890041744503468e-08, |
|
"loss": 0.3207, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 1.932094943240454, |
|
"grad_norm": 0.3368172513862861, |
|
"learning_rate": 1.5018830301817277e-08, |
|
"loss": 0.3112, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 1.9403508771929825, |
|
"grad_norm": 0.328478614234708, |
|
"learning_rate": 1.1581721230883302e-08, |
|
"loss": 0.3175, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 1.9486068111455108, |
|
"grad_norm": 0.3003501782030781, |
|
"learning_rate": 8.589705013396509e-09, |
|
"loss": 0.3056, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 1.9568627450980394, |
|
"grad_norm": 0.33048912522337326, |
|
"learning_rate": 6.043316818643008e-09, |
|
"loss": 0.3189, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 1.9651186790505677, |
|
"grad_norm": 0.3300692526475954, |
|
"learning_rate": 3.9430121083106065e-09, |
|
"loss": 0.3153, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 1.973374613003096, |
|
"grad_norm": 0.3265423101136748, |
|
"learning_rate": 2.2891665550200946e-09, |
|
"loss": 0.3142, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 1.9816305469556244, |
|
"grad_norm": 0.3321586725232749, |
|
"learning_rate": 1.0820759751309363e-09, |
|
"loss": 0.3133, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 1.9898864809081527, |
|
"grad_norm": 0.3312240597270669, |
|
"learning_rate": 3.219562758302597e-10, |
|
"loss": 0.3094, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 1.998142414860681, |
|
"grad_norm": 0.32676874574809356, |
|
"learning_rate": 8.943416513751412e-12, |
|
"loss": 0.3136, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 1.9997936016511868, |
|
"eval_loss": 0.49619075655937195, |
|
"eval_runtime": 93.5055, |
|
"eval_samples_per_second": 5.754, |
|
"eval_steps_per_second": 0.053, |
|
"step": 4844 |
|
}, |
|
{ |
|
"epoch": 1.9997936016511868, |
|
"step": 4844, |
|
"total_flos": 2.009123424043008e+16, |
|
"train_loss": 0.08715950149479826, |
|
"train_runtime": 137060.7981, |
|
"train_samples_per_second": 4.524, |
|
"train_steps_per_second": 0.035 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 4844, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.009123424043008e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|