|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.5420054200542005, |
|
"eval_steps": 500, |
|
"global_step": 300, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0018066847335140017, |
|
"grad_norm": 2.8394027200091747, |
|
"learning_rate": 0.0, |
|
"loss": 0.8251, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0036133694670280035, |
|
"grad_norm": 2.8846288476633135, |
|
"learning_rate": 1.0714285714285714e-06, |
|
"loss": 0.8284, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.005420054200542005, |
|
"grad_norm": 2.9025021758600973, |
|
"learning_rate": 2.1428571428571427e-06, |
|
"loss": 0.8427, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.007226738934056007, |
|
"grad_norm": 2.8006966406467435, |
|
"learning_rate": 3.2142857142857143e-06, |
|
"loss": 0.8377, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.009033423667570008, |
|
"grad_norm": 2.5933707261583208, |
|
"learning_rate": 4.2857142857142855e-06, |
|
"loss": 0.8227, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01084010840108401, |
|
"grad_norm": 2.0871248987589857, |
|
"learning_rate": 5.357142857142857e-06, |
|
"loss": 0.8095, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.012646793134598013, |
|
"grad_norm": 1.5051624332871105, |
|
"learning_rate": 6.428571428571429e-06, |
|
"loss": 0.7739, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.014453477868112014, |
|
"grad_norm": 1.390065658542279, |
|
"learning_rate": 7.5e-06, |
|
"loss": 0.7734, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.016260162601626018, |
|
"grad_norm": 1.3346375437162221, |
|
"learning_rate": 8.571428571428571e-06, |
|
"loss": 0.7655, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.018066847335140017, |
|
"grad_norm": 2.368502454719072, |
|
"learning_rate": 9.642857142857144e-06, |
|
"loss": 0.7502, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01987353206865402, |
|
"grad_norm": 2.355019739749437, |
|
"learning_rate": 1.0714285714285714e-05, |
|
"loss": 0.7355, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.02168021680216802, |
|
"grad_norm": 2.235397645486187, |
|
"learning_rate": 1.1785714285714286e-05, |
|
"loss": 0.7489, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.023486901535682024, |
|
"grad_norm": 1.8443725862230025, |
|
"learning_rate": 1.2857142857142857e-05, |
|
"loss": 0.7253, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.025293586269196026, |
|
"grad_norm": 3.9119844600464275, |
|
"learning_rate": 1.3928571428571429e-05, |
|
"loss": 0.7285, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.02710027100271003, |
|
"grad_norm": 1.6411422206825859, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.7284, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.028906955736224028, |
|
"grad_norm": 1.3411860085380753, |
|
"learning_rate": 1.6071428571428572e-05, |
|
"loss": 0.7105, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.03071364046973803, |
|
"grad_norm": 1.157320872714633, |
|
"learning_rate": 1.7142857142857142e-05, |
|
"loss": 0.7068, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.032520325203252036, |
|
"grad_norm": 0.7462236915744604, |
|
"learning_rate": 1.8214285714285712e-05, |
|
"loss": 0.6926, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.03432700993676603, |
|
"grad_norm": 0.8473541980236234, |
|
"learning_rate": 1.928571428571429e-05, |
|
"loss": 0.6858, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.036133694670280034, |
|
"grad_norm": 0.7215438560743268, |
|
"learning_rate": 2.0357142857142858e-05, |
|
"loss": 0.6605, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.037940379403794036, |
|
"grad_norm": 0.6345550993430565, |
|
"learning_rate": 2.1428571428571428e-05, |
|
"loss": 0.6668, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.03974706413730804, |
|
"grad_norm": 0.6182590458724911, |
|
"learning_rate": 2.25e-05, |
|
"loss": 0.6559, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.04155374887082204, |
|
"grad_norm": 0.623617204927979, |
|
"learning_rate": 2.357142857142857e-05, |
|
"loss": 0.6561, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.04336043360433604, |
|
"grad_norm": 0.5049024834505677, |
|
"learning_rate": 2.464285714285714e-05, |
|
"loss": 0.6613, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.045167118337850046, |
|
"grad_norm": 0.5485283866894338, |
|
"learning_rate": 2.5714285714285714e-05, |
|
"loss": 0.6429, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.04697380307136405, |
|
"grad_norm": 0.5146774550210349, |
|
"learning_rate": 2.6785714285714288e-05, |
|
"loss": 0.6572, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.04878048780487805, |
|
"grad_norm": 0.4849533483489942, |
|
"learning_rate": 2.7857142857142858e-05, |
|
"loss": 0.6352, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.05058717253839205, |
|
"grad_norm": 0.4679753904023602, |
|
"learning_rate": 2.892857142857143e-05, |
|
"loss": 0.6372, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.052393857271906055, |
|
"grad_norm": 0.49823982670847244, |
|
"learning_rate": 3e-05, |
|
"loss": 0.649, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.05420054200542006, |
|
"grad_norm": 0.3998487451158294, |
|
"learning_rate": 2.9999731440137413e-05, |
|
"loss": 0.6492, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.05600722673893405, |
|
"grad_norm": 0.4037683931810622, |
|
"learning_rate": 2.9998925770166232e-05, |
|
"loss": 0.6384, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.057813911472448055, |
|
"grad_norm": 0.3869283159888842, |
|
"learning_rate": 2.9997583018935875e-05, |
|
"loss": 0.6266, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.05962059620596206, |
|
"grad_norm": 0.4105676527710418, |
|
"learning_rate": 2.9995703234527553e-05, |
|
"loss": 0.6365, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.06142728093947606, |
|
"grad_norm": 0.3428905600355113, |
|
"learning_rate": 2.999328648425255e-05, |
|
"loss": 0.6236, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.06323396567299007, |
|
"grad_norm": 0.3666139696974658, |
|
"learning_rate": 2.999033285464982e-05, |
|
"loss": 0.6293, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.06504065040650407, |
|
"grad_norm": 0.369662441500822, |
|
"learning_rate": 2.9986842451482876e-05, |
|
"loss": 0.6251, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.06684733514001806, |
|
"grad_norm": 0.3677980333288224, |
|
"learning_rate": 2.9982815399736008e-05, |
|
"loss": 0.6297, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.06865401987353206, |
|
"grad_norm": 0.3223035029779702, |
|
"learning_rate": 2.9978251843609816e-05, |
|
"loss": 0.6219, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.07046070460704607, |
|
"grad_norm": 0.37293367784153375, |
|
"learning_rate": 2.9973151946516027e-05, |
|
"loss": 0.6242, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.07226738934056007, |
|
"grad_norm": 0.322511907065973, |
|
"learning_rate": 2.996751589107167e-05, |
|
"loss": 0.6122, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.07407407407407407, |
|
"grad_norm": 0.34111037181843756, |
|
"learning_rate": 2.9961343879092512e-05, |
|
"loss": 0.6186, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.07588075880758807, |
|
"grad_norm": 0.30130338237811194, |
|
"learning_rate": 2.9954636131585845e-05, |
|
"loss": 0.6113, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.07768744354110207, |
|
"grad_norm": 0.32753158641617225, |
|
"learning_rate": 2.9947392888742566e-05, |
|
"loss": 0.6112, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.07949412827461608, |
|
"grad_norm": 0.3060506987492754, |
|
"learning_rate": 2.993961440992859e-05, |
|
"loss": 0.6056, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.08130081300813008, |
|
"grad_norm": 0.5813127875924191, |
|
"learning_rate": 2.993130097367553e-05, |
|
"loss": 0.6131, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.08310749774164408, |
|
"grad_norm": 0.30546110146813144, |
|
"learning_rate": 2.9922452877670775e-05, |
|
"loss": 0.6144, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.08491418247515808, |
|
"grad_norm": 0.3009821627626054, |
|
"learning_rate": 2.991307043874677e-05, |
|
"loss": 0.617, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.08672086720867209, |
|
"grad_norm": 0.2835661481876977, |
|
"learning_rate": 2.9903153992869734e-05, |
|
"loss": 0.6031, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.08852755194218609, |
|
"grad_norm": 0.31548723118407335, |
|
"learning_rate": 2.989270389512756e-05, |
|
"loss": 0.6107, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.09033423667570009, |
|
"grad_norm": 0.2864309330552123, |
|
"learning_rate": 2.988172051971717e-05, |
|
"loss": 0.6097, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0921409214092141, |
|
"grad_norm": 0.29953498642613263, |
|
"learning_rate": 2.9870204259931062e-05, |
|
"loss": 0.5998, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.0939476061427281, |
|
"grad_norm": 0.2782739210196595, |
|
"learning_rate": 2.9858155528143256e-05, |
|
"loss": 0.6169, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.0957542908762421, |
|
"grad_norm": 0.3431216549638938, |
|
"learning_rate": 2.9845574755794522e-05, |
|
"loss": 0.6024, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.0975609756097561, |
|
"grad_norm": 0.26425217740885376, |
|
"learning_rate": 2.9832462393376926e-05, |
|
"loss": 0.5921, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.0993676603432701, |
|
"grad_norm": 0.3422702392148149, |
|
"learning_rate": 2.9818818910417706e-05, |
|
"loss": 0.6079, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.1011743450767841, |
|
"grad_norm": 0.3197424015440345, |
|
"learning_rate": 2.9804644795462437e-05, |
|
"loss": 0.612, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.10298102981029811, |
|
"grad_norm": 0.3035603795716621, |
|
"learning_rate": 2.9789940556057574e-05, |
|
"loss": 0.5975, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.10478771454381211, |
|
"grad_norm": 0.3410992160925094, |
|
"learning_rate": 2.9774706718732255e-05, |
|
"loss": 0.6058, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.10659439927732611, |
|
"grad_norm": 0.27322743279862205, |
|
"learning_rate": 2.9758943828979444e-05, |
|
"loss": 0.592, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.10840108401084012, |
|
"grad_norm": 0.3639122458989281, |
|
"learning_rate": 2.9742652451236414e-05, |
|
"loss": 0.5932, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1102077687443541, |
|
"grad_norm": 0.3014533566173352, |
|
"learning_rate": 2.972583316886451e-05, |
|
"loss": 0.597, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.1120144534778681, |
|
"grad_norm": 0.34457895149113055, |
|
"learning_rate": 2.9708486584128303e-05, |
|
"loss": 0.6017, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.11382113821138211, |
|
"grad_norm": 0.30295456441359353, |
|
"learning_rate": 2.9690613318173966e-05, |
|
"loss": 0.6015, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.11562782294489611, |
|
"grad_norm": 0.2899317610304254, |
|
"learning_rate": 2.9672214011007087e-05, |
|
"loss": 0.5999, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.11743450767841011, |
|
"grad_norm": 0.29733861402573497, |
|
"learning_rate": 2.9653289321469715e-05, |
|
"loss": 0.5939, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.11924119241192412, |
|
"grad_norm": 0.27257724173917824, |
|
"learning_rate": 2.9633839927216793e-05, |
|
"loss": 0.5969, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.12104787714543812, |
|
"grad_norm": 0.2898338043822156, |
|
"learning_rate": 2.9613866524691867e-05, |
|
"loss": 0.5873, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.12285456187895212, |
|
"grad_norm": 0.2884348823630205, |
|
"learning_rate": 2.9593369829102173e-05, |
|
"loss": 0.6026, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.12466124661246612, |
|
"grad_norm": 0.28168278457326457, |
|
"learning_rate": 2.957235057439301e-05, |
|
"loss": 0.6042, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.12646793134598014, |
|
"grad_norm": 0.2869178339893195, |
|
"learning_rate": 2.955080951322147e-05, |
|
"loss": 0.6085, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.12827461607949414, |
|
"grad_norm": 0.3037977880321082, |
|
"learning_rate": 2.9528747416929467e-05, |
|
"loss": 0.5817, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.13008130081300814, |
|
"grad_norm": 0.28914281369381906, |
|
"learning_rate": 2.9506165075516148e-05, |
|
"loss": 0.5985, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.13188798554652212, |
|
"grad_norm": 0.3680574307953626, |
|
"learning_rate": 2.9483063297609577e-05, |
|
"loss": 0.5975, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.13369467028003612, |
|
"grad_norm": 0.3174888148098996, |
|
"learning_rate": 2.9459442910437798e-05, |
|
"loss": 0.5891, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.13550135501355012, |
|
"grad_norm": 0.31913893930146975, |
|
"learning_rate": 2.94353047597992e-05, |
|
"loss": 0.5888, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.13730803974706413, |
|
"grad_norm": 0.28949770075545406, |
|
"learning_rate": 2.941064971003224e-05, |
|
"loss": 0.5841, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.13911472448057813, |
|
"grad_norm": 0.3247755531851961, |
|
"learning_rate": 2.9385478643984484e-05, |
|
"loss": 0.5795, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.14092140921409213, |
|
"grad_norm": 0.3511957059688905, |
|
"learning_rate": 2.9359792462981007e-05, |
|
"loss": 0.5905, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.14272809394760613, |
|
"grad_norm": 0.302521463157134, |
|
"learning_rate": 2.9333592086792113e-05, |
|
"loss": 0.5985, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.14453477868112014, |
|
"grad_norm": 0.3549679822021618, |
|
"learning_rate": 2.9306878453600382e-05, |
|
"loss": 0.6074, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.14634146341463414, |
|
"grad_norm": 0.3198459509652125, |
|
"learning_rate": 2.9279652519967105e-05, |
|
"loss": 0.5922, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.14814814814814814, |
|
"grad_norm": 0.3576854570877617, |
|
"learning_rate": 2.9251915260798024e-05, |
|
"loss": 0.5846, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.14995483288166214, |
|
"grad_norm": 0.3452717739532111, |
|
"learning_rate": 2.9223667669308395e-05, |
|
"loss": 0.5949, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.15176151761517614, |
|
"grad_norm": 0.35130798066442276, |
|
"learning_rate": 2.9194910756987464e-05, |
|
"loss": 0.5882, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.15356820234869015, |
|
"grad_norm": 0.2979218241802279, |
|
"learning_rate": 2.9165645553562215e-05, |
|
"loss": 0.5911, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.15537488708220415, |
|
"grad_norm": 0.30352540649162485, |
|
"learning_rate": 2.9135873106960525e-05, |
|
"loss": 0.5902, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.15718157181571815, |
|
"grad_norm": 0.3047300397649292, |
|
"learning_rate": 2.9105594483273603e-05, |
|
"loss": 0.5927, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.15898825654923215, |
|
"grad_norm": 0.3034444492388091, |
|
"learning_rate": 2.9074810766717865e-05, |
|
"loss": 0.5969, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.16079494128274616, |
|
"grad_norm": 0.31726687931722786, |
|
"learning_rate": 2.904352305959606e-05, |
|
"loss": 0.5938, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.16260162601626016, |
|
"grad_norm": 0.2844238571820103, |
|
"learning_rate": 2.9011732482257835e-05, |
|
"loss": 0.5856, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.16440831074977416, |
|
"grad_norm": 0.33861384709236475, |
|
"learning_rate": 2.89794401730596e-05, |
|
"loss": 0.5835, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.16621499548328816, |
|
"grad_norm": 0.31986755243990583, |
|
"learning_rate": 2.894664728832377e-05, |
|
"loss": 0.5875, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.16802168021680217, |
|
"grad_norm": 0.31772953202959486, |
|
"learning_rate": 2.8913355002297367e-05, |
|
"loss": 0.5937, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.16982836495031617, |
|
"grad_norm": 0.3404713775893871, |
|
"learning_rate": 2.887956450710995e-05, |
|
"loss": 0.5933, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.17163504968383017, |
|
"grad_norm": 0.3270438057428047, |
|
"learning_rate": 2.8845277012730963e-05, |
|
"loss": 0.5854, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.17344173441734417, |
|
"grad_norm": 0.2813265419298624, |
|
"learning_rate": 2.8810493746926364e-05, |
|
"loss": 0.5849, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.17524841915085818, |
|
"grad_norm": 0.30908563683812523, |
|
"learning_rate": 2.87752159552147e-05, |
|
"loss": 0.5906, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.17705510388437218, |
|
"grad_norm": 0.28172567784588337, |
|
"learning_rate": 2.87394449008225e-05, |
|
"loss": 0.5836, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.17886178861788618, |
|
"grad_norm": 0.32776100679221937, |
|
"learning_rate": 2.8703181864639013e-05, |
|
"loss": 0.5968, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.18066847335140018, |
|
"grad_norm": 0.2829820594340673, |
|
"learning_rate": 2.8666428145170385e-05, |
|
"loss": 0.5698, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.18247515808491419, |
|
"grad_norm": 0.2634972832550828, |
|
"learning_rate": 2.8629185058493116e-05, |
|
"loss": 0.5858, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.1842818428184282, |
|
"grad_norm": 0.33074374876902657, |
|
"learning_rate": 2.8591453938206985e-05, |
|
"loss": 0.6042, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.1860885275519422, |
|
"grad_norm": 0.2755244711155326, |
|
"learning_rate": 2.8553236135387247e-05, |
|
"loss": 0.5827, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.1878952122854562, |
|
"grad_norm": 0.3306960703512347, |
|
"learning_rate": 2.8514533018536286e-05, |
|
"loss": 0.5805, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.1897018970189702, |
|
"grad_norm": 0.27767659256895755, |
|
"learning_rate": 2.8475345973534605e-05, |
|
"loss": 0.5731, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.1915085817524842, |
|
"grad_norm": 0.43715096447176, |
|
"learning_rate": 2.8435676403591193e-05, |
|
"loss": 0.5759, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.1933152664859982, |
|
"grad_norm": 0.28202224774674794, |
|
"learning_rate": 2.8395525729193284e-05, |
|
"loss": 0.5821, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.1951219512195122, |
|
"grad_norm": 0.2549341033504584, |
|
"learning_rate": 2.835489538805548e-05, |
|
"loss": 0.5801, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.1969286359530262, |
|
"grad_norm": 0.2885151330973423, |
|
"learning_rate": 2.8313786835068314e-05, |
|
"loss": 0.5891, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.1987353206865402, |
|
"grad_norm": 0.2795891601436777, |
|
"learning_rate": 2.8272201542246077e-05, |
|
"loss": 0.5707, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2005420054200542, |
|
"grad_norm": 0.2955873960209813, |
|
"learning_rate": 2.8230140998674185e-05, |
|
"loss": 0.5903, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.2023486901535682, |
|
"grad_norm": 0.2686848479774233, |
|
"learning_rate": 2.8187606710455807e-05, |
|
"loss": 0.5836, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.2041553748870822, |
|
"grad_norm": 0.30130327978633054, |
|
"learning_rate": 2.8144600200657953e-05, |
|
"loss": 0.5882, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.20596205962059622, |
|
"grad_norm": 0.3094839897177016, |
|
"learning_rate": 2.8101123009256946e-05, |
|
"loss": 0.5712, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.20776874435411022, |
|
"grad_norm": 0.26683135581406914, |
|
"learning_rate": 2.8057176693083253e-05, |
|
"loss": 0.5873, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.20957542908762422, |
|
"grad_norm": 0.2990360414829194, |
|
"learning_rate": 2.8012762825765763e-05, |
|
"loss": 0.5804, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.21138211382113822, |
|
"grad_norm": 0.3133148730057432, |
|
"learning_rate": 2.7967882997675424e-05, |
|
"loss": 0.5919, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.21318879855465223, |
|
"grad_norm": 0.3038806250211378, |
|
"learning_rate": 2.7922538815868287e-05, |
|
"loss": 0.5792, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.21499548328816623, |
|
"grad_norm": 0.26808000519659025, |
|
"learning_rate": 2.7876731904027994e-05, |
|
"loss": 0.5694, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.21680216802168023, |
|
"grad_norm": 0.3124504773400534, |
|
"learning_rate": 2.78304639024076e-05, |
|
"loss": 0.5885, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2186088527551942, |
|
"grad_norm": 0.2537155548055984, |
|
"learning_rate": 2.7783736467770863e-05, |
|
"loss": 0.5837, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.2204155374887082, |
|
"grad_norm": 0.2976068165908906, |
|
"learning_rate": 2.7736551273332908e-05, |
|
"loss": 0.5888, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.2222222222222222, |
|
"grad_norm": 0.28922047785965427, |
|
"learning_rate": 2.7688910008700305e-05, |
|
"loss": 0.5789, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.2240289069557362, |
|
"grad_norm": 0.24080562112321108, |
|
"learning_rate": 2.764081437981059e-05, |
|
"loss": 0.5647, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.22583559168925021, |
|
"grad_norm": 0.27448072752877173, |
|
"learning_rate": 2.7592266108871158e-05, |
|
"loss": 0.572, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.22764227642276422, |
|
"grad_norm": 0.2926601943246854, |
|
"learning_rate": 2.754326693429761e-05, |
|
"loss": 0.5845, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.22944896115627822, |
|
"grad_norm": 0.2556901465713744, |
|
"learning_rate": 2.7493818610651493e-05, |
|
"loss": 0.5772, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.23125564588979222, |
|
"grad_norm": 0.2561501825860112, |
|
"learning_rate": 2.744392290857747e-05, |
|
"loss": 0.5765, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.23306233062330622, |
|
"grad_norm": 0.27007071971247465, |
|
"learning_rate": 2.7393581614739924e-05, |
|
"loss": 0.5715, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.23486901535682023, |
|
"grad_norm": 0.2720070067806271, |
|
"learning_rate": 2.7342796531758984e-05, |
|
"loss": 0.5899, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.23667570009033423, |
|
"grad_norm": 0.27867638614924434, |
|
"learning_rate": 2.729156947814598e-05, |
|
"loss": 0.5847, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.23848238482384823, |
|
"grad_norm": 0.26134114963816046, |
|
"learning_rate": 2.7239902288238297e-05, |
|
"loss": 0.5632, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.24028906955736223, |
|
"grad_norm": 0.26639406666272125, |
|
"learning_rate": 2.7187796812133733e-05, |
|
"loss": 0.579, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.24209575429087624, |
|
"grad_norm": 0.2876739206328272, |
|
"learning_rate": 2.7135254915624213e-05, |
|
"loss": 0.5795, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.24390243902439024, |
|
"grad_norm": 0.27235240180814757, |
|
"learning_rate": 2.708227848012901e-05, |
|
"loss": 0.5771, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.24570912375790424, |
|
"grad_norm": 0.2720723951174645, |
|
"learning_rate": 2.7028869402627357e-05, |
|
"loss": 0.5801, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.24751580849141824, |
|
"grad_norm": 0.26926686974517616, |
|
"learning_rate": 2.6975029595590523e-05, |
|
"loss": 0.5639, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.24932249322493225, |
|
"grad_norm": 0.29325483042034345, |
|
"learning_rate": 2.6920760986913332e-05, |
|
"loss": 0.5808, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.25112917795844625, |
|
"grad_norm": 0.2698556596124732, |
|
"learning_rate": 2.6866065519845124e-05, |
|
"loss": 0.584, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.2529358626919603, |
|
"grad_norm": 0.2786316780374937, |
|
"learning_rate": 2.681094515292018e-05, |
|
"loss": 0.583, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.25474254742547425, |
|
"grad_norm": 0.24933279116696466, |
|
"learning_rate": 2.6755401859887598e-05, |
|
"loss": 0.5641, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.2565492321589883, |
|
"grad_norm": 0.3122326056143099, |
|
"learning_rate": 2.6699437629640595e-05, |
|
"loss": 0.5841, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.25835591689250226, |
|
"grad_norm": 0.28090053633091877, |
|
"learning_rate": 2.6643054466145297e-05, |
|
"loss": 0.5758, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.2601626016260163, |
|
"grad_norm": 0.27419051211590195, |
|
"learning_rate": 2.6586254388368995e-05, |
|
"loss": 0.5642, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.26196928635953026, |
|
"grad_norm": 0.25632244502638984, |
|
"learning_rate": 2.652903943020783e-05, |
|
"loss": 0.5769, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.26377597109304424, |
|
"grad_norm": 0.27493807993448327, |
|
"learning_rate": 2.647141164041398e-05, |
|
"loss": 0.5814, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.26558265582655827, |
|
"grad_norm": 0.2683678706046532, |
|
"learning_rate": 2.641337308252228e-05, |
|
"loss": 0.571, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.26738934056007224, |
|
"grad_norm": 0.29628453279860983, |
|
"learning_rate": 2.6354925834776346e-05, |
|
"loss": 0.5726, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.26919602529358627, |
|
"grad_norm": 0.28402654957400403, |
|
"learning_rate": 2.6296071990054167e-05, |
|
"loss": 0.5696, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.27100271002710025, |
|
"grad_norm": 0.2607317023496585, |
|
"learning_rate": 2.6236813655793123e-05, |
|
"loss": 0.5798, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2728093947606143, |
|
"grad_norm": 0.273647796185402, |
|
"learning_rate": 2.617715295391457e-05, |
|
"loss": 0.5614, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.27461607949412825, |
|
"grad_norm": 0.2545529593791399, |
|
"learning_rate": 2.6117092020747824e-05, |
|
"loss": 0.5692, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.2764227642276423, |
|
"grad_norm": 0.274532875914137, |
|
"learning_rate": 2.6056633006953677e-05, |
|
"loss": 0.5694, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.27822944896115626, |
|
"grad_norm": 0.26288128426350693, |
|
"learning_rate": 2.5995778077447393e-05, |
|
"loss": 0.5731, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.2800361336946703, |
|
"grad_norm": 0.30284194692462474, |
|
"learning_rate": 2.5934529411321174e-05, |
|
"loss": 0.5658, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.28184281842818426, |
|
"grad_norm": 0.24452133079312888, |
|
"learning_rate": 2.587288920176613e-05, |
|
"loss": 0.5657, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.2836495031616983, |
|
"grad_norm": 0.28048503748226455, |
|
"learning_rate": 2.581085965599375e-05, |
|
"loss": 0.5787, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.28545618789521227, |
|
"grad_norm": 0.26599106457147237, |
|
"learning_rate": 2.5748442995156882e-05, |
|
"loss": 0.5763, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.2872628726287263, |
|
"grad_norm": 0.2839977556276154, |
|
"learning_rate": 2.5685641454270172e-05, |
|
"loss": 0.557, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.28906955736224027, |
|
"grad_norm": 0.3026219403510213, |
|
"learning_rate": 2.5622457282130046e-05, |
|
"loss": 0.5662, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.2908762420957543, |
|
"grad_norm": 0.25173949399581536, |
|
"learning_rate": 2.5558892741234173e-05, |
|
"loss": 0.5742, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.2926829268292683, |
|
"grad_norm": 0.3113881337024894, |
|
"learning_rate": 2.5494950107700482e-05, |
|
"loss": 0.5602, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.2944896115627823, |
|
"grad_norm": 0.26454819092843895, |
|
"learning_rate": 2.5430631671185616e-05, |
|
"loss": 0.5682, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.2962962962962963, |
|
"grad_norm": 0.27612792011239423, |
|
"learning_rate": 2.5365939734802973e-05, |
|
"loss": 0.5777, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.2981029810298103, |
|
"grad_norm": 0.2546081323722588, |
|
"learning_rate": 2.5300876615040223e-05, |
|
"loss": 0.5686, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.2999096657633243, |
|
"grad_norm": 0.2963445091229743, |
|
"learning_rate": 2.523544464167637e-05, |
|
"loss": 0.5752, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.3017163504968383, |
|
"grad_norm": 0.256196797189933, |
|
"learning_rate": 2.5169646157698313e-05, |
|
"loss": 0.5738, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.3035230352303523, |
|
"grad_norm": 0.27645696746126036, |
|
"learning_rate": 2.5103483519216964e-05, |
|
"loss": 0.5724, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.3053297199638663, |
|
"grad_norm": 0.26064430234348385, |
|
"learning_rate": 2.5036959095382875e-05, |
|
"loss": 0.5644, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.3071364046973803, |
|
"grad_norm": 0.2883428318862628, |
|
"learning_rate": 2.4970075268301388e-05, |
|
"loss": 0.5781, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.3089430894308943, |
|
"grad_norm": 0.2632924938897706, |
|
"learning_rate": 2.4902834432947353e-05, |
|
"loss": 0.5612, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.3107497741644083, |
|
"grad_norm": 0.27223503577264474, |
|
"learning_rate": 2.4835238997079382e-05, |
|
"loss": 0.5681, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.31255645889792233, |
|
"grad_norm": 0.2777996613880595, |
|
"learning_rate": 2.4767291381153603e-05, |
|
"loss": 0.5713, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.3143631436314363, |
|
"grad_norm": 0.23698042977211534, |
|
"learning_rate": 2.4698994018236994e-05, |
|
"loss": 0.5624, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.31616982836495033, |
|
"grad_norm": 0.2942107184208593, |
|
"learning_rate": 2.4630349353920284e-05, |
|
"loss": 0.567, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.3179765130984643, |
|
"grad_norm": 0.25517257945021476, |
|
"learning_rate": 2.4561359846230346e-05, |
|
"loss": 0.5809, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.31978319783197834, |
|
"grad_norm": 0.24673263637056791, |
|
"learning_rate": 2.4492027965542217e-05, |
|
"loss": 0.5566, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.3215898825654923, |
|
"grad_norm": 0.2654696054555115, |
|
"learning_rate": 2.44223561944906e-05, |
|
"loss": 0.5636, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.32339656729900634, |
|
"grad_norm": 0.2509205757784683, |
|
"learning_rate": 2.4352347027881003e-05, |
|
"loss": 0.573, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.3252032520325203, |
|
"grad_norm": 0.2782257679089532, |
|
"learning_rate": 2.4282002972600382e-05, |
|
"loss": 0.5811, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.32700993676603435, |
|
"grad_norm": 0.261975491450884, |
|
"learning_rate": 2.4211326547527377e-05, |
|
"loss": 0.5595, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.3288166214995483, |
|
"grad_norm": 0.2732747607543431, |
|
"learning_rate": 2.4140320283442125e-05, |
|
"loss": 0.5698, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.33062330623306235, |
|
"grad_norm": 0.2654409094285421, |
|
"learning_rate": 2.4068986722935625e-05, |
|
"loss": 0.5802, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.3324299909665763, |
|
"grad_norm": 0.27364065775420887, |
|
"learning_rate": 2.3997328420318705e-05, |
|
"loss": 0.5763, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.33423667570009036, |
|
"grad_norm": 0.26988406932399184, |
|
"learning_rate": 2.3925347941530556e-05, |
|
"loss": 0.573, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.33604336043360433, |
|
"grad_norm": 0.24913498548586777, |
|
"learning_rate": 2.3853047864046843e-05, |
|
"loss": 0.5678, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.33785004516711836, |
|
"grad_norm": 0.26662821751854754, |
|
"learning_rate": 2.3780430776787413e-05, |
|
"loss": 0.5652, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.33965672990063234, |
|
"grad_norm": 0.24027260391804556, |
|
"learning_rate": 2.3707499280023604e-05, |
|
"loss": 0.5654, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.34146341463414637, |
|
"grad_norm": 0.26313325975109614, |
|
"learning_rate": 2.3634255985285104e-05, |
|
"loss": 0.5726, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.34327009936766034, |
|
"grad_norm": 0.28321211914229444, |
|
"learning_rate": 2.356070351526648e-05, |
|
"loss": 0.5615, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.34507678410117437, |
|
"grad_norm": 0.2472498045788604, |
|
"learning_rate": 2.348684450373322e-05, |
|
"loss": 0.5593, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.34688346883468835, |
|
"grad_norm": 0.240143859643124, |
|
"learning_rate": 2.3412681595427467e-05, |
|
"loss": 0.5667, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.3486901535682023, |
|
"grad_norm": 0.27894029380988317, |
|
"learning_rate": 2.3338217445973268e-05, |
|
"loss": 0.5683, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.35049683830171635, |
|
"grad_norm": 0.2514644044619394, |
|
"learning_rate": 2.3263454721781537e-05, |
|
"loss": 0.5699, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.3523035230352303, |
|
"grad_norm": 0.2802978993269217, |
|
"learning_rate": 2.318839609995453e-05, |
|
"loss": 0.5534, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.35411020776874436, |
|
"grad_norm": 0.25311337469847905, |
|
"learning_rate": 2.3113044268189995e-05, |
|
"loss": 0.5664, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.35591689250225833, |
|
"grad_norm": 0.24577349249807215, |
|
"learning_rate": 2.303740192468495e-05, |
|
"loss": 0.5661, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.35772357723577236, |
|
"grad_norm": 0.26381231166122326, |
|
"learning_rate": 2.2961471778039045e-05, |
|
"loss": 0.5434, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.35953026196928634, |
|
"grad_norm": 0.2605543346247411, |
|
"learning_rate": 2.288525654715757e-05, |
|
"loss": 0.5666, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.36133694670280037, |
|
"grad_norm": 0.27062115319955804, |
|
"learning_rate": 2.280875896115413e-05, |
|
"loss": 0.5554, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.36314363143631434, |
|
"grad_norm": 0.2528702856534622, |
|
"learning_rate": 2.2731981759252876e-05, |
|
"loss": 0.55, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.36495031616982837, |
|
"grad_norm": 0.24594198622850097, |
|
"learning_rate": 2.2654927690690445e-05, |
|
"loss": 0.5766, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.36675700090334235, |
|
"grad_norm": 0.254972571523128, |
|
"learning_rate": 2.257759951461752e-05, |
|
"loss": 0.5636, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.3685636856368564, |
|
"grad_norm": 0.2806803673734258, |
|
"learning_rate": 2.25e-05, |
|
"loss": 0.5768, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.37037037037037035, |
|
"grad_norm": 0.23720480295379937, |
|
"learning_rate": 2.24221319255199e-05, |
|
"loss": 0.5672, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.3721770551038844, |
|
"grad_norm": 0.2694240775770153, |
|
"learning_rate": 2.234399807947579e-05, |
|
"loss": 0.554, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.37398373983739835, |
|
"grad_norm": 0.23479458581037854, |
|
"learning_rate": 2.2265601259683e-05, |
|
"loss": 0.5619, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.3757904245709124, |
|
"grad_norm": 0.24511933133656108, |
|
"learning_rate": 2.2186944273373426e-05, |
|
"loss": 0.5494, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.37759710930442636, |
|
"grad_norm": 0.2457782899671268, |
|
"learning_rate": 2.210802993709498e-05, |
|
"loss": 0.5709, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.3794037940379404, |
|
"grad_norm": 0.2573918967778832, |
|
"learning_rate": 2.202886107661078e-05, |
|
"loss": 0.5671, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.38121047877145436, |
|
"grad_norm": 0.24827518412990313, |
|
"learning_rate": 2.1949440526797928e-05, |
|
"loss": 0.5664, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.3830171635049684, |
|
"grad_norm": 0.2510685180298552, |
|
"learning_rate": 2.1869771131546015e-05, |
|
"loss": 0.5742, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.38482384823848237, |
|
"grad_norm": 0.26651654550259046, |
|
"learning_rate": 2.178985574365529e-05, |
|
"loss": 0.5745, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.3866305329719964, |
|
"grad_norm": 0.2582963988078575, |
|
"learning_rate": 2.170969722473449e-05, |
|
"loss": 0.5604, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.3884372177055104, |
|
"grad_norm": 0.3321141523101486, |
|
"learning_rate": 2.1629298445098403e-05, |
|
"loss": 0.5872, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.3902439024390244, |
|
"grad_norm": 0.24254906410847127, |
|
"learning_rate": 2.154866228366505e-05, |
|
"loss": 0.5759, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.3920505871725384, |
|
"grad_norm": 0.2429155474316322, |
|
"learning_rate": 2.146779162785263e-05, |
|
"loss": 0.576, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.3938572719060524, |
|
"grad_norm": 0.25427260612915886, |
|
"learning_rate": 2.138668937347609e-05, |
|
"loss": 0.5677, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.3956639566395664, |
|
"grad_norm": 0.23326106828082938, |
|
"learning_rate": 2.1305358424643484e-05, |
|
"loss": 0.5629, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.3974706413730804, |
|
"grad_norm": 0.2436559501381673, |
|
"learning_rate": 2.1223801693651927e-05, |
|
"loss": 0.5653, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.3992773261065944, |
|
"grad_norm": 0.25318225968369956, |
|
"learning_rate": 2.114202210088336e-05, |
|
"loss": 0.5542, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.4010840108401084, |
|
"grad_norm": 0.2462046635865081, |
|
"learning_rate": 2.106002257469993e-05, |
|
"loss": 0.5714, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.4028906955736224, |
|
"grad_norm": 0.2360999410411674, |
|
"learning_rate": 2.0977806051339172e-05, |
|
"loss": 0.5542, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.4046973803071364, |
|
"grad_norm": 0.24470631417214034, |
|
"learning_rate": 2.0895375474808857e-05, |
|
"loss": 0.5584, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.4065040650406504, |
|
"grad_norm": 0.265905351487726, |
|
"learning_rate": 2.0812733796781544e-05, |
|
"loss": 0.5638, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.4083107497741644, |
|
"grad_norm": 0.23676860373034014, |
|
"learning_rate": 2.0729883976488936e-05, |
|
"loss": 0.5694, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.4101174345076784, |
|
"grad_norm": 0.26382680535395553, |
|
"learning_rate": 2.064682898061588e-05, |
|
"loss": 0.5739, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.41192411924119243, |
|
"grad_norm": 0.2552385632003772, |
|
"learning_rate": 2.0563571783194146e-05, |
|
"loss": 0.5606, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.4137308039747064, |
|
"grad_norm": 0.22943590877340414, |
|
"learning_rate": 2.0480115365495928e-05, |
|
"loss": 0.563, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.41553748870822044, |
|
"grad_norm": 0.2594052703130071, |
|
"learning_rate": 2.0396462715927107e-05, |
|
"loss": 0.5651, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.4173441734417344, |
|
"grad_norm": 0.23337937854939458, |
|
"learning_rate": 2.0312616829920222e-05, |
|
"loss": 0.5643, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.41915085817524844, |
|
"grad_norm": 0.23235825280419914, |
|
"learning_rate": 2.022858070982723e-05, |
|
"loss": 0.5673, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.4209575429087624, |
|
"grad_norm": 0.22585464244623496, |
|
"learning_rate": 2.0144357364811973e-05, |
|
"loss": 0.5493, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.42276422764227645, |
|
"grad_norm": 0.24147478621396193, |
|
"learning_rate": 2.0059949810742452e-05, |
|
"loss": 0.5643, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.4245709123757904, |
|
"grad_norm": 0.211585416294672, |
|
"learning_rate": 1.997536107008281e-05, |
|
"loss": 0.5598, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.42637759710930445, |
|
"grad_norm": 0.24840733707932, |
|
"learning_rate": 1.9890594171785128e-05, |
|
"loss": 0.5712, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.4281842818428184, |
|
"grad_norm": 0.23294417474994988, |
|
"learning_rate": 1.9805652151180945e-05, |
|
"loss": 0.571, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.42999096657633246, |
|
"grad_norm": 0.22241956591220233, |
|
"learning_rate": 1.972053804987258e-05, |
|
"loss": 0.5791, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.43179765130984643, |
|
"grad_norm": 0.2432446757540741, |
|
"learning_rate": 1.963525491562421e-05, |
|
"loss": 0.5603, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.43360433604336046, |
|
"grad_norm": 0.23289202395477654, |
|
"learning_rate": 1.954980580225275e-05, |
|
"loss": 0.5607, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.43541102077687444, |
|
"grad_norm": 0.2423884205399499, |
|
"learning_rate": 1.946419376951848e-05, |
|
"loss": 0.5595, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.4372177055103884, |
|
"grad_norm": 0.24367683500187448, |
|
"learning_rate": 1.9378421883015505e-05, |
|
"loss": 0.5599, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.43902439024390244, |
|
"grad_norm": 0.2448864048948071, |
|
"learning_rate": 1.9292493214061953e-05, |
|
"loss": 0.5647, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.4408310749774164, |
|
"grad_norm": 0.23594363597702456, |
|
"learning_rate": 1.9206410839590042e-05, |
|
"loss": 0.5711, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.44263775971093045, |
|
"grad_norm": 0.29161925076798506, |
|
"learning_rate": 1.9120177842035853e-05, |
|
"loss": 0.5632, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 0.24178286345391872, |
|
"learning_rate": 1.9033797309228984e-05, |
|
"loss": 0.5698, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.44625112917795845, |
|
"grad_norm": 0.2386264364631626, |
|
"learning_rate": 1.8947272334281977e-05, |
|
"loss": 0.5515, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.4480578139114724, |
|
"grad_norm": 0.23546044963734702, |
|
"learning_rate": 1.8860606015479537e-05, |
|
"loss": 0.5702, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.44986449864498645, |
|
"grad_norm": 0.24602939520620193, |
|
"learning_rate": 1.877380145616763e-05, |
|
"loss": 0.5539, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.45167118337850043, |
|
"grad_norm": 0.24927796394572446, |
|
"learning_rate": 1.868686176464232e-05, |
|
"loss": 0.5685, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.45347786811201446, |
|
"grad_norm": 0.23348430670004636, |
|
"learning_rate": 1.8599790054038487e-05, |
|
"loss": 0.5556, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.45528455284552843, |
|
"grad_norm": 0.23038751620953404, |
|
"learning_rate": 1.8512589442218358e-05, |
|
"loss": 0.5379, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.45709123757904246, |
|
"grad_norm": 0.2470432372608671, |
|
"learning_rate": 1.8425263051659838e-05, |
|
"loss": 0.5611, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.45889792231255644, |
|
"grad_norm": 0.2752755428251915, |
|
"learning_rate": 1.8337814009344716e-05, |
|
"loss": 0.5596, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.46070460704607047, |
|
"grad_norm": 0.26345960590354556, |
|
"learning_rate": 1.8250245446646707e-05, |
|
"loss": 0.5532, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.46251129177958444, |
|
"grad_norm": 0.2590290719006861, |
|
"learning_rate": 1.8162560499219286e-05, |
|
"loss": 0.5532, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.4643179765130985, |
|
"grad_norm": 0.3788681013271154, |
|
"learning_rate": 1.807476230688346e-05, |
|
"loss": 0.5632, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.46612466124661245, |
|
"grad_norm": 0.26287326052859095, |
|
"learning_rate": 1.7986854013515274e-05, |
|
"loss": 0.5588, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.4679313459801265, |
|
"grad_norm": 0.2452706292889911, |
|
"learning_rate": 1.78988387669333e-05, |
|
"loss": 0.5623, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.46973803071364045, |
|
"grad_norm": 0.2711441295103095, |
|
"learning_rate": 1.781071971878587e-05, |
|
"loss": 0.5758, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.4715447154471545, |
|
"grad_norm": 0.25871117471585714, |
|
"learning_rate": 1.7722500024438244e-05, |
|
"loss": 0.5664, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.47335140018066846, |
|
"grad_norm": 0.26452477947450626, |
|
"learning_rate": 1.7634182842859628e-05, |
|
"loss": 0.5551, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.4751580849141825, |
|
"grad_norm": 0.26255317163283814, |
|
"learning_rate": 1.7545771336510033e-05, |
|
"loss": 0.56, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.47696476964769646, |
|
"grad_norm": 0.24584008611588468, |
|
"learning_rate": 1.7457268671227067e-05, |
|
"loss": 0.566, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.4787714543812105, |
|
"grad_norm": 0.28058112893938664, |
|
"learning_rate": 1.736867801611254e-05, |
|
"loss": 0.5476, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.48057813911472447, |
|
"grad_norm": 0.2398588888650874, |
|
"learning_rate": 1.728000254341901e-05, |
|
"loss": 0.5714, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.4823848238482385, |
|
"grad_norm": 0.23159723895248227, |
|
"learning_rate": 1.7191245428436175e-05, |
|
"loss": 0.556, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.48419150858175247, |
|
"grad_norm": 0.242023204376101, |
|
"learning_rate": 1.7102409849377188e-05, |
|
"loss": 0.556, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.4859981933152665, |
|
"grad_norm": 0.23092092883029253, |
|
"learning_rate": 1.7013498987264832e-05, |
|
"loss": 0.563, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.4878048780487805, |
|
"grad_norm": 0.2338290007415461, |
|
"learning_rate": 1.6924516025817636e-05, |
|
"loss": 0.5447, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.4896115627822945, |
|
"grad_norm": 0.23878083199278635, |
|
"learning_rate": 1.683546415133584e-05, |
|
"loss": 0.566, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.4914182475158085, |
|
"grad_norm": 0.26791667689770354, |
|
"learning_rate": 1.6746346552587342e-05, |
|
"loss": 0.5606, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.4932249322493225, |
|
"grad_norm": 0.2567814206559349, |
|
"learning_rate": 1.665716642069349e-05, |
|
"loss": 0.5686, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.4950316169828365, |
|
"grad_norm": 0.25936103085206763, |
|
"learning_rate": 1.6567926949014805e-05, |
|
"loss": 0.5563, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.4968383017163505, |
|
"grad_norm": 0.22934274716661407, |
|
"learning_rate": 1.6478631333036655e-05, |
|
"loss": 0.5505, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.4986449864498645, |
|
"grad_norm": 0.22246496169380997, |
|
"learning_rate": 1.638928277025482e-05, |
|
"loss": 0.5427, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.5004516711833785, |
|
"grad_norm": 0.2411640165990185, |
|
"learning_rate": 1.6299884460061005e-05, |
|
"loss": 0.5544, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.5022583559168925, |
|
"grad_norm": 0.22032966667707543, |
|
"learning_rate": 1.621043960362826e-05, |
|
"loss": 0.5414, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.5040650406504065, |
|
"grad_norm": 0.2338531785284466, |
|
"learning_rate": 1.6120951403796367e-05, |
|
"loss": 0.5562, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.5058717253839206, |
|
"grad_norm": 0.2404049446380252, |
|
"learning_rate": 1.603142306495714e-05, |
|
"loss": 0.5528, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5076784101174345, |
|
"grad_norm": 0.21543759278974306, |
|
"learning_rate": 1.5941857792939702e-05, |
|
"loss": 0.5535, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.5094850948509485, |
|
"grad_norm": 0.22174685531345867, |
|
"learning_rate": 1.585225879489567e-05, |
|
"loss": 0.5482, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.5112917795844625, |
|
"grad_norm": 0.23505724653269477, |
|
"learning_rate": 1.5762629279184326e-05, |
|
"loss": 0.5629, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.5130984643179766, |
|
"grad_norm": 0.2190752071568156, |
|
"learning_rate": 1.5672972455257726e-05, |
|
"loss": 0.5512, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.5149051490514905, |
|
"grad_norm": 0.23014057552115058, |
|
"learning_rate": 1.5583291533545775e-05, |
|
"loss": 0.5462, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.5167118337850045, |
|
"grad_norm": 0.24787471158863109, |
|
"learning_rate": 1.549358972534128e-05, |
|
"loss": 0.5435, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.5185185185185185, |
|
"grad_norm": 0.22237933857949138, |
|
"learning_rate": 1.5403870242684942e-05, |
|
"loss": 0.5649, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.5203252032520326, |
|
"grad_norm": 0.22788208111760372, |
|
"learning_rate": 1.5314136298250355e-05, |
|
"loss": 0.5622, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.5221318879855466, |
|
"grad_norm": 0.2151133182754434, |
|
"learning_rate": 1.5224391105228956e-05, |
|
"loss": 0.5468, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.5239385727190605, |
|
"grad_norm": 0.2084276943561875, |
|
"learning_rate": 1.5134637877214968e-05, |
|
"loss": 0.5585, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5257452574525745, |
|
"grad_norm": 0.2474715604768919, |
|
"learning_rate": 1.5044879828090346e-05, |
|
"loss": 0.5626, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.5275519421860885, |
|
"grad_norm": 0.22981615890512452, |
|
"learning_rate": 1.4955120171909658e-05, |
|
"loss": 0.5666, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.5293586269196026, |
|
"grad_norm": 0.2002529633522432, |
|
"learning_rate": 1.4865362122785031e-05, |
|
"loss": 0.5594, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.5311653116531165, |
|
"grad_norm": 0.2361248388500811, |
|
"learning_rate": 1.4775608894771048e-05, |
|
"loss": 0.5515, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.5329719963866305, |
|
"grad_norm": 0.2566454036838341, |
|
"learning_rate": 1.4685863701749648e-05, |
|
"loss": 0.5494, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.5347786811201445, |
|
"grad_norm": 0.23551528554343792, |
|
"learning_rate": 1.4596129757315062e-05, |
|
"loss": 0.5533, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.5365853658536586, |
|
"grad_norm": 0.24065601220182203, |
|
"learning_rate": 1.4506410274658718e-05, |
|
"loss": 0.5581, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.5383920505871725, |
|
"grad_norm": 0.23351047682185289, |
|
"learning_rate": 1.441670846645423e-05, |
|
"loss": 0.5531, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.5401987353206865, |
|
"grad_norm": 0.23435195585103996, |
|
"learning_rate": 1.4327027544742281e-05, |
|
"loss": 0.5683, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.5420054200542005, |
|
"grad_norm": 0.2242996313471287, |
|
"learning_rate": 1.4237370720815675e-05, |
|
"loss": 0.5562, |
|
"step": 300 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 553, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 347755278106624.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|