|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.7226738934056007, |
|
"eval_steps": 500, |
|
"global_step": 400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0018066847335140017, |
|
"grad_norm": 2.8330893538644304, |
|
"learning_rate": 0.0, |
|
"loss": 0.8251, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0036133694670280035, |
|
"grad_norm": 2.881435997205295, |
|
"learning_rate": 3.5714285714285716e-07, |
|
"loss": 0.8284, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.005420054200542005, |
|
"grad_norm": 2.913444175898846, |
|
"learning_rate": 7.142857142857143e-07, |
|
"loss": 0.843, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.007226738934056007, |
|
"grad_norm": 2.8476328344852626, |
|
"learning_rate": 1.0714285714285714e-06, |
|
"loss": 0.8396, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.009033423667570008, |
|
"grad_norm": 2.7681799600000607, |
|
"learning_rate": 1.4285714285714286e-06, |
|
"loss": 0.8292, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01084010840108401, |
|
"grad_norm": 2.7322260951975927, |
|
"learning_rate": 1.7857142857142859e-06, |
|
"loss": 0.829, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.012646793134598013, |
|
"grad_norm": 2.597806254256109, |
|
"learning_rate": 2.1428571428571427e-06, |
|
"loss": 0.808, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.014453477868112014, |
|
"grad_norm": 2.594208882541123, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.8153, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.016260162601626018, |
|
"grad_norm": 2.1559466678394696, |
|
"learning_rate": 2.8571428571428573e-06, |
|
"loss": 0.8042, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.018066847335140017, |
|
"grad_norm": 2.046189870534073, |
|
"learning_rate": 3.2142857142857147e-06, |
|
"loss": 0.798, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01987353206865402, |
|
"grad_norm": 1.489092346853261, |
|
"learning_rate": 3.5714285714285718e-06, |
|
"loss": 0.7766, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.02168021680216802, |
|
"grad_norm": 1.422343449560607, |
|
"learning_rate": 3.928571428571429e-06, |
|
"loss": 0.7866, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.023486901535682024, |
|
"grad_norm": 1.3533723550860928, |
|
"learning_rate": 4.2857142857142855e-06, |
|
"loss": 0.7737, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.025293586269196026, |
|
"grad_norm": 1.2487216948113553, |
|
"learning_rate": 4.642857142857144e-06, |
|
"loss": 0.7669, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.02710027100271003, |
|
"grad_norm": 1.4826389765971164, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7523, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.028906955736224028, |
|
"grad_norm": 1.6838425063979934, |
|
"learning_rate": 5.357142857142857e-06, |
|
"loss": 0.7431, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.03071364046973803, |
|
"grad_norm": 1.8944059999030647, |
|
"learning_rate": 5.7142857142857145e-06, |
|
"loss": 0.7505, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.032520325203252036, |
|
"grad_norm": 1.7093633044809413, |
|
"learning_rate": 6.071428571428571e-06, |
|
"loss": 0.7413, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.03432700993676603, |
|
"grad_norm": 1.4480856814923126, |
|
"learning_rate": 6.4285714285714295e-06, |
|
"loss": 0.7326, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.036133694670280034, |
|
"grad_norm": 0.8989567363750163, |
|
"learning_rate": 6.785714285714287e-06, |
|
"loss": 0.7034, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.037940379403794036, |
|
"grad_norm": 0.9336921226547814, |
|
"learning_rate": 7.1428571428571436e-06, |
|
"loss": 0.7054, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.03974706413730804, |
|
"grad_norm": 0.9964488635721185, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 0.6939, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.04155374887082204, |
|
"grad_norm": 0.8799604592025886, |
|
"learning_rate": 7.857142857142858e-06, |
|
"loss": 0.6911, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.04336043360433604, |
|
"grad_norm": 0.8240418742797938, |
|
"learning_rate": 8.214285714285714e-06, |
|
"loss": 0.6967, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.045167118337850046, |
|
"grad_norm": 0.6769494865754769, |
|
"learning_rate": 8.571428571428571e-06, |
|
"loss": 0.6755, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.04697380307136405, |
|
"grad_norm": 0.6032918245818127, |
|
"learning_rate": 8.92857142857143e-06, |
|
"loss": 0.6895, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.04878048780487805, |
|
"grad_norm": 0.6155544869504126, |
|
"learning_rate": 9.285714285714288e-06, |
|
"loss": 0.6658, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.05058717253839205, |
|
"grad_norm": 0.621356627949306, |
|
"learning_rate": 9.642857142857144e-06, |
|
"loss": 0.6675, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.052393857271906055, |
|
"grad_norm": 0.5810118247332029, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6791, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.05420054200542006, |
|
"grad_norm": 0.5247538671601636, |
|
"learning_rate": 9.999910480045805e-06, |
|
"loss": 0.6778, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.05600722673893405, |
|
"grad_norm": 0.5714434850034724, |
|
"learning_rate": 9.999641923388745e-06, |
|
"loss": 0.6666, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.057813911472448055, |
|
"grad_norm": 0.4712383108751352, |
|
"learning_rate": 9.999194339645292e-06, |
|
"loss": 0.6538, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.05962059620596206, |
|
"grad_norm": 0.4617901106156676, |
|
"learning_rate": 9.998567744842518e-06, |
|
"loss": 0.6638, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.06142728093947606, |
|
"grad_norm": 0.4666096297215481, |
|
"learning_rate": 9.997762161417517e-06, |
|
"loss": 0.6507, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.06323396567299007, |
|
"grad_norm": 0.4177170379633248, |
|
"learning_rate": 9.996777618216608e-06, |
|
"loss": 0.6558, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.06504065040650407, |
|
"grad_norm": 0.3853857815173069, |
|
"learning_rate": 9.995614150494293e-06, |
|
"loss": 0.6501, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.06684733514001806, |
|
"grad_norm": 0.4750342442561457, |
|
"learning_rate": 9.994271799912004e-06, |
|
"loss": 0.654, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.06865401987353206, |
|
"grad_norm": 0.4687734149491767, |
|
"learning_rate": 9.992750614536606e-06, |
|
"loss": 0.6468, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.07046070460704607, |
|
"grad_norm": 0.48644381395947056, |
|
"learning_rate": 9.991050648838676e-06, |
|
"loss": 0.6473, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.07226738934056007, |
|
"grad_norm": 0.31783178912190924, |
|
"learning_rate": 9.989171963690556e-06, |
|
"loss": 0.6366, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.07407407407407407, |
|
"grad_norm": 0.33895045962254544, |
|
"learning_rate": 9.987114626364172e-06, |
|
"loss": 0.6427, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.07588075880758807, |
|
"grad_norm": 0.3719545945498325, |
|
"learning_rate": 9.984878710528615e-06, |
|
"loss": 0.634, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.07768744354110207, |
|
"grad_norm": 0.38317415749657263, |
|
"learning_rate": 9.982464296247523e-06, |
|
"loss": 0.6341, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.07949412827461608, |
|
"grad_norm": 0.31029398525665997, |
|
"learning_rate": 9.979871469976197e-06, |
|
"loss": 0.6272, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.08130081300813008, |
|
"grad_norm": 0.2908522040574856, |
|
"learning_rate": 9.97710032455851e-06, |
|
"loss": 0.6342, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.08310749774164408, |
|
"grad_norm": 0.2748406344962639, |
|
"learning_rate": 9.974150959223591e-06, |
|
"loss": 0.6358, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.08491418247515808, |
|
"grad_norm": 0.3149168698937494, |
|
"learning_rate": 9.971023479582258e-06, |
|
"loss": 0.6387, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.08672086720867209, |
|
"grad_norm": 0.2884824606148138, |
|
"learning_rate": 9.967717997623245e-06, |
|
"loss": 0.6257, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.08852755194218609, |
|
"grad_norm": 0.27614043410709116, |
|
"learning_rate": 9.964234631709188e-06, |
|
"loss": 0.6313, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.09033423667570009, |
|
"grad_norm": 0.2556625842505664, |
|
"learning_rate": 9.960573506572391e-06, |
|
"loss": 0.63, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0921409214092141, |
|
"grad_norm": 0.26971687477967615, |
|
"learning_rate": 9.956734753310355e-06, |
|
"loss": 0.6193, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.0939476061427281, |
|
"grad_norm": 0.2549513910301006, |
|
"learning_rate": 9.952718509381086e-06, |
|
"loss": 0.6377, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.0957542908762421, |
|
"grad_norm": 0.2828412847060681, |
|
"learning_rate": 9.948524918598175e-06, |
|
"loss": 0.6219, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.0975609756097561, |
|
"grad_norm": 0.2622887143638149, |
|
"learning_rate": 9.944154131125643e-06, |
|
"loss": 0.6126, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.0993676603432701, |
|
"grad_norm": 0.25966122737232883, |
|
"learning_rate": 9.93960630347257e-06, |
|
"loss": 0.6265, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.1011743450767841, |
|
"grad_norm": 0.2433496106029979, |
|
"learning_rate": 9.934881598487478e-06, |
|
"loss": 0.6316, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.10298102981029811, |
|
"grad_norm": 0.24558757040432388, |
|
"learning_rate": 9.929980185352525e-06, |
|
"loss": 0.6173, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.10478771454381211, |
|
"grad_norm": 0.3572659265600395, |
|
"learning_rate": 9.924902239577419e-06, |
|
"loss": 0.6249, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.10659439927732611, |
|
"grad_norm": 0.24691672110135698, |
|
"learning_rate": 9.91964794299315e-06, |
|
"loss": 0.6106, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.10840108401084012, |
|
"grad_norm": 0.2628093882374342, |
|
"learning_rate": 9.914217483745472e-06, |
|
"loss": 0.6119, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1102077687443541, |
|
"grad_norm": 0.2576012071557905, |
|
"learning_rate": 9.90861105628817e-06, |
|
"loss": 0.6158, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.1120144534778681, |
|
"grad_norm": 0.2520164585110792, |
|
"learning_rate": 9.902828861376101e-06, |
|
"loss": 0.6209, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.11382113821138211, |
|
"grad_norm": 0.24714468913812493, |
|
"learning_rate": 9.896871106057989e-06, |
|
"loss": 0.6203, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.11562782294489611, |
|
"grad_norm": 0.27939059621136214, |
|
"learning_rate": 9.890738003669029e-06, |
|
"loss": 0.6186, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.11743450767841011, |
|
"grad_norm": 0.30469002315407645, |
|
"learning_rate": 9.884429773823238e-06, |
|
"loss": 0.6132, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.11924119241192412, |
|
"grad_norm": 0.256255619416014, |
|
"learning_rate": 9.877946642405598e-06, |
|
"loss": 0.6151, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.12104787714543812, |
|
"grad_norm": 0.27333424534726836, |
|
"learning_rate": 9.871288841563956e-06, |
|
"loss": 0.6054, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.12285456187895212, |
|
"grad_norm": 0.2576979129157107, |
|
"learning_rate": 9.864456609700726e-06, |
|
"loss": 0.6212, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.12466124661246612, |
|
"grad_norm": 0.25853307753594634, |
|
"learning_rate": 9.857450191464337e-06, |
|
"loss": 0.6231, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.12646793134598014, |
|
"grad_norm": 0.2656560556389431, |
|
"learning_rate": 9.85026983774049e-06, |
|
"loss": 0.6284, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.12827461607949414, |
|
"grad_norm": 0.23027981167540387, |
|
"learning_rate": 9.842915805643156e-06, |
|
"loss": 0.5994, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.13008130081300814, |
|
"grad_norm": 0.22295142007678526, |
|
"learning_rate": 9.835388358505383e-06, |
|
"loss": 0.6168, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.13188798554652212, |
|
"grad_norm": 0.24893134247531343, |
|
"learning_rate": 9.827687765869859e-06, |
|
"loss": 0.6158, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.13369467028003612, |
|
"grad_norm": 0.24768240929937196, |
|
"learning_rate": 9.819814303479268e-06, |
|
"loss": 0.6079, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.13550135501355012, |
|
"grad_norm": 0.25510089913616635, |
|
"learning_rate": 9.811768253266401e-06, |
|
"loss": 0.6058, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.13730803974706413, |
|
"grad_norm": 0.2566035915807109, |
|
"learning_rate": 9.803549903344081e-06, |
|
"loss": 0.6015, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.13911472448057813, |
|
"grad_norm": 0.2498155436503847, |
|
"learning_rate": 9.79515954799483e-06, |
|
"loss": 0.5961, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.14092140921409213, |
|
"grad_norm": 0.25315720275933445, |
|
"learning_rate": 9.786597487660336e-06, |
|
"loss": 0.6077, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.14272809394760613, |
|
"grad_norm": 0.2506783920655026, |
|
"learning_rate": 9.777864028930705e-06, |
|
"loss": 0.6169, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.14453477868112014, |
|
"grad_norm": 0.24357522442887738, |
|
"learning_rate": 9.768959484533461e-06, |
|
"loss": 0.6258, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.14634146341463414, |
|
"grad_norm": 0.2368292397855168, |
|
"learning_rate": 9.75988417332237e-06, |
|
"loss": 0.6084, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.14814814814814814, |
|
"grad_norm": 0.24319176665961809, |
|
"learning_rate": 9.750638420266008e-06, |
|
"loss": 0.602, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.14995483288166214, |
|
"grad_norm": 0.26083997132385134, |
|
"learning_rate": 9.741222556436132e-06, |
|
"loss": 0.6131, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.15176151761517614, |
|
"grad_norm": 0.23313509037048255, |
|
"learning_rate": 9.731636918995821e-06, |
|
"loss": 0.6059, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.15356820234869015, |
|
"grad_norm": 0.2561353779177137, |
|
"learning_rate": 9.721881851187406e-06, |
|
"loss": 0.608, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.15537488708220415, |
|
"grad_norm": 0.24665820987089132, |
|
"learning_rate": 9.711957702320176e-06, |
|
"loss": 0.6079, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.15718157181571815, |
|
"grad_norm": 0.24260951004614914, |
|
"learning_rate": 9.701864827757868e-06, |
|
"loss": 0.6101, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.15898825654923215, |
|
"grad_norm": 0.23580290946970783, |
|
"learning_rate": 9.691603588905956e-06, |
|
"loss": 0.6145, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.16079494128274616, |
|
"grad_norm": 0.23843217719986562, |
|
"learning_rate": 9.681174353198687e-06, |
|
"loss": 0.6101, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.16260162601626016, |
|
"grad_norm": 0.23674563433604787, |
|
"learning_rate": 9.670577494085945e-06, |
|
"loss": 0.6032, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.16440831074977416, |
|
"grad_norm": 0.22347295635871733, |
|
"learning_rate": 9.659813391019867e-06, |
|
"loss": 0.6012, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.16621499548328816, |
|
"grad_norm": 0.22774371040996969, |
|
"learning_rate": 9.648882429441258e-06, |
|
"loss": 0.6049, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.16802168021680217, |
|
"grad_norm": 0.24860302708545573, |
|
"learning_rate": 9.637785000765789e-06, |
|
"loss": 0.6113, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.16982836495031617, |
|
"grad_norm": 0.2514244920841384, |
|
"learning_rate": 9.626521502369984e-06, |
|
"loss": 0.6101, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.17163504968383017, |
|
"grad_norm": 0.2615146913938569, |
|
"learning_rate": 9.615092337576987e-06, |
|
"loss": 0.6024, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.17344173441734417, |
|
"grad_norm": 0.24221345499794553, |
|
"learning_rate": 9.603497915642122e-06, |
|
"loss": 0.6012, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.17524841915085818, |
|
"grad_norm": 0.3115318810789579, |
|
"learning_rate": 9.591738651738235e-06, |
|
"loss": 0.6073, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.17705510388437218, |
|
"grad_norm": 0.244045694442119, |
|
"learning_rate": 9.579814966940833e-06, |
|
"loss": 0.6011, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.17886178861788618, |
|
"grad_norm": 0.2376023672791741, |
|
"learning_rate": 9.567727288213005e-06, |
|
"loss": 0.6136, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.18066847335140018, |
|
"grad_norm": 0.21793476185622473, |
|
"learning_rate": 9.55547604839013e-06, |
|
"loss": 0.587, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.18247515808491419, |
|
"grad_norm": 0.2337281277152664, |
|
"learning_rate": 9.543061686164374e-06, |
|
"loss": 0.6032, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.1842818428184282, |
|
"grad_norm": 0.23686576311395632, |
|
"learning_rate": 9.530484646068996e-06, |
|
"loss": 0.6213, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.1860885275519422, |
|
"grad_norm": 0.24261565879784514, |
|
"learning_rate": 9.517745378462417e-06, |
|
"loss": 0.6003, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.1878952122854562, |
|
"grad_norm": 0.2524487307231951, |
|
"learning_rate": 9.504844339512096e-06, |
|
"loss": 0.5985, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.1897018970189702, |
|
"grad_norm": 0.22875411685789507, |
|
"learning_rate": 9.491781991178203e-06, |
|
"loss": 0.5907, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.1915085817524842, |
|
"grad_norm": 0.23168788790974576, |
|
"learning_rate": 9.478558801197065e-06, |
|
"loss": 0.5927, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.1933152664859982, |
|
"grad_norm": 0.24440785625645858, |
|
"learning_rate": 9.465175243064428e-06, |
|
"loss": 0.5985, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.1951219512195122, |
|
"grad_norm": 0.2358000622956773, |
|
"learning_rate": 9.451631796018495e-06, |
|
"loss": 0.597, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.1969286359530262, |
|
"grad_norm": 0.2476890175323139, |
|
"learning_rate": 9.437928945022772e-06, |
|
"loss": 0.6066, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.1987353206865402, |
|
"grad_norm": 0.24611295267738714, |
|
"learning_rate": 9.424067180748692e-06, |
|
"loss": 0.5878, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2005420054200542, |
|
"grad_norm": 0.2881495774555611, |
|
"learning_rate": 9.410046999558062e-06, |
|
"loss": 0.6072, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.2023486901535682, |
|
"grad_norm": 0.24596539710053744, |
|
"learning_rate": 9.395868903485269e-06, |
|
"loss": 0.6005, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.2041553748870822, |
|
"grad_norm": 0.2386416433130278, |
|
"learning_rate": 9.381533400219319e-06, |
|
"loss": 0.6041, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.20596205962059622, |
|
"grad_norm": 0.3575913544704138, |
|
"learning_rate": 9.36704100308565e-06, |
|
"loss": 0.5872, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.20776874435411022, |
|
"grad_norm": 0.23192198774830253, |
|
"learning_rate": 9.352392231027752e-06, |
|
"loss": 0.6032, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.20957542908762422, |
|
"grad_norm": 0.23101719819824798, |
|
"learning_rate": 9.337587608588588e-06, |
|
"loss": 0.5974, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.21138211382113822, |
|
"grad_norm": 0.25528392059554045, |
|
"learning_rate": 9.322627665891807e-06, |
|
"loss": 0.6076, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.21318879855465223, |
|
"grad_norm": 0.24287946493861107, |
|
"learning_rate": 9.307512938622762e-06, |
|
"loss": 0.5952, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.21499548328816623, |
|
"grad_norm": 0.22892112715952465, |
|
"learning_rate": 9.292243968009332e-06, |
|
"loss": 0.5864, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.21680216802168023, |
|
"grad_norm": 0.23875546778816065, |
|
"learning_rate": 9.276821300802535e-06, |
|
"loss": 0.6042, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2186088527551942, |
|
"grad_norm": 0.23651334894824905, |
|
"learning_rate": 9.261245489256956e-06, |
|
"loss": 0.6, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.2204155374887082, |
|
"grad_norm": 0.2526347555924926, |
|
"learning_rate": 9.24551709111097e-06, |
|
"loss": 0.6049, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.2222222222222222, |
|
"grad_norm": 0.26377863716117955, |
|
"learning_rate": 9.229636669566769e-06, |
|
"loss": 0.5961, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.2240289069557362, |
|
"grad_norm": 0.2632856019025536, |
|
"learning_rate": 9.213604793270196e-06, |
|
"loss": 0.5818, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.22583559168925021, |
|
"grad_norm": 0.26119773624805764, |
|
"learning_rate": 9.197422036290386e-06, |
|
"loss": 0.5887, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.22764227642276422, |
|
"grad_norm": 0.22525425992658815, |
|
"learning_rate": 9.181088978099203e-06, |
|
"loss": 0.601, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.22944896115627822, |
|
"grad_norm": 0.25288660956040915, |
|
"learning_rate": 9.164606203550498e-06, |
|
"loss": 0.5933, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.23125564588979222, |
|
"grad_norm": 0.23943780402862191, |
|
"learning_rate": 9.147974302859158e-06, |
|
"loss": 0.5925, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.23306233062330622, |
|
"grad_norm": 0.25270951927417573, |
|
"learning_rate": 9.131193871579975e-06, |
|
"loss": 0.588, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.23486901535682023, |
|
"grad_norm": 0.23629117048544068, |
|
"learning_rate": 9.114265510586329e-06, |
|
"loss": 0.6066, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.23667570009033423, |
|
"grad_norm": 0.25827287096558593, |
|
"learning_rate": 9.09718982604866e-06, |
|
"loss": 0.6005, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.23848238482384823, |
|
"grad_norm": 0.2267410893620417, |
|
"learning_rate": 9.079967429412766e-06, |
|
"loss": 0.5795, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.24028906955736223, |
|
"grad_norm": 0.2550015941357386, |
|
"learning_rate": 9.062598937377911e-06, |
|
"loss": 0.5951, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.24209575429087624, |
|
"grad_norm": 0.24549928967178372, |
|
"learning_rate": 9.045084971874738e-06, |
|
"loss": 0.5958, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.24390243902439024, |
|
"grad_norm": 0.23869896137391008, |
|
"learning_rate": 9.027426160043005e-06, |
|
"loss": 0.5925, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.24570912375790424, |
|
"grad_norm": 0.23603926331716685, |
|
"learning_rate": 9.00962313420912e-06, |
|
"loss": 0.5967, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.24751580849141824, |
|
"grad_norm": 0.21954443180366723, |
|
"learning_rate": 8.991676531863507e-06, |
|
"loss": 0.5799, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.24932249322493225, |
|
"grad_norm": 0.24573994311601394, |
|
"learning_rate": 8.973586995637778e-06, |
|
"loss": 0.5974, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.25112917795844625, |
|
"grad_norm": 0.23867846548393645, |
|
"learning_rate": 8.955355173281709e-06, |
|
"loss": 0.6006, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.2529358626919603, |
|
"grad_norm": 0.2581948484162573, |
|
"learning_rate": 8.936981717640061e-06, |
|
"loss": 0.5999, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.25474254742547425, |
|
"grad_norm": 0.267121573221628, |
|
"learning_rate": 8.9184672866292e-06, |
|
"loss": 0.5805, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.2565492321589883, |
|
"grad_norm": 0.2470490796537526, |
|
"learning_rate": 8.899812543213532e-06, |
|
"loss": 0.6006, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.25835591689250226, |
|
"grad_norm": 0.3179114771380903, |
|
"learning_rate": 8.881018155381766e-06, |
|
"loss": 0.592, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.2601626016260163, |
|
"grad_norm": 0.23894724426899414, |
|
"learning_rate": 8.862084796122998e-06, |
|
"loss": 0.5813, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.26196928635953026, |
|
"grad_norm": 0.2550194178662108, |
|
"learning_rate": 8.84301314340261e-06, |
|
"loss": 0.5938, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.26377597109304424, |
|
"grad_norm": 0.23756009297770989, |
|
"learning_rate": 8.823803880137993e-06, |
|
"loss": 0.5967, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.26558265582655827, |
|
"grad_norm": 0.23830080522997849, |
|
"learning_rate": 8.804457694174093e-06, |
|
"loss": 0.5884, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.26738934056007224, |
|
"grad_norm": 0.2541492506135848, |
|
"learning_rate": 8.784975278258783e-06, |
|
"loss": 0.5895, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.26919602529358627, |
|
"grad_norm": 0.2248990020625438, |
|
"learning_rate": 8.765357330018056e-06, |
|
"loss": 0.5867, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.27100271002710025, |
|
"grad_norm": 0.23467307143176355, |
|
"learning_rate": 8.745604551931042e-06, |
|
"loss": 0.5955, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2728093947606143, |
|
"grad_norm": 0.25171242957304724, |
|
"learning_rate": 8.725717651304856e-06, |
|
"loss": 0.5794, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.27461607949412825, |
|
"grad_norm": 0.23300717704512258, |
|
"learning_rate": 8.705697340249275e-06, |
|
"loss": 0.5852, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.2764227642276423, |
|
"grad_norm": 0.2477278416478864, |
|
"learning_rate": 8.685544335651226e-06, |
|
"loss": 0.586, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.27822944896115626, |
|
"grad_norm": 0.23445523989787243, |
|
"learning_rate": 8.665259359149132e-06, |
|
"loss": 0.591, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.2800361336946703, |
|
"grad_norm": 0.22970328615556354, |
|
"learning_rate": 8.644843137107058e-06, |
|
"loss": 0.5819, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.28184281842818426, |
|
"grad_norm": 0.23879183212303057, |
|
"learning_rate": 8.62429640058871e-06, |
|
"loss": 0.5829, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.2836495031616983, |
|
"grad_norm": 0.25663229334164944, |
|
"learning_rate": 8.603619885331251e-06, |
|
"loss": 0.5955, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.28545618789521227, |
|
"grad_norm": 0.24837164619570978, |
|
"learning_rate": 8.582814331718961e-06, |
|
"loss": 0.5928, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.2872628726287263, |
|
"grad_norm": 0.23588125642758875, |
|
"learning_rate": 8.561880484756726e-06, |
|
"loss": 0.5741, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.28906955736224027, |
|
"grad_norm": 0.24350097611405788, |
|
"learning_rate": 8.540819094043349e-06, |
|
"loss": 0.5829, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.2908762420957543, |
|
"grad_norm": 0.2553683709091329, |
|
"learning_rate": 8.519630913744726e-06, |
|
"loss": 0.5899, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.2926829268292683, |
|
"grad_norm": 0.22785571039238608, |
|
"learning_rate": 8.498316702566828e-06, |
|
"loss": 0.5761, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.2944896115627823, |
|
"grad_norm": 0.25965181875075294, |
|
"learning_rate": 8.476877223728539e-06, |
|
"loss": 0.5856, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.2962962962962963, |
|
"grad_norm": 0.24290266021805976, |
|
"learning_rate": 8.455313244934324e-06, |
|
"loss": 0.5944, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.2981029810298103, |
|
"grad_norm": 0.22927751586664705, |
|
"learning_rate": 8.433625538346742e-06, |
|
"loss": 0.5859, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.2999096657633243, |
|
"grad_norm": 0.24535687152116695, |
|
"learning_rate": 8.41181488055879e-06, |
|
"loss": 0.5921, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.3017163504968383, |
|
"grad_norm": 0.23751101243273856, |
|
"learning_rate": 8.389882052566106e-06, |
|
"loss": 0.591, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.3035230352303523, |
|
"grad_norm": 0.23630246240385233, |
|
"learning_rate": 8.36782783973899e-06, |
|
"loss": 0.5894, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.3053297199638663, |
|
"grad_norm": 0.22799677339250804, |
|
"learning_rate": 8.345653031794292e-06, |
|
"loss": 0.5818, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.3071364046973803, |
|
"grad_norm": 0.22405689087807926, |
|
"learning_rate": 8.32335842276713e-06, |
|
"loss": 0.5959, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.3089430894308943, |
|
"grad_norm": 0.22813844475300996, |
|
"learning_rate": 8.300944810982452e-06, |
|
"loss": 0.5786, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.3107497741644083, |
|
"grad_norm": 0.2640722980858363, |
|
"learning_rate": 8.278412999026462e-06, |
|
"loss": 0.5853, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.31255645889792233, |
|
"grad_norm": 0.2233536921872013, |
|
"learning_rate": 8.255763793717868e-06, |
|
"loss": 0.5887, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.3143631436314363, |
|
"grad_norm": 0.23154045983028293, |
|
"learning_rate": 8.232998006078998e-06, |
|
"loss": 0.5799, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.31616982836495033, |
|
"grad_norm": 0.23034930998436073, |
|
"learning_rate": 8.210116451306762e-06, |
|
"loss": 0.5842, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.3179765130984643, |
|
"grad_norm": 0.23658108883929088, |
|
"learning_rate": 8.18711994874345e-06, |
|
"loss": 0.5985, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.31978319783197834, |
|
"grad_norm": 0.21745109308108737, |
|
"learning_rate": 8.164009321847405e-06, |
|
"loss": 0.5734, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.3215898825654923, |
|
"grad_norm": 0.2293512030663522, |
|
"learning_rate": 8.140785398163535e-06, |
|
"loss": 0.58, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.32339656729900634, |
|
"grad_norm": 0.24597935781904853, |
|
"learning_rate": 8.117449009293668e-06, |
|
"loss": 0.5901, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.3252032520325203, |
|
"grad_norm": 0.21247364482115172, |
|
"learning_rate": 8.094000990866795e-06, |
|
"loss": 0.5981, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.32700993676603435, |
|
"grad_norm": 0.2372685877924778, |
|
"learning_rate": 8.070442182509127e-06, |
|
"loss": 0.576, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.3288166214995483, |
|
"grad_norm": 0.3447763183753473, |
|
"learning_rate": 8.046773427814043e-06, |
|
"loss": 0.586, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.33062330623306235, |
|
"grad_norm": 0.23582522162922892, |
|
"learning_rate": 8.022995574311876e-06, |
|
"loss": 0.5973, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.3324299909665763, |
|
"grad_norm": 0.23703989170533815, |
|
"learning_rate": 7.99910947343957e-06, |
|
"loss": 0.5939, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.33423667570009036, |
|
"grad_norm": 0.23664320383509757, |
|
"learning_rate": 7.975115980510187e-06, |
|
"loss": 0.5902, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.33604336043360433, |
|
"grad_norm": 0.23010205857606106, |
|
"learning_rate": 7.951015954682281e-06, |
|
"loss": 0.5857, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.33785004516711836, |
|
"grad_norm": 0.2663491724216408, |
|
"learning_rate": 7.926810258929138e-06, |
|
"loss": 0.5831, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.33965672990063234, |
|
"grad_norm": 0.23096764617799878, |
|
"learning_rate": 7.902499760007867e-06, |
|
"loss": 0.5828, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.34146341463414637, |
|
"grad_norm": 0.23869913090614076, |
|
"learning_rate": 7.87808532842837e-06, |
|
"loss": 0.5901, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.34327009936766034, |
|
"grad_norm": 0.22415991273320168, |
|
"learning_rate": 7.85356783842216e-06, |
|
"loss": 0.5787, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.34507678410117437, |
|
"grad_norm": 0.24057830595652646, |
|
"learning_rate": 7.828948167911073e-06, |
|
"loss": 0.577, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.34688346883468835, |
|
"grad_norm": 0.8377691682291619, |
|
"learning_rate": 7.804227198475823e-06, |
|
"loss": 0.5838, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.3486901535682023, |
|
"grad_norm": 0.22688859107583473, |
|
"learning_rate": 7.779405815324424e-06, |
|
"loss": 0.5857, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.35049683830171635, |
|
"grad_norm": 0.22785735734765863, |
|
"learning_rate": 7.754484907260513e-06, |
|
"loss": 0.5874, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.3523035230352303, |
|
"grad_norm": 0.2177502560490473, |
|
"learning_rate": 7.72946536665151e-06, |
|
"loss": 0.5707, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.35411020776874436, |
|
"grad_norm": 0.24597755253541856, |
|
"learning_rate": 7.704348089396667e-06, |
|
"loss": 0.5838, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.35591689250225833, |
|
"grad_norm": 0.2292296000848298, |
|
"learning_rate": 7.679133974894984e-06, |
|
"loss": 0.5833, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.35772357723577236, |
|
"grad_norm": 0.23517427703242888, |
|
"learning_rate": 7.653823926013016e-06, |
|
"loss": 0.5604, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.35953026196928634, |
|
"grad_norm": 0.23076838848026066, |
|
"learning_rate": 7.628418849052523e-06, |
|
"loss": 0.5831, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.36133694670280037, |
|
"grad_norm": 0.23076094009385115, |
|
"learning_rate": 7.602919653718044e-06, |
|
"loss": 0.573, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.36314363143631434, |
|
"grad_norm": 0.22528349334552952, |
|
"learning_rate": 7.577327253084292e-06, |
|
"loss": 0.5675, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.36495031616982837, |
|
"grad_norm": 0.25919326541454196, |
|
"learning_rate": 7.551642563563481e-06, |
|
"loss": 0.5944, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.36675700090334235, |
|
"grad_norm": 0.2475750487466084, |
|
"learning_rate": 7.5258665048725065e-06, |
|
"loss": 0.5816, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.3685636856368564, |
|
"grad_norm": 0.23639856935126805, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 0.5941, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.37037037037037035, |
|
"grad_norm": 0.2312059720413921, |
|
"learning_rate": 7.4740439751732994e-06, |
|
"loss": 0.5841, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.3721770551038844, |
|
"grad_norm": 0.23815007414755204, |
|
"learning_rate": 7.447999359825263e-06, |
|
"loss": 0.5714, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.37398373983739835, |
|
"grad_norm": 0.2559897289254791, |
|
"learning_rate": 7.421867086561001e-06, |
|
"loss": 0.5798, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.3757904245709124, |
|
"grad_norm": 0.23431179392042337, |
|
"learning_rate": 7.395648091124476e-06, |
|
"loss": 0.5668, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.37759710930442636, |
|
"grad_norm": 0.2512828254268562, |
|
"learning_rate": 7.369343312364994e-06, |
|
"loss": 0.5881, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.3794037940379404, |
|
"grad_norm": 0.24502883889971508, |
|
"learning_rate": 7.342953692203594e-06, |
|
"loss": 0.5836, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.38121047877145436, |
|
"grad_norm": 0.21556942382237343, |
|
"learning_rate": 7.31648017559931e-06, |
|
"loss": 0.5845, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.3830171635049684, |
|
"grad_norm": 0.23583974310453942, |
|
"learning_rate": 7.289923710515338e-06, |
|
"loss": 0.5927, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.38482384823848237, |
|
"grad_norm": 0.23705623638550383, |
|
"learning_rate": 7.263285247885097e-06, |
|
"loss": 0.5917, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.3866305329719964, |
|
"grad_norm": 0.22991114203917198, |
|
"learning_rate": 7.236565741578163e-06, |
|
"loss": 0.5778, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.3884372177055104, |
|
"grad_norm": 0.23180178819150027, |
|
"learning_rate": 7.2097661483661355e-06, |
|
"loss": 0.6044, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.3902439024390244, |
|
"grad_norm": 0.26613899638149857, |
|
"learning_rate": 7.182887427888351e-06, |
|
"loss": 0.5936, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.3920505871725384, |
|
"grad_norm": 0.2312933670885305, |
|
"learning_rate": 7.155930542617543e-06, |
|
"loss": 0.5935, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.3938572719060524, |
|
"grad_norm": 0.22978225434942193, |
|
"learning_rate": 7.128896457825364e-06, |
|
"loss": 0.5854, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.3956639566395664, |
|
"grad_norm": 0.23602529887683144, |
|
"learning_rate": 7.101786141547829e-06, |
|
"loss": 0.5801, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.3974706413730804, |
|
"grad_norm": 0.2703924915547822, |
|
"learning_rate": 7.074600564550643e-06, |
|
"loss": 0.5833, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.3992773261065944, |
|
"grad_norm": 0.2617465948292489, |
|
"learning_rate": 7.047340700294454e-06, |
|
"loss": 0.5716, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.4010840108401084, |
|
"grad_norm": 0.24382884586289413, |
|
"learning_rate": 7.020007524899976e-06, |
|
"loss": 0.5886, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.4028906955736224, |
|
"grad_norm": 0.21935282499736788, |
|
"learning_rate": 6.992602017113058e-06, |
|
"loss": 0.5714, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.4046973803071364, |
|
"grad_norm": 0.25790526082244813, |
|
"learning_rate": 6.965125158269619e-06, |
|
"loss": 0.5766, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.4065040650406504, |
|
"grad_norm": 0.31271609267639955, |
|
"learning_rate": 6.9375779322605154e-06, |
|
"loss": 0.5815, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.4083107497741644, |
|
"grad_norm": 0.25644160028108604, |
|
"learning_rate": 6.909961325496312e-06, |
|
"loss": 0.5876, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.4101174345076784, |
|
"grad_norm": 0.23592094628675797, |
|
"learning_rate": 6.88227632687196e-06, |
|
"loss": 0.5922, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.41192411924119243, |
|
"grad_norm": 0.2466791961563765, |
|
"learning_rate": 6.854523927731383e-06, |
|
"loss": 0.5786, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.4137308039747064, |
|
"grad_norm": 0.23617198599914752, |
|
"learning_rate": 6.8267051218319766e-06, |
|
"loss": 0.5808, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.41553748870822044, |
|
"grad_norm": 0.2416898307177306, |
|
"learning_rate": 6.798820905309036e-06, |
|
"loss": 0.583, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.4173441734417344, |
|
"grad_norm": 0.24268134289433835, |
|
"learning_rate": 6.7708722766400745e-06, |
|
"loss": 0.5831, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.41915085817524844, |
|
"grad_norm": 0.2228715513161369, |
|
"learning_rate": 6.7428602366090764e-06, |
|
"loss": 0.5859, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.4209575429087624, |
|
"grad_norm": 0.249377876574396, |
|
"learning_rate": 6.714785788270658e-06, |
|
"loss": 0.567, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.42276422764227645, |
|
"grad_norm": 0.26017782806271494, |
|
"learning_rate": 6.686649936914151e-06, |
|
"loss": 0.5833, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.4245709123757904, |
|
"grad_norm": 0.22238035207177997, |
|
"learning_rate": 6.658453690027604e-06, |
|
"loss": 0.578, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.42637759710930445, |
|
"grad_norm": 0.2251727087649973, |
|
"learning_rate": 6.63019805726171e-06, |
|
"loss": 0.5897, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.4281842818428184, |
|
"grad_norm": 0.24334747524670824, |
|
"learning_rate": 6.601884050393649e-06, |
|
"loss": 0.5883, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.42999096657633246, |
|
"grad_norm": 0.23018321868830866, |
|
"learning_rate": 6.57351268329086e-06, |
|
"loss": 0.5973, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.43179765130984643, |
|
"grad_norm": 0.2328779372011691, |
|
"learning_rate": 6.545084971874738e-06, |
|
"loss": 0.5789, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.43360433604336046, |
|
"grad_norm": 0.23242378577030887, |
|
"learning_rate": 6.51660193408425e-06, |
|
"loss": 0.5794, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.43541102077687444, |
|
"grad_norm": 0.2161659491395485, |
|
"learning_rate": 6.4880645898394935e-06, |
|
"loss": 0.5778, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.4372177055103884, |
|
"grad_norm": 0.2299276453855878, |
|
"learning_rate": 6.459473961005168e-06, |
|
"loss": 0.5786, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.43902439024390244, |
|
"grad_norm": 0.2476235320127899, |
|
"learning_rate": 6.4308310713539845e-06, |
|
"loss": 0.5828, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.4408310749774164, |
|
"grad_norm": 0.22170490563024103, |
|
"learning_rate": 6.402136946530014e-06, |
|
"loss": 0.5882, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.44263775971093045, |
|
"grad_norm": 0.228282250322248, |
|
"learning_rate": 6.373392614011952e-06, |
|
"loss": 0.5814, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 0.24503809348111755, |
|
"learning_rate": 6.344599103076329e-06, |
|
"loss": 0.5878, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.44625112917795845, |
|
"grad_norm": 0.217515748990581, |
|
"learning_rate": 6.315757444760659e-06, |
|
"loss": 0.5703, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.4480578139114724, |
|
"grad_norm": 0.2330555687580969, |
|
"learning_rate": 6.286868671826513e-06, |
|
"loss": 0.5885, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.44986449864498645, |
|
"grad_norm": 0.21335194723381823, |
|
"learning_rate": 6.257933818722544e-06, |
|
"loss": 0.5717, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.45167118337850043, |
|
"grad_norm": 0.22486747840550073, |
|
"learning_rate": 6.228953921547441e-06, |
|
"loss": 0.5866, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.45347786811201446, |
|
"grad_norm": 0.25899034502299706, |
|
"learning_rate": 6.19993001801283e-06, |
|
"loss": 0.5745, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.45528455284552843, |
|
"grad_norm": 0.21794726924055616, |
|
"learning_rate": 6.17086314740612e-06, |
|
"loss": 0.5559, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.45709123757904246, |
|
"grad_norm": 0.24234352061206263, |
|
"learning_rate": 6.141754350553279e-06, |
|
"loss": 0.5788, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.45889792231255644, |
|
"grad_norm": 0.2274414978394944, |
|
"learning_rate": 6.112604669781572e-06, |
|
"loss": 0.5774, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.46070460704607047, |
|
"grad_norm": 0.22998778589918353, |
|
"learning_rate": 6.083415148882236e-06, |
|
"loss": 0.5716, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.46251129177958444, |
|
"grad_norm": 0.22485661414287983, |
|
"learning_rate": 6.054186833073096e-06, |
|
"loss": 0.572, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.4643179765130985, |
|
"grad_norm": 0.22811563640884158, |
|
"learning_rate": 6.024920768961153e-06, |
|
"loss": 0.581, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.46612466124661245, |
|
"grad_norm": 0.22052113306724444, |
|
"learning_rate": 5.995618004505091e-06, |
|
"loss": 0.5766, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.4679313459801265, |
|
"grad_norm": 0.21805381304894822, |
|
"learning_rate": 5.9662795889777666e-06, |
|
"loss": 0.5803, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.46973803071364045, |
|
"grad_norm": 0.23376637006655057, |
|
"learning_rate": 5.936906572928625e-06, |
|
"loss": 0.5945, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.4715447154471545, |
|
"grad_norm": 0.2623609821320283, |
|
"learning_rate": 5.907500008146082e-06, |
|
"loss": 0.5855, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.47335140018066846, |
|
"grad_norm": 0.20712796845127485, |
|
"learning_rate": 5.878060947619877e-06, |
|
"loss": 0.5739, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.4751580849141825, |
|
"grad_norm": 0.21143577704362118, |
|
"learning_rate": 5.848590445503345e-06, |
|
"loss": 0.5782, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.47696476964769646, |
|
"grad_norm": 0.21994485672746086, |
|
"learning_rate": 5.819089557075689e-06, |
|
"loss": 0.585, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.4787714543812105, |
|
"grad_norm": 0.21069182355242835, |
|
"learning_rate": 5.78955933870418e-06, |
|
"loss": 0.5659, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.48057813911472447, |
|
"grad_norm": 0.23778394000563338, |
|
"learning_rate": 5.760000847806337e-06, |
|
"loss": 0.5902, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.4823848238482385, |
|
"grad_norm": 0.21210561161504926, |
|
"learning_rate": 5.730415142812059e-06, |
|
"loss": 0.5745, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.48419150858175247, |
|
"grad_norm": 0.23173733623724077, |
|
"learning_rate": 5.70080328312573e-06, |
|
"loss": 0.5752, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.4859981933152665, |
|
"grad_norm": 0.21851750516316734, |
|
"learning_rate": 5.671166329088278e-06, |
|
"loss": 0.581, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.4878048780487805, |
|
"grad_norm": 0.230353098776304, |
|
"learning_rate": 5.641505341939212e-06, |
|
"loss": 0.5633, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.4896115627822945, |
|
"grad_norm": 0.23007026101437933, |
|
"learning_rate": 5.611821383778614e-06, |
|
"loss": 0.5847, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.4914182475158085, |
|
"grad_norm": 0.23404979176928067, |
|
"learning_rate": 5.582115517529114e-06, |
|
"loss": 0.5792, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.4932249322493225, |
|
"grad_norm": 0.2340510902231734, |
|
"learning_rate": 5.55238880689783e-06, |
|
"loss": 0.5873, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.4950316169828365, |
|
"grad_norm": 0.2515187234434128, |
|
"learning_rate": 5.522642316338268e-06, |
|
"loss": 0.5747, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.4968383017163505, |
|
"grad_norm": 0.22982520627329517, |
|
"learning_rate": 5.4928771110122185e-06, |
|
"loss": 0.5691, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.4986449864498645, |
|
"grad_norm": 0.21964980040640775, |
|
"learning_rate": 5.463094256751608e-06, |
|
"loss": 0.5616, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.5004516711833785, |
|
"grad_norm": 0.22067905722359166, |
|
"learning_rate": 5.433294820020335e-06, |
|
"loss": 0.5736, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.5022583559168925, |
|
"grad_norm": 0.22091279640365025, |
|
"learning_rate": 5.403479867876087e-06, |
|
"loss": 0.5603, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.5040650406504065, |
|
"grad_norm": 0.2313691575132618, |
|
"learning_rate": 5.373650467932122e-06, |
|
"loss": 0.575, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.5058717253839206, |
|
"grad_norm": 0.21483775764620186, |
|
"learning_rate": 5.343807688319047e-06, |
|
"loss": 0.5716, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5076784101174345, |
|
"grad_norm": 0.23350539595994096, |
|
"learning_rate": 5.3139525976465675e-06, |
|
"loss": 0.5725, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.5094850948509485, |
|
"grad_norm": 0.20419037267861948, |
|
"learning_rate": 5.284086264965224e-06, |
|
"loss": 0.5664, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.5112917795844625, |
|
"grad_norm": 0.2206929643409549, |
|
"learning_rate": 5.2542097597281095e-06, |
|
"loss": 0.5824, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.5130984643179766, |
|
"grad_norm": 0.2328716449251837, |
|
"learning_rate": 5.224324151752575e-06, |
|
"loss": 0.5704, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.5149051490514905, |
|
"grad_norm": 0.22646920035996798, |
|
"learning_rate": 5.194430511181925e-06, |
|
"loss": 0.5637, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.5167118337850045, |
|
"grad_norm": 0.2338999611062683, |
|
"learning_rate": 5.1645299084470936e-06, |
|
"loss": 0.563, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.5185185185185185, |
|
"grad_norm": 0.23348201201542426, |
|
"learning_rate": 5.134623414228315e-06, |
|
"loss": 0.5846, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.5203252032520326, |
|
"grad_norm": 0.2066768548623096, |
|
"learning_rate": 5.1047120994167855e-06, |
|
"loss": 0.5814, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.5221318879855466, |
|
"grad_norm": 0.22067476291358168, |
|
"learning_rate": 5.074797035076319e-06, |
|
"loss": 0.5658, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.5239385727190605, |
|
"grad_norm": 0.2307755650814999, |
|
"learning_rate": 5.04487929240499e-06, |
|
"loss": 0.5777, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5257452574525745, |
|
"grad_norm": 0.21733975598284586, |
|
"learning_rate": 5.014959942696782e-06, |
|
"loss": 0.5822, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.5275519421860885, |
|
"grad_norm": 0.2110494515215823, |
|
"learning_rate": 4.98504005730322e-06, |
|
"loss": 0.5852, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.5293586269196026, |
|
"grad_norm": 0.21376160889279272, |
|
"learning_rate": 4.955120707595011e-06, |
|
"loss": 0.5786, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.5311653116531165, |
|
"grad_norm": 0.22255506256387717, |
|
"learning_rate": 4.9252029649236835e-06, |
|
"loss": 0.5707, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.5329719963866305, |
|
"grad_norm": 0.22357316134025623, |
|
"learning_rate": 4.895287900583216e-06, |
|
"loss": 0.569, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.5347786811201445, |
|
"grad_norm": 0.2303749633454824, |
|
"learning_rate": 4.865376585771687e-06, |
|
"loss": 0.572, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.5365853658536586, |
|
"grad_norm": 0.24691170554512507, |
|
"learning_rate": 4.835470091552906e-06, |
|
"loss": 0.578, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.5383920505871725, |
|
"grad_norm": 0.2184305772497302, |
|
"learning_rate": 4.805569488818077e-06, |
|
"loss": 0.5722, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.5401987353206865, |
|
"grad_norm": 0.21437152967432271, |
|
"learning_rate": 4.775675848247427e-06, |
|
"loss": 0.5884, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.5420054200542005, |
|
"grad_norm": 0.21552542470264674, |
|
"learning_rate": 4.745790240271892e-06, |
|
"loss": 0.5761, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5438121047877146, |
|
"grad_norm": 0.2125660770635089, |
|
"learning_rate": 4.715913735034779e-06, |
|
"loss": 0.5773, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.5456187895212286, |
|
"grad_norm": 0.21488086772508092, |
|
"learning_rate": 4.686047402353433e-06, |
|
"loss": 0.5897, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.5474254742547425, |
|
"grad_norm": 0.22966510979007831, |
|
"learning_rate": 4.6561923116809545e-06, |
|
"loss": 0.5708, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.5492321589882565, |
|
"grad_norm": 0.21097665896626477, |
|
"learning_rate": 4.626349532067879e-06, |
|
"loss": 0.5656, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.5510388437217706, |
|
"grad_norm": 0.2101261903441546, |
|
"learning_rate": 4.596520132123915e-06, |
|
"loss": 0.5722, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.5528455284552846, |
|
"grad_norm": 0.22436513473996944, |
|
"learning_rate": 4.566705179979665e-06, |
|
"loss": 0.5698, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.5546522131887985, |
|
"grad_norm": 0.20902719330700562, |
|
"learning_rate": 4.536905743248394e-06, |
|
"loss": 0.5878, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.5564588979223125, |
|
"grad_norm": 0.21097784366583347, |
|
"learning_rate": 4.507122888987782e-06, |
|
"loss": 0.5671, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.5582655826558266, |
|
"grad_norm": 0.21970798388898458, |
|
"learning_rate": 4.477357683661734e-06, |
|
"loss": 0.5762, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.5600722673893406, |
|
"grad_norm": 0.23843978575881658, |
|
"learning_rate": 4.447611193102171e-06, |
|
"loss": 0.5595, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.5618789521228545, |
|
"grad_norm": 0.2085020569553253, |
|
"learning_rate": 4.417884482470887e-06, |
|
"loss": 0.5776, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.5636856368563685, |
|
"grad_norm": 0.20561180891816055, |
|
"learning_rate": 4.388178616221389e-06, |
|
"loss": 0.5771, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.5654923215898826, |
|
"grad_norm": 0.21551536323301249, |
|
"learning_rate": 4.35849465806079e-06, |
|
"loss": 0.5788, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.5672990063233966, |
|
"grad_norm": 0.20574458895157158, |
|
"learning_rate": 4.3288336709117246e-06, |
|
"loss": 0.5707, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.5691056910569106, |
|
"grad_norm": 0.21500847742483045, |
|
"learning_rate": 4.299196716874271e-06, |
|
"loss": 0.5706, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.5709123757904245, |
|
"grad_norm": 0.20544086197673322, |
|
"learning_rate": 4.269584857187942e-06, |
|
"loss": 0.5676, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.5727190605239386, |
|
"grad_norm": 0.19987644528439158, |
|
"learning_rate": 4.239999152193664e-06, |
|
"loss": 0.5621, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.5745257452574526, |
|
"grad_norm": 0.20629701462913586, |
|
"learning_rate": 4.2104406612958216e-06, |
|
"loss": 0.5744, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.5763324299909666, |
|
"grad_norm": 0.22689443352313568, |
|
"learning_rate": 4.180910442924312e-06, |
|
"loss": 0.5841, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.5781391147244805, |
|
"grad_norm": 0.2199680438163748, |
|
"learning_rate": 4.1514095544966556e-06, |
|
"loss": 0.5671, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5799457994579946, |
|
"grad_norm": 0.22719849829451663, |
|
"learning_rate": 4.121939052380125e-06, |
|
"loss": 0.5634, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.5817524841915086, |
|
"grad_norm": 0.24575016551621642, |
|
"learning_rate": 4.092499991853919e-06, |
|
"loss": 0.585, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.5835591689250226, |
|
"grad_norm": 0.2165593982049528, |
|
"learning_rate": 4.063093427071376e-06, |
|
"loss": 0.5705, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.5853658536585366, |
|
"grad_norm": 0.21038397261423286, |
|
"learning_rate": 4.033720411022235e-06, |
|
"loss": 0.5509, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.5871725383920506, |
|
"grad_norm": 0.23381630506120754, |
|
"learning_rate": 4.0043819954949105e-06, |
|
"loss": 0.5692, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.5889792231255646, |
|
"grad_norm": 0.22274092699155776, |
|
"learning_rate": 3.975079231038848e-06, |
|
"loss": 0.578, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.5907859078590786, |
|
"grad_norm": 0.23312487246943384, |
|
"learning_rate": 3.9458131669269066e-06, |
|
"loss": 0.5655, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.5925925925925926, |
|
"grad_norm": 0.20940039382277698, |
|
"learning_rate": 3.916584851117766e-06, |
|
"loss": 0.5713, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.5943992773261066, |
|
"grad_norm": 0.20083788756986765, |
|
"learning_rate": 3.887395330218429e-06, |
|
"loss": 0.5623, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.5962059620596206, |
|
"grad_norm": 0.22293875603937713, |
|
"learning_rate": 3.8582456494467214e-06, |
|
"loss": 0.5694, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5980126467931346, |
|
"grad_norm": 0.21580894775676748, |
|
"learning_rate": 3.829136852593881e-06, |
|
"loss": 0.5741, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.5998193315266486, |
|
"grad_norm": 0.2084327717713985, |
|
"learning_rate": 3.8000699819871704e-06, |
|
"loss": 0.5568, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.6016260162601627, |
|
"grad_norm": 0.19133614379073072, |
|
"learning_rate": 3.7710460784525617e-06, |
|
"loss": 0.5777, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.6034327009936766, |
|
"grad_norm": 0.21749963390685656, |
|
"learning_rate": 3.7420661812774577e-06, |
|
"loss": 0.5904, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.6052393857271906, |
|
"grad_norm": 0.19792142717136593, |
|
"learning_rate": 3.7131313281734895e-06, |
|
"loss": 0.5727, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.6070460704607046, |
|
"grad_norm": 0.20808382531187122, |
|
"learning_rate": 3.6842425552393424e-06, |
|
"loss": 0.5701, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.6088527551942186, |
|
"grad_norm": 0.20007357870443254, |
|
"learning_rate": 3.655400896923672e-06, |
|
"loss": 0.5713, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.6106594399277326, |
|
"grad_norm": 0.21439635973420199, |
|
"learning_rate": 3.62660738598805e-06, |
|
"loss": 0.5669, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.6124661246612466, |
|
"grad_norm": 0.19649367243591512, |
|
"learning_rate": 3.5978630534699873e-06, |
|
"loss": 0.5756, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.6142728093947606, |
|
"grad_norm": 0.1993688643471656, |
|
"learning_rate": 3.5691689286460172e-06, |
|
"loss": 0.571, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.6160794941282746, |
|
"grad_norm": 0.1993509321428296, |
|
"learning_rate": 3.540526038994834e-06, |
|
"loss": 0.5695, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.6178861788617886, |
|
"grad_norm": 0.21021124016240533, |
|
"learning_rate": 3.5119354101605086e-06, |
|
"loss": 0.573, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.6196928635953026, |
|
"grad_norm": 0.20907305981232832, |
|
"learning_rate": 3.4833980659157507e-06, |
|
"loss": 0.5673, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.6214995483288166, |
|
"grad_norm": 0.19471235958062905, |
|
"learning_rate": 3.4549150281252635e-06, |
|
"loss": 0.5569, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.6233062330623306, |
|
"grad_norm": 0.22694431095992104, |
|
"learning_rate": 3.4264873167091405e-06, |
|
"loss": 0.5711, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.6251129177958447, |
|
"grad_norm": 0.20864419691174535, |
|
"learning_rate": 3.398115949606352e-06, |
|
"loss": 0.5725, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.6269196025293586, |
|
"grad_norm": 0.192989073538916, |
|
"learning_rate": 3.3698019427382912e-06, |
|
"loss": 0.5577, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.6287262872628726, |
|
"grad_norm": 0.22659064089939246, |
|
"learning_rate": 3.341546309972398e-06, |
|
"loss": 0.5589, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.6305329719963866, |
|
"grad_norm": 0.19775912613412447, |
|
"learning_rate": 3.3133500630858507e-06, |
|
"loss": 0.5618, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.6323396567299007, |
|
"grad_norm": 0.21221541860013762, |
|
"learning_rate": 3.2852142117293435e-06, |
|
"loss": 0.5742, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.6341463414634146, |
|
"grad_norm": 0.1965105061000691, |
|
"learning_rate": 3.2571397633909252e-06, |
|
"loss": 0.5641, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.6359530261969286, |
|
"grad_norm": 0.21779012635428768, |
|
"learning_rate": 3.229127723359927e-06, |
|
"loss": 0.578, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.6377597109304426, |
|
"grad_norm": 0.19836139294521862, |
|
"learning_rate": 3.2011790946909673e-06, |
|
"loss": 0.5781, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.6395663956639567, |
|
"grad_norm": 0.2122021344625228, |
|
"learning_rate": 3.173294878168025e-06, |
|
"loss": 0.5732, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.6413730803974707, |
|
"grad_norm": 0.2194958795560653, |
|
"learning_rate": 3.1454760722686206e-06, |
|
"loss": 0.5625, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.6431797651309846, |
|
"grad_norm": 0.19693232693468157, |
|
"learning_rate": 3.11772367312804e-06, |
|
"loss": 0.5772, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.6449864498644986, |
|
"grad_norm": 0.20175014232350474, |
|
"learning_rate": 3.090038674503688e-06, |
|
"loss": 0.5778, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.6467931345980127, |
|
"grad_norm": 0.2005114679348494, |
|
"learning_rate": 3.0624220677394854e-06, |
|
"loss": 0.5835, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.6485998193315267, |
|
"grad_norm": 0.21305517551950778, |
|
"learning_rate": 3.0348748417303826e-06, |
|
"loss": 0.5562, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.6504065040650406, |
|
"grad_norm": 0.2387797393224941, |
|
"learning_rate": 3.007397982886942e-06, |
|
"loss": 0.5649, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.6522131887985546, |
|
"grad_norm": 0.19835214640988658, |
|
"learning_rate": 2.979992475100024e-06, |
|
"loss": 0.5707, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.6540198735320687, |
|
"grad_norm": 0.19057623785322997, |
|
"learning_rate": 2.9526592997055488e-06, |
|
"loss": 0.582, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.6558265582655827, |
|
"grad_norm": 0.21048961626325866, |
|
"learning_rate": 2.9253994354493575e-06, |
|
"loss": 0.5726, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.6576332429990966, |
|
"grad_norm": 0.19890524691407938, |
|
"learning_rate": 2.8982138584521734e-06, |
|
"loss": 0.5714, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.6594399277326106, |
|
"grad_norm": 0.20817182309526563, |
|
"learning_rate": 2.871103542174637e-06, |
|
"loss": 0.5634, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.6612466124661247, |
|
"grad_norm": 0.20749634910701936, |
|
"learning_rate": 2.844069457382459e-06, |
|
"loss": 0.5851, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.6630532971996387, |
|
"grad_norm": 0.19371505947939419, |
|
"learning_rate": 2.817112572111651e-06, |
|
"loss": 0.5665, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.6648599819331527, |
|
"grad_norm": 0.1985051207678445, |
|
"learning_rate": 2.790233851633868e-06, |
|
"loss": 0.5781, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.2075475783976387, |
|
"learning_rate": 2.7634342584218364e-06, |
|
"loss": 0.579, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.6684733514001807, |
|
"grad_norm": 0.20221005818139545, |
|
"learning_rate": 2.7367147521149052e-06, |
|
"loss": 0.5775, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.6702800361336947, |
|
"grad_norm": 0.20777755987287125, |
|
"learning_rate": 2.7100762894846633e-06, |
|
"loss": 0.5656, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.6720867208672087, |
|
"grad_norm": 0.19802371508565084, |
|
"learning_rate": 2.683519824400693e-06, |
|
"loss": 0.5832, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.6738934056007226, |
|
"grad_norm": 0.19817132297037005, |
|
"learning_rate": 2.657046307796407e-06, |
|
"loss": 0.5691, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.6757000903342367, |
|
"grad_norm": 0.18886995958743352, |
|
"learning_rate": 2.6306566876350072e-06, |
|
"loss": 0.5583, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.6775067750677507, |
|
"grad_norm": 0.19326300664336102, |
|
"learning_rate": 2.6043519088755263e-06, |
|
"loss": 0.5731, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.6793134598012647, |
|
"grad_norm": 0.19042051149412062, |
|
"learning_rate": 2.578132913439e-06, |
|
"loss": 0.5577, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.6811201445347786, |
|
"grad_norm": 0.18993813802274045, |
|
"learning_rate": 2.55200064017474e-06, |
|
"loss": 0.5736, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.6829268292682927, |
|
"grad_norm": 0.18809021990235358, |
|
"learning_rate": 2.5259560248267022e-06, |
|
"loss": 0.5744, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.6847335140018067, |
|
"grad_norm": 0.20746435742973107, |
|
"learning_rate": 2.5000000000000015e-06, |
|
"loss": 0.5772, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.6865401987353207, |
|
"grad_norm": 0.181205508307499, |
|
"learning_rate": 2.4741334951274948e-06, |
|
"loss": 0.5579, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.6883468834688347, |
|
"grad_norm": 0.21976859288827372, |
|
"learning_rate": 2.448357436436519e-06, |
|
"loss": 0.5743, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.6901535682023487, |
|
"grad_norm": 0.18988860188680556, |
|
"learning_rate": 2.4226727469157097e-06, |
|
"loss": 0.5619, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.6919602529358627, |
|
"grad_norm": 0.18800207830401067, |
|
"learning_rate": 2.3970803462819586e-06, |
|
"loss": 0.581, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.6937669376693767, |
|
"grad_norm": 0.20574302994251628, |
|
"learning_rate": 2.371581150947476e-06, |
|
"loss": 0.5792, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.6955736224028907, |
|
"grad_norm": 0.1997912442763758, |
|
"learning_rate": 2.3461760739869865e-06, |
|
"loss": 0.5613, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.6973803071364046, |
|
"grad_norm": 0.19311780562623543, |
|
"learning_rate": 2.320866025105016e-06, |
|
"loss": 0.5727, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.6991869918699187, |
|
"grad_norm": 0.20388178567890022, |
|
"learning_rate": 2.2956519106033366e-06, |
|
"loss": 0.5729, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.7009936766034327, |
|
"grad_norm": 0.2078954929763357, |
|
"learning_rate": 2.2705346333484925e-06, |
|
"loss": 0.5723, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.7028003613369467, |
|
"grad_norm": 0.23477627938842424, |
|
"learning_rate": 2.245515092739488e-06, |
|
"loss": 0.5752, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.7046070460704607, |
|
"grad_norm": 0.19482153687355794, |
|
"learning_rate": 2.2205941846755787e-06, |
|
"loss": 0.5685, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.7064137308039747, |
|
"grad_norm": 0.19531470723741354, |
|
"learning_rate": 2.1957728015241793e-06, |
|
"loss": 0.5691, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.7082204155374887, |
|
"grad_norm": 0.20096947462073975, |
|
"learning_rate": 2.171051832088928e-06, |
|
"loss": 0.575, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.7100271002710027, |
|
"grad_norm": 0.20294226078460895, |
|
"learning_rate": 2.146432161577842e-06, |
|
"loss": 0.5798, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.7118337850045167, |
|
"grad_norm": 0.1993751736574143, |
|
"learning_rate": 2.1219146715716332e-06, |
|
"loss": 0.582, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.7136404697380307, |
|
"grad_norm": 0.20494626377292469, |
|
"learning_rate": 2.097500239992132e-06, |
|
"loss": 0.5774, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.7154471544715447, |
|
"grad_norm": 0.21134391941418917, |
|
"learning_rate": 2.0731897410708618e-06, |
|
"loss": 0.571, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.7172538392050587, |
|
"grad_norm": 0.19571464617790202, |
|
"learning_rate": 2.0489840453177198e-06, |
|
"loss": 0.5703, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.7190605239385727, |
|
"grad_norm": 0.2116541881285577, |
|
"learning_rate": 2.0248840194898155e-06, |
|
"loss": 0.5717, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.7208672086720868, |
|
"grad_norm": 0.1826612419863307, |
|
"learning_rate": 2.0008905265604316e-06, |
|
"loss": 0.5756, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.7226738934056007, |
|
"grad_norm": 0.19748539528420223, |
|
"learning_rate": 1.977004425688126e-06, |
|
"loss": 0.5719, |
|
"step": 400 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 553, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 463977396436992.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|