|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9781021897810218, |
|
"eval_steps": 68, |
|
"global_step": 136, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.014598540145985401, |
|
"grad_norm": 6.6188509609874355, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7522, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.029197080291970802, |
|
"grad_norm": 6.574902892362282, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7555, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.043795620437956206, |
|
"grad_norm": 4.116760676302817, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7337, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.058394160583941604, |
|
"grad_norm": 6.487316576987083, |
|
"learning_rate": 4e-05, |
|
"loss": 0.752, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.072992700729927, |
|
"grad_norm": 15.034094104997072, |
|
"learning_rate": 5e-05, |
|
"loss": 0.8254, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.08759124087591241, |
|
"grad_norm": 8.121100633280987, |
|
"learning_rate": 4.9992811366328926e-05, |
|
"loss": 0.9564, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.10218978102189781, |
|
"grad_norm": 8.8258406941248, |
|
"learning_rate": 4.997124959943201e-05, |
|
"loss": 0.8104, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.11678832116788321, |
|
"grad_norm": 6.295661645639767, |
|
"learning_rate": 4.993532709928075e-05, |
|
"loss": 0.7748, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.13138686131386862, |
|
"grad_norm": 8.793469060407903, |
|
"learning_rate": 4.9885064524570665e-05, |
|
"loss": 0.7102, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.145985401459854, |
|
"grad_norm": 4.105212126064049, |
|
"learning_rate": 4.982049078084071e-05, |
|
"loss": 0.6743, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.16058394160583941, |
|
"grad_norm": 2.039360680236883, |
|
"learning_rate": 4.974164300384998e-05, |
|
"loss": 0.6023, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.17518248175182483, |
|
"grad_norm": 2.0703841635505706, |
|
"learning_rate": 4.964856653822122e-05, |
|
"loss": 0.5914, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.1897810218978102, |
|
"grad_norm": 2.216652524604031, |
|
"learning_rate": 4.954131491136362e-05, |
|
"loss": 0.5223, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.20437956204379562, |
|
"grad_norm": 2.068081403621057, |
|
"learning_rate": 4.9419949802689666e-05, |
|
"loss": 0.4807, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.21897810218978103, |
|
"grad_norm": 2.2137900547871485, |
|
"learning_rate": 4.92845410081439e-05, |
|
"loss": 0.481, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.23357664233576642, |
|
"grad_norm": 3.863818843522576, |
|
"learning_rate": 4.913516640006392e-05, |
|
"loss": 0.4943, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.24817518248175183, |
|
"grad_norm": 5.538121594510211, |
|
"learning_rate": 4.897191188239667e-05, |
|
"loss": 0.4939, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.26277372262773724, |
|
"grad_norm": 2.110050058368509, |
|
"learning_rate": 4.8794871341296e-05, |
|
"loss": 0.4579, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.2773722627737226, |
|
"grad_norm": 1.6455776595109672, |
|
"learning_rate": 4.8604146591129485e-05, |
|
"loss": 0.4315, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.291970802919708, |
|
"grad_norm": 1.6699338063986153, |
|
"learning_rate": 4.8399847315926e-05, |
|
"loss": 0.4436, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.30656934306569344, |
|
"grad_norm": 1.6382590699916768, |
|
"learning_rate": 4.818209100629745e-05, |
|
"loss": 0.4024, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.32116788321167883, |
|
"grad_norm": 1.473696036952172, |
|
"learning_rate": 4.795100289187099e-05, |
|
"loss": 0.407, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.3357664233576642, |
|
"grad_norm": 1.5431679072262796, |
|
"learning_rate": 4.7706715869270635e-05, |
|
"loss": 0.407, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.35036496350364965, |
|
"grad_norm": 1.5667303006045037, |
|
"learning_rate": 4.74493704256897e-05, |
|
"loss": 0.3976, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.36496350364963503, |
|
"grad_norm": 1.4086778511120994, |
|
"learning_rate": 4.717911455809782e-05, |
|
"loss": 0.3932, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.3795620437956204, |
|
"grad_norm": 1.255371117436458, |
|
"learning_rate": 4.6896103688129385e-05, |
|
"loss": 0.3514, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.39416058394160586, |
|
"grad_norm": 1.580105274585517, |
|
"learning_rate": 4.660050057270191e-05, |
|
"loss": 0.3679, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.40875912408759124, |
|
"grad_norm": 1.5330542124088484, |
|
"learning_rate": 4.6292475210416106e-05, |
|
"loss": 0.3809, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.4233576642335766, |
|
"grad_norm": 1.36659727223936, |
|
"learning_rate": 4.597220474379125e-05, |
|
"loss": 0.3396, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.43795620437956206, |
|
"grad_norm": 1.4830662677819917, |
|
"learning_rate": 4.563987335739216e-05, |
|
"loss": 0.3581, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.45255474452554745, |
|
"grad_norm": 1.579952170056348, |
|
"learning_rate": 4.5295672171906364e-05, |
|
"loss": 0.3494, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.46715328467153283, |
|
"grad_norm": 1.5223723528568243, |
|
"learning_rate": 4.49397991342324e-05, |
|
"loss": 0.3534, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.48175182481751827, |
|
"grad_norm": 1.4533190235735434, |
|
"learning_rate": 4.4572458903642354e-05, |
|
"loss": 0.35, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.49635036496350365, |
|
"grad_norm": 1.273536091739147, |
|
"learning_rate": 4.419386273408428e-05, |
|
"loss": 0.3539, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.5109489051094891, |
|
"grad_norm": 1.2519284173239225, |
|
"learning_rate": 4.3804228352691935e-05, |
|
"loss": 0.3434, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.5255474452554745, |
|
"grad_norm": 1.2947790542911022, |
|
"learning_rate": 4.3403779834572004e-05, |
|
"loss": 0.3446, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.5401459854014599, |
|
"grad_norm": 1.4271186712542432, |
|
"learning_rate": 4.2992747473940556e-05, |
|
"loss": 0.3485, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.5547445255474452, |
|
"grad_norm": 1.1763950013950975, |
|
"learning_rate": 4.2571367651683e-05, |
|
"loss": 0.3117, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.5693430656934306, |
|
"grad_norm": 1.1117128867059132, |
|
"learning_rate": 4.213988269941362e-05, |
|
"loss": 0.2661, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.583941605839416, |
|
"grad_norm": 1.2354957130115665, |
|
"learning_rate": 4.169854076011292e-05, |
|
"loss": 0.3013, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.5985401459854015, |
|
"grad_norm": 1.163629462061701, |
|
"learning_rate": 4.124759564542295e-05, |
|
"loss": 0.293, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.6131386861313869, |
|
"grad_norm": 1.1963752057859445, |
|
"learning_rate": 4.078730668968253e-05, |
|
"loss": 0.2704, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.6277372262773723, |
|
"grad_norm": 1.1433957305246072, |
|
"learning_rate": 4.031793860078649e-05, |
|
"loss": 0.2798, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.6423357664233577, |
|
"grad_norm": 1.0742244561363925, |
|
"learning_rate": 3.9839761307954675e-05, |
|
"loss": 0.2668, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.656934306569343, |
|
"grad_norm": 1.139472353645151, |
|
"learning_rate": 3.935304980649813e-05, |
|
"loss": 0.3206, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.6715328467153284, |
|
"grad_norm": 1.1118011686610685, |
|
"learning_rate": 3.8858083999671855e-05, |
|
"loss": 0.28, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.6861313868613139, |
|
"grad_norm": 1.105867383261182, |
|
"learning_rate": 3.835514853770505e-05, |
|
"loss": 0.2861, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.7007299270072993, |
|
"grad_norm": 1.0643135219882822, |
|
"learning_rate": 3.784453265410141e-05, |
|
"loss": 0.2677, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.7153284671532847, |
|
"grad_norm": 0.970941214477793, |
|
"learning_rate": 3.732652999930364e-05, |
|
"loss": 0.2419, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.7299270072992701, |
|
"grad_norm": 1.2039034119373921, |
|
"learning_rate": 3.680143847181783e-05, |
|
"loss": 0.2817, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.7445255474452555, |
|
"grad_norm": 1.3763613598969262, |
|
"learning_rate": 3.6269560046894766e-05, |
|
"loss": 0.2813, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.7591240875912408, |
|
"grad_norm": 1.0917986372749406, |
|
"learning_rate": 3.573120060286679e-05, |
|
"loss": 0.2434, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.7737226277372263, |
|
"grad_norm": 1.2010408408369964, |
|
"learning_rate": 3.5186669745240026e-05, |
|
"loss": 0.266, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.7883211678832117, |
|
"grad_norm": 0.9851709556799068, |
|
"learning_rate": 3.463628062864312e-05, |
|
"loss": 0.2401, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.8029197080291971, |
|
"grad_norm": 1.027902148221402, |
|
"learning_rate": 3.4080349776734925e-05, |
|
"loss": 0.2396, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.8175182481751825, |
|
"grad_norm": 1.038222274485028, |
|
"learning_rate": 3.351919690017473e-05, |
|
"loss": 0.2396, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.8321167883211679, |
|
"grad_norm": 0.9906398418415747, |
|
"learning_rate": 3.2953144712759545e-05, |
|
"loss": 0.2435, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.8467153284671532, |
|
"grad_norm": 0.9958710262970785, |
|
"learning_rate": 3.238251874583452e-05, |
|
"loss": 0.2585, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.8613138686131386, |
|
"grad_norm": 0.9280508074490356, |
|
"learning_rate": 3.1807647161082795e-05, |
|
"loss": 0.2328, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.8759124087591241, |
|
"grad_norm": 0.9551807399682243, |
|
"learning_rate": 3.122886056180284e-05, |
|
"loss": 0.248, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.8905109489051095, |
|
"grad_norm": 1.0680823587174857, |
|
"learning_rate": 3.064649180278152e-05, |
|
"loss": 0.2766, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.9051094890510949, |
|
"grad_norm": 2.730287207993988, |
|
"learning_rate": 3.006087579887244e-05, |
|
"loss": 0.2274, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.9197080291970803, |
|
"grad_norm": 1.0477423658369303, |
|
"learning_rate": 2.9472349332389525e-05, |
|
"loss": 0.2163, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.9343065693430657, |
|
"grad_norm": 1.0633922453008755, |
|
"learning_rate": 2.8881250859426646e-05, |
|
"loss": 0.2552, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.948905109489051, |
|
"grad_norm": 0.940535973392031, |
|
"learning_rate": 2.8287920315214643e-05, |
|
"loss": 0.2268, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.9635036496350365, |
|
"grad_norm": 0.8335596155443756, |
|
"learning_rate": 2.7692698918627778e-05, |
|
"loss": 0.2051, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.9781021897810219, |
|
"grad_norm": 0.9709259192327347, |
|
"learning_rate": 2.7095928975951913e-05, |
|
"loss": 0.238, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.9927007299270073, |
|
"grad_norm": 0.9181878310139981, |
|
"learning_rate": 2.649795368402735e-05, |
|
"loss": 0.2607, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.9181878310139981, |
|
"learning_rate": 2.649795368402735e-05, |
|
"loss": 0.1126, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.5867471694946289, |
|
"eval_runtime": 2.5685, |
|
"eval_samples_per_second": 27.643, |
|
"eval_steps_per_second": 1.168, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.0145985401459854, |
|
"grad_norm": 0.817428076729955, |
|
"learning_rate": 2.5899116932879534e-05, |
|
"loss": 0.1198, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.0291970802919708, |
|
"grad_norm": 0.7442926018853524, |
|
"learning_rate": 2.529976310795108e-05, |
|
"loss": 0.1285, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 1.0437956204379562, |
|
"grad_norm": 0.782523876321553, |
|
"learning_rate": 2.470023689204893e-05, |
|
"loss": 0.1272, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.0583941605839415, |
|
"grad_norm": 0.7879156799249153, |
|
"learning_rate": 2.4100883067120475e-05, |
|
"loss": 0.114, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.072992700729927, |
|
"grad_norm": 0.6738289101740561, |
|
"learning_rate": 2.3502046315972656e-05, |
|
"loss": 0.1161, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.0875912408759123, |
|
"grad_norm": 0.7921210574551151, |
|
"learning_rate": 2.290407102404809e-05, |
|
"loss": 0.1078, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.102189781021898, |
|
"grad_norm": 0.7872824090927025, |
|
"learning_rate": 2.2307301081372224e-05, |
|
"loss": 0.1311, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.1167883211678833, |
|
"grad_norm": 0.8331285832919668, |
|
"learning_rate": 2.1712079684785363e-05, |
|
"loss": 0.0932, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.1313868613138687, |
|
"grad_norm": 0.7662201225372701, |
|
"learning_rate": 2.111874914057336e-05, |
|
"loss": 0.1339, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.145985401459854, |
|
"grad_norm": 0.8748763622681786, |
|
"learning_rate": 2.0527650667610478e-05, |
|
"loss": 0.1328, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.1605839416058394, |
|
"grad_norm": 0.764695112873272, |
|
"learning_rate": 1.993912420112756e-05, |
|
"loss": 0.0968, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.1751824817518248, |
|
"grad_norm": 0.8003697459148763, |
|
"learning_rate": 1.935350819721849e-05, |
|
"loss": 0.1303, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.1897810218978102, |
|
"grad_norm": 0.8866466859845482, |
|
"learning_rate": 1.8771139438197168e-05, |
|
"loss": 0.1273, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.2043795620437956, |
|
"grad_norm": 0.7202427078538033, |
|
"learning_rate": 1.819235283891721e-05, |
|
"loss": 0.1036, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.218978102189781, |
|
"grad_norm": 0.6939994466496473, |
|
"learning_rate": 1.7617481254165487e-05, |
|
"loss": 0.1166, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.2335766423357664, |
|
"grad_norm": 0.6773010302740605, |
|
"learning_rate": 1.704685528724046e-05, |
|
"loss": 0.1239, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.2481751824817517, |
|
"grad_norm": 0.8057614185871913, |
|
"learning_rate": 1.648080309982528e-05, |
|
"loss": 0.1269, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.2627737226277373, |
|
"grad_norm": 0.738081394228655, |
|
"learning_rate": 1.591965022326507e-05, |
|
"loss": 0.1184, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.2773722627737225, |
|
"grad_norm": 0.7034144989855043, |
|
"learning_rate": 1.536371937135688e-05, |
|
"loss": 0.105, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.2919708029197081, |
|
"grad_norm": 0.809614248272801, |
|
"learning_rate": 1.4813330254759985e-05, |
|
"loss": 0.1299, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.3065693430656935, |
|
"grad_norm": 0.7359581131995093, |
|
"learning_rate": 1.426879939713322e-05, |
|
"loss": 0.0931, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.3211678832116789, |
|
"grad_norm": 0.6271750011013629, |
|
"learning_rate": 1.3730439953105243e-05, |
|
"loss": 0.1, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.3357664233576643, |
|
"grad_norm": 0.6681653257507313, |
|
"learning_rate": 1.3198561528182183e-05, |
|
"loss": 0.0882, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.3503649635036497, |
|
"grad_norm": 0.5892983169851506, |
|
"learning_rate": 1.2673470000696364e-05, |
|
"loss": 0.119, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.364963503649635, |
|
"grad_norm": 0.7781672091856281, |
|
"learning_rate": 1.2155467345898602e-05, |
|
"loss": 0.1235, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.3795620437956204, |
|
"grad_norm": 0.7457533748551523, |
|
"learning_rate": 1.1644851462294957e-05, |
|
"loss": 0.1153, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.3941605839416058, |
|
"grad_norm": 0.7719451070162108, |
|
"learning_rate": 1.114191600032815e-05, |
|
"loss": 0.1108, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.4087591240875912, |
|
"grad_norm": 0.6456397546412238, |
|
"learning_rate": 1.064695019350187e-05, |
|
"loss": 0.0966, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.4233576642335766, |
|
"grad_norm": 0.7075931478021869, |
|
"learning_rate": 1.0160238692045332e-05, |
|
"loss": 0.122, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.437956204379562, |
|
"grad_norm": 0.7164358739819071, |
|
"learning_rate": 9.682061399213525e-06, |
|
"loss": 0.1073, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.4525547445255476, |
|
"grad_norm": 0.6742795850736939, |
|
"learning_rate": 9.21269331031748e-06, |
|
"loss": 0.1082, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.4671532846715327, |
|
"grad_norm": 0.664349783270638, |
|
"learning_rate": 8.752404354577052e-06, |
|
"loss": 0.1086, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.4817518248175183, |
|
"grad_norm": 0.6104029576807147, |
|
"learning_rate": 8.301459239887074e-06, |
|
"loss": 0.095, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.4963503649635037, |
|
"grad_norm": 0.9865461135511884, |
|
"learning_rate": 7.860117300586383e-06, |
|
"loss": 0.1213, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 1.510948905109489, |
|
"grad_norm": 0.6388824796222101, |
|
"learning_rate": 7.428632348317005e-06, |
|
"loss": 0.082, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.5255474452554745, |
|
"grad_norm": 0.5887544819341681, |
|
"learning_rate": 7.007252526059446e-06, |
|
"loss": 0.0807, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.5401459854014599, |
|
"grad_norm": 0.5287062328074978, |
|
"learning_rate": 6.596220165428002e-06, |
|
"loss": 0.0946, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.5547445255474452, |
|
"grad_norm": 0.693208580525385, |
|
"learning_rate": 6.195771647308074e-06, |
|
"loss": 0.116, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.5693430656934306, |
|
"grad_norm": 0.6615214534660775, |
|
"learning_rate": 5.806137265915732e-06, |
|
"loss": 0.1029, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.583941605839416, |
|
"grad_norm": 0.6252962266934661, |
|
"learning_rate": 5.427541096357647e-06, |
|
"loss": 0.0999, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 1.5985401459854014, |
|
"grad_norm": 0.6524052423711888, |
|
"learning_rate": 5.060200865767606e-06, |
|
"loss": 0.1018, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.613138686131387, |
|
"grad_norm": 0.67321527870827, |
|
"learning_rate": 4.704327828093641e-06, |
|
"loss": 0.1015, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.6277372262773722, |
|
"grad_norm": 0.6002708673392801, |
|
"learning_rate": 4.3601266426078426e-06, |
|
"loss": 0.0919, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.6423357664233578, |
|
"grad_norm": 0.6354891189727261, |
|
"learning_rate": 4.02779525620875e-06, |
|
"loss": 0.0962, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.656934306569343, |
|
"grad_norm": 0.6211165567663863, |
|
"learning_rate": 3.707524789583891e-06, |
|
"loss": 0.0838, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.6715328467153285, |
|
"grad_norm": 0.6058146661351359, |
|
"learning_rate": 3.3994994272980946e-06, |
|
"loss": 0.0965, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.686131386861314, |
|
"grad_norm": 0.6098982651990246, |
|
"learning_rate": 3.1038963118706244e-06, |
|
"loss": 0.1097, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.7007299270072993, |
|
"grad_norm": 0.6644009843756008, |
|
"learning_rate": 2.8208854419021824e-06, |
|
"loss": 0.0903, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.7153284671532847, |
|
"grad_norm": 0.5210745143934205, |
|
"learning_rate": 2.5506295743103094e-06, |
|
"loss": 0.0717, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.72992700729927, |
|
"grad_norm": 0.6089605504043503, |
|
"learning_rate": 2.2932841307293644e-06, |
|
"loss": 0.103, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.7445255474452555, |
|
"grad_norm": 0.6090432120427985, |
|
"learning_rate": 2.0489971081290195e-06, |
|
"loss": 0.0895, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.7591240875912408, |
|
"grad_norm": 0.5490697461707983, |
|
"learning_rate": 1.817908993702555e-06, |
|
"loss": 0.0843, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.7737226277372264, |
|
"grad_norm": 0.5710673269130835, |
|
"learning_rate": 1.6001526840740049e-06, |
|
"loss": 0.0906, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.7883211678832116, |
|
"grad_norm": 0.6245928434878661, |
|
"learning_rate": 1.3958534088705206e-06, |
|
"loss": 0.0776, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.8029197080291972, |
|
"grad_norm": 0.549963273284548, |
|
"learning_rate": 1.205128658704005e-06, |
|
"loss": 0.0803, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.8175182481751824, |
|
"grad_norm": 0.6057327330974007, |
|
"learning_rate": 1.028088117603332e-06, |
|
"loss": 0.0714, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.832116788321168, |
|
"grad_norm": 0.5865246963245718, |
|
"learning_rate": 8.648335999360935e-07, |
|
"loss": 0.0967, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.8467153284671531, |
|
"grad_norm": 0.5746297279399645, |
|
"learning_rate": 7.154589918561022e-07, |
|
"loss": 0.0849, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.8613138686131387, |
|
"grad_norm": 0.600256652314103, |
|
"learning_rate": 5.800501973103362e-07, |
|
"loss": 0.0916, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.8759124087591241, |
|
"grad_norm": 0.6555734310326506, |
|
"learning_rate": 4.586850886363875e-07, |
|
"loss": 0.1204, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.8905109489051095, |
|
"grad_norm": 0.6566157654207085, |
|
"learning_rate": 3.514334617787857e-07, |
|
"loss": 0.081, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.905109489051095, |
|
"grad_norm": 0.5292071962356375, |
|
"learning_rate": 2.5835699615002764e-07, |
|
"loss": 0.0727, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.9197080291970803, |
|
"grad_norm": 0.6235513391783781, |
|
"learning_rate": 1.7950921915928788e-07, |
|
"loss": 0.0967, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.9343065693430657, |
|
"grad_norm": 0.6604731273435182, |
|
"learning_rate": 1.1493547542933969e-07, |
|
"loss": 0.1037, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.948905109489051, |
|
"grad_norm": 0.6217262060263744, |
|
"learning_rate": 6.467290071925647e-08, |
|
"loss": 0.0711, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.9635036496350367, |
|
"grad_norm": 0.535719020023098, |
|
"learning_rate": 2.8750400567992274e-08, |
|
"loss": 0.0799, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.9781021897810218, |
|
"grad_norm": 0.6249296896177375, |
|
"learning_rate": 7.188633671079137e-09, |
|
"loss": 0.097, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.9781021897810218, |
|
"eval_loss": 0.6955251693725586, |
|
"eval_runtime": 1.1203, |
|
"eval_samples_per_second": 63.374, |
|
"eval_steps_per_second": 2.678, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.9781021897810218, |
|
"step": 136, |
|
"total_flos": 79316025114624.0, |
|
"train_loss": 0.2505467278773294, |
|
"train_runtime": 551.7554, |
|
"train_samples_per_second": 19.886, |
|
"train_steps_per_second": 0.246 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 136, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 1, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 79316025114624.0, |
|
"train_batch_size": 5, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|