|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 93, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03225806451612903, |
|
"grad_norm": 14.220000481200906, |
|
"learning_rate": 5e-06, |
|
"loss": 13.9265, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.06451612903225806, |
|
"grad_norm": 14.702762596013635, |
|
"learning_rate": 1e-05, |
|
"loss": 13.95, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0967741935483871, |
|
"grad_norm": 14.297305924795248, |
|
"learning_rate": 1.5e-05, |
|
"loss": 13.9116, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.12903225806451613, |
|
"grad_norm": 14.484767601202535, |
|
"learning_rate": 2e-05, |
|
"loss": 13.753, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.16129032258064516, |
|
"grad_norm": 14.480323510985746, |
|
"learning_rate": 2.5e-05, |
|
"loss": 13.222, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.1935483870967742, |
|
"grad_norm": 14.002264166506258, |
|
"learning_rate": 3e-05, |
|
"loss": 12.1647, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.22580645161290322, |
|
"grad_norm": 7.226601449948956, |
|
"learning_rate": 3.5e-05, |
|
"loss": 11.1642, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.25806451612903225, |
|
"grad_norm": 13.608621977990989, |
|
"learning_rate": 4e-05, |
|
"loss": 10.8504, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.2903225806451613, |
|
"grad_norm": 6.436610267622404, |
|
"learning_rate": 4.5e-05, |
|
"loss": 10.4279, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.3225806451612903, |
|
"grad_norm": 4.922039728837155, |
|
"learning_rate": 5e-05, |
|
"loss": 10.1287, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.3548387096774194, |
|
"grad_norm": 12.033200559896779, |
|
"learning_rate": 4.9397590361445786e-05, |
|
"loss": 9.9607, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.3870967741935484, |
|
"grad_norm": 3.5396711201156807, |
|
"learning_rate": 4.879518072289157e-05, |
|
"loss": 9.6616, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.41935483870967744, |
|
"grad_norm": 3.4751992441084534, |
|
"learning_rate": 4.8192771084337354e-05, |
|
"loss": 9.3783, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.45161290322580644, |
|
"grad_norm": 3.9005409440562735, |
|
"learning_rate": 4.759036144578313e-05, |
|
"loss": 9.1355, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.4838709677419355, |
|
"grad_norm": 2.8756405244423915, |
|
"learning_rate": 4.698795180722892e-05, |
|
"loss": 8.8203, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.5161290322580645, |
|
"grad_norm": 3.9300784221492027, |
|
"learning_rate": 4.63855421686747e-05, |
|
"loss": 8.5065, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.5483870967741935, |
|
"grad_norm": 5.84606546682727, |
|
"learning_rate": 4.578313253012048e-05, |
|
"loss": 8.3534, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.5806451612903226, |
|
"grad_norm": 5.025102319247832, |
|
"learning_rate": 4.5180722891566266e-05, |
|
"loss": 8.1611, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.6129032258064516, |
|
"grad_norm": 3.5261942209741486, |
|
"learning_rate": 4.457831325301205e-05, |
|
"loss": 8.0223, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.6451612903225806, |
|
"grad_norm": 8.46799283954891, |
|
"learning_rate": 4.3975903614457834e-05, |
|
"loss": 7.8982, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.6774193548387096, |
|
"grad_norm": 3.926166227638816, |
|
"learning_rate": 4.337349397590362e-05, |
|
"loss": 7.6319, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.7096774193548387, |
|
"grad_norm": 3.358443458515684, |
|
"learning_rate": 4.27710843373494e-05, |
|
"loss": 7.4718, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.7419354838709677, |
|
"grad_norm": 2.7982062504728487, |
|
"learning_rate": 4.2168674698795186e-05, |
|
"loss": 7.3488, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.7741935483870968, |
|
"grad_norm": 3.1113638072919216, |
|
"learning_rate": 4.156626506024097e-05, |
|
"loss": 7.15, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.8064516129032258, |
|
"grad_norm": 3.1595449829251945, |
|
"learning_rate": 4.0963855421686746e-05, |
|
"loss": 7.0563, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.8387096774193549, |
|
"grad_norm": 2.108212146858006, |
|
"learning_rate": 4.036144578313254e-05, |
|
"loss": 6.9565, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.8709677419354839, |
|
"grad_norm": 2.461122936407565, |
|
"learning_rate": 3.9759036144578314e-05, |
|
"loss": 6.8685, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.9032258064516129, |
|
"grad_norm": 1.6920822731314087, |
|
"learning_rate": 3.91566265060241e-05, |
|
"loss": 6.7245, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.9354838709677419, |
|
"grad_norm": 2.2327133352179276, |
|
"learning_rate": 3.855421686746988e-05, |
|
"loss": 6.6712, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.967741935483871, |
|
"grad_norm": 2.398221999231477, |
|
"learning_rate": 3.7951807228915666e-05, |
|
"loss": 6.5071, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.9848914786193277, |
|
"learning_rate": 3.734939759036144e-05, |
|
"loss": 6.3974, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 1.032258064516129, |
|
"grad_norm": 1.8070671857380076, |
|
"learning_rate": 3.674698795180723e-05, |
|
"loss": 6.3319, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.064516129032258, |
|
"grad_norm": 1.6370922118025715, |
|
"learning_rate": 3.614457831325301e-05, |
|
"loss": 6.1884, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.096774193548387, |
|
"grad_norm": 1.3131279804848621, |
|
"learning_rate": 3.5542168674698794e-05, |
|
"loss": 6.1036, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.129032258064516, |
|
"grad_norm": 1.340531475202813, |
|
"learning_rate": 3.4939759036144585e-05, |
|
"loss": 5.9671, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.1612903225806452, |
|
"grad_norm": 2.005454028622789, |
|
"learning_rate": 3.433734939759036e-05, |
|
"loss": 5.8771, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.1935483870967742, |
|
"grad_norm": 1.4142674915557123, |
|
"learning_rate": 3.3734939759036146e-05, |
|
"loss": 5.8714, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.2258064516129032, |
|
"grad_norm": 1.2956798900198585, |
|
"learning_rate": 3.313253012048193e-05, |
|
"loss": 5.7898, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.2580645161290323, |
|
"grad_norm": 1.6593235984258043, |
|
"learning_rate": 3.253012048192771e-05, |
|
"loss": 5.6873, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.2903225806451613, |
|
"grad_norm": 1.380163595182406, |
|
"learning_rate": 3.192771084337349e-05, |
|
"loss": 5.7097, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.3225806451612903, |
|
"grad_norm": 1.0401057745638052, |
|
"learning_rate": 3.132530120481928e-05, |
|
"loss": 5.6556, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.3548387096774195, |
|
"grad_norm": 0.8473367054648464, |
|
"learning_rate": 3.072289156626506e-05, |
|
"loss": 5.4949, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.3870967741935485, |
|
"grad_norm": 1.1572154596782513, |
|
"learning_rate": 3.012048192771085e-05, |
|
"loss": 5.4822, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.4193548387096775, |
|
"grad_norm": 1.1443583002498423, |
|
"learning_rate": 2.951807228915663e-05, |
|
"loss": 5.4141, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.4516129032258065, |
|
"grad_norm": 1.1052929131923555, |
|
"learning_rate": 2.891566265060241e-05, |
|
"loss": 5.3635, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.4838709677419355, |
|
"grad_norm": 0.8970565525154285, |
|
"learning_rate": 2.8313253012048197e-05, |
|
"loss": 5.2672, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.5161290322580645, |
|
"grad_norm": 0.756063967066019, |
|
"learning_rate": 2.7710843373493977e-05, |
|
"loss": 5.2908, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.5483870967741935, |
|
"grad_norm": 0.7264373375167542, |
|
"learning_rate": 2.7108433734939758e-05, |
|
"loss": 5.2698, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.5806451612903225, |
|
"grad_norm": 0.7011676067381816, |
|
"learning_rate": 2.6506024096385545e-05, |
|
"loss": 5.2454, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.6129032258064515, |
|
"grad_norm": 0.6927754707271319, |
|
"learning_rate": 2.5903614457831325e-05, |
|
"loss": 5.1169, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.6451612903225805, |
|
"grad_norm": 0.6274016321054947, |
|
"learning_rate": 2.530120481927711e-05, |
|
"loss": 5.079, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.6774193548387095, |
|
"grad_norm": 0.6260513529033087, |
|
"learning_rate": 2.4698795180722893e-05, |
|
"loss": 5.0318, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.7096774193548387, |
|
"grad_norm": 0.6122295776355817, |
|
"learning_rate": 2.4096385542168677e-05, |
|
"loss": 5.0809, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.7419354838709677, |
|
"grad_norm": 0.6219482405443587, |
|
"learning_rate": 2.349397590361446e-05, |
|
"loss": 4.9977, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.7741935483870968, |
|
"grad_norm": 0.7127219759114208, |
|
"learning_rate": 2.289156626506024e-05, |
|
"loss": 4.9713, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.8064516129032258, |
|
"grad_norm": 0.6728184125308878, |
|
"learning_rate": 2.2289156626506025e-05, |
|
"loss": 4.9564, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.838709677419355, |
|
"grad_norm": 0.6658692158816993, |
|
"learning_rate": 2.168674698795181e-05, |
|
"loss": 4.8953, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.870967741935484, |
|
"grad_norm": 0.6480363491872242, |
|
"learning_rate": 2.1084337349397593e-05, |
|
"loss": 4.8271, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.903225806451613, |
|
"grad_norm": 0.6798514027495196, |
|
"learning_rate": 2.0481927710843373e-05, |
|
"loss": 4.856, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.935483870967742, |
|
"grad_norm": 0.5868913336163518, |
|
"learning_rate": 1.9879518072289157e-05, |
|
"loss": 4.7935, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.967741935483871, |
|
"grad_norm": 0.5793335785631702, |
|
"learning_rate": 1.927710843373494e-05, |
|
"loss": 4.8151, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.5816955191304313, |
|
"learning_rate": 1.867469879518072e-05, |
|
"loss": 4.7828, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 2.032258064516129, |
|
"grad_norm": 0.5845389898807549, |
|
"learning_rate": 1.8072289156626505e-05, |
|
"loss": 4.6459, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 2.064516129032258, |
|
"grad_norm": 0.5946763020956095, |
|
"learning_rate": 1.7469879518072292e-05, |
|
"loss": 4.7297, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 2.096774193548387, |
|
"grad_norm": 0.5627139324820443, |
|
"learning_rate": 1.6867469879518073e-05, |
|
"loss": 4.6863, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 2.129032258064516, |
|
"grad_norm": 0.5636104401228851, |
|
"learning_rate": 1.6265060240963857e-05, |
|
"loss": 4.6437, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 2.161290322580645, |
|
"grad_norm": 0.5966948986254044, |
|
"learning_rate": 1.566265060240964e-05, |
|
"loss": 4.6102, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 2.193548387096774, |
|
"grad_norm": 0.5635496928050323, |
|
"learning_rate": 1.5060240963855424e-05, |
|
"loss": 4.56, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 2.225806451612903, |
|
"grad_norm": 0.6201573994580546, |
|
"learning_rate": 1.4457831325301205e-05, |
|
"loss": 4.5522, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 2.258064516129032, |
|
"grad_norm": 0.5283310806265233, |
|
"learning_rate": 1.3855421686746989e-05, |
|
"loss": 4.5434, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.2903225806451615, |
|
"grad_norm": 0.5830200387240102, |
|
"learning_rate": 1.3253012048192772e-05, |
|
"loss": 4.5213, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 2.3225806451612905, |
|
"grad_norm": 0.5366398051729224, |
|
"learning_rate": 1.2650602409638555e-05, |
|
"loss": 4.4882, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 2.3548387096774195, |
|
"grad_norm": 0.5278747342683787, |
|
"learning_rate": 1.2048192771084338e-05, |
|
"loss": 4.4285, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 2.3870967741935485, |
|
"grad_norm": 0.5187375955155756, |
|
"learning_rate": 1.144578313253012e-05, |
|
"loss": 4.4649, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 2.4193548387096775, |
|
"grad_norm": 0.49022439094573594, |
|
"learning_rate": 1.0843373493975904e-05, |
|
"loss": 4.4827, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 2.4516129032258065, |
|
"grad_norm": 0.4832270774924177, |
|
"learning_rate": 1.0240963855421687e-05, |
|
"loss": 4.4521, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 2.4838709677419355, |
|
"grad_norm": 0.5104329783830706, |
|
"learning_rate": 9.63855421686747e-06, |
|
"loss": 4.4009, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 2.5161290322580645, |
|
"grad_norm": 0.46199522908208795, |
|
"learning_rate": 9.036144578313253e-06, |
|
"loss": 4.3962, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 2.5483870967741935, |
|
"grad_norm": 0.4752588993383047, |
|
"learning_rate": 8.433734939759036e-06, |
|
"loss": 4.4422, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 2.5806451612903225, |
|
"grad_norm": 0.48368008070516033, |
|
"learning_rate": 7.83132530120482e-06, |
|
"loss": 4.2568, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.6129032258064515, |
|
"grad_norm": 0.47323578510274494, |
|
"learning_rate": 7.228915662650602e-06, |
|
"loss": 4.3716, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 2.6451612903225805, |
|
"grad_norm": 0.47907919622898615, |
|
"learning_rate": 6.626506024096386e-06, |
|
"loss": 4.4467, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 2.6774193548387095, |
|
"grad_norm": 0.5132262686017864, |
|
"learning_rate": 6.024096385542169e-06, |
|
"loss": 4.353, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 2.709677419354839, |
|
"grad_norm": 0.4512163239687551, |
|
"learning_rate": 5.421686746987952e-06, |
|
"loss": 4.3904, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 2.741935483870968, |
|
"grad_norm": 0.4613864288737105, |
|
"learning_rate": 4.819277108433735e-06, |
|
"loss": 4.3699, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 2.774193548387097, |
|
"grad_norm": 0.4834904636291854, |
|
"learning_rate": 4.216867469879518e-06, |
|
"loss": 4.3382, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 2.806451612903226, |
|
"grad_norm": 0.45137040545065016, |
|
"learning_rate": 3.614457831325301e-06, |
|
"loss": 4.3651, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 2.838709677419355, |
|
"grad_norm": 0.44625483484835893, |
|
"learning_rate": 3.0120481927710846e-06, |
|
"loss": 4.4046, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 2.870967741935484, |
|
"grad_norm": 0.4306895833715559, |
|
"learning_rate": 2.4096385542168676e-06, |
|
"loss": 4.3594, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 2.903225806451613, |
|
"grad_norm": 0.46583862244193375, |
|
"learning_rate": 1.8072289156626506e-06, |
|
"loss": 4.3189, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.935483870967742, |
|
"grad_norm": 0.4698345242151301, |
|
"learning_rate": 1.2048192771084338e-06, |
|
"loss": 4.3348, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 2.967741935483871, |
|
"grad_norm": 0.4355936118573381, |
|
"learning_rate": 6.024096385542169e-07, |
|
"loss": 4.3989, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.4741459705347069, |
|
"learning_rate": 0.0, |
|
"loss": 4.3099, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 93, |
|
"total_flos": 42389716598784.0, |
|
"train_loss": 6.370550432512837, |
|
"train_runtime": 17767.6162, |
|
"train_samples_per_second": 0.083, |
|
"train_steps_per_second": 0.005 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 93, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 42389716598784.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|