{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 93, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03225806451612903, "grad_norm": 14.220000481200906, "learning_rate": 5e-06, "loss": 13.9265, "step": 1 }, { "epoch": 0.06451612903225806, "grad_norm": 14.702762596013635, "learning_rate": 1e-05, "loss": 13.95, "step": 2 }, { "epoch": 0.0967741935483871, "grad_norm": 14.297305924795248, "learning_rate": 1.5e-05, "loss": 13.9116, "step": 3 }, { "epoch": 0.12903225806451613, "grad_norm": 14.484767601202535, "learning_rate": 2e-05, "loss": 13.753, "step": 4 }, { "epoch": 0.16129032258064516, "grad_norm": 14.480323510985746, "learning_rate": 2.5e-05, "loss": 13.222, "step": 5 }, { "epoch": 0.1935483870967742, "grad_norm": 14.002264166506258, "learning_rate": 3e-05, "loss": 12.1647, "step": 6 }, { "epoch": 0.22580645161290322, "grad_norm": 7.226601449948956, "learning_rate": 3.5e-05, "loss": 11.1642, "step": 7 }, { "epoch": 0.25806451612903225, "grad_norm": 13.608621977990989, "learning_rate": 4e-05, "loss": 10.8504, "step": 8 }, { "epoch": 0.2903225806451613, "grad_norm": 6.436610267622404, "learning_rate": 4.5e-05, "loss": 10.4279, "step": 9 }, { "epoch": 0.3225806451612903, "grad_norm": 4.922039728837155, "learning_rate": 5e-05, "loss": 10.1287, "step": 10 }, { "epoch": 0.3548387096774194, "grad_norm": 12.033200559896779, "learning_rate": 4.9397590361445786e-05, "loss": 9.9607, "step": 11 }, { "epoch": 0.3870967741935484, "grad_norm": 3.5396711201156807, "learning_rate": 4.879518072289157e-05, "loss": 9.6616, "step": 12 }, { "epoch": 0.41935483870967744, "grad_norm": 3.4751992441084534, "learning_rate": 4.8192771084337354e-05, "loss": 9.3783, "step": 13 }, { "epoch": 0.45161290322580644, "grad_norm": 3.9005409440562735, "learning_rate": 4.759036144578313e-05, "loss": 9.1355, "step": 14 }, { "epoch": 0.4838709677419355, "grad_norm": 2.8756405244423915, "learning_rate": 4.698795180722892e-05, "loss": 8.8203, "step": 15 }, { "epoch": 0.5161290322580645, "grad_norm": 3.9300784221492027, "learning_rate": 4.63855421686747e-05, "loss": 8.5065, "step": 16 }, { "epoch": 0.5483870967741935, "grad_norm": 5.84606546682727, "learning_rate": 4.578313253012048e-05, "loss": 8.3534, "step": 17 }, { "epoch": 0.5806451612903226, "grad_norm": 5.025102319247832, "learning_rate": 4.5180722891566266e-05, "loss": 8.1611, "step": 18 }, { "epoch": 0.6129032258064516, "grad_norm": 3.5261942209741486, "learning_rate": 4.457831325301205e-05, "loss": 8.0223, "step": 19 }, { "epoch": 0.6451612903225806, "grad_norm": 8.46799283954891, "learning_rate": 4.3975903614457834e-05, "loss": 7.8982, "step": 20 }, { "epoch": 0.6774193548387096, "grad_norm": 3.926166227638816, "learning_rate": 4.337349397590362e-05, "loss": 7.6319, "step": 21 }, { "epoch": 0.7096774193548387, "grad_norm": 3.358443458515684, "learning_rate": 4.27710843373494e-05, "loss": 7.4718, "step": 22 }, { "epoch": 0.7419354838709677, "grad_norm": 2.7982062504728487, "learning_rate": 4.2168674698795186e-05, "loss": 7.3488, "step": 23 }, { "epoch": 0.7741935483870968, "grad_norm": 3.1113638072919216, "learning_rate": 4.156626506024097e-05, "loss": 7.15, "step": 24 }, { "epoch": 0.8064516129032258, "grad_norm": 3.1595449829251945, "learning_rate": 4.0963855421686746e-05, "loss": 7.0563, "step": 25 }, { "epoch": 0.8387096774193549, "grad_norm": 2.108212146858006, "learning_rate": 4.036144578313254e-05, "loss": 6.9565, "step": 26 }, { "epoch": 0.8709677419354839, "grad_norm": 2.461122936407565, "learning_rate": 3.9759036144578314e-05, "loss": 6.8685, "step": 27 }, { "epoch": 0.9032258064516129, "grad_norm": 1.6920822731314087, "learning_rate": 3.91566265060241e-05, "loss": 6.7245, "step": 28 }, { "epoch": 0.9354838709677419, "grad_norm": 2.2327133352179276, "learning_rate": 3.855421686746988e-05, "loss": 6.6712, "step": 29 }, { "epoch": 0.967741935483871, "grad_norm": 2.398221999231477, "learning_rate": 3.7951807228915666e-05, "loss": 6.5071, "step": 30 }, { "epoch": 1.0, "grad_norm": 1.9848914786193277, "learning_rate": 3.734939759036144e-05, "loss": 6.3974, "step": 31 }, { "epoch": 1.032258064516129, "grad_norm": 1.8070671857380076, "learning_rate": 3.674698795180723e-05, "loss": 6.3319, "step": 32 }, { "epoch": 1.064516129032258, "grad_norm": 1.6370922118025715, "learning_rate": 3.614457831325301e-05, "loss": 6.1884, "step": 33 }, { "epoch": 1.096774193548387, "grad_norm": 1.3131279804848621, "learning_rate": 3.5542168674698794e-05, "loss": 6.1036, "step": 34 }, { "epoch": 1.129032258064516, "grad_norm": 1.340531475202813, "learning_rate": 3.4939759036144585e-05, "loss": 5.9671, "step": 35 }, { "epoch": 1.1612903225806452, "grad_norm": 2.005454028622789, "learning_rate": 3.433734939759036e-05, "loss": 5.8771, "step": 36 }, { "epoch": 1.1935483870967742, "grad_norm": 1.4142674915557123, "learning_rate": 3.3734939759036146e-05, "loss": 5.8714, "step": 37 }, { "epoch": 1.2258064516129032, "grad_norm": 1.2956798900198585, "learning_rate": 3.313253012048193e-05, "loss": 5.7898, "step": 38 }, { "epoch": 1.2580645161290323, "grad_norm": 1.6593235984258043, "learning_rate": 3.253012048192771e-05, "loss": 5.6873, "step": 39 }, { "epoch": 1.2903225806451613, "grad_norm": 1.380163595182406, "learning_rate": 3.192771084337349e-05, "loss": 5.7097, "step": 40 }, { "epoch": 1.3225806451612903, "grad_norm": 1.0401057745638052, "learning_rate": 3.132530120481928e-05, "loss": 5.6556, "step": 41 }, { "epoch": 1.3548387096774195, "grad_norm": 0.8473367054648464, "learning_rate": 3.072289156626506e-05, "loss": 5.4949, "step": 42 }, { "epoch": 1.3870967741935485, "grad_norm": 1.1572154596782513, "learning_rate": 3.012048192771085e-05, "loss": 5.4822, "step": 43 }, { "epoch": 1.4193548387096775, "grad_norm": 1.1443583002498423, "learning_rate": 2.951807228915663e-05, "loss": 5.4141, "step": 44 }, { "epoch": 1.4516129032258065, "grad_norm": 1.1052929131923555, "learning_rate": 2.891566265060241e-05, "loss": 5.3635, "step": 45 }, { "epoch": 1.4838709677419355, "grad_norm": 0.8970565525154285, "learning_rate": 2.8313253012048197e-05, "loss": 5.2672, "step": 46 }, { "epoch": 1.5161290322580645, "grad_norm": 0.756063967066019, "learning_rate": 2.7710843373493977e-05, "loss": 5.2908, "step": 47 }, { "epoch": 1.5483870967741935, "grad_norm": 0.7264373375167542, "learning_rate": 2.7108433734939758e-05, "loss": 5.2698, "step": 48 }, { "epoch": 1.5806451612903225, "grad_norm": 0.7011676067381816, "learning_rate": 2.6506024096385545e-05, "loss": 5.2454, "step": 49 }, { "epoch": 1.6129032258064515, "grad_norm": 0.6927754707271319, "learning_rate": 2.5903614457831325e-05, "loss": 5.1169, "step": 50 }, { "epoch": 1.6451612903225805, "grad_norm": 0.6274016321054947, "learning_rate": 2.530120481927711e-05, "loss": 5.079, "step": 51 }, { "epoch": 1.6774193548387095, "grad_norm": 0.6260513529033087, "learning_rate": 2.4698795180722893e-05, "loss": 5.0318, "step": 52 }, { "epoch": 1.7096774193548387, "grad_norm": 0.6122295776355817, "learning_rate": 2.4096385542168677e-05, "loss": 5.0809, "step": 53 }, { "epoch": 1.7419354838709677, "grad_norm": 0.6219482405443587, "learning_rate": 2.349397590361446e-05, "loss": 4.9977, "step": 54 }, { "epoch": 1.7741935483870968, "grad_norm": 0.7127219759114208, "learning_rate": 2.289156626506024e-05, "loss": 4.9713, "step": 55 }, { "epoch": 1.8064516129032258, "grad_norm": 0.6728184125308878, "learning_rate": 2.2289156626506025e-05, "loss": 4.9564, "step": 56 }, { "epoch": 1.838709677419355, "grad_norm": 0.6658692158816993, "learning_rate": 2.168674698795181e-05, "loss": 4.8953, "step": 57 }, { "epoch": 1.870967741935484, "grad_norm": 0.6480363491872242, "learning_rate": 2.1084337349397593e-05, "loss": 4.8271, "step": 58 }, { "epoch": 1.903225806451613, "grad_norm": 0.6798514027495196, "learning_rate": 2.0481927710843373e-05, "loss": 4.856, "step": 59 }, { "epoch": 1.935483870967742, "grad_norm": 0.5868913336163518, "learning_rate": 1.9879518072289157e-05, "loss": 4.7935, "step": 60 }, { "epoch": 1.967741935483871, "grad_norm": 0.5793335785631702, "learning_rate": 1.927710843373494e-05, "loss": 4.8151, "step": 61 }, { "epoch": 2.0, "grad_norm": 0.5816955191304313, "learning_rate": 1.867469879518072e-05, "loss": 4.7828, "step": 62 }, { "epoch": 2.032258064516129, "grad_norm": 0.5845389898807549, "learning_rate": 1.8072289156626505e-05, "loss": 4.6459, "step": 63 }, { "epoch": 2.064516129032258, "grad_norm": 0.5946763020956095, "learning_rate": 1.7469879518072292e-05, "loss": 4.7297, "step": 64 }, { "epoch": 2.096774193548387, "grad_norm": 0.5627139324820443, "learning_rate": 1.6867469879518073e-05, "loss": 4.6863, "step": 65 }, { "epoch": 2.129032258064516, "grad_norm": 0.5636104401228851, "learning_rate": 1.6265060240963857e-05, "loss": 4.6437, "step": 66 }, { "epoch": 2.161290322580645, "grad_norm": 0.5966948986254044, "learning_rate": 1.566265060240964e-05, "loss": 4.6102, "step": 67 }, { "epoch": 2.193548387096774, "grad_norm": 0.5635496928050323, "learning_rate": 1.5060240963855424e-05, "loss": 4.56, "step": 68 }, { "epoch": 2.225806451612903, "grad_norm": 0.6201573994580546, "learning_rate": 1.4457831325301205e-05, "loss": 4.5522, "step": 69 }, { "epoch": 2.258064516129032, "grad_norm": 0.5283310806265233, "learning_rate": 1.3855421686746989e-05, "loss": 4.5434, "step": 70 }, { "epoch": 2.2903225806451615, "grad_norm": 0.5830200387240102, "learning_rate": 1.3253012048192772e-05, "loss": 4.5213, "step": 71 }, { "epoch": 2.3225806451612905, "grad_norm": 0.5366398051729224, "learning_rate": 1.2650602409638555e-05, "loss": 4.4882, "step": 72 }, { "epoch": 2.3548387096774195, "grad_norm": 0.5278747342683787, "learning_rate": 1.2048192771084338e-05, "loss": 4.4285, "step": 73 }, { "epoch": 2.3870967741935485, "grad_norm": 0.5187375955155756, "learning_rate": 1.144578313253012e-05, "loss": 4.4649, "step": 74 }, { "epoch": 2.4193548387096775, "grad_norm": 0.49022439094573594, "learning_rate": 1.0843373493975904e-05, "loss": 4.4827, "step": 75 }, { "epoch": 2.4516129032258065, "grad_norm": 0.4832270774924177, "learning_rate": 1.0240963855421687e-05, "loss": 4.4521, "step": 76 }, { "epoch": 2.4838709677419355, "grad_norm": 0.5104329783830706, "learning_rate": 9.63855421686747e-06, "loss": 4.4009, "step": 77 }, { "epoch": 2.5161290322580645, "grad_norm": 0.46199522908208795, "learning_rate": 9.036144578313253e-06, "loss": 4.3962, "step": 78 }, { "epoch": 2.5483870967741935, "grad_norm": 0.4752588993383047, "learning_rate": 8.433734939759036e-06, "loss": 4.4422, "step": 79 }, { "epoch": 2.5806451612903225, "grad_norm": 0.48368008070516033, "learning_rate": 7.83132530120482e-06, "loss": 4.2568, "step": 80 }, { "epoch": 2.6129032258064515, "grad_norm": 0.47323578510274494, "learning_rate": 7.228915662650602e-06, "loss": 4.3716, "step": 81 }, { "epoch": 2.6451612903225805, "grad_norm": 0.47907919622898615, "learning_rate": 6.626506024096386e-06, "loss": 4.4467, "step": 82 }, { "epoch": 2.6774193548387095, "grad_norm": 0.5132262686017864, "learning_rate": 6.024096385542169e-06, "loss": 4.353, "step": 83 }, { "epoch": 2.709677419354839, "grad_norm": 0.4512163239687551, "learning_rate": 5.421686746987952e-06, "loss": 4.3904, "step": 84 }, { "epoch": 2.741935483870968, "grad_norm": 0.4613864288737105, "learning_rate": 4.819277108433735e-06, "loss": 4.3699, "step": 85 }, { "epoch": 2.774193548387097, "grad_norm": 0.4834904636291854, "learning_rate": 4.216867469879518e-06, "loss": 4.3382, "step": 86 }, { "epoch": 2.806451612903226, "grad_norm": 0.45137040545065016, "learning_rate": 3.614457831325301e-06, "loss": 4.3651, "step": 87 }, { "epoch": 2.838709677419355, "grad_norm": 0.44625483484835893, "learning_rate": 3.0120481927710846e-06, "loss": 4.4046, "step": 88 }, { "epoch": 2.870967741935484, "grad_norm": 0.4306895833715559, "learning_rate": 2.4096385542168676e-06, "loss": 4.3594, "step": 89 }, { "epoch": 2.903225806451613, "grad_norm": 0.46583862244193375, "learning_rate": 1.8072289156626506e-06, "loss": 4.3189, "step": 90 }, { "epoch": 2.935483870967742, "grad_norm": 0.4698345242151301, "learning_rate": 1.2048192771084338e-06, "loss": 4.3348, "step": 91 }, { "epoch": 2.967741935483871, "grad_norm": 0.4355936118573381, "learning_rate": 6.024096385542169e-07, "loss": 4.3989, "step": 92 }, { "epoch": 3.0, "grad_norm": 0.4741459705347069, "learning_rate": 0.0, "loss": 4.3099, "step": 93 }, { "epoch": 3.0, "step": 93, "total_flos": 42389716598784.0, "train_loss": 6.370550432512837, "train_runtime": 17767.6162, "train_samples_per_second": 0.083, "train_steps_per_second": 0.005 } ], "logging_steps": 1, "max_steps": 93, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 42389716598784.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }