{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.75609756097561, "eval_steps": 5, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04878048780487805, "grad_norm": 5.151791580979949, "learning_rate": 5e-08, "loss": 2.9815, "step": 1 }, { "epoch": 0.24390243902439024, "grad_norm": 5.13763305558627, "learning_rate": 2.5e-07, "loss": 2.9732, "step": 5 }, { "epoch": 0.24390243902439024, "eval_loss": 2.9549503326416016, "eval_runtime": 23.256, "eval_samples_per_second": 31.734, "eval_steps_per_second": 0.817, "step": 5 }, { "epoch": 0.4878048780487805, "grad_norm": 4.3040377415100535, "learning_rate": 5e-07, "loss": 2.9685, "step": 10 }, { "epoch": 0.4878048780487805, "eval_loss": 2.9328665733337402, "eval_runtime": 19.8689, "eval_samples_per_second": 37.144, "eval_steps_per_second": 0.956, "step": 10 }, { "epoch": 0.7317073170731707, "grad_norm": 3.403638208876821, "learning_rate": 7.5e-07, "loss": 2.9341, "step": 15 }, { "epoch": 0.7317073170731707, "eval_loss": 2.886631488800049, "eval_runtime": 18.4607, "eval_samples_per_second": 39.977, "eval_steps_per_second": 1.029, "step": 15 }, { "epoch": 0.975609756097561, "grad_norm": 3.00111206949746, "learning_rate": 1e-06, "loss": 2.8788, "step": 20 }, { "epoch": 0.975609756097561, "eval_loss": 2.8079330921173096, "eval_runtime": 19.9381, "eval_samples_per_second": 37.014, "eval_steps_per_second": 0.953, "step": 20 }, { "epoch": 1.2195121951219512, "grad_norm": 2.7591628779527766, "learning_rate": 9.980973490458728e-07, "loss": 2.8082, "step": 25 }, { "epoch": 1.2195121951219512, "eval_loss": 2.7484195232391357, "eval_runtime": 18.8326, "eval_samples_per_second": 39.187, "eval_steps_per_second": 1.009, "step": 25 }, { "epoch": 1.4634146341463414, "grad_norm": 2.388152230770368, "learning_rate": 9.92403876506104e-07, "loss": 2.7341, "step": 30 }, { "epoch": 1.4634146341463414, "eval_loss": 2.6838204860687256, "eval_runtime": 20.3367, "eval_samples_per_second": 36.289, "eval_steps_per_second": 0.934, "step": 30 }, { "epoch": 1.7073170731707317, "grad_norm": 2.339030304927524, "learning_rate": 9.82962913144534e-07, "loss": 2.6784, "step": 35 }, { "epoch": 1.7073170731707317, "eval_loss": 2.633502244949341, "eval_runtime": 18.7828, "eval_samples_per_second": 39.291, "eval_steps_per_second": 1.012, "step": 35 }, { "epoch": 1.951219512195122, "grad_norm": 2.289213281990378, "learning_rate": 9.698463103929541e-07, "loss": 2.6326, "step": 40 }, { "epoch": 1.951219512195122, "eval_loss": 2.5951168537139893, "eval_runtime": 20.4763, "eval_samples_per_second": 36.042, "eval_steps_per_second": 0.928, "step": 40 }, { "epoch": 2.1951219512195124, "grad_norm": 2.224029018392997, "learning_rate": 9.531538935183249e-07, "loss": 2.5934, "step": 45 }, { "epoch": 2.1951219512195124, "eval_loss": 2.5593512058258057, "eval_runtime": 18.9816, "eval_samples_per_second": 38.88, "eval_steps_per_second": 1.001, "step": 45 }, { "epoch": 2.4390243902439024, "grad_norm": 2.2412373397938583, "learning_rate": 9.330127018922193e-07, "loss": 2.5543, "step": 50 }, { "epoch": 2.4390243902439024, "eval_loss": 2.521718740463257, "eval_runtime": 20.215, "eval_samples_per_second": 36.508, "eval_steps_per_second": 0.94, "step": 50 }, { "epoch": 2.682926829268293, "grad_norm": 2.209083408074763, "learning_rate": 9.095760221444959e-07, "loss": 2.513, "step": 55 }, { "epoch": 2.682926829268293, "eval_loss": 2.4829368591308594, "eval_runtime": 19.4395, "eval_samples_per_second": 37.964, "eval_steps_per_second": 0.977, "step": 55 }, { "epoch": 2.926829268292683, "grad_norm": 2.297068739774513, "learning_rate": 8.83022221559489e-07, "loss": 2.4712, "step": 60 }, { "epoch": 2.926829268292683, "eval_loss": 2.446091890335083, "eval_runtime": 19.007, "eval_samples_per_second": 38.828, "eval_steps_per_second": 1.0, "step": 60 }, { "epoch": 3.1707317073170733, "grad_norm": 2.3349163677591314, "learning_rate": 8.535533905932737e-07, "loss": 2.4365, "step": 65 }, { "epoch": 3.1707317073170733, "eval_loss": 2.413790225982666, "eval_runtime": 20.2301, "eval_samples_per_second": 36.48, "eval_steps_per_second": 0.939, "step": 65 }, { "epoch": 3.4146341463414633, "grad_norm": 2.285003975033001, "learning_rate": 8.213938048432696e-07, "loss": 2.4066, "step": 70 }, { "epoch": 3.4146341463414633, "eval_loss": 2.3858845233917236, "eval_runtime": 18.3399, "eval_samples_per_second": 40.24, "eval_steps_per_second": 1.036, "step": 70 }, { "epoch": 3.658536585365854, "grad_norm": 2.1725408410741203, "learning_rate": 7.86788218175523e-07, "loss": 2.375, "step": 75 }, { "epoch": 3.658536585365854, "eval_loss": 2.3606066703796387, "eval_runtime": 19.9461, "eval_samples_per_second": 37.0, "eval_steps_per_second": 0.953, "step": 75 }, { "epoch": 3.902439024390244, "grad_norm": 2.2674210698852133, "learning_rate": 7.5e-07, "loss": 2.3415, "step": 80 }, { "epoch": 3.902439024390244, "eval_loss": 2.336864709854126, "eval_runtime": 18.4703, "eval_samples_per_second": 39.956, "eval_steps_per_second": 1.029, "step": 80 }, { "epoch": 4.146341463414634, "grad_norm": 2.22603925687789, "learning_rate": 7.113091308703497e-07, "loss": 2.3225, "step": 85 }, { "epoch": 4.146341463414634, "eval_loss": 2.3142693042755127, "eval_runtime": 19.8254, "eval_samples_per_second": 37.225, "eval_steps_per_second": 0.958, "step": 85 }, { "epoch": 4.390243902439025, "grad_norm": 2.250461857867485, "learning_rate": 6.710100716628344e-07, "loss": 2.2989, "step": 90 }, { "epoch": 4.390243902439025, "eval_loss": 2.292672634124756, "eval_runtime": 19.1763, "eval_samples_per_second": 38.485, "eval_steps_per_second": 0.991, "step": 90 }, { "epoch": 4.634146341463414, "grad_norm": 2.2063059930566107, "learning_rate": 6.294095225512604e-07, "loss": 2.2748, "step": 95 }, { "epoch": 4.634146341463414, "eval_loss": 2.2731637954711914, "eval_runtime": 18.4462, "eval_samples_per_second": 40.008, "eval_steps_per_second": 1.03, "step": 95 }, { "epoch": 4.878048780487805, "grad_norm": 2.2114255470751205, "learning_rate": 5.868240888334652e-07, "loss": 2.2513, "step": 100 }, { "epoch": 4.878048780487805, "eval_loss": 2.2562241554260254, "eval_runtime": 19.9755, "eval_samples_per_second": 36.945, "eval_steps_per_second": 0.951, "step": 100 }, { "epoch": 5.121951219512195, "grad_norm": 2.1470949247310247, "learning_rate": 5.435778713738292e-07, "loss": 2.2401, "step": 105 }, { "epoch": 5.121951219512195, "eval_loss": 2.241244077682495, "eval_runtime": 18.4347, "eval_samples_per_second": 40.033, "eval_steps_per_second": 1.031, "step": 105 }, { "epoch": 5.365853658536586, "grad_norm": 2.2399669699712543, "learning_rate": 5e-07, "loss": 2.2172, "step": 110 }, { "epoch": 5.365853658536586, "eval_loss": 2.228184461593628, "eval_runtime": 19.9167, "eval_samples_per_second": 37.054, "eval_steps_per_second": 0.954, "step": 110 }, { "epoch": 5.609756097560975, "grad_norm": 2.248609346835798, "learning_rate": 4.5642212862617085e-07, "loss": 2.204, "step": 115 }, { "epoch": 5.609756097560975, "eval_loss": 2.216791868209839, "eval_runtime": 18.6439, "eval_samples_per_second": 39.584, "eval_steps_per_second": 1.019, "step": 115 }, { "epoch": 5.853658536585366, "grad_norm": 2.2843160705737393, "learning_rate": 4.131759111665348e-07, "loss": 2.1893, "step": 120 }, { "epoch": 5.853658536585366, "eval_loss": 2.2068991661071777, "eval_runtime": 19.7257, "eval_samples_per_second": 37.413, "eval_steps_per_second": 0.963, "step": 120 }, { "epoch": 6.097560975609756, "grad_norm": 2.198659146419105, "learning_rate": 3.7059047744873955e-07, "loss": 2.1784, "step": 125 }, { "epoch": 6.097560975609756, "eval_loss": 2.1983890533447266, "eval_runtime": 20.2531, "eval_samples_per_second": 36.439, "eval_steps_per_second": 0.938, "step": 125 }, { "epoch": 6.341463414634147, "grad_norm": 2.2746004739227996, "learning_rate": 3.2898992833716563e-07, "loss": 2.1646, "step": 130 }, { "epoch": 6.341463414634147, "eval_loss": 2.191380500793457, "eval_runtime": 18.8387, "eval_samples_per_second": 39.175, "eval_steps_per_second": 1.009, "step": 130 }, { "epoch": 6.585365853658536, "grad_norm": 2.182140726336947, "learning_rate": 2.8869086912965036e-07, "loss": 2.1673, "step": 135 }, { "epoch": 6.585365853658536, "eval_loss": 2.185249090194702, "eval_runtime": 20.3689, "eval_samples_per_second": 36.232, "eval_steps_per_second": 0.933, "step": 135 }, { "epoch": 6.829268292682927, "grad_norm": 2.2128884344288244, "learning_rate": 2.500000000000001e-07, "loss": 2.1555, "step": 140 }, { "epoch": 6.829268292682927, "eval_loss": 2.180130958557129, "eval_runtime": 19.0502, "eval_samples_per_second": 38.74, "eval_steps_per_second": 0.997, "step": 140 }, { "epoch": 7.073170731707317, "grad_norm": 2.102015386581095, "learning_rate": 2.1321178182447709e-07, "loss": 2.1599, "step": 145 }, { "epoch": 7.073170731707317, "eval_loss": 2.175684690475464, "eval_runtime": 20.511, "eval_samples_per_second": 35.981, "eval_steps_per_second": 0.926, "step": 145 }, { "epoch": 7.317073170731708, "grad_norm": 2.313350074598611, "learning_rate": 1.7860619515673032e-07, "loss": 2.145, "step": 150 }, { "epoch": 7.317073170731708, "eval_loss": 2.1720824241638184, "eval_runtime": 18.8834, "eval_samples_per_second": 39.082, "eval_steps_per_second": 1.006, "step": 150 }, { "epoch": 7.560975609756097, "grad_norm": 2.253715272870049, "learning_rate": 1.4644660940672627e-07, "loss": 2.1359, "step": 155 }, { "epoch": 7.560975609756097, "eval_loss": 2.169234275817871, "eval_runtime": 19.3523, "eval_samples_per_second": 38.135, "eval_steps_per_second": 0.982, "step": 155 }, { "epoch": 7.804878048780488, "grad_norm": 2.216308681434654, "learning_rate": 1.1697777844051104e-07, "loss": 2.1391, "step": 160 }, { "epoch": 7.804878048780488, "eval_loss": 2.1668455600738525, "eval_runtime": 20.0953, "eval_samples_per_second": 36.725, "eval_steps_per_second": 0.945, "step": 160 }, { "epoch": 8.048780487804878, "grad_norm": 2.2398335056684116, "learning_rate": 9.042397785550404e-08, "loss": 2.1274, "step": 165 }, { "epoch": 8.048780487804878, "eval_loss": 2.1650218963623047, "eval_runtime": 18.6515, "eval_samples_per_second": 39.568, "eval_steps_per_second": 1.019, "step": 165 }, { "epoch": 8.292682926829269, "grad_norm": 2.183896419200447, "learning_rate": 6.698729810778064e-08, "loss": 2.1342, "step": 170 }, { "epoch": 8.292682926829269, "eval_loss": 2.163686752319336, "eval_runtime": 20.1658, "eval_samples_per_second": 36.597, "eval_steps_per_second": 0.942, "step": 170 }, { "epoch": 8.536585365853659, "grad_norm": 2.260276175787966, "learning_rate": 4.684610648167503e-08, "loss": 2.1272, "step": 175 }, { "epoch": 8.536585365853659, "eval_loss": 2.1627187728881836, "eval_runtime": 18.4069, "eval_samples_per_second": 40.094, "eval_steps_per_second": 1.032, "step": 175 }, { "epoch": 8.78048780487805, "grad_norm": 2.2467910530017043, "learning_rate": 3.015368960704584e-08, "loss": 2.133, "step": 180 }, { "epoch": 8.78048780487805, "eval_loss": 2.162067413330078, "eval_runtime": 19.691, "eval_samples_per_second": 37.479, "eval_steps_per_second": 0.965, "step": 180 }, { "epoch": 9.024390243902438, "grad_norm": 2.2631951574285387, "learning_rate": 1.7037086855465898e-08, "loss": 2.1286, "step": 185 }, { "epoch": 9.024390243902438, "eval_loss": 2.1616575717926025, "eval_runtime": 19.1797, "eval_samples_per_second": 38.478, "eval_steps_per_second": 0.991, "step": 185 }, { "epoch": 9.268292682926829, "grad_norm": 2.2305961635805214, "learning_rate": 7.59612349389599e-09, "loss": 2.1296, "step": 190 }, { "epoch": 9.268292682926829, "eval_loss": 2.161451578140259, "eval_runtime": 18.6125, "eval_samples_per_second": 39.651, "eval_steps_per_second": 1.021, "step": 190 }, { "epoch": 9.512195121951219, "grad_norm": 2.209641020058134, "learning_rate": 1.9026509541272273e-09, "loss": 2.1256, "step": 195 }, { "epoch": 9.512195121951219, "eval_loss": 2.161362409591675, "eval_runtime": 20.0337, "eval_samples_per_second": 36.838, "eval_steps_per_second": 0.948, "step": 195 }, { "epoch": 9.75609756097561, "grad_norm": 2.2480648233150617, "learning_rate": 0.0, "loss": 2.1267, "step": 200 }, { "epoch": 9.75609756097561, "eval_loss": 2.1613569259643555, "eval_runtime": 18.4232, "eval_samples_per_second": 40.058, "eval_steps_per_second": 1.031, "step": 200 }, { "epoch": 9.75609756097561, "step": 200, "total_flos": 1.742036445167616e+16, "train_loss": 2.3644245743751524, "train_runtime": 7118.2054, "train_samples_per_second": 9.167, "train_steps_per_second": 0.028 } ], "logging_steps": 5, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.742036445167616e+16, "train_batch_size": 10, "trial_name": null, "trial_params": null }