|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.75609756097561, |
|
"eval_steps": 5, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.04878048780487805, |
|
"grad_norm": 5.151791580979949, |
|
"learning_rate": 5e-08, |
|
"loss": 2.9815, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.24390243902439024, |
|
"grad_norm": 5.13763305558627, |
|
"learning_rate": 2.5e-07, |
|
"loss": 2.9732, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.24390243902439024, |
|
"eval_loss": 2.9549503326416016, |
|
"eval_runtime": 23.256, |
|
"eval_samples_per_second": 31.734, |
|
"eval_steps_per_second": 0.817, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.4878048780487805, |
|
"grad_norm": 4.3040377415100535, |
|
"learning_rate": 5e-07, |
|
"loss": 2.9685, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.4878048780487805, |
|
"eval_loss": 2.9328665733337402, |
|
"eval_runtime": 19.8689, |
|
"eval_samples_per_second": 37.144, |
|
"eval_steps_per_second": 0.956, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.7317073170731707, |
|
"grad_norm": 3.403638208876821, |
|
"learning_rate": 7.5e-07, |
|
"loss": 2.9341, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.7317073170731707, |
|
"eval_loss": 2.886631488800049, |
|
"eval_runtime": 18.4607, |
|
"eval_samples_per_second": 39.977, |
|
"eval_steps_per_second": 1.029, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.975609756097561, |
|
"grad_norm": 3.00111206949746, |
|
"learning_rate": 1e-06, |
|
"loss": 2.8788, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.975609756097561, |
|
"eval_loss": 2.8079330921173096, |
|
"eval_runtime": 19.9381, |
|
"eval_samples_per_second": 37.014, |
|
"eval_steps_per_second": 0.953, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 1.2195121951219512, |
|
"grad_norm": 2.7591628779527766, |
|
"learning_rate": 9.980973490458728e-07, |
|
"loss": 2.8082, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 1.2195121951219512, |
|
"eval_loss": 2.7484195232391357, |
|
"eval_runtime": 18.8326, |
|
"eval_samples_per_second": 39.187, |
|
"eval_steps_per_second": 1.009, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 1.4634146341463414, |
|
"grad_norm": 2.388152230770368, |
|
"learning_rate": 9.92403876506104e-07, |
|
"loss": 2.7341, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.4634146341463414, |
|
"eval_loss": 2.6838204860687256, |
|
"eval_runtime": 20.3367, |
|
"eval_samples_per_second": 36.289, |
|
"eval_steps_per_second": 0.934, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.7073170731707317, |
|
"grad_norm": 2.339030304927524, |
|
"learning_rate": 9.82962913144534e-07, |
|
"loss": 2.6784, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.7073170731707317, |
|
"eval_loss": 2.633502244949341, |
|
"eval_runtime": 18.7828, |
|
"eval_samples_per_second": 39.291, |
|
"eval_steps_per_second": 1.012, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.951219512195122, |
|
"grad_norm": 2.289213281990378, |
|
"learning_rate": 9.698463103929541e-07, |
|
"loss": 2.6326, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.951219512195122, |
|
"eval_loss": 2.5951168537139893, |
|
"eval_runtime": 20.4763, |
|
"eval_samples_per_second": 36.042, |
|
"eval_steps_per_second": 0.928, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 2.1951219512195124, |
|
"grad_norm": 2.224029018392997, |
|
"learning_rate": 9.531538935183249e-07, |
|
"loss": 2.5934, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 2.1951219512195124, |
|
"eval_loss": 2.5593512058258057, |
|
"eval_runtime": 18.9816, |
|
"eval_samples_per_second": 38.88, |
|
"eval_steps_per_second": 1.001, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 2.4390243902439024, |
|
"grad_norm": 2.2412373397938583, |
|
"learning_rate": 9.330127018922193e-07, |
|
"loss": 2.5543, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 2.4390243902439024, |
|
"eval_loss": 2.521718740463257, |
|
"eval_runtime": 20.215, |
|
"eval_samples_per_second": 36.508, |
|
"eval_steps_per_second": 0.94, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 2.682926829268293, |
|
"grad_norm": 2.209083408074763, |
|
"learning_rate": 9.095760221444959e-07, |
|
"loss": 2.513, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 2.682926829268293, |
|
"eval_loss": 2.4829368591308594, |
|
"eval_runtime": 19.4395, |
|
"eval_samples_per_second": 37.964, |
|
"eval_steps_per_second": 0.977, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 2.926829268292683, |
|
"grad_norm": 2.297068739774513, |
|
"learning_rate": 8.83022221559489e-07, |
|
"loss": 2.4712, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 2.926829268292683, |
|
"eval_loss": 2.446091890335083, |
|
"eval_runtime": 19.007, |
|
"eval_samples_per_second": 38.828, |
|
"eval_steps_per_second": 1.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 3.1707317073170733, |
|
"grad_norm": 2.3349163677591314, |
|
"learning_rate": 8.535533905932737e-07, |
|
"loss": 2.4365, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 3.1707317073170733, |
|
"eval_loss": 2.413790225982666, |
|
"eval_runtime": 20.2301, |
|
"eval_samples_per_second": 36.48, |
|
"eval_steps_per_second": 0.939, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 3.4146341463414633, |
|
"grad_norm": 2.285003975033001, |
|
"learning_rate": 8.213938048432696e-07, |
|
"loss": 2.4066, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 3.4146341463414633, |
|
"eval_loss": 2.3858845233917236, |
|
"eval_runtime": 18.3399, |
|
"eval_samples_per_second": 40.24, |
|
"eval_steps_per_second": 1.036, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 3.658536585365854, |
|
"grad_norm": 2.1725408410741203, |
|
"learning_rate": 7.86788218175523e-07, |
|
"loss": 2.375, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 3.658536585365854, |
|
"eval_loss": 2.3606066703796387, |
|
"eval_runtime": 19.9461, |
|
"eval_samples_per_second": 37.0, |
|
"eval_steps_per_second": 0.953, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 3.902439024390244, |
|
"grad_norm": 2.2674210698852133, |
|
"learning_rate": 7.5e-07, |
|
"loss": 2.3415, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 3.902439024390244, |
|
"eval_loss": 2.336864709854126, |
|
"eval_runtime": 18.4703, |
|
"eval_samples_per_second": 39.956, |
|
"eval_steps_per_second": 1.029, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 4.146341463414634, |
|
"grad_norm": 2.22603925687789, |
|
"learning_rate": 7.113091308703497e-07, |
|
"loss": 2.3225, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 4.146341463414634, |
|
"eval_loss": 2.3142693042755127, |
|
"eval_runtime": 19.8254, |
|
"eval_samples_per_second": 37.225, |
|
"eval_steps_per_second": 0.958, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 4.390243902439025, |
|
"grad_norm": 2.250461857867485, |
|
"learning_rate": 6.710100716628344e-07, |
|
"loss": 2.2989, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 4.390243902439025, |
|
"eval_loss": 2.292672634124756, |
|
"eval_runtime": 19.1763, |
|
"eval_samples_per_second": 38.485, |
|
"eval_steps_per_second": 0.991, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 4.634146341463414, |
|
"grad_norm": 2.2063059930566107, |
|
"learning_rate": 6.294095225512604e-07, |
|
"loss": 2.2748, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 4.634146341463414, |
|
"eval_loss": 2.2731637954711914, |
|
"eval_runtime": 18.4462, |
|
"eval_samples_per_second": 40.008, |
|
"eval_steps_per_second": 1.03, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 4.878048780487805, |
|
"grad_norm": 2.2114255470751205, |
|
"learning_rate": 5.868240888334652e-07, |
|
"loss": 2.2513, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 4.878048780487805, |
|
"eval_loss": 2.2562241554260254, |
|
"eval_runtime": 19.9755, |
|
"eval_samples_per_second": 36.945, |
|
"eval_steps_per_second": 0.951, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 5.121951219512195, |
|
"grad_norm": 2.1470949247310247, |
|
"learning_rate": 5.435778713738292e-07, |
|
"loss": 2.2401, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 5.121951219512195, |
|
"eval_loss": 2.241244077682495, |
|
"eval_runtime": 18.4347, |
|
"eval_samples_per_second": 40.033, |
|
"eval_steps_per_second": 1.031, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 5.365853658536586, |
|
"grad_norm": 2.2399669699712543, |
|
"learning_rate": 5e-07, |
|
"loss": 2.2172, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 5.365853658536586, |
|
"eval_loss": 2.228184461593628, |
|
"eval_runtime": 19.9167, |
|
"eval_samples_per_second": 37.054, |
|
"eval_steps_per_second": 0.954, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 5.609756097560975, |
|
"grad_norm": 2.248609346835798, |
|
"learning_rate": 4.5642212862617085e-07, |
|
"loss": 2.204, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 5.609756097560975, |
|
"eval_loss": 2.216791868209839, |
|
"eval_runtime": 18.6439, |
|
"eval_samples_per_second": 39.584, |
|
"eval_steps_per_second": 1.019, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 5.853658536585366, |
|
"grad_norm": 2.2843160705737393, |
|
"learning_rate": 4.131759111665348e-07, |
|
"loss": 2.1893, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 5.853658536585366, |
|
"eval_loss": 2.2068991661071777, |
|
"eval_runtime": 19.7257, |
|
"eval_samples_per_second": 37.413, |
|
"eval_steps_per_second": 0.963, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 6.097560975609756, |
|
"grad_norm": 2.198659146419105, |
|
"learning_rate": 3.7059047744873955e-07, |
|
"loss": 2.1784, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 6.097560975609756, |
|
"eval_loss": 2.1983890533447266, |
|
"eval_runtime": 20.2531, |
|
"eval_samples_per_second": 36.439, |
|
"eval_steps_per_second": 0.938, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 6.341463414634147, |
|
"grad_norm": 2.2746004739227996, |
|
"learning_rate": 3.2898992833716563e-07, |
|
"loss": 2.1646, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 6.341463414634147, |
|
"eval_loss": 2.191380500793457, |
|
"eval_runtime": 18.8387, |
|
"eval_samples_per_second": 39.175, |
|
"eval_steps_per_second": 1.009, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 6.585365853658536, |
|
"grad_norm": 2.182140726336947, |
|
"learning_rate": 2.8869086912965036e-07, |
|
"loss": 2.1673, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 6.585365853658536, |
|
"eval_loss": 2.185249090194702, |
|
"eval_runtime": 20.3689, |
|
"eval_samples_per_second": 36.232, |
|
"eval_steps_per_second": 0.933, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 6.829268292682927, |
|
"grad_norm": 2.2128884344288244, |
|
"learning_rate": 2.500000000000001e-07, |
|
"loss": 2.1555, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 6.829268292682927, |
|
"eval_loss": 2.180130958557129, |
|
"eval_runtime": 19.0502, |
|
"eval_samples_per_second": 38.74, |
|
"eval_steps_per_second": 0.997, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 7.073170731707317, |
|
"grad_norm": 2.102015386581095, |
|
"learning_rate": 2.1321178182447709e-07, |
|
"loss": 2.1599, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 7.073170731707317, |
|
"eval_loss": 2.175684690475464, |
|
"eval_runtime": 20.511, |
|
"eval_samples_per_second": 35.981, |
|
"eval_steps_per_second": 0.926, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 7.317073170731708, |
|
"grad_norm": 2.313350074598611, |
|
"learning_rate": 1.7860619515673032e-07, |
|
"loss": 2.145, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 7.317073170731708, |
|
"eval_loss": 2.1720824241638184, |
|
"eval_runtime": 18.8834, |
|
"eval_samples_per_second": 39.082, |
|
"eval_steps_per_second": 1.006, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 7.560975609756097, |
|
"grad_norm": 2.253715272870049, |
|
"learning_rate": 1.4644660940672627e-07, |
|
"loss": 2.1359, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 7.560975609756097, |
|
"eval_loss": 2.169234275817871, |
|
"eval_runtime": 19.3523, |
|
"eval_samples_per_second": 38.135, |
|
"eval_steps_per_second": 0.982, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 7.804878048780488, |
|
"grad_norm": 2.216308681434654, |
|
"learning_rate": 1.1697777844051104e-07, |
|
"loss": 2.1391, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 7.804878048780488, |
|
"eval_loss": 2.1668455600738525, |
|
"eval_runtime": 20.0953, |
|
"eval_samples_per_second": 36.725, |
|
"eval_steps_per_second": 0.945, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 8.048780487804878, |
|
"grad_norm": 2.2398335056684116, |
|
"learning_rate": 9.042397785550404e-08, |
|
"loss": 2.1274, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 8.048780487804878, |
|
"eval_loss": 2.1650218963623047, |
|
"eval_runtime": 18.6515, |
|
"eval_samples_per_second": 39.568, |
|
"eval_steps_per_second": 1.019, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 8.292682926829269, |
|
"grad_norm": 2.183896419200447, |
|
"learning_rate": 6.698729810778064e-08, |
|
"loss": 2.1342, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 8.292682926829269, |
|
"eval_loss": 2.163686752319336, |
|
"eval_runtime": 20.1658, |
|
"eval_samples_per_second": 36.597, |
|
"eval_steps_per_second": 0.942, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 8.536585365853659, |
|
"grad_norm": 2.260276175787966, |
|
"learning_rate": 4.684610648167503e-08, |
|
"loss": 2.1272, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 8.536585365853659, |
|
"eval_loss": 2.1627187728881836, |
|
"eval_runtime": 18.4069, |
|
"eval_samples_per_second": 40.094, |
|
"eval_steps_per_second": 1.032, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 8.78048780487805, |
|
"grad_norm": 2.2467910530017043, |
|
"learning_rate": 3.015368960704584e-08, |
|
"loss": 2.133, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 8.78048780487805, |
|
"eval_loss": 2.162067413330078, |
|
"eval_runtime": 19.691, |
|
"eval_samples_per_second": 37.479, |
|
"eval_steps_per_second": 0.965, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 9.024390243902438, |
|
"grad_norm": 2.2631951574285387, |
|
"learning_rate": 1.7037086855465898e-08, |
|
"loss": 2.1286, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 9.024390243902438, |
|
"eval_loss": 2.1616575717926025, |
|
"eval_runtime": 19.1797, |
|
"eval_samples_per_second": 38.478, |
|
"eval_steps_per_second": 0.991, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 9.268292682926829, |
|
"grad_norm": 2.2305961635805214, |
|
"learning_rate": 7.59612349389599e-09, |
|
"loss": 2.1296, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 9.268292682926829, |
|
"eval_loss": 2.161451578140259, |
|
"eval_runtime": 18.6125, |
|
"eval_samples_per_second": 39.651, |
|
"eval_steps_per_second": 1.021, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 9.512195121951219, |
|
"grad_norm": 2.209641020058134, |
|
"learning_rate": 1.9026509541272273e-09, |
|
"loss": 2.1256, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 9.512195121951219, |
|
"eval_loss": 2.161362409591675, |
|
"eval_runtime": 20.0337, |
|
"eval_samples_per_second": 36.838, |
|
"eval_steps_per_second": 0.948, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 9.75609756097561, |
|
"grad_norm": 2.2480648233150617, |
|
"learning_rate": 0.0, |
|
"loss": 2.1267, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 9.75609756097561, |
|
"eval_loss": 2.1613569259643555, |
|
"eval_runtime": 18.4232, |
|
"eval_samples_per_second": 40.058, |
|
"eval_steps_per_second": 1.031, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 9.75609756097561, |
|
"step": 200, |
|
"total_flos": 1.742036445167616e+16, |
|
"train_loss": 2.3644245743751524, |
|
"train_runtime": 7118.2054, |
|
"train_samples_per_second": 9.167, |
|
"train_steps_per_second": 0.028 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 200, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.742036445167616e+16, |
|
"train_batch_size": 10, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|