|
{ |
|
"best_global_step": 60, |
|
"best_metric": 0.4450100064277649, |
|
"best_model_checkpoint": "outputs/checkpoint-60", |
|
"epoch": 0.08571428571428572, |
|
"eval_steps": 5, |
|
"global_step": 60, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0014285714285714286, |
|
"grad_norm": 4.179355621337891, |
|
"learning_rate": 0.0, |
|
"loss": 1.3819, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.002857142857142857, |
|
"grad_norm": 4.235444068908691, |
|
"learning_rate": 4e-05, |
|
"loss": 1.5352, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.004285714285714286, |
|
"grad_norm": 3.5132126808166504, |
|
"learning_rate": 8e-05, |
|
"loss": 0.996, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.005714285714285714, |
|
"grad_norm": 2.1232669353485107, |
|
"learning_rate": 0.00012, |
|
"loss": 0.3471, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.007142857142857143, |
|
"grad_norm": 3.057875633239746, |
|
"learning_rate": 0.00016, |
|
"loss": 0.5202, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.007142857142857143, |
|
"eval_loss": 0.729008674621582, |
|
"eval_runtime": 61.1607, |
|
"eval_samples_per_second": 4.578, |
|
"eval_steps_per_second": 1.145, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.008571428571428572, |
|
"grad_norm": 5.017824649810791, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6374, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.8987300395965576, |
|
"learning_rate": 0.00019555555555555556, |
|
"loss": 0.7946, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.011428571428571429, |
|
"grad_norm": 2.3041818141937256, |
|
"learning_rate": 0.00019111111111111114, |
|
"loss": 0.475, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.012857142857142857, |
|
"grad_norm": 1.1344828605651855, |
|
"learning_rate": 0.0001866666666666667, |
|
"loss": 0.499, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.014285714285714285, |
|
"grad_norm": 1.3779399394989014, |
|
"learning_rate": 0.00018222222222222224, |
|
"loss": 0.5737, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.014285714285714285, |
|
"eval_loss": 0.4982975423336029, |
|
"eval_runtime": 58.9374, |
|
"eval_samples_per_second": 4.751, |
|
"eval_steps_per_second": 1.188, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.015714285714285715, |
|
"grad_norm": 1.039328932762146, |
|
"learning_rate": 0.00017777777777777779, |
|
"loss": 0.7145, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.017142857142857144, |
|
"grad_norm": 1.0184112787246704, |
|
"learning_rate": 0.00017333333333333334, |
|
"loss": 0.5086, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.018571428571428572, |
|
"grad_norm": 3.169090986251831, |
|
"learning_rate": 0.00016888888888888889, |
|
"loss": 0.3931, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.8889961242675781, |
|
"learning_rate": 0.00016444444444444444, |
|
"loss": 0.4175, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.02142857142857143, |
|
"grad_norm": 1.040206789970398, |
|
"learning_rate": 0.00016, |
|
"loss": 0.7937, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.02142857142857143, |
|
"eval_loss": 0.47482773661613464, |
|
"eval_runtime": 58.0691, |
|
"eval_samples_per_second": 4.822, |
|
"eval_steps_per_second": 1.205, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.022857142857142857, |
|
"grad_norm": 1.05618417263031, |
|
"learning_rate": 0.00015555555555555556, |
|
"loss": 0.4657, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.024285714285714285, |
|
"grad_norm": 1.636629343032837, |
|
"learning_rate": 0.0001511111111111111, |
|
"loss": 0.496, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.025714285714285714, |
|
"grad_norm": 0.8520965576171875, |
|
"learning_rate": 0.00014666666666666666, |
|
"loss": 0.2923, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.027142857142857142, |
|
"grad_norm": 1.2350469827651978, |
|
"learning_rate": 0.00014222222222222224, |
|
"loss": 0.6657, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.02857142857142857, |
|
"grad_norm": 0.8397138118743896, |
|
"learning_rate": 0.0001377777777777778, |
|
"loss": 0.2923, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02857142857142857, |
|
"eval_loss": 0.4646180272102356, |
|
"eval_runtime": 58.7341, |
|
"eval_samples_per_second": 4.767, |
|
"eval_steps_per_second": 1.192, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.4164972305297852, |
|
"learning_rate": 0.00013333333333333334, |
|
"loss": 0.5831, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.03142857142857143, |
|
"grad_norm": 1.0668251514434814, |
|
"learning_rate": 0.00012888888888888892, |
|
"loss": 0.5977, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.032857142857142856, |
|
"grad_norm": 1.0122352838516235, |
|
"learning_rate": 0.00012444444444444444, |
|
"loss": 0.3958, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.03428571428571429, |
|
"grad_norm": 1.1400679349899292, |
|
"learning_rate": 0.00012, |
|
"loss": 0.3899, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.03571428571428571, |
|
"grad_norm": 1.161012887954712, |
|
"learning_rate": 0.00011555555555555555, |
|
"loss": 0.6196, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.03571428571428571, |
|
"eval_loss": 0.4567541480064392, |
|
"eval_runtime": 58.108, |
|
"eval_samples_per_second": 4.819, |
|
"eval_steps_per_second": 1.205, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.037142857142857144, |
|
"grad_norm": 1.1181843280792236, |
|
"learning_rate": 0.00011111111111111112, |
|
"loss": 0.3504, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.03857142857142857, |
|
"grad_norm": 1.0887891054153442, |
|
"learning_rate": 0.00010666666666666667, |
|
"loss": 0.474, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.8779735565185547, |
|
"learning_rate": 0.00010222222222222222, |
|
"loss": 0.2359, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.041428571428571426, |
|
"grad_norm": 1.2299634218215942, |
|
"learning_rate": 9.777777777777778e-05, |
|
"loss": 0.5917, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.04285714285714286, |
|
"grad_norm": 0.6770172715187073, |
|
"learning_rate": 9.333333333333334e-05, |
|
"loss": 0.2978, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.04285714285714286, |
|
"eval_loss": 0.4585675895214081, |
|
"eval_runtime": 58.9389, |
|
"eval_samples_per_second": 4.751, |
|
"eval_steps_per_second": 1.188, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.04428571428571428, |
|
"grad_norm": 1.2675914764404297, |
|
"learning_rate": 8.888888888888889e-05, |
|
"loss": 0.464, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.045714285714285714, |
|
"grad_norm": 0.9487901926040649, |
|
"learning_rate": 8.444444444444444e-05, |
|
"loss": 0.6331, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.047142857142857146, |
|
"grad_norm": 1.002474069595337, |
|
"learning_rate": 8e-05, |
|
"loss": 0.5226, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.04857142857142857, |
|
"grad_norm": 2.1608269214630127, |
|
"learning_rate": 7.555555555555556e-05, |
|
"loss": 0.9663, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.7519217133522034, |
|
"learning_rate": 7.111111111111112e-05, |
|
"loss": 0.2157, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 0.4550328850746155, |
|
"eval_runtime": 57.9729, |
|
"eval_samples_per_second": 4.83, |
|
"eval_steps_per_second": 1.207, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.05142857142857143, |
|
"grad_norm": 0.7981988787651062, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 0.3595, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.05285714285714286, |
|
"grad_norm": 1.1520148515701294, |
|
"learning_rate": 6.222222222222222e-05, |
|
"loss": 0.4654, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.054285714285714284, |
|
"grad_norm": 0.7494262456893921, |
|
"learning_rate": 5.7777777777777776e-05, |
|
"loss": 0.2049, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.055714285714285716, |
|
"grad_norm": 0.8117587566375732, |
|
"learning_rate": 5.333333333333333e-05, |
|
"loss": 0.4791, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.05714285714285714, |
|
"grad_norm": 0.9275745749473572, |
|
"learning_rate": 4.888888888888889e-05, |
|
"loss": 0.4328, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05714285714285714, |
|
"eval_loss": 0.44896164536476135, |
|
"eval_runtime": 59.566, |
|
"eval_samples_per_second": 4.701, |
|
"eval_steps_per_second": 1.175, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05857142857142857, |
|
"grad_norm": 0.9272159337997437, |
|
"learning_rate": 4.4444444444444447e-05, |
|
"loss": 0.3348, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.161618947982788, |
|
"learning_rate": 4e-05, |
|
"loss": 0.7938, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.06142857142857143, |
|
"grad_norm": 0.6889943480491638, |
|
"learning_rate": 3.555555555555556e-05, |
|
"loss": 0.3177, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.06285714285714286, |
|
"grad_norm": 1.577309250831604, |
|
"learning_rate": 3.111111111111111e-05, |
|
"loss": 0.5701, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.06428571428571428, |
|
"grad_norm": 1.2045623064041138, |
|
"learning_rate": 2.6666666666666667e-05, |
|
"loss": 0.912, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.06428571428571428, |
|
"eval_loss": 0.4481067657470703, |
|
"eval_runtime": 58.0782, |
|
"eval_samples_per_second": 4.821, |
|
"eval_steps_per_second": 1.205, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.06571428571428571, |
|
"grad_norm": 1.6550114154815674, |
|
"learning_rate": 2.2222222222222223e-05, |
|
"loss": 0.8362, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.06714285714285714, |
|
"grad_norm": 0.6368440985679626, |
|
"learning_rate": 1.777777777777778e-05, |
|
"loss": 0.291, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.06857142857142857, |
|
"grad_norm": 0.8126080632209778, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 0.21, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.02597975730896, |
|
"learning_rate": 8.88888888888889e-06, |
|
"loss": 0.4724, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.07142857142857142, |
|
"grad_norm": 0.9014645218849182, |
|
"learning_rate": 4.444444444444445e-06, |
|
"loss": 0.351, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.07142857142857142, |
|
"eval_loss": 0.4469253420829773, |
|
"eval_runtime": 59.9788, |
|
"eval_samples_per_second": 4.668, |
|
"eval_steps_per_second": 1.167, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.07285714285714286, |
|
"grad_norm": 0.8374767303466797, |
|
"learning_rate": 0.0, |
|
"loss": 0.4678, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.07428571428571429, |
|
"grad_norm": 1.0087125301361084, |
|
"learning_rate": 3.272727272727273e-05, |
|
"loss": 0.8477, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.07571428571428572, |
|
"grad_norm": 0.7490191459655762, |
|
"learning_rate": 2.909090909090909e-05, |
|
"loss": 0.2679, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.07714285714285714, |
|
"grad_norm": 0.8489861488342285, |
|
"learning_rate": 2.5454545454545454e-05, |
|
"loss": 0.3833, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.07857142857142857, |
|
"grad_norm": 0.897487223148346, |
|
"learning_rate": 2.1818181818181818e-05, |
|
"loss": 0.5637, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.07857142857142857, |
|
"eval_loss": 0.44575706124305725, |
|
"eval_runtime": 57.9488, |
|
"eval_samples_per_second": 4.832, |
|
"eval_steps_per_second": 1.208, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.9447337985038757, |
|
"learning_rate": 1.8181818181818182e-05, |
|
"loss": 0.5945, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.08142857142857143, |
|
"grad_norm": 0.8487027287483215, |
|
"learning_rate": 1.4545454545454545e-05, |
|
"loss": 0.4971, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.08285714285714285, |
|
"grad_norm": 1.3720009326934814, |
|
"learning_rate": 1.0909090909090909e-05, |
|
"loss": 0.905, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.08428571428571428, |
|
"grad_norm": 0.8870661854743958, |
|
"learning_rate": 7.272727272727272e-06, |
|
"loss": 0.4392, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.08571428571428572, |
|
"grad_norm": 0.8729221224784851, |
|
"learning_rate": 3.636363636363636e-06, |
|
"loss": 0.3077, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.08571428571428572, |
|
"eval_loss": 0.4450100064277649, |
|
"eval_runtime": 59.8757, |
|
"eval_samples_per_second": 4.676, |
|
"eval_steps_per_second": 1.169, |
|
"step": 60 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 60, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 10, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3217787051458560.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|