|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 63, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 2.3387899124656393, |
|
"learning_rate": 0.0, |
|
"loss": 1.1321, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 2.2739175729395282, |
|
"learning_rate": 3.2258064516129035e-07, |
|
"loss": 0.9759, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 3.2333006230979437, |
|
"learning_rate": 6.451612903225807e-07, |
|
"loss": 1.0141, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 2.350357276823565, |
|
"learning_rate": 9.67741935483871e-07, |
|
"loss": 0.9947, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.1839271581694755, |
|
"learning_rate": 1.2903225806451614e-06, |
|
"loss": 1.023, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 2.153599307977408, |
|
"learning_rate": 1.6129032258064516e-06, |
|
"loss": 0.9609, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 2.1555129140427067, |
|
"learning_rate": 1.935483870967742e-06, |
|
"loss": 0.9622, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 2.008278114649717, |
|
"learning_rate": 2.2580645161290324e-06, |
|
"loss": 0.9711, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 1.9242182990110015, |
|
"learning_rate": 2.580645161290323e-06, |
|
"loss": 0.9673, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.750252408219067, |
|
"learning_rate": 2.903225806451613e-06, |
|
"loss": 0.9438, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 1.6939070396617073, |
|
"learning_rate": 3.225806451612903e-06, |
|
"loss": 0.8756, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 1.2834939133740737, |
|
"learning_rate": 3.548387096774194e-06, |
|
"loss": 0.9823, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"grad_norm": 1.4367519479529964, |
|
"learning_rate": 3.870967741935484e-06, |
|
"loss": 0.9596, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 1.3144994907230427, |
|
"learning_rate": 4.193548387096774e-06, |
|
"loss": 1.0521, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.0689628463958285, |
|
"learning_rate": 4.516129032258065e-06, |
|
"loss": 0.938, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 1.2021712620582392, |
|
"learning_rate": 4.838709677419355e-06, |
|
"loss": 1.0955, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"grad_norm": 1.2539951248917753, |
|
"learning_rate": 5.161290322580646e-06, |
|
"loss": 0.9417, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 1.4053233581800462, |
|
"learning_rate": 5.483870967741935e-06, |
|
"loss": 0.9956, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"grad_norm": 1.0278616263126648, |
|
"learning_rate": 5.806451612903226e-06, |
|
"loss": 0.7439, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.256128877382818, |
|
"learning_rate": 6.129032258064517e-06, |
|
"loss": 0.9542, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"grad_norm": 0.9487549369343321, |
|
"learning_rate": 6.451612903225806e-06, |
|
"loss": 0.9816, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 1.0891896971338342, |
|
"learning_rate": 6.774193548387097e-06, |
|
"loss": 1.0704, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"grad_norm": 1.1660258370107575, |
|
"learning_rate": 7.096774193548388e-06, |
|
"loss": 0.8931, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 0.9415137340919071, |
|
"learning_rate": 7.4193548387096784e-06, |
|
"loss": 0.8683, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.0798354285638785, |
|
"learning_rate": 7.741935483870968e-06, |
|
"loss": 1.0478, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 0.8096142802637235, |
|
"learning_rate": 8.064516129032258e-06, |
|
"loss": 0.8466, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"grad_norm": 0.7447999799600428, |
|
"learning_rate": 8.387096774193549e-06, |
|
"loss": 0.7486, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 0.8621800122661538, |
|
"learning_rate": 8.70967741935484e-06, |
|
"loss": 0.9581, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.464, |
|
"grad_norm": 0.7940321530754582, |
|
"learning_rate": 9.03225806451613e-06, |
|
"loss": 0.8867, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.8496750615573763, |
|
"learning_rate": 9.35483870967742e-06, |
|
"loss": 0.9179, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.496, |
|
"grad_norm": 0.8798601594137285, |
|
"learning_rate": 9.67741935483871e-06, |
|
"loss": 0.8834, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 0.8818390794513048, |
|
"learning_rate": 1e-05, |
|
"loss": 0.8952, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.528, |
|
"grad_norm": 0.8322934845468171, |
|
"learning_rate": 9.999683023724021e-06, |
|
"loss": 1.029, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"grad_norm": 0.7007834456821566, |
|
"learning_rate": 9.998732135085665e-06, |
|
"loss": 0.8438, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.6137020406246253, |
|
"learning_rate": 9.99714745464859e-06, |
|
"loss": 0.8435, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 0.7231796555332799, |
|
"learning_rate": 9.994929183335237e-06, |
|
"loss": 0.9646, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.592, |
|
"grad_norm": 0.6180598082755802, |
|
"learning_rate": 9.992077602401358e-06, |
|
"loss": 0.9616, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"grad_norm": 0.6567886708960547, |
|
"learning_rate": 9.988593073400354e-06, |
|
"loss": 0.8522, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.624, |
|
"grad_norm": 0.7454067800699115, |
|
"learning_rate": 9.984476038137437e-06, |
|
"loss": 0.9052, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.6714316518124167, |
|
"learning_rate": 9.979727018613607e-06, |
|
"loss": 0.8103, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.656, |
|
"grad_norm": 0.6229146249540587, |
|
"learning_rate": 9.974346616959476e-06, |
|
"loss": 0.8401, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 0.6448840489157996, |
|
"learning_rate": 9.968335515358916e-06, |
|
"loss": 0.9144, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.688, |
|
"grad_norm": 0.528060268374605, |
|
"learning_rate": 9.961694475962562e-06, |
|
"loss": 0.8348, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 0.5902102953939741, |
|
"learning_rate": 9.954424340791195e-06, |
|
"loss": 0.7721, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.5526442876969153, |
|
"learning_rate": 9.94652603162896e-06, |
|
"loss": 0.8438, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"grad_norm": 0.5331236922634145, |
|
"learning_rate": 9.938000549906509e-06, |
|
"loss": 0.8023, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.752, |
|
"grad_norm": 0.6339240906385807, |
|
"learning_rate": 9.92884897657402e-06, |
|
"loss": 0.894, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 0.5283804877111584, |
|
"learning_rate": 9.919072471964146e-06, |
|
"loss": 0.9069, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.784, |
|
"grad_norm": 0.5170169653511614, |
|
"learning_rate": 9.908672275644898e-06, |
|
"loss": 0.8051, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.4763529065504662, |
|
"learning_rate": 9.897649706262474e-06, |
|
"loss": 0.8301, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.816, |
|
"grad_norm": 0.4890239660294139, |
|
"learning_rate": 9.88600616137407e-06, |
|
"loss": 0.8629, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"grad_norm": 0.37392633165922323, |
|
"learning_rate": 9.873743117270691e-06, |
|
"loss": 0.8343, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.848, |
|
"grad_norm": 0.5097372006252004, |
|
"learning_rate": 9.860862128789954e-06, |
|
"loss": 0.9025, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"grad_norm": 0.5331140780823508, |
|
"learning_rate": 9.847364829118963e-06, |
|
"loss": 0.7832, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.5371583880849503, |
|
"learning_rate": 9.833252929587231e-06, |
|
"loss": 0.7241, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 0.45993890737280607, |
|
"learning_rate": 9.818528219449705e-06, |
|
"loss": 0.8964, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.912, |
|
"grad_norm": 0.4715847632418153, |
|
"learning_rate": 9.803192565659898e-06, |
|
"loss": 0.8623, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"grad_norm": 0.4942244137822645, |
|
"learning_rate": 9.78724791263318e-06, |
|
"loss": 0.7657, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.944, |
|
"grad_norm": 0.45536932211394177, |
|
"learning_rate": 9.770696282000245e-06, |
|
"loss": 0.8766, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.4891485391099054, |
|
"learning_rate": 9.753539772350792e-06, |
|
"loss": 0.8621, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.976, |
|
"grad_norm": 0.45887131278496746, |
|
"learning_rate": 9.735780558967434e-06, |
|
"loss": 0.7718, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.992, |
|
"grad_norm": 0.5002555111850944, |
|
"learning_rate": 9.717420893549902e-06, |
|
"loss": 0.8746, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.5002555111850944, |
|
"learning_rate": 9.698463103929542e-06, |
|
"loss": 0.7591, |
|
"step": 63 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 310, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 12613769428992.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|