|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 415, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.060350030175015085, |
|
"grad_norm": 0.17640693485736847, |
|
"learning_rate": 9.599999999999999e-05, |
|
"loss": 1.7813, |
|
"mean_token_accuracy": 0.6300852990150452, |
|
"num_tokens": 156656.0, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.12070006035003017, |
|
"grad_norm": 0.24193964898586273, |
|
"learning_rate": 0.00019599999999999997, |
|
"loss": 0.9197, |
|
"mean_token_accuracy": 0.768382026553154, |
|
"num_tokens": 282982.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.18105009052504525, |
|
"grad_norm": 0.16499435901641846, |
|
"learning_rate": 0.000296, |
|
"loss": 0.5906, |
|
"mean_token_accuracy": 0.8341364151239395, |
|
"num_tokens": 441181.0, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.24140012070006034, |
|
"grad_norm": 0.23563049733638763, |
|
"learning_rate": 0.0002999269005776963, |
|
"loss": 0.4832, |
|
"mean_token_accuracy": 0.8592967188358307, |
|
"num_tokens": 567644.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.30175015087507545, |
|
"grad_norm": 0.22556591033935547, |
|
"learning_rate": 0.0002996953705789175, |
|
"loss": 0.3612, |
|
"mean_token_accuracy": 0.8925437909364701, |
|
"num_tokens": 725987.0, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.3621001810500905, |
|
"grad_norm": 0.33429527282714844, |
|
"learning_rate": 0.00029930552794275785, |
|
"loss": 0.3126, |
|
"mean_token_accuracy": 0.9086851555109025, |
|
"num_tokens": 853185.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.4224502112251056, |
|
"grad_norm": 0.27340370416641235, |
|
"learning_rate": 0.0002987577849532824, |
|
"loss": 0.2343, |
|
"mean_token_accuracy": 0.9301495373249054, |
|
"num_tokens": 1011232.0, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.4828002414001207, |
|
"grad_norm": 0.2711191475391388, |
|
"learning_rate": 0.00029805272088449905, |
|
"loss": 0.2021, |
|
"mean_token_accuracy": 0.9406860828399658, |
|
"num_tokens": 1138074.0, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5431502715751357, |
|
"grad_norm": 0.19240038096904755, |
|
"learning_rate": 0.00029719108138773827, |
|
"loss": 0.1508, |
|
"mean_token_accuracy": 0.9550948125123978, |
|
"num_tokens": 1293601.0, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.6035003017501509, |
|
"grad_norm": 0.27221739292144775, |
|
"learning_rate": 0.00029617377770307837, |
|
"loss": 0.1563, |
|
"mean_token_accuracy": 0.9542003554105759, |
|
"num_tokens": 1418074.0, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.663850331925166, |
|
"grad_norm": 0.25977134704589844, |
|
"learning_rate": 0.0002950018856956494, |
|
"loss": 0.1228, |
|
"mean_token_accuracy": 0.9640595990419388, |
|
"num_tokens": 1577856.0, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.724200362100181, |
|
"grad_norm": 0.2311161458492279, |
|
"learning_rate": 0.0002936766447178356, |
|
"loss": 0.1229, |
|
"mean_token_accuracy": 0.9646531140804291, |
|
"num_tokens": 1704393.0, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7845503922751962, |
|
"grad_norm": 0.1375264674425125, |
|
"learning_rate": 0.0002921994562985788, |
|
"loss": 0.0972, |
|
"mean_token_accuracy": 0.9722524845600128, |
|
"num_tokens": 1860935.0, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.8449004224502112, |
|
"grad_norm": 0.2860631048679352, |
|
"learning_rate": 0.0002905718826611708, |
|
"loss": 0.0853, |
|
"mean_token_accuracy": 0.9756521546840667, |
|
"num_tokens": 1988266.0, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.9052504526252263, |
|
"grad_norm": 0.11123040318489075, |
|
"learning_rate": 0.00028879564507109946, |
|
"loss": 0.0814, |
|
"mean_token_accuracy": 0.9769123244285584, |
|
"num_tokens": 2146122.0, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.9656004828002414, |
|
"grad_norm": 0.239139586687088, |
|
"learning_rate": 0.0002868726220156981, |
|
"loss": 0.0696, |
|
"mean_token_accuracy": 0.9802538651227951, |
|
"num_tokens": 2273996.0, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.07211296260356903, |
|
"eval_mean_token_accuracy": 0.979879263285044, |
|
"eval_num_tokens": 2354180.0, |
|
"eval_runtime": 29.4819, |
|
"eval_samples_per_second": 12.516, |
|
"eval_steps_per_second": 6.275, |
|
"step": 415 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 2490, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 6, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.998178519084032e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|