|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 12.470588235294118, |
|
"eval_steps": 500, |
|
"global_step": 112, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.5882352941176471, |
|
"grad_norm": 0.8250285070833755, |
|
"learning_rate": 2.0833333333333336e-05, |
|
"loss": 1.1514, |
|
"mean_token_accuracy": 0.7545319080352784, |
|
"num_tokens": 2505918.0, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 1.1176470588235294, |
|
"grad_norm": 0.769774425261519, |
|
"learning_rate": 4.166666666666667e-05, |
|
"loss": 1.0097, |
|
"mean_token_accuracy": 0.7732286784383986, |
|
"num_tokens": 4791423.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 1.7058823529411766, |
|
"grad_norm": 0.38521393276298865, |
|
"learning_rate": 4.85e-05, |
|
"loss": 0.8391, |
|
"mean_token_accuracy": 0.806722441315651, |
|
"num_tokens": 7330272.0, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 2.235294117647059, |
|
"grad_norm": 0.3989534309948224, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 0.7915, |
|
"mean_token_accuracy": 0.8170362280474769, |
|
"num_tokens": 9618952.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 2.8235294117647056, |
|
"grad_norm": 0.35223023869070996, |
|
"learning_rate": 4.35e-05, |
|
"loss": 0.6523, |
|
"mean_token_accuracy": 0.8425902664661408, |
|
"num_tokens": 12114377.0, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 3.3529411764705883, |
|
"grad_norm": 0.3419690748616924, |
|
"learning_rate": 4.1e-05, |
|
"loss": 0.6196, |
|
"mean_token_accuracy": 0.8515104750792185, |
|
"num_tokens": 14409009.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 3.9411764705882355, |
|
"grad_norm": 1.0636105938490006, |
|
"learning_rate": 3.85e-05, |
|
"loss": 0.5473, |
|
"mean_token_accuracy": 0.8684158384799957, |
|
"num_tokens": 16918082.0, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 4.470588235294118, |
|
"grad_norm": 0.8104215493367165, |
|
"learning_rate": 3.6e-05, |
|
"loss": 0.5313, |
|
"mean_token_accuracy": 0.8752750555674235, |
|
"num_tokens": 19217454.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.4155095547714891, |
|
"learning_rate": 3.35e-05, |
|
"loss": 0.3995, |
|
"mean_token_accuracy": 0.8990385002560086, |
|
"num_tokens": 21475135.0, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 5.588235294117647, |
|
"grad_norm": 0.38830329565569643, |
|
"learning_rate": 3.1e-05, |
|
"loss": 0.3932, |
|
"mean_token_accuracy": 0.9030953556299209, |
|
"num_tokens": 23999790.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 6.117647058823529, |
|
"grad_norm": 0.6003093985868039, |
|
"learning_rate": 2.8499999999999998e-05, |
|
"loss": 0.3304, |
|
"mean_token_accuracy": 0.9205087688234117, |
|
"num_tokens": 26264984.0, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 6.705882352941177, |
|
"grad_norm": 0.8142075697801212, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 0.2441, |
|
"mean_token_accuracy": 0.9367312312126159, |
|
"num_tokens": 28792279.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 7.235294117647059, |
|
"grad_norm": 0.44738871439201344, |
|
"learning_rate": 2.35e-05, |
|
"loss": 0.3344, |
|
"mean_token_accuracy": 0.9259601334730784, |
|
"num_tokens": 31085796.0, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 7.823529411764706, |
|
"grad_norm": 0.5170032581958464, |
|
"learning_rate": 2.1e-05, |
|
"loss": 0.2198, |
|
"mean_token_accuracy": 0.9499045610427856, |
|
"num_tokens": 33579925.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 8.352941176470589, |
|
"grad_norm": 0.41980493493671023, |
|
"learning_rate": 1.85e-05, |
|
"loss": 0.243, |
|
"mean_token_accuracy": 0.9482089314195845, |
|
"num_tokens": 35872214.0, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 8.941176470588236, |
|
"grad_norm": 1.2078125761983167, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.1887, |
|
"mean_token_accuracy": 0.9552906274795532, |
|
"num_tokens": 38392486.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 9.470588235294118, |
|
"grad_norm": 0.4806987993157492, |
|
"learning_rate": 1.3500000000000001e-05, |
|
"loss": 0.1776, |
|
"mean_token_accuracy": 0.9594514999124739, |
|
"num_tokens": 40676022.0, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.626687130036329, |
|
"learning_rate": 1.1000000000000001e-05, |
|
"loss": 0.1525, |
|
"mean_token_accuracy": 0.9653611448076036, |
|
"num_tokens": 42948208.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 10.588235294117647, |
|
"grad_norm": 0.3484261346541397, |
|
"learning_rate": 8.500000000000002e-06, |
|
"loss": 0.1488, |
|
"mean_token_accuracy": 0.9670574784278869, |
|
"num_tokens": 45475525.0, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 11.117647058823529, |
|
"grad_norm": 0.43421046707207084, |
|
"learning_rate": 6e-06, |
|
"loss": 0.1424, |
|
"mean_token_accuracy": 0.9697046114338769, |
|
"num_tokens": 47733376.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 11.705882352941176, |
|
"grad_norm": 0.24969104543475654, |
|
"learning_rate": 3.5000000000000004e-06, |
|
"loss": 0.1501, |
|
"mean_token_accuracy": 0.9680212080478668, |
|
"num_tokens": 50275121.0, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 12.235294117647058, |
|
"grad_norm": 0.1876144299655879, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0969, |
|
"mean_token_accuracy": 0.9780076709058549, |
|
"num_tokens": 52549898.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 12.470588235294118, |
|
"mean_token_accuracy": 0.9807853102684021, |
|
"num_tokens": 53548989.0, |
|
"step": 112, |
|
"total_flos": 80266967384064.0, |
|
"train_loss": 0.4191697733476758, |
|
"train_runtime": 1031.2092, |
|
"train_samples_per_second": 7.318, |
|
"train_steps_per_second": 0.109 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 112, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 14, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 80266967384064.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|