|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.8209806157354618, |
|
"eval_steps": 500, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.04561003420752566, |
|
"grad_norm": 1.0824084281921387, |
|
"learning_rate": 4.9981876195011844e-05, |
|
"loss": 0.4811, |
|
"num_input_tokens_seen": 110848, |
|
"step": 5, |
|
"train_runtime": 30.3319, |
|
"train_tokens_per_second": 3654.501 |
|
}, |
|
{ |
|
"epoch": 0.09122006841505131, |
|
"grad_norm": 0.6754106879234314, |
|
"learning_rate": 4.9908293271567286e-05, |
|
"loss": 0.2124, |
|
"num_input_tokens_seen": 220880, |
|
"step": 10, |
|
"train_runtime": 60.3932, |
|
"train_tokens_per_second": 3657.363 |
|
}, |
|
{ |
|
"epoch": 0.13683010262257697, |
|
"grad_norm": 0.39871442317962646, |
|
"learning_rate": 4.977828505250903e-05, |
|
"loss": 0.1393, |
|
"num_input_tokens_seen": 332160, |
|
"step": 15, |
|
"train_runtime": 90.9477, |
|
"train_tokens_per_second": 3652.209 |
|
}, |
|
{ |
|
"epoch": 0.18244013683010263, |
|
"grad_norm": 0.3741743564605713, |
|
"learning_rate": 4.959214604826831e-05, |
|
"loss": 0.1043, |
|
"num_input_tokens_seen": 442608, |
|
"step": 20, |
|
"train_runtime": 121.2095, |
|
"train_tokens_per_second": 3651.596 |
|
}, |
|
{ |
|
"epoch": 0.22805017103762829, |
|
"grad_norm": 0.43203699588775635, |
|
"learning_rate": 4.935029792355834e-05, |
|
"loss": 0.0953, |
|
"num_input_tokens_seen": 552768, |
|
"step": 25, |
|
"train_runtime": 151.3115, |
|
"train_tokens_per_second": 3653.179 |
|
}, |
|
{ |
|
"epoch": 0.27366020524515394, |
|
"grad_norm": 0.40135976672172546, |
|
"learning_rate": 4.9053288542168185e-05, |
|
"loss": 0.0867, |
|
"num_input_tokens_seen": 663056, |
|
"step": 30, |
|
"train_runtime": 181.4665, |
|
"train_tokens_per_second": 3653.876 |
|
}, |
|
{ |
|
"epoch": 0.31927023945267957, |
|
"grad_norm": 0.33862611651420593, |
|
"learning_rate": 4.870179072587499e-05, |
|
"loss": 0.0625, |
|
"num_input_tokens_seen": 773936, |
|
"step": 35, |
|
"train_runtime": 211.7267, |
|
"train_tokens_per_second": 3655.353 |
|
}, |
|
{ |
|
"epoch": 0.36488027366020526, |
|
"grad_norm": 0.41058799624443054, |
|
"learning_rate": 4.829660073028631e-05, |
|
"loss": 0.0727, |
|
"num_input_tokens_seen": 884832, |
|
"step": 40, |
|
"train_runtime": 241.9754, |
|
"train_tokens_per_second": 3656.702 |
|
}, |
|
{ |
|
"epoch": 0.4104903078677309, |
|
"grad_norm": 0.5706074833869934, |
|
"learning_rate": 4.783863644106502e-05, |
|
"loss": 0.0727, |
|
"num_input_tokens_seen": 995744, |
|
"step": 45, |
|
"train_runtime": 272.301, |
|
"train_tokens_per_second": 3656.777 |
|
}, |
|
{ |
|
"epoch": 0.45610034207525657, |
|
"grad_norm": 0.31977686285972595, |
|
"learning_rate": 4.73289352946231e-05, |
|
"loss": 0.0676, |
|
"num_input_tokens_seen": 1106256, |
|
"step": 50, |
|
"train_runtime": 302.3755, |
|
"train_tokens_per_second": 3658.551 |
|
}, |
|
{ |
|
"epoch": 0.5017103762827823, |
|
"grad_norm": 0.29765012860298157, |
|
"learning_rate": 4.6768651927994434e-05, |
|
"loss": 0.0635, |
|
"num_input_tokens_seen": 1217840, |
|
"step": 55, |
|
"train_runtime": 332.9163, |
|
"train_tokens_per_second": 3658.097 |
|
}, |
|
{ |
|
"epoch": 0.5473204104903079, |
|
"grad_norm": 0.3982709050178528, |
|
"learning_rate": 4.6159055563210604e-05, |
|
"loss": 0.0586, |
|
"num_input_tokens_seen": 1328544, |
|
"step": 60, |
|
"train_runtime": 363.1815, |
|
"train_tokens_per_second": 3658.072 |
|
}, |
|
{ |
|
"epoch": 0.5929304446978335, |
|
"grad_norm": 0.24241122603416443, |
|
"learning_rate": 4.550152713210478e-05, |
|
"loss": 0.0572, |
|
"num_input_tokens_seen": 1438848, |
|
"step": 65, |
|
"train_runtime": 393.3602, |
|
"train_tokens_per_second": 3657.838 |
|
}, |
|
{ |
|
"epoch": 0.6385404789053591, |
|
"grad_norm": 0.2856714427471161, |
|
"learning_rate": 4.479755614805688e-05, |
|
"loss": 0.054, |
|
"num_input_tokens_seen": 1547904, |
|
"step": 70, |
|
"train_runtime": 423.1722, |
|
"train_tokens_per_second": 3657.858 |
|
}, |
|
{ |
|
"epoch": 0.6841505131128849, |
|
"grad_norm": 0.3355858325958252, |
|
"learning_rate": 4.404873733176678e-05, |
|
"loss": 0.047, |
|
"num_input_tokens_seen": 1658736, |
|
"step": 75, |
|
"train_runtime": 453.4813, |
|
"train_tokens_per_second": 3657.782 |
|
}, |
|
{ |
|
"epoch": 0.7297605473204105, |
|
"grad_norm": 0.3131825029850006, |
|
"learning_rate": 4.3256766998698936e-05, |
|
"loss": 0.0471, |
|
"num_input_tokens_seen": 1769248, |
|
"step": 80, |
|
"train_runtime": 483.7293, |
|
"train_tokens_per_second": 3657.517 |
|
}, |
|
{ |
|
"epoch": 0.7753705815279361, |
|
"grad_norm": 0.2504868805408478, |
|
"learning_rate": 4.242343921638234e-05, |
|
"loss": 0.0408, |
|
"num_input_tokens_seen": 1879024, |
|
"step": 85, |
|
"train_runtime": 513.7413, |
|
"train_tokens_per_second": 3657.529 |
|
}, |
|
{ |
|
"epoch": 0.8209806157354618, |
|
"grad_norm": 0.4044341742992401, |
|
"learning_rate": 4.155064174027047e-05, |
|
"loss": 0.0398, |
|
"num_input_tokens_seen": 1991024, |
|
"step": 90, |
|
"train_runtime": 544.3342, |
|
"train_tokens_per_second": 3657.724 |
|
}, |
|
{ |
|
"epoch": 0.8665906499429875, |
|
"grad_norm": 0.3132927119731903, |
|
"learning_rate": 4.064035173736804e-05, |
|
"loss": 0.0369, |
|
"num_input_tokens_seen": 2102064, |
|
"step": 95, |
|
"train_runtime": 574.6371, |
|
"train_tokens_per_second": 3658.072 |
|
}, |
|
{ |
|
"epoch": 0.9122006841505131, |
|
"grad_norm": 0.3075104057788849, |
|
"learning_rate": 3.969463130731183e-05, |
|
"loss": 0.0408, |
|
"num_input_tokens_seen": 2212432, |
|
"step": 100, |
|
"train_runtime": 604.7406, |
|
"train_tokens_per_second": 3658.481 |
|
}, |
|
{ |
|
"epoch": 0.9578107183580388, |
|
"grad_norm": 0.3352231979370117, |
|
"learning_rate": 3.871562281105175e-05, |
|
"loss": 0.0441, |
|
"num_input_tokens_seen": 2322032, |
|
"step": 105, |
|
"train_runtime": 635.488, |
|
"train_tokens_per_second": 3653.936 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.36991724371910095, |
|
"learning_rate": 3.770554401771423e-05, |
|
"loss": 0.0356, |
|
"num_input_tokens_seen": 2423872, |
|
"step": 110, |
|
"train_runtime": 663.2539, |
|
"train_tokens_per_second": 3654.516 |
|
}, |
|
{ |
|
"epoch": 1.0456100342075256, |
|
"grad_norm": 0.31266242265701294, |
|
"learning_rate": 3.6666683080641846e-05, |
|
"loss": 0.0358, |
|
"num_input_tokens_seen": 2534752, |
|
"step": 115, |
|
"train_runtime": 693.4736, |
|
"train_tokens_per_second": 3655.153 |
|
}, |
|
{ |
|
"epoch": 1.0912200684150513, |
|
"grad_norm": 0.29535725712776184, |
|
"learning_rate": 3.5601393353990046e-05, |
|
"loss": 0.0388, |
|
"num_input_tokens_seen": 2644464, |
|
"step": 120, |
|
"train_runtime": 723.4647, |
|
"train_tokens_per_second": 3655.277 |
|
}, |
|
{ |
|
"epoch": 1.1368301026225769, |
|
"grad_norm": 0.37989553809165955, |
|
"learning_rate": 3.4512088061623075e-05, |
|
"loss": 0.0323, |
|
"num_input_tokens_seen": 2755568, |
|
"step": 125, |
|
"train_runtime": 753.8099, |
|
"train_tokens_per_second": 3655.521 |
|
}, |
|
{ |
|
"epoch": 1.1824401368301025, |
|
"grad_norm": 0.2450297474861145, |
|
"learning_rate": 3.3401234830385756e-05, |
|
"loss": 0.031, |
|
"num_input_tokens_seen": 2865744, |
|
"step": 130, |
|
"train_runtime": 783.9325, |
|
"train_tokens_per_second": 3655.6 |
|
}, |
|
{ |
|
"epoch": 1.2280501710376284, |
|
"grad_norm": 0.35642582178115845, |
|
"learning_rate": 3.2271350100134975e-05, |
|
"loss": 0.0279, |
|
"num_input_tokens_seen": 2976304, |
|
"step": 135, |
|
"train_runtime": 814.2192, |
|
"train_tokens_per_second": 3655.409 |
|
}, |
|
{ |
|
"epoch": 1.273660205245154, |
|
"grad_norm": 0.34205862879753113, |
|
"learning_rate": 3.11249934231941e-05, |
|
"loss": 0.0285, |
|
"num_input_tokens_seen": 3086224, |
|
"step": 140, |
|
"train_runtime": 844.255, |
|
"train_tokens_per_second": 3655.559 |
|
}, |
|
{ |
|
"epoch": 1.3192702394526796, |
|
"grad_norm": 0.4949014484882355, |
|
"learning_rate": 2.996476166614364e-05, |
|
"loss": 0.0292, |
|
"num_input_tokens_seen": 3196352, |
|
"step": 145, |
|
"train_runtime": 874.2926, |
|
"train_tokens_per_second": 3655.929 |
|
}, |
|
{ |
|
"epoch": 1.3648802736602053, |
|
"grad_norm": 0.3114229142665863, |
|
"learning_rate": 2.8793283127083292e-05, |
|
"loss": 0.0299, |
|
"num_input_tokens_seen": 3307744, |
|
"step": 150, |
|
"train_runtime": 904.5859, |
|
"train_tokens_per_second": 3656.639 |
|
}, |
|
{ |
|
"epoch": 1.4104903078677309, |
|
"grad_norm": 0.3953075706958771, |
|
"learning_rate": 2.761321158169134e-05, |
|
"loss": 0.0345, |
|
"num_input_tokens_seen": 3418016, |
|
"step": 155, |
|
"train_runtime": 934.6229, |
|
"train_tokens_per_second": 3657.107 |
|
}, |
|
{ |
|
"epoch": 1.4561003420752565, |
|
"grad_norm": 0.5266784429550171, |
|
"learning_rate": 2.6427220271569203e-05, |
|
"loss": 0.0338, |
|
"num_input_tokens_seen": 3528544, |
|
"step": 160, |
|
"train_runtime": 964.715, |
|
"train_tokens_per_second": 3657.602 |
|
}, |
|
{ |
|
"epoch": 1.5017103762827824, |
|
"grad_norm": 0.4755711853504181, |
|
"learning_rate": 2.523799584848942e-05, |
|
"loss": 0.0363, |
|
"num_input_tokens_seen": 3639056, |
|
"step": 165, |
|
"train_runtime": 994.801, |
|
"train_tokens_per_second": 3658.074 |
|
}, |
|
{ |
|
"epoch": 1.547320410490308, |
|
"grad_norm": 0.4198484420776367, |
|
"learning_rate": 2.4048232288265253e-05, |
|
"loss": 0.0294, |
|
"num_input_tokens_seen": 3749824, |
|
"step": 170, |
|
"train_runtime": 1025.0451, |
|
"train_tokens_per_second": 3658.204 |
|
}, |
|
{ |
|
"epoch": 1.5929304446978336, |
|
"grad_norm": 0.40441790223121643, |
|
"learning_rate": 2.2860624788029013e-05, |
|
"loss": 0.0334, |
|
"num_input_tokens_seen": 3859776, |
|
"step": 175, |
|
"train_runtime": 1055.0684, |
|
"train_tokens_per_second": 3658.318 |
|
}, |
|
{ |
|
"epoch": 1.6385404789053593, |
|
"grad_norm": 0.7088226079940796, |
|
"learning_rate": 2.167786366074365e-05, |
|
"loss": 0.0296, |
|
"num_input_tokens_seen": 3970576, |
|
"step": 180, |
|
"train_runtime": 1085.3469, |
|
"train_tokens_per_second": 3658.347 |
|
}, |
|
{ |
|
"epoch": 1.6841505131128849, |
|
"grad_norm": 0.40556278824806213, |
|
"learning_rate": 2.0502628240778655e-05, |
|
"loss": 0.0319, |
|
"num_input_tokens_seen": 4081568, |
|
"step": 185, |
|
"train_runtime": 1115.6872, |
|
"train_tokens_per_second": 3658.344 |
|
}, |
|
{ |
|
"epoch": 1.7297605473204105, |
|
"grad_norm": 0.32311075925827026, |
|
"learning_rate": 1.9337580814355888e-05, |
|
"loss": 0.0324, |
|
"num_input_tokens_seen": 4191264, |
|
"step": 190, |
|
"train_runtime": 1145.6081, |
|
"train_tokens_per_second": 3658.55 |
|
}, |
|
{ |
|
"epoch": 1.7753705815279361, |
|
"grad_norm": 0.3284579813480377, |
|
"learning_rate": 1.8185360588615058e-05, |
|
"loss": 0.0337, |
|
"num_input_tokens_seen": 4301440, |
|
"step": 195, |
|
"train_runtime": 1175.6929, |
|
"train_tokens_per_second": 3658.642 |
|
}, |
|
{ |
|
"epoch": 1.8209806157354618, |
|
"grad_norm": 0.2036353498697281, |
|
"learning_rate": 1.7048577712960627e-05, |
|
"loss": 0.0356, |
|
"num_input_tokens_seen": 4412000, |
|
"step": 200, |
|
"train_runtime": 1206.0273, |
|
"train_tokens_per_second": 3658.292 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 330, |
|
"num_input_tokens_seen": 4412000, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.99225523994624e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|