{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 330, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04561003420752566, "grad_norm": 1.0824084281921387, "learning_rate": 4.9981876195011844e-05, "loss": 0.4811, "num_input_tokens_seen": 110848, "step": 5, "train_runtime": 30.3319, "train_tokens_per_second": 3654.501 }, { "epoch": 0.09122006841505131, "grad_norm": 0.6754106879234314, "learning_rate": 4.9908293271567286e-05, "loss": 0.2124, "num_input_tokens_seen": 220880, "step": 10, "train_runtime": 60.3932, "train_tokens_per_second": 3657.363 }, { "epoch": 0.13683010262257697, "grad_norm": 0.39871442317962646, "learning_rate": 4.977828505250903e-05, "loss": 0.1393, "num_input_tokens_seen": 332160, "step": 15, "train_runtime": 90.9477, "train_tokens_per_second": 3652.209 }, { "epoch": 0.18244013683010263, "grad_norm": 0.3741743564605713, "learning_rate": 4.959214604826831e-05, "loss": 0.1043, "num_input_tokens_seen": 442608, "step": 20, "train_runtime": 121.2095, "train_tokens_per_second": 3651.596 }, { "epoch": 0.22805017103762829, "grad_norm": 0.43203699588775635, "learning_rate": 4.935029792355834e-05, "loss": 0.0953, "num_input_tokens_seen": 552768, "step": 25, "train_runtime": 151.3115, "train_tokens_per_second": 3653.179 }, { "epoch": 0.27366020524515394, "grad_norm": 0.40135976672172546, "learning_rate": 4.9053288542168185e-05, "loss": 0.0867, "num_input_tokens_seen": 663056, "step": 30, "train_runtime": 181.4665, "train_tokens_per_second": 3653.876 }, { "epoch": 0.31927023945267957, "grad_norm": 0.33862611651420593, "learning_rate": 4.870179072587499e-05, "loss": 0.0625, "num_input_tokens_seen": 773936, "step": 35, "train_runtime": 211.7267, "train_tokens_per_second": 3655.353 }, { "epoch": 0.36488027366020526, "grad_norm": 0.41058799624443054, "learning_rate": 4.829660073028631e-05, "loss": 0.0727, "num_input_tokens_seen": 884832, "step": 40, "train_runtime": 241.9754, "train_tokens_per_second": 3656.702 }, { "epoch": 0.4104903078677309, "grad_norm": 0.5706074833869934, "learning_rate": 4.783863644106502e-05, "loss": 0.0727, "num_input_tokens_seen": 995744, "step": 45, "train_runtime": 272.301, "train_tokens_per_second": 3656.777 }, { "epoch": 0.45610034207525657, "grad_norm": 0.31977686285972595, "learning_rate": 4.73289352946231e-05, "loss": 0.0676, "num_input_tokens_seen": 1106256, "step": 50, "train_runtime": 302.3755, "train_tokens_per_second": 3658.551 }, { "epoch": 0.5017103762827823, "grad_norm": 0.29765012860298157, "learning_rate": 4.6768651927994434e-05, "loss": 0.0635, "num_input_tokens_seen": 1217840, "step": 55, "train_runtime": 332.9163, "train_tokens_per_second": 3658.097 }, { "epoch": 0.5473204104903079, "grad_norm": 0.3982709050178528, "learning_rate": 4.6159055563210604e-05, "loss": 0.0586, "num_input_tokens_seen": 1328544, "step": 60, "train_runtime": 363.1815, "train_tokens_per_second": 3658.072 }, { "epoch": 0.5929304446978335, "grad_norm": 0.24241122603416443, "learning_rate": 4.550152713210478e-05, "loss": 0.0572, "num_input_tokens_seen": 1438848, "step": 65, "train_runtime": 393.3602, "train_tokens_per_second": 3657.838 }, { "epoch": 0.6385404789053591, "grad_norm": 0.2856714427471161, "learning_rate": 4.479755614805688e-05, "loss": 0.054, "num_input_tokens_seen": 1547904, "step": 70, "train_runtime": 423.1722, "train_tokens_per_second": 3657.858 }, { "epoch": 0.6841505131128849, "grad_norm": 0.3355858325958252, "learning_rate": 4.404873733176678e-05, "loss": 0.047, "num_input_tokens_seen": 1658736, "step": 75, "train_runtime": 453.4813, "train_tokens_per_second": 3657.782 }, { "epoch": 0.7297605473204105, "grad_norm": 0.3131825029850006, "learning_rate": 4.3256766998698936e-05, "loss": 0.0471, "num_input_tokens_seen": 1769248, "step": 80, "train_runtime": 483.7293, "train_tokens_per_second": 3657.517 }, { "epoch": 0.7753705815279361, "grad_norm": 0.2504868805408478, "learning_rate": 4.242343921638234e-05, "loss": 0.0408, "num_input_tokens_seen": 1879024, "step": 85, "train_runtime": 513.7413, "train_tokens_per_second": 3657.529 }, { "epoch": 0.8209806157354618, "grad_norm": 0.4044341742992401, "learning_rate": 4.155064174027047e-05, "loss": 0.0398, "num_input_tokens_seen": 1991024, "step": 90, "train_runtime": 544.3342, "train_tokens_per_second": 3657.724 }, { "epoch": 0.8665906499429875, "grad_norm": 0.3132927119731903, "learning_rate": 4.064035173736804e-05, "loss": 0.0369, "num_input_tokens_seen": 2102064, "step": 95, "train_runtime": 574.6371, "train_tokens_per_second": 3658.072 }, { "epoch": 0.9122006841505131, "grad_norm": 0.3075104057788849, "learning_rate": 3.969463130731183e-05, "loss": 0.0408, "num_input_tokens_seen": 2212432, "step": 100, "train_runtime": 604.7406, "train_tokens_per_second": 3658.481 }, { "epoch": 0.9578107183580388, "grad_norm": 0.3352231979370117, "learning_rate": 3.871562281105175e-05, "loss": 0.0441, "num_input_tokens_seen": 2322032, "step": 105, "train_runtime": 635.488, "train_tokens_per_second": 3653.936 }, { "epoch": 1.0, "grad_norm": 0.36991724371910095, "learning_rate": 3.770554401771423e-05, "loss": 0.0356, "num_input_tokens_seen": 2423872, "step": 110, "train_runtime": 663.2539, "train_tokens_per_second": 3654.516 }, { "epoch": 1.0456100342075256, "grad_norm": 0.31266242265701294, "learning_rate": 3.6666683080641846e-05, "loss": 0.0358, "num_input_tokens_seen": 2534752, "step": 115, "train_runtime": 693.4736, "train_tokens_per_second": 3655.153 }, { "epoch": 1.0912200684150513, "grad_norm": 0.29535725712776184, "learning_rate": 3.5601393353990046e-05, "loss": 0.0388, "num_input_tokens_seen": 2644464, "step": 120, "train_runtime": 723.4647, "train_tokens_per_second": 3655.277 }, { "epoch": 1.1368301026225769, "grad_norm": 0.37989553809165955, "learning_rate": 3.4512088061623075e-05, "loss": 0.0323, "num_input_tokens_seen": 2755568, "step": 125, "train_runtime": 753.8099, "train_tokens_per_second": 3655.521 }, { "epoch": 1.1824401368301025, "grad_norm": 0.2450297474861145, "learning_rate": 3.3401234830385756e-05, "loss": 0.031, "num_input_tokens_seen": 2865744, "step": 130, "train_runtime": 783.9325, "train_tokens_per_second": 3655.6 }, { "epoch": 1.2280501710376284, "grad_norm": 0.35642582178115845, "learning_rate": 3.2271350100134975e-05, "loss": 0.0279, "num_input_tokens_seen": 2976304, "step": 135, "train_runtime": 814.2192, "train_tokens_per_second": 3655.409 }, { "epoch": 1.273660205245154, "grad_norm": 0.34205862879753113, "learning_rate": 3.11249934231941e-05, "loss": 0.0285, "num_input_tokens_seen": 3086224, "step": 140, "train_runtime": 844.255, "train_tokens_per_second": 3655.559 }, { "epoch": 1.3192702394526796, "grad_norm": 0.4949014484882355, "learning_rate": 2.996476166614364e-05, "loss": 0.0292, "num_input_tokens_seen": 3196352, "step": 145, "train_runtime": 874.2926, "train_tokens_per_second": 3655.929 }, { "epoch": 1.3648802736602053, "grad_norm": 0.3114229142665863, "learning_rate": 2.8793283127083292e-05, "loss": 0.0299, "num_input_tokens_seen": 3307744, "step": 150, "train_runtime": 904.5859, "train_tokens_per_second": 3656.639 }, { "epoch": 1.4104903078677309, "grad_norm": 0.3953075706958771, "learning_rate": 2.761321158169134e-05, "loss": 0.0345, "num_input_tokens_seen": 3418016, "step": 155, "train_runtime": 934.6229, "train_tokens_per_second": 3657.107 }, { "epoch": 1.4561003420752565, "grad_norm": 0.5266784429550171, "learning_rate": 2.6427220271569203e-05, "loss": 0.0338, "num_input_tokens_seen": 3528544, "step": 160, "train_runtime": 964.715, "train_tokens_per_second": 3657.602 }, { "epoch": 1.5017103762827824, "grad_norm": 0.4755711853504181, "learning_rate": 2.523799584848942e-05, "loss": 0.0363, "num_input_tokens_seen": 3639056, "step": 165, "train_runtime": 994.801, "train_tokens_per_second": 3658.074 }, { "epoch": 1.547320410490308, "grad_norm": 0.4198484420776367, "learning_rate": 2.4048232288265253e-05, "loss": 0.0294, "num_input_tokens_seen": 3749824, "step": 170, "train_runtime": 1025.0451, "train_tokens_per_second": 3658.204 }, { "epoch": 1.5929304446978336, "grad_norm": 0.40441790223121643, "learning_rate": 2.2860624788029013e-05, "loss": 0.0334, "num_input_tokens_seen": 3859776, "step": 175, "train_runtime": 1055.0684, "train_tokens_per_second": 3658.318 }, { "epoch": 1.6385404789053593, "grad_norm": 0.7088226079940796, "learning_rate": 2.167786366074365e-05, "loss": 0.0296, "num_input_tokens_seen": 3970576, "step": 180, "train_runtime": 1085.3469, "train_tokens_per_second": 3658.347 }, { "epoch": 1.6841505131128849, "grad_norm": 0.40556278824806213, "learning_rate": 2.0502628240778655e-05, "loss": 0.0319, "num_input_tokens_seen": 4081568, "step": 185, "train_runtime": 1115.6872, "train_tokens_per_second": 3658.344 }, { "epoch": 1.7297605473204105, "grad_norm": 0.32311075925827026, "learning_rate": 1.9337580814355888e-05, "loss": 0.0324, "num_input_tokens_seen": 4191264, "step": 190, "train_runtime": 1145.6081, "train_tokens_per_second": 3658.55 }, { "epoch": 1.7753705815279361, "grad_norm": 0.3284579813480377, "learning_rate": 1.8185360588615058e-05, "loss": 0.0337, "num_input_tokens_seen": 4301440, "step": 195, "train_runtime": 1175.6929, "train_tokens_per_second": 3658.642 }, { "epoch": 1.8209806157354618, "grad_norm": 0.2036353498697281, "learning_rate": 1.7048577712960627e-05, "loss": 0.0356, "num_input_tokens_seen": 4412000, "step": 200, "train_runtime": 1206.0273, "train_tokens_per_second": 3658.292 }, { "epoch": 1.8665906499429874, "grad_norm": 0.2839590907096863, "learning_rate": 1.5929807366233977e-05, "loss": 0.0259, "num_input_tokens_seen": 4521984, "step": 205, "train_runtime": 1237.1367, "train_tokens_per_second": 3655.202 }, { "epoch": 1.912200684150513, "grad_norm": 0.3934095799922943, "learning_rate": 1.4831583923104999e-05, "loss": 0.0372, "num_input_tokens_seen": 4632176, "step": 210, "train_runtime": 1267.3175, "train_tokens_per_second": 3655.103 }, { "epoch": 1.9578107183580387, "grad_norm": 0.30704551935195923, "learning_rate": 1.3756395212898359e-05, "loss": 0.0261, "num_input_tokens_seen": 4743552, "step": 215, "train_runtime": 1297.7646, "train_tokens_per_second": 3655.171 }, { "epoch": 2.0, "grad_norm": 0.2414654642343521, "learning_rate": 1.2706676883859903e-05, "loss": 0.0256, "num_input_tokens_seen": 4845664, "step": 220, "train_runtime": 1325.6139, "train_tokens_per_second": 3655.411 }, { "epoch": 2.0456100342075256, "grad_norm": 0.271589457988739, "learning_rate": 1.1684806885630004e-05, "loss": 0.024, "num_input_tokens_seen": 4955440, "step": 225, "train_runtime": 1355.6355, "train_tokens_per_second": 3655.437 }, { "epoch": 2.0912200684150513, "grad_norm": 0.3638203740119934, "learning_rate": 1.0693100082422763e-05, "loss": 0.022, "num_input_tokens_seen": 5065744, "step": 230, "train_runtime": 1385.6557, "train_tokens_per_second": 3655.846 }, { "epoch": 2.136830102622577, "grad_norm": 0.22529903054237366, "learning_rate": 9.733803009114045e-06, "loss": 0.0223, "num_input_tokens_seen": 5177424, "step": 235, "train_runtime": 1416.0752, "train_tokens_per_second": 3656.179 }, { "epoch": 2.1824401368301025, "grad_norm": 0.41837063431739807, "learning_rate": 8.809088782117452e-06, "loss": 0.0251, "num_input_tokens_seen": 5287824, "step": 240, "train_runtime": 1446.1526, "train_tokens_per_second": 3656.477 }, { "epoch": 2.228050171037628, "grad_norm": 0.38458940386772156, "learning_rate": 7.921052176576644e-06, "loss": 0.0223, "num_input_tokens_seen": 5398032, "step": 245, "train_runtime": 1476.1489, "train_tokens_per_second": 3656.834 }, { "epoch": 2.2736602052451538, "grad_norm": 0.3993557095527649, "learning_rate": 7.071704881025915e-06, "loss": 0.0255, "num_input_tokens_seen": 5509248, "step": 250, "train_runtime": 1506.4711, "train_tokens_per_second": 3657.055 }, { "epoch": 2.3192702394526794, "grad_norm": 0.311338871717453, "learning_rate": 6.2629709402686535e-06, "loss": 0.0211, "num_input_tokens_seen": 5619920, "step": 255, "train_runtime": 1536.6024, "train_tokens_per_second": 3657.368 }, { "epoch": 2.364880273660205, "grad_norm": 0.3177950382232666, "learning_rate": 5.49668239679699e-06, "loss": 0.0235, "num_input_tokens_seen": 5729536, "step": 260, "train_runtime": 1566.4697, "train_tokens_per_second": 3657.61 }, { "epoch": 2.4104903078677307, "grad_norm": 0.2807183265686035, "learning_rate": 4.7745751406263165e-06, "loss": 0.0249, "num_input_tokens_seen": 5840848, "step": 265, "train_runtime": 1596.7614, "train_tokens_per_second": 3657.934 }, { "epoch": 2.4561003420752567, "grad_norm": 0.3168954849243164, "learning_rate": 4.098284976946101e-06, "loss": 0.0228, "num_input_tokens_seen": 5951840, "step": 270, "train_runtime": 1626.9716, "train_tokens_per_second": 3658.232 }, { "epoch": 2.5017103762827824, "grad_norm": 0.5031352639198303, "learning_rate": 3.4693439204949858e-06, "loss": 0.0304, "num_input_tokens_seen": 6062160, "step": 275, "train_runtime": 1656.9898, "train_tokens_per_second": 3658.538 }, { "epoch": 2.547320410490308, "grad_norm": 0.28105977177619934, "learning_rate": 2.889176725054643e-06, "loss": 0.023, "num_input_tokens_seen": 6172976, "step": 280, "train_runtime": 1687.1404, "train_tokens_per_second": 3658.84 }, { "epoch": 2.5929304446978336, "grad_norm": 0.3535321056842804, "learning_rate": 2.3590976559242278e-06, "loss": 0.0263, "num_input_tokens_seen": 6283600, "step": 285, "train_runtime": 1717.249, "train_tokens_per_second": 3659.108 }, { "epoch": 2.6385404789053593, "grad_norm": 0.3748933970928192, "learning_rate": 1.8803075126867715e-06, "loss": 0.0259, "num_input_tokens_seen": 6394176, "step": 290, "train_runtime": 1747.3641, "train_tokens_per_second": 3659.327 }, { "epoch": 2.684150513112885, "grad_norm": 0.2608453035354614, "learning_rate": 1.4538909090118846e-06, "loss": 0.0217, "num_input_tokens_seen": 6503616, "step": 295, "train_runtime": 1777.1238, "train_tokens_per_second": 3659.63 }, { "epoch": 2.7297605473204105, "grad_norm": 0.2588518559932709, "learning_rate": 1.0808138156569614e-06, "loss": 0.0259, "num_input_tokens_seen": 6614736, "step": 300, "train_runtime": 1807.3109, "train_tokens_per_second": 3659.988 }, { "epoch": 2.775370581527936, "grad_norm": 0.38416263461112976, "learning_rate": 7.619213722327185e-07, "loss": 0.0259, "num_input_tokens_seen": 6725088, "step": 305, "train_runtime": 1838.354, "train_tokens_per_second": 3658.212 }, { "epoch": 2.8209806157354618, "grad_norm": 0.3531897962093353, "learning_rate": 4.979359726901639e-07, "loss": 0.0279, "num_input_tokens_seen": 6834816, "step": 310, "train_runtime": 1868.2865, "train_tokens_per_second": 3658.334 }, { "epoch": 2.8665906499429874, "grad_norm": 0.4269830584526062, "learning_rate": 2.894556288659395e-07, "loss": 0.0221, "num_input_tokens_seen": 6945024, "step": 315, "train_runtime": 1898.3866, "train_tokens_per_second": 3658.382 }, { "epoch": 2.912200684150513, "grad_norm": 0.5062984228134155, "learning_rate": 1.3695261579316777e-07, "loss": 0.0246, "num_input_tokens_seen": 7055120, "step": 320, "train_runtime": 1928.4784, "train_tokens_per_second": 3658.387 }, { "epoch": 2.9578107183580387, "grad_norm": 0.20750342309474945, "learning_rate": 4.07724018466088e-08, "loss": 0.0189, "num_input_tokens_seen": 7166944, "step": 325, "train_runtime": 1959.0815, "train_tokens_per_second": 3658.318 }, { "epoch": 3.0, "grad_norm": 0.6835935115814209, "learning_rate": 1.132866145678313e-09, "loss": 0.0228, "num_input_tokens_seen": 7269760, "step": 330, "train_runtime": 1987.0937, "train_tokens_per_second": 3658.489 } ], "logging_steps": 5, "max_steps": 330, "num_input_tokens_seen": 7269760, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.282687546045235e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }