{ "best_metric": 1.4004004001617432, "best_model_checkpoint": "finetuned_student_model/checkpoint-900", "epoch": 1.9969604863221884, "eval_steps": 100, "global_step": 904, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.022105554020447636, "grad_norm": 9.875, "learning_rate": 0.0002973392461197339, "loss": 3.2514, "mean_token_accuracy": 0.4204087435267866, "step": 10 }, { "epoch": 0.04421110804089527, "grad_norm": 3.984375, "learning_rate": 0.0002940133037694013, "loss": 2.3718, "mean_token_accuracy": 0.5071762848645449, "step": 20 }, { "epoch": 0.06631666206134292, "grad_norm": 3.65625, "learning_rate": 0.0002906873614190687, "loss": 2.1087, "mean_token_accuracy": 0.5472177878022194, "step": 30 }, { "epoch": 0.08842221608179054, "grad_norm": 2.9375, "learning_rate": 0.0002873614190687361, "loss": 2.0888, "mean_token_accuracy": 0.5551092017441988, "step": 40 }, { "epoch": 0.11052777010223819, "grad_norm": 2.9375, "learning_rate": 0.0002840354767184035, "loss": 2.1327, "mean_token_accuracy": 0.5446394145488739, "step": 50 }, { "epoch": 0.13263332412268583, "grad_norm": 2.78125, "learning_rate": 0.0002807095343680709, "loss": 2.0511, "mean_token_accuracy": 0.5573025777935982, "step": 60 }, { "epoch": 0.15473887814313347, "grad_norm": 2.921875, "learning_rate": 0.0002773835920177383, "loss": 2.0515, "mean_token_accuracy": 0.5619197305291891, "step": 70 }, { "epoch": 0.1768444321635811, "grad_norm": 2.609375, "learning_rate": 0.0002740576496674058, "loss": 1.9854, "mean_token_accuracy": 0.5682128138840199, "step": 80 }, { "epoch": 0.19894998618402873, "grad_norm": 2.40625, "learning_rate": 0.00027073170731707315, "loss": 1.9689, "mean_token_accuracy": 0.5708833433687687, "step": 90 }, { "epoch": 0.22105554020447638, "grad_norm": 2.78125, "learning_rate": 0.0002674057649667406, "loss": 1.9959, "mean_token_accuracy": 0.5728900354355574, "step": 100 }, { "epoch": 0.22105554020447638, "eval_loss": 1.9453095197677612, "eval_mean_token_accuracy": 0.5785076600132566, "eval_runtime": 23.0365, "eval_samples_per_second": 34.293, "eval_steps_per_second": 4.298, "step": 100 }, { "epoch": 0.24316109422492402, "grad_norm": 2.5, "learning_rate": 0.00026407982261640795, "loss": 1.967, "mean_token_accuracy": 0.5708974912762642, "step": 110 }, { "epoch": 0.26526664824537166, "grad_norm": 2.6875, "learning_rate": 0.0002607538802660754, "loss": 1.9176, "mean_token_accuracy": 0.5783881578594446, "step": 120 }, { "epoch": 0.2873722022658193, "grad_norm": 2.3125, "learning_rate": 0.00025742793791574275, "loss": 1.8481, "mean_token_accuracy": 0.5897452015429735, "step": 130 }, { "epoch": 0.30947775628626695, "grad_norm": 2.484375, "learning_rate": 0.0002541019955654102, "loss": 1.8706, "mean_token_accuracy": 0.5862758502364158, "step": 140 }, { "epoch": 0.33158331030671456, "grad_norm": 2.15625, "learning_rate": 0.0002507760532150776, "loss": 1.8667, "mean_token_accuracy": 0.5883878566324711, "step": 150 }, { "epoch": 0.3536888643271622, "grad_norm": 2.421875, "learning_rate": 0.000247450110864745, "loss": 1.8101, "mean_token_accuracy": 0.5959903877228498, "step": 160 }, { "epoch": 0.37579441834760985, "grad_norm": 2.34375, "learning_rate": 0.00024412416851441238, "loss": 1.8353, "mean_token_accuracy": 0.5912164930254221, "step": 170 }, { "epoch": 0.39789997236805746, "grad_norm": 2.25, "learning_rate": 0.00024079822616407978, "loss": 1.7821, "mean_token_accuracy": 0.6001696892082691, "step": 180 }, { "epoch": 0.42000552638850513, "grad_norm": 2.21875, "learning_rate": 0.0002374722838137472, "loss": 1.8351, "mean_token_accuracy": 0.5922294020652771, "step": 190 }, { "epoch": 0.44211108040895275, "grad_norm": 2.296875, "learning_rate": 0.0002341463414634146, "loss": 1.7342, "mean_token_accuracy": 0.6008941765874625, "step": 200 }, { "epoch": 0.44211108040895275, "eval_loss": 1.7812447547912598, "eval_mean_token_accuracy": 0.6040364732645979, "eval_runtime": 21.5006, "eval_samples_per_second": 36.743, "eval_steps_per_second": 4.605, "step": 200 }, { "epoch": 0.46421663442940037, "grad_norm": 2.015625, "learning_rate": 0.000230820399113082, "loss": 1.7778, "mean_token_accuracy": 0.6000470589846373, "step": 210 }, { "epoch": 0.48632218844984804, "grad_norm": 1.890625, "learning_rate": 0.0002274944567627494, "loss": 1.7515, "mean_token_accuracy": 0.6070282235741615, "step": 220 }, { "epoch": 0.5084277424702957, "grad_norm": 2.109375, "learning_rate": 0.0002241685144124168, "loss": 1.7348, "mean_token_accuracy": 0.6033936321735383, "step": 230 }, { "epoch": 0.5305332964907433, "grad_norm": 1.8515625, "learning_rate": 0.0002208425720620842, "loss": 1.7198, "mean_token_accuracy": 0.6069234035909176, "step": 240 }, { "epoch": 0.5526388505111909, "grad_norm": 1.984375, "learning_rate": 0.00021751662971175166, "loss": 1.6866, "mean_token_accuracy": 0.6128835912793875, "step": 250 }, { "epoch": 0.5747444045316386, "grad_norm": 2.0625, "learning_rate": 0.00021419068736141907, "loss": 1.6751, "mean_token_accuracy": 0.6139085631817579, "step": 260 }, { "epoch": 0.5968499585520862, "grad_norm": 2.203125, "learning_rate": 0.00021086474501108647, "loss": 1.7237, "mean_token_accuracy": 0.6062099590897561, "step": 270 }, { "epoch": 0.6189555125725339, "grad_norm": 2.09375, "learning_rate": 0.00020753880266075387, "loss": 1.6894, "mean_token_accuracy": 0.6122250266373157, "step": 280 }, { "epoch": 0.6410610665929815, "grad_norm": 1.7734375, "learning_rate": 0.00020421286031042127, "loss": 1.6325, "mean_token_accuracy": 0.6204121351242066, "step": 290 }, { "epoch": 0.6631666206134291, "grad_norm": 2.0625, "learning_rate": 0.0002008869179600887, "loss": 1.6668, "mean_token_accuracy": 0.615133136883378, "step": 300 }, { "epoch": 0.6631666206134291, "eval_loss": 1.6475698947906494, "eval_mean_token_accuracy": 0.6219914049813242, "eval_runtime": 21.4767, "eval_samples_per_second": 36.784, "eval_steps_per_second": 4.61, "step": 300 }, { "epoch": 0.6852721746338768, "grad_norm": 1.8359375, "learning_rate": 0.0001975609756097561, "loss": 1.614, "mean_token_accuracy": 0.6179928559809923, "step": 310 }, { "epoch": 0.7073777286543244, "grad_norm": 10.6875, "learning_rate": 0.0001942350332594235, "loss": 1.7166, "mean_token_accuracy": 0.6136065050959587, "step": 320 }, { "epoch": 0.729483282674772, "grad_norm": 1.8203125, "learning_rate": 0.0001909090909090909, "loss": 1.657, "mean_token_accuracy": 0.6190799050033092, "step": 330 }, { "epoch": 0.7515888366952197, "grad_norm": 2.078125, "learning_rate": 0.0001875831485587583, "loss": 1.6038, "mean_token_accuracy": 0.6287192285060883, "step": 340 }, { "epoch": 0.7736943907156673, "grad_norm": 1.71875, "learning_rate": 0.0001842572062084257, "loss": 1.5914, "mean_token_accuracy": 0.6284744247794152, "step": 350 }, { "epoch": 0.7957999447361149, "grad_norm": 2.0, "learning_rate": 0.00018093126385809312, "loss": 1.6075, "mean_token_accuracy": 0.6283752031624317, "step": 360 }, { "epoch": 0.8179054987565626, "grad_norm": 2.0, "learning_rate": 0.00017760532150776052, "loss": 1.6082, "mean_token_accuracy": 0.6261149801313877, "step": 370 }, { "epoch": 0.8400110527770103, "grad_norm": 1.8984375, "learning_rate": 0.00017427937915742792, "loss": 1.5879, "mean_token_accuracy": 0.6261836618185044, "step": 380 }, { "epoch": 0.8621166067974578, "grad_norm": 2.515625, "learning_rate": 0.00017095343680709532, "loss": 1.5785, "mean_token_accuracy": 0.631008780002594, "step": 390 }, { "epoch": 0.8842221608179055, "grad_norm": 2.015625, "learning_rate": 0.00016762749445676272, "loss": 1.4931, "mean_token_accuracy": 0.6441986732184887, "step": 400 }, { "epoch": 0.8842221608179055, "eval_loss": 1.5408236980438232, "eval_mean_token_accuracy": 0.6406004489070237, "eval_runtime": 21.4648, "eval_samples_per_second": 36.804, "eval_steps_per_second": 4.612, "step": 400 }, { "epoch": 0.9063277148383532, "grad_norm": 1.6328125, "learning_rate": 0.00016430155210643015, "loss": 1.5441, "mean_token_accuracy": 0.6400343291461468, "step": 410 }, { "epoch": 0.9284332688588007, "grad_norm": 1.796875, "learning_rate": 0.00016097560975609755, "loss": 1.5299, "mean_token_accuracy": 0.6389802560210228, "step": 420 }, { "epoch": 0.9505388228792484, "grad_norm": 1.7890625, "learning_rate": 0.00015764966740576495, "loss": 1.524, "mean_token_accuracy": 0.6369420018047094, "step": 430 }, { "epoch": 0.9726443768996961, "grad_norm": 1.734375, "learning_rate": 0.00015432372505543235, "loss": 1.5423, "mean_token_accuracy": 0.6327314972877502, "step": 440 }, { "epoch": 0.9947499309201436, "grad_norm": 1.84375, "learning_rate": 0.00015099778270509975, "loss": 1.5033, "mean_token_accuracy": 0.6431983485817909, "step": 450 }, { "epoch": 1.0154738878143132, "grad_norm": 1.6640625, "learning_rate": 0.00014767184035476718, "loss": 1.2191, "mean_token_accuracy": 0.6965674503644308, "step": 460 }, { "epoch": 1.037579441834761, "grad_norm": 1.7890625, "learning_rate": 0.00014434589800443458, "loss": 1.0348, "mean_token_accuracy": 0.7245359934866429, "step": 470 }, { "epoch": 1.0596849958552086, "grad_norm": 1.703125, "learning_rate": 0.00014101995565410198, "loss": 1.0563, "mean_token_accuracy": 0.7208410792052746, "step": 480 }, { "epoch": 1.0817905498756564, "grad_norm": 1.5390625, "learning_rate": 0.00013769401330376938, "loss": 1.0467, "mean_token_accuracy": 0.7243687815964222, "step": 490 }, { "epoch": 1.103896103896104, "grad_norm": 1.734375, "learning_rate": 0.00013436807095343678, "loss": 1.048, "mean_token_accuracy": 0.718181136995554, "step": 500 }, { "epoch": 1.103896103896104, "eval_loss": 1.5222158432006836, "eval_mean_token_accuracy": 0.6474175338793282, "eval_runtime": 21.4789, "eval_samples_per_second": 36.78, "eval_steps_per_second": 4.609, "step": 500 }, { "epoch": 1.1260016579165515, "grad_norm": 1.7421875, "learning_rate": 0.0001310421286031042, "loss": 1.0292, "mean_token_accuracy": 0.724801865965128, "step": 510 }, { "epoch": 1.148107211936999, "grad_norm": 1.5546875, "learning_rate": 0.0001277161862527716, "loss": 1.0431, "mean_token_accuracy": 0.7216434337198734, "step": 520 }, { "epoch": 1.1702127659574468, "grad_norm": 1.671875, "learning_rate": 0.000124390243902439, "loss": 1.0261, "mean_token_accuracy": 0.7233444675803185, "step": 530 }, { "epoch": 1.1923183199778944, "grad_norm": 1.734375, "learning_rate": 0.00012106430155210642, "loss": 1.0001, "mean_token_accuracy": 0.7310704313218593, "step": 540 }, { "epoch": 1.2144238739983422, "grad_norm": 1.78125, "learning_rate": 0.00011773835920177382, "loss": 0.9996, "mean_token_accuracy": 0.7315604917705059, "step": 550 }, { "epoch": 1.2365294280187897, "grad_norm": 1.8203125, "learning_rate": 0.00011441241685144122, "loss": 0.9732, "mean_token_accuracy": 0.7363451808691025, "step": 560 }, { "epoch": 1.2586349820392373, "grad_norm": 1.796875, "learning_rate": 0.00011108647450110864, "loss": 1.0185, "mean_token_accuracy": 0.7294765569269657, "step": 570 }, { "epoch": 1.280740536059685, "grad_norm": 1.6953125, "learning_rate": 0.00010776053215077604, "loss": 0.9948, "mean_token_accuracy": 0.7329611636698246, "step": 580 }, { "epoch": 1.3028460900801326, "grad_norm": 1.7265625, "learning_rate": 0.00010443458980044345, "loss": 0.9808, "mean_token_accuracy": 0.7348393484950065, "step": 590 }, { "epoch": 1.3249516441005802, "grad_norm": 1.6875, "learning_rate": 0.00010110864745011085, "loss": 0.9809, "mean_token_accuracy": 0.7370587438344955, "step": 600 }, { "epoch": 1.3249516441005802, "eval_loss": 1.4869917631149292, "eval_mean_token_accuracy": 0.6575891068487456, "eval_runtime": 21.4699, "eval_samples_per_second": 36.796, "eval_steps_per_second": 4.611, "step": 600 }, { "epoch": 1.347057198121028, "grad_norm": 1.7578125, "learning_rate": 9.778270509977825e-05, "loss": 0.9891, "mean_token_accuracy": 0.7354318417608738, "step": 610 }, { "epoch": 1.3691627521414755, "grad_norm": 1.703125, "learning_rate": 9.445676274944568e-05, "loss": 0.9653, "mean_token_accuracy": 0.7393273778259755, "step": 620 }, { "epoch": 1.391268306161923, "grad_norm": 1.671875, "learning_rate": 9.113082039911308e-05, "loss": 0.9552, "mean_token_accuracy": 0.7432896822690964, "step": 630 }, { "epoch": 1.4133738601823709, "grad_norm": 1.6015625, "learning_rate": 8.780487804878047e-05, "loss": 0.9713, "mean_token_accuracy": 0.7419402815401555, "step": 640 }, { "epoch": 1.4354794142028184, "grad_norm": 1.640625, "learning_rate": 8.44789356984479e-05, "loss": 0.9548, "mean_token_accuracy": 0.7421382986009121, "step": 650 }, { "epoch": 1.4575849682232662, "grad_norm": 1.6796875, "learning_rate": 8.11529933481153e-05, "loss": 0.9824, "mean_token_accuracy": 0.737388264387846, "step": 660 }, { "epoch": 1.4796905222437138, "grad_norm": 1.7734375, "learning_rate": 7.78270509977827e-05, "loss": 0.937, "mean_token_accuracy": 0.7477126508951187, "step": 670 }, { "epoch": 1.5017960762641613, "grad_norm": 1.8984375, "learning_rate": 7.450110864745011e-05, "loss": 0.9433, "mean_token_accuracy": 0.744731155782938, "step": 680 }, { "epoch": 1.523901630284609, "grad_norm": 1.765625, "learning_rate": 7.117516629711751e-05, "loss": 0.9259, "mean_token_accuracy": 0.745546705648303, "step": 690 }, { "epoch": 1.5460071843050567, "grad_norm": 1.625, "learning_rate": 6.784922394678491e-05, "loss": 0.9625, "mean_token_accuracy": 0.7406142510473728, "step": 700 }, { "epoch": 1.5460071843050567, "eval_loss": 1.429708480834961, "eval_mean_token_accuracy": 0.6674101424939705, "eval_runtime": 21.4679, "eval_samples_per_second": 36.799, "eval_steps_per_second": 4.612, "step": 700 }, { "epoch": 1.5681127383255042, "grad_norm": 1.75, "learning_rate": 6.452328159645232e-05, "loss": 0.9285, "mean_token_accuracy": 0.7476192332804203, "step": 710 }, { "epoch": 1.590218292345952, "grad_norm": 1.6640625, "learning_rate": 6.119733924611973e-05, "loss": 0.9173, "mean_token_accuracy": 0.75327173396945, "step": 720 }, { "epoch": 1.6123238463663996, "grad_norm": 1.765625, "learning_rate": 5.787139689578713e-05, "loss": 0.9392, "mean_token_accuracy": 0.7470508739352226, "step": 730 }, { "epoch": 1.6344294003868471, "grad_norm": 1.484375, "learning_rate": 5.454545454545454e-05, "loss": 0.9148, "mean_token_accuracy": 0.7503846064209938, "step": 740 }, { "epoch": 1.6565349544072947, "grad_norm": 1.578125, "learning_rate": 5.121951219512195e-05, "loss": 0.9286, "mean_token_accuracy": 0.749035281687975, "step": 750 }, { "epoch": 1.6786405084277425, "grad_norm": 1.6484375, "learning_rate": 4.7893569844789354e-05, "loss": 0.9312, "mean_token_accuracy": 0.748369749635458, "step": 760 }, { "epoch": 1.7007460624481903, "grad_norm": 1.6484375, "learning_rate": 4.456762749445676e-05, "loss": 0.9585, "mean_token_accuracy": 0.742963894456625, "step": 770 }, { "epoch": 1.7228516164686378, "grad_norm": 1.484375, "learning_rate": 4.124168514412417e-05, "loss": 1.0002, "mean_token_accuracy": 0.7539634991437196, "step": 780 }, { "epoch": 1.7449571704890854, "grad_norm": 1.6484375, "learning_rate": 3.791574279379157e-05, "loss": 0.93, "mean_token_accuracy": 0.7481286890804768, "step": 790 }, { "epoch": 1.767062724509533, "grad_norm": 1.7421875, "learning_rate": 3.4589800443458975e-05, "loss": 0.9119, "mean_token_accuracy": 0.7515073113143445, "step": 800 }, { "epoch": 1.767062724509533, "eval_loss": 1.4062376022338867, "eval_mean_token_accuracy": 0.6735276628022242, "eval_runtime": 21.4799, "eval_samples_per_second": 36.779, "eval_steps_per_second": 4.609, "step": 800 }, { "epoch": 1.7891682785299805, "grad_norm": 1.7734375, "learning_rate": 3.126385809312638e-05, "loss": 0.9018, "mean_token_accuracy": 0.7531212449073792, "step": 810 }, { "epoch": 1.8112738325504283, "grad_norm": 1.7421875, "learning_rate": 2.793791574279379e-05, "loss": 0.9048, "mean_token_accuracy": 0.7560021050274373, "step": 820 }, { "epoch": 1.833379386570876, "grad_norm": 1.6953125, "learning_rate": 2.4611973392461197e-05, "loss": 0.9028, "mean_token_accuracy": 0.7558302395045757, "step": 830 }, { "epoch": 1.8554849405913236, "grad_norm": 1.578125, "learning_rate": 2.12860310421286e-05, "loss": 0.9158, "mean_token_accuracy": 0.7508154392242432, "step": 840 }, { "epoch": 1.8775904946117712, "grad_norm": 1.734375, "learning_rate": 1.7960088691796008e-05, "loss": 0.8864, "mean_token_accuracy": 0.7576524101197719, "step": 850 }, { "epoch": 1.8996960486322187, "grad_norm": 1.8203125, "learning_rate": 1.4634146341463413e-05, "loss": 0.9228, "mean_token_accuracy": 0.7508851245045662, "step": 860 }, { "epoch": 1.9218016026526665, "grad_norm": 1.6015625, "learning_rate": 1.130820399113082e-05, "loss": 0.8975, "mean_token_accuracy": 0.7541297495365142, "step": 870 }, { "epoch": 1.943907156673114, "grad_norm": 1.75, "learning_rate": 7.982261640798226e-06, "loss": 0.8971, "mean_token_accuracy": 0.7586074694991112, "step": 880 }, { "epoch": 1.9660127106935619, "grad_norm": 1.5546875, "learning_rate": 4.656319290465632e-06, "loss": 0.9251, "mean_token_accuracy": 0.7490280956029892, "step": 890 }, { "epoch": 1.9881182647140094, "grad_norm": 1.671875, "learning_rate": 1.3303769401330375e-06, "loss": 0.9005, "mean_token_accuracy": 0.7555492661893368, "step": 900 }, { "epoch": 1.9881182647140094, "eval_loss": 1.4004004001617432, "eval_mean_token_accuracy": 0.6744482210188201, "eval_runtime": 21.4879, "eval_samples_per_second": 36.765, "eval_steps_per_second": 4.607, "step": 900 } ], "logging_steps": 10, "max_steps": 904, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.7777625251882496e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }