{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7152436298614215, "eval_steps": 100, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001788109074653554, "eval_loss": 2.046928882598877, "eval_runtime": 19.3171, "eval_samples_per_second": 12.217, "eval_steps_per_second": 6.109, "step": 1 }, { "epoch": 0.008940545373267769, "grad_norm": 12.01391887664795, "learning_rate": 1.6666666666666667e-05, "loss": 7.1881, "step": 5 }, { "epoch": 0.017881090746535537, "grad_norm": 16.71553611755371, "learning_rate": 3.3333333333333335e-05, "loss": 6.7995, "step": 10 }, { "epoch": 0.02682163611980331, "grad_norm": 31.485000610351562, "learning_rate": 5e-05, "loss": 4.923, "step": 15 }, { "epoch": 0.035762181493071074, "grad_norm": 23.379528045654297, "learning_rate": 6.666666666666667e-05, "loss": 5.7787, "step": 20 }, { "epoch": 0.044702726866338846, "grad_norm": 18.306440353393555, "learning_rate": 8.333333333333334e-05, "loss": 4.6506, "step": 25 }, { "epoch": 0.05364327223960662, "grad_norm": 25.282123565673828, "learning_rate": 0.0001, "loss": 4.8731, "step": 30 }, { "epoch": 0.06258381761287439, "grad_norm": 22.050247192382812, "learning_rate": 9.995494831023409e-05, "loss": 4.7905, "step": 35 }, { "epoch": 0.07152436298614215, "grad_norm": 36.01411056518555, "learning_rate": 9.981987442712633e-05, "loss": 6.3107, "step": 40 }, { "epoch": 0.08046490835940992, "grad_norm": 41.16471481323242, "learning_rate": 9.959502176294383e-05, "loss": 5.1572, "step": 45 }, { "epoch": 0.08940545373267769, "grad_norm": 50.630348205566406, "learning_rate": 9.928079551738543e-05, "loss": 5.9515, "step": 50 }, { "epoch": 0.09834599910594546, "grad_norm": 10.866789817810059, "learning_rate": 9.887776194738432e-05, "loss": 6.7468, "step": 55 }, { "epoch": 0.10728654447921324, "grad_norm": 11.329707145690918, "learning_rate": 9.838664734667495e-05, "loss": 6.4092, "step": 60 }, { "epoch": 0.11622708985248101, "grad_norm": 12.150516510009766, "learning_rate": 9.780833673696254e-05, "loss": 4.6159, "step": 65 }, { "epoch": 0.12516763522574878, "grad_norm": 12.043376922607422, "learning_rate": 9.714387227305422e-05, "loss": 4.6607, "step": 70 }, { "epoch": 0.13410818059901655, "grad_norm": 13.809493064880371, "learning_rate": 9.639445136482548e-05, "loss": 4.6828, "step": 75 }, { "epoch": 0.1430487259722843, "grad_norm": 17.044633865356445, "learning_rate": 9.55614245194068e-05, "loss": 4.1347, "step": 80 }, { "epoch": 0.15198927134555207, "grad_norm": 17.997037887573242, "learning_rate": 9.464629290747842e-05, "loss": 4.9318, "step": 85 }, { "epoch": 0.16092981671881984, "grad_norm": 29.060178756713867, "learning_rate": 9.365070565805941e-05, "loss": 5.6566, "step": 90 }, { "epoch": 0.1698703620920876, "grad_norm": 37.676151275634766, "learning_rate": 9.257645688666556e-05, "loss": 4.9255, "step": 95 }, { "epoch": 0.17881090746535538, "grad_norm": 40.094573974609375, "learning_rate": 9.142548246219212e-05, "loss": 5.421, "step": 100 }, { "epoch": 0.17881090746535538, "eval_loss": 1.4111511707305908, "eval_runtime": 19.669, "eval_samples_per_second": 11.999, "eval_steps_per_second": 5.999, "step": 100 }, { "epoch": 0.18775145283862316, "grad_norm": 8.953372955322266, "learning_rate": 9.019985651834703e-05, "loss": 6.6067, "step": 105 }, { "epoch": 0.19669199821189093, "grad_norm": 9.89356803894043, "learning_rate": 8.890178771592199e-05, "loss": 5.5489, "step": 110 }, { "epoch": 0.2056325435851587, "grad_norm": 10.751019477844238, "learning_rate": 8.753361526263621e-05, "loss": 4.145, "step": 115 }, { "epoch": 0.21457308895842647, "grad_norm": 11.124737739562988, "learning_rate": 8.609780469772623e-05, "loss": 3.9116, "step": 120 }, { "epoch": 0.22351363433169424, "grad_norm": 13.146486282348633, "learning_rate": 8.459694344887732e-05, "loss": 4.5305, "step": 125 }, { "epoch": 0.23245417970496202, "grad_norm": 20.332517623901367, "learning_rate": 8.303373616950408e-05, "loss": 4.6298, "step": 130 }, { "epoch": 0.24139472507822976, "grad_norm": 15.850321769714355, "learning_rate": 8.141099986478212e-05, "loss": 5.1285, "step": 135 }, { "epoch": 0.25033527045149756, "grad_norm": 25.98038673400879, "learning_rate": 7.973165881521434e-05, "loss": 4.8565, "step": 140 }, { "epoch": 0.25927581582476533, "grad_norm": 41.79204559326172, "learning_rate": 7.799873930687978e-05, "loss": 5.2209, "step": 145 }, { "epoch": 0.2682163611980331, "grad_norm": 66.45893096923828, "learning_rate": 7.621536417786159e-05, "loss": 5.4458, "step": 150 }, { "epoch": 0.2771569065713009, "grad_norm": 7.945077896118164, "learning_rate": 7.438474719068173e-05, "loss": 6.6603, "step": 155 }, { "epoch": 0.2860974519445686, "grad_norm": 11.32933521270752, "learning_rate": 7.251018724088367e-05, "loss": 5.9139, "step": 160 }, { "epoch": 0.29503799731783636, "grad_norm": 9.906847953796387, "learning_rate": 7.059506241219965e-05, "loss": 5.6972, "step": 165 }, { "epoch": 0.30397854269110414, "grad_norm": 11.90020751953125, "learning_rate": 6.864282388901544e-05, "loss": 4.1571, "step": 170 }, { "epoch": 0.3129190880643719, "grad_norm": 11.035331726074219, "learning_rate": 6.665698973710288e-05, "loss": 4.3225, "step": 175 }, { "epoch": 0.3218596334376397, "grad_norm": 16.212894439697266, "learning_rate": 6.464113856382752e-05, "loss": 4.0366, "step": 180 }, { "epoch": 0.33080017881090745, "grad_norm": 14.783008575439453, "learning_rate": 6.259890306925627e-05, "loss": 3.9975, "step": 185 }, { "epoch": 0.3397407241841752, "grad_norm": 20.191490173339844, "learning_rate": 6.0533963499786314e-05, "loss": 4.8397, "step": 190 }, { "epoch": 0.348681269557443, "grad_norm": 19.51316261291504, "learning_rate": 5.8450041016092464e-05, "loss": 3.8329, "step": 195 }, { "epoch": 0.35762181493071077, "grad_norm": 97.10255432128906, "learning_rate": 5.6350890987343944e-05, "loss": 6.9008, "step": 200 }, { "epoch": 0.35762181493071077, "eval_loss": 1.2790147066116333, "eval_runtime": 19.6612, "eval_samples_per_second": 12.003, "eval_steps_per_second": 6.002, "step": 200 }, { "epoch": 0.36656236030397854, "grad_norm": 9.973484992980957, "learning_rate": 5.4240296223775465e-05, "loss": 6.283, "step": 205 }, { "epoch": 0.3755029056772463, "grad_norm": 9.57100772857666, "learning_rate": 5.212206015980742e-05, "loss": 5.2025, "step": 210 }, { "epoch": 0.3844434510505141, "grad_norm": 11.266950607299805, "learning_rate": 5e-05, "loss": 4.9933, "step": 215 }, { "epoch": 0.39338399642378186, "grad_norm": 15.667479515075684, "learning_rate": 4.78779398401926e-05, "loss": 4.0976, "step": 220 }, { "epoch": 0.40232454179704963, "grad_norm": 14.551920890808105, "learning_rate": 4.575970377622456e-05, "loss": 3.9731, "step": 225 }, { "epoch": 0.4112650871703174, "grad_norm": 9.969139099121094, "learning_rate": 4.364910901265606e-05, "loss": 4.0724, "step": 230 }, { "epoch": 0.4202056325435852, "grad_norm": 13.29488754272461, "learning_rate": 4.1549958983907555e-05, "loss": 4.0002, "step": 235 }, { "epoch": 0.42914617791685294, "grad_norm": 17.779531478881836, "learning_rate": 3.94660365002137e-05, "loss": 4.5389, "step": 240 }, { "epoch": 0.4380867232901207, "grad_norm": 39.08097839355469, "learning_rate": 3.740109693074375e-05, "loss": 4.8595, "step": 245 }, { "epoch": 0.4470272686633885, "grad_norm": 51.12818908691406, "learning_rate": 3.5358861436172485e-05, "loss": 5.6399, "step": 250 }, { "epoch": 0.45596781403665626, "grad_norm": 8.17031192779541, "learning_rate": 3.334301026289712e-05, "loss": 6.488, "step": 255 }, { "epoch": 0.46490835940992403, "grad_norm": 11.224337577819824, "learning_rate": 3.135717611098458e-05, "loss": 4.4233, "step": 260 }, { "epoch": 0.47384890478319175, "grad_norm": 11.139561653137207, "learning_rate": 2.9404937587800375e-05, "loss": 4.3712, "step": 265 }, { "epoch": 0.4827894501564595, "grad_norm": 9.91527271270752, "learning_rate": 2.748981275911633e-05, "loss": 4.235, "step": 270 }, { "epoch": 0.4917299955297273, "grad_norm": 11.947907447814941, "learning_rate": 2.5615252809318284e-05, "loss": 3.5619, "step": 275 }, { "epoch": 0.5006705409029951, "grad_norm": 14.57845687866211, "learning_rate": 2.3784635822138424e-05, "loss": 4.2477, "step": 280 }, { "epoch": 0.5096110862762628, "grad_norm": 16.487865447998047, "learning_rate": 2.2001260693120233e-05, "loss": 4.7886, "step": 285 }, { "epoch": 0.5185516316495307, "grad_norm": 25.24747657775879, "learning_rate": 2.026834118478567e-05, "loss": 3.643, "step": 290 }, { "epoch": 0.5274921770227984, "grad_norm": 29.675588607788086, "learning_rate": 1.858900013521788e-05, "loss": 4.3066, "step": 295 }, { "epoch": 0.5364327223960662, "grad_norm": 42.390953063964844, "learning_rate": 1.6966263830495936e-05, "loss": 4.9912, "step": 300 }, { "epoch": 0.5364327223960662, "eval_loss": 1.2248698472976685, "eval_runtime": 19.6654, "eval_samples_per_second": 12.001, "eval_steps_per_second": 6.0, "step": 300 }, { "epoch": 0.5453732677693339, "grad_norm": 6.869641304016113, "learning_rate": 1.5403056551122697e-05, "loss": 5.6695, "step": 305 }, { "epoch": 0.5543138131426018, "grad_norm": 12.490229606628418, "learning_rate": 1.3902195302273779e-05, "loss": 4.9411, "step": 310 }, { "epoch": 0.5632543585158695, "grad_norm": 12.478997230529785, "learning_rate": 1.246638473736378e-05, "loss": 4.2023, "step": 315 }, { "epoch": 0.5721949038891372, "grad_norm": 12.402631759643555, "learning_rate": 1.1098212284078036e-05, "loss": 4.0082, "step": 320 }, { "epoch": 0.581135449262405, "grad_norm": 12.362221717834473, "learning_rate": 9.800143481652979e-06, "loss": 4.2276, "step": 325 }, { "epoch": 0.5900759946356727, "grad_norm": 12.496500015258789, "learning_rate": 8.574517537807897e-06, "loss": 3.4944, "step": 330 }, { "epoch": 0.5990165400089406, "grad_norm": 25.300518035888672, "learning_rate": 7.423543113334436e-06, "loss": 4.4454, "step": 335 }, { "epoch": 0.6079570853822083, "grad_norm": 19.98192596435547, "learning_rate": 6.349294341940593e-06, "loss": 4.4782, "step": 340 }, { "epoch": 0.6168976307554761, "grad_norm": 34.964942932128906, "learning_rate": 5.353707092521582e-06, "loss": 3.9177, "step": 345 }, { "epoch": 0.6258381761287438, "grad_norm": 37.618648529052734, "learning_rate": 4.43857548059321e-06, "loss": 5.8998, "step": 350 }, { "epoch": 0.6347787215020116, "grad_norm": 12.334329605102539, "learning_rate": 3.605548635174533e-06, "loss": 5.8214, "step": 355 }, { "epoch": 0.6437192668752794, "grad_norm": 11.229202270507812, "learning_rate": 2.85612772694579e-06, "loss": 4.0393, "step": 360 }, { "epoch": 0.6526598122485472, "grad_norm": 10.08092212677002, "learning_rate": 2.191663263037458e-06, "loss": 4.3505, "step": 365 }, { "epoch": 0.6616003576218149, "grad_norm": 9.676584243774414, "learning_rate": 1.6133526533250565e-06, "loss": 3.7247, "step": 370 }, { "epoch": 0.6705409029950827, "grad_norm": 10.292547225952148, "learning_rate": 1.1222380526156928e-06, "loss": 3.5345, "step": 375 }, { "epoch": 0.6794814483683504, "grad_norm": 15.131571769714355, "learning_rate": 7.192044826145771e-07, "loss": 3.9341, "step": 380 }, { "epoch": 0.6884219937416183, "grad_norm": 19.556400299072266, "learning_rate": 4.049782370561583e-07, "loss": 4.0314, "step": 385 }, { "epoch": 0.697362539114886, "grad_norm": 22.79786491394043, "learning_rate": 1.8012557287367392e-07, "loss": 4.756, "step": 390 }, { "epoch": 0.7063030844881538, "grad_norm": 29.830158233642578, "learning_rate": 4.5051689765929214e-08, "loss": 4.7687, "step": 395 }, { "epoch": 0.7152436298614215, "grad_norm": 40.145057678222656, "learning_rate": 0.0, "loss": 4.1833, "step": 400 }, { "epoch": 0.7152436298614215, "eval_loss": 1.181886076927185, "eval_runtime": 19.6499, "eval_samples_per_second": 12.01, "eval_steps_per_second": 6.005, "step": 400 } ], "logging_steps": 5, "max_steps": 400, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.4091489359010202e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }