{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 817, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006119951040391677, "grad_norm": 15.50185489654541, "learning_rate": 5e-05, "loss": 5.9855, "num_input_tokens_seen": 92864, "step": 5 }, { "epoch": 0.012239902080783354, "grad_norm": 3.99252986907959, "learning_rate": 5e-05, "loss": 1.0463, "num_input_tokens_seen": 166552, "step": 10 }, { "epoch": 0.01835985312117503, "grad_norm": 3.7689640522003174, "learning_rate": 5e-05, "loss": 0.8016, "num_input_tokens_seen": 247024, "step": 15 }, { "epoch": 0.02447980416156671, "grad_norm": 2.2368924617767334, "learning_rate": 5e-05, "loss": 0.7061, "num_input_tokens_seen": 320328, "step": 20 }, { "epoch": 0.030599755201958383, "grad_norm": 2.7950894832611084, "learning_rate": 5e-05, "loss": 0.5543, "num_input_tokens_seen": 394664, "step": 25 }, { "epoch": 0.03671970624235006, "grad_norm": 2.344271659851074, "learning_rate": 5e-05, "loss": 0.4609, "num_input_tokens_seen": 481928, "step": 30 }, { "epoch": 0.042839657282741736, "grad_norm": 1.4321657419204712, "learning_rate": 5e-05, "loss": 0.4659, "num_input_tokens_seen": 560424, "step": 35 }, { "epoch": 0.04895960832313342, "grad_norm": 1.6118403673171997, "learning_rate": 5e-05, "loss": 0.5474, "num_input_tokens_seen": 643896, "step": 40 }, { "epoch": 0.05507955936352509, "grad_norm": 1.3398417234420776, "learning_rate": 5e-05, "loss": 0.369, "num_input_tokens_seen": 718456, "step": 45 }, { "epoch": 0.06119951040391677, "grad_norm": 2.494168281555176, "learning_rate": 5e-05, "loss": 0.3453, "num_input_tokens_seen": 798904, "step": 50 }, { "epoch": 0.06731946144430845, "grad_norm": 2.230762481689453, "learning_rate": 5e-05, "loss": 0.4311, "num_input_tokens_seen": 874288, "step": 55 }, { "epoch": 0.07343941248470012, "grad_norm": 1.0016298294067383, "learning_rate": 5e-05, "loss": 0.311, "num_input_tokens_seen": 957776, "step": 60 }, { "epoch": 0.0795593635250918, "grad_norm": 1.7588512897491455, "learning_rate": 5e-05, "loss": 0.3786, "num_input_tokens_seen": 1020136, "step": 65 }, { "epoch": 0.08567931456548347, "grad_norm": 1.8847310543060303, "learning_rate": 5e-05, "loss": 0.3514, "num_input_tokens_seen": 1098480, "step": 70 }, { "epoch": 0.09179926560587515, "grad_norm": 2.2706146240234375, "learning_rate": 5e-05, "loss": 0.3355, "num_input_tokens_seen": 1172240, "step": 75 }, { "epoch": 0.09791921664626684, "grad_norm": 2.4245944023132324, "learning_rate": 5e-05, "loss": 0.412, "num_input_tokens_seen": 1247888, "step": 80 }, { "epoch": 0.10403916768665851, "grad_norm": 2.491201639175415, "learning_rate": 5e-05, "loss": 0.4127, "num_input_tokens_seen": 1319664, "step": 85 }, { "epoch": 0.11015911872705018, "grad_norm": 1.311059832572937, "learning_rate": 5e-05, "loss": 0.302, "num_input_tokens_seen": 1398192, "step": 90 }, { "epoch": 0.11627906976744186, "grad_norm": 1.2040491104125977, "learning_rate": 5e-05, "loss": 0.3489, "num_input_tokens_seen": 1475704, "step": 95 }, { "epoch": 0.12239902080783353, "grad_norm": 3.255805253982544, "learning_rate": 5e-05, "loss": 0.4049, "num_input_tokens_seen": 1562344, "step": 100 }, { "epoch": 0.12851897184822522, "grad_norm": 2.2746737003326416, "learning_rate": 5e-05, "loss": 0.2916, "num_input_tokens_seen": 1638240, "step": 105 }, { "epoch": 0.1346389228886169, "grad_norm": 2.090926170349121, "learning_rate": 5e-05, "loss": 0.4657, "num_input_tokens_seen": 1711056, "step": 110 }, { "epoch": 0.14075887392900857, "grad_norm": 1.9717285633087158, "learning_rate": 5e-05, "loss": 0.326, "num_input_tokens_seen": 1791240, "step": 115 }, { "epoch": 0.14687882496940025, "grad_norm": 2.5442869663238525, "learning_rate": 5e-05, "loss": 0.4545, "num_input_tokens_seen": 1867944, "step": 120 }, { "epoch": 0.15299877600979192, "grad_norm": 1.0716873407363892, "learning_rate": 5e-05, "loss": 0.3623, "num_input_tokens_seen": 1947744, "step": 125 }, { "epoch": 0.1591187270501836, "grad_norm": 1.7015032768249512, "learning_rate": 5e-05, "loss": 0.3906, "num_input_tokens_seen": 2025720, "step": 130 }, { "epoch": 0.16523867809057527, "grad_norm": 0.9654507040977478, "learning_rate": 5e-05, "loss": 0.3797, "num_input_tokens_seen": 2104992, "step": 135 }, { "epoch": 0.17135862913096694, "grad_norm": 2.8033182621002197, "learning_rate": 5e-05, "loss": 0.3875, "num_input_tokens_seen": 2180648, "step": 140 }, { "epoch": 0.17747858017135862, "grad_norm": 1.4507533311843872, "learning_rate": 5e-05, "loss": 0.4188, "num_input_tokens_seen": 2246640, "step": 145 }, { "epoch": 0.1835985312117503, "grad_norm": 0.5426273941993713, "learning_rate": 5e-05, "loss": 0.4039, "num_input_tokens_seen": 2324624, "step": 150 }, { "epoch": 0.189718482252142, "grad_norm": 0.5102578401565552, "learning_rate": 5e-05, "loss": 0.3005, "num_input_tokens_seen": 2406352, "step": 155 }, { "epoch": 0.19583843329253367, "grad_norm": 1.6424686908721924, "learning_rate": 5e-05, "loss": 0.3075, "num_input_tokens_seen": 2478120, "step": 160 }, { "epoch": 0.20195838433292534, "grad_norm": 0.8794494271278381, "learning_rate": 5e-05, "loss": 0.317, "num_input_tokens_seen": 2562888, "step": 165 }, { "epoch": 0.20807833537331702, "grad_norm": 0.8234019875526428, "learning_rate": 5e-05, "loss": 0.3388, "num_input_tokens_seen": 2635304, "step": 170 }, { "epoch": 0.2141982864137087, "grad_norm": 0.4053242802619934, "learning_rate": 5e-05, "loss": 0.3495, "num_input_tokens_seen": 2707136, "step": 175 }, { "epoch": 0.22031823745410037, "grad_norm": 0.6473408937454224, "learning_rate": 5e-05, "loss": 0.3467, "num_input_tokens_seen": 2766208, "step": 180 }, { "epoch": 0.22643818849449204, "grad_norm": 0.8537346720695496, "learning_rate": 5e-05, "loss": 0.2882, "num_input_tokens_seen": 2840536, "step": 185 }, { "epoch": 0.23255813953488372, "grad_norm": 0.6824322938919067, "learning_rate": 5e-05, "loss": 0.2664, "num_input_tokens_seen": 2907464, "step": 190 }, { "epoch": 0.2386780905752754, "grad_norm": 0.6884396076202393, "learning_rate": 5e-05, "loss": 0.2957, "num_input_tokens_seen": 2976088, "step": 195 }, { "epoch": 0.24479804161566707, "grad_norm": 1.038341760635376, "learning_rate": 5e-05, "loss": 0.3627, "num_input_tokens_seen": 3049880, "step": 200 }, { "epoch": 0.25091799265605874, "grad_norm": 0.6371074914932251, "learning_rate": 5e-05, "loss": 0.4283, "num_input_tokens_seen": 3136440, "step": 205 }, { "epoch": 0.25703794369645044, "grad_norm": 1.1879721879959106, "learning_rate": 5e-05, "loss": 0.3298, "num_input_tokens_seen": 3209024, "step": 210 }, { "epoch": 0.2631578947368421, "grad_norm": 1.514265537261963, "learning_rate": 5e-05, "loss": 0.3975, "num_input_tokens_seen": 3290192, "step": 215 }, { "epoch": 0.2692778457772338, "grad_norm": 1.2970479726791382, "learning_rate": 5e-05, "loss": 0.2331, "num_input_tokens_seen": 3368808, "step": 220 }, { "epoch": 0.27539779681762544, "grad_norm": 5.48612117767334, "learning_rate": 5e-05, "loss": 0.3176, "num_input_tokens_seen": 3444128, "step": 225 }, { "epoch": 0.28151774785801714, "grad_norm": 1.9169118404388428, "learning_rate": 5e-05, "loss": 0.2638, "num_input_tokens_seen": 3522216, "step": 230 }, { "epoch": 0.2876376988984088, "grad_norm": 1.0575088262557983, "learning_rate": 5e-05, "loss": 0.3282, "num_input_tokens_seen": 3602256, "step": 235 }, { "epoch": 0.2937576499388005, "grad_norm": 1.955322265625, "learning_rate": 5e-05, "loss": 0.2492, "num_input_tokens_seen": 3674936, "step": 240 }, { "epoch": 0.2998776009791922, "grad_norm": 0.7224699854850769, "learning_rate": 5e-05, "loss": 0.3157, "num_input_tokens_seen": 3749888, "step": 245 }, { "epoch": 0.30599755201958384, "grad_norm": 1.2078381776809692, "learning_rate": 5e-05, "loss": 0.3591, "num_input_tokens_seen": 3821408, "step": 250 }, { "epoch": 0.31211750305997554, "grad_norm": 1.230710744857788, "learning_rate": 5e-05, "loss": 0.2546, "num_input_tokens_seen": 3900024, "step": 255 }, { "epoch": 0.3182374541003672, "grad_norm": 0.38521382212638855, "learning_rate": 5e-05, "loss": 0.2471, "num_input_tokens_seen": 3972872, "step": 260 }, { "epoch": 0.3243574051407589, "grad_norm": 1.9589159488677979, "learning_rate": 5e-05, "loss": 0.2279, "num_input_tokens_seen": 4047152, "step": 265 }, { "epoch": 0.33047735618115054, "grad_norm": 0.90382319688797, "learning_rate": 5e-05, "loss": 0.3831, "num_input_tokens_seen": 4139888, "step": 270 }, { "epoch": 0.33659730722154224, "grad_norm": 1.3298989534378052, "learning_rate": 5e-05, "loss": 0.4124, "num_input_tokens_seen": 4222680, "step": 275 }, { "epoch": 0.3427172582619339, "grad_norm": 1.2357605695724487, "learning_rate": 5e-05, "loss": 0.2841, "num_input_tokens_seen": 4298800, "step": 280 }, { "epoch": 0.3488372093023256, "grad_norm": 0.7269086241722107, "learning_rate": 5e-05, "loss": 0.2493, "num_input_tokens_seen": 4371032, "step": 285 }, { "epoch": 0.35495716034271724, "grad_norm": 0.6951619386672974, "learning_rate": 5e-05, "loss": 0.1735, "num_input_tokens_seen": 4447448, "step": 290 }, { "epoch": 0.36107711138310894, "grad_norm": 1.7031666040420532, "learning_rate": 5e-05, "loss": 0.3252, "num_input_tokens_seen": 4527088, "step": 295 }, { "epoch": 0.3671970624235006, "grad_norm": 1.7470440864562988, "learning_rate": 5e-05, "loss": 0.3145, "num_input_tokens_seen": 4606888, "step": 300 }, { "epoch": 0.3733170134638923, "grad_norm": 0.6903861165046692, "learning_rate": 5e-05, "loss": 0.2633, "num_input_tokens_seen": 4672608, "step": 305 }, { "epoch": 0.379436964504284, "grad_norm": 0.8705160021781921, "learning_rate": 5e-05, "loss": 0.3045, "num_input_tokens_seen": 4746504, "step": 310 }, { "epoch": 0.38555691554467564, "grad_norm": 0.4258286654949188, "learning_rate": 5e-05, "loss": 0.2162, "num_input_tokens_seen": 4820096, "step": 315 }, { "epoch": 0.39167686658506734, "grad_norm": 1.1658859252929688, "learning_rate": 5e-05, "loss": 0.3229, "num_input_tokens_seen": 4908704, "step": 320 }, { "epoch": 0.397796817625459, "grad_norm": 1.2161023616790771, "learning_rate": 5e-05, "loss": 0.4275, "num_input_tokens_seen": 4987840, "step": 325 }, { "epoch": 0.4039167686658507, "grad_norm": 1.4946047067642212, "learning_rate": 5e-05, "loss": 0.2701, "num_input_tokens_seen": 5060504, "step": 330 }, { "epoch": 0.41003671970624234, "grad_norm": 0.9652360677719116, "learning_rate": 5e-05, "loss": 0.3254, "num_input_tokens_seen": 5152448, "step": 335 }, { "epoch": 0.41615667074663404, "grad_norm": 0.7417214512825012, "learning_rate": 5e-05, "loss": 0.27, "num_input_tokens_seen": 5230600, "step": 340 }, { "epoch": 0.4222766217870257, "grad_norm": 0.37863102555274963, "learning_rate": 5e-05, "loss": 0.2715, "num_input_tokens_seen": 5308488, "step": 345 }, { "epoch": 0.4283965728274174, "grad_norm": 1.1890950202941895, "learning_rate": 5e-05, "loss": 0.2518, "num_input_tokens_seen": 5384648, "step": 350 }, { "epoch": 0.43451652386780903, "grad_norm": 0.6195424199104309, "learning_rate": 5e-05, "loss": 0.3159, "num_input_tokens_seen": 5459000, "step": 355 }, { "epoch": 0.44063647490820074, "grad_norm": 0.9810832142829895, "learning_rate": 5e-05, "loss": 0.383, "num_input_tokens_seen": 5540016, "step": 360 }, { "epoch": 0.4467564259485924, "grad_norm": 1.2738054990768433, "learning_rate": 5e-05, "loss": 0.259, "num_input_tokens_seen": 5613864, "step": 365 }, { "epoch": 0.4528763769889841, "grad_norm": 1.968427300453186, "learning_rate": 5e-05, "loss": 0.2683, "num_input_tokens_seen": 5691464, "step": 370 }, { "epoch": 0.4589963280293758, "grad_norm": 0.9270569086074829, "learning_rate": 5e-05, "loss": 0.3312, "num_input_tokens_seen": 5775440, "step": 375 }, { "epoch": 0.46511627906976744, "grad_norm": 1.2682478427886963, "learning_rate": 5e-05, "loss": 0.2864, "num_input_tokens_seen": 5860576, "step": 380 }, { "epoch": 0.47123623011015914, "grad_norm": 1.4553344249725342, "learning_rate": 5e-05, "loss": 0.2775, "num_input_tokens_seen": 5935600, "step": 385 }, { "epoch": 0.4773561811505508, "grad_norm": 0.8053863048553467, "learning_rate": 5e-05, "loss": 0.2987, "num_input_tokens_seen": 6012008, "step": 390 }, { "epoch": 0.4834761321909425, "grad_norm": 2.0067827701568604, "learning_rate": 5e-05, "loss": 0.2989, "num_input_tokens_seen": 6090440, "step": 395 }, { "epoch": 0.48959608323133413, "grad_norm": 1.08030366897583, "learning_rate": 5e-05, "loss": 0.3515, "num_input_tokens_seen": 6160544, "step": 400 }, { "epoch": 0.49571603427172584, "grad_norm": 2.7822110652923584, "learning_rate": 5e-05, "loss": 0.3286, "num_input_tokens_seen": 6234352, "step": 405 }, { "epoch": 0.5018359853121175, "grad_norm": 1.6549819707870483, "learning_rate": 5e-05, "loss": 0.2768, "num_input_tokens_seen": 6308968, "step": 410 }, { "epoch": 0.5079559363525091, "grad_norm": 1.4953241348266602, "learning_rate": 5e-05, "loss": 0.409, "num_input_tokens_seen": 6383784, "step": 415 }, { "epoch": 0.5140758873929009, "grad_norm": 0.37719306349754333, "learning_rate": 5e-05, "loss": 0.2401, "num_input_tokens_seen": 6460816, "step": 420 }, { "epoch": 0.5201958384332925, "grad_norm": 0.5301929712295532, "learning_rate": 5e-05, "loss": 0.3234, "num_input_tokens_seen": 6527408, "step": 425 }, { "epoch": 0.5263157894736842, "grad_norm": 1.024317979812622, "learning_rate": 5e-05, "loss": 0.2835, "num_input_tokens_seen": 6596496, "step": 430 }, { "epoch": 0.5324357405140759, "grad_norm": 1.5636080503463745, "learning_rate": 5e-05, "loss": 0.3964, "num_input_tokens_seen": 6668272, "step": 435 }, { "epoch": 0.5385556915544676, "grad_norm": 0.6953904628753662, "learning_rate": 5e-05, "loss": 0.3196, "num_input_tokens_seen": 6745928, "step": 440 }, { "epoch": 0.5446756425948592, "grad_norm": 1.1809886693954468, "learning_rate": 5e-05, "loss": 0.2127, "num_input_tokens_seen": 6818536, "step": 445 }, { "epoch": 0.5507955936352509, "grad_norm": 1.18154776096344, "learning_rate": 5e-05, "loss": 0.3542, "num_input_tokens_seen": 6887592, "step": 450 }, { "epoch": 0.5569155446756426, "grad_norm": 0.8108683824539185, "learning_rate": 5e-05, "loss": 0.3328, "num_input_tokens_seen": 6962408, "step": 455 }, { "epoch": 0.5630354957160343, "grad_norm": 1.0861562490463257, "learning_rate": 5e-05, "loss": 0.3721, "num_input_tokens_seen": 7044480, "step": 460 }, { "epoch": 0.5691554467564259, "grad_norm": 0.5014681816101074, "learning_rate": 5e-05, "loss": 0.2219, "num_input_tokens_seen": 7123000, "step": 465 }, { "epoch": 0.5752753977968176, "grad_norm": 1.4004812240600586, "learning_rate": 5e-05, "loss": 0.2463, "num_input_tokens_seen": 7214808, "step": 470 }, { "epoch": 0.5813953488372093, "grad_norm": 0.7245849967002869, "learning_rate": 5e-05, "loss": 0.2206, "num_input_tokens_seen": 7297144, "step": 475 }, { "epoch": 0.587515299877601, "grad_norm": 2.269347667694092, "learning_rate": 5e-05, "loss": 0.3, "num_input_tokens_seen": 7365368, "step": 480 }, { "epoch": 0.5936352509179926, "grad_norm": 1.4849718809127808, "learning_rate": 5e-05, "loss": 0.4515, "num_input_tokens_seen": 7442408, "step": 485 }, { "epoch": 0.5997552019583844, "grad_norm": 0.6952480673789978, "learning_rate": 5e-05, "loss": 0.1743, "num_input_tokens_seen": 7515008, "step": 490 }, { "epoch": 0.605875152998776, "grad_norm": 0.6444818377494812, "learning_rate": 5e-05, "loss": 0.2391, "num_input_tokens_seen": 7590720, "step": 495 }, { "epoch": 0.6119951040391677, "grad_norm": 1.1886272430419922, "learning_rate": 5e-05, "loss": 0.3442, "num_input_tokens_seen": 7674104, "step": 500 }, { "epoch": 0.6181150550795593, "grad_norm": 0.6505646109580994, "learning_rate": 5e-05, "loss": 0.3187, "num_input_tokens_seen": 7757176, "step": 505 }, { "epoch": 0.6242350061199511, "grad_norm": 2.536336898803711, "learning_rate": 5e-05, "loss": 0.2686, "num_input_tokens_seen": 7830024, "step": 510 }, { "epoch": 0.6303549571603427, "grad_norm": 2.330808639526367, "learning_rate": 5e-05, "loss": 0.2953, "num_input_tokens_seen": 7902824, "step": 515 }, { "epoch": 0.6364749082007344, "grad_norm": 1.786583662033081, "learning_rate": 5e-05, "loss": 0.2318, "num_input_tokens_seen": 7975880, "step": 520 }, { "epoch": 0.642594859241126, "grad_norm": 1.5765174627304077, "learning_rate": 5e-05, "loss": 0.1621, "num_input_tokens_seen": 8049216, "step": 525 }, { "epoch": 0.6487148102815178, "grad_norm": 1.1585646867752075, "learning_rate": 5e-05, "loss": 0.2007, "num_input_tokens_seen": 8115720, "step": 530 }, { "epoch": 0.6548347613219094, "grad_norm": 0.8964922428131104, "learning_rate": 5e-05, "loss": 0.1959, "num_input_tokens_seen": 8182608, "step": 535 }, { "epoch": 0.6609547123623011, "grad_norm": 1.195090889930725, "learning_rate": 5e-05, "loss": 0.2871, "num_input_tokens_seen": 8275536, "step": 540 }, { "epoch": 0.6670746634026927, "grad_norm": 1.71815824508667, "learning_rate": 5e-05, "loss": 0.1991, "num_input_tokens_seen": 8345352, "step": 545 }, { "epoch": 0.6731946144430845, "grad_norm": 1.8731549978256226, "learning_rate": 5e-05, "loss": 0.327, "num_input_tokens_seen": 8421296, "step": 550 }, { "epoch": 0.6793145654834761, "grad_norm": 0.8870775103569031, "learning_rate": 5e-05, "loss": 0.2886, "num_input_tokens_seen": 8503432, "step": 555 }, { "epoch": 0.6854345165238678, "grad_norm": 1.113157868385315, "learning_rate": 5e-05, "loss": 0.2306, "num_input_tokens_seen": 8584616, "step": 560 }, { "epoch": 0.6915544675642595, "grad_norm": 1.1967592239379883, "learning_rate": 5e-05, "loss": 0.2687, "num_input_tokens_seen": 8662720, "step": 565 }, { "epoch": 0.6976744186046512, "grad_norm": 1.5223525762557983, "learning_rate": 5e-05, "loss": 0.2938, "num_input_tokens_seen": 8733952, "step": 570 }, { "epoch": 0.7037943696450428, "grad_norm": 2.2871458530426025, "learning_rate": 5e-05, "loss": 0.2867, "num_input_tokens_seen": 8813480, "step": 575 }, { "epoch": 0.7099143206854345, "grad_norm": 1.0057543516159058, "learning_rate": 5e-05, "loss": 0.3609, "num_input_tokens_seen": 8890096, "step": 580 }, { "epoch": 0.7160342717258262, "grad_norm": 0.7304249405860901, "learning_rate": 5e-05, "loss": 0.3394, "num_input_tokens_seen": 8962264, "step": 585 }, { "epoch": 0.7221542227662179, "grad_norm": 0.7682881355285645, "learning_rate": 5e-05, "loss": 0.2729, "num_input_tokens_seen": 9046040, "step": 590 }, { "epoch": 0.7282741738066095, "grad_norm": 1.031092643737793, "learning_rate": 5e-05, "loss": 0.3099, "num_input_tokens_seen": 9108680, "step": 595 }, { "epoch": 0.7343941248470012, "grad_norm": 1.1240421533584595, "learning_rate": 5e-05, "loss": 0.2264, "num_input_tokens_seen": 9179368, "step": 600 }, { "epoch": 0.7405140758873929, "grad_norm": 1.188981533050537, "learning_rate": 5e-05, "loss": 0.3204, "num_input_tokens_seen": 9252496, "step": 605 }, { "epoch": 0.7466340269277846, "grad_norm": 0.8609564304351807, "learning_rate": 5e-05, "loss": 0.3297, "num_input_tokens_seen": 9332992, "step": 610 }, { "epoch": 0.7527539779681762, "grad_norm": 0.5670002102851868, "learning_rate": 5e-05, "loss": 0.3259, "num_input_tokens_seen": 9401552, "step": 615 }, { "epoch": 0.758873929008568, "grad_norm": 1.380366563796997, "learning_rate": 5e-05, "loss": 0.2458, "num_input_tokens_seen": 9486880, "step": 620 }, { "epoch": 0.7649938800489596, "grad_norm": 1.0020085573196411, "learning_rate": 5e-05, "loss": 0.2189, "num_input_tokens_seen": 9563392, "step": 625 }, { "epoch": 0.7711138310893513, "grad_norm": 0.8492164015769958, "learning_rate": 5e-05, "loss": 0.315, "num_input_tokens_seen": 9633968, "step": 630 }, { "epoch": 0.7772337821297429, "grad_norm": 1.0888570547103882, "learning_rate": 5e-05, "loss": 0.3934, "num_input_tokens_seen": 9715240, "step": 635 }, { "epoch": 0.7833537331701347, "grad_norm": 1.028193712234497, "learning_rate": 5e-05, "loss": 0.2971, "num_input_tokens_seen": 9788488, "step": 640 }, { "epoch": 0.7894736842105263, "grad_norm": 1.152001976966858, "learning_rate": 5e-05, "loss": 0.2677, "num_input_tokens_seen": 9871456, "step": 645 }, { "epoch": 0.795593635250918, "grad_norm": 0.9151332378387451, "learning_rate": 5e-05, "loss": 0.226, "num_input_tokens_seen": 9954072, "step": 650 }, { "epoch": 0.8017135862913096, "grad_norm": 0.9921525120735168, "learning_rate": 5e-05, "loss": 0.231, "num_input_tokens_seen": 10026008, "step": 655 }, { "epoch": 0.8078335373317014, "grad_norm": 1.1160526275634766, "learning_rate": 5e-05, "loss": 0.2447, "num_input_tokens_seen": 10110664, "step": 660 }, { "epoch": 0.813953488372093, "grad_norm": 1.1362825632095337, "learning_rate": 5e-05, "loss": 0.3693, "num_input_tokens_seen": 10176016, "step": 665 }, { "epoch": 0.8200734394124847, "grad_norm": 1.020122766494751, "learning_rate": 5e-05, "loss": 0.3211, "num_input_tokens_seen": 10249848, "step": 670 }, { "epoch": 0.8261933904528764, "grad_norm": 1.5490026473999023, "learning_rate": 5e-05, "loss": 0.2148, "num_input_tokens_seen": 10337456, "step": 675 }, { "epoch": 0.8323133414932681, "grad_norm": 1.5763447284698486, "learning_rate": 5e-05, "loss": 0.2896, "num_input_tokens_seen": 10415576, "step": 680 }, { "epoch": 0.8384332925336597, "grad_norm": 0.7152131199836731, "learning_rate": 5e-05, "loss": 0.2144, "num_input_tokens_seen": 10493344, "step": 685 }, { "epoch": 0.8445532435740514, "grad_norm": 0.7920908331871033, "learning_rate": 5e-05, "loss": 0.1782, "num_input_tokens_seen": 10564376, "step": 690 }, { "epoch": 0.8506731946144431, "grad_norm": 0.612686812877655, "learning_rate": 5e-05, "loss": 0.1408, "num_input_tokens_seen": 10634712, "step": 695 }, { "epoch": 0.8567931456548348, "grad_norm": 2.0629584789276123, "learning_rate": 5e-05, "loss": 0.2171, "num_input_tokens_seen": 10711520, "step": 700 }, { "epoch": 0.8629130966952264, "grad_norm": 0.23964820802211761, "learning_rate": 5e-05, "loss": 0.1531, "num_input_tokens_seen": 10796816, "step": 705 }, { "epoch": 0.8690330477356181, "grad_norm": 1.496092677116394, "learning_rate": 5e-05, "loss": 0.2001, "num_input_tokens_seen": 10880712, "step": 710 }, { "epoch": 0.8751529987760098, "grad_norm": 1.120675802230835, "learning_rate": 5e-05, "loss": 0.2839, "num_input_tokens_seen": 10958952, "step": 715 }, { "epoch": 0.8812729498164015, "grad_norm": 0.5255671739578247, "learning_rate": 5e-05, "loss": 0.2218, "num_input_tokens_seen": 11039976, "step": 720 }, { "epoch": 0.8873929008567931, "grad_norm": 0.9678085446357727, "learning_rate": 5e-05, "loss": 0.1788, "num_input_tokens_seen": 11111552, "step": 725 }, { "epoch": 0.8935128518971848, "grad_norm": 0.5875991582870483, "learning_rate": 5e-05, "loss": 0.1406, "num_input_tokens_seen": 11181248, "step": 730 }, { "epoch": 0.8996328029375765, "grad_norm": 0.34448331594467163, "learning_rate": 5e-05, "loss": 0.2181, "num_input_tokens_seen": 11258064, "step": 735 }, { "epoch": 0.9057527539779682, "grad_norm": 0.8798747658729553, "learning_rate": 5e-05, "loss": 0.3332, "num_input_tokens_seen": 11340760, "step": 740 }, { "epoch": 0.9118727050183598, "grad_norm": 0.8203728795051575, "learning_rate": 5e-05, "loss": 0.2806, "num_input_tokens_seen": 11412328, "step": 745 }, { "epoch": 0.9179926560587516, "grad_norm": 0.4957216680049896, "learning_rate": 5e-05, "loss": 0.3153, "num_input_tokens_seen": 11495808, "step": 750 }, { "epoch": 0.9241126070991432, "grad_norm": 0.20371459424495697, "learning_rate": 5e-05, "loss": 0.3194, "num_input_tokens_seen": 11571296, "step": 755 }, { "epoch": 0.9302325581395349, "grad_norm": 0.5156471729278564, "learning_rate": 5e-05, "loss": 0.2576, "num_input_tokens_seen": 11642864, "step": 760 }, { "epoch": 0.9363525091799265, "grad_norm": 2.4537665843963623, "learning_rate": 5e-05, "loss": 0.3595, "num_input_tokens_seen": 11712568, "step": 765 }, { "epoch": 0.9424724602203183, "grad_norm": 1.3034874200820923, "learning_rate": 5e-05, "loss": 0.3284, "num_input_tokens_seen": 11781728, "step": 770 }, { "epoch": 0.9485924112607099, "grad_norm": 1.508277177810669, "learning_rate": 5e-05, "loss": 0.2667, "num_input_tokens_seen": 11855544, "step": 775 }, { "epoch": 0.9547123623011016, "grad_norm": 0.7294648885726929, "learning_rate": 5e-05, "loss": 0.2876, "num_input_tokens_seen": 11924032, "step": 780 }, { "epoch": 0.9608323133414932, "grad_norm": 1.7540596723556519, "learning_rate": 5e-05, "loss": 0.4058, "num_input_tokens_seen": 12004648, "step": 785 }, { "epoch": 0.966952264381885, "grad_norm": 1.4596582651138306, "learning_rate": 5e-05, "loss": 0.2826, "num_input_tokens_seen": 12084400, "step": 790 }, { "epoch": 0.9730722154222766, "grad_norm": 0.36397644877433777, "learning_rate": 5e-05, "loss": 0.3458, "num_input_tokens_seen": 12166608, "step": 795 }, { "epoch": 0.9791921664626683, "grad_norm": 0.8607336282730103, "learning_rate": 5e-05, "loss": 0.2151, "num_input_tokens_seen": 12243688, "step": 800 }, { "epoch": 0.98531211750306, "grad_norm": 1.2912824153900146, "learning_rate": 5e-05, "loss": 0.2501, "num_input_tokens_seen": 12329600, "step": 805 }, { "epoch": 0.9914320685434517, "grad_norm": 0.7768998146057129, "learning_rate": 5e-05, "loss": 0.236, "num_input_tokens_seen": 12401032, "step": 810 }, { "epoch": 0.9975520195838433, "grad_norm": 0.43636301159858704, "learning_rate": 5e-05, "loss": 0.3012, "num_input_tokens_seen": 12474488, "step": 815 } ], "logging_steps": 5, "max_steps": 817, "num_input_tokens_seen": 12505688, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2137320769650688e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }