{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.7316017316017316, "eval_steps": 500, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01443001443001443, "grad_norm": 56.83119360154837, "learning_rate": 4.9997137491585e-05, "loss": 1.3624, "num_input_tokens_seen": 359024, "step": 5 }, { "epoch": 0.02886002886002886, "grad_norm": 3.369976030864084, "learning_rate": 4.9988550621856334e-05, "loss": 0.4676, "num_input_tokens_seen": 704936, "step": 10 }, { "epoch": 0.04329004329004329, "grad_norm": 4.096562689130303, "learning_rate": 4.997424135721297e-05, "loss": 0.2693, "num_input_tokens_seen": 1054072, "step": 15 }, { "epoch": 0.05772005772005772, "grad_norm": 3.267048245468216, "learning_rate": 4.9954212974486133e-05, "loss": 0.1972, "num_input_tokens_seen": 1407008, "step": 20 }, { "epoch": 0.07215007215007214, "grad_norm": 1.4062210868604832, "learning_rate": 4.9928470060188954e-05, "loss": 0.1583, "num_input_tokens_seen": 1758688, "step": 25 }, { "epoch": 0.08658008658008658, "grad_norm": 1.435503762940731, "learning_rate": 4.989701850946613e-05, "loss": 0.3325, "num_input_tokens_seen": 2115360, "step": 30 }, { "epoch": 0.10101010101010101, "grad_norm": 1.4056756129974017, "learning_rate": 4.985986552474396e-05, "loss": 0.2568, "num_input_tokens_seen": 2465168, "step": 35 }, { "epoch": 0.11544011544011544, "grad_norm": 2.4297584032149038, "learning_rate": 4.9817019614080956e-05, "loss": 0.2166, "num_input_tokens_seen": 2824680, "step": 40 }, { "epoch": 0.12987012987012986, "grad_norm": 2.079558907831912, "learning_rate": 4.97684905892195e-05, "loss": 0.1564, "num_input_tokens_seen": 3186688, "step": 45 }, { "epoch": 0.1443001443001443, "grad_norm": 4.289559037743566, "learning_rate": 4.9714289563338956e-05, "loss": 0.2399, "num_input_tokens_seen": 3539368, "step": 50 }, { "epoch": 0.15873015873015872, "grad_norm": 1.709391455489484, "learning_rate": 4.9654428948510733e-05, "loss": 0.1786, "num_input_tokens_seen": 3892272, "step": 55 }, { "epoch": 0.17316017316017315, "grad_norm": 4.105516922389285, "learning_rate": 4.9588922452855935e-05, "loss": 0.1634, "num_input_tokens_seen": 4247888, "step": 60 }, { "epoch": 0.18759018759018758, "grad_norm": 14.337965174289707, "learning_rate": 4.9517785077406154e-05, "loss": 0.2301, "num_input_tokens_seen": 4600504, "step": 65 }, { "epoch": 0.20202020202020202, "grad_norm": 3.4495197481454194, "learning_rate": 4.9441033112668264e-05, "loss": 0.1836, "num_input_tokens_seen": 4954360, "step": 70 }, { "epoch": 0.21645021645021645, "grad_norm": 2.9433687548388106, "learning_rate": 4.9358684134893875e-05, "loss": 0.2348, "num_input_tokens_seen": 5307224, "step": 75 }, { "epoch": 0.23088023088023088, "grad_norm": 1.9991837308587015, "learning_rate": 4.927075700205431e-05, "loss": 0.1776, "num_input_tokens_seen": 5665880, "step": 80 }, { "epoch": 0.2453102453102453, "grad_norm": 1.0758975822927606, "learning_rate": 4.917727184952219e-05, "loss": 0.153, "num_input_tokens_seen": 6013968, "step": 85 }, { "epoch": 0.2597402597402597, "grad_norm": 0.9976424589406766, "learning_rate": 4.9078250085460384e-05, "loss": 0.1538, "num_input_tokens_seen": 6362696, "step": 90 }, { "epoch": 0.2741702741702742, "grad_norm": 1.554715189619398, "learning_rate": 4.897371438591952e-05, "loss": 0.1166, "num_input_tokens_seen": 6707576, "step": 95 }, { "epoch": 0.2886002886002886, "grad_norm": 2.122029208713052, "learning_rate": 4.8863688689645164e-05, "loss": 0.1719, "num_input_tokens_seen": 7056720, "step": 100 }, { "epoch": 0.30303030303030304, "grad_norm": 1.214256916538219, "learning_rate": 4.874819819259584e-05, "loss": 0.1858, "num_input_tokens_seen": 7412576, "step": 105 }, { "epoch": 0.31746031746031744, "grad_norm": 1.3100116612480939, "learning_rate": 4.862726934217311e-05, "loss": 0.1949, "num_input_tokens_seen": 7772560, "step": 110 }, { "epoch": 0.3318903318903319, "grad_norm": 1.1161659945835543, "learning_rate": 4.850092983116514e-05, "loss": 0.1788, "num_input_tokens_seen": 8131176, "step": 115 }, { "epoch": 0.3463203463203463, "grad_norm": 1.0722413071734969, "learning_rate": 4.8369208591404997e-05, "loss": 0.1625, "num_input_tokens_seen": 8485328, "step": 120 }, { "epoch": 0.36075036075036077, "grad_norm": 1.2139674135231018, "learning_rate": 4.823213578714526e-05, "loss": 0.1156, "num_input_tokens_seen": 8833696, "step": 125 }, { "epoch": 0.37518037518037517, "grad_norm": 1.6739499644681717, "learning_rate": 4.8089742808150384e-05, "loss": 0.172, "num_input_tokens_seen": 9184616, "step": 130 }, { "epoch": 0.38961038961038963, "grad_norm": 1.483447317449199, "learning_rate": 4.7942062262508425e-05, "loss": 0.1966, "num_input_tokens_seen": 9539992, "step": 135 }, { "epoch": 0.40404040404040403, "grad_norm": 1.1509455037627738, "learning_rate": 4.778912796916374e-05, "loss": 0.1628, "num_input_tokens_seen": 9887200, "step": 140 }, { "epoch": 0.4184704184704185, "grad_norm": 1.420110660393153, "learning_rate": 4.763097495017247e-05, "loss": 0.1336, "num_input_tokens_seen": 10242808, "step": 145 }, { "epoch": 0.4329004329004329, "grad_norm": 1.4519100138720278, "learning_rate": 4.746763942268243e-05, "loss": 0.1703, "num_input_tokens_seen": 10594344, "step": 150 }, { "epoch": 0.44733044733044736, "grad_norm": 1.303306860048612, "learning_rate": 4.7299158790639365e-05, "loss": 0.1553, "num_input_tokens_seen": 10948808, "step": 155 }, { "epoch": 0.46176046176046176, "grad_norm": 0.834125896322133, "learning_rate": 4.712557163622145e-05, "loss": 0.1514, "num_input_tokens_seen": 11307176, "step": 160 }, { "epoch": 0.47619047619047616, "grad_norm": 1.090377119591504, "learning_rate": 4.694691771100389e-05, "loss": 0.1689, "num_input_tokens_seen": 11664048, "step": 165 }, { "epoch": 0.4906204906204906, "grad_norm": 1.1504944334378613, "learning_rate": 4.676323792685584e-05, "loss": 0.1943, "num_input_tokens_seen": 12024008, "step": 170 }, { "epoch": 0.5050505050505051, "grad_norm": 1.5052046184655268, "learning_rate": 4.657457434657152e-05, "loss": 0.1416, "num_input_tokens_seen": 12374176, "step": 175 }, { "epoch": 0.5194805194805194, "grad_norm": 1.250782472648046, "learning_rate": 4.638097017423783e-05, "loss": 0.1572, "num_input_tokens_seen": 12726528, "step": 180 }, { "epoch": 0.5339105339105339, "grad_norm": 1.4846786443672924, "learning_rate": 4.618246974534055e-05, "loss": 0.1752, "num_input_tokens_seen": 13092552, "step": 185 }, { "epoch": 0.5483405483405484, "grad_norm": 1.209336870267204, "learning_rate": 4.597911851661155e-05, "loss": 0.2137, "num_input_tokens_seen": 13450656, "step": 190 }, { "epoch": 0.5627705627705628, "grad_norm": 0.900006892425402, "learning_rate": 4.5770963055619095e-05, "loss": 0.1534, "num_input_tokens_seen": 13801680, "step": 195 }, { "epoch": 0.5772005772005772, "grad_norm": 1.7634935350790797, "learning_rate": 4.5558051030103876e-05, "loss": 0.1604, "num_input_tokens_seen": 14153496, "step": 200 }, { "epoch": 0.5916305916305916, "grad_norm": 1.3464012143723911, "learning_rate": 4.5340431197063084e-05, "loss": 0.1793, "num_input_tokens_seen": 14510352, "step": 205 }, { "epoch": 0.6060606060606061, "grad_norm": 0.8869022258852858, "learning_rate": 4.5118153391584974e-05, "loss": 0.1541, "num_input_tokens_seen": 14859280, "step": 210 }, { "epoch": 0.6204906204906205, "grad_norm": 1.0128792509826028, "learning_rate": 4.489126851543664e-05, "loss": 0.1612, "num_input_tokens_seen": 15220952, "step": 215 }, { "epoch": 0.6349206349206349, "grad_norm": 1.7855902267859547, "learning_rate": 4.465982852540747e-05, "loss": 0.2029, "num_input_tokens_seen": 15585584, "step": 220 }, { "epoch": 0.6493506493506493, "grad_norm": 1.1762565216888077, "learning_rate": 4.442388642141097e-05, "loss": 0.1213, "num_input_tokens_seen": 15932344, "step": 225 }, { "epoch": 0.6637806637806638, "grad_norm": 1.5774565711682704, "learning_rate": 4.4183496234347796e-05, "loss": 0.1808, "num_input_tokens_seen": 16288200, "step": 230 }, { "epoch": 0.6782106782106783, "grad_norm": 1.4243380964648475, "learning_rate": 4.393871301373262e-05, "loss": 0.1502, "num_input_tokens_seen": 16637448, "step": 235 }, { "epoch": 0.6926406926406926, "grad_norm": 0.9512374605634504, "learning_rate": 4.3689592815087764e-05, "loss": 0.1557, "num_input_tokens_seen": 16992200, "step": 240 }, { "epoch": 0.7070707070707071, "grad_norm": 1.3279436403523264, "learning_rate": 4.3436192687106406e-05, "loss": 0.1607, "num_input_tokens_seen": 17347112, "step": 245 }, { "epoch": 0.7215007215007215, "grad_norm": 1.750549734106104, "learning_rate": 4.317857065858844e-05, "loss": 0.2099, "num_input_tokens_seen": 17699392, "step": 250 }, { "epoch": 0.7359307359307359, "grad_norm": 1.1251441881988402, "learning_rate": 4.291678572515184e-05, "loss": 0.1543, "num_input_tokens_seen": 18056608, "step": 255 }, { "epoch": 0.7503607503607503, "grad_norm": 1.0416765811260265, "learning_rate": 4.26508978357226e-05, "loss": 0.1784, "num_input_tokens_seen": 18411256, "step": 260 }, { "epoch": 0.7647907647907648, "grad_norm": 1.201198812934987, "learning_rate": 4.238096787880638e-05, "loss": 0.1857, "num_input_tokens_seen": 18767664, "step": 265 }, { "epoch": 0.7792207792207793, "grad_norm": 1.4819563873601835, "learning_rate": 4.2107057668545044e-05, "loss": 0.136, "num_input_tokens_seen": 19132320, "step": 270 }, { "epoch": 0.7936507936507936, "grad_norm": 1.2547051865192014, "learning_rate": 4.182922993056113e-05, "loss": 0.1058, "num_input_tokens_seen": 19488160, "step": 275 }, { "epoch": 0.8080808080808081, "grad_norm": 1.5166739134010474, "learning_rate": 4.154754828759368e-05, "loss": 0.1823, "num_input_tokens_seen": 19844064, "step": 280 }, { "epoch": 0.8225108225108225, "grad_norm": 1.1491639114248267, "learning_rate": 4.126207724492855e-05, "loss": 0.1587, "num_input_tokens_seen": 20200488, "step": 285 }, { "epoch": 0.836940836940837, "grad_norm": 1.797485180499581, "learning_rate": 4.097288217562669e-05, "loss": 0.203, "num_input_tokens_seen": 20557248, "step": 290 }, { "epoch": 0.8513708513708513, "grad_norm": 1.929792036515502, "learning_rate": 4.0680029305553674e-05, "loss": 0.2322, "num_input_tokens_seen": 20921800, "step": 295 }, { "epoch": 0.8658008658008658, "grad_norm": 0.7667283264695735, "learning_rate": 4.0383585698213876e-05, "loss": 0.1355, "num_input_tokens_seen": 21269448, "step": 300 }, { "epoch": 0.8802308802308803, "grad_norm": 0.729775915381155, "learning_rate": 4.008361923939295e-05, "loss": 0.1873, "num_input_tokens_seen": 21625040, "step": 305 }, { "epoch": 0.8946608946608947, "grad_norm": 1.2721263119411592, "learning_rate": 3.978019862161191e-05, "loss": 0.2325, "num_input_tokens_seen": 21973600, "step": 310 }, { "epoch": 0.9090909090909091, "grad_norm": 1.40284206796357, "learning_rate": 3.9473393328396484e-05, "loss": 0.1754, "num_input_tokens_seen": 22327832, "step": 315 }, { "epoch": 0.9235209235209235, "grad_norm": 1.4456006541134594, "learning_rate": 3.916327361836536e-05, "loss": 0.1967, "num_input_tokens_seen": 22686432, "step": 320 }, { "epoch": 0.937950937950938, "grad_norm": 0.5527227312593487, "learning_rate": 3.884991050914091e-05, "loss": 0.1457, "num_input_tokens_seen": 23043784, "step": 325 }, { "epoch": 0.9523809523809523, "grad_norm": 1.3930212264797546, "learning_rate": 3.85333757610861e-05, "loss": 0.2194, "num_input_tokens_seen": 23411560, "step": 330 }, { "epoch": 0.9668109668109668, "grad_norm": 1.4476303074289294, "learning_rate": 3.821374186087133e-05, "loss": 0.1148, "num_input_tokens_seen": 23765000, "step": 335 }, { "epoch": 0.9812409812409812, "grad_norm": 3.292955863226407, "learning_rate": 3.789108200487493e-05, "loss": 0.1348, "num_input_tokens_seen": 24119024, "step": 340 }, { "epoch": 0.9956709956709957, "grad_norm": 1.1327523117828926, "learning_rate": 3.756547008242112e-05, "loss": 0.1762, "num_input_tokens_seen": 24475120, "step": 345 }, { "epoch": 1.0101010101010102, "grad_norm": 0.6731553914954855, "learning_rate": 3.723698065885936e-05, "loss": 0.0941, "num_input_tokens_seen": 24834408, "step": 350 }, { "epoch": 1.0245310245310246, "grad_norm": 0.9750510929970303, "learning_rate": 3.690568895848879e-05, "loss": 0.0694, "num_input_tokens_seen": 25195312, "step": 355 }, { "epoch": 1.0389610389610389, "grad_norm": 0.6125336557821428, "learning_rate": 3.65716708473318e-05, "loss": 0.0736, "num_input_tokens_seen": 25555472, "step": 360 }, { "epoch": 1.0533910533910533, "grad_norm": 1.1303634424790558, "learning_rate": 3.623500281576073e-05, "loss": 0.054, "num_input_tokens_seen": 25907632, "step": 365 }, { "epoch": 1.0678210678210678, "grad_norm": 0.8264622226623303, "learning_rate": 3.589576196098142e-05, "loss": 0.0555, "num_input_tokens_seen": 26255856, "step": 370 }, { "epoch": 1.0822510822510822, "grad_norm": 0.7804657972204446, "learning_rate": 3.5554025969378034e-05, "loss": 0.0781, "num_input_tokens_seen": 26614912, "step": 375 }, { "epoch": 1.0966810966810967, "grad_norm": 0.6498854003200126, "learning_rate": 3.520987309872269e-05, "loss": 0.0633, "num_input_tokens_seen": 26973272, "step": 380 }, { "epoch": 1.1111111111111112, "grad_norm": 1.3530620649043212, "learning_rate": 3.486338216025444e-05, "loss": 0.0626, "num_input_tokens_seen": 27333584, "step": 385 }, { "epoch": 1.1255411255411256, "grad_norm": 0.8465897427898971, "learning_rate": 3.451463250063146e-05, "loss": 0.0583, "num_input_tokens_seen": 27686384, "step": 390 }, { "epoch": 1.13997113997114, "grad_norm": 0.9339277337141088, "learning_rate": 3.416370398376057e-05, "loss": 0.0902, "num_input_tokens_seen": 28042656, "step": 395 }, { "epoch": 1.1544011544011543, "grad_norm": 0.6813215436255746, "learning_rate": 3.38106769725084e-05, "loss": 0.0629, "num_input_tokens_seen": 28395936, "step": 400 }, { "epoch": 1.1688311688311688, "grad_norm": 0.6152635426287013, "learning_rate": 3.345563231029818e-05, "loss": 0.0792, "num_input_tokens_seen": 28752264, "step": 405 }, { "epoch": 1.1832611832611832, "grad_norm": 0.5791814399404469, "learning_rate": 3.309865130259656e-05, "loss": 0.0538, "num_input_tokens_seen": 29104512, "step": 410 }, { "epoch": 1.1976911976911977, "grad_norm": 1.227354622086928, "learning_rate": 3.2739815698294635e-05, "loss": 0.0806, "num_input_tokens_seen": 29460048, "step": 415 }, { "epoch": 1.2121212121212122, "grad_norm": 1.014705815120655, "learning_rate": 3.237920767098735e-05, "loss": 0.0654, "num_input_tokens_seen": 29815240, "step": 420 }, { "epoch": 1.2265512265512266, "grad_norm": 0.6935986036942643, "learning_rate": 3.201690980015572e-05, "loss": 0.0631, "num_input_tokens_seen": 30168648, "step": 425 }, { "epoch": 1.240981240981241, "grad_norm": 0.5742221282988151, "learning_rate": 3.165300505225608e-05, "loss": 0.0454, "num_input_tokens_seen": 30515984, "step": 430 }, { "epoch": 1.2554112554112553, "grad_norm": 0.8521717779753476, "learning_rate": 3.128757676172065e-05, "loss": 0.0435, "num_input_tokens_seen": 30856848, "step": 435 }, { "epoch": 1.2698412698412698, "grad_norm": 0.6676462028746246, "learning_rate": 3.092070861187401e-05, "loss": 0.079, "num_input_tokens_seen": 31210856, "step": 440 }, { "epoch": 1.2842712842712842, "grad_norm": 0.4953272050872759, "learning_rate": 3.0552484615769404e-05, "loss": 0.0551, "num_input_tokens_seen": 31565760, "step": 445 }, { "epoch": 1.2987012987012987, "grad_norm": 0.8296764277086711, "learning_rate": 3.018298909694986e-05, "loss": 0.0607, "num_input_tokens_seen": 31920664, "step": 450 }, { "epoch": 1.3131313131313131, "grad_norm": 0.7341929187486326, "learning_rate": 2.9812306670137928e-05, "loss": 0.0683, "num_input_tokens_seen": 32277696, "step": 455 }, { "epoch": 1.3275613275613276, "grad_norm": 0.5799627106422043, "learning_rate": 2.9440522221858885e-05, "loss": 0.0672, "num_input_tokens_seen": 32629688, "step": 460 }, { "epoch": 1.341991341991342, "grad_norm": 0.892667216375801, "learning_rate": 2.9067720891001676e-05, "loss": 0.0675, "num_input_tokens_seen": 32979664, "step": 465 }, { "epoch": 1.3564213564213565, "grad_norm": 0.3708623827189489, "learning_rate": 2.869398804932204e-05, "loss": 0.0673, "num_input_tokens_seen": 33336624, "step": 470 }, { "epoch": 1.370851370851371, "grad_norm": 0.7639296039850831, "learning_rate": 2.8319409281892307e-05, "loss": 0.0843, "num_input_tokens_seen": 33698032, "step": 475 }, { "epoch": 1.3852813852813852, "grad_norm": 0.659221228832128, "learning_rate": 2.7944070367502402e-05, "loss": 0.0438, "num_input_tokens_seen": 34043384, "step": 480 }, { "epoch": 1.3997113997113997, "grad_norm": 0.6103194296481118, "learning_rate": 2.7568057259016384e-05, "loss": 0.0568, "num_input_tokens_seen": 34400944, "step": 485 }, { "epoch": 1.4141414141414141, "grad_norm": 0.5955688127258445, "learning_rate": 2.7191456063689236e-05, "loss": 0.0673, "num_input_tokens_seen": 34763888, "step": 490 }, { "epoch": 1.4285714285714286, "grad_norm": 0.7048448509220415, "learning_rate": 2.6814353023448213e-05, "loss": 0.0712, "num_input_tokens_seen": 35122880, "step": 495 }, { "epoch": 1.443001443001443, "grad_norm": 0.8954659143802416, "learning_rate": 2.6436834495143396e-05, "loss": 0.0672, "num_input_tokens_seen": 35476128, "step": 500 }, { "epoch": 1.4574314574314573, "grad_norm": 0.5357540884810665, "learning_rate": 2.6058986930771923e-05, "loss": 0.0697, "num_input_tokens_seen": 35826824, "step": 505 }, { "epoch": 1.4718614718614718, "grad_norm": 0.6403871525105113, "learning_rate": 2.568089685768038e-05, "loss": 0.075, "num_input_tokens_seen": 36176528, "step": 510 }, { "epoch": 1.4862914862914862, "grad_norm": 0.6086257743807054, "learning_rate": 2.530265085875005e-05, "loss": 0.0583, "num_input_tokens_seen": 36531584, "step": 515 }, { "epoch": 1.5007215007215007, "grad_norm": 0.7284156072158536, "learning_rate": 2.492433555256933e-05, "loss": 0.0887, "num_input_tokens_seen": 36887632, "step": 520 }, { "epoch": 1.5151515151515151, "grad_norm": 0.5833690078341504, "learning_rate": 2.4546037573598003e-05, "loss": 0.0697, "num_input_tokens_seen": 37237360, "step": 525 }, { "epoch": 1.5295815295815296, "grad_norm": 1.068934721386313, "learning_rate": 2.4167843552327932e-05, "loss": 0.0633, "num_input_tokens_seen": 37594456, "step": 530 }, { "epoch": 1.544011544011544, "grad_norm": 0.6914421570316827, "learning_rate": 2.3789840095444584e-05, "loss": 0.0831, "num_input_tokens_seen": 37943432, "step": 535 }, { "epoch": 1.5584415584415585, "grad_norm": 0.5411649106235956, "learning_rate": 2.341211376599406e-05, "loss": 0.0896, "num_input_tokens_seen": 38309480, "step": 540 }, { "epoch": 1.572871572871573, "grad_norm": 0.7808054192274716, "learning_rate": 2.303475106356009e-05, "loss": 0.075, "num_input_tokens_seen": 38670552, "step": 545 }, { "epoch": 1.5873015873015874, "grad_norm": 0.5377374336741765, "learning_rate": 2.265783840445557e-05, "loss": 0.0661, "num_input_tokens_seen": 39022944, "step": 550 }, { "epoch": 1.601731601731602, "grad_norm": 0.37966039726527356, "learning_rate": 2.2281462101933174e-05, "loss": 0.0525, "num_input_tokens_seen": 39370928, "step": 555 }, { "epoch": 1.6161616161616161, "grad_norm": 1.6803346686839633, "learning_rate": 2.1905708346419553e-05, "loss": 0.0755, "num_input_tokens_seen": 39717904, "step": 560 }, { "epoch": 1.6305916305916306, "grad_norm": 0.5133393164983202, "learning_rate": 2.1530663185777686e-05, "loss": 0.0522, "num_input_tokens_seen": 40067856, "step": 565 }, { "epoch": 1.645021645021645, "grad_norm": 0.7107107176574299, "learning_rate": 2.115641250560183e-05, "loss": 0.063, "num_input_tokens_seen": 40420928, "step": 570 }, { "epoch": 1.6594516594516593, "grad_norm": 0.37375269780433457, "learning_rate": 2.0783042009549696e-05, "loss": 0.0572, "num_input_tokens_seen": 40775672, "step": 575 }, { "epoch": 1.6738816738816737, "grad_norm": 0.4542968746133499, "learning_rate": 2.0410637199716236e-05, "loss": 0.0664, "num_input_tokens_seen": 41132536, "step": 580 }, { "epoch": 1.6883116883116882, "grad_norm": 1.6546823865399398, "learning_rate": 2.00392833570536e-05, "loss": 0.0563, "num_input_tokens_seen": 41492840, "step": 585 }, { "epoch": 1.7027417027417027, "grad_norm": 0.7762350084544962, "learning_rate": 1.9669065521841758e-05, "loss": 0.0754, "num_input_tokens_seen": 41849832, "step": 590 }, { "epoch": 1.7171717171717171, "grad_norm": 0.5851162333943368, "learning_rate": 1.9300068474214195e-05, "loss": 0.0677, "num_input_tokens_seen": 42201136, "step": 595 }, { "epoch": 1.7316017316017316, "grad_norm": 0.9931889138260699, "learning_rate": 1.8932376714743236e-05, "loss": 0.0818, "num_input_tokens_seen": 42558776, "step": 600 } ], "logging_steps": 5, "max_steps": 1038, "num_input_tokens_seen": 42558776, "num_train_epochs": 3, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 67969436221440.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }