{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9781021897810218, "eval_steps": 20, "global_step": 136, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014598540145985401, "grad_norm": 5.558824092078288, "learning_rate": 1e-05, "loss": 0.7048, "step": 1 }, { "epoch": 0.029197080291970802, "grad_norm": 5.39781753446926, "learning_rate": 2e-05, "loss": 0.706, "step": 2 }, { "epoch": 0.043795620437956206, "grad_norm": 2.5767987530959293, "learning_rate": 3e-05, "loss": 0.6427, "step": 3 }, { "epoch": 0.058394160583941604, "grad_norm": 5.332582144004997, "learning_rate": 4e-05, "loss": 0.7227, "step": 4 }, { "epoch": 0.072992700729927, "grad_norm": 6.654038960215542, "learning_rate": 5e-05, "loss": 0.8235, "step": 5 }, { "epoch": 0.08759124087591241, "grad_norm": 6.676666009964554, "learning_rate": 4.9992811366328926e-05, "loss": 0.8702, "step": 6 }, { "epoch": 0.10218978102189781, "grad_norm": 4.274269796378038, "learning_rate": 4.997124959943201e-05, "loss": 0.7634, "step": 7 }, { "epoch": 0.11678832116788321, "grad_norm": 9.488201233376888, "learning_rate": 4.993532709928075e-05, "loss": 0.7925, "step": 8 }, { "epoch": 0.13138686131386862, "grad_norm": 5.052354667424013, "learning_rate": 4.9885064524570665e-05, "loss": 0.7246, "step": 9 }, { "epoch": 0.145985401459854, "grad_norm": 1.5775620837406379, "learning_rate": 4.982049078084071e-05, "loss": 0.6298, "step": 10 }, { "epoch": 0.16058394160583941, "grad_norm": 2.0338294097415694, "learning_rate": 4.974164300384998e-05, "loss": 0.5719, "step": 11 }, { "epoch": 0.17518248175182483, "grad_norm": 1.4271467759265182, "learning_rate": 4.964856653822122e-05, "loss": 0.5577, "step": 12 }, { "epoch": 0.1897810218978102, "grad_norm": 1.4553670927449545, "learning_rate": 4.954131491136362e-05, "loss": 0.484, "step": 13 }, { "epoch": 0.20437956204379562, "grad_norm": 1.270113333054875, "learning_rate": 4.9419949802689666e-05, "loss": 0.4394, "step": 14 }, { "epoch": 0.21897810218978103, "grad_norm": 1.2876560129600205, "learning_rate": 4.92845410081439e-05, "loss": 0.4575, "step": 15 }, { "epoch": 0.23357664233576642, "grad_norm": 1.3643309071212184, "learning_rate": 4.913516640006392e-05, "loss": 0.4642, "step": 16 }, { "epoch": 0.24817518248175183, "grad_norm": 1.1729936051973566, "learning_rate": 4.897191188239667e-05, "loss": 0.434, "step": 17 }, { "epoch": 0.26277372262773724, "grad_norm": 1.1621979050371054, "learning_rate": 4.8794871341296e-05, "loss": 0.405, "step": 18 }, { "epoch": 0.2773722627737226, "grad_norm": 1.1218709581501245, "learning_rate": 4.8604146591129485e-05, "loss": 0.3909, "step": 19 }, { "epoch": 0.291970802919708, "grad_norm": 1.0395959826090466, "learning_rate": 4.8399847315926e-05, "loss": 0.4086, "step": 20 }, { "epoch": 0.291970802919708, "eval_loss": 0.5058628916740417, "eval_runtime": 2.3522, "eval_samples_per_second": 30.184, "eval_steps_per_second": 1.275, "step": 20 }, { "epoch": 0.30656934306569344, "grad_norm": 1.1417380012049692, "learning_rate": 4.818209100629745e-05, "loss": 0.3901, "step": 21 }, { "epoch": 0.32116788321167883, "grad_norm": 1.0254963744794567, "learning_rate": 4.795100289187099e-05, "loss": 0.3723, "step": 22 }, { "epoch": 0.3357664233576642, "grad_norm": 1.0470804323692136, "learning_rate": 4.7706715869270635e-05, "loss": 0.3757, "step": 23 }, { "epoch": 0.35036496350364965, "grad_norm": 0.9566801844725584, "learning_rate": 4.74493704256897e-05, "loss": 0.377, "step": 24 }, { "epoch": 0.36496350364963503, "grad_norm": 0.8719622478856044, "learning_rate": 4.717911455809782e-05, "loss": 0.3599, "step": 25 }, { "epoch": 0.3795620437956204, "grad_norm": 0.8993416168257213, "learning_rate": 4.6896103688129385e-05, "loss": 0.3235, "step": 26 }, { "epoch": 0.39416058394160586, "grad_norm": 0.9295183450455919, "learning_rate": 4.660050057270191e-05, "loss": 0.3538, "step": 27 }, { "epoch": 0.40875912408759124, "grad_norm": 0.8210997173864658, "learning_rate": 4.6292475210416106e-05, "loss": 0.347, "step": 28 }, { "epoch": 0.4233576642335766, "grad_norm": 0.9109711747983845, "learning_rate": 4.597220474379125e-05, "loss": 0.3239, "step": 29 }, { "epoch": 0.43795620437956206, "grad_norm": 0.917876136961733, "learning_rate": 4.563987335739216e-05, "loss": 0.3172, "step": 30 }, { "epoch": 0.45255474452554745, "grad_norm": 0.8348657603950833, "learning_rate": 4.5295672171906364e-05, "loss": 0.3161, "step": 31 }, { "epoch": 0.46715328467153283, "grad_norm": 0.9628144830881412, "learning_rate": 4.49397991342324e-05, "loss": 0.3231, "step": 32 }, { "epoch": 0.48175182481751827, "grad_norm": 0.8758205697486683, "learning_rate": 4.4572458903642354e-05, "loss": 0.3305, "step": 33 }, { "epoch": 0.49635036496350365, "grad_norm": 0.8177550425000464, "learning_rate": 4.419386273408428e-05, "loss": 0.3384, "step": 34 }, { "epoch": 0.5109489051094891, "grad_norm": 0.7707655080363717, "learning_rate": 4.3804228352691935e-05, "loss": 0.3178, "step": 35 }, { "epoch": 0.5255474452554745, "grad_norm": 0.8191298252366446, "learning_rate": 4.3403779834572004e-05, "loss": 0.3346, "step": 36 }, { "epoch": 0.5401459854014599, "grad_norm": 0.9406397953710217, "learning_rate": 4.2992747473940556e-05, "loss": 0.3268, "step": 37 }, { "epoch": 0.5547445255474452, "grad_norm": 0.757783885677172, "learning_rate": 4.2571367651683e-05, "loss": 0.2861, "step": 38 }, { "epoch": 0.5693430656934306, "grad_norm": 0.7275760878949353, "learning_rate": 4.213988269941362e-05, "loss": 0.2516, "step": 39 }, { "epoch": 0.583941605839416, "grad_norm": 0.9299663609616579, "learning_rate": 4.169854076011292e-05, "loss": 0.2844, "step": 40 }, { "epoch": 0.583941605839416, "eval_loss": 0.5329129099845886, "eval_runtime": 1.6671, "eval_samples_per_second": 42.589, "eval_steps_per_second": 1.8, "step": 40 }, { "epoch": 0.5985401459854015, "grad_norm": 0.7884853681211518, "learning_rate": 4.124759564542295e-05, "loss": 0.2759, "step": 41 }, { "epoch": 0.6131386861313869, "grad_norm": 0.7356673142072975, "learning_rate": 4.078730668968253e-05, "loss": 0.2614, "step": 42 }, { "epoch": 0.6277372262773723, "grad_norm": 0.7411930478686042, "learning_rate": 4.031793860078649e-05, "loss": 0.2577, "step": 43 }, { "epoch": 0.6423357664233577, "grad_norm": 0.6945647014688631, "learning_rate": 3.9839761307954675e-05, "loss": 0.2518, "step": 44 }, { "epoch": 0.656934306569343, "grad_norm": 0.758841305493356, "learning_rate": 3.935304980649813e-05, "loss": 0.3, "step": 45 }, { "epoch": 0.6715328467153284, "grad_norm": 0.7083671932224866, "learning_rate": 3.8858083999671855e-05, "loss": 0.2604, "step": 46 }, { "epoch": 0.6861313868613139, "grad_norm": 0.7105677438159481, "learning_rate": 3.835514853770505e-05, "loss": 0.2716, "step": 47 }, { "epoch": 0.7007299270072993, "grad_norm": 0.6955300432925063, "learning_rate": 3.784453265410141e-05, "loss": 0.2535, "step": 48 }, { "epoch": 0.7153284671532847, "grad_norm": 0.6251239165395376, "learning_rate": 3.732652999930364e-05, "loss": 0.2341, "step": 49 }, { "epoch": 0.7299270072992701, "grad_norm": 0.7101720555562038, "learning_rate": 3.680143847181783e-05, "loss": 0.2702, "step": 50 }, { "epoch": 0.7445255474452555, "grad_norm": 0.7982281837079658, "learning_rate": 3.6269560046894766e-05, "loss": 0.2579, "step": 51 }, { "epoch": 0.7591240875912408, "grad_norm": 0.6679783854698054, "learning_rate": 3.573120060286679e-05, "loss": 0.2287, "step": 52 }, { "epoch": 0.7737226277372263, "grad_norm": 0.6971062669945208, "learning_rate": 3.5186669745240026e-05, "loss": 0.2501, "step": 53 }, { "epoch": 0.7883211678832117, "grad_norm": 0.6212428454824916, "learning_rate": 3.463628062864312e-05, "loss": 0.2195, "step": 54 }, { "epoch": 0.8029197080291971, "grad_norm": 0.6621251726886289, "learning_rate": 3.4080349776734925e-05, "loss": 0.2243, "step": 55 }, { "epoch": 0.8175182481751825, "grad_norm": 0.618969299596716, "learning_rate": 3.351919690017473e-05, "loss": 0.2184, "step": 56 }, { "epoch": 0.8321167883211679, "grad_norm": 0.6210248745374214, "learning_rate": 3.2953144712759545e-05, "loss": 0.2255, "step": 57 }, { "epoch": 0.8467153284671532, "grad_norm": 0.648626857719657, "learning_rate": 3.238251874583452e-05, "loss": 0.24, "step": 58 }, { "epoch": 0.8613138686131386, "grad_norm": 0.6442995895157195, "learning_rate": 3.1807647161082795e-05, "loss": 0.2246, "step": 59 }, { "epoch": 0.8759124087591241, "grad_norm": 0.5822022754916901, "learning_rate": 3.122886056180284e-05, "loss": 0.232, "step": 60 }, { "epoch": 0.8759124087591241, "eval_loss": 0.5338236093521118, "eval_runtime": 1.6696, "eval_samples_per_second": 42.525, "eval_steps_per_second": 1.797, "step": 60 }, { "epoch": 0.8905109489051095, "grad_norm": 0.6753004591751348, "learning_rate": 3.064649180278152e-05, "loss": 0.2636, "step": 61 }, { "epoch": 0.9051094890510949, "grad_norm": 0.6326351355678477, "learning_rate": 3.006087579887244e-05, "loss": 0.2112, "step": 62 }, { "epoch": 0.9197080291970803, "grad_norm": 0.5944744601386913, "learning_rate": 2.9472349332389525e-05, "loss": 0.1924, "step": 63 }, { "epoch": 0.9343065693430657, "grad_norm": 0.6089460479407833, "learning_rate": 2.8881250859426646e-05, "loss": 0.2254, "step": 64 }, { "epoch": 0.948905109489051, "grad_norm": 0.5901015804515078, "learning_rate": 2.8287920315214643e-05, "loss": 0.2074, "step": 65 }, { "epoch": 0.9635036496350365, "grad_norm": 0.5482779301116911, "learning_rate": 2.7692698918627778e-05, "loss": 0.1833, "step": 66 }, { "epoch": 0.9781021897810219, "grad_norm": 0.619204984698149, "learning_rate": 2.7095928975951913e-05, "loss": 0.2125, "step": 67 }, { "epoch": 0.9927007299270073, "grad_norm": 0.5754857994127854, "learning_rate": 2.649795368402735e-05, "loss": 0.2373, "step": 68 }, { "epoch": 1.0, "grad_norm": 0.5754857994127854, "learning_rate": 2.649795368402735e-05, "loss": 0.1027, "step": 69 }, { "epoch": 1.0145985401459854, "grad_norm": 0.5481032621072542, "learning_rate": 2.5899116932879534e-05, "loss": 0.1039, "step": 70 }, { "epoch": 1.0291970802919708, "grad_norm": 0.44573568671990305, "learning_rate": 2.529976310795108e-05, "loss": 0.1075, "step": 71 }, { "epoch": 1.0437956204379562, "grad_norm": 0.46800129257717743, "learning_rate": 2.470023689204893e-05, "loss": 0.1053, "step": 72 }, { "epoch": 1.0583941605839415, "grad_norm": 0.5217427141578048, "learning_rate": 2.4100883067120475e-05, "loss": 0.1048, "step": 73 }, { "epoch": 1.072992700729927, "grad_norm": 0.46349892807310117, "learning_rate": 2.3502046315972656e-05, "loss": 0.0987, "step": 74 }, { "epoch": 1.0875912408759123, "grad_norm": 0.4835213983318123, "learning_rate": 2.290407102404809e-05, "loss": 0.0836, "step": 75 }, { "epoch": 1.102189781021898, "grad_norm": 0.5194340907634253, "learning_rate": 2.2307301081372224e-05, "loss": 0.1159, "step": 76 }, { "epoch": 1.1167883211678833, "grad_norm": 0.5807960446197283, "learning_rate": 2.1712079684785363e-05, "loss": 0.0816, "step": 77 }, { "epoch": 1.1313868613138687, "grad_norm": 0.5578387906848328, "learning_rate": 2.111874914057336e-05, "loss": 0.1141, "step": 78 }, { "epoch": 1.145985401459854, "grad_norm": 0.5639933149192473, "learning_rate": 2.0527650667610478e-05, "loss": 0.1146, "step": 79 }, { "epoch": 1.1605839416058394, "grad_norm": 0.5239001485998342, "learning_rate": 1.993912420112756e-05, "loss": 0.0847, "step": 80 }, { "epoch": 1.1605839416058394, "eval_loss": 0.6174606084823608, "eval_runtime": 1.6663, "eval_samples_per_second": 42.61, "eval_steps_per_second": 1.8, "step": 80 }, { "epoch": 1.1751824817518248, "grad_norm": 0.4929761742504611, "learning_rate": 1.935350819721849e-05, "loss": 0.1019, "step": 81 }, { "epoch": 1.1897810218978102, "grad_norm": 0.5544859652703089, "learning_rate": 1.8771139438197168e-05, "loss": 0.1077, "step": 82 }, { "epoch": 1.2043795620437956, "grad_norm": 0.5164965039818554, "learning_rate": 1.819235283891721e-05, "loss": 0.091, "step": 83 }, { "epoch": 1.218978102189781, "grad_norm": 0.49567303589016076, "learning_rate": 1.7617481254165487e-05, "loss": 0.1081, "step": 84 }, { "epoch": 1.2335766423357664, "grad_norm": 0.4587304943562019, "learning_rate": 1.704685528724046e-05, "loss": 0.1087, "step": 85 }, { "epoch": 1.2481751824817517, "grad_norm": 0.5758635229758376, "learning_rate": 1.648080309982528e-05, "loss": 0.1094, "step": 86 }, { "epoch": 1.2627737226277373, "grad_norm": 0.4790913429309353, "learning_rate": 1.591965022326507e-05, "loss": 0.1018, "step": 87 }, { "epoch": 1.2773722627737225, "grad_norm": 0.42743049770737657, "learning_rate": 1.536371937135688e-05, "loss": 0.0921, "step": 88 }, { "epoch": 1.2919708029197081, "grad_norm": 0.4874948888356134, "learning_rate": 1.4813330254759985e-05, "loss": 0.1044, "step": 89 }, { "epoch": 1.3065693430656935, "grad_norm": 0.4772485634630385, "learning_rate": 1.426879939713322e-05, "loss": 0.0816, "step": 90 }, { "epoch": 1.3211678832116789, "grad_norm": 0.40079927908899615, "learning_rate": 1.3730439953105243e-05, "loss": 0.0842, "step": 91 }, { "epoch": 1.3357664233576643, "grad_norm": 0.4275504239180154, "learning_rate": 1.3198561528182183e-05, "loss": 0.0744, "step": 92 }, { "epoch": 1.3503649635036497, "grad_norm": 0.3793648773284397, "learning_rate": 1.2673470000696364e-05, "loss": 0.1033, "step": 93 }, { "epoch": 1.364963503649635, "grad_norm": 0.5181460659792264, "learning_rate": 1.2155467345898602e-05, "loss": 0.1001, "step": 94 }, { "epoch": 1.3795620437956204, "grad_norm": 0.45318669096336833, "learning_rate": 1.1644851462294957e-05, "loss": 0.101, "step": 95 }, { "epoch": 1.3941605839416058, "grad_norm": 0.4766867331350184, "learning_rate": 1.114191600032815e-05, "loss": 0.0863, "step": 96 }, { "epoch": 1.4087591240875912, "grad_norm": 0.4080689197803639, "learning_rate": 1.064695019350187e-05, "loss": 0.0855, "step": 97 }, { "epoch": 1.4233576642335766, "grad_norm": 0.4653770430519792, "learning_rate": 1.0160238692045332e-05, "loss": 0.111, "step": 98 }, { "epoch": 1.437956204379562, "grad_norm": 0.45097632263026527, "learning_rate": 9.682061399213525e-06, "loss": 0.0939, "step": 99 }, { "epoch": 1.4525547445255476, "grad_norm": 0.4496977114474888, "learning_rate": 9.21269331031748e-06, "loss": 0.0882, "step": 100 }, { "epoch": 1.4525547445255476, "eval_loss": 0.5955010652542114, "eval_runtime": 1.6733, "eval_samples_per_second": 42.432, "eval_steps_per_second": 1.793, "step": 100 }, { "epoch": 1.4671532846715327, "grad_norm": 0.45643797377810924, "learning_rate": 8.752404354577052e-06, "loss": 0.0937, "step": 101 }, { "epoch": 1.4817518248175183, "grad_norm": 0.38809063898842117, "learning_rate": 8.301459239887074e-06, "loss": 0.0803, "step": 102 }, { "epoch": 1.4963503649635037, "grad_norm": 0.4599787928563747, "learning_rate": 7.860117300586383e-06, "loss": 0.1008, "step": 103 }, { "epoch": 1.510948905109489, "grad_norm": 0.4265168032214748, "learning_rate": 7.428632348317005e-06, "loss": 0.0731, "step": 104 }, { "epoch": 1.5255474452554745, "grad_norm": 0.4408034300546994, "learning_rate": 7.007252526059446e-06, "loss": 0.075, "step": 105 }, { "epoch": 1.5401459854014599, "grad_norm": 0.36837598605703786, "learning_rate": 6.596220165428002e-06, "loss": 0.0797, "step": 106 }, { "epoch": 1.5547445255474452, "grad_norm": 0.45686833811722544, "learning_rate": 6.195771647308074e-06, "loss": 0.1031, "step": 107 }, { "epoch": 1.5693430656934306, "grad_norm": 0.43293441847585606, "learning_rate": 5.806137265915732e-06, "loss": 0.0876, "step": 108 }, { "epoch": 1.583941605839416, "grad_norm": 0.4018622232704113, "learning_rate": 5.427541096357647e-06, "loss": 0.0859, "step": 109 }, { "epoch": 1.5985401459854014, "grad_norm": 0.42651961163568564, "learning_rate": 5.060200865767606e-06, "loss": 0.0914, "step": 110 }, { "epoch": 1.613138686131387, "grad_norm": 0.45501350150501924, "learning_rate": 4.704327828093641e-06, "loss": 0.0891, "step": 111 }, { "epoch": 1.6277372262773722, "grad_norm": 0.4018716741861915, "learning_rate": 4.3601266426078426e-06, "loss": 0.0802, "step": 112 }, { "epoch": 1.6423357664233578, "grad_norm": 0.4381477464449764, "learning_rate": 4.02779525620875e-06, "loss": 0.0837, "step": 113 }, { "epoch": 1.656934306569343, "grad_norm": 0.38056775027295703, "learning_rate": 3.707524789583891e-06, "loss": 0.0733, "step": 114 }, { "epoch": 1.6715328467153285, "grad_norm": 0.38615940067266047, "learning_rate": 3.3994994272980946e-06, "loss": 0.0816, "step": 115 }, { "epoch": 1.686131386861314, "grad_norm": 0.4083027079839302, "learning_rate": 3.1038963118706244e-06, "loss": 0.0947, "step": 116 }, { "epoch": 1.7007299270072993, "grad_norm": 0.4281395329269055, "learning_rate": 2.8208854419021824e-06, "loss": 0.0807, "step": 117 }, { "epoch": 1.7153284671532847, "grad_norm": 0.33771309316638104, "learning_rate": 2.5506295743103094e-06, "loss": 0.0576, "step": 118 }, { "epoch": 1.72992700729927, "grad_norm": 0.3862388702821623, "learning_rate": 2.2932841307293644e-06, "loss": 0.0857, "step": 119 }, { "epoch": 1.7445255474452555, "grad_norm": 0.3825839529168523, "learning_rate": 2.0489971081290195e-06, "loss": 0.0868, "step": 120 }, { "epoch": 1.7445255474452555, "eval_loss": 0.6069221496582031, "eval_runtime": 1.6811, "eval_samples_per_second": 42.233, "eval_steps_per_second": 1.785, "step": 120 }, { "epoch": 1.7591240875912408, "grad_norm": 0.40516989643017143, "learning_rate": 1.817908993702555e-06, "loss": 0.0781, "step": 121 }, { "epoch": 1.7737226277372264, "grad_norm": 0.37108997015823786, "learning_rate": 1.6001526840740049e-06, "loss": 0.0773, "step": 122 }, { "epoch": 1.7883211678832116, "grad_norm": 0.4152153354127059, "learning_rate": 1.3958534088705206e-06, "loss": 0.0648, "step": 123 }, { "epoch": 1.8029197080291972, "grad_norm": 0.3350032112266357, "learning_rate": 1.205128658704005e-06, "loss": 0.0685, "step": 124 }, { "epoch": 1.8175182481751824, "grad_norm": 0.37253550313302386, "learning_rate": 1.028088117603332e-06, "loss": 0.0597, "step": 125 }, { "epoch": 1.832116788321168, "grad_norm": 0.3646815056116574, "learning_rate": 8.648335999360935e-07, "loss": 0.0851, "step": 126 }, { "epoch": 1.8467153284671531, "grad_norm": 0.40006480391539845, "learning_rate": 7.154589918561022e-07, "loss": 0.0775, "step": 127 }, { "epoch": 1.8613138686131387, "grad_norm": 0.3998882921725406, "learning_rate": 5.800501973103362e-07, "loss": 0.0837, "step": 128 }, { "epoch": 1.8759124087591241, "grad_norm": 0.45939594997920835, "learning_rate": 4.586850886363875e-07, "loss": 0.104, "step": 129 }, { "epoch": 1.8905109489051095, "grad_norm": 0.415499645321787, "learning_rate": 3.514334617787857e-07, "loss": 0.0694, "step": 130 }, { "epoch": 1.905109489051095, "grad_norm": 0.3572513685399802, "learning_rate": 2.5835699615002764e-07, "loss": 0.0617, "step": 131 }, { "epoch": 1.9197080291970803, "grad_norm": 0.3824291555094829, "learning_rate": 1.7950921915928788e-07, "loss": 0.0792, "step": 132 }, { "epoch": 1.9343065693430657, "grad_norm": 0.42192592913917887, "learning_rate": 1.1493547542933969e-07, "loss": 0.0919, "step": 133 }, { "epoch": 1.948905109489051, "grad_norm": 0.40497490119047314, "learning_rate": 6.467290071925647e-08, "loss": 0.0655, "step": 134 }, { "epoch": 1.9635036496350367, "grad_norm": 0.3653261951630353, "learning_rate": 2.8750400567992274e-08, "loss": 0.0607, "step": 135 }, { "epoch": 1.9781021897810218, "grad_norm": 0.40679528870310416, "learning_rate": 7.188633671079137e-09, "loss": 0.0823, "step": 136 }, { "epoch": 1.9781021897810218, "step": 136, "total_flos": 89096184856576.0, "train_loss": 0.23073186133714282, "train_runtime": 917.4487, "train_samples_per_second": 11.959, "train_steps_per_second": 0.148 } ], "logging_steps": 1, "max_steps": 136, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 89096184856576.0, "train_batch_size": 5, "trial_name": null, "trial_params": null }