{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 4900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02040816326530612, "grad_norm": 10.628430366516113, "learning_rate": 3.6734693877551024e-06, "loss": 1.0152, "step": 10 }, { "epoch": 0.04081632653061224, "grad_norm": 3.175971269607544, "learning_rate": 7.755102040816327e-06, "loss": 0.5298, "step": 20 }, { "epoch": 0.061224489795918366, "grad_norm": 2.9172680377960205, "learning_rate": 1.1836734693877552e-05, "loss": 0.337, "step": 30 }, { "epoch": 0.08163265306122448, "grad_norm": 2.0697555541992188, "learning_rate": 1.5918367346938776e-05, "loss": 0.255, "step": 40 }, { "epoch": 0.10204081632653061, "grad_norm": 1.7489535808563232, "learning_rate": 2e-05, "loss": 0.2925, "step": 50 }, { "epoch": 0.12244897959183673, "grad_norm": 2.501734495162964, "learning_rate": 2.4081632653061224e-05, "loss": 0.2364, "step": 60 }, { "epoch": 0.14285714285714285, "grad_norm": 1.3701616525650024, "learning_rate": 2.816326530612245e-05, "loss": 0.2546, "step": 70 }, { "epoch": 0.16326530612244897, "grad_norm": 2.0604536533355713, "learning_rate": 3.224489795918367e-05, "loss": 0.2513, "step": 80 }, { "epoch": 0.1836734693877551, "grad_norm": 1.1776974201202393, "learning_rate": 3.63265306122449e-05, "loss": 0.2447, "step": 90 }, { "epoch": 0.20408163265306123, "grad_norm": 1.3436124324798584, "learning_rate": 4.0408163265306124e-05, "loss": 0.2096, "step": 100 }, { "epoch": 0.22448979591836735, "grad_norm": 1.3945685625076294, "learning_rate": 4.448979591836735e-05, "loss": 0.1879, "step": 110 }, { "epoch": 0.24489795918367346, "grad_norm": 1.264801263809204, "learning_rate": 4.8571428571428576e-05, "loss": 0.1622, "step": 120 }, { "epoch": 0.2653061224489796, "grad_norm": 1.1072674989700317, "learning_rate": 5.26530612244898e-05, "loss": 0.1611, "step": 130 }, { "epoch": 0.2857142857142857, "grad_norm": 1.289307713508606, "learning_rate": 5.673469387755103e-05, "loss": 0.133, "step": 140 }, { "epoch": 0.30612244897959184, "grad_norm": 1.7481720447540283, "learning_rate": 6.081632653061224e-05, "loss": 0.1362, "step": 150 }, { "epoch": 0.32653061224489793, "grad_norm": 1.594106912612915, "learning_rate": 6.489795918367347e-05, "loss": 0.1247, "step": 160 }, { "epoch": 0.3469387755102041, "grad_norm": 1.075558066368103, "learning_rate": 6.897959183673471e-05, "loss": 0.125, "step": 170 }, { "epoch": 0.3673469387755102, "grad_norm": 1.093751311302185, "learning_rate": 7.306122448979592e-05, "loss": 0.1133, "step": 180 }, { "epoch": 0.3877551020408163, "grad_norm": 0.7638540863990784, "learning_rate": 7.714285714285715e-05, "loss": 0.1103, "step": 190 }, { "epoch": 0.40816326530612246, "grad_norm": 0.8106374740600586, "learning_rate": 8.122448979591836e-05, "loss": 0.1062, "step": 200 }, { "epoch": 0.42857142857142855, "grad_norm": 0.8688479065895081, "learning_rate": 8.53061224489796e-05, "loss": 0.1145, "step": 210 }, { "epoch": 0.4489795918367347, "grad_norm": 1.0158841609954834, "learning_rate": 8.938775510204082e-05, "loss": 0.1099, "step": 220 }, { "epoch": 0.46938775510204084, "grad_norm": 0.9198583364486694, "learning_rate": 9.346938775510204e-05, "loss": 0.1003, "step": 230 }, { "epoch": 0.4897959183673469, "grad_norm": 1.484209418296814, "learning_rate": 9.755102040816328e-05, "loss": 0.105, "step": 240 }, { "epoch": 0.5102040816326531, "grad_norm": 0.7159306406974792, "learning_rate": 9.999981781185989e-05, "loss": 0.104, "step": 250 }, { "epoch": 0.5306122448979592, "grad_norm": 0.8900401592254639, "learning_rate": 9.999776821053133e-05, "loss": 0.0946, "step": 260 }, { "epoch": 0.5510204081632653, "grad_norm": 0.9417662620544434, "learning_rate": 9.999344136636323e-05, "loss": 0.1161, "step": 270 }, { "epoch": 0.5714285714285714, "grad_norm": 0.7933630347251892, "learning_rate": 9.998683747642985e-05, "loss": 0.0966, "step": 280 }, { "epoch": 0.5918367346938775, "grad_norm": 1.1042319536209106, "learning_rate": 9.997795684151788e-05, "loss": 0.113, "step": 290 }, { "epoch": 0.6122448979591837, "grad_norm": 0.4663528501987457, "learning_rate": 9.996679986611258e-05, "loss": 0.096, "step": 300 }, { "epoch": 0.6326530612244898, "grad_norm": 1.089492678642273, "learning_rate": 9.99533670583795e-05, "loss": 0.1061, "step": 310 }, { "epoch": 0.6530612244897959, "grad_norm": 0.8293689489364624, "learning_rate": 9.993765903014125e-05, "loss": 0.0948, "step": 320 }, { "epoch": 0.673469387755102, "grad_norm": 0.7057058811187744, "learning_rate": 9.991967649684967e-05, "loss": 0.0966, "step": 330 }, { "epoch": 0.6938775510204082, "grad_norm": 0.5784198641777039, "learning_rate": 9.98994202775532e-05, "loss": 0.0875, "step": 340 }, { "epoch": 0.7142857142857143, "grad_norm": 0.6503363847732544, "learning_rate": 9.987689129485964e-05, "loss": 0.0929, "step": 350 }, { "epoch": 0.7346938775510204, "grad_norm": 0.7261630892753601, "learning_rate": 9.98520905748941e-05, "loss": 0.0823, "step": 360 }, { "epoch": 0.7551020408163265, "grad_norm": 0.8498600721359253, "learning_rate": 9.982501924725222e-05, "loss": 0.0918, "step": 370 }, { "epoch": 0.7755102040816326, "grad_norm": 0.5623912215232849, "learning_rate": 9.979567854494877e-05, "loss": 0.0873, "step": 380 }, { "epoch": 0.7959183673469388, "grad_norm": 0.5773722529411316, "learning_rate": 9.97640698043615e-05, "loss": 0.0921, "step": 390 }, { "epoch": 0.8163265306122449, "grad_norm": 0.6975867748260498, "learning_rate": 9.973019446517023e-05, "loss": 0.0821, "step": 400 }, { "epoch": 0.8367346938775511, "grad_norm": 0.7360525131225586, "learning_rate": 9.96940540702913e-05, "loss": 0.0965, "step": 410 }, { "epoch": 0.8571428571428571, "grad_norm": 1.1155353784561157, "learning_rate": 9.96556502658073e-05, "loss": 0.0879, "step": 420 }, { "epoch": 0.8775510204081632, "grad_norm": 0.43246451020240784, "learning_rate": 9.961498480089208e-05, "loss": 0.0892, "step": 430 }, { "epoch": 0.8979591836734694, "grad_norm": 0.6282567381858826, "learning_rate": 9.957205952773108e-05, "loss": 0.0826, "step": 440 }, { "epoch": 0.9183673469387755, "grad_norm": 0.9212186932563782, "learning_rate": 9.952687640143699e-05, "loss": 0.0901, "step": 450 }, { "epoch": 0.9387755102040817, "grad_norm": 0.7539145350456238, "learning_rate": 9.947943747996069e-05, "loss": 0.0881, "step": 460 }, { "epoch": 0.9591836734693877, "grad_norm": 0.4982381761074066, "learning_rate": 9.94297449239975e-05, "loss": 0.0895, "step": 470 }, { "epoch": 0.9795918367346939, "grad_norm": 0.6672772169113159, "learning_rate": 9.93778009968888e-05, "loss": 0.081, "step": 480 }, { "epoch": 1.0, "grad_norm": 0.8654345870018005, "learning_rate": 9.932360806451893e-05, "loss": 0.084, "step": 490 }, { "epoch": 1.0204081632653061, "grad_norm": 0.7786339521408081, "learning_rate": 9.926716859520737e-05, "loss": 0.0846, "step": 500 }, { "epoch": 1.0408163265306123, "grad_norm": 1.0392732620239258, "learning_rate": 9.920848515959645e-05, "loss": 0.0846, "step": 510 }, { "epoch": 1.0612244897959184, "grad_norm": 0.7900115847587585, "learning_rate": 9.91475604305341e-05, "loss": 0.0749, "step": 520 }, { "epoch": 1.0816326530612246, "grad_norm": 0.5556976795196533, "learning_rate": 9.908439718295229e-05, "loss": 0.0785, "step": 530 }, { "epoch": 1.1020408163265305, "grad_norm": 0.5236167311668396, "learning_rate": 9.901899829374047e-05, "loss": 0.0773, "step": 540 }, { "epoch": 1.1224489795918366, "grad_norm": 1.0668035745620728, "learning_rate": 9.895136674161465e-05, "loss": 0.0769, "step": 550 }, { "epoch": 1.1428571428571428, "grad_norm": 0.6634009480476379, "learning_rate": 9.888150560698169e-05, "loss": 0.065, "step": 560 }, { "epoch": 1.163265306122449, "grad_norm": 0.6299061179161072, "learning_rate": 9.880941807179895e-05, "loss": 0.0791, "step": 570 }, { "epoch": 1.183673469387755, "grad_norm": 0.8009123802185059, "learning_rate": 9.873510741942951e-05, "loss": 0.0854, "step": 580 }, { "epoch": 1.2040816326530612, "grad_norm": 0.7957753539085388, "learning_rate": 9.865857703449242e-05, "loss": 0.0826, "step": 590 }, { "epoch": 1.2244897959183674, "grad_norm": 0.7319406270980835, "learning_rate": 9.857983040270872e-05, "loss": 0.0849, "step": 600 }, { "epoch": 1.2448979591836735, "grad_norm": 0.7303819060325623, "learning_rate": 9.849887111074256e-05, "loss": 0.0856, "step": 610 }, { "epoch": 1.2653061224489797, "grad_norm": 0.5338705778121948, "learning_rate": 9.841570284603787e-05, "loss": 0.0742, "step": 620 }, { "epoch": 1.2857142857142856, "grad_norm": 0.5050308704376221, "learning_rate": 9.833032939665047e-05, "loss": 0.0693, "step": 630 }, { "epoch": 1.306122448979592, "grad_norm": 0.6604131460189819, "learning_rate": 9.824275465107545e-05, "loss": 0.0695, "step": 640 }, { "epoch": 1.3265306122448979, "grad_norm": 0.691947877407074, "learning_rate": 9.815298259807008e-05, "loss": 0.0713, "step": 650 }, { "epoch": 1.346938775510204, "grad_norm": 0.5263407826423645, "learning_rate": 9.806101732647217e-05, "loss": 0.0743, "step": 660 }, { "epoch": 1.3673469387755102, "grad_norm": 0.8018398880958557, "learning_rate": 9.796686302501382e-05, "loss": 0.0791, "step": 670 }, { "epoch": 1.3877551020408163, "grad_norm": 0.7871843576431274, "learning_rate": 9.787052398213062e-05, "loss": 0.0792, "step": 680 }, { "epoch": 1.4081632653061225, "grad_norm": 0.509185254573822, "learning_rate": 9.777200458576634e-05, "loss": 0.0778, "step": 690 }, { "epoch": 1.4285714285714286, "grad_norm": 0.6663973927497864, "learning_rate": 9.767130932317305e-05, "loss": 0.0769, "step": 700 }, { "epoch": 1.4489795918367347, "grad_norm": 0.6002593636512756, "learning_rate": 9.756844278070682e-05, "loss": 0.0775, "step": 710 }, { "epoch": 1.469387755102041, "grad_norm": 0.5615339875221252, "learning_rate": 9.746340964361871e-05, "loss": 0.0672, "step": 720 }, { "epoch": 1.489795918367347, "grad_norm": 1.005101203918457, "learning_rate": 9.735621469584144e-05, "loss": 0.0762, "step": 730 }, { "epoch": 1.510204081632653, "grad_norm": 0.7237843871116638, "learning_rate": 9.724686281977146e-05, "loss": 0.0643, "step": 740 }, { "epoch": 1.5306122448979593, "grad_norm": 0.4733352065086365, "learning_rate": 9.713535899604666e-05, "loss": 0.0717, "step": 750 }, { "epoch": 1.5510204081632653, "grad_norm": 0.5426393151283264, "learning_rate": 9.70217083033194e-05, "loss": 0.0648, "step": 760 }, { "epoch": 1.5714285714285714, "grad_norm": 0.7964388132095337, "learning_rate": 9.690591591802524e-05, "loss": 0.0701, "step": 770 }, { "epoch": 1.5918367346938775, "grad_norm": 0.6255714893341064, "learning_rate": 9.678798711414721e-05, "loss": 0.0694, "step": 780 }, { "epoch": 1.6122448979591837, "grad_norm": 0.5886476635932922, "learning_rate": 9.666792726297555e-05, "loss": 0.0652, "step": 790 }, { "epoch": 1.6326530612244898, "grad_norm": 0.5296547412872314, "learning_rate": 9.654574183286307e-05, "loss": 0.0653, "step": 800 }, { "epoch": 1.6530612244897958, "grad_norm": 0.7035044431686401, "learning_rate": 9.642143638897609e-05, "loss": 0.0718, "step": 810 }, { "epoch": 1.6734693877551021, "grad_norm": 0.554286539554596, "learning_rate": 9.629501659304096e-05, "loss": 0.0687, "step": 820 }, { "epoch": 1.693877551020408, "grad_norm": 0.5010990500450134, "learning_rate": 9.61664882030862e-05, "loss": 0.0717, "step": 830 }, { "epoch": 1.7142857142857144, "grad_norm": 0.9288004040718079, "learning_rate": 9.60358570731802e-05, "loss": 0.068, "step": 840 }, { "epoch": 1.7346938775510203, "grad_norm": 0.7913525104522705, "learning_rate": 9.59031291531647e-05, "loss": 0.0753, "step": 850 }, { "epoch": 1.7551020408163265, "grad_norm": 0.6899034380912781, "learning_rate": 9.57683104883836e-05, "loss": 0.0667, "step": 860 }, { "epoch": 1.7755102040816326, "grad_norm": 0.4607239365577698, "learning_rate": 9.563140721940779e-05, "loss": 0.068, "step": 870 }, { "epoch": 1.7959183673469388, "grad_norm": 0.7755603790283203, "learning_rate": 9.549242558175536e-05, "loss": 0.065, "step": 880 }, { "epoch": 1.816326530612245, "grad_norm": 0.7670177817344666, "learning_rate": 9.535137190560766e-05, "loss": 0.0641, "step": 890 }, { "epoch": 1.836734693877551, "grad_norm": 0.7770594954490662, "learning_rate": 9.520825261552092e-05, "loss": 0.0734, "step": 900 }, { "epoch": 1.8571428571428572, "grad_norm": 0.5837225317955017, "learning_rate": 9.50630742301337e-05, "loss": 0.0613, "step": 910 }, { "epoch": 1.8775510204081631, "grad_norm": 0.7210603952407837, "learning_rate": 9.491584336186989e-05, "loss": 0.0688, "step": 920 }, { "epoch": 1.8979591836734695, "grad_norm": 0.38964253664016724, "learning_rate": 9.476656671663765e-05, "loss": 0.0647, "step": 930 }, { "epoch": 1.9183673469387754, "grad_norm": 0.8442269563674927, "learning_rate": 9.46152510935239e-05, "loss": 0.0641, "step": 940 }, { "epoch": 1.9387755102040818, "grad_norm": 0.8859763741493225, "learning_rate": 9.446190338448463e-05, "loss": 0.0718, "step": 950 }, { "epoch": 1.9591836734693877, "grad_norm": 0.670879065990448, "learning_rate": 9.430653057403105e-05, "loss": 0.0608, "step": 960 }, { "epoch": 1.9795918367346939, "grad_norm": 0.5757398009300232, "learning_rate": 9.414913973891144e-05, "loss": 0.0581, "step": 970 }, { "epoch": 2.0, "grad_norm": 0.6478833556175232, "learning_rate": 9.398973804778882e-05, "loss": 0.0636, "step": 980 }, { "epoch": 2.020408163265306, "grad_norm": 0.46475401520729065, "learning_rate": 9.382833276091447e-05, "loss": 0.0616, "step": 990 }, { "epoch": 2.0408163265306123, "grad_norm": 0.8922365307807922, "learning_rate": 9.366493122979718e-05, "loss": 0.0684, "step": 1000 }, { "epoch": 2.061224489795918, "grad_norm": 0.5000820755958557, "learning_rate": 9.349954089686852e-05, "loss": 0.0563, "step": 1010 }, { "epoch": 2.0816326530612246, "grad_norm": 0.5674407482147217, "learning_rate": 9.333216929514372e-05, "loss": 0.0558, "step": 1020 }, { "epoch": 2.1020408163265305, "grad_norm": 0.7039157748222351, "learning_rate": 9.316282404787871e-05, "loss": 0.0653, "step": 1030 }, { "epoch": 2.122448979591837, "grad_norm": 0.56775963306427, "learning_rate": 9.299151286822278e-05, "loss": 0.0661, "step": 1040 }, { "epoch": 2.142857142857143, "grad_norm": 0.6371131539344788, "learning_rate": 9.281824355886737e-05, "loss": 0.0594, "step": 1050 }, { "epoch": 2.163265306122449, "grad_norm": 0.6409271359443665, "learning_rate": 9.264302401169063e-05, "loss": 0.0658, "step": 1060 }, { "epoch": 2.183673469387755, "grad_norm": 0.3617221415042877, "learning_rate": 9.246586220739794e-05, "loss": 0.0631, "step": 1070 }, { "epoch": 2.204081632653061, "grad_norm": 0.5386372804641724, "learning_rate": 9.228676621515853e-05, "loss": 0.0649, "step": 1080 }, { "epoch": 2.2244897959183674, "grad_norm": 0.6013124585151672, "learning_rate": 9.210574419223777e-05, "loss": 0.0617, "step": 1090 }, { "epoch": 2.2448979591836733, "grad_norm": 0.5672731399536133, "learning_rate": 9.192280438362582e-05, "loss": 0.0657, "step": 1100 }, { "epoch": 2.2653061224489797, "grad_norm": 0.7263081669807434, "learning_rate": 9.173795512166197e-05, "loss": 0.0662, "step": 1110 }, { "epoch": 2.2857142857142856, "grad_norm": 0.6938877701759338, "learning_rate": 9.155120482565521e-05, "loss": 0.0644, "step": 1120 }, { "epoch": 2.306122448979592, "grad_norm": 0.40912479162216187, "learning_rate": 9.136256200150066e-05, "loss": 0.0678, "step": 1130 }, { "epoch": 2.326530612244898, "grad_norm": 0.6514697074890137, "learning_rate": 9.117203524129228e-05, "loss": 0.0623, "step": 1140 }, { "epoch": 2.3469387755102042, "grad_norm": 0.5389600992202759, "learning_rate": 9.097963322293142e-05, "loss": 0.0648, "step": 1150 }, { "epoch": 2.36734693877551, "grad_norm": 0.5755404829978943, "learning_rate": 9.078536470973158e-05, "loss": 0.0604, "step": 1160 }, { "epoch": 2.387755102040816, "grad_norm": 0.7156921029090881, "learning_rate": 9.058923855001935e-05, "loss": 0.0587, "step": 1170 }, { "epoch": 2.4081632653061225, "grad_norm": 0.6002137660980225, "learning_rate": 9.039126367673132e-05, "loss": 0.0579, "step": 1180 }, { "epoch": 2.4285714285714284, "grad_norm": 0.4457235038280487, "learning_rate": 9.01914491070072e-05, "loss": 0.0586, "step": 1190 }, { "epoch": 2.4489795918367347, "grad_norm": 0.522216796875, "learning_rate": 8.99898039417792e-05, "loss": 0.0685, "step": 1200 }, { "epoch": 2.4693877551020407, "grad_norm": 0.4636695683002472, "learning_rate": 8.978633736535746e-05, "loss": 0.0577, "step": 1210 }, { "epoch": 2.489795918367347, "grad_norm": 0.40519633889198303, "learning_rate": 8.95810586450117e-05, "loss": 0.0532, "step": 1220 }, { "epoch": 2.510204081632653, "grad_norm": 0.7298558950424194, "learning_rate": 8.937397713054916e-05, "loss": 0.0646, "step": 1230 }, { "epoch": 2.5306122448979593, "grad_norm": 0.4737094044685364, "learning_rate": 8.916510225388878e-05, "loss": 0.0627, "step": 1240 }, { "epoch": 2.5510204081632653, "grad_norm": 0.3282434940338135, "learning_rate": 8.895444352863154e-05, "loss": 0.0617, "step": 1250 }, { "epoch": 2.571428571428571, "grad_norm": 0.6259459257125854, "learning_rate": 8.874201054962721e-05, "loss": 0.0628, "step": 1260 }, { "epoch": 2.5918367346938775, "grad_norm": 0.5925482511520386, "learning_rate": 8.852781299253724e-05, "loss": 0.0618, "step": 1270 }, { "epoch": 2.612244897959184, "grad_norm": 0.4080643951892853, "learning_rate": 8.83118606133942e-05, "loss": 0.0587, "step": 1280 }, { "epoch": 2.63265306122449, "grad_norm": 0.5757149457931519, "learning_rate": 8.809416324815729e-05, "loss": 0.0647, "step": 1290 }, { "epoch": 2.6530612244897958, "grad_norm": 0.6882585287094116, "learning_rate": 8.78747308122644e-05, "loss": 0.0611, "step": 1300 }, { "epoch": 2.673469387755102, "grad_norm": 0.4955786466598511, "learning_rate": 8.765357330018056e-05, "loss": 0.0595, "step": 1310 }, { "epoch": 2.693877551020408, "grad_norm": 0.425704687833786, "learning_rate": 8.743070078494255e-05, "loss": 0.069, "step": 1320 }, { "epoch": 2.7142857142857144, "grad_norm": 0.6342069506645203, "learning_rate": 8.720612341770027e-05, "loss": 0.0605, "step": 1330 }, { "epoch": 2.7346938775510203, "grad_norm": 0.5021179914474487, "learning_rate": 8.697985142725435e-05, "loss": 0.0632, "step": 1340 }, { "epoch": 2.7551020408163263, "grad_norm": 0.4925266206264496, "learning_rate": 8.67518951195902e-05, "loss": 0.0598, "step": 1350 }, { "epoch": 2.7755102040816326, "grad_norm": 0.5765326023101807, "learning_rate": 8.652226487740863e-05, "loss": 0.0577, "step": 1360 }, { "epoch": 2.795918367346939, "grad_norm": 0.8558735847473145, "learning_rate": 8.629097115965298e-05, "loss": 0.0614, "step": 1370 }, { "epoch": 2.816326530612245, "grad_norm": 0.6366243958473206, "learning_rate": 8.605802450103275e-05, "loss": 0.0692, "step": 1380 }, { "epoch": 2.836734693877551, "grad_norm": 0.97010737657547, "learning_rate": 8.58234355115437e-05, "loss": 0.0567, "step": 1390 }, { "epoch": 2.857142857142857, "grad_norm": 0.48445919156074524, "learning_rate": 8.558721487598471e-05, "loss": 0.054, "step": 1400 }, { "epoch": 2.877551020408163, "grad_norm": 0.5952962040901184, "learning_rate": 8.534937335347102e-05, "loss": 0.0529, "step": 1410 }, { "epoch": 2.8979591836734695, "grad_norm": 0.5294446349143982, "learning_rate": 8.510992177694429e-05, "loss": 0.055, "step": 1420 }, { "epoch": 2.9183673469387754, "grad_norm": 0.47300493717193604, "learning_rate": 8.486887105267902e-05, "loss": 0.0598, "step": 1430 }, { "epoch": 2.938775510204082, "grad_norm": 0.37820449471473694, "learning_rate": 8.462623215978605e-05, "loss": 0.0618, "step": 1440 }, { "epoch": 2.9591836734693877, "grad_norm": 0.46813079714775085, "learning_rate": 8.438201614971226e-05, "loss": 0.059, "step": 1450 }, { "epoch": 2.979591836734694, "grad_norm": 0.5875042080879211, "learning_rate": 8.41362341457374e-05, "loss": 0.0564, "step": 1460 }, { "epoch": 3.0, "grad_norm": 0.7085863947868347, "learning_rate": 8.388889734246725e-05, "loss": 0.058, "step": 1470 }, { "epoch": 3.020408163265306, "grad_norm": 0.5409305095672607, "learning_rate": 8.364001700532401e-05, "loss": 0.0666, "step": 1480 }, { "epoch": 3.0408163265306123, "grad_norm": 0.7195011377334595, "learning_rate": 8.338960447003292e-05, "loss": 0.0578, "step": 1490 }, { "epoch": 3.061224489795918, "grad_norm": 0.4446586072444916, "learning_rate": 8.313767114210615e-05, "loss": 0.0634, "step": 1500 }, { "epoch": 3.0816326530612246, "grad_norm": 0.4142863154411316, "learning_rate": 8.288422849632325e-05, "loss": 0.0652, "step": 1510 }, { "epoch": 3.1020408163265305, "grad_norm": 0.47294557094573975, "learning_rate": 8.262928807620843e-05, "loss": 0.0552, "step": 1520 }, { "epoch": 3.122448979591837, "grad_norm": 0.4253334403038025, "learning_rate": 8.237286149350495e-05, "loss": 0.0596, "step": 1530 }, { "epoch": 3.142857142857143, "grad_norm": 0.4108365774154663, "learning_rate": 8.211496042764612e-05, "loss": 0.059, "step": 1540 }, { "epoch": 3.163265306122449, "grad_norm": 0.4678657650947571, "learning_rate": 8.185559662522337e-05, "loss": 0.0617, "step": 1550 }, { "epoch": 3.183673469387755, "grad_norm": 0.5301884412765503, "learning_rate": 8.159478189945121e-05, "loss": 0.0513, "step": 1560 }, { "epoch": 3.204081632653061, "grad_norm": 0.8106349110603333, "learning_rate": 8.133252812962922e-05, "loss": 0.0567, "step": 1570 }, { "epoch": 3.2244897959183674, "grad_norm": 0.4195539355278015, "learning_rate": 8.106884726060098e-05, "loss": 0.053, "step": 1580 }, { "epoch": 3.2448979591836733, "grad_norm": 0.5291970372200012, "learning_rate": 8.080375130220994e-05, "loss": 0.0549, "step": 1590 }, { "epoch": 3.2653061224489797, "grad_norm": 0.45250171422958374, "learning_rate": 8.053725232875253e-05, "loss": 0.0563, "step": 1600 }, { "epoch": 3.2857142857142856, "grad_norm": 0.5225273966789246, "learning_rate": 8.026936247842813e-05, "loss": 0.0547, "step": 1610 }, { "epoch": 3.306122448979592, "grad_norm": 0.4152168929576874, "learning_rate": 8.000009395278624e-05, "loss": 0.0491, "step": 1620 }, { "epoch": 3.326530612244898, "grad_norm": 0.4513435959815979, "learning_rate": 7.972945901617072e-05, "loss": 0.0522, "step": 1630 }, { "epoch": 3.3469387755102042, "grad_norm": 0.38231614232063293, "learning_rate": 7.945746999516119e-05, "loss": 0.0496, "step": 1640 }, { "epoch": 3.36734693877551, "grad_norm": 0.47759726643562317, "learning_rate": 7.918413927801164e-05, "loss": 0.0555, "step": 1650 }, { "epoch": 3.387755102040816, "grad_norm": 0.47755759954452515, "learning_rate": 7.890947931408613e-05, "loss": 0.0468, "step": 1660 }, { "epoch": 3.4081632653061225, "grad_norm": 0.34949928522109985, "learning_rate": 7.863350261329177e-05, "loss": 0.0459, "step": 1670 }, { "epoch": 3.4285714285714284, "grad_norm": 0.39500531554222107, "learning_rate": 7.835622174550891e-05, "loss": 0.0545, "step": 1680 }, { "epoch": 3.4489795918367347, "grad_norm": 0.3671438992023468, "learning_rate": 7.807764934001874e-05, "loss": 0.0542, "step": 1690 }, { "epoch": 3.4693877551020407, "grad_norm": 0.4870457351207733, "learning_rate": 7.779779808492788e-05, "loss": 0.0525, "step": 1700 }, { "epoch": 3.489795918367347, "grad_norm": 0.6565643548965454, "learning_rate": 7.751668072659061e-05, "loss": 0.0559, "step": 1710 }, { "epoch": 3.510204081632653, "grad_norm": 0.4305172860622406, "learning_rate": 7.723431006902829e-05, "loss": 0.0516, "step": 1720 }, { "epoch": 3.5306122448979593, "grad_norm": 0.44627729058265686, "learning_rate": 7.695069897334612e-05, "loss": 0.0495, "step": 1730 }, { "epoch": 3.5510204081632653, "grad_norm": 0.39987713098526, "learning_rate": 7.666586035714745e-05, "loss": 0.0521, "step": 1740 }, { "epoch": 3.571428571428571, "grad_norm": 0.39626452326774597, "learning_rate": 7.637980719394533e-05, "loss": 0.0605, "step": 1750 }, { "epoch": 3.5918367346938775, "grad_norm": 0.7232034206390381, "learning_rate": 7.609255251257168e-05, "loss": 0.0626, "step": 1760 }, { "epoch": 3.612244897959184, "grad_norm": 0.4168018102645874, "learning_rate": 7.580410939658383e-05, "loss": 0.0525, "step": 1770 }, { "epoch": 3.63265306122449, "grad_norm": 0.45826172828674316, "learning_rate": 7.551449098366857e-05, "loss": 0.0587, "step": 1780 }, { "epoch": 3.6530612244897958, "grad_norm": 0.5224451422691345, "learning_rate": 7.522371046504385e-05, "loss": 0.0541, "step": 1790 }, { "epoch": 3.673469387755102, "grad_norm": 0.577448308467865, "learning_rate": 7.493178108485792e-05, "loss": 0.0591, "step": 1800 }, { "epoch": 3.693877551020408, "grad_norm": 0.5018607974052429, "learning_rate": 7.463871613958608e-05, "loss": 0.0466, "step": 1810 }, { "epoch": 3.7142857142857144, "grad_norm": 0.38134491443634033, "learning_rate": 7.434452897742513e-05, "loss": 0.0527, "step": 1820 }, { "epoch": 3.7346938775510203, "grad_norm": 0.4851766526699066, "learning_rate": 7.40492329976853e-05, "loss": 0.0524, "step": 1830 }, { "epoch": 3.7551020408163263, "grad_norm": 0.4203612804412842, "learning_rate": 7.375284165018003e-05, "loss": 0.0548, "step": 1840 }, { "epoch": 3.7755102040816326, "grad_norm": 0.41804400086402893, "learning_rate": 7.345536843461342e-05, "loss": 0.0526, "step": 1850 }, { "epoch": 3.795918367346939, "grad_norm": 0.45135030150413513, "learning_rate": 7.31568268999652e-05, "loss": 0.0602, "step": 1860 }, { "epoch": 3.816326530612245, "grad_norm": 0.4034428894519806, "learning_rate": 7.285723064387373e-05, "loss": 0.0529, "step": 1870 }, { "epoch": 3.836734693877551, "grad_norm": 0.48290595412254333, "learning_rate": 7.255659331201673e-05, "loss": 0.0547, "step": 1880 }, { "epoch": 3.857142857142857, "grad_norm": 0.7128674387931824, "learning_rate": 7.22549285974896e-05, "loss": 0.0522, "step": 1890 }, { "epoch": 3.877551020408163, "grad_norm": 0.5425318479537964, "learning_rate": 7.195225024018187e-05, "loss": 0.0582, "step": 1900 }, { "epoch": 3.8979591836734695, "grad_norm": 0.4822893440723419, "learning_rate": 7.164857202615129e-05, "loss": 0.0508, "step": 1910 }, { "epoch": 3.9183673469387754, "grad_norm": 0.5935354828834534, "learning_rate": 7.134390778699604e-05, "loss": 0.0482, "step": 1920 }, { "epoch": 3.938775510204082, "grad_norm": 0.372458279132843, "learning_rate": 7.103827139922465e-05, "loss": 0.0503, "step": 1930 }, { "epoch": 3.9591836734693877, "grad_norm": 0.46793851256370544, "learning_rate": 7.073167678362401e-05, "loss": 0.0567, "step": 1940 }, { "epoch": 3.979591836734694, "grad_norm": 0.5541433095932007, "learning_rate": 7.042413790462528e-05, "loss": 0.0564, "step": 1950 }, { "epoch": 4.0, "grad_norm": 0.5569531917572021, "learning_rate": 7.011566876966786e-05, "loss": 0.0536, "step": 1960 }, { "epoch": 4.020408163265306, "grad_norm": 0.48454782366752625, "learning_rate": 6.980628342856152e-05, "loss": 0.0545, "step": 1970 }, { "epoch": 4.040816326530612, "grad_norm": 0.5605842471122742, "learning_rate": 6.949599597284625e-05, "loss": 0.0533, "step": 1980 }, { "epoch": 4.061224489795919, "grad_norm": 0.5089229941368103, "learning_rate": 6.91848205351506e-05, "loss": 0.0497, "step": 1990 }, { "epoch": 4.081632653061225, "grad_norm": 0.42554935812950134, "learning_rate": 6.8872771288548e-05, "loss": 0.0495, "step": 2000 }, { "epoch": 4.1020408163265305, "grad_norm": 0.6508166790008545, "learning_rate": 6.855986244591104e-05, "loss": 0.0499, "step": 2010 }, { "epoch": 4.122448979591836, "grad_norm": 0.46225705742836, "learning_rate": 6.824610825926433e-05, "loss": 0.0484, "step": 2020 }, { "epoch": 4.142857142857143, "grad_norm": 0.38654595613479614, "learning_rate": 6.793152301913523e-05, "loss": 0.0525, "step": 2030 }, { "epoch": 4.163265306122449, "grad_norm": 0.43045929074287415, "learning_rate": 6.7616121053903e-05, "loss": 0.0601, "step": 2040 }, { "epoch": 4.183673469387755, "grad_norm": 0.29230445623397827, "learning_rate": 6.72999167291462e-05, "loss": 0.0442, "step": 2050 }, { "epoch": 4.204081632653061, "grad_norm": 0.4340389370918274, "learning_rate": 6.698292444698839e-05, "loss": 0.0455, "step": 2060 }, { "epoch": 4.224489795918367, "grad_norm": 0.6728600859642029, "learning_rate": 6.666515864544209e-05, "loss": 0.0419, "step": 2070 }, { "epoch": 4.244897959183674, "grad_norm": 0.4668474495410919, "learning_rate": 6.634663379775125e-05, "loss": 0.0466, "step": 2080 }, { "epoch": 4.26530612244898, "grad_norm": 0.35625386238098145, "learning_rate": 6.602736441173204e-05, "loss": 0.0408, "step": 2090 }, { "epoch": 4.285714285714286, "grad_norm": 0.29537519812583923, "learning_rate": 6.570736502911197e-05, "loss": 0.0401, "step": 2100 }, { "epoch": 4.3061224489795915, "grad_norm": 0.4634284973144531, "learning_rate": 6.538665022486767e-05, "loss": 0.0476, "step": 2110 }, { "epoch": 4.326530612244898, "grad_norm": 0.3917020559310913, "learning_rate": 6.506523460656103e-05, "loss": 0.0441, "step": 2120 }, { "epoch": 4.346938775510204, "grad_norm": 0.48713311553001404, "learning_rate": 6.474313281367373e-05, "loss": 0.0481, "step": 2130 }, { "epoch": 4.36734693877551, "grad_norm": 0.4484109580516815, "learning_rate": 6.442035951694068e-05, "loss": 0.044, "step": 2140 }, { "epoch": 4.387755102040816, "grad_norm": 0.5104283690452576, "learning_rate": 6.409692941768166e-05, "loss": 0.0562, "step": 2150 }, { "epoch": 4.408163265306122, "grad_norm": 0.5283732414245605, "learning_rate": 6.377285724713176e-05, "loss": 0.045, "step": 2160 }, { "epoch": 4.428571428571429, "grad_norm": 0.4185742735862732, "learning_rate": 6.344815776577041e-05, "loss": 0.044, "step": 2170 }, { "epoch": 4.448979591836735, "grad_norm": 0.5266509056091309, "learning_rate": 6.312284576264913e-05, "loss": 0.0475, "step": 2180 }, { "epoch": 4.469387755102041, "grad_norm": 0.34054675698280334, "learning_rate": 6.279693605471787e-05, "loss": 0.0508, "step": 2190 }, { "epoch": 4.489795918367347, "grad_norm": 0.42366960644721985, "learning_rate": 6.24704434861502e-05, "loss": 0.0483, "step": 2200 }, { "epoch": 4.510204081632653, "grad_norm": 0.46188119053840637, "learning_rate": 6.214338292766715e-05, "loss": 0.0413, "step": 2210 }, { "epoch": 4.530612244897959, "grad_norm": 0.5456094741821289, "learning_rate": 6.181576927585992e-05, "loss": 0.0362, "step": 2220 }, { "epoch": 4.551020408163265, "grad_norm": 0.3864874243736267, "learning_rate": 6.148761745251146e-05, "loss": 0.0413, "step": 2230 }, { "epoch": 4.571428571428571, "grad_norm": 0.3080037534236908, "learning_rate": 6.115894240391666e-05, "loss": 0.0442, "step": 2240 }, { "epoch": 4.591836734693878, "grad_norm": 0.3093554675579071, "learning_rate": 6.082975910020179e-05, "loss": 0.0455, "step": 2250 }, { "epoch": 4.612244897959184, "grad_norm": 0.3666916489601135, "learning_rate": 6.0500082534642464e-05, "loss": 0.0465, "step": 2260 }, { "epoch": 4.63265306122449, "grad_norm": 0.3994399607181549, "learning_rate": 6.0169927722980935e-05, "loss": 0.045, "step": 2270 }, { "epoch": 4.653061224489796, "grad_norm": 0.4581643044948578, "learning_rate": 5.9839309702741995e-05, "loss": 0.0477, "step": 2280 }, { "epoch": 4.673469387755102, "grad_norm": 0.47960394620895386, "learning_rate": 5.950824353254818e-05, "loss": 0.0451, "step": 2290 }, { "epoch": 4.6938775510204085, "grad_norm": 0.4794948399066925, "learning_rate": 5.9176744291433895e-05, "loss": 0.0426, "step": 2300 }, { "epoch": 4.714285714285714, "grad_norm": 0.39106351137161255, "learning_rate": 5.8844827078158525e-05, "loss": 0.0473, "step": 2310 }, { "epoch": 4.73469387755102, "grad_norm": 0.48323217034339905, "learning_rate": 5.851250701051881e-05, "loss": 0.0439, "step": 2320 }, { "epoch": 4.755102040816326, "grad_norm": 0.397773802280426, "learning_rate": 5.817979922466028e-05, "loss": 0.0481, "step": 2330 }, { "epoch": 4.775510204081632, "grad_norm": 0.9115492105484009, "learning_rate": 5.784671887438775e-05, "loss": 0.047, "step": 2340 }, { "epoch": 4.795918367346939, "grad_norm": 0.4305925965309143, "learning_rate": 5.751328113047527e-05, "loss": 0.0492, "step": 2350 }, { "epoch": 4.816326530612245, "grad_norm": 0.43962588906288147, "learning_rate": 5.717950117997501e-05, "loss": 0.0457, "step": 2360 }, { "epoch": 4.836734693877551, "grad_norm": 0.47304484248161316, "learning_rate": 5.6845394225525605e-05, "loss": 0.0435, "step": 2370 }, { "epoch": 4.857142857142857, "grad_norm": 0.3434034287929535, "learning_rate": 5.6510975484659675e-05, "loss": 0.0399, "step": 2380 }, { "epoch": 4.877551020408164, "grad_norm": 0.4160457253456116, "learning_rate": 5.617626018911079e-05, "loss": 0.0494, "step": 2390 }, { "epoch": 4.8979591836734695, "grad_norm": 0.3816176652908325, "learning_rate": 5.5841263584119594e-05, "loss": 0.048, "step": 2400 }, { "epoch": 4.918367346938775, "grad_norm": 0.5131191611289978, "learning_rate": 5.550600092773959e-05, "loss": 0.0409, "step": 2410 }, { "epoch": 4.938775510204081, "grad_norm": 0.37583550810813904, "learning_rate": 5.5170487490142006e-05, "loss": 0.0445, "step": 2420 }, { "epoch": 4.959183673469388, "grad_norm": 0.397798091173172, "learning_rate": 5.483473855292043e-05, "loss": 0.04, "step": 2430 }, { "epoch": 4.979591836734694, "grad_norm": 0.5067810416221619, "learning_rate": 5.449876940839472e-05, "loss": 0.0476, "step": 2440 }, { "epoch": 5.0, "grad_norm": 0.5477608442306519, "learning_rate": 5.416259535891447e-05, "loss": 0.0454, "step": 2450 }, { "epoch": 5.020408163265306, "grad_norm": 0.5660824179649353, "learning_rate": 5.3826231716162076e-05, "loss": 0.048, "step": 2460 }, { "epoch": 5.040816326530612, "grad_norm": 0.2889501750469208, "learning_rate": 5.348969380045531e-05, "loss": 0.0438, "step": 2470 }, { "epoch": 5.061224489795919, "grad_norm": 0.426714152097702, "learning_rate": 5.315299694004954e-05, "loss": 0.0404, "step": 2480 }, { "epoch": 5.081632653061225, "grad_norm": 0.34856680035591125, "learning_rate": 5.281615647043956e-05, "loss": 0.0394, "step": 2490 }, { "epoch": 5.1020408163265305, "grad_norm": 0.5608141422271729, "learning_rate": 5.247918773366112e-05, "loss": 0.0402, "step": 2500 }, { "epoch": 5.122448979591836, "grad_norm": 0.5281340479850769, "learning_rate": 5.2142106077592155e-05, "loss": 0.0397, "step": 2510 }, { "epoch": 5.142857142857143, "grad_norm": 0.48721784353256226, "learning_rate": 5.18049268552537e-05, "loss": 0.0435, "step": 2520 }, { "epoch": 5.163265306122449, "grad_norm": 0.4792710542678833, "learning_rate": 5.1467665424110657e-05, "loss": 0.0406, "step": 2530 }, { "epoch": 5.183673469387755, "grad_norm": 0.5936989784240723, "learning_rate": 5.113033714537226e-05, "loss": 0.0393, "step": 2540 }, { "epoch": 5.204081632653061, "grad_norm": 0.5217839479446411, "learning_rate": 5.079295738329245e-05, "loss": 0.042, "step": 2550 }, { "epoch": 5.224489795918367, "grad_norm": 0.4166584610939026, "learning_rate": 5.0455541504470086e-05, "loss": 0.0416, "step": 2560 }, { "epoch": 5.244897959183674, "grad_norm": 0.3131585419178009, "learning_rate": 5.011810487714901e-05, "loss": 0.0435, "step": 2570 }, { "epoch": 5.26530612244898, "grad_norm": 0.39630407094955444, "learning_rate": 4.97806628705181e-05, "loss": 0.0423, "step": 2580 }, { "epoch": 5.285714285714286, "grad_norm": 0.5689300298690796, "learning_rate": 4.9443230854011255e-05, "loss": 0.0443, "step": 2590 }, { "epoch": 5.3061224489795915, "grad_norm": 0.36981523036956787, "learning_rate": 4.910582419660735e-05, "loss": 0.0381, "step": 2600 }, { "epoch": 5.326530612244898, "grad_norm": 0.5300806164741516, "learning_rate": 4.876845826613026e-05, "loss": 0.0385, "step": 2610 }, { "epoch": 5.346938775510204, "grad_norm": 0.37739285826683044, "learning_rate": 4.843114842854881e-05, "loss": 0.0389, "step": 2620 }, { "epoch": 5.36734693877551, "grad_norm": 0.288785457611084, "learning_rate": 4.8093910047277e-05, "loss": 0.0397, "step": 2630 }, { "epoch": 5.387755102040816, "grad_norm": 0.37196066975593567, "learning_rate": 4.775675848247427e-05, "loss": 0.0365, "step": 2640 }, { "epoch": 5.408163265306122, "grad_norm": 0.4045736789703369, "learning_rate": 4.7419709090345734e-05, "loss": 0.0357, "step": 2650 }, { "epoch": 5.428571428571429, "grad_norm": 0.3250954747200012, "learning_rate": 4.7082777222442954e-05, "loss": 0.041, "step": 2660 }, { "epoch": 5.448979591836735, "grad_norm": 0.4290127754211426, "learning_rate": 4.6745978224964574e-05, "loss": 0.0419, "step": 2670 }, { "epoch": 5.469387755102041, "grad_norm": 0.40228599309921265, "learning_rate": 4.640932743805745e-05, "loss": 0.0406, "step": 2680 }, { "epoch": 5.489795918367347, "grad_norm": 0.42489856481552124, "learning_rate": 4.607284019511784e-05, "loss": 0.0353, "step": 2690 }, { "epoch": 5.510204081632653, "grad_norm": 0.6084537506103516, "learning_rate": 4.5736531822093136e-05, "loss": 0.0419, "step": 2700 }, { "epoch": 5.530612244897959, "grad_norm": 0.4374919533729553, "learning_rate": 4.540041763678378e-05, "loss": 0.0377, "step": 2710 }, { "epoch": 5.551020408163265, "grad_norm": 0.4828069508075714, "learning_rate": 4.5064512948145474e-05, "loss": 0.0408, "step": 2720 }, { "epoch": 5.571428571428571, "grad_norm": 0.4128870964050293, "learning_rate": 4.472883305559209e-05, "loss": 0.043, "step": 2730 }, { "epoch": 5.591836734693878, "grad_norm": 0.47663477063179016, "learning_rate": 4.439339324829872e-05, "loss": 0.0429, "step": 2740 }, { "epoch": 5.612244897959184, "grad_norm": 0.46140581369400024, "learning_rate": 4.405820880450529e-05, "loss": 0.0392, "step": 2750 }, { "epoch": 5.63265306122449, "grad_norm": 0.4380878806114197, "learning_rate": 4.372329499082072e-05, "loss": 0.042, "step": 2760 }, { "epoch": 5.653061224489796, "grad_norm": 0.34445661306381226, "learning_rate": 4.33886670615276e-05, "loss": 0.0449, "step": 2770 }, { "epoch": 5.673469387755102, "grad_norm": 0.4100647270679474, "learning_rate": 4.305434025788734e-05, "loss": 0.0401, "step": 2780 }, { "epoch": 5.6938775510204085, "grad_norm": 0.31862297654151917, "learning_rate": 4.272032980744603e-05, "loss": 0.0344, "step": 2790 }, { "epoch": 5.714285714285714, "grad_norm": 0.5393098592758179, "learning_rate": 4.238665092334084e-05, "loss": 0.039, "step": 2800 }, { "epoch": 5.73469387755102, "grad_norm": 0.42914947867393494, "learning_rate": 4.2053318803607156e-05, "loss": 0.0425, "step": 2810 }, { "epoch": 5.755102040816326, "grad_norm": 0.3658852279186249, "learning_rate": 4.172034863048631e-05, "loss": 0.029, "step": 2820 }, { "epoch": 5.775510204081632, "grad_norm": 0.3325256109237671, "learning_rate": 4.138775556973406e-05, "loss": 0.0407, "step": 2830 }, { "epoch": 5.795918367346939, "grad_norm": 0.3700428903102875, "learning_rate": 4.105555476992992e-05, "loss": 0.0393, "step": 2840 }, { "epoch": 5.816326530612245, "grad_norm": 0.4134577810764313, "learning_rate": 4.0723761361787116e-05, "loss": 0.0388, "step": 2850 }, { "epoch": 5.836734693877551, "grad_norm": 0.4173426032066345, "learning_rate": 4.039239045746342e-05, "loss": 0.0404, "step": 2860 }, { "epoch": 5.857142857142857, "grad_norm": 0.3706321716308594, "learning_rate": 4.006145714987293e-05, "loss": 0.0367, "step": 2870 }, { "epoch": 5.877551020408164, "grad_norm": 0.34077945351600647, "learning_rate": 3.973097651199853e-05, "loss": 0.0349, "step": 2880 }, { "epoch": 5.8979591836734695, "grad_norm": 0.48339715600013733, "learning_rate": 3.9400963596205434e-05, "loss": 0.0359, "step": 2890 }, { "epoch": 5.918367346938775, "grad_norm": 0.47534608840942383, "learning_rate": 3.9071433433555525e-05, "loss": 0.04, "step": 2900 }, { "epoch": 5.938775510204081, "grad_norm": 0.4137221872806549, "learning_rate": 3.874240103312282e-05, "loss": 0.0489, "step": 2910 }, { "epoch": 5.959183673469388, "grad_norm": 0.3204796314239502, "learning_rate": 3.841388138130984e-05, "loss": 0.0341, "step": 2920 }, { "epoch": 5.979591836734694, "grad_norm": 0.5732039213180542, "learning_rate": 3.808588944116491e-05, "loss": 0.0342, "step": 2930 }, { "epoch": 6.0, "grad_norm": 0.7128877639770508, "learning_rate": 3.775844015170083e-05, "loss": 0.0374, "step": 2940 }, { "epoch": 6.020408163265306, "grad_norm": 0.33346930146217346, "learning_rate": 3.7431548427214295e-05, "loss": 0.0358, "step": 2950 }, { "epoch": 6.040816326530612, "grad_norm": 0.41604581475257874, "learning_rate": 3.710522915660666e-05, "loss": 0.0456, "step": 2960 }, { "epoch": 6.061224489795919, "grad_norm": 0.344349205493927, "learning_rate": 3.677949720270578e-05, "loss": 0.0384, "step": 2970 }, { "epoch": 6.081632653061225, "grad_norm": 1.0503120422363281, "learning_rate": 3.645436740158908e-05, "loss": 0.0383, "step": 2980 }, { "epoch": 6.1020408163265305, "grad_norm": 0.35643085837364197, "learning_rate": 3.612985456190778e-05, "loss": 0.0312, "step": 2990 }, { "epoch": 6.122448979591836, "grad_norm": 0.2821815609931946, "learning_rate": 3.580597346421243e-05, "loss": 0.0342, "step": 3000 }, { "epoch": 6.142857142857143, "grad_norm": 0.3137131631374359, "learning_rate": 3.54827388602797e-05, "loss": 0.038, "step": 3010 }, { "epoch": 6.163265306122449, "grad_norm": 0.44174015522003174, "learning_rate": 3.5160165472440473e-05, "loss": 0.0379, "step": 3020 }, { "epoch": 6.183673469387755, "grad_norm": 0.3963649868965149, "learning_rate": 3.4838267992909325e-05, "loss": 0.035, "step": 3030 }, { "epoch": 6.204081632653061, "grad_norm": 0.3708857595920563, "learning_rate": 3.451706108311525e-05, "loss": 0.0342, "step": 3040 }, { "epoch": 6.224489795918367, "grad_norm": 0.3640486001968384, "learning_rate": 3.4196559373034006e-05, "loss": 0.0327, "step": 3050 }, { "epoch": 6.244897959183674, "grad_norm": 0.5147858262062073, "learning_rate": 3.387677746052165e-05, "loss": 0.0453, "step": 3060 }, { "epoch": 6.26530612244898, "grad_norm": 0.5431321859359741, "learning_rate": 3.3557729910649734e-05, "loss": 0.0376, "step": 3070 }, { "epoch": 6.285714285714286, "grad_norm": 0.41198182106018066, "learning_rate": 3.323943125504184e-05, "loss": 0.0354, "step": 3080 }, { "epoch": 6.3061224489795915, "grad_norm": 0.43326935172080994, "learning_rate": 3.292189599121176e-05, "loss": 0.0327, "step": 3090 }, { "epoch": 6.326530612244898, "grad_norm": 0.38192853331565857, "learning_rate": 3.260513858190319e-05, "loss": 0.0352, "step": 3100 }, { "epoch": 6.346938775510204, "grad_norm": 0.41123682260513306, "learning_rate": 3.228917345443092e-05, "loss": 0.0377, "step": 3110 }, { "epoch": 6.36734693877551, "grad_norm": 0.5325567126274109, "learning_rate": 3.19740150000238e-05, "loss": 0.0356, "step": 3120 }, { "epoch": 6.387755102040816, "grad_norm": 0.3802492916584015, "learning_rate": 3.165967757316925e-05, "loss": 0.0351, "step": 3130 }, { "epoch": 6.408163265306122, "grad_norm": 0.26050621271133423, "learning_rate": 3.134617549095941e-05, "loss": 0.0363, "step": 3140 }, { "epoch": 6.428571428571429, "grad_norm": 0.3772394359111786, "learning_rate": 3.103352303243905e-05, "loss": 0.0353, "step": 3150 }, { "epoch": 6.448979591836735, "grad_norm": 0.36528316140174866, "learning_rate": 3.072173443795527e-05, "loss": 0.0312, "step": 3160 }, { "epoch": 6.469387755102041, "grad_norm": 0.41683340072631836, "learning_rate": 3.0410823908508816e-05, "loss": 0.0447, "step": 3170 }, { "epoch": 6.489795918367347, "grad_norm": 0.41402310132980347, "learning_rate": 3.0100805605107312e-05, "loss": 0.0311, "step": 3180 }, { "epoch": 6.510204081632653, "grad_norm": 0.4392901062965393, "learning_rate": 2.979169364812028e-05, "loss": 0.0325, "step": 3190 }, { "epoch": 6.530612244897959, "grad_norm": 0.3072589933872223, "learning_rate": 2.948350211663594e-05, "loss": 0.0385, "step": 3200 }, { "epoch": 6.551020408163265, "grad_norm": 0.3144962787628174, "learning_rate": 2.917624504782006e-05, "loss": 0.0298, "step": 3210 }, { "epoch": 6.571428571428571, "grad_norm": 0.33626478910446167, "learning_rate": 2.886993643627644e-05, "loss": 0.033, "step": 3220 }, { "epoch": 6.591836734693878, "grad_norm": 0.4536080062389374, "learning_rate": 2.8564590233409693e-05, "loss": 0.0313, "step": 3230 }, { "epoch": 6.612244897959184, "grad_norm": 0.25697794556617737, "learning_rate": 2.826022034678969e-05, "loss": 0.0286, "step": 3240 }, { "epoch": 6.63265306122449, "grad_norm": 0.295259952545166, "learning_rate": 2.7956840639518077e-05, "loss": 0.0239, "step": 3250 }, { "epoch": 6.653061224489796, "grad_norm": 0.3460862934589386, "learning_rate": 2.7654464929596975e-05, "loss": 0.037, "step": 3260 }, { "epoch": 6.673469387755102, "grad_norm": 0.3951895534992218, "learning_rate": 2.7353106989299528e-05, "loss": 0.0306, "step": 3270 }, { "epoch": 6.6938775510204085, "grad_norm": 0.268476665019989, "learning_rate": 2.705278054454265e-05, "loss": 0.0299, "step": 3280 }, { "epoch": 6.714285714285714, "grad_norm": 0.37184420228004456, "learning_rate": 2.6753499274261795e-05, "loss": 0.0326, "step": 3290 }, { "epoch": 6.73469387755102, "grad_norm": 0.33349689841270447, "learning_rate": 2.6455276809788023e-05, "loss": 0.0269, "step": 3300 }, { "epoch": 6.755102040816326, "grad_norm": 0.30202341079711914, "learning_rate": 2.6158126734227072e-05, "loss": 0.0291, "step": 3310 }, { "epoch": 6.775510204081632, "grad_norm": 0.3205631971359253, "learning_rate": 2.5862062581840674e-05, "loss": 0.0308, "step": 3320 }, { "epoch": 6.795918367346939, "grad_norm": 0.2775018513202667, "learning_rate": 2.5567097837430155e-05, "loss": 0.0287, "step": 3330 }, { "epoch": 6.816326530612245, "grad_norm": 0.3472177982330322, "learning_rate": 2.5273245935722227e-05, "loss": 0.0356, "step": 3340 }, { "epoch": 6.836734693877551, "grad_norm": 0.40243059396743774, "learning_rate": 2.498052026075711e-05, "loss": 0.0327, "step": 3350 }, { "epoch": 6.857142857142857, "grad_norm": 0.34365713596343994, "learning_rate": 2.4688934145278825e-05, "loss": 0.0325, "step": 3360 }, { "epoch": 6.877551020408164, "grad_norm": 0.6802788972854614, "learning_rate": 2.439850087012806e-05, "loss": 0.0382, "step": 3370 }, { "epoch": 6.8979591836734695, "grad_norm": 0.3928966522216797, "learning_rate": 2.4109233663637215e-05, "loss": 0.0289, "step": 3380 }, { "epoch": 6.918367346938775, "grad_norm": 0.35136857628822327, "learning_rate": 2.3821145701027836e-05, "loss": 0.0375, "step": 3390 }, { "epoch": 6.938775510204081, "grad_norm": 0.42939406633377075, "learning_rate": 2.353425010381063e-05, "loss": 0.03, "step": 3400 }, { "epoch": 6.959183673469388, "grad_norm": 0.2758863866329193, "learning_rate": 2.3248559939187748e-05, "loss": 0.0339, "step": 3410 }, { "epoch": 6.979591836734694, "grad_norm": 0.4242722690105438, "learning_rate": 2.2964088219457664e-05, "loss": 0.0317, "step": 3420 }, { "epoch": 7.0, "grad_norm": 0.31365418434143066, "learning_rate": 2.2680847901422403e-05, "loss": 0.0316, "step": 3430 }, { "epoch": 7.020408163265306, "grad_norm": 0.3401188254356384, "learning_rate": 2.239885188579755e-05, "loss": 0.025, "step": 3440 }, { "epoch": 7.040816326530612, "grad_norm": 0.4100947678089142, "learning_rate": 2.211811301662457e-05, "loss": 0.0273, "step": 3450 }, { "epoch": 7.061224489795919, "grad_norm": 0.4056410491466522, "learning_rate": 2.183864408068577e-05, "loss": 0.0357, "step": 3460 }, { "epoch": 7.081632653061225, "grad_norm": 0.42886030673980713, "learning_rate": 2.1560457806921997e-05, "loss": 0.0299, "step": 3470 }, { "epoch": 7.1020408163265305, "grad_norm": 0.4000697731971741, "learning_rate": 2.128356686585282e-05, "loss": 0.0296, "step": 3480 }, { "epoch": 7.122448979591836, "grad_norm": 0.29271119832992554, "learning_rate": 2.1007983868999405e-05, "loss": 0.0294, "step": 3490 }, { "epoch": 7.142857142857143, "grad_norm": 0.3026485741138458, "learning_rate": 2.0733721368310144e-05, "loss": 0.0349, "step": 3500 }, { "epoch": 7.163265306122449, "grad_norm": 0.5303738713264465, "learning_rate": 2.0460791855588957e-05, "loss": 0.0292, "step": 3510 }, { "epoch": 7.183673469387755, "grad_norm": 0.47380366921424866, "learning_rate": 2.018920776192626e-05, "loss": 0.0275, "step": 3520 }, { "epoch": 7.204081632653061, "grad_norm": 0.4064404368400574, "learning_rate": 1.991898145713287e-05, "loss": 0.0229, "step": 3530 }, { "epoch": 7.224489795918367, "grad_norm": 0.38069799542427063, "learning_rate": 1.9650125249176475e-05, "loss": 0.0291, "step": 3540 }, { "epoch": 7.244897959183674, "grad_norm": 0.3179313540458679, "learning_rate": 1.938265138362118e-05, "loss": 0.0315, "step": 3550 }, { "epoch": 7.26530612244898, "grad_norm": 0.3878839612007141, "learning_rate": 1.9116572043069687e-05, "loss": 0.0313, "step": 3560 }, { "epoch": 7.285714285714286, "grad_norm": 0.32467424869537354, "learning_rate": 1.885189934660836e-05, "loss": 0.0285, "step": 3570 }, { "epoch": 7.3061224489795915, "grad_norm": 0.3442543148994446, "learning_rate": 1.8588645349255364e-05, "loss": 0.0305, "step": 3580 }, { "epoch": 7.326530612244898, "grad_norm": 0.44325685501098633, "learning_rate": 1.8326822041411524e-05, "loss": 0.0275, "step": 3590 }, { "epoch": 7.346938775510204, "grad_norm": 0.3210112750530243, "learning_rate": 1.806644134831415e-05, "loss": 0.0262, "step": 3600 }, { "epoch": 7.36734693877551, "grad_norm": 0.31204965710639954, "learning_rate": 1.7807515129494014e-05, "loss": 0.0293, "step": 3610 }, { "epoch": 7.387755102040816, "grad_norm": 0.33807334303855896, "learning_rate": 1.7550055178235058e-05, "loss": 0.0299, "step": 3620 }, { "epoch": 7.408163265306122, "grad_norm": 0.3368700444698334, "learning_rate": 1.7294073221037344e-05, "loss": 0.0321, "step": 3630 }, { "epoch": 7.428571428571429, "grad_norm": 0.401540607213974, "learning_rate": 1.703958091708282e-05, "loss": 0.0273, "step": 3640 }, { "epoch": 7.448979591836735, "grad_norm": 0.28700941801071167, "learning_rate": 1.678658985770445e-05, "loss": 0.0304, "step": 3650 }, { "epoch": 7.469387755102041, "grad_norm": 0.3458382487297058, "learning_rate": 1.6535111565858135e-05, "loss": 0.0286, "step": 3660 }, { "epoch": 7.489795918367347, "grad_norm": 0.4366629123687744, "learning_rate": 1.6285157495597904e-05, "loss": 0.0301, "step": 3670 }, { "epoch": 7.510204081632653, "grad_norm": 0.37988075613975525, "learning_rate": 1.6036739031554266e-05, "loss": 0.0341, "step": 3680 }, { "epoch": 7.530612244897959, "grad_norm": 0.3958030343055725, "learning_rate": 1.5789867488415632e-05, "loss": 0.0351, "step": 3690 }, { "epoch": 7.551020408163265, "grad_norm": 0.30942267179489136, "learning_rate": 1.5544554110412985e-05, "loss": 0.0334, "step": 3700 }, { "epoch": 7.571428571428571, "grad_norm": 0.4118305444717407, "learning_rate": 1.5300810070807697e-05, "loss": 0.0267, "step": 3710 }, { "epoch": 7.591836734693878, "grad_norm": 0.3243233859539032, "learning_rate": 1.5058646471382698e-05, "loss": 0.0374, "step": 3720 }, { "epoch": 7.612244897959184, "grad_norm": 0.32483312487602234, "learning_rate": 1.4818074341936782e-05, "loss": 0.0264, "step": 3730 }, { "epoch": 7.63265306122449, "grad_norm": 0.2870495915412903, "learning_rate": 1.4579104639782188e-05, "loss": 0.029, "step": 3740 }, { "epoch": 7.653061224489796, "grad_norm": 0.3924682140350342, "learning_rate": 1.4341748249245623e-05, "loss": 0.0317, "step": 3750 }, { "epoch": 7.673469387755102, "grad_norm": 0.30662891268730164, "learning_rate": 1.410601598117246e-05, "loss": 0.0284, "step": 3760 }, { "epoch": 7.6938775510204085, "grad_norm": 0.22942852973937988, "learning_rate": 1.3871918572434345e-05, "loss": 0.0328, "step": 3770 }, { "epoch": 7.714285714285714, "grad_norm": 0.4252789616584778, "learning_rate": 1.3639466685440132e-05, "loss": 0.0322, "step": 3780 }, { "epoch": 7.73469387755102, "grad_norm": 0.40231239795684814, "learning_rate": 1.3408670907650322e-05, "loss": 0.0276, "step": 3790 }, { "epoch": 7.755102040816326, "grad_norm": 0.25347328186035156, "learning_rate": 1.3179541751094787e-05, "loss": 0.0277, "step": 3800 }, { "epoch": 7.775510204081632, "grad_norm": 0.3201022148132324, "learning_rate": 1.2952089651893945e-05, "loss": 0.0262, "step": 3810 }, { "epoch": 7.795918367346939, "grad_norm": 0.3606759011745453, "learning_rate": 1.2726324969783515e-05, "loss": 0.0304, "step": 3820 }, { "epoch": 7.816326530612245, "grad_norm": 0.3817797601222992, "learning_rate": 1.2502257987642591e-05, "loss": 0.0325, "step": 3830 }, { "epoch": 7.836734693877551, "grad_norm": 0.3720795512199402, "learning_rate": 1.2279898911025333e-05, "loss": 0.027, "step": 3840 }, { "epoch": 7.857142857142857, "grad_norm": 0.5197123885154724, "learning_rate": 1.2059257867696073e-05, "loss": 0.0284, "step": 3850 }, { "epoch": 7.877551020408164, "grad_norm": 0.26620540022850037, "learning_rate": 1.1840344907168122e-05, "loss": 0.0236, "step": 3860 }, { "epoch": 7.8979591836734695, "grad_norm": 0.5102793574333191, "learning_rate": 1.1623170000245975e-05, "loss": 0.033, "step": 3870 }, { "epoch": 7.918367346938775, "grad_norm": 0.47227033972740173, "learning_rate": 1.1407743038571172e-05, "loss": 0.0328, "step": 3880 }, { "epoch": 7.938775510204081, "grad_norm": 0.17635785043239594, "learning_rate": 1.1194073834171804e-05, "loss": 0.0223, "step": 3890 }, { "epoch": 7.959183673469388, "grad_norm": 0.6158728003501892, "learning_rate": 1.0982172119015593e-05, "loss": 0.0322, "step": 3900 }, { "epoch": 7.979591836734694, "grad_norm": 0.29061493277549744, "learning_rate": 1.0772047544566589e-05, "loss": 0.0269, "step": 3910 }, { "epoch": 8.0, "grad_norm": 0.36250999569892883, "learning_rate": 1.056370968134564e-05, "loss": 0.0258, "step": 3920 }, { "epoch": 8.020408163265307, "grad_norm": 0.288967490196228, "learning_rate": 1.0357168018494462e-05, "loss": 0.0242, "step": 3930 }, { "epoch": 8.040816326530612, "grad_norm": 0.35259634256362915, "learning_rate": 1.0152431963343384e-05, "loss": 0.0248, "step": 3940 }, { "epoch": 8.061224489795919, "grad_norm": 0.2822493314743042, "learning_rate": 9.949510840982985e-06, "loss": 0.0249, "step": 3950 }, { "epoch": 8.081632653061224, "grad_norm": 0.3533857762813568, "learning_rate": 9.748413893839231e-06, "loss": 0.0272, "step": 3960 }, { "epoch": 8.10204081632653, "grad_norm": 0.4431054890155792, "learning_rate": 9.549150281252633e-06, "loss": 0.0276, "step": 3970 }, { "epoch": 8.122448979591837, "grad_norm": 0.5681186318397522, "learning_rate": 9.351729079061005e-06, "loss": 0.0263, "step": 3980 }, { "epoch": 8.142857142857142, "grad_norm": 0.26654306054115295, "learning_rate": 9.156159279186078e-06, "loss": 0.0254, "step": 3990 }, { "epoch": 8.16326530612245, "grad_norm": 0.9877157211303711, "learning_rate": 8.96244978922397e-06, "loss": 0.0239, "step": 4000 }, { "epoch": 8.183673469387756, "grad_norm": 0.30778566002845764, "learning_rate": 8.770609432039505e-06, "loss": 0.0281, "step": 4010 }, { "epoch": 8.204081632653061, "grad_norm": 0.5464140176773071, "learning_rate": 8.580646945364256e-06, "loss": 0.0243, "step": 4020 }, { "epoch": 8.224489795918368, "grad_norm": 0.34788644313812256, "learning_rate": 8.392570981398695e-06, "loss": 0.0307, "step": 4030 }, { "epoch": 8.244897959183673, "grad_norm": 0.31038105487823486, "learning_rate": 8.206390106418027e-06, "loss": 0.0249, "step": 4040 }, { "epoch": 8.26530612244898, "grad_norm": 0.2261275053024292, "learning_rate": 8.022112800382059e-06, "loss": 0.0234, "step": 4050 }, { "epoch": 8.285714285714286, "grad_norm": 0.2555391192436218, "learning_rate": 7.839747456548918e-06, "loss": 0.0298, "step": 4060 }, { "epoch": 8.306122448979592, "grad_norm": 0.2868110239505768, "learning_rate": 7.659302381092842e-06, "loss": 0.0295, "step": 4070 }, { "epoch": 8.326530612244898, "grad_norm": 0.2840665578842163, "learning_rate": 7.480785792725809e-06, "loss": 0.0214, "step": 4080 }, { "epoch": 8.346938775510203, "grad_norm": 0.3482186198234558, "learning_rate": 7.304205822323173e-06, "loss": 0.0265, "step": 4090 }, { "epoch": 8.36734693877551, "grad_norm": 0.3534049093723297, "learning_rate": 7.129570512553407e-06, "loss": 0.0267, "step": 4100 }, { "epoch": 8.387755102040817, "grad_norm": 0.2970373034477234, "learning_rate": 6.95688781751172e-06, "loss": 0.0271, "step": 4110 }, { "epoch": 8.408163265306122, "grad_norm": 0.3268330693244934, "learning_rate": 6.786165602357808e-06, "loss": 0.0275, "step": 4120 }, { "epoch": 8.428571428571429, "grad_norm": 0.5232375264167786, "learning_rate": 6.617411642957566e-06, "loss": 0.0248, "step": 4130 }, { "epoch": 8.448979591836734, "grad_norm": 0.37294477224349976, "learning_rate": 6.450633625529001e-06, "loss": 0.0283, "step": 4140 }, { "epoch": 8.46938775510204, "grad_norm": 0.3570898771286011, "learning_rate": 6.285839146292111e-06, "loss": 0.0276, "step": 4150 }, { "epoch": 8.489795918367347, "grad_norm": 0.28744009137153625, "learning_rate": 6.12303571112286e-06, "loss": 0.0203, "step": 4160 }, { "epoch": 8.510204081632653, "grad_norm": 0.4438728988170624, "learning_rate": 5.962230735211383e-06, "loss": 0.026, "step": 4170 }, { "epoch": 8.53061224489796, "grad_norm": 0.24973392486572266, "learning_rate": 5.803431542724192e-06, "loss": 0.0284, "step": 4180 }, { "epoch": 8.551020408163264, "grad_norm": 0.5999603271484375, "learning_rate": 5.646645366470621e-06, "loss": 0.0285, "step": 4190 }, { "epoch": 8.571428571428571, "grad_norm": 0.3535723090171814, "learning_rate": 5.491879347573353e-06, "loss": 0.0261, "step": 4200 }, { "epoch": 8.591836734693878, "grad_norm": 0.22815968096256256, "learning_rate": 5.3391405351432e-06, "loss": 0.0223, "step": 4210 }, { "epoch": 8.612244897959183, "grad_norm": 0.2869364023208618, "learning_rate": 5.188435885958037e-06, "loss": 0.0258, "step": 4220 }, { "epoch": 8.63265306122449, "grad_norm": 0.25079235434532166, "learning_rate": 5.039772264145887e-06, "loss": 0.0198, "step": 4230 }, { "epoch": 8.653061224489797, "grad_norm": 0.3177327811717987, "learning_rate": 4.8931564408723656e-06, "loss": 0.023, "step": 4240 }, { "epoch": 8.673469387755102, "grad_norm": 0.3457529544830322, "learning_rate": 4.748595094032221e-06, "loss": 0.0274, "step": 4250 }, { "epoch": 8.693877551020408, "grad_norm": 0.4598150849342346, "learning_rate": 4.606094807945199e-06, "loss": 0.0333, "step": 4260 }, { "epoch": 8.714285714285714, "grad_norm": 0.2446356564760208, "learning_rate": 4.465662073056109e-06, "loss": 0.0238, "step": 4270 }, { "epoch": 8.73469387755102, "grad_norm": 0.16839979588985443, "learning_rate": 4.327303285639262e-06, "loss": 0.0218, "step": 4280 }, { "epoch": 8.755102040816327, "grad_norm": 0.2948671281337738, "learning_rate": 4.1910247475071104e-06, "loss": 0.0262, "step": 4290 }, { "epoch": 8.775510204081632, "grad_norm": 0.28787535429000854, "learning_rate": 4.0568326657231985e-06, "loss": 0.0255, "step": 4300 }, { "epoch": 8.795918367346939, "grad_norm": 0.25958192348480225, "learning_rate": 3.924733152319493e-06, "loss": 0.0207, "step": 4310 }, { "epoch": 8.816326530612244, "grad_norm": 0.3317425549030304, "learning_rate": 3.794732224017994e-06, "loss": 0.0268, "step": 4320 }, { "epoch": 8.83673469387755, "grad_norm": 0.3428516685962677, "learning_rate": 3.6668358019566305e-06, "loss": 0.0212, "step": 4330 }, { "epoch": 8.857142857142858, "grad_norm": 0.365178644657135, "learning_rate": 3.541049711419664e-06, "loss": 0.0219, "step": 4340 }, { "epoch": 8.877551020408163, "grad_norm": 0.2565886378288269, "learning_rate": 3.417379681572297e-06, "loss": 0.0244, "step": 4350 }, { "epoch": 8.89795918367347, "grad_norm": 0.3833511471748352, "learning_rate": 3.295831345199746e-06, "loss": 0.021, "step": 4360 }, { "epoch": 8.918367346938776, "grad_norm": 0.2949051260948181, "learning_rate": 3.1764102384507e-06, "loss": 0.0259, "step": 4370 }, { "epoch": 8.938775510204081, "grad_norm": 0.5416157841682434, "learning_rate": 3.059121800585152e-06, "loss": 0.0196, "step": 4380 }, { "epoch": 8.959183673469388, "grad_norm": 0.327238529920578, "learning_rate": 2.9439713737266504e-06, "loss": 0.0256, "step": 4390 }, { "epoch": 8.979591836734693, "grad_norm": 0.4477047622203827, "learning_rate": 2.8309642026190075e-06, "loss": 0.0275, "step": 4400 }, { "epoch": 9.0, "grad_norm": 0.42429378628730774, "learning_rate": 2.720105434387382e-06, "loss": 0.0292, "step": 4410 }, { "epoch": 9.020408163265307, "grad_norm": 0.35936760902404785, "learning_rate": 2.6114001183038806e-06, "loss": 0.0228, "step": 4420 }, { "epoch": 9.040816326530612, "grad_norm": 0.4515119194984436, "learning_rate": 2.5048532055575615e-06, "loss": 0.0275, "step": 4430 }, { "epoch": 9.061224489795919, "grad_norm": 0.28619876503944397, "learning_rate": 2.400469549028922e-06, "loss": 0.0256, "step": 4440 }, { "epoch": 9.081632653061224, "grad_norm": 0.3474380075931549, "learning_rate": 2.2982539030688475e-06, "loss": 0.0307, "step": 4450 }, { "epoch": 9.10204081632653, "grad_norm": 0.21155385673046112, "learning_rate": 2.1982109232821178e-06, "loss": 0.0246, "step": 4460 }, { "epoch": 9.122448979591837, "grad_norm": 0.33730846643447876, "learning_rate": 2.1003451663153095e-06, "loss": 0.0203, "step": 4470 }, { "epoch": 9.142857142857142, "grad_norm": 0.3035857379436493, "learning_rate": 2.0046610896492624e-06, "loss": 0.021, "step": 4480 }, { "epoch": 9.16326530612245, "grad_norm": 0.7528371214866638, "learning_rate": 1.9111630513960755e-06, "loss": 0.0211, "step": 4490 }, { "epoch": 9.183673469387756, "grad_norm": 0.3337666392326355, "learning_rate": 1.8198553101005987e-06, "loss": 0.0278, "step": 4500 }, { "epoch": 9.204081632653061, "grad_norm": 0.7967633605003357, "learning_rate": 1.7307420245464667e-06, "loss": 0.0315, "step": 4510 }, { "epoch": 9.224489795918368, "grad_norm": 0.22648100554943085, "learning_rate": 1.643827253566671e-06, "loss": 0.0268, "step": 4520 }, { "epoch": 9.244897959183673, "grad_norm": 0.2347017526626587, "learning_rate": 1.5591149558587038e-06, "loss": 0.0232, "step": 4530 }, { "epoch": 9.26530612244898, "grad_norm": 0.38450396060943604, "learning_rate": 1.4766089898042678e-06, "loss": 0.023, "step": 4540 }, { "epoch": 9.285714285714286, "grad_norm": 0.49666064977645874, "learning_rate": 1.3963131132935015e-06, "loss": 0.021, "step": 4550 }, { "epoch": 9.306122448979592, "grad_norm": 0.35134318470954895, "learning_rate": 1.3182309835538365e-06, "loss": 0.0213, "step": 4560 }, { "epoch": 9.326530612244898, "grad_norm": 0.24812936782836914, "learning_rate": 1.2423661569834489e-06, "loss": 0.0248, "step": 4570 }, { "epoch": 9.346938775510203, "grad_norm": 0.2637794315814972, "learning_rate": 1.168722088989227e-06, "loss": 0.022, "step": 4580 }, { "epoch": 9.36734693877551, "grad_norm": 0.25408467650413513, "learning_rate": 1.0973021338294143e-06, "loss": 0.025, "step": 4590 }, { "epoch": 9.387755102040817, "grad_norm": 0.2591371536254883, "learning_rate": 1.0281095444608423e-06, "loss": 0.0228, "step": 4600 }, { "epoch": 9.408163265306122, "grad_norm": 0.2628161609172821, "learning_rate": 9.611474723907666e-07, "loss": 0.0274, "step": 4610 }, { "epoch": 9.428571428571429, "grad_norm": 0.24642103910446167, "learning_rate": 8.964189675332912e-07, "loss": 0.0218, "step": 4620 }, { "epoch": 9.448979591836734, "grad_norm": 0.2833108901977539, "learning_rate": 8.33926978070504e-07, "loss": 0.0242, "step": 4630 }, { "epoch": 9.46938775510204, "grad_norm": 0.17190630733966827, "learning_rate": 7.736743503181543e-07, "loss": 0.0226, "step": 4640 }, { "epoch": 9.489795918367347, "grad_norm": 0.28679272532463074, "learning_rate": 7.156638285960471e-07, "loss": 0.0245, "step": 4650 }, { "epoch": 9.510204081632653, "grad_norm": 0.3385928273200989, "learning_rate": 6.598980551030143e-07, "loss": 0.0271, "step": 4660 }, { "epoch": 9.53061224489796, "grad_norm": 0.3173186779022217, "learning_rate": 6.063795697966057e-07, "loss": 0.0251, "step": 4670 }, { "epoch": 9.551020408163264, "grad_norm": 0.28356119990348816, "learning_rate": 5.551108102773761e-07, "loss": 0.0207, "step": 4680 }, { "epoch": 9.571428571428571, "grad_norm": 0.3257679045200348, "learning_rate": 5.060941116778572e-07, "loss": 0.0272, "step": 4690 }, { "epoch": 9.591836734693878, "grad_norm": 0.36799612641334534, "learning_rate": 4.5933170655622083e-07, "loss": 0.0263, "step": 4700 }, { "epoch": 9.612244897959183, "grad_norm": 0.5192159414291382, "learning_rate": 4.1482572479458193e-07, "loss": 0.0241, "step": 4710 }, { "epoch": 9.63265306122449, "grad_norm": 0.24428972601890564, "learning_rate": 3.725781935019934e-07, "loss": 0.025, "step": 4720 }, { "epoch": 9.653061224489797, "grad_norm": 0.35796719789505005, "learning_rate": 3.3259103692209747e-07, "loss": 0.0223, "step": 4730 }, { "epoch": 9.673469387755102, "grad_norm": 0.264077365398407, "learning_rate": 2.9486607634551797e-07, "loss": 0.0225, "step": 4740 }, { "epoch": 9.693877551020408, "grad_norm": 0.6665878295898438, "learning_rate": 2.594050300268769e-07, "loss": 0.0228, "step": 4750 }, { "epoch": 9.714285714285714, "grad_norm": 0.3441040813922882, "learning_rate": 2.2620951310653448e-07, "loss": 0.0259, "step": 4760 }, { "epoch": 9.73469387755102, "grad_norm": 0.322299599647522, "learning_rate": 1.9528103753704842e-07, "loss": 0.0254, "step": 4770 }, { "epoch": 9.755102040816327, "grad_norm": 0.3630422055721283, "learning_rate": 1.666210120142897e-07, "loss": 0.0256, "step": 4780 }, { "epoch": 9.775510204081632, "grad_norm": 0.298095166683197, "learning_rate": 1.4023074191327756e-07, "loss": 0.0231, "step": 4790 }, { "epoch": 9.795918367346939, "grad_norm": 0.30586209893226624, "learning_rate": 1.1611142922874352e-07, "loss": 0.0199, "step": 4800 }, { "epoch": 9.816326530612244, "grad_norm": 0.4290463924407959, "learning_rate": 9.426417252035858e-08, "loss": 0.0234, "step": 4810 }, { "epoch": 9.83673469387755, "grad_norm": 0.2478075474500656, "learning_rate": 7.468996686271768e-08, "loss": 0.0234, "step": 4820 }, { "epoch": 9.857142857142858, "grad_norm": 0.40217646956443787, "learning_rate": 5.73897038000093e-08, "loss": 0.0252, "step": 4830 }, { "epoch": 9.877551020408163, "grad_norm": 0.3873520791530609, "learning_rate": 4.236417130540904e-08, "loss": 0.0325, "step": 4840 }, { "epoch": 9.89795918367347, "grad_norm": 0.32106974720954895, "learning_rate": 2.961405374519166e-08, "loss": 0.0237, "step": 4850 }, { "epoch": 9.918367346938776, "grad_norm": 0.28574123978614807, "learning_rate": 1.9139931847556026e-08, "loss": 0.0239, "step": 4860 }, { "epoch": 9.938775510204081, "grad_norm": 0.19580473005771637, "learning_rate": 1.0942282676174032e-08, "loss": 0.026, "step": 4870 }, { "epoch": 9.959183673469388, "grad_norm": 0.2628851532936096, "learning_rate": 5.021479608474655e-09, "loss": 0.0271, "step": 4880 }, { "epoch": 9.979591836734693, "grad_norm": 0.2211391031742096, "learning_rate": 1.377792318624227e-09, "loss": 0.0261, "step": 4890 }, { "epoch": 10.0, "grad_norm": 0.34537559747695923, "learning_rate": 1.1386765241816477e-11, "loss": 0.0276, "step": 4900 }, { "epoch": 10.0, "step": 4900, "total_flos": 0.0, "train_loss": 0.05459458509270026, "train_runtime": 4855.4962, "train_samples_per_second": 32.258, "train_steps_per_second": 1.009 } ], "logging_steps": 10, "max_steps": 4900, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 20000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }