diff --git "a/checkpoint-20000/trainer_state.json" "b/checkpoint-20000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-20000/trainer_state.json" @@ -0,0 +1,14033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.398496240601503, + "eval_steps": 500, + "global_step": 20000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.004699248120300752, + "grad_norm": 4.17242956161499, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.0329, + "step": 10 + }, + { + "epoch": 0.009398496240601503, + "grad_norm": 6.51262092590332, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.0723, + "step": 20 + }, + { + "epoch": 0.014097744360902255, + "grad_norm": 4.060946464538574, + "learning_rate": 3e-06, + "loss": 1.0744, + "step": 30 + }, + { + "epoch": 0.018796992481203006, + "grad_norm": 3.7678067684173584, + "learning_rate": 4.000000000000001e-06, + "loss": 1.0152, + "step": 40 + }, + { + "epoch": 0.023496240601503758, + "grad_norm": 3.8775312900543213, + "learning_rate": 5e-06, + "loss": 0.8718, + "step": 50 + }, + { + "epoch": 0.02819548872180451, + "grad_norm": 3.488631010055542, + "learning_rate": 6e-06, + "loss": 0.7328, + "step": 60 + }, + { + "epoch": 0.03289473684210526, + "grad_norm": 2.0852737426757812, + "learning_rate": 7.000000000000001e-06, + "loss": 0.5037, + "step": 70 + }, + { + "epoch": 0.03759398496240601, + "grad_norm": 1.5253463983535767, + "learning_rate": 8.000000000000001e-06, + "loss": 0.4341, + "step": 80 + }, + { + "epoch": 0.042293233082706765, + "grad_norm": 1.1347906589508057, + "learning_rate": 9e-06, + "loss": 0.287, + "step": 90 + }, + { + "epoch": 0.046992481203007516, + "grad_norm": 1.715220332145691, + "learning_rate": 1e-05, + "loss": 0.2604, + "step": 100 + }, + { + "epoch": 0.05169172932330827, + "grad_norm": 1.325265645980835, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.2482, + "step": 110 + }, + { + "epoch": 0.05639097744360902, + "grad_norm": 1.0217766761779785, + "learning_rate": 1.2e-05, + "loss": 0.217, + "step": 120 + }, + { + "epoch": 0.06109022556390977, + "grad_norm": 0.9909769296646118, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.1512, + "step": 130 + }, + { + "epoch": 0.06578947368421052, + "grad_norm": 0.549028217792511, + "learning_rate": 1.4000000000000001e-05, + "loss": 0.1569, + "step": 140 + }, + { + "epoch": 0.07048872180451128, + "grad_norm": 0.8695999383926392, + "learning_rate": 1.5e-05, + "loss": 0.1652, + "step": 150 + }, + { + "epoch": 0.07518796992481203, + "grad_norm": 0.6757495403289795, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.1339, + "step": 160 + }, + { + "epoch": 0.07988721804511278, + "grad_norm": 0.4814220666885376, + "learning_rate": 1.7000000000000003e-05, + "loss": 0.1316, + "step": 170 + }, + { + "epoch": 0.08458646616541353, + "grad_norm": 0.8050209283828735, + "learning_rate": 1.8e-05, + "loss": 0.1224, + "step": 180 + }, + { + "epoch": 0.08928571428571429, + "grad_norm": 0.49900326132774353, + "learning_rate": 1.9e-05, + "loss": 0.1269, + "step": 190 + }, + { + "epoch": 0.09398496240601503, + "grad_norm": 0.7031654119491577, + "learning_rate": 2e-05, + "loss": 0.1288, + "step": 200 + }, + { + "epoch": 0.09868421052631579, + "grad_norm": 0.5220195055007935, + "learning_rate": 2.1e-05, + "loss": 0.1111, + "step": 210 + }, + { + "epoch": 0.10338345864661654, + "grad_norm": 0.4195871353149414, + "learning_rate": 2.2000000000000003e-05, + "loss": 0.114, + "step": 220 + }, + { + "epoch": 0.1080827067669173, + "grad_norm": 0.5394448637962341, + "learning_rate": 2.3000000000000003e-05, + "loss": 0.1086, + "step": 230 + }, + { + "epoch": 0.11278195488721804, + "grad_norm": 0.5818265676498413, + "learning_rate": 2.4e-05, + "loss": 0.1034, + "step": 240 + }, + { + "epoch": 0.1174812030075188, + "grad_norm": 0.5092581510543823, + "learning_rate": 2.5e-05, + "loss": 0.1001, + "step": 250 + }, + { + "epoch": 0.12218045112781954, + "grad_norm": 0.5820969939231873, + "learning_rate": 2.6000000000000002e-05, + "loss": 0.0985, + "step": 260 + }, + { + "epoch": 0.12687969924812031, + "grad_norm": 0.4616420269012451, + "learning_rate": 2.7000000000000002e-05, + "loss": 0.095, + "step": 270 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 0.5095752477645874, + "learning_rate": 2.8000000000000003e-05, + "loss": 0.1115, + "step": 280 + }, + { + "epoch": 0.1362781954887218, + "grad_norm": 0.6476600170135498, + "learning_rate": 2.9e-05, + "loss": 0.0831, + "step": 290 + }, + { + "epoch": 0.14097744360902256, + "grad_norm": 0.5189246535301208, + "learning_rate": 3e-05, + "loss": 0.1043, + "step": 300 + }, + { + "epoch": 0.14567669172932332, + "grad_norm": 0.5447798371315002, + "learning_rate": 3.1e-05, + "loss": 0.0829, + "step": 310 + }, + { + "epoch": 0.15037593984962405, + "grad_norm": 0.47478917241096497, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.0844, + "step": 320 + }, + { + "epoch": 0.1550751879699248, + "grad_norm": 0.577869713306427, + "learning_rate": 3.3e-05, + "loss": 0.0935, + "step": 330 + }, + { + "epoch": 0.15977443609022557, + "grad_norm": 0.6907890439033508, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.0773, + "step": 340 + }, + { + "epoch": 0.16447368421052633, + "grad_norm": 0.5402064919471741, + "learning_rate": 3.5e-05, + "loss": 0.0852, + "step": 350 + }, + { + "epoch": 0.16917293233082706, + "grad_norm": 0.5711252689361572, + "learning_rate": 3.6e-05, + "loss": 0.0821, + "step": 360 + }, + { + "epoch": 0.17387218045112782, + "grad_norm": 0.5073336362838745, + "learning_rate": 3.7e-05, + "loss": 0.0793, + "step": 370 + }, + { + "epoch": 0.17857142857142858, + "grad_norm": 0.4687930941581726, + "learning_rate": 3.8e-05, + "loss": 0.0788, + "step": 380 + }, + { + "epoch": 0.18327067669172933, + "grad_norm": 0.46393144130706787, + "learning_rate": 3.9000000000000006e-05, + "loss": 0.0849, + "step": 390 + }, + { + "epoch": 0.18796992481203006, + "grad_norm": 0.43402764201164246, + "learning_rate": 4e-05, + "loss": 0.0667, + "step": 400 + }, + { + "epoch": 0.19266917293233082, + "grad_norm": 0.4862448275089264, + "learning_rate": 4.1e-05, + "loss": 0.074, + "step": 410 + }, + { + "epoch": 0.19736842105263158, + "grad_norm": 0.4262375831604004, + "learning_rate": 4.2e-05, + "loss": 0.0825, + "step": 420 + }, + { + "epoch": 0.20206766917293234, + "grad_norm": 0.4565044641494751, + "learning_rate": 4.3e-05, + "loss": 0.0786, + "step": 430 + }, + { + "epoch": 0.20676691729323307, + "grad_norm": 0.43929043412208557, + "learning_rate": 4.4000000000000006e-05, + "loss": 0.0726, + "step": 440 + }, + { + "epoch": 0.21146616541353383, + "grad_norm": 0.5057302713394165, + "learning_rate": 4.5e-05, + "loss": 0.0673, + "step": 450 + }, + { + "epoch": 0.2161654135338346, + "grad_norm": 0.5209115147590637, + "learning_rate": 4.600000000000001e-05, + "loss": 0.0639, + "step": 460 + }, + { + "epoch": 0.22086466165413535, + "grad_norm": 0.544666051864624, + "learning_rate": 4.7e-05, + "loss": 0.0716, + "step": 470 + }, + { + "epoch": 0.22556390977443608, + "grad_norm": 0.47278276085853577, + "learning_rate": 4.8e-05, + "loss": 0.0631, + "step": 480 + }, + { + "epoch": 0.23026315789473684, + "grad_norm": 0.5331622362136841, + "learning_rate": 4.9e-05, + "loss": 0.0668, + "step": 490 + }, + { + "epoch": 0.2349624060150376, + "grad_norm": 0.5460582971572876, + "learning_rate": 5e-05, + "loss": 0.055, + "step": 500 + }, + { + "epoch": 0.23966165413533835, + "grad_norm": 0.511077880859375, + "learning_rate": 5.1000000000000006e-05, + "loss": 0.0609, + "step": 510 + }, + { + "epoch": 0.24436090225563908, + "grad_norm": 0.33869466185569763, + "learning_rate": 5.2000000000000004e-05, + "loss": 0.0532, + "step": 520 + }, + { + "epoch": 0.24906015037593984, + "grad_norm": 0.4660898745059967, + "learning_rate": 5.300000000000001e-05, + "loss": 0.0613, + "step": 530 + }, + { + "epoch": 0.25375939849624063, + "grad_norm": 0.5432083010673523, + "learning_rate": 5.4000000000000005e-05, + "loss": 0.0702, + "step": 540 + }, + { + "epoch": 0.25845864661654133, + "grad_norm": 0.5141884684562683, + "learning_rate": 5.500000000000001e-05, + "loss": 0.0513, + "step": 550 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 0.3961157500743866, + "learning_rate": 5.6000000000000006e-05, + "loss": 0.0609, + "step": 560 + }, + { + "epoch": 0.26785714285714285, + "grad_norm": 0.5757367014884949, + "learning_rate": 5.6999999999999996e-05, + "loss": 0.0717, + "step": 570 + }, + { + "epoch": 0.2725563909774436, + "grad_norm": 0.4370141625404358, + "learning_rate": 5.8e-05, + "loss": 0.0637, + "step": 580 + }, + { + "epoch": 0.27725563909774437, + "grad_norm": 0.3731731176376343, + "learning_rate": 5.9e-05, + "loss": 0.0517, + "step": 590 + }, + { + "epoch": 0.2819548872180451, + "grad_norm": 0.5891894102096558, + "learning_rate": 6e-05, + "loss": 0.0664, + "step": 600 + }, + { + "epoch": 0.2866541353383459, + "grad_norm": 0.49936580657958984, + "learning_rate": 6.1e-05, + "loss": 0.071, + "step": 610 + }, + { + "epoch": 0.29135338345864664, + "grad_norm": 0.5268176198005676, + "learning_rate": 6.2e-05, + "loss": 0.063, + "step": 620 + }, + { + "epoch": 0.29605263157894735, + "grad_norm": 0.33853551745414734, + "learning_rate": 6.3e-05, + "loss": 0.057, + "step": 630 + }, + { + "epoch": 0.3007518796992481, + "grad_norm": 0.47726792097091675, + "learning_rate": 6.400000000000001e-05, + "loss": 0.059, + "step": 640 + }, + { + "epoch": 0.30545112781954886, + "grad_norm": 0.4239175021648407, + "learning_rate": 6.500000000000001e-05, + "loss": 0.0601, + "step": 650 + }, + { + "epoch": 0.3101503759398496, + "grad_norm": 0.46040597558021545, + "learning_rate": 6.6e-05, + "loss": 0.0598, + "step": 660 + }, + { + "epoch": 0.3148496240601504, + "grad_norm": 0.374403715133667, + "learning_rate": 6.7e-05, + "loss": 0.0422, + "step": 670 + }, + { + "epoch": 0.31954887218045114, + "grad_norm": 0.5622545480728149, + "learning_rate": 6.800000000000001e-05, + "loss": 0.0642, + "step": 680 + }, + { + "epoch": 0.3242481203007519, + "grad_norm": 0.4140852391719818, + "learning_rate": 6.9e-05, + "loss": 0.0475, + "step": 690 + }, + { + "epoch": 0.32894736842105265, + "grad_norm": 0.46566590666770935, + "learning_rate": 7e-05, + "loss": 0.0576, + "step": 700 + }, + { + "epoch": 0.33364661654135336, + "grad_norm": 0.6023309826850891, + "learning_rate": 7.1e-05, + "loss": 0.0566, + "step": 710 + }, + { + "epoch": 0.3383458646616541, + "grad_norm": 0.5072154402732849, + "learning_rate": 7.2e-05, + "loss": 0.0493, + "step": 720 + }, + { + "epoch": 0.3430451127819549, + "grad_norm": 0.439280241727829, + "learning_rate": 7.3e-05, + "loss": 0.0642, + "step": 730 + }, + { + "epoch": 0.34774436090225563, + "grad_norm": 0.5976812839508057, + "learning_rate": 7.4e-05, + "loss": 0.0608, + "step": 740 + }, + { + "epoch": 0.3524436090225564, + "grad_norm": 0.3581954538822174, + "learning_rate": 7.500000000000001e-05, + "loss": 0.0522, + "step": 750 + }, + { + "epoch": 0.35714285714285715, + "grad_norm": 0.5385236740112305, + "learning_rate": 7.6e-05, + "loss": 0.0556, + "step": 760 + }, + { + "epoch": 0.3618421052631579, + "grad_norm": 0.48683884739875793, + "learning_rate": 7.7e-05, + "loss": 0.0545, + "step": 770 + }, + { + "epoch": 0.36654135338345867, + "grad_norm": 0.4413968622684479, + "learning_rate": 7.800000000000001e-05, + "loss": 0.0562, + "step": 780 + }, + { + "epoch": 0.37124060150375937, + "grad_norm": 0.6093789935112, + "learning_rate": 7.900000000000001e-05, + "loss": 0.0414, + "step": 790 + }, + { + "epoch": 0.37593984962406013, + "grad_norm": 0.32487279176712036, + "learning_rate": 8e-05, + "loss": 0.0547, + "step": 800 + }, + { + "epoch": 0.3806390977443609, + "grad_norm": 0.43237757682800293, + "learning_rate": 8.1e-05, + "loss": 0.0473, + "step": 810 + }, + { + "epoch": 0.38533834586466165, + "grad_norm": 0.3570401072502136, + "learning_rate": 8.2e-05, + "loss": 0.0638, + "step": 820 + }, + { + "epoch": 0.3900375939849624, + "grad_norm": 0.3672104477882385, + "learning_rate": 8.3e-05, + "loss": 0.0554, + "step": 830 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 0.4324614107608795, + "learning_rate": 8.4e-05, + "loss": 0.0432, + "step": 840 + }, + { + "epoch": 0.3994360902255639, + "grad_norm": 0.31276965141296387, + "learning_rate": 8.5e-05, + "loss": 0.0458, + "step": 850 + }, + { + "epoch": 0.4041353383458647, + "grad_norm": 0.36087697744369507, + "learning_rate": 8.6e-05, + "loss": 0.0525, + "step": 860 + }, + { + "epoch": 0.40883458646616544, + "grad_norm": 0.33883997797966003, + "learning_rate": 8.7e-05, + "loss": 0.0468, + "step": 870 + }, + { + "epoch": 0.41353383458646614, + "grad_norm": 0.43214815855026245, + "learning_rate": 8.800000000000001e-05, + "loss": 0.0565, + "step": 880 + }, + { + "epoch": 0.4182330827067669, + "grad_norm": 0.34577423334121704, + "learning_rate": 8.900000000000001e-05, + "loss": 0.0518, + "step": 890 + }, + { + "epoch": 0.42293233082706766, + "grad_norm": 0.45119839906692505, + "learning_rate": 9e-05, + "loss": 0.0496, + "step": 900 + }, + { + "epoch": 0.4276315789473684, + "grad_norm": 0.47580137848854065, + "learning_rate": 9.1e-05, + "loss": 0.0555, + "step": 910 + }, + { + "epoch": 0.4323308270676692, + "grad_norm": 0.4175976812839508, + "learning_rate": 9.200000000000001e-05, + "loss": 0.0567, + "step": 920 + }, + { + "epoch": 0.43703007518796994, + "grad_norm": 0.4780498445034027, + "learning_rate": 9.300000000000001e-05, + "loss": 0.0678, + "step": 930 + }, + { + "epoch": 0.4417293233082707, + "grad_norm": 0.45834606885910034, + "learning_rate": 9.4e-05, + "loss": 0.0379, + "step": 940 + }, + { + "epoch": 0.44642857142857145, + "grad_norm": 0.33210793137550354, + "learning_rate": 9.5e-05, + "loss": 0.0461, + "step": 950 + }, + { + "epoch": 0.45112781954887216, + "grad_norm": 0.5226761102676392, + "learning_rate": 9.6e-05, + "loss": 0.0542, + "step": 960 + }, + { + "epoch": 0.4558270676691729, + "grad_norm": 0.4086574912071228, + "learning_rate": 9.7e-05, + "loss": 0.0443, + "step": 970 + }, + { + "epoch": 0.4605263157894737, + "grad_norm": 0.2859014868736267, + "learning_rate": 9.8e-05, + "loss": 0.0458, + "step": 980 + }, + { + "epoch": 0.46522556390977443, + "grad_norm": 0.3826181888580322, + "learning_rate": 9.900000000000001e-05, + "loss": 0.044, + "step": 990 + }, + { + "epoch": 0.4699248120300752, + "grad_norm": 0.3559493124485016, + "learning_rate": 0.0001, + "loss": 0.0407, + "step": 1000 + }, + { + "epoch": 0.47462406015037595, + "grad_norm": 0.2889259457588196, + "learning_rate": 9.999993165095463e-05, + "loss": 0.0568, + "step": 1010 + }, + { + "epoch": 0.4793233082706767, + "grad_norm": 0.26828300952911377, + "learning_rate": 9.999972660400536e-05, + "loss": 0.0459, + "step": 1020 + }, + { + "epoch": 0.48402255639097747, + "grad_norm": 0.45884090662002563, + "learning_rate": 9.999938485971279e-05, + "loss": 0.054, + "step": 1030 + }, + { + "epoch": 0.48872180451127817, + "grad_norm": 0.5988617539405823, + "learning_rate": 9.999890641901125e-05, + "loss": 0.0455, + "step": 1040 + }, + { + "epoch": 0.4934210526315789, + "grad_norm": 0.320173442363739, + "learning_rate": 9.999829128320874e-05, + "loss": 0.047, + "step": 1050 + }, + { + "epoch": 0.4981203007518797, + "grad_norm": 0.34646356105804443, + "learning_rate": 9.999753945398704e-05, + "loss": 0.0567, + "step": 1060 + }, + { + "epoch": 0.5028195488721805, + "grad_norm": 0.321715384721756, + "learning_rate": 9.999665093340165e-05, + "loss": 0.0418, + "step": 1070 + }, + { + "epoch": 0.5075187969924813, + "grad_norm": 0.29327085614204407, + "learning_rate": 9.99956257238817e-05, + "loss": 0.0498, + "step": 1080 + }, + { + "epoch": 0.5122180451127819, + "grad_norm": 0.2684969902038574, + "learning_rate": 9.999446382823013e-05, + "loss": 0.0507, + "step": 1090 + }, + { + "epoch": 0.5169172932330827, + "grad_norm": 0.1975565105676651, + "learning_rate": 9.999316524962345e-05, + "loss": 0.0387, + "step": 1100 + }, + { + "epoch": 0.5216165413533834, + "grad_norm": 0.31090304255485535, + "learning_rate": 9.999172999161198e-05, + "loss": 0.0378, + "step": 1110 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 0.30241793394088745, + "learning_rate": 9.999015805811965e-05, + "loss": 0.0398, + "step": 1120 + }, + { + "epoch": 0.5310150375939849, + "grad_norm": 0.28800079226493835, + "learning_rate": 9.998844945344405e-05, + "loss": 0.0409, + "step": 1130 + }, + { + "epoch": 0.5357142857142857, + "grad_norm": 0.41590771079063416, + "learning_rate": 9.998660418225645e-05, + "loss": 0.0352, + "step": 1140 + }, + { + "epoch": 0.5404135338345865, + "grad_norm": 0.3479129374027252, + "learning_rate": 9.998462224960175e-05, + "loss": 0.049, + "step": 1150 + }, + { + "epoch": 0.5451127819548872, + "grad_norm": 0.26915863156318665, + "learning_rate": 9.998250366089848e-05, + "loss": 0.0347, + "step": 1160 + }, + { + "epoch": 0.549812030075188, + "grad_norm": 0.2776716351509094, + "learning_rate": 9.998024842193876e-05, + "loss": 0.0461, + "step": 1170 + }, + { + "epoch": 0.5545112781954887, + "grad_norm": 0.49566450715065, + "learning_rate": 9.997785653888835e-05, + "loss": 0.0437, + "step": 1180 + }, + { + "epoch": 0.5592105263157895, + "grad_norm": 0.2881394922733307, + "learning_rate": 9.997532801828658e-05, + "loss": 0.0401, + "step": 1190 + }, + { + "epoch": 0.5639097744360902, + "grad_norm": 0.30325883626937866, + "learning_rate": 9.997266286704631e-05, + "loss": 0.0484, + "step": 1200 + }, + { + "epoch": 0.568609022556391, + "grad_norm": 0.25720658898353577, + "learning_rate": 9.996986109245395e-05, + "loss": 0.0572, + "step": 1210 + }, + { + "epoch": 0.5733082706766918, + "grad_norm": 0.33928200602531433, + "learning_rate": 9.996692270216947e-05, + "loss": 0.0468, + "step": 1220 + }, + { + "epoch": 0.5780075187969925, + "grad_norm": 0.21968646347522736, + "learning_rate": 9.996384770422629e-05, + "loss": 0.0401, + "step": 1230 + }, + { + "epoch": 0.5827067669172933, + "grad_norm": 0.3939560651779175, + "learning_rate": 9.996063610703137e-05, + "loss": 0.0504, + "step": 1240 + }, + { + "epoch": 0.5874060150375939, + "grad_norm": 0.23372384905815125, + "learning_rate": 9.995728791936504e-05, + "loss": 0.0371, + "step": 1250 + }, + { + "epoch": 0.5921052631578947, + "grad_norm": 0.3492465019226074, + "learning_rate": 9.995380315038119e-05, + "loss": 0.0359, + "step": 1260 + }, + { + "epoch": 0.5968045112781954, + "grad_norm": 0.31815361976623535, + "learning_rate": 9.9950181809607e-05, + "loss": 0.0387, + "step": 1270 + }, + { + "epoch": 0.6015037593984962, + "grad_norm": 0.44413354992866516, + "learning_rate": 9.994642390694308e-05, + "loss": 0.0435, + "step": 1280 + }, + { + "epoch": 0.606203007518797, + "grad_norm": 0.1992567628622055, + "learning_rate": 9.99425294526634e-05, + "loss": 0.0403, + "step": 1290 + }, + { + "epoch": 0.6109022556390977, + "grad_norm": 0.2784936726093292, + "learning_rate": 9.993849845741524e-05, + "loss": 0.0441, + "step": 1300 + }, + { + "epoch": 0.6156015037593985, + "grad_norm": 0.22886349260807037, + "learning_rate": 9.99343309322192e-05, + "loss": 0.0361, + "step": 1310 + }, + { + "epoch": 0.6203007518796992, + "grad_norm": 0.29100221395492554, + "learning_rate": 9.993002688846913e-05, + "loss": 0.0439, + "step": 1320 + }, + { + "epoch": 0.625, + "grad_norm": 0.356252521276474, + "learning_rate": 9.992558633793212e-05, + "loss": 0.0498, + "step": 1330 + }, + { + "epoch": 0.6296992481203008, + "grad_norm": 0.3578437864780426, + "learning_rate": 9.992100929274846e-05, + "loss": 0.0572, + "step": 1340 + }, + { + "epoch": 0.6343984962406015, + "grad_norm": 0.2747027575969696, + "learning_rate": 9.991629576543163e-05, + "loss": 0.0395, + "step": 1350 + }, + { + "epoch": 0.6390977443609023, + "grad_norm": 0.31512099504470825, + "learning_rate": 9.991144576886823e-05, + "loss": 0.0532, + "step": 1360 + }, + { + "epoch": 0.643796992481203, + "grad_norm": 0.34956079721450806, + "learning_rate": 9.990645931631796e-05, + "loss": 0.0433, + "step": 1370 + }, + { + "epoch": 0.6484962406015038, + "grad_norm": 0.3578788638114929, + "learning_rate": 9.990133642141359e-05, + "loss": 0.0399, + "step": 1380 + }, + { + "epoch": 0.6531954887218046, + "grad_norm": 0.4128391146659851, + "learning_rate": 9.989607709816091e-05, + "loss": 0.0438, + "step": 1390 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 0.32965388894081116, + "learning_rate": 9.989068136093873e-05, + "loss": 0.0385, + "step": 1400 + }, + { + "epoch": 0.6625939849624061, + "grad_norm": 0.345131516456604, + "learning_rate": 9.988514922449879e-05, + "loss": 0.0583, + "step": 1410 + }, + { + "epoch": 0.6672932330827067, + "grad_norm": 0.24946953356266022, + "learning_rate": 9.987948070396571e-05, + "loss": 0.0432, + "step": 1420 + }, + { + "epoch": 0.6719924812030075, + "grad_norm": 0.3567699193954468, + "learning_rate": 9.987367581483705e-05, + "loss": 0.0477, + "step": 1430 + }, + { + "epoch": 0.6766917293233082, + "grad_norm": 0.3243180215358734, + "learning_rate": 9.986773457298311e-05, + "loss": 0.0356, + "step": 1440 + }, + { + "epoch": 0.681390977443609, + "grad_norm": 0.31001994013786316, + "learning_rate": 9.986165699464705e-05, + "loss": 0.0485, + "step": 1450 + }, + { + "epoch": 0.6860902255639098, + "grad_norm": 0.22086216509342194, + "learning_rate": 9.985544309644475e-05, + "loss": 0.041, + "step": 1460 + }, + { + "epoch": 0.6907894736842105, + "grad_norm": 0.2946653962135315, + "learning_rate": 9.984909289536473e-05, + "loss": 0.0299, + "step": 1470 + }, + { + "epoch": 0.6954887218045113, + "grad_norm": 0.29980820417404175, + "learning_rate": 9.984260640876821e-05, + "loss": 0.0369, + "step": 1480 + }, + { + "epoch": 0.700187969924812, + "grad_norm": 0.31546637415885925, + "learning_rate": 9.983598365438902e-05, + "loss": 0.0365, + "step": 1490 + }, + { + "epoch": 0.7048872180451128, + "grad_norm": 0.2951902747154236, + "learning_rate": 9.98292246503335e-05, + "loss": 0.0433, + "step": 1500 + }, + { + "epoch": 0.7095864661654135, + "grad_norm": 0.24987751245498657, + "learning_rate": 9.98223294150805e-05, + "loss": 0.0322, + "step": 1510 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 0.25791454315185547, + "learning_rate": 9.981529796748134e-05, + "loss": 0.0347, + "step": 1520 + }, + { + "epoch": 0.7189849624060151, + "grad_norm": 0.2819271385669708, + "learning_rate": 9.980813032675974e-05, + "loss": 0.0393, + "step": 1530 + }, + { + "epoch": 0.7236842105263158, + "grad_norm": 0.2781679630279541, + "learning_rate": 9.980082651251175e-05, + "loss": 0.032, + "step": 1540 + }, + { + "epoch": 0.7283834586466166, + "grad_norm": 0.2625151574611664, + "learning_rate": 9.979338654470569e-05, + "loss": 0.0457, + "step": 1550 + }, + { + "epoch": 0.7330827067669173, + "grad_norm": 0.23519033193588257, + "learning_rate": 9.97858104436822e-05, + "loss": 0.0293, + "step": 1560 + }, + { + "epoch": 0.7377819548872181, + "grad_norm": 0.3268503248691559, + "learning_rate": 9.977809823015401e-05, + "loss": 0.0351, + "step": 1570 + }, + { + "epoch": 0.7424812030075187, + "grad_norm": 0.26714998483657837, + "learning_rate": 9.977024992520602e-05, + "loss": 0.0354, + "step": 1580 + }, + { + "epoch": 0.7471804511278195, + "grad_norm": 0.2875531017780304, + "learning_rate": 9.976226555029522e-05, + "loss": 0.0375, + "step": 1590 + }, + { + "epoch": 0.7518796992481203, + "grad_norm": 0.29909199476242065, + "learning_rate": 9.975414512725057e-05, + "loss": 0.042, + "step": 1600 + }, + { + "epoch": 0.756578947368421, + "grad_norm": 0.35534903407096863, + "learning_rate": 9.974588867827301e-05, + "loss": 0.0392, + "step": 1610 + }, + { + "epoch": 0.7612781954887218, + "grad_norm": 0.2016129046678543, + "learning_rate": 9.973749622593534e-05, + "loss": 0.0386, + "step": 1620 + }, + { + "epoch": 0.7659774436090225, + "grad_norm": 0.3945503234863281, + "learning_rate": 9.972896779318219e-05, + "loss": 0.0468, + "step": 1630 + }, + { + "epoch": 0.7706766917293233, + "grad_norm": 0.2441464513540268, + "learning_rate": 9.972030340333001e-05, + "loss": 0.0378, + "step": 1640 + }, + { + "epoch": 0.775375939849624, + "grad_norm": 0.24610991775989532, + "learning_rate": 9.97115030800669e-05, + "loss": 0.0324, + "step": 1650 + }, + { + "epoch": 0.7800751879699248, + "grad_norm": 0.2868049144744873, + "learning_rate": 9.970256684745258e-05, + "loss": 0.0511, + "step": 1660 + }, + { + "epoch": 0.7847744360902256, + "grad_norm": 0.36867856979370117, + "learning_rate": 9.969349472991838e-05, + "loss": 0.036, + "step": 1670 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.2507390081882477, + "learning_rate": 9.968428675226714e-05, + "loss": 0.0325, + "step": 1680 + }, + { + "epoch": 0.7941729323308271, + "grad_norm": 0.29328516125679016, + "learning_rate": 9.967494293967312e-05, + "loss": 0.0491, + "step": 1690 + }, + { + "epoch": 0.7988721804511278, + "grad_norm": 0.26543503999710083, + "learning_rate": 9.966546331768191e-05, + "loss": 0.0397, + "step": 1700 + }, + { + "epoch": 0.8035714285714286, + "grad_norm": 0.2858276963233948, + "learning_rate": 9.965584791221048e-05, + "loss": 0.0328, + "step": 1710 + }, + { + "epoch": 0.8082706766917294, + "grad_norm": 0.3322197496891022, + "learning_rate": 9.964609674954696e-05, + "loss": 0.0437, + "step": 1720 + }, + { + "epoch": 0.8129699248120301, + "grad_norm": 0.29589325189590454, + "learning_rate": 9.963620985635065e-05, + "loss": 0.033, + "step": 1730 + }, + { + "epoch": 0.8176691729323309, + "grad_norm": 0.2513675093650818, + "learning_rate": 9.962618725965196e-05, + "loss": 0.0466, + "step": 1740 + }, + { + "epoch": 0.8223684210526315, + "grad_norm": 0.24024534225463867, + "learning_rate": 9.961602898685226e-05, + "loss": 0.0316, + "step": 1750 + }, + { + "epoch": 0.8270676691729323, + "grad_norm": 0.2825106680393219, + "learning_rate": 9.96057350657239e-05, + "loss": 0.0335, + "step": 1760 + }, + { + "epoch": 0.831766917293233, + "grad_norm": 0.286940336227417, + "learning_rate": 9.959530552441005e-05, + "loss": 0.0373, + "step": 1770 + }, + { + "epoch": 0.8364661654135338, + "grad_norm": 0.25305864214897156, + "learning_rate": 9.95847403914247e-05, + "loss": 0.0341, + "step": 1780 + }, + { + "epoch": 0.8411654135338346, + "grad_norm": 0.3127809464931488, + "learning_rate": 9.95740396956525e-05, + "loss": 0.0382, + "step": 1790 + }, + { + "epoch": 0.8458646616541353, + "grad_norm": 0.2425500899553299, + "learning_rate": 9.956320346634876e-05, + "loss": 0.0386, + "step": 1800 + }, + { + "epoch": 0.8505639097744361, + "grad_norm": 0.3123094439506531, + "learning_rate": 9.955223173313931e-05, + "loss": 0.0335, + "step": 1810 + }, + { + "epoch": 0.8552631578947368, + "grad_norm": 0.20537501573562622, + "learning_rate": 9.954112452602045e-05, + "loss": 0.025, + "step": 1820 + }, + { + "epoch": 0.8599624060150376, + "grad_norm": 0.24950020015239716, + "learning_rate": 9.952988187535886e-05, + "loss": 0.0344, + "step": 1830 + }, + { + "epoch": 0.8646616541353384, + "grad_norm": 0.27342480421066284, + "learning_rate": 9.95185038118915e-05, + "loss": 0.0414, + "step": 1840 + }, + { + "epoch": 0.8693609022556391, + "grad_norm": 0.31957921385765076, + "learning_rate": 9.950699036672559e-05, + "loss": 0.0314, + "step": 1850 + }, + { + "epoch": 0.8740601503759399, + "grad_norm": 0.2214190661907196, + "learning_rate": 9.949534157133844e-05, + "loss": 0.0328, + "step": 1860 + }, + { + "epoch": 0.8787593984962406, + "grad_norm": 0.2547666132450104, + "learning_rate": 9.948355745757741e-05, + "loss": 0.0296, + "step": 1870 + }, + { + "epoch": 0.8834586466165414, + "grad_norm": 0.20601588487625122, + "learning_rate": 9.94716380576598e-05, + "loss": 0.0319, + "step": 1880 + }, + { + "epoch": 0.8881578947368421, + "grad_norm": 0.30237144231796265, + "learning_rate": 9.945958340417283e-05, + "loss": 0.0417, + "step": 1890 + }, + { + "epoch": 0.8928571428571429, + "grad_norm": 0.24439987540245056, + "learning_rate": 9.944739353007344e-05, + "loss": 0.0414, + "step": 1900 + }, + { + "epoch": 0.8975563909774437, + "grad_norm": 0.32059434056282043, + "learning_rate": 9.943506846868826e-05, + "loss": 0.0337, + "step": 1910 + }, + { + "epoch": 0.9022556390977443, + "grad_norm": 0.24147386848926544, + "learning_rate": 9.942260825371358e-05, + "loss": 0.0257, + "step": 1920 + }, + { + "epoch": 0.9069548872180451, + "grad_norm": 0.3102846145629883, + "learning_rate": 9.941001291921512e-05, + "loss": 0.0391, + "step": 1930 + }, + { + "epoch": 0.9116541353383458, + "grad_norm": 0.2683519124984741, + "learning_rate": 9.939728249962807e-05, + "loss": 0.0249, + "step": 1940 + }, + { + "epoch": 0.9163533834586466, + "grad_norm": 0.2480572909116745, + "learning_rate": 9.938441702975689e-05, + "loss": 0.0369, + "step": 1950 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 0.26705268025398254, + "learning_rate": 9.937141654477528e-05, + "loss": 0.0374, + "step": 1960 + }, + { + "epoch": 0.9257518796992481, + "grad_norm": 0.2704300880432129, + "learning_rate": 9.93582810802261e-05, + "loss": 0.0384, + "step": 1970 + }, + { + "epoch": 0.9304511278195489, + "grad_norm": 0.3017674386501312, + "learning_rate": 9.934501067202117e-05, + "loss": 0.0393, + "step": 1980 + }, + { + "epoch": 0.9351503759398496, + "grad_norm": 0.3015788495540619, + "learning_rate": 9.93316053564413e-05, + "loss": 0.0326, + "step": 1990 + }, + { + "epoch": 0.9398496240601504, + "grad_norm": 0.30426859855651855, + "learning_rate": 9.931806517013612e-05, + "loss": 0.0329, + "step": 2000 + }, + { + "epoch": 0.9445488721804511, + "grad_norm": 0.3226216435432434, + "learning_rate": 9.930439015012396e-05, + "loss": 0.0327, + "step": 2010 + }, + { + "epoch": 0.9492481203007519, + "grad_norm": 0.31180641055107117, + "learning_rate": 9.929058033379181e-05, + "loss": 0.0284, + "step": 2020 + }, + { + "epoch": 0.9539473684210527, + "grad_norm": 0.26425114274024963, + "learning_rate": 9.927663575889521e-05, + "loss": 0.033, + "step": 2030 + }, + { + "epoch": 0.9586466165413534, + "grad_norm": 0.15367713570594788, + "learning_rate": 9.926255646355804e-05, + "loss": 0.0311, + "step": 2040 + }, + { + "epoch": 0.9633458646616542, + "grad_norm": 0.21648705005645752, + "learning_rate": 9.92483424862726e-05, + "loss": 0.031, + "step": 2050 + }, + { + "epoch": 0.9680451127819549, + "grad_norm": 0.2765688896179199, + "learning_rate": 9.923399386589933e-05, + "loss": 0.0389, + "step": 2060 + }, + { + "epoch": 0.9727443609022557, + "grad_norm": 0.21116332709789276, + "learning_rate": 9.921951064166684e-05, + "loss": 0.0348, + "step": 2070 + }, + { + "epoch": 0.9774436090225563, + "grad_norm": 0.2599218189716339, + "learning_rate": 9.92048928531717e-05, + "loss": 0.043, + "step": 2080 + }, + { + "epoch": 0.9821428571428571, + "grad_norm": 0.2683297395706177, + "learning_rate": 9.919014054037836e-05, + "loss": 0.0285, + "step": 2090 + }, + { + "epoch": 0.9868421052631579, + "grad_norm": 0.20241263508796692, + "learning_rate": 9.917525374361912e-05, + "loss": 0.033, + "step": 2100 + }, + { + "epoch": 0.9915413533834586, + "grad_norm": 0.19710877537727356, + "learning_rate": 9.91602325035939e-05, + "loss": 0.0375, + "step": 2110 + }, + { + "epoch": 0.9962406015037594, + "grad_norm": 0.29469940066337585, + "learning_rate": 9.914507686137019e-05, + "loss": 0.0435, + "step": 2120 + }, + { + "epoch": 1.0009398496240602, + "grad_norm": 0.40180471539497375, + "learning_rate": 9.912978685838294e-05, + "loss": 0.0425, + "step": 2130 + }, + { + "epoch": 1.005639097744361, + "grad_norm": 0.26152098178863525, + "learning_rate": 9.911436253643445e-05, + "loss": 0.0326, + "step": 2140 + }, + { + "epoch": 1.0103383458646618, + "grad_norm": 0.31987690925598145, + "learning_rate": 9.90988039376942e-05, + "loss": 0.0318, + "step": 2150 + }, + { + "epoch": 1.0150375939849625, + "grad_norm": 0.22263208031654358, + "learning_rate": 9.90831111046988e-05, + "loss": 0.0424, + "step": 2160 + }, + { + "epoch": 1.019736842105263, + "grad_norm": 0.23681917786598206, + "learning_rate": 9.90672840803519e-05, + "loss": 0.0272, + "step": 2170 + }, + { + "epoch": 1.0244360902255638, + "grad_norm": 0.210203617811203, + "learning_rate": 9.905132290792394e-05, + "loss": 0.0205, + "step": 2180 + }, + { + "epoch": 1.0291353383458646, + "grad_norm": 0.3350231945514679, + "learning_rate": 9.903522763105218e-05, + "loss": 0.0308, + "step": 2190 + }, + { + "epoch": 1.0338345864661653, + "grad_norm": 0.18159183859825134, + "learning_rate": 9.901899829374047e-05, + "loss": 0.032, + "step": 2200 + }, + { + "epoch": 1.038533834586466, + "grad_norm": 0.23585166037082672, + "learning_rate": 9.900263494035921e-05, + "loss": 0.0381, + "step": 2210 + }, + { + "epoch": 1.0432330827067668, + "grad_norm": 0.21487107872962952, + "learning_rate": 9.89861376156452e-05, + "loss": 0.0332, + "step": 2220 + }, + { + "epoch": 1.0479323308270676, + "grad_norm": 0.29074567556381226, + "learning_rate": 9.896950636470147e-05, + "loss": 0.0363, + "step": 2230 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 0.2568870782852173, + "learning_rate": 9.895274123299723e-05, + "loss": 0.0305, + "step": 2240 + }, + { + "epoch": 1.0573308270676691, + "grad_norm": 0.3047011196613312, + "learning_rate": 9.893584226636772e-05, + "loss": 0.0335, + "step": 2250 + }, + { + "epoch": 1.0620300751879699, + "grad_norm": 0.2922953963279724, + "learning_rate": 9.891880951101407e-05, + "loss": 0.0443, + "step": 2260 + }, + { + "epoch": 1.0667293233082706, + "grad_norm": 0.29236820340156555, + "learning_rate": 9.890164301350318e-05, + "loss": 0.0339, + "step": 2270 + }, + { + "epoch": 1.0714285714285714, + "grad_norm": 0.2855212688446045, + "learning_rate": 9.888434282076758e-05, + "loss": 0.0341, + "step": 2280 + }, + { + "epoch": 1.0761278195488722, + "grad_norm": 0.2906155586242676, + "learning_rate": 9.886690898010535e-05, + "loss": 0.0366, + "step": 2290 + }, + { + "epoch": 1.080827067669173, + "grad_norm": 0.24896125495433807, + "learning_rate": 9.884934153917997e-05, + "loss": 0.0386, + "step": 2300 + }, + { + "epoch": 1.0855263157894737, + "grad_norm": 0.2523311674594879, + "learning_rate": 9.883164054602012e-05, + "loss": 0.0322, + "step": 2310 + }, + { + "epoch": 1.0902255639097744, + "grad_norm": 0.3600609004497528, + "learning_rate": 9.881380604901964e-05, + "loss": 0.0394, + "step": 2320 + }, + { + "epoch": 1.0949248120300752, + "grad_norm": 0.2090110331773758, + "learning_rate": 9.879583809693738e-05, + "loss": 0.0273, + "step": 2330 + }, + { + "epoch": 1.099624060150376, + "grad_norm": 0.27230265736579895, + "learning_rate": 9.877773673889701e-05, + "loss": 0.0313, + "step": 2340 + }, + { + "epoch": 1.1043233082706767, + "grad_norm": 0.24328522384166718, + "learning_rate": 9.8759502024387e-05, + "loss": 0.0366, + "step": 2350 + }, + { + "epoch": 1.1090225563909775, + "grad_norm": 0.24177910387516022, + "learning_rate": 9.87411340032603e-05, + "loss": 0.0295, + "step": 2360 + }, + { + "epoch": 1.1137218045112782, + "grad_norm": 0.22861960530281067, + "learning_rate": 9.872263272573443e-05, + "loss": 0.0359, + "step": 2370 + }, + { + "epoch": 1.118421052631579, + "grad_norm": 0.2622787654399872, + "learning_rate": 9.870399824239117e-05, + "loss": 0.0274, + "step": 2380 + }, + { + "epoch": 1.1231203007518797, + "grad_norm": 0.22869399189949036, + "learning_rate": 9.868523060417646e-05, + "loss": 0.0437, + "step": 2390 + }, + { + "epoch": 1.1278195488721805, + "grad_norm": 0.22369208931922913, + "learning_rate": 9.86663298624003e-05, + "loss": 0.0313, + "step": 2400 + }, + { + "epoch": 1.1325187969924813, + "grad_norm": 0.3054512143135071, + "learning_rate": 9.864729606873663e-05, + "loss": 0.0343, + "step": 2410 + }, + { + "epoch": 1.137218045112782, + "grad_norm": 0.21700331568717957, + "learning_rate": 9.862812927522309e-05, + "loss": 0.0356, + "step": 2420 + }, + { + "epoch": 1.1419172932330828, + "grad_norm": 0.1657762974500656, + "learning_rate": 9.860882953426099e-05, + "loss": 0.0273, + "step": 2430 + }, + { + "epoch": 1.1466165413533835, + "grad_norm": 0.3012523949146271, + "learning_rate": 9.858939689861506e-05, + "loss": 0.0278, + "step": 2440 + }, + { + "epoch": 1.1513157894736843, + "grad_norm": 0.1556554138660431, + "learning_rate": 9.856983142141339e-05, + "loss": 0.032, + "step": 2450 + }, + { + "epoch": 1.156015037593985, + "grad_norm": 0.2980904281139374, + "learning_rate": 9.855013315614725e-05, + "loss": 0.0359, + "step": 2460 + }, + { + "epoch": 1.1607142857142858, + "grad_norm": 0.23628155887126923, + "learning_rate": 9.853030215667093e-05, + "loss": 0.0253, + "step": 2470 + }, + { + "epoch": 1.1654135338345863, + "grad_norm": 0.23128274083137512, + "learning_rate": 9.851033847720166e-05, + "loss": 0.0307, + "step": 2480 + }, + { + "epoch": 1.170112781954887, + "grad_norm": 0.3112635314464569, + "learning_rate": 9.849024217231935e-05, + "loss": 0.0337, + "step": 2490 + }, + { + "epoch": 1.1748120300751879, + "grad_norm": 0.30317866802215576, + "learning_rate": 9.847001329696653e-05, + "loss": 0.0349, + "step": 2500 + }, + { + "epoch": 1.1795112781954886, + "grad_norm": 0.3040902614593506, + "learning_rate": 9.844965190644817e-05, + "loss": 0.035, + "step": 2510 + }, + { + "epoch": 1.1842105263157894, + "grad_norm": 0.22820954024791718, + "learning_rate": 9.842915805643155e-05, + "loss": 0.0293, + "step": 2520 + }, + { + "epoch": 1.1889097744360901, + "grad_norm": 0.25117453932762146, + "learning_rate": 9.840853180294608e-05, + "loss": 0.0295, + "step": 2530 + }, + { + "epoch": 1.193609022556391, + "grad_norm": 0.28759878873825073, + "learning_rate": 9.838777320238312e-05, + "loss": 0.0423, + "step": 2540 + }, + { + "epoch": 1.1983082706766917, + "grad_norm": 0.286913126707077, + "learning_rate": 9.836688231149592e-05, + "loss": 0.0284, + "step": 2550 + }, + { + "epoch": 1.2030075187969924, + "grad_norm": 0.2655298411846161, + "learning_rate": 9.834585918739936e-05, + "loss": 0.0389, + "step": 2560 + }, + { + "epoch": 1.2077067669172932, + "grad_norm": 0.2870301604270935, + "learning_rate": 9.832470388756987e-05, + "loss": 0.0354, + "step": 2570 + }, + { + "epoch": 1.212406015037594, + "grad_norm": 0.4012272357940674, + "learning_rate": 9.830341646984521e-05, + "loss": 0.0331, + "step": 2580 + }, + { + "epoch": 1.2171052631578947, + "grad_norm": 0.29325100779533386, + "learning_rate": 9.82819969924244e-05, + "loss": 0.0286, + "step": 2590 + }, + { + "epoch": 1.2218045112781954, + "grad_norm": 0.27856117486953735, + "learning_rate": 9.826044551386744e-05, + "loss": 0.0342, + "step": 2600 + }, + { + "epoch": 1.2265037593984962, + "grad_norm": 0.3191596567630768, + "learning_rate": 9.823876209309527e-05, + "loss": 0.0299, + "step": 2610 + }, + { + "epoch": 1.231203007518797, + "grad_norm": 0.2671605944633484, + "learning_rate": 9.821694678938953e-05, + "loss": 0.0305, + "step": 2620 + }, + { + "epoch": 1.2359022556390977, + "grad_norm": 0.29545503854751587, + "learning_rate": 9.819499966239243e-05, + "loss": 0.033, + "step": 2630 + }, + { + "epoch": 1.2406015037593985, + "grad_norm": 0.3468911647796631, + "learning_rate": 9.817292077210659e-05, + "loss": 0.0362, + "step": 2640 + }, + { + "epoch": 1.2453007518796992, + "grad_norm": 0.21935796737670898, + "learning_rate": 9.815071017889482e-05, + "loss": 0.0275, + "step": 2650 + }, + { + "epoch": 1.25, + "grad_norm": 0.20226982235908508, + "learning_rate": 9.812836794348004e-05, + "loss": 0.0303, + "step": 2660 + }, + { + "epoch": 1.2546992481203008, + "grad_norm": 0.24741894006729126, + "learning_rate": 9.81058941269451e-05, + "loss": 0.0333, + "step": 2670 + }, + { + "epoch": 1.2593984962406015, + "grad_norm": 0.28749677538871765, + "learning_rate": 9.808328879073251e-05, + "loss": 0.0297, + "step": 2680 + }, + { + "epoch": 1.2640977443609023, + "grad_norm": 0.14903762936592102, + "learning_rate": 9.806055199664446e-05, + "loss": 0.0261, + "step": 2690 + }, + { + "epoch": 1.268796992481203, + "grad_norm": 0.2174089550971985, + "learning_rate": 9.803768380684242e-05, + "loss": 0.0336, + "step": 2700 + }, + { + "epoch": 1.2734962406015038, + "grad_norm": 0.21511636674404144, + "learning_rate": 9.801468428384716e-05, + "loss": 0.028, + "step": 2710 + }, + { + "epoch": 1.2781954887218046, + "grad_norm": 0.2507420778274536, + "learning_rate": 9.799155349053851e-05, + "loss": 0.0315, + "step": 2720 + }, + { + "epoch": 1.2828947368421053, + "grad_norm": 0.22325477004051208, + "learning_rate": 9.796829149015517e-05, + "loss": 0.0242, + "step": 2730 + }, + { + "epoch": 1.287593984962406, + "grad_norm": 0.324486643075943, + "learning_rate": 9.794489834629455e-05, + "loss": 0.0243, + "step": 2740 + }, + { + "epoch": 1.2922932330827068, + "grad_norm": 0.24349677562713623, + "learning_rate": 9.792137412291265e-05, + "loss": 0.0294, + "step": 2750 + }, + { + "epoch": 1.2969924812030076, + "grad_norm": 0.2508913576602936, + "learning_rate": 9.789771888432375e-05, + "loss": 0.0283, + "step": 2760 + }, + { + "epoch": 1.3016917293233083, + "grad_norm": 0.210478737950325, + "learning_rate": 9.787393269520039e-05, + "loss": 0.0269, + "step": 2770 + }, + { + "epoch": 1.306390977443609, + "grad_norm": 0.2785148620605469, + "learning_rate": 9.785001562057309e-05, + "loss": 0.0277, + "step": 2780 + }, + { + "epoch": 1.3110902255639099, + "grad_norm": 0.30427059531211853, + "learning_rate": 9.782596772583026e-05, + "loss": 0.0335, + "step": 2790 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 0.3535986840724945, + "learning_rate": 9.780178907671789e-05, + "loss": 0.035, + "step": 2800 + }, + { + "epoch": 1.3204887218045114, + "grad_norm": 0.3639542758464813, + "learning_rate": 9.777747973933948e-05, + "loss": 0.0263, + "step": 2810 + }, + { + "epoch": 1.3251879699248121, + "grad_norm": 0.3313891589641571, + "learning_rate": 9.775303978015585e-05, + "loss": 0.0302, + "step": 2820 + }, + { + "epoch": 1.329887218045113, + "grad_norm": 0.266083300113678, + "learning_rate": 9.772846926598491e-05, + "loss": 0.0354, + "step": 2830 + }, + { + "epoch": 1.3345864661654137, + "grad_norm": 0.3054597079753876, + "learning_rate": 9.77037682640015e-05, + "loss": 0.0344, + "step": 2840 + }, + { + "epoch": 1.3392857142857144, + "grad_norm": 0.1686822921037674, + "learning_rate": 9.767893684173721e-05, + "loss": 0.0314, + "step": 2850 + }, + { + "epoch": 1.3439849624060152, + "grad_norm": 0.3133353292942047, + "learning_rate": 9.765397506708023e-05, + "loss": 0.0314, + "step": 2860 + }, + { + "epoch": 1.3486842105263157, + "grad_norm": 0.3505442142486572, + "learning_rate": 9.762888300827507e-05, + "loss": 0.0371, + "step": 2870 + }, + { + "epoch": 1.3533834586466165, + "grad_norm": 0.23840002715587616, + "learning_rate": 9.760366073392246e-05, + "loss": 0.0278, + "step": 2880 + }, + { + "epoch": 1.3580827067669172, + "grad_norm": 0.35968664288520813, + "learning_rate": 9.757830831297914e-05, + "loss": 0.0315, + "step": 2890 + }, + { + "epoch": 1.362781954887218, + "grad_norm": 0.2002311795949936, + "learning_rate": 9.755282581475769e-05, + "loss": 0.0296, + "step": 2900 + }, + { + "epoch": 1.3674812030075187, + "grad_norm": 0.31238964200019836, + "learning_rate": 9.752721330892624e-05, + "loss": 0.0279, + "step": 2910 + }, + { + "epoch": 1.3721804511278195, + "grad_norm": 0.16220539808273315, + "learning_rate": 9.750147086550844e-05, + "loss": 0.031, + "step": 2920 + }, + { + "epoch": 1.3768796992481203, + "grad_norm": 0.20878033339977264, + "learning_rate": 9.747559855488313e-05, + "loss": 0.0275, + "step": 2930 + }, + { + "epoch": 1.381578947368421, + "grad_norm": 0.2526220679283142, + "learning_rate": 9.744959644778422e-05, + "loss": 0.0348, + "step": 2940 + }, + { + "epoch": 1.3862781954887218, + "grad_norm": 0.15938736498355865, + "learning_rate": 9.742346461530048e-05, + "loss": 0.0393, + "step": 2950 + }, + { + "epoch": 1.3909774436090225, + "grad_norm": 0.2751128375530243, + "learning_rate": 9.739720312887535e-05, + "loss": 0.031, + "step": 2960 + }, + { + "epoch": 1.3956766917293233, + "grad_norm": 0.1849040985107422, + "learning_rate": 9.73708120603067e-05, + "loss": 0.0316, + "step": 2970 + }, + { + "epoch": 1.400375939849624, + "grad_norm": 0.2610875964164734, + "learning_rate": 9.734429148174675e-05, + "loss": 0.0291, + "step": 2980 + }, + { + "epoch": 1.4050751879699248, + "grad_norm": 0.23159807920455933, + "learning_rate": 9.731764146570173e-05, + "loss": 0.0322, + "step": 2990 + }, + { + "epoch": 1.4097744360902256, + "grad_norm": 0.3438315689563751, + "learning_rate": 9.729086208503174e-05, + "loss": 0.0446, + "step": 3000 + }, + { + "epoch": 1.4144736842105263, + "grad_norm": 0.1952996701002121, + "learning_rate": 9.726395341295062e-05, + "loss": 0.0291, + "step": 3010 + }, + { + "epoch": 1.419172932330827, + "grad_norm": 0.29903092980384827, + "learning_rate": 9.723691552302562e-05, + "loss": 0.0291, + "step": 3020 + }, + { + "epoch": 1.4238721804511278, + "grad_norm": 0.19270353019237518, + "learning_rate": 9.720974848917735e-05, + "loss": 0.0322, + "step": 3030 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 0.24853135645389557, + "learning_rate": 9.718245238567939e-05, + "loss": 0.0352, + "step": 3040 + }, + { + "epoch": 1.4332706766917294, + "grad_norm": 0.28170156478881836, + "learning_rate": 9.715502728715826e-05, + "loss": 0.0402, + "step": 3050 + }, + { + "epoch": 1.4379699248120301, + "grad_norm": 0.2433410882949829, + "learning_rate": 9.712747326859315e-05, + "loss": 0.029, + "step": 3060 + }, + { + "epoch": 1.4426691729323309, + "grad_norm": 0.272994726896286, + "learning_rate": 9.709979040531569e-05, + "loss": 0.0255, + "step": 3070 + }, + { + "epoch": 1.4473684210526316, + "grad_norm": 0.3101017475128174, + "learning_rate": 9.707197877300974e-05, + "loss": 0.0464, + "step": 3080 + }, + { + "epoch": 1.4520676691729324, + "grad_norm": 0.22507302463054657, + "learning_rate": 9.704403844771128e-05, + "loss": 0.0334, + "step": 3090 + }, + { + "epoch": 1.4567669172932332, + "grad_norm": 0.17489475011825562, + "learning_rate": 9.701596950580806e-05, + "loss": 0.0357, + "step": 3100 + }, + { + "epoch": 1.461466165413534, + "grad_norm": 0.20901988446712494, + "learning_rate": 9.698777202403953e-05, + "loss": 0.028, + "step": 3110 + }, + { + "epoch": 1.4661654135338344, + "grad_norm": 0.22741763293743134, + "learning_rate": 9.695944607949649e-05, + "loss": 0.0329, + "step": 3120 + }, + { + "epoch": 1.4708646616541352, + "grad_norm": 0.26588672399520874, + "learning_rate": 9.693099174962103e-05, + "loss": 0.029, + "step": 3130 + }, + { + "epoch": 1.475563909774436, + "grad_norm": 0.25058892369270325, + "learning_rate": 9.690240911220618e-05, + "loss": 0.0273, + "step": 3140 + }, + { + "epoch": 1.4802631578947367, + "grad_norm": 0.2134590893983841, + "learning_rate": 9.687369824539577e-05, + "loss": 0.034, + "step": 3150 + }, + { + "epoch": 1.4849624060150375, + "grad_norm": 0.19278165698051453, + "learning_rate": 9.684485922768422e-05, + "loss": 0.0254, + "step": 3160 + }, + { + "epoch": 1.4896616541353382, + "grad_norm": 0.17773672938346863, + "learning_rate": 9.681589213791633e-05, + "loss": 0.0324, + "step": 3170 + }, + { + "epoch": 1.494360902255639, + "grad_norm": 0.2836746573448181, + "learning_rate": 9.6786797055287e-05, + "loss": 0.0331, + "step": 3180 + }, + { + "epoch": 1.4990601503759398, + "grad_norm": 0.1892741471529007, + "learning_rate": 9.675757405934103e-05, + "loss": 0.0266, + "step": 3190 + }, + { + "epoch": 1.5037593984962405, + "grad_norm": 0.26725977659225464, + "learning_rate": 9.672822322997305e-05, + "loss": 0.0392, + "step": 3200 + }, + { + "epoch": 1.5084586466165413, + "grad_norm": 0.25514504313468933, + "learning_rate": 9.669874464742705e-05, + "loss": 0.0274, + "step": 3210 + }, + { + "epoch": 1.513157894736842, + "grad_norm": 0.2787286043167114, + "learning_rate": 9.66691383922964e-05, + "loss": 0.0262, + "step": 3220 + }, + { + "epoch": 1.5178571428571428, + "grad_norm": 0.16668666899204254, + "learning_rate": 9.663940454552342e-05, + "loss": 0.0244, + "step": 3230 + }, + { + "epoch": 1.5225563909774436, + "grad_norm": 0.19877758622169495, + "learning_rate": 9.660954318839933e-05, + "loss": 0.0244, + "step": 3240 + }, + { + "epoch": 1.5272556390977443, + "grad_norm": 0.19377648830413818, + "learning_rate": 9.657955440256395e-05, + "loss": 0.0302, + "step": 3250 + }, + { + "epoch": 1.531954887218045, + "grad_norm": 0.2523846924304962, + "learning_rate": 9.654943827000548e-05, + "loss": 0.0231, + "step": 3260 + }, + { + "epoch": 1.5366541353383458, + "grad_norm": 0.2247912436723709, + "learning_rate": 9.651919487306025e-05, + "loss": 0.0303, + "step": 3270 + }, + { + "epoch": 1.5413533834586466, + "grad_norm": 0.28465062379837036, + "learning_rate": 9.648882429441257e-05, + "loss": 0.0264, + "step": 3280 + }, + { + "epoch": 1.5460526315789473, + "grad_norm": 0.24437738955020905, + "learning_rate": 9.645832661709444e-05, + "loss": 0.0311, + "step": 3290 + }, + { + "epoch": 1.550751879699248, + "grad_norm": 0.320959210395813, + "learning_rate": 9.642770192448536e-05, + "loss": 0.041, + "step": 3300 + }, + { + "epoch": 1.5554511278195489, + "grad_norm": 0.19710564613342285, + "learning_rate": 9.639695030031204e-05, + "loss": 0.0249, + "step": 3310 + }, + { + "epoch": 1.5601503759398496, + "grad_norm": 0.2785632908344269, + "learning_rate": 9.636607182864827e-05, + "loss": 0.026, + "step": 3320 + }, + { + "epoch": 1.5648496240601504, + "grad_norm": 0.24171295762062073, + "learning_rate": 9.63350665939146e-05, + "loss": 0.0307, + "step": 3330 + }, + { + "epoch": 1.5695488721804511, + "grad_norm": 0.25583142042160034, + "learning_rate": 9.630393468087818e-05, + "loss": 0.0258, + "step": 3340 + }, + { + "epoch": 1.574248120300752, + "grad_norm": 0.20863236486911774, + "learning_rate": 9.627267617465243e-05, + "loss": 0.0271, + "step": 3350 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.1731235533952713, + "learning_rate": 9.624129116069694e-05, + "loss": 0.0365, + "step": 3360 + }, + { + "epoch": 1.5836466165413534, + "grad_norm": 0.3141409754753113, + "learning_rate": 9.620977972481716e-05, + "loss": 0.0375, + "step": 3370 + }, + { + "epoch": 1.5883458646616542, + "grad_norm": 0.1895882934331894, + "learning_rate": 9.617814195316411e-05, + "loss": 0.0244, + "step": 3380 + }, + { + "epoch": 1.593045112781955, + "grad_norm": 0.26948222517967224, + "learning_rate": 9.614637793223425e-05, + "loss": 0.0269, + "step": 3390 + }, + { + "epoch": 1.5977443609022557, + "grad_norm": 0.2704524099826813, + "learning_rate": 9.611448774886924e-05, + "loss": 0.0305, + "step": 3400 + }, + { + "epoch": 1.6024436090225564, + "grad_norm": 0.24672971665859222, + "learning_rate": 9.60824714902556e-05, + "loss": 0.0242, + "step": 3410 + }, + { + "epoch": 1.6071428571428572, + "grad_norm": 0.2219143658876419, + "learning_rate": 9.605032924392457e-05, + "loss": 0.0323, + "step": 3420 + }, + { + "epoch": 1.611842105263158, + "grad_norm": 0.29584020376205444, + "learning_rate": 9.601806109775179e-05, + "loss": 0.0279, + "step": 3430 + }, + { + "epoch": 1.6165413533834587, + "grad_norm": 0.20713065564632416, + "learning_rate": 9.598566713995718e-05, + "loss": 0.0361, + "step": 3440 + }, + { + "epoch": 1.6212406015037595, + "grad_norm": 0.23049211502075195, + "learning_rate": 9.595314745910456e-05, + "loss": 0.0302, + "step": 3450 + }, + { + "epoch": 1.6259398496240602, + "grad_norm": 0.2782370150089264, + "learning_rate": 9.59205021441015e-05, + "loss": 0.0266, + "step": 3460 + }, + { + "epoch": 1.630639097744361, + "grad_norm": 0.14473502337932587, + "learning_rate": 9.588773128419906e-05, + "loss": 0.0284, + "step": 3470 + }, + { + "epoch": 1.6353383458646618, + "grad_norm": 0.16135524213314056, + "learning_rate": 9.58548349689915e-05, + "loss": 0.0254, + "step": 3480 + }, + { + "epoch": 1.6400375939849625, + "grad_norm": 0.2047393023967743, + "learning_rate": 9.582181328841611e-05, + "loss": 0.0317, + "step": 3490 + }, + { + "epoch": 1.6447368421052633, + "grad_norm": 0.1870441883802414, + "learning_rate": 9.578866633275288e-05, + "loss": 0.0285, + "step": 3500 + }, + { + "epoch": 1.649436090225564, + "grad_norm": 0.2362082153558731, + "learning_rate": 9.575539419262434e-05, + "loss": 0.0299, + "step": 3510 + }, + { + "epoch": 1.6541353383458648, + "grad_norm": 0.3200538158416748, + "learning_rate": 9.572199695899522e-05, + "loss": 0.0334, + "step": 3520 + }, + { + "epoch": 1.6588345864661656, + "grad_norm": 0.22211411595344543, + "learning_rate": 9.568847472317232e-05, + "loss": 0.0243, + "step": 3530 + }, + { + "epoch": 1.6635338345864663, + "grad_norm": 0.27358049154281616, + "learning_rate": 9.565482757680415e-05, + "loss": 0.037, + "step": 3540 + }, + { + "epoch": 1.668233082706767, + "grad_norm": 0.23407623171806335, + "learning_rate": 9.562105561188069e-05, + "loss": 0.0278, + "step": 3550 + }, + { + "epoch": 1.6729323308270678, + "grad_norm": 0.17386211454868317, + "learning_rate": 9.558715892073323e-05, + "loss": 0.0315, + "step": 3560 + }, + { + "epoch": 1.6776315789473686, + "grad_norm": 0.2037278711795807, + "learning_rate": 9.555313759603402e-05, + "loss": 0.0359, + "step": 3570 + }, + { + "epoch": 1.6823308270676691, + "grad_norm": 0.2268248200416565, + "learning_rate": 9.551899173079607e-05, + "loss": 0.0376, + "step": 3580 + }, + { + "epoch": 1.6870300751879699, + "grad_norm": 0.23089633882045746, + "learning_rate": 9.548472141837286e-05, + "loss": 0.0376, + "step": 3590 + }, + { + "epoch": 1.6917293233082706, + "grad_norm": 0.19939178228378296, + "learning_rate": 9.545032675245813e-05, + "loss": 0.0341, + "step": 3600 + }, + { + "epoch": 1.6964285714285714, + "grad_norm": 0.24425296485424042, + "learning_rate": 9.541580782708557e-05, + "loss": 0.0333, + "step": 3610 + }, + { + "epoch": 1.7011278195488722, + "grad_norm": 0.3158224821090698, + "learning_rate": 9.538116473662861e-05, + "loss": 0.0311, + "step": 3620 + }, + { + "epoch": 1.705827067669173, + "grad_norm": 0.2468244433403015, + "learning_rate": 9.534639757580013e-05, + "loss": 0.0325, + "step": 3630 + }, + { + "epoch": 1.7105263157894737, + "grad_norm": 0.27000561356544495, + "learning_rate": 9.531150643965223e-05, + "loss": 0.0258, + "step": 3640 + }, + { + "epoch": 1.7152255639097744, + "grad_norm": 0.22154955565929413, + "learning_rate": 9.527649142357596e-05, + "loss": 0.0334, + "step": 3650 + }, + { + "epoch": 1.7199248120300752, + "grad_norm": 0.2872529625892639, + "learning_rate": 9.524135262330098e-05, + "loss": 0.0276, + "step": 3660 + }, + { + "epoch": 1.724624060150376, + "grad_norm": 0.2597917914390564, + "learning_rate": 9.520609013489547e-05, + "loss": 0.0333, + "step": 3670 + }, + { + "epoch": 1.7293233082706767, + "grad_norm": 0.2070426195859909, + "learning_rate": 9.517070405476575e-05, + "loss": 0.0224, + "step": 3680 + }, + { + "epoch": 1.7340225563909775, + "grad_norm": 0.25321757793426514, + "learning_rate": 9.513519447965595e-05, + "loss": 0.0283, + "step": 3690 + }, + { + "epoch": 1.7387218045112782, + "grad_norm": 0.25070440769195557, + "learning_rate": 9.509956150664796e-05, + "loss": 0.0299, + "step": 3700 + }, + { + "epoch": 1.743421052631579, + "grad_norm": 0.1794224977493286, + "learning_rate": 9.50638052331609e-05, + "loss": 0.033, + "step": 3710 + }, + { + "epoch": 1.7481203007518797, + "grad_norm": 0.24292895197868347, + "learning_rate": 9.502792575695112e-05, + "loss": 0.0347, + "step": 3720 + }, + { + "epoch": 1.7528195488721805, + "grad_norm": 0.16750149428844452, + "learning_rate": 9.499192317611167e-05, + "loss": 0.0269, + "step": 3730 + }, + { + "epoch": 1.7575187969924813, + "grad_norm": 0.2226608693599701, + "learning_rate": 9.49557975890723e-05, + "loss": 0.032, + "step": 3740 + }, + { + "epoch": 1.7622180451127818, + "grad_norm": 0.1929808109998703, + "learning_rate": 9.491954909459895e-05, + "loss": 0.0362, + "step": 3750 + }, + { + "epoch": 1.7669172932330826, + "grad_norm": 0.2228640466928482, + "learning_rate": 9.488317779179361e-05, + "loss": 0.0216, + "step": 3760 + }, + { + "epoch": 1.7716165413533833, + "grad_norm": 0.20736654102802277, + "learning_rate": 9.484668378009408e-05, + "loss": 0.0231, + "step": 3770 + }, + { + "epoch": 1.776315789473684, + "grad_norm": 0.1721428632736206, + "learning_rate": 9.481006715927351e-05, + "loss": 0.0255, + "step": 3780 + }, + { + "epoch": 1.7810150375939848, + "grad_norm": 0.21083158254623413, + "learning_rate": 9.477332802944044e-05, + "loss": 0.0303, + "step": 3790 + }, + { + "epoch": 1.7857142857142856, + "grad_norm": 0.2370145171880722, + "learning_rate": 9.473646649103818e-05, + "loss": 0.0417, + "step": 3800 + }, + { + "epoch": 1.7904135338345863, + "grad_norm": 0.2074321210384369, + "learning_rate": 9.46994826448448e-05, + "loss": 0.0337, + "step": 3810 + }, + { + "epoch": 1.795112781954887, + "grad_norm": 0.32189542055130005, + "learning_rate": 9.46623765919727e-05, + "loss": 0.0285, + "step": 3820 + }, + { + "epoch": 1.7998120300751879, + "grad_norm": 0.12926824390888214, + "learning_rate": 9.462514843386845e-05, + "loss": 0.0278, + "step": 3830 + }, + { + "epoch": 1.8045112781954886, + "grad_norm": 0.2261073887348175, + "learning_rate": 9.458779827231237e-05, + "loss": 0.0286, + "step": 3840 + }, + { + "epoch": 1.8092105263157894, + "grad_norm": 0.18350914120674133, + "learning_rate": 9.45503262094184e-05, + "loss": 0.0258, + "step": 3850 + }, + { + "epoch": 1.8139097744360901, + "grad_norm": 0.2196865826845169, + "learning_rate": 9.451273234763371e-05, + "loss": 0.0376, + "step": 3860 + }, + { + "epoch": 1.818609022556391, + "grad_norm": 0.18123769760131836, + "learning_rate": 9.447501678973852e-05, + "loss": 0.0356, + "step": 3870 + }, + { + "epoch": 1.8233082706766917, + "grad_norm": 0.21060392260551453, + "learning_rate": 9.443717963884569e-05, + "loss": 0.0317, + "step": 3880 + }, + { + "epoch": 1.8280075187969924, + "grad_norm": 0.2036607414484024, + "learning_rate": 9.439922099840054e-05, + "loss": 0.0315, + "step": 3890 + }, + { + "epoch": 1.8327067669172932, + "grad_norm": 0.18484705686569214, + "learning_rate": 9.43611409721806e-05, + "loss": 0.0307, + "step": 3900 + }, + { + "epoch": 1.837406015037594, + "grad_norm": 0.20937314629554749, + "learning_rate": 9.432293966429514e-05, + "loss": 0.026, + "step": 3910 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 0.2676234245300293, + "learning_rate": 9.428461717918511e-05, + "loss": 0.0255, + "step": 3920 + }, + { + "epoch": 1.8468045112781954, + "grad_norm": 0.20652443170547485, + "learning_rate": 9.424617362162271e-05, + "loss": 0.0288, + "step": 3930 + }, + { + "epoch": 1.8515037593984962, + "grad_norm": 0.3014736473560333, + "learning_rate": 9.420760909671118e-05, + "loss": 0.0337, + "step": 3940 + }, + { + "epoch": 1.856203007518797, + "grad_norm": 0.2299966812133789, + "learning_rate": 9.416892370988444e-05, + "loss": 0.0317, + "step": 3950 + }, + { + "epoch": 1.8609022556390977, + "grad_norm": 0.2134593427181244, + "learning_rate": 9.413011756690685e-05, + "loss": 0.0416, + "step": 3960 + }, + { + "epoch": 1.8656015037593985, + "grad_norm": 0.18940375745296478, + "learning_rate": 9.409119077387294e-05, + "loss": 0.0316, + "step": 3970 + }, + { + "epoch": 1.8703007518796992, + "grad_norm": 0.25739434361457825, + "learning_rate": 9.405214343720707e-05, + "loss": 0.0252, + "step": 3980 + }, + { + "epoch": 1.875, + "grad_norm": 0.21260878443717957, + "learning_rate": 9.401297566366318e-05, + "loss": 0.0241, + "step": 3990 + }, + { + "epoch": 1.8796992481203008, + "grad_norm": 0.26923254132270813, + "learning_rate": 9.397368756032445e-05, + "loss": 0.0239, + "step": 4000 + }, + { + "epoch": 1.8843984962406015, + "grad_norm": 0.18807107210159302, + "learning_rate": 9.393427923460308e-05, + "loss": 0.0266, + "step": 4010 + }, + { + "epoch": 1.8890977443609023, + "grad_norm": 0.18463344871997833, + "learning_rate": 9.389475079423988e-05, + "loss": 0.0228, + "step": 4020 + }, + { + "epoch": 1.893796992481203, + "grad_norm": 0.1900807023048401, + "learning_rate": 9.385510234730415e-05, + "loss": 0.0271, + "step": 4030 + }, + { + "epoch": 1.8984962406015038, + "grad_norm": 0.22632406651973724, + "learning_rate": 9.381533400219318e-05, + "loss": 0.0262, + "step": 4040 + }, + { + "epoch": 1.9031954887218046, + "grad_norm": 0.22867454588413239, + "learning_rate": 9.377544586763215e-05, + "loss": 0.0386, + "step": 4050 + }, + { + "epoch": 1.9078947368421053, + "grad_norm": 0.21906429529190063, + "learning_rate": 9.373543805267368e-05, + "loss": 0.0313, + "step": 4060 + }, + { + "epoch": 1.912593984962406, + "grad_norm": 0.21118290722370148, + "learning_rate": 9.369531066669758e-05, + "loss": 0.0368, + "step": 4070 + }, + { + "epoch": 1.9172932330827068, + "grad_norm": 0.2828110456466675, + "learning_rate": 9.365506381941066e-05, + "loss": 0.0313, + "step": 4080 + }, + { + "epoch": 1.9219924812030076, + "grad_norm": 0.29733067750930786, + "learning_rate": 9.36146976208462e-05, + "loss": 0.0287, + "step": 4090 + }, + { + "epoch": 1.9266917293233083, + "grad_norm": 0.17516322433948517, + "learning_rate": 9.357421218136386e-05, + "loss": 0.0313, + "step": 4100 + }, + { + "epoch": 1.931390977443609, + "grad_norm": 0.13433349132537842, + "learning_rate": 9.353360761164931e-05, + "loss": 0.0235, + "step": 4110 + }, + { + "epoch": 1.9360902255639099, + "grad_norm": 0.19060957431793213, + "learning_rate": 9.349288402271388e-05, + "loss": 0.0267, + "step": 4120 + }, + { + "epoch": 1.9407894736842106, + "grad_norm": 0.2099418193101883, + "learning_rate": 9.345204152589428e-05, + "loss": 0.022, + "step": 4130 + }, + { + "epoch": 1.9454887218045114, + "grad_norm": 0.16266584396362305, + "learning_rate": 9.341108023285238e-05, + "loss": 0.023, + "step": 4140 + }, + { + "epoch": 1.9501879699248121, + "grad_norm": 0.21736647188663483, + "learning_rate": 9.337000025557476e-05, + "loss": 0.03, + "step": 4150 + }, + { + "epoch": 1.954887218045113, + "grad_norm": 0.19137585163116455, + "learning_rate": 9.332880170637252e-05, + "loss": 0.024, + "step": 4160 + }, + { + "epoch": 1.9595864661654137, + "grad_norm": 0.22384802997112274, + "learning_rate": 9.328748469788093e-05, + "loss": 0.0251, + "step": 4170 + }, + { + "epoch": 1.9642857142857144, + "grad_norm": 0.21163040399551392, + "learning_rate": 9.32460493430591e-05, + "loss": 0.0279, + "step": 4180 + }, + { + "epoch": 1.9689849624060152, + "grad_norm": 0.21178165078163147, + "learning_rate": 9.320449575518972e-05, + "loss": 0.0279, + "step": 4190 + }, + { + "epoch": 1.973684210526316, + "grad_norm": 0.17716725170612335, + "learning_rate": 9.316282404787871e-05, + "loss": 0.0211, + "step": 4200 + }, + { + "epoch": 1.9783834586466167, + "grad_norm": 0.20710819959640503, + "learning_rate": 9.31210343350549e-05, + "loss": 0.0227, + "step": 4210 + }, + { + "epoch": 1.9830827067669174, + "grad_norm": 0.21356646716594696, + "learning_rate": 9.30791267309698e-05, + "loss": 0.0262, + "step": 4220 + }, + { + "epoch": 1.9877819548872182, + "grad_norm": 0.24351871013641357, + "learning_rate": 9.30371013501972e-05, + "loss": 0.0231, + "step": 4230 + }, + { + "epoch": 1.9924812030075187, + "grad_norm": 0.1973067820072174, + "learning_rate": 9.299495830763286e-05, + "loss": 0.02, + "step": 4240 + }, + { + "epoch": 1.9971804511278195, + "grad_norm": 0.24003633856773376, + "learning_rate": 9.295269771849427e-05, + "loss": 0.0296, + "step": 4250 + }, + { + "epoch": 2.0018796992481205, + "grad_norm": 0.18152223527431488, + "learning_rate": 9.291031969832026e-05, + "loss": 0.0231, + "step": 4260 + }, + { + "epoch": 2.0065789473684212, + "grad_norm": 0.18007569015026093, + "learning_rate": 9.286782436297073e-05, + "loss": 0.0223, + "step": 4270 + }, + { + "epoch": 2.011278195488722, + "grad_norm": 0.1332777440547943, + "learning_rate": 9.282521182862629e-05, + "loss": 0.0263, + "step": 4280 + }, + { + "epoch": 2.0159774436090228, + "grad_norm": 0.2537460923194885, + "learning_rate": 9.278248221178798e-05, + "loss": 0.0421, + "step": 4290 + }, + { + "epoch": 2.0206766917293235, + "grad_norm": 0.25093963742256165, + "learning_rate": 9.273963562927695e-05, + "loss": 0.0238, + "step": 4300 + }, + { + "epoch": 2.0253759398496243, + "grad_norm": 0.1842799037694931, + "learning_rate": 9.269667219823412e-05, + "loss": 0.0209, + "step": 4310 + }, + { + "epoch": 2.030075187969925, + "grad_norm": 0.2580346465110779, + "learning_rate": 9.265359203611987e-05, + "loss": 0.0257, + "step": 4320 + }, + { + "epoch": 2.0347744360902253, + "grad_norm": 0.17334118485450745, + "learning_rate": 9.261039526071374e-05, + "loss": 0.0273, + "step": 4330 + }, + { + "epoch": 2.039473684210526, + "grad_norm": 0.19151276350021362, + "learning_rate": 9.256708199011401e-05, + "loss": 0.0212, + "step": 4340 + }, + { + "epoch": 2.044172932330827, + "grad_norm": 0.23879149556159973, + "learning_rate": 9.252365234273755e-05, + "loss": 0.0221, + "step": 4350 + }, + { + "epoch": 2.0488721804511276, + "grad_norm": 0.2724449634552002, + "learning_rate": 9.248010643731935e-05, + "loss": 0.0298, + "step": 4360 + }, + { + "epoch": 2.0535714285714284, + "grad_norm": 0.16645994782447815, + "learning_rate": 9.243644439291223e-05, + "loss": 0.0253, + "step": 4370 + }, + { + "epoch": 2.058270676691729, + "grad_norm": 0.19133131206035614, + "learning_rate": 9.239266632888659e-05, + "loss": 0.0262, + "step": 4380 + }, + { + "epoch": 2.06296992481203, + "grad_norm": 0.2156103402376175, + "learning_rate": 9.234877236492997e-05, + "loss": 0.0334, + "step": 4390 + }, + { + "epoch": 2.0676691729323307, + "grad_norm": 0.17188550531864166, + "learning_rate": 9.230476262104677e-05, + "loss": 0.0189, + "step": 4400 + }, + { + "epoch": 2.0723684210526314, + "grad_norm": 0.2865235507488251, + "learning_rate": 9.226063721755799e-05, + "loss": 0.0203, + "step": 4410 + }, + { + "epoch": 2.077067669172932, + "grad_norm": 0.16109667718410492, + "learning_rate": 9.221639627510076e-05, + "loss": 0.0212, + "step": 4420 + }, + { + "epoch": 2.081766917293233, + "grad_norm": 0.22182904183864594, + "learning_rate": 9.217203991462815e-05, + "loss": 0.0242, + "step": 4430 + }, + { + "epoch": 2.0864661654135337, + "grad_norm": 0.20680855214595795, + "learning_rate": 9.212756825740873e-05, + "loss": 0.0283, + "step": 4440 + }, + { + "epoch": 2.0911654135338344, + "grad_norm": 0.12871260941028595, + "learning_rate": 9.208298142502636e-05, + "loss": 0.0268, + "step": 4450 + }, + { + "epoch": 2.095864661654135, + "grad_norm": 0.2089470773935318, + "learning_rate": 9.20382795393797e-05, + "loss": 0.028, + "step": 4460 + }, + { + "epoch": 2.100563909774436, + "grad_norm": 0.20934775471687317, + "learning_rate": 9.199346272268199e-05, + "loss": 0.0298, + "step": 4470 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 0.24874700605869293, + "learning_rate": 9.194853109746074e-05, + "loss": 0.02, + "step": 4480 + }, + { + "epoch": 2.1099624060150375, + "grad_norm": 0.2033921480178833, + "learning_rate": 9.190348478655724e-05, + "loss": 0.0226, + "step": 4490 + }, + { + "epoch": 2.1146616541353382, + "grad_norm": 0.29149314761161804, + "learning_rate": 9.185832391312644e-05, + "loss": 0.0397, + "step": 4500 + }, + { + "epoch": 2.119360902255639, + "grad_norm": 0.180942565202713, + "learning_rate": 9.18130486006364e-05, + "loss": 0.0206, + "step": 4510 + }, + { + "epoch": 2.1240601503759398, + "grad_norm": 0.21293392777442932, + "learning_rate": 9.176765897286813e-05, + "loss": 0.0356, + "step": 4520 + }, + { + "epoch": 2.1287593984962405, + "grad_norm": 0.23218363523483276, + "learning_rate": 9.17221551539151e-05, + "loss": 0.029, + "step": 4530 + }, + { + "epoch": 2.1334586466165413, + "grad_norm": 0.19981394708156586, + "learning_rate": 9.167653726818305e-05, + "loss": 0.0204, + "step": 4540 + }, + { + "epoch": 2.138157894736842, + "grad_norm": 0.1897270381450653, + "learning_rate": 9.163080544038952e-05, + "loss": 0.0321, + "step": 4550 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 0.20262902975082397, + "learning_rate": 9.158495979556358e-05, + "loss": 0.0284, + "step": 4560 + }, + { + "epoch": 2.1475563909774436, + "grad_norm": 0.1939176321029663, + "learning_rate": 9.153900045904549e-05, + "loss": 0.0283, + "step": 4570 + }, + { + "epoch": 2.1522556390977443, + "grad_norm": 0.23823733627796173, + "learning_rate": 9.14929275564863e-05, + "loss": 0.0291, + "step": 4580 + }, + { + "epoch": 2.156954887218045, + "grad_norm": 0.22697608172893524, + "learning_rate": 9.144674121384757e-05, + "loss": 0.0226, + "step": 4590 + }, + { + "epoch": 2.161654135338346, + "grad_norm": 0.19017039239406586, + "learning_rate": 9.140044155740101e-05, + "loss": 0.02, + "step": 4600 + }, + { + "epoch": 2.1663533834586466, + "grad_norm": 0.16114512085914612, + "learning_rate": 9.135402871372808e-05, + "loss": 0.0298, + "step": 4610 + }, + { + "epoch": 2.1710526315789473, + "grad_norm": 0.24711167812347412, + "learning_rate": 9.130750280971978e-05, + "loss": 0.0312, + "step": 4620 + }, + { + "epoch": 2.175751879699248, + "grad_norm": 0.1635618507862091, + "learning_rate": 9.126086397257612e-05, + "loss": 0.0192, + "step": 4630 + }, + { + "epoch": 2.180451127819549, + "grad_norm": 0.23969772458076477, + "learning_rate": 9.121411232980588e-05, + "loss": 0.0307, + "step": 4640 + }, + { + "epoch": 2.1851503759398496, + "grad_norm": 0.21272121369838715, + "learning_rate": 9.116724800922629e-05, + "loss": 0.0295, + "step": 4650 + }, + { + "epoch": 2.1898496240601504, + "grad_norm": 0.3185397684574127, + "learning_rate": 9.112027113896262e-05, + "loss": 0.0323, + "step": 4660 + }, + { + "epoch": 2.194548872180451, + "grad_norm": 0.1866607964038849, + "learning_rate": 9.107318184744781e-05, + "loss": 0.0334, + "step": 4670 + }, + { + "epoch": 2.199248120300752, + "grad_norm": 0.20084838569164276, + "learning_rate": 9.102598026342222e-05, + "loss": 0.0185, + "step": 4680 + }, + { + "epoch": 2.2039473684210527, + "grad_norm": 0.13514482975006104, + "learning_rate": 9.097866651593317e-05, + "loss": 0.0297, + "step": 4690 + }, + { + "epoch": 2.2086466165413534, + "grad_norm": 0.22178612649440765, + "learning_rate": 9.093124073433463e-05, + "loss": 0.0288, + "step": 4700 + }, + { + "epoch": 2.213345864661654, + "grad_norm": 0.22371554374694824, + "learning_rate": 9.088370304828685e-05, + "loss": 0.0245, + "step": 4710 + }, + { + "epoch": 2.218045112781955, + "grad_norm": 0.14162443578243256, + "learning_rate": 9.083605358775612e-05, + "loss": 0.0227, + "step": 4720 + }, + { + "epoch": 2.2227443609022557, + "grad_norm": 0.1731516569852829, + "learning_rate": 9.078829248301417e-05, + "loss": 0.0211, + "step": 4730 + }, + { + "epoch": 2.2274436090225564, + "grad_norm": 0.17966926097869873, + "learning_rate": 9.074041986463808e-05, + "loss": 0.0309, + "step": 4740 + }, + { + "epoch": 2.232142857142857, + "grad_norm": 0.20831747353076935, + "learning_rate": 9.069243586350975e-05, + "loss": 0.0227, + "step": 4750 + }, + { + "epoch": 2.236842105263158, + "grad_norm": 0.1533919721841812, + "learning_rate": 9.064434061081562e-05, + "loss": 0.0223, + "step": 4760 + }, + { + "epoch": 2.2415413533834587, + "grad_norm": 0.2533239722251892, + "learning_rate": 9.059613423804623e-05, + "loss": 0.0257, + "step": 4770 + }, + { + "epoch": 2.2462406015037595, + "grad_norm": 0.1808866560459137, + "learning_rate": 9.0547816876996e-05, + "loss": 0.0323, + "step": 4780 + }, + { + "epoch": 2.2509398496240602, + "grad_norm": 0.2061844766139984, + "learning_rate": 9.049938865976275e-05, + "loss": 0.0365, + "step": 4790 + }, + { + "epoch": 2.255639097744361, + "grad_norm": 0.18600964546203613, + "learning_rate": 9.045084971874738e-05, + "loss": 0.0202, + "step": 4800 + }, + { + "epoch": 2.2603383458646618, + "grad_norm": 0.25376129150390625, + "learning_rate": 9.040220018665347e-05, + "loss": 0.0259, + "step": 4810 + }, + { + "epoch": 2.2650375939849625, + "grad_norm": 0.2140718698501587, + "learning_rate": 9.035344019648702e-05, + "loss": 0.0196, + "step": 4820 + }, + { + "epoch": 2.2697368421052633, + "grad_norm": 0.21682094037532806, + "learning_rate": 9.030456988155596e-05, + "loss": 0.0224, + "step": 4830 + }, + { + "epoch": 2.274436090225564, + "grad_norm": 0.2152978926897049, + "learning_rate": 9.025558937546988e-05, + "loss": 0.0245, + "step": 4840 + }, + { + "epoch": 2.279135338345865, + "grad_norm": 0.20505715906620026, + "learning_rate": 9.020649881213958e-05, + "loss": 0.0196, + "step": 4850 + }, + { + "epoch": 2.2838345864661656, + "grad_norm": 0.19191820919513702, + "learning_rate": 9.015729832577681e-05, + "loss": 0.021, + "step": 4860 + }, + { + "epoch": 2.2885338345864663, + "grad_norm": 0.2543664276599884, + "learning_rate": 9.010798805089384e-05, + "loss": 0.025, + "step": 4870 + }, + { + "epoch": 2.293233082706767, + "grad_norm": 0.241551473736763, + "learning_rate": 9.005856812230304e-05, + "loss": 0.0261, + "step": 4880 + }, + { + "epoch": 2.297932330827068, + "grad_norm": 0.19452232122421265, + "learning_rate": 9.000903867511666e-05, + "loss": 0.0256, + "step": 4890 + }, + { + "epoch": 2.3026315789473686, + "grad_norm": 0.16211168467998505, + "learning_rate": 8.995939984474624e-05, + "loss": 0.0231, + "step": 4900 + }, + { + "epoch": 2.3073308270676693, + "grad_norm": 0.15496976673603058, + "learning_rate": 8.990965176690252e-05, + "loss": 0.027, + "step": 4910 + }, + { + "epoch": 2.31203007518797, + "grad_norm": 0.2289806455373764, + "learning_rate": 8.98597945775948e-05, + "loss": 0.0303, + "step": 4920 + }, + { + "epoch": 2.316729323308271, + "grad_norm": 0.15668641030788422, + "learning_rate": 8.980982841313074e-05, + "loss": 0.0255, + "step": 4930 + }, + { + "epoch": 2.3214285714285716, + "grad_norm": 0.2543923258781433, + "learning_rate": 8.975975341011596e-05, + "loss": 0.029, + "step": 4940 + }, + { + "epoch": 2.326127819548872, + "grad_norm": 0.2210584133863449, + "learning_rate": 8.970956970545355e-05, + "loss": 0.0217, + "step": 4950 + }, + { + "epoch": 2.3308270676691727, + "grad_norm": 0.25968605279922485, + "learning_rate": 8.965927743634391e-05, + "loss": 0.0283, + "step": 4960 + }, + { + "epoch": 2.3355263157894735, + "grad_norm": 0.22940151393413544, + "learning_rate": 8.96088767402841e-05, + "loss": 0.0216, + "step": 4970 + }, + { + "epoch": 2.340225563909774, + "grad_norm": 0.1916259378194809, + "learning_rate": 8.955836775506776e-05, + "loss": 0.0197, + "step": 4980 + }, + { + "epoch": 2.344924812030075, + "grad_norm": 0.14167390763759613, + "learning_rate": 8.950775061878453e-05, + "loss": 0.0244, + "step": 4990 + }, + { + "epoch": 2.3496240601503757, + "grad_norm": 0.22982220351696014, + "learning_rate": 8.945702546981969e-05, + "loss": 0.0267, + "step": 5000 + }, + { + "epoch": 2.3543233082706765, + "grad_norm": 0.21236097812652588, + "learning_rate": 8.940619244685388e-05, + "loss": 0.0268, + "step": 5010 + }, + { + "epoch": 2.3590225563909772, + "grad_norm": 0.16472011804580688, + "learning_rate": 8.935525168886262e-05, + "loss": 0.0279, + "step": 5020 + }, + { + "epoch": 2.363721804511278, + "grad_norm": 0.18868985772132874, + "learning_rate": 8.930420333511606e-05, + "loss": 0.0232, + "step": 5030 + }, + { + "epoch": 2.3684210526315788, + "grad_norm": 0.2023637294769287, + "learning_rate": 8.92530475251784e-05, + "loss": 0.0208, + "step": 5040 + }, + { + "epoch": 2.3731203007518795, + "grad_norm": 0.22184379398822784, + "learning_rate": 8.920178439890765e-05, + "loss": 0.0249, + "step": 5050 + }, + { + "epoch": 2.3778195488721803, + "grad_norm": 0.23359708487987518, + "learning_rate": 8.91504140964553e-05, + "loss": 0.0182, + "step": 5060 + }, + { + "epoch": 2.382518796992481, + "grad_norm": 0.17309461534023285, + "learning_rate": 8.909893675826574e-05, + "loss": 0.0276, + "step": 5070 + }, + { + "epoch": 2.387218045112782, + "grad_norm": 0.2259601503610611, + "learning_rate": 8.90473525250761e-05, + "loss": 0.0271, + "step": 5080 + }, + { + "epoch": 2.3919172932330826, + "grad_norm": 0.24056588113307953, + "learning_rate": 8.899566153791566e-05, + "loss": 0.0313, + "step": 5090 + }, + { + "epoch": 2.3966165413533833, + "grad_norm": 0.1999393254518509, + "learning_rate": 8.894386393810563e-05, + "loss": 0.0329, + "step": 5100 + }, + { + "epoch": 2.401315789473684, + "grad_norm": 0.1775711178779602, + "learning_rate": 8.889195986725865e-05, + "loss": 0.0236, + "step": 5110 + }, + { + "epoch": 2.406015037593985, + "grad_norm": 0.2337716966867447, + "learning_rate": 8.883994946727849e-05, + "loss": 0.0267, + "step": 5120 + }, + { + "epoch": 2.4107142857142856, + "grad_norm": 0.3311557471752167, + "learning_rate": 8.878783288035957e-05, + "loss": 0.0206, + "step": 5130 + }, + { + "epoch": 2.4154135338345863, + "grad_norm": 0.21677523851394653, + "learning_rate": 8.873561024898668e-05, + "loss": 0.028, + "step": 5140 + }, + { + "epoch": 2.420112781954887, + "grad_norm": 0.2683987617492676, + "learning_rate": 8.868328171593448e-05, + "loss": 0.0251, + "step": 5150 + }, + { + "epoch": 2.424812030075188, + "grad_norm": 0.18326067924499512, + "learning_rate": 8.863084742426719e-05, + "loss": 0.0245, + "step": 5160 + }, + { + "epoch": 2.4295112781954886, + "grad_norm": 0.16746392846107483, + "learning_rate": 8.857830751733815e-05, + "loss": 0.0248, + "step": 5170 + }, + { + "epoch": 2.4342105263157894, + "grad_norm": 0.1843288242816925, + "learning_rate": 8.852566213878947e-05, + "loss": 0.0283, + "step": 5180 + }, + { + "epoch": 2.43890977443609, + "grad_norm": 0.2047811597585678, + "learning_rate": 8.84729114325516e-05, + "loss": 0.0205, + "step": 5190 + }, + { + "epoch": 2.443609022556391, + "grad_norm": 0.2268257886171341, + "learning_rate": 8.842005554284296e-05, + "loss": 0.0255, + "step": 5200 + }, + { + "epoch": 2.4483082706766917, + "grad_norm": 0.27279675006866455, + "learning_rate": 8.836709461416952e-05, + "loss": 0.0281, + "step": 5210 + }, + { + "epoch": 2.4530075187969924, + "grad_norm": 0.14802409708499908, + "learning_rate": 8.831402879132446e-05, + "loss": 0.0195, + "step": 5220 + }, + { + "epoch": 2.457706766917293, + "grad_norm": 0.1476491093635559, + "learning_rate": 8.82608582193877e-05, + "loss": 0.0255, + "step": 5230 + }, + { + "epoch": 2.462406015037594, + "grad_norm": 0.1853826940059662, + "learning_rate": 8.820758304372557e-05, + "loss": 0.0245, + "step": 5240 + }, + { + "epoch": 2.4671052631578947, + "grad_norm": 0.2378029078245163, + "learning_rate": 8.815420340999033e-05, + "loss": 0.0205, + "step": 5250 + }, + { + "epoch": 2.4718045112781954, + "grad_norm": 0.1497645527124405, + "learning_rate": 8.810071946411989e-05, + "loss": 0.0239, + "step": 5260 + }, + { + "epoch": 2.476503759398496, + "grad_norm": 0.23548933863639832, + "learning_rate": 8.804713135233731e-05, + "loss": 0.0217, + "step": 5270 + }, + { + "epoch": 2.481203007518797, + "grad_norm": 0.2269853800535202, + "learning_rate": 8.799343922115044e-05, + "loss": 0.0183, + "step": 5280 + }, + { + "epoch": 2.4859022556390977, + "grad_norm": 0.22214250266551971, + "learning_rate": 8.79396432173515e-05, + "loss": 0.0221, + "step": 5290 + }, + { + "epoch": 2.4906015037593985, + "grad_norm": 0.2275986224412918, + "learning_rate": 8.788574348801675e-05, + "loss": 0.0311, + "step": 5300 + }, + { + "epoch": 2.4953007518796992, + "grad_norm": 0.21562987565994263, + "learning_rate": 8.783174018050594e-05, + "loss": 0.0388, + "step": 5310 + }, + { + "epoch": 2.5, + "grad_norm": 0.20339468121528625, + "learning_rate": 8.77776334424621e-05, + "loss": 0.0264, + "step": 5320 + }, + { + "epoch": 2.5046992481203008, + "grad_norm": 0.1768190860748291, + "learning_rate": 8.772342342181095e-05, + "loss": 0.0278, + "step": 5330 + }, + { + "epoch": 2.5093984962406015, + "grad_norm": 0.13602979481220245, + "learning_rate": 8.766911026676064e-05, + "loss": 0.0241, + "step": 5340 + }, + { + "epoch": 2.5140977443609023, + "grad_norm": 0.34090307354927063, + "learning_rate": 8.761469412580125e-05, + "loss": 0.0304, + "step": 5350 + }, + { + "epoch": 2.518796992481203, + "grad_norm": 0.2651212811470032, + "learning_rate": 8.756017514770443e-05, + "loss": 0.0264, + "step": 5360 + }, + { + "epoch": 2.523496240601504, + "grad_norm": 0.1824772208929062, + "learning_rate": 8.750555348152298e-05, + "loss": 0.0189, + "step": 5370 + }, + { + "epoch": 2.5281954887218046, + "grad_norm": 0.2285076379776001, + "learning_rate": 8.745082927659047e-05, + "loss": 0.0282, + "step": 5380 + }, + { + "epoch": 2.5328947368421053, + "grad_norm": 0.23857127130031586, + "learning_rate": 8.739600268252078e-05, + "loss": 0.0309, + "step": 5390 + }, + { + "epoch": 2.537593984962406, + "grad_norm": 0.20476020872592926, + "learning_rate": 8.73410738492077e-05, + "loss": 0.0265, + "step": 5400 + }, + { + "epoch": 2.542293233082707, + "grad_norm": 0.2645573616027832, + "learning_rate": 8.728604292682459e-05, + "loss": 0.0245, + "step": 5410 + }, + { + "epoch": 2.5469924812030076, + "grad_norm": 0.18955717980861664, + "learning_rate": 8.723091006582389e-05, + "loss": 0.0279, + "step": 5420 + }, + { + "epoch": 2.5516917293233083, + "grad_norm": 0.20659516751766205, + "learning_rate": 8.717567541693673e-05, + "loss": 0.0191, + "step": 5430 + }, + { + "epoch": 2.556390977443609, + "grad_norm": 0.19456426799297333, + "learning_rate": 8.71203391311725e-05, + "loss": 0.0286, + "step": 5440 + }, + { + "epoch": 2.56109022556391, + "grad_norm": 0.15492534637451172, + "learning_rate": 8.706490135981855e-05, + "loss": 0.0258, + "step": 5450 + }, + { + "epoch": 2.5657894736842106, + "grad_norm": 0.21852245926856995, + "learning_rate": 8.700936225443959e-05, + "loss": 0.0227, + "step": 5460 + }, + { + "epoch": 2.5704887218045114, + "grad_norm": 0.17760106921195984, + "learning_rate": 8.695372196687743e-05, + "loss": 0.0284, + "step": 5470 + }, + { + "epoch": 2.575187969924812, + "grad_norm": 0.20838047564029694, + "learning_rate": 8.689798064925049e-05, + "loss": 0.0239, + "step": 5480 + }, + { + "epoch": 2.579887218045113, + "grad_norm": 0.19678470492362976, + "learning_rate": 8.684213845395339e-05, + "loss": 0.022, + "step": 5490 + }, + { + "epoch": 2.5845864661654137, + "grad_norm": 0.15934813022613525, + "learning_rate": 8.678619553365659e-05, + "loss": 0.0192, + "step": 5500 + }, + { + "epoch": 2.5892857142857144, + "grad_norm": 0.21788392961025238, + "learning_rate": 8.673015204130586e-05, + "loss": 0.0209, + "step": 5510 + }, + { + "epoch": 2.593984962406015, + "grad_norm": 0.19440729916095734, + "learning_rate": 8.6674008130122e-05, + "loss": 0.0224, + "step": 5520 + }, + { + "epoch": 2.598684210526316, + "grad_norm": 0.2586307227611542, + "learning_rate": 8.661776395360029e-05, + "loss": 0.0246, + "step": 5530 + }, + { + "epoch": 2.6033834586466167, + "grad_norm": 0.22773458063602448, + "learning_rate": 8.656141966551019e-05, + "loss": 0.0232, + "step": 5540 + }, + { + "epoch": 2.6080827067669174, + "grad_norm": 0.1501941978931427, + "learning_rate": 8.650497541989482e-05, + "loss": 0.0189, + "step": 5550 + }, + { + "epoch": 2.612781954887218, + "grad_norm": 0.2053869515657425, + "learning_rate": 8.644843137107059e-05, + "loss": 0.0278, + "step": 5560 + }, + { + "epoch": 2.617481203007519, + "grad_norm": 0.20903167128562927, + "learning_rate": 8.639178767362676e-05, + "loss": 0.0265, + "step": 5570 + }, + { + "epoch": 2.6221804511278197, + "grad_norm": 0.16303761303424835, + "learning_rate": 8.633504448242505e-05, + "loss": 0.0174, + "step": 5580 + }, + { + "epoch": 2.6268796992481205, + "grad_norm": 0.18065503239631653, + "learning_rate": 8.627820195259918e-05, + "loss": 0.0223, + "step": 5590 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 0.18412037193775177, + "learning_rate": 8.622126023955446e-05, + "loss": 0.0287, + "step": 5600 + }, + { + "epoch": 2.636278195488722, + "grad_norm": 0.1771506518125534, + "learning_rate": 8.616421949896734e-05, + "loss": 0.0221, + "step": 5610 + }, + { + "epoch": 2.6409774436090228, + "grad_norm": 0.20367956161499023, + "learning_rate": 8.610707988678503e-05, + "loss": 0.0262, + "step": 5620 + }, + { + "epoch": 2.6456766917293235, + "grad_norm": 0.2133282572031021, + "learning_rate": 8.604984155922506e-05, + "loss": 0.016, + "step": 5630 + }, + { + "epoch": 2.6503759398496243, + "grad_norm": 0.2280464917421341, + "learning_rate": 8.599250467277483e-05, + "loss": 0.0226, + "step": 5640 + }, + { + "epoch": 2.655075187969925, + "grad_norm": 0.20852112770080566, + "learning_rate": 8.59350693841912e-05, + "loss": 0.0273, + "step": 5650 + }, + { + "epoch": 2.659774436090226, + "grad_norm": 0.11491677910089493, + "learning_rate": 8.587753585050004e-05, + "loss": 0.0231, + "step": 5660 + }, + { + "epoch": 2.6644736842105265, + "grad_norm": 0.13895131647586823, + "learning_rate": 8.581990422899585e-05, + "loss": 0.0159, + "step": 5670 + }, + { + "epoch": 2.6691729323308273, + "grad_norm": 0.1562396138906479, + "learning_rate": 8.576217467724128e-05, + "loss": 0.029, + "step": 5680 + }, + { + "epoch": 2.673872180451128, + "grad_norm": 0.16841505467891693, + "learning_rate": 8.570434735306671e-05, + "loss": 0.0164, + "step": 5690 + }, + { + "epoch": 2.678571428571429, + "grad_norm": 0.19897368550300598, + "learning_rate": 8.564642241456986e-05, + "loss": 0.03, + "step": 5700 + }, + { + "epoch": 2.6832706766917296, + "grad_norm": 0.21792681515216827, + "learning_rate": 8.558840002011528e-05, + "loss": 0.031, + "step": 5710 + }, + { + "epoch": 2.6879699248120303, + "grad_norm": 0.15453846752643585, + "learning_rate": 8.553028032833397e-05, + "loss": 0.0211, + "step": 5720 + }, + { + "epoch": 2.692669172932331, + "grad_norm": 0.129106804728508, + "learning_rate": 8.547206349812298e-05, + "loss": 0.0238, + "step": 5730 + }, + { + "epoch": 2.6973684210526314, + "grad_norm": 0.2648111581802368, + "learning_rate": 8.541374968864487e-05, + "loss": 0.0249, + "step": 5740 + }, + { + "epoch": 2.702067669172932, + "grad_norm": 0.23966850340366364, + "learning_rate": 8.535533905932738e-05, + "loss": 0.0224, + "step": 5750 + }, + { + "epoch": 2.706766917293233, + "grad_norm": 0.2106933295726776, + "learning_rate": 8.529683176986295e-05, + "loss": 0.02, + "step": 5760 + }, + { + "epoch": 2.7114661654135337, + "grad_norm": 0.20059479773044586, + "learning_rate": 8.523822798020827e-05, + "loss": 0.0172, + "step": 5770 + }, + { + "epoch": 2.7161654135338344, + "grad_norm": 0.14424027502536774, + "learning_rate": 8.517952785058385e-05, + "loss": 0.0256, + "step": 5780 + }, + { + "epoch": 2.720864661654135, + "grad_norm": 0.1661170870065689, + "learning_rate": 8.512073154147362e-05, + "loss": 0.0246, + "step": 5790 + }, + { + "epoch": 2.725563909774436, + "grad_norm": 0.23051202297210693, + "learning_rate": 8.506183921362443e-05, + "loss": 0.0223, + "step": 5800 + }, + { + "epoch": 2.7302631578947367, + "grad_norm": 0.24159590899944305, + "learning_rate": 8.500285102804568e-05, + "loss": 0.0368, + "step": 5810 + }, + { + "epoch": 2.7349624060150375, + "grad_norm": 0.20334158837795258, + "learning_rate": 8.494376714600878e-05, + "loss": 0.0337, + "step": 5820 + }, + { + "epoch": 2.7396616541353382, + "grad_norm": 0.15019096434116364, + "learning_rate": 8.488458772904684e-05, + "loss": 0.0246, + "step": 5830 + }, + { + "epoch": 2.744360902255639, + "grad_norm": 0.15891961753368378, + "learning_rate": 8.482531293895412e-05, + "loss": 0.0215, + "step": 5840 + }, + { + "epoch": 2.7490601503759398, + "grad_norm": 0.16506938636302948, + "learning_rate": 8.476594293778561e-05, + "loss": 0.0256, + "step": 5850 + }, + { + "epoch": 2.7537593984962405, + "grad_norm": 0.18463429808616638, + "learning_rate": 8.470647788785665e-05, + "loss": 0.0197, + "step": 5860 + }, + { + "epoch": 2.7584586466165413, + "grad_norm": 0.186926007270813, + "learning_rate": 8.46469179517424e-05, + "loss": 0.0199, + "step": 5870 + }, + { + "epoch": 2.763157894736842, + "grad_norm": 0.19735975563526154, + "learning_rate": 8.458726329227747e-05, + "loss": 0.0269, + "step": 5880 + }, + { + "epoch": 2.767857142857143, + "grad_norm": 0.17075762152671814, + "learning_rate": 8.452751407255541e-05, + "loss": 0.0213, + "step": 5890 + }, + { + "epoch": 2.7725563909774436, + "grad_norm": 0.25880053639411926, + "learning_rate": 8.44676704559283e-05, + "loss": 0.0222, + "step": 5900 + }, + { + "epoch": 2.7772556390977443, + "grad_norm": 0.34592244029045105, + "learning_rate": 8.44077326060063e-05, + "loss": 0.0295, + "step": 5910 + }, + { + "epoch": 2.781954887218045, + "grad_norm": 0.16668987274169922, + "learning_rate": 8.434770068665723e-05, + "loss": 0.0228, + "step": 5920 + }, + { + "epoch": 2.786654135338346, + "grad_norm": 0.14265859127044678, + "learning_rate": 8.428757486200603e-05, + "loss": 0.0252, + "step": 5930 + }, + { + "epoch": 2.7913533834586466, + "grad_norm": 0.2153574824333191, + "learning_rate": 8.422735529643444e-05, + "loss": 0.0268, + "step": 5940 + }, + { + "epoch": 2.7960526315789473, + "grad_norm": 0.20310519635677338, + "learning_rate": 8.416704215458043e-05, + "loss": 0.0205, + "step": 5950 + }, + { + "epoch": 2.800751879699248, + "grad_norm": 0.20665310323238373, + "learning_rate": 8.410663560133784e-05, + "loss": 0.022, + "step": 5960 + }, + { + "epoch": 2.805451127819549, + "grad_norm": 0.23281921446323395, + "learning_rate": 8.404613580185585e-05, + "loss": 0.0215, + "step": 5970 + }, + { + "epoch": 2.8101503759398496, + "grad_norm": 0.16719530522823334, + "learning_rate": 8.398554292153866e-05, + "loss": 0.0158, + "step": 5980 + }, + { + "epoch": 2.8148496240601504, + "grad_norm": 0.15998616814613342, + "learning_rate": 8.392485712604483e-05, + "loss": 0.0224, + "step": 5990 + }, + { + "epoch": 2.819548872180451, + "grad_norm": 0.15419815480709076, + "learning_rate": 8.386407858128706e-05, + "loss": 0.0285, + "step": 6000 + }, + { + "epoch": 2.824248120300752, + "grad_norm": 0.18911899626255035, + "learning_rate": 8.380320745343153e-05, + "loss": 0.0227, + "step": 6010 + }, + { + "epoch": 2.8289473684210527, + "grad_norm": 0.1339092254638672, + "learning_rate": 8.37422439088976e-05, + "loss": 0.0219, + "step": 6020 + }, + { + "epoch": 2.8336466165413534, + "grad_norm": 0.22458699345588684, + "learning_rate": 8.368118811435726e-05, + "loss": 0.0334, + "step": 6030 + }, + { + "epoch": 2.838345864661654, + "grad_norm": 0.1807672381401062, + "learning_rate": 8.362004023673474e-05, + "loss": 0.0302, + "step": 6040 + }, + { + "epoch": 2.843045112781955, + "grad_norm": 0.19045840203762054, + "learning_rate": 8.355880044320598e-05, + "loss": 0.0262, + "step": 6050 + }, + { + "epoch": 2.8477443609022557, + "grad_norm": 0.1385623812675476, + "learning_rate": 8.349746890119826e-05, + "loss": 0.0204, + "step": 6060 + }, + { + "epoch": 2.8524436090225564, + "grad_norm": 0.13099630177021027, + "learning_rate": 8.343604577838964e-05, + "loss": 0.0219, + "step": 6070 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.16968640685081482, + "learning_rate": 8.337453124270863e-05, + "loss": 0.0179, + "step": 6080 + }, + { + "epoch": 2.861842105263158, + "grad_norm": 0.19834963977336884, + "learning_rate": 8.331292546233362e-05, + "loss": 0.0246, + "step": 6090 + }, + { + "epoch": 2.8665413533834587, + "grad_norm": 0.21443118155002594, + "learning_rate": 8.32512286056924e-05, + "loss": 0.0263, + "step": 6100 + }, + { + "epoch": 2.8712406015037595, + "grad_norm": 0.21712978184223175, + "learning_rate": 8.318944084146192e-05, + "loss": 0.0311, + "step": 6110 + }, + { + "epoch": 2.8759398496240602, + "grad_norm": 0.18117016553878784, + "learning_rate": 8.31275623385675e-05, + "loss": 0.0205, + "step": 6120 + }, + { + "epoch": 2.880639097744361, + "grad_norm": 0.18793556094169617, + "learning_rate": 8.306559326618259e-05, + "loss": 0.0314, + "step": 6130 + }, + { + "epoch": 2.8853383458646618, + "grad_norm": 0.18074850738048553, + "learning_rate": 8.300353379372834e-05, + "loss": 0.0219, + "step": 6140 + }, + { + "epoch": 2.8900375939849625, + "grad_norm": 0.22259654104709625, + "learning_rate": 8.29413840908729e-05, + "loss": 0.0226, + "step": 6150 + }, + { + "epoch": 2.8947368421052633, + "grad_norm": 0.24867089092731476, + "learning_rate": 8.287914432753123e-05, + "loss": 0.0231, + "step": 6160 + }, + { + "epoch": 2.899436090225564, + "grad_norm": 0.21988849341869354, + "learning_rate": 8.281681467386446e-05, + "loss": 0.0257, + "step": 6170 + }, + { + "epoch": 2.904135338345865, + "grad_norm": 0.18946348130702972, + "learning_rate": 8.275439530027948e-05, + "loss": 0.0238, + "step": 6180 + }, + { + "epoch": 2.9088345864661656, + "grad_norm": 0.21483662724494934, + "learning_rate": 8.269188637742846e-05, + "loss": 0.0232, + "step": 6190 + }, + { + "epoch": 2.9135338345864663, + "grad_norm": 0.24274252355098724, + "learning_rate": 8.262928807620843e-05, + "loss": 0.0292, + "step": 6200 + }, + { + "epoch": 2.918233082706767, + "grad_norm": 0.1813192218542099, + "learning_rate": 8.256660056776076e-05, + "loss": 0.0223, + "step": 6210 + }, + { + "epoch": 2.922932330827068, + "grad_norm": 0.25920167565345764, + "learning_rate": 8.250382402347065e-05, + "loss": 0.0341, + "step": 6220 + }, + { + "epoch": 2.9276315789473686, + "grad_norm": 0.19504626095294952, + "learning_rate": 8.244095861496686e-05, + "loss": 0.0217, + "step": 6230 + }, + { + "epoch": 2.932330827067669, + "grad_norm": 0.1617734581232071, + "learning_rate": 8.237800451412095e-05, + "loss": 0.0238, + "step": 6240 + }, + { + "epoch": 2.9370300751879697, + "grad_norm": 0.15244753658771515, + "learning_rate": 8.231496189304704e-05, + "loss": 0.0244, + "step": 6250 + }, + { + "epoch": 2.9417293233082704, + "grad_norm": 0.20108844339847565, + "learning_rate": 8.225183092410128e-05, + "loss": 0.0304, + "step": 6260 + }, + { + "epoch": 2.946428571428571, + "grad_norm": 0.23991245031356812, + "learning_rate": 8.218861177988129e-05, + "loss": 0.0286, + "step": 6270 + }, + { + "epoch": 2.951127819548872, + "grad_norm": 0.17502142488956451, + "learning_rate": 8.212530463322583e-05, + "loss": 0.0241, + "step": 6280 + }, + { + "epoch": 2.9558270676691727, + "grad_norm": 0.1822666972875595, + "learning_rate": 8.206190965721419e-05, + "loss": 0.0276, + "step": 6290 + }, + { + "epoch": 2.9605263157894735, + "grad_norm": 0.22813008725643158, + "learning_rate": 8.199842702516583e-05, + "loss": 0.026, + "step": 6300 + }, + { + "epoch": 2.965225563909774, + "grad_norm": 0.18035642802715302, + "learning_rate": 8.193485691063985e-05, + "loss": 0.0196, + "step": 6310 + }, + { + "epoch": 2.969924812030075, + "grad_norm": 0.20626391470432281, + "learning_rate": 8.18711994874345e-05, + "loss": 0.021, + "step": 6320 + }, + { + "epoch": 2.9746240601503757, + "grad_norm": 0.23911024630069733, + "learning_rate": 8.180745492958674e-05, + "loss": 0.0269, + "step": 6330 + }, + { + "epoch": 2.9793233082706765, + "grad_norm": 0.21398058533668518, + "learning_rate": 8.174362341137177e-05, + "loss": 0.0282, + "step": 6340 + }, + { + "epoch": 2.9840225563909772, + "grad_norm": 0.14263466000556946, + "learning_rate": 8.167970510730253e-05, + "loss": 0.0192, + "step": 6350 + }, + { + "epoch": 2.988721804511278, + "grad_norm": 0.11814016848802567, + "learning_rate": 8.161570019212921e-05, + "loss": 0.018, + "step": 6360 + }, + { + "epoch": 2.9934210526315788, + "grad_norm": 0.17552949488162994, + "learning_rate": 8.155160884083881e-05, + "loss": 0.0198, + "step": 6370 + }, + { + "epoch": 2.9981203007518795, + "grad_norm": 0.2032817304134369, + "learning_rate": 8.148743122865463e-05, + "loss": 0.0202, + "step": 6380 + }, + { + "epoch": 3.0028195488721803, + "grad_norm": 0.21947944164276123, + "learning_rate": 8.14231675310358e-05, + "loss": 0.0322, + "step": 6390 + }, + { + "epoch": 3.007518796992481, + "grad_norm": 0.18210892379283905, + "learning_rate": 8.135881792367686e-05, + "loss": 0.0278, + "step": 6400 + }, + { + "epoch": 3.012218045112782, + "grad_norm": 0.16313527524471283, + "learning_rate": 8.129438258250712e-05, + "loss": 0.0277, + "step": 6410 + }, + { + "epoch": 3.0169172932330826, + "grad_norm": 0.167706698179245, + "learning_rate": 8.12298616836904e-05, + "loss": 0.0217, + "step": 6420 + }, + { + "epoch": 3.0216165413533833, + "grad_norm": 0.16999609768390656, + "learning_rate": 8.116525540362434e-05, + "loss": 0.0227, + "step": 6430 + }, + { + "epoch": 3.026315789473684, + "grad_norm": 0.13280713558197021, + "learning_rate": 8.110056391894005e-05, + "loss": 0.0179, + "step": 6440 + }, + { + "epoch": 3.031015037593985, + "grad_norm": 0.2560625970363617, + "learning_rate": 8.103578740650156e-05, + "loss": 0.0299, + "step": 6450 + }, + { + "epoch": 3.0357142857142856, + "grad_norm": 0.12923632562160492, + "learning_rate": 8.097092604340542e-05, + "loss": 0.0177, + "step": 6460 + }, + { + "epoch": 3.0404135338345863, + "grad_norm": 0.1939212530851364, + "learning_rate": 8.090598000698009e-05, + "loss": 0.0223, + "step": 6470 + }, + { + "epoch": 3.045112781954887, + "grad_norm": 0.17978033423423767, + "learning_rate": 8.084094947478556e-05, + "loss": 0.0146, + "step": 6480 + }, + { + "epoch": 3.049812030075188, + "grad_norm": 0.1513349711894989, + "learning_rate": 8.077583462461283e-05, + "loss": 0.0154, + "step": 6490 + }, + { + "epoch": 3.0545112781954886, + "grad_norm": 0.20076999068260193, + "learning_rate": 8.07106356344834e-05, + "loss": 0.0193, + "step": 6500 + }, + { + "epoch": 3.0592105263157894, + "grad_norm": 0.19262535870075226, + "learning_rate": 8.064535268264883e-05, + "loss": 0.0338, + "step": 6510 + }, + { + "epoch": 3.06390977443609, + "grad_norm": 0.16921037435531616, + "learning_rate": 8.057998594759022e-05, + "loss": 0.0314, + "step": 6520 + }, + { + "epoch": 3.068609022556391, + "grad_norm": 0.18066401779651642, + "learning_rate": 8.051453560801772e-05, + "loss": 0.0173, + "step": 6530 + }, + { + "epoch": 3.0733082706766917, + "grad_norm": 0.1616375744342804, + "learning_rate": 8.044900184287007e-05, + "loss": 0.0286, + "step": 6540 + }, + { + "epoch": 3.0780075187969924, + "grad_norm": 0.15176372230052948, + "learning_rate": 8.038338483131407e-05, + "loss": 0.0238, + "step": 6550 + }, + { + "epoch": 3.082706766917293, + "grad_norm": 0.21256138384342194, + "learning_rate": 8.031768475274413e-05, + "loss": 0.014, + "step": 6560 + }, + { + "epoch": 3.087406015037594, + "grad_norm": 0.19960719347000122, + "learning_rate": 8.025190178678175e-05, + "loss": 0.0279, + "step": 6570 + }, + { + "epoch": 3.0921052631578947, + "grad_norm": 0.1822323501110077, + "learning_rate": 8.018603611327504e-05, + "loss": 0.0201, + "step": 6580 + }, + { + "epoch": 3.0968045112781954, + "grad_norm": 0.16403251886367798, + "learning_rate": 8.012008791229826e-05, + "loss": 0.0194, + "step": 6590 + }, + { + "epoch": 3.101503759398496, + "grad_norm": 0.19543980062007904, + "learning_rate": 8.005405736415126e-05, + "loss": 0.0208, + "step": 6600 + }, + { + "epoch": 3.106203007518797, + "grad_norm": 0.19898360967636108, + "learning_rate": 7.998794464935904e-05, + "loss": 0.0185, + "step": 6610 + }, + { + "epoch": 3.1109022556390977, + "grad_norm": 0.13113637268543243, + "learning_rate": 7.992174994867123e-05, + "loss": 0.0284, + "step": 6620 + }, + { + "epoch": 3.1156015037593985, + "grad_norm": 0.21924848854541779, + "learning_rate": 7.985547344306161e-05, + "loss": 0.019, + "step": 6630 + }, + { + "epoch": 3.1203007518796992, + "grad_norm": 0.13814175128936768, + "learning_rate": 7.978911531372765e-05, + "loss": 0.0192, + "step": 6640 + }, + { + "epoch": 3.125, + "grad_norm": 0.17765717208385468, + "learning_rate": 7.972267574208991e-05, + "loss": 0.0165, + "step": 6650 + }, + { + "epoch": 3.1296992481203008, + "grad_norm": 0.1574385166168213, + "learning_rate": 7.965615490979163e-05, + "loss": 0.0271, + "step": 6660 + }, + { + "epoch": 3.1343984962406015, + "grad_norm": 0.17050959169864655, + "learning_rate": 7.958955299869825e-05, + "loss": 0.0246, + "step": 6670 + }, + { + "epoch": 3.1390977443609023, + "grad_norm": 0.17193661630153656, + "learning_rate": 7.952287019089685e-05, + "loss": 0.0184, + "step": 6680 + }, + { + "epoch": 3.143796992481203, + "grad_norm": 0.2032015323638916, + "learning_rate": 7.945610666869568e-05, + "loss": 0.0198, + "step": 6690 + }, + { + "epoch": 3.148496240601504, + "grad_norm": 0.2095039039850235, + "learning_rate": 7.938926261462366e-05, + "loss": 0.0335, + "step": 6700 + }, + { + "epoch": 3.1531954887218046, + "grad_norm": 0.18302005529403687, + "learning_rate": 7.932233821142987e-05, + "loss": 0.0215, + "step": 6710 + }, + { + "epoch": 3.1578947368421053, + "grad_norm": 0.18510419130325317, + "learning_rate": 7.925533364208309e-05, + "loss": 0.0163, + "step": 6720 + }, + { + "epoch": 3.162593984962406, + "grad_norm": 0.22432270646095276, + "learning_rate": 7.918824908977123e-05, + "loss": 0.0324, + "step": 6730 + }, + { + "epoch": 3.167293233082707, + "grad_norm": 0.14236007630825043, + "learning_rate": 7.912108473790092e-05, + "loss": 0.0251, + "step": 6740 + }, + { + "epoch": 3.1719924812030076, + "grad_norm": 0.21034064888954163, + "learning_rate": 7.905384077009693e-05, + "loss": 0.0255, + "step": 6750 + }, + { + "epoch": 3.1766917293233083, + "grad_norm": 0.18089409172534943, + "learning_rate": 7.898651737020166e-05, + "loss": 0.0301, + "step": 6760 + }, + { + "epoch": 3.181390977443609, + "grad_norm": 0.1281910240650177, + "learning_rate": 7.891911472227478e-05, + "loss": 0.0271, + "step": 6770 + }, + { + "epoch": 3.18609022556391, + "grad_norm": 0.2349163144826889, + "learning_rate": 7.88516330105925e-05, + "loss": 0.0231, + "step": 6780 + }, + { + "epoch": 3.1907894736842106, + "grad_norm": 0.15001404285430908, + "learning_rate": 7.878407241964729e-05, + "loss": 0.0196, + "step": 6790 + }, + { + "epoch": 3.1954887218045114, + "grad_norm": 0.20565535128116608, + "learning_rate": 7.871643313414718e-05, + "loss": 0.0339, + "step": 6800 + }, + { + "epoch": 3.200187969924812, + "grad_norm": 0.24226559698581696, + "learning_rate": 7.864871533901544e-05, + "loss": 0.0239, + "step": 6810 + }, + { + "epoch": 3.204887218045113, + "grad_norm": 0.17688481509685516, + "learning_rate": 7.858091921938988e-05, + "loss": 0.021, + "step": 6820 + }, + { + "epoch": 3.2095864661654137, + "grad_norm": 0.22673030197620392, + "learning_rate": 7.851304496062254e-05, + "loss": 0.0267, + "step": 6830 + }, + { + "epoch": 3.2142857142857144, + "grad_norm": 0.21045391261577606, + "learning_rate": 7.844509274827907e-05, + "loss": 0.0199, + "step": 6840 + }, + { + "epoch": 3.218984962406015, + "grad_norm": 0.14403589069843292, + "learning_rate": 7.837706276813819e-05, + "loss": 0.0214, + "step": 6850 + }, + { + "epoch": 3.223684210526316, + "grad_norm": 0.1579636186361313, + "learning_rate": 7.830895520619128e-05, + "loss": 0.022, + "step": 6860 + }, + { + "epoch": 3.2283834586466167, + "grad_norm": 0.1557644158601761, + "learning_rate": 7.824077024864179e-05, + "loss": 0.0192, + "step": 6870 + }, + { + "epoch": 3.2330827067669174, + "grad_norm": 0.22842907905578613, + "learning_rate": 7.817250808190483e-05, + "loss": 0.0312, + "step": 6880 + }, + { + "epoch": 3.237781954887218, + "grad_norm": 0.09012676775455475, + "learning_rate": 7.810416889260653e-05, + "loss": 0.0202, + "step": 6890 + }, + { + "epoch": 3.242481203007519, + "grad_norm": 0.14062803983688354, + "learning_rate": 7.803575286758364e-05, + "loss": 0.0193, + "step": 6900 + }, + { + "epoch": 3.2471804511278197, + "grad_norm": 0.20320133864879608, + "learning_rate": 7.796726019388295e-05, + "loss": 0.0269, + "step": 6910 + }, + { + "epoch": 3.2518796992481205, + "grad_norm": 0.21432797610759735, + "learning_rate": 7.789869105876083e-05, + "loss": 0.0225, + "step": 6920 + }, + { + "epoch": 3.2565789473684212, + "grad_norm": 0.1976686418056488, + "learning_rate": 7.783004564968263e-05, + "loss": 0.0225, + "step": 6930 + }, + { + "epoch": 3.261278195488722, + "grad_norm": 0.12431466579437256, + "learning_rate": 7.776132415432234e-05, + "loss": 0.0201, + "step": 6940 + }, + { + "epoch": 3.2659774436090228, + "grad_norm": 0.11760783195495605, + "learning_rate": 7.769252676056187e-05, + "loss": 0.0213, + "step": 6950 + }, + { + "epoch": 3.2706766917293235, + "grad_norm": 0.2153691202402115, + "learning_rate": 7.762365365649067e-05, + "loss": 0.0239, + "step": 6960 + }, + { + "epoch": 3.2753759398496243, + "grad_norm": 0.12900856137275696, + "learning_rate": 7.755470503040516e-05, + "loss": 0.0204, + "step": 6970 + }, + { + "epoch": 3.280075187969925, + "grad_norm": 0.16941407322883606, + "learning_rate": 7.748568107080832e-05, + "loss": 0.0211, + "step": 6980 + }, + { + "epoch": 3.284774436090226, + "grad_norm": 0.2066490203142166, + "learning_rate": 7.741658196640892e-05, + "loss": 0.0321, + "step": 6990 + }, + { + "epoch": 3.2894736842105265, + "grad_norm": 0.1458861082792282, + "learning_rate": 7.734740790612136e-05, + "loss": 0.0245, + "step": 7000 + }, + { + "epoch": 3.2941729323308273, + "grad_norm": 0.19808563590049744, + "learning_rate": 7.727815907906481e-05, + "loss": 0.0218, + "step": 7010 + }, + { + "epoch": 3.298872180451128, + "grad_norm": 0.1738310009241104, + "learning_rate": 7.720883567456298e-05, + "loss": 0.02, + "step": 7020 + }, + { + "epoch": 3.3035714285714284, + "grad_norm": 0.18204271793365479, + "learning_rate": 7.713943788214337e-05, + "loss": 0.0187, + "step": 7030 + }, + { + "epoch": 3.308270676691729, + "grad_norm": 0.19523444771766663, + "learning_rate": 7.70699658915369e-05, + "loss": 0.0225, + "step": 7040 + }, + { + "epoch": 3.31296992481203, + "grad_norm": 0.15334999561309814, + "learning_rate": 7.700041989267736e-05, + "loss": 0.0215, + "step": 7050 + }, + { + "epoch": 3.3176691729323307, + "grad_norm": 0.19023378193378448, + "learning_rate": 7.693080007570084e-05, + "loss": 0.0205, + "step": 7060 + }, + { + "epoch": 3.3223684210526314, + "grad_norm": 0.14410124719142914, + "learning_rate": 7.686110663094525e-05, + "loss": 0.0283, + "step": 7070 + }, + { + "epoch": 3.327067669172932, + "grad_norm": 0.17559373378753662, + "learning_rate": 7.679133974894983e-05, + "loss": 0.0256, + "step": 7080 + }, + { + "epoch": 3.331766917293233, + "grad_norm": 0.17834362387657166, + "learning_rate": 7.672149962045457e-05, + "loss": 0.0236, + "step": 7090 + }, + { + "epoch": 3.3364661654135337, + "grad_norm": 0.22900669276714325, + "learning_rate": 7.66515864363997e-05, + "loss": 0.0251, + "step": 7100 + }, + { + "epoch": 3.3411654135338344, + "grad_norm": 0.15852832794189453, + "learning_rate": 7.658160038792518e-05, + "loss": 0.0239, + "step": 7110 + }, + { + "epoch": 3.345864661654135, + "grad_norm": 0.16738687455654144, + "learning_rate": 7.651154166637025e-05, + "loss": 0.0204, + "step": 7120 + }, + { + "epoch": 3.350563909774436, + "grad_norm": 0.1924857199192047, + "learning_rate": 7.644141046327271e-05, + "loss": 0.0271, + "step": 7130 + }, + { + "epoch": 3.3552631578947367, + "grad_norm": 0.15029247105121613, + "learning_rate": 7.637120697036866e-05, + "loss": 0.0249, + "step": 7140 + }, + { + "epoch": 3.3599624060150375, + "grad_norm": 0.16167303919792175, + "learning_rate": 7.630093137959171e-05, + "loss": 0.0193, + "step": 7150 + }, + { + "epoch": 3.3646616541353382, + "grad_norm": 0.16990002989768982, + "learning_rate": 7.623058388307269e-05, + "loss": 0.0305, + "step": 7160 + }, + { + "epoch": 3.369360902255639, + "grad_norm": 0.22318102419376373, + "learning_rate": 7.616016467313891e-05, + "loss": 0.0186, + "step": 7170 + }, + { + "epoch": 3.3740601503759398, + "grad_norm": 0.15061348676681519, + "learning_rate": 7.608967394231387e-05, + "loss": 0.0213, + "step": 7180 + }, + { + "epoch": 3.3787593984962405, + "grad_norm": 0.2030361294746399, + "learning_rate": 7.60191118833165e-05, + "loss": 0.019, + "step": 7190 + }, + { + "epoch": 3.3834586466165413, + "grad_norm": 0.09996183216571808, + "learning_rate": 7.594847868906076e-05, + "loss": 0.0297, + "step": 7200 + }, + { + "epoch": 3.388157894736842, + "grad_norm": 0.22531284391880035, + "learning_rate": 7.587777455265515e-05, + "loss": 0.0189, + "step": 7210 + }, + { + "epoch": 3.392857142857143, + "grad_norm": 0.10503160953521729, + "learning_rate": 7.580699966740201e-05, + "loss": 0.0174, + "step": 7220 + }, + { + "epoch": 3.3975563909774436, + "grad_norm": 0.11209885030984879, + "learning_rate": 7.573615422679726e-05, + "loss": 0.0212, + "step": 7230 + }, + { + "epoch": 3.4022556390977443, + "grad_norm": 0.16246545314788818, + "learning_rate": 7.566523842452958e-05, + "loss": 0.019, + "step": 7240 + }, + { + "epoch": 3.406954887218045, + "grad_norm": 0.1651008427143097, + "learning_rate": 7.559425245448006e-05, + "loss": 0.0241, + "step": 7250 + }, + { + "epoch": 3.411654135338346, + "grad_norm": 0.12217399477958679, + "learning_rate": 7.552319651072164e-05, + "loss": 0.0218, + "step": 7260 + }, + { + "epoch": 3.4163533834586466, + "grad_norm": 0.16352419555187225, + "learning_rate": 7.545207078751857e-05, + "loss": 0.0269, + "step": 7270 + }, + { + "epoch": 3.4210526315789473, + "grad_norm": 0.16248418390750885, + "learning_rate": 7.538087547932585e-05, + "loss": 0.0177, + "step": 7280 + }, + { + "epoch": 3.425751879699248, + "grad_norm": 0.17357391119003296, + "learning_rate": 7.530961078078873e-05, + "loss": 0.0195, + "step": 7290 + }, + { + "epoch": 3.430451127819549, + "grad_norm": 0.19342300295829773, + "learning_rate": 7.52382768867422e-05, + "loss": 0.029, + "step": 7300 + }, + { + "epoch": 3.4351503759398496, + "grad_norm": 0.1559091955423355, + "learning_rate": 7.516687399221037e-05, + "loss": 0.0156, + "step": 7310 + }, + { + "epoch": 3.4398496240601504, + "grad_norm": 0.156753808259964, + "learning_rate": 7.509540229240601e-05, + "loss": 0.0218, + "step": 7320 + }, + { + "epoch": 3.444548872180451, + "grad_norm": 0.25587204098701477, + "learning_rate": 7.50238619827301e-05, + "loss": 0.0195, + "step": 7330 + }, + { + "epoch": 3.449248120300752, + "grad_norm": 0.14930440485477448, + "learning_rate": 7.495225325877103e-05, + "loss": 0.0213, + "step": 7340 + }, + { + "epoch": 3.4539473684210527, + "grad_norm": 0.1230517104268074, + "learning_rate": 7.488057631630437e-05, + "loss": 0.0253, + "step": 7350 + }, + { + "epoch": 3.4586466165413534, + "grad_norm": 0.2551042437553406, + "learning_rate": 7.480883135129211e-05, + "loss": 0.0337, + "step": 7360 + }, + { + "epoch": 3.463345864661654, + "grad_norm": 0.2262675166130066, + "learning_rate": 7.473701855988227e-05, + "loss": 0.0245, + "step": 7370 + }, + { + "epoch": 3.468045112781955, + "grad_norm": 0.24242740869522095, + "learning_rate": 7.466513813840825e-05, + "loss": 0.0215, + "step": 7380 + }, + { + "epoch": 3.4727443609022557, + "grad_norm": 0.13138991594314575, + "learning_rate": 7.45931902833884e-05, + "loss": 0.0215, + "step": 7390 + }, + { + "epoch": 3.4774436090225564, + "grad_norm": 0.1672552078962326, + "learning_rate": 7.452117519152542e-05, + "loss": 0.0163, + "step": 7400 + }, + { + "epoch": 3.482142857142857, + "grad_norm": 0.1623755842447281, + "learning_rate": 7.444909305970578e-05, + "loss": 0.0241, + "step": 7410 + }, + { + "epoch": 3.486842105263158, + "grad_norm": 0.13752855360507965, + "learning_rate": 7.437694408499933e-05, + "loss": 0.0303, + "step": 7420 + }, + { + "epoch": 3.4915413533834587, + "grad_norm": 0.23802992701530457, + "learning_rate": 7.430472846465856e-05, + "loss": 0.0263, + "step": 7430 + }, + { + "epoch": 3.4962406015037595, + "grad_norm": 0.19350318610668182, + "learning_rate": 7.423244639611826e-05, + "loss": 0.0204, + "step": 7440 + }, + { + "epoch": 3.5009398496240602, + "grad_norm": 0.15366598963737488, + "learning_rate": 7.416009807699482e-05, + "loss": 0.021, + "step": 7450 + }, + { + "epoch": 3.505639097744361, + "grad_norm": 0.16275855898857117, + "learning_rate": 7.408768370508576e-05, + "loss": 0.0181, + "step": 7460 + }, + { + "epoch": 3.5103383458646618, + "grad_norm": 0.22359305620193481, + "learning_rate": 7.401520347836926e-05, + "loss": 0.0238, + "step": 7470 + }, + { + "epoch": 3.5150375939849625, + "grad_norm": 0.143357053399086, + "learning_rate": 7.394265759500348e-05, + "loss": 0.0231, + "step": 7480 + }, + { + "epoch": 3.5197368421052633, + "grad_norm": 0.14645427465438843, + "learning_rate": 7.387004625332608e-05, + "loss": 0.0231, + "step": 7490 + }, + { + "epoch": 3.524436090225564, + "grad_norm": 0.17870672047138214, + "learning_rate": 7.379736965185368e-05, + "loss": 0.0253, + "step": 7500 + }, + { + "epoch": 3.529135338345865, + "grad_norm": 0.12523587048053741, + "learning_rate": 7.372462798928137e-05, + "loss": 0.0172, + "step": 7510 + }, + { + "epoch": 3.5338345864661656, + "grad_norm": 0.15407587587833405, + "learning_rate": 7.365182146448205e-05, + "loss": 0.0302, + "step": 7520 + }, + { + "epoch": 3.5385338345864663, + "grad_norm": 0.178106427192688, + "learning_rate": 7.357895027650598e-05, + "loss": 0.0266, + "step": 7530 + }, + { + "epoch": 3.543233082706767, + "grad_norm": 0.1787080615758896, + "learning_rate": 7.350601462458024e-05, + "loss": 0.0244, + "step": 7540 + }, + { + "epoch": 3.547932330827068, + "grad_norm": 0.12952303886413574, + "learning_rate": 7.343301470810808e-05, + "loss": 0.0274, + "step": 7550 + }, + { + "epoch": 3.5526315789473686, + "grad_norm": 0.14345309138298035, + "learning_rate": 7.335995072666848e-05, + "loss": 0.0189, + "step": 7560 + }, + { + "epoch": 3.557330827067669, + "grad_norm": 0.15413424372673035, + "learning_rate": 7.328682288001561e-05, + "loss": 0.0142, + "step": 7570 + }, + { + "epoch": 3.5620300751879697, + "grad_norm": 0.19909968972206116, + "learning_rate": 7.32136313680782e-05, + "loss": 0.0193, + "step": 7580 + }, + { + "epoch": 3.5667293233082704, + "grad_norm": 0.18164218962192535, + "learning_rate": 7.3140376390959e-05, + "loss": 0.0241, + "step": 7590 + }, + { + "epoch": 3.571428571428571, + "grad_norm": 0.12364038825035095, + "learning_rate": 7.30670581489344e-05, + "loss": 0.023, + "step": 7600 + }, + { + "epoch": 3.576127819548872, + "grad_norm": 0.14471431076526642, + "learning_rate": 7.299367684245362e-05, + "loss": 0.0196, + "step": 7610 + }, + { + "epoch": 3.5808270676691727, + "grad_norm": 0.13688594102859497, + "learning_rate": 7.292023267213835e-05, + "loss": 0.0186, + "step": 7620 + }, + { + "epoch": 3.5855263157894735, + "grad_norm": 0.13687384128570557, + "learning_rate": 7.284672583878219e-05, + "loss": 0.0287, + "step": 7630 + }, + { + "epoch": 3.590225563909774, + "grad_norm": 0.16403521597385406, + "learning_rate": 7.277315654334997e-05, + "loss": 0.0195, + "step": 7640 + }, + { + "epoch": 3.594924812030075, + "grad_norm": 0.15605804324150085, + "learning_rate": 7.269952498697734e-05, + "loss": 0.027, + "step": 7650 + }, + { + "epoch": 3.5996240601503757, + "grad_norm": 0.16706325113773346, + "learning_rate": 7.262583137097018e-05, + "loss": 0.0186, + "step": 7660 + }, + { + "epoch": 3.6043233082706765, + "grad_norm": 0.18399161100387573, + "learning_rate": 7.255207589680402e-05, + "loss": 0.0199, + "step": 7670 + }, + { + "epoch": 3.6090225563909772, + "grad_norm": 0.14956754446029663, + "learning_rate": 7.247825876612353e-05, + "loss": 0.0223, + "step": 7680 + }, + { + "epoch": 3.613721804511278, + "grad_norm": 0.16335678100585938, + "learning_rate": 7.240438018074189e-05, + "loss": 0.0205, + "step": 7690 + }, + { + "epoch": 3.6184210526315788, + "grad_norm": 0.15078043937683105, + "learning_rate": 7.233044034264034e-05, + "loss": 0.0166, + "step": 7700 + }, + { + "epoch": 3.6231203007518795, + "grad_norm": 0.1938415914773941, + "learning_rate": 7.225643945396757e-05, + "loss": 0.0218, + "step": 7710 + }, + { + "epoch": 3.6278195488721803, + "grad_norm": 0.1588267982006073, + "learning_rate": 7.218237771703921e-05, + "loss": 0.031, + "step": 7720 + }, + { + "epoch": 3.632518796992481, + "grad_norm": 0.17583388090133667, + "learning_rate": 7.210825533433719e-05, + "loss": 0.0244, + "step": 7730 + }, + { + "epoch": 3.637218045112782, + "grad_norm": 0.2322985827922821, + "learning_rate": 7.203407250850928e-05, + "loss": 0.0181, + "step": 7740 + }, + { + "epoch": 3.6419172932330826, + "grad_norm": 0.16796962916851044, + "learning_rate": 7.195982944236851e-05, + "loss": 0.0227, + "step": 7750 + }, + { + "epoch": 3.6466165413533833, + "grad_norm": 0.14361505210399628, + "learning_rate": 7.188552633889259e-05, + "loss": 0.0223, + "step": 7760 + }, + { + "epoch": 3.651315789473684, + "grad_norm": 0.13321304321289062, + "learning_rate": 7.181116340122336e-05, + "loss": 0.0165, + "step": 7770 + }, + { + "epoch": 3.656015037593985, + "grad_norm": 0.1332216113805771, + "learning_rate": 7.173674083266624e-05, + "loss": 0.0223, + "step": 7780 + }, + { + "epoch": 3.6607142857142856, + "grad_norm": 0.15823520720005035, + "learning_rate": 7.166225883668969e-05, + "loss": 0.0282, + "step": 7790 + }, + { + "epoch": 3.6654135338345863, + "grad_norm": 0.18819200992584229, + "learning_rate": 7.158771761692464e-05, + "loss": 0.0156, + "step": 7800 + }, + { + "epoch": 3.670112781954887, + "grad_norm": 0.1621289998292923, + "learning_rate": 7.151311737716397e-05, + "loss": 0.0222, + "step": 7810 + }, + { + "epoch": 3.674812030075188, + "grad_norm": 0.12522225081920624, + "learning_rate": 7.143845832136188e-05, + "loss": 0.0204, + "step": 7820 + }, + { + "epoch": 3.6795112781954886, + "grad_norm": 0.12633934617042542, + "learning_rate": 7.136374065363334e-05, + "loss": 0.0197, + "step": 7830 + }, + { + "epoch": 3.6842105263157894, + "grad_norm": 0.12287107110023499, + "learning_rate": 7.128896457825364e-05, + "loss": 0.0218, + "step": 7840 + }, + { + "epoch": 3.68890977443609, + "grad_norm": 0.17396919429302216, + "learning_rate": 7.121413029965769e-05, + "loss": 0.0214, + "step": 7850 + }, + { + "epoch": 3.693609022556391, + "grad_norm": 0.15268878638744354, + "learning_rate": 7.113923802243957e-05, + "loss": 0.0225, + "step": 7860 + }, + { + "epoch": 3.6983082706766917, + "grad_norm": 0.14335058629512787, + "learning_rate": 7.10642879513519e-05, + "loss": 0.0239, + "step": 7870 + }, + { + "epoch": 3.7030075187969924, + "grad_norm": 0.13077248632907867, + "learning_rate": 7.09892802913053e-05, + "loss": 0.0208, + "step": 7880 + }, + { + "epoch": 3.707706766917293, + "grad_norm": 0.16183945536613464, + "learning_rate": 7.091421524736784e-05, + "loss": 0.0232, + "step": 7890 + }, + { + "epoch": 3.712406015037594, + "grad_norm": 0.20572522282600403, + "learning_rate": 7.083909302476453e-05, + "loss": 0.0263, + "step": 7900 + }, + { + "epoch": 3.7171052631578947, + "grad_norm": 0.17196118831634521, + "learning_rate": 7.076391382887661e-05, + "loss": 0.0193, + "step": 7910 + }, + { + "epoch": 3.7218045112781954, + "grad_norm": 0.16815915703773499, + "learning_rate": 7.068867786524116e-05, + "loss": 0.0175, + "step": 7920 + }, + { + "epoch": 3.726503759398496, + "grad_norm": 0.20678561925888062, + "learning_rate": 7.061338533955043e-05, + "loss": 0.0229, + "step": 7930 + }, + { + "epoch": 3.731203007518797, + "grad_norm": 0.12356437742710114, + "learning_rate": 7.053803645765128e-05, + "loss": 0.0273, + "step": 7940 + }, + { + "epoch": 3.7359022556390977, + "grad_norm": 0.1457894891500473, + "learning_rate": 7.04626314255447e-05, + "loss": 0.0187, + "step": 7950 + }, + { + "epoch": 3.7406015037593985, + "grad_norm": 0.149841770529747, + "learning_rate": 7.038717044938519e-05, + "loss": 0.0201, + "step": 7960 + }, + { + "epoch": 3.7453007518796992, + "grad_norm": 0.22033792734146118, + "learning_rate": 7.031165373548014e-05, + "loss": 0.0246, + "step": 7970 + }, + { + "epoch": 3.75, + "grad_norm": 0.13069063425064087, + "learning_rate": 7.023608149028937e-05, + "loss": 0.0155, + "step": 7980 + }, + { + "epoch": 3.7546992481203008, + "grad_norm": 0.15247632563114166, + "learning_rate": 7.016045392042452e-05, + "loss": 0.0217, + "step": 7990 + }, + { + "epoch": 3.7593984962406015, + "grad_norm": 0.1565304547548294, + "learning_rate": 7.008477123264848e-05, + "loss": 0.0212, + "step": 8000 + }, + { + "epoch": 3.7640977443609023, + "grad_norm": 0.08946457505226135, + "learning_rate": 7.000903363387482e-05, + "loss": 0.0178, + "step": 8010 + }, + { + "epoch": 3.768796992481203, + "grad_norm": 0.16338147222995758, + "learning_rate": 6.993324133116726e-05, + "loss": 0.0188, + "step": 8020 + }, + { + "epoch": 3.773496240601504, + "grad_norm": 0.15769213438034058, + "learning_rate": 6.985739453173903e-05, + "loss": 0.0183, + "step": 8030 + }, + { + "epoch": 3.7781954887218046, + "grad_norm": 0.18680426478385925, + "learning_rate": 6.978149344295242e-05, + "loss": 0.0183, + "step": 8040 + }, + { + "epoch": 3.7828947368421053, + "grad_norm": 0.15789374709129333, + "learning_rate": 6.97055382723181e-05, + "loss": 0.0226, + "step": 8050 + }, + { + "epoch": 3.787593984962406, + "grad_norm": 0.14424045383930206, + "learning_rate": 6.962952922749457e-05, + "loss": 0.0205, + "step": 8060 + }, + { + "epoch": 3.792293233082707, + "grad_norm": 0.2000490128993988, + "learning_rate": 6.955346651628771e-05, + "loss": 0.022, + "step": 8070 + }, + { + "epoch": 3.7969924812030076, + "grad_norm": 0.11632633209228516, + "learning_rate": 6.947735034665002e-05, + "loss": 0.03, + "step": 8080 + }, + { + "epoch": 3.8016917293233083, + "grad_norm": 0.15219521522521973, + "learning_rate": 6.940118092668022e-05, + "loss": 0.0171, + "step": 8090 + }, + { + "epoch": 3.806390977443609, + "grad_norm": 0.16450823843479156, + "learning_rate": 6.932495846462261e-05, + "loss": 0.0203, + "step": 8100 + }, + { + "epoch": 3.81109022556391, + "grad_norm": 0.2347194105386734, + "learning_rate": 6.924868316886649e-05, + "loss": 0.0257, + "step": 8110 + }, + { + "epoch": 3.8157894736842106, + "grad_norm": 0.14800488948822021, + "learning_rate": 6.917235524794558e-05, + "loss": 0.0263, + "step": 8120 + }, + { + "epoch": 3.8204887218045114, + "grad_norm": 0.18087433278560638, + "learning_rate": 6.909597491053751e-05, + "loss": 0.0192, + "step": 8130 + }, + { + "epoch": 3.825187969924812, + "grad_norm": 0.14640933275222778, + "learning_rate": 6.901954236546323e-05, + "loss": 0.0248, + "step": 8140 + }, + { + "epoch": 3.829887218045113, + "grad_norm": 0.12826332449913025, + "learning_rate": 6.894305782168638e-05, + "loss": 0.0148, + "step": 8150 + }, + { + "epoch": 3.8345864661654137, + "grad_norm": 0.165438711643219, + "learning_rate": 6.886652148831279e-05, + "loss": 0.0189, + "step": 8160 + }, + { + "epoch": 3.8392857142857144, + "grad_norm": 0.17549659311771393, + "learning_rate": 6.878993357458986e-05, + "loss": 0.024, + "step": 8170 + }, + { + "epoch": 3.843984962406015, + "grad_norm": 0.10705628246068954, + "learning_rate": 6.871329428990602e-05, + "loss": 0.0171, + "step": 8180 + }, + { + "epoch": 3.848684210526316, + "grad_norm": 0.13803797960281372, + "learning_rate": 6.863660384379017e-05, + "loss": 0.0254, + "step": 8190 + }, + { + "epoch": 3.8533834586466167, + "grad_norm": 0.21638810634613037, + "learning_rate": 6.855986244591104e-05, + "loss": 0.0247, + "step": 8200 + }, + { + "epoch": 3.8580827067669174, + "grad_norm": 0.1485230177640915, + "learning_rate": 6.84830703060767e-05, + "loss": 0.0173, + "step": 8210 + }, + { + "epoch": 3.862781954887218, + "grad_norm": 0.12228238582611084, + "learning_rate": 6.840622763423391e-05, + "loss": 0.0222, + "step": 8220 + }, + { + "epoch": 3.867481203007519, + "grad_norm": 0.15566863119602203, + "learning_rate": 6.83293346404676e-05, + "loss": 0.0179, + "step": 8230 + }, + { + "epoch": 3.8721804511278197, + "grad_norm": 0.1872120499610901, + "learning_rate": 6.825239153500029e-05, + "loss": 0.0245, + "step": 8240 + }, + { + "epoch": 3.8768796992481205, + "grad_norm": 0.12243503332138062, + "learning_rate": 6.817539852819149e-05, + "loss": 0.0208, + "step": 8250 + }, + { + "epoch": 3.8815789473684212, + "grad_norm": 0.204155832529068, + "learning_rate": 6.809835583053715e-05, + "loss": 0.0204, + "step": 8260 + }, + { + "epoch": 3.886278195488722, + "grad_norm": 0.1609678864479065, + "learning_rate": 6.802126365266905e-05, + "loss": 0.0214, + "step": 8270 + }, + { + "epoch": 3.8909774436090228, + "grad_norm": 0.17930862307548523, + "learning_rate": 6.794412220535426e-05, + "loss": 0.0268, + "step": 8280 + }, + { + "epoch": 3.8956766917293235, + "grad_norm": 0.16851861774921417, + "learning_rate": 6.786693169949455e-05, + "loss": 0.0266, + "step": 8290 + }, + { + "epoch": 3.9003759398496243, + "grad_norm": 0.16229747235774994, + "learning_rate": 6.778969234612584e-05, + "loss": 0.0138, + "step": 8300 + }, + { + "epoch": 3.905075187969925, + "grad_norm": 0.17356090247631073, + "learning_rate": 6.771240435641754e-05, + "loss": 0.013, + "step": 8310 + }, + { + "epoch": 3.909774436090226, + "grad_norm": 0.1390371471643448, + "learning_rate": 6.763506794167208e-05, + "loss": 0.0163, + "step": 8320 + }, + { + "epoch": 3.9144736842105265, + "grad_norm": 0.17728669941425323, + "learning_rate": 6.755768331332424e-05, + "loss": 0.0254, + "step": 8330 + }, + { + "epoch": 3.9191729323308273, + "grad_norm": 0.13212069869041443, + "learning_rate": 6.748025068294067e-05, + "loss": 0.0239, + "step": 8340 + }, + { + "epoch": 3.923872180451128, + "grad_norm": 0.1477879285812378, + "learning_rate": 6.740277026221923e-05, + "loss": 0.0211, + "step": 8350 + }, + { + "epoch": 3.928571428571429, + "grad_norm": 0.17585650086402893, + "learning_rate": 6.732524226298841e-05, + "loss": 0.0282, + "step": 8360 + }, + { + "epoch": 3.9332706766917296, + "grad_norm": 0.20357094705104828, + "learning_rate": 6.72476668972068e-05, + "loss": 0.0249, + "step": 8370 + }, + { + "epoch": 3.9379699248120303, + "grad_norm": 0.14865775406360626, + "learning_rate": 6.71700443769625e-05, + "loss": 0.0248, + "step": 8380 + }, + { + "epoch": 3.942669172932331, + "grad_norm": 0.24721617996692657, + "learning_rate": 6.709237491447249e-05, + "loss": 0.019, + "step": 8390 + }, + { + "epoch": 3.9473684210526314, + "grad_norm": 0.1690632849931717, + "learning_rate": 6.701465872208216e-05, + "loss": 0.0199, + "step": 8400 + }, + { + "epoch": 3.952067669172932, + "grad_norm": 0.1842128187417984, + "learning_rate": 6.693689601226458e-05, + "loss": 0.0185, + "step": 8410 + }, + { + "epoch": 3.956766917293233, + "grad_norm": 0.15533077716827393, + "learning_rate": 6.685908699762002e-05, + "loss": 0.0171, + "step": 8420 + }, + { + "epoch": 3.9614661654135337, + "grad_norm": 0.19935789704322815, + "learning_rate": 6.67812318908754e-05, + "loss": 0.0166, + "step": 8430 + }, + { + "epoch": 3.9661654135338344, + "grad_norm": 0.16483817994594574, + "learning_rate": 6.670333090488356e-05, + "loss": 0.0167, + "step": 8440 + }, + { + "epoch": 3.970864661654135, + "grad_norm": 0.13967077434062958, + "learning_rate": 6.662538425262285e-05, + "loss": 0.0189, + "step": 8450 + }, + { + "epoch": 3.975563909774436, + "grad_norm": 0.08983052521944046, + "learning_rate": 6.654739214719641e-05, + "loss": 0.0158, + "step": 8460 + }, + { + "epoch": 3.9802631578947367, + "grad_norm": 0.11961586773395538, + "learning_rate": 6.646935480183173e-05, + "loss": 0.0195, + "step": 8470 + }, + { + "epoch": 3.9849624060150375, + "grad_norm": 0.14519299566745758, + "learning_rate": 6.639127242987988e-05, + "loss": 0.0227, + "step": 8480 + }, + { + "epoch": 3.9896616541353382, + "grad_norm": 0.1847597360610962, + "learning_rate": 6.631314524481513e-05, + "loss": 0.0215, + "step": 8490 + }, + { + "epoch": 3.994360902255639, + "grad_norm": 0.16919434070587158, + "learning_rate": 6.623497346023418e-05, + "loss": 0.0233, + "step": 8500 + }, + { + "epoch": 3.9990601503759398, + "grad_norm": 0.18585987389087677, + "learning_rate": 6.615675728985572e-05, + "loss": 0.0317, + "step": 8510 + }, + { + "epoch": 4.003759398496241, + "grad_norm": 0.11600866168737411, + "learning_rate": 6.607849694751977e-05, + "loss": 0.0221, + "step": 8520 + }, + { + "epoch": 4.008458646616542, + "grad_norm": 0.1685023158788681, + "learning_rate": 6.600019264718713e-05, + "loss": 0.0164, + "step": 8530 + }, + { + "epoch": 4.0131578947368425, + "grad_norm": 0.13364779949188232, + "learning_rate": 6.592184460293877e-05, + "loss": 0.024, + "step": 8540 + }, + { + "epoch": 4.017857142857143, + "grad_norm": 0.1409081667661667, + "learning_rate": 6.584345302897523e-05, + "loss": 0.0197, + "step": 8550 + }, + { + "epoch": 4.022556390977444, + "grad_norm": 0.18445076048374176, + "learning_rate": 6.576501813961609e-05, + "loss": 0.0186, + "step": 8560 + }, + { + "epoch": 4.027255639097745, + "grad_norm": 0.13780328631401062, + "learning_rate": 6.568654014929932e-05, + "loss": 0.0158, + "step": 8570 + }, + { + "epoch": 4.0319548872180455, + "grad_norm": 0.1404091864824295, + "learning_rate": 6.56080192725808e-05, + "loss": 0.0178, + "step": 8580 + }, + { + "epoch": 4.036654135338346, + "grad_norm": 0.1806766241788864, + "learning_rate": 6.552945572413358e-05, + "loss": 0.0212, + "step": 8590 + }, + { + "epoch": 4.041353383458647, + "grad_norm": 0.12911942601203918, + "learning_rate": 6.545084971874738e-05, + "loss": 0.0162, + "step": 8600 + }, + { + "epoch": 4.046052631578948, + "grad_norm": 0.1130770891904831, + "learning_rate": 6.537220147132805e-05, + "loss": 0.0204, + "step": 8610 + }, + { + "epoch": 4.0507518796992485, + "grad_norm": 0.1793549507856369, + "learning_rate": 6.529351119689688e-05, + "loss": 0.0149, + "step": 8620 + }, + { + "epoch": 4.055451127819549, + "grad_norm": 0.12300092726945877, + "learning_rate": 6.521477911059008e-05, + "loss": 0.0273, + "step": 8630 + }, + { + "epoch": 4.06015037593985, + "grad_norm": 0.11498741805553436, + "learning_rate": 6.513600542765817e-05, + "loss": 0.0163, + "step": 8640 + }, + { + "epoch": 4.06484962406015, + "grad_norm": 0.17557263374328613, + "learning_rate": 6.505719036346539e-05, + "loss": 0.0187, + "step": 8650 + }, + { + "epoch": 4.069548872180451, + "grad_norm": 0.12561099231243134, + "learning_rate": 6.497833413348909e-05, + "loss": 0.0167, + "step": 8660 + }, + { + "epoch": 4.0742481203007515, + "grad_norm": 0.14439953863620758, + "learning_rate": 6.489943695331923e-05, + "loss": 0.0181, + "step": 8670 + }, + { + "epoch": 4.078947368421052, + "grad_norm": 0.2207750380039215, + "learning_rate": 6.48204990386577e-05, + "loss": 0.0187, + "step": 8680 + }, + { + "epoch": 4.083646616541353, + "grad_norm": 0.1530761420726776, + "learning_rate": 6.474152060531768e-05, + "loss": 0.0188, + "step": 8690 + }, + { + "epoch": 4.088345864661654, + "grad_norm": 0.18468138575553894, + "learning_rate": 6.466250186922325e-05, + "loss": 0.0153, + "step": 8700 + }, + { + "epoch": 4.0930451127819545, + "grad_norm": 0.19658444821834564, + "learning_rate": 6.458344304640858e-05, + "loss": 0.0144, + "step": 8710 + }, + { + "epoch": 4.097744360902255, + "grad_norm": 0.16969050467014313, + "learning_rate": 6.450434435301751e-05, + "loss": 0.0167, + "step": 8720 + }, + { + "epoch": 4.102443609022556, + "grad_norm": 0.20715074241161346, + "learning_rate": 6.44252060053028e-05, + "loss": 0.0188, + "step": 8730 + }, + { + "epoch": 4.107142857142857, + "grad_norm": 0.19607798755168915, + "learning_rate": 6.43460282196257e-05, + "loss": 0.0176, + "step": 8740 + }, + { + "epoch": 4.1118421052631575, + "grad_norm": 0.13354948163032532, + "learning_rate": 6.426681121245527e-05, + "loss": 0.0175, + "step": 8750 + }, + { + "epoch": 4.116541353383458, + "grad_norm": 0.16963709890842438, + "learning_rate": 6.418755520036775e-05, + "loss": 0.0184, + "step": 8760 + }, + { + "epoch": 4.121240601503759, + "grad_norm": 0.17850276827812195, + "learning_rate": 6.410826040004607e-05, + "loss": 0.0206, + "step": 8770 + }, + { + "epoch": 4.12593984962406, + "grad_norm": 0.13571204245090485, + "learning_rate": 6.402892702827916e-05, + "loss": 0.0206, + "step": 8780 + }, + { + "epoch": 4.1306390977443606, + "grad_norm": 0.14652928709983826, + "learning_rate": 6.394955530196147e-05, + "loss": 0.0285, + "step": 8790 + }, + { + "epoch": 4.135338345864661, + "grad_norm": 0.20904070138931274, + "learning_rate": 6.387014543809223e-05, + "loss": 0.0227, + "step": 8800 + }, + { + "epoch": 4.140037593984962, + "grad_norm": 0.11015600711107254, + "learning_rate": 6.3790697653775e-05, + "loss": 0.0283, + "step": 8810 + }, + { + "epoch": 4.144736842105263, + "grad_norm": 0.2260255068540573, + "learning_rate": 6.371121216621698e-05, + "loss": 0.018, + "step": 8820 + }, + { + "epoch": 4.149436090225564, + "grad_norm": 0.16859593987464905, + "learning_rate": 6.363168919272846e-05, + "loss": 0.0179, + "step": 8830 + }, + { + "epoch": 4.154135338345864, + "grad_norm": 0.11148626357316971, + "learning_rate": 6.355212895072223e-05, + "loss": 0.0303, + "step": 8840 + }, + { + "epoch": 4.158834586466165, + "grad_norm": 0.1172458827495575, + "learning_rate": 6.34725316577129e-05, + "loss": 0.023, + "step": 8850 + }, + { + "epoch": 4.163533834586466, + "grad_norm": 0.17377308011054993, + "learning_rate": 6.339289753131649e-05, + "loss": 0.0229, + "step": 8860 + }, + { + "epoch": 4.168233082706767, + "grad_norm": 0.1693037748336792, + "learning_rate": 6.331322678924962e-05, + "loss": 0.0224, + "step": 8870 + }, + { + "epoch": 4.172932330827067, + "grad_norm": 0.1303318440914154, + "learning_rate": 6.323351964932908e-05, + "loss": 0.0265, + "step": 8880 + }, + { + "epoch": 4.177631578947368, + "grad_norm": 0.1451072245836258, + "learning_rate": 6.315377632947115e-05, + "loss": 0.0261, + "step": 8890 + }, + { + "epoch": 4.182330827067669, + "grad_norm": 0.11508966982364655, + "learning_rate": 6.307399704769099e-05, + "loss": 0.0175, + "step": 8900 + }, + { + "epoch": 4.18703007518797, + "grad_norm": 0.15271732211112976, + "learning_rate": 6.299418202210214e-05, + "loss": 0.0212, + "step": 8910 + }, + { + "epoch": 4.19172932330827, + "grad_norm": 0.16247068345546722, + "learning_rate": 6.291433147091583e-05, + "loss": 0.0188, + "step": 8920 + }, + { + "epoch": 4.196428571428571, + "grad_norm": 0.1361701935529709, + "learning_rate": 6.283444561244042e-05, + "loss": 0.0173, + "step": 8930 + }, + { + "epoch": 4.201127819548872, + "grad_norm": 0.1367974579334259, + "learning_rate": 6.275452466508077e-05, + "loss": 0.0185, + "step": 8940 + }, + { + "epoch": 4.205827067669173, + "grad_norm": 0.2000769078731537, + "learning_rate": 6.26745688473377e-05, + "loss": 0.0196, + "step": 8950 + }, + { + "epoch": 4.2105263157894735, + "grad_norm": 0.18851491808891296, + "learning_rate": 6.259457837780742e-05, + "loss": 0.0224, + "step": 8960 + }, + { + "epoch": 4.215225563909774, + "grad_norm": 0.13504041731357574, + "learning_rate": 6.251455347518073e-05, + "loss": 0.0185, + "step": 8970 + }, + { + "epoch": 4.219924812030075, + "grad_norm": 0.21460330486297607, + "learning_rate": 6.243449435824276e-05, + "loss": 0.0231, + "step": 8980 + }, + { + "epoch": 4.224624060150376, + "grad_norm": 0.18271034955978394, + "learning_rate": 6.235440124587198e-05, + "loss": 0.0208, + "step": 8990 + }, + { + "epoch": 4.2293233082706765, + "grad_norm": 0.14157791435718536, + "learning_rate": 6.227427435703997e-05, + "loss": 0.0194, + "step": 9000 + }, + { + "epoch": 4.234022556390977, + "grad_norm": 0.1823650449514389, + "learning_rate": 6.219411391081055e-05, + "loss": 0.025, + "step": 9010 + }, + { + "epoch": 4.238721804511278, + "grad_norm": 0.10560489445924759, + "learning_rate": 6.211392012633932e-05, + "loss": 0.0209, + "step": 9020 + }, + { + "epoch": 4.243421052631579, + "grad_norm": 0.13004441559314728, + "learning_rate": 6.203369322287306e-05, + "loss": 0.022, + "step": 9030 + }, + { + "epoch": 4.2481203007518795, + "grad_norm": 0.189819797873497, + "learning_rate": 6.195343341974899e-05, + "loss": 0.018, + "step": 9040 + }, + { + "epoch": 4.25281954887218, + "grad_norm": 0.1904393583536148, + "learning_rate": 6.187314093639444e-05, + "loss": 0.0225, + "step": 9050 + }, + { + "epoch": 4.257518796992481, + "grad_norm": 0.1637134999036789, + "learning_rate": 6.179281599232591e-05, + "loss": 0.0229, + "step": 9060 + }, + { + "epoch": 4.262218045112782, + "grad_norm": 0.1223156750202179, + "learning_rate": 6.17124588071488e-05, + "loss": 0.0159, + "step": 9070 + }, + { + "epoch": 4.2669172932330826, + "grad_norm": 0.1365230232477188, + "learning_rate": 6.163206960055651e-05, + "loss": 0.0229, + "step": 9080 + }, + { + "epoch": 4.271616541353383, + "grad_norm": 0.11772079765796661, + "learning_rate": 6.155164859233012e-05, + "loss": 0.0178, + "step": 9090 + }, + { + "epoch": 4.276315789473684, + "grad_norm": 0.13973119854927063, + "learning_rate": 6.147119600233758e-05, + "loss": 0.0181, + "step": 9100 + }, + { + "epoch": 4.281015037593985, + "grad_norm": 0.15319527685642242, + "learning_rate": 6.13907120505332e-05, + "loss": 0.0148, + "step": 9110 + }, + { + "epoch": 4.285714285714286, + "grad_norm": 0.12740933895111084, + "learning_rate": 6.131019695695702e-05, + "loss": 0.0173, + "step": 9120 + }, + { + "epoch": 4.290413533834586, + "grad_norm": 0.12393118441104889, + "learning_rate": 6.122965094173424e-05, + "loss": 0.0211, + "step": 9130 + }, + { + "epoch": 4.295112781954887, + "grad_norm": 0.10120144486427307, + "learning_rate": 6.11490742250746e-05, + "loss": 0.019, + "step": 9140 + }, + { + "epoch": 4.299812030075188, + "grad_norm": 0.12977886199951172, + "learning_rate": 6.106846702727172e-05, + "loss": 0.0188, + "step": 9150 + }, + { + "epoch": 4.304511278195489, + "grad_norm": 0.21875134110450745, + "learning_rate": 6.0987829568702656e-05, + "loss": 0.0208, + "step": 9160 + }, + { + "epoch": 4.309210526315789, + "grad_norm": 0.18392659723758698, + "learning_rate": 6.090716206982714e-05, + "loss": 0.0192, + "step": 9170 + }, + { + "epoch": 4.31390977443609, + "grad_norm": 0.1815064251422882, + "learning_rate": 6.0826464751186994e-05, + "loss": 0.017, + "step": 9180 + }, + { + "epoch": 4.318609022556391, + "grad_norm": 0.12864035367965698, + "learning_rate": 6.074573783340562e-05, + "loss": 0.0159, + "step": 9190 + }, + { + "epoch": 4.323308270676692, + "grad_norm": 0.2170429825782776, + "learning_rate": 6.066498153718735e-05, + "loss": 0.0155, + "step": 9200 + }, + { + "epoch": 4.328007518796992, + "grad_norm": 0.179051011800766, + "learning_rate": 6.0584196083316794e-05, + "loss": 0.0299, + "step": 9210 + }, + { + "epoch": 4.332706766917293, + "grad_norm": 0.09811102598905563, + "learning_rate": 6.05033816926583e-05, + "loss": 0.0182, + "step": 9220 + }, + { + "epoch": 4.337406015037594, + "grad_norm": 0.11130985617637634, + "learning_rate": 6.042253858615532e-05, + "loss": 0.017, + "step": 9230 + }, + { + "epoch": 4.342105263157895, + "grad_norm": 0.19028566777706146, + "learning_rate": 6.034166698482984e-05, + "loss": 0.0273, + "step": 9240 + }, + { + "epoch": 4.3468045112781954, + "grad_norm": 0.13545772433280945, + "learning_rate": 6.026076710978171e-05, + "loss": 0.0164, + "step": 9250 + }, + { + "epoch": 4.351503759398496, + "grad_norm": 0.11075558513402939, + "learning_rate": 6.017983918218812e-05, + "loss": 0.02, + "step": 9260 + }, + { + "epoch": 4.356203007518797, + "grad_norm": 0.15003326535224915, + "learning_rate": 6.009888342330292e-05, + "loss": 0.0224, + "step": 9270 + }, + { + "epoch": 4.360902255639098, + "grad_norm": 0.1644802838563919, + "learning_rate": 6.001790005445607e-05, + "loss": 0.0227, + "step": 9280 + }, + { + "epoch": 4.3656015037593985, + "grad_norm": 0.1330413520336151, + "learning_rate": 5.9936889297052986e-05, + "loss": 0.0151, + "step": 9290 + }, + { + "epoch": 4.370300751879699, + "grad_norm": 0.1558411419391632, + "learning_rate": 5.985585137257401e-05, + "loss": 0.0227, + "step": 9300 + }, + { + "epoch": 4.375, + "grad_norm": 0.16447797417640686, + "learning_rate": 5.977478650257374e-05, + "loss": 0.0234, + "step": 9310 + }, + { + "epoch": 4.379699248120301, + "grad_norm": 0.13559196889400482, + "learning_rate": 5.969369490868042e-05, + "loss": 0.0338, + "step": 9320 + }, + { + "epoch": 4.3843984962406015, + "grad_norm": 0.15715987980365753, + "learning_rate": 5.961257681259535e-05, + "loss": 0.0205, + "step": 9330 + }, + { + "epoch": 4.389097744360902, + "grad_norm": 0.11574830114841461, + "learning_rate": 5.953143243609235e-05, + "loss": 0.0166, + "step": 9340 + }, + { + "epoch": 4.393796992481203, + "grad_norm": 0.093317911028862, + "learning_rate": 5.945026200101702e-05, + "loss": 0.015, + "step": 9350 + }, + { + "epoch": 4.398496240601504, + "grad_norm": 0.1592835932970047, + "learning_rate": 5.9369065729286245e-05, + "loss": 0.0164, + "step": 9360 + }, + { + "epoch": 4.4031954887218046, + "grad_norm": 0.1047334372997284, + "learning_rate": 5.92878438428875e-05, + "loss": 0.0276, + "step": 9370 + }, + { + "epoch": 4.407894736842105, + "grad_norm": 0.12860779464244843, + "learning_rate": 5.9206596563878357e-05, + "loss": 0.0132, + "step": 9380 + }, + { + "epoch": 4.412593984962406, + "grad_norm": 0.15609225630760193, + "learning_rate": 5.912532411438576e-05, + "loss": 0.0246, + "step": 9390 + }, + { + "epoch": 4.417293233082707, + "grad_norm": 0.13190561532974243, + "learning_rate": 5.90440267166055e-05, + "loss": 0.0239, + "step": 9400 + }, + { + "epoch": 4.421992481203008, + "grad_norm": 0.15665863454341888, + "learning_rate": 5.896270459280153e-05, + "loss": 0.0188, + "step": 9410 + }, + { + "epoch": 4.426691729323308, + "grad_norm": 0.1300572156906128, + "learning_rate": 5.888135796530544e-05, + "loss": 0.0174, + "step": 9420 + }, + { + "epoch": 4.431390977443609, + "grad_norm": 0.16136862337589264, + "learning_rate": 5.8799987056515804e-05, + "loss": 0.0235, + "step": 9430 + }, + { + "epoch": 4.43609022556391, + "grad_norm": 0.1629219949245453, + "learning_rate": 5.871859208889759e-05, + "loss": 0.0184, + "step": 9440 + }, + { + "epoch": 4.440789473684211, + "grad_norm": 0.18689210712909698, + "learning_rate": 5.8637173284981526e-05, + "loss": 0.022, + "step": 9450 + }, + { + "epoch": 4.445488721804511, + "grad_norm": 0.1344980001449585, + "learning_rate": 5.85557308673635e-05, + "loss": 0.0143, + "step": 9460 + }, + { + "epoch": 4.450187969924812, + "grad_norm": 0.16175200045108795, + "learning_rate": 5.847426505870399e-05, + "loss": 0.0204, + "step": 9470 + }, + { + "epoch": 4.454887218045113, + "grad_norm": 0.15878114104270935, + "learning_rate": 5.8392776081727385e-05, + "loss": 0.0155, + "step": 9480 + }, + { + "epoch": 4.459586466165414, + "grad_norm": 0.15410248935222626, + "learning_rate": 5.831126415922148e-05, + "loss": 0.0176, + "step": 9490 + }, + { + "epoch": 4.464285714285714, + "grad_norm": 0.19770678877830505, + "learning_rate": 5.8229729514036705e-05, + "loss": 0.0183, + "step": 9500 + }, + { + "epoch": 4.468984962406015, + "grad_norm": 0.15174371004104614, + "learning_rate": 5.8148172369085686e-05, + "loss": 0.0254, + "step": 9510 + }, + { + "epoch": 4.473684210526316, + "grad_norm": 0.1716819554567337, + "learning_rate": 5.8066592947342555e-05, + "loss": 0.0205, + "step": 9520 + }, + { + "epoch": 4.478383458646617, + "grad_norm": 0.11503490060567856, + "learning_rate": 5.798499147184233e-05, + "loss": 0.0187, + "step": 9530 + }, + { + "epoch": 4.4830827067669174, + "grad_norm": 0.14293760061264038, + "learning_rate": 5.7903368165680327e-05, + "loss": 0.0234, + "step": 9540 + }, + { + "epoch": 4.487781954887218, + "grad_norm": 0.12031774967908859, + "learning_rate": 5.782172325201155e-05, + "loss": 0.0268, + "step": 9550 + }, + { + "epoch": 4.492481203007519, + "grad_norm": 0.10676831752061844, + "learning_rate": 5.7740056954050084e-05, + "loss": 0.0156, + "step": 9560 + }, + { + "epoch": 4.49718045112782, + "grad_norm": 0.11658485978841782, + "learning_rate": 5.765836949506843e-05, + "loss": 0.0219, + "step": 9570 + }, + { + "epoch": 4.5018796992481205, + "grad_norm": 0.17919903993606567, + "learning_rate": 5.757666109839702e-05, + "loss": 0.012, + "step": 9580 + }, + { + "epoch": 4.506578947368421, + "grad_norm": 0.1580527275800705, + "learning_rate": 5.74949319874235e-05, + "loss": 0.0151, + "step": 9590 + }, + { + "epoch": 4.511278195488722, + "grad_norm": 0.14657042920589447, + "learning_rate": 5.74131823855921e-05, + "loss": 0.0215, + "step": 9600 + }, + { + "epoch": 4.515977443609023, + "grad_norm": 0.1635216772556305, + "learning_rate": 5.733141251640315e-05, + "loss": 0.0145, + "step": 9610 + }, + { + "epoch": 4.5206766917293235, + "grad_norm": 0.15993301570415497, + "learning_rate": 5.72496226034123e-05, + "loss": 0.0214, + "step": 9620 + }, + { + "epoch": 4.525375939849624, + "grad_norm": 0.14244981110095978, + "learning_rate": 5.7167812870230094e-05, + "loss": 0.0194, + "step": 9630 + }, + { + "epoch": 4.530075187969925, + "grad_norm": 0.1011502593755722, + "learning_rate": 5.7085983540521216e-05, + "loss": 0.0131, + "step": 9640 + }, + { + "epoch": 4.534774436090226, + "grad_norm": 0.12874962389469147, + "learning_rate": 5.70041348380039e-05, + "loss": 0.02, + "step": 9650 + }, + { + "epoch": 4.5394736842105265, + "grad_norm": 0.18720367550849915, + "learning_rate": 5.692226698644938e-05, + "loss": 0.0238, + "step": 9660 + }, + { + "epoch": 4.544172932330827, + "grad_norm": 0.17449072003364563, + "learning_rate": 5.6840380209681255e-05, + "loss": 0.0319, + "step": 9670 + }, + { + "epoch": 4.548872180451128, + "grad_norm": 0.15957769751548767, + "learning_rate": 5.675847473157485e-05, + "loss": 0.0234, + "step": 9680 + }, + { + "epoch": 4.553571428571429, + "grad_norm": 0.13500474393367767, + "learning_rate": 5.667655077605659e-05, + "loss": 0.0217, + "step": 9690 + }, + { + "epoch": 4.55827067669173, + "grad_norm": 0.1734580397605896, + "learning_rate": 5.6594608567103456e-05, + "loss": 0.0185, + "step": 9700 + }, + { + "epoch": 4.56296992481203, + "grad_norm": 0.1234162226319313, + "learning_rate": 5.65126483287423e-05, + "loss": 0.0155, + "step": 9710 + }, + { + "epoch": 4.567669172932331, + "grad_norm": 0.13775312900543213, + "learning_rate": 5.6430670285049314e-05, + "loss": 0.0246, + "step": 9720 + }, + { + "epoch": 4.572368421052632, + "grad_norm": 0.13060703873634338, + "learning_rate": 5.634867466014932e-05, + "loss": 0.0206, + "step": 9730 + }, + { + "epoch": 4.577067669172933, + "grad_norm": 0.27983561158180237, + "learning_rate": 5.6266661678215216e-05, + "loss": 0.0196, + "step": 9740 + }, + { + "epoch": 4.581766917293233, + "grad_norm": 0.12983421981334686, + "learning_rate": 5.618463156346739e-05, + "loss": 0.0234, + "step": 9750 + }, + { + "epoch": 4.586466165413534, + "grad_norm": 0.08709783852100372, + "learning_rate": 5.6102584540173006e-05, + "loss": 0.0207, + "step": 9760 + }, + { + "epoch": 4.591165413533835, + "grad_norm": 0.1174778863787651, + "learning_rate": 5.602052083264555e-05, + "loss": 0.0145, + "step": 9770 + }, + { + "epoch": 4.595864661654136, + "grad_norm": 0.1293332278728485, + "learning_rate": 5.5938440665244006e-05, + "loss": 0.0214, + "step": 9780 + }, + { + "epoch": 4.600563909774436, + "grad_norm": 0.17360520362854004, + "learning_rate": 5.585634426237246e-05, + "loss": 0.0238, + "step": 9790 + }, + { + "epoch": 4.605263157894737, + "grad_norm": 0.16103921830654144, + "learning_rate": 5.577423184847932e-05, + "loss": 0.0157, + "step": 9800 + }, + { + "epoch": 4.609962406015038, + "grad_norm": 0.09659445285797119, + "learning_rate": 5.569210364805677e-05, + "loss": 0.0202, + "step": 9810 + }, + { + "epoch": 4.614661654135339, + "grad_norm": 0.2310553640127182, + "learning_rate": 5.560995988564023e-05, + "loss": 0.0208, + "step": 9820 + }, + { + "epoch": 4.6193609022556394, + "grad_norm": 0.11814546585083008, + "learning_rate": 5.552780078580756e-05, + "loss": 0.0151, + "step": 9830 + }, + { + "epoch": 4.62406015037594, + "grad_norm": 0.15608763694763184, + "learning_rate": 5.544562657317863e-05, + "loss": 0.0142, + "step": 9840 + }, + { + "epoch": 4.628759398496241, + "grad_norm": 0.1391037106513977, + "learning_rate": 5.5363437472414595e-05, + "loss": 0.0201, + "step": 9850 + }, + { + "epoch": 4.633458646616542, + "grad_norm": 0.15986937284469604, + "learning_rate": 5.52812337082173e-05, + "loss": 0.0193, + "step": 9860 + }, + { + "epoch": 4.6381578947368425, + "grad_norm": 0.14006660878658295, + "learning_rate": 5.519901550532871e-05, + "loss": 0.0201, + "step": 9870 + }, + { + "epoch": 4.642857142857143, + "grad_norm": 0.19203944504261017, + "learning_rate": 5.511678308853026e-05, + "loss": 0.0156, + "step": 9880 + }, + { + "epoch": 4.647556390977444, + "grad_norm": 0.24364925920963287, + "learning_rate": 5.5034536682642224e-05, + "loss": 0.0223, + "step": 9890 + }, + { + "epoch": 4.652255639097744, + "grad_norm": 0.17941319942474365, + "learning_rate": 5.495227651252315e-05, + "loss": 0.0197, + "step": 9900 + }, + { + "epoch": 4.6569548872180455, + "grad_norm": 0.1661718338727951, + "learning_rate": 5.487000280306917e-05, + "loss": 0.0203, + "step": 9910 + }, + { + "epoch": 4.661654135338345, + "grad_norm": 0.20843440294265747, + "learning_rate": 5.478771577921351e-05, + "loss": 0.0145, + "step": 9920 + }, + { + "epoch": 4.666353383458647, + "grad_norm": 0.15984666347503662, + "learning_rate": 5.470541566592573e-05, + "loss": 0.0218, + "step": 9930 + }, + { + "epoch": 4.671052631578947, + "grad_norm": 0.17885951697826385, + "learning_rate": 5.462310268821118e-05, + "loss": 0.0251, + "step": 9940 + }, + { + "epoch": 4.6757518796992485, + "grad_norm": 0.20301920175552368, + "learning_rate": 5.454077707111042e-05, + "loss": 0.0187, + "step": 9950 + }, + { + "epoch": 4.680451127819548, + "grad_norm": 0.14082729816436768, + "learning_rate": 5.445843903969854e-05, + "loss": 0.024, + "step": 9960 + }, + { + "epoch": 4.68515037593985, + "grad_norm": 0.14042581617832184, + "learning_rate": 5.4376088819084556e-05, + "loss": 0.0174, + "step": 9970 + }, + { + "epoch": 4.68984962406015, + "grad_norm": 0.1587418168783188, + "learning_rate": 5.4293726634410855e-05, + "loss": 0.0192, + "step": 9980 + }, + { + "epoch": 4.694548872180452, + "grad_norm": 0.14780429005622864, + "learning_rate": 5.4211352710852495e-05, + "loss": 0.0198, + "step": 9990 + }, + { + "epoch": 4.6992481203007515, + "grad_norm": 0.1238761693239212, + "learning_rate": 5.4128967273616625e-05, + "loss": 0.0206, + "step": 10000 + }, + { + "epoch": 4.703947368421053, + "grad_norm": 0.14264823496341705, + "learning_rate": 5.404657054794189e-05, + "loss": 0.02, + "step": 10010 + }, + { + "epoch": 4.708646616541353, + "grad_norm": 0.11588063091039658, + "learning_rate": 5.396416275909779e-05, + "loss": 0.0258, + "step": 10020 + }, + { + "epoch": 4.713345864661655, + "grad_norm": 0.11729754507541656, + "learning_rate": 5.3881744132384104e-05, + "loss": 0.0173, + "step": 10030 + }, + { + "epoch": 4.7180451127819545, + "grad_norm": 0.1283014863729477, + "learning_rate": 5.379931489313016e-05, + "loss": 0.0205, + "step": 10040 + }, + { + "epoch": 4.722744360902256, + "grad_norm": 0.11900748312473297, + "learning_rate": 5.371687526669439e-05, + "loss": 0.0204, + "step": 10050 + }, + { + "epoch": 4.727443609022556, + "grad_norm": 0.2039898782968521, + "learning_rate": 5.363442547846356e-05, + "loss": 0.023, + "step": 10060 + }, + { + "epoch": 4.732142857142857, + "grad_norm": 0.16698098182678223, + "learning_rate": 5.355196575385225e-05, + "loss": 0.0149, + "step": 10070 + }, + { + "epoch": 4.7368421052631575, + "grad_norm": 0.20464769005775452, + "learning_rate": 5.3469496318302204e-05, + "loss": 0.0198, + "step": 10080 + }, + { + "epoch": 4.741541353383458, + "grad_norm": 0.08524361997842789, + "learning_rate": 5.3387017397281704e-05, + "loss": 0.0185, + "step": 10090 + }, + { + "epoch": 4.746240601503759, + "grad_norm": 0.1856192648410797, + "learning_rate": 5.330452921628497e-05, + "loss": 0.0198, + "step": 10100 + }, + { + "epoch": 4.75093984962406, + "grad_norm": 0.1088978499174118, + "learning_rate": 5.322203200083154e-05, + "loss": 0.0156, + "step": 10110 + }, + { + "epoch": 4.7556390977443606, + "grad_norm": 0.18362818658351898, + "learning_rate": 5.313952597646568e-05, + "loss": 0.0193, + "step": 10120 + }, + { + "epoch": 4.760338345864661, + "grad_norm": 0.09336452186107635, + "learning_rate": 5.305701136875566e-05, + "loss": 0.0127, + "step": 10130 + }, + { + "epoch": 4.765037593984962, + "grad_norm": 0.10276808589696884, + "learning_rate": 5.297448840329329e-05, + "loss": 0.0138, + "step": 10140 + }, + { + "epoch": 4.769736842105263, + "grad_norm": 0.14418187737464905, + "learning_rate": 5.2891957305693205e-05, + "loss": 0.0188, + "step": 10150 + }, + { + "epoch": 4.774436090225564, + "grad_norm": 0.1359757035970688, + "learning_rate": 5.280941830159227e-05, + "loss": 0.0183, + "step": 10160 + }, + { + "epoch": 4.779135338345864, + "grad_norm": 0.21340955793857574, + "learning_rate": 5.2726871616649e-05, + "loss": 0.0195, + "step": 10170 + }, + { + "epoch": 4.783834586466165, + "grad_norm": 0.1824106127023697, + "learning_rate": 5.264431747654284e-05, + "loss": 0.0221, + "step": 10180 + }, + { + "epoch": 4.788533834586466, + "grad_norm": 0.1339014172554016, + "learning_rate": 5.2561756106973656e-05, + "loss": 0.0207, + "step": 10190 + }, + { + "epoch": 4.793233082706767, + "grad_norm": 0.12776830792427063, + "learning_rate": 5.247918773366112e-05, + "loss": 0.0241, + "step": 10200 + }, + { + "epoch": 4.797932330827067, + "grad_norm": 0.1569841355085373, + "learning_rate": 5.2396612582343986e-05, + "loss": 0.0166, + "step": 10210 + }, + { + "epoch": 4.802631578947368, + "grad_norm": 0.1283421814441681, + "learning_rate": 5.231403087877955e-05, + "loss": 0.0177, + "step": 10220 + }, + { + "epoch": 4.807330827067669, + "grad_norm": 0.16856853663921356, + "learning_rate": 5.2231442848743064e-05, + "loss": 0.027, + "step": 10230 + }, + { + "epoch": 4.81203007518797, + "grad_norm": 0.22872963547706604, + "learning_rate": 5.214884871802703e-05, + "loss": 0.0257, + "step": 10240 + }, + { + "epoch": 4.81672932330827, + "grad_norm": 0.14206314086914062, + "learning_rate": 5.2066248712440656e-05, + "loss": 0.0121, + "step": 10250 + }, + { + "epoch": 4.821428571428571, + "grad_norm": 0.10408526659011841, + "learning_rate": 5.198364305780922e-05, + "loss": 0.0181, + "step": 10260 + }, + { + "epoch": 4.826127819548872, + "grad_norm": 0.10545016825199127, + "learning_rate": 5.1901031979973394e-05, + "loss": 0.0169, + "step": 10270 + }, + { + "epoch": 4.830827067669173, + "grad_norm": 0.10499098896980286, + "learning_rate": 5.1818415704788725e-05, + "loss": 0.0173, + "step": 10280 + }, + { + "epoch": 4.8355263157894735, + "grad_norm": 0.10384233295917511, + "learning_rate": 5.1735794458124956e-05, + "loss": 0.0172, + "step": 10290 + }, + { + "epoch": 4.840225563909774, + "grad_norm": 0.13764654099941254, + "learning_rate": 5.165316846586541e-05, + "loss": 0.0167, + "step": 10300 + }, + { + "epoch": 4.844924812030075, + "grad_norm": 0.1662788838148117, + "learning_rate": 5.157053795390642e-05, + "loss": 0.0248, + "step": 10310 + }, + { + "epoch": 4.849624060150376, + "grad_norm": 0.20149970054626465, + "learning_rate": 5.148790314815663e-05, + "loss": 0.0158, + "step": 10320 + }, + { + "epoch": 4.8543233082706765, + "grad_norm": 0.14202655851840973, + "learning_rate": 5.1405264274536445e-05, + "loss": 0.0112, + "step": 10330 + }, + { + "epoch": 4.859022556390977, + "grad_norm": 0.1373424530029297, + "learning_rate": 5.132262155897739e-05, + "loss": 0.0143, + "step": 10340 + }, + { + "epoch": 4.863721804511278, + "grad_norm": 0.15334497392177582, + "learning_rate": 5.123997522742151e-05, + "loss": 0.018, + "step": 10350 + }, + { + "epoch": 4.868421052631579, + "grad_norm": 0.09590376168489456, + "learning_rate": 5.1157325505820694e-05, + "loss": 0.0187, + "step": 10360 + }, + { + "epoch": 4.8731203007518795, + "grad_norm": 0.11532887071371078, + "learning_rate": 5.107467262013614e-05, + "loss": 0.0187, + "step": 10370 + }, + { + "epoch": 4.87781954887218, + "grad_norm": 0.13246522843837738, + "learning_rate": 5.0992016796337686e-05, + "loss": 0.0201, + "step": 10380 + }, + { + "epoch": 4.882518796992481, + "grad_norm": 0.12044485658407211, + "learning_rate": 5.0909358260403186e-05, + "loss": 0.0259, + "step": 10390 + }, + { + "epoch": 4.887218045112782, + "grad_norm": 0.14442338049411774, + "learning_rate": 5.0826697238317935e-05, + "loss": 0.0192, + "step": 10400 + }, + { + "epoch": 4.8919172932330826, + "grad_norm": 0.20286041498184204, + "learning_rate": 5.074403395607399e-05, + "loss": 0.0168, + "step": 10410 + }, + { + "epoch": 4.896616541353383, + "grad_norm": 0.1601938009262085, + "learning_rate": 5.066136863966963e-05, + "loss": 0.0208, + "step": 10420 + }, + { + "epoch": 4.901315789473684, + "grad_norm": 0.12259446829557419, + "learning_rate": 5.057870151510864e-05, + "loss": 0.0208, + "step": 10430 + }, + { + "epoch": 4.906015037593985, + "grad_norm": 0.1430525928735733, + "learning_rate": 5.0496032808399815e-05, + "loss": 0.0216, + "step": 10440 + }, + { + "epoch": 4.910714285714286, + "grad_norm": 0.11188165843486786, + "learning_rate": 5.041336274555625e-05, + "loss": 0.0205, + "step": 10450 + }, + { + "epoch": 4.915413533834586, + "grad_norm": 0.1343916654586792, + "learning_rate": 5.033069155259471e-05, + "loss": 0.0206, + "step": 10460 + }, + { + "epoch": 4.920112781954887, + "grad_norm": 0.15581011772155762, + "learning_rate": 5.02480194555351e-05, + "loss": 0.0145, + "step": 10470 + }, + { + "epoch": 4.924812030075188, + "grad_norm": 0.10869266837835312, + "learning_rate": 5.016534668039976e-05, + "loss": 0.0183, + "step": 10480 + }, + { + "epoch": 4.929511278195489, + "grad_norm": 0.1791427731513977, + "learning_rate": 5.0082673453212914e-05, + "loss": 0.0145, + "step": 10490 + }, + { + "epoch": 4.934210526315789, + "grad_norm": 0.12199488282203674, + "learning_rate": 5e-05, + "loss": 0.0142, + "step": 10500 + }, + { + "epoch": 4.93890977443609, + "grad_norm": 0.1306176632642746, + "learning_rate": 4.991732654678709e-05, + "loss": 0.0187, + "step": 10510 + }, + { + "epoch": 4.943609022556391, + "grad_norm": 0.11600327491760254, + "learning_rate": 4.9834653319600246e-05, + "loss": 0.0206, + "step": 10520 + }, + { + "epoch": 4.948308270676692, + "grad_norm": 0.14572572708129883, + "learning_rate": 4.975198054446492e-05, + "loss": 0.017, + "step": 10530 + }, + { + "epoch": 4.953007518796992, + "grad_norm": 0.1601126790046692, + "learning_rate": 4.96693084474053e-05, + "loss": 0.0219, + "step": 10540 + }, + { + "epoch": 4.957706766917293, + "grad_norm": 0.1626318246126175, + "learning_rate": 4.9586637254443756e-05, + "loss": 0.0209, + "step": 10550 + }, + { + "epoch": 4.962406015037594, + "grad_norm": 0.17098815739154816, + "learning_rate": 4.950396719160018e-05, + "loss": 0.0158, + "step": 10560 + }, + { + "epoch": 4.967105263157895, + "grad_norm": 0.13434873521327972, + "learning_rate": 4.942129848489137e-05, + "loss": 0.0192, + "step": 10570 + }, + { + "epoch": 4.9718045112781954, + "grad_norm": 0.15642866492271423, + "learning_rate": 4.93386313603304e-05, + "loss": 0.0187, + "step": 10580 + }, + { + "epoch": 4.976503759398496, + "grad_norm": 0.15607450902462006, + "learning_rate": 4.925596604392603e-05, + "loss": 0.016, + "step": 10590 + }, + { + "epoch": 4.981203007518797, + "grad_norm": 0.12438451498746872, + "learning_rate": 4.917330276168208e-05, + "loss": 0.0187, + "step": 10600 + }, + { + "epoch": 4.985902255639098, + "grad_norm": 0.14996737241744995, + "learning_rate": 4.909064173959681e-05, + "loss": 0.0171, + "step": 10610 + }, + { + "epoch": 4.9906015037593985, + "grad_norm": 0.11663806438446045, + "learning_rate": 4.9007983203662326e-05, + "loss": 0.0182, + "step": 10620 + }, + { + "epoch": 4.995300751879699, + "grad_norm": 0.11488846689462662, + "learning_rate": 4.892532737986387e-05, + "loss": 0.0174, + "step": 10630 + }, + { + "epoch": 5.0, + "grad_norm": 0.15718309581279755, + "learning_rate": 4.884267449417931e-05, + "loss": 0.0184, + "step": 10640 + }, + { + "epoch": 5.004699248120301, + "grad_norm": 0.11409519612789154, + "learning_rate": 4.87600247725785e-05, + "loss": 0.018, + "step": 10650 + }, + { + "epoch": 5.0093984962406015, + "grad_norm": 0.14960633218288422, + "learning_rate": 4.867737844102261e-05, + "loss": 0.0131, + "step": 10660 + }, + { + "epoch": 5.014097744360902, + "grad_norm": 0.1907559037208557, + "learning_rate": 4.8594735725463567e-05, + "loss": 0.024, + "step": 10670 + }, + { + "epoch": 5.018796992481203, + "grad_norm": 0.12973536550998688, + "learning_rate": 4.851209685184338e-05, + "loss": 0.0149, + "step": 10680 + }, + { + "epoch": 5.023496240601504, + "grad_norm": 0.12642143666744232, + "learning_rate": 4.8429462046093585e-05, + "loss": 0.0218, + "step": 10690 + }, + { + "epoch": 5.0281954887218046, + "grad_norm": 0.19579017162322998, + "learning_rate": 4.834683153413459e-05, + "loss": 0.0162, + "step": 10700 + }, + { + "epoch": 5.032894736842105, + "grad_norm": 0.06657780706882477, + "learning_rate": 4.826420554187506e-05, + "loss": 0.0219, + "step": 10710 + }, + { + "epoch": 5.037593984962406, + "grad_norm": 0.15197539329528809, + "learning_rate": 4.818158429521129e-05, + "loss": 0.0179, + "step": 10720 + }, + { + "epoch": 5.042293233082707, + "grad_norm": 0.13582901656627655, + "learning_rate": 4.809896802002662e-05, + "loss": 0.0222, + "step": 10730 + }, + { + "epoch": 5.046992481203008, + "grad_norm": 0.0987696573138237, + "learning_rate": 4.801635694219079e-05, + "loss": 0.02, + "step": 10740 + }, + { + "epoch": 5.051691729323308, + "grad_norm": 0.1544206142425537, + "learning_rate": 4.7933751287559335e-05, + "loss": 0.0165, + "step": 10750 + }, + { + "epoch": 5.056390977443609, + "grad_norm": 0.19164049625396729, + "learning_rate": 4.785115128197298e-05, + "loss": 0.0162, + "step": 10760 + }, + { + "epoch": 5.06109022556391, + "grad_norm": 0.1684752255678177, + "learning_rate": 4.776855715125694e-05, + "loss": 0.0147, + "step": 10770 + }, + { + "epoch": 5.065789473684211, + "grad_norm": 0.18184302747249603, + "learning_rate": 4.7685969121220456e-05, + "loss": 0.0183, + "step": 10780 + }, + { + "epoch": 5.070488721804511, + "grad_norm": 0.15143953263759613, + "learning_rate": 4.7603387417656026e-05, + "loss": 0.0148, + "step": 10790 + }, + { + "epoch": 5.075187969924812, + "grad_norm": 0.1459171175956726, + "learning_rate": 4.7520812266338885e-05, + "loss": 0.0175, + "step": 10800 + }, + { + "epoch": 5.079887218045113, + "grad_norm": 0.11474397778511047, + "learning_rate": 4.743824389302635e-05, + "loss": 0.0191, + "step": 10810 + }, + { + "epoch": 5.084586466165414, + "grad_norm": 0.10887852311134338, + "learning_rate": 4.735568252345718e-05, + "loss": 0.019, + "step": 10820 + }, + { + "epoch": 5.089285714285714, + "grad_norm": 0.2430884689092636, + "learning_rate": 4.7273128383351015e-05, + "loss": 0.0184, + "step": 10830 + }, + { + "epoch": 5.093984962406015, + "grad_norm": 0.14770224690437317, + "learning_rate": 4.7190581698407725e-05, + "loss": 0.0119, + "step": 10840 + }, + { + "epoch": 5.098684210526316, + "grad_norm": 0.11698685586452484, + "learning_rate": 4.710804269430681e-05, + "loss": 0.0162, + "step": 10850 + }, + { + "epoch": 5.103383458646617, + "grad_norm": 0.10466932505369186, + "learning_rate": 4.702551159670672e-05, + "loss": 0.0133, + "step": 10860 + }, + { + "epoch": 5.1080827067669174, + "grad_norm": 0.21390311419963837, + "learning_rate": 4.694298863124435e-05, + "loss": 0.0184, + "step": 10870 + }, + { + "epoch": 5.112781954887218, + "grad_norm": 0.12267635017633438, + "learning_rate": 4.6860474023534335e-05, + "loss": 0.0201, + "step": 10880 + }, + { + "epoch": 5.117481203007519, + "grad_norm": 0.10549784451723099, + "learning_rate": 4.677796799916845e-05, + "loss": 0.0185, + "step": 10890 + }, + { + "epoch": 5.12218045112782, + "grad_norm": 0.09418050944805145, + "learning_rate": 4.669547078371504e-05, + "loss": 0.0232, + "step": 10900 + }, + { + "epoch": 5.1268796992481205, + "grad_norm": 0.10465802997350693, + "learning_rate": 4.66129826027183e-05, + "loss": 0.016, + "step": 10910 + }, + { + "epoch": 5.131578947368421, + "grad_norm": 0.14196555316448212, + "learning_rate": 4.65305036816978e-05, + "loss": 0.0202, + "step": 10920 + }, + { + "epoch": 5.136278195488722, + "grad_norm": 0.15972945094108582, + "learning_rate": 4.6448034246147754e-05, + "loss": 0.0164, + "step": 10930 + }, + { + "epoch": 5.140977443609023, + "grad_norm": 0.1118629202246666, + "learning_rate": 4.6365574521536445e-05, + "loss": 0.0134, + "step": 10940 + }, + { + "epoch": 5.1456766917293235, + "grad_norm": 0.08104313164949417, + "learning_rate": 4.6283124733305624e-05, + "loss": 0.0102, + "step": 10950 + }, + { + "epoch": 5.150375939849624, + "grad_norm": 0.0983964204788208, + "learning_rate": 4.620068510686985e-05, + "loss": 0.021, + "step": 10960 + }, + { + "epoch": 5.155075187969925, + "grad_norm": 0.13934247195720673, + "learning_rate": 4.611825586761591e-05, + "loss": 0.0123, + "step": 10970 + }, + { + "epoch": 5.159774436090226, + "grad_norm": 0.10796473920345306, + "learning_rate": 4.60358372409022e-05, + "loss": 0.0137, + "step": 10980 + }, + { + "epoch": 5.1644736842105265, + "grad_norm": 0.1407727152109146, + "learning_rate": 4.5953429452058135e-05, + "loss": 0.0138, + "step": 10990 + }, + { + "epoch": 5.169172932330827, + "grad_norm": 0.1869104653596878, + "learning_rate": 4.5871032726383386e-05, + "loss": 0.0178, + "step": 11000 + }, + { + "epoch": 5.173872180451128, + "grad_norm": 0.22772102057933807, + "learning_rate": 4.5788647289147516e-05, + "loss": 0.017, + "step": 11010 + }, + { + "epoch": 5.178571428571429, + "grad_norm": 0.1635831892490387, + "learning_rate": 4.570627336558915e-05, + "loss": 0.0177, + "step": 11020 + }, + { + "epoch": 5.18327067669173, + "grad_norm": 0.10193423926830292, + "learning_rate": 4.562391118091544e-05, + "loss": 0.0135, + "step": 11030 + }, + { + "epoch": 5.18796992481203, + "grad_norm": 0.17310504615306854, + "learning_rate": 4.554156096030149e-05, + "loss": 0.0152, + "step": 11040 + }, + { + "epoch": 5.192669172932331, + "grad_norm": 0.1305575966835022, + "learning_rate": 4.545922292888959e-05, + "loss": 0.0187, + "step": 11050 + }, + { + "epoch": 5.197368421052632, + "grad_norm": 0.17065905034542084, + "learning_rate": 4.537689731178883e-05, + "loss": 0.025, + "step": 11060 + }, + { + "epoch": 5.202067669172933, + "grad_norm": 0.13812799751758575, + "learning_rate": 4.529458433407429e-05, + "loss": 0.02, + "step": 11070 + }, + { + "epoch": 5.206766917293233, + "grad_norm": 0.1573777198791504, + "learning_rate": 4.5212284220786494e-05, + "loss": 0.0163, + "step": 11080 + }, + { + "epoch": 5.211466165413534, + "grad_norm": 0.13692985475063324, + "learning_rate": 4.5129997196930845e-05, + "loss": 0.0125, + "step": 11090 + }, + { + "epoch": 5.216165413533835, + "grad_norm": 0.11966300755739212, + "learning_rate": 4.504772348747687e-05, + "loss": 0.0133, + "step": 11100 + }, + { + "epoch": 5.220864661654136, + "grad_norm": 0.22742719948291779, + "learning_rate": 4.496546331735778e-05, + "loss": 0.0193, + "step": 11110 + }, + { + "epoch": 5.225563909774436, + "grad_norm": 0.17194905877113342, + "learning_rate": 4.488321691146975e-05, + "loss": 0.0163, + "step": 11120 + }, + { + "epoch": 5.230263157894737, + "grad_norm": 0.10848580300807953, + "learning_rate": 4.480098449467132e-05, + "loss": 0.011, + "step": 11130 + }, + { + "epoch": 5.234962406015038, + "grad_norm": 0.12128433585166931, + "learning_rate": 4.471876629178273e-05, + "loss": 0.0153, + "step": 11140 + }, + { + "epoch": 5.239661654135339, + "grad_norm": 0.19572050869464874, + "learning_rate": 4.463656252758542e-05, + "loss": 0.0174, + "step": 11150 + }, + { + "epoch": 5.2443609022556394, + "grad_norm": 0.18351411819458008, + "learning_rate": 4.4554373426821374e-05, + "loss": 0.0197, + "step": 11160 + }, + { + "epoch": 5.24906015037594, + "grad_norm": 0.14616544544696808, + "learning_rate": 4.447219921419244e-05, + "loss": 0.0186, + "step": 11170 + }, + { + "epoch": 5.253759398496241, + "grad_norm": 0.1219051256775856, + "learning_rate": 4.439004011435979e-05, + "loss": 0.0167, + "step": 11180 + }, + { + "epoch": 5.258458646616542, + "grad_norm": 0.1375686526298523, + "learning_rate": 4.430789635194324e-05, + "loss": 0.0223, + "step": 11190 + }, + { + "epoch": 5.2631578947368425, + "grad_norm": 0.1277266889810562, + "learning_rate": 4.4225768151520694e-05, + "loss": 0.0132, + "step": 11200 + }, + { + "epoch": 5.267857142857143, + "grad_norm": 0.16868142783641815, + "learning_rate": 4.414365573762755e-05, + "loss": 0.0292, + "step": 11210 + }, + { + "epoch": 5.272556390977444, + "grad_norm": 0.091234490275383, + "learning_rate": 4.406155933475599e-05, + "loss": 0.0165, + "step": 11220 + }, + { + "epoch": 5.277255639097745, + "grad_norm": 0.11484044045209885, + "learning_rate": 4.3979479167354477e-05, + "loss": 0.0167, + "step": 11230 + }, + { + "epoch": 5.2819548872180455, + "grad_norm": 0.14860452711582184, + "learning_rate": 4.3897415459827e-05, + "loss": 0.019, + "step": 11240 + }, + { + "epoch": 5.286654135338346, + "grad_norm": 0.13529516756534576, + "learning_rate": 4.381536843653262e-05, + "loss": 0.0158, + "step": 11250 + }, + { + "epoch": 5.291353383458647, + "grad_norm": 0.09828982502222061, + "learning_rate": 4.373333832178478e-05, + "loss": 0.0129, + "step": 11260 + }, + { + "epoch": 5.296052631578947, + "grad_norm": 0.09586787968873978, + "learning_rate": 4.365132533985071e-05, + "loss": 0.0189, + "step": 11270 + }, + { + "epoch": 5.3007518796992485, + "grad_norm": 0.10568477213382721, + "learning_rate": 4.3569329714950704e-05, + "loss": 0.0194, + "step": 11280 + }, + { + "epoch": 5.305451127819548, + "grad_norm": 0.14483776688575745, + "learning_rate": 4.348735167125771e-05, + "loss": 0.016, + "step": 11290 + }, + { + "epoch": 5.31015037593985, + "grad_norm": 0.07663799077272415, + "learning_rate": 4.3405391432896555e-05, + "loss": 0.0125, + "step": 11300 + }, + { + "epoch": 5.31484962406015, + "grad_norm": 0.18015138804912567, + "learning_rate": 4.3323449223943416e-05, + "loss": 0.0151, + "step": 11310 + }, + { + "epoch": 5.319548872180452, + "grad_norm": 0.12221848219633102, + "learning_rate": 4.324152526842517e-05, + "loss": 0.0195, + "step": 11320 + }, + { + "epoch": 5.3242481203007515, + "grad_norm": 0.10736917704343796, + "learning_rate": 4.315961979031875e-05, + "loss": 0.018, + "step": 11330 + }, + { + "epoch": 5.328947368421053, + "grad_norm": 0.15660639107227325, + "learning_rate": 4.307773301355062e-05, + "loss": 0.0162, + "step": 11340 + }, + { + "epoch": 5.333646616541353, + "grad_norm": 0.16791242361068726, + "learning_rate": 4.2995865161996105e-05, + "loss": 0.0146, + "step": 11350 + }, + { + "epoch": 5.338345864661654, + "grad_norm": 0.08962516486644745, + "learning_rate": 4.291401645947879e-05, + "loss": 0.0169, + "step": 11360 + }, + { + "epoch": 5.3430451127819545, + "grad_norm": 0.14809159934520721, + "learning_rate": 4.283218712976992e-05, + "loss": 0.0148, + "step": 11370 + }, + { + "epoch": 5.347744360902255, + "grad_norm": 0.10135837644338608, + "learning_rate": 4.275037739658771e-05, + "loss": 0.0226, + "step": 11380 + }, + { + "epoch": 5.352443609022556, + "grad_norm": 0.13660521805286407, + "learning_rate": 4.2668587483596864e-05, + "loss": 0.018, + "step": 11390 + }, + { + "epoch": 5.357142857142857, + "grad_norm": 0.08671059459447861, + "learning_rate": 4.2586817614407895e-05, + "loss": 0.0124, + "step": 11400 + }, + { + "epoch": 5.3618421052631575, + "grad_norm": 0.09788667410612106, + "learning_rate": 4.250506801257653e-05, + "loss": 0.0151, + "step": 11410 + }, + { + "epoch": 5.366541353383458, + "grad_norm": 0.15231043100357056, + "learning_rate": 4.2423338901602985e-05, + "loss": 0.0176, + "step": 11420 + }, + { + "epoch": 5.371240601503759, + "grad_norm": 0.13623958826065063, + "learning_rate": 4.234163050493158e-05, + "loss": 0.0238, + "step": 11430 + }, + { + "epoch": 5.37593984962406, + "grad_norm": 0.16451093554496765, + "learning_rate": 4.2259943045949934e-05, + "loss": 0.0159, + "step": 11440 + }, + { + "epoch": 5.3806390977443606, + "grad_norm": 0.1713268756866455, + "learning_rate": 4.2178276747988446e-05, + "loss": 0.0228, + "step": 11450 + }, + { + "epoch": 5.385338345864661, + "grad_norm": 0.1485781967639923, + "learning_rate": 4.209663183431969e-05, + "loss": 0.0239, + "step": 11460 + }, + { + "epoch": 5.390037593984962, + "grad_norm": 0.09829729050397873, + "learning_rate": 4.201500852815768e-05, + "loss": 0.0132, + "step": 11470 + }, + { + "epoch": 5.394736842105263, + "grad_norm": 0.1141219288110733, + "learning_rate": 4.1933407052657456e-05, + "loss": 0.0158, + "step": 11480 + }, + { + "epoch": 5.399436090225564, + "grad_norm": 0.11218131333589554, + "learning_rate": 4.1851827630914305e-05, + "loss": 0.0118, + "step": 11490 + }, + { + "epoch": 5.404135338345864, + "grad_norm": 0.11761670559644699, + "learning_rate": 4.17702704859633e-05, + "loss": 0.0196, + "step": 11500 + }, + { + "epoch": 5.408834586466165, + "grad_norm": 0.11419840902090073, + "learning_rate": 4.1688735840778546e-05, + "loss": 0.0143, + "step": 11510 + }, + { + "epoch": 5.413533834586466, + "grad_norm": 0.10110355168581009, + "learning_rate": 4.160722391827262e-05, + "loss": 0.0177, + "step": 11520 + }, + { + "epoch": 5.418233082706767, + "grad_norm": 0.08888833969831467, + "learning_rate": 4.1525734941296026e-05, + "loss": 0.0214, + "step": 11530 + }, + { + "epoch": 5.422932330827067, + "grad_norm": 0.08826860040426254, + "learning_rate": 4.14442691326365e-05, + "loss": 0.0119, + "step": 11540 + }, + { + "epoch": 5.427631578947368, + "grad_norm": 0.1564245969057083, + "learning_rate": 4.13628267150185e-05, + "loss": 0.0181, + "step": 11550 + }, + { + "epoch": 5.432330827067669, + "grad_norm": 0.18380986154079437, + "learning_rate": 4.1281407911102425e-05, + "loss": 0.021, + "step": 11560 + }, + { + "epoch": 5.43703007518797, + "grad_norm": 0.17931777238845825, + "learning_rate": 4.120001294348421e-05, + "loss": 0.0142, + "step": 11570 + }, + { + "epoch": 5.44172932330827, + "grad_norm": 0.06229656562209129, + "learning_rate": 4.111864203469457e-05, + "loss": 0.0225, + "step": 11580 + }, + { + "epoch": 5.446428571428571, + "grad_norm": 0.12182196229696274, + "learning_rate": 4.103729540719847e-05, + "loss": 0.0153, + "step": 11590 + }, + { + "epoch": 5.451127819548872, + "grad_norm": 0.16493044793605804, + "learning_rate": 4.095597328339452e-05, + "loss": 0.0141, + "step": 11600 + }, + { + "epoch": 5.455827067669173, + "grad_norm": 0.07192741334438324, + "learning_rate": 4.087467588561424e-05, + "loss": 0.0114, + "step": 11610 + }, + { + "epoch": 5.4605263157894735, + "grad_norm": 0.14149798452854156, + "learning_rate": 4.079340343612165e-05, + "loss": 0.0167, + "step": 11620 + }, + { + "epoch": 5.465225563909774, + "grad_norm": 0.14575918018817902, + "learning_rate": 4.07121561571125e-05, + "loss": 0.0162, + "step": 11630 + }, + { + "epoch": 5.469924812030075, + "grad_norm": 0.15547528862953186, + "learning_rate": 4.063093427071376e-05, + "loss": 0.0154, + "step": 11640 + }, + { + "epoch": 5.474624060150376, + "grad_norm": 0.12537823617458344, + "learning_rate": 4.0549737998983e-05, + "loss": 0.0202, + "step": 11650 + }, + { + "epoch": 5.4793233082706765, + "grad_norm": 0.1138703003525734, + "learning_rate": 4.046856756390767e-05, + "loss": 0.02, + "step": 11660 + }, + { + "epoch": 5.484022556390977, + "grad_norm": 0.13138170540332794, + "learning_rate": 4.038742318740465e-05, + "loss": 0.0204, + "step": 11670 + }, + { + "epoch": 5.488721804511278, + "grad_norm": 0.10835835337638855, + "learning_rate": 4.0306305091319595e-05, + "loss": 0.0174, + "step": 11680 + }, + { + "epoch": 5.493421052631579, + "grad_norm": 0.09958133846521378, + "learning_rate": 4.0225213497426276e-05, + "loss": 0.019, + "step": 11690 + }, + { + "epoch": 5.4981203007518795, + "grad_norm": 0.14578290283679962, + "learning_rate": 4.0144148627425993e-05, + "loss": 0.0233, + "step": 11700 + }, + { + "epoch": 5.50281954887218, + "grad_norm": 0.18853400647640228, + "learning_rate": 4.006311070294702e-05, + "loss": 0.0255, + "step": 11710 + }, + { + "epoch": 5.507518796992481, + "grad_norm": 0.1436021775007248, + "learning_rate": 3.9982099945543945e-05, + "loss": 0.0158, + "step": 11720 + }, + { + "epoch": 5.512218045112782, + "grad_norm": 0.10432905703783035, + "learning_rate": 3.9901116576697083e-05, + "loss": 0.0173, + "step": 11730 + }, + { + "epoch": 5.5169172932330826, + "grad_norm": 0.13678883016109467, + "learning_rate": 3.982016081781189e-05, + "loss": 0.0174, + "step": 11740 + }, + { + "epoch": 5.521616541353383, + "grad_norm": 0.15448826551437378, + "learning_rate": 3.973923289021829e-05, + "loss": 0.016, + "step": 11750 + }, + { + "epoch": 5.526315789473684, + "grad_norm": 0.16062240302562714, + "learning_rate": 3.965833301517017e-05, + "loss": 0.0322, + "step": 11760 + }, + { + "epoch": 5.531015037593985, + "grad_norm": 0.11189954727888107, + "learning_rate": 3.9577461413844684e-05, + "loss": 0.02, + "step": 11770 + }, + { + "epoch": 5.535714285714286, + "grad_norm": 0.171811044216156, + "learning_rate": 3.949661830734172e-05, + "loss": 0.0175, + "step": 11780 + }, + { + "epoch": 5.540413533834586, + "grad_norm": 0.12942734360694885, + "learning_rate": 3.9415803916683224e-05, + "loss": 0.0131, + "step": 11790 + }, + { + "epoch": 5.545112781954887, + "grad_norm": 0.11269880831241608, + "learning_rate": 3.933501846281267e-05, + "loss": 0.0221, + "step": 11800 + }, + { + "epoch": 5.549812030075188, + "grad_norm": 0.12639208137989044, + "learning_rate": 3.925426216659438e-05, + "loss": 0.0178, + "step": 11810 + }, + { + "epoch": 5.554511278195489, + "grad_norm": 0.21581538021564484, + "learning_rate": 3.917353524881302e-05, + "loss": 0.0152, + "step": 11820 + }, + { + "epoch": 5.559210526315789, + "grad_norm": 0.14881852269172668, + "learning_rate": 3.9092837930172884e-05, + "loss": 0.0156, + "step": 11830 + }, + { + "epoch": 5.56390977443609, + "grad_norm": 0.18276169896125793, + "learning_rate": 3.901217043129735e-05, + "loss": 0.023, + "step": 11840 + }, + { + "epoch": 5.568609022556391, + "grad_norm": 0.10481563955545425, + "learning_rate": 3.8931532972728285e-05, + "loss": 0.0139, + "step": 11850 + }, + { + "epoch": 5.573308270676692, + "grad_norm": 0.1183846965432167, + "learning_rate": 3.8850925774925425e-05, + "loss": 0.0151, + "step": 11860 + }, + { + "epoch": 5.578007518796992, + "grad_norm": 0.12934796512126923, + "learning_rate": 3.877034905826577e-05, + "loss": 0.0159, + "step": 11870 + }, + { + "epoch": 5.582706766917293, + "grad_norm": 0.12058614939451218, + "learning_rate": 3.8689803043043e-05, + "loss": 0.0164, + "step": 11880 + }, + { + "epoch": 5.587406015037594, + "grad_norm": 0.10819224268198013, + "learning_rate": 3.860928794946682e-05, + "loss": 0.0106, + "step": 11890 + }, + { + "epoch": 5.592105263157895, + "grad_norm": 0.1796448975801468, + "learning_rate": 3.852880399766243e-05, + "loss": 0.0177, + "step": 11900 + }, + { + "epoch": 5.5968045112781954, + "grad_norm": 0.14283166825771332, + "learning_rate": 3.844835140766988e-05, + "loss": 0.0136, + "step": 11910 + }, + { + "epoch": 5.601503759398496, + "grad_norm": 0.14546529948711395, + "learning_rate": 3.836793039944349e-05, + "loss": 0.0117, + "step": 11920 + }, + { + "epoch": 5.606203007518797, + "grad_norm": 0.1672084927558899, + "learning_rate": 3.828754119285123e-05, + "loss": 0.0126, + "step": 11930 + }, + { + "epoch": 5.610902255639098, + "grad_norm": 0.08551529794931412, + "learning_rate": 3.820718400767409e-05, + "loss": 0.0162, + "step": 11940 + }, + { + "epoch": 5.6156015037593985, + "grad_norm": 0.1129482090473175, + "learning_rate": 3.812685906360557e-05, + "loss": 0.0151, + "step": 11950 + }, + { + "epoch": 5.620300751879699, + "grad_norm": 0.17113980650901794, + "learning_rate": 3.8046566580251e-05, + "loss": 0.0136, + "step": 11960 + }, + { + "epoch": 5.625, + "grad_norm": 0.08610133826732635, + "learning_rate": 3.796630677712697e-05, + "loss": 0.0162, + "step": 11970 + }, + { + "epoch": 5.629699248120301, + "grad_norm": 0.16481080651283264, + "learning_rate": 3.788607987366069e-05, + "loss": 0.0158, + "step": 11980 + }, + { + "epoch": 5.6343984962406015, + "grad_norm": 0.1907389611005783, + "learning_rate": 3.780588608918947e-05, + "loss": 0.018, + "step": 11990 + }, + { + "epoch": 5.639097744360902, + "grad_norm": 0.10017523914575577, + "learning_rate": 3.772572564296005e-05, + "loss": 0.0147, + "step": 12000 + }, + { + "epoch": 5.643796992481203, + "grad_norm": 0.2201337069272995, + "learning_rate": 3.764559875412803e-05, + "loss": 0.0159, + "step": 12010 + }, + { + "epoch": 5.648496240601504, + "grad_norm": 0.0887899100780487, + "learning_rate": 3.756550564175727e-05, + "loss": 0.0139, + "step": 12020 + }, + { + "epoch": 5.6531954887218046, + "grad_norm": 0.1127086952328682, + "learning_rate": 3.748544652481927e-05, + "loss": 0.0146, + "step": 12030 + }, + { + "epoch": 5.657894736842105, + "grad_norm": 0.2131458818912506, + "learning_rate": 3.74054216221926e-05, + "loss": 0.0215, + "step": 12040 + }, + { + "epoch": 5.662593984962406, + "grad_norm": 0.10316726565361023, + "learning_rate": 3.73254311526623e-05, + "loss": 0.0129, + "step": 12050 + }, + { + "epoch": 5.667293233082707, + "grad_norm": 0.1518326699733734, + "learning_rate": 3.7245475334919246e-05, + "loss": 0.0114, + "step": 12060 + }, + { + "epoch": 5.671992481203008, + "grad_norm": 0.1073705404996872, + "learning_rate": 3.716555438755961e-05, + "loss": 0.0218, + "step": 12070 + }, + { + "epoch": 5.676691729323308, + "grad_norm": 0.15070666372776031, + "learning_rate": 3.7085668529084184e-05, + "loss": 0.0182, + "step": 12080 + }, + { + "epoch": 5.681390977443609, + "grad_norm": 0.09960098564624786, + "learning_rate": 3.700581797789786e-05, + "loss": 0.0128, + "step": 12090 + }, + { + "epoch": 5.68609022556391, + "grad_norm": 0.13308414816856384, + "learning_rate": 3.6926002952309016e-05, + "loss": 0.0111, + "step": 12100 + }, + { + "epoch": 5.690789473684211, + "grad_norm": 0.1321004182100296, + "learning_rate": 3.684622367052887e-05, + "loss": 0.0157, + "step": 12110 + }, + { + "epoch": 5.695488721804511, + "grad_norm": 0.12728892266750336, + "learning_rate": 3.676648035067093e-05, + "loss": 0.0161, + "step": 12120 + }, + { + "epoch": 5.700187969924812, + "grad_norm": 0.08147571980953217, + "learning_rate": 3.6686773210750385e-05, + "loss": 0.0134, + "step": 12130 + }, + { + "epoch": 5.704887218045113, + "grad_norm": 0.1688104271888733, + "learning_rate": 3.6607102468683526e-05, + "loss": 0.0147, + "step": 12140 + }, + { + "epoch": 5.709586466165414, + "grad_norm": 0.11916866153478622, + "learning_rate": 3.65274683422871e-05, + "loss": 0.0238, + "step": 12150 + }, + { + "epoch": 5.714285714285714, + "grad_norm": 0.12698572874069214, + "learning_rate": 3.6447871049277796e-05, + "loss": 0.0151, + "step": 12160 + }, + { + "epoch": 5.718984962406015, + "grad_norm": 0.09343686699867249, + "learning_rate": 3.636831080727154e-05, + "loss": 0.0104, + "step": 12170 + }, + { + "epoch": 5.723684210526316, + "grad_norm": 0.13863399624824524, + "learning_rate": 3.628878783378302e-05, + "loss": 0.0251, + "step": 12180 + }, + { + "epoch": 5.728383458646617, + "grad_norm": 0.10305328667163849, + "learning_rate": 3.6209302346225006e-05, + "loss": 0.0127, + "step": 12190 + }, + { + "epoch": 5.7330827067669174, + "grad_norm": 0.13228711485862732, + "learning_rate": 3.612985456190778e-05, + "loss": 0.0106, + "step": 12200 + }, + { + "epoch": 5.737781954887218, + "grad_norm": 0.18315477669239044, + "learning_rate": 3.605044469803854e-05, + "loss": 0.015, + "step": 12210 + }, + { + "epoch": 5.742481203007519, + "grad_norm": 0.09973251819610596, + "learning_rate": 3.597107297172084e-05, + "loss": 0.0154, + "step": 12220 + }, + { + "epoch": 5.74718045112782, + "grad_norm": 0.12596507370471954, + "learning_rate": 3.5891739599953945e-05, + "loss": 0.0165, + "step": 12230 + }, + { + "epoch": 5.7518796992481205, + "grad_norm": 0.12065248191356659, + "learning_rate": 3.581244479963225e-05, + "loss": 0.0165, + "step": 12240 + }, + { + "epoch": 5.756578947368421, + "grad_norm": 0.182419553399086, + "learning_rate": 3.5733188787544745e-05, + "loss": 0.0141, + "step": 12250 + }, + { + "epoch": 5.761278195488722, + "grad_norm": 0.1837804764509201, + "learning_rate": 3.5653971780374295e-05, + "loss": 0.0214, + "step": 12260 + }, + { + "epoch": 5.765977443609023, + "grad_norm": 0.10047575831413269, + "learning_rate": 3.557479399469721e-05, + "loss": 0.012, + "step": 12270 + }, + { + "epoch": 5.7706766917293235, + "grad_norm": 0.10815638303756714, + "learning_rate": 3.5495655646982505e-05, + "loss": 0.0165, + "step": 12280 + }, + { + "epoch": 5.775375939849624, + "grad_norm": 0.1140371561050415, + "learning_rate": 3.541655695359142e-05, + "loss": 0.01, + "step": 12290 + }, + { + "epoch": 5.780075187969925, + "grad_norm": 0.10626989603042603, + "learning_rate": 3.533749813077677e-05, + "loss": 0.0158, + "step": 12300 + }, + { + "epoch": 5.784774436090226, + "grad_norm": 0.15287335216999054, + "learning_rate": 3.525847939468233e-05, + "loss": 0.0152, + "step": 12310 + }, + { + "epoch": 5.7894736842105265, + "grad_norm": 0.08385778963565826, + "learning_rate": 3.517950096134232e-05, + "loss": 0.0207, + "step": 12320 + }, + { + "epoch": 5.794172932330827, + "grad_norm": 0.08987794816493988, + "learning_rate": 3.5100563046680764e-05, + "loss": 0.0141, + "step": 12330 + }, + { + "epoch": 5.798872180451128, + "grad_norm": 0.11704299598932266, + "learning_rate": 3.5021665866510925e-05, + "loss": 0.021, + "step": 12340 + }, + { + "epoch": 5.803571428571429, + "grad_norm": 0.15203259885311127, + "learning_rate": 3.494280963653463e-05, + "loss": 0.0169, + "step": 12350 + }, + { + "epoch": 5.80827067669173, + "grad_norm": 0.07202760130167007, + "learning_rate": 3.4863994572341843e-05, + "loss": 0.0171, + "step": 12360 + }, + { + "epoch": 5.81296992481203, + "grad_norm": 0.16119526326656342, + "learning_rate": 3.478522088940993e-05, + "loss": 0.0192, + "step": 12370 + }, + { + "epoch": 5.817669172932331, + "grad_norm": 0.1303485631942749, + "learning_rate": 3.470648880310313e-05, + "loss": 0.0175, + "step": 12380 + }, + { + "epoch": 5.822368421052632, + "grad_norm": 0.10909580439329147, + "learning_rate": 3.462779852867197e-05, + "loss": 0.0255, + "step": 12390 + }, + { + "epoch": 5.827067669172933, + "grad_norm": 0.13761353492736816, + "learning_rate": 3.4549150281252636e-05, + "loss": 0.0156, + "step": 12400 + }, + { + "epoch": 5.831766917293233, + "grad_norm": 0.11712969839572906, + "learning_rate": 3.447054427586644e-05, + "loss": 0.0127, + "step": 12410 + }, + { + "epoch": 5.836466165413534, + "grad_norm": 0.1407334804534912, + "learning_rate": 3.439198072741921e-05, + "loss": 0.0161, + "step": 12420 + }, + { + "epoch": 5.841165413533835, + "grad_norm": 0.1053805947303772, + "learning_rate": 3.431345985070067e-05, + "loss": 0.0176, + "step": 12430 + }, + { + "epoch": 5.845864661654136, + "grad_norm": 0.17800423502922058, + "learning_rate": 3.423498186038393e-05, + "loss": 0.0182, + "step": 12440 + }, + { + "epoch": 5.850563909774436, + "grad_norm": 0.09566630423069, + "learning_rate": 3.4156546971024784e-05, + "loss": 0.016, + "step": 12450 + }, + { + "epoch": 5.855263157894737, + "grad_norm": 0.12941595911979675, + "learning_rate": 3.407815539706124e-05, + "loss": 0.0159, + "step": 12460 + }, + { + "epoch": 5.859962406015038, + "grad_norm": 0.12009347975254059, + "learning_rate": 3.399980735281286e-05, + "loss": 0.0125, + "step": 12470 + }, + { + "epoch": 5.864661654135339, + "grad_norm": 0.06218589097261429, + "learning_rate": 3.392150305248024e-05, + "loss": 0.0189, + "step": 12480 + }, + { + "epoch": 5.8693609022556394, + "grad_norm": 0.1344054788351059, + "learning_rate": 3.384324271014429e-05, + "loss": 0.0176, + "step": 12490 + }, + { + "epoch": 5.87406015037594, + "grad_norm": 0.11987704038619995, + "learning_rate": 3.3765026539765834e-05, + "loss": 0.0185, + "step": 12500 + }, + { + "epoch": 5.878759398496241, + "grad_norm": 0.13597393035888672, + "learning_rate": 3.368685475518488e-05, + "loss": 0.018, + "step": 12510 + }, + { + "epoch": 5.883458646616542, + "grad_norm": 0.10128919035196304, + "learning_rate": 3.360872757012011e-05, + "loss": 0.0177, + "step": 12520 + }, + { + "epoch": 5.8881578947368425, + "grad_norm": 0.1099942997097969, + "learning_rate": 3.3530645198168295e-05, + "loss": 0.0184, + "step": 12530 + }, + { + "epoch": 5.892857142857143, + "grad_norm": 0.149112269282341, + "learning_rate": 3.3452607852803584e-05, + "loss": 0.0123, + "step": 12540 + }, + { + "epoch": 5.897556390977444, + "grad_norm": 0.20705215632915497, + "learning_rate": 3.337461574737716e-05, + "loss": 0.019, + "step": 12550 + }, + { + "epoch": 5.902255639097744, + "grad_norm": 0.14608100056648254, + "learning_rate": 3.329666909511645e-05, + "loss": 0.0168, + "step": 12560 + }, + { + "epoch": 5.9069548872180455, + "grad_norm": 0.10303889214992523, + "learning_rate": 3.321876810912461e-05, + "loss": 0.0217, + "step": 12570 + }, + { + "epoch": 5.911654135338345, + "grad_norm": 0.13910971581935883, + "learning_rate": 3.3140913002379995e-05, + "loss": 0.0163, + "step": 12580 + }, + { + "epoch": 5.916353383458647, + "grad_norm": 0.080747589468956, + "learning_rate": 3.3063103987735433e-05, + "loss": 0.0106, + "step": 12590 + }, + { + "epoch": 5.921052631578947, + "grad_norm": 0.08607535809278488, + "learning_rate": 3.298534127791785e-05, + "loss": 0.0188, + "step": 12600 + }, + { + "epoch": 5.9257518796992485, + "grad_norm": 0.11979581415653229, + "learning_rate": 3.2907625085527503e-05, + "loss": 0.0157, + "step": 12610 + }, + { + "epoch": 5.930451127819548, + "grad_norm": 0.12914730608463287, + "learning_rate": 3.282995562303754e-05, + "loss": 0.0155, + "step": 12620 + }, + { + "epoch": 5.93515037593985, + "grad_norm": 0.09366561472415924, + "learning_rate": 3.275233310279321e-05, + "loss": 0.0102, + "step": 12630 + }, + { + "epoch": 5.93984962406015, + "grad_norm": 0.10397497564554214, + "learning_rate": 3.267475773701161e-05, + "loss": 0.0161, + "step": 12640 + }, + { + "epoch": 5.944548872180452, + "grad_norm": 0.09889456629753113, + "learning_rate": 3.2597229737780774e-05, + "loss": 0.0143, + "step": 12650 + }, + { + "epoch": 5.9492481203007515, + "grad_norm": 0.0786278024315834, + "learning_rate": 3.251974931705933e-05, + "loss": 0.013, + "step": 12660 + }, + { + "epoch": 5.953947368421053, + "grad_norm": 0.1270080953836441, + "learning_rate": 3.244231668667578e-05, + "loss": 0.0204, + "step": 12670 + }, + { + "epoch": 5.958646616541353, + "grad_norm": 0.10432140529155731, + "learning_rate": 3.236493205832795e-05, + "loss": 0.0138, + "step": 12680 + }, + { + "epoch": 5.963345864661655, + "grad_norm": 0.12858955562114716, + "learning_rate": 3.228759564358248e-05, + "loss": 0.0136, + "step": 12690 + }, + { + "epoch": 5.9680451127819545, + "grad_norm": 0.093807153403759, + "learning_rate": 3.221030765387417e-05, + "loss": 0.0101, + "step": 12700 + }, + { + "epoch": 5.972744360902256, + "grad_norm": 0.09169401228427887, + "learning_rate": 3.2133068300505455e-05, + "loss": 0.0107, + "step": 12710 + }, + { + "epoch": 5.977443609022556, + "grad_norm": 0.1795411854982376, + "learning_rate": 3.205587779464576e-05, + "loss": 0.0219, + "step": 12720 + }, + { + "epoch": 5.982142857142857, + "grad_norm": 0.12922433018684387, + "learning_rate": 3.197873634733096e-05, + "loss": 0.0166, + "step": 12730 + }, + { + "epoch": 5.9868421052631575, + "grad_norm": 0.08514426648616791, + "learning_rate": 3.190164416946285e-05, + "loss": 0.0165, + "step": 12740 + }, + { + "epoch": 5.991541353383458, + "grad_norm": 0.09083666652441025, + "learning_rate": 3.18246014718085e-05, + "loss": 0.0123, + "step": 12750 + }, + { + "epoch": 5.996240601503759, + "grad_norm": 0.14577679336071014, + "learning_rate": 3.1747608464999725e-05, + "loss": 0.018, + "step": 12760 + }, + { + "epoch": 6.00093984962406, + "grad_norm": 0.09893287718296051, + "learning_rate": 3.167066535953242e-05, + "loss": 0.0163, + "step": 12770 + }, + { + "epoch": 6.0056390977443606, + "grad_norm": 0.09252132475376129, + "learning_rate": 3.1593772365766105e-05, + "loss": 0.0147, + "step": 12780 + }, + { + "epoch": 6.010338345864661, + "grad_norm": 0.09168685972690582, + "learning_rate": 3.1516929693923315e-05, + "loss": 0.0162, + "step": 12790 + }, + { + "epoch": 6.015037593984962, + "grad_norm": 0.16114775836467743, + "learning_rate": 3.144013755408895e-05, + "loss": 0.0165, + "step": 12800 + }, + { + "epoch": 6.019736842105263, + "grad_norm": 0.15589860081672668, + "learning_rate": 3.136339615620985e-05, + "loss": 0.0176, + "step": 12810 + }, + { + "epoch": 6.024436090225564, + "grad_norm": 0.11967472732067108, + "learning_rate": 3.128670571009399e-05, + "loss": 0.017, + "step": 12820 + }, + { + "epoch": 6.029135338345864, + "grad_norm": 0.10718057304620743, + "learning_rate": 3.121006642541014e-05, + "loss": 0.0134, + "step": 12830 + }, + { + "epoch": 6.033834586466165, + "grad_norm": 0.13733793795108795, + "learning_rate": 3.113347851168721e-05, + "loss": 0.0191, + "step": 12840 + }, + { + "epoch": 6.038533834586466, + "grad_norm": 0.17362140119075775, + "learning_rate": 3.105694217831361e-05, + "loss": 0.0113, + "step": 12850 + }, + { + "epoch": 6.043233082706767, + "grad_norm": 0.09463926404714584, + "learning_rate": 3.098045763453678e-05, + "loss": 0.0138, + "step": 12860 + }, + { + "epoch": 6.047932330827067, + "grad_norm": 0.14590539038181305, + "learning_rate": 3.090402508946249e-05, + "loss": 0.013, + "step": 12870 + }, + { + "epoch": 6.052631578947368, + "grad_norm": 0.06830133497714996, + "learning_rate": 3.082764475205442e-05, + "loss": 0.0116, + "step": 12880 + }, + { + "epoch": 6.057330827067669, + "grad_norm": 0.06772523373365402, + "learning_rate": 3.075131683113352e-05, + "loss": 0.0107, + "step": 12890 + }, + { + "epoch": 6.06203007518797, + "grad_norm": 0.07648127526044846, + "learning_rate": 3.0675041535377405e-05, + "loss": 0.0112, + "step": 12900 + }, + { + "epoch": 6.06672932330827, + "grad_norm": 0.10932531207799911, + "learning_rate": 3.059881907331979e-05, + "loss": 0.0125, + "step": 12910 + }, + { + "epoch": 6.071428571428571, + "grad_norm": 0.18748976290225983, + "learning_rate": 3.052264965335e-05, + "loss": 0.0164, + "step": 12920 + }, + { + "epoch": 6.076127819548872, + "grad_norm": 0.11473233997821808, + "learning_rate": 3.0446533483712304e-05, + "loss": 0.0143, + "step": 12930 + }, + { + "epoch": 6.080827067669173, + "grad_norm": 0.1189013198018074, + "learning_rate": 3.0370470772505433e-05, + "loss": 0.0194, + "step": 12940 + }, + { + "epoch": 6.0855263157894735, + "grad_norm": 0.05811845138669014, + "learning_rate": 3.0294461727681932e-05, + "loss": 0.0126, + "step": 12950 + }, + { + "epoch": 6.090225563909774, + "grad_norm": 0.14157220721244812, + "learning_rate": 3.0218506557047598e-05, + "loss": 0.0127, + "step": 12960 + }, + { + "epoch": 6.094924812030075, + "grad_norm": 0.06045945733785629, + "learning_rate": 3.0142605468260978e-05, + "loss": 0.0134, + "step": 12970 + }, + { + "epoch": 6.099624060150376, + "grad_norm": 0.15809015929698944, + "learning_rate": 3.006675866883275e-05, + "loss": 0.016, + "step": 12980 + }, + { + "epoch": 6.1043233082706765, + "grad_norm": 0.09324190765619278, + "learning_rate": 2.999096636612518e-05, + "loss": 0.0111, + "step": 12990 + }, + { + "epoch": 6.109022556390977, + "grad_norm": 0.08457409590482712, + "learning_rate": 2.991522876735154e-05, + "loss": 0.0131, + "step": 13000 + }, + { + "epoch": 6.113721804511278, + "grad_norm": 0.1202535331249237, + "learning_rate": 2.9839546079575497e-05, + "loss": 0.0169, + "step": 13010 + }, + { + "epoch": 6.118421052631579, + "grad_norm": 0.13873109221458435, + "learning_rate": 2.976391850971065e-05, + "loss": 0.0159, + "step": 13020 + }, + { + "epoch": 6.1231203007518795, + "grad_norm": 0.08397231251001358, + "learning_rate": 2.9688346264519866e-05, + "loss": 0.014, + "step": 13030 + }, + { + "epoch": 6.12781954887218, + "grad_norm": 0.06623484939336777, + "learning_rate": 2.9612829550614836e-05, + "loss": 0.0156, + "step": 13040 + }, + { + "epoch": 6.132518796992481, + "grad_norm": 0.11314690858125687, + "learning_rate": 2.9537368574455304e-05, + "loss": 0.0253, + "step": 13050 + }, + { + "epoch": 6.137218045112782, + "grad_norm": 0.07116863131523132, + "learning_rate": 2.9461963542348737e-05, + "loss": 0.014, + "step": 13060 + }, + { + "epoch": 6.1419172932330826, + "grad_norm": 0.1338961273431778, + "learning_rate": 2.9386614660449596e-05, + "loss": 0.0178, + "step": 13070 + }, + { + "epoch": 6.146616541353383, + "grad_norm": 0.16874520480632782, + "learning_rate": 2.931132213475884e-05, + "loss": 0.0169, + "step": 13080 + }, + { + "epoch": 6.151315789473684, + "grad_norm": 0.09894317388534546, + "learning_rate": 2.9236086171123404e-05, + "loss": 0.0117, + "step": 13090 + }, + { + "epoch": 6.156015037593985, + "grad_norm": 0.14408692717552185, + "learning_rate": 2.916090697523549e-05, + "loss": 0.0174, + "step": 13100 + }, + { + "epoch": 6.160714285714286, + "grad_norm": 0.16142411530017853, + "learning_rate": 2.9085784752632157e-05, + "loss": 0.02, + "step": 13110 + }, + { + "epoch": 6.165413533834586, + "grad_norm": 0.10157699137926102, + "learning_rate": 2.9010719708694722e-05, + "loss": 0.0178, + "step": 13120 + }, + { + "epoch": 6.170112781954887, + "grad_norm": 0.11045944690704346, + "learning_rate": 2.8935712048648112e-05, + "loss": 0.0141, + "step": 13130 + }, + { + "epoch": 6.174812030075188, + "grad_norm": 0.16046114265918732, + "learning_rate": 2.8860761977560436e-05, + "loss": 0.026, + "step": 13140 + }, + { + "epoch": 6.179511278195489, + "grad_norm": 0.10078492015600204, + "learning_rate": 2.878586970034232e-05, + "loss": 0.0205, + "step": 13150 + }, + { + "epoch": 6.184210526315789, + "grad_norm": 0.12906494736671448, + "learning_rate": 2.8711035421746367e-05, + "loss": 0.0267, + "step": 13160 + }, + { + "epoch": 6.18890977443609, + "grad_norm": 0.10753978043794632, + "learning_rate": 2.8636259346366666e-05, + "loss": 0.013, + "step": 13170 + }, + { + "epoch": 6.193609022556391, + "grad_norm": 0.12442290037870407, + "learning_rate": 2.8561541678638142e-05, + "loss": 0.0126, + "step": 13180 + }, + { + "epoch": 6.198308270676692, + "grad_norm": 0.16419479250907898, + "learning_rate": 2.8486882622836026e-05, + "loss": 0.0181, + "step": 13190 + }, + { + "epoch": 6.203007518796992, + "grad_norm": 0.1196175292134285, + "learning_rate": 2.8412282383075363e-05, + "loss": 0.017, + "step": 13200 + }, + { + "epoch": 6.207706766917293, + "grad_norm": 0.09084620326757431, + "learning_rate": 2.8337741163310317e-05, + "loss": 0.0122, + "step": 13210 + }, + { + "epoch": 6.212406015037594, + "grad_norm": 0.08656840771436691, + "learning_rate": 2.8263259167333777e-05, + "loss": 0.0203, + "step": 13220 + }, + { + "epoch": 6.217105263157895, + "grad_norm": 0.1413203328847885, + "learning_rate": 2.8188836598776662e-05, + "loss": 0.0155, + "step": 13230 + }, + { + "epoch": 6.2218045112781954, + "grad_norm": 0.1380605250597, + "learning_rate": 2.811447366110741e-05, + "loss": 0.0154, + "step": 13240 + }, + { + "epoch": 6.226503759398496, + "grad_norm": 0.13681164383888245, + "learning_rate": 2.804017055763149e-05, + "loss": 0.0131, + "step": 13250 + }, + { + "epoch": 6.231203007518797, + "grad_norm": 0.10424366593360901, + "learning_rate": 2.7965927491490705e-05, + "loss": 0.0173, + "step": 13260 + }, + { + "epoch": 6.235902255639098, + "grad_norm": 0.2111925184726715, + "learning_rate": 2.7891744665662823e-05, + "loss": 0.0165, + "step": 13270 + }, + { + "epoch": 6.2406015037593985, + "grad_norm": 0.09070563316345215, + "learning_rate": 2.7817622282960815e-05, + "loss": 0.0136, + "step": 13280 + }, + { + "epoch": 6.245300751879699, + "grad_norm": 0.1343836635351181, + "learning_rate": 2.774356054603243e-05, + "loss": 0.0137, + "step": 13290 + }, + { + "epoch": 6.25, + "grad_norm": 0.17384490370750427, + "learning_rate": 2.766955965735968e-05, + "loss": 0.0141, + "step": 13300 + }, + { + "epoch": 6.254699248120301, + "grad_norm": 0.07739756256341934, + "learning_rate": 2.7595619819258116e-05, + "loss": 0.0144, + "step": 13310 + }, + { + "epoch": 6.2593984962406015, + "grad_norm": 0.14672629535198212, + "learning_rate": 2.7521741233876496e-05, + "loss": 0.0195, + "step": 13320 + }, + { + "epoch": 6.264097744360902, + "grad_norm": 0.20344915986061096, + "learning_rate": 2.7447924103195976e-05, + "loss": 0.0133, + "step": 13330 + }, + { + "epoch": 6.268796992481203, + "grad_norm": 0.20322270691394806, + "learning_rate": 2.7374168629029813e-05, + "loss": 0.0199, + "step": 13340 + }, + { + "epoch": 6.273496240601504, + "grad_norm": 0.1515226662158966, + "learning_rate": 2.7300475013022663e-05, + "loss": 0.0193, + "step": 13350 + }, + { + "epoch": 6.2781954887218046, + "grad_norm": 0.1119297668337822, + "learning_rate": 2.7226843456650037e-05, + "loss": 0.0192, + "step": 13360 + }, + { + "epoch": 6.282894736842105, + "grad_norm": 0.0805499255657196, + "learning_rate": 2.7153274161217846e-05, + "loss": 0.009, + "step": 13370 + }, + { + "epoch": 6.287593984962406, + "grad_norm": 0.07355663180351257, + "learning_rate": 2.707976732786166e-05, + "loss": 0.0172, + "step": 13380 + }, + { + "epoch": 6.292293233082707, + "grad_norm": 0.06784632056951523, + "learning_rate": 2.7006323157546386e-05, + "loss": 0.0132, + "step": 13390 + }, + { + "epoch": 6.296992481203008, + "grad_norm": 0.15547586977481842, + "learning_rate": 2.693294185106562e-05, + "loss": 0.0138, + "step": 13400 + }, + { + "epoch": 6.301691729323308, + "grad_norm": 0.10337734967470169, + "learning_rate": 2.6859623609040984e-05, + "loss": 0.0117, + "step": 13410 + }, + { + "epoch": 6.306390977443609, + "grad_norm": 0.09480666369199753, + "learning_rate": 2.6786368631921836e-05, + "loss": 0.016, + "step": 13420 + }, + { + "epoch": 6.31109022556391, + "grad_norm": 0.16595226526260376, + "learning_rate": 2.67131771199844e-05, + "loss": 0.0192, + "step": 13430 + }, + { + "epoch": 6.315789473684211, + "grad_norm": 0.07386650145053864, + "learning_rate": 2.6640049273331515e-05, + "loss": 0.0119, + "step": 13440 + }, + { + "epoch": 6.320488721804511, + "grad_norm": 0.14563970267772675, + "learning_rate": 2.656698529189193e-05, + "loss": 0.017, + "step": 13450 + }, + { + "epoch": 6.325187969924812, + "grad_norm": 0.08166952431201935, + "learning_rate": 2.6493985375419778e-05, + "loss": 0.0176, + "step": 13460 + }, + { + "epoch": 6.329887218045113, + "grad_norm": 0.123371921479702, + "learning_rate": 2.642104972349403e-05, + "loss": 0.019, + "step": 13470 + }, + { + "epoch": 6.334586466165414, + "grad_norm": 0.08788731694221497, + "learning_rate": 2.6348178535517966e-05, + "loss": 0.0203, + "step": 13480 + }, + { + "epoch": 6.339285714285714, + "grad_norm": 0.07246407121419907, + "learning_rate": 2.6275372010718635e-05, + "loss": 0.0179, + "step": 13490 + }, + { + "epoch": 6.343984962406015, + "grad_norm": 0.13420486450195312, + "learning_rate": 2.6202630348146324e-05, + "loss": 0.0143, + "step": 13500 + }, + { + "epoch": 6.348684210526316, + "grad_norm": 0.10652023553848267, + "learning_rate": 2.612995374667394e-05, + "loss": 0.0084, + "step": 13510 + }, + { + "epoch": 6.353383458646617, + "grad_norm": 0.1149219200015068, + "learning_rate": 2.6057342404996522e-05, + "loss": 0.01, + "step": 13520 + }, + { + "epoch": 6.3580827067669174, + "grad_norm": 0.08880855143070221, + "learning_rate": 2.5984796521630737e-05, + "loss": 0.0138, + "step": 13530 + }, + { + "epoch": 6.362781954887218, + "grad_norm": 0.153299942612648, + "learning_rate": 2.591231629491423e-05, + "loss": 0.0109, + "step": 13540 + }, + { + "epoch": 6.367481203007519, + "grad_norm": 0.10760670900344849, + "learning_rate": 2.5839901923005205e-05, + "loss": 0.0125, + "step": 13550 + }, + { + "epoch": 6.37218045112782, + "grad_norm": 0.15970246493816376, + "learning_rate": 2.5767553603881767e-05, + "loss": 0.0162, + "step": 13560 + }, + { + "epoch": 6.3768796992481205, + "grad_norm": 0.16052334010601044, + "learning_rate": 2.5695271535341443e-05, + "loss": 0.0142, + "step": 13570 + }, + { + "epoch": 6.381578947368421, + "grad_norm": 0.11121491342782974, + "learning_rate": 2.562305591500069e-05, + "loss": 0.0137, + "step": 13580 + }, + { + "epoch": 6.386278195488722, + "grad_norm": 0.06344175338745117, + "learning_rate": 2.555090694029421e-05, + "loss": 0.0143, + "step": 13590 + }, + { + "epoch": 6.390977443609023, + "grad_norm": 0.12826725840568542, + "learning_rate": 2.547882480847461e-05, + "loss": 0.024, + "step": 13600 + }, + { + "epoch": 6.3956766917293235, + "grad_norm": 0.07199674844741821, + "learning_rate": 2.540680971661161e-05, + "loss": 0.0155, + "step": 13610 + }, + { + "epoch": 6.400375939849624, + "grad_norm": 0.1330031007528305, + "learning_rate": 2.5334861861591753e-05, + "loss": 0.0123, + "step": 13620 + }, + { + "epoch": 6.405075187969925, + "grad_norm": 0.09626597911119461, + "learning_rate": 2.526298144011775e-05, + "loss": 0.0184, + "step": 13630 + }, + { + "epoch": 6.409774436090226, + "grad_norm": 0.18249697983264923, + "learning_rate": 2.5191168648707887e-05, + "loss": 0.0091, + "step": 13640 + }, + { + "epoch": 6.4144736842105265, + "grad_norm": 0.08582185208797455, + "learning_rate": 2.511942368369566e-05, + "loss": 0.0097, + "step": 13650 + }, + { + "epoch": 6.419172932330827, + "grad_norm": 0.09408201277256012, + "learning_rate": 2.5047746741228978e-05, + "loss": 0.0147, + "step": 13660 + }, + { + "epoch": 6.423872180451128, + "grad_norm": 0.17775125801563263, + "learning_rate": 2.4976138017269908e-05, + "loss": 0.0078, + "step": 13670 + }, + { + "epoch": 6.428571428571429, + "grad_norm": 0.09252524375915527, + "learning_rate": 2.490459770759398e-05, + "loss": 0.0159, + "step": 13680 + }, + { + "epoch": 6.43327067669173, + "grad_norm": 0.08252805471420288, + "learning_rate": 2.4833126007789653e-05, + "loss": 0.0099, + "step": 13690 + }, + { + "epoch": 6.43796992481203, + "grad_norm": 0.11147051304578781, + "learning_rate": 2.476172311325783e-05, + "loss": 0.0124, + "step": 13700 + }, + { + "epoch": 6.442669172932331, + "grad_norm": 0.12817934155464172, + "learning_rate": 2.4690389219211273e-05, + "loss": 0.0136, + "step": 13710 + }, + { + "epoch": 6.447368421052632, + "grad_norm": 0.15487729012966156, + "learning_rate": 2.4619124520674146e-05, + "loss": 0.0189, + "step": 13720 + }, + { + "epoch": 6.452067669172933, + "grad_norm": 0.12925255298614502, + "learning_rate": 2.4547929212481435e-05, + "loss": 0.0178, + "step": 13730 + }, + { + "epoch": 6.456766917293233, + "grad_norm": 0.12829270958900452, + "learning_rate": 2.447680348927837e-05, + "loss": 0.0208, + "step": 13740 + }, + { + "epoch": 6.461466165413534, + "grad_norm": 0.1329372674226761, + "learning_rate": 2.4405747545519963e-05, + "loss": 0.0146, + "step": 13750 + }, + { + "epoch": 6.466165413533835, + "grad_norm": 0.10055895149707794, + "learning_rate": 2.433476157547044e-05, + "loss": 0.0199, + "step": 13760 + }, + { + "epoch": 6.470864661654136, + "grad_norm": 0.109032541513443, + "learning_rate": 2.4263845773202736e-05, + "loss": 0.0252, + "step": 13770 + }, + { + "epoch": 6.475563909774436, + "grad_norm": 0.1153714656829834, + "learning_rate": 2.419300033259798e-05, + "loss": 0.0138, + "step": 13780 + }, + { + "epoch": 6.480263157894737, + "grad_norm": 0.10681688040494919, + "learning_rate": 2.4122225447344875e-05, + "loss": 0.0155, + "step": 13790 + }, + { + "epoch": 6.484962406015038, + "grad_norm": 0.12183579057455063, + "learning_rate": 2.405152131093926e-05, + "loss": 0.0109, + "step": 13800 + }, + { + "epoch": 6.489661654135339, + "grad_norm": 0.13035376369953156, + "learning_rate": 2.3980888116683515e-05, + "loss": 0.0217, + "step": 13810 + }, + { + "epoch": 6.4943609022556394, + "grad_norm": 0.06607703119516373, + "learning_rate": 2.3910326057686127e-05, + "loss": 0.0117, + "step": 13820 + }, + { + "epoch": 6.49906015037594, + "grad_norm": 0.11934183537960052, + "learning_rate": 2.3839835326861104e-05, + "loss": 0.0165, + "step": 13830 + }, + { + "epoch": 6.503759398496241, + "grad_norm": 0.13289965689182281, + "learning_rate": 2.3769416116927335e-05, + "loss": 0.0285, + "step": 13840 + }, + { + "epoch": 6.508458646616542, + "grad_norm": 0.11643469333648682, + "learning_rate": 2.3699068620408304e-05, + "loss": 0.0134, + "step": 13850 + }, + { + "epoch": 6.5131578947368425, + "grad_norm": 0.11470893025398254, + "learning_rate": 2.362879302963135e-05, + "loss": 0.0112, + "step": 13860 + }, + { + "epoch": 6.517857142857143, + "grad_norm": 0.10102832317352295, + "learning_rate": 2.3558589536727277e-05, + "loss": 0.0125, + "step": 13870 + }, + { + "epoch": 6.522556390977444, + "grad_norm": 0.1053953766822815, + "learning_rate": 2.3488458333629777e-05, + "loss": 0.0123, + "step": 13880 + }, + { + "epoch": 6.527255639097744, + "grad_norm": 0.12388722598552704, + "learning_rate": 2.341839961207482e-05, + "loss": 0.0172, + "step": 13890 + }, + { + "epoch": 6.5319548872180455, + "grad_norm": 0.13171012699604034, + "learning_rate": 2.3348413563600325e-05, + "loss": 0.017, + "step": 13900 + }, + { + "epoch": 6.536654135338345, + "grad_norm": 0.10711341351270676, + "learning_rate": 2.3278500379545436e-05, + "loss": 0.0152, + "step": 13910 + }, + { + "epoch": 6.541353383458647, + "grad_norm": 0.17922531068325043, + "learning_rate": 2.3208660251050158e-05, + "loss": 0.0132, + "step": 13920 + }, + { + "epoch": 6.546052631578947, + "grad_norm": 0.07679407298564911, + "learning_rate": 2.3138893369054766e-05, + "loss": 0.0133, + "step": 13930 + }, + { + "epoch": 6.5507518796992485, + "grad_norm": 0.16956987977027893, + "learning_rate": 2.3069199924299174e-05, + "loss": 0.0115, + "step": 13940 + }, + { + "epoch": 6.555451127819548, + "grad_norm": 0.10801958292722702, + "learning_rate": 2.2999580107322653e-05, + "loss": 0.0212, + "step": 13950 + }, + { + "epoch": 6.56015037593985, + "grad_norm": 0.14689181745052338, + "learning_rate": 2.29300341084631e-05, + "loss": 0.0141, + "step": 13960 + }, + { + "epoch": 6.56484962406015, + "grad_norm": 0.10904575139284134, + "learning_rate": 2.2860562117856647e-05, + "loss": 0.0139, + "step": 13970 + }, + { + "epoch": 6.569548872180452, + "grad_norm": 0.06999867409467697, + "learning_rate": 2.279116432543705e-05, + "loss": 0.0139, + "step": 13980 + }, + { + "epoch": 6.5742481203007515, + "grad_norm": 0.09230833500623703, + "learning_rate": 2.2721840920935196e-05, + "loss": 0.0147, + "step": 13990 + }, + { + "epoch": 6.578947368421053, + "grad_norm": 0.19919702410697937, + "learning_rate": 2.2652592093878666e-05, + "loss": 0.0162, + "step": 14000 + }, + { + "epoch": 6.583646616541353, + "grad_norm": 0.0856180340051651, + "learning_rate": 2.258341803359108e-05, + "loss": 0.0118, + "step": 14010 + }, + { + "epoch": 6.588345864661655, + "grad_norm": 0.11447542905807495, + "learning_rate": 2.251431892919171e-05, + "loss": 0.0176, + "step": 14020 + }, + { + "epoch": 6.5930451127819545, + "grad_norm": 0.11393177509307861, + "learning_rate": 2.2445294969594844e-05, + "loss": 0.0122, + "step": 14030 + }, + { + "epoch": 6.597744360902256, + "grad_norm": 0.06813038140535355, + "learning_rate": 2.237634634350934e-05, + "loss": 0.0112, + "step": 14040 + }, + { + "epoch": 6.602443609022556, + "grad_norm": 0.1496971994638443, + "learning_rate": 2.2307473239438154e-05, + "loss": 0.0137, + "step": 14050 + }, + { + "epoch": 6.607142857142857, + "grad_norm": 0.15726064145565033, + "learning_rate": 2.2238675845677663e-05, + "loss": 0.0148, + "step": 14060 + }, + { + "epoch": 6.6118421052631575, + "grad_norm": 0.06907601654529572, + "learning_rate": 2.2169954350317374e-05, + "loss": 0.0104, + "step": 14070 + }, + { + "epoch": 6.616541353383458, + "grad_norm": 0.10363949090242386, + "learning_rate": 2.2101308941239203e-05, + "loss": 0.0203, + "step": 14080 + }, + { + "epoch": 6.621240601503759, + "grad_norm": 0.1385289579629898, + "learning_rate": 2.2032739806117058e-05, + "loss": 0.0154, + "step": 14090 + }, + { + "epoch": 6.62593984962406, + "grad_norm": 0.10194943100214005, + "learning_rate": 2.196424713241637e-05, + "loss": 0.0198, + "step": 14100 + }, + { + "epoch": 6.6306390977443606, + "grad_norm": 0.06067529693245888, + "learning_rate": 2.1895831107393484e-05, + "loss": 0.0124, + "step": 14110 + }, + { + "epoch": 6.635338345864661, + "grad_norm": 0.0793943703174591, + "learning_rate": 2.182749191809518e-05, + "loss": 0.0121, + "step": 14120 + }, + { + "epoch": 6.640037593984962, + "grad_norm": 0.09581856429576874, + "learning_rate": 2.1759229751358217e-05, + "loss": 0.0164, + "step": 14130 + }, + { + "epoch": 6.644736842105263, + "grad_norm": 0.11547362804412842, + "learning_rate": 2.1691044793808734e-05, + "loss": 0.0156, + "step": 14140 + }, + { + "epoch": 6.649436090225564, + "grad_norm": 0.12122860550880432, + "learning_rate": 2.1622937231861822e-05, + "loss": 0.01, + "step": 14150 + }, + { + "epoch": 6.654135338345864, + "grad_norm": 0.10450417548418045, + "learning_rate": 2.1554907251720945e-05, + "loss": 0.0109, + "step": 14160 + }, + { + "epoch": 6.658834586466165, + "grad_norm": 0.21685545146465302, + "learning_rate": 2.148695503937745e-05, + "loss": 0.0137, + "step": 14170 + }, + { + "epoch": 6.663533834586466, + "grad_norm": 0.07424575835466385, + "learning_rate": 2.1419080780610123e-05, + "loss": 0.0092, + "step": 14180 + }, + { + "epoch": 6.668233082706767, + "grad_norm": 0.10741780698299408, + "learning_rate": 2.1351284660984572e-05, + "loss": 0.0131, + "step": 14190 + }, + { + "epoch": 6.672932330827067, + "grad_norm": 0.162250816822052, + "learning_rate": 2.128356686585282e-05, + "loss": 0.0154, + "step": 14200 + }, + { + "epoch": 6.677631578947368, + "grad_norm": 0.10091499984264374, + "learning_rate": 2.121592758035273e-05, + "loss": 0.015, + "step": 14210 + }, + { + "epoch": 6.682330827067669, + "grad_norm": 0.12936082482337952, + "learning_rate": 2.1148366989407496e-05, + "loss": 0.0115, + "step": 14220 + }, + { + "epoch": 6.68703007518797, + "grad_norm": 0.05973741412162781, + "learning_rate": 2.1080885277725236e-05, + "loss": 0.0097, + "step": 14230 + }, + { + "epoch": 6.69172932330827, + "grad_norm": 0.1663609743118286, + "learning_rate": 2.1013482629798333e-05, + "loss": 0.0204, + "step": 14240 + }, + { + "epoch": 6.696428571428571, + "grad_norm": 0.13079845905303955, + "learning_rate": 2.094615922990309e-05, + "loss": 0.0128, + "step": 14250 + }, + { + "epoch": 6.701127819548872, + "grad_norm": 0.133228600025177, + "learning_rate": 2.0878915262099098e-05, + "loss": 0.0117, + "step": 14260 + }, + { + "epoch": 6.705827067669173, + "grad_norm": 0.13159476220607758, + "learning_rate": 2.0811750910228774e-05, + "loss": 0.0152, + "step": 14270 + }, + { + "epoch": 6.7105263157894735, + "grad_norm": 0.17307148873806, + "learning_rate": 2.0744666357916925e-05, + "loss": 0.0149, + "step": 14280 + }, + { + "epoch": 6.715225563909774, + "grad_norm": 0.14028523862361908, + "learning_rate": 2.067766178857013e-05, + "loss": 0.0249, + "step": 14290 + }, + { + "epoch": 6.719924812030075, + "grad_norm": 0.11767170578241348, + "learning_rate": 2.061073738537635e-05, + "loss": 0.0104, + "step": 14300 + }, + { + "epoch": 6.724624060150376, + "grad_norm": 0.09434834867715836, + "learning_rate": 2.0543893331304333e-05, + "loss": 0.0164, + "step": 14310 + }, + { + "epoch": 6.7293233082706765, + "grad_norm": 0.13524094223976135, + "learning_rate": 2.0477129809103147e-05, + "loss": 0.0144, + "step": 14320 + }, + { + "epoch": 6.734022556390977, + "grad_norm": 0.12902255356311798, + "learning_rate": 2.0410447001301753e-05, + "loss": 0.0126, + "step": 14330 + }, + { + "epoch": 6.738721804511278, + "grad_norm": 0.12430833280086517, + "learning_rate": 2.0343845090208368e-05, + "loss": 0.0136, + "step": 14340 + }, + { + "epoch": 6.743421052631579, + "grad_norm": 0.15644432604312897, + "learning_rate": 2.0277324257910106e-05, + "loss": 0.0175, + "step": 14350 + }, + { + "epoch": 6.7481203007518795, + "grad_norm": 0.10792649537324905, + "learning_rate": 2.0210884686272368e-05, + "loss": 0.0097, + "step": 14360 + }, + { + "epoch": 6.75281954887218, + "grad_norm": 0.11637542396783829, + "learning_rate": 2.0144526556938387e-05, + "loss": 0.0275, + "step": 14370 + }, + { + "epoch": 6.757518796992481, + "grad_norm": 0.1322125643491745, + "learning_rate": 2.0078250051328784e-05, + "loss": 0.018, + "step": 14380 + }, + { + "epoch": 6.762218045112782, + "grad_norm": 0.11292218416929245, + "learning_rate": 2.0012055350640986e-05, + "loss": 0.0098, + "step": 14390 + }, + { + "epoch": 6.7669172932330826, + "grad_norm": 0.15685032308101654, + "learning_rate": 1.9945942635848748e-05, + "loss": 0.016, + "step": 14400 + }, + { + "epoch": 6.771616541353383, + "grad_norm": 0.10153713822364807, + "learning_rate": 1.9879912087701753e-05, + "loss": 0.0094, + "step": 14410 + }, + { + "epoch": 6.776315789473684, + "grad_norm": 0.0829312652349472, + "learning_rate": 1.981396388672496e-05, + "loss": 0.0104, + "step": 14420 + }, + { + "epoch": 6.781015037593985, + "grad_norm": 0.19711576402187347, + "learning_rate": 1.974809821321827e-05, + "loss": 0.0178, + "step": 14430 + }, + { + "epoch": 6.785714285714286, + "grad_norm": 0.07211043685674667, + "learning_rate": 1.9682315247255894e-05, + "loss": 0.017, + "step": 14440 + }, + { + "epoch": 6.790413533834586, + "grad_norm": 0.1333143711090088, + "learning_rate": 1.9616615168685943e-05, + "loss": 0.0136, + "step": 14450 + }, + { + "epoch": 6.795112781954887, + "grad_norm": 0.05626295506954193, + "learning_rate": 1.9550998157129946e-05, + "loss": 0.01, + "step": 14460 + }, + { + "epoch": 6.799812030075188, + "grad_norm": 0.07672274112701416, + "learning_rate": 1.9485464391982284e-05, + "loss": 0.0097, + "step": 14470 + }, + { + "epoch": 6.804511278195489, + "grad_norm": 0.08597870171070099, + "learning_rate": 1.942001405240979e-05, + "loss": 0.0157, + "step": 14480 + }, + { + "epoch": 6.809210526315789, + "grad_norm": 0.0822441503405571, + "learning_rate": 1.9354647317351188e-05, + "loss": 0.0096, + "step": 14490 + }, + { + "epoch": 6.81390977443609, + "grad_norm": 0.13318496942520142, + "learning_rate": 1.928936436551661e-05, + "loss": 0.0122, + "step": 14500 + }, + { + "epoch": 6.818609022556391, + "grad_norm": 0.10445527732372284, + "learning_rate": 1.9224165375387193e-05, + "loss": 0.0107, + "step": 14510 + }, + { + "epoch": 6.823308270676692, + "grad_norm": 0.18820570409297943, + "learning_rate": 1.9159050525214452e-05, + "loss": 0.012, + "step": 14520 + }, + { + "epoch": 6.828007518796992, + "grad_norm": 0.14440257847309113, + "learning_rate": 1.909401999301993e-05, + "loss": 0.0151, + "step": 14530 + }, + { + "epoch": 6.832706766917293, + "grad_norm": 0.1307608187198639, + "learning_rate": 1.9029073956594606e-05, + "loss": 0.0146, + "step": 14540 + }, + { + "epoch": 6.837406015037594, + "grad_norm": 0.12377108633518219, + "learning_rate": 1.8964212593498442e-05, + "loss": 0.0139, + "step": 14550 + }, + { + "epoch": 6.842105263157895, + "grad_norm": 0.11900335550308228, + "learning_rate": 1.8899436081059975e-05, + "loss": 0.0143, + "step": 14560 + }, + { + "epoch": 6.8468045112781954, + "grad_norm": 0.12481661140918732, + "learning_rate": 1.8834744596375666e-05, + "loss": 0.0089, + "step": 14570 + }, + { + "epoch": 6.851503759398496, + "grad_norm": 0.1504499465227127, + "learning_rate": 1.877013831630961e-05, + "loss": 0.0174, + "step": 14580 + }, + { + "epoch": 6.856203007518797, + "grad_norm": 0.18421101570129395, + "learning_rate": 1.8705617417492883e-05, + "loss": 0.0225, + "step": 14590 + }, + { + "epoch": 6.860902255639098, + "grad_norm": 0.09719163924455643, + "learning_rate": 1.8641182076323148e-05, + "loss": 0.0104, + "step": 14600 + }, + { + "epoch": 6.8656015037593985, + "grad_norm": 0.10158328711986542, + "learning_rate": 1.85768324689642e-05, + "loss": 0.0151, + "step": 14610 + }, + { + "epoch": 6.870300751879699, + "grad_norm": 0.08070684224367142, + "learning_rate": 1.851256877134538e-05, + "loss": 0.0183, + "step": 14620 + }, + { + "epoch": 6.875, + "grad_norm": 0.06398982554674149, + "learning_rate": 1.8448391159161204e-05, + "loss": 0.0118, + "step": 14630 + }, + { + "epoch": 6.879699248120301, + "grad_norm": 0.09589491784572601, + "learning_rate": 1.838429980787081e-05, + "loss": 0.0128, + "step": 14640 + }, + { + "epoch": 6.8843984962406015, + "grad_norm": 0.1288967728614807, + "learning_rate": 1.8320294892697478e-05, + "loss": 0.0137, + "step": 14650 + }, + { + "epoch": 6.889097744360902, + "grad_norm": 0.11724042892456055, + "learning_rate": 1.8256376588628238e-05, + "loss": 0.0207, + "step": 14660 + }, + { + "epoch": 6.893796992481203, + "grad_norm": 0.13123568892478943, + "learning_rate": 1.8192545070413282e-05, + "loss": 0.011, + "step": 14670 + }, + { + "epoch": 6.898496240601504, + "grad_norm": 0.09076406806707382, + "learning_rate": 1.8128800512565513e-05, + "loss": 0.013, + "step": 14680 + }, + { + "epoch": 6.9031954887218046, + "grad_norm": 0.09277361631393433, + "learning_rate": 1.8065143089360172e-05, + "loss": 0.0125, + "step": 14690 + }, + { + "epoch": 6.907894736842105, + "grad_norm": 0.10244913399219513, + "learning_rate": 1.800157297483417e-05, + "loss": 0.0111, + "step": 14700 + }, + { + "epoch": 6.912593984962406, + "grad_norm": 0.06498057395219803, + "learning_rate": 1.7938090342785817e-05, + "loss": 0.0111, + "step": 14710 + }, + { + "epoch": 6.917293233082707, + "grad_norm": 0.06595025211572647, + "learning_rate": 1.787469536677419e-05, + "loss": 0.0099, + "step": 14720 + }, + { + "epoch": 6.921992481203008, + "grad_norm": 0.0671527311205864, + "learning_rate": 1.7811388220118707e-05, + "loss": 0.0144, + "step": 14730 + }, + { + "epoch": 6.926691729323308, + "grad_norm": 0.07553205639123917, + "learning_rate": 1.774816907589873e-05, + "loss": 0.0137, + "step": 14740 + }, + { + "epoch": 6.931390977443609, + "grad_norm": 0.13109339773654938, + "learning_rate": 1.768503810695295e-05, + "loss": 0.0123, + "step": 14750 + }, + { + "epoch": 6.93609022556391, + "grad_norm": 0.09198206663131714, + "learning_rate": 1.7621995485879062e-05, + "loss": 0.0119, + "step": 14760 + }, + { + "epoch": 6.940789473684211, + "grad_norm": 0.07400915026664734, + "learning_rate": 1.755904138503316e-05, + "loss": 0.0132, + "step": 14770 + }, + { + "epoch": 6.945488721804511, + "grad_norm": 0.071579709649086, + "learning_rate": 1.749617597652934e-05, + "loss": 0.0175, + "step": 14780 + }, + { + "epoch": 6.950187969924812, + "grad_norm": 0.11160339415073395, + "learning_rate": 1.743339943223926e-05, + "loss": 0.0153, + "step": 14790 + }, + { + "epoch": 6.954887218045113, + "grad_norm": 0.1040780320763588, + "learning_rate": 1.7370711923791567e-05, + "loss": 0.0139, + "step": 14800 + }, + { + "epoch": 6.959586466165414, + "grad_norm": 0.05674157291650772, + "learning_rate": 1.7308113622571544e-05, + "loss": 0.0064, + "step": 14810 + }, + { + "epoch": 6.964285714285714, + "grad_norm": 0.08227528631687164, + "learning_rate": 1.7245604699720535e-05, + "loss": 0.0131, + "step": 14820 + }, + { + "epoch": 6.968984962406015, + "grad_norm": 0.062148675322532654, + "learning_rate": 1.7183185326135543e-05, + "loss": 0.0142, + "step": 14830 + }, + { + "epoch": 6.973684210526316, + "grad_norm": 0.13188673555850983, + "learning_rate": 1.712085567246878e-05, + "loss": 0.0198, + "step": 14840 + }, + { + "epoch": 6.978383458646617, + "grad_norm": 0.09028539806604385, + "learning_rate": 1.70586159091271e-05, + "loss": 0.0131, + "step": 14850 + }, + { + "epoch": 6.9830827067669174, + "grad_norm": 0.09629946202039719, + "learning_rate": 1.699646620627168e-05, + "loss": 0.0158, + "step": 14860 + }, + { + "epoch": 6.987781954887218, + "grad_norm": 0.05366470292210579, + "learning_rate": 1.6934406733817414e-05, + "loss": 0.0186, + "step": 14870 + }, + { + "epoch": 6.992481203007519, + "grad_norm": 0.09256349503993988, + "learning_rate": 1.6872437661432517e-05, + "loss": 0.0124, + "step": 14880 + }, + { + "epoch": 6.99718045112782, + "grad_norm": 0.08437152206897736, + "learning_rate": 1.6810559158538092e-05, + "loss": 0.0125, + "step": 14890 + }, + { + "epoch": 7.0018796992481205, + "grad_norm": 0.13958819210529327, + "learning_rate": 1.6748771394307585e-05, + "loss": 0.0126, + "step": 14900 + }, + { + "epoch": 7.006578947368421, + "grad_norm": 0.10412228107452393, + "learning_rate": 1.6687074537666398e-05, + "loss": 0.0081, + "step": 14910 + }, + { + "epoch": 7.011278195488722, + "grad_norm": 0.16561958193778992, + "learning_rate": 1.662546875729138e-05, + "loss": 0.0236, + "step": 14920 + }, + { + "epoch": 7.015977443609023, + "grad_norm": 0.10665090382099152, + "learning_rate": 1.6563954221610355e-05, + "loss": 0.0126, + "step": 14930 + }, + { + "epoch": 7.0206766917293235, + "grad_norm": 0.07634269446134567, + "learning_rate": 1.6502531098801753e-05, + "loss": 0.0112, + "step": 14940 + }, + { + "epoch": 7.025375939849624, + "grad_norm": 0.0811878889799118, + "learning_rate": 1.6441199556794033e-05, + "loss": 0.0129, + "step": 14950 + }, + { + "epoch": 7.030075187969925, + "grad_norm": 0.09860299527645111, + "learning_rate": 1.637995976326527e-05, + "loss": 0.0175, + "step": 14960 + }, + { + "epoch": 7.034774436090226, + "grad_norm": 0.10491390526294708, + "learning_rate": 1.631881188564275e-05, + "loss": 0.0158, + "step": 14970 + }, + { + "epoch": 7.0394736842105265, + "grad_norm": 0.17026887834072113, + "learning_rate": 1.62577560911024e-05, + "loss": 0.0148, + "step": 14980 + }, + { + "epoch": 7.044172932330827, + "grad_norm": 0.10451708734035492, + "learning_rate": 1.6196792546568472e-05, + "loss": 0.0101, + "step": 14990 + }, + { + "epoch": 7.048872180451128, + "grad_norm": 0.11327500641345978, + "learning_rate": 1.6135921418712956e-05, + "loss": 0.016, + "step": 15000 + }, + { + "epoch": 7.053571428571429, + "grad_norm": 0.08484944701194763, + "learning_rate": 1.6075142873955164e-05, + "loss": 0.0103, + "step": 15010 + }, + { + "epoch": 7.05827067669173, + "grad_norm": 0.07527375966310501, + "learning_rate": 1.6014457078461353e-05, + "loss": 0.0077, + "step": 15020 + }, + { + "epoch": 7.06296992481203, + "grad_norm": 0.10955455899238586, + "learning_rate": 1.5953864198144135e-05, + "loss": 0.0202, + "step": 15030 + }, + { + "epoch": 7.067669172932331, + "grad_norm": 0.10458432137966156, + "learning_rate": 1.5893364398662176e-05, + "loss": 0.0117, + "step": 15040 + }, + { + "epoch": 7.072368421052632, + "grad_norm": 0.0628298744559288, + "learning_rate": 1.583295784541958e-05, + "loss": 0.0084, + "step": 15050 + }, + { + "epoch": 7.077067669172933, + "grad_norm": 0.1378493756055832, + "learning_rate": 1.5772644703565565e-05, + "loss": 0.0204, + "step": 15060 + }, + { + "epoch": 7.081766917293233, + "grad_norm": 0.13175398111343384, + "learning_rate": 1.5712425137993973e-05, + "loss": 0.0137, + "step": 15070 + }, + { + "epoch": 7.086466165413534, + "grad_norm": 0.12167177349328995, + "learning_rate": 1.5652299313342773e-05, + "loss": 0.0102, + "step": 15080 + }, + { + "epoch": 7.091165413533835, + "grad_norm": 0.0790913999080658, + "learning_rate": 1.5592267393993716e-05, + "loss": 0.0127, + "step": 15090 + }, + { + "epoch": 7.095864661654136, + "grad_norm": 0.1287565529346466, + "learning_rate": 1.553232954407171e-05, + "loss": 0.0089, + "step": 15100 + }, + { + "epoch": 7.100563909774436, + "grad_norm": 0.06657912582159042, + "learning_rate": 1.5472485927444597e-05, + "loss": 0.012, + "step": 15110 + }, + { + "epoch": 7.105263157894737, + "grad_norm": 0.05287037044763565, + "learning_rate": 1.5412736707722537e-05, + "loss": 0.0077, + "step": 15120 + }, + { + "epoch": 7.109962406015038, + "grad_norm": 0.1070045679807663, + "learning_rate": 1.5353082048257596e-05, + "loss": 0.0119, + "step": 15130 + }, + { + "epoch": 7.114661654135339, + "grad_norm": 0.07215863466262817, + "learning_rate": 1.5293522112143373e-05, + "loss": 0.0133, + "step": 15140 + }, + { + "epoch": 7.1193609022556394, + "grad_norm": 0.10572236031293869, + "learning_rate": 1.5234057062214402e-05, + "loss": 0.0188, + "step": 15150 + }, + { + "epoch": 7.12406015037594, + "grad_norm": 0.12182480096817017, + "learning_rate": 1.517468706104589e-05, + "loss": 0.0166, + "step": 15160 + }, + { + "epoch": 7.128759398496241, + "grad_norm": 0.1621452122926712, + "learning_rate": 1.5115412270953167e-05, + "loss": 0.0182, + "step": 15170 + }, + { + "epoch": 7.133458646616542, + "grad_norm": 0.09231683611869812, + "learning_rate": 1.5056232853991209e-05, + "loss": 0.0093, + "step": 15180 + }, + { + "epoch": 7.1381578947368425, + "grad_norm": 0.10214181244373322, + "learning_rate": 1.4997148971954344e-05, + "loss": 0.018, + "step": 15190 + }, + { + "epoch": 7.142857142857143, + "grad_norm": 0.10214631259441376, + "learning_rate": 1.4938160786375572e-05, + "loss": 0.0173, + "step": 15200 + }, + { + "epoch": 7.147556390977444, + "grad_norm": 0.11673356592655182, + "learning_rate": 1.4879268458526379e-05, + "loss": 0.0109, + "step": 15210 + }, + { + "epoch": 7.152255639097745, + "grad_norm": 0.07134755700826645, + "learning_rate": 1.4820472149416154e-05, + "loss": 0.0132, + "step": 15220 + }, + { + "epoch": 7.1569548872180455, + "grad_norm": 0.16357994079589844, + "learning_rate": 1.4761772019791748e-05, + "loss": 0.0142, + "step": 15230 + }, + { + "epoch": 7.161654135338346, + "grad_norm": 0.05381006747484207, + "learning_rate": 1.470316823013707e-05, + "loss": 0.0094, + "step": 15240 + }, + { + "epoch": 7.166353383458647, + "grad_norm": 0.09296070784330368, + "learning_rate": 1.4644660940672627e-05, + "loss": 0.0146, + "step": 15250 + }, + { + "epoch": 7.171052631578948, + "grad_norm": 0.10752391815185547, + "learning_rate": 1.4586250311355132e-05, + "loss": 0.0124, + "step": 15260 + }, + { + "epoch": 7.1757518796992485, + "grad_norm": 0.10608847439289093, + "learning_rate": 1.4527936501877032e-05, + "loss": 0.011, + "step": 15270 + }, + { + "epoch": 7.180451127819548, + "grad_norm": 0.10653159022331238, + "learning_rate": 1.4469719671666043e-05, + "loss": 0.0132, + "step": 15280 + }, + { + "epoch": 7.18515037593985, + "grad_norm": 0.09571955353021622, + "learning_rate": 1.4411599979884744e-05, + "loss": 0.0164, + "step": 15290 + }, + { + "epoch": 7.18984962406015, + "grad_norm": 0.05526496097445488, + "learning_rate": 1.435357758543015e-05, + "loss": 0.0154, + "step": 15300 + }, + { + "epoch": 7.194548872180452, + "grad_norm": 0.08336817473173141, + "learning_rate": 1.4295652646933277e-05, + "loss": 0.0151, + "step": 15310 + }, + { + "epoch": 7.1992481203007515, + "grad_norm": 0.06959939748048782, + "learning_rate": 1.4237825322758736e-05, + "loss": 0.0155, + "step": 15320 + }, + { + "epoch": 7.203947368421052, + "grad_norm": 0.15572324395179749, + "learning_rate": 1.4180095771004154e-05, + "loss": 0.019, + "step": 15330 + }, + { + "epoch": 7.208646616541353, + "grad_norm": 0.14393118023872375, + "learning_rate": 1.412246414949997e-05, + "loss": 0.0085, + "step": 15340 + }, + { + "epoch": 7.213345864661654, + "grad_norm": 0.06008661538362503, + "learning_rate": 1.4064930615808808e-05, + "loss": 0.011, + "step": 15350 + }, + { + "epoch": 7.2180451127819545, + "grad_norm": 0.14957301318645477, + "learning_rate": 1.4007495327225162e-05, + "loss": 0.0129, + "step": 15360 + }, + { + "epoch": 7.222744360902255, + "grad_norm": 0.1129375472664833, + "learning_rate": 1.3950158440774957e-05, + "loss": 0.0165, + "step": 15370 + }, + { + "epoch": 7.227443609022556, + "grad_norm": 0.16532012820243835, + "learning_rate": 1.389292011321498e-05, + "loss": 0.0101, + "step": 15380 + }, + { + "epoch": 7.232142857142857, + "grad_norm": 0.07673346996307373, + "learning_rate": 1.383578050103268e-05, + "loss": 0.0099, + "step": 15390 + }, + { + "epoch": 7.2368421052631575, + "grad_norm": 0.06047337129712105, + "learning_rate": 1.3778739760445552e-05, + "loss": 0.0105, + "step": 15400 + }, + { + "epoch": 7.241541353383458, + "grad_norm": 0.09481897950172424, + "learning_rate": 1.3721798047400813e-05, + "loss": 0.0136, + "step": 15410 + }, + { + "epoch": 7.246240601503759, + "grad_norm": 0.13582715392112732, + "learning_rate": 1.3664955517574968e-05, + "loss": 0.0163, + "step": 15420 + }, + { + "epoch": 7.25093984962406, + "grad_norm": 0.12135443836450577, + "learning_rate": 1.3608212326373249e-05, + "loss": 0.0126, + "step": 15430 + }, + { + "epoch": 7.2556390977443606, + "grad_norm": 0.15014630556106567, + "learning_rate": 1.3551568628929434e-05, + "loss": 0.0138, + "step": 15440 + }, + { + "epoch": 7.260338345864661, + "grad_norm": 0.08308325707912445, + "learning_rate": 1.3495024580105192e-05, + "loss": 0.0186, + "step": 15450 + }, + { + "epoch": 7.265037593984962, + "grad_norm": 0.18257148563861847, + "learning_rate": 1.343858033448982e-05, + "loss": 0.0154, + "step": 15460 + }, + { + "epoch": 7.269736842105263, + "grad_norm": 0.05905809998512268, + "learning_rate": 1.3382236046399722e-05, + "loss": 0.0097, + "step": 15470 + }, + { + "epoch": 7.274436090225564, + "grad_norm": 0.15629065036773682, + "learning_rate": 1.3325991869878013e-05, + "loss": 0.014, + "step": 15480 + }, + { + "epoch": 7.279135338345864, + "grad_norm": 0.10098100453615189, + "learning_rate": 1.3269847958694148e-05, + "loss": 0.015, + "step": 15490 + }, + { + "epoch": 7.283834586466165, + "grad_norm": 0.08035392314195633, + "learning_rate": 1.3213804466343421e-05, + "loss": 0.0134, + "step": 15500 + }, + { + "epoch": 7.288533834586466, + "grad_norm": 0.09680403023958206, + "learning_rate": 1.3157861546046613e-05, + "loss": 0.0132, + "step": 15510 + }, + { + "epoch": 7.293233082706767, + "grad_norm": 0.12656839191913605, + "learning_rate": 1.3102019350749528e-05, + "loss": 0.0106, + "step": 15520 + }, + { + "epoch": 7.297932330827067, + "grad_norm": 0.11206916719675064, + "learning_rate": 1.3046278033122577e-05, + "loss": 0.0092, + "step": 15530 + }, + { + "epoch": 7.302631578947368, + "grad_norm": 0.09982562065124512, + "learning_rate": 1.299063774556042e-05, + "loss": 0.0079, + "step": 15540 + }, + { + "epoch": 7.307330827067669, + "grad_norm": 0.05940884351730347, + "learning_rate": 1.293509864018146e-05, + "loss": 0.015, + "step": 15550 + }, + { + "epoch": 7.31203007518797, + "grad_norm": 0.09302843362092972, + "learning_rate": 1.2879660868827508e-05, + "loss": 0.0133, + "step": 15560 + }, + { + "epoch": 7.31672932330827, + "grad_norm": 0.07972191274166107, + "learning_rate": 1.2824324583063302e-05, + "loss": 0.0152, + "step": 15570 + }, + { + "epoch": 7.321428571428571, + "grad_norm": 0.11651585251092911, + "learning_rate": 1.2769089934176126e-05, + "loss": 0.0085, + "step": 15580 + }, + { + "epoch": 7.326127819548872, + "grad_norm": 0.09426891058683395, + "learning_rate": 1.2713957073175425e-05, + "loss": 0.0163, + "step": 15590 + }, + { + "epoch": 7.330827067669173, + "grad_norm": 0.1127847209572792, + "learning_rate": 1.2658926150792322e-05, + "loss": 0.012, + "step": 15600 + }, + { + "epoch": 7.3355263157894735, + "grad_norm": 0.08010071516036987, + "learning_rate": 1.2603997317479238e-05, + "loss": 0.0099, + "step": 15610 + }, + { + "epoch": 7.340225563909774, + "grad_norm": 0.18087002635002136, + "learning_rate": 1.2549170723409549e-05, + "loss": 0.0179, + "step": 15620 + }, + { + "epoch": 7.344924812030075, + "grad_norm": 0.12995214760303497, + "learning_rate": 1.2494446518477022e-05, + "loss": 0.0139, + "step": 15630 + }, + { + "epoch": 7.349624060150376, + "grad_norm": 0.10053711384534836, + "learning_rate": 1.243982485229559e-05, + "loss": 0.0162, + "step": 15640 + }, + { + "epoch": 7.3543233082706765, + "grad_norm": 0.07871995866298676, + "learning_rate": 1.2385305874198776e-05, + "loss": 0.0165, + "step": 15650 + }, + { + "epoch": 7.359022556390977, + "grad_norm": 0.09473975747823715, + "learning_rate": 1.233088973323937e-05, + "loss": 0.0278, + "step": 15660 + }, + { + "epoch": 7.363721804511278, + "grad_norm": 0.1175215020775795, + "learning_rate": 1.2276576578189064e-05, + "loss": 0.0091, + "step": 15670 + }, + { + "epoch": 7.368421052631579, + "grad_norm": 0.11268990486860275, + "learning_rate": 1.2222366557537911e-05, + "loss": 0.0119, + "step": 15680 + }, + { + "epoch": 7.3731203007518795, + "grad_norm": 0.12361589074134827, + "learning_rate": 1.2168259819494066e-05, + "loss": 0.0189, + "step": 15690 + }, + { + "epoch": 7.37781954887218, + "grad_norm": 0.05146336555480957, + "learning_rate": 1.2114256511983274e-05, + "loss": 0.0087, + "step": 15700 + }, + { + "epoch": 7.382518796992481, + "grad_norm": 0.05579405277967453, + "learning_rate": 1.2060356782648503e-05, + "loss": 0.0094, + "step": 15710 + }, + { + "epoch": 7.387218045112782, + "grad_norm": 0.07007358223199844, + "learning_rate": 1.2006560778849578e-05, + "loss": 0.0097, + "step": 15720 + }, + { + "epoch": 7.3919172932330826, + "grad_norm": 0.09807036072015762, + "learning_rate": 1.1952868647662696e-05, + "loss": 0.0118, + "step": 15730 + }, + { + "epoch": 7.396616541353383, + "grad_norm": 0.1110677719116211, + "learning_rate": 1.1899280535880119e-05, + "loss": 0.0101, + "step": 15740 + }, + { + "epoch": 7.401315789473684, + "grad_norm": 0.1247294470667839, + "learning_rate": 1.1845796590009683e-05, + "loss": 0.016, + "step": 15750 + }, + { + "epoch": 7.406015037593985, + "grad_norm": 0.10053283721208572, + "learning_rate": 1.1792416956274444e-05, + "loss": 0.0115, + "step": 15760 + }, + { + "epoch": 7.410714285714286, + "grad_norm": 0.08065032213926315, + "learning_rate": 1.1739141780612306e-05, + "loss": 0.0152, + "step": 15770 + }, + { + "epoch": 7.415413533834586, + "grad_norm": 0.0791928693652153, + "learning_rate": 1.1685971208675539e-05, + "loss": 0.0087, + "step": 15780 + }, + { + "epoch": 7.420112781954887, + "grad_norm": 0.12295614182949066, + "learning_rate": 1.1632905385830484e-05, + "loss": 0.0152, + "step": 15790 + }, + { + "epoch": 7.424812030075188, + "grad_norm": 0.10082532465457916, + "learning_rate": 1.157994445715706e-05, + "loss": 0.0097, + "step": 15800 + }, + { + "epoch": 7.429511278195489, + "grad_norm": 0.11935935169458389, + "learning_rate": 1.1527088567448407e-05, + "loss": 0.0153, + "step": 15810 + }, + { + "epoch": 7.434210526315789, + "grad_norm": 0.15272879600524902, + "learning_rate": 1.1474337861210543e-05, + "loss": 0.0132, + "step": 15820 + }, + { + "epoch": 7.43890977443609, + "grad_norm": 0.08860108256340027, + "learning_rate": 1.1421692482661856e-05, + "loss": 0.0166, + "step": 15830 + }, + { + "epoch": 7.443609022556391, + "grad_norm": 0.10336878895759583, + "learning_rate": 1.1369152575732822e-05, + "loss": 0.0098, + "step": 15840 + }, + { + "epoch": 7.448308270676692, + "grad_norm": 0.13501764833927155, + "learning_rate": 1.1316718284065537e-05, + "loss": 0.0125, + "step": 15850 + }, + { + "epoch": 7.453007518796992, + "grad_norm": 0.12372566759586334, + "learning_rate": 1.1264389751013326e-05, + "loss": 0.0129, + "step": 15860 + }, + { + "epoch": 7.457706766917293, + "grad_norm": 0.1402164101600647, + "learning_rate": 1.1212167119640438e-05, + "loss": 0.0159, + "step": 15870 + }, + { + "epoch": 7.462406015037594, + "grad_norm": 0.1364455670118332, + "learning_rate": 1.1160050532721528e-05, + "loss": 0.0155, + "step": 15880 + }, + { + "epoch": 7.467105263157895, + "grad_norm": 0.12234701961278915, + "learning_rate": 1.1108040132741354e-05, + "loss": 0.0155, + "step": 15890 + }, + { + "epoch": 7.4718045112781954, + "grad_norm": 0.0902191773056984, + "learning_rate": 1.1056136061894384e-05, + "loss": 0.0209, + "step": 15900 + }, + { + "epoch": 7.476503759398496, + "grad_norm": 0.060509975999593735, + "learning_rate": 1.100433846208434e-05, + "loss": 0.0201, + "step": 15910 + }, + { + "epoch": 7.481203007518797, + "grad_norm": 0.15859703719615936, + "learning_rate": 1.095264747492391e-05, + "loss": 0.0179, + "step": 15920 + }, + { + "epoch": 7.485902255639098, + "grad_norm": 0.05780461058020592, + "learning_rate": 1.090106324173426e-05, + "loss": 0.0136, + "step": 15930 + }, + { + "epoch": 7.4906015037593985, + "grad_norm": 0.14311784505844116, + "learning_rate": 1.0849585903544706e-05, + "loss": 0.0147, + "step": 15940 + }, + { + "epoch": 7.495300751879699, + "grad_norm": 0.08800586313009262, + "learning_rate": 1.0798215601092354e-05, + "loss": 0.0169, + "step": 15950 + }, + { + "epoch": 7.5, + "grad_norm": 0.12583638727664948, + "learning_rate": 1.0746952474821614e-05, + "loss": 0.015, + "step": 15960 + }, + { + "epoch": 7.504699248120301, + "grad_norm": 0.12645405530929565, + "learning_rate": 1.069579666488395e-05, + "loss": 0.0166, + "step": 15970 + }, + { + "epoch": 7.5093984962406015, + "grad_norm": 0.08927604556083679, + "learning_rate": 1.0644748311137376e-05, + "loss": 0.0137, + "step": 15980 + }, + { + "epoch": 7.514097744360902, + "grad_norm": 0.05305986478924751, + "learning_rate": 1.059380755314613e-05, + "loss": 0.016, + "step": 15990 + }, + { + "epoch": 7.518796992481203, + "grad_norm": 0.05655212327837944, + "learning_rate": 1.0542974530180327e-05, + "loss": 0.0146, + "step": 16000 + }, + { + "epoch": 7.523496240601504, + "grad_norm": 0.13975438475608826, + "learning_rate": 1.049224938121548e-05, + "loss": 0.0157, + "step": 16010 + }, + { + "epoch": 7.5281954887218046, + "grad_norm": 0.16795960068702698, + "learning_rate": 1.0441632244932237e-05, + "loss": 0.0168, + "step": 16020 + }, + { + "epoch": 7.532894736842105, + "grad_norm": 0.09216076880693436, + "learning_rate": 1.0391123259715906e-05, + "loss": 0.0162, + "step": 16030 + }, + { + "epoch": 7.537593984962406, + "grad_norm": 0.10978179425001144, + "learning_rate": 1.0340722563656107e-05, + "loss": 0.0147, + "step": 16040 + }, + { + "epoch": 7.542293233082707, + "grad_norm": 0.06818930059671402, + "learning_rate": 1.0290430294546449e-05, + "loss": 0.0138, + "step": 16050 + }, + { + "epoch": 7.546992481203008, + "grad_norm": 0.11898943781852722, + "learning_rate": 1.0240246589884044e-05, + "loss": 0.0119, + "step": 16060 + }, + { + "epoch": 7.551691729323308, + "grad_norm": 0.09943754971027374, + "learning_rate": 1.0190171586869258e-05, + "loss": 0.0129, + "step": 16070 + }, + { + "epoch": 7.556390977443609, + "grad_norm": 0.05211075395345688, + "learning_rate": 1.0140205422405214e-05, + "loss": 0.0083, + "step": 16080 + }, + { + "epoch": 7.56109022556391, + "grad_norm": 0.09384645521640778, + "learning_rate": 1.009034823309749e-05, + "loss": 0.0111, + "step": 16090 + }, + { + "epoch": 7.565789473684211, + "grad_norm": 0.08874189853668213, + "learning_rate": 1.0040600155253765e-05, + "loss": 0.0063, + "step": 16100 + }, + { + "epoch": 7.570488721804511, + "grad_norm": 0.05463829264044762, + "learning_rate": 9.990961324883358e-06, + "loss": 0.0121, + "step": 16110 + }, + { + "epoch": 7.575187969924812, + "grad_norm": 0.16796351969242096, + "learning_rate": 9.941431877696955e-06, + "loss": 0.0151, + "step": 16120 + }, + { + "epoch": 7.579887218045113, + "grad_norm": 0.05948880687355995, + "learning_rate": 9.892011949106172e-06, + "loss": 0.0144, + "step": 16130 + }, + { + "epoch": 7.584586466165414, + "grad_norm": 0.06560337543487549, + "learning_rate": 9.842701674223187e-06, + "loss": 0.0089, + "step": 16140 + }, + { + "epoch": 7.589285714285714, + "grad_norm": 0.07054495811462402, + "learning_rate": 9.793501187860432e-06, + "loss": 0.0107, + "step": 16150 + }, + { + "epoch": 7.593984962406015, + "grad_norm": 0.07890637964010239, + "learning_rate": 9.744410624530148e-06, + "loss": 0.0138, + "step": 16160 + }, + { + "epoch": 7.598684210526316, + "grad_norm": 0.10604366660118103, + "learning_rate": 9.695430118444048e-06, + "loss": 0.0085, + "step": 16170 + }, + { + "epoch": 7.603383458646617, + "grad_norm": 0.07709035277366638, + "learning_rate": 9.646559803512994e-06, + "loss": 0.0105, + "step": 16180 + }, + { + "epoch": 7.6080827067669174, + "grad_norm": 0.10427499562501907, + "learning_rate": 9.597799813346525e-06, + "loss": 0.0097, + "step": 16190 + }, + { + "epoch": 7.612781954887218, + "grad_norm": 0.10048440843820572, + "learning_rate": 9.549150281252633e-06, + "loss": 0.0074, + "step": 16200 + }, + { + "epoch": 7.617481203007519, + "grad_norm": 0.057601477950811386, + "learning_rate": 9.500611340237258e-06, + "loss": 0.011, + "step": 16210 + }, + { + "epoch": 7.62218045112782, + "grad_norm": 0.06594853103160858, + "learning_rate": 9.452183123004e-06, + "loss": 0.0151, + "step": 16220 + }, + { + "epoch": 7.6268796992481205, + "grad_norm": 0.06648577749729156, + "learning_rate": 9.403865761953779e-06, + "loss": 0.0117, + "step": 16230 + }, + { + "epoch": 7.631578947368421, + "grad_norm": 0.08442472666501999, + "learning_rate": 9.355659389184396e-06, + "loss": 0.0181, + "step": 16240 + }, + { + "epoch": 7.636278195488722, + "grad_norm": 0.07537990063428879, + "learning_rate": 9.307564136490254e-06, + "loss": 0.0112, + "step": 16250 + }, + { + "epoch": 7.640977443609023, + "grad_norm": 0.1092437133193016, + "learning_rate": 9.259580135361929e-06, + "loss": 0.0084, + "step": 16260 + }, + { + "epoch": 7.6456766917293235, + "grad_norm": 0.04986412823200226, + "learning_rate": 9.211707516985829e-06, + "loss": 0.0087, + "step": 16270 + }, + { + "epoch": 7.650375939849624, + "grad_norm": 0.06437689810991287, + "learning_rate": 9.163946412243896e-06, + "loss": 0.0177, + "step": 16280 + }, + { + "epoch": 7.655075187969925, + "grad_norm": 0.0936589166522026, + "learning_rate": 9.116296951713133e-06, + "loss": 0.0163, + "step": 16290 + }, + { + "epoch": 7.659774436090226, + "grad_norm": 0.09114740043878555, + "learning_rate": 9.068759265665384e-06, + "loss": 0.0096, + "step": 16300 + }, + { + "epoch": 7.6644736842105265, + "grad_norm": 0.093952976167202, + "learning_rate": 9.02133348406684e-06, + "loss": 0.0161, + "step": 16310 + }, + { + "epoch": 7.669172932330827, + "grad_norm": 0.11028216034173965, + "learning_rate": 8.974019736577777e-06, + "loss": 0.0117, + "step": 16320 + }, + { + "epoch": 7.673872180451128, + "grad_norm": 0.09880071133375168, + "learning_rate": 8.92681815255219e-06, + "loss": 0.0095, + "step": 16330 + }, + { + "epoch": 7.678571428571429, + "grad_norm": 0.10285835713148117, + "learning_rate": 8.879728861037384e-06, + "loss": 0.0117, + "step": 16340 + }, + { + "epoch": 7.68327067669173, + "grad_norm": 0.0487658828496933, + "learning_rate": 8.832751990773714e-06, + "loss": 0.0157, + "step": 16350 + }, + { + "epoch": 7.68796992481203, + "grad_norm": 0.04792909696698189, + "learning_rate": 8.785887670194138e-06, + "loss": 0.0183, + "step": 16360 + }, + { + "epoch": 7.692669172932331, + "grad_norm": 0.09301532804965973, + "learning_rate": 8.739136027423894e-06, + "loss": 0.0138, + "step": 16370 + }, + { + "epoch": 7.697368421052632, + "grad_norm": 0.1253780871629715, + "learning_rate": 8.692497190280224e-06, + "loss": 0.016, + "step": 16380 + }, + { + "epoch": 7.702067669172933, + "grad_norm": 0.14552897214889526, + "learning_rate": 8.645971286271904e-06, + "loss": 0.0128, + "step": 16390 + }, + { + "epoch": 7.706766917293233, + "grad_norm": 0.11963018774986267, + "learning_rate": 8.599558442598998e-06, + "loss": 0.0112, + "step": 16400 + }, + { + "epoch": 7.711466165413534, + "grad_norm": 0.13872599601745605, + "learning_rate": 8.55325878615244e-06, + "loss": 0.0113, + "step": 16410 + }, + { + "epoch": 7.716165413533835, + "grad_norm": 0.06703979521989822, + "learning_rate": 8.507072443513702e-06, + "loss": 0.0141, + "step": 16420 + }, + { + "epoch": 7.720864661654136, + "grad_norm": 0.12049257755279541, + "learning_rate": 8.460999540954517e-06, + "loss": 0.0145, + "step": 16430 + }, + { + "epoch": 7.725563909774436, + "grad_norm": 0.09920763969421387, + "learning_rate": 8.415040204436426e-06, + "loss": 0.0119, + "step": 16440 + }, + { + "epoch": 7.730263157894737, + "grad_norm": 0.20273980498313904, + "learning_rate": 8.369194559610482e-06, + "loss": 0.0131, + "step": 16450 + }, + { + "epoch": 7.734962406015038, + "grad_norm": 0.1277891844511032, + "learning_rate": 8.323462731816961e-06, + "loss": 0.0163, + "step": 16460 + }, + { + "epoch": 7.739661654135339, + "grad_norm": 0.09195142984390259, + "learning_rate": 8.277844846084898e-06, + "loss": 0.0089, + "step": 16470 + }, + { + "epoch": 7.7443609022556394, + "grad_norm": 0.07057520747184753, + "learning_rate": 8.232341027131885e-06, + "loss": 0.0125, + "step": 16480 + }, + { + "epoch": 7.74906015037594, + "grad_norm": 0.12380823493003845, + "learning_rate": 8.186951399363613e-06, + "loss": 0.0099, + "step": 16490 + }, + { + "epoch": 7.753759398496241, + "grad_norm": 0.11303659528493881, + "learning_rate": 8.141676086873572e-06, + "loss": 0.0096, + "step": 16500 + }, + { + "epoch": 7.758458646616542, + "grad_norm": 0.06908733397722244, + "learning_rate": 8.096515213442762e-06, + "loss": 0.014, + "step": 16510 + }, + { + "epoch": 7.7631578947368425, + "grad_norm": 0.12358922511339188, + "learning_rate": 8.051468902539272e-06, + "loss": 0.0106, + "step": 16520 + }, + { + "epoch": 7.767857142857143, + "grad_norm": 0.06847315281629562, + "learning_rate": 8.00653727731801e-06, + "loss": 0.0179, + "step": 16530 + }, + { + "epoch": 7.772556390977444, + "grad_norm": 0.1353650540113449, + "learning_rate": 7.96172046062032e-06, + "loss": 0.0143, + "step": 16540 + }, + { + "epoch": 7.777255639097744, + "grad_norm": 0.13592234253883362, + "learning_rate": 7.917018574973645e-06, + "loss": 0.0214, + "step": 16550 + }, + { + "epoch": 7.7819548872180455, + "grad_norm": 0.1253795623779297, + "learning_rate": 7.872431742591268e-06, + "loss": 0.0095, + "step": 16560 + }, + { + "epoch": 7.786654135338345, + "grad_norm": 0.2027290016412735, + "learning_rate": 7.827960085371855e-06, + "loss": 0.0151, + "step": 16570 + }, + { + "epoch": 7.791353383458647, + "grad_norm": 0.15278691053390503, + "learning_rate": 7.783603724899257e-06, + "loss": 0.0119, + "step": 16580 + }, + { + "epoch": 7.796052631578947, + "grad_norm": 0.04820878058671951, + "learning_rate": 7.739362782442021e-06, + "loss": 0.0102, + "step": 16590 + }, + { + "epoch": 7.8007518796992485, + "grad_norm": 0.12460605800151825, + "learning_rate": 7.695237378953223e-06, + "loss": 0.0119, + "step": 16600 + }, + { + "epoch": 7.805451127819548, + "grad_norm": 0.1490667760372162, + "learning_rate": 7.651227635070041e-06, + "loss": 0.0119, + "step": 16610 + }, + { + "epoch": 7.81015037593985, + "grad_norm": 0.09200643748044968, + "learning_rate": 7.607333671113409e-06, + "loss": 0.0185, + "step": 16620 + }, + { + "epoch": 7.81484962406015, + "grad_norm": 0.10247211903333664, + "learning_rate": 7.56355560708778e-06, + "loss": 0.0164, + "step": 16630 + }, + { + "epoch": 7.819548872180452, + "grad_norm": 0.1183587983250618, + "learning_rate": 7.519893562680663e-06, + "loss": 0.0116, + "step": 16640 + }, + { + "epoch": 7.8242481203007515, + "grad_norm": 0.08483126014471054, + "learning_rate": 7.476347657262456e-06, + "loss": 0.0105, + "step": 16650 + }, + { + "epoch": 7.828947368421053, + "grad_norm": 0.10963563621044159, + "learning_rate": 7.432918009885997e-06, + "loss": 0.0148, + "step": 16660 + }, + { + "epoch": 7.833646616541353, + "grad_norm": 0.03716852888464928, + "learning_rate": 7.389604739286271e-06, + "loss": 0.0125, + "step": 16670 + }, + { + "epoch": 7.838345864661655, + "grad_norm": 0.05933445319533348, + "learning_rate": 7.3464079638801365e-06, + "loss": 0.012, + "step": 16680 + }, + { + "epoch": 7.8430451127819545, + "grad_norm": 0.06644676625728607, + "learning_rate": 7.30332780176588e-06, + "loss": 0.0158, + "step": 16690 + }, + { + "epoch": 7.847744360902256, + "grad_norm": 0.0786653533577919, + "learning_rate": 7.260364370723044e-06, + "loss": 0.0142, + "step": 16700 + }, + { + "epoch": 7.852443609022556, + "grad_norm": 0.16818052530288696, + "learning_rate": 7.217517788212025e-06, + "loss": 0.0095, + "step": 16710 + }, + { + "epoch": 7.857142857142857, + "grad_norm": 0.05747194588184357, + "learning_rate": 7.174788171373731e-06, + "loss": 0.0094, + "step": 16720 + }, + { + "epoch": 7.8618421052631575, + "grad_norm": 0.06959807127714157, + "learning_rate": 7.132175637029293e-06, + "loss": 0.0095, + "step": 16730 + }, + { + "epoch": 7.866541353383458, + "grad_norm": 0.0715508908033371, + "learning_rate": 7.089680301679752e-06, + "loss": 0.0106, + "step": 16740 + }, + { + "epoch": 7.871240601503759, + "grad_norm": 0.11585424840450287, + "learning_rate": 7.047302281505736e-06, + "loss": 0.0104, + "step": 16750 + }, + { + "epoch": 7.87593984962406, + "grad_norm": 0.09123794734477997, + "learning_rate": 7.005041692367154e-06, + "loss": 0.0151, + "step": 16760 + }, + { + "epoch": 7.8806390977443606, + "grad_norm": 0.14770396053791046, + "learning_rate": 6.962898649802823e-06, + "loss": 0.0136, + "step": 16770 + }, + { + "epoch": 7.885338345864661, + "grad_norm": 0.10611529648303986, + "learning_rate": 6.92087326903022e-06, + "loss": 0.0134, + "step": 16780 + }, + { + "epoch": 7.890037593984962, + "grad_norm": 0.07197631895542145, + "learning_rate": 6.878965664945108e-06, + "loss": 0.0148, + "step": 16790 + }, + { + "epoch": 7.894736842105263, + "grad_norm": 0.08622337877750397, + "learning_rate": 6.837175952121306e-06, + "loss": 0.0058, + "step": 16800 + }, + { + "epoch": 7.899436090225564, + "grad_norm": 0.08402518182992935, + "learning_rate": 6.795504244810285e-06, + "loss": 0.0102, + "step": 16810 + }, + { + "epoch": 7.904135338345864, + "grad_norm": 0.07237549871206284, + "learning_rate": 6.753950656940905e-06, + "loss": 0.0126, + "step": 16820 + }, + { + "epoch": 7.908834586466165, + "grad_norm": 0.17599986493587494, + "learning_rate": 6.712515302119077e-06, + "loss": 0.0108, + "step": 16830 + }, + { + "epoch": 7.913533834586466, + "grad_norm": 0.07870358228683472, + "learning_rate": 6.671198293627479e-06, + "loss": 0.012, + "step": 16840 + }, + { + "epoch": 7.918233082706767, + "grad_norm": 0.05515943467617035, + "learning_rate": 6.629999744425236e-06, + "loss": 0.0073, + "step": 16850 + }, + { + "epoch": 7.922932330827067, + "grad_norm": 0.1197136715054512, + "learning_rate": 6.588919767147639e-06, + "loss": 0.0096, + "step": 16860 + }, + { + "epoch": 7.927631578947368, + "grad_norm": 0.055950380861759186, + "learning_rate": 6.5479584741057255e-06, + "loss": 0.0131, + "step": 16870 + }, + { + "epoch": 7.932330827067669, + "grad_norm": 0.08994955569505692, + "learning_rate": 6.5071159772861436e-06, + "loss": 0.0114, + "step": 16880 + }, + { + "epoch": 7.93703007518797, + "grad_norm": 0.059606000781059265, + "learning_rate": 6.466392388350695e-06, + "loss": 0.014, + "step": 16890 + }, + { + "epoch": 7.94172932330827, + "grad_norm": 0.11336628347635269, + "learning_rate": 6.425787818636131e-06, + "loss": 0.0109, + "step": 16900 + }, + { + "epoch": 7.946428571428571, + "grad_norm": 0.08738347887992859, + "learning_rate": 6.385302379153818e-06, + "loss": 0.0099, + "step": 16910 + }, + { + "epoch": 7.951127819548872, + "grad_norm": 0.13885965943336487, + "learning_rate": 6.344936180589351e-06, + "loss": 0.0111, + "step": 16920 + }, + { + "epoch": 7.955827067669173, + "grad_norm": 0.1789887249469757, + "learning_rate": 6.304689333302416e-06, + "loss": 0.0119, + "step": 16930 + }, + { + "epoch": 7.9605263157894735, + "grad_norm": 0.08739109337329865, + "learning_rate": 6.264561947326331e-06, + "loss": 0.0083, + "step": 16940 + }, + { + "epoch": 7.965225563909774, + "grad_norm": 0.07735337316989899, + "learning_rate": 6.22455413236786e-06, + "loss": 0.0112, + "step": 16950 + }, + { + "epoch": 7.969924812030075, + "grad_norm": 0.15434323251247406, + "learning_rate": 6.184665997806832e-06, + "loss": 0.0189, + "step": 16960 + }, + { + "epoch": 7.974624060150376, + "grad_norm": 0.07851307839155197, + "learning_rate": 6.144897652695864e-06, + "loss": 0.0143, + "step": 16970 + }, + { + "epoch": 7.9793233082706765, + "grad_norm": 0.16692528128623962, + "learning_rate": 6.1052492057601275e-06, + "loss": 0.024, + "step": 16980 + }, + { + "epoch": 7.984022556390977, + "grad_norm": 0.04660286381840706, + "learning_rate": 6.0657207653969315e-06, + "loss": 0.006, + "step": 16990 + }, + { + "epoch": 7.988721804511278, + "grad_norm": 0.18530908226966858, + "learning_rate": 6.026312439675552e-06, + "loss": 0.0106, + "step": 17000 + }, + { + "epoch": 7.993421052631579, + "grad_norm": 0.1558287888765335, + "learning_rate": 5.9870243363368275e-06, + "loss": 0.0131, + "step": 17010 + }, + { + "epoch": 7.9981203007518795, + "grad_norm": 0.08138086646795273, + "learning_rate": 5.947856562792925e-06, + "loss": 0.0156, + "step": 17020 + }, + { + "epoch": 8.00281954887218, + "grad_norm": 0.04826957359910011, + "learning_rate": 5.908809226127054e-06, + "loss": 0.0117, + "step": 17030 + }, + { + "epoch": 8.007518796992482, + "grad_norm": 0.07623440772294998, + "learning_rate": 5.869882433093155e-06, + "loss": 0.0137, + "step": 17040 + }, + { + "epoch": 8.012218045112782, + "grad_norm": 0.11140824854373932, + "learning_rate": 5.831076290115573e-06, + "loss": 0.0098, + "step": 17050 + }, + { + "epoch": 8.016917293233083, + "grad_norm": 0.045380230993032455, + "learning_rate": 5.79239090328883e-06, + "loss": 0.0102, + "step": 17060 + }, + { + "epoch": 8.021616541353383, + "grad_norm": 0.056830886751413345, + "learning_rate": 5.753826378377286e-06, + "loss": 0.0089, + "step": 17070 + }, + { + "epoch": 8.026315789473685, + "grad_norm": 0.10038434714078903, + "learning_rate": 5.715382820814885e-06, + "loss": 0.0093, + "step": 17080 + }, + { + "epoch": 8.031015037593985, + "grad_norm": 0.06969669461250305, + "learning_rate": 5.67706033570487e-06, + "loss": 0.0135, + "step": 17090 + }, + { + "epoch": 8.035714285714286, + "grad_norm": 0.16016316413879395, + "learning_rate": 5.6388590278194096e-06, + "loss": 0.0121, + "step": 17100 + }, + { + "epoch": 8.040413533834586, + "grad_norm": 0.08400869369506836, + "learning_rate": 5.600779001599455e-06, + "loss": 0.0114, + "step": 17110 + }, + { + "epoch": 8.045112781954888, + "grad_norm": 0.0524422749876976, + "learning_rate": 5.562820361154314e-06, + "loss": 0.0084, + "step": 17120 + }, + { + "epoch": 8.049812030075188, + "grad_norm": 0.1329491287469864, + "learning_rate": 5.524983210261481e-06, + "loss": 0.0126, + "step": 17130 + }, + { + "epoch": 8.05451127819549, + "grad_norm": 0.04901084676384926, + "learning_rate": 5.48726765236629e-06, + "loss": 0.0172, + "step": 17140 + }, + { + "epoch": 8.05921052631579, + "grad_norm": 0.06812364608049393, + "learning_rate": 5.449673790581611e-06, + "loss": 0.0113, + "step": 17150 + }, + { + "epoch": 8.063909774436091, + "grad_norm": 0.04741634428501129, + "learning_rate": 5.412201727687644e-06, + "loss": 0.0109, + "step": 17160 + }, + { + "epoch": 8.068609022556391, + "grad_norm": 0.0470438152551651, + "learning_rate": 5.374851566131561e-06, + "loss": 0.0092, + "step": 17170 + }, + { + "epoch": 8.073308270676693, + "grad_norm": 0.10451999306678772, + "learning_rate": 5.337623408027293e-06, + "loss": 0.0115, + "step": 17180 + }, + { + "epoch": 8.078007518796992, + "grad_norm": 0.08115876466035843, + "learning_rate": 5.300517355155215e-06, + "loss": 0.0156, + "step": 17190 + }, + { + "epoch": 8.082706766917294, + "grad_norm": 0.0681278258562088, + "learning_rate": 5.263533508961827e-06, + "loss": 0.0151, + "step": 17200 + }, + { + "epoch": 8.087406015037594, + "grad_norm": 0.11319839954376221, + "learning_rate": 5.226671970559577e-06, + "loss": 0.0129, + "step": 17210 + }, + { + "epoch": 8.092105263157896, + "grad_norm": 0.0824664905667305, + "learning_rate": 5.1899328407264855e-06, + "loss": 0.0113, + "step": 17220 + }, + { + "epoch": 8.096804511278195, + "grad_norm": 0.05062666907906532, + "learning_rate": 5.153316219905946e-06, + "loss": 0.0164, + "step": 17230 + }, + { + "epoch": 8.101503759398497, + "grad_norm": 0.10200849920511246, + "learning_rate": 5.116822208206396e-06, + "loss": 0.0158, + "step": 17240 + }, + { + "epoch": 8.106203007518797, + "grad_norm": 0.07733986526727676, + "learning_rate": 5.080450905401057e-06, + "loss": 0.0118, + "step": 17250 + }, + { + "epoch": 8.110902255639099, + "grad_norm": 0.04709453135728836, + "learning_rate": 5.044202410927706e-06, + "loss": 0.0083, + "step": 17260 + }, + { + "epoch": 8.115601503759398, + "grad_norm": 0.06094250828027725, + "learning_rate": 5.008076823888319e-06, + "loss": 0.0154, + "step": 17270 + }, + { + "epoch": 8.1203007518797, + "grad_norm": 0.04622756689786911, + "learning_rate": 4.972074243048897e-06, + "loss": 0.0103, + "step": 17280 + }, + { + "epoch": 8.125, + "grad_norm": 0.07227181643247604, + "learning_rate": 4.936194766839103e-06, + "loss": 0.0127, + "step": 17290 + }, + { + "epoch": 8.1296992481203, + "grad_norm": 0.07325026392936707, + "learning_rate": 4.900438493352055e-06, + "loss": 0.0155, + "step": 17300 + }, + { + "epoch": 8.134398496240602, + "grad_norm": 0.1052834540605545, + "learning_rate": 4.864805520344051e-06, + "loss": 0.0117, + "step": 17310 + }, + { + "epoch": 8.139097744360901, + "grad_norm": 0.09175686538219452, + "learning_rate": 4.829295945234258e-06, + "loss": 0.0082, + "step": 17320 + }, + { + "epoch": 8.143796992481203, + "grad_norm": 0.045797426253557205, + "learning_rate": 4.7939098651045235e-06, + "loss": 0.0142, + "step": 17330 + }, + { + "epoch": 8.148496240601503, + "grad_norm": 0.06085168570280075, + "learning_rate": 4.758647376699032e-06, + "loss": 0.0072, + "step": 17340 + }, + { + "epoch": 8.153195488721805, + "grad_norm": 0.15534856915473938, + "learning_rate": 4.723508576424062e-06, + "loss": 0.0157, + "step": 17350 + }, + { + "epoch": 8.157894736842104, + "grad_norm": 0.1595873087644577, + "learning_rate": 4.688493560347773e-06, + "loss": 0.0167, + "step": 17360 + }, + { + "epoch": 8.162593984962406, + "grad_norm": 0.05131183937191963, + "learning_rate": 4.653602424199876e-06, + "loss": 0.0114, + "step": 17370 + }, + { + "epoch": 8.167293233082706, + "grad_norm": 0.12188933789730072, + "learning_rate": 4.618835263371396e-06, + "loss": 0.0119, + "step": 17380 + }, + { + "epoch": 8.171992481203008, + "grad_norm": 0.12209612131118774, + "learning_rate": 4.5841921729144424e-06, + "loss": 0.0122, + "step": 17390 + }, + { + "epoch": 8.176691729323307, + "grad_norm": 0.0490056648850441, + "learning_rate": 4.549673247541875e-06, + "loss": 0.0112, + "step": 17400 + }, + { + "epoch": 8.181390977443609, + "grad_norm": 0.1899740844964981, + "learning_rate": 4.515278581627141e-06, + "loss": 0.0082, + "step": 17410 + }, + { + "epoch": 8.186090225563909, + "grad_norm": 0.05114210397005081, + "learning_rate": 4.48100826920394e-06, + "loss": 0.0078, + "step": 17420 + }, + { + "epoch": 8.19078947368421, + "grad_norm": 0.1333758533000946, + "learning_rate": 4.446862403965984e-06, + "loss": 0.0127, + "step": 17430 + }, + { + "epoch": 8.19548872180451, + "grad_norm": 0.05620293319225311, + "learning_rate": 4.412841079266777e-06, + "loss": 0.012, + "step": 17440 + }, + { + "epoch": 8.200187969924812, + "grad_norm": 0.08491794764995575, + "learning_rate": 4.378944388119311e-06, + "loss": 0.0162, + "step": 17450 + }, + { + "epoch": 8.204887218045112, + "grad_norm": 0.08482471108436584, + "learning_rate": 4.3451724231958644e-06, + "loss": 0.0098, + "step": 17460 + }, + { + "epoch": 8.209586466165414, + "grad_norm": 0.11990874260663986, + "learning_rate": 4.311525276827682e-06, + "loss": 0.01, + "step": 17470 + }, + { + "epoch": 8.214285714285714, + "grad_norm": 0.07016388326883316, + "learning_rate": 4.27800304100478e-06, + "loss": 0.0142, + "step": 17480 + }, + { + "epoch": 8.218984962406015, + "grad_norm": 0.14324408769607544, + "learning_rate": 4.244605807375679e-06, + "loss": 0.0184, + "step": 17490 + }, + { + "epoch": 8.223684210526315, + "grad_norm": 0.06899172067642212, + "learning_rate": 4.2113336672471245e-06, + "loss": 0.0108, + "step": 17500 + }, + { + "epoch": 8.228383458646617, + "grad_norm": 0.1519225388765335, + "learning_rate": 4.178186711583904e-06, + "loss": 0.015, + "step": 17510 + }, + { + "epoch": 8.233082706766917, + "grad_norm": 0.0976828932762146, + "learning_rate": 4.145165031008508e-06, + "loss": 0.0147, + "step": 17520 + }, + { + "epoch": 8.237781954887218, + "grad_norm": 0.060619693249464035, + "learning_rate": 4.112268715800943e-06, + "loss": 0.0137, + "step": 17530 + }, + { + "epoch": 8.242481203007518, + "grad_norm": 0.05055955424904823, + "learning_rate": 4.079497855898501e-06, + "loss": 0.0148, + "step": 17540 + }, + { + "epoch": 8.24718045112782, + "grad_norm": 0.11303461343050003, + "learning_rate": 4.046852540895446e-06, + "loss": 0.0124, + "step": 17550 + }, + { + "epoch": 8.25187969924812, + "grad_norm": 0.10226847231388092, + "learning_rate": 4.01433286004283e-06, + "loss": 0.0148, + "step": 17560 + }, + { + "epoch": 8.256578947368421, + "grad_norm": 0.13229216635227203, + "learning_rate": 3.981938902248222e-06, + "loss": 0.0131, + "step": 17570 + }, + { + "epoch": 8.261278195488721, + "grad_norm": 0.10800975561141968, + "learning_rate": 3.949670756075447e-06, + "loss": 0.0193, + "step": 17580 + }, + { + "epoch": 8.265977443609023, + "grad_norm": 0.049899160861968994, + "learning_rate": 3.917528509744412e-06, + "loss": 0.0089, + "step": 17590 + }, + { + "epoch": 8.270676691729323, + "grad_norm": 0.052780695259571075, + "learning_rate": 3.885512251130763e-06, + "loss": 0.0118, + "step": 17600 + }, + { + "epoch": 8.275375939849624, + "grad_norm": 0.10531821846961975, + "learning_rate": 3.8536220677657495e-06, + "loss": 0.0231, + "step": 17610 + }, + { + "epoch": 8.280075187969924, + "grad_norm": 0.14322184026241302, + "learning_rate": 3.821858046835913e-06, + "loss": 0.0114, + "step": 17620 + }, + { + "epoch": 8.284774436090226, + "grad_norm": 0.05661779269576073, + "learning_rate": 3.790220275182854e-06, + "loss": 0.0084, + "step": 17630 + }, + { + "epoch": 8.289473684210526, + "grad_norm": 0.052848171442747116, + "learning_rate": 3.75870883930306e-06, + "loss": 0.0094, + "step": 17640 + }, + { + "epoch": 8.294172932330827, + "grad_norm": 0.133419930934906, + "learning_rate": 3.7273238253475785e-06, + "loss": 0.0199, + "step": 17650 + }, + { + "epoch": 8.298872180451127, + "grad_norm": 0.08947378396987915, + "learning_rate": 3.696065319121833e-06, + "loss": 0.0104, + "step": 17660 + }, + { + "epoch": 8.303571428571429, + "grad_norm": 0.03752607852220535, + "learning_rate": 3.664933406085402e-06, + "loss": 0.009, + "step": 17670 + }, + { + "epoch": 8.308270676691729, + "grad_norm": 0.08471754193305969, + "learning_rate": 3.6339281713517303e-06, + "loss": 0.0097, + "step": 17680 + }, + { + "epoch": 8.31296992481203, + "grad_norm": 0.07993436604738235, + "learning_rate": 3.60304969968796e-06, + "loss": 0.0107, + "step": 17690 + }, + { + "epoch": 8.31766917293233, + "grad_norm": 0.034584853798151016, + "learning_rate": 3.5722980755146517e-06, + "loss": 0.0141, + "step": 17700 + }, + { + "epoch": 8.322368421052632, + "grad_norm": 0.13245198130607605, + "learning_rate": 3.541673382905558e-06, + "loss": 0.0179, + "step": 17710 + }, + { + "epoch": 8.327067669172932, + "grad_norm": 0.09505411237478256, + "learning_rate": 3.511175705587433e-06, + "loss": 0.0125, + "step": 17720 + }, + { + "epoch": 8.331766917293233, + "grad_norm": 0.12648499011993408, + "learning_rate": 3.4808051269397512e-06, + "loss": 0.0108, + "step": 17730 + }, + { + "epoch": 8.336466165413533, + "grad_norm": 0.07255811244249344, + "learning_rate": 3.4505617299945336e-06, + "loss": 0.0073, + "step": 17740 + }, + { + "epoch": 8.341165413533835, + "grad_norm": 0.07952384650707245, + "learning_rate": 3.420445597436056e-06, + "loss": 0.0087, + "step": 17750 + }, + { + "epoch": 8.345864661654135, + "grad_norm": 0.061990268528461456, + "learning_rate": 3.390456811600673e-06, + "loss": 0.0118, + "step": 17760 + }, + { + "epoch": 8.350563909774436, + "grad_norm": 0.14404335618019104, + "learning_rate": 3.360595454476595e-06, + "loss": 0.0179, + "step": 17770 + }, + { + "epoch": 8.355263157894736, + "grad_norm": 0.13847926259040833, + "learning_rate": 3.3308616077036115e-06, + "loss": 0.0108, + "step": 17780 + }, + { + "epoch": 8.359962406015038, + "grad_norm": 0.0559711791574955, + "learning_rate": 3.301255352572946e-06, + "loss": 0.0084, + "step": 17790 + }, + { + "epoch": 8.364661654135338, + "grad_norm": 0.05780694633722305, + "learning_rate": 3.271776770026963e-06, + "loss": 0.0141, + "step": 17800 + }, + { + "epoch": 8.36936090225564, + "grad_norm": 0.09520161896944046, + "learning_rate": 3.2424259406589664e-06, + "loss": 0.0138, + "step": 17810 + }, + { + "epoch": 8.37406015037594, + "grad_norm": 0.07881022244691849, + "learning_rate": 3.213202944713023e-06, + "loss": 0.007, + "step": 17820 + }, + { + "epoch": 8.378759398496241, + "grad_norm": 0.12898702919483185, + "learning_rate": 3.1841078620836683e-06, + "loss": 0.012, + "step": 17830 + }, + { + "epoch": 8.38345864661654, + "grad_norm": 0.06967730820178986, + "learning_rate": 3.155140772315773e-06, + "loss": 0.0117, + "step": 17840 + }, + { + "epoch": 8.388157894736842, + "grad_norm": 0.14333881437778473, + "learning_rate": 3.126301754604233e-06, + "loss": 0.0112, + "step": 17850 + }, + { + "epoch": 8.392857142857142, + "grad_norm": 0.05870426073670387, + "learning_rate": 3.0975908877938277e-06, + "loss": 0.0082, + "step": 17860 + }, + { + "epoch": 8.397556390977444, + "grad_norm": 0.04919710382819176, + "learning_rate": 3.0690082503789742e-06, + "loss": 0.0095, + "step": 17870 + }, + { + "epoch": 8.402255639097744, + "grad_norm": 0.10509052872657776, + "learning_rate": 3.040553920503503e-06, + "loss": 0.0116, + "step": 17880 + }, + { + "epoch": 8.406954887218046, + "grad_norm": 0.07657311856746674, + "learning_rate": 3.0122279759604745e-06, + "loss": 0.0137, + "step": 17890 + }, + { + "epoch": 8.411654135338345, + "grad_norm": 0.14278799295425415, + "learning_rate": 2.9840304941919415e-06, + "loss": 0.0147, + "step": 17900 + }, + { + "epoch": 8.416353383458647, + "grad_norm": 0.06424115598201752, + "learning_rate": 2.9559615522887273e-06, + "loss": 0.0125, + "step": 17910 + }, + { + "epoch": 8.421052631578947, + "grad_norm": 0.140712708234787, + "learning_rate": 2.928021226990263e-06, + "loss": 0.0109, + "step": 17920 + }, + { + "epoch": 8.425751879699249, + "grad_norm": 0.04652019590139389, + "learning_rate": 2.9002095946843277e-06, + "loss": 0.0125, + "step": 17930 + }, + { + "epoch": 8.430451127819548, + "grad_norm": 0.07694724202156067, + "learning_rate": 2.8725267314068495e-06, + "loss": 0.0075, + "step": 17940 + }, + { + "epoch": 8.43515037593985, + "grad_norm": 0.08953419327735901, + "learning_rate": 2.844972712841737e-06, + "loss": 0.0124, + "step": 17950 + }, + { + "epoch": 8.43984962406015, + "grad_norm": 0.07722273468971252, + "learning_rate": 2.817547614320615e-06, + "loss": 0.0165, + "step": 17960 + }, + { + "epoch": 8.444548872180452, + "grad_norm": 0.10457627475261688, + "learning_rate": 2.790251510822661e-06, + "loss": 0.0104, + "step": 17970 + }, + { + "epoch": 8.449248120300751, + "grad_norm": 0.07237595319747925, + "learning_rate": 2.7630844769743757e-06, + "loss": 0.0139, + "step": 17980 + }, + { + "epoch": 8.453947368421053, + "grad_norm": 0.07277540117502213, + "learning_rate": 2.73604658704939e-06, + "loss": 0.0141, + "step": 17990 + }, + { + "epoch": 8.458646616541353, + "grad_norm": 0.0630272775888443, + "learning_rate": 2.7091379149682685e-06, + "loss": 0.0126, + "step": 18000 + }, + { + "epoch": 8.463345864661655, + "grad_norm": 0.09985774755477905, + "learning_rate": 2.682358534298285e-06, + "loss": 0.0135, + "step": 18010 + }, + { + "epoch": 8.468045112781954, + "grad_norm": 0.05475354194641113, + "learning_rate": 2.6557085182532582e-06, + "loss": 0.0094, + "step": 18020 + }, + { + "epoch": 8.472744360902256, + "grad_norm": 0.145661398768425, + "learning_rate": 2.6291879396933004e-06, + "loss": 0.0104, + "step": 18030 + }, + { + "epoch": 8.477443609022556, + "grad_norm": 0.04099111631512642, + "learning_rate": 2.602796871124663e-06, + "loss": 0.0068, + "step": 18040 + }, + { + "epoch": 8.482142857142858, + "grad_norm": 0.09392759948968887, + "learning_rate": 2.57653538469953e-06, + "loss": 0.0115, + "step": 18050 + }, + { + "epoch": 8.486842105263158, + "grad_norm": 0.06434937566518784, + "learning_rate": 2.5504035522157854e-06, + "loss": 0.0123, + "step": 18060 + }, + { + "epoch": 8.49154135338346, + "grad_norm": 0.12161760032176971, + "learning_rate": 2.5244014451168863e-06, + "loss": 0.009, + "step": 18070 + }, + { + "epoch": 8.496240601503759, + "grad_norm": 0.06177238002419472, + "learning_rate": 2.4985291344915674e-06, + "loss": 0.0085, + "step": 18080 + }, + { + "epoch": 8.50093984962406, + "grad_norm": 0.10747835785150528, + "learning_rate": 2.4727866910737583e-06, + "loss": 0.0115, + "step": 18090 + }, + { + "epoch": 8.50563909774436, + "grad_norm": 0.09225572645664215, + "learning_rate": 2.4471741852423237e-06, + "loss": 0.0126, + "step": 18100 + }, + { + "epoch": 8.510338345864662, + "grad_norm": 0.11941071599721909, + "learning_rate": 2.421691687020855e-06, + "loss": 0.018, + "step": 18110 + }, + { + "epoch": 8.515037593984962, + "grad_norm": 0.052437394857406616, + "learning_rate": 2.3963392660775575e-06, + "loss": 0.0105, + "step": 18120 + }, + { + "epoch": 8.519736842105264, + "grad_norm": 0.13413766026496887, + "learning_rate": 2.371116991724953e-06, + "loss": 0.0083, + "step": 18130 + }, + { + "epoch": 8.524436090225564, + "grad_norm": 0.0477786548435688, + "learning_rate": 2.3460249329197824e-06, + "loss": 0.0128, + "step": 18140 + }, + { + "epoch": 8.529135338345865, + "grad_norm": 0.05250224471092224, + "learning_rate": 2.321063158262793e-06, + "loss": 0.0166, + "step": 18150 + }, + { + "epoch": 8.533834586466165, + "grad_norm": 0.0531403049826622, + "learning_rate": 2.296231735998511e-06, + "loss": 0.0072, + "step": 18160 + }, + { + "epoch": 8.538533834586467, + "grad_norm": 0.046157341450452805, + "learning_rate": 2.271530734015104e-06, + "loss": 0.0101, + "step": 18170 + }, + { + "epoch": 8.543233082706767, + "grad_norm": 0.16874974966049194, + "learning_rate": 2.2469602198441573e-06, + "loss": 0.0169, + "step": 18180 + }, + { + "epoch": 8.547932330827068, + "grad_norm": 0.08781706541776657, + "learning_rate": 2.222520260660521e-06, + "loss": 0.0087, + "step": 18190 + }, + { + "epoch": 8.552631578947368, + "grad_norm": 0.08364150673151016, + "learning_rate": 2.1982109232821178e-06, + "loss": 0.0154, + "step": 18200 + }, + { + "epoch": 8.55733082706767, + "grad_norm": 0.09896906465291977, + "learning_rate": 2.174032274169746e-06, + "loss": 0.0137, + "step": 18210 + }, + { + "epoch": 8.56203007518797, + "grad_norm": 0.1566537767648697, + "learning_rate": 2.149984379426906e-06, + "loss": 0.0129, + "step": 18220 + }, + { + "epoch": 8.566729323308271, + "grad_norm": 0.08905629068613052, + "learning_rate": 2.1260673047996227e-06, + "loss": 0.0164, + "step": 18230 + }, + { + "epoch": 8.571428571428571, + "grad_norm": 0.1383201777935028, + "learning_rate": 2.102281115676258e-06, + "loss": 0.0088, + "step": 18240 + }, + { + "epoch": 8.576127819548873, + "grad_norm": 0.054648809134960175, + "learning_rate": 2.0786258770873647e-06, + "loss": 0.0099, + "step": 18250 + }, + { + "epoch": 8.580827067669173, + "grad_norm": 0.0630398541688919, + "learning_rate": 2.0551016537054493e-06, + "loss": 0.0077, + "step": 18260 + }, + { + "epoch": 8.585526315789474, + "grad_norm": 0.12112493067979813, + "learning_rate": 2.0317085098448372e-06, + "loss": 0.0114, + "step": 18270 + }, + { + "epoch": 8.590225563909774, + "grad_norm": 0.12041357904672623, + "learning_rate": 2.008446509461498e-06, + "loss": 0.009, + "step": 18280 + }, + { + "epoch": 8.594924812030076, + "grad_norm": 0.10477473586797714, + "learning_rate": 1.985315716152847e-06, + "loss": 0.0115, + "step": 18290 + }, + { + "epoch": 8.599624060150376, + "grad_norm": 0.05041252076625824, + "learning_rate": 1.962316193157593e-06, + "loss": 0.0218, + "step": 18300 + }, + { + "epoch": 8.604323308270677, + "grad_norm": 0.027872784063220024, + "learning_rate": 1.939448003355554e-06, + "loss": 0.0138, + "step": 18310 + }, + { + "epoch": 8.609022556390977, + "grad_norm": 0.048155125230550766, + "learning_rate": 1.91671120926748e-06, + "loss": 0.0119, + "step": 18320 + }, + { + "epoch": 8.613721804511279, + "grad_norm": 0.10073873400688171, + "learning_rate": 1.8941058730549132e-06, + "loss": 0.0114, + "step": 18330 + }, + { + "epoch": 8.618421052631579, + "grad_norm": 0.07115645706653595, + "learning_rate": 1.8716320565199618e-06, + "loss": 0.0122, + "step": 18340 + }, + { + "epoch": 8.62312030075188, + "grad_norm": 0.16081885993480682, + "learning_rate": 1.849289821105199e-06, + "loss": 0.0192, + "step": 18350 + }, + { + "epoch": 8.62781954887218, + "grad_norm": 0.11983854323625565, + "learning_rate": 1.8270792278934302e-06, + "loss": 0.0118, + "step": 18360 + }, + { + "epoch": 8.632518796992482, + "grad_norm": 0.086446613073349, + "learning_rate": 1.8050003376075707e-06, + "loss": 0.012, + "step": 18370 + }, + { + "epoch": 8.637218045112782, + "grad_norm": 0.10558143258094788, + "learning_rate": 1.7830532106104747e-06, + "loss": 0.0092, + "step": 18380 + }, + { + "epoch": 8.641917293233083, + "grad_norm": 0.07899550348520279, + "learning_rate": 1.7612379069047335e-06, + "loss": 0.0104, + "step": 18390 + }, + { + "epoch": 8.646616541353383, + "grad_norm": 0.07186214625835419, + "learning_rate": 1.7395544861325718e-06, + "loss": 0.0146, + "step": 18400 + }, + { + "epoch": 8.651315789473685, + "grad_norm": 0.06743978708982468, + "learning_rate": 1.7180030075756136e-06, + "loss": 0.0087, + "step": 18410 + }, + { + "epoch": 8.656015037593985, + "grad_norm": 0.06802039593458176, + "learning_rate": 1.696583530154794e-06, + "loss": 0.012, + "step": 18420 + }, + { + "epoch": 8.660714285714286, + "grad_norm": 0.13704046607017517, + "learning_rate": 1.6752961124301415e-06, + "loss": 0.0184, + "step": 18430 + }, + { + "epoch": 8.665413533834586, + "grad_norm": 0.07604125887155533, + "learning_rate": 1.6541408126006463e-06, + "loss": 0.01, + "step": 18440 + }, + { + "epoch": 8.670112781954888, + "grad_norm": 0.09493885189294815, + "learning_rate": 1.6331176885040878e-06, + "loss": 0.0146, + "step": 18450 + }, + { + "epoch": 8.674812030075188, + "grad_norm": 0.0435079000890255, + "learning_rate": 1.6122267976168781e-06, + "loss": 0.0119, + "step": 18460 + }, + { + "epoch": 8.67951127819549, + "grad_norm": 0.06214023381471634, + "learning_rate": 1.5914681970539192e-06, + "loss": 0.0127, + "step": 18470 + }, + { + "epoch": 8.68421052631579, + "grad_norm": 0.11379414051771164, + "learning_rate": 1.5708419435684462e-06, + "loss": 0.0124, + "step": 18480 + }, + { + "epoch": 8.688909774436091, + "grad_norm": 0.08199574053287506, + "learning_rate": 1.550348093551829e-06, + "loss": 0.0173, + "step": 18490 + }, + { + "epoch": 8.693609022556391, + "grad_norm": 0.04369651526212692, + "learning_rate": 1.5299867030334814e-06, + "loss": 0.0115, + "step": 18500 + }, + { + "epoch": 8.698308270676693, + "grad_norm": 0.040135458111763, + "learning_rate": 1.5097578276806633e-06, + "loss": 0.0123, + "step": 18510 + }, + { + "epoch": 8.703007518796992, + "grad_norm": 0.06963648647069931, + "learning_rate": 1.4896615227983468e-06, + "loss": 0.0098, + "step": 18520 + }, + { + "epoch": 8.707706766917294, + "grad_norm": 0.048562925308942795, + "learning_rate": 1.4696978433290653e-06, + "loss": 0.0158, + "step": 18530 + }, + { + "epoch": 8.712406015037594, + "grad_norm": 0.06549891829490662, + "learning_rate": 1.4498668438527597e-06, + "loss": 0.011, + "step": 18540 + }, + { + "epoch": 8.717105263157894, + "grad_norm": 0.07452305406332016, + "learning_rate": 1.4301685785866214e-06, + "loss": 0.0083, + "step": 18550 + }, + { + "epoch": 8.721804511278195, + "grad_norm": 0.1147482693195343, + "learning_rate": 1.4106031013849496e-06, + "loss": 0.0128, + "step": 18560 + }, + { + "epoch": 8.726503759398497, + "grad_norm": 0.14861489832401276, + "learning_rate": 1.3911704657390113e-06, + "loss": 0.0117, + "step": 18570 + }, + { + "epoch": 8.731203007518797, + "grad_norm": 0.1281604766845703, + "learning_rate": 1.3718707247769135e-06, + "loss": 0.0092, + "step": 18580 + }, + { + "epoch": 8.735902255639097, + "grad_norm": 0.06106063723564148, + "learning_rate": 1.3527039312633827e-06, + "loss": 0.0061, + "step": 18590 + }, + { + "epoch": 8.740601503759398, + "grad_norm": 0.1314494013786316, + "learning_rate": 1.333670137599713e-06, + "loss": 0.019, + "step": 18600 + }, + { + "epoch": 8.7453007518797, + "grad_norm": 0.12703455984592438, + "learning_rate": 1.3147693958235618e-06, + "loss": 0.0085, + "step": 18610 + }, + { + "epoch": 8.75, + "grad_norm": 0.05815136432647705, + "learning_rate": 1.2960017576088446e-06, + "loss": 0.0151, + "step": 18620 + }, + { + "epoch": 8.7546992481203, + "grad_norm": 0.08019871264696121, + "learning_rate": 1.2773672742655784e-06, + "loss": 0.0127, + "step": 18630 + }, + { + "epoch": 8.759398496240602, + "grad_norm": 0.12885844707489014, + "learning_rate": 1.2588659967397e-06, + "loss": 0.0094, + "step": 18640 + }, + { + "epoch": 8.764097744360903, + "grad_norm": 0.0936972126364708, + "learning_rate": 1.2404979756130142e-06, + "loss": 0.0141, + "step": 18650 + }, + { + "epoch": 8.768796992481203, + "grad_norm": 0.09744187444448471, + "learning_rate": 1.222263261102985e-06, + "loss": 0.0156, + "step": 18660 + }, + { + "epoch": 8.773496240601503, + "grad_norm": 0.14510540664196014, + "learning_rate": 1.2041619030626284e-06, + "loss": 0.0147, + "step": 18670 + }, + { + "epoch": 8.778195488721805, + "grad_norm": 0.05950562283396721, + "learning_rate": 1.1861939509803687e-06, + "loss": 0.0137, + "step": 18680 + }, + { + "epoch": 8.782894736842106, + "grad_norm": 0.1445922553539276, + "learning_rate": 1.1683594539798893e-06, + "loss": 0.0108, + "step": 18690 + }, + { + "epoch": 8.787593984962406, + "grad_norm": 0.05897856876254082, + "learning_rate": 1.1506584608200367e-06, + "loss": 0.007, + "step": 18700 + }, + { + "epoch": 8.792293233082706, + "grad_norm": 0.06876976042985916, + "learning_rate": 1.1330910198946442e-06, + "loss": 0.0106, + "step": 18710 + }, + { + "epoch": 8.796992481203008, + "grad_norm": 0.1327231377363205, + "learning_rate": 1.1156571792324211e-06, + "loss": 0.0147, + "step": 18720 + }, + { + "epoch": 8.801691729323307, + "grad_norm": 0.11447696387767792, + "learning_rate": 1.0983569864968346e-06, + "loss": 0.0171, + "step": 18730 + }, + { + "epoch": 8.806390977443609, + "grad_norm": 0.07379591464996338, + "learning_rate": 1.0811904889859336e-06, + "loss": 0.0151, + "step": 18740 + }, + { + "epoch": 8.811090225563909, + "grad_norm": 0.11582107096910477, + "learning_rate": 1.064157733632276e-06, + "loss": 0.0119, + "step": 18750 + }, + { + "epoch": 8.81578947368421, + "grad_norm": 0.15303994715213776, + "learning_rate": 1.0472587670027678e-06, + "loss": 0.015, + "step": 18760 + }, + { + "epoch": 8.82048872180451, + "grad_norm": 0.12110389024019241, + "learning_rate": 1.030493635298535e-06, + "loss": 0.013, + "step": 18770 + }, + { + "epoch": 8.825187969924812, + "grad_norm": 0.0908411517739296, + "learning_rate": 1.0138623843548078e-06, + "loss": 0.0098, + "step": 18780 + }, + { + "epoch": 8.829887218045112, + "grad_norm": 0.08263817429542542, + "learning_rate": 9.97365059640787e-07, + "loss": 0.0151, + "step": 18790 + }, + { + "epoch": 8.834586466165414, + "grad_norm": 0.09736278653144836, + "learning_rate": 9.810017062595322e-07, + "loss": 0.01, + "step": 18800 + }, + { + "epoch": 8.839285714285714, + "grad_norm": 0.13054323196411133, + "learning_rate": 9.647723689478305e-07, + "loss": 0.0113, + "step": 18810 + }, + { + "epoch": 8.843984962406015, + "grad_norm": 0.09773892909288406, + "learning_rate": 9.486770920760668e-07, + "loss": 0.0102, + "step": 18820 + }, + { + "epoch": 8.848684210526315, + "grad_norm": 0.15423932671546936, + "learning_rate": 9.327159196481138e-07, + "loss": 0.0133, + "step": 18830 + }, + { + "epoch": 8.853383458646617, + "grad_norm": 0.15199892222881317, + "learning_rate": 9.168888953011989e-07, + "loss": 0.0133, + "step": 18840 + }, + { + "epoch": 8.858082706766917, + "grad_norm": 0.0423540361225605, + "learning_rate": 9.011960623058202e-07, + "loss": 0.0098, + "step": 18850 + }, + { + "epoch": 8.862781954887218, + "grad_norm": 0.120146244764328, + "learning_rate": 8.856374635655695e-07, + "loss": 0.0126, + "step": 18860 + }, + { + "epoch": 8.867481203007518, + "grad_norm": 0.06241992488503456, + "learning_rate": 8.702131416170656e-07, + "loss": 0.0084, + "step": 18870 + }, + { + "epoch": 8.87218045112782, + "grad_norm": 0.137865349650383, + "learning_rate": 8.549231386298151e-07, + "loss": 0.0227, + "step": 18880 + }, + { + "epoch": 8.87687969924812, + "grad_norm": 0.04242929443717003, + "learning_rate": 8.397674964061075e-07, + "loss": 0.0134, + "step": 18890 + }, + { + "epoch": 8.881578947368421, + "grad_norm": 0.05079125985503197, + "learning_rate": 8.247462563808817e-07, + "loss": 0.0117, + "step": 18900 + }, + { + "epoch": 8.886278195488721, + "grad_norm": 0.1343405693769455, + "learning_rate": 8.098594596216424e-07, + "loss": 0.014, + "step": 18910 + }, + { + "epoch": 8.890977443609023, + "grad_norm": 0.05027283728122711, + "learning_rate": 7.951071468283167e-07, + "loss": 0.0117, + "step": 18920 + }, + { + "epoch": 8.895676691729323, + "grad_norm": 0.0534767210483551, + "learning_rate": 7.804893583331696e-07, + "loss": 0.0134, + "step": 18930 + }, + { + "epoch": 8.900375939849624, + "grad_norm": 0.15196926891803741, + "learning_rate": 7.66006134100672e-07, + "loss": 0.0141, + "step": 18940 + }, + { + "epoch": 8.905075187969924, + "grad_norm": 0.05484990403056145, + "learning_rate": 7.516575137274162e-07, + "loss": 0.0105, + "step": 18950 + }, + { + "epoch": 8.909774436090226, + "grad_norm": 0.07123623043298721, + "learning_rate": 7.374435364419674e-07, + "loss": 0.0112, + "step": 18960 + }, + { + "epoch": 8.914473684210526, + "grad_norm": 0.1264994591474533, + "learning_rate": 7.233642411048014e-07, + "loss": 0.0135, + "step": 18970 + }, + { + "epoch": 8.919172932330827, + "grad_norm": 0.07649128884077072, + "learning_rate": 7.094196662081831e-07, + "loss": 0.015, + "step": 18980 + }, + { + "epoch": 8.923872180451127, + "grad_norm": 0.08238343894481659, + "learning_rate": 6.956098498760389e-07, + "loss": 0.0169, + "step": 18990 + }, + { + "epoch": 8.928571428571429, + "grad_norm": 0.06182100623846054, + "learning_rate": 6.819348298638839e-07, + "loss": 0.0082, + "step": 19000 + }, + { + "epoch": 8.933270676691729, + "grad_norm": 0.07743122428655624, + "learning_rate": 6.683946435586952e-07, + "loss": 0.0283, + "step": 19010 + }, + { + "epoch": 8.93796992481203, + "grad_norm": 0.05746229737997055, + "learning_rate": 6.549893279788277e-07, + "loss": 0.0095, + "step": 19020 + }, + { + "epoch": 8.94266917293233, + "grad_norm": 0.1159047782421112, + "learning_rate": 6.417189197739093e-07, + "loss": 0.015, + "step": 19030 + }, + { + "epoch": 8.947368421052632, + "grad_norm": 0.13121691346168518, + "learning_rate": 6.285834552247128e-07, + "loss": 0.0136, + "step": 19040 + }, + { + "epoch": 8.952067669172932, + "grad_norm": 0.10032495856285095, + "learning_rate": 6.15582970243117e-07, + "loss": 0.0108, + "step": 19050 + }, + { + "epoch": 8.956766917293233, + "grad_norm": 0.08989301323890686, + "learning_rate": 6.027175003719354e-07, + "loss": 0.0147, + "step": 19060 + }, + { + "epoch": 8.961466165413533, + "grad_norm": 0.08607086539268494, + "learning_rate": 5.899870807848762e-07, + "loss": 0.0162, + "step": 19070 + }, + { + "epoch": 8.966165413533835, + "grad_norm": 0.07855894416570663, + "learning_rate": 5.773917462864264e-07, + "loss": 0.0154, + "step": 19080 + }, + { + "epoch": 8.970864661654135, + "grad_norm": 0.17435581982135773, + "learning_rate": 5.64931531311741e-07, + "loss": 0.0158, + "step": 19090 + }, + { + "epoch": 8.975563909774436, + "grad_norm": 0.09805602580308914, + "learning_rate": 5.526064699265753e-07, + "loss": 0.0158, + "step": 19100 + }, + { + "epoch": 8.980263157894736, + "grad_norm": 0.0758136659860611, + "learning_rate": 5.404165958271811e-07, + "loss": 0.0129, + "step": 19110 + }, + { + "epoch": 8.984962406015038, + "grad_norm": 0.14450609683990479, + "learning_rate": 5.283619423401998e-07, + "loss": 0.0197, + "step": 19120 + }, + { + "epoch": 8.989661654135338, + "grad_norm": 0.054334018379449844, + "learning_rate": 5.164425424226016e-07, + "loss": 0.0079, + "step": 19130 + }, + { + "epoch": 8.99436090225564, + "grad_norm": 0.1322498619556427, + "learning_rate": 5.046584286615697e-07, + "loss": 0.0155, + "step": 19140 + }, + { + "epoch": 8.99906015037594, + "grad_norm": 0.10678986459970474, + "learning_rate": 4.930096332744105e-07, + "loss": 0.0112, + "step": 19150 + }, + { + "epoch": 9.003759398496241, + "grad_norm": 0.12043260782957077, + "learning_rate": 4.814961881085045e-07, + "loss": 0.0139, + "step": 19160 + }, + { + "epoch": 9.00845864661654, + "grad_norm": 0.14756977558135986, + "learning_rate": 4.701181246411501e-07, + "loss": 0.0164, + "step": 19170 + }, + { + "epoch": 9.013157894736842, + "grad_norm": 0.04383913427591324, + "learning_rate": 4.5887547397955864e-07, + "loss": 0.0075, + "step": 19180 + }, + { + "epoch": 9.017857142857142, + "grad_norm": 0.23295745253562927, + "learning_rate": 4.4776826686069305e-07, + "loss": 0.0109, + "step": 19190 + }, + { + "epoch": 9.022556390977444, + "grad_norm": 0.09766188263893127, + "learning_rate": 4.367965336512403e-07, + "loss": 0.0096, + "step": 19200 + }, + { + "epoch": 9.027255639097744, + "grad_norm": 0.2106829732656479, + "learning_rate": 4.259603043475002e-07, + "loss": 0.0166, + "step": 19210 + }, + { + "epoch": 9.031954887218046, + "grad_norm": 0.04127572849392891, + "learning_rate": 4.1525960857530243e-07, + "loss": 0.0162, + "step": 19220 + }, + { + "epoch": 9.036654135338345, + "grad_norm": 0.07691402733325958, + "learning_rate": 4.0469447558995065e-07, + "loss": 0.0186, + "step": 19230 + }, + { + "epoch": 9.041353383458647, + "grad_norm": 0.18556630611419678, + "learning_rate": 3.9426493427611177e-07, + "loss": 0.0151, + "step": 19240 + }, + { + "epoch": 9.046052631578947, + "grad_norm": 0.10369502753019333, + "learning_rate": 3.839710131477492e-07, + "loss": 0.0174, + "step": 19250 + }, + { + "epoch": 9.050751879699249, + "grad_norm": 0.1650490164756775, + "learning_rate": 3.738127403480507e-07, + "loss": 0.01, + "step": 19260 + }, + { + "epoch": 9.055451127819548, + "grad_norm": 0.12048038840293884, + "learning_rate": 3.637901436493507e-07, + "loss": 0.0108, + "step": 19270 + }, + { + "epoch": 9.06015037593985, + "grad_norm": 0.049617137759923935, + "learning_rate": 3.5390325045304706e-07, + "loss": 0.0094, + "step": 19280 + }, + { + "epoch": 9.06484962406015, + "grad_norm": 0.12197034060955048, + "learning_rate": 3.441520877895288e-07, + "loss": 0.0161, + "step": 19290 + }, + { + "epoch": 9.069548872180452, + "grad_norm": 0.05120290815830231, + "learning_rate": 3.3453668231809286e-07, + "loss": 0.0109, + "step": 19300 + }, + { + "epoch": 9.074248120300751, + "grad_norm": 0.04005982726812363, + "learning_rate": 3.250570603268943e-07, + "loss": 0.0133, + "step": 19310 + }, + { + "epoch": 9.078947368421053, + "grad_norm": 0.10951024293899536, + "learning_rate": 3.157132477328628e-07, + "loss": 0.0111, + "step": 19320 + }, + { + "epoch": 9.083646616541353, + "grad_norm": 0.12475921213626862, + "learning_rate": 3.0650527008162513e-07, + "loss": 0.0113, + "step": 19330 + }, + { + "epoch": 9.088345864661655, + "grad_norm": 0.0481477752327919, + "learning_rate": 2.9743315254743833e-07, + "loss": 0.0079, + "step": 19340 + }, + { + "epoch": 9.093045112781954, + "grad_norm": 0.12329145520925522, + "learning_rate": 2.8849691993311777e-07, + "loss": 0.0146, + "step": 19350 + }, + { + "epoch": 9.097744360902256, + "grad_norm": 0.08423899859189987, + "learning_rate": 2.796965966699927e-07, + "loss": 0.0156, + "step": 19360 + }, + { + "epoch": 9.102443609022556, + "grad_norm": 0.11121213436126709, + "learning_rate": 2.7103220681780615e-07, + "loss": 0.0111, + "step": 19370 + }, + { + "epoch": 9.107142857142858, + "grad_norm": 0.10711877793073654, + "learning_rate": 2.625037740646763e-07, + "loss": 0.0107, + "step": 19380 + }, + { + "epoch": 9.111842105263158, + "grad_norm": 0.05875176563858986, + "learning_rate": 2.5411132172700194e-07, + "loss": 0.0114, + "step": 19390 + }, + { + "epoch": 9.11654135338346, + "grad_norm": 0.06813649088144302, + "learning_rate": 2.458548727494292e-07, + "loss": 0.009, + "step": 19400 + }, + { + "epoch": 9.121240601503759, + "grad_norm": 0.047655753791332245, + "learning_rate": 2.3773444970477955e-07, + "loss": 0.0128, + "step": 19410 + }, + { + "epoch": 9.12593984962406, + "grad_norm": 0.11299572139978409, + "learning_rate": 2.2975007479397738e-07, + "loss": 0.0134, + "step": 19420 + }, + { + "epoch": 9.13063909774436, + "grad_norm": 0.08376818895339966, + "learning_rate": 2.219017698460002e-07, + "loss": 0.0177, + "step": 19430 + }, + { + "epoch": 9.135338345864662, + "grad_norm": 0.15395602583885193, + "learning_rate": 2.1418955631781202e-07, + "loss": 0.0103, + "step": 19440 + }, + { + "epoch": 9.140037593984962, + "grad_norm": 0.09228936582803726, + "learning_rate": 2.0661345529430775e-07, + "loss": 0.0129, + "step": 19450 + }, + { + "epoch": 9.144736842105264, + "grad_norm": 0.1360449194908142, + "learning_rate": 1.9917348748826335e-07, + "loss": 0.0128, + "step": 19460 + }, + { + "epoch": 9.149436090225564, + "grad_norm": 0.10789410024881363, + "learning_rate": 1.918696732402636e-07, + "loss": 0.0124, + "step": 19470 + }, + { + "epoch": 9.154135338345865, + "grad_norm": 0.12555211782455444, + "learning_rate": 1.847020325186577e-07, + "loss": 0.0123, + "step": 19480 + }, + { + "epoch": 9.158834586466165, + "grad_norm": 0.11357904225587845, + "learning_rate": 1.776705849195037e-07, + "loss": 0.0086, + "step": 19490 + }, + { + "epoch": 9.163533834586467, + "grad_norm": 0.09295064955949783, + "learning_rate": 1.7077534966650766e-07, + "loss": 0.0147, + "step": 19500 + }, + { + "epoch": 9.168233082706767, + "grad_norm": 0.1021043211221695, + "learning_rate": 1.6401634561098444e-07, + "loss": 0.0081, + "step": 19510 + }, + { + "epoch": 9.172932330827068, + "grad_norm": 0.07909571379423141, + "learning_rate": 1.5739359123178587e-07, + "loss": 0.011, + "step": 19520 + }, + { + "epoch": 9.177631578947368, + "grad_norm": 0.047228168696165085, + "learning_rate": 1.5090710463527836e-07, + "loss": 0.0141, + "step": 19530 + }, + { + "epoch": 9.18233082706767, + "grad_norm": 0.08932602405548096, + "learning_rate": 1.4455690355525964e-07, + "loss": 0.01, + "step": 19540 + }, + { + "epoch": 9.18703007518797, + "grad_norm": 0.1202157661318779, + "learning_rate": 1.383430053529422e-07, + "loss": 0.0112, + "step": 19550 + }, + { + "epoch": 9.191729323308271, + "grad_norm": 0.027310887351632118, + "learning_rate": 1.3226542701689215e-07, + "loss": 0.0142, + "step": 19560 + }, + { + "epoch": 9.196428571428571, + "grad_norm": 0.08331390470266342, + "learning_rate": 1.2632418516296262e-07, + "loss": 0.0088, + "step": 19570 + }, + { + "epoch": 9.201127819548873, + "grad_norm": 0.12556742131710052, + "learning_rate": 1.2051929603428825e-07, + "loss": 0.0204, + "step": 19580 + }, + { + "epoch": 9.205827067669173, + "grad_norm": 0.05147033929824829, + "learning_rate": 1.1485077550122402e-07, + "loss": 0.0075, + "step": 19590 + }, + { + "epoch": 9.210526315789474, + "grad_norm": 0.061899200081825256, + "learning_rate": 1.0931863906127327e-07, + "loss": 0.0089, + "step": 19600 + }, + { + "epoch": 9.215225563909774, + "grad_norm": 0.05099288746714592, + "learning_rate": 1.0392290183909304e-07, + "loss": 0.0115, + "step": 19610 + }, + { + "epoch": 9.219924812030076, + "grad_norm": 0.04262029007077217, + "learning_rate": 9.866357858642205e-08, + "loss": 0.0097, + "step": 19620 + }, + { + "epoch": 9.224624060150376, + "grad_norm": 0.045728232711553574, + "learning_rate": 9.354068368204739e-08, + "loss": 0.012, + "step": 19630 + }, + { + "epoch": 9.229323308270677, + "grad_norm": 0.09205039590597153, + "learning_rate": 8.855423113177664e-08, + "loss": 0.0112, + "step": 19640 + }, + { + "epoch": 9.234022556390977, + "grad_norm": 0.048479776829481125, + "learning_rate": 8.37042345683714e-08, + "loss": 0.0205, + "step": 19650 + }, + { + "epoch": 9.238721804511279, + "grad_norm": 0.046253811568021774, + "learning_rate": 7.899070725153613e-08, + "loss": 0.0117, + "step": 19660 + }, + { + "epoch": 9.243421052631579, + "grad_norm": 0.10509341955184937, + "learning_rate": 7.44136620678848e-08, + "loss": 0.0113, + "step": 19670 + }, + { + "epoch": 9.24812030075188, + "grad_norm": 0.09187602251768112, + "learning_rate": 6.997311153086883e-08, + "loss": 0.0172, + "step": 19680 + }, + { + "epoch": 9.25281954887218, + "grad_norm": 0.04167873412370682, + "learning_rate": 6.566906778079917e-08, + "loss": 0.0109, + "step": 19690 + }, + { + "epoch": 9.257518796992482, + "grad_norm": 0.1210450828075409, + "learning_rate": 6.150154258476315e-08, + "loss": 0.012, + "step": 19700 + }, + { + "epoch": 9.262218045112782, + "grad_norm": 0.03811037912964821, + "learning_rate": 5.747054733660773e-08, + "loss": 0.0223, + "step": 19710 + }, + { + "epoch": 9.266917293233083, + "grad_norm": 0.10711174458265305, + "learning_rate": 5.3576093056922906e-08, + "loss": 0.0092, + "step": 19720 + }, + { + "epoch": 9.271616541353383, + "grad_norm": 0.07966139167547226, + "learning_rate": 4.981819039300284e-08, + "loss": 0.011, + "step": 19730 + }, + { + "epoch": 9.276315789473685, + "grad_norm": 0.11096978932619095, + "learning_rate": 4.619684961881254e-08, + "loss": 0.0075, + "step": 19740 + }, + { + "epoch": 9.281015037593985, + "grad_norm": 0.08339618891477585, + "learning_rate": 4.2712080634949024e-08, + "loss": 0.0186, + "step": 19750 + }, + { + "epoch": 9.285714285714286, + "grad_norm": 0.11398918181657791, + "learning_rate": 3.936389296864129e-08, + "loss": 0.0088, + "step": 19760 + }, + { + "epoch": 9.290413533834586, + "grad_norm": 0.04430542141199112, + "learning_rate": 3.615229577371149e-08, + "loss": 0.0149, + "step": 19770 + }, + { + "epoch": 9.295112781954888, + "grad_norm": 0.06718280166387558, + "learning_rate": 3.3077297830541584e-08, + "loss": 0.0172, + "step": 19780 + }, + { + "epoch": 9.299812030075188, + "grad_norm": 0.11311540752649307, + "learning_rate": 3.01389075460512e-08, + "loss": 0.0123, + "step": 19790 + }, + { + "epoch": 9.30451127819549, + "grad_norm": 0.06545548141002655, + "learning_rate": 2.7337132953697554e-08, + "loss": 0.0166, + "step": 19800 + }, + { + "epoch": 9.30921052631579, + "grad_norm": 0.07823529839515686, + "learning_rate": 2.467198171342e-08, + "loss": 0.0132, + "step": 19810 + }, + { + "epoch": 9.313909774436091, + "grad_norm": 0.06555328518152237, + "learning_rate": 2.214346111164556e-08, + "loss": 0.0115, + "step": 19820 + }, + { + "epoch": 9.318609022556391, + "grad_norm": 0.04978393763303757, + "learning_rate": 1.9751578061244504e-08, + "loss": 0.0252, + "step": 19830 + }, + { + "epoch": 9.323308270676693, + "grad_norm": 0.04655987396836281, + "learning_rate": 1.749633910153592e-08, + "loss": 0.0098, + "step": 19840 + }, + { + "epoch": 9.328007518796992, + "grad_norm": 0.04883876070380211, + "learning_rate": 1.5377750398265502e-08, + "loss": 0.009, + "step": 19850 + }, + { + "epoch": 9.332706766917294, + "grad_norm": 0.04160952940583229, + "learning_rate": 1.3395817743561134e-08, + "loss": 0.0088, + "step": 19860 + }, + { + "epoch": 9.337406015037594, + "grad_norm": 0.14190863072872162, + "learning_rate": 1.1550546555960662e-08, + "loss": 0.0189, + "step": 19870 + }, + { + "epoch": 9.342105263157896, + "grad_norm": 0.1261986643075943, + "learning_rate": 9.841941880361916e-09, + "loss": 0.0123, + "step": 19880 + }, + { + "epoch": 9.346804511278195, + "grad_norm": 0.13026973605155945, + "learning_rate": 8.270008388022721e-09, + "loss": 0.019, + "step": 19890 + }, + { + "epoch": 9.351503759398497, + "grad_norm": 0.04853995516896248, + "learning_rate": 6.834750376549792e-09, + "loss": 0.0128, + "step": 19900 + }, + { + "epoch": 9.356203007518797, + "grad_norm": 0.06438510119915009, + "learning_rate": 5.536171769887632e-09, + "loss": 0.0175, + "step": 19910 + }, + { + "epoch": 9.360902255639097, + "grad_norm": 0.0485045500099659, + "learning_rate": 4.3742761183018784e-09, + "loss": 0.0112, + "step": 19920 + }, + { + "epoch": 9.365601503759398, + "grad_norm": 0.0634605661034584, + "learning_rate": 3.349066598362649e-09, + "loss": 0.0086, + "step": 19930 + }, + { + "epoch": 9.3703007518797, + "grad_norm": 0.05341292545199394, + "learning_rate": 2.4605460129556445e-09, + "loss": 0.0113, + "step": 19940 + }, + { + "epoch": 9.375, + "grad_norm": 0.10371940582990646, + "learning_rate": 1.7087167912710478e-09, + "loss": 0.0153, + "step": 19950 + }, + { + "epoch": 9.3796992481203, + "grad_norm": 0.04511750862002373, + "learning_rate": 1.0935809887702154e-09, + "loss": 0.0112, + "step": 19960 + }, + { + "epoch": 9.384398496240602, + "grad_norm": 0.0870666578412056, + "learning_rate": 6.151402872134337e-10, + "loss": 0.0169, + "step": 19970 + }, + { + "epoch": 9.389097744360903, + "grad_norm": 0.2327416092157364, + "learning_rate": 2.7339599464326627e-10, + "loss": 0.0134, + "step": 19980 + }, + { + "epoch": 9.393796992481203, + "grad_norm": 0.06986179947853088, + "learning_rate": 6.834904537900144e-11, + "loss": 0.0133, + "step": 19990 + }, + { + "epoch": 9.398496240601503, + "grad_norm": 0.14632254838943481, + "learning_rate": 0.0, + "loss": 0.0165, + "step": 20000 + } + ], + "logging_steps": 10, + "max_steps": 20000, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6.892274366138223e+17, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}