{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 30.0, "eval_steps": 500, "global_step": 5370, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.055865921787709494, "grad_norm": 9.474554061889648, "learning_rate": 5.018587360594795e-06, "loss": 0.9647, "step": 10 }, { "epoch": 0.11173184357541899, "grad_norm": 2.904832124710083, "learning_rate": 1.0594795539033457e-05, "loss": 0.3646, "step": 20 }, { "epoch": 0.16759776536312848, "grad_norm": 2.1869466304779053, "learning_rate": 1.617100371747212e-05, "loss": 0.2295, "step": 30 }, { "epoch": 0.22346368715083798, "grad_norm": 1.0514620542526245, "learning_rate": 2.1747211895910778e-05, "loss": 0.1933, "step": 40 }, { "epoch": 0.27932960893854747, "grad_norm": 1.2703338861465454, "learning_rate": 2.732342007434944e-05, "loss": 0.16, "step": 50 }, { "epoch": 0.33519553072625696, "grad_norm": 1.139123797416687, "learning_rate": 3.28996282527881e-05, "loss": 0.1487, "step": 60 }, { "epoch": 0.39106145251396646, "grad_norm": 1.1433963775634766, "learning_rate": 3.8475836431226764e-05, "loss": 0.1484, "step": 70 }, { "epoch": 0.44692737430167595, "grad_norm": 1.1463357210159302, "learning_rate": 4.4052044609665426e-05, "loss": 0.1315, "step": 80 }, { "epoch": 0.5027932960893855, "grad_norm": 1.1672674417495728, "learning_rate": 4.962825278810409e-05, "loss": 0.1267, "step": 90 }, { "epoch": 0.5586592178770949, "grad_norm": 1.0597262382507324, "learning_rate": 5.520446096654275e-05, "loss": 0.1244, "step": 100 }, { "epoch": 0.6145251396648045, "grad_norm": 1.1038429737091064, "learning_rate": 6.0780669144981406e-05, "loss": 0.1147, "step": 110 }, { "epoch": 0.6703910614525139, "grad_norm": 0.6325312852859497, "learning_rate": 6.635687732342006e-05, "loss": 0.1192, "step": 120 }, { "epoch": 0.7262569832402235, "grad_norm": 1.0795389413833618, "learning_rate": 7.193308550185873e-05, "loss": 0.1105, "step": 130 }, { "epoch": 0.7821229050279329, "grad_norm": 0.7824896574020386, "learning_rate": 7.750929368029739e-05, "loss": 0.1035, "step": 140 }, { "epoch": 0.8379888268156425, "grad_norm": 1.3778011798858643, "learning_rate": 8.308550185873605e-05, "loss": 0.1069, "step": 150 }, { "epoch": 0.8938547486033519, "grad_norm": 0.8529579639434814, "learning_rate": 8.866171003717471e-05, "loss": 0.1022, "step": 160 }, { "epoch": 0.9497206703910615, "grad_norm": 1.441953182220459, "learning_rate": 9.423791821561338e-05, "loss": 0.0985, "step": 170 }, { "epoch": 1.005586592178771, "grad_norm": 0.7136903405189514, "learning_rate": 9.981412639405203e-05, "loss": 0.1026, "step": 180 }, { "epoch": 1.0614525139664805, "grad_norm": 1.4089112281799316, "learning_rate": 0.0001053903345724907, "loss": 0.0973, "step": 190 }, { "epoch": 1.1173184357541899, "grad_norm": 0.8666443228721619, "learning_rate": 0.00011096654275092937, "loss": 0.0899, "step": 200 }, { "epoch": 1.1731843575418994, "grad_norm": 0.6051232218742371, "learning_rate": 0.00011654275092936801, "loss": 0.0891, "step": 210 }, { "epoch": 1.229050279329609, "grad_norm": 0.5882899165153503, "learning_rate": 0.00012211895910780668, "loss": 0.0859, "step": 220 }, { "epoch": 1.2849162011173183, "grad_norm": 0.662291407585144, "learning_rate": 0.00012769516728624534, "loss": 0.0835, "step": 230 }, { "epoch": 1.3407821229050279, "grad_norm": 1.036959171295166, "learning_rate": 0.000133271375464684, "loss": 0.0801, "step": 240 }, { "epoch": 1.3966480446927374, "grad_norm": 0.5812122225761414, "learning_rate": 0.00013884758364312265, "loss": 0.0723, "step": 250 }, { "epoch": 1.452513966480447, "grad_norm": 0.799187183380127, "learning_rate": 0.00014442379182156133, "loss": 0.0701, "step": 260 }, { "epoch": 1.5083798882681565, "grad_norm": 1.0647534132003784, "learning_rate": 0.00015, "loss": 0.075, "step": 270 }, { "epoch": 1.564245810055866, "grad_norm": 0.9664549827575684, "learning_rate": 0.0001499985776090078, "loss": 0.0696, "step": 280 }, { "epoch": 1.6201117318435754, "grad_norm": 0.621644139289856, "learning_rate": 0.0001499943104899832, "loss": 0.0657, "step": 290 }, { "epoch": 1.675977653631285, "grad_norm": 0.6787592768669128, "learning_rate": 0.00014998719880477977, "loss": 0.0731, "step": 300 }, { "epoch": 1.7318435754189943, "grad_norm": 0.563049852848053, "learning_rate": 0.00014997724282314682, "loss": 0.0707, "step": 310 }, { "epoch": 1.7877094972067038, "grad_norm": 0.4710414409637451, "learning_rate": 0.00014996444292271892, "loss": 0.0657, "step": 320 }, { "epoch": 1.8435754189944134, "grad_norm": 0.46238359808921814, "learning_rate": 0.0001499487995890018, "loss": 0.0649, "step": 330 }, { "epoch": 1.899441340782123, "grad_norm": 0.4369705021381378, "learning_rate": 0.00014993031341535375, "loss": 0.0562, "step": 340 }, { "epoch": 1.9553072625698324, "grad_norm": 0.5079113841056824, "learning_rate": 0.0001499089851029632, "loss": 0.0553, "step": 350 }, { "epoch": 2.011173184357542, "grad_norm": 0.7058802247047424, "learning_rate": 0.00014988481546082222, "loss": 0.0551, "step": 360 }, { "epoch": 2.0670391061452515, "grad_norm": 0.42536088824272156, "learning_rate": 0.0001498578054056956, "loss": 0.0553, "step": 370 }, { "epoch": 2.122905027932961, "grad_norm": 0.5014165043830872, "learning_rate": 0.00014982795596208618, "loss": 0.0539, "step": 380 }, { "epoch": 2.17877094972067, "grad_norm": 0.6823521852493286, "learning_rate": 0.0001497952682621962, "loss": 0.0507, "step": 390 }, { "epoch": 2.2346368715083798, "grad_norm": 0.4654916822910309, "learning_rate": 0.00014975974354588395, "loss": 0.058, "step": 400 }, { "epoch": 2.2905027932960893, "grad_norm": 0.33408060669898987, "learning_rate": 0.00014972138316061715, "loss": 0.0484, "step": 410 }, { "epoch": 2.346368715083799, "grad_norm": 0.40119659900665283, "learning_rate": 0.00014968018856142154, "loss": 0.051, "step": 420 }, { "epoch": 2.4022346368715084, "grad_norm": 0.3584083616733551, "learning_rate": 0.00014963616131082584, "loss": 0.0494, "step": 430 }, { "epoch": 2.458100558659218, "grad_norm": 0.6725554466247559, "learning_rate": 0.00014958930307880242, "loss": 0.0572, "step": 440 }, { "epoch": 2.5139664804469275, "grad_norm": 0.582353413105011, "learning_rate": 0.00014953961564270404, "loss": 0.0495, "step": 450 }, { "epoch": 2.5698324022346366, "grad_norm": 0.6442360877990723, "learning_rate": 0.00014948710088719632, "loss": 0.0581, "step": 460 }, { "epoch": 2.6256983240223466, "grad_norm": 0.6121304631233215, "learning_rate": 0.00014943176080418633, "loss": 0.0528, "step": 470 }, { "epoch": 2.6815642458100557, "grad_norm": 0.6058341860771179, "learning_rate": 0.00014937359749274705, "loss": 0.0504, "step": 480 }, { "epoch": 2.7374301675977653, "grad_norm": 0.4608060121536255, "learning_rate": 0.00014931261315903765, "loss": 0.0501, "step": 490 }, { "epoch": 2.793296089385475, "grad_norm": 0.33580687642097473, "learning_rate": 0.00014924881011621992, "loss": 0.0441, "step": 500 }, { "epoch": 2.8491620111731844, "grad_norm": 0.5434100031852722, "learning_rate": 0.0001491821907843705, "loss": 0.0469, "step": 510 }, { "epoch": 2.905027932960894, "grad_norm": 0.7853328585624695, "learning_rate": 0.00014911275769038902, "loss": 0.0435, "step": 520 }, { "epoch": 2.9608938547486034, "grad_norm": 0.6744305491447449, "learning_rate": 0.0001490405134679024, "loss": 0.0477, "step": 530 }, { "epoch": 3.016759776536313, "grad_norm": 0.6475287079811096, "learning_rate": 0.00014896546085716475, "loss": 0.0468, "step": 540 }, { "epoch": 3.0726256983240225, "grad_norm": 0.5036594271659851, "learning_rate": 0.00014888760270495365, "loss": 0.044, "step": 550 }, { "epoch": 3.1284916201117317, "grad_norm": 0.4917600452899933, "learning_rate": 0.000148806941964462, "loss": 0.0478, "step": 560 }, { "epoch": 3.184357541899441, "grad_norm": 0.5577938556671143, "learning_rate": 0.0001487234816951861, "loss": 0.0464, "step": 570 }, { "epoch": 3.2402234636871508, "grad_norm": 0.452462375164032, "learning_rate": 0.0001486372250628095, "loss": 0.041, "step": 580 }, { "epoch": 3.2960893854748603, "grad_norm": 0.3954116702079773, "learning_rate": 0.00014854817533908313, "loss": 0.0437, "step": 590 }, { "epoch": 3.35195530726257, "grad_norm": 0.529547393321991, "learning_rate": 0.00014845633590170092, "loss": 0.0433, "step": 600 }, { "epoch": 3.4078212290502794, "grad_norm": 0.3528311848640442, "learning_rate": 0.00014836171023417191, "loss": 0.0365, "step": 610 }, { "epoch": 3.463687150837989, "grad_norm": 0.4728206992149353, "learning_rate": 0.00014826430192568807, "loss": 0.0457, "step": 620 }, { "epoch": 3.5195530726256985, "grad_norm": 0.6676551103591919, "learning_rate": 0.00014816411467098806, "loss": 0.045, "step": 630 }, { "epoch": 3.5754189944134076, "grad_norm": 0.3946695327758789, "learning_rate": 0.00014806115227021714, "loss": 0.0465, "step": 640 }, { "epoch": 3.631284916201117, "grad_norm": 0.6944833993911743, "learning_rate": 0.00014795541862878308, "loss": 0.0481, "step": 650 }, { "epoch": 3.6871508379888267, "grad_norm": 0.6339210271835327, "learning_rate": 0.00014784691775720807, "loss": 0.0394, "step": 660 }, { "epoch": 3.7430167597765363, "grad_norm": 0.47451841831207275, "learning_rate": 0.00014773565377097631, "loss": 0.0392, "step": 670 }, { "epoch": 3.798882681564246, "grad_norm": 0.3181611895561218, "learning_rate": 0.0001476216308903784, "loss": 0.0436, "step": 680 }, { "epoch": 3.8547486033519553, "grad_norm": 0.4814484715461731, "learning_rate": 0.00014750485344035071, "loss": 0.0383, "step": 690 }, { "epoch": 3.910614525139665, "grad_norm": 0.3436708450317383, "learning_rate": 0.00014738532585031178, "loss": 0.044, "step": 700 }, { "epoch": 3.9664804469273744, "grad_norm": 0.367412269115448, "learning_rate": 0.00014726305265399403, "loss": 0.0403, "step": 710 }, { "epoch": 4.022346368715084, "grad_norm": 0.23561592400074005, "learning_rate": 0.00014713803848927196, "loss": 0.0422, "step": 720 }, { "epoch": 4.078212290502793, "grad_norm": 0.33305707573890686, "learning_rate": 0.00014701028809798619, "loss": 0.0431, "step": 730 }, { "epoch": 4.134078212290503, "grad_norm": 0.43914180994033813, "learning_rate": 0.00014687980632576347, "loss": 0.0432, "step": 740 }, { "epoch": 4.189944134078212, "grad_norm": 0.41326114535331726, "learning_rate": 0.0001467465981218331, "loss": 0.0408, "step": 750 }, { "epoch": 4.245810055865922, "grad_norm": 0.3312179148197174, "learning_rate": 0.0001466106685388391, "loss": 0.0437, "step": 760 }, { "epoch": 4.301675977653631, "grad_norm": 0.24370786547660828, "learning_rate": 0.00014647202273264848, "loss": 0.0381, "step": 770 }, { "epoch": 4.35754189944134, "grad_norm": 0.3902948796749115, "learning_rate": 0.00014633066596215577, "loss": 0.039, "step": 780 }, { "epoch": 4.41340782122905, "grad_norm": 0.48275288939476013, "learning_rate": 0.0001461866035890836, "loss": 0.0395, "step": 790 }, { "epoch": 4.4692737430167595, "grad_norm": 0.45421525835990906, "learning_rate": 0.00014603984107777924, "loss": 0.0386, "step": 800 }, { "epoch": 4.5251396648044695, "grad_norm": 0.2560478150844574, "learning_rate": 0.00014589038399500726, "loss": 0.0375, "step": 810 }, { "epoch": 4.581005586592179, "grad_norm": 0.3527555465698242, "learning_rate": 0.0001457382380097386, "loss": 0.046, "step": 820 }, { "epoch": 4.636871508379889, "grad_norm": 0.5826641917228699, "learning_rate": 0.00014558340889293534, "loss": 0.0419, "step": 830 }, { "epoch": 4.692737430167598, "grad_norm": 0.5426205992698669, "learning_rate": 0.00014542590251733192, "loss": 0.0449, "step": 840 }, { "epoch": 4.748603351955307, "grad_norm": 0.4121716618537903, "learning_rate": 0.00014526572485721234, "loss": 0.0404, "step": 850 }, { "epoch": 4.804469273743017, "grad_norm": 0.41459986567497253, "learning_rate": 0.00014510288198818356, "loss": 0.0434, "step": 860 }, { "epoch": 4.860335195530726, "grad_norm": 0.42841342091560364, "learning_rate": 0.00014493738008694503, "loss": 0.0418, "step": 870 }, { "epoch": 4.916201117318436, "grad_norm": 0.3888803720474243, "learning_rate": 0.00014476922543105443, "loss": 0.0391, "step": 880 }, { "epoch": 4.972067039106145, "grad_norm": 0.2680191695690155, "learning_rate": 0.00014459842439868963, "loss": 0.0412, "step": 890 }, { "epoch": 5.027932960893855, "grad_norm": 0.3152766823768616, "learning_rate": 0.00014442498346840658, "loss": 0.0351, "step": 900 }, { "epoch": 5.083798882681564, "grad_norm": 0.3976776897907257, "learning_rate": 0.00014424890921889373, "loss": 0.0369, "step": 910 }, { "epoch": 5.139664804469274, "grad_norm": 0.4556487500667572, "learning_rate": 0.00014407020832872246, "loss": 0.0337, "step": 920 }, { "epoch": 5.195530726256983, "grad_norm": 0.368202269077301, "learning_rate": 0.00014388888757609376, "loss": 0.0396, "step": 930 }, { "epoch": 5.251396648044693, "grad_norm": 0.34850746393203735, "learning_rate": 0.00014370495383858107, "loss": 0.0361, "step": 940 }, { "epoch": 5.307262569832402, "grad_norm": 0.21902212500572205, "learning_rate": 0.00014351841409286954, "loss": 0.0332, "step": 950 }, { "epoch": 5.363128491620111, "grad_norm": 0.37780269980430603, "learning_rate": 0.00014332927541449122, "loss": 0.0359, "step": 960 }, { "epoch": 5.418994413407821, "grad_norm": 0.20398513972759247, "learning_rate": 0.00014313754497755687, "loss": 0.0394, "step": 970 }, { "epoch": 5.4748603351955305, "grad_norm": 0.2890792787075043, "learning_rate": 0.00014294323005448374, "loss": 0.042, "step": 980 }, { "epoch": 5.5307262569832405, "grad_norm": 0.3031609356403351, "learning_rate": 0.0001427463380157197, "loss": 0.0429, "step": 990 }, { "epoch": 5.58659217877095, "grad_norm": 0.5294051766395569, "learning_rate": 0.0001425468763294638, "loss": 0.0424, "step": 1000 }, { "epoch": 5.64245810055866, "grad_norm": 0.31394124031066895, "learning_rate": 0.00014234485256138277, "loss": 0.0455, "step": 1010 }, { "epoch": 5.698324022346369, "grad_norm": 0.5516133904457092, "learning_rate": 0.00014214027437432439, "loss": 0.0375, "step": 1020 }, { "epoch": 5.754189944134078, "grad_norm": 0.45591840147972107, "learning_rate": 0.00014193314952802645, "loss": 0.0361, "step": 1030 }, { "epoch": 5.810055865921788, "grad_norm": 0.41406169533729553, "learning_rate": 0.00014172348587882276, "loss": 0.0347, "step": 1040 }, { "epoch": 5.865921787709497, "grad_norm": 0.3035840094089508, "learning_rate": 0.00014151129137934492, "loss": 0.0354, "step": 1050 }, { "epoch": 5.921787709497207, "grad_norm": 0.3065755367279053, "learning_rate": 0.00014129657407822082, "loss": 0.0364, "step": 1060 }, { "epoch": 5.977653631284916, "grad_norm": 0.3852936029434204, "learning_rate": 0.0001410793421197692, "loss": 0.0352, "step": 1070 }, { "epoch": 6.033519553072626, "grad_norm": 0.3598945140838623, "learning_rate": 0.00014085960374369096, "loss": 0.0404, "step": 1080 }, { "epoch": 6.089385474860335, "grad_norm": 0.3946482539176941, "learning_rate": 0.00014063736728475634, "loss": 0.0312, "step": 1090 }, { "epoch": 6.145251396648045, "grad_norm": 0.4066295027732849, "learning_rate": 0.00014041264117248907, "loss": 0.041, "step": 1100 }, { "epoch": 6.201117318435754, "grad_norm": 0.2542637586593628, "learning_rate": 0.00014018543393084633, "loss": 0.0366, "step": 1110 }, { "epoch": 6.256983240223463, "grad_norm": 0.34908485412597656, "learning_rate": 0.0001399557541778958, "loss": 0.0358, "step": 1120 }, { "epoch": 6.312849162011173, "grad_norm": 0.39812198281288147, "learning_rate": 0.00013972361062548837, "loss": 0.0405, "step": 1130 }, { "epoch": 6.368715083798882, "grad_norm": 0.45712536573410034, "learning_rate": 0.00013948901207892807, "loss": 0.037, "step": 1140 }, { "epoch": 6.424581005586592, "grad_norm": 0.5260054469108582, "learning_rate": 0.0001392519674366377, "loss": 0.0381, "step": 1150 }, { "epoch": 6.4804469273743015, "grad_norm": 0.39201897382736206, "learning_rate": 0.00013901248568982177, "loss": 0.0366, "step": 1160 }, { "epoch": 6.5363128491620115, "grad_norm": 0.22905181348323822, "learning_rate": 0.00013877057592212496, "loss": 0.0397, "step": 1170 }, { "epoch": 6.592178770949721, "grad_norm": 0.4780491590499878, "learning_rate": 0.00013852624730928794, "loss": 0.0414, "step": 1180 }, { "epoch": 6.648044692737431, "grad_norm": 0.3810228705406189, "learning_rate": 0.00013827950911879922, "loss": 0.035, "step": 1190 }, { "epoch": 6.70391061452514, "grad_norm": 0.3878680467605591, "learning_rate": 0.0001380303707095436, "loss": 0.0365, "step": 1200 }, { "epoch": 6.759776536312849, "grad_norm": 0.3587484657764435, "learning_rate": 0.00013777884153144714, "loss": 0.0349, "step": 1210 }, { "epoch": 6.815642458100559, "grad_norm": 0.3499602675437927, "learning_rate": 0.00013752493112511888, "loss": 0.0418, "step": 1220 }, { "epoch": 6.871508379888268, "grad_norm": 0.3068997859954834, "learning_rate": 0.00013726864912148878, "loss": 0.0364, "step": 1230 }, { "epoch": 6.927374301675978, "grad_norm": 0.24322666227817535, "learning_rate": 0.00013701000524144252, "loss": 0.0413, "step": 1240 }, { "epoch": 6.983240223463687, "grad_norm": 0.27041393518447876, "learning_rate": 0.00013674900929545284, "loss": 0.0362, "step": 1250 }, { "epoch": 7.039106145251397, "grad_norm": 0.4791170358657837, "learning_rate": 0.00013648567118320723, "loss": 0.0364, "step": 1260 }, { "epoch": 7.094972067039106, "grad_norm": 0.39187201857566833, "learning_rate": 0.00013622000089323256, "loss": 0.0406, "step": 1270 }, { "epoch": 7.150837988826815, "grad_norm": 0.3253427743911743, "learning_rate": 0.00013595200850251627, "loss": 0.0337, "step": 1280 }, { "epoch": 7.206703910614525, "grad_norm": 0.2595740854740143, "learning_rate": 0.000135681704176124, "loss": 0.0367, "step": 1290 }, { "epoch": 7.262569832402234, "grad_norm": 0.4414283335208893, "learning_rate": 0.00013540909816681416, "loss": 0.0391, "step": 1300 }, { "epoch": 7.318435754189944, "grad_norm": 0.329851895570755, "learning_rate": 0.0001351342008146489, "loss": 0.0306, "step": 1310 }, { "epoch": 7.374301675977653, "grad_norm": 0.3009398877620697, "learning_rate": 0.000134857022546602, "loss": 0.032, "step": 1320 }, { "epoch": 7.430167597765363, "grad_norm": 0.670881986618042, "learning_rate": 0.00013457757387616348, "loss": 0.04, "step": 1330 }, { "epoch": 7.4860335195530725, "grad_norm": 0.38917240500450134, "learning_rate": 0.00013429586540294049, "loss": 0.0337, "step": 1340 }, { "epoch": 7.5418994413407825, "grad_norm": 0.3102369010448456, "learning_rate": 0.00013401190781225553, "loss": 0.031, "step": 1350 }, { "epoch": 7.597765363128492, "grad_norm": 0.37413033843040466, "learning_rate": 0.00013372571187474121, "loss": 0.0334, "step": 1360 }, { "epoch": 7.653631284916202, "grad_norm": 0.28566494584083557, "learning_rate": 0.0001334372884459314, "loss": 0.0284, "step": 1370 }, { "epoch": 7.709497206703911, "grad_norm": 0.35984960198402405, "learning_rate": 0.0001331466484658498, "loss": 0.028, "step": 1380 }, { "epoch": 7.76536312849162, "grad_norm": 0.41864386200904846, "learning_rate": 0.00013285380295859482, "loss": 0.0316, "step": 1390 }, { "epoch": 7.82122905027933, "grad_norm": 0.44935837388038635, "learning_rate": 0.00013255876303192138, "loss": 0.0341, "step": 1400 }, { "epoch": 7.877094972067039, "grad_norm": 0.2935531437397003, "learning_rate": 0.00013226153987681974, "loss": 0.035, "step": 1410 }, { "epoch": 7.932960893854749, "grad_norm": 0.5159788131713867, "learning_rate": 0.00013196214476709096, "loss": 0.0303, "step": 1420 }, { "epoch": 7.988826815642458, "grad_norm": 0.3987308144569397, "learning_rate": 0.00013166058905891918, "loss": 0.0342, "step": 1430 }, { "epoch": 8.044692737430168, "grad_norm": 0.35137230157852173, "learning_rate": 0.00013135688419044109, "loss": 0.0298, "step": 1440 }, { "epoch": 8.100558659217878, "grad_norm": 0.2903079092502594, "learning_rate": 0.00013105104168131178, "loss": 0.0293, "step": 1450 }, { "epoch": 8.156424581005586, "grad_norm": 0.26529935002326965, "learning_rate": 0.00013074307313226807, "loss": 0.0327, "step": 1460 }, { "epoch": 8.212290502793296, "grad_norm": 0.46690046787261963, "learning_rate": 0.0001304329902246884, "loss": 0.0336, "step": 1470 }, { "epoch": 8.268156424581006, "grad_norm": 0.37878528237342834, "learning_rate": 0.00013012080472014968, "loss": 0.0304, "step": 1480 }, { "epoch": 8.324022346368714, "grad_norm": 0.2849811315536499, "learning_rate": 0.00012980652845998123, "loss": 0.032, "step": 1490 }, { "epoch": 8.379888268156424, "grad_norm": 0.2862546741962433, "learning_rate": 0.0001294901733648156, "loss": 0.0286, "step": 1500 }, { "epoch": 8.435754189944134, "grad_norm": 0.21827903389930725, "learning_rate": 0.00012917175143413646, "loss": 0.0373, "step": 1510 }, { "epoch": 8.491620111731844, "grad_norm": 0.16881871223449707, "learning_rate": 0.00012885127474582348, "loss": 0.0334, "step": 1520 }, { "epoch": 8.547486033519553, "grad_norm": 0.5263266563415527, "learning_rate": 0.00012852875545569408, "loss": 0.037, "step": 1530 }, { "epoch": 8.603351955307263, "grad_norm": 0.41702142357826233, "learning_rate": 0.00012820420579704245, "loss": 0.0398, "step": 1540 }, { "epoch": 8.659217877094973, "grad_norm": 0.20871484279632568, "learning_rate": 0.00012787763808017565, "loss": 0.0321, "step": 1550 }, { "epoch": 8.71508379888268, "grad_norm": 0.4315951466560364, "learning_rate": 0.00012754906469194632, "loss": 0.0316, "step": 1560 }, { "epoch": 8.77094972067039, "grad_norm": 0.2898998558521271, "learning_rate": 0.0001272184980952833, "loss": 0.0314, "step": 1570 }, { "epoch": 8.8268156424581, "grad_norm": 0.4114074409008026, "learning_rate": 0.00012688595082871852, "loss": 0.0316, "step": 1580 }, { "epoch": 8.88268156424581, "grad_norm": 0.3064340353012085, "learning_rate": 0.00012655143550591164, "loss": 0.0335, "step": 1590 }, { "epoch": 8.938547486033519, "grad_norm": 0.2396443486213684, "learning_rate": 0.0001262149648151715, "loss": 0.0292, "step": 1600 }, { "epoch": 8.994413407821229, "grad_norm": 0.280146986246109, "learning_rate": 0.00012587655151897488, "loss": 0.031, "step": 1610 }, { "epoch": 9.050279329608939, "grad_norm": 0.2552490234375, "learning_rate": 0.00012553620845348246, "loss": 0.0249, "step": 1620 }, { "epoch": 9.106145251396647, "grad_norm": 0.1910940706729889, "learning_rate": 0.00012519394852805182, "loss": 0.0297, "step": 1630 }, { "epoch": 9.162011173184357, "grad_norm": 0.2914765477180481, "learning_rate": 0.0001248497847247479, "loss": 0.0321, "step": 1640 }, { "epoch": 9.217877094972067, "grad_norm": 0.34026339650154114, "learning_rate": 0.00012450373009785054, "loss": 0.0272, "step": 1650 }, { "epoch": 9.273743016759777, "grad_norm": 0.21475638449192047, "learning_rate": 0.00012415579777335932, "loss": 0.026, "step": 1660 }, { "epoch": 9.329608938547485, "grad_norm": 0.30112239718437195, "learning_rate": 0.00012380600094849566, "loss": 0.0374, "step": 1670 }, { "epoch": 9.385474860335195, "grad_norm": 0.17926353216171265, "learning_rate": 0.00012345435289120234, "loss": 0.026, "step": 1680 }, { "epoch": 9.441340782122905, "grad_norm": 0.16974708437919617, "learning_rate": 0.00012310086693964012, "loss": 0.0308, "step": 1690 }, { "epoch": 9.497206703910614, "grad_norm": 0.3298158347606659, "learning_rate": 0.00012274555650168187, "loss": 0.03, "step": 1700 }, { "epoch": 9.553072625698324, "grad_norm": 0.33263733983039856, "learning_rate": 0.000122388435054404, "loss": 0.03, "step": 1710 }, { "epoch": 9.608938547486034, "grad_norm": 0.5304810404777527, "learning_rate": 0.00012202951614357541, "loss": 0.028, "step": 1720 }, { "epoch": 9.664804469273744, "grad_norm": 0.2756952941417694, "learning_rate": 0.00012166881338314337, "loss": 0.0271, "step": 1730 }, { "epoch": 9.720670391061452, "grad_norm": 0.329500675201416, "learning_rate": 0.0001213063404547174, "loss": 0.0392, "step": 1740 }, { "epoch": 9.776536312849162, "grad_norm": 0.20280927419662476, "learning_rate": 0.0001209421111070503, "loss": 0.0293, "step": 1750 }, { "epoch": 9.832402234636872, "grad_norm": 0.26771217584609985, "learning_rate": 0.00012057613915551652, "loss": 0.0283, "step": 1760 }, { "epoch": 9.888268156424582, "grad_norm": 0.2704836428165436, "learning_rate": 0.00012020843848158826, "loss": 0.0295, "step": 1770 }, { "epoch": 9.94413407821229, "grad_norm": 0.34217214584350586, "learning_rate": 0.00011983902303230892, "loss": 0.0303, "step": 1780 }, { "epoch": 10.0, "grad_norm": 0.27921390533447266, "learning_rate": 0.00011946790681976399, "loss": 0.0292, "step": 1790 }, { "epoch": 10.05586592178771, "grad_norm": 0.2127024084329605, "learning_rate": 0.0001190951039205497, "loss": 0.0256, "step": 1800 }, { "epoch": 10.111731843575418, "grad_norm": 0.2055603712797165, "learning_rate": 0.00011872062847523899, "loss": 0.028, "step": 1810 }, { "epoch": 10.167597765363128, "grad_norm": 0.27974414825439453, "learning_rate": 0.00011834449468784523, "loss": 0.03, "step": 1820 }, { "epoch": 10.223463687150838, "grad_norm": 0.3269798159599304, "learning_rate": 0.00011796671682528334, "loss": 0.0287, "step": 1830 }, { "epoch": 10.279329608938548, "grad_norm": 0.3258807063102722, "learning_rate": 0.00011758730921682876, "loss": 0.0284, "step": 1840 }, { "epoch": 10.335195530726256, "grad_norm": 0.21946227550506592, "learning_rate": 0.00011720628625357392, "loss": 0.0306, "step": 1850 }, { "epoch": 10.391061452513966, "grad_norm": 0.3200410008430481, "learning_rate": 0.00011682366238788225, "loss": 0.0283, "step": 1860 }, { "epoch": 10.446927374301676, "grad_norm": 0.32745105028152466, "learning_rate": 0.0001164394521328402, "loss": 0.0269, "step": 1870 }, { "epoch": 10.502793296089386, "grad_norm": 0.30036798119544983, "learning_rate": 0.00011605367006170654, "loss": 0.0267, "step": 1880 }, { "epoch": 10.558659217877095, "grad_norm": 0.3197910785675049, "learning_rate": 0.0001156663308073598, "loss": 0.0257, "step": 1890 }, { "epoch": 10.614525139664805, "grad_norm": 0.23818261921405792, "learning_rate": 0.00011527744906174306, "loss": 0.0309, "step": 1900 }, { "epoch": 10.670391061452515, "grad_norm": 0.1711827963590622, "learning_rate": 0.00011488703957530675, "loss": 0.0275, "step": 1910 }, { "epoch": 10.726256983240223, "grad_norm": 0.3193713426589966, "learning_rate": 0.00011449511715644922, "loss": 0.0286, "step": 1920 }, { "epoch": 10.782122905027933, "grad_norm": 0.3227252662181854, "learning_rate": 0.00011410169667095494, "loss": 0.0274, "step": 1930 }, { "epoch": 10.837988826815643, "grad_norm": 0.27391719818115234, "learning_rate": 0.0001137067930414307, "loss": 0.0314, "step": 1940 }, { "epoch": 10.893854748603353, "grad_norm": 0.407885879278183, "learning_rate": 0.0001133104212467396, "loss": 0.0272, "step": 1950 }, { "epoch": 10.949720670391061, "grad_norm": 0.2611219584941864, "learning_rate": 0.0001129125963214328, "loss": 0.0266, "step": 1960 }, { "epoch": 11.005586592178771, "grad_norm": 0.335827499628067, "learning_rate": 0.00011251333335517944, "loss": 0.0273, "step": 1970 }, { "epoch": 11.061452513966481, "grad_norm": 0.24541901051998138, "learning_rate": 0.00011211264749219403, "loss": 0.0295, "step": 1980 }, { "epoch": 11.11731843575419, "grad_norm": 0.27788233757019043, "learning_rate": 0.00011171055393066224, "loss": 0.0254, "step": 1990 }, { "epoch": 11.1731843575419, "grad_norm": 0.23598261177539825, "learning_rate": 0.00011130706792216439, "loss": 0.0298, "step": 2000 }, { "epoch": 11.22905027932961, "grad_norm": 0.20312641561031342, "learning_rate": 0.0001109022047710968, "loss": 0.0278, "step": 2010 }, { "epoch": 11.28491620111732, "grad_norm": 0.2139887809753418, "learning_rate": 0.00011049597983409147, "loss": 0.0271, "step": 2020 }, { "epoch": 11.340782122905027, "grad_norm": 0.22644950449466705, "learning_rate": 0.00011008840851943348, "loss": 0.0298, "step": 2030 }, { "epoch": 11.396648044692737, "grad_norm": 0.20427192747592926, "learning_rate": 0.00010967950628647664, "loss": 0.03, "step": 2040 }, { "epoch": 11.452513966480447, "grad_norm": 0.4383834898471832, "learning_rate": 0.00010926928864505698, "loss": 0.0303, "step": 2050 }, { "epoch": 11.508379888268156, "grad_norm": 0.2548943758010864, "learning_rate": 0.00010885777115490463, "loss": 0.0288, "step": 2060 }, { "epoch": 11.564245810055866, "grad_norm": 0.2970632016658783, "learning_rate": 0.00010844496942505342, "loss": 0.0287, "step": 2070 }, { "epoch": 11.620111731843576, "grad_norm": 0.28128278255462646, "learning_rate": 0.00010803089911324907, "loss": 0.0298, "step": 2080 }, { "epoch": 11.675977653631286, "grad_norm": 0.2895059287548065, "learning_rate": 0.00010761557592535509, "loss": 0.0312, "step": 2090 }, { "epoch": 11.731843575418994, "grad_norm": 0.3408230245113373, "learning_rate": 0.00010719901561475706, "loss": 0.031, "step": 2100 }, { "epoch": 11.787709497206704, "grad_norm": 0.2545909881591797, "learning_rate": 0.00010678123398176526, "loss": 0.0274, "step": 2110 }, { "epoch": 11.843575418994414, "grad_norm": 0.2746656537055969, "learning_rate": 0.00010636224687301515, "loss": 0.0214, "step": 2120 }, { "epoch": 11.899441340782122, "grad_norm": 0.35347574949264526, "learning_rate": 0.00010594207018086647, "loss": 0.0306, "step": 2130 }, { "epoch": 11.955307262569832, "grad_norm": 0.2086307555437088, "learning_rate": 0.00010552071984280035, "loss": 0.0253, "step": 2140 }, { "epoch": 12.011173184357542, "grad_norm": 0.2846223711967468, "learning_rate": 0.00010509821184081479, "loss": 0.0267, "step": 2150 }, { "epoch": 12.067039106145252, "grad_norm": 0.26856729388237, "learning_rate": 0.00010467456220081847, "loss": 0.0218, "step": 2160 }, { "epoch": 12.12290502793296, "grad_norm": 0.24002467095851898, "learning_rate": 0.00010424978699202294, "loss": 0.0268, "step": 2170 }, { "epoch": 12.17877094972067, "grad_norm": 0.37470901012420654, "learning_rate": 0.00010382390232633298, "loss": 0.0258, "step": 2180 }, { "epoch": 12.23463687150838, "grad_norm": 0.3723151981830597, "learning_rate": 0.00010339692435773564, "loss": 0.0303, "step": 2190 }, { "epoch": 12.29050279329609, "grad_norm": 0.44261157512664795, "learning_rate": 0.00010296886928168728, "loss": 0.0302, "step": 2200 }, { "epoch": 12.346368715083798, "grad_norm": 0.23005405068397522, "learning_rate": 0.00010253975333449953, "loss": 0.0302, "step": 2210 }, { "epoch": 12.402234636871508, "grad_norm": 0.35475534200668335, "learning_rate": 0.00010210959279272327, "loss": 0.0305, "step": 2220 }, { "epoch": 12.458100558659218, "grad_norm": 0.1821463704109192, "learning_rate": 0.00010167840397253129, "loss": 0.0293, "step": 2230 }, { "epoch": 12.513966480446927, "grad_norm": 0.3717893958091736, "learning_rate": 0.00010124620322909939, "loss": 0.0265, "step": 2240 }, { "epoch": 12.569832402234637, "grad_norm": 0.35059571266174316, "learning_rate": 0.00010081300695598612, "loss": 0.0257, "step": 2250 }, { "epoch": 12.625698324022347, "grad_norm": 0.3039356470108032, "learning_rate": 0.00010037883158451083, "loss": 0.0288, "step": 2260 }, { "epoch": 12.681564245810057, "grad_norm": 0.49444425106048584, "learning_rate": 9.994369358313057e-05, "loss": 0.029, "step": 2270 }, { "epoch": 12.737430167597765, "grad_norm": 0.2746371626853943, "learning_rate": 9.950760945681525e-05, "loss": 0.0265, "step": 2280 }, { "epoch": 12.793296089385475, "grad_norm": 0.21077698469161987, "learning_rate": 9.907059574642177e-05, "loss": 0.0279, "step": 2290 }, { "epoch": 12.849162011173185, "grad_norm": 0.3581600487232208, "learning_rate": 9.863266902806654e-05, "loss": 0.0256, "step": 2300 }, { "epoch": 12.905027932960895, "grad_norm": 0.3539135754108429, "learning_rate": 9.81938459124967e-05, "loss": 0.0244, "step": 2310 }, { "epoch": 12.960893854748603, "grad_norm": 0.24212266504764557, "learning_rate": 9.775414304446024e-05, "loss": 0.0291, "step": 2320 }, { "epoch": 13.016759776536313, "grad_norm": 0.24756573140621185, "learning_rate": 9.731357710207442e-05, "loss": 0.0254, "step": 2330 }, { "epoch": 13.072625698324023, "grad_norm": 0.20405328273773193, "learning_rate": 9.687216479619328e-05, "loss": 0.0274, "step": 2340 }, { "epoch": 13.128491620111731, "grad_norm": 0.3089810013771057, "learning_rate": 9.64299228697739e-05, "loss": 0.0233, "step": 2350 }, { "epoch": 13.184357541899441, "grad_norm": 0.3249969780445099, "learning_rate": 9.598686809724109e-05, "loss": 0.0251, "step": 2360 }, { "epoch": 13.240223463687151, "grad_norm": 0.30293509364128113, "learning_rate": 9.554301728385133e-05, "loss": 0.0257, "step": 2370 }, { "epoch": 13.296089385474861, "grad_norm": 0.27823933959007263, "learning_rate": 9.509838726505527e-05, "loss": 0.0245, "step": 2380 }, { "epoch": 13.35195530726257, "grad_norm": 0.31522759795188904, "learning_rate": 9.465299490585914e-05, "loss": 0.0254, "step": 2390 }, { "epoch": 13.40782122905028, "grad_norm": 0.28887462615966797, "learning_rate": 9.420685710018505e-05, "loss": 0.0237, "step": 2400 }, { "epoch": 13.46368715083799, "grad_norm": 0.23696090281009674, "learning_rate": 9.375999077023024e-05, "loss": 0.0264, "step": 2410 }, { "epoch": 13.519553072625698, "grad_norm": 0.2132495492696762, "learning_rate": 9.331241286582515e-05, "loss": 0.0238, "step": 2420 }, { "epoch": 13.575418994413408, "grad_norm": 0.3045707046985626, "learning_rate": 9.28641403637906e-05, "loss": 0.0292, "step": 2430 }, { "epoch": 13.631284916201118, "grad_norm": 0.2944694459438324, "learning_rate": 9.241519026729374e-05, "loss": 0.0252, "step": 2440 }, { "epoch": 13.687150837988828, "grad_norm": 0.18707014620304108, "learning_rate": 9.19655796052032e-05, "loss": 0.0264, "step": 2450 }, { "epoch": 13.743016759776536, "grad_norm": 0.4140284061431885, "learning_rate": 9.151532543144313e-05, "loss": 0.0284, "step": 2460 }, { "epoch": 13.798882681564246, "grad_norm": 0.24894434213638306, "learning_rate": 9.106444482434638e-05, "loss": 0.0288, "step": 2470 }, { "epoch": 13.854748603351956, "grad_norm": 0.24449239671230316, "learning_rate": 9.061295488600659e-05, "loss": 0.0232, "step": 2480 }, { "epoch": 13.910614525139664, "grad_norm": 0.2831474244594574, "learning_rate": 9.016087274162972e-05, "loss": 0.0238, "step": 2490 }, { "epoch": 13.966480446927374, "grad_norm": 0.27097368240356445, "learning_rate": 8.970821553888428e-05, "loss": 0.0314, "step": 2500 }, { "epoch": 14.022346368715084, "grad_norm": 0.2659156024456024, "learning_rate": 8.925500044725104e-05, "loss": 0.0254, "step": 2510 }, { "epoch": 14.078212290502794, "grad_norm": 0.200448676943779, "learning_rate": 8.880124465737168e-05, "loss": 0.0242, "step": 2520 }, { "epoch": 14.134078212290502, "grad_norm": 0.21925762295722961, "learning_rate": 8.834696538039678e-05, "loss": 0.0238, "step": 2530 }, { "epoch": 14.189944134078212, "grad_norm": 0.2610517144203186, "learning_rate": 8.789217984733312e-05, "loss": 0.0222, "step": 2540 }, { "epoch": 14.245810055865922, "grad_norm": 0.29856961965560913, "learning_rate": 8.743690530838983e-05, "loss": 0.0267, "step": 2550 }, { "epoch": 14.30167597765363, "grad_norm": 0.22454680502414703, "learning_rate": 8.69811590323244e-05, "loss": 0.023, "step": 2560 }, { "epoch": 14.35754189944134, "grad_norm": 0.3834502398967743, "learning_rate": 8.652495830578744e-05, "loss": 0.0248, "step": 2570 }, { "epoch": 14.41340782122905, "grad_norm": 0.3710498511791229, "learning_rate": 8.6068320432667e-05, "loss": 0.0263, "step": 2580 }, { "epoch": 14.46927374301676, "grad_norm": 0.19465550780296326, "learning_rate": 8.561126273343239e-05, "loss": 0.0219, "step": 2590 }, { "epoch": 14.525139664804469, "grad_norm": 0.1909979283809662, "learning_rate": 8.515380254447703e-05, "loss": 0.0255, "step": 2600 }, { "epoch": 14.581005586592179, "grad_norm": 0.3466046452522278, "learning_rate": 8.4695957217461e-05, "loss": 0.0249, "step": 2610 }, { "epoch": 14.636871508379889, "grad_norm": 0.3462992012500763, "learning_rate": 8.42377441186528e-05, "loss": 0.0245, "step": 2620 }, { "epoch": 14.692737430167599, "grad_norm": 0.28605830669403076, "learning_rate": 8.377918062827068e-05, "loss": 0.0254, "step": 2630 }, { "epoch": 14.748603351955307, "grad_norm": 0.2495012879371643, "learning_rate": 8.332028413982336e-05, "loss": 0.0268, "step": 2640 }, { "epoch": 14.804469273743017, "grad_norm": 0.23009264469146729, "learning_rate": 8.286107205945036e-05, "loss": 0.0235, "step": 2650 }, { "epoch": 14.860335195530727, "grad_norm": 0.5348438024520874, "learning_rate": 8.240156180526173e-05, "loss": 0.0282, "step": 2660 }, { "epoch": 14.916201117318435, "grad_norm": 0.23143941164016724, "learning_rate": 8.194177080667739e-05, "loss": 0.0227, "step": 2670 }, { "epoch": 14.972067039106145, "grad_norm": 0.2876596450805664, "learning_rate": 8.148171650376595e-05, "loss": 0.0217, "step": 2680 }, { "epoch": 15.027932960893855, "grad_norm": 0.21062903106212616, "learning_rate": 8.102141634658339e-05, "loss": 0.0226, "step": 2690 }, { "epoch": 15.083798882681565, "grad_norm": 0.22216618061065674, "learning_rate": 8.056088779451088e-05, "loss": 0.0244, "step": 2700 }, { "epoch": 15.139664804469273, "grad_norm": 0.2092180848121643, "learning_rate": 8.010014831559288e-05, "loss": 0.0267, "step": 2710 }, { "epoch": 15.195530726256983, "grad_norm": 0.31914299726486206, "learning_rate": 7.963921538587422e-05, "loss": 0.0186, "step": 2720 }, { "epoch": 15.251396648044693, "grad_norm": 0.28938183188438416, "learning_rate": 7.917810648873758e-05, "loss": 0.0254, "step": 2730 }, { "epoch": 15.307262569832401, "grad_norm": 0.2850409746170044, "learning_rate": 7.871683911424e-05, "loss": 0.0254, "step": 2740 }, { "epoch": 15.363128491620111, "grad_norm": 0.3008069097995758, "learning_rate": 7.82554307584498e-05, "loss": 0.021, "step": 2750 }, { "epoch": 15.418994413407821, "grad_norm": 0.2249889373779297, "learning_rate": 7.779389892278258e-05, "loss": 0.0215, "step": 2760 }, { "epoch": 15.474860335195531, "grad_norm": 0.3198390305042267, "learning_rate": 7.733226111333775e-05, "loss": 0.0214, "step": 2770 }, { "epoch": 15.53072625698324, "grad_norm": 0.2054470032453537, "learning_rate": 7.687053484023428e-05, "loss": 0.0248, "step": 2780 }, { "epoch": 15.58659217877095, "grad_norm": 0.3050447106361389, "learning_rate": 7.640873761694661e-05, "loss": 0.0254, "step": 2790 }, { "epoch": 15.64245810055866, "grad_norm": 0.3348614275455475, "learning_rate": 7.594688695964039e-05, "loss": 0.0228, "step": 2800 }, { "epoch": 15.69832402234637, "grad_norm": 0.2211858332157135, "learning_rate": 7.548500038650796e-05, "loss": 0.0211, "step": 2810 }, { "epoch": 15.754189944134078, "grad_norm": 0.25668853521347046, "learning_rate": 7.502309541710404e-05, "loss": 0.0247, "step": 2820 }, { "epoch": 15.810055865921788, "grad_norm": 0.18035383522510529, "learning_rate": 7.456118957168111e-05, "loss": 0.022, "step": 2830 }, { "epoch": 15.865921787709498, "grad_norm": 0.3626658320426941, "learning_rate": 7.409930037052485e-05, "loss": 0.0239, "step": 2840 }, { "epoch": 15.921787709497206, "grad_norm": 0.21136699616909027, "learning_rate": 7.363744533328964e-05, "loss": 0.0222, "step": 2850 }, { "epoch": 15.977653631284916, "grad_norm": 0.35462382435798645, "learning_rate": 7.317564197833401e-05, "loss": 0.0213, "step": 2860 }, { "epoch": 16.033519553072626, "grad_norm": 0.19702385365962982, "learning_rate": 7.271390782205617e-05, "loss": 0.0245, "step": 2870 }, { "epoch": 16.089385474860336, "grad_norm": 0.2611335813999176, "learning_rate": 7.225226037822956e-05, "loss": 0.0249, "step": 2880 }, { "epoch": 16.145251396648046, "grad_norm": 0.3108862340450287, "learning_rate": 7.179071715733858e-05, "loss": 0.0238, "step": 2890 }, { "epoch": 16.201117318435756, "grad_norm": 0.3305630385875702, "learning_rate": 7.132929566591445e-05, "loss": 0.0254, "step": 2900 }, { "epoch": 16.256983240223462, "grad_norm": 0.29176512360572815, "learning_rate": 7.086801340587114e-05, "loss": 0.0224, "step": 2910 }, { "epoch": 16.312849162011172, "grad_norm": 0.33446812629699707, "learning_rate": 7.040688787384144e-05, "loss": 0.0223, "step": 2920 }, { "epoch": 16.368715083798882, "grad_norm": 0.19562651216983795, "learning_rate": 6.994593656051346e-05, "loss": 0.0254, "step": 2930 }, { "epoch": 16.424581005586592, "grad_norm": 0.21511122584342957, "learning_rate": 6.94851769499671e-05, "loss": 0.0226, "step": 2940 }, { "epoch": 16.480446927374302, "grad_norm": 0.23985400795936584, "learning_rate": 6.902462651901085e-05, "loss": 0.0269, "step": 2950 }, { "epoch": 16.536312849162012, "grad_norm": 0.3013317584991455, "learning_rate": 6.856430273651896e-05, "loss": 0.021, "step": 2960 }, { "epoch": 16.592178770949722, "grad_norm": 0.35089585185050964, "learning_rate": 6.810422306276884e-05, "loss": 0.0232, "step": 2970 }, { "epoch": 16.64804469273743, "grad_norm": 0.17514319717884064, "learning_rate": 6.764440494877874e-05, "loss": 0.0203, "step": 2980 }, { "epoch": 16.70391061452514, "grad_norm": 0.15833304822444916, "learning_rate": 6.718486583564577e-05, "loss": 0.0207, "step": 2990 }, { "epoch": 16.75977653631285, "grad_norm": 0.22394374012947083, "learning_rate": 6.672562315388446e-05, "loss": 0.0219, "step": 3000 }, { "epoch": 16.81564245810056, "grad_norm": 0.33897238969802856, "learning_rate": 6.62666943227656e-05, "loss": 0.0231, "step": 3010 }, { "epoch": 16.87150837988827, "grad_norm": 0.3397151529788971, "learning_rate": 6.580809674965549e-05, "loss": 0.0203, "step": 3020 }, { "epoch": 16.92737430167598, "grad_norm": 0.2227960079908371, "learning_rate": 6.53498478293556e-05, "loss": 0.0197, "step": 3030 }, { "epoch": 16.98324022346369, "grad_norm": 0.29340264201164246, "learning_rate": 6.489196494344294e-05, "loss": 0.0238, "step": 3040 }, { "epoch": 17.039106145251395, "grad_norm": 0.35238850116729736, "learning_rate": 6.44344654596106e-05, "loss": 0.0271, "step": 3050 }, { "epoch": 17.094972067039105, "grad_norm": 0.18793293833732605, "learning_rate": 6.397736673100918e-05, "loss": 0.021, "step": 3060 }, { "epoch": 17.150837988826815, "grad_norm": 0.25130748748779297, "learning_rate": 6.35206860955883e-05, "loss": 0.0249, "step": 3070 }, { "epoch": 17.206703910614525, "grad_norm": 0.2051066905260086, "learning_rate": 6.306444087543927e-05, "loss": 0.019, "step": 3080 }, { "epoch": 17.262569832402235, "grad_norm": 0.16360081732273102, "learning_rate": 6.260864837613788e-05, "loss": 0.0213, "step": 3090 }, { "epoch": 17.318435754189945, "grad_norm": 0.3169659674167633, "learning_rate": 6.215332588608793e-05, "loss": 0.018, "step": 3100 }, { "epoch": 17.374301675977655, "grad_norm": 0.24224449694156647, "learning_rate": 6.16984906758657e-05, "loss": 0.0194, "step": 3110 }, { "epoch": 17.43016759776536, "grad_norm": 0.3294720947742462, "learning_rate": 6.124415999756466e-05, "loss": 0.025, "step": 3120 }, { "epoch": 17.48603351955307, "grad_norm": 0.3010692298412323, "learning_rate": 6.079035108414123e-05, "loss": 0.019, "step": 3130 }, { "epoch": 17.54189944134078, "grad_norm": 0.1720583438873291, "learning_rate": 6.033708114876097e-05, "loss": 0.0197, "step": 3140 }, { "epoch": 17.59776536312849, "grad_norm": 0.13594307005405426, "learning_rate": 5.988436738414584e-05, "loss": 0.0208, "step": 3150 }, { "epoch": 17.6536312849162, "grad_norm": 0.2565588057041168, "learning_rate": 5.943222696192204e-05, "loss": 0.0246, "step": 3160 }, { "epoch": 17.70949720670391, "grad_norm": 0.1882692277431488, "learning_rate": 5.898067703196857e-05, "loss": 0.023, "step": 3170 }, { "epoch": 17.76536312849162, "grad_norm": 0.16729940474033356, "learning_rate": 5.852973472176685e-05, "loss": 0.0188, "step": 3180 }, { "epoch": 17.821229050279328, "grad_norm": 0.2922714948654175, "learning_rate": 5.807941713575098e-05, "loss": 0.0226, "step": 3190 }, { "epoch": 17.877094972067038, "grad_norm": 0.33513203263282776, "learning_rate": 5.762974135465907e-05, "loss": 0.0234, "step": 3200 }, { "epoch": 17.932960893854748, "grad_norm": 0.2364656627178192, "learning_rate": 5.7180724434885294e-05, "loss": 0.0195, "step": 3210 }, { "epoch": 17.988826815642458, "grad_norm": 0.23285451531410217, "learning_rate": 5.673238340783285e-05, "loss": 0.0193, "step": 3220 }, { "epoch": 18.044692737430168, "grad_norm": 0.20390582084655762, "learning_rate": 5.6284735279268134e-05, "loss": 0.0206, "step": 3230 }, { "epoch": 18.100558659217878, "grad_norm": 0.3086446225643158, "learning_rate": 5.583779702867555e-05, "loss": 0.0201, "step": 3240 }, { "epoch": 18.156424581005588, "grad_norm": 0.17210273444652557, "learning_rate": 5.5391585608613514e-05, "loss": 0.0231, "step": 3250 }, { "epoch": 18.212290502793294, "grad_norm": 0.18998980522155762, "learning_rate": 5.4946117944071514e-05, "loss": 0.0203, "step": 3260 }, { "epoch": 18.268156424581004, "grad_norm": 0.2326553910970688, "learning_rate": 5.450141093182803e-05, "loss": 0.0194, "step": 3270 }, { "epoch": 18.324022346368714, "grad_norm": 0.24682560563087463, "learning_rate": 5.405748143980972e-05, "loss": 0.0208, "step": 3280 }, { "epoch": 18.379888268156424, "grad_norm": 0.2583821415901184, "learning_rate": 5.361434630645148e-05, "loss": 0.0204, "step": 3290 }, { "epoch": 18.435754189944134, "grad_norm": 0.23446252942085266, "learning_rate": 5.31720223400579e-05, "loss": 0.0176, "step": 3300 }, { "epoch": 18.491620111731844, "grad_norm": 0.2943982779979706, "learning_rate": 5.2730526318165684e-05, "loss": 0.0192, "step": 3310 }, { "epoch": 18.547486033519554, "grad_norm": 0.2999999225139618, "learning_rate": 5.2289874986907236e-05, "loss": 0.0204, "step": 3320 }, { "epoch": 18.60335195530726, "grad_norm": 0.21415217220783234, "learning_rate": 5.1850085060375417e-05, "loss": 0.0201, "step": 3330 }, { "epoch": 18.65921787709497, "grad_norm": 0.2961329221725464, "learning_rate": 5.1411173219989736e-05, "loss": 0.0199, "step": 3340 }, { "epoch": 18.71508379888268, "grad_norm": 0.25467586517333984, "learning_rate": 5.097315611386345e-05, "loss": 0.021, "step": 3350 }, { "epoch": 18.77094972067039, "grad_norm": 0.2654760777950287, "learning_rate": 5.0536050356172234e-05, "loss": 0.0171, "step": 3360 }, { "epoch": 18.8268156424581, "grad_norm": 0.279527485370636, "learning_rate": 5.0099872526523826e-05, "loss": 0.0205, "step": 3370 }, { "epoch": 18.88268156424581, "grad_norm": 0.41244757175445557, "learning_rate": 4.966463916932934e-05, "loss": 0.0162, "step": 3380 }, { "epoch": 18.93854748603352, "grad_norm": 0.25530514121055603, "learning_rate": 4.9230366793175654e-05, "loss": 0.0181, "step": 3390 }, { "epoch": 18.994413407821227, "grad_norm": 0.18296165764331818, "learning_rate": 4.879707187019914e-05, "loss": 0.0229, "step": 3400 }, { "epoch": 19.050279329608937, "grad_norm": 0.18316328525543213, "learning_rate": 4.8364770835461054e-05, "loss": 0.0195, "step": 3410 }, { "epoch": 19.106145251396647, "grad_norm": 0.1494152843952179, "learning_rate": 4.793348008632396e-05, "loss": 0.0167, "step": 3420 }, { "epoch": 19.162011173184357, "grad_norm": 0.3076527416706085, "learning_rate": 4.750321598182995e-05, "loss": 0.0177, "step": 3430 }, { "epoch": 19.217877094972067, "grad_norm": 0.27527594566345215, "learning_rate": 4.707399484207995e-05, "loss": 0.0216, "step": 3440 }, { "epoch": 19.273743016759777, "grad_norm": 0.16726377606391907, "learning_rate": 4.664583294761485e-05, "loss": 0.0211, "step": 3450 }, { "epoch": 19.329608938547487, "grad_norm": 0.2316291779279709, "learning_rate": 4.6218746538797926e-05, "loss": 0.0197, "step": 3460 }, { "epoch": 19.385474860335197, "grad_norm": 0.2616920471191406, "learning_rate": 4.579275181519879e-05, "loss": 0.0193, "step": 3470 }, { "epoch": 19.441340782122904, "grad_norm": 0.17275288701057434, "learning_rate": 4.536786493497894e-05, "loss": 0.0186, "step": 3480 }, { "epoch": 19.497206703910614, "grad_norm": 0.18898899853229523, "learning_rate": 4.494410201427896e-05, "loss": 0.0198, "step": 3490 }, { "epoch": 19.553072625698324, "grad_norm": 0.3262523412704468, "learning_rate": 4.452147912660715e-05, "loss": 0.0212, "step": 3500 }, { "epoch": 19.608938547486034, "grad_norm": 0.1693217158317566, "learning_rate": 4.4100012302229915e-05, "loss": 0.0203, "step": 3510 }, { "epoch": 19.664804469273744, "grad_norm": 0.1959124356508255, "learning_rate": 4.3679717527563576e-05, "loss": 0.0184, "step": 3520 }, { "epoch": 19.720670391061454, "grad_norm": 0.21138575673103333, "learning_rate": 4.326061074456817e-05, "loss": 0.0172, "step": 3530 }, { "epoch": 19.776536312849164, "grad_norm": 0.6261563301086426, "learning_rate": 4.2842707850142806e-05, "loss": 0.018, "step": 3540 }, { "epoch": 19.83240223463687, "grad_norm": 0.2905479073524475, "learning_rate": 4.242602469552234e-05, "loss": 0.017, "step": 3550 }, { "epoch": 19.88826815642458, "grad_norm": 0.4615681767463684, "learning_rate": 4.201057708567664e-05, "loss": 0.0206, "step": 3560 }, { "epoch": 19.94413407821229, "grad_norm": 0.1858612447977066, "learning_rate": 4.1596380778710686e-05, "loss": 0.0193, "step": 3570 }, { "epoch": 20.0, "grad_norm": 0.21462394297122955, "learning_rate": 4.1183451485267034e-05, "loss": 0.0175, "step": 3580 }, { "epoch": 20.05586592178771, "grad_norm": 0.2793791890144348, "learning_rate": 4.077180486792987e-05, "loss": 0.0166, "step": 3590 }, { "epoch": 20.11173184357542, "grad_norm": 0.22271260619163513, "learning_rate": 4.036145654063093e-05, "loss": 0.0146, "step": 3600 }, { "epoch": 20.16759776536313, "grad_norm": 0.15600596368312836, "learning_rate": 3.995242206805731e-05, "loss": 0.0193, "step": 3610 }, { "epoch": 20.223463687150836, "grad_norm": 0.20925801992416382, "learning_rate": 3.9544716965060996e-05, "loss": 0.0209, "step": 3620 }, { "epoch": 20.279329608938546, "grad_norm": 0.16699257493019104, "learning_rate": 3.9138356696070315e-05, "loss": 0.0136, "step": 3630 }, { "epoch": 20.335195530726256, "grad_norm": 0.22640997171401978, "learning_rate": 3.8733356674503676e-05, "loss": 0.0161, "step": 3640 }, { "epoch": 20.391061452513966, "grad_norm": 0.5254058837890625, "learning_rate": 3.832973226218457e-05, "loss": 0.0176, "step": 3650 }, { "epoch": 20.446927374301676, "grad_norm": 0.148833766579628, "learning_rate": 3.792749876875907e-05, "loss": 0.0147, "step": 3660 }, { "epoch": 20.502793296089386, "grad_norm": 0.26952382922172546, "learning_rate": 3.752667145111514e-05, "loss": 0.0193, "step": 3670 }, { "epoch": 20.558659217877096, "grad_norm": 0.24985520541667938, "learning_rate": 3.712726551280385e-05, "loss": 0.0164, "step": 3680 }, { "epoch": 20.614525139664803, "grad_norm": 0.12173712253570557, "learning_rate": 3.6729296103462835e-05, "loss": 0.016, "step": 3690 }, { "epoch": 20.670391061452513, "grad_norm": 0.24871109426021576, "learning_rate": 3.63327783182414e-05, "loss": 0.0174, "step": 3700 }, { "epoch": 20.726256983240223, "grad_norm": 0.21813338994979858, "learning_rate": 3.593772719722832e-05, "loss": 0.0175, "step": 3710 }, { "epoch": 20.782122905027933, "grad_norm": 0.15502704679965973, "learning_rate": 3.5544157724881045e-05, "loss": 0.0171, "step": 3720 }, { "epoch": 20.837988826815643, "grad_norm": 0.2693827152252197, "learning_rate": 3.515208482945758e-05, "loss": 0.0184, "step": 3730 }, { "epoch": 20.893854748603353, "grad_norm": 0.2545803487300873, "learning_rate": 3.476152338244995e-05, "loss": 0.0211, "step": 3740 }, { "epoch": 20.949720670391063, "grad_norm": 0.14349234104156494, "learning_rate": 3.437248819802042e-05, "loss": 0.016, "step": 3750 }, { "epoch": 21.00558659217877, "grad_norm": 0.42411503195762634, "learning_rate": 3.3984994032439496e-05, "loss": 0.0178, "step": 3760 }, { "epoch": 21.06145251396648, "grad_norm": 0.25866132974624634, "learning_rate": 3.359905558352609e-05, "loss": 0.0165, "step": 3770 }, { "epoch": 21.11731843575419, "grad_norm": 0.22836929559707642, "learning_rate": 3.321468749009017e-05, "loss": 0.0196, "step": 3780 }, { "epoch": 21.1731843575419, "grad_norm": 0.1367328017950058, "learning_rate": 3.283190433137742e-05, "loss": 0.0177, "step": 3790 }, { "epoch": 21.22905027932961, "grad_norm": 0.16710641980171204, "learning_rate": 3.2450720626516294e-05, "loss": 0.0153, "step": 3800 }, { "epoch": 21.28491620111732, "grad_norm": 0.4859740734100342, "learning_rate": 3.207115083396728e-05, "loss": 0.0166, "step": 3810 }, { "epoch": 21.34078212290503, "grad_norm": 0.2656506299972534, "learning_rate": 3.1693209350974466e-05, "loss": 0.0174, "step": 3820 }, { "epoch": 21.39664804469274, "grad_norm": 0.2200634777545929, "learning_rate": 3.131691051301952e-05, "loss": 0.0192, "step": 3830 }, { "epoch": 21.452513966480446, "grad_norm": 0.15455004572868347, "learning_rate": 3.09422685932778e-05, "loss": 0.0185, "step": 3840 }, { "epoch": 21.508379888268156, "grad_norm": 0.32110223174095154, "learning_rate": 3.056929780207711e-05, "loss": 0.0178, "step": 3850 }, { "epoch": 21.564245810055866, "grad_norm": 0.20172889530658722, "learning_rate": 3.0198012286358618e-05, "loss": 0.0154, "step": 3860 }, { "epoch": 21.620111731843576, "grad_norm": 0.1673143357038498, "learning_rate": 2.9828426129140237e-05, "loss": 0.0173, "step": 3870 }, { "epoch": 21.675977653631286, "grad_norm": 0.13557426631450653, "learning_rate": 2.94605533489826e-05, "loss": 0.0146, "step": 3880 }, { "epoch": 21.731843575418996, "grad_norm": 0.11623450368642807, "learning_rate": 2.9094407899457046e-05, "loss": 0.0139, "step": 3890 }, { "epoch": 21.787709497206706, "grad_norm": 0.16807802021503448, "learning_rate": 2.8730003668616698e-05, "loss": 0.0135, "step": 3900 }, { "epoch": 21.843575418994412, "grad_norm": 0.37868157029151917, "learning_rate": 2.8367354478469396e-05, "loss": 0.0151, "step": 3910 }, { "epoch": 21.899441340782122, "grad_norm": 0.29837918281555176, "learning_rate": 2.8006474084453595e-05, "loss": 0.0151, "step": 3920 }, { "epoch": 21.955307262569832, "grad_norm": 0.24290640652179718, "learning_rate": 2.764737617491653e-05, "loss": 0.0141, "step": 3930 }, { "epoch": 22.011173184357542, "grad_norm": 0.1992618888616562, "learning_rate": 2.729007437059502e-05, "loss": 0.0145, "step": 3940 }, { "epoch": 22.067039106145252, "grad_norm": 0.13898395001888275, "learning_rate": 2.6934582224098963e-05, "loss": 0.0145, "step": 3950 }, { "epoch": 22.122905027932962, "grad_norm": 0.257982075214386, "learning_rate": 2.658091321939697e-05, "loss": 0.0144, "step": 3960 }, { "epoch": 22.178770949720672, "grad_norm": 0.1655452847480774, "learning_rate": 2.6229080771305293e-05, "loss": 0.0141, "step": 3970 }, { "epoch": 22.23463687150838, "grad_norm": 0.15178877115249634, "learning_rate": 2.5879098224978706e-05, "loss": 0.0181, "step": 3980 }, { "epoch": 22.29050279329609, "grad_norm": 0.12818969786167145, "learning_rate": 2.5530978855404448e-05, "loss": 0.0112, "step": 3990 }, { "epoch": 22.3463687150838, "grad_norm": 0.18601444363594055, "learning_rate": 2.5184735866898657e-05, "loss": 0.0131, "step": 4000 }, { "epoch": 22.40223463687151, "grad_norm": 0.2364937663078308, "learning_rate": 2.4840382392605515e-05, "loss": 0.0169, "step": 4010 }, { "epoch": 22.45810055865922, "grad_norm": 0.1969766914844513, "learning_rate": 2.449793149399922e-05, "loss": 0.0117, "step": 4020 }, { "epoch": 22.51396648044693, "grad_norm": 0.31158965826034546, "learning_rate": 2.4157396160388358e-05, "loss": 0.0164, "step": 4030 }, { "epoch": 22.56983240223464, "grad_norm": 0.17856691777706146, "learning_rate": 2.381878930842337e-05, "loss": 0.015, "step": 4040 }, { "epoch": 22.625698324022345, "grad_norm": 0.2636869251728058, "learning_rate": 2.3482123781606547e-05, "loss": 0.0185, "step": 4050 }, { "epoch": 22.681564245810055, "grad_norm": 0.12141893804073334, "learning_rate": 2.3147412349804862e-05, "loss": 0.014, "step": 4060 }, { "epoch": 22.737430167597765, "grad_norm": 0.22012671828269958, "learning_rate": 2.2814667708765745e-05, "loss": 0.0162, "step": 4070 }, { "epoch": 22.793296089385475, "grad_norm": 0.16044989228248596, "learning_rate": 2.2483902479635238e-05, "loss": 0.0131, "step": 4080 }, { "epoch": 22.849162011173185, "grad_norm": 0.15903018414974213, "learning_rate": 2.2155129208479606e-05, "loss": 0.0152, "step": 4090 }, { "epoch": 22.905027932960895, "grad_norm": 0.1807982474565506, "learning_rate": 2.182836036580923e-05, "loss": 0.0152, "step": 4100 }, { "epoch": 22.960893854748605, "grad_norm": 0.2643374502658844, "learning_rate": 2.1503608346105598e-05, "loss": 0.0171, "step": 4110 }, { "epoch": 23.01675977653631, "grad_norm": 0.13949626684188843, "learning_rate": 2.118088546735138e-05, "loss": 0.0132, "step": 4120 }, { "epoch": 23.07262569832402, "grad_norm": 0.6042327880859375, "learning_rate": 2.0860203970562954e-05, "loss": 0.0146, "step": 4130 }, { "epoch": 23.12849162011173, "grad_norm": 0.28848835825920105, "learning_rate": 2.054157601932631e-05, "loss": 0.0157, "step": 4140 }, { "epoch": 23.18435754189944, "grad_norm": 0.5369081497192383, "learning_rate": 2.0225013699335435e-05, "loss": 0.0128, "step": 4150 }, { "epoch": 23.24022346368715, "grad_norm": 0.15896014869213104, "learning_rate": 1.9910529017934196e-05, "loss": 0.0171, "step": 4160 }, { "epoch": 23.29608938547486, "grad_norm": 0.22632993757724762, "learning_rate": 1.9598133903660642e-05, "loss": 0.0117, "step": 4170 }, { "epoch": 23.35195530726257, "grad_norm": 0.14386065304279327, "learning_rate": 1.9287840205794693e-05, "loss": 0.0148, "step": 4180 }, { "epoch": 23.407821229050278, "grad_norm": 0.2815168797969818, "learning_rate": 1.8979659693908616e-05, "loss": 0.0143, "step": 4190 }, { "epoch": 23.463687150837988, "grad_norm": 0.17017313838005066, "learning_rate": 1.867360405742066e-05, "loss": 0.0126, "step": 4200 }, { "epoch": 23.519553072625698, "grad_norm": 0.2865433990955353, "learning_rate": 1.8369684905151677e-05, "loss": 0.0129, "step": 4210 }, { "epoch": 23.575418994413408, "grad_norm": 0.2120775431394577, "learning_rate": 1.806791376488467e-05, "loss": 0.0111, "step": 4220 }, { "epoch": 23.631284916201118, "grad_norm": 0.2156466841697693, "learning_rate": 1.7768302082927673e-05, "loss": 0.0134, "step": 4230 }, { "epoch": 23.687150837988828, "grad_norm": 0.13205291330814362, "learning_rate": 1.74708612236796e-05, "loss": 0.0116, "step": 4240 }, { "epoch": 23.743016759776538, "grad_norm": 0.1907736212015152, "learning_rate": 1.7175602469199075e-05, "loss": 0.0107, "step": 4250 }, { "epoch": 23.798882681564244, "grad_norm": 0.19241803884506226, "learning_rate": 1.6882537018776567e-05, "loss": 0.0136, "step": 4260 }, { "epoch": 23.854748603351954, "grad_norm": 0.26086995005607605, "learning_rate": 1.6591675988509588e-05, "loss": 0.0169, "step": 4270 }, { "epoch": 23.910614525139664, "grad_norm": 0.1134791448712349, "learning_rate": 1.6303030410881034e-05, "loss": 0.0126, "step": 4280 }, { "epoch": 23.966480446927374, "grad_norm": 0.19399867951869965, "learning_rate": 1.601661123434082e-05, "loss": 0.0146, "step": 4290 }, { "epoch": 24.022346368715084, "grad_norm": 0.218746617436409, "learning_rate": 1.5732429322890384e-05, "loss": 0.0196, "step": 4300 }, { "epoch": 24.078212290502794, "grad_norm": 0.173094242811203, "learning_rate": 1.5450495455670868e-05, "loss": 0.0131, "step": 4310 }, { "epoch": 24.134078212290504, "grad_norm": 0.28099843859672546, "learning_rate": 1.5170820326554087e-05, "loss": 0.0125, "step": 4320 }, { "epoch": 24.189944134078214, "grad_norm": 0.2801670432090759, "learning_rate": 1.489341454373694e-05, "loss": 0.0129, "step": 4330 }, { "epoch": 24.24581005586592, "grad_norm": 0.21972441673278809, "learning_rate": 1.4618288629339079e-05, "loss": 0.016, "step": 4340 }, { "epoch": 24.30167597765363, "grad_norm": 0.132466122508049, "learning_rate": 1.4345453019003745e-05, "loss": 0.0134, "step": 4350 }, { "epoch": 24.35754189944134, "grad_norm": 0.1710919737815857, "learning_rate": 1.4074918061502045e-05, "loss": 0.0131, "step": 4360 }, { "epoch": 24.41340782122905, "grad_norm": 0.2302296757698059, "learning_rate": 1.3806694018340223e-05, "loss": 0.0138, "step": 4370 }, { "epoch": 24.46927374301676, "grad_norm": 0.34001755714416504, "learning_rate": 1.3540791063370673e-05, "loss": 0.0132, "step": 4380 }, { "epoch": 24.52513966480447, "grad_norm": 0.18118420243263245, "learning_rate": 1.3277219282405858e-05, "loss": 0.0114, "step": 4390 }, { "epoch": 24.58100558659218, "grad_norm": 0.2221328318119049, "learning_rate": 1.3015988672835823e-05, "loss": 0.0135, "step": 4400 }, { "epoch": 24.636871508379887, "grad_norm": 0.2257193773984909, "learning_rate": 1.2757109143249002e-05, "loss": 0.0155, "step": 4410 }, { "epoch": 24.692737430167597, "grad_norm": 0.3096782863140106, "learning_rate": 1.2500590513056333e-05, "loss": 0.0099, "step": 4420 }, { "epoch": 24.748603351955307, "grad_norm": 0.20636042952537537, "learning_rate": 1.2246442512118886e-05, "loss": 0.0124, "step": 4430 }, { "epoch": 24.804469273743017, "grad_norm": 0.24674853682518005, "learning_rate": 1.199467478037871e-05, "loss": 0.0134, "step": 4440 }, { "epoch": 24.860335195530727, "grad_norm": 0.3628787100315094, "learning_rate": 1.1745296867493226e-05, "loss": 0.0168, "step": 4450 }, { "epoch": 24.916201117318437, "grad_norm": 0.17266486585140228, "learning_rate": 1.1498318232473033e-05, "loss": 0.0112, "step": 4460 }, { "epoch": 24.972067039106147, "grad_norm": 0.20120470225811005, "learning_rate": 1.1253748243323055e-05, "loss": 0.0126, "step": 4470 }, { "epoch": 25.027932960893853, "grad_norm": 0.30043306946754456, "learning_rate": 1.1011596176687343e-05, "loss": 0.011, "step": 4480 }, { "epoch": 25.083798882681563, "grad_norm": 0.23731638491153717, "learning_rate": 1.0771871217496971e-05, "loss": 0.0164, "step": 4490 }, { "epoch": 25.139664804469273, "grad_norm": 0.1487792432308197, "learning_rate": 1.0534582458621932e-05, "loss": 0.0171, "step": 4500 }, { "epoch": 25.195530726256983, "grad_norm": 0.16674961149692535, "learning_rate": 1.0299738900526022e-05, "loss": 0.0118, "step": 4510 }, { "epoch": 25.251396648044693, "grad_norm": 0.1953108161687851, "learning_rate": 1.0067349450925548e-05, "loss": 0.0141, "step": 4520 }, { "epoch": 25.307262569832403, "grad_norm": 0.2785939872264862, "learning_rate": 9.837422924451454e-06, "loss": 0.013, "step": 4530 }, { "epoch": 25.363128491620113, "grad_norm": 0.18461591005325317, "learning_rate": 9.609968042314908e-06, "loss": 0.0101, "step": 4540 }, { "epoch": 25.41899441340782, "grad_norm": 0.195012629032135, "learning_rate": 9.384993431976647e-06, "loss": 0.0142, "step": 4550 }, { "epoch": 25.47486033519553, "grad_norm": 0.11832192540168762, "learning_rate": 9.162507626819516e-06, "loss": 0.0118, "step": 4560 }, { "epoch": 25.53072625698324, "grad_norm": 0.24214394390583038, "learning_rate": 8.942519065825041e-06, "loss": 0.0159, "step": 4570 }, { "epoch": 25.58659217877095, "grad_norm": 0.18261413276195526, "learning_rate": 8.725036093253124e-06, "loss": 0.011, "step": 4580 }, { "epoch": 25.64245810055866, "grad_norm": 0.1193467229604721, "learning_rate": 8.510066958325679e-06, "loss": 0.0109, "step": 4590 }, { "epoch": 25.69832402234637, "grad_norm": 0.25043681263923645, "learning_rate": 8.297619814913667e-06, "loss": 0.0124, "step": 4600 }, { "epoch": 25.75418994413408, "grad_norm": 0.17554740607738495, "learning_rate": 8.087702721227812e-06, "loss": 0.017, "step": 4610 }, { "epoch": 25.810055865921786, "grad_norm": 0.13422702252864838, "learning_rate": 7.880323639513029e-06, "loss": 0.0122, "step": 4620 }, { "epoch": 25.865921787709496, "grad_norm": 0.16326576471328735, "learning_rate": 7.675490435746326e-06, "loss": 0.0106, "step": 4630 }, { "epoch": 25.921787709497206, "grad_norm": 0.17265120148658752, "learning_rate": 7.473210879338462e-06, "loss": 0.0123, "step": 4640 }, { "epoch": 25.977653631284916, "grad_norm": 0.18150538206100464, "learning_rate": 7.273492642839273e-06, "loss": 0.0117, "step": 4650 }, { "epoch": 26.033519553072626, "grad_norm": 0.2668885290622711, "learning_rate": 7.0763433016466535e-06, "loss": 0.0125, "step": 4660 }, { "epoch": 26.089385474860336, "grad_norm": 0.20581719279289246, "learning_rate": 6.881770333719177e-06, "loss": 0.0148, "step": 4670 }, { "epoch": 26.145251396648046, "grad_norm": 0.15722183883190155, "learning_rate": 6.689781119292487e-06, "loss": 0.0124, "step": 4680 }, { "epoch": 26.201117318435756, "grad_norm": 0.1610933542251587, "learning_rate": 6.5003829405993885e-06, "loss": 0.011, "step": 4690 }, { "epoch": 26.256983240223462, "grad_norm": 0.2166851907968521, "learning_rate": 6.3135829815935736e-06, "loss": 0.0135, "step": 4700 }, { "epoch": 26.312849162011172, "grad_norm": 0.21534748375415802, "learning_rate": 6.1293883276770985e-06, "loss": 0.0092, "step": 4710 }, { "epoch": 26.368715083798882, "grad_norm": 0.1989695280790329, "learning_rate": 5.947805965431792e-06, "loss": 0.0127, "step": 4720 }, { "epoch": 26.424581005586592, "grad_norm": 0.44271120429039, "learning_rate": 5.768842782354055e-06, "loss": 0.01, "step": 4730 }, { "epoch": 26.480446927374302, "grad_norm": 0.17514117062091827, "learning_rate": 5.592505566593786e-06, "loss": 0.0106, "step": 4740 }, { "epoch": 26.536312849162012, "grad_norm": 0.15244068205356598, "learning_rate": 5.418801006696736e-06, "loss": 0.0124, "step": 4750 }, { "epoch": 26.592178770949722, "grad_norm": 0.3087766468524933, "learning_rate": 5.247735691350941e-06, "loss": 0.0099, "step": 4760 }, { "epoch": 26.64804469273743, "grad_norm": 0.12699635326862335, "learning_rate": 5.079316109136808e-06, "loss": 0.0116, "step": 4770 }, { "epoch": 26.70391061452514, "grad_norm": 0.14864447712898254, "learning_rate": 4.9135486482808515e-06, "loss": 0.0137, "step": 4780 }, { "epoch": 26.75977653631285, "grad_norm": 0.14272750914096832, "learning_rate": 4.750439596413605e-06, "loss": 0.011, "step": 4790 }, { "epoch": 26.81564245810056, "grad_norm": 0.18606601655483246, "learning_rate": 4.589995140330952e-06, "loss": 0.0127, "step": 4800 }, { "epoch": 26.87150837988827, "grad_norm": 0.1599322408437729, "learning_rate": 4.432221365759523e-06, "loss": 0.0102, "step": 4810 }, { "epoch": 26.92737430167598, "grad_norm": 0.2554507553577423, "learning_rate": 4.277124257125872e-06, "loss": 0.0145, "step": 4820 }, { "epoch": 26.98324022346369, "grad_norm": 0.15352533757686615, "learning_rate": 4.124709697329476e-06, "loss": 0.0137, "step": 4830 }, { "epoch": 27.039106145251395, "grad_norm": 0.10090656578540802, "learning_rate": 3.974983467519593e-06, "loss": 0.0099, "step": 4840 }, { "epoch": 27.094972067039105, "grad_norm": 0.203408882021904, "learning_rate": 3.8279512468759684e-06, "loss": 0.0133, "step": 4850 }, { "epoch": 27.150837988826815, "grad_norm": 0.0952206403017044, "learning_rate": 3.683618612393441e-06, "loss": 0.0127, "step": 4860 }, { "epoch": 27.206703910614525, "grad_norm": 0.09654044359922409, "learning_rate": 3.5419910386703694e-06, "loss": 0.0092, "step": 4870 }, { "epoch": 27.262569832402235, "grad_norm": 0.42048877477645874, "learning_rate": 3.403073897701028e-06, "loss": 0.0126, "step": 4880 }, { "epoch": 27.318435754189945, "grad_norm": 0.16950425505638123, "learning_rate": 3.266872458671846e-06, "loss": 0.0112, "step": 4890 }, { "epoch": 27.374301675977655, "grad_norm": 0.2811785042285919, "learning_rate": 3.133391887761441e-06, "loss": 0.0113, "step": 4900 }, { "epoch": 27.43016759776536, "grad_norm": 0.20062536001205444, "learning_rate": 3.002637247944828e-06, "loss": 0.0101, "step": 4910 }, { "epoch": 27.48603351955307, "grad_norm": 0.176531121134758, "learning_rate": 2.8746134988012453e-06, "loss": 0.0122, "step": 4920 }, { "epoch": 27.54189944134078, "grad_norm": 0.2781747877597809, "learning_rate": 2.749325496326099e-06, "loss": 0.0114, "step": 4930 }, { "epoch": 27.59776536312849, "grad_norm": 0.23316988348960876, "learning_rate": 2.6267779927467423e-06, "loss": 0.009, "step": 4940 }, { "epoch": 27.6536312849162, "grad_norm": 0.17585094273090363, "learning_rate": 2.5069756363422454e-06, "loss": 0.0131, "step": 4950 }, { "epoch": 27.70949720670391, "grad_norm": 0.22221805155277252, "learning_rate": 2.3899229712671027e-06, "loss": 0.0129, "step": 4960 }, { "epoch": 27.76536312849162, "grad_norm": 0.18307988345623016, "learning_rate": 2.2756244373787867e-06, "loss": 0.0099, "step": 4970 }, { "epoch": 27.821229050279328, "grad_norm": 0.32926902174949646, "learning_rate": 2.164084370069452e-06, "loss": 0.0106, "step": 4980 }, { "epoch": 27.877094972067038, "grad_norm": 0.1888887584209442, "learning_rate": 2.055307000101414e-06, "loss": 0.0122, "step": 4990 }, { "epoch": 27.932960893854748, "grad_norm": 0.13414178788661957, "learning_rate": 1.9492964534467026e-06, "loss": 0.0109, "step": 5000 }, { "epoch": 27.988826815642458, "grad_norm": 0.27602723240852356, "learning_rate": 1.8460567511305645e-06, "loss": 0.0125, "step": 5010 }, { "epoch": 28.044692737430168, "grad_norm": 0.1933746188879013, "learning_rate": 1.745591809078925e-06, "loss": 0.0105, "step": 5020 }, { "epoch": 28.100558659217878, "grad_norm": 0.09113660454750061, "learning_rate": 1.6479054379698985e-06, "loss": 0.0095, "step": 5030 }, { "epoch": 28.156424581005588, "grad_norm": 0.11389518529176712, "learning_rate": 1.5530013430891886e-06, "loss": 0.0123, "step": 5040 }, { "epoch": 28.212290502793294, "grad_norm": 0.12854307889938354, "learning_rate": 1.4608831241895823e-06, "loss": 0.0111, "step": 5050 }, { "epoch": 28.268156424581004, "grad_norm": 0.1454714685678482, "learning_rate": 1.3715542753544195e-06, "loss": 0.0116, "step": 5060 }, { "epoch": 28.324022346368714, "grad_norm": 0.11753421276807785, "learning_rate": 1.2850181848650304e-06, "loss": 0.0085, "step": 5070 }, { "epoch": 28.379888268156424, "grad_norm": 0.19782139360904694, "learning_rate": 1.2012781350722145e-06, "loss": 0.0099, "step": 5080 }, { "epoch": 28.435754189944134, "grad_norm": 0.1429385244846344, "learning_rate": 1.1203373022717899e-06, "loss": 0.0096, "step": 5090 }, { "epoch": 28.491620111731844, "grad_norm": 0.17323465645313263, "learning_rate": 1.0421987565840562e-06, "loss": 0.0107, "step": 5100 }, { "epoch": 28.547486033519554, "grad_norm": 0.1123204305768013, "learning_rate": 9.66865461837371e-07, "loss": 0.0139, "step": 5110 }, { "epoch": 28.60335195530726, "grad_norm": 0.23444265127182007, "learning_rate": 8.943402754557405e-07, "loss": 0.0148, "step": 5120 }, { "epoch": 28.65921787709497, "grad_norm": 0.10738368332386017, "learning_rate": 8.246259483504047e-07, "loss": 0.0115, "step": 5130 }, { "epoch": 28.71508379888268, "grad_norm": 0.1469849944114685, "learning_rate": 7.577251248155142e-07, "loss": 0.0092, "step": 5140 }, { "epoch": 28.77094972067039, "grad_norm": 0.1442977786064148, "learning_rate": 6.936403424278591e-07, "loss": 0.0126, "step": 5150 }, { "epoch": 28.8268156424581, "grad_norm": 0.11567971855401993, "learning_rate": 6.323740319505388e-07, "loss": 0.0108, "step": 5160 }, { "epoch": 28.88268156424581, "grad_norm": 0.12551957368850708, "learning_rate": 5.739285172408598e-07, "loss": 0.0129, "step": 5170 }, { "epoch": 28.93854748603352, "grad_norm": 0.16800257563591003, "learning_rate": 5.183060151620983e-07, "loss": 0.0124, "step": 5180 }, { "epoch": 28.994413407821227, "grad_norm": 0.17249980568885803, "learning_rate": 4.655086354994925e-07, "loss": 0.0118, "step": 5190 }, { "epoch": 29.050279329608937, "grad_norm": 0.3741263151168823, "learning_rate": 4.155383808801649e-07, "loss": 0.0147, "step": 5200 }, { "epoch": 29.106145251396647, "grad_norm": 0.22194480895996094, "learning_rate": 3.68397146697183e-07, "loss": 0.0138, "step": 5210 }, { "epoch": 29.162011173184357, "grad_norm": 0.2826599180698395, "learning_rate": 3.240867210376752e-07, "loss": 0.0145, "step": 5220 }, { "epoch": 29.217877094972067, "grad_norm": 0.21220660209655762, "learning_rate": 2.8260878461495995e-07, "loss": 0.0153, "step": 5230 }, { "epoch": 29.273743016759777, "grad_norm": 0.2775328755378723, "learning_rate": 2.439649107048719e-07, "loss": 0.0112, "step": 5240 }, { "epoch": 29.329608938547487, "grad_norm": 0.2872767150402069, "learning_rate": 2.0815656508601785e-07, "loss": 0.0114, "step": 5250 }, { "epoch": 29.385474860335197, "grad_norm": 0.2828880250453949, "learning_rate": 1.7518510598422963e-07, "loss": 0.0094, "step": 5260 }, { "epoch": 29.441340782122904, "grad_norm": 0.27305498719215393, "learning_rate": 1.4505178402099692e-07, "loss": 0.0119, "step": 5270 }, { "epoch": 29.497206703910614, "grad_norm": 0.1336456537246704, "learning_rate": 1.1775774216608858e-07, "loss": 0.0082, "step": 5280 }, { "epoch": 29.553072625698324, "grad_norm": 0.12913921475410461, "learning_rate": 9.330401569413725e-08, "loss": 0.0094, "step": 5290 }, { "epoch": 29.608938547486034, "grad_norm": 0.193023219704628, "learning_rate": 7.169153214543754e-08, "loss": 0.0094, "step": 5300 }, { "epoch": 29.664804469273744, "grad_norm": 0.1686532348394394, "learning_rate": 5.292111129068244e-08, "loss": 0.0113, "step": 5310 }, { "epoch": 29.720670391061454, "grad_norm": 0.18468423187732697, "learning_rate": 3.69934650999465e-08, "loss": 0.0115, "step": 5320 }, { "epoch": 29.776536312849164, "grad_norm": 0.21080362796783447, "learning_rate": 2.3909197715657468e-08, "loss": 0.0126, "step": 5330 }, { "epoch": 29.83240223463687, "grad_norm": 0.15631383657455444, "learning_rate": 1.3668805429639662e-08, "loss": 0.0107, "step": 5340 }, { "epoch": 29.88826815642458, "grad_norm": 0.16603289544582367, "learning_rate": 6.27267666434561e-09, "loss": 0.0123, "step": 5350 }, { "epoch": 29.94413407821229, "grad_norm": 0.12879528105258942, "learning_rate": 1.7210919580928906e-09, "loss": 0.0119, "step": 5360 }, { "epoch": 30.0, "grad_norm": 0.17963722348213196, "learning_rate": 1.4223954430958584e-11, "loss": 0.0121, "step": 5370 }, { "epoch": 30.0, "step": 5370, "total_flos": 0.0, "train_loss": 0.03160388583327805, "train_runtime": 6237.9083, "train_samples_per_second": 42.139, "train_steps_per_second": 0.861 } ], "logging_steps": 10, "max_steps": 5370, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 20000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 49, "trial_name": null, "trial_params": null }