{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.25599901577555206, "eval_steps": 500, "global_step": 22889, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00011184368726268168, "grad_norm": 0.6328383684158325, "learning_rate": 4.5e-06, "loss": 1.734, "step": 10 }, { "epoch": 0.00022368737452536336, "grad_norm": 0.566952109336853, "learning_rate": 9.5e-06, "loss": 1.6903, "step": 20 }, { "epoch": 0.00033553106178804503, "grad_norm": 0.5359939932823181, "learning_rate": 1.4500000000000002e-05, "loss": 1.6266, "step": 30 }, { "epoch": 0.0004473747490507267, "grad_norm": 0.4729914367198944, "learning_rate": 1.95e-05, "loss": 1.5731, "step": 40 }, { "epoch": 0.0005592184363134084, "grad_norm": 0.42020025849342346, "learning_rate": 2.4500000000000003e-05, "loss": 1.5335, "step": 50 }, { "epoch": 0.0006710621235760901, "grad_norm": 0.4461672604084015, "learning_rate": 2.95e-05, "loss": 1.4851, "step": 60 }, { "epoch": 0.0007829058108387717, "grad_norm": 0.4443751275539398, "learning_rate": 3.4500000000000005e-05, "loss": 1.4431, "step": 70 }, { "epoch": 0.0008947494981014534, "grad_norm": 0.4204632639884949, "learning_rate": 3.95e-05, "loss": 1.4036, "step": 80 }, { "epoch": 0.0010065931853641351, "grad_norm": 0.3985028862953186, "learning_rate": 4.45e-05, "loss": 1.3725, "step": 90 }, { "epoch": 0.0011184368726268167, "grad_norm": 0.4111650586128235, "learning_rate": 4.9500000000000004e-05, "loss": 1.3527, "step": 100 }, { "epoch": 0.0012302805598894985, "grad_norm": 0.4175569713115692, "learning_rate": 5.45e-05, "loss": 1.3431, "step": 110 }, { "epoch": 0.0013421242471521801, "grad_norm": 0.3871678411960602, "learning_rate": 5.9499999999999996e-05, "loss": 1.3322, "step": 120 }, { "epoch": 0.0014539679344148617, "grad_norm": 0.39584827423095703, "learning_rate": 6.450000000000001e-05, "loss": 1.3075, "step": 130 }, { "epoch": 0.0015658116216775435, "grad_norm": 0.4165605902671814, "learning_rate": 6.950000000000001e-05, "loss": 1.286, "step": 140 }, { "epoch": 0.001677655308940225, "grad_norm": 0.3985513150691986, "learning_rate": 7.45e-05, "loss": 1.2567, "step": 150 }, { "epoch": 0.0017894989962029069, "grad_norm": 0.39112743735313416, "learning_rate": 7.950000000000001e-05, "loss": 1.2448, "step": 160 }, { "epoch": 0.0019013426834655885, "grad_norm": 0.3867124915122986, "learning_rate": 8.450000000000001e-05, "loss": 1.2405, "step": 170 }, { "epoch": 0.0020131863707282703, "grad_norm": 0.3955863416194916, "learning_rate": 8.95e-05, "loss": 1.2123, "step": 180 }, { "epoch": 0.002125030057990952, "grad_norm": 0.40293410420417786, "learning_rate": 9.45e-05, "loss": 1.2081, "step": 190 }, { "epoch": 0.0022368737452536334, "grad_norm": 0.3828902542591095, "learning_rate": 9.95e-05, "loss": 1.2049, "step": 200 }, { "epoch": 0.002348717432516315, "grad_norm": 0.3969178795814514, "learning_rate": 0.00010449999999999999, "loss": 1.1892, "step": 210 }, { "epoch": 0.002460561119778997, "grad_norm": 0.4122287929058075, "learning_rate": 0.0001095, "loss": 1.184, "step": 220 }, { "epoch": 0.0025724048070416786, "grad_norm": 0.3793940246105194, "learning_rate": 0.0001145, "loss": 1.1809, "step": 230 }, { "epoch": 0.0026842484943043602, "grad_norm": 0.4132145643234253, "learning_rate": 0.00011949999999999999, "loss": 1.1883, "step": 240 }, { "epoch": 0.002796092181567042, "grad_norm": 0.3900831639766693, "learning_rate": 0.0001245, "loss": 1.1818, "step": 250 }, { "epoch": 0.0029079358688297234, "grad_norm": 0.3898029625415802, "learning_rate": 0.0001295, "loss": 1.1693, "step": 260 }, { "epoch": 0.0030197795560924054, "grad_norm": 0.40828797221183777, "learning_rate": 0.00013450000000000002, "loss": 1.1869, "step": 270 }, { "epoch": 0.003131623243355087, "grad_norm": 0.3976770341396332, "learning_rate": 0.0001395, "loss": 1.1841, "step": 280 }, { "epoch": 0.0032434669306177686, "grad_norm": 0.3902062773704529, "learning_rate": 0.0001445, "loss": 1.1843, "step": 290 }, { "epoch": 0.00335531061788045, "grad_norm": 0.38051125407218933, "learning_rate": 0.0001495, "loss": 1.1662, "step": 300 }, { "epoch": 0.0034671543051431318, "grad_norm": 0.3628483712673187, "learning_rate": 0.00015450000000000001, "loss": 1.1638, "step": 310 }, { "epoch": 0.0035789979924058138, "grad_norm": 0.3693360388278961, "learning_rate": 0.0001595, "loss": 1.1606, "step": 320 }, { "epoch": 0.0036908416796684954, "grad_norm": 0.38896557688713074, "learning_rate": 0.00016450000000000001, "loss": 1.1448, "step": 330 }, { "epoch": 0.003802685366931177, "grad_norm": 0.40257108211517334, "learning_rate": 0.00016950000000000003, "loss": 1.143, "step": 340 }, { "epoch": 0.0039145290541938585, "grad_norm": 0.38656994700431824, "learning_rate": 0.00017449999999999999, "loss": 1.141, "step": 350 }, { "epoch": 0.0040263727414565405, "grad_norm": 0.3700025677680969, "learning_rate": 0.0001795, "loss": 1.136, "step": 360 }, { "epoch": 0.004138216428719222, "grad_norm": 0.37222161889076233, "learning_rate": 0.0001845, "loss": 1.1292, "step": 370 }, { "epoch": 0.004250060115981904, "grad_norm": 0.39386317133903503, "learning_rate": 0.0001895, "loss": 1.1139, "step": 380 }, { "epoch": 0.004361903803244586, "grad_norm": 0.3776305913925171, "learning_rate": 0.0001945, "loss": 1.1125, "step": 390 }, { "epoch": 0.004473747490507267, "grad_norm": 0.40314197540283203, "learning_rate": 0.00019950000000000002, "loss": 1.0962, "step": 400 }, { "epoch": 0.004585591177769949, "grad_norm": 0.37841472029685974, "learning_rate": 0.00020449999999999998, "loss": 1.0987, "step": 410 }, { "epoch": 0.00469743486503263, "grad_norm": 0.3678649365901947, "learning_rate": 0.0002095, "loss": 1.0826, "step": 420 }, { "epoch": 0.004809278552295312, "grad_norm": 0.37902751564979553, "learning_rate": 0.0002145, "loss": 1.0973, "step": 430 }, { "epoch": 0.004921122239557994, "grad_norm": 0.3776302635669708, "learning_rate": 0.0002195, "loss": 1.112, "step": 440 }, { "epoch": 0.005032965926820675, "grad_norm": 0.43771493434906006, "learning_rate": 0.0002245, "loss": 1.1005, "step": 450 }, { "epoch": 0.005144809614083357, "grad_norm": 0.3662595748901367, "learning_rate": 0.00022950000000000002, "loss": 1.0899, "step": 460 }, { "epoch": 0.005256653301346038, "grad_norm": 0.37473002076148987, "learning_rate": 0.00023449999999999998, "loss": 1.0982, "step": 470 }, { "epoch": 0.0053684969886087204, "grad_norm": 0.35591790080070496, "learning_rate": 0.0002395, "loss": 1.1005, "step": 480 }, { "epoch": 0.0054803406758714025, "grad_norm": 0.3825643062591553, "learning_rate": 0.0002445, "loss": 1.0896, "step": 490 }, { "epoch": 0.005592184363134084, "grad_norm": 0.3784261643886566, "learning_rate": 0.0002495, "loss": 1.1039, "step": 500 }, { "epoch": 0.005704028050396766, "grad_norm": 0.35387158393859863, "learning_rate": 0.0002545, "loss": 1.1038, "step": 510 }, { "epoch": 0.005815871737659447, "grad_norm": 0.3992142975330353, "learning_rate": 0.0002595, "loss": 1.088, "step": 520 }, { "epoch": 0.005927715424922129, "grad_norm": 0.36795270442962646, "learning_rate": 0.00026450000000000003, "loss": 1.0888, "step": 530 }, { "epoch": 0.006039559112184811, "grad_norm": 0.4007701575756073, "learning_rate": 0.00026950000000000005, "loss": 1.0838, "step": 540 }, { "epoch": 0.006151402799447492, "grad_norm": 0.34527722001075745, "learning_rate": 0.0002745, "loss": 1.0892, "step": 550 }, { "epoch": 0.006263246486710174, "grad_norm": 0.37232115864753723, "learning_rate": 0.0002795, "loss": 1.0939, "step": 560 }, { "epoch": 0.006375090173972855, "grad_norm": 0.4048405885696411, "learning_rate": 0.0002845, "loss": 1.0863, "step": 570 }, { "epoch": 0.006486933861235537, "grad_norm": 0.37317511439323425, "learning_rate": 0.0002895, "loss": 1.0711, "step": 580 }, { "epoch": 0.006598777548498219, "grad_norm": 0.38564008474349976, "learning_rate": 0.0002945, "loss": 1.091, "step": 590 }, { "epoch": 0.0067106212357609, "grad_norm": 0.3639361262321472, "learning_rate": 0.0002995, "loss": 1.0682, "step": 600 }, { "epoch": 0.006822464923023582, "grad_norm": 0.35907182097435, "learning_rate": 0.0003045, "loss": 1.0755, "step": 610 }, { "epoch": 0.0069343086102862635, "grad_norm": 0.35199785232543945, "learning_rate": 0.0003095, "loss": 1.0581, "step": 620 }, { "epoch": 0.0070461522975489455, "grad_norm": 0.35156381130218506, "learning_rate": 0.0003145, "loss": 1.0651, "step": 630 }, { "epoch": 0.0071579959848116275, "grad_norm": 0.3742520213127136, "learning_rate": 0.0003195, "loss": 1.0555, "step": 640 }, { "epoch": 0.007269839672074309, "grad_norm": 0.3587191700935364, "learning_rate": 0.00032450000000000003, "loss": 1.0548, "step": 650 }, { "epoch": 0.007381683359336991, "grad_norm": 0.37587791681289673, "learning_rate": 0.00032950000000000004, "loss": 1.0437, "step": 660 }, { "epoch": 0.007493527046599672, "grad_norm": 0.3410298526287079, "learning_rate": 0.00033450000000000005, "loss": 1.0426, "step": 670 }, { "epoch": 0.007605370733862354, "grad_norm": 0.3450978696346283, "learning_rate": 0.0003395, "loss": 1.0487, "step": 680 }, { "epoch": 0.007717214421125036, "grad_norm": 0.3445068299770355, "learning_rate": 0.00034449999999999997, "loss": 1.0411, "step": 690 }, { "epoch": 0.007829058108387717, "grad_norm": 0.34611567854881287, "learning_rate": 0.0003495, "loss": 1.0404, "step": 700 }, { "epoch": 0.007940901795650398, "grad_norm": 0.3339330852031708, "learning_rate": 0.0003545, "loss": 1.0361, "step": 710 }, { "epoch": 0.008052745482913081, "grad_norm": 0.33232080936431885, "learning_rate": 0.0003595, "loss": 1.0271, "step": 720 }, { "epoch": 0.008164589170175762, "grad_norm": 0.33050498366355896, "learning_rate": 0.0003645, "loss": 1.0316, "step": 730 }, { "epoch": 0.008276432857438443, "grad_norm": 0.3449972867965698, "learning_rate": 0.0003695, "loss": 1.0426, "step": 740 }, { "epoch": 0.008388276544701126, "grad_norm": 0.3543892502784729, "learning_rate": 0.0003745, "loss": 1.0475, "step": 750 }, { "epoch": 0.008500120231963807, "grad_norm": 0.3447831869125366, "learning_rate": 0.0003795, "loss": 1.0482, "step": 760 }, { "epoch": 0.008611963919226489, "grad_norm": 0.33845630288124084, "learning_rate": 0.0003845, "loss": 1.0533, "step": 770 }, { "epoch": 0.008723807606489171, "grad_norm": 0.3394622802734375, "learning_rate": 0.00038950000000000003, "loss": 1.0803, "step": 780 }, { "epoch": 0.008835651293751853, "grad_norm": 0.33649975061416626, "learning_rate": 0.00039450000000000005, "loss": 1.0461, "step": 790 }, { "epoch": 0.008947494981014534, "grad_norm": 0.3265191912651062, "learning_rate": 0.0003995, "loss": 1.0714, "step": 800 }, { "epoch": 0.009059338668277215, "grad_norm": 0.34960776567459106, "learning_rate": 0.0004045, "loss": 1.0542, "step": 810 }, { "epoch": 0.009171182355539898, "grad_norm": 0.3353814482688904, "learning_rate": 0.0004095, "loss": 1.0625, "step": 820 }, { "epoch": 0.009283026042802579, "grad_norm": 0.3499109148979187, "learning_rate": 0.0004145, "loss": 1.0679, "step": 830 }, { "epoch": 0.00939486973006526, "grad_norm": 0.33906084299087524, "learning_rate": 0.0004195, "loss": 1.0659, "step": 840 }, { "epoch": 0.009506713417327943, "grad_norm": 0.3245256543159485, "learning_rate": 0.0004245, "loss": 1.078, "step": 850 }, { "epoch": 0.009618557104590624, "grad_norm": 0.3364386260509491, "learning_rate": 0.0004295, "loss": 1.0771, "step": 860 }, { "epoch": 0.009730400791853305, "grad_norm": 0.348718523979187, "learning_rate": 0.0004345, "loss": 1.0751, "step": 870 }, { "epoch": 0.009842244479115988, "grad_norm": 0.31124839186668396, "learning_rate": 0.0004395, "loss": 1.0693, "step": 880 }, { "epoch": 0.00995408816637867, "grad_norm": 0.3478352129459381, "learning_rate": 0.0004445, "loss": 1.0682, "step": 890 }, { "epoch": 0.01006593185364135, "grad_norm": 0.31189802289009094, "learning_rate": 0.00044950000000000003, "loss": 1.0608, "step": 900 }, { "epoch": 0.010177775540904033, "grad_norm": 0.34715884923934937, "learning_rate": 0.00045450000000000004, "loss": 1.0698, "step": 910 }, { "epoch": 0.010289619228166715, "grad_norm": 0.3279336988925934, "learning_rate": 0.00045950000000000006, "loss": 1.0728, "step": 920 }, { "epoch": 0.010401462915429396, "grad_norm": 0.32010868191719055, "learning_rate": 0.0004645, "loss": 1.0765, "step": 930 }, { "epoch": 0.010513306602692077, "grad_norm": 0.3618028163909912, "learning_rate": 0.0004695, "loss": 1.0815, "step": 940 }, { "epoch": 0.01062515028995476, "grad_norm": 0.3403186798095703, "learning_rate": 0.0004745, "loss": 1.0713, "step": 950 }, { "epoch": 0.010736993977217441, "grad_norm": 0.347687691450119, "learning_rate": 0.0004795, "loss": 1.0844, "step": 960 }, { "epoch": 0.010848837664480122, "grad_norm": 0.3537987768650055, "learning_rate": 0.0004845, "loss": 1.0762, "step": 970 }, { "epoch": 0.010960681351742805, "grad_norm": 0.42015892267227173, "learning_rate": 0.0004895, "loss": 1.0832, "step": 980 }, { "epoch": 0.011072525039005486, "grad_norm": 0.35781368613243103, "learning_rate": 0.0004945, "loss": 1.0606, "step": 990 }, { "epoch": 0.011184368726268167, "grad_norm": 0.3361358344554901, "learning_rate": 0.0004995, "loss": 1.0717, "step": 1000 }, { "epoch": 0.01129621241353085, "grad_norm": 0.36569204926490784, "learning_rate": 0.0004997944172872219, "loss": 1.0602, "step": 1010 }, { "epoch": 0.011408056100793531, "grad_norm": 0.31979477405548096, "learning_rate": 0.0004995659920508017, "loss": 1.0531, "step": 1020 }, { "epoch": 0.011519899788056212, "grad_norm": 0.3295707404613495, "learning_rate": 0.0004993375668143817, "loss": 1.0346, "step": 1030 }, { "epoch": 0.011631743475318894, "grad_norm": 0.3207838833332062, "learning_rate": 0.0004991091415779616, "loss": 1.059, "step": 1040 }, { "epoch": 0.011743587162581576, "grad_norm": 0.33032119274139404, "learning_rate": 0.0004988807163415415, "loss": 1.0573, "step": 1050 }, { "epoch": 0.011855430849844258, "grad_norm": 0.3566173017024994, "learning_rate": 0.0004986522911051213, "loss": 1.0501, "step": 1060 }, { "epoch": 0.011967274537106939, "grad_norm": 0.31658655405044556, "learning_rate": 0.0004984238658687012, "loss": 1.0706, "step": 1070 }, { "epoch": 0.012079118224369622, "grad_norm": 0.3438680171966553, "learning_rate": 0.0004981954406322811, "loss": 1.0765, "step": 1080 }, { "epoch": 0.012190961911632303, "grad_norm": 0.3130144774913788, "learning_rate": 0.0004979670153958609, "loss": 1.0588, "step": 1090 }, { "epoch": 0.012302805598894984, "grad_norm": 0.31765422224998474, "learning_rate": 0.0004977385901594408, "loss": 1.0703, "step": 1100 }, { "epoch": 0.012414649286157667, "grad_norm": 0.36112868785858154, "learning_rate": 0.0004975101649230207, "loss": 1.0642, "step": 1110 }, { "epoch": 0.012526492973420348, "grad_norm": 0.33418065309524536, "learning_rate": 0.0004972817396866005, "loss": 1.0572, "step": 1120 }, { "epoch": 0.01263833666068303, "grad_norm": 0.34439629316329956, "learning_rate": 0.0004970533144501805, "loss": 1.0473, "step": 1130 }, { "epoch": 0.01275018034794571, "grad_norm": 0.32954639196395874, "learning_rate": 0.0004968248892137603, "loss": 1.054, "step": 1140 }, { "epoch": 0.012862024035208393, "grad_norm": 0.3351511061191559, "learning_rate": 0.0004965964639773402, "loss": 1.0444, "step": 1150 }, { "epoch": 0.012973867722471074, "grad_norm": 0.3065156638622284, "learning_rate": 0.0004963680387409202, "loss": 1.0546, "step": 1160 }, { "epoch": 0.013085711409733755, "grad_norm": 0.36450672149658203, "learning_rate": 0.0004961396135045, "loss": 1.0501, "step": 1170 }, { "epoch": 0.013197555096996438, "grad_norm": 0.3020591735839844, "learning_rate": 0.0004959111882680799, "loss": 1.052, "step": 1180 }, { "epoch": 0.01330939878425912, "grad_norm": 0.3097701966762543, "learning_rate": 0.0004956827630316598, "loss": 1.0695, "step": 1190 }, { "epoch": 0.0134212424715218, "grad_norm": 0.3410932719707489, "learning_rate": 0.0004954543377952396, "loss": 1.0692, "step": 1200 }, { "epoch": 0.013533086158784484, "grad_norm": 0.38478952646255493, "learning_rate": 0.0004952259125588195, "loss": 1.0592, "step": 1210 }, { "epoch": 0.013644929846047165, "grad_norm": 0.3737089931964874, "learning_rate": 0.0004949974873223994, "loss": 1.0808, "step": 1220 }, { "epoch": 0.013756773533309846, "grad_norm": 0.3264448940753937, "learning_rate": 0.0004947690620859793, "loss": 1.0759, "step": 1230 }, { "epoch": 0.013868617220572527, "grad_norm": 0.3922732472419739, "learning_rate": 0.0004945406368495591, "loss": 1.0634, "step": 1240 }, { "epoch": 0.01398046090783521, "grad_norm": 0.36068034172058105, "learning_rate": 0.000494312211613139, "loss": 1.0683, "step": 1250 }, { "epoch": 0.014092304595097891, "grad_norm": 0.3544798791408539, "learning_rate": 0.0004940837863767189, "loss": 1.0687, "step": 1260 }, { "epoch": 0.014204148282360572, "grad_norm": 0.31447795033454895, "learning_rate": 0.0004938553611402987, "loss": 1.0549, "step": 1270 }, { "epoch": 0.014315991969623255, "grad_norm": 0.37639158964157104, "learning_rate": 0.0004936269359038786, "loss": 1.0698, "step": 1280 }, { "epoch": 0.014427835656885936, "grad_norm": 0.32416418194770813, "learning_rate": 0.0004933985106674586, "loss": 1.0617, "step": 1290 }, { "epoch": 0.014539679344148617, "grad_norm": 0.3122979998588562, "learning_rate": 0.0004931700854310385, "loss": 1.0553, "step": 1300 }, { "epoch": 0.0146515230314113, "grad_norm": 0.3574884533882141, "learning_rate": 0.0004929416601946184, "loss": 1.0598, "step": 1310 }, { "epoch": 0.014763366718673981, "grad_norm": 0.30762428045272827, "learning_rate": 0.0004927132349581982, "loss": 1.0642, "step": 1320 }, { "epoch": 0.014875210405936663, "grad_norm": 0.34350454807281494, "learning_rate": 0.0004924848097217781, "loss": 1.0663, "step": 1330 }, { "epoch": 0.014987054093199344, "grad_norm": 0.33486828207969666, "learning_rate": 0.000492256384485358, "loss": 1.0479, "step": 1340 }, { "epoch": 0.015098897780462027, "grad_norm": 0.3025324046611786, "learning_rate": 0.0004920279592489378, "loss": 1.0705, "step": 1350 }, { "epoch": 0.015210741467724708, "grad_norm": 0.35260385274887085, "learning_rate": 0.0004917995340125177, "loss": 1.0762, "step": 1360 }, { "epoch": 0.015322585154987389, "grad_norm": 0.3188925087451935, "learning_rate": 0.0004915711087760976, "loss": 1.069, "step": 1370 }, { "epoch": 0.015434428842250072, "grad_norm": 0.332660436630249, "learning_rate": 0.0004913426835396775, "loss": 1.0749, "step": 1380 }, { "epoch": 0.015546272529512753, "grad_norm": 0.31745171546936035, "learning_rate": 0.0004911142583032573, "loss": 1.0811, "step": 1390 }, { "epoch": 0.015658116216775434, "grad_norm": 0.3237819969654083, "learning_rate": 0.0004908858330668372, "loss": 1.0634, "step": 1400 }, { "epoch": 0.015769959904038115, "grad_norm": 0.3300880789756775, "learning_rate": 0.0004906574078304171, "loss": 1.0554, "step": 1410 }, { "epoch": 0.015881803591300796, "grad_norm": 0.32475635409355164, "learning_rate": 0.0004904289825939969, "loss": 1.0598, "step": 1420 }, { "epoch": 0.01599364727856348, "grad_norm": 0.31278952956199646, "learning_rate": 0.0004902005573575769, "loss": 1.0498, "step": 1430 }, { "epoch": 0.016105490965826162, "grad_norm": 0.308680921792984, "learning_rate": 0.0004899721321211568, "loss": 1.0586, "step": 1440 }, { "epoch": 0.016217334653088843, "grad_norm": 0.34637314081192017, "learning_rate": 0.0004897437068847367, "loss": 1.0535, "step": 1450 }, { "epoch": 0.016329178340351524, "grad_norm": 0.3220643401145935, "learning_rate": 0.0004895152816483165, "loss": 1.0624, "step": 1460 }, { "epoch": 0.016441022027614206, "grad_norm": 0.31472912430763245, "learning_rate": 0.0004892868564118964, "loss": 1.0748, "step": 1470 }, { "epoch": 0.016552865714876887, "grad_norm": 0.3416632115840912, "learning_rate": 0.0004890584311754763, "loss": 1.0715, "step": 1480 }, { "epoch": 0.01666470940213957, "grad_norm": 0.3463667631149292, "learning_rate": 0.0004888300059390561, "loss": 1.0914, "step": 1490 }, { "epoch": 0.016776553089402253, "grad_norm": 0.3322199881076813, "learning_rate": 0.000488601580702636, "loss": 1.0707, "step": 1500 }, { "epoch": 0.016888396776664934, "grad_norm": 0.3899800479412079, "learning_rate": 0.0004883731554662159, "loss": 1.0883, "step": 1510 }, { "epoch": 0.017000240463927615, "grad_norm": 0.3409605324268341, "learning_rate": 0.0004881447302297958, "loss": 1.0982, "step": 1520 }, { "epoch": 0.017112084151190296, "grad_norm": 0.3720357120037079, "learning_rate": 0.0004879163049933757, "loss": 1.0674, "step": 1530 }, { "epoch": 0.017223927838452977, "grad_norm": 0.326050728559494, "learning_rate": 0.00048768787975695554, "loss": 1.0764, "step": 1540 }, { "epoch": 0.01733577152571566, "grad_norm": 0.3238283395767212, "learning_rate": 0.0004874594545205354, "loss": 1.0547, "step": 1550 }, { "epoch": 0.017447615212978343, "grad_norm": 0.3324073553085327, "learning_rate": 0.00048723102928411536, "loss": 1.0608, "step": 1560 }, { "epoch": 0.017559458900241024, "grad_norm": 0.3382217586040497, "learning_rate": 0.0004870026040476952, "loss": 1.0505, "step": 1570 }, { "epoch": 0.017671302587503705, "grad_norm": 0.3409116566181183, "learning_rate": 0.00048677417881127507, "loss": 1.0673, "step": 1580 }, { "epoch": 0.017783146274766386, "grad_norm": 0.3123399019241333, "learning_rate": 0.000486545753574855, "loss": 1.0461, "step": 1590 }, { "epoch": 0.017894989962029068, "grad_norm": 0.3178008198738098, "learning_rate": 0.00048631732833843484, "loss": 1.0526, "step": 1600 }, { "epoch": 0.01800683364929175, "grad_norm": 0.37002459168434143, "learning_rate": 0.0004860889031020147, "loss": 1.0483, "step": 1610 }, { "epoch": 0.01811867733655443, "grad_norm": 0.31036287546157837, "learning_rate": 0.0004858604778655946, "loss": 1.0418, "step": 1620 }, { "epoch": 0.018230521023817114, "grad_norm": 0.3027215600013733, "learning_rate": 0.00048563205262917446, "loss": 1.0467, "step": 1630 }, { "epoch": 0.018342364711079796, "grad_norm": 0.32144612073898315, "learning_rate": 0.00048540362739275437, "loss": 1.0437, "step": 1640 }, { "epoch": 0.018454208398342477, "grad_norm": 0.3156447410583496, "learning_rate": 0.0004851752021563343, "loss": 1.0447, "step": 1650 }, { "epoch": 0.018566052085605158, "grad_norm": 0.3228546380996704, "learning_rate": 0.00048494677691991413, "loss": 1.056, "step": 1660 }, { "epoch": 0.01867789577286784, "grad_norm": 0.3478510081768036, "learning_rate": 0.000484718351683494, "loss": 1.0523, "step": 1670 }, { "epoch": 0.01878973946013052, "grad_norm": 0.3413507342338562, "learning_rate": 0.0004844899264470739, "loss": 1.049, "step": 1680 }, { "epoch": 0.018901583147393205, "grad_norm": 0.3277221918106079, "learning_rate": 0.00048426150121065375, "loss": 1.0403, "step": 1690 }, { "epoch": 0.019013426834655886, "grad_norm": 0.3044646382331848, "learning_rate": 0.0004840330759742336, "loss": 1.0518, "step": 1700 }, { "epoch": 0.019125270521918567, "grad_norm": 0.31599846482276917, "learning_rate": 0.0004838046507378135, "loss": 1.0475, "step": 1710 }, { "epoch": 0.01923711420918125, "grad_norm": 0.346741646528244, "learning_rate": 0.00048357622550139343, "loss": 1.0515, "step": 1720 }, { "epoch": 0.01934895789644393, "grad_norm": 0.32756108045578003, "learning_rate": 0.0004833478002649733, "loss": 1.054, "step": 1730 }, { "epoch": 0.01946080158370661, "grad_norm": 0.3318345546722412, "learning_rate": 0.0004831193750285532, "loss": 1.0575, "step": 1740 }, { "epoch": 0.019572645270969292, "grad_norm": 0.3389560282230377, "learning_rate": 0.00048289094979213305, "loss": 1.0576, "step": 1750 }, { "epoch": 0.019684488958231976, "grad_norm": 0.31532642245292664, "learning_rate": 0.0004826625245557129, "loss": 1.0554, "step": 1760 }, { "epoch": 0.019796332645494658, "grad_norm": 0.3263496160507202, "learning_rate": 0.0004824340993192928, "loss": 1.0697, "step": 1770 }, { "epoch": 0.01990817633275734, "grad_norm": 0.328225314617157, "learning_rate": 0.00048220567408287267, "loss": 1.0584, "step": 1780 }, { "epoch": 0.02002002002002002, "grad_norm": 0.3030998706817627, "learning_rate": 0.00048197724884645253, "loss": 1.0555, "step": 1790 }, { "epoch": 0.0201318637072827, "grad_norm": 0.32594701647758484, "learning_rate": 0.0004817488236100325, "loss": 1.0512, "step": 1800 }, { "epoch": 0.020243707394545382, "grad_norm": 0.2882954776287079, "learning_rate": 0.00048152039837361235, "loss": 1.0441, "step": 1810 }, { "epoch": 0.020355551081808067, "grad_norm": 0.33917129039764404, "learning_rate": 0.0004812919731371922, "loss": 1.048, "step": 1820 }, { "epoch": 0.020467394769070748, "grad_norm": 0.32748523354530334, "learning_rate": 0.0004810635479007721, "loss": 1.042, "step": 1830 }, { "epoch": 0.02057923845633343, "grad_norm": 0.32332462072372437, "learning_rate": 0.00048083512266435197, "loss": 1.0396, "step": 1840 }, { "epoch": 0.02069108214359611, "grad_norm": 0.36977729201316833, "learning_rate": 0.0004806066974279318, "loss": 1.0337, "step": 1850 }, { "epoch": 0.02080292583085879, "grad_norm": 0.33298948407173157, "learning_rate": 0.00048037827219151174, "loss": 1.045, "step": 1860 }, { "epoch": 0.020914769518121473, "grad_norm": 0.328861802816391, "learning_rate": 0.00048014984695509165, "loss": 1.053, "step": 1870 }, { "epoch": 0.021026613205384154, "grad_norm": 0.3438888490200043, "learning_rate": 0.0004799214217186715, "loss": 1.0385, "step": 1880 }, { "epoch": 0.02113845689264684, "grad_norm": 0.3251883387565613, "learning_rate": 0.00047969299648225136, "loss": 1.0436, "step": 1890 }, { "epoch": 0.02125030057990952, "grad_norm": 0.3300330340862274, "learning_rate": 0.00047946457124583127, "loss": 1.0627, "step": 1900 }, { "epoch": 0.0213621442671722, "grad_norm": 0.31774377822875977, "learning_rate": 0.0004792361460094111, "loss": 1.0491, "step": 1910 }, { "epoch": 0.021473987954434882, "grad_norm": 0.36171990633010864, "learning_rate": 0.000479007720772991, "loss": 1.0536, "step": 1920 }, { "epoch": 0.021585831641697563, "grad_norm": 0.33032888174057007, "learning_rate": 0.0004787792955365709, "loss": 1.0327, "step": 1930 }, { "epoch": 0.021697675328960244, "grad_norm": 0.34056538343429565, "learning_rate": 0.00047855087030015074, "loss": 1.0354, "step": 1940 }, { "epoch": 0.021809519016222925, "grad_norm": 0.31768256425857544, "learning_rate": 0.00047832244506373065, "loss": 1.0278, "step": 1950 }, { "epoch": 0.02192136270348561, "grad_norm": 0.33165955543518066, "learning_rate": 0.00047809401982731056, "loss": 1.057, "step": 1960 }, { "epoch": 0.02203320639074829, "grad_norm": 0.34456339478492737, "learning_rate": 0.0004778655945908904, "loss": 1.0465, "step": 1970 }, { "epoch": 0.022145050078010972, "grad_norm": 0.35331544280052185, "learning_rate": 0.0004776371693544703, "loss": 1.0509, "step": 1980 }, { "epoch": 0.022256893765273653, "grad_norm": 0.3497447669506073, "learning_rate": 0.0004774087441180502, "loss": 1.0579, "step": 1990 }, { "epoch": 0.022368737452536334, "grad_norm": 0.31631171703338623, "learning_rate": 0.00047718031888163004, "loss": 1.0747, "step": 2000 }, { "epoch": 0.022480581139799016, "grad_norm": 0.34811535477638245, "learning_rate": 0.0004769518936452099, "loss": 1.0443, "step": 2010 }, { "epoch": 0.0225924248270617, "grad_norm": 0.350975900888443, "learning_rate": 0.0004767234684087898, "loss": 1.0721, "step": 2020 }, { "epoch": 0.02270426851432438, "grad_norm": 0.38026875257492065, "learning_rate": 0.0004764950431723697, "loss": 1.0502, "step": 2030 }, { "epoch": 0.022816112201587063, "grad_norm": 0.3079335391521454, "learning_rate": 0.00047626661793594957, "loss": 1.0325, "step": 2040 }, { "epoch": 0.022927955888849744, "grad_norm": 0.3412174582481384, "learning_rate": 0.0004760381926995295, "loss": 1.026, "step": 2050 }, { "epoch": 0.023039799576112425, "grad_norm": 0.31905752420425415, "learning_rate": 0.00047580976746310934, "loss": 1.033, "step": 2060 }, { "epoch": 0.023151643263375106, "grad_norm": 0.3110033869743347, "learning_rate": 0.0004755813422266892, "loss": 1.026, "step": 2070 }, { "epoch": 0.023263486950637787, "grad_norm": 0.3087383210659027, "learning_rate": 0.0004753529169902691, "loss": 1.0285, "step": 2080 }, { "epoch": 0.023375330637900472, "grad_norm": 0.310497522354126, "learning_rate": 0.00047512449175384896, "loss": 1.012, "step": 2090 }, { "epoch": 0.023487174325163153, "grad_norm": 0.35822993516921997, "learning_rate": 0.0004748960665174288, "loss": 1.0124, "step": 2100 }, { "epoch": 0.023599018012425834, "grad_norm": 0.3355759084224701, "learning_rate": 0.0004746676412810088, "loss": 1.0159, "step": 2110 }, { "epoch": 0.023710861699688515, "grad_norm": 0.29633432626724243, "learning_rate": 0.00047443921604458863, "loss": 1.0068, "step": 2120 }, { "epoch": 0.023822705386951196, "grad_norm": 0.3268597424030304, "learning_rate": 0.0004742107908081685, "loss": 1.0029, "step": 2130 }, { "epoch": 0.023934549074213878, "grad_norm": 0.32010769844055176, "learning_rate": 0.0004739823655717484, "loss": 1.0081, "step": 2140 }, { "epoch": 0.02404639276147656, "grad_norm": 0.30638498067855835, "learning_rate": 0.00047375394033532826, "loss": 0.9955, "step": 2150 }, { "epoch": 0.024158236448739243, "grad_norm": 0.32299259305000305, "learning_rate": 0.0004735255150989081, "loss": 1.0028, "step": 2160 }, { "epoch": 0.024270080136001924, "grad_norm": 0.30714213848114014, "learning_rate": 0.000473297089862488, "loss": 1.0163, "step": 2170 }, { "epoch": 0.024381923823264606, "grad_norm": 0.3207940459251404, "learning_rate": 0.0004730686646260679, "loss": 1.0053, "step": 2180 }, { "epoch": 0.024493767510527287, "grad_norm": 0.3073663115501404, "learning_rate": 0.0004728402393896478, "loss": 1.0007, "step": 2190 }, { "epoch": 0.024605611197789968, "grad_norm": 0.3209913671016693, "learning_rate": 0.0004726118141532277, "loss": 1.0065, "step": 2200 }, { "epoch": 0.02471745488505265, "grad_norm": 0.2987804114818573, "learning_rate": 0.00047238338891680755, "loss": 1.0015, "step": 2210 }, { "epoch": 0.024829298572315334, "grad_norm": 0.31511807441711426, "learning_rate": 0.0004721549636803874, "loss": 0.9892, "step": 2220 }, { "epoch": 0.024941142259578015, "grad_norm": 0.2840864956378937, "learning_rate": 0.0004719265384439673, "loss": 1.0084, "step": 2230 }, { "epoch": 0.025052985946840696, "grad_norm": 0.3094743490219116, "learning_rate": 0.0004716981132075472, "loss": 1.0169, "step": 2240 }, { "epoch": 0.025164829634103377, "grad_norm": 0.2905067205429077, "learning_rate": 0.00047146968797112703, "loss": 0.9991, "step": 2250 }, { "epoch": 0.02527667332136606, "grad_norm": 0.31322264671325684, "learning_rate": 0.00047124126273470694, "loss": 1.0169, "step": 2260 }, { "epoch": 0.02538851700862874, "grad_norm": 0.29053428769111633, "learning_rate": 0.00047101283749828685, "loss": 0.9942, "step": 2270 }, { "epoch": 0.02550036069589142, "grad_norm": 0.2863853871822357, "learning_rate": 0.0004707844122618667, "loss": 1.002, "step": 2280 }, { "epoch": 0.025612204383154105, "grad_norm": 0.3087761104106903, "learning_rate": 0.0004705559870254466, "loss": 1.0025, "step": 2290 }, { "epoch": 0.025724048070416786, "grad_norm": 0.3308629095554352, "learning_rate": 0.00047032756178902647, "loss": 1.0078, "step": 2300 }, { "epoch": 0.025835891757679467, "grad_norm": 0.29703134298324585, "learning_rate": 0.0004700991365526063, "loss": 1.006, "step": 2310 }, { "epoch": 0.02594773544494215, "grad_norm": 0.27238258719444275, "learning_rate": 0.0004698707113161862, "loss": 0.9963, "step": 2320 }, { "epoch": 0.02605957913220483, "grad_norm": 0.2795617878437042, "learning_rate": 0.0004696422860797661, "loss": 0.9876, "step": 2330 }, { "epoch": 0.02617142281946751, "grad_norm": 0.2989327013492584, "learning_rate": 0.000469413860843346, "loss": 0.9864, "step": 2340 }, { "epoch": 0.026283266506730196, "grad_norm": 0.3229614794254303, "learning_rate": 0.00046918543560692586, "loss": 0.9849, "step": 2350 }, { "epoch": 0.026395110193992877, "grad_norm": 0.2921406328678131, "learning_rate": 0.00046895701037050577, "loss": 0.9764, "step": 2360 }, { "epoch": 0.026506953881255558, "grad_norm": 0.2955220639705658, "learning_rate": 0.0004687285851340856, "loss": 0.9883, "step": 2370 }, { "epoch": 0.02661879756851824, "grad_norm": 0.31378960609436035, "learning_rate": 0.0004685001598976655, "loss": 0.9978, "step": 2380 }, { "epoch": 0.02673064125578092, "grad_norm": 0.30504587292671204, "learning_rate": 0.0004682717346612454, "loss": 0.9912, "step": 2390 }, { "epoch": 0.0268424849430436, "grad_norm": 0.3066459000110626, "learning_rate": 0.00046804330942482524, "loss": 0.9877, "step": 2400 }, { "epoch": 0.026954328630306282, "grad_norm": 0.3198714256286621, "learning_rate": 0.0004678148841884051, "loss": 0.98, "step": 2410 }, { "epoch": 0.027066172317568967, "grad_norm": 0.27119094133377075, "learning_rate": 0.00046758645895198506, "loss": 1.001, "step": 2420 }, { "epoch": 0.027178016004831648, "grad_norm": 0.28178098797798157, "learning_rate": 0.0004673580337155649, "loss": 0.9605, "step": 2430 }, { "epoch": 0.02728985969209433, "grad_norm": 0.29373088479042053, "learning_rate": 0.0004671296084791448, "loss": 0.9834, "step": 2440 }, { "epoch": 0.02740170337935701, "grad_norm": 0.2861827313899994, "learning_rate": 0.0004669011832427247, "loss": 0.9797, "step": 2450 }, { "epoch": 0.02751354706661969, "grad_norm": 0.3488409221172333, "learning_rate": 0.00046667275800630454, "loss": 0.9682, "step": 2460 }, { "epoch": 0.027625390753882373, "grad_norm": 0.29631665349006653, "learning_rate": 0.0004664443327698844, "loss": 0.9751, "step": 2470 }, { "epoch": 0.027737234441145054, "grad_norm": 0.27299416065216064, "learning_rate": 0.0004662159075334643, "loss": 0.9571, "step": 2480 }, { "epoch": 0.02784907812840774, "grad_norm": 0.30409684777259827, "learning_rate": 0.00046598748229704416, "loss": 0.968, "step": 2490 }, { "epoch": 0.02796092181567042, "grad_norm": 0.2957991063594818, "learning_rate": 0.00046575905706062407, "loss": 0.9814, "step": 2500 }, { "epoch": 0.0280727655029331, "grad_norm": 0.28328225016593933, "learning_rate": 0.000465530631824204, "loss": 0.9816, "step": 2510 }, { "epoch": 0.028184609190195782, "grad_norm": 0.40670067071914673, "learning_rate": 0.00046530220658778384, "loss": 0.9737, "step": 2520 }, { "epoch": 0.028296452877458463, "grad_norm": 0.2818649411201477, "learning_rate": 0.0004650737813513637, "loss": 0.9891, "step": 2530 }, { "epoch": 0.028408296564721144, "grad_norm": 0.3054118752479553, "learning_rate": 0.0004648453561149436, "loss": 0.9976, "step": 2540 }, { "epoch": 0.02852014025198383, "grad_norm": 0.31439468264579773, "learning_rate": 0.00046461693087852346, "loss": 0.9928, "step": 2550 }, { "epoch": 0.02863198393924651, "grad_norm": 0.3173445761203766, "learning_rate": 0.0004643885056421033, "loss": 1.0002, "step": 2560 }, { "epoch": 0.02874382762650919, "grad_norm": 0.32495757937431335, "learning_rate": 0.0004641600804056832, "loss": 0.9981, "step": 2570 }, { "epoch": 0.028855671313771872, "grad_norm": 0.35957351326942444, "learning_rate": 0.00046393165516926313, "loss": 1.0112, "step": 2580 }, { "epoch": 0.028967515001034554, "grad_norm": 0.3070557713508606, "learning_rate": 0.000463703229932843, "loss": 1.0047, "step": 2590 }, { "epoch": 0.029079358688297235, "grad_norm": 0.3227770924568176, "learning_rate": 0.0004634748046964229, "loss": 1.0115, "step": 2600 }, { "epoch": 0.029191202375559916, "grad_norm": 0.34345880150794983, "learning_rate": 0.00046324637946000276, "loss": 0.9984, "step": 2610 }, { "epoch": 0.0293030460628226, "grad_norm": 0.34459254145622253, "learning_rate": 0.0004630179542235826, "loss": 0.9965, "step": 2620 }, { "epoch": 0.02941488975008528, "grad_norm": 0.3396269679069519, "learning_rate": 0.0004627895289871625, "loss": 0.9986, "step": 2630 }, { "epoch": 0.029526733437347963, "grad_norm": 0.3370846211910248, "learning_rate": 0.0004625611037507424, "loss": 0.9987, "step": 2640 }, { "epoch": 0.029638577124610644, "grad_norm": 0.30689191818237305, "learning_rate": 0.00046233267851432223, "loss": 1.0081, "step": 2650 }, { "epoch": 0.029750420811873325, "grad_norm": 0.35536935925483704, "learning_rate": 0.0004621042532779022, "loss": 0.9948, "step": 2660 }, { "epoch": 0.029862264499136006, "grad_norm": 0.3295105993747711, "learning_rate": 0.00046187582804148205, "loss": 1.0115, "step": 2670 }, { "epoch": 0.029974108186398687, "grad_norm": 0.34881895780563354, "learning_rate": 0.0004616474028050619, "loss": 1.0024, "step": 2680 }, { "epoch": 0.030085951873661372, "grad_norm": 0.379261314868927, "learning_rate": 0.0004614189775686418, "loss": 0.9965, "step": 2690 }, { "epoch": 0.030197795560924053, "grad_norm": 0.34729093313217163, "learning_rate": 0.0004611905523322217, "loss": 1.0026, "step": 2700 }, { "epoch": 0.030309639248186734, "grad_norm": 0.34687525033950806, "learning_rate": 0.00046096212709580153, "loss": 0.9992, "step": 2710 }, { "epoch": 0.030421482935449416, "grad_norm": 0.3564583659172058, "learning_rate": 0.00046073370185938144, "loss": 0.9859, "step": 2720 }, { "epoch": 0.030533326622712097, "grad_norm": 0.3762670159339905, "learning_rate": 0.0004605052766229613, "loss": 1.0059, "step": 2730 }, { "epoch": 0.030645170309974778, "grad_norm": 0.3470481038093567, "learning_rate": 0.0004602768513865412, "loss": 1.0044, "step": 2740 }, { "epoch": 0.030757013997237462, "grad_norm": 0.3322189450263977, "learning_rate": 0.0004600484261501211, "loss": 0.9811, "step": 2750 }, { "epoch": 0.030868857684500144, "grad_norm": 0.3248903751373291, "learning_rate": 0.00045982000091370097, "loss": 0.9721, "step": 2760 }, { "epoch": 0.030980701371762825, "grad_norm": 0.32881951332092285, "learning_rate": 0.0004595915756772808, "loss": 0.9821, "step": 2770 }, { "epoch": 0.031092545059025506, "grad_norm": 0.35410797595977783, "learning_rate": 0.0004593631504408607, "loss": 0.9786, "step": 2780 }, { "epoch": 0.031204388746288187, "grad_norm": 0.3307279050350189, "learning_rate": 0.0004591347252044406, "loss": 0.9759, "step": 2790 }, { "epoch": 0.03131623243355087, "grad_norm": 0.3207128643989563, "learning_rate": 0.00045890629996802045, "loss": 0.9812, "step": 2800 }, { "epoch": 0.03142807612081355, "grad_norm": 0.3065459728240967, "learning_rate": 0.0004586778747316003, "loss": 0.9596, "step": 2810 }, { "epoch": 0.03153991980807623, "grad_norm": 0.3115104138851166, "learning_rate": 0.00045844944949518027, "loss": 0.9732, "step": 2820 }, { "epoch": 0.031651763495338915, "grad_norm": 0.3136879801750183, "learning_rate": 0.0004582210242587601, "loss": 0.9818, "step": 2830 }, { "epoch": 0.03176360718260159, "grad_norm": 0.3240731656551361, "learning_rate": 0.00045799259902234, "loss": 0.9836, "step": 2840 }, { "epoch": 0.03187545086986428, "grad_norm": 0.31390219926834106, "learning_rate": 0.0004577641737859199, "loss": 0.9837, "step": 2850 }, { "epoch": 0.03198729455712696, "grad_norm": 0.3056069612503052, "learning_rate": 0.00045753574854949975, "loss": 0.995, "step": 2860 }, { "epoch": 0.03209913824438964, "grad_norm": 0.29556363821029663, "learning_rate": 0.0004573073233130796, "loss": 1.0018, "step": 2870 }, { "epoch": 0.032210981931652324, "grad_norm": 0.2931666374206543, "learning_rate": 0.0004570788980766595, "loss": 1.0124, "step": 2880 }, { "epoch": 0.032322825618915, "grad_norm": 0.31029924750328064, "learning_rate": 0.0004568504728402394, "loss": 1.0115, "step": 2890 }, { "epoch": 0.03243466930617769, "grad_norm": 0.3164144456386566, "learning_rate": 0.0004566220476038193, "loss": 0.9966, "step": 2900 }, { "epoch": 0.032546512993440364, "grad_norm": 0.31638383865356445, "learning_rate": 0.0004563936223673992, "loss": 0.989, "step": 2910 }, { "epoch": 0.03265835668070305, "grad_norm": 0.28559473156929016, "learning_rate": 0.00045616519713097904, "loss": 1.0038, "step": 2920 }, { "epoch": 0.032770200367965734, "grad_norm": 0.285154789686203, "learning_rate": 0.0004559367718945589, "loss": 1.0009, "step": 2930 }, { "epoch": 0.03288204405522841, "grad_norm": 0.2722555100917816, "learning_rate": 0.0004557083466581388, "loss": 0.9977, "step": 2940 }, { "epoch": 0.032993887742491096, "grad_norm": 0.2854909896850586, "learning_rate": 0.00045547992142171866, "loss": 0.9996, "step": 2950 }, { "epoch": 0.033105731429753774, "grad_norm": 0.2726607620716095, "learning_rate": 0.0004552514961852985, "loss": 0.9925, "step": 2960 }, { "epoch": 0.03321757511701646, "grad_norm": 0.30692654848098755, "learning_rate": 0.0004550230709488785, "loss": 0.9776, "step": 2970 }, { "epoch": 0.03332941880427914, "grad_norm": 0.2921067774295807, "learning_rate": 0.00045479464571245834, "loss": 0.9831, "step": 2980 }, { "epoch": 0.03344126249154182, "grad_norm": 0.30490297079086304, "learning_rate": 0.0004545662204760382, "loss": 0.9835, "step": 2990 }, { "epoch": 0.033553106178804505, "grad_norm": 0.2823980450630188, "learning_rate": 0.0004543377952396181, "loss": 0.9859, "step": 3000 }, { "epoch": 0.03366494986606718, "grad_norm": 0.31844133138656616, "learning_rate": 0.00045410937000319796, "loss": 1.0007, "step": 3010 }, { "epoch": 0.03377679355332987, "grad_norm": 0.30595019459724426, "learning_rate": 0.0004538809447667778, "loss": 1.0069, "step": 3020 }, { "epoch": 0.033888637240592545, "grad_norm": 0.31177419424057007, "learning_rate": 0.0004536525195303577, "loss": 1.0068, "step": 3030 }, { "epoch": 0.03400048092785523, "grad_norm": 0.33921870589256287, "learning_rate": 0.0004534240942939376, "loss": 1.0116, "step": 3040 }, { "epoch": 0.034112324615117914, "grad_norm": 0.29299408197402954, "learning_rate": 0.0004531956690575175, "loss": 1.0014, "step": 3050 }, { "epoch": 0.03422416830238059, "grad_norm": 0.28572002053260803, "learning_rate": 0.0004529672438210974, "loss": 0.9976, "step": 3060 }, { "epoch": 0.03433601198964328, "grad_norm": 0.30842283368110657, "learning_rate": 0.00045273881858467726, "loss": 0.9994, "step": 3070 }, { "epoch": 0.034447855676905954, "grad_norm": 0.29677408933639526, "learning_rate": 0.0004525103933482571, "loss": 1.0055, "step": 3080 }, { "epoch": 0.03455969936416864, "grad_norm": 0.388823926448822, "learning_rate": 0.000452281968111837, "loss": 1.0062, "step": 3090 }, { "epoch": 0.03467154305143132, "grad_norm": 0.2956707775592804, "learning_rate": 0.0004520535428754169, "loss": 0.9794, "step": 3100 }, { "epoch": 0.034783386738694, "grad_norm": 0.3179475665092468, "learning_rate": 0.00045182511763899673, "loss": 0.9831, "step": 3110 }, { "epoch": 0.034895230425956686, "grad_norm": 0.29509803652763367, "learning_rate": 0.00045159669240257664, "loss": 0.9851, "step": 3120 }, { "epoch": 0.035007074113219364, "grad_norm": 0.31095758080482483, "learning_rate": 0.00045136826716615655, "loss": 0.9852, "step": 3130 }, { "epoch": 0.03511891780048205, "grad_norm": 0.27768880128860474, "learning_rate": 0.0004511398419297364, "loss": 0.9741, "step": 3140 }, { "epoch": 0.035230761487744726, "grad_norm": 0.3117106854915619, "learning_rate": 0.0004509114166933163, "loss": 0.9987, "step": 3150 }, { "epoch": 0.03534260517500741, "grad_norm": 0.30113616585731506, "learning_rate": 0.0004506829914568962, "loss": 0.9855, "step": 3160 }, { "epoch": 0.03545444886227009, "grad_norm": 0.2842777967453003, "learning_rate": 0.00045045456622047603, "loss": 0.9793, "step": 3170 }, { "epoch": 0.03556629254953277, "grad_norm": 0.30115559697151184, "learning_rate": 0.00045022614098405594, "loss": 0.9854, "step": 3180 }, { "epoch": 0.03567813623679546, "grad_norm": 0.3350517153739929, "learning_rate": 0.0004499977157476358, "loss": 0.9787, "step": 3190 }, { "epoch": 0.035789979924058135, "grad_norm": 0.2736664414405823, "learning_rate": 0.00044976929051121565, "loss": 1.0067, "step": 3200 }, { "epoch": 0.03590182361132082, "grad_norm": 0.2868112027645111, "learning_rate": 0.0004495408652747956, "loss": 1.0002, "step": 3210 }, { "epoch": 0.0360136672985835, "grad_norm": 0.27296972274780273, "learning_rate": 0.00044931244003837547, "loss": 0.9939, "step": 3220 }, { "epoch": 0.03612551098584618, "grad_norm": 0.2894013226032257, "learning_rate": 0.00044908401480195533, "loss": 1.0017, "step": 3230 }, { "epoch": 0.03623735467310886, "grad_norm": 0.26549386978149414, "learning_rate": 0.0004488555895655352, "loss": 0.9953, "step": 3240 }, { "epoch": 0.036349198360371544, "grad_norm": 0.27381303906440735, "learning_rate": 0.0004486271643291151, "loss": 1.0077, "step": 3250 }, { "epoch": 0.03646104204763423, "grad_norm": 0.2829972505569458, "learning_rate": 0.00044839873909269495, "loss": 1.0008, "step": 3260 }, { "epoch": 0.03657288573489691, "grad_norm": 0.29023584723472595, "learning_rate": 0.0004481703138562748, "loss": 0.9999, "step": 3270 }, { "epoch": 0.03668472942215959, "grad_norm": 0.29526880383491516, "learning_rate": 0.00044794188861985477, "loss": 0.9982, "step": 3280 }, { "epoch": 0.03679657310942227, "grad_norm": 0.27724817395210266, "learning_rate": 0.0004477134633834346, "loss": 1.0109, "step": 3290 }, { "epoch": 0.036908416796684954, "grad_norm": 0.2780180275440216, "learning_rate": 0.0004474850381470145, "loss": 0.997, "step": 3300 }, { "epoch": 0.03702026048394764, "grad_norm": 0.29814234375953674, "learning_rate": 0.0004472566129105944, "loss": 1.0056, "step": 3310 }, { "epoch": 0.037132104171210316, "grad_norm": 0.3131207823753357, "learning_rate": 0.00044702818767417425, "loss": 0.999, "step": 3320 }, { "epoch": 0.037243947858473, "grad_norm": 0.2865641415119171, "learning_rate": 0.0004467997624377541, "loss": 0.9938, "step": 3330 }, { "epoch": 0.03735579154573568, "grad_norm": 0.31247007846832275, "learning_rate": 0.000446571337201334, "loss": 1.0029, "step": 3340 }, { "epoch": 0.03746763523299836, "grad_norm": 0.3432846665382385, "learning_rate": 0.00044634291196491387, "loss": 0.9861, "step": 3350 }, { "epoch": 0.03757947892026104, "grad_norm": 0.3200684189796448, "learning_rate": 0.0004461144867284938, "loss": 0.9958, "step": 3360 }, { "epoch": 0.037691322607523725, "grad_norm": 0.3280775547027588, "learning_rate": 0.0004458860614920737, "loss": 0.9972, "step": 3370 }, { "epoch": 0.03780316629478641, "grad_norm": 0.3129955232143402, "learning_rate": 0.00044565763625565354, "loss": 0.9947, "step": 3380 }, { "epoch": 0.03791500998204909, "grad_norm": 0.27574583888053894, "learning_rate": 0.0004454292110192334, "loss": 1.0004, "step": 3390 }, { "epoch": 0.03802685366931177, "grad_norm": 0.3088320791721344, "learning_rate": 0.0004452007857828133, "loss": 0.9907, "step": 3400 }, { "epoch": 0.03813869735657445, "grad_norm": 0.3232235908508301, "learning_rate": 0.00044497236054639316, "loss": 0.9956, "step": 3410 }, { "epoch": 0.038250541043837134, "grad_norm": 0.3009951114654541, "learning_rate": 0.000444743935309973, "loss": 0.9899, "step": 3420 }, { "epoch": 0.03836238473109981, "grad_norm": 0.2987104058265686, "learning_rate": 0.00044451551007355293, "loss": 0.9852, "step": 3430 }, { "epoch": 0.0384742284183625, "grad_norm": 0.2890870273113251, "learning_rate": 0.00044428708483713284, "loss": 0.9775, "step": 3440 }, { "epoch": 0.03858607210562518, "grad_norm": 0.2704969048500061, "learning_rate": 0.0004440586596007127, "loss": 0.9745, "step": 3450 }, { "epoch": 0.03869791579288786, "grad_norm": 0.3041844964027405, "learning_rate": 0.0004438302343642926, "loss": 0.977, "step": 3460 }, { "epoch": 0.038809759480150544, "grad_norm": 0.2794378995895386, "learning_rate": 0.00044360180912787246, "loss": 0.9818, "step": 3470 }, { "epoch": 0.03892160316741322, "grad_norm": 0.2784910798072815, "learning_rate": 0.0004433733838914523, "loss": 0.9655, "step": 3480 }, { "epoch": 0.039033446854675906, "grad_norm": 0.2610478103160858, "learning_rate": 0.0004431449586550322, "loss": 0.975, "step": 3490 }, { "epoch": 0.039145290541938584, "grad_norm": 0.2646799087524414, "learning_rate": 0.0004429165334186121, "loss": 0.9767, "step": 3500 }, { "epoch": 0.03925713422920127, "grad_norm": 0.2622663676738739, "learning_rate": 0.00044268810818219194, "loss": 0.98, "step": 3510 }, { "epoch": 0.03936897791646395, "grad_norm": 0.26897987723350525, "learning_rate": 0.0004424596829457719, "loss": 0.9718, "step": 3520 }, { "epoch": 0.03948082160372663, "grad_norm": 0.29816752672195435, "learning_rate": 0.00044223125770935176, "loss": 1.0074, "step": 3530 }, { "epoch": 0.039592665290989315, "grad_norm": 0.2652198076248169, "learning_rate": 0.0004420028324729316, "loss": 0.9789, "step": 3540 }, { "epoch": 0.03970450897825199, "grad_norm": 0.2648336887359619, "learning_rate": 0.0004417744072365115, "loss": 0.9794, "step": 3550 }, { "epoch": 0.03981635266551468, "grad_norm": 0.25409677624702454, "learning_rate": 0.0004415459820000914, "loss": 0.9868, "step": 3560 }, { "epoch": 0.039928196352777355, "grad_norm": 0.25675469636917114, "learning_rate": 0.00044131755676367123, "loss": 0.9827, "step": 3570 }, { "epoch": 0.04004004004004004, "grad_norm": 0.2915634214878082, "learning_rate": 0.00044108913152725114, "loss": 0.9833, "step": 3580 }, { "epoch": 0.040151883727302724, "grad_norm": 0.29538393020629883, "learning_rate": 0.000440860706290831, "loss": 0.9848, "step": 3590 }, { "epoch": 0.0402637274145654, "grad_norm": 0.3026215732097626, "learning_rate": 0.0004406322810544109, "loss": 0.9778, "step": 3600 }, { "epoch": 0.04037557110182809, "grad_norm": 0.30865418910980225, "learning_rate": 0.0004404038558179908, "loss": 0.9743, "step": 3610 }, { "epoch": 0.040487414789090764, "grad_norm": 0.28092265129089355, "learning_rate": 0.0004401754305815707, "loss": 0.9795, "step": 3620 }, { "epoch": 0.04059925847635345, "grad_norm": 0.27747923135757446, "learning_rate": 0.00043994700534515053, "loss": 0.9642, "step": 3630 }, { "epoch": 0.040711102163616134, "grad_norm": 0.28192010521888733, "learning_rate": 0.00043971858010873044, "loss": 0.9742, "step": 3640 }, { "epoch": 0.04082294585087881, "grad_norm": 0.2670564651489258, "learning_rate": 0.0004394901548723103, "loss": 0.9544, "step": 3650 }, { "epoch": 0.040934789538141496, "grad_norm": 0.3089617192745209, "learning_rate": 0.00043926172963589015, "loss": 0.9563, "step": 3660 }, { "epoch": 0.041046633225404174, "grad_norm": 0.26768213510513306, "learning_rate": 0.00043903330439947, "loss": 0.9531, "step": 3670 }, { "epoch": 0.04115847691266686, "grad_norm": 0.28865131735801697, "learning_rate": 0.00043880487916305, "loss": 0.9579, "step": 3680 }, { "epoch": 0.041270320599929536, "grad_norm": 0.27369582653045654, "learning_rate": 0.00043857645392662983, "loss": 0.9679, "step": 3690 }, { "epoch": 0.04138216428719222, "grad_norm": 0.2889108955860138, "learning_rate": 0.0004383480286902097, "loss": 0.9561, "step": 3700 }, { "epoch": 0.041494007974454905, "grad_norm": 0.2701929211616516, "learning_rate": 0.0004381196034537896, "loss": 0.9642, "step": 3710 }, { "epoch": 0.04160585166171758, "grad_norm": 0.2817586064338684, "learning_rate": 0.00043789117821736945, "loss": 0.9701, "step": 3720 }, { "epoch": 0.04171769534898027, "grad_norm": 0.2924664318561554, "learning_rate": 0.0004376627529809493, "loss": 0.9617, "step": 3730 }, { "epoch": 0.041829539036242945, "grad_norm": 0.28590497374534607, "learning_rate": 0.0004374343277445292, "loss": 0.9646, "step": 3740 }, { "epoch": 0.04194138272350563, "grad_norm": 0.270046591758728, "learning_rate": 0.0004372059025081091, "loss": 0.95, "step": 3750 }, { "epoch": 0.04205322641076831, "grad_norm": 0.2508755326271057, "learning_rate": 0.000436977477271689, "loss": 0.9525, "step": 3760 }, { "epoch": 0.04216507009803099, "grad_norm": 0.26878127455711365, "learning_rate": 0.0004367490520352689, "loss": 0.9609, "step": 3770 }, { "epoch": 0.04227691378529368, "grad_norm": 0.26882994174957275, "learning_rate": 0.00043652062679884875, "loss": 0.9671, "step": 3780 }, { "epoch": 0.042388757472556354, "grad_norm": 0.28049325942993164, "learning_rate": 0.0004362922015624286, "loss": 0.9492, "step": 3790 }, { "epoch": 0.04250060115981904, "grad_norm": 0.33502647280693054, "learning_rate": 0.0004360637763260085, "loss": 0.9537, "step": 3800 }, { "epoch": 0.04261244484708172, "grad_norm": 0.321997731924057, "learning_rate": 0.00043583535108958837, "loss": 0.9646, "step": 3810 }, { "epoch": 0.0427242885343444, "grad_norm": 0.29477357864379883, "learning_rate": 0.0004356069258531682, "loss": 0.9794, "step": 3820 }, { "epoch": 0.04283613222160708, "grad_norm": 0.2989972233772278, "learning_rate": 0.0004353785006167482, "loss": 0.9645, "step": 3830 }, { "epoch": 0.042947975908869764, "grad_norm": 0.33459851145744324, "learning_rate": 0.00043515007538032804, "loss": 0.9556, "step": 3840 }, { "epoch": 0.04305981959613245, "grad_norm": 0.2941781282424927, "learning_rate": 0.0004349216501439079, "loss": 0.9507, "step": 3850 }, { "epoch": 0.043171663283395126, "grad_norm": 0.27801111340522766, "learning_rate": 0.0004346932249074878, "loss": 0.9623, "step": 3860 }, { "epoch": 0.04328350697065781, "grad_norm": 0.2765832841396332, "learning_rate": 0.00043446479967106767, "loss": 0.9815, "step": 3870 }, { "epoch": 0.04339535065792049, "grad_norm": 0.303786039352417, "learning_rate": 0.0004342363744346475, "loss": 0.9575, "step": 3880 }, { "epoch": 0.04350719434518317, "grad_norm": 0.29517048597335815, "learning_rate": 0.00043400794919822743, "loss": 0.9554, "step": 3890 }, { "epoch": 0.04361903803244585, "grad_norm": 0.28657206892967224, "learning_rate": 0.0004337795239618073, "loss": 0.9631, "step": 3900 }, { "epoch": 0.043730881719708535, "grad_norm": 0.2933245003223419, "learning_rate": 0.0004335510987253872, "loss": 0.987, "step": 3910 }, { "epoch": 0.04384272540697122, "grad_norm": 0.31331002712249756, "learning_rate": 0.0004333226734889671, "loss": 0.971, "step": 3920 }, { "epoch": 0.0439545690942339, "grad_norm": 0.32431700825691223, "learning_rate": 0.00043309424825254696, "loss": 0.9603, "step": 3930 }, { "epoch": 0.04406641278149658, "grad_norm": 0.3346642851829529, "learning_rate": 0.0004328658230161268, "loss": 0.9721, "step": 3940 }, { "epoch": 0.04417825646875926, "grad_norm": 0.33921241760253906, "learning_rate": 0.00043263739777970673, "loss": 0.9639, "step": 3950 }, { "epoch": 0.044290100156021944, "grad_norm": 0.3068247139453888, "learning_rate": 0.0004324089725432866, "loss": 0.9756, "step": 3960 }, { "epoch": 0.04440194384328462, "grad_norm": 0.3049049973487854, "learning_rate": 0.00043218054730686644, "loss": 0.9693, "step": 3970 }, { "epoch": 0.04451378753054731, "grad_norm": 0.30104655027389526, "learning_rate": 0.00043195212207044635, "loss": 0.9704, "step": 3980 }, { "epoch": 0.04462563121780999, "grad_norm": 0.36955609917640686, "learning_rate": 0.00043172369683402626, "loss": 0.9527, "step": 3990 }, { "epoch": 0.04473747490507267, "grad_norm": 0.318854957818985, "learning_rate": 0.0004314952715976061, "loss": 0.9543, "step": 4000 }, { "epoch": 0.044849318592335354, "grad_norm": 0.3166191875934601, "learning_rate": 0.000431266846361186, "loss": 0.968, "step": 4010 }, { "epoch": 0.04496116227959803, "grad_norm": 0.2976950407028198, "learning_rate": 0.0004310384211247659, "loss": 0.9822, "step": 4020 }, { "epoch": 0.045073005966860716, "grad_norm": 0.2912284731864929, "learning_rate": 0.00043080999588834574, "loss": 0.9759, "step": 4030 }, { "epoch": 0.0451848496541234, "grad_norm": 0.31027549505233765, "learning_rate": 0.00043058157065192565, "loss": 0.9794, "step": 4040 }, { "epoch": 0.04529669334138608, "grad_norm": 0.3182738721370697, "learning_rate": 0.0004303531454155055, "loss": 0.9654, "step": 4050 }, { "epoch": 0.04540853702864876, "grad_norm": 0.3006060719490051, "learning_rate": 0.00043012472017908536, "loss": 0.9548, "step": 4060 }, { "epoch": 0.04552038071591144, "grad_norm": 0.2828291654586792, "learning_rate": 0.0004298962949426653, "loss": 0.9611, "step": 4070 }, { "epoch": 0.045632224403174125, "grad_norm": 0.30988603830337524, "learning_rate": 0.0004296678697062452, "loss": 0.9614, "step": 4080 }, { "epoch": 0.0457440680904368, "grad_norm": 0.29344943165779114, "learning_rate": 0.00042943944446982503, "loss": 0.9522, "step": 4090 }, { "epoch": 0.04585591177769949, "grad_norm": 0.29713529348373413, "learning_rate": 0.00042921101923340494, "loss": 0.9468, "step": 4100 }, { "epoch": 0.04596775546496217, "grad_norm": 0.2815961539745331, "learning_rate": 0.0004289825939969848, "loss": 0.9546, "step": 4110 }, { "epoch": 0.04607959915222485, "grad_norm": 0.25218480825424194, "learning_rate": 0.00042875416876056465, "loss": 0.9372, "step": 4120 }, { "epoch": 0.046191442839487534, "grad_norm": 0.2735552191734314, "learning_rate": 0.0004285257435241445, "loss": 0.942, "step": 4130 }, { "epoch": 0.04630328652675021, "grad_norm": 0.27451473474502563, "learning_rate": 0.0004282973182877245, "loss": 0.931, "step": 4140 }, { "epoch": 0.0464151302140129, "grad_norm": 0.24361196160316467, "learning_rate": 0.00042806889305130433, "loss": 0.924, "step": 4150 }, { "epoch": 0.046526973901275574, "grad_norm": 0.25817179679870605, "learning_rate": 0.0004278404678148842, "loss": 0.9373, "step": 4160 }, { "epoch": 0.04663881758853826, "grad_norm": 0.28722450137138367, "learning_rate": 0.0004276120425784641, "loss": 0.9271, "step": 4170 }, { "epoch": 0.046750661275800943, "grad_norm": 0.25202882289886475, "learning_rate": 0.00042738361734204395, "loss": 0.9187, "step": 4180 }, { "epoch": 0.04686250496306362, "grad_norm": 0.2637481391429901, "learning_rate": 0.0004271551921056238, "loss": 0.9402, "step": 4190 }, { "epoch": 0.046974348650326306, "grad_norm": 0.2684090733528137, "learning_rate": 0.0004269267668692037, "loss": 0.9574, "step": 4200 }, { "epoch": 0.047086192337588983, "grad_norm": 0.28711873292922974, "learning_rate": 0.00042669834163278357, "loss": 0.9551, "step": 4210 }, { "epoch": 0.04719803602485167, "grad_norm": 0.2933102250099182, "learning_rate": 0.0004264699163963635, "loss": 0.9457, "step": 4220 }, { "epoch": 0.047309879712114346, "grad_norm": 0.2875578701496124, "learning_rate": 0.0004262414911599434, "loss": 0.9667, "step": 4230 }, { "epoch": 0.04742172339937703, "grad_norm": 0.3007104694843292, "learning_rate": 0.00042601306592352325, "loss": 0.9672, "step": 4240 }, { "epoch": 0.047533567086639715, "grad_norm": 0.30211201310157776, "learning_rate": 0.0004257846406871031, "loss": 0.9781, "step": 4250 }, { "epoch": 0.04764541077390239, "grad_norm": 0.29263827204704285, "learning_rate": 0.000425556215450683, "loss": 0.9923, "step": 4260 }, { "epoch": 0.04775725446116508, "grad_norm": 0.29569676518440247, "learning_rate": 0.00042532779021426287, "loss": 0.9913, "step": 4270 }, { "epoch": 0.047869098148427755, "grad_norm": 0.28223690390586853, "learning_rate": 0.0004250993649778427, "loss": 0.9817, "step": 4280 }, { "epoch": 0.04798094183569044, "grad_norm": 0.271419882774353, "learning_rate": 0.00042487093974142263, "loss": 0.9977, "step": 4290 }, { "epoch": 0.04809278552295312, "grad_norm": 0.26362791657447815, "learning_rate": 0.00042464251450500254, "loss": 0.9859, "step": 4300 }, { "epoch": 0.0482046292102158, "grad_norm": 0.31365934014320374, "learning_rate": 0.0004244140892685824, "loss": 0.9862, "step": 4310 }, { "epoch": 0.04831647289747849, "grad_norm": 0.26915237307548523, "learning_rate": 0.0004241856640321623, "loss": 0.9693, "step": 4320 }, { "epoch": 0.048428316584741164, "grad_norm": 0.2639203369617462, "learning_rate": 0.00042395723879574217, "loss": 0.9691, "step": 4330 }, { "epoch": 0.04854016027200385, "grad_norm": 0.30106601119041443, "learning_rate": 0.000423728813559322, "loss": 0.9521, "step": 4340 }, { "epoch": 0.04865200395926653, "grad_norm": 0.2807524800300598, "learning_rate": 0.00042350038832290193, "loss": 0.9616, "step": 4350 }, { "epoch": 0.04876384764652921, "grad_norm": 0.27363407611846924, "learning_rate": 0.0004232719630864818, "loss": 0.9538, "step": 4360 }, { "epoch": 0.048875691333791896, "grad_norm": 0.29041701555252075, "learning_rate": 0.00042304353785006164, "loss": 0.9455, "step": 4370 }, { "epoch": 0.048987535021054573, "grad_norm": 0.28237226605415344, "learning_rate": 0.0004228151126136416, "loss": 0.9615, "step": 4380 }, { "epoch": 0.04909937870831726, "grad_norm": 0.30885329842567444, "learning_rate": 0.00042258668737722146, "loss": 0.9691, "step": 4390 }, { "epoch": 0.049211222395579936, "grad_norm": 0.2734643220901489, "learning_rate": 0.0004223582621408013, "loss": 0.9663, "step": 4400 }, { "epoch": 0.04932306608284262, "grad_norm": 0.2652278244495392, "learning_rate": 0.00042212983690438123, "loss": 0.9439, "step": 4410 }, { "epoch": 0.0494349097701053, "grad_norm": 0.27749761939048767, "learning_rate": 0.0004219014116679611, "loss": 0.9623, "step": 4420 }, { "epoch": 0.04954675345736798, "grad_norm": 0.2812553942203522, "learning_rate": 0.00042167298643154094, "loss": 0.9557, "step": 4430 }, { "epoch": 0.04965859714463067, "grad_norm": 0.2762252688407898, "learning_rate": 0.00042144456119512085, "loss": 0.945, "step": 4440 }, { "epoch": 0.049770440831893345, "grad_norm": 0.277118980884552, "learning_rate": 0.0004212161359587007, "loss": 0.93, "step": 4450 }, { "epoch": 0.04988228451915603, "grad_norm": 0.2723037004470825, "learning_rate": 0.0004209877107222806, "loss": 0.963, "step": 4460 }, { "epoch": 0.04999412820641871, "grad_norm": 0.29789137840270996, "learning_rate": 0.0004207592854858605, "loss": 0.954, "step": 4470 }, { "epoch": 0.05010597189368139, "grad_norm": 0.26940014958381653, "learning_rate": 0.0004205308602494404, "loss": 0.9443, "step": 4480 }, { "epoch": 0.05021781558094407, "grad_norm": 0.263300359249115, "learning_rate": 0.00042030243501302024, "loss": 0.9403, "step": 4490 }, { "epoch": 0.050329659268206754, "grad_norm": 0.27823972702026367, "learning_rate": 0.00042007400977660015, "loss": 0.95, "step": 4500 }, { "epoch": 0.05044150295546944, "grad_norm": 0.2782444357872009, "learning_rate": 0.00041984558454018, "loss": 0.953, "step": 4510 }, { "epoch": 0.05055334664273212, "grad_norm": 0.277182936668396, "learning_rate": 0.00041961715930375986, "loss": 0.9498, "step": 4520 }, { "epoch": 0.0506651903299948, "grad_norm": 0.2942575514316559, "learning_rate": 0.00041938873406733977, "loss": 0.957, "step": 4530 }, { "epoch": 0.05077703401725748, "grad_norm": 0.3258327543735504, "learning_rate": 0.0004191603088309197, "loss": 0.9626, "step": 4540 }, { "epoch": 0.05088887770452016, "grad_norm": 0.27874353528022766, "learning_rate": 0.00041893188359449953, "loss": 0.971, "step": 4550 }, { "epoch": 0.05100072139178284, "grad_norm": 0.2981313169002533, "learning_rate": 0.00041870345835807944, "loss": 0.965, "step": 4560 }, { "epoch": 0.051112565079045526, "grad_norm": 0.30568984150886536, "learning_rate": 0.0004184750331216593, "loss": 0.9566, "step": 4570 }, { "epoch": 0.05122440876630821, "grad_norm": 0.27867600321769714, "learning_rate": 0.00041824660788523915, "loss": 0.94, "step": 4580 }, { "epoch": 0.05133625245357089, "grad_norm": 0.30877605080604553, "learning_rate": 0.000418018182648819, "loss": 0.9453, "step": 4590 }, { "epoch": 0.05144809614083357, "grad_norm": 0.3018844425678253, "learning_rate": 0.0004177897574123989, "loss": 0.9511, "step": 4600 }, { "epoch": 0.05155993982809625, "grad_norm": 0.27943944931030273, "learning_rate": 0.0004175613321759788, "loss": 0.9371, "step": 4610 }, { "epoch": 0.051671783515358935, "grad_norm": 0.2654775381088257, "learning_rate": 0.0004173329069395587, "loss": 0.9366, "step": 4620 }, { "epoch": 0.05178362720262161, "grad_norm": 0.27594050765037537, "learning_rate": 0.0004171044817031386, "loss": 0.9229, "step": 4630 }, { "epoch": 0.0518954708898843, "grad_norm": 0.26856914162635803, "learning_rate": 0.00041687605646671845, "loss": 0.9357, "step": 4640 }, { "epoch": 0.05200731457714698, "grad_norm": 0.2956237494945526, "learning_rate": 0.0004166476312302983, "loss": 0.9023, "step": 4650 }, { "epoch": 0.05211915826440966, "grad_norm": 0.30004164576530457, "learning_rate": 0.0004164192059938782, "loss": 0.9273, "step": 4660 }, { "epoch": 0.052231001951672344, "grad_norm": 0.2691096365451813, "learning_rate": 0.0004161907807574581, "loss": 0.9332, "step": 4670 }, { "epoch": 0.05234284563893502, "grad_norm": 0.2551780641078949, "learning_rate": 0.00041596235552103793, "loss": 0.9327, "step": 4680 }, { "epoch": 0.052454689326197707, "grad_norm": 0.2806546092033386, "learning_rate": 0.0004157339302846179, "loss": 0.9355, "step": 4690 }, { "epoch": 0.05256653301346039, "grad_norm": 0.27648645639419556, "learning_rate": 0.00041550550504819775, "loss": 0.9348, "step": 4700 }, { "epoch": 0.05267837670072307, "grad_norm": 0.2816336750984192, "learning_rate": 0.0004152770798117776, "loss": 0.9294, "step": 4710 }, { "epoch": 0.05279022038798575, "grad_norm": 0.29570698738098145, "learning_rate": 0.0004150486545753575, "loss": 0.9317, "step": 4720 }, { "epoch": 0.05290206407524843, "grad_norm": 0.26981687545776367, "learning_rate": 0.00041482022933893737, "loss": 0.9317, "step": 4730 }, { "epoch": 0.053013907762511116, "grad_norm": 0.2586159110069275, "learning_rate": 0.0004145918041025172, "loss": 0.9162, "step": 4740 }, { "epoch": 0.05312575144977379, "grad_norm": 0.24129503965377808, "learning_rate": 0.00041436337886609714, "loss": 0.934, "step": 4750 }, { "epoch": 0.05323759513703648, "grad_norm": 0.28072717785835266, "learning_rate": 0.000414134953629677, "loss": 0.9089, "step": 4760 }, { "epoch": 0.05334943882429916, "grad_norm": 0.2760024964809418, "learning_rate": 0.0004139065283932569, "loss": 0.9115, "step": 4770 }, { "epoch": 0.05346128251156184, "grad_norm": 0.28894710540771484, "learning_rate": 0.0004136781031568368, "loss": 0.9108, "step": 4780 }, { "epoch": 0.053573126198824525, "grad_norm": 0.27882319688796997, "learning_rate": 0.00041344967792041667, "loss": 0.9184, "step": 4790 }, { "epoch": 0.0536849698860872, "grad_norm": 0.27242934703826904, "learning_rate": 0.0004132212526839965, "loss": 0.9498, "step": 4800 }, { "epoch": 0.05379681357334989, "grad_norm": 0.2809596359729767, "learning_rate": 0.00041299282744757643, "loss": 0.9365, "step": 4810 }, { "epoch": 0.053908657260612565, "grad_norm": 0.3026556074619293, "learning_rate": 0.0004127644022111563, "loss": 0.9433, "step": 4820 }, { "epoch": 0.05402050094787525, "grad_norm": 0.2933846116065979, "learning_rate": 0.00041253597697473614, "loss": 0.9351, "step": 4830 }, { "epoch": 0.054132344635137934, "grad_norm": 0.2774868309497833, "learning_rate": 0.00041230755173831605, "loss": 0.9285, "step": 4840 }, { "epoch": 0.05424418832240061, "grad_norm": 0.2859903573989868, "learning_rate": 0.00041207912650189596, "loss": 0.9344, "step": 4850 }, { "epoch": 0.054356032009663297, "grad_norm": 0.26687270402908325, "learning_rate": 0.0004118507012654758, "loss": 0.9281, "step": 4860 }, { "epoch": 0.054467875696925974, "grad_norm": 0.31075340509414673, "learning_rate": 0.00041162227602905573, "loss": 0.9418, "step": 4870 }, { "epoch": 0.05457971938418866, "grad_norm": 0.2569184899330139, "learning_rate": 0.0004113938507926356, "loss": 0.9394, "step": 4880 }, { "epoch": 0.054691563071451336, "grad_norm": 0.26250478625297546, "learning_rate": 0.00041116542555621544, "loss": 0.9499, "step": 4890 }, { "epoch": 0.05480340675871402, "grad_norm": 0.27604004740715027, "learning_rate": 0.00041093700031979535, "loss": 0.9268, "step": 4900 }, { "epoch": 0.054915250445976706, "grad_norm": 0.26279163360595703, "learning_rate": 0.0004107085750833752, "loss": 0.9313, "step": 4910 }, { "epoch": 0.05502709413323938, "grad_norm": 0.29265978932380676, "learning_rate": 0.00041048014984695506, "loss": 0.9498, "step": 4920 }, { "epoch": 0.05513893782050207, "grad_norm": 0.32107868790626526, "learning_rate": 0.000410251724610535, "loss": 0.9708, "step": 4930 }, { "epoch": 0.055250781507764746, "grad_norm": 0.32804161310195923, "learning_rate": 0.0004100232993741149, "loss": 0.9624, "step": 4940 }, { "epoch": 0.05536262519502743, "grad_norm": 0.3207037150859833, "learning_rate": 0.00040979487413769474, "loss": 0.9538, "step": 4950 }, { "epoch": 0.05547446888229011, "grad_norm": 0.29660555720329285, "learning_rate": 0.00040956644890127465, "loss": 0.9677, "step": 4960 }, { "epoch": 0.05558631256955279, "grad_norm": 0.34930771589279175, "learning_rate": 0.0004093380236648545, "loss": 0.9777, "step": 4970 }, { "epoch": 0.05569815625681548, "grad_norm": 0.3037464916706085, "learning_rate": 0.00040910959842843436, "loss": 0.9826, "step": 4980 }, { "epoch": 0.055809999944078155, "grad_norm": 0.31435292959213257, "learning_rate": 0.00040888117319201427, "loss": 0.9677, "step": 4990 }, { "epoch": 0.05592184363134084, "grad_norm": 0.29182785749435425, "learning_rate": 0.0004086527479555941, "loss": 0.9563, "step": 5000 }, { "epoch": 0.05603368731860352, "grad_norm": 0.34796231985092163, "learning_rate": 0.00040842432271917403, "loss": 0.957, "step": 5010 }, { "epoch": 0.0561455310058662, "grad_norm": 0.3027050495147705, "learning_rate": 0.00040819589748275394, "loss": 0.967, "step": 5020 }, { "epoch": 0.056257374693128887, "grad_norm": 0.3419332802295685, "learning_rate": 0.0004079674722463338, "loss": 0.9654, "step": 5030 }, { "epoch": 0.056369218380391564, "grad_norm": 0.29381224513053894, "learning_rate": 0.00040773904700991366, "loss": 0.9647, "step": 5040 }, { "epoch": 0.05648106206765425, "grad_norm": 0.29206860065460205, "learning_rate": 0.0004075106217734935, "loss": 0.9637, "step": 5050 }, { "epoch": 0.056592905754916926, "grad_norm": 0.3169795274734497, "learning_rate": 0.0004072821965370734, "loss": 0.963, "step": 5060 }, { "epoch": 0.05670474944217961, "grad_norm": 0.30713772773742676, "learning_rate": 0.0004070537713006533, "loss": 0.9766, "step": 5070 }, { "epoch": 0.05681659312944229, "grad_norm": 0.29805994033813477, "learning_rate": 0.00040682534606423313, "loss": 0.9597, "step": 5080 }, { "epoch": 0.05692843681670497, "grad_norm": 0.33419644832611084, "learning_rate": 0.0004065969208278131, "loss": 0.9598, "step": 5090 }, { "epoch": 0.05704028050396766, "grad_norm": 0.31769025325775146, "learning_rate": 0.00040636849559139295, "loss": 0.942, "step": 5100 }, { "epoch": 0.057152124191230336, "grad_norm": 0.3017726242542267, "learning_rate": 0.0004061400703549728, "loss": 0.9627, "step": 5110 }, { "epoch": 0.05726396787849302, "grad_norm": 0.32213470339775085, "learning_rate": 0.0004059116451185527, "loss": 0.9518, "step": 5120 }, { "epoch": 0.0573758115657557, "grad_norm": 0.29069948196411133, "learning_rate": 0.0004056832198821326, "loss": 0.9337, "step": 5130 }, { "epoch": 0.05748765525301838, "grad_norm": 0.32283100485801697, "learning_rate": 0.00040545479464571243, "loss": 0.959, "step": 5140 }, { "epoch": 0.05759949894028106, "grad_norm": 0.3191847801208496, "learning_rate": 0.00040522636940929234, "loss": 0.9439, "step": 5150 }, { "epoch": 0.057711342627543745, "grad_norm": 0.565864622592926, "learning_rate": 0.00040499794417287225, "loss": 0.9587, "step": 5160 }, { "epoch": 0.05782318631480643, "grad_norm": 0.3419003188610077, "learning_rate": 0.0004047695189364521, "loss": 0.9466, "step": 5170 }, { "epoch": 0.05793503000206911, "grad_norm": 0.28331097960472107, "learning_rate": 0.000404541093700032, "loss": 0.9472, "step": 5180 }, { "epoch": 0.05804687368933179, "grad_norm": 0.2994554042816162, "learning_rate": 0.00040431266846361187, "loss": 0.9434, "step": 5190 }, { "epoch": 0.05815871737659447, "grad_norm": 0.30070000886917114, "learning_rate": 0.0004040842432271917, "loss": 0.9408, "step": 5200 }, { "epoch": 0.058270561063857154, "grad_norm": 0.29924333095550537, "learning_rate": 0.00040385581799077164, "loss": 0.9484, "step": 5210 }, { "epoch": 0.05838240475111983, "grad_norm": 0.2905283272266388, "learning_rate": 0.0004036273927543515, "loss": 0.9636, "step": 5220 }, { "epoch": 0.058494248438382516, "grad_norm": 0.3290540874004364, "learning_rate": 0.00040339896751793135, "loss": 0.9396, "step": 5230 }, { "epoch": 0.0586060921256452, "grad_norm": 0.29686272144317627, "learning_rate": 0.0004031705422815113, "loss": 0.9408, "step": 5240 }, { "epoch": 0.05871793581290788, "grad_norm": 0.2768057882785797, "learning_rate": 0.00040294211704509117, "loss": 0.9328, "step": 5250 }, { "epoch": 0.05882977950017056, "grad_norm": 0.2614899277687073, "learning_rate": 0.000402713691808671, "loss": 0.9483, "step": 5260 }, { "epoch": 0.05894162318743324, "grad_norm": 0.2692766487598419, "learning_rate": 0.00040248526657225093, "loss": 0.9479, "step": 5270 }, { "epoch": 0.059053466874695926, "grad_norm": 0.3009514808654785, "learning_rate": 0.0004022568413358308, "loss": 0.9681, "step": 5280 }, { "epoch": 0.0591653105619586, "grad_norm": 0.27767086029052734, "learning_rate": 0.00040202841609941064, "loss": 0.9685, "step": 5290 }, { "epoch": 0.05927715424922129, "grad_norm": 0.2956901788711548, "learning_rate": 0.00040179999086299055, "loss": 0.9609, "step": 5300 }, { "epoch": 0.05938899793648397, "grad_norm": 0.3046570420265198, "learning_rate": 0.0004015715656265704, "loss": 0.961, "step": 5310 }, { "epoch": 0.05950084162374665, "grad_norm": 0.24477365612983704, "learning_rate": 0.0004013431403901503, "loss": 0.9501, "step": 5320 }, { "epoch": 0.059612685311009335, "grad_norm": 0.25505194067955017, "learning_rate": 0.00040111471515373023, "loss": 0.946, "step": 5330 }, { "epoch": 0.05972452899827201, "grad_norm": 0.26015251874923706, "learning_rate": 0.0004008862899173101, "loss": 0.9372, "step": 5340 }, { "epoch": 0.0598363726855347, "grad_norm": 0.24911250174045563, "learning_rate": 0.00040065786468088994, "loss": 0.9487, "step": 5350 }, { "epoch": 0.059948216372797375, "grad_norm": 0.2779735028743744, "learning_rate": 0.00040042943944446985, "loss": 0.9316, "step": 5360 }, { "epoch": 0.06006006006006006, "grad_norm": 0.30663251876831055, "learning_rate": 0.0004002010142080497, "loss": 0.9461, "step": 5370 }, { "epoch": 0.060171903747322744, "grad_norm": 0.2724740505218506, "learning_rate": 0.00039997258897162956, "loss": 0.9214, "step": 5380 }, { "epoch": 0.06028374743458542, "grad_norm": 0.26819276809692383, "learning_rate": 0.00039974416373520947, "loss": 0.9368, "step": 5390 }, { "epoch": 0.060395591121848106, "grad_norm": 0.26342320442199707, "learning_rate": 0.0003995157384987894, "loss": 0.9332, "step": 5400 }, { "epoch": 0.060507434809110784, "grad_norm": 0.32590556144714355, "learning_rate": 0.00039928731326236924, "loss": 0.9286, "step": 5410 }, { "epoch": 0.06061927849637347, "grad_norm": 0.2747272849082947, "learning_rate": 0.00039905888802594915, "loss": 0.932, "step": 5420 }, { "epoch": 0.06073112218363615, "grad_norm": 0.23089702427387238, "learning_rate": 0.000398830462789529, "loss": 0.9216, "step": 5430 }, { "epoch": 0.06084296587089883, "grad_norm": 0.24383346736431122, "learning_rate": 0.00039860203755310886, "loss": 0.9333, "step": 5440 }, { "epoch": 0.060954809558161516, "grad_norm": 0.23999489843845367, "learning_rate": 0.00039837361231668877, "loss": 0.9134, "step": 5450 }, { "epoch": 0.06106665324542419, "grad_norm": 0.3041435480117798, "learning_rate": 0.0003981451870802686, "loss": 0.9226, "step": 5460 }, { "epoch": 0.06117849693268688, "grad_norm": 0.2667579650878906, "learning_rate": 0.0003979167618438485, "loss": 0.9148, "step": 5470 }, { "epoch": 0.061290340619949556, "grad_norm": 0.2730364203453064, "learning_rate": 0.0003976883366074284, "loss": 0.9073, "step": 5480 }, { "epoch": 0.06140218430721224, "grad_norm": 0.28175118565559387, "learning_rate": 0.0003974599113710083, "loss": 0.9097, "step": 5490 }, { "epoch": 0.061514027994474925, "grad_norm": 0.2826266288757324, "learning_rate": 0.00039723148613458816, "loss": 0.8972, "step": 5500 }, { "epoch": 0.0616258716817376, "grad_norm": 0.25821810960769653, "learning_rate": 0.000397003060898168, "loss": 0.8898, "step": 5510 }, { "epoch": 0.06173771536900029, "grad_norm": 0.31401073932647705, "learning_rate": 0.0003967746356617479, "loss": 0.8986, "step": 5520 }, { "epoch": 0.061849559056262965, "grad_norm": 0.2664715349674225, "learning_rate": 0.0003965462104253278, "loss": 0.9178, "step": 5530 }, { "epoch": 0.06196140274352565, "grad_norm": 0.2725924253463745, "learning_rate": 0.00039631778518890763, "loss": 0.8941, "step": 5540 }, { "epoch": 0.06207324643078833, "grad_norm": 0.2991993725299835, "learning_rate": 0.0003960893599524876, "loss": 0.899, "step": 5550 }, { "epoch": 0.06218509011805101, "grad_norm": 0.2683865427970886, "learning_rate": 0.00039586093471606745, "loss": 0.9105, "step": 5560 }, { "epoch": 0.062296933805313696, "grad_norm": 0.29127469658851624, "learning_rate": 0.0003956325094796473, "loss": 0.9091, "step": 5570 }, { "epoch": 0.062408777492576374, "grad_norm": 0.28191229701042175, "learning_rate": 0.0003954040842432272, "loss": 0.9078, "step": 5580 }, { "epoch": 0.06252062117983906, "grad_norm": 0.28319644927978516, "learning_rate": 0.0003951756590068071, "loss": 0.9134, "step": 5590 }, { "epoch": 0.06263246486710174, "grad_norm": 0.2563108205795288, "learning_rate": 0.00039494723377038693, "loss": 0.9166, "step": 5600 }, { "epoch": 0.06274430855436441, "grad_norm": 0.29730817675590515, "learning_rate": 0.00039471880853396684, "loss": 0.9101, "step": 5610 }, { "epoch": 0.0628561522416271, "grad_norm": 0.25925830006599426, "learning_rate": 0.0003944903832975467, "loss": 0.9131, "step": 5620 }, { "epoch": 0.06296799592888978, "grad_norm": 0.2645208537578583, "learning_rate": 0.0003942619580611266, "loss": 0.9203, "step": 5630 }, { "epoch": 0.06307983961615246, "grad_norm": 0.2844574749469757, "learning_rate": 0.0003940335328247065, "loss": 0.914, "step": 5640 }, { "epoch": 0.06319168330341515, "grad_norm": 0.2687402367591858, "learning_rate": 0.00039380510758828637, "loss": 0.9095, "step": 5650 }, { "epoch": 0.06330352699067783, "grad_norm": 0.22893477976322174, "learning_rate": 0.00039357668235186623, "loss": 0.8993, "step": 5660 }, { "epoch": 0.06341537067794051, "grad_norm": 0.27271768450737, "learning_rate": 0.00039334825711544614, "loss": 0.8989, "step": 5670 }, { "epoch": 0.06352721436520319, "grad_norm": 0.27709853649139404, "learning_rate": 0.000393119831879026, "loss": 0.8998, "step": 5680 }, { "epoch": 0.06363905805246588, "grad_norm": 0.24321520328521729, "learning_rate": 0.00039289140664260585, "loss": 0.887, "step": 5690 }, { "epoch": 0.06375090173972855, "grad_norm": 0.26779887080192566, "learning_rate": 0.00039266298140618576, "loss": 0.9091, "step": 5700 }, { "epoch": 0.06386274542699123, "grad_norm": 0.2612350881099701, "learning_rate": 0.00039243455616976567, "loss": 0.9043, "step": 5710 }, { "epoch": 0.06397458911425392, "grad_norm": 0.26247987151145935, "learning_rate": 0.0003922061309333455, "loss": 0.9024, "step": 5720 }, { "epoch": 0.0640864328015166, "grad_norm": 0.2605653703212738, "learning_rate": 0.00039197770569692543, "loss": 0.9311, "step": 5730 }, { "epoch": 0.06419827648877928, "grad_norm": 0.28249841928482056, "learning_rate": 0.0003917492804605053, "loss": 0.9265, "step": 5740 }, { "epoch": 0.06431012017604196, "grad_norm": 0.2880108654499054, "learning_rate": 0.00039152085522408515, "loss": 0.9331, "step": 5750 }, { "epoch": 0.06442196386330465, "grad_norm": 0.31626009941101074, "learning_rate": 0.00039129242998766506, "loss": 0.9483, "step": 5760 }, { "epoch": 0.06453380755056733, "grad_norm": 0.28972744941711426, "learning_rate": 0.0003910640047512449, "loss": 0.9239, "step": 5770 }, { "epoch": 0.06464565123783, "grad_norm": 0.27140864729881287, "learning_rate": 0.00039083557951482477, "loss": 0.9259, "step": 5780 }, { "epoch": 0.0647574949250927, "grad_norm": 0.26331818103790283, "learning_rate": 0.00039060715427840473, "loss": 0.9383, "step": 5790 }, { "epoch": 0.06486933861235537, "grad_norm": 0.26927000284194946, "learning_rate": 0.0003903787290419846, "loss": 0.9236, "step": 5800 }, { "epoch": 0.06498118229961805, "grad_norm": 0.2833601236343384, "learning_rate": 0.00039015030380556444, "loss": 0.9257, "step": 5810 }, { "epoch": 0.06509302598688073, "grad_norm": 0.2970174551010132, "learning_rate": 0.00038992187856914435, "loss": 0.9164, "step": 5820 }, { "epoch": 0.06520486967414342, "grad_norm": 0.27904263138771057, "learning_rate": 0.0003896934533327242, "loss": 0.9045, "step": 5830 }, { "epoch": 0.0653167133614061, "grad_norm": 0.24879537522792816, "learning_rate": 0.00038946502809630406, "loss": 0.9, "step": 5840 }, { "epoch": 0.06542855704866878, "grad_norm": 0.2897798717021942, "learning_rate": 0.000389236602859884, "loss": 0.919, "step": 5850 }, { "epoch": 0.06554040073593147, "grad_norm": 0.26522865891456604, "learning_rate": 0.00038900817762346383, "loss": 0.9168, "step": 5860 }, { "epoch": 0.06565224442319414, "grad_norm": 0.26405441761016846, "learning_rate": 0.00038877975238704374, "loss": 0.9169, "step": 5870 }, { "epoch": 0.06576408811045682, "grad_norm": 0.2543514370918274, "learning_rate": 0.00038855132715062365, "loss": 0.917, "step": 5880 }, { "epoch": 0.06587593179771951, "grad_norm": 0.2683538794517517, "learning_rate": 0.0003883229019142035, "loss": 0.9179, "step": 5890 }, { "epoch": 0.06598777548498219, "grad_norm": 0.24559274315834045, "learning_rate": 0.00038809447667778336, "loss": 0.907, "step": 5900 }, { "epoch": 0.06609961917224487, "grad_norm": 0.2604455351829529, "learning_rate": 0.00038786605144136327, "loss": 0.9172, "step": 5910 }, { "epoch": 0.06621146285950755, "grad_norm": 0.24329319596290588, "learning_rate": 0.0003876376262049431, "loss": 0.9171, "step": 5920 }, { "epoch": 0.06632330654677024, "grad_norm": 0.237509623169899, "learning_rate": 0.000387409200968523, "loss": 0.9272, "step": 5930 }, { "epoch": 0.06643515023403292, "grad_norm": 0.2569025754928589, "learning_rate": 0.00038718077573210284, "loss": 0.9327, "step": 5940 }, { "epoch": 0.0665469939212956, "grad_norm": 0.2908497750759125, "learning_rate": 0.0003869523504956828, "loss": 0.9299, "step": 5950 }, { "epoch": 0.06665883760855829, "grad_norm": 0.24669544398784637, "learning_rate": 0.00038672392525926266, "loss": 0.9036, "step": 5960 }, { "epoch": 0.06677068129582096, "grad_norm": 0.23906981945037842, "learning_rate": 0.0003864955000228425, "loss": 0.9266, "step": 5970 }, { "epoch": 0.06688252498308364, "grad_norm": 0.2822079658508301, "learning_rate": 0.0003862670747864224, "loss": 0.9209, "step": 5980 }, { "epoch": 0.06699436867034632, "grad_norm": 0.27469775080680847, "learning_rate": 0.0003860386495500023, "loss": 0.9385, "step": 5990 }, { "epoch": 0.06710621235760901, "grad_norm": 0.24559862911701202, "learning_rate": 0.00038581022431358213, "loss": 0.9248, "step": 6000 }, { "epoch": 0.06721805604487169, "grad_norm": 0.24427008628845215, "learning_rate": 0.00038558179907716204, "loss": 0.9358, "step": 6010 }, { "epoch": 0.06732989973213437, "grad_norm": 0.2626965641975403, "learning_rate": 0.00038535337384074195, "loss": 0.9211, "step": 6020 }, { "epoch": 0.06744174341939706, "grad_norm": 0.226990208029747, "learning_rate": 0.0003851249486043218, "loss": 0.9292, "step": 6030 }, { "epoch": 0.06755358710665973, "grad_norm": 0.2762834131717682, "learning_rate": 0.0003848965233679017, "loss": 0.932, "step": 6040 }, { "epoch": 0.06766543079392241, "grad_norm": 0.2799958884716034, "learning_rate": 0.0003846680981314816, "loss": 0.943, "step": 6050 }, { "epoch": 0.06777727448118509, "grad_norm": 0.26224029064178467, "learning_rate": 0.00038443967289506143, "loss": 0.9236, "step": 6060 }, { "epoch": 0.06788911816844778, "grad_norm": 0.2897866368293762, "learning_rate": 0.00038421124765864134, "loss": 0.95, "step": 6070 }, { "epoch": 0.06800096185571046, "grad_norm": 0.2899113893508911, "learning_rate": 0.0003839828224222212, "loss": 0.9403, "step": 6080 }, { "epoch": 0.06811280554297314, "grad_norm": 0.27765581011772156, "learning_rate": 0.00038375439718580105, "loss": 0.9447, "step": 6090 }, { "epoch": 0.06822464923023583, "grad_norm": 0.27683207392692566, "learning_rate": 0.000383525971949381, "loss": 0.949, "step": 6100 }, { "epoch": 0.0683364929174985, "grad_norm": 0.2815559506416321, "learning_rate": 0.00038329754671296087, "loss": 0.9627, "step": 6110 }, { "epoch": 0.06844833660476118, "grad_norm": 0.2741657793521881, "learning_rate": 0.00038306912147654073, "loss": 0.9659, "step": 6120 }, { "epoch": 0.06856018029202386, "grad_norm": 0.4103181064128876, "learning_rate": 0.00038284069624012064, "loss": 0.9612, "step": 6130 }, { "epoch": 0.06867202397928655, "grad_norm": 0.2862701416015625, "learning_rate": 0.0003826122710037005, "loss": 0.9393, "step": 6140 }, { "epoch": 0.06878386766654923, "grad_norm": 0.2789844274520874, "learning_rate": 0.00038238384576728035, "loss": 0.9447, "step": 6150 }, { "epoch": 0.06889571135381191, "grad_norm": 0.590391993522644, "learning_rate": 0.00038215542053086026, "loss": 0.9525, "step": 6160 }, { "epoch": 0.0690075550410746, "grad_norm": 0.2721211016178131, "learning_rate": 0.0003819269952944401, "loss": 0.9467, "step": 6170 }, { "epoch": 0.06911939872833728, "grad_norm": 0.27576929330825806, "learning_rate": 0.00038169857005802, "loss": 0.9428, "step": 6180 }, { "epoch": 0.06923124241559996, "grad_norm": 0.28229111433029175, "learning_rate": 0.00038147014482159993, "loss": 0.9418, "step": 6190 }, { "epoch": 0.06934308610286263, "grad_norm": 0.29595518112182617, "learning_rate": 0.0003812417195851798, "loss": 0.9178, "step": 6200 }, { "epoch": 0.06945492979012532, "grad_norm": 0.3055596351623535, "learning_rate": 0.00038101329434875965, "loss": 0.9464, "step": 6210 }, { "epoch": 0.069566773477388, "grad_norm": 0.29212549328804016, "learning_rate": 0.00038078486911233956, "loss": 0.9491, "step": 6220 }, { "epoch": 0.06967861716465068, "grad_norm": 0.288928359746933, "learning_rate": 0.0003805564438759194, "loss": 0.9285, "step": 6230 }, { "epoch": 0.06979046085191337, "grad_norm": 0.2759207487106323, "learning_rate": 0.00038032801863949927, "loss": 0.9336, "step": 6240 }, { "epoch": 0.06990230453917605, "grad_norm": 0.31041648983955383, "learning_rate": 0.0003800995934030792, "loss": 0.9317, "step": 6250 }, { "epoch": 0.07001414822643873, "grad_norm": 0.29425299167633057, "learning_rate": 0.0003798711681666591, "loss": 0.9212, "step": 6260 }, { "epoch": 0.0701259919137014, "grad_norm": 0.278062105178833, "learning_rate": 0.00037964274293023894, "loss": 0.9291, "step": 6270 }, { "epoch": 0.0702378356009641, "grad_norm": 0.2983698546886444, "learning_rate": 0.00037941431769381885, "loss": 0.9169, "step": 6280 }, { "epoch": 0.07034967928822677, "grad_norm": 0.29595527052879333, "learning_rate": 0.0003791858924573987, "loss": 0.9286, "step": 6290 }, { "epoch": 0.07046152297548945, "grad_norm": 0.26365020871162415, "learning_rate": 0.00037895746722097856, "loss": 0.9312, "step": 6300 }, { "epoch": 0.07057336666275214, "grad_norm": 0.27807778120040894, "learning_rate": 0.0003787290419845585, "loss": 0.9274, "step": 6310 }, { "epoch": 0.07068521035001482, "grad_norm": 0.2585415840148926, "learning_rate": 0.00037850061674813833, "loss": 0.9513, "step": 6320 }, { "epoch": 0.0707970540372775, "grad_norm": 0.2740543484687805, "learning_rate": 0.0003782721915117182, "loss": 0.922, "step": 6330 }, { "epoch": 0.07090889772454018, "grad_norm": 0.28271788358688354, "learning_rate": 0.00037804376627529815, "loss": 0.94, "step": 6340 }, { "epoch": 0.07102074141180287, "grad_norm": 0.28767603635787964, "learning_rate": 0.000377815341038878, "loss": 0.9295, "step": 6350 }, { "epoch": 0.07113258509906555, "grad_norm": 0.25200092792510986, "learning_rate": 0.00037758691580245786, "loss": 0.9219, "step": 6360 }, { "epoch": 0.07124442878632822, "grad_norm": 0.27449852228164673, "learning_rate": 0.00037735849056603777, "loss": 0.9227, "step": 6370 }, { "epoch": 0.07135627247359091, "grad_norm": 0.27951040863990784, "learning_rate": 0.0003771300653296176, "loss": 0.9256, "step": 6380 }, { "epoch": 0.07146811616085359, "grad_norm": 0.27883175015449524, "learning_rate": 0.0003769016400931975, "loss": 0.9244, "step": 6390 }, { "epoch": 0.07157995984811627, "grad_norm": 0.27942216396331787, "learning_rate": 0.00037667321485677734, "loss": 0.9287, "step": 6400 }, { "epoch": 0.07169180353537895, "grad_norm": 0.2605076730251312, "learning_rate": 0.00037644478962035725, "loss": 0.9213, "step": 6410 }, { "epoch": 0.07180364722264164, "grad_norm": 0.25812190771102905, "learning_rate": 0.00037621636438393716, "loss": 0.9268, "step": 6420 }, { "epoch": 0.07191549090990432, "grad_norm": 0.27478551864624023, "learning_rate": 0.000375987939147517, "loss": 0.9341, "step": 6430 }, { "epoch": 0.072027334597167, "grad_norm": 0.2799810469150543, "learning_rate": 0.0003757595139110969, "loss": 0.9308, "step": 6440 }, { "epoch": 0.07213917828442969, "grad_norm": 0.2494313269853592, "learning_rate": 0.0003755310886746768, "loss": 0.9389, "step": 6450 }, { "epoch": 0.07225102197169236, "grad_norm": 0.3362772762775421, "learning_rate": 0.00037530266343825664, "loss": 0.9362, "step": 6460 }, { "epoch": 0.07236286565895504, "grad_norm": 0.28501999378204346, "learning_rate": 0.00037507423820183655, "loss": 0.9262, "step": 6470 }, { "epoch": 0.07247470934621772, "grad_norm": 0.24787545204162598, "learning_rate": 0.0003748458129654164, "loss": 0.9409, "step": 6480 }, { "epoch": 0.07258655303348041, "grad_norm": 0.277665913105011, "learning_rate": 0.0003746173877289963, "loss": 0.9244, "step": 6490 }, { "epoch": 0.07269839672074309, "grad_norm": 0.2613317370414734, "learning_rate": 0.0003743889624925762, "loss": 0.9429, "step": 6500 }, { "epoch": 0.07281024040800577, "grad_norm": 0.2740306258201599, "learning_rate": 0.0003741605372561561, "loss": 0.9422, "step": 6510 }, { "epoch": 0.07292208409526846, "grad_norm": 0.3052440881729126, "learning_rate": 0.00037393211201973593, "loss": 0.9346, "step": 6520 }, { "epoch": 0.07303392778253114, "grad_norm": 0.27979132533073425, "learning_rate": 0.00037370368678331584, "loss": 0.9305, "step": 6530 }, { "epoch": 0.07314577146979381, "grad_norm": 0.2834227979183197, "learning_rate": 0.0003734752615468957, "loss": 0.9305, "step": 6540 }, { "epoch": 0.07325761515705649, "grad_norm": 0.28621387481689453, "learning_rate": 0.00037324683631047555, "loss": 0.9505, "step": 6550 }, { "epoch": 0.07336945884431918, "grad_norm": 0.2539358139038086, "learning_rate": 0.00037301841107405546, "loss": 0.9491, "step": 6560 }, { "epoch": 0.07348130253158186, "grad_norm": 0.29257437586784363, "learning_rate": 0.0003727899858376354, "loss": 0.9428, "step": 6570 }, { "epoch": 0.07359314621884454, "grad_norm": 0.25158485770225525, "learning_rate": 0.00037256156060121523, "loss": 0.9471, "step": 6580 }, { "epoch": 0.07370498990610723, "grad_norm": 0.26301345229148865, "learning_rate": 0.00037233313536479514, "loss": 0.928, "step": 6590 }, { "epoch": 0.07381683359336991, "grad_norm": 0.2519192397594452, "learning_rate": 0.000372104710128375, "loss": 0.9189, "step": 6600 }, { "epoch": 0.07392867728063258, "grad_norm": 0.29801836609840393, "learning_rate": 0.00037187628489195485, "loss": 0.9218, "step": 6610 }, { "epoch": 0.07404052096789528, "grad_norm": 0.30779263377189636, "learning_rate": 0.00037164785965553476, "loss": 0.9263, "step": 6620 }, { "epoch": 0.07415236465515795, "grad_norm": 0.2758638262748718, "learning_rate": 0.0003714194344191146, "loss": 0.904, "step": 6630 }, { "epoch": 0.07426420834242063, "grad_norm": 0.26482871174812317, "learning_rate": 0.00037119100918269447, "loss": 0.9024, "step": 6640 }, { "epoch": 0.07437605202968331, "grad_norm": 0.24001047015190125, "learning_rate": 0.00037096258394627444, "loss": 0.914, "step": 6650 }, { "epoch": 0.074487895716946, "grad_norm": 0.2694549560546875, "learning_rate": 0.0003707341587098543, "loss": 0.921, "step": 6660 }, { "epoch": 0.07459973940420868, "grad_norm": 0.25042393803596497, "learning_rate": 0.00037050573347343415, "loss": 0.9108, "step": 6670 }, { "epoch": 0.07471158309147136, "grad_norm": 0.25945019721984863, "learning_rate": 0.00037027730823701406, "loss": 0.912, "step": 6680 }, { "epoch": 0.07482342677873405, "grad_norm": 0.2624742090702057, "learning_rate": 0.0003700488830005939, "loss": 0.9108, "step": 6690 }, { "epoch": 0.07493527046599673, "grad_norm": 0.27438145875930786, "learning_rate": 0.00036982045776417377, "loss": 0.9215, "step": 6700 }, { "epoch": 0.0750471141532594, "grad_norm": 0.27610865235328674, "learning_rate": 0.0003695920325277537, "loss": 0.9053, "step": 6710 }, { "epoch": 0.07515895784052208, "grad_norm": 0.2616426944732666, "learning_rate": 0.00036936360729133353, "loss": 0.9255, "step": 6720 }, { "epoch": 0.07527080152778477, "grad_norm": 0.3146522641181946, "learning_rate": 0.00036913518205491344, "loss": 0.9105, "step": 6730 }, { "epoch": 0.07538264521504745, "grad_norm": 0.29139819741249084, "learning_rate": 0.00036890675681849335, "loss": 0.9324, "step": 6740 }, { "epoch": 0.07549448890231013, "grad_norm": 0.3176229000091553, "learning_rate": 0.0003686783315820732, "loss": 0.9434, "step": 6750 }, { "epoch": 0.07560633258957282, "grad_norm": 0.2786601781845093, "learning_rate": 0.00036844990634565307, "loss": 0.9405, "step": 6760 }, { "epoch": 0.0757181762768355, "grad_norm": 0.2988050580024719, "learning_rate": 0.000368221481109233, "loss": 0.9477, "step": 6770 }, { "epoch": 0.07583001996409817, "grad_norm": 0.28120875358581543, "learning_rate": 0.00036799305587281283, "loss": 0.9521, "step": 6780 }, { "epoch": 0.07594186365136085, "grad_norm": 0.27376359701156616, "learning_rate": 0.0003677646306363927, "loss": 0.9405, "step": 6790 }, { "epoch": 0.07605370733862354, "grad_norm": 0.2721284329891205, "learning_rate": 0.0003675362053999726, "loss": 0.9392, "step": 6800 }, { "epoch": 0.07616555102588622, "grad_norm": 0.31443721055984497, "learning_rate": 0.0003673077801635525, "loss": 0.939, "step": 6810 }, { "epoch": 0.0762773947131489, "grad_norm": 0.27175766229629517, "learning_rate": 0.00036707935492713236, "loss": 0.9262, "step": 6820 }, { "epoch": 0.07638923840041159, "grad_norm": 0.2984711527824402, "learning_rate": 0.00036685092969071227, "loss": 0.9381, "step": 6830 }, { "epoch": 0.07650108208767427, "grad_norm": 0.2773591876029968, "learning_rate": 0.00036662250445429213, "loss": 0.9217, "step": 6840 }, { "epoch": 0.07661292577493695, "grad_norm": 0.29338565468788147, "learning_rate": 0.000366394079217872, "loss": 0.9197, "step": 6850 }, { "epoch": 0.07672476946219962, "grad_norm": 0.2456415593624115, "learning_rate": 0.00036616565398145184, "loss": 0.9191, "step": 6860 }, { "epoch": 0.07683661314946232, "grad_norm": 0.324935644865036, "learning_rate": 0.00036593722874503175, "loss": 0.8975, "step": 6870 }, { "epoch": 0.076948456836725, "grad_norm": 0.6967706680297852, "learning_rate": 0.0003657088035086116, "loss": 0.9053, "step": 6880 }, { "epoch": 0.07706030052398767, "grad_norm": 0.8192552328109741, "learning_rate": 0.0003654803782721915, "loss": 0.9129, "step": 6890 }, { "epoch": 0.07717214421125036, "grad_norm": 0.4698907136917114, "learning_rate": 0.0003652519530357714, "loss": 0.9128, "step": 6900 }, { "epoch": 0.07728398789851304, "grad_norm": 0.3055092990398407, "learning_rate": 0.0003650235277993513, "loss": 0.9207, "step": 6910 }, { "epoch": 0.07739583158577572, "grad_norm": 0.31879591941833496, "learning_rate": 0.00036479510256293114, "loss": 0.9101, "step": 6920 }, { "epoch": 0.0775076752730384, "grad_norm": 0.2708083987236023, "learning_rate": 0.00036456667732651105, "loss": 0.9036, "step": 6930 }, { "epoch": 0.07761951896030109, "grad_norm": 0.2801443040370941, "learning_rate": 0.0003643382520900909, "loss": 0.9031, "step": 6940 }, { "epoch": 0.07773136264756376, "grad_norm": 0.2481400966644287, "learning_rate": 0.00036410982685367076, "loss": 0.8952, "step": 6950 }, { "epoch": 0.07784320633482644, "grad_norm": 0.25424808263778687, "learning_rate": 0.0003638814016172507, "loss": 0.8846, "step": 6960 }, { "epoch": 0.07795505002208913, "grad_norm": 0.2655096650123596, "learning_rate": 0.0003636529763808306, "loss": 0.8922, "step": 6970 }, { "epoch": 0.07806689370935181, "grad_norm": 0.281180202960968, "learning_rate": 0.00036342455114441043, "loss": 0.8934, "step": 6980 }, { "epoch": 0.07817873739661449, "grad_norm": 0.2850550413131714, "learning_rate": 0.00036319612590799034, "loss": 0.8856, "step": 6990 }, { "epoch": 0.07829058108387717, "grad_norm": 0.24838604032993317, "learning_rate": 0.0003629677006715702, "loss": 0.905, "step": 7000 }, { "epoch": 0.07840242477113986, "grad_norm": 0.2703045606613159, "learning_rate": 0.00036273927543515005, "loss": 0.8816, "step": 7010 }, { "epoch": 0.07851426845840254, "grad_norm": 0.2786656320095062, "learning_rate": 0.00036251085019872996, "loss": 0.8997, "step": 7020 }, { "epoch": 0.07862611214566521, "grad_norm": 0.2771463692188263, "learning_rate": 0.0003622824249623098, "loss": 0.9033, "step": 7030 }, { "epoch": 0.0787379558329279, "grad_norm": 0.2721976339817047, "learning_rate": 0.00036205399972588973, "loss": 0.9109, "step": 7040 }, { "epoch": 0.07884979952019058, "grad_norm": 0.2606031596660614, "learning_rate": 0.00036182557448946964, "loss": 0.9221, "step": 7050 }, { "epoch": 0.07896164320745326, "grad_norm": 0.45895281434059143, "learning_rate": 0.0003615971492530495, "loss": 0.908, "step": 7060 }, { "epoch": 0.07907348689471594, "grad_norm": 0.30524522066116333, "learning_rate": 0.00036136872401662935, "loss": 0.9234, "step": 7070 }, { "epoch": 0.07918533058197863, "grad_norm": 0.2704319953918457, "learning_rate": 0.00036114029878020926, "loss": 0.9003, "step": 7080 }, { "epoch": 0.07929717426924131, "grad_norm": 0.2770727872848511, "learning_rate": 0.0003609118735437891, "loss": 0.9253, "step": 7090 }, { "epoch": 0.07940901795650399, "grad_norm": 0.25288262963294983, "learning_rate": 0.00036068344830736897, "loss": 0.9089, "step": 7100 }, { "epoch": 0.07952086164376668, "grad_norm": 0.27105236053466797, "learning_rate": 0.0003604550230709489, "loss": 0.9138, "step": 7110 }, { "epoch": 0.07963270533102935, "grad_norm": 0.2631518840789795, "learning_rate": 0.0003602265978345288, "loss": 0.9226, "step": 7120 }, { "epoch": 0.07974454901829203, "grad_norm": 0.25269970297813416, "learning_rate": 0.00035999817259810865, "loss": 0.9102, "step": 7130 }, { "epoch": 0.07985639270555471, "grad_norm": 0.2576468586921692, "learning_rate": 0.00035976974736168856, "loss": 0.9075, "step": 7140 }, { "epoch": 0.0799682363928174, "grad_norm": 0.26297688484191895, "learning_rate": 0.0003595413221252684, "loss": 0.9004, "step": 7150 }, { "epoch": 0.08008008008008008, "grad_norm": 0.3029099702835083, "learning_rate": 0.00035931289688884827, "loss": 0.9165, "step": 7160 }, { "epoch": 0.08019192376734276, "grad_norm": 0.2699684798717499, "learning_rate": 0.0003590844716524282, "loss": 0.9232, "step": 7170 }, { "epoch": 0.08030376745460545, "grad_norm": 0.26480093598365784, "learning_rate": 0.00035885604641600804, "loss": 0.9319, "step": 7180 }, { "epoch": 0.08041561114186813, "grad_norm": 0.27503007650375366, "learning_rate": 0.0003586276211795879, "loss": 0.9398, "step": 7190 }, { "epoch": 0.0805274548291308, "grad_norm": 0.2715147137641907, "learning_rate": 0.00035839919594316785, "loss": 0.9307, "step": 7200 }, { "epoch": 0.08063929851639348, "grad_norm": 0.2697315812110901, "learning_rate": 0.0003581707707067477, "loss": 0.9342, "step": 7210 }, { "epoch": 0.08075114220365617, "grad_norm": 0.2833189070224762, "learning_rate": 0.00035794234547032757, "loss": 0.9604, "step": 7220 }, { "epoch": 0.08086298589091885, "grad_norm": 0.3069300353527069, "learning_rate": 0.0003577139202339075, "loss": 0.9397, "step": 7230 }, { "epoch": 0.08097482957818153, "grad_norm": 0.28459593653678894, "learning_rate": 0.00035748549499748733, "loss": 0.925, "step": 7240 }, { "epoch": 0.08108667326544422, "grad_norm": 0.28896769881248474, "learning_rate": 0.0003572570697610672, "loss": 0.9245, "step": 7250 }, { "epoch": 0.0811985169527069, "grad_norm": 0.2574586272239685, "learning_rate": 0.0003570286445246471, "loss": 0.9326, "step": 7260 }, { "epoch": 0.08131036063996958, "grad_norm": 0.2965002954006195, "learning_rate": 0.00035680021928822695, "loss": 0.9221, "step": 7270 }, { "epoch": 0.08142220432723227, "grad_norm": 0.2657724618911743, "learning_rate": 0.00035657179405180686, "loss": 0.9143, "step": 7280 }, { "epoch": 0.08153404801449494, "grad_norm": 0.2973329424858093, "learning_rate": 0.0003563433688153867, "loss": 0.9164, "step": 7290 }, { "epoch": 0.08164589170175762, "grad_norm": 0.3032989501953125, "learning_rate": 0.00035611494357896663, "loss": 0.9254, "step": 7300 }, { "epoch": 0.0817577353890203, "grad_norm": 0.28107839822769165, "learning_rate": 0.0003558865183425465, "loss": 0.9155, "step": 7310 }, { "epoch": 0.08186957907628299, "grad_norm": 0.30296218395233154, "learning_rate": 0.00035565809310612634, "loss": 0.9218, "step": 7320 }, { "epoch": 0.08198142276354567, "grad_norm": 0.28191155195236206, "learning_rate": 0.00035542966786970625, "loss": 0.9197, "step": 7330 }, { "epoch": 0.08209326645080835, "grad_norm": 0.3113023638725281, "learning_rate": 0.0003552012426332861, "loss": 0.9228, "step": 7340 }, { "epoch": 0.08220511013807104, "grad_norm": 0.3066212832927704, "learning_rate": 0.00035497281739686596, "loss": 0.9191, "step": 7350 }, { "epoch": 0.08231695382533372, "grad_norm": 0.2658233940601349, "learning_rate": 0.0003547443921604459, "loss": 0.918, "step": 7360 }, { "epoch": 0.0824287975125964, "grad_norm": 0.28222033381462097, "learning_rate": 0.0003545159669240258, "loss": 0.9253, "step": 7370 }, { "epoch": 0.08254064119985907, "grad_norm": 0.2917843461036682, "learning_rate": 0.00035428754168760564, "loss": 0.9059, "step": 7380 }, { "epoch": 0.08265248488712176, "grad_norm": 0.290404349565506, "learning_rate": 0.00035405911645118555, "loss": 0.9044, "step": 7390 }, { "epoch": 0.08276432857438444, "grad_norm": 0.28990834951400757, "learning_rate": 0.0003538306912147654, "loss": 0.9078, "step": 7400 }, { "epoch": 0.08287617226164712, "grad_norm": 0.27296292781829834, "learning_rate": 0.00035360226597834526, "loss": 0.9081, "step": 7410 }, { "epoch": 0.08298801594890981, "grad_norm": 0.25443321466445923, "learning_rate": 0.00035337384074192517, "loss": 0.9019, "step": 7420 }, { "epoch": 0.08309985963617249, "grad_norm": 0.25014832615852356, "learning_rate": 0.0003531454155055051, "loss": 0.8976, "step": 7430 }, { "epoch": 0.08321170332343517, "grad_norm": 0.2844237983226776, "learning_rate": 0.00035291699026908493, "loss": 0.9039, "step": 7440 }, { "epoch": 0.08332354701069784, "grad_norm": 0.26745542883872986, "learning_rate": 0.00035268856503266484, "loss": 0.8813, "step": 7450 }, { "epoch": 0.08343539069796053, "grad_norm": 0.30750566720962524, "learning_rate": 0.0003524601397962447, "loss": 0.8988, "step": 7460 }, { "epoch": 0.08354723438522321, "grad_norm": 0.2960536777973175, "learning_rate": 0.00035223171455982456, "loss": 0.8966, "step": 7470 }, { "epoch": 0.08365907807248589, "grad_norm": 0.28923213481903076, "learning_rate": 0.00035200328932340447, "loss": 0.8872, "step": 7480 }, { "epoch": 0.08377092175974858, "grad_norm": 0.2762465476989746, "learning_rate": 0.0003517748640869843, "loss": 0.8655, "step": 7490 }, { "epoch": 0.08388276544701126, "grad_norm": 0.2870965301990509, "learning_rate": 0.0003515464388505642, "loss": 0.889, "step": 7500 }, { "epoch": 0.08399460913427394, "grad_norm": 0.3135611116886139, "learning_rate": 0.00035131801361414414, "loss": 0.8898, "step": 7510 }, { "epoch": 0.08410645282153661, "grad_norm": 0.29541128873825073, "learning_rate": 0.000351089588377724, "loss": 0.8884, "step": 7520 }, { "epoch": 0.0842182965087993, "grad_norm": 0.2667001485824585, "learning_rate": 0.00035086116314130385, "loss": 0.8923, "step": 7530 }, { "epoch": 0.08433014019606198, "grad_norm": 0.28677645325660706, "learning_rate": 0.00035063273790488376, "loss": 0.8862, "step": 7540 }, { "epoch": 0.08444198388332466, "grad_norm": 0.26973757147789, "learning_rate": 0.0003504043126684636, "loss": 0.8739, "step": 7550 }, { "epoch": 0.08455382757058735, "grad_norm": 0.2670735716819763, "learning_rate": 0.0003501758874320435, "loss": 0.8843, "step": 7560 }, { "epoch": 0.08466567125785003, "grad_norm": 0.2678844928741455, "learning_rate": 0.0003499474621956234, "loss": 0.8855, "step": 7570 }, { "epoch": 0.08477751494511271, "grad_norm": 0.26894411444664, "learning_rate": 0.00034971903695920324, "loss": 0.8828, "step": 7580 }, { "epoch": 0.08488935863237539, "grad_norm": 0.28703927993774414, "learning_rate": 0.00034949061172278315, "loss": 0.885, "step": 7590 }, { "epoch": 0.08500120231963808, "grad_norm": 0.2618086636066437, "learning_rate": 0.00034926218648636306, "loss": 0.8777, "step": 7600 }, { "epoch": 0.08511304600690076, "grad_norm": 0.28816747665405273, "learning_rate": 0.0003490337612499429, "loss": 0.8836, "step": 7610 }, { "epoch": 0.08522488969416343, "grad_norm": 0.29172763228416443, "learning_rate": 0.00034880533601352277, "loss": 0.8835, "step": 7620 }, { "epoch": 0.08533673338142612, "grad_norm": 0.2613106667995453, "learning_rate": 0.0003485769107771027, "loss": 0.8736, "step": 7630 }, { "epoch": 0.0854485770686888, "grad_norm": 0.2737283408641815, "learning_rate": 0.00034834848554068254, "loss": 0.8589, "step": 7640 }, { "epoch": 0.08556042075595148, "grad_norm": 0.2709786295890808, "learning_rate": 0.0003481200603042624, "loss": 0.8675, "step": 7650 }, { "epoch": 0.08567226444321416, "grad_norm": 0.2982759177684784, "learning_rate": 0.0003478916350678423, "loss": 0.8827, "step": 7660 }, { "epoch": 0.08578410813047685, "grad_norm": 0.21551093459129333, "learning_rate": 0.0003476632098314222, "loss": 0.8663, "step": 7670 }, { "epoch": 0.08589595181773953, "grad_norm": 0.26418018341064453, "learning_rate": 0.00034743478459500207, "loss": 0.8845, "step": 7680 }, { "epoch": 0.0860077955050022, "grad_norm": 0.2310175597667694, "learning_rate": 0.000347206359358582, "loss": 0.8874, "step": 7690 }, { "epoch": 0.0861196391922649, "grad_norm": 0.25112512707710266, "learning_rate": 0.00034697793412216183, "loss": 0.8896, "step": 7700 }, { "epoch": 0.08623148287952757, "grad_norm": 0.33391082286834717, "learning_rate": 0.0003467495088857417, "loss": 0.8765, "step": 7710 }, { "epoch": 0.08634332656679025, "grad_norm": 0.24641484022140503, "learning_rate": 0.0003465210836493216, "loss": 0.8572, "step": 7720 }, { "epoch": 0.08645517025405293, "grad_norm": 0.26017534732818604, "learning_rate": 0.00034629265841290145, "loss": 0.8585, "step": 7730 }, { "epoch": 0.08656701394131562, "grad_norm": 0.23500847816467285, "learning_rate": 0.0003460642331764813, "loss": 0.8797, "step": 7740 }, { "epoch": 0.0866788576285783, "grad_norm": 0.25485488772392273, "learning_rate": 0.0003458358079400612, "loss": 0.8796, "step": 7750 }, { "epoch": 0.08679070131584098, "grad_norm": 0.27644404768943787, "learning_rate": 0.00034560738270364113, "loss": 0.8708, "step": 7760 }, { "epoch": 0.08690254500310367, "grad_norm": 0.233077734708786, "learning_rate": 0.000345378957467221, "loss": 0.8652, "step": 7770 }, { "epoch": 0.08701438869036635, "grad_norm": 0.24039144814014435, "learning_rate": 0.00034515053223080084, "loss": 0.8723, "step": 7780 }, { "epoch": 0.08712623237762902, "grad_norm": 0.23007874190807343, "learning_rate": 0.00034492210699438075, "loss": 0.8644, "step": 7790 }, { "epoch": 0.0872380760648917, "grad_norm": 0.27570798993110657, "learning_rate": 0.0003446936817579606, "loss": 0.872, "step": 7800 }, { "epoch": 0.08734991975215439, "grad_norm": 0.24157382547855377, "learning_rate": 0.00034446525652154046, "loss": 0.8846, "step": 7810 }, { "epoch": 0.08746176343941707, "grad_norm": 0.2703733742237091, "learning_rate": 0.0003442368312851204, "loss": 0.889, "step": 7820 }, { "epoch": 0.08757360712667975, "grad_norm": 0.26786255836486816, "learning_rate": 0.0003440084060487003, "loss": 0.8933, "step": 7830 }, { "epoch": 0.08768545081394244, "grad_norm": 0.2595812976360321, "learning_rate": 0.00034377998081228014, "loss": 0.9156, "step": 7840 }, { "epoch": 0.08779729450120512, "grad_norm": 0.24396800994873047, "learning_rate": 0.00034355155557586005, "loss": 0.8849, "step": 7850 }, { "epoch": 0.0879091381884678, "grad_norm": 0.24363452196121216, "learning_rate": 0.0003433231303394399, "loss": 0.9011, "step": 7860 }, { "epoch": 0.08802098187573047, "grad_norm": 0.2666647434234619, "learning_rate": 0.00034309470510301976, "loss": 0.8952, "step": 7870 }, { "epoch": 0.08813282556299316, "grad_norm": 0.267863005399704, "learning_rate": 0.00034286627986659967, "loss": 0.9113, "step": 7880 }, { "epoch": 0.08824466925025584, "grad_norm": 0.24397262930870056, "learning_rate": 0.0003426378546301795, "loss": 0.8762, "step": 7890 }, { "epoch": 0.08835651293751852, "grad_norm": 0.23912496864795685, "learning_rate": 0.00034240942939375943, "loss": 0.8865, "step": 7900 }, { "epoch": 0.08846835662478121, "grad_norm": 0.2737523913383484, "learning_rate": 0.00034218100415733934, "loss": 0.8732, "step": 7910 }, { "epoch": 0.08858020031204389, "grad_norm": 0.24978673458099365, "learning_rate": 0.0003419525789209192, "loss": 0.8832, "step": 7920 }, { "epoch": 0.08869204399930657, "grad_norm": 0.25200751423835754, "learning_rate": 0.00034172415368449906, "loss": 0.8952, "step": 7930 }, { "epoch": 0.08880388768656924, "grad_norm": 0.7863819003105164, "learning_rate": 0.00034149572844807897, "loss": 0.8708, "step": 7940 }, { "epoch": 0.08891573137383194, "grad_norm": 0.2560253441333771, "learning_rate": 0.0003412673032116588, "loss": 0.8681, "step": 7950 }, { "epoch": 0.08902757506109461, "grad_norm": 0.2669181823730469, "learning_rate": 0.0003410388779752387, "loss": 0.9007, "step": 7960 }, { "epoch": 0.08913941874835729, "grad_norm": 0.27906209230422974, "learning_rate": 0.0003408104527388186, "loss": 0.8988, "step": 7970 }, { "epoch": 0.08925126243561998, "grad_norm": 0.2506297826766968, "learning_rate": 0.0003405820275023985, "loss": 0.8997, "step": 7980 }, { "epoch": 0.08936310612288266, "grad_norm": 0.2513269782066345, "learning_rate": 0.00034035360226597835, "loss": 0.9215, "step": 7990 }, { "epoch": 0.08947494981014534, "grad_norm": 0.2672421634197235, "learning_rate": 0.00034012517702955826, "loss": 0.9112, "step": 8000 }, { "epoch": 0.08958679349740803, "grad_norm": 0.2553747296333313, "learning_rate": 0.0003398967517931381, "loss": 0.9255, "step": 8010 }, { "epoch": 0.08969863718467071, "grad_norm": 0.2325398176908493, "learning_rate": 0.000339668326556718, "loss": 0.9173, "step": 8020 }, { "epoch": 0.08981048087193338, "grad_norm": 0.23461295664310455, "learning_rate": 0.0003394399013202979, "loss": 0.9183, "step": 8030 }, { "epoch": 0.08992232455919606, "grad_norm": 0.26092031598091125, "learning_rate": 0.00033921147608387774, "loss": 0.9106, "step": 8040 }, { "epoch": 0.09003416824645875, "grad_norm": 0.26250872015953064, "learning_rate": 0.0003389830508474576, "loss": 0.8893, "step": 8050 }, { "epoch": 0.09014601193372143, "grad_norm": 0.2501981556415558, "learning_rate": 0.00033875462561103756, "loss": 0.8934, "step": 8060 }, { "epoch": 0.09025785562098411, "grad_norm": 0.26185476779937744, "learning_rate": 0.0003385262003746174, "loss": 0.8855, "step": 8070 }, { "epoch": 0.0903696993082468, "grad_norm": 0.26889827847480774, "learning_rate": 0.00033829777513819727, "loss": 0.8944, "step": 8080 }, { "epoch": 0.09048154299550948, "grad_norm": 0.2473451793193817, "learning_rate": 0.0003380693499017772, "loss": 0.8937, "step": 8090 }, { "epoch": 0.09059338668277216, "grad_norm": 0.24157559871673584, "learning_rate": 0.00033784092466535704, "loss": 0.8903, "step": 8100 }, { "epoch": 0.09070523037003483, "grad_norm": 0.2701563239097595, "learning_rate": 0.0003376124994289369, "loss": 0.9109, "step": 8110 }, { "epoch": 0.09081707405729753, "grad_norm": 0.28706929087638855, "learning_rate": 0.0003373840741925168, "loss": 0.8956, "step": 8120 }, { "epoch": 0.0909289177445602, "grad_norm": 0.27120909094810486, "learning_rate": 0.00033715564895609666, "loss": 0.8947, "step": 8130 }, { "epoch": 0.09104076143182288, "grad_norm": 0.2504216432571411, "learning_rate": 0.00033692722371967657, "loss": 0.8814, "step": 8140 }, { "epoch": 0.09115260511908557, "grad_norm": 0.2921849489212036, "learning_rate": 0.0003366987984832565, "loss": 0.8856, "step": 8150 }, { "epoch": 0.09126444880634825, "grad_norm": 0.2587922513484955, "learning_rate": 0.00033647037324683633, "loss": 0.8778, "step": 8160 }, { "epoch": 0.09137629249361093, "grad_norm": 0.2399989813566208, "learning_rate": 0.0003362419480104162, "loss": 0.883, "step": 8170 }, { "epoch": 0.0914881361808736, "grad_norm": 0.24794407188892365, "learning_rate": 0.0003360135227739961, "loss": 0.8935, "step": 8180 }, { "epoch": 0.0915999798681363, "grad_norm": 0.26669082045555115, "learning_rate": 0.00033578509753757595, "loss": 0.863, "step": 8190 }, { "epoch": 0.09171182355539897, "grad_norm": 0.25162795186042786, "learning_rate": 0.0003355566723011558, "loss": 0.8887, "step": 8200 }, { "epoch": 0.09182366724266165, "grad_norm": 0.28969621658325195, "learning_rate": 0.00033532824706473567, "loss": 0.9066, "step": 8210 }, { "epoch": 0.09193551092992434, "grad_norm": 0.25944870710372925, "learning_rate": 0.00033509982182831563, "loss": 0.8875, "step": 8220 }, { "epoch": 0.09204735461718702, "grad_norm": 0.27627986669540405, "learning_rate": 0.0003348713965918955, "loss": 0.8895, "step": 8230 }, { "epoch": 0.0921591983044497, "grad_norm": 0.2673914134502411, "learning_rate": 0.00033464297135547534, "loss": 0.8937, "step": 8240 }, { "epoch": 0.09227104199171238, "grad_norm": 0.2810732126235962, "learning_rate": 0.00033441454611905525, "loss": 0.9007, "step": 8250 }, { "epoch": 0.09238288567897507, "grad_norm": 0.2671091556549072, "learning_rate": 0.0003341861208826351, "loss": 0.905, "step": 8260 }, { "epoch": 0.09249472936623775, "grad_norm": 0.25006943941116333, "learning_rate": 0.00033395769564621496, "loss": 0.8981, "step": 8270 }, { "epoch": 0.09260657305350042, "grad_norm": 0.2891542613506317, "learning_rate": 0.0003337292704097949, "loss": 0.8978, "step": 8280 }, { "epoch": 0.09271841674076312, "grad_norm": 0.29497236013412476, "learning_rate": 0.0003335008451733748, "loss": 0.9044, "step": 8290 }, { "epoch": 0.0928302604280258, "grad_norm": 0.29290974140167236, "learning_rate": 0.00033327241993695464, "loss": 0.9081, "step": 8300 }, { "epoch": 0.09294210411528847, "grad_norm": 0.27077415585517883, "learning_rate": 0.00033304399470053455, "loss": 0.9184, "step": 8310 }, { "epoch": 0.09305394780255115, "grad_norm": 0.26410186290740967, "learning_rate": 0.0003328155694641144, "loss": 0.8912, "step": 8320 }, { "epoch": 0.09316579148981384, "grad_norm": 0.2818413972854614, "learning_rate": 0.00033258714422769426, "loss": 0.9096, "step": 8330 }, { "epoch": 0.09327763517707652, "grad_norm": 0.265286386013031, "learning_rate": 0.00033235871899127417, "loss": 0.9192, "step": 8340 }, { "epoch": 0.0933894788643392, "grad_norm": 0.2714836597442627, "learning_rate": 0.000332130293754854, "loss": 0.9122, "step": 8350 }, { "epoch": 0.09350132255160189, "grad_norm": 0.2858263850212097, "learning_rate": 0.0003319018685184339, "loss": 0.9143, "step": 8360 }, { "epoch": 0.09361316623886456, "grad_norm": 0.27788257598876953, "learning_rate": 0.00033167344328201385, "loss": 0.9116, "step": 8370 }, { "epoch": 0.09372500992612724, "grad_norm": 0.27748674154281616, "learning_rate": 0.0003314450180455937, "loss": 0.8934, "step": 8380 }, { "epoch": 0.09383685361338992, "grad_norm": 0.4757048785686493, "learning_rate": 0.00033121659280917356, "loss": 0.9097, "step": 8390 }, { "epoch": 0.09394869730065261, "grad_norm": 0.3016970157623291, "learning_rate": 0.00033098816757275347, "loss": 0.8973, "step": 8400 }, { "epoch": 0.09406054098791529, "grad_norm": 0.2640211880207062, "learning_rate": 0.0003307597423363333, "loss": 0.8914, "step": 8410 }, { "epoch": 0.09417238467517797, "grad_norm": 0.2608022391796112, "learning_rate": 0.0003305313170999132, "loss": 0.9138, "step": 8420 }, { "epoch": 0.09428422836244066, "grad_norm": 0.23691967129707336, "learning_rate": 0.0003303028918634931, "loss": 0.9149, "step": 8430 }, { "epoch": 0.09439607204970334, "grad_norm": 0.28734761476516724, "learning_rate": 0.00033007446662707294, "loss": 0.9056, "step": 8440 }, { "epoch": 0.09450791573696601, "grad_norm": 0.2846873700618744, "learning_rate": 0.00032984604139065285, "loss": 0.9052, "step": 8450 }, { "epoch": 0.09461975942422869, "grad_norm": 0.2613682448863983, "learning_rate": 0.00032961761615423276, "loss": 0.9129, "step": 8460 }, { "epoch": 0.09473160311149138, "grad_norm": 0.25336501002311707, "learning_rate": 0.0003293891909178126, "loss": 0.9048, "step": 8470 }, { "epoch": 0.09484344679875406, "grad_norm": 0.2662324905395508, "learning_rate": 0.0003291607656813925, "loss": 0.9181, "step": 8480 }, { "epoch": 0.09495529048601674, "grad_norm": 0.2482605278491974, "learning_rate": 0.0003289323404449724, "loss": 0.8978, "step": 8490 }, { "epoch": 0.09506713417327943, "grad_norm": 0.24181032180786133, "learning_rate": 0.00032870391520855224, "loss": 0.9121, "step": 8500 }, { "epoch": 0.09517897786054211, "grad_norm": 0.276621013879776, "learning_rate": 0.0003284754899721321, "loss": 0.9106, "step": 8510 }, { "epoch": 0.09529082154780479, "grad_norm": 0.2788410186767578, "learning_rate": 0.000328247064735712, "loss": 0.9062, "step": 8520 }, { "epoch": 0.09540266523506746, "grad_norm": 0.28387385606765747, "learning_rate": 0.0003280186394992919, "loss": 0.9309, "step": 8530 }, { "epoch": 0.09551450892233015, "grad_norm": 0.2923261523246765, "learning_rate": 0.00032779021426287177, "loss": 0.9278, "step": 8540 }, { "epoch": 0.09562635260959283, "grad_norm": 0.3008005917072296, "learning_rate": 0.0003275617890264517, "loss": 0.9196, "step": 8550 }, { "epoch": 0.09573819629685551, "grad_norm": 0.2849402129650116, "learning_rate": 0.00032733336379003154, "loss": 0.9243, "step": 8560 }, { "epoch": 0.0958500399841182, "grad_norm": 0.262134313583374, "learning_rate": 0.0003271049385536114, "loss": 0.9346, "step": 8570 }, { "epoch": 0.09596188367138088, "grad_norm": 0.2891925573348999, "learning_rate": 0.0003268765133171913, "loss": 0.9176, "step": 8580 }, { "epoch": 0.09607372735864356, "grad_norm": 0.26165837049484253, "learning_rate": 0.00032664808808077116, "loss": 0.9229, "step": 8590 }, { "epoch": 0.09618557104590623, "grad_norm": 0.2683985233306885, "learning_rate": 0.000326419662844351, "loss": 0.9067, "step": 8600 }, { "epoch": 0.09629741473316893, "grad_norm": 0.25300973653793335, "learning_rate": 0.000326191237607931, "loss": 0.9037, "step": 8610 }, { "epoch": 0.0964092584204316, "grad_norm": 0.30520153045654297, "learning_rate": 0.00032596281237151083, "loss": 0.9038, "step": 8620 }, { "epoch": 0.09652110210769428, "grad_norm": 0.2573854327201843, "learning_rate": 0.0003257343871350907, "loss": 0.9062, "step": 8630 }, { "epoch": 0.09663294579495697, "grad_norm": 0.2664088308811188, "learning_rate": 0.0003255059618986706, "loss": 0.8864, "step": 8640 }, { "epoch": 0.09674478948221965, "grad_norm": 0.26375049352645874, "learning_rate": 0.00032527753666225046, "loss": 0.8804, "step": 8650 }, { "epoch": 0.09685663316948233, "grad_norm": 0.25367647409439087, "learning_rate": 0.0003250491114258303, "loss": 0.8987, "step": 8660 }, { "epoch": 0.09696847685674502, "grad_norm": 0.2764420807361603, "learning_rate": 0.00032482068618941017, "loss": 0.9078, "step": 8670 }, { "epoch": 0.0970803205440077, "grad_norm": 0.2663860023021698, "learning_rate": 0.0003245922609529901, "loss": 0.8838, "step": 8680 }, { "epoch": 0.09719216423127038, "grad_norm": 0.25380998849868774, "learning_rate": 0.00032436383571657, "loss": 0.8949, "step": 8690 }, { "epoch": 0.09730400791853305, "grad_norm": 0.29428210854530334, "learning_rate": 0.00032413541048014984, "loss": 0.883, "step": 8700 }, { "epoch": 0.09741585160579574, "grad_norm": 0.25604331493377686, "learning_rate": 0.00032390698524372975, "loss": 0.8891, "step": 8710 }, { "epoch": 0.09752769529305842, "grad_norm": 0.26663005352020264, "learning_rate": 0.0003236785600073096, "loss": 0.8763, "step": 8720 }, { "epoch": 0.0976395389803211, "grad_norm": 0.27305158972740173, "learning_rate": 0.00032345013477088946, "loss": 0.8877, "step": 8730 }, { "epoch": 0.09775138266758379, "grad_norm": 0.27395525574684143, "learning_rate": 0.0003232217095344694, "loss": 0.871, "step": 8740 }, { "epoch": 0.09786322635484647, "grad_norm": 0.26152902841567993, "learning_rate": 0.00032299328429804923, "loss": 0.8714, "step": 8750 }, { "epoch": 0.09797507004210915, "grad_norm": 0.2872631847858429, "learning_rate": 0.0003227648590616291, "loss": 0.8754, "step": 8760 }, { "epoch": 0.09808691372937182, "grad_norm": 0.2681150436401367, "learning_rate": 0.00032253643382520905, "loss": 0.8699, "step": 8770 }, { "epoch": 0.09819875741663452, "grad_norm": 0.27205002307891846, "learning_rate": 0.0003223080085887889, "loss": 0.8743, "step": 8780 }, { "epoch": 0.0983106011038972, "grad_norm": 0.27747979760169983, "learning_rate": 0.00032207958335236876, "loss": 0.8607, "step": 8790 }, { "epoch": 0.09842244479115987, "grad_norm": 0.2963927984237671, "learning_rate": 0.00032185115811594867, "loss": 0.8676, "step": 8800 }, { "epoch": 0.09853428847842256, "grad_norm": 0.26414602994918823, "learning_rate": 0.0003216227328795285, "loss": 0.8556, "step": 8810 }, { "epoch": 0.09864613216568524, "grad_norm": 0.3005480170249939, "learning_rate": 0.0003213943076431084, "loss": 0.8816, "step": 8820 }, { "epoch": 0.09875797585294792, "grad_norm": 0.29625314474105835, "learning_rate": 0.0003211658824066883, "loss": 0.8747, "step": 8830 }, { "epoch": 0.0988698195402106, "grad_norm": 0.2900589108467102, "learning_rate": 0.0003209374571702682, "loss": 0.8697, "step": 8840 }, { "epoch": 0.09898166322747329, "grad_norm": 0.2951551675796509, "learning_rate": 0.00032070903193384806, "loss": 0.8756, "step": 8850 }, { "epoch": 0.09909350691473597, "grad_norm": 0.3049459159374237, "learning_rate": 0.00032048060669742797, "loss": 0.8767, "step": 8860 }, { "epoch": 0.09920535060199864, "grad_norm": 0.30216872692108154, "learning_rate": 0.0003202521814610078, "loss": 0.8687, "step": 8870 }, { "epoch": 0.09931719428926133, "grad_norm": 0.2913934290409088, "learning_rate": 0.0003200237562245877, "loss": 0.8616, "step": 8880 }, { "epoch": 0.09942903797652401, "grad_norm": 0.26879578828811646, "learning_rate": 0.0003197953309881676, "loss": 0.8681, "step": 8890 }, { "epoch": 0.09954088166378669, "grad_norm": 0.28092971444129944, "learning_rate": 0.00031956690575174744, "loss": 0.8765, "step": 8900 }, { "epoch": 0.09965272535104937, "grad_norm": 0.3074035048484802, "learning_rate": 0.0003193384805153273, "loss": 0.881, "step": 8910 }, { "epoch": 0.09976456903831206, "grad_norm": 0.2945140600204468, "learning_rate": 0.00031911005527890726, "loss": 0.8913, "step": 8920 }, { "epoch": 0.09987641272557474, "grad_norm": 0.2707176208496094, "learning_rate": 0.0003188816300424871, "loss": 0.8822, "step": 8930 }, { "epoch": 0.09998825641283741, "grad_norm": 0.2639947235584259, "learning_rate": 0.000318653204806067, "loss": 0.8892, "step": 8940 }, { "epoch": 0.1001001001001001, "grad_norm": 0.2709505558013916, "learning_rate": 0.0003184247795696469, "loss": 0.8654, "step": 8950 }, { "epoch": 0.10021194378736278, "grad_norm": 0.27803289890289307, "learning_rate": 0.00031819635433322674, "loss": 0.8887, "step": 8960 }, { "epoch": 0.10032378747462546, "grad_norm": 0.25851163268089294, "learning_rate": 0.0003179679290968066, "loss": 0.8662, "step": 8970 }, { "epoch": 0.10043563116188814, "grad_norm": 0.261068731546402, "learning_rate": 0.0003177395038603865, "loss": 0.8641, "step": 8980 }, { "epoch": 0.10054747484915083, "grad_norm": 0.25510483980178833, "learning_rate": 0.00031751107862396636, "loss": 0.8762, "step": 8990 }, { "epoch": 0.10065931853641351, "grad_norm": 0.25765854120254517, "learning_rate": 0.00031728265338754627, "loss": 0.8837, "step": 9000 }, { "epoch": 0.10077116222367619, "grad_norm": 0.24198535084724426, "learning_rate": 0.0003170542281511262, "loss": 0.8791, "step": 9010 }, { "epoch": 0.10088300591093888, "grad_norm": 0.2673517167568207, "learning_rate": 0.00031682580291470604, "loss": 0.8795, "step": 9020 }, { "epoch": 0.10099484959820156, "grad_norm": 0.26392221450805664, "learning_rate": 0.0003165973776782859, "loss": 0.8788, "step": 9030 }, { "epoch": 0.10110669328546423, "grad_norm": 0.2698739171028137, "learning_rate": 0.0003163689524418658, "loss": 0.8959, "step": 9040 }, { "epoch": 0.10121853697272691, "grad_norm": 0.2800233066082001, "learning_rate": 0.00031614052720544566, "loss": 0.8945, "step": 9050 }, { "epoch": 0.1013303806599896, "grad_norm": 0.29603493213653564, "learning_rate": 0.0003159121019690255, "loss": 0.892, "step": 9060 }, { "epoch": 0.10144222434725228, "grad_norm": 0.26462167501449585, "learning_rate": 0.0003156836767326054, "loss": 0.8849, "step": 9070 }, { "epoch": 0.10155406803451496, "grad_norm": 0.27941739559173584, "learning_rate": 0.00031545525149618534, "loss": 0.8782, "step": 9080 }, { "epoch": 0.10166591172177765, "grad_norm": 0.2777186334133148, "learning_rate": 0.0003152268262597652, "loss": 0.8787, "step": 9090 }, { "epoch": 0.10177775540904033, "grad_norm": 0.25893428921699524, "learning_rate": 0.00031499840102334505, "loss": 0.8629, "step": 9100 }, { "epoch": 0.101889599096303, "grad_norm": 0.27407601475715637, "learning_rate": 0.00031476997578692496, "loss": 0.8619, "step": 9110 }, { "epoch": 0.10200144278356568, "grad_norm": 0.2663459777832031, "learning_rate": 0.0003145415505505048, "loss": 0.8474, "step": 9120 }, { "epoch": 0.10211328647082837, "grad_norm": 0.2621177136898041, "learning_rate": 0.00031431312531408467, "loss": 0.8565, "step": 9130 }, { "epoch": 0.10222513015809105, "grad_norm": 0.26687386631965637, "learning_rate": 0.0003140847000776646, "loss": 0.8438, "step": 9140 }, { "epoch": 0.10233697384535373, "grad_norm": 0.24772432446479797, "learning_rate": 0.00031385627484124443, "loss": 0.8511, "step": 9150 }, { "epoch": 0.10244881753261642, "grad_norm": 0.278730183839798, "learning_rate": 0.00031362784960482434, "loss": 0.8499, "step": 9160 }, { "epoch": 0.1025606612198791, "grad_norm": 0.28657999634742737, "learning_rate": 0.00031339942436840425, "loss": 0.85, "step": 9170 }, { "epoch": 0.10267250490714178, "grad_norm": 0.2848927676677704, "learning_rate": 0.0003131709991319841, "loss": 0.8411, "step": 9180 }, { "epoch": 0.10278434859440445, "grad_norm": 0.28381872177124023, "learning_rate": 0.00031294257389556396, "loss": 0.8508, "step": 9190 }, { "epoch": 0.10289619228166715, "grad_norm": 0.26624616980552673, "learning_rate": 0.0003127141486591439, "loss": 0.8658, "step": 9200 }, { "epoch": 0.10300803596892982, "grad_norm": 0.2605401277542114, "learning_rate": 0.00031248572342272373, "loss": 0.8602, "step": 9210 }, { "epoch": 0.1031198796561925, "grad_norm": 0.2819276750087738, "learning_rate": 0.0003122572981863036, "loss": 0.8614, "step": 9220 }, { "epoch": 0.10323172334345519, "grad_norm": 0.27677878737449646, "learning_rate": 0.00031202887294988355, "loss": 0.8556, "step": 9230 }, { "epoch": 0.10334356703071787, "grad_norm": 0.25589799880981445, "learning_rate": 0.0003118004477134634, "loss": 0.8704, "step": 9240 }, { "epoch": 0.10345541071798055, "grad_norm": 0.2731853425502777, "learning_rate": 0.00031157202247704326, "loss": 0.8428, "step": 9250 }, { "epoch": 0.10356725440524323, "grad_norm": 0.3047199547290802, "learning_rate": 0.00031134359724062317, "loss": 0.8508, "step": 9260 }, { "epoch": 0.10367909809250592, "grad_norm": 0.28696686029434204, "learning_rate": 0.00031111517200420303, "loss": 0.8571, "step": 9270 }, { "epoch": 0.1037909417797686, "grad_norm": 0.23354049026966095, "learning_rate": 0.0003108867467677829, "loss": 0.8518, "step": 9280 }, { "epoch": 0.10390278546703127, "grad_norm": 0.27123787999153137, "learning_rate": 0.0003106583215313628, "loss": 0.8621, "step": 9290 }, { "epoch": 0.10401462915429396, "grad_norm": 0.2509523332118988, "learning_rate": 0.00031042989629494265, "loss": 0.8568, "step": 9300 }, { "epoch": 0.10412647284155664, "grad_norm": 0.2359481155872345, "learning_rate": 0.00031020147105852256, "loss": 0.8598, "step": 9310 }, { "epoch": 0.10423831652881932, "grad_norm": 0.27097463607788086, "learning_rate": 0.00030997304582210247, "loss": 0.8615, "step": 9320 }, { "epoch": 0.104350160216082, "grad_norm": 0.2616114020347595, "learning_rate": 0.0003097446205856823, "loss": 0.8462, "step": 9330 }, { "epoch": 0.10446200390334469, "grad_norm": 0.30027398467063904, "learning_rate": 0.0003095161953492622, "loss": 0.8683, "step": 9340 }, { "epoch": 0.10457384759060737, "grad_norm": 0.28468623757362366, "learning_rate": 0.0003092877701128421, "loss": 0.856, "step": 9350 }, { "epoch": 0.10468569127787004, "grad_norm": 0.318521112203598, "learning_rate": 0.00030905934487642195, "loss": 0.8532, "step": 9360 }, { "epoch": 0.10479753496513274, "grad_norm": 0.3118298351764679, "learning_rate": 0.0003088309196400018, "loss": 0.8546, "step": 9370 }, { "epoch": 0.10490937865239541, "grad_norm": 0.28549399971961975, "learning_rate": 0.0003086024944035817, "loss": 0.8718, "step": 9380 }, { "epoch": 0.10502122233965809, "grad_norm": 0.24803526699543, "learning_rate": 0.0003083740691671616, "loss": 0.8489, "step": 9390 }, { "epoch": 0.10513306602692078, "grad_norm": 0.26765918731689453, "learning_rate": 0.0003081456439307415, "loss": 0.8617, "step": 9400 }, { "epoch": 0.10524490971418346, "grad_norm": 0.26363757252693176, "learning_rate": 0.0003079172186943214, "loss": 0.8648, "step": 9410 }, { "epoch": 0.10535675340144614, "grad_norm": 0.2734963595867157, "learning_rate": 0.00030768879345790124, "loss": 0.8556, "step": 9420 }, { "epoch": 0.10546859708870882, "grad_norm": 0.2773530185222626, "learning_rate": 0.0003074603682214811, "loss": 0.8737, "step": 9430 }, { "epoch": 0.1055804407759715, "grad_norm": 0.2684498429298401, "learning_rate": 0.000307231942985061, "loss": 0.8657, "step": 9440 }, { "epoch": 0.10569228446323418, "grad_norm": 0.26110732555389404, "learning_rate": 0.00030700351774864086, "loss": 0.8618, "step": 9450 }, { "epoch": 0.10580412815049686, "grad_norm": 0.27595090866088867, "learning_rate": 0.0003067750925122207, "loss": 0.8654, "step": 9460 }, { "epoch": 0.10591597183775955, "grad_norm": 0.2799736559391022, "learning_rate": 0.0003065466672758007, "loss": 0.8583, "step": 9470 }, { "epoch": 0.10602781552502223, "grad_norm": 0.2729387879371643, "learning_rate": 0.00030631824203938054, "loss": 0.8628, "step": 9480 }, { "epoch": 0.10613965921228491, "grad_norm": 0.30332332849502563, "learning_rate": 0.0003060898168029604, "loss": 0.8512, "step": 9490 }, { "epoch": 0.10625150289954759, "grad_norm": 0.276753306388855, "learning_rate": 0.0003058613915665403, "loss": 0.85, "step": 9500 }, { "epoch": 0.10636334658681028, "grad_norm": 0.3190478980541229, "learning_rate": 0.00030563296633012016, "loss": 0.8534, "step": 9510 }, { "epoch": 0.10647519027407296, "grad_norm": 0.2926968038082123, "learning_rate": 0.0003054045410937, "loss": 0.8309, "step": 9520 }, { "epoch": 0.10658703396133563, "grad_norm": 0.29631507396698, "learning_rate": 0.0003051761158572799, "loss": 0.8406, "step": 9530 }, { "epoch": 0.10669887764859833, "grad_norm": 0.2881840765476227, "learning_rate": 0.0003049476906208598, "loss": 0.8274, "step": 9540 }, { "epoch": 0.106810721335861, "grad_norm": 0.2623940408229828, "learning_rate": 0.0003047192653844397, "loss": 0.8346, "step": 9550 }, { "epoch": 0.10692256502312368, "grad_norm": 0.29798468947410583, "learning_rate": 0.00030449084014801955, "loss": 0.8362, "step": 9560 }, { "epoch": 0.10703440871038636, "grad_norm": 0.2976382076740265, "learning_rate": 0.00030426241491159946, "loss": 0.8179, "step": 9570 }, { "epoch": 0.10714625239764905, "grad_norm": 0.28637486696243286, "learning_rate": 0.0003040339896751793, "loss": 0.8363, "step": 9580 }, { "epoch": 0.10725809608491173, "grad_norm": 0.3023325204849243, "learning_rate": 0.00030380556443875917, "loss": 0.8382, "step": 9590 }, { "epoch": 0.1073699397721744, "grad_norm": 0.2889160215854645, "learning_rate": 0.0003035771392023391, "loss": 0.8476, "step": 9600 }, { "epoch": 0.1074817834594371, "grad_norm": 0.2868768572807312, "learning_rate": 0.00030334871396591893, "loss": 0.8482, "step": 9610 }, { "epoch": 0.10759362714669977, "grad_norm": 0.2773813307285309, "learning_rate": 0.0003031202887294988, "loss": 0.8577, "step": 9620 }, { "epoch": 0.10770547083396245, "grad_norm": 0.28698423504829407, "learning_rate": 0.00030289186349307875, "loss": 0.8663, "step": 9630 }, { "epoch": 0.10781731452122513, "grad_norm": 0.26839759945869446, "learning_rate": 0.0003026634382566586, "loss": 0.8649, "step": 9640 }, { "epoch": 0.10792915820848782, "grad_norm": 0.2686857283115387, "learning_rate": 0.00030243501302023847, "loss": 0.8563, "step": 9650 }, { "epoch": 0.1080410018957505, "grad_norm": 0.2815250754356384, "learning_rate": 0.0003022065877838184, "loss": 0.8538, "step": 9660 }, { "epoch": 0.10815284558301318, "grad_norm": 0.24625800549983978, "learning_rate": 0.00030197816254739823, "loss": 0.87, "step": 9670 }, { "epoch": 0.10826468927027587, "grad_norm": 0.27051877975463867, "learning_rate": 0.0003017497373109781, "loss": 0.8692, "step": 9680 }, { "epoch": 0.10837653295753855, "grad_norm": 0.253892183303833, "learning_rate": 0.000301521312074558, "loss": 0.8583, "step": 9690 }, { "epoch": 0.10848837664480122, "grad_norm": 0.26951879262924194, "learning_rate": 0.0003012928868381379, "loss": 0.8699, "step": 9700 }, { "epoch": 0.1086002203320639, "grad_norm": 0.27741488814353943, "learning_rate": 0.00030106446160171776, "loss": 0.8673, "step": 9710 }, { "epoch": 0.10871206401932659, "grad_norm": 0.2655075788497925, "learning_rate": 0.00030083603636529767, "loss": 0.8628, "step": 9720 }, { "epoch": 0.10882390770658927, "grad_norm": 0.298532098531723, "learning_rate": 0.00030060761112887753, "loss": 0.8707, "step": 9730 }, { "epoch": 0.10893575139385195, "grad_norm": 0.3105684816837311, "learning_rate": 0.0003003791858924574, "loss": 0.8661, "step": 9740 }, { "epoch": 0.10904759508111464, "grad_norm": 0.27781355381011963, "learning_rate": 0.0003001507606560373, "loss": 0.8871, "step": 9750 }, { "epoch": 0.10915943876837732, "grad_norm": 0.2966761589050293, "learning_rate": 0.00029992233541961715, "loss": 0.875, "step": 9760 }, { "epoch": 0.10927128245564, "grad_norm": 0.3010736405849457, "learning_rate": 0.000299693910183197, "loss": 0.8746, "step": 9770 }, { "epoch": 0.10938312614290267, "grad_norm": 0.31352171301841736, "learning_rate": 0.00029946548494677697, "loss": 0.8733, "step": 9780 }, { "epoch": 0.10949496983016536, "grad_norm": 0.30627313256263733, "learning_rate": 0.0002992370597103568, "loss": 0.8675, "step": 9790 }, { "epoch": 0.10960681351742804, "grad_norm": 0.23990577459335327, "learning_rate": 0.0002990086344739367, "loss": 0.8614, "step": 9800 }, { "epoch": 0.10971865720469072, "grad_norm": 0.2856599688529968, "learning_rate": 0.0002987802092375166, "loss": 0.8454, "step": 9810 }, { "epoch": 0.10983050089195341, "grad_norm": 0.26476389169692993, "learning_rate": 0.00029855178400109645, "loss": 0.8616, "step": 9820 }, { "epoch": 0.10994234457921609, "grad_norm": 0.2871752381324768, "learning_rate": 0.0002983233587646763, "loss": 0.8444, "step": 9830 }, { "epoch": 0.11005418826647877, "grad_norm": 0.27318039536476135, "learning_rate": 0.0002980949335282562, "loss": 0.8487, "step": 9840 }, { "epoch": 0.11016603195374144, "grad_norm": 0.25630125403404236, "learning_rate": 0.00029786650829183607, "loss": 0.846, "step": 9850 }, { "epoch": 0.11027787564100414, "grad_norm": 0.23908184468746185, "learning_rate": 0.000297638083055416, "loss": 0.8403, "step": 9860 }, { "epoch": 0.11038971932826681, "grad_norm": 0.2978418469429016, "learning_rate": 0.0002974096578189959, "loss": 0.8652, "step": 9870 }, { "epoch": 0.11050156301552949, "grad_norm": 0.2503781318664551, "learning_rate": 0.00029718123258257574, "loss": 0.8657, "step": 9880 }, { "epoch": 0.11061340670279218, "grad_norm": 0.28556469082832336, "learning_rate": 0.0002969528073461556, "loss": 0.8501, "step": 9890 }, { "epoch": 0.11072525039005486, "grad_norm": 0.2643977701663971, "learning_rate": 0.0002967243821097355, "loss": 0.8742, "step": 9900 }, { "epoch": 0.11083709407731754, "grad_norm": 0.2757241725921631, "learning_rate": 0.00029649595687331536, "loss": 0.8837, "step": 9910 }, { "epoch": 0.11094893776458022, "grad_norm": 0.28263452649116516, "learning_rate": 0.0002962675316368952, "loss": 0.8793, "step": 9920 }, { "epoch": 0.11106078145184291, "grad_norm": 0.27624276280403137, "learning_rate": 0.00029603910640047513, "loss": 0.8669, "step": 9930 }, { "epoch": 0.11117262513910559, "grad_norm": 0.2814600467681885, "learning_rate": 0.00029581068116405504, "loss": 0.8858, "step": 9940 }, { "epoch": 0.11128446882636826, "grad_norm": 0.2871972918510437, "learning_rate": 0.0002955822559276349, "loss": 0.8714, "step": 9950 }, { "epoch": 0.11139631251363095, "grad_norm": 0.2885976731777191, "learning_rate": 0.0002953538306912148, "loss": 0.8675, "step": 9960 }, { "epoch": 0.11150815620089363, "grad_norm": 0.281021386384964, "learning_rate": 0.00029512540545479466, "loss": 0.8762, "step": 9970 }, { "epoch": 0.11161999988815631, "grad_norm": 0.2923888862133026, "learning_rate": 0.0002948969802183745, "loss": 0.87, "step": 9980 }, { "epoch": 0.11173184357541899, "grad_norm": 0.2596036195755005, "learning_rate": 0.00029466855498195443, "loss": 0.8696, "step": 9990 }, { "epoch": 0.11184368726268168, "grad_norm": 0.2749873697757721, "learning_rate": 0.0002944401297455343, "loss": 0.8604, "step": 10000 }, { "epoch": 0.11195553094994436, "grad_norm": 0.2696766257286072, "learning_rate": 0.00029421170450911414, "loss": 0.8743, "step": 10010 }, { "epoch": 0.11206737463720703, "grad_norm": 0.2824450731277466, "learning_rate": 0.00029398327927269405, "loss": 0.8734, "step": 10020 }, { "epoch": 0.11217921832446973, "grad_norm": 0.2795054614543915, "learning_rate": 0.00029375485403627396, "loss": 0.865, "step": 10030 }, { "epoch": 0.1122910620117324, "grad_norm": 0.2974453866481781, "learning_rate": 0.0002935264287998538, "loss": 0.8762, "step": 10040 }, { "epoch": 0.11240290569899508, "grad_norm": 0.27134743332862854, "learning_rate": 0.00029329800356343367, "loss": 0.8616, "step": 10050 }, { "epoch": 0.11251474938625777, "grad_norm": 0.2651810348033905, "learning_rate": 0.0002930695783270136, "loss": 0.8653, "step": 10060 }, { "epoch": 0.11262659307352045, "grad_norm": 0.29161420464515686, "learning_rate": 0.00029284115309059344, "loss": 0.8583, "step": 10070 }, { "epoch": 0.11273843676078313, "grad_norm": 0.27624139189720154, "learning_rate": 0.0002926127278541733, "loss": 0.8447, "step": 10080 }, { "epoch": 0.1128502804480458, "grad_norm": 0.290632039308548, "learning_rate": 0.00029238430261775326, "loss": 0.8568, "step": 10090 }, { "epoch": 0.1129621241353085, "grad_norm": 0.2906644940376282, "learning_rate": 0.0002921558773813331, "loss": 0.8566, "step": 10100 }, { "epoch": 0.11307396782257118, "grad_norm": 0.29284584522247314, "learning_rate": 0.00029192745214491297, "loss": 0.8679, "step": 10110 }, { "epoch": 0.11318581150983385, "grad_norm": 0.29635393619537354, "learning_rate": 0.0002916990269084929, "loss": 0.8648, "step": 10120 }, { "epoch": 0.11329765519709654, "grad_norm": 0.2560585141181946, "learning_rate": 0.00029147060167207273, "loss": 0.8565, "step": 10130 }, { "epoch": 0.11340949888435922, "grad_norm": 0.2480679154396057, "learning_rate": 0.0002912421764356526, "loss": 0.8574, "step": 10140 }, { "epoch": 0.1135213425716219, "grad_norm": 0.28708118200302124, "learning_rate": 0.0002910137511992325, "loss": 0.8658, "step": 10150 }, { "epoch": 0.11363318625888458, "grad_norm": 0.2553873062133789, "learning_rate": 0.00029078532596281235, "loss": 0.8721, "step": 10160 }, { "epoch": 0.11374502994614727, "grad_norm": 0.26742488145828247, "learning_rate": 0.00029055690072639226, "loss": 0.8608, "step": 10170 }, { "epoch": 0.11385687363340995, "grad_norm": 0.2674279510974884, "learning_rate": 0.0002903284754899722, "loss": 0.8763, "step": 10180 }, { "epoch": 0.11396871732067262, "grad_norm": 0.2484348863363266, "learning_rate": 0.00029010005025355203, "loss": 0.8799, "step": 10190 }, { "epoch": 0.11408056100793532, "grad_norm": 0.2603932321071625, "learning_rate": 0.0002898716250171319, "loss": 0.8922, "step": 10200 }, { "epoch": 0.114192404695198, "grad_norm": 0.2510204613208771, "learning_rate": 0.0002896431997807118, "loss": 0.8851, "step": 10210 }, { "epoch": 0.11430424838246067, "grad_norm": 0.26795732975006104, "learning_rate": 0.00028941477454429165, "loss": 0.8917, "step": 10220 }, { "epoch": 0.11441609206972335, "grad_norm": 0.2880701422691345, "learning_rate": 0.0002891863493078715, "loss": 0.8903, "step": 10230 }, { "epoch": 0.11452793575698604, "grad_norm": 0.23970642685890198, "learning_rate": 0.0002889579240714514, "loss": 0.8882, "step": 10240 }, { "epoch": 0.11463977944424872, "grad_norm": 0.2786742150783539, "learning_rate": 0.0002887294988350313, "loss": 0.8827, "step": 10250 }, { "epoch": 0.1147516231315114, "grad_norm": 0.2780776619911194, "learning_rate": 0.0002885010735986112, "loss": 0.8879, "step": 10260 }, { "epoch": 0.11486346681877409, "grad_norm": 0.26984742283821106, "learning_rate": 0.0002882726483621911, "loss": 0.8732, "step": 10270 }, { "epoch": 0.11497531050603677, "grad_norm": 0.26902884244918823, "learning_rate": 0.00028804422312577095, "loss": 0.878, "step": 10280 }, { "epoch": 0.11508715419329944, "grad_norm": 0.24787285923957825, "learning_rate": 0.0002878157978893508, "loss": 0.8573, "step": 10290 }, { "epoch": 0.11519899788056212, "grad_norm": 0.22702965140342712, "learning_rate": 0.0002875873726529307, "loss": 0.8621, "step": 10300 }, { "epoch": 0.11531084156782481, "grad_norm": 0.27474096417427063, "learning_rate": 0.00028735894741651057, "loss": 0.8763, "step": 10310 }, { "epoch": 0.11542268525508749, "grad_norm": 0.2605912983417511, "learning_rate": 0.0002871305221800904, "loss": 0.8706, "step": 10320 }, { "epoch": 0.11553452894235017, "grad_norm": 0.25281742215156555, "learning_rate": 0.0002869020969436704, "loss": 0.855, "step": 10330 }, { "epoch": 0.11564637262961286, "grad_norm": 0.2559000849723816, "learning_rate": 0.00028667367170725024, "loss": 0.8549, "step": 10340 }, { "epoch": 0.11575821631687554, "grad_norm": 0.2439345121383667, "learning_rate": 0.0002864452464708301, "loss": 0.8639, "step": 10350 }, { "epoch": 0.11587006000413821, "grad_norm": 0.2690776288509369, "learning_rate": 0.00028621682123441, "loss": 0.8487, "step": 10360 }, { "epoch": 0.11598190369140089, "grad_norm": 0.25111067295074463, "learning_rate": 0.00028598839599798987, "loss": 0.8558, "step": 10370 }, { "epoch": 0.11609374737866358, "grad_norm": 0.26838451623916626, "learning_rate": 0.0002857599707615697, "loss": 0.8603, "step": 10380 }, { "epoch": 0.11620559106592626, "grad_norm": 0.2401856780052185, "learning_rate": 0.00028553154552514963, "loss": 0.8286, "step": 10390 }, { "epoch": 0.11631743475318894, "grad_norm": 0.26284924149513245, "learning_rate": 0.0002853031202887295, "loss": 0.8402, "step": 10400 }, { "epoch": 0.11642927844045163, "grad_norm": 0.28734955191612244, "learning_rate": 0.0002850746950523094, "loss": 0.8358, "step": 10410 }, { "epoch": 0.11654112212771431, "grad_norm": 0.2564549446105957, "learning_rate": 0.0002848462698158893, "loss": 0.8458, "step": 10420 }, { "epoch": 0.11665296581497699, "grad_norm": 0.2507050633430481, "learning_rate": 0.00028461784457946916, "loss": 0.8371, "step": 10430 }, { "epoch": 0.11676480950223966, "grad_norm": 0.25748834013938904, "learning_rate": 0.000284389419343049, "loss": 0.8527, "step": 10440 }, { "epoch": 0.11687665318950236, "grad_norm": 0.24484454095363617, "learning_rate": 0.00028416099410662893, "loss": 0.8372, "step": 10450 }, { "epoch": 0.11698849687676503, "grad_norm": 0.24171967804431915, "learning_rate": 0.0002839325688702088, "loss": 0.8327, "step": 10460 }, { "epoch": 0.11710034056402771, "grad_norm": 0.30423420667648315, "learning_rate": 0.00028370414363378864, "loss": 0.8271, "step": 10470 }, { "epoch": 0.1172121842512904, "grad_norm": 0.2598424553871155, "learning_rate": 0.0002834757183973685, "loss": 0.8169, "step": 10480 }, { "epoch": 0.11732402793855308, "grad_norm": 0.2608656585216522, "learning_rate": 0.00028324729316094846, "loss": 0.8261, "step": 10490 }, { "epoch": 0.11743587162581576, "grad_norm": 0.25370126962661743, "learning_rate": 0.0002830188679245283, "loss": 0.8227, "step": 10500 }, { "epoch": 0.11754771531307844, "grad_norm": 0.2760542333126068, "learning_rate": 0.00028279044268810817, "loss": 0.8413, "step": 10510 }, { "epoch": 0.11765955900034113, "grad_norm": 0.24994856119155884, "learning_rate": 0.0002825620174516881, "loss": 0.8288, "step": 10520 }, { "epoch": 0.1177714026876038, "grad_norm": 0.25439032912254333, "learning_rate": 0.00028233359221526794, "loss": 0.8318, "step": 10530 }, { "epoch": 0.11788324637486648, "grad_norm": 0.28182244300842285, "learning_rate": 0.0002821051669788478, "loss": 0.8437, "step": 10540 }, { "epoch": 0.11799509006212917, "grad_norm": 0.2419012039899826, "learning_rate": 0.0002818767417424277, "loss": 0.8446, "step": 10550 }, { "epoch": 0.11810693374939185, "grad_norm": 0.2598857581615448, "learning_rate": 0.0002816483165060076, "loss": 0.8428, "step": 10560 }, { "epoch": 0.11821877743665453, "grad_norm": 0.25206229090690613, "learning_rate": 0.00028141989126958747, "loss": 0.8533, "step": 10570 }, { "epoch": 0.1183306211239172, "grad_norm": 0.25155991315841675, "learning_rate": 0.0002811914660331674, "loss": 0.8538, "step": 10580 }, { "epoch": 0.1184424648111799, "grad_norm": 0.2342199832201004, "learning_rate": 0.00028096304079674723, "loss": 0.8519, "step": 10590 }, { "epoch": 0.11855430849844258, "grad_norm": 0.25823327898979187, "learning_rate": 0.0002807346155603271, "loss": 0.8483, "step": 10600 }, { "epoch": 0.11866615218570525, "grad_norm": 0.26428598165512085, "learning_rate": 0.000280506190323907, "loss": 0.86, "step": 10610 }, { "epoch": 0.11877799587296795, "grad_norm": 0.25176918506622314, "learning_rate": 0.00028027776508748685, "loss": 0.8589, "step": 10620 }, { "epoch": 0.11888983956023062, "grad_norm": 0.28826919198036194, "learning_rate": 0.0002800493398510667, "loss": 0.8627, "step": 10630 }, { "epoch": 0.1190016832474933, "grad_norm": 0.24679958820343018, "learning_rate": 0.0002798209146146467, "loss": 0.8563, "step": 10640 }, { "epoch": 0.11911352693475598, "grad_norm": 0.2550687789916992, "learning_rate": 0.00027959248937822653, "loss": 0.8535, "step": 10650 }, { "epoch": 0.11922537062201867, "grad_norm": 0.2506476640701294, "learning_rate": 0.0002793640641418064, "loss": 0.8553, "step": 10660 }, { "epoch": 0.11933721430928135, "grad_norm": 0.24980700016021729, "learning_rate": 0.0002791356389053863, "loss": 0.854, "step": 10670 }, { "epoch": 0.11944905799654403, "grad_norm": 0.2280970811843872, "learning_rate": 0.00027890721366896615, "loss": 0.8569, "step": 10680 }, { "epoch": 0.11956090168380672, "grad_norm": 0.25191232562065125, "learning_rate": 0.000278678788432546, "loss": 0.8566, "step": 10690 }, { "epoch": 0.1196727453710694, "grad_norm": 0.2748493552207947, "learning_rate": 0.0002784503631961259, "loss": 0.8573, "step": 10700 }, { "epoch": 0.11978458905833207, "grad_norm": 0.25123515725135803, "learning_rate": 0.00027822193795970577, "loss": 0.8473, "step": 10710 }, { "epoch": 0.11989643274559475, "grad_norm": 0.25573378801345825, "learning_rate": 0.0002779935127232857, "loss": 0.8469, "step": 10720 }, { "epoch": 0.12000827643285744, "grad_norm": 0.23367713391780853, "learning_rate": 0.0002777650874868656, "loss": 0.8452, "step": 10730 }, { "epoch": 0.12012012012012012, "grad_norm": 0.24593010544776917, "learning_rate": 0.00027753666225044545, "loss": 0.838, "step": 10740 }, { "epoch": 0.1202319638073828, "grad_norm": 0.2422724962234497, "learning_rate": 0.0002773082370140253, "loss": 0.8398, "step": 10750 }, { "epoch": 0.12034380749464549, "grad_norm": 0.24471783638000488, "learning_rate": 0.0002770798117776052, "loss": 0.8409, "step": 10760 }, { "epoch": 0.12045565118190817, "grad_norm": 0.25523480772972107, "learning_rate": 0.00027685138654118507, "loss": 0.835, "step": 10770 }, { "epoch": 0.12056749486917084, "grad_norm": 0.24846532940864563, "learning_rate": 0.0002766229613047649, "loss": 0.842, "step": 10780 }, { "epoch": 0.12067933855643354, "grad_norm": 0.26955240964889526, "learning_rate": 0.00027639453606834484, "loss": 0.8525, "step": 10790 }, { "epoch": 0.12079118224369621, "grad_norm": 0.2711884081363678, "learning_rate": 0.00027616611083192475, "loss": 0.8352, "step": 10800 }, { "epoch": 0.12090302593095889, "grad_norm": 0.24954953789710999, "learning_rate": 0.0002759376855955046, "loss": 0.8257, "step": 10810 }, { "epoch": 0.12101486961822157, "grad_norm": 0.27029111981391907, "learning_rate": 0.0002757092603590845, "loss": 0.8147, "step": 10820 }, { "epoch": 0.12112671330548426, "grad_norm": 0.2440258413553238, "learning_rate": 0.00027548083512266437, "loss": 0.8239, "step": 10830 }, { "epoch": 0.12123855699274694, "grad_norm": 0.27082934975624084, "learning_rate": 0.0002752524098862442, "loss": 0.8391, "step": 10840 }, { "epoch": 0.12135040068000962, "grad_norm": 0.27641886472702026, "learning_rate": 0.00027502398464982413, "loss": 0.8276, "step": 10850 }, { "epoch": 0.1214622443672723, "grad_norm": 0.24772177636623383, "learning_rate": 0.000274795559413404, "loss": 0.8226, "step": 10860 }, { "epoch": 0.12157408805453498, "grad_norm": 0.2585364580154419, "learning_rate": 0.00027456713417698384, "loss": 0.8096, "step": 10870 }, { "epoch": 0.12168593174179766, "grad_norm": 0.2730146050453186, "learning_rate": 0.0002743387089405638, "loss": 0.8156, "step": 10880 }, { "epoch": 0.12179777542906034, "grad_norm": 0.2693599760532379, "learning_rate": 0.00027411028370414366, "loss": 0.8125, "step": 10890 }, { "epoch": 0.12190961911632303, "grad_norm": 0.26071295142173767, "learning_rate": 0.0002738818584677235, "loss": 0.8106, "step": 10900 }, { "epoch": 0.12202146280358571, "grad_norm": 0.2560258209705353, "learning_rate": 0.0002736534332313034, "loss": 0.8195, "step": 10910 }, { "epoch": 0.12213330649084839, "grad_norm": 0.27529552578926086, "learning_rate": 0.0002734250079948833, "loss": 0.8104, "step": 10920 }, { "epoch": 0.12224515017811108, "grad_norm": 0.2782133221626282, "learning_rate": 0.00027319658275846314, "loss": 0.8105, "step": 10930 }, { "epoch": 0.12235699386537376, "grad_norm": 0.27981024980545044, "learning_rate": 0.000272968157522043, "loss": 0.8085, "step": 10940 }, { "epoch": 0.12246883755263643, "grad_norm": 0.2741667926311493, "learning_rate": 0.0002727397322856229, "loss": 0.8042, "step": 10950 }, { "epoch": 0.12258068123989911, "grad_norm": 0.2468159943819046, "learning_rate": 0.0002725113070492028, "loss": 0.8198, "step": 10960 }, { "epoch": 0.1226925249271618, "grad_norm": 0.26167941093444824, "learning_rate": 0.00027228288181278267, "loss": 0.8176, "step": 10970 }, { "epoch": 0.12280436861442448, "grad_norm": 0.26660802960395813, "learning_rate": 0.0002720544565763626, "loss": 0.8036, "step": 10980 }, { "epoch": 0.12291621230168716, "grad_norm": 0.301575243473053, "learning_rate": 0.00027182603133994244, "loss": 0.8049, "step": 10990 }, { "epoch": 0.12302805598894985, "grad_norm": 0.2759682834148407, "learning_rate": 0.0002715976061035223, "loss": 0.8024, "step": 11000 }, { "epoch": 0.12313989967621253, "grad_norm": 0.25659626722335815, "learning_rate": 0.0002713691808671022, "loss": 0.8229, "step": 11010 }, { "epoch": 0.1232517433634752, "grad_norm": 0.2672923505306244, "learning_rate": 0.00027114075563068206, "loss": 0.8018, "step": 11020 }, { "epoch": 0.12336358705073788, "grad_norm": 0.25423988699913025, "learning_rate": 0.0002709123303942619, "loss": 0.836, "step": 11030 }, { "epoch": 0.12347543073800057, "grad_norm": 0.28428804874420166, "learning_rate": 0.0002706839051578419, "loss": 0.8299, "step": 11040 }, { "epoch": 0.12358727442526325, "grad_norm": 0.2924467921257019, "learning_rate": 0.00027045547992142173, "loss": 0.8236, "step": 11050 }, { "epoch": 0.12369911811252593, "grad_norm": 0.25230658054351807, "learning_rate": 0.0002702270546850016, "loss": 0.8274, "step": 11060 }, { "epoch": 0.12381096179978862, "grad_norm": 0.27876734733581543, "learning_rate": 0.0002699986294485815, "loss": 0.8244, "step": 11070 }, { "epoch": 0.1239228054870513, "grad_norm": 0.29841694235801697, "learning_rate": 0.00026977020421216136, "loss": 0.8327, "step": 11080 }, { "epoch": 0.12403464917431398, "grad_norm": 0.3055926263332367, "learning_rate": 0.0002695417789757412, "loss": 0.8247, "step": 11090 }, { "epoch": 0.12414649286157665, "grad_norm": 0.275919109582901, "learning_rate": 0.0002693133537393211, "loss": 0.8263, "step": 11100 }, { "epoch": 0.12425833654883935, "grad_norm": 0.3069559931755066, "learning_rate": 0.00026908492850290103, "loss": 0.8242, "step": 11110 }, { "epoch": 0.12437018023610202, "grad_norm": 0.2574029564857483, "learning_rate": 0.0002688565032664809, "loss": 0.819, "step": 11120 }, { "epoch": 0.1244820239233647, "grad_norm": 0.25053170323371887, "learning_rate": 0.0002686280780300608, "loss": 0.8022, "step": 11130 }, { "epoch": 0.12459386761062739, "grad_norm": 0.27337634563446045, "learning_rate": 0.00026839965279364065, "loss": 0.8127, "step": 11140 }, { "epoch": 0.12470571129789007, "grad_norm": 0.2531510889530182, "learning_rate": 0.0002681712275572205, "loss": 0.8138, "step": 11150 }, { "epoch": 0.12481755498515275, "grad_norm": 0.27455076575279236, "learning_rate": 0.0002679428023208004, "loss": 0.7974, "step": 11160 }, { "epoch": 0.12492939867241543, "grad_norm": 0.2515604496002197, "learning_rate": 0.0002677143770843803, "loss": 0.8077, "step": 11170 }, { "epoch": 0.12504124235967812, "grad_norm": 0.27941974997520447, "learning_rate": 0.00026748595184796013, "loss": 0.8099, "step": 11180 }, { "epoch": 0.1251530860469408, "grad_norm": 0.2508449852466583, "learning_rate": 0.0002672575266115401, "loss": 0.8077, "step": 11190 }, { "epoch": 0.12526492973420347, "grad_norm": 0.24805410206317902, "learning_rate": 0.00026702910137511995, "loss": 0.8029, "step": 11200 }, { "epoch": 0.12537677342146616, "grad_norm": 0.2730201184749603, "learning_rate": 0.0002668006761386998, "loss": 0.8383, "step": 11210 }, { "epoch": 0.12548861710872883, "grad_norm": 0.24301932752132416, "learning_rate": 0.0002665722509022797, "loss": 0.8245, "step": 11220 }, { "epoch": 0.12560046079599152, "grad_norm": 0.270059734582901, "learning_rate": 0.00026634382566585957, "loss": 0.8287, "step": 11230 }, { "epoch": 0.1257123044832542, "grad_norm": 0.24491746723651886, "learning_rate": 0.0002661154004294394, "loss": 0.8283, "step": 11240 }, { "epoch": 0.12582414817051688, "grad_norm": 0.2461182177066803, "learning_rate": 0.00026588697519301934, "loss": 0.8285, "step": 11250 }, { "epoch": 0.12593599185777957, "grad_norm": 0.26306700706481934, "learning_rate": 0.0002656585499565992, "loss": 0.8366, "step": 11260 }, { "epoch": 0.12604783554504226, "grad_norm": 0.2317613661289215, "learning_rate": 0.0002654301247201791, "loss": 0.8373, "step": 11270 }, { "epoch": 0.12615967923230492, "grad_norm": 0.25218284130096436, "learning_rate": 0.000265201699483759, "loss": 0.8163, "step": 11280 }, { "epoch": 0.1262715229195676, "grad_norm": 0.2527898848056793, "learning_rate": 0.00026497327424733887, "loss": 0.819, "step": 11290 }, { "epoch": 0.1263833666068303, "grad_norm": 0.2344309389591217, "learning_rate": 0.0002647448490109187, "loss": 0.8335, "step": 11300 }, { "epoch": 0.12649521029409297, "grad_norm": 0.23913320899009705, "learning_rate": 0.00026451642377449863, "loss": 0.8289, "step": 11310 }, { "epoch": 0.12660705398135566, "grad_norm": 0.24901095032691956, "learning_rate": 0.0002642879985380785, "loss": 0.8159, "step": 11320 }, { "epoch": 0.12671889766861835, "grad_norm": 0.2503173351287842, "learning_rate": 0.00026405957330165834, "loss": 0.8372, "step": 11330 }, { "epoch": 0.12683074135588102, "grad_norm": 0.2341470569372177, "learning_rate": 0.00026383114806523825, "loss": 0.8264, "step": 11340 }, { "epoch": 0.1269425850431437, "grad_norm": 0.23143555223941803, "learning_rate": 0.00026360272282881816, "loss": 0.824, "step": 11350 }, { "epoch": 0.12705442873040637, "grad_norm": 0.24911652505397797, "learning_rate": 0.000263374297592398, "loss": 0.82, "step": 11360 }, { "epoch": 0.12716627241766906, "grad_norm": 0.21931353211402893, "learning_rate": 0.0002631458723559779, "loss": 0.8194, "step": 11370 }, { "epoch": 0.12727811610493175, "grad_norm": 0.2432345151901245, "learning_rate": 0.0002629174471195578, "loss": 0.8371, "step": 11380 }, { "epoch": 0.12738995979219442, "grad_norm": 0.24188277125358582, "learning_rate": 0.00026268902188313764, "loss": 0.8096, "step": 11390 }, { "epoch": 0.1275018034794571, "grad_norm": 0.2522214651107788, "learning_rate": 0.0002624605966467175, "loss": 0.8187, "step": 11400 }, { "epoch": 0.1276136471667198, "grad_norm": 0.2596495449542999, "learning_rate": 0.0002622321714102974, "loss": 0.8138, "step": 11410 }, { "epoch": 0.12772549085398247, "grad_norm": 0.2708049714565277, "learning_rate": 0.00026200374617387726, "loss": 0.8066, "step": 11420 }, { "epoch": 0.12783733454124516, "grad_norm": 0.27820831537246704, "learning_rate": 0.00026177532093745717, "loss": 0.8112, "step": 11430 }, { "epoch": 0.12794917822850785, "grad_norm": 0.23918400704860687, "learning_rate": 0.0002615468957010371, "loss": 0.8148, "step": 11440 }, { "epoch": 0.1280610219157705, "grad_norm": 0.22054031491279602, "learning_rate": 0.00026131847046461694, "loss": 0.8183, "step": 11450 }, { "epoch": 0.1281728656030332, "grad_norm": 0.25998455286026, "learning_rate": 0.0002610900452281968, "loss": 0.8242, "step": 11460 }, { "epoch": 0.1282847092902959, "grad_norm": 0.26852914690971375, "learning_rate": 0.0002608616199917767, "loss": 0.8161, "step": 11470 }, { "epoch": 0.12839655297755856, "grad_norm": 0.24028563499450684, "learning_rate": 0.00026063319475535656, "loss": 0.8083, "step": 11480 }, { "epoch": 0.12850839666482125, "grad_norm": 0.24944745004177094, "learning_rate": 0.0002604047695189364, "loss": 0.8168, "step": 11490 }, { "epoch": 0.12862024035208391, "grad_norm": 0.26595303416252136, "learning_rate": 0.0002601763442825164, "loss": 0.8178, "step": 11500 }, { "epoch": 0.1287320840393466, "grad_norm": 0.24556541442871094, "learning_rate": 0.00025994791904609623, "loss": 0.8229, "step": 11510 }, { "epoch": 0.1288439277266093, "grad_norm": 0.24716900289058685, "learning_rate": 0.0002597194938096761, "loss": 0.809, "step": 11520 }, { "epoch": 0.12895577141387196, "grad_norm": 0.24745820462703705, "learning_rate": 0.000259491068573256, "loss": 0.8293, "step": 11530 }, { "epoch": 0.12906761510113465, "grad_norm": 0.2732492983341217, "learning_rate": 0.00025926264333683586, "loss": 0.8, "step": 11540 }, { "epoch": 0.12917945878839734, "grad_norm": 0.23239663243293762, "learning_rate": 0.0002590342181004157, "loss": 0.8175, "step": 11550 }, { "epoch": 0.12929130247566, "grad_norm": 0.24953389167785645, "learning_rate": 0.0002588057928639956, "loss": 0.8152, "step": 11560 }, { "epoch": 0.1294031461629227, "grad_norm": 0.25258156657218933, "learning_rate": 0.0002585773676275755, "loss": 0.8301, "step": 11570 }, { "epoch": 0.1295149898501854, "grad_norm": 0.2609168291091919, "learning_rate": 0.0002583489423911554, "loss": 0.8197, "step": 11580 }, { "epoch": 0.12962683353744806, "grad_norm": 0.2484872192144394, "learning_rate": 0.0002581205171547353, "loss": 0.8362, "step": 11590 }, { "epoch": 0.12973867722471075, "grad_norm": 0.2833307385444641, "learning_rate": 0.00025789209191831515, "loss": 0.8338, "step": 11600 }, { "epoch": 0.12985052091197344, "grad_norm": 0.24657459557056427, "learning_rate": 0.000257663666681895, "loss": 0.8205, "step": 11610 }, { "epoch": 0.1299623645992361, "grad_norm": 0.2499598115682602, "learning_rate": 0.0002574352414454749, "loss": 0.8406, "step": 11620 }, { "epoch": 0.1300742082864988, "grad_norm": 0.2757512629032135, "learning_rate": 0.0002572068162090548, "loss": 0.8247, "step": 11630 }, { "epoch": 0.13018605197376146, "grad_norm": 0.25661805272102356, "learning_rate": 0.00025697839097263463, "loss": 0.8384, "step": 11640 }, { "epoch": 0.13029789566102415, "grad_norm": 0.27651283144950867, "learning_rate": 0.00025674996573621454, "loss": 0.818, "step": 11650 }, { "epoch": 0.13040973934828684, "grad_norm": 0.247050940990448, "learning_rate": 0.00025652154049979445, "loss": 0.8261, "step": 11660 }, { "epoch": 0.1305215830355495, "grad_norm": 0.23124581575393677, "learning_rate": 0.0002562931152633743, "loss": 0.8259, "step": 11670 }, { "epoch": 0.1306334267228122, "grad_norm": 0.2694045603275299, "learning_rate": 0.0002560646900269542, "loss": 0.8304, "step": 11680 }, { "epoch": 0.1307452704100749, "grad_norm": 0.26821568608283997, "learning_rate": 0.00025583626479053407, "loss": 0.8441, "step": 11690 }, { "epoch": 0.13085711409733755, "grad_norm": 0.2747989892959595, "learning_rate": 0.0002556078395541139, "loss": 0.841, "step": 11700 }, { "epoch": 0.13096895778460024, "grad_norm": 0.28248855471611023, "learning_rate": 0.00025537941431769384, "loss": 0.857, "step": 11710 }, { "epoch": 0.13108080147186293, "grad_norm": 0.25378182530403137, "learning_rate": 0.0002551509890812737, "loss": 0.8437, "step": 11720 }, { "epoch": 0.1311926451591256, "grad_norm": 0.25950944423675537, "learning_rate": 0.00025492256384485355, "loss": 0.8497, "step": 11730 }, { "epoch": 0.1313044888463883, "grad_norm": 0.26261699199676514, "learning_rate": 0.0002546941386084335, "loss": 0.8477, "step": 11740 }, { "epoch": 0.13141633253365098, "grad_norm": 0.30151599645614624, "learning_rate": 0.00025446571337201337, "loss": 0.8405, "step": 11750 }, { "epoch": 0.13152817622091365, "grad_norm": 0.2556060254573822, "learning_rate": 0.0002542372881355932, "loss": 0.831, "step": 11760 }, { "epoch": 0.13164001990817634, "grad_norm": 0.26560309529304504, "learning_rate": 0.00025400886289917313, "loss": 0.8445, "step": 11770 }, { "epoch": 0.13175186359543903, "grad_norm": 0.28504636883735657, "learning_rate": 0.000253780437662753, "loss": 0.8432, "step": 11780 }, { "epoch": 0.1318637072827017, "grad_norm": 0.2985188663005829, "learning_rate": 0.00025355201242633285, "loss": 0.8584, "step": 11790 }, { "epoch": 0.13197555096996438, "grad_norm": 0.28022414445877075, "learning_rate": 0.00025332358718991276, "loss": 0.8393, "step": 11800 }, { "epoch": 0.13208739465722705, "grad_norm": 0.28535568714141846, "learning_rate": 0.0002530951619534926, "loss": 0.8369, "step": 11810 }, { "epoch": 0.13219923834448974, "grad_norm": 0.27764952182769775, "learning_rate": 0.0002528667367170725, "loss": 0.8435, "step": 11820 }, { "epoch": 0.13231108203175243, "grad_norm": 0.28943151235580444, "learning_rate": 0.0002526383114806524, "loss": 0.8334, "step": 11830 }, { "epoch": 0.1324229257190151, "grad_norm": 0.28240668773651123, "learning_rate": 0.0002524098862442323, "loss": 0.8338, "step": 11840 }, { "epoch": 0.13253476940627779, "grad_norm": 0.27650541067123413, "learning_rate": 0.00025218146100781214, "loss": 0.8275, "step": 11850 }, { "epoch": 0.13264661309354048, "grad_norm": 0.27569788694381714, "learning_rate": 0.000251953035771392, "loss": 0.8323, "step": 11860 }, { "epoch": 0.13275845678080314, "grad_norm": 0.29103782773017883, "learning_rate": 0.0002517246105349719, "loss": 0.8401, "step": 11870 }, { "epoch": 0.13287030046806583, "grad_norm": 0.28769806027412415, "learning_rate": 0.00025149618529855176, "loss": 0.8369, "step": 11880 }, { "epoch": 0.13298214415532852, "grad_norm": 0.2803378701210022, "learning_rate": 0.0002512677600621316, "loss": 0.8308, "step": 11890 }, { "epoch": 0.1330939878425912, "grad_norm": 0.29264572262763977, "learning_rate": 0.0002510393348257116, "loss": 0.8314, "step": 11900 }, { "epoch": 0.13320583152985388, "grad_norm": 0.27434802055358887, "learning_rate": 0.00025081090958929144, "loss": 0.8337, "step": 11910 }, { "epoch": 0.13331767521711657, "grad_norm": 0.270589143037796, "learning_rate": 0.0002505824843528713, "loss": 0.8503, "step": 11920 }, { "epoch": 0.13342951890437924, "grad_norm": 0.27260124683380127, "learning_rate": 0.0002503540591164512, "loss": 0.8293, "step": 11930 }, { "epoch": 0.13354136259164193, "grad_norm": 0.2684808075428009, "learning_rate": 0.00025012563388003106, "loss": 0.8339, "step": 11940 }, { "epoch": 0.1336532062789046, "grad_norm": 0.2510156035423279, "learning_rate": 0.00024989720864361097, "loss": 0.8464, "step": 11950 }, { "epoch": 0.13376504996616728, "grad_norm": 0.24331960082054138, "learning_rate": 0.0002496687834071908, "loss": 0.8443, "step": 11960 }, { "epoch": 0.13387689365342997, "grad_norm": 0.2688249349594116, "learning_rate": 0.00024944035817077074, "loss": 0.8483, "step": 11970 }, { "epoch": 0.13398873734069264, "grad_norm": 0.2608729898929596, "learning_rate": 0.0002492119329343506, "loss": 0.852, "step": 11980 }, { "epoch": 0.13410058102795533, "grad_norm": 0.28415507078170776, "learning_rate": 0.00024898350769793045, "loss": 0.8449, "step": 11990 }, { "epoch": 0.13421242471521802, "grad_norm": 0.2920886278152466, "learning_rate": 0.00024875508246151036, "loss": 0.8281, "step": 12000 }, { "epoch": 0.13432426840248068, "grad_norm": 0.2763430178165436, "learning_rate": 0.00024852665722509027, "loss": 0.8492, "step": 12010 }, { "epoch": 0.13443611208974338, "grad_norm": 0.26460400223731995, "learning_rate": 0.0002482982319886701, "loss": 0.8409, "step": 12020 }, { "epoch": 0.13454795577700607, "grad_norm": 0.2698183059692383, "learning_rate": 0.00024806980675225, "loss": 0.8295, "step": 12030 }, { "epoch": 0.13465979946426873, "grad_norm": 0.2728478014469147, "learning_rate": 0.0002478413815158299, "loss": 0.837, "step": 12040 }, { "epoch": 0.13477164315153142, "grad_norm": 0.282924085855484, "learning_rate": 0.00024761295627940974, "loss": 0.8482, "step": 12050 }, { "epoch": 0.13488348683879411, "grad_norm": 0.264614999294281, "learning_rate": 0.00024738453104298965, "loss": 0.8432, "step": 12060 }, { "epoch": 0.13499533052605678, "grad_norm": 0.2475707232952118, "learning_rate": 0.0002471561058065695, "loss": 0.8387, "step": 12070 }, { "epoch": 0.13510717421331947, "grad_norm": 0.2620779573917389, "learning_rate": 0.00024692768057014937, "loss": 0.8559, "step": 12080 }, { "epoch": 0.13521901790058213, "grad_norm": 0.2645311951637268, "learning_rate": 0.0002466992553337293, "loss": 0.8363, "step": 12090 }, { "epoch": 0.13533086158784483, "grad_norm": 0.27586236596107483, "learning_rate": 0.0002464708300973092, "loss": 0.8365, "step": 12100 }, { "epoch": 0.13544270527510752, "grad_norm": 0.2695125341415405, "learning_rate": 0.00024624240486088904, "loss": 0.8412, "step": 12110 }, { "epoch": 0.13555454896237018, "grad_norm": 0.2473846971988678, "learning_rate": 0.0002460139796244689, "loss": 0.8362, "step": 12120 }, { "epoch": 0.13566639264963287, "grad_norm": 0.28001588582992554, "learning_rate": 0.0002457855543880488, "loss": 0.8462, "step": 12130 }, { "epoch": 0.13577823633689556, "grad_norm": 0.29486599564552307, "learning_rate": 0.00024555712915162866, "loss": 0.8607, "step": 12140 }, { "epoch": 0.13589008002415823, "grad_norm": 0.2761843204498291, "learning_rate": 0.00024532870391520857, "loss": 0.8668, "step": 12150 }, { "epoch": 0.13600192371142092, "grad_norm": 0.25779953598976135, "learning_rate": 0.00024510027867878843, "loss": 0.853, "step": 12160 }, { "epoch": 0.1361137673986836, "grad_norm": 0.27593857049942017, "learning_rate": 0.00024487185344236834, "loss": 0.8506, "step": 12170 }, { "epoch": 0.13622561108594627, "grad_norm": 0.24426791071891785, "learning_rate": 0.0002446434282059482, "loss": 0.8623, "step": 12180 }, { "epoch": 0.13633745477320897, "grad_norm": 0.25555628538131714, "learning_rate": 0.00024441500296952805, "loss": 0.8493, "step": 12190 }, { "epoch": 0.13644929846047166, "grad_norm": 0.2234913557767868, "learning_rate": 0.00024418657773310796, "loss": 0.8644, "step": 12200 }, { "epoch": 0.13656114214773432, "grad_norm": 0.27130651473999023, "learning_rate": 0.00024395815249668784, "loss": 0.8791, "step": 12210 }, { "epoch": 0.136672985834997, "grad_norm": 0.24734824895858765, "learning_rate": 0.0002437297272602677, "loss": 0.8719, "step": 12220 }, { "epoch": 0.13678482952225968, "grad_norm": 0.24316945672035217, "learning_rate": 0.0002435013020238476, "loss": 0.8546, "step": 12230 }, { "epoch": 0.13689667320952237, "grad_norm": 0.2349976748228073, "learning_rate": 0.0002432728767874275, "loss": 0.8458, "step": 12240 }, { "epoch": 0.13700851689678506, "grad_norm": 0.26791033148765564, "learning_rate": 0.00024304445155100735, "loss": 0.8485, "step": 12250 }, { "epoch": 0.13712036058404772, "grad_norm": 0.23598451912403107, "learning_rate": 0.00024281602631458723, "loss": 0.8451, "step": 12260 }, { "epoch": 0.13723220427131042, "grad_norm": 0.23012129962444305, "learning_rate": 0.00024258760107816714, "loss": 0.8332, "step": 12270 }, { "epoch": 0.1373440479585731, "grad_norm": 0.22834524512290955, "learning_rate": 0.000242359175841747, "loss": 0.8203, "step": 12280 }, { "epoch": 0.13745589164583577, "grad_norm": 0.2247861921787262, "learning_rate": 0.00024213075060532688, "loss": 0.8303, "step": 12290 }, { "epoch": 0.13756773533309846, "grad_norm": 0.2438284307718277, "learning_rate": 0.00024190232536890676, "loss": 0.8216, "step": 12300 }, { "epoch": 0.13767957902036115, "grad_norm": 0.24075888097286224, "learning_rate": 0.00024167390013248664, "loss": 0.7964, "step": 12310 }, { "epoch": 0.13779142270762382, "grad_norm": 0.24668976664543152, "learning_rate": 0.00024144547489606653, "loss": 0.8028, "step": 12320 }, { "epoch": 0.1379032663948865, "grad_norm": 0.26727405190467834, "learning_rate": 0.0002412170496596464, "loss": 0.8081, "step": 12330 }, { "epoch": 0.1380151100821492, "grad_norm": 0.2645564377307892, "learning_rate": 0.00024098862442322626, "loss": 0.8116, "step": 12340 }, { "epoch": 0.13812695376941186, "grad_norm": 0.25368645787239075, "learning_rate": 0.00024076019918680617, "loss": 0.8105, "step": 12350 }, { "epoch": 0.13823879745667456, "grad_norm": 0.26823967695236206, "learning_rate": 0.00024053177395038606, "loss": 0.8249, "step": 12360 }, { "epoch": 0.13835064114393722, "grad_norm": 0.2827225625514984, "learning_rate": 0.0002403033487139659, "loss": 0.8191, "step": 12370 }, { "epoch": 0.1384624848311999, "grad_norm": 0.23261433839797974, "learning_rate": 0.00024007492347754582, "loss": 0.8215, "step": 12380 }, { "epoch": 0.1385743285184626, "grad_norm": 0.27331966161727905, "learning_rate": 0.00023984649824112568, "loss": 0.8232, "step": 12390 }, { "epoch": 0.13868617220572527, "grad_norm": 0.2801966369152069, "learning_rate": 0.00023961807300470556, "loss": 0.8074, "step": 12400 }, { "epoch": 0.13879801589298796, "grad_norm": 0.2379591315984726, "learning_rate": 0.00023938964776828544, "loss": 0.8209, "step": 12410 }, { "epoch": 0.13890985958025065, "grad_norm": 0.27151694893836975, "learning_rate": 0.00023916122253186533, "loss": 0.8258, "step": 12420 }, { "epoch": 0.1390217032675133, "grad_norm": 0.21429865062236786, "learning_rate": 0.0002389327972954452, "loss": 0.8178, "step": 12430 }, { "epoch": 0.139133546954776, "grad_norm": 0.2777722477912903, "learning_rate": 0.0002387043720590251, "loss": 0.826, "step": 12440 }, { "epoch": 0.1392453906420387, "grad_norm": 0.2514742910861969, "learning_rate": 0.00023847594682260495, "loss": 0.8362, "step": 12450 }, { "epoch": 0.13935723432930136, "grad_norm": 0.23247656226158142, "learning_rate": 0.00023824752158618486, "loss": 0.8049, "step": 12460 }, { "epoch": 0.13946907801656405, "grad_norm": 0.2391313910484314, "learning_rate": 0.00023801909634976474, "loss": 0.8082, "step": 12470 }, { "epoch": 0.13958092170382674, "grad_norm": 0.2366340011358261, "learning_rate": 0.0002377906711133446, "loss": 0.8214, "step": 12480 }, { "epoch": 0.1396927653910894, "grad_norm": 0.2570713758468628, "learning_rate": 0.00023756224587692448, "loss": 0.827, "step": 12490 }, { "epoch": 0.1398046090783521, "grad_norm": 0.22823789715766907, "learning_rate": 0.0002373338206405044, "loss": 0.8314, "step": 12500 }, { "epoch": 0.1399164527656148, "grad_norm": 0.24660278856754303, "learning_rate": 0.00023710539540408424, "loss": 0.838, "step": 12510 }, { "epoch": 0.14002829645287745, "grad_norm": 0.25041723251342773, "learning_rate": 0.00023687697016766413, "loss": 0.8371, "step": 12520 }, { "epoch": 0.14014014014014015, "grad_norm": 0.23942531645298004, "learning_rate": 0.000236648544931244, "loss": 0.8282, "step": 12530 }, { "epoch": 0.1402519838274028, "grad_norm": 0.2445865273475647, "learning_rate": 0.0002364201196948239, "loss": 0.8307, "step": 12540 }, { "epoch": 0.1403638275146655, "grad_norm": 0.25278452038764954, "learning_rate": 0.00023619169445840378, "loss": 0.8483, "step": 12550 }, { "epoch": 0.1404756712019282, "grad_norm": 0.22890037298202515, "learning_rate": 0.00023596326922198366, "loss": 0.8328, "step": 12560 }, { "epoch": 0.14058751488919086, "grad_norm": 0.2360977679491043, "learning_rate": 0.00023573484398556351, "loss": 0.8373, "step": 12570 }, { "epoch": 0.14069935857645355, "grad_norm": 0.22873692214488983, "learning_rate": 0.00023550641874914342, "loss": 0.8399, "step": 12580 }, { "epoch": 0.14081120226371624, "grad_norm": 0.228402242064476, "learning_rate": 0.0002352779935127233, "loss": 0.8272, "step": 12590 }, { "epoch": 0.1409230459509789, "grad_norm": 0.2625369131565094, "learning_rate": 0.00023504956827630316, "loss": 0.8413, "step": 12600 }, { "epoch": 0.1410348896382416, "grad_norm": 0.2744843363761902, "learning_rate": 0.00023482114303988305, "loss": 0.823, "step": 12610 }, { "epoch": 0.1411467333255043, "grad_norm": 0.24845914542675018, "learning_rate": 0.00023459271780346293, "loss": 0.8089, "step": 12620 }, { "epoch": 0.14125857701276695, "grad_norm": 0.2431713193655014, "learning_rate": 0.0002343642925670428, "loss": 0.8204, "step": 12630 }, { "epoch": 0.14137042070002964, "grad_norm": 0.2636731266975403, "learning_rate": 0.0002341358673306227, "loss": 0.8241, "step": 12640 }, { "epoch": 0.14148226438729233, "grad_norm": 0.24605631828308105, "learning_rate": 0.00023390744209420255, "loss": 0.837, "step": 12650 }, { "epoch": 0.141594108074555, "grad_norm": 0.25722581148147583, "learning_rate": 0.00023367901685778246, "loss": 0.8338, "step": 12660 }, { "epoch": 0.1417059517618177, "grad_norm": 0.2628157138824463, "learning_rate": 0.00023345059162136234, "loss": 0.8271, "step": 12670 }, { "epoch": 0.14181779544908035, "grad_norm": 0.24534687399864197, "learning_rate": 0.0002332221663849422, "loss": 0.8281, "step": 12680 }, { "epoch": 0.14192963913634304, "grad_norm": 0.24370639026165009, "learning_rate": 0.00023299374114852208, "loss": 0.8243, "step": 12690 }, { "epoch": 0.14204148282360574, "grad_norm": 0.2993674576282501, "learning_rate": 0.000232765315912102, "loss": 0.8191, "step": 12700 }, { "epoch": 0.1421533265108684, "grad_norm": 0.2372383326292038, "learning_rate": 0.00023253689067568185, "loss": 0.8115, "step": 12710 }, { "epoch": 0.1422651701981311, "grad_norm": 0.2405237853527069, "learning_rate": 0.00023230846543926173, "loss": 0.8012, "step": 12720 }, { "epoch": 0.14237701388539378, "grad_norm": 0.23501497507095337, "learning_rate": 0.0002320800402028416, "loss": 0.8272, "step": 12730 }, { "epoch": 0.14248885757265645, "grad_norm": 0.2573966085910797, "learning_rate": 0.0002318516149664215, "loss": 0.8231, "step": 12740 }, { "epoch": 0.14260070125991914, "grad_norm": 0.25884565711021423, "learning_rate": 0.00023162318973000138, "loss": 0.8293, "step": 12750 }, { "epoch": 0.14271254494718183, "grad_norm": 0.24788953363895416, "learning_rate": 0.00023139476449358126, "loss": 0.8338, "step": 12760 }, { "epoch": 0.1428243886344445, "grad_norm": 0.23874413967132568, "learning_rate": 0.00023116633925716112, "loss": 0.8184, "step": 12770 }, { "epoch": 0.14293623232170719, "grad_norm": 0.2358027547597885, "learning_rate": 0.00023093791402074103, "loss": 0.8143, "step": 12780 }, { "epoch": 0.14304807600896988, "grad_norm": 0.22447925806045532, "learning_rate": 0.0002307094887843209, "loss": 0.8093, "step": 12790 }, { "epoch": 0.14315991969623254, "grad_norm": 0.25550246238708496, "learning_rate": 0.00023048106354790077, "loss": 0.8178, "step": 12800 }, { "epoch": 0.14327176338349523, "grad_norm": 0.2370327264070511, "learning_rate": 0.00023025263831148065, "loss": 0.8035, "step": 12810 }, { "epoch": 0.1433836070707579, "grad_norm": 0.24910229444503784, "learning_rate": 0.00023002421307506056, "loss": 0.7965, "step": 12820 }, { "epoch": 0.1434954507580206, "grad_norm": 0.23592302203178406, "learning_rate": 0.0002297957878386404, "loss": 0.808, "step": 12830 }, { "epoch": 0.14360729444528328, "grad_norm": 0.24010522663593292, "learning_rate": 0.0002295673626022203, "loss": 0.8047, "step": 12840 }, { "epoch": 0.14371913813254594, "grad_norm": 0.26334619522094727, "learning_rate": 0.00022933893736580015, "loss": 0.8011, "step": 12850 }, { "epoch": 0.14383098181980863, "grad_norm": 0.23162928223609924, "learning_rate": 0.00022911051212938006, "loss": 0.811, "step": 12860 }, { "epoch": 0.14394282550707133, "grad_norm": 0.24273565411567688, "learning_rate": 0.00022888208689295994, "loss": 0.8249, "step": 12870 }, { "epoch": 0.144054669194334, "grad_norm": 0.239716574549675, "learning_rate": 0.0002286536616565398, "loss": 0.8146, "step": 12880 }, { "epoch": 0.14416651288159668, "grad_norm": 0.22947145998477936, "learning_rate": 0.0002284252364201197, "loss": 0.8037, "step": 12890 }, { "epoch": 0.14427835656885937, "grad_norm": 0.2369975745677948, "learning_rate": 0.0002281968111836996, "loss": 0.7938, "step": 12900 }, { "epoch": 0.14439020025612204, "grad_norm": 0.23150302469730377, "learning_rate": 0.00022796838594727945, "loss": 0.7971, "step": 12910 }, { "epoch": 0.14450204394338473, "grad_norm": 0.25659120082855225, "learning_rate": 0.00022773996071085933, "loss": 0.7897, "step": 12920 }, { "epoch": 0.14461388763064742, "grad_norm": 0.26838308572769165, "learning_rate": 0.00022751153547443924, "loss": 0.8025, "step": 12930 }, { "epoch": 0.14472573131791008, "grad_norm": 0.2421617954969406, "learning_rate": 0.0002272831102380191, "loss": 0.7937, "step": 12940 }, { "epoch": 0.14483757500517278, "grad_norm": 0.22780479490756989, "learning_rate": 0.00022705468500159898, "loss": 0.7861, "step": 12950 }, { "epoch": 0.14494941869243544, "grad_norm": 0.2561044692993164, "learning_rate": 0.00022682625976517886, "loss": 0.7817, "step": 12960 }, { "epoch": 0.14506126237969813, "grad_norm": 0.24073092639446259, "learning_rate": 0.00022659783452875875, "loss": 0.8024, "step": 12970 }, { "epoch": 0.14517310606696082, "grad_norm": 0.24959658086299896, "learning_rate": 0.00022636940929233863, "loss": 0.7994, "step": 12980 }, { "epoch": 0.14528494975422349, "grad_norm": 0.2711149752140045, "learning_rate": 0.0002261409840559185, "loss": 0.8011, "step": 12990 }, { "epoch": 0.14539679344148618, "grad_norm": 0.2447725236415863, "learning_rate": 0.00022591255881949837, "loss": 0.7957, "step": 13000 }, { "epoch": 0.14550863712874887, "grad_norm": 0.26505330204963684, "learning_rate": 0.00022568413358307828, "loss": 0.7932, "step": 13010 }, { "epoch": 0.14562048081601153, "grad_norm": 0.256712943315506, "learning_rate": 0.00022545570834665816, "loss": 0.7919, "step": 13020 }, { "epoch": 0.14573232450327422, "grad_norm": 0.23816627264022827, "learning_rate": 0.00022522728311023802, "loss": 0.7942, "step": 13030 }, { "epoch": 0.14584416819053692, "grad_norm": 0.25607794523239136, "learning_rate": 0.0002249988578738179, "loss": 0.8058, "step": 13040 }, { "epoch": 0.14595601187779958, "grad_norm": 0.2644692361354828, "learning_rate": 0.0002247704326373978, "loss": 0.8026, "step": 13050 }, { "epoch": 0.14606785556506227, "grad_norm": 0.24160505831241608, "learning_rate": 0.00022454200740097766, "loss": 0.8013, "step": 13060 }, { "epoch": 0.14617969925232496, "grad_norm": 0.25321200489997864, "learning_rate": 0.00022431358216455755, "loss": 0.802, "step": 13070 }, { "epoch": 0.14629154293958763, "grad_norm": 0.38834208250045776, "learning_rate": 0.0002240851569281374, "loss": 0.8053, "step": 13080 }, { "epoch": 0.14640338662685032, "grad_norm": 0.2638767957687378, "learning_rate": 0.0002238567316917173, "loss": 0.803, "step": 13090 }, { "epoch": 0.14651523031411298, "grad_norm": 0.33412685990333557, "learning_rate": 0.0002236283064552972, "loss": 0.8091, "step": 13100 }, { "epoch": 0.14662707400137567, "grad_norm": 0.27539852261543274, "learning_rate": 0.00022339988121887705, "loss": 0.8019, "step": 13110 }, { "epoch": 0.14673891768863837, "grad_norm": 0.25128626823425293, "learning_rate": 0.00022317145598245693, "loss": 0.7961, "step": 13120 }, { "epoch": 0.14685076137590103, "grad_norm": 0.27428579330444336, "learning_rate": 0.00022294303074603684, "loss": 0.792, "step": 13130 }, { "epoch": 0.14696260506316372, "grad_norm": 0.25421425700187683, "learning_rate": 0.0002227146055096167, "loss": 0.8139, "step": 13140 }, { "epoch": 0.1470744487504264, "grad_norm": 0.23709440231323242, "learning_rate": 0.00022248618027319658, "loss": 0.8147, "step": 13150 }, { "epoch": 0.14718629243768908, "grad_norm": 0.2693617641925812, "learning_rate": 0.00022225775503677646, "loss": 0.8174, "step": 13160 }, { "epoch": 0.14729813612495177, "grad_norm": 0.26674261689186096, "learning_rate": 0.00022202932980035635, "loss": 0.8105, "step": 13170 }, { "epoch": 0.14740997981221446, "grad_norm": 0.2656268775463104, "learning_rate": 0.00022180090456393623, "loss": 0.8355, "step": 13180 }, { "epoch": 0.14752182349947712, "grad_norm": 0.2587822377681732, "learning_rate": 0.0002215724793275161, "loss": 0.8311, "step": 13190 }, { "epoch": 0.14763366718673981, "grad_norm": 0.29723209142684937, "learning_rate": 0.00022134405409109597, "loss": 0.8664, "step": 13200 }, { "epoch": 0.1477455108740025, "grad_norm": 0.2579325735569, "learning_rate": 0.00022111562885467588, "loss": 0.8515, "step": 13210 }, { "epoch": 0.14785735456126517, "grad_norm": 0.28357258439064026, "learning_rate": 0.00022088720361825576, "loss": 0.8562, "step": 13220 }, { "epoch": 0.14796919824852786, "grad_norm": 0.26742318272590637, "learning_rate": 0.00022065877838183562, "loss": 0.8571, "step": 13230 }, { "epoch": 0.14808104193579055, "grad_norm": 0.2750874161720276, "learning_rate": 0.0002204303531454155, "loss": 0.8449, "step": 13240 }, { "epoch": 0.14819288562305322, "grad_norm": 0.3043031692504883, "learning_rate": 0.0002202019279089954, "loss": 0.8472, "step": 13250 }, { "epoch": 0.1483047293103159, "grad_norm": 0.27216988801956177, "learning_rate": 0.00021997350267257527, "loss": 0.8732, "step": 13260 }, { "epoch": 0.14841657299757857, "grad_norm": 0.2818603515625, "learning_rate": 0.00021974507743615515, "loss": 0.8333, "step": 13270 }, { "epoch": 0.14852841668484126, "grad_norm": 0.2604407072067261, "learning_rate": 0.000219516652199735, "loss": 0.8467, "step": 13280 }, { "epoch": 0.14864026037210396, "grad_norm": 0.28342294692993164, "learning_rate": 0.00021928822696331491, "loss": 0.8292, "step": 13290 }, { "epoch": 0.14875210405936662, "grad_norm": 0.2564396262168884, "learning_rate": 0.0002190598017268948, "loss": 0.8355, "step": 13300 }, { "epoch": 0.1488639477466293, "grad_norm": 0.2528108060359955, "learning_rate": 0.00021883137649047465, "loss": 0.8269, "step": 13310 }, { "epoch": 0.148975791433892, "grad_norm": 0.26454785466194153, "learning_rate": 0.00021860295125405456, "loss": 0.8425, "step": 13320 }, { "epoch": 0.14908763512115467, "grad_norm": 0.25204601883888245, "learning_rate": 0.00021837452601763445, "loss": 0.8251, "step": 13330 }, { "epoch": 0.14919947880841736, "grad_norm": 0.24680152535438538, "learning_rate": 0.0002181461007812143, "loss": 0.8247, "step": 13340 }, { "epoch": 0.14931132249568005, "grad_norm": 0.27356913685798645, "learning_rate": 0.00021791767554479418, "loss": 0.811, "step": 13350 }, { "epoch": 0.1494231661829427, "grad_norm": 0.24703428149223328, "learning_rate": 0.0002176892503083741, "loss": 0.8145, "step": 13360 }, { "epoch": 0.1495350098702054, "grad_norm": 0.27793166041374207, "learning_rate": 0.00021746082507195395, "loss": 0.8162, "step": 13370 }, { "epoch": 0.1496468535574681, "grad_norm": 0.28826582431793213, "learning_rate": 0.00021723239983553383, "loss": 0.8258, "step": 13380 }, { "epoch": 0.14975869724473076, "grad_norm": 0.24826544523239136, "learning_rate": 0.00021700397459911372, "loss": 0.8131, "step": 13390 }, { "epoch": 0.14987054093199345, "grad_norm": 0.29015326499938965, "learning_rate": 0.0002167755493626936, "loss": 0.8241, "step": 13400 }, { "epoch": 0.14998238461925611, "grad_norm": 0.2692265510559082, "learning_rate": 0.00021654712412627348, "loss": 0.8046, "step": 13410 }, { "epoch": 0.1500942283065188, "grad_norm": 0.28277263045310974, "learning_rate": 0.00021631869888985336, "loss": 0.8075, "step": 13420 }, { "epoch": 0.1502060719937815, "grad_norm": 0.25920721888542175, "learning_rate": 0.00021609027365343322, "loss": 0.8146, "step": 13430 }, { "epoch": 0.15031791568104416, "grad_norm": 0.2548248767852783, "learning_rate": 0.00021586184841701313, "loss": 0.82, "step": 13440 }, { "epoch": 0.15042975936830685, "grad_norm": 0.3121783435344696, "learning_rate": 0.000215633423180593, "loss": 0.796, "step": 13450 }, { "epoch": 0.15054160305556955, "grad_norm": 0.2799825370311737, "learning_rate": 0.00021540499794417287, "loss": 0.8073, "step": 13460 }, { "epoch": 0.1506534467428322, "grad_norm": 0.24525675177574158, "learning_rate": 0.00021517657270775275, "loss": 0.804, "step": 13470 }, { "epoch": 0.1507652904300949, "grad_norm": 0.26799294352531433, "learning_rate": 0.00021494814747133266, "loss": 0.8086, "step": 13480 }, { "epoch": 0.1508771341173576, "grad_norm": 0.24744056165218353, "learning_rate": 0.00021471972223491252, "loss": 0.7972, "step": 13490 }, { "epoch": 0.15098897780462026, "grad_norm": 0.27284878492355347, "learning_rate": 0.0002144912969984924, "loss": 0.8048, "step": 13500 }, { "epoch": 0.15110082149188295, "grad_norm": 0.2427281141281128, "learning_rate": 0.00021426287176207225, "loss": 0.8043, "step": 13510 }, { "epoch": 0.15121266517914564, "grad_norm": 0.27432921528816223, "learning_rate": 0.00021403444652565216, "loss": 0.8198, "step": 13520 }, { "epoch": 0.1513245088664083, "grad_norm": 0.26843661069869995, "learning_rate": 0.00021380602128923205, "loss": 0.8156, "step": 13530 }, { "epoch": 0.151436352553671, "grad_norm": 0.2460176795721054, "learning_rate": 0.0002135775960528119, "loss": 0.806, "step": 13540 }, { "epoch": 0.15154819624093366, "grad_norm": 0.24147658050060272, "learning_rate": 0.00021334917081639179, "loss": 0.8146, "step": 13550 }, { "epoch": 0.15166003992819635, "grad_norm": 0.2715270221233368, "learning_rate": 0.0002131207455799717, "loss": 0.8065, "step": 13560 }, { "epoch": 0.15177188361545904, "grad_norm": 0.2851991653442383, "learning_rate": 0.00021289232034355155, "loss": 0.8042, "step": 13570 }, { "epoch": 0.1518837273027217, "grad_norm": 0.2779170870780945, "learning_rate": 0.00021266389510713143, "loss": 0.8163, "step": 13580 }, { "epoch": 0.1519955709899844, "grad_norm": 0.2853197455406189, "learning_rate": 0.00021243546987071132, "loss": 0.8025, "step": 13590 }, { "epoch": 0.1521074146772471, "grad_norm": 0.2753603160381317, "learning_rate": 0.0002122070446342912, "loss": 0.8187, "step": 13600 }, { "epoch": 0.15221925836450975, "grad_norm": 0.29546552896499634, "learning_rate": 0.00021197861939787108, "loss": 0.8189, "step": 13610 }, { "epoch": 0.15233110205177244, "grad_norm": 0.2799798250198364, "learning_rate": 0.00021175019416145097, "loss": 0.8098, "step": 13620 }, { "epoch": 0.15244294573903514, "grad_norm": 0.23527085781097412, "learning_rate": 0.00021152176892503082, "loss": 0.8212, "step": 13630 }, { "epoch": 0.1525547894262978, "grad_norm": 0.27207401394844055, "learning_rate": 0.00021129334368861073, "loss": 0.808, "step": 13640 }, { "epoch": 0.1526666331135605, "grad_norm": 0.26520609855651855, "learning_rate": 0.00021106491845219061, "loss": 0.8133, "step": 13650 }, { "epoch": 0.15277847680082318, "grad_norm": 0.2750151455402374, "learning_rate": 0.00021083649321577047, "loss": 0.8248, "step": 13660 }, { "epoch": 0.15289032048808585, "grad_norm": 0.28339120745658875, "learning_rate": 0.00021060806797935035, "loss": 0.8175, "step": 13670 }, { "epoch": 0.15300216417534854, "grad_norm": 0.27611440420150757, "learning_rate": 0.00021037964274293026, "loss": 0.8232, "step": 13680 }, { "epoch": 0.1531140078626112, "grad_norm": 0.264113187789917, "learning_rate": 0.00021015121750651012, "loss": 0.8217, "step": 13690 }, { "epoch": 0.1532258515498739, "grad_norm": 0.27031853795051575, "learning_rate": 0.00020992279227009, "loss": 0.8242, "step": 13700 }, { "epoch": 0.15333769523713658, "grad_norm": 0.2753359079360962, "learning_rate": 0.00020969436703366988, "loss": 0.8311, "step": 13710 }, { "epoch": 0.15344953892439925, "grad_norm": 0.24859648942947388, "learning_rate": 0.00020946594179724977, "loss": 0.8285, "step": 13720 }, { "epoch": 0.15356138261166194, "grad_norm": 0.2773294448852539, "learning_rate": 0.00020923751656082965, "loss": 0.8201, "step": 13730 }, { "epoch": 0.15367322629892463, "grad_norm": 0.23855488002300262, "learning_rate": 0.0002090090913244095, "loss": 0.8145, "step": 13740 }, { "epoch": 0.1537850699861873, "grad_norm": 0.27641457319259644, "learning_rate": 0.0002087806660879894, "loss": 0.8233, "step": 13750 }, { "epoch": 0.15389691367345, "grad_norm": 0.26556023955345154, "learning_rate": 0.0002085522408515693, "loss": 0.8309, "step": 13760 }, { "epoch": 0.15400875736071268, "grad_norm": 0.2980164885520935, "learning_rate": 0.00020832381561514915, "loss": 0.8585, "step": 13770 }, { "epoch": 0.15412060104797534, "grad_norm": 0.21802592277526855, "learning_rate": 0.00020809539037872904, "loss": 0.8385, "step": 13780 }, { "epoch": 0.15423244473523803, "grad_norm": 0.3153620958328247, "learning_rate": 0.00020786696514230895, "loss": 0.8423, "step": 13790 }, { "epoch": 0.15434428842250072, "grad_norm": 0.2928372621536255, "learning_rate": 0.0002076385399058888, "loss": 0.8399, "step": 13800 }, { "epoch": 0.1544561321097634, "grad_norm": 0.3015557527542114, "learning_rate": 0.00020741011466946868, "loss": 0.843, "step": 13810 }, { "epoch": 0.15456797579702608, "grad_norm": 0.2243575006723404, "learning_rate": 0.00020718168943304857, "loss": 0.8302, "step": 13820 }, { "epoch": 0.15467981948428874, "grad_norm": 0.23281534016132355, "learning_rate": 0.00020695326419662845, "loss": 0.8268, "step": 13830 }, { "epoch": 0.15479166317155144, "grad_norm": 0.2412877380847931, "learning_rate": 0.00020672483896020833, "loss": 0.849, "step": 13840 }, { "epoch": 0.15490350685881413, "grad_norm": 0.2762492001056671, "learning_rate": 0.00020649641372378822, "loss": 0.8324, "step": 13850 }, { "epoch": 0.1550153505460768, "grad_norm": 0.27976560592651367, "learning_rate": 0.00020626798848736807, "loss": 0.843, "step": 13860 }, { "epoch": 0.15512719423333948, "grad_norm": 0.29076194763183594, "learning_rate": 0.00020603956325094798, "loss": 0.8575, "step": 13870 }, { "epoch": 0.15523903792060217, "grad_norm": 0.2367868423461914, "learning_rate": 0.00020581113801452786, "loss": 0.8465, "step": 13880 }, { "epoch": 0.15535088160786484, "grad_norm": 0.26191186904907227, "learning_rate": 0.00020558271277810772, "loss": 0.8291, "step": 13890 }, { "epoch": 0.15546272529512753, "grad_norm": 0.27254414558410645, "learning_rate": 0.0002053542875416876, "loss": 0.8347, "step": 13900 }, { "epoch": 0.15557456898239022, "grad_norm": 0.2718988060951233, "learning_rate": 0.0002051258623052675, "loss": 0.8319, "step": 13910 }, { "epoch": 0.15568641266965288, "grad_norm": 0.24478264153003693, "learning_rate": 0.00020489743706884737, "loss": 0.8369, "step": 13920 }, { "epoch": 0.15579825635691558, "grad_norm": 0.27791038155555725, "learning_rate": 0.00020466901183242725, "loss": 0.8486, "step": 13930 }, { "epoch": 0.15591010004417827, "grad_norm": 0.27220630645751953, "learning_rate": 0.00020444058659600713, "loss": 0.8335, "step": 13940 }, { "epoch": 0.15602194373144093, "grad_norm": 0.2945479154586792, "learning_rate": 0.00020421216135958702, "loss": 0.8234, "step": 13950 }, { "epoch": 0.15613378741870362, "grad_norm": 0.2911258041858673, "learning_rate": 0.0002039837361231669, "loss": 0.8279, "step": 13960 }, { "epoch": 0.15624563110596631, "grad_norm": 0.3039700984954834, "learning_rate": 0.00020375531088674676, "loss": 0.8409, "step": 13970 }, { "epoch": 0.15635747479322898, "grad_norm": 0.27290788292884827, "learning_rate": 0.00020352688565032664, "loss": 0.8394, "step": 13980 }, { "epoch": 0.15646931848049167, "grad_norm": 0.28534916043281555, "learning_rate": 0.00020329846041390655, "loss": 0.8431, "step": 13990 }, { "epoch": 0.15658116216775433, "grad_norm": 0.304221510887146, "learning_rate": 0.0002030700351774864, "loss": 0.8476, "step": 14000 }, { "epoch": 0.15669300585501703, "grad_norm": 0.3151461184024811, "learning_rate": 0.0002028416099410663, "loss": 0.852, "step": 14010 }, { "epoch": 0.15680484954227972, "grad_norm": 0.2947019040584564, "learning_rate": 0.00020261318470464617, "loss": 0.8396, "step": 14020 }, { "epoch": 0.15691669322954238, "grad_norm": 0.2737627625465393, "learning_rate": 0.00020238475946822605, "loss": 0.8337, "step": 14030 }, { "epoch": 0.15702853691680507, "grad_norm": 0.28257089853286743, "learning_rate": 0.00020215633423180594, "loss": 0.8475, "step": 14040 }, { "epoch": 0.15714038060406776, "grad_norm": 0.3102625608444214, "learning_rate": 0.00020192790899538582, "loss": 0.8451, "step": 14050 }, { "epoch": 0.15725222429133043, "grad_norm": 0.2839931845664978, "learning_rate": 0.00020169948375896567, "loss": 0.8365, "step": 14060 }, { "epoch": 0.15736406797859312, "grad_norm": 0.25566980242729187, "learning_rate": 0.00020147105852254558, "loss": 0.8287, "step": 14070 }, { "epoch": 0.1574759116658558, "grad_norm": 0.267791211605072, "learning_rate": 0.00020124263328612547, "loss": 0.8289, "step": 14080 }, { "epoch": 0.15758775535311847, "grad_norm": 0.267635703086853, "learning_rate": 0.00020101420804970532, "loss": 0.8357, "step": 14090 }, { "epoch": 0.15769959904038117, "grad_norm": 0.28065699338912964, "learning_rate": 0.0002007857828132852, "loss": 0.8363, "step": 14100 }, { "epoch": 0.15781144272764386, "grad_norm": 0.26585736870765686, "learning_rate": 0.00020055735757686512, "loss": 0.8409, "step": 14110 }, { "epoch": 0.15792328641490652, "grad_norm": 0.2562732398509979, "learning_rate": 0.00020032893234044497, "loss": 0.8374, "step": 14120 }, { "epoch": 0.1580351301021692, "grad_norm": 0.2572222650051117, "learning_rate": 0.00020010050710402485, "loss": 0.8405, "step": 14130 }, { "epoch": 0.15814697378943188, "grad_norm": 0.3075050413608551, "learning_rate": 0.00019987208186760474, "loss": 0.825, "step": 14140 }, { "epoch": 0.15825881747669457, "grad_norm": 0.2630293071269989, "learning_rate": 0.00019964365663118462, "loss": 0.8326, "step": 14150 }, { "epoch": 0.15837066116395726, "grad_norm": 0.255015105009079, "learning_rate": 0.0001994152313947645, "loss": 0.8181, "step": 14160 }, { "epoch": 0.15848250485121992, "grad_norm": 0.25929179787635803, "learning_rate": 0.00019918680615834438, "loss": 0.8067, "step": 14170 }, { "epoch": 0.15859434853848262, "grad_norm": 0.27078965306282043, "learning_rate": 0.00019895838092192424, "loss": 0.8043, "step": 14180 }, { "epoch": 0.1587061922257453, "grad_norm": 0.2618376612663269, "learning_rate": 0.00019872995568550415, "loss": 0.8191, "step": 14190 }, { "epoch": 0.15881803591300797, "grad_norm": 0.246153324842453, "learning_rate": 0.000198501530449084, "loss": 0.8251, "step": 14200 }, { "epoch": 0.15892987960027066, "grad_norm": 0.25498026609420776, "learning_rate": 0.0001982731052126639, "loss": 0.8319, "step": 14210 }, { "epoch": 0.15904172328753335, "grad_norm": 0.2517942190170288, "learning_rate": 0.0001980446799762438, "loss": 0.8106, "step": 14220 }, { "epoch": 0.15915356697479602, "grad_norm": 0.2659161388874054, "learning_rate": 0.00019781625473982365, "loss": 0.8163, "step": 14230 }, { "epoch": 0.1592654106620587, "grad_norm": 0.24527288973331451, "learning_rate": 0.00019758782950340354, "loss": 0.8359, "step": 14240 }, { "epoch": 0.1593772543493214, "grad_norm": 0.23943792283535004, "learning_rate": 0.00019735940426698342, "loss": 0.8253, "step": 14250 }, { "epoch": 0.15948909803658406, "grad_norm": 0.30401650071144104, "learning_rate": 0.0001971309790305633, "loss": 0.8369, "step": 14260 }, { "epoch": 0.15960094172384676, "grad_norm": 0.25001001358032227, "learning_rate": 0.00019690255379414319, "loss": 0.8354, "step": 14270 }, { "epoch": 0.15971278541110942, "grad_norm": 0.2378586083650589, "learning_rate": 0.00019667412855772307, "loss": 0.8324, "step": 14280 }, { "epoch": 0.1598246290983721, "grad_norm": 0.26216059923171997, "learning_rate": 0.00019644570332130292, "loss": 0.8227, "step": 14290 }, { "epoch": 0.1599364727856348, "grad_norm": 0.24156969785690308, "learning_rate": 0.00019621727808488283, "loss": 0.8362, "step": 14300 }, { "epoch": 0.16004831647289747, "grad_norm": 0.24192091822624207, "learning_rate": 0.00019598885284846272, "loss": 0.835, "step": 14310 }, { "epoch": 0.16016016016016016, "grad_norm": 0.24861887097358704, "learning_rate": 0.00019576042761204257, "loss": 0.8232, "step": 14320 }, { "epoch": 0.16027200384742285, "grad_norm": 0.27175864577293396, "learning_rate": 0.00019553200237562246, "loss": 0.8303, "step": 14330 }, { "epoch": 0.16038384753468551, "grad_norm": 0.272334486246109, "learning_rate": 0.00019530357713920237, "loss": 0.8217, "step": 14340 }, { "epoch": 0.1604956912219482, "grad_norm": 0.28357213735580444, "learning_rate": 0.00019507515190278222, "loss": 0.8343, "step": 14350 }, { "epoch": 0.1606075349092109, "grad_norm": 0.272276371717453, "learning_rate": 0.0001948467266663621, "loss": 0.8235, "step": 14360 }, { "epoch": 0.16071937859647356, "grad_norm": 0.26771044731140137, "learning_rate": 0.000194618301429942, "loss": 0.8292, "step": 14370 }, { "epoch": 0.16083122228373625, "grad_norm": 0.27449774742126465, "learning_rate": 0.00019438987619352187, "loss": 0.8485, "step": 14380 }, { "epoch": 0.16094306597099894, "grad_norm": 0.26026156544685364, "learning_rate": 0.00019416145095710175, "loss": 0.8458, "step": 14390 }, { "epoch": 0.1610549096582616, "grad_norm": 0.2667345404624939, "learning_rate": 0.00019393302572068164, "loss": 0.8519, "step": 14400 }, { "epoch": 0.1611667533455243, "grad_norm": 0.26302048563957214, "learning_rate": 0.0001937046004842615, "loss": 0.8353, "step": 14410 }, { "epoch": 0.16127859703278696, "grad_norm": 0.24420003592967987, "learning_rate": 0.0001934761752478414, "loss": 0.8464, "step": 14420 }, { "epoch": 0.16139044072004965, "grad_norm": 0.2739315629005432, "learning_rate": 0.00019324775001142126, "loss": 0.8257, "step": 14430 }, { "epoch": 0.16150228440731235, "grad_norm": 0.2370629757642746, "learning_rate": 0.00019301932477500114, "loss": 0.8324, "step": 14440 }, { "epoch": 0.161614128094575, "grad_norm": 0.2616153955459595, "learning_rate": 0.00019279089953858102, "loss": 0.8513, "step": 14450 }, { "epoch": 0.1617259717818377, "grad_norm": 0.2527558207511902, "learning_rate": 0.0001925624743021609, "loss": 0.8435, "step": 14460 }, { "epoch": 0.1618378154691004, "grad_norm": 0.28255122900009155, "learning_rate": 0.0001923340490657408, "loss": 0.8497, "step": 14470 }, { "epoch": 0.16194965915636306, "grad_norm": 0.23198026418685913, "learning_rate": 0.00019210562382932067, "loss": 0.8357, "step": 14480 }, { "epoch": 0.16206150284362575, "grad_norm": 0.2534460127353668, "learning_rate": 0.00019187719859290053, "loss": 0.8396, "step": 14490 }, { "epoch": 0.16217334653088844, "grad_norm": 0.2693686783313751, "learning_rate": 0.00019164877335648044, "loss": 0.8438, "step": 14500 }, { "epoch": 0.1622851902181511, "grad_norm": 0.26181599497795105, "learning_rate": 0.00019142034812006032, "loss": 0.8452, "step": 14510 }, { "epoch": 0.1623970339054138, "grad_norm": 0.2268761545419693, "learning_rate": 0.00019119192288364017, "loss": 0.8496, "step": 14520 }, { "epoch": 0.1625088775926765, "grad_norm": 0.27698907256126404, "learning_rate": 0.00019096349764722006, "loss": 0.8265, "step": 14530 }, { "epoch": 0.16262072127993915, "grad_norm": 0.30570700764656067, "learning_rate": 0.00019073507241079997, "loss": 0.8399, "step": 14540 }, { "epoch": 0.16273256496720184, "grad_norm": 0.2894477844238281, "learning_rate": 0.00019050664717437982, "loss": 0.8488, "step": 14550 }, { "epoch": 0.16284440865446453, "grad_norm": 0.3094457685947418, "learning_rate": 0.0001902782219379597, "loss": 0.8243, "step": 14560 }, { "epoch": 0.1629562523417272, "grad_norm": 0.2908037602901459, "learning_rate": 0.0001900497967015396, "loss": 0.835, "step": 14570 }, { "epoch": 0.1630680960289899, "grad_norm": 0.27222102880477905, "learning_rate": 0.00018982137146511947, "loss": 0.8306, "step": 14580 }, { "epoch": 0.16317993971625255, "grad_norm": 0.2542339563369751, "learning_rate": 0.00018959294622869935, "loss": 0.8259, "step": 14590 }, { "epoch": 0.16329178340351524, "grad_norm": 0.28288012742996216, "learning_rate": 0.00018936452099227924, "loss": 0.8243, "step": 14600 }, { "epoch": 0.16340362709077794, "grad_norm": 0.2584143877029419, "learning_rate": 0.0001891360957558591, "loss": 0.8224, "step": 14610 }, { "epoch": 0.1635154707780406, "grad_norm": 0.26679450273513794, "learning_rate": 0.000188907670519439, "loss": 0.8142, "step": 14620 }, { "epoch": 0.1636273144653033, "grad_norm": 0.24589306116104126, "learning_rate": 0.00018867924528301889, "loss": 0.81, "step": 14630 }, { "epoch": 0.16373915815256598, "grad_norm": 0.28474611043930054, "learning_rate": 0.00018845082004659874, "loss": 0.7989, "step": 14640 }, { "epoch": 0.16385100183982865, "grad_norm": 0.27567991614341736, "learning_rate": 0.00018822239481017862, "loss": 0.8049, "step": 14650 }, { "epoch": 0.16396284552709134, "grad_norm": 0.2509905695915222, "learning_rate": 0.0001879939695737585, "loss": 0.8168, "step": 14660 }, { "epoch": 0.16407468921435403, "grad_norm": 0.30284953117370605, "learning_rate": 0.0001877655443373384, "loss": 0.8055, "step": 14670 }, { "epoch": 0.1641865329016167, "grad_norm": 0.27638325095176697, "learning_rate": 0.00018753711910091827, "loss": 0.8368, "step": 14680 }, { "epoch": 0.16429837658887939, "grad_norm": 0.29546642303466797, "learning_rate": 0.00018730869386449816, "loss": 0.8161, "step": 14690 }, { "epoch": 0.16441022027614208, "grad_norm": 0.2483370304107666, "learning_rate": 0.00018708026862807804, "loss": 0.8136, "step": 14700 }, { "epoch": 0.16452206396340474, "grad_norm": 0.2862898111343384, "learning_rate": 0.00018685184339165792, "loss": 0.836, "step": 14710 }, { "epoch": 0.16463390765066743, "grad_norm": 0.2730434238910675, "learning_rate": 0.00018662341815523778, "loss": 0.8279, "step": 14720 }, { "epoch": 0.1647457513379301, "grad_norm": 0.2846275269985199, "learning_rate": 0.0001863949929188177, "loss": 0.7991, "step": 14730 }, { "epoch": 0.1648575950251928, "grad_norm": 0.2455524355173111, "learning_rate": 0.00018616656768239757, "loss": 0.7931, "step": 14740 }, { "epoch": 0.16496943871245548, "grad_norm": 0.25060829520225525, "learning_rate": 0.00018593814244597743, "loss": 0.8009, "step": 14750 }, { "epoch": 0.16508128239971814, "grad_norm": 0.2687000334262848, "learning_rate": 0.0001857097172095573, "loss": 0.7968, "step": 14760 }, { "epoch": 0.16519312608698083, "grad_norm": 0.28619691729545593, "learning_rate": 0.00018548129197313722, "loss": 0.7818, "step": 14770 }, { "epoch": 0.16530496977424353, "grad_norm": 0.2549494206905365, "learning_rate": 0.00018525286673671707, "loss": 0.7877, "step": 14780 }, { "epoch": 0.1654168134615062, "grad_norm": 0.2419700175523758, "learning_rate": 0.00018502444150029696, "loss": 0.7899, "step": 14790 }, { "epoch": 0.16552865714876888, "grad_norm": 0.2636066675186157, "learning_rate": 0.00018479601626387684, "loss": 0.7893, "step": 14800 }, { "epoch": 0.16564050083603157, "grad_norm": 0.264072984457016, "learning_rate": 0.00018456759102745672, "loss": 0.7984, "step": 14810 }, { "epoch": 0.16575234452329424, "grad_norm": 0.2661677598953247, "learning_rate": 0.0001843391657910366, "loss": 0.8085, "step": 14820 }, { "epoch": 0.16586418821055693, "grad_norm": 0.28324052691459656, "learning_rate": 0.0001841107405546165, "loss": 0.8066, "step": 14830 }, { "epoch": 0.16597603189781962, "grad_norm": 0.277761310338974, "learning_rate": 0.00018388231531819634, "loss": 0.8008, "step": 14840 }, { "epoch": 0.16608787558508228, "grad_norm": 0.2669602036476135, "learning_rate": 0.00018365389008177625, "loss": 0.8285, "step": 14850 }, { "epoch": 0.16619971927234498, "grad_norm": 0.28757140040397644, "learning_rate": 0.00018342546484535614, "loss": 0.8121, "step": 14860 }, { "epoch": 0.16631156295960764, "grad_norm": 0.2616439163684845, "learning_rate": 0.000183197039608936, "loss": 0.8185, "step": 14870 }, { "epoch": 0.16642340664687033, "grad_norm": 0.28334370255470276, "learning_rate": 0.00018296861437251587, "loss": 0.8229, "step": 14880 }, { "epoch": 0.16653525033413302, "grad_norm": 0.2659022808074951, "learning_rate": 0.00018274018913609576, "loss": 0.82, "step": 14890 }, { "epoch": 0.1666470940213957, "grad_norm": 0.2544262111186981, "learning_rate": 0.00018251176389967564, "loss": 0.84, "step": 14900 }, { "epoch": 0.16675893770865838, "grad_norm": 0.27492937445640564, "learning_rate": 0.00018228333866325552, "loss": 0.8411, "step": 14910 }, { "epoch": 0.16687078139592107, "grad_norm": 0.2961216866970062, "learning_rate": 0.00018205491342683538, "loss": 0.8178, "step": 14920 }, { "epoch": 0.16698262508318373, "grad_norm": 0.2704416811466217, "learning_rate": 0.0001818264881904153, "loss": 0.8264, "step": 14930 }, { "epoch": 0.16709446877044642, "grad_norm": 0.261704683303833, "learning_rate": 0.00018159806295399517, "loss": 0.8307, "step": 14940 }, { "epoch": 0.16720631245770912, "grad_norm": 0.26157405972480774, "learning_rate": 0.00018136963771757503, "loss": 0.8064, "step": 14950 }, { "epoch": 0.16731815614497178, "grad_norm": 0.2589896023273468, "learning_rate": 0.0001811412124811549, "loss": 0.8195, "step": 14960 }, { "epoch": 0.16742999983223447, "grad_norm": 0.24691319465637207, "learning_rate": 0.00018091278724473482, "loss": 0.8283, "step": 14970 }, { "epoch": 0.16754184351949716, "grad_norm": 0.2527819871902466, "learning_rate": 0.00018068436200831468, "loss": 0.8229, "step": 14980 }, { "epoch": 0.16765368720675983, "grad_norm": 0.2639094293117523, "learning_rate": 0.00018045593677189456, "loss": 0.8393, "step": 14990 }, { "epoch": 0.16776553089402252, "grad_norm": 0.24417634308338165, "learning_rate": 0.00018022751153547444, "loss": 0.8204, "step": 15000 }, { "epoch": 0.16787737458128518, "grad_norm": 0.25673115253448486, "learning_rate": 0.00017999908629905432, "loss": 0.8184, "step": 15010 }, { "epoch": 0.16798921826854787, "grad_norm": 0.254077285528183, "learning_rate": 0.0001797706610626342, "loss": 0.8195, "step": 15020 }, { "epoch": 0.16810106195581057, "grad_norm": 0.2455417662858963, "learning_rate": 0.0001795422358262141, "loss": 0.8255, "step": 15030 }, { "epoch": 0.16821290564307323, "grad_norm": 0.27918189764022827, "learning_rate": 0.00017931381058979395, "loss": 0.8345, "step": 15040 }, { "epoch": 0.16832474933033592, "grad_norm": 0.2272186279296875, "learning_rate": 0.00017908538535337386, "loss": 0.8178, "step": 15050 }, { "epoch": 0.1684365930175986, "grad_norm": 0.269189715385437, "learning_rate": 0.00017885696011695374, "loss": 0.8343, "step": 15060 }, { "epoch": 0.16854843670486128, "grad_norm": 0.2805529832839966, "learning_rate": 0.0001786285348805336, "loss": 0.8126, "step": 15070 }, { "epoch": 0.16866028039212397, "grad_norm": 0.28788769245147705, "learning_rate": 0.00017840010964411348, "loss": 0.8278, "step": 15080 }, { "epoch": 0.16877212407938666, "grad_norm": 0.2439277619123459, "learning_rate": 0.00017817168440769336, "loss": 0.8272, "step": 15090 }, { "epoch": 0.16888396776664932, "grad_norm": 0.3151440918445587, "learning_rate": 0.00017794325917127324, "loss": 0.8201, "step": 15100 }, { "epoch": 0.16899581145391201, "grad_norm": 0.2562885880470276, "learning_rate": 0.00017771483393485313, "loss": 0.8275, "step": 15110 }, { "epoch": 0.1691076551411747, "grad_norm": 0.2718476355075836, "learning_rate": 0.00017748640869843298, "loss": 0.821, "step": 15120 }, { "epoch": 0.16921949882843737, "grad_norm": 0.2699459493160248, "learning_rate": 0.0001772579834620129, "loss": 0.8352, "step": 15130 }, { "epoch": 0.16933134251570006, "grad_norm": 0.29737600684165955, "learning_rate": 0.00017702955822559277, "loss": 0.8279, "step": 15140 }, { "epoch": 0.16944318620296273, "grad_norm": 0.3075369894504547, "learning_rate": 0.00017680113298917263, "loss": 0.8037, "step": 15150 }, { "epoch": 0.16955502989022542, "grad_norm": 0.27061593532562256, "learning_rate": 0.00017657270775275254, "loss": 0.8149, "step": 15160 }, { "epoch": 0.1696668735774881, "grad_norm": 0.26719844341278076, "learning_rate": 0.00017634428251633242, "loss": 0.7896, "step": 15170 }, { "epoch": 0.16977871726475077, "grad_norm": 0.2871409058570862, "learning_rate": 0.00017611585727991228, "loss": 0.7863, "step": 15180 }, { "epoch": 0.16989056095201346, "grad_norm": 0.2502906620502472, "learning_rate": 0.00017588743204349216, "loss": 0.7817, "step": 15190 }, { "epoch": 0.17000240463927616, "grad_norm": 0.2579248547554016, "learning_rate": 0.00017565900680707207, "loss": 0.796, "step": 15200 }, { "epoch": 0.17011424832653882, "grad_norm": 0.2537415325641632, "learning_rate": 0.00017543058157065193, "loss": 0.78, "step": 15210 }, { "epoch": 0.1702260920138015, "grad_norm": 0.2420157790184021, "learning_rate": 0.0001752021563342318, "loss": 0.7946, "step": 15220 }, { "epoch": 0.1703379357010642, "grad_norm": 0.2423790544271469, "learning_rate": 0.0001749737310978117, "loss": 0.797, "step": 15230 }, { "epoch": 0.17044977938832687, "grad_norm": 0.2521071434020996, "learning_rate": 0.00017474530586139157, "loss": 0.8073, "step": 15240 }, { "epoch": 0.17056162307558956, "grad_norm": 0.22921273112297058, "learning_rate": 0.00017451688062497146, "loss": 0.7916, "step": 15250 }, { "epoch": 0.17067346676285225, "grad_norm": 0.35150206089019775, "learning_rate": 0.00017428845538855134, "loss": 0.8001, "step": 15260 }, { "epoch": 0.1707853104501149, "grad_norm": 0.27637869119644165, "learning_rate": 0.0001740600301521312, "loss": 0.7948, "step": 15270 }, { "epoch": 0.1708971541373776, "grad_norm": 0.22480230033397675, "learning_rate": 0.0001738316049157111, "loss": 0.7932, "step": 15280 }, { "epoch": 0.1710089978246403, "grad_norm": 0.27264508605003357, "learning_rate": 0.000173603179679291, "loss": 0.8083, "step": 15290 }, { "epoch": 0.17112084151190296, "grad_norm": 0.2647417485713959, "learning_rate": 0.00017337475444287084, "loss": 0.8177, "step": 15300 }, { "epoch": 0.17123268519916565, "grad_norm": 0.23619987070560455, "learning_rate": 0.00017314632920645073, "loss": 0.8068, "step": 15310 }, { "epoch": 0.17134452888642832, "grad_norm": 0.22450131177902222, "learning_rate": 0.0001729179039700306, "loss": 0.8004, "step": 15320 }, { "epoch": 0.171456372573691, "grad_norm": 0.2784859240055084, "learning_rate": 0.0001726894787336105, "loss": 0.7938, "step": 15330 }, { "epoch": 0.1715682162609537, "grad_norm": 0.25513574481010437, "learning_rate": 0.00017246105349719038, "loss": 0.7844, "step": 15340 }, { "epoch": 0.17168005994821636, "grad_norm": 0.27425146102905273, "learning_rate": 0.00017223262826077023, "loss": 0.7906, "step": 15350 }, { "epoch": 0.17179190363547905, "grad_norm": 0.2500791847705841, "learning_rate": 0.00017200420302435014, "loss": 0.7834, "step": 15360 }, { "epoch": 0.17190374732274175, "grad_norm": 0.2550630271434784, "learning_rate": 0.00017177577778793002, "loss": 0.7736, "step": 15370 }, { "epoch": 0.1720155910100044, "grad_norm": 0.25209444761276245, "learning_rate": 0.00017154735255150988, "loss": 0.773, "step": 15380 }, { "epoch": 0.1721274346972671, "grad_norm": 0.2347812056541443, "learning_rate": 0.00017131892731508976, "loss": 0.7745, "step": 15390 }, { "epoch": 0.1722392783845298, "grad_norm": 0.2858305871486664, "learning_rate": 0.00017109050207866967, "loss": 0.7776, "step": 15400 }, { "epoch": 0.17235112207179246, "grad_norm": 0.30414941906929016, "learning_rate": 0.00017086207684224953, "loss": 0.7701, "step": 15410 }, { "epoch": 0.17246296575905515, "grad_norm": 0.2645011842250824, "learning_rate": 0.0001706336516058294, "loss": 0.7746, "step": 15420 }, { "epoch": 0.17257480944631784, "grad_norm": 0.2984048128128052, "learning_rate": 0.0001704052263694093, "loss": 0.771, "step": 15430 }, { "epoch": 0.1726866531335805, "grad_norm": 0.2734147906303406, "learning_rate": 0.00017017680113298918, "loss": 0.7769, "step": 15440 }, { "epoch": 0.1727984968208432, "grad_norm": 0.2632124125957489, "learning_rate": 0.00016994837589656906, "loss": 0.7754, "step": 15450 }, { "epoch": 0.17291034050810586, "grad_norm": 0.29384443163871765, "learning_rate": 0.00016971995066014894, "loss": 0.7833, "step": 15460 }, { "epoch": 0.17302218419536855, "grad_norm": 0.3194182813167572, "learning_rate": 0.0001694915254237288, "loss": 0.7813, "step": 15470 }, { "epoch": 0.17313402788263124, "grad_norm": 0.25995251536369324, "learning_rate": 0.0001692631001873087, "loss": 0.7796, "step": 15480 }, { "epoch": 0.1732458715698939, "grad_norm": 0.272419810295105, "learning_rate": 0.0001690346749508886, "loss": 0.7839, "step": 15490 }, { "epoch": 0.1733577152571566, "grad_norm": 0.26239413022994995, "learning_rate": 0.00016880624971446845, "loss": 0.7807, "step": 15500 }, { "epoch": 0.1734695589444193, "grad_norm": 0.29991698265075684, "learning_rate": 0.00016857782447804833, "loss": 0.7941, "step": 15510 }, { "epoch": 0.17358140263168195, "grad_norm": 0.2812528908252716, "learning_rate": 0.00016834939924162824, "loss": 0.7863, "step": 15520 }, { "epoch": 0.17369324631894464, "grad_norm": 0.2557685077190399, "learning_rate": 0.0001681209740052081, "loss": 0.7953, "step": 15530 }, { "epoch": 0.17380509000620734, "grad_norm": 0.28565913438796997, "learning_rate": 0.00016789254876878798, "loss": 0.7934, "step": 15540 }, { "epoch": 0.17391693369347, "grad_norm": 0.25316086411476135, "learning_rate": 0.00016766412353236783, "loss": 0.7969, "step": 15550 }, { "epoch": 0.1740287773807327, "grad_norm": 0.2636478543281555, "learning_rate": 0.00016743569829594774, "loss": 0.8021, "step": 15560 }, { "epoch": 0.17414062106799538, "grad_norm": 0.28839442133903503, "learning_rate": 0.00016720727305952763, "loss": 0.8108, "step": 15570 }, { "epoch": 0.17425246475525805, "grad_norm": 0.2453639954328537, "learning_rate": 0.00016697884782310748, "loss": 0.8034, "step": 15580 }, { "epoch": 0.17436430844252074, "grad_norm": 0.2550848424434662, "learning_rate": 0.0001667504225866874, "loss": 0.8169, "step": 15590 }, { "epoch": 0.1744761521297834, "grad_norm": 0.24949923157691956, "learning_rate": 0.00016652199735026727, "loss": 0.8167, "step": 15600 }, { "epoch": 0.1745879958170461, "grad_norm": 0.24357125163078308, "learning_rate": 0.00016629357211384713, "loss": 0.821, "step": 15610 }, { "epoch": 0.17469983950430878, "grad_norm": 0.2246461659669876, "learning_rate": 0.000166065146877427, "loss": 0.82, "step": 15620 }, { "epoch": 0.17481168319157145, "grad_norm": 0.26160740852355957, "learning_rate": 0.00016583672164100692, "loss": 0.8167, "step": 15630 }, { "epoch": 0.17492352687883414, "grad_norm": 0.25773337483406067, "learning_rate": 0.00016560829640458678, "loss": 0.8305, "step": 15640 }, { "epoch": 0.17503537056609683, "grad_norm": 0.24051527678966522, "learning_rate": 0.00016537987116816666, "loss": 0.8201, "step": 15650 }, { "epoch": 0.1751472142533595, "grad_norm": 0.2507860064506531, "learning_rate": 0.00016515144593174654, "loss": 0.8444, "step": 15660 }, { "epoch": 0.1752590579406222, "grad_norm": 0.24071821570396423, "learning_rate": 0.00016492302069532643, "loss": 0.8071, "step": 15670 }, { "epoch": 0.17537090162788488, "grad_norm": 0.2533905506134033, "learning_rate": 0.0001646945954589063, "loss": 0.8164, "step": 15680 }, { "epoch": 0.17548274531514754, "grad_norm": 0.2546316683292389, "learning_rate": 0.0001644661702224862, "loss": 0.8237, "step": 15690 }, { "epoch": 0.17559458900241023, "grad_norm": 0.25692155957221985, "learning_rate": 0.00016423774498606605, "loss": 0.8198, "step": 15700 }, { "epoch": 0.17570643268967293, "grad_norm": 0.254535436630249, "learning_rate": 0.00016400931974964596, "loss": 0.8061, "step": 15710 }, { "epoch": 0.1758182763769356, "grad_norm": 0.2557326555252075, "learning_rate": 0.00016378089451322584, "loss": 0.8194, "step": 15720 }, { "epoch": 0.17593012006419828, "grad_norm": 0.24234241247177124, "learning_rate": 0.0001635524692768057, "loss": 0.8183, "step": 15730 }, { "epoch": 0.17604196375146094, "grad_norm": 0.2597709596157074, "learning_rate": 0.00016332404404038558, "loss": 0.7957, "step": 15740 }, { "epoch": 0.17615380743872364, "grad_norm": 0.2896418273448944, "learning_rate": 0.0001630956188039655, "loss": 0.8146, "step": 15750 }, { "epoch": 0.17626565112598633, "grad_norm": 0.2686966061592102, "learning_rate": 0.00016286719356754535, "loss": 0.7988, "step": 15760 }, { "epoch": 0.176377494813249, "grad_norm": 0.26220840215682983, "learning_rate": 0.00016263876833112523, "loss": 0.7936, "step": 15770 }, { "epoch": 0.17648933850051168, "grad_norm": 0.260547012090683, "learning_rate": 0.00016241034309470508, "loss": 0.8002, "step": 15780 }, { "epoch": 0.17660118218777437, "grad_norm": 0.22341471910476685, "learning_rate": 0.000162181917858285, "loss": 0.7935, "step": 15790 }, { "epoch": 0.17671302587503704, "grad_norm": 0.24994009733200073, "learning_rate": 0.00016195349262186488, "loss": 0.7971, "step": 15800 }, { "epoch": 0.17682486956229973, "grad_norm": 0.24070651829242706, "learning_rate": 0.00016172506738544473, "loss": 0.7844, "step": 15810 }, { "epoch": 0.17693671324956242, "grad_norm": 0.23858696222305298, "learning_rate": 0.00016149664214902461, "loss": 0.7687, "step": 15820 }, { "epoch": 0.17704855693682509, "grad_norm": 0.24684946238994598, "learning_rate": 0.00016126821691260452, "loss": 0.7848, "step": 15830 }, { "epoch": 0.17716040062408778, "grad_norm": 0.2525545656681061, "learning_rate": 0.00016103979167618438, "loss": 0.773, "step": 15840 }, { "epoch": 0.17727224431135047, "grad_norm": 0.2485392689704895, "learning_rate": 0.00016081136643976426, "loss": 0.7787, "step": 15850 }, { "epoch": 0.17738408799861313, "grad_norm": 0.2384241223335266, "learning_rate": 0.00016058294120334415, "loss": 0.7732, "step": 15860 }, { "epoch": 0.17749593168587582, "grad_norm": 0.25029659271240234, "learning_rate": 0.00016035451596692403, "loss": 0.7819, "step": 15870 }, { "epoch": 0.1776077753731385, "grad_norm": 0.2988499701023102, "learning_rate": 0.0001601260907305039, "loss": 0.7815, "step": 15880 }, { "epoch": 0.17771961906040118, "grad_norm": 0.25840380787849426, "learning_rate": 0.0001598976654940838, "loss": 0.7899, "step": 15890 }, { "epoch": 0.17783146274766387, "grad_norm": 0.2870889902114868, "learning_rate": 0.00015966924025766365, "loss": 0.7964, "step": 15900 }, { "epoch": 0.17794330643492653, "grad_norm": 0.270702987909317, "learning_rate": 0.00015944081502124356, "loss": 0.7907, "step": 15910 }, { "epoch": 0.17805515012218923, "grad_norm": 0.24939289689064026, "learning_rate": 0.00015921238978482344, "loss": 0.7909, "step": 15920 }, { "epoch": 0.17816699380945192, "grad_norm": 0.25692620873451233, "learning_rate": 0.0001589839645484033, "loss": 0.7864, "step": 15930 }, { "epoch": 0.17827883749671458, "grad_norm": 0.25667235255241394, "learning_rate": 0.00015875553931198318, "loss": 0.7792, "step": 15940 }, { "epoch": 0.17839068118397727, "grad_norm": 0.27988189458847046, "learning_rate": 0.0001585271140755631, "loss": 0.78, "step": 15950 }, { "epoch": 0.17850252487123996, "grad_norm": 0.26706936955451965, "learning_rate": 0.00015829868883914295, "loss": 0.7764, "step": 15960 }, { "epoch": 0.17861436855850263, "grad_norm": 0.25825801491737366, "learning_rate": 0.00015807026360272283, "loss": 0.7798, "step": 15970 }, { "epoch": 0.17872621224576532, "grad_norm": 0.26630404591560364, "learning_rate": 0.0001578418383663027, "loss": 0.7877, "step": 15980 }, { "epoch": 0.178838055933028, "grad_norm": 0.24562442302703857, "learning_rate": 0.0001576134131298826, "loss": 0.7761, "step": 15990 }, { "epoch": 0.17894989962029068, "grad_norm": 0.2607520818710327, "learning_rate": 0.00015738498789346248, "loss": 0.7844, "step": 16000 }, { "epoch": 0.17906174330755337, "grad_norm": 0.25256794691085815, "learning_rate": 0.00015715656265704233, "loss": 0.7712, "step": 16010 }, { "epoch": 0.17917358699481606, "grad_norm": 0.24657808244228363, "learning_rate": 0.00015692813742062222, "loss": 0.7766, "step": 16020 }, { "epoch": 0.17928543068207872, "grad_norm": 0.2546744644641876, "learning_rate": 0.00015669971218420213, "loss": 0.781, "step": 16030 }, { "epoch": 0.17939727436934141, "grad_norm": 0.24849241971969604, "learning_rate": 0.00015647128694778198, "loss": 0.786, "step": 16040 }, { "epoch": 0.17950911805660408, "grad_norm": 0.2447352409362793, "learning_rate": 0.00015624286171136187, "loss": 0.7805, "step": 16050 }, { "epoch": 0.17962096174386677, "grad_norm": 0.3004114031791687, "learning_rate": 0.00015601443647494178, "loss": 0.7748, "step": 16060 }, { "epoch": 0.17973280543112946, "grad_norm": 0.24974007904529572, "learning_rate": 0.00015578601123852163, "loss": 0.7823, "step": 16070 }, { "epoch": 0.17984464911839212, "grad_norm": 0.2995624542236328, "learning_rate": 0.00015555758600210151, "loss": 0.7894, "step": 16080 }, { "epoch": 0.17995649280565482, "grad_norm": 0.2560220956802368, "learning_rate": 0.0001553291607656814, "loss": 0.7849, "step": 16090 }, { "epoch": 0.1800683364929175, "grad_norm": 0.24940122663974762, "learning_rate": 0.00015510073552926128, "loss": 0.7903, "step": 16100 }, { "epoch": 0.18018018018018017, "grad_norm": 0.22082312405109406, "learning_rate": 0.00015487231029284116, "loss": 0.783, "step": 16110 }, { "epoch": 0.18029202386744286, "grad_norm": 0.2670224606990814, "learning_rate": 0.00015464388505642104, "loss": 0.7919, "step": 16120 }, { "epoch": 0.18040386755470555, "grad_norm": 0.2533135414123535, "learning_rate": 0.0001544154598200009, "loss": 0.8007, "step": 16130 }, { "epoch": 0.18051571124196822, "grad_norm": 0.2660861909389496, "learning_rate": 0.0001541870345835808, "loss": 0.7913, "step": 16140 }, { "epoch": 0.1806275549292309, "grad_norm": 0.2556677460670471, "learning_rate": 0.0001539586093471607, "loss": 0.7826, "step": 16150 }, { "epoch": 0.1807393986164936, "grad_norm": 0.275900661945343, "learning_rate": 0.00015373018411074055, "loss": 0.8048, "step": 16160 }, { "epoch": 0.18085124230375627, "grad_norm": 0.29176998138427734, "learning_rate": 0.00015350175887432043, "loss": 0.8241, "step": 16170 }, { "epoch": 0.18096308599101896, "grad_norm": 0.2635776996612549, "learning_rate": 0.00015327333363790034, "loss": 0.8211, "step": 16180 }, { "epoch": 0.18107492967828162, "grad_norm": 0.27744734287261963, "learning_rate": 0.0001530449084014802, "loss": 0.8254, "step": 16190 }, { "epoch": 0.1811867733655443, "grad_norm": 0.28162074089050293, "learning_rate": 0.00015281648316506008, "loss": 0.8182, "step": 16200 }, { "epoch": 0.181298617052807, "grad_norm": 0.29347339272499084, "learning_rate": 0.00015258805792863996, "loss": 0.812, "step": 16210 }, { "epoch": 0.18141046074006967, "grad_norm": 0.26170992851257324, "learning_rate": 0.00015235963269221985, "loss": 0.8221, "step": 16220 }, { "epoch": 0.18152230442733236, "grad_norm": 0.27848196029663086, "learning_rate": 0.00015213120745579973, "loss": 0.825, "step": 16230 }, { "epoch": 0.18163414811459505, "grad_norm": 0.2994973659515381, "learning_rate": 0.00015190278221937958, "loss": 0.8158, "step": 16240 }, { "epoch": 0.18174599180185771, "grad_norm": 0.27873843908309937, "learning_rate": 0.00015167435698295947, "loss": 0.816, "step": 16250 }, { "epoch": 0.1818578354891204, "grad_norm": 0.3014775812625885, "learning_rate": 0.00015144593174653938, "loss": 0.8174, "step": 16260 }, { "epoch": 0.1819696791763831, "grad_norm": 0.29963594675064087, "learning_rate": 0.00015121750651011923, "loss": 0.8104, "step": 16270 }, { "epoch": 0.18208152286364576, "grad_norm": 0.3388141393661499, "learning_rate": 0.00015098908127369912, "loss": 0.826, "step": 16280 }, { "epoch": 0.18219336655090845, "grad_norm": 0.29143062233924866, "learning_rate": 0.000150760656037279, "loss": 0.8222, "step": 16290 }, { "epoch": 0.18230521023817114, "grad_norm": 0.327824205160141, "learning_rate": 0.00015053223080085888, "loss": 0.8186, "step": 16300 }, { "epoch": 0.1824170539254338, "grad_norm": 0.3053797483444214, "learning_rate": 0.00015030380556443876, "loss": 0.8214, "step": 16310 }, { "epoch": 0.1825288976126965, "grad_norm": 0.3030015230178833, "learning_rate": 0.00015007538032801865, "loss": 0.8198, "step": 16320 }, { "epoch": 0.18264074129995916, "grad_norm": 0.3147192597389221, "learning_rate": 0.0001498469550915985, "loss": 0.8224, "step": 16330 }, { "epoch": 0.18275258498722186, "grad_norm": 0.2838999927043915, "learning_rate": 0.0001496185298551784, "loss": 0.8142, "step": 16340 }, { "epoch": 0.18286442867448455, "grad_norm": 0.27273476123809814, "learning_rate": 0.0001493901046187583, "loss": 0.8054, "step": 16350 }, { "epoch": 0.1829762723617472, "grad_norm": 0.2754770517349243, "learning_rate": 0.00014916167938233815, "loss": 0.8131, "step": 16360 }, { "epoch": 0.1830881160490099, "grad_norm": 0.29061514139175415, "learning_rate": 0.00014893325414591803, "loss": 0.7988, "step": 16370 }, { "epoch": 0.1831999597362726, "grad_norm": 0.2525017559528351, "learning_rate": 0.00014870482890949794, "loss": 0.8023, "step": 16380 }, { "epoch": 0.18331180342353526, "grad_norm": 0.3019058108329773, "learning_rate": 0.0001484764036730778, "loss": 0.8077, "step": 16390 }, { "epoch": 0.18342364711079795, "grad_norm": 0.302090048789978, "learning_rate": 0.00014824797843665768, "loss": 0.812, "step": 16400 }, { "epoch": 0.18353549079806064, "grad_norm": 0.29742154479026794, "learning_rate": 0.00014801955320023757, "loss": 0.7911, "step": 16410 }, { "epoch": 0.1836473344853233, "grad_norm": 0.31950804591178894, "learning_rate": 0.00014779112796381745, "loss": 0.7875, "step": 16420 }, { "epoch": 0.183759178172586, "grad_norm": 0.32971978187561035, "learning_rate": 0.00014756270272739733, "loss": 0.7788, "step": 16430 }, { "epoch": 0.1838710218598487, "grad_norm": 0.2941220700740814, "learning_rate": 0.00014733427749097721, "loss": 0.7772, "step": 16440 }, { "epoch": 0.18398286554711135, "grad_norm": 0.2639923393726349, "learning_rate": 0.00014710585225455707, "loss": 0.7708, "step": 16450 }, { "epoch": 0.18409470923437404, "grad_norm": 0.2483467161655426, "learning_rate": 0.00014687742701813698, "loss": 0.7846, "step": 16460 }, { "epoch": 0.1842065529216367, "grad_norm": 0.31150713562965393, "learning_rate": 0.00014664900178171683, "loss": 0.7853, "step": 16470 }, { "epoch": 0.1843183966088994, "grad_norm": 0.30439406633377075, "learning_rate": 0.00014642057654529672, "loss": 0.7779, "step": 16480 }, { "epoch": 0.1844302402961621, "grad_norm": 0.29318898916244507, "learning_rate": 0.00014619215130887663, "loss": 0.7911, "step": 16490 }, { "epoch": 0.18454208398342475, "grad_norm": 0.2726874053478241, "learning_rate": 0.00014596372607245648, "loss": 0.7869, "step": 16500 }, { "epoch": 0.18465392767068745, "grad_norm": 0.2978016436100006, "learning_rate": 0.00014573530083603637, "loss": 0.783, "step": 16510 }, { "epoch": 0.18476577135795014, "grad_norm": 0.3107501268386841, "learning_rate": 0.00014550687559961625, "loss": 0.801, "step": 16520 }, { "epoch": 0.1848776150452128, "grad_norm": 0.2848517894744873, "learning_rate": 0.00014527845036319613, "loss": 0.8063, "step": 16530 }, { "epoch": 0.1849894587324755, "grad_norm": 0.2625429332256317, "learning_rate": 0.00014505002512677601, "loss": 0.8074, "step": 16540 }, { "epoch": 0.18510130241973818, "grad_norm": 0.2805044949054718, "learning_rate": 0.0001448215998903559, "loss": 0.8013, "step": 16550 }, { "epoch": 0.18521314610700085, "grad_norm": 0.27657589316368103, "learning_rate": 0.00014459317465393575, "loss": 0.8012, "step": 16560 }, { "epoch": 0.18532498979426354, "grad_norm": 0.2780141532421112, "learning_rate": 0.00014436474941751566, "loss": 0.8161, "step": 16570 }, { "epoch": 0.18543683348152623, "grad_norm": 0.2871207892894745, "learning_rate": 0.00014413632418109555, "loss": 0.7899, "step": 16580 }, { "epoch": 0.1855486771687889, "grad_norm": 0.2656658887863159, "learning_rate": 0.0001439078989446754, "loss": 0.7985, "step": 16590 }, { "epoch": 0.1856605208560516, "grad_norm": 0.2766350209712982, "learning_rate": 0.00014367947370825528, "loss": 0.7999, "step": 16600 }, { "epoch": 0.18577236454331428, "grad_norm": 0.2616749107837677, "learning_rate": 0.0001434510484718352, "loss": 0.8002, "step": 16610 }, { "epoch": 0.18588420823057694, "grad_norm": 0.25887414813041687, "learning_rate": 0.00014322262323541505, "loss": 0.8112, "step": 16620 }, { "epoch": 0.18599605191783963, "grad_norm": 0.2594297528266907, "learning_rate": 0.00014299419799899493, "loss": 0.802, "step": 16630 }, { "epoch": 0.1861078956051023, "grad_norm": 0.2535499036312103, "learning_rate": 0.00014276577276257482, "loss": 0.7867, "step": 16640 }, { "epoch": 0.186219739292365, "grad_norm": 0.25161436200141907, "learning_rate": 0.0001425373475261547, "loss": 0.8059, "step": 16650 }, { "epoch": 0.18633158297962768, "grad_norm": 0.22897444665431976, "learning_rate": 0.00014230892228973458, "loss": 0.7864, "step": 16660 }, { "epoch": 0.18644342666689034, "grad_norm": 0.27164047956466675, "learning_rate": 0.00014208049705331446, "loss": 0.796, "step": 16670 }, { "epoch": 0.18655527035415304, "grad_norm": 0.2717941701412201, "learning_rate": 0.00014185207181689432, "loss": 0.7801, "step": 16680 }, { "epoch": 0.18666711404141573, "grad_norm": 0.27144837379455566, "learning_rate": 0.00014162364658047423, "loss": 0.7758, "step": 16690 }, { "epoch": 0.1867789577286784, "grad_norm": 0.2357831746339798, "learning_rate": 0.00014139522134405409, "loss": 0.7674, "step": 16700 }, { "epoch": 0.18689080141594108, "grad_norm": 0.23233544826507568, "learning_rate": 0.00014116679610763397, "loss": 0.7827, "step": 16710 }, { "epoch": 0.18700264510320377, "grad_norm": 0.2399321347475052, "learning_rate": 0.00014093837087121385, "loss": 0.7811, "step": 16720 }, { "epoch": 0.18711448879046644, "grad_norm": 0.2493642419576645, "learning_rate": 0.00014070994563479373, "loss": 0.7762, "step": 16730 }, { "epoch": 0.18722633247772913, "grad_norm": 0.23383350670337677, "learning_rate": 0.00014048152039837362, "loss": 0.7754, "step": 16740 }, { "epoch": 0.18733817616499182, "grad_norm": 0.2624364197254181, "learning_rate": 0.0001402530951619535, "loss": 0.7766, "step": 16750 }, { "epoch": 0.18745001985225448, "grad_norm": 0.24138151109218597, "learning_rate": 0.00014002466992553336, "loss": 0.7869, "step": 16760 }, { "epoch": 0.18756186353951718, "grad_norm": 0.2397204041481018, "learning_rate": 0.00013979624468911326, "loss": 0.7974, "step": 16770 }, { "epoch": 0.18767370722677984, "grad_norm": 0.27491655945777893, "learning_rate": 0.00013956781945269315, "loss": 0.8011, "step": 16780 }, { "epoch": 0.18778555091404253, "grad_norm": 0.2321402132511139, "learning_rate": 0.000139339394216273, "loss": 0.803, "step": 16790 }, { "epoch": 0.18789739460130522, "grad_norm": 0.24487042427062988, "learning_rate": 0.00013911096897985289, "loss": 0.7975, "step": 16800 }, { "epoch": 0.1880092382885679, "grad_norm": 0.23328396677970886, "learning_rate": 0.0001388825437434328, "loss": 0.795, "step": 16810 }, { "epoch": 0.18812108197583058, "grad_norm": 0.22705566883087158, "learning_rate": 0.00013865411850701265, "loss": 0.7895, "step": 16820 }, { "epoch": 0.18823292566309327, "grad_norm": 0.24339929223060608, "learning_rate": 0.00013842569327059253, "loss": 0.7931, "step": 16830 }, { "epoch": 0.18834476935035593, "grad_norm": 0.2613057494163513, "learning_rate": 0.00013819726803417242, "loss": 0.7785, "step": 16840 }, { "epoch": 0.18845661303761863, "grad_norm": 0.27011603116989136, "learning_rate": 0.0001379688427977523, "loss": 0.7853, "step": 16850 }, { "epoch": 0.18856845672488132, "grad_norm": 0.26589342951774597, "learning_rate": 0.00013774041756133218, "loss": 0.7893, "step": 16860 }, { "epoch": 0.18868030041214398, "grad_norm": 0.26286208629608154, "learning_rate": 0.00013751199232491207, "loss": 0.7707, "step": 16870 }, { "epoch": 0.18879214409940667, "grad_norm": 0.3021993637084961, "learning_rate": 0.00013728356708849192, "loss": 0.7896, "step": 16880 }, { "epoch": 0.18890398778666936, "grad_norm": 0.30742523074150085, "learning_rate": 0.00013705514185207183, "loss": 0.7895, "step": 16890 }, { "epoch": 0.18901583147393203, "grad_norm": 0.3027999699115753, "learning_rate": 0.0001368267166156517, "loss": 0.7839, "step": 16900 }, { "epoch": 0.18912767516119472, "grad_norm": 0.29199281334877014, "learning_rate": 0.00013659829137923157, "loss": 0.7771, "step": 16910 }, { "epoch": 0.18923951884845738, "grad_norm": 0.2460477203130722, "learning_rate": 0.00013636986614281145, "loss": 0.7823, "step": 16920 }, { "epoch": 0.18935136253572007, "grad_norm": 0.2608555853366852, "learning_rate": 0.00013614144090639134, "loss": 0.7664, "step": 16930 }, { "epoch": 0.18946320622298277, "grad_norm": 0.2723162770271301, "learning_rate": 0.00013591301566997122, "loss": 0.7768, "step": 16940 }, { "epoch": 0.18957504991024543, "grad_norm": 0.2690962255001068, "learning_rate": 0.0001356845904335511, "loss": 0.7697, "step": 16950 }, { "epoch": 0.18968689359750812, "grad_norm": 0.2892717719078064, "learning_rate": 0.00013545616519713096, "loss": 0.769, "step": 16960 }, { "epoch": 0.1897987372847708, "grad_norm": 0.2581406533718109, "learning_rate": 0.00013522773996071087, "loss": 0.7766, "step": 16970 }, { "epoch": 0.18991058097203348, "grad_norm": 0.2944723963737488, "learning_rate": 0.00013499931472429075, "loss": 0.7638, "step": 16980 }, { "epoch": 0.19002242465929617, "grad_norm": 0.2776504158973694, "learning_rate": 0.0001347708894878706, "loss": 0.7731, "step": 16990 }, { "epoch": 0.19013426834655886, "grad_norm": 0.267098993062973, "learning_rate": 0.00013454246425145052, "loss": 0.7772, "step": 17000 }, { "epoch": 0.19024611203382152, "grad_norm": 0.2806127071380615, "learning_rate": 0.0001343140390150304, "loss": 0.772, "step": 17010 }, { "epoch": 0.19035795572108422, "grad_norm": 0.2872319519519806, "learning_rate": 0.00013408561377861025, "loss": 0.7695, "step": 17020 }, { "epoch": 0.1904697994083469, "grad_norm": 0.24477818608283997, "learning_rate": 0.00013385718854219014, "loss": 0.7764, "step": 17030 }, { "epoch": 0.19058164309560957, "grad_norm": 0.2637476623058319, "learning_rate": 0.00013362876330577005, "loss": 0.7712, "step": 17040 }, { "epoch": 0.19069348678287226, "grad_norm": 0.2676442861557007, "learning_rate": 0.0001334003380693499, "loss": 0.7707, "step": 17050 }, { "epoch": 0.19080533047013493, "grad_norm": 0.2592306435108185, "learning_rate": 0.00013317191283292979, "loss": 0.7808, "step": 17060 }, { "epoch": 0.19091717415739762, "grad_norm": 0.3543199896812439, "learning_rate": 0.00013294348759650967, "loss": 0.7928, "step": 17070 }, { "epoch": 0.1910290178446603, "grad_norm": 0.26262548565864563, "learning_rate": 0.00013271506236008955, "loss": 0.7677, "step": 17080 }, { "epoch": 0.19114086153192297, "grad_norm": 0.2845424711704254, "learning_rate": 0.00013248663712366943, "loss": 0.7758, "step": 17090 }, { "epoch": 0.19125270521918566, "grad_norm": 0.2694297730922699, "learning_rate": 0.00013225821188724932, "loss": 0.7857, "step": 17100 }, { "epoch": 0.19136454890644836, "grad_norm": 0.2682325839996338, "learning_rate": 0.00013202978665082917, "loss": 0.782, "step": 17110 }, { "epoch": 0.19147639259371102, "grad_norm": 0.26535049080848694, "learning_rate": 0.00013180136141440908, "loss": 0.7796, "step": 17120 }, { "epoch": 0.1915882362809737, "grad_norm": 0.2759861946105957, "learning_rate": 0.00013157293617798894, "loss": 0.7732, "step": 17130 }, { "epoch": 0.1917000799682364, "grad_norm": 0.24873244762420654, "learning_rate": 0.00013134451094156882, "loss": 0.7763, "step": 17140 }, { "epoch": 0.19181192365549907, "grad_norm": 0.2826152443885803, "learning_rate": 0.0001311160857051487, "loss": 0.7748, "step": 17150 }, { "epoch": 0.19192376734276176, "grad_norm": 0.2823798358440399, "learning_rate": 0.00013088766046872859, "loss": 0.768, "step": 17160 }, { "epoch": 0.19203561103002445, "grad_norm": 0.2591745853424072, "learning_rate": 0.00013065923523230847, "loss": 0.7831, "step": 17170 }, { "epoch": 0.19214745471728711, "grad_norm": 0.24773742258548737, "learning_rate": 0.00013043080999588835, "loss": 0.7799, "step": 17180 }, { "epoch": 0.1922592984045498, "grad_norm": 0.28184765577316284, "learning_rate": 0.0001302023847594682, "loss": 0.787, "step": 17190 }, { "epoch": 0.19237114209181247, "grad_norm": 0.24396668374538422, "learning_rate": 0.00012997395952304812, "loss": 0.7777, "step": 17200 }, { "epoch": 0.19248298577907516, "grad_norm": 0.25493332743644714, "learning_rate": 0.000129745534286628, "loss": 0.7842, "step": 17210 }, { "epoch": 0.19259482946633785, "grad_norm": 0.2615022361278534, "learning_rate": 0.00012951710905020786, "loss": 0.788, "step": 17220 }, { "epoch": 0.19270667315360052, "grad_norm": 0.28270524740219116, "learning_rate": 0.00012928868381378774, "loss": 0.7788, "step": 17230 }, { "epoch": 0.1928185168408632, "grad_norm": 0.24917210638523102, "learning_rate": 0.00012906025857736765, "loss": 0.7731, "step": 17240 }, { "epoch": 0.1929303605281259, "grad_norm": 0.2589946985244751, "learning_rate": 0.0001288318333409475, "loss": 0.7781, "step": 17250 }, { "epoch": 0.19304220421538856, "grad_norm": 0.23770585656166077, "learning_rate": 0.0001286034081045274, "loss": 0.7902, "step": 17260 }, { "epoch": 0.19315404790265125, "grad_norm": 0.22782771289348602, "learning_rate": 0.00012837498286810727, "loss": 0.7875, "step": 17270 }, { "epoch": 0.19326589158991395, "grad_norm": 0.2611001431941986, "learning_rate": 0.00012814655763168715, "loss": 0.794, "step": 17280 }, { "epoch": 0.1933777352771766, "grad_norm": 0.2642746865749359, "learning_rate": 0.00012791813239526704, "loss": 0.8005, "step": 17290 }, { "epoch": 0.1934895789644393, "grad_norm": 0.2470688372850418, "learning_rate": 0.00012768970715884692, "loss": 0.7854, "step": 17300 }, { "epoch": 0.193601422651702, "grad_norm": 0.24735964834690094, "learning_rate": 0.00012746128192242677, "loss": 0.7918, "step": 17310 }, { "epoch": 0.19371326633896466, "grad_norm": 0.2734208405017853, "learning_rate": 0.00012723285668600668, "loss": 0.7719, "step": 17320 }, { "epoch": 0.19382511002622735, "grad_norm": 0.28373652696609497, "learning_rate": 0.00012700443144958657, "loss": 0.7743, "step": 17330 }, { "epoch": 0.19393695371349004, "grad_norm": 0.25755295157432556, "learning_rate": 0.00012677600621316642, "loss": 0.7761, "step": 17340 }, { "epoch": 0.1940487974007527, "grad_norm": 0.2918241322040558, "learning_rate": 0.0001265475809767463, "loss": 0.7885, "step": 17350 }, { "epoch": 0.1941606410880154, "grad_norm": 0.2589518427848816, "learning_rate": 0.0001263191557403262, "loss": 0.7781, "step": 17360 }, { "epoch": 0.19427248477527806, "grad_norm": 0.2941739857196808, "learning_rate": 0.00012609073050390607, "loss": 0.7896, "step": 17370 }, { "epoch": 0.19438432846254075, "grad_norm": 0.2625831663608551, "learning_rate": 0.00012586230526748595, "loss": 0.7797, "step": 17380 }, { "epoch": 0.19449617214980344, "grad_norm": 0.2731517255306244, "learning_rate": 0.0001256338800310658, "loss": 0.7861, "step": 17390 }, { "epoch": 0.1946080158370661, "grad_norm": 0.2802453637123108, "learning_rate": 0.00012540545479464572, "loss": 0.8066, "step": 17400 }, { "epoch": 0.1947198595243288, "grad_norm": 0.24151596426963806, "learning_rate": 0.0001251770295582256, "loss": 0.7746, "step": 17410 }, { "epoch": 0.1948317032115915, "grad_norm": 0.27006617188453674, "learning_rate": 0.00012494860432180549, "loss": 0.7796, "step": 17420 }, { "epoch": 0.19494354689885415, "grad_norm": 0.2574283480644226, "learning_rate": 0.00012472017908538537, "loss": 0.7809, "step": 17430 }, { "epoch": 0.19505539058611684, "grad_norm": 0.25741514563560486, "learning_rate": 0.00012449175384896522, "loss": 0.7792, "step": 17440 }, { "epoch": 0.19516723427337954, "grad_norm": 0.2619360685348511, "learning_rate": 0.00012426332861254513, "loss": 0.7768, "step": 17450 }, { "epoch": 0.1952790779606422, "grad_norm": 0.28053224086761475, "learning_rate": 0.000124034903376125, "loss": 0.7841, "step": 17460 }, { "epoch": 0.1953909216479049, "grad_norm": 0.24019859731197357, "learning_rate": 0.00012380647813970487, "loss": 0.783, "step": 17470 }, { "epoch": 0.19550276533516758, "grad_norm": 0.2747540771961212, "learning_rate": 0.00012357805290328475, "loss": 0.7911, "step": 17480 }, { "epoch": 0.19561460902243025, "grad_norm": 0.28044483065605164, "learning_rate": 0.00012334962766686464, "loss": 0.7986, "step": 17490 }, { "epoch": 0.19572645270969294, "grad_norm": 0.24908137321472168, "learning_rate": 0.00012312120243044452, "loss": 0.8087, "step": 17500 }, { "epoch": 0.1958382963969556, "grad_norm": 0.29041793942451477, "learning_rate": 0.0001228927771940244, "loss": 0.8063, "step": 17510 }, { "epoch": 0.1959501400842183, "grad_norm": 0.3020537495613098, "learning_rate": 0.00012266435195760429, "loss": 0.8004, "step": 17520 }, { "epoch": 0.19606198377148099, "grad_norm": 0.29414400458335876, "learning_rate": 0.00012243592672118417, "loss": 0.7846, "step": 17530 }, { "epoch": 0.19617382745874365, "grad_norm": 0.2648397386074066, "learning_rate": 0.00012220750148476402, "loss": 0.7708, "step": 17540 }, { "epoch": 0.19628567114600634, "grad_norm": 0.2834302484989166, "learning_rate": 0.00012197907624834392, "loss": 0.7818, "step": 17550 }, { "epoch": 0.19639751483326903, "grad_norm": 0.2748505175113678, "learning_rate": 0.0001217506510119238, "loss": 0.7642, "step": 17560 }, { "epoch": 0.1965093585205317, "grad_norm": 0.32425326108932495, "learning_rate": 0.00012152222577550367, "loss": 0.7765, "step": 17570 }, { "epoch": 0.1966212022077944, "grad_norm": 0.27183324098587036, "learning_rate": 0.00012129380053908357, "loss": 0.7572, "step": 17580 }, { "epoch": 0.19673304589505708, "grad_norm": 0.28190943598747253, "learning_rate": 0.00012106537530266344, "loss": 0.7571, "step": 17590 }, { "epoch": 0.19684488958231974, "grad_norm": 0.5151196718215942, "learning_rate": 0.00012083695006624332, "loss": 0.7565, "step": 17600 }, { "epoch": 0.19695673326958243, "grad_norm": 0.2523132264614105, "learning_rate": 0.0001206085248298232, "loss": 0.7597, "step": 17610 }, { "epoch": 0.19706857695684513, "grad_norm": 0.27336063981056213, "learning_rate": 0.00012038009959340309, "loss": 0.7546, "step": 17620 }, { "epoch": 0.1971804206441078, "grad_norm": 0.25119057297706604, "learning_rate": 0.00012015167435698296, "loss": 0.7519, "step": 17630 }, { "epoch": 0.19729226433137048, "grad_norm": 0.281147301197052, "learning_rate": 0.00011992324912056284, "loss": 0.7623, "step": 17640 }, { "epoch": 0.19740410801863315, "grad_norm": 0.2463361769914627, "learning_rate": 0.00011969482388414272, "loss": 0.754, "step": 17650 }, { "epoch": 0.19751595170589584, "grad_norm": 0.2902059853076935, "learning_rate": 0.0001194663986477226, "loss": 0.7578, "step": 17660 }, { "epoch": 0.19762779539315853, "grad_norm": 0.2590588629245758, "learning_rate": 0.00011923797341130247, "loss": 0.7427, "step": 17670 }, { "epoch": 0.1977396390804212, "grad_norm": 0.24349506199359894, "learning_rate": 0.00011900954817488237, "loss": 0.7599, "step": 17680 }, { "epoch": 0.19785148276768388, "grad_norm": 0.2568139135837555, "learning_rate": 0.00011878112293846224, "loss": 0.7673, "step": 17690 }, { "epoch": 0.19796332645494658, "grad_norm": 0.2617419958114624, "learning_rate": 0.00011855269770204212, "loss": 0.7637, "step": 17700 }, { "epoch": 0.19807517014220924, "grad_norm": 0.24309082329273224, "learning_rate": 0.000118324272465622, "loss": 0.7583, "step": 17710 }, { "epoch": 0.19818701382947193, "grad_norm": 0.22027656435966492, "learning_rate": 0.00011809584722920189, "loss": 0.7479, "step": 17720 }, { "epoch": 0.19829885751673462, "grad_norm": 0.27296265959739685, "learning_rate": 0.00011786742199278176, "loss": 0.765, "step": 17730 }, { "epoch": 0.1984107012039973, "grad_norm": 0.2589128613471985, "learning_rate": 0.00011763899675636165, "loss": 0.777, "step": 17740 }, { "epoch": 0.19852254489125998, "grad_norm": 0.27665242552757263, "learning_rate": 0.00011741057151994152, "loss": 0.7656, "step": 17750 }, { "epoch": 0.19863438857852267, "grad_norm": 0.27103251218795776, "learning_rate": 0.0001171821462835214, "loss": 0.7716, "step": 17760 }, { "epoch": 0.19874623226578533, "grad_norm": 0.2768172025680542, "learning_rate": 0.00011695372104710127, "loss": 0.7738, "step": 17770 }, { "epoch": 0.19885807595304802, "grad_norm": 0.2424757182598114, "learning_rate": 0.00011672529581068117, "loss": 0.7793, "step": 17780 }, { "epoch": 0.1989699196403107, "grad_norm": 0.2821860909461975, "learning_rate": 0.00011649687057426104, "loss": 0.7771, "step": 17790 }, { "epoch": 0.19908176332757338, "grad_norm": 0.28263264894485474, "learning_rate": 0.00011626844533784092, "loss": 0.7812, "step": 17800 }, { "epoch": 0.19919360701483607, "grad_norm": 0.24835869669914246, "learning_rate": 0.0001160400201014208, "loss": 0.7753, "step": 17810 }, { "epoch": 0.19930545070209874, "grad_norm": 0.23325562477111816, "learning_rate": 0.00011581159486500069, "loss": 0.7763, "step": 17820 }, { "epoch": 0.19941729438936143, "grad_norm": 0.2520182132720947, "learning_rate": 0.00011558316962858056, "loss": 0.791, "step": 17830 }, { "epoch": 0.19952913807662412, "grad_norm": 0.2478768676519394, "learning_rate": 0.00011535474439216045, "loss": 0.7819, "step": 17840 }, { "epoch": 0.19964098176388678, "grad_norm": 0.2749478220939636, "learning_rate": 0.00011512631915574032, "loss": 0.7805, "step": 17850 }, { "epoch": 0.19975282545114947, "grad_norm": 0.2417723685503006, "learning_rate": 0.0001148978939193202, "loss": 0.766, "step": 17860 }, { "epoch": 0.19986466913841217, "grad_norm": 0.25219354033470154, "learning_rate": 0.00011466946868290008, "loss": 0.758, "step": 17870 }, { "epoch": 0.19997651282567483, "grad_norm": 0.24644000828266144, "learning_rate": 0.00011444104344647997, "loss": 0.7569, "step": 17880 }, { "epoch": 0.20008835651293752, "grad_norm": 0.2683338224887848, "learning_rate": 0.00011421261821005986, "loss": 0.7509, "step": 17890 }, { "epoch": 0.2002002002002002, "grad_norm": 0.29149681329727173, "learning_rate": 0.00011398419297363972, "loss": 0.7611, "step": 17900 }, { "epoch": 0.20031204388746288, "grad_norm": 0.2651118338108063, "learning_rate": 0.00011375576773721962, "loss": 0.756, "step": 17910 }, { "epoch": 0.20042388757472557, "grad_norm": 0.26990607380867004, "learning_rate": 0.00011352734250079949, "loss": 0.7726, "step": 17920 }, { "epoch": 0.20053573126198823, "grad_norm": 0.23897935450077057, "learning_rate": 0.00011329891726437937, "loss": 0.7875, "step": 17930 }, { "epoch": 0.20064757494925092, "grad_norm": 0.2300727218389511, "learning_rate": 0.00011307049202795926, "loss": 0.7697, "step": 17940 }, { "epoch": 0.20075941863651361, "grad_norm": 0.2873596251010895, "learning_rate": 0.00011284206679153914, "loss": 0.7776, "step": 17950 }, { "epoch": 0.20087126232377628, "grad_norm": 0.29036712646484375, "learning_rate": 0.00011261364155511901, "loss": 0.7794, "step": 17960 }, { "epoch": 0.20098310601103897, "grad_norm": 0.2837420701980591, "learning_rate": 0.0001123852163186989, "loss": 0.7818, "step": 17970 }, { "epoch": 0.20109494969830166, "grad_norm": 0.2920686602592468, "learning_rate": 0.00011215679108227877, "loss": 0.7851, "step": 17980 }, { "epoch": 0.20120679338556433, "grad_norm": 0.27664583921432495, "learning_rate": 0.00011192836584585866, "loss": 0.7601, "step": 17990 }, { "epoch": 0.20131863707282702, "grad_norm": 0.26870399713516235, "learning_rate": 0.00011169994060943853, "loss": 0.7961, "step": 18000 }, { "epoch": 0.2014304807600897, "grad_norm": 0.2502228021621704, "learning_rate": 0.00011147151537301842, "loss": 0.7827, "step": 18010 }, { "epoch": 0.20154232444735237, "grad_norm": 0.2473440319299698, "learning_rate": 0.00011124309013659829, "loss": 0.7815, "step": 18020 }, { "epoch": 0.20165416813461506, "grad_norm": 0.2513076663017273, "learning_rate": 0.00011101466490017817, "loss": 0.7675, "step": 18030 }, { "epoch": 0.20176601182187776, "grad_norm": 0.2829226851463318, "learning_rate": 0.00011078623966375806, "loss": 0.7669, "step": 18040 }, { "epoch": 0.20187785550914042, "grad_norm": 0.25758418440818787, "learning_rate": 0.00011055781442733794, "loss": 0.7707, "step": 18050 }, { "epoch": 0.2019896991964031, "grad_norm": 0.27185285091400146, "learning_rate": 0.00011032938919091781, "loss": 0.7742, "step": 18060 }, { "epoch": 0.2021015428836658, "grad_norm": 0.2802230417728424, "learning_rate": 0.0001101009639544977, "loss": 0.7821, "step": 18070 }, { "epoch": 0.20221338657092847, "grad_norm": 0.2882921099662781, "learning_rate": 0.00010987253871807757, "loss": 0.779, "step": 18080 }, { "epoch": 0.20232523025819116, "grad_norm": 0.2569839358329773, "learning_rate": 0.00010964411348165746, "loss": 0.7694, "step": 18090 }, { "epoch": 0.20243707394545382, "grad_norm": 0.2600938379764557, "learning_rate": 0.00010941568824523733, "loss": 0.7781, "step": 18100 }, { "epoch": 0.2025489176327165, "grad_norm": 0.28083154559135437, "learning_rate": 0.00010918726300881722, "loss": 0.7799, "step": 18110 }, { "epoch": 0.2026607613199792, "grad_norm": 0.22990182042121887, "learning_rate": 0.00010895883777239709, "loss": 0.7883, "step": 18120 }, { "epoch": 0.20277260500724187, "grad_norm": 0.27432581782341003, "learning_rate": 0.00010873041253597697, "loss": 0.7942, "step": 18130 }, { "epoch": 0.20288444869450456, "grad_norm": 0.2607738971710205, "learning_rate": 0.00010850198729955686, "loss": 0.7877, "step": 18140 }, { "epoch": 0.20299629238176725, "grad_norm": 0.2818219065666199, "learning_rate": 0.00010827356206313674, "loss": 0.7948, "step": 18150 }, { "epoch": 0.20310813606902992, "grad_norm": 0.2751563489437103, "learning_rate": 0.00010804513682671661, "loss": 0.7836, "step": 18160 }, { "epoch": 0.2032199797562926, "grad_norm": 0.2746957242488861, "learning_rate": 0.0001078167115902965, "loss": 0.7693, "step": 18170 }, { "epoch": 0.2033318234435553, "grad_norm": 0.24990054965019226, "learning_rate": 0.00010758828635387638, "loss": 0.7869, "step": 18180 }, { "epoch": 0.20344366713081796, "grad_norm": 0.24581623077392578, "learning_rate": 0.00010735986111745626, "loss": 0.768, "step": 18190 }, { "epoch": 0.20355551081808065, "grad_norm": 0.26637768745422363, "learning_rate": 0.00010713143588103613, "loss": 0.7711, "step": 18200 }, { "epoch": 0.20366735450534335, "grad_norm": 0.2510250508785248, "learning_rate": 0.00010690301064461602, "loss": 0.7748, "step": 18210 }, { "epoch": 0.203779198192606, "grad_norm": 0.2378496378660202, "learning_rate": 0.00010667458540819589, "loss": 0.7622, "step": 18220 }, { "epoch": 0.2038910418798687, "grad_norm": 0.2507869601249695, "learning_rate": 0.00010644616017177578, "loss": 0.7739, "step": 18230 }, { "epoch": 0.20400288556713136, "grad_norm": 0.24733096361160278, "learning_rate": 0.00010621773493535566, "loss": 0.7508, "step": 18240 }, { "epoch": 0.20411472925439406, "grad_norm": 0.23383109271526337, "learning_rate": 0.00010598930969893554, "loss": 0.7507, "step": 18250 }, { "epoch": 0.20422657294165675, "grad_norm": 0.2543237805366516, "learning_rate": 0.00010576088446251541, "loss": 0.7578, "step": 18260 }, { "epoch": 0.2043384166289194, "grad_norm": 0.25807520747184753, "learning_rate": 0.00010553245922609531, "loss": 0.7513, "step": 18270 }, { "epoch": 0.2044502603161821, "grad_norm": 0.23354406654834747, "learning_rate": 0.00010530403398967518, "loss": 0.7566, "step": 18280 }, { "epoch": 0.2045621040034448, "grad_norm": 0.2685154676437378, "learning_rate": 0.00010507560875325506, "loss": 0.758, "step": 18290 }, { "epoch": 0.20467394769070746, "grad_norm": 0.24349918961524963, "learning_rate": 0.00010484718351683494, "loss": 0.7686, "step": 18300 }, { "epoch": 0.20478579137797015, "grad_norm": 0.24823498725891113, "learning_rate": 0.00010461875828041482, "loss": 0.7659, "step": 18310 }, { "epoch": 0.20489763506523284, "grad_norm": 0.2511804401874542, "learning_rate": 0.0001043903330439947, "loss": 0.77, "step": 18320 }, { "epoch": 0.2050094787524955, "grad_norm": 0.24065516889095306, "learning_rate": 0.00010416190780757458, "loss": 0.7677, "step": 18330 }, { "epoch": 0.2051213224397582, "grad_norm": 0.2819323241710663, "learning_rate": 0.00010393348257115447, "loss": 0.753, "step": 18340 }, { "epoch": 0.2052331661270209, "grad_norm": 0.26467952132225037, "learning_rate": 0.00010370505733473434, "loss": 0.7826, "step": 18350 }, { "epoch": 0.20534500981428355, "grad_norm": 0.22962163388729095, "learning_rate": 0.00010347663209831423, "loss": 0.7683, "step": 18360 }, { "epoch": 0.20545685350154624, "grad_norm": 0.2582736611366272, "learning_rate": 0.00010324820686189411, "loss": 0.7951, "step": 18370 }, { "epoch": 0.2055686971888089, "grad_norm": 0.2352149486541748, "learning_rate": 0.00010301978162547399, "loss": 0.7577, "step": 18380 }, { "epoch": 0.2056805408760716, "grad_norm": 0.25687554478645325, "learning_rate": 0.00010279135638905386, "loss": 0.7696, "step": 18390 }, { "epoch": 0.2057923845633343, "grad_norm": 0.2579772472381592, "learning_rate": 0.00010256293115263376, "loss": 0.7837, "step": 18400 }, { "epoch": 0.20590422825059695, "grad_norm": 0.24537009000778198, "learning_rate": 0.00010233450591621363, "loss": 0.7799, "step": 18410 }, { "epoch": 0.20601607193785965, "grad_norm": 0.2636966109275818, "learning_rate": 0.00010210608067979351, "loss": 0.7588, "step": 18420 }, { "epoch": 0.20612791562512234, "grad_norm": 0.30670562386512756, "learning_rate": 0.00010187765544337338, "loss": 0.771, "step": 18430 }, { "epoch": 0.206239759312385, "grad_norm": 0.28400668501853943, "learning_rate": 0.00010164923020695327, "loss": 0.7686, "step": 18440 }, { "epoch": 0.2063516029996477, "grad_norm": 0.27395951747894287, "learning_rate": 0.00010142080497053314, "loss": 0.776, "step": 18450 }, { "epoch": 0.20646344668691038, "grad_norm": 0.284868061542511, "learning_rate": 0.00010119237973411303, "loss": 0.7864, "step": 18460 }, { "epoch": 0.20657529037417305, "grad_norm": 0.2859087586402893, "learning_rate": 0.00010096395449769291, "loss": 0.7749, "step": 18470 }, { "epoch": 0.20668713406143574, "grad_norm": 0.28758034110069275, "learning_rate": 0.00010073552926127279, "loss": 0.7919, "step": 18480 }, { "epoch": 0.20679897774869843, "grad_norm": 0.2752404510974884, "learning_rate": 0.00010050710402485266, "loss": 0.7808, "step": 18490 }, { "epoch": 0.2069108214359611, "grad_norm": 0.30756843090057373, "learning_rate": 0.00010027867878843256, "loss": 0.7734, "step": 18500 }, { "epoch": 0.2070226651232238, "grad_norm": 0.2694368064403534, "learning_rate": 0.00010005025355201243, "loss": 0.7751, "step": 18510 }, { "epoch": 0.20713450881048645, "grad_norm": 0.25838834047317505, "learning_rate": 9.982182831559231e-05, "loss": 0.7686, "step": 18520 }, { "epoch": 0.20724635249774914, "grad_norm": 0.257729709148407, "learning_rate": 9.959340307917219e-05, "loss": 0.7827, "step": 18530 }, { "epoch": 0.20735819618501183, "grad_norm": 0.2938844859600067, "learning_rate": 9.936497784275208e-05, "loss": 0.7685, "step": 18540 }, { "epoch": 0.2074700398722745, "grad_norm": 0.25894027948379517, "learning_rate": 9.913655260633194e-05, "loss": 0.7738, "step": 18550 }, { "epoch": 0.2075818835595372, "grad_norm": 0.2751148045063019, "learning_rate": 9.890812736991183e-05, "loss": 0.7594, "step": 18560 }, { "epoch": 0.20769372724679988, "grad_norm": 0.28643253445625305, "learning_rate": 9.867970213349171e-05, "loss": 0.7737, "step": 18570 }, { "epoch": 0.20780557093406254, "grad_norm": 0.2575749158859253, "learning_rate": 9.845127689707159e-05, "loss": 0.7778, "step": 18580 }, { "epoch": 0.20791741462132524, "grad_norm": 0.27625295519828796, "learning_rate": 9.822285166065146e-05, "loss": 0.7716, "step": 18590 }, { "epoch": 0.20802925830858793, "grad_norm": 0.2803322672843933, "learning_rate": 9.799442642423136e-05, "loss": 0.7805, "step": 18600 }, { "epoch": 0.2081411019958506, "grad_norm": 0.2567484676837921, "learning_rate": 9.776600118781123e-05, "loss": 0.7633, "step": 18610 }, { "epoch": 0.20825294568311328, "grad_norm": 0.28193768858909607, "learning_rate": 9.753757595139111e-05, "loss": 0.7895, "step": 18620 }, { "epoch": 0.20836478937037597, "grad_norm": 0.28459542989730835, "learning_rate": 9.7309150714971e-05, "loss": 0.7741, "step": 18630 }, { "epoch": 0.20847663305763864, "grad_norm": 0.28346261382102966, "learning_rate": 9.708072547855088e-05, "loss": 0.7813, "step": 18640 }, { "epoch": 0.20858847674490133, "grad_norm": 0.2818828523159027, "learning_rate": 9.685230024213075e-05, "loss": 0.7755, "step": 18650 }, { "epoch": 0.208700320432164, "grad_norm": 0.28914326429367065, "learning_rate": 9.662387500571063e-05, "loss": 0.7798, "step": 18660 }, { "epoch": 0.20881216411942669, "grad_norm": 0.2600755989551544, "learning_rate": 9.639544976929051e-05, "loss": 0.7758, "step": 18670 }, { "epoch": 0.20892400780668938, "grad_norm": 0.2726733088493347, "learning_rate": 9.61670245328704e-05, "loss": 0.7769, "step": 18680 }, { "epoch": 0.20903585149395204, "grad_norm": 0.23421594500541687, "learning_rate": 9.593859929645026e-05, "loss": 0.758, "step": 18690 }, { "epoch": 0.20914769518121473, "grad_norm": 0.29468339681625366, "learning_rate": 9.571017406003016e-05, "loss": 0.7746, "step": 18700 }, { "epoch": 0.20925953886847742, "grad_norm": 0.29477235674858093, "learning_rate": 9.548174882361003e-05, "loss": 0.7633, "step": 18710 }, { "epoch": 0.2093713825557401, "grad_norm": 0.2564197778701782, "learning_rate": 9.525332358718991e-05, "loss": 0.7541, "step": 18720 }, { "epoch": 0.20948322624300278, "grad_norm": 0.2745250165462494, "learning_rate": 9.50248983507698e-05, "loss": 0.7887, "step": 18730 }, { "epoch": 0.20959506993026547, "grad_norm": 0.2572060525417328, "learning_rate": 9.479647311434968e-05, "loss": 0.774, "step": 18740 }, { "epoch": 0.20970691361752813, "grad_norm": 0.28513193130493164, "learning_rate": 9.456804787792955e-05, "loss": 0.7871, "step": 18750 }, { "epoch": 0.20981875730479083, "grad_norm": 0.2643887400627136, "learning_rate": 9.433962264150944e-05, "loss": 0.77, "step": 18760 }, { "epoch": 0.20993060099205352, "grad_norm": 0.27534207701683044, "learning_rate": 9.411119740508931e-05, "loss": 0.7775, "step": 18770 }, { "epoch": 0.21004244467931618, "grad_norm": 0.2620585858821869, "learning_rate": 9.38827721686692e-05, "loss": 0.7808, "step": 18780 }, { "epoch": 0.21015428836657887, "grad_norm": 0.2759549915790558, "learning_rate": 9.365434693224908e-05, "loss": 0.7642, "step": 18790 }, { "epoch": 0.21026613205384156, "grad_norm": 0.2919774353504181, "learning_rate": 9.342592169582896e-05, "loss": 0.7828, "step": 18800 }, { "epoch": 0.21037797574110423, "grad_norm": 0.2717173099517822, "learning_rate": 9.319749645940884e-05, "loss": 0.7513, "step": 18810 }, { "epoch": 0.21048981942836692, "grad_norm": 0.2662122845649719, "learning_rate": 9.296907122298871e-05, "loss": 0.7668, "step": 18820 }, { "epoch": 0.21060166311562958, "grad_norm": 0.26051005721092224, "learning_rate": 9.274064598656861e-05, "loss": 0.7676, "step": 18830 }, { "epoch": 0.21071350680289228, "grad_norm": 0.27510005235671997, "learning_rate": 9.251222075014848e-05, "loss": 0.7507, "step": 18840 }, { "epoch": 0.21082535049015497, "grad_norm": 0.23877868056297302, "learning_rate": 9.228379551372836e-05, "loss": 0.7535, "step": 18850 }, { "epoch": 0.21093719417741763, "grad_norm": 0.256104439496994, "learning_rate": 9.205537027730824e-05, "loss": 0.7546, "step": 18860 }, { "epoch": 0.21104903786468032, "grad_norm": 0.2829015552997589, "learning_rate": 9.182694504088813e-05, "loss": 0.7588, "step": 18870 }, { "epoch": 0.211160881551943, "grad_norm": 0.22898368537425995, "learning_rate": 9.1598519804468e-05, "loss": 0.7551, "step": 18880 }, { "epoch": 0.21127272523920568, "grad_norm": 0.23679418861865997, "learning_rate": 9.137009456804788e-05, "loss": 0.7718, "step": 18890 }, { "epoch": 0.21138456892646837, "grad_norm": 0.2878457009792328, "learning_rate": 9.114166933162776e-05, "loss": 0.7593, "step": 18900 }, { "epoch": 0.21149641261373106, "grad_norm": 0.2936013638973236, "learning_rate": 9.091324409520764e-05, "loss": 0.7713, "step": 18910 }, { "epoch": 0.21160825630099372, "grad_norm": 0.26062774658203125, "learning_rate": 9.068481885878751e-05, "loss": 0.7763, "step": 18920 }, { "epoch": 0.21172009998825642, "grad_norm": 0.3092271685600281, "learning_rate": 9.045639362236741e-05, "loss": 0.7807, "step": 18930 }, { "epoch": 0.2118319436755191, "grad_norm": 0.23566113412380219, "learning_rate": 9.022796838594728e-05, "loss": 0.7779, "step": 18940 }, { "epoch": 0.21194378736278177, "grad_norm": 0.27366477251052856, "learning_rate": 8.999954314952716e-05, "loss": 0.77, "step": 18950 }, { "epoch": 0.21205563105004446, "grad_norm": 0.23270778357982635, "learning_rate": 8.977111791310704e-05, "loss": 0.7549, "step": 18960 }, { "epoch": 0.21216747473730713, "grad_norm": 0.28785306215286255, "learning_rate": 8.954269267668693e-05, "loss": 0.7677, "step": 18970 }, { "epoch": 0.21227931842456982, "grad_norm": 0.2588510811328888, "learning_rate": 8.93142674402668e-05, "loss": 0.7715, "step": 18980 }, { "epoch": 0.2123911621118325, "grad_norm": 0.248029887676239, "learning_rate": 8.908584220384668e-05, "loss": 0.7749, "step": 18990 }, { "epoch": 0.21250300579909517, "grad_norm": 0.2579936981201172, "learning_rate": 8.885741696742656e-05, "loss": 0.7552, "step": 19000 }, { "epoch": 0.21261484948635787, "grad_norm": 0.26293206214904785, "learning_rate": 8.862899173100645e-05, "loss": 0.7657, "step": 19010 }, { "epoch": 0.21272669317362056, "grad_norm": 0.24589793384075165, "learning_rate": 8.840056649458631e-05, "loss": 0.7598, "step": 19020 }, { "epoch": 0.21283853686088322, "grad_norm": 0.2315252274274826, "learning_rate": 8.817214125816621e-05, "loss": 0.7637, "step": 19030 }, { "epoch": 0.2129503805481459, "grad_norm": 0.2538358271121979, "learning_rate": 8.794371602174608e-05, "loss": 0.7587, "step": 19040 }, { "epoch": 0.2130622242354086, "grad_norm": 0.2626616060733795, "learning_rate": 8.771529078532596e-05, "loss": 0.7597, "step": 19050 }, { "epoch": 0.21317406792267127, "grad_norm": 0.2557279169559479, "learning_rate": 8.748686554890585e-05, "loss": 0.7499, "step": 19060 }, { "epoch": 0.21328591160993396, "grad_norm": 0.25008153915405273, "learning_rate": 8.725844031248573e-05, "loss": 0.7466, "step": 19070 }, { "epoch": 0.21339775529719665, "grad_norm": 0.2647120952606201, "learning_rate": 8.70300150760656e-05, "loss": 0.7574, "step": 19080 }, { "epoch": 0.21350959898445931, "grad_norm": 0.2535738945007324, "learning_rate": 8.68015898396455e-05, "loss": 0.7672, "step": 19090 }, { "epoch": 0.213621442671722, "grad_norm": 0.28925755620002747, "learning_rate": 8.657316460322536e-05, "loss": 0.7692, "step": 19100 }, { "epoch": 0.21373328635898467, "grad_norm": 0.26770591735839844, "learning_rate": 8.634473936680525e-05, "loss": 0.7511, "step": 19110 }, { "epoch": 0.21384513004624736, "grad_norm": 0.25162947177886963, "learning_rate": 8.611631413038512e-05, "loss": 0.7573, "step": 19120 }, { "epoch": 0.21395697373351005, "grad_norm": 0.253324031829834, "learning_rate": 8.588788889396501e-05, "loss": 0.7516, "step": 19130 }, { "epoch": 0.21406881742077272, "grad_norm": 0.2784843146800995, "learning_rate": 8.565946365754488e-05, "loss": 0.7522, "step": 19140 }, { "epoch": 0.2141806611080354, "grad_norm": 0.2869722247123718, "learning_rate": 8.543103842112476e-05, "loss": 0.7525, "step": 19150 }, { "epoch": 0.2142925047952981, "grad_norm": 0.2467101663351059, "learning_rate": 8.520261318470465e-05, "loss": 0.7336, "step": 19160 }, { "epoch": 0.21440434848256076, "grad_norm": 0.26108691096305847, "learning_rate": 8.497418794828453e-05, "loss": 0.751, "step": 19170 }, { "epoch": 0.21451619216982346, "grad_norm": 0.2992580533027649, "learning_rate": 8.47457627118644e-05, "loss": 0.7599, "step": 19180 }, { "epoch": 0.21462803585708615, "grad_norm": 0.2573351562023163, "learning_rate": 8.45173374754443e-05, "loss": 0.752, "step": 19190 }, { "epoch": 0.2147398795443488, "grad_norm": 0.30148234963417053, "learning_rate": 8.428891223902416e-05, "loss": 0.7536, "step": 19200 }, { "epoch": 0.2148517232316115, "grad_norm": 0.2811321020126343, "learning_rate": 8.406048700260405e-05, "loss": 0.761, "step": 19210 }, { "epoch": 0.2149635669188742, "grad_norm": 0.2792038321495056, "learning_rate": 8.383206176618392e-05, "loss": 0.7558, "step": 19220 }, { "epoch": 0.21507541060613686, "grad_norm": 0.30432426929473877, "learning_rate": 8.360363652976381e-05, "loss": 0.7541, "step": 19230 }, { "epoch": 0.21518725429339955, "grad_norm": 0.28335481882095337, "learning_rate": 8.33752112933437e-05, "loss": 0.7628, "step": 19240 }, { "epoch": 0.2152990979806622, "grad_norm": 0.28402864933013916, "learning_rate": 8.314678605692357e-05, "loss": 0.7835, "step": 19250 }, { "epoch": 0.2154109416679249, "grad_norm": 0.2914164662361145, "learning_rate": 8.291836082050346e-05, "loss": 0.7705, "step": 19260 }, { "epoch": 0.2155227853551876, "grad_norm": 0.27296769618988037, "learning_rate": 8.268993558408333e-05, "loss": 0.7791, "step": 19270 }, { "epoch": 0.21563462904245026, "grad_norm": 0.2987435460090637, "learning_rate": 8.246151034766321e-05, "loss": 0.7918, "step": 19280 }, { "epoch": 0.21574647272971295, "grad_norm": 0.2743736207485199, "learning_rate": 8.22330851112431e-05, "loss": 0.7777, "step": 19290 }, { "epoch": 0.21585831641697564, "grad_norm": 0.2775188982486725, "learning_rate": 8.200465987482298e-05, "loss": 0.7811, "step": 19300 }, { "epoch": 0.2159701601042383, "grad_norm": 0.2942585349082947, "learning_rate": 8.177623463840285e-05, "loss": 0.7748, "step": 19310 }, { "epoch": 0.216082003791501, "grad_norm": 0.2545025050640106, "learning_rate": 8.154780940198274e-05, "loss": 0.77, "step": 19320 }, { "epoch": 0.2161938474787637, "grad_norm": 0.2571526765823364, "learning_rate": 8.131938416556261e-05, "loss": 0.7735, "step": 19330 }, { "epoch": 0.21630569116602635, "grad_norm": 0.2687735855579376, "learning_rate": 8.10909589291425e-05, "loss": 0.7703, "step": 19340 }, { "epoch": 0.21641753485328905, "grad_norm": 0.27332374453544617, "learning_rate": 8.086253369272237e-05, "loss": 0.7645, "step": 19350 }, { "epoch": 0.21652937854055174, "grad_norm": 0.25585636496543884, "learning_rate": 8.063410845630226e-05, "loss": 0.7651, "step": 19360 }, { "epoch": 0.2166412222278144, "grad_norm": 0.25861334800720215, "learning_rate": 8.040568321988213e-05, "loss": 0.7788, "step": 19370 }, { "epoch": 0.2167530659150771, "grad_norm": 0.26126453280448914, "learning_rate": 8.017725798346201e-05, "loss": 0.7631, "step": 19380 }, { "epoch": 0.21686490960233978, "grad_norm": 0.27623289823532104, "learning_rate": 7.99488327470419e-05, "loss": 0.7555, "step": 19390 }, { "epoch": 0.21697675328960245, "grad_norm": 0.256489634513855, "learning_rate": 7.972040751062178e-05, "loss": 0.7565, "step": 19400 }, { "epoch": 0.21708859697686514, "grad_norm": 0.26825475692749023, "learning_rate": 7.949198227420165e-05, "loss": 0.7619, "step": 19410 }, { "epoch": 0.2172004406641278, "grad_norm": 0.2633214294910431, "learning_rate": 7.926355703778155e-05, "loss": 0.7576, "step": 19420 }, { "epoch": 0.2173122843513905, "grad_norm": 0.24602185189723969, "learning_rate": 7.903513180136141e-05, "loss": 0.748, "step": 19430 }, { "epoch": 0.21742412803865319, "grad_norm": 0.24769659340381622, "learning_rate": 7.88067065649413e-05, "loss": 0.749, "step": 19440 }, { "epoch": 0.21753597172591585, "grad_norm": 0.22824670374393463, "learning_rate": 7.857828132852117e-05, "loss": 0.7439, "step": 19450 }, { "epoch": 0.21764781541317854, "grad_norm": 0.24848710000514984, "learning_rate": 7.834985609210106e-05, "loss": 0.7422, "step": 19460 }, { "epoch": 0.21775965910044123, "grad_norm": 0.25875037908554077, "learning_rate": 7.812143085568093e-05, "loss": 0.7411, "step": 19470 }, { "epoch": 0.2178715027877039, "grad_norm": 0.24616488814353943, "learning_rate": 7.789300561926082e-05, "loss": 0.723, "step": 19480 }, { "epoch": 0.2179833464749666, "grad_norm": 0.26018476486206055, "learning_rate": 7.76645803828407e-05, "loss": 0.7388, "step": 19490 }, { "epoch": 0.21809519016222928, "grad_norm": 0.24355724453926086, "learning_rate": 7.743615514642058e-05, "loss": 0.7337, "step": 19500 }, { "epoch": 0.21820703384949194, "grad_norm": 0.24908235669136047, "learning_rate": 7.720772991000045e-05, "loss": 0.7378, "step": 19510 }, { "epoch": 0.21831887753675464, "grad_norm": 0.2710162401199341, "learning_rate": 7.697930467358035e-05, "loss": 0.7336, "step": 19520 }, { "epoch": 0.21843072122401733, "grad_norm": 0.24222905933856964, "learning_rate": 7.675087943716022e-05, "loss": 0.7386, "step": 19530 }, { "epoch": 0.21854256491128, "grad_norm": 0.23762881755828857, "learning_rate": 7.65224542007401e-05, "loss": 0.7354, "step": 19540 }, { "epoch": 0.21865440859854268, "grad_norm": 0.25905948877334595, "learning_rate": 7.629402896431998e-05, "loss": 0.7453, "step": 19550 }, { "epoch": 0.21876625228580535, "grad_norm": 0.24563716351985931, "learning_rate": 7.606560372789986e-05, "loss": 0.7422, "step": 19560 }, { "epoch": 0.21887809597306804, "grad_norm": 0.2649664878845215, "learning_rate": 7.583717849147973e-05, "loss": 0.7301, "step": 19570 }, { "epoch": 0.21898993966033073, "grad_norm": 0.24720273911952972, "learning_rate": 7.560875325505962e-05, "loss": 0.7321, "step": 19580 }, { "epoch": 0.2191017833475934, "grad_norm": 0.23652884364128113, "learning_rate": 7.53803280186395e-05, "loss": 0.7296, "step": 19590 }, { "epoch": 0.21921362703485608, "grad_norm": 0.23715312778949738, "learning_rate": 7.515190278221938e-05, "loss": 0.7237, "step": 19600 }, { "epoch": 0.21932547072211878, "grad_norm": 0.2500048577785492, "learning_rate": 7.492347754579925e-05, "loss": 0.7372, "step": 19610 }, { "epoch": 0.21943731440938144, "grad_norm": 0.2575337886810303, "learning_rate": 7.469505230937915e-05, "loss": 0.7393, "step": 19620 }, { "epoch": 0.21954915809664413, "grad_norm": 0.255375474691391, "learning_rate": 7.446662707295902e-05, "loss": 0.75, "step": 19630 }, { "epoch": 0.21966100178390682, "grad_norm": 0.2793714106082916, "learning_rate": 7.42382018365389e-05, "loss": 0.7585, "step": 19640 }, { "epoch": 0.2197728454711695, "grad_norm": 0.2588786482810974, "learning_rate": 7.400977660011878e-05, "loss": 0.7661, "step": 19650 }, { "epoch": 0.21988468915843218, "grad_norm": 0.27130866050720215, "learning_rate": 7.378135136369867e-05, "loss": 0.7579, "step": 19660 }, { "epoch": 0.21999653284569487, "grad_norm": 0.2730309069156647, "learning_rate": 7.355292612727853e-05, "loss": 0.7463, "step": 19670 }, { "epoch": 0.22010837653295753, "grad_norm": 0.24330918490886688, "learning_rate": 7.332450089085842e-05, "loss": 0.7388, "step": 19680 }, { "epoch": 0.22022022022022023, "grad_norm": 0.30004703998565674, "learning_rate": 7.309607565443831e-05, "loss": 0.7633, "step": 19690 }, { "epoch": 0.2203320639074829, "grad_norm": 0.2754705548286438, "learning_rate": 7.286765041801818e-05, "loss": 0.7587, "step": 19700 }, { "epoch": 0.22044390759474558, "grad_norm": 0.27601394057273865, "learning_rate": 7.263922518159807e-05, "loss": 0.7468, "step": 19710 }, { "epoch": 0.22055575128200827, "grad_norm": 0.2328653633594513, "learning_rate": 7.241079994517795e-05, "loss": 0.7432, "step": 19720 }, { "epoch": 0.22066759496927094, "grad_norm": 0.23960436880588531, "learning_rate": 7.218237470875783e-05, "loss": 0.7384, "step": 19730 }, { "epoch": 0.22077943865653363, "grad_norm": 0.2687484323978424, "learning_rate": 7.19539494723377e-05, "loss": 0.738, "step": 19740 }, { "epoch": 0.22089128234379632, "grad_norm": 0.2243189811706543, "learning_rate": 7.17255242359176e-05, "loss": 0.7467, "step": 19750 }, { "epoch": 0.22100312603105898, "grad_norm": 0.26094529032707214, "learning_rate": 7.149709899949747e-05, "loss": 0.7579, "step": 19760 }, { "epoch": 0.22111496971832167, "grad_norm": 0.2761390507221222, "learning_rate": 7.126867376307735e-05, "loss": 0.7491, "step": 19770 }, { "epoch": 0.22122681340558437, "grad_norm": 0.2523578405380249, "learning_rate": 7.104024852665723e-05, "loss": 0.7358, "step": 19780 }, { "epoch": 0.22133865709284703, "grad_norm": 0.25612056255340576, "learning_rate": 7.081182329023711e-05, "loss": 0.7322, "step": 19790 }, { "epoch": 0.22145050078010972, "grad_norm": 0.24379362165927887, "learning_rate": 7.058339805381698e-05, "loss": 0.7438, "step": 19800 }, { "epoch": 0.2215623444673724, "grad_norm": 0.2315502017736435, "learning_rate": 7.035497281739687e-05, "loss": 0.7349, "step": 19810 }, { "epoch": 0.22167418815463508, "grad_norm": 0.41941365599632263, "learning_rate": 7.012654758097675e-05, "loss": 0.743, "step": 19820 }, { "epoch": 0.22178603184189777, "grad_norm": 0.23147599399089813, "learning_rate": 6.989812234455663e-05, "loss": 0.7381, "step": 19830 }, { "epoch": 0.22189787552916043, "grad_norm": 0.25920864939689636, "learning_rate": 6.96696971081365e-05, "loss": 0.7469, "step": 19840 }, { "epoch": 0.22200971921642312, "grad_norm": 0.23870904743671417, "learning_rate": 6.94412718717164e-05, "loss": 0.7476, "step": 19850 }, { "epoch": 0.22212156290368582, "grad_norm": 0.2372673749923706, "learning_rate": 6.921284663529627e-05, "loss": 0.7468, "step": 19860 }, { "epoch": 0.22223340659094848, "grad_norm": 0.2703365683555603, "learning_rate": 6.898442139887615e-05, "loss": 0.742, "step": 19870 }, { "epoch": 0.22234525027821117, "grad_norm": 0.24437329173088074, "learning_rate": 6.875599616245603e-05, "loss": 0.7217, "step": 19880 }, { "epoch": 0.22245709396547386, "grad_norm": 0.21680840849876404, "learning_rate": 6.852757092603592e-05, "loss": 0.7547, "step": 19890 }, { "epoch": 0.22256893765273653, "grad_norm": 0.29101526737213135, "learning_rate": 6.829914568961579e-05, "loss": 0.7389, "step": 19900 }, { "epoch": 0.22268078133999922, "grad_norm": 0.2821531891822815, "learning_rate": 6.807072045319567e-05, "loss": 0.731, "step": 19910 }, { "epoch": 0.2227926250272619, "grad_norm": 0.2773050367832184, "learning_rate": 6.784229521677555e-05, "loss": 0.7369, "step": 19920 }, { "epoch": 0.22290446871452457, "grad_norm": 0.2531367838382721, "learning_rate": 6.761386998035543e-05, "loss": 0.7399, "step": 19930 }, { "epoch": 0.22301631240178726, "grad_norm": 0.28158465027809143, "learning_rate": 6.73854447439353e-05, "loss": 0.7523, "step": 19940 }, { "epoch": 0.22312815608904996, "grad_norm": 0.25612935423851013, "learning_rate": 6.71570195075152e-05, "loss": 0.7725, "step": 19950 }, { "epoch": 0.22323999977631262, "grad_norm": 0.26996153593063354, "learning_rate": 6.692859427109507e-05, "loss": 0.7823, "step": 19960 }, { "epoch": 0.2233518434635753, "grad_norm": 0.28008782863616943, "learning_rate": 6.670016903467495e-05, "loss": 0.7679, "step": 19970 }, { "epoch": 0.22346368715083798, "grad_norm": 0.27016493678092957, "learning_rate": 6.647174379825483e-05, "loss": 0.7617, "step": 19980 }, { "epoch": 0.22357553083810067, "grad_norm": 0.2679850459098816, "learning_rate": 6.624331856183472e-05, "loss": 0.7737, "step": 19990 }, { "epoch": 0.22368737452536336, "grad_norm": 0.2570480406284332, "learning_rate": 6.601489332541459e-05, "loss": 0.758, "step": 20000 }, { "epoch": 0.22379921821262602, "grad_norm": 0.2503785490989685, "learning_rate": 6.578646808899447e-05, "loss": 0.761, "step": 20010 }, { "epoch": 0.2239110618998887, "grad_norm": 0.2648092210292816, "learning_rate": 6.555804285257435e-05, "loss": 0.7532, "step": 20020 }, { "epoch": 0.2240229055871514, "grad_norm": 0.26829221844673157, "learning_rate": 6.532961761615423e-05, "loss": 0.7542, "step": 20030 }, { "epoch": 0.22413474927441407, "grad_norm": 0.27535539865493774, "learning_rate": 6.51011923797341e-05, "loss": 0.7578, "step": 20040 }, { "epoch": 0.22424659296167676, "grad_norm": 0.28674209117889404, "learning_rate": 6.4872767143314e-05, "loss": 0.756, "step": 20050 }, { "epoch": 0.22435843664893945, "grad_norm": 0.2523026466369629, "learning_rate": 6.464434190689387e-05, "loss": 0.7514, "step": 20060 }, { "epoch": 0.22447028033620212, "grad_norm": 0.24213305115699768, "learning_rate": 6.441591667047375e-05, "loss": 0.7546, "step": 20070 }, { "epoch": 0.2245821240234648, "grad_norm": 0.2779023349285126, "learning_rate": 6.418749143405363e-05, "loss": 0.7654, "step": 20080 }, { "epoch": 0.2246939677107275, "grad_norm": 0.28806111216545105, "learning_rate": 6.395906619763352e-05, "loss": 0.7612, "step": 20090 }, { "epoch": 0.22480581139799016, "grad_norm": 0.2637580931186676, "learning_rate": 6.373064096121339e-05, "loss": 0.7659, "step": 20100 }, { "epoch": 0.22491765508525285, "grad_norm": 0.2683275043964386, "learning_rate": 6.350221572479328e-05, "loss": 0.753, "step": 20110 }, { "epoch": 0.22502949877251555, "grad_norm": 0.2693597078323364, "learning_rate": 6.327379048837315e-05, "loss": 0.7697, "step": 20120 }, { "epoch": 0.2251413424597782, "grad_norm": 0.26335635781288147, "learning_rate": 6.304536525195304e-05, "loss": 0.7644, "step": 20130 }, { "epoch": 0.2252531861470409, "grad_norm": 0.29237446188926697, "learning_rate": 6.28169400155329e-05, "loss": 0.7721, "step": 20140 }, { "epoch": 0.22536502983430357, "grad_norm": 0.3080182373523712, "learning_rate": 6.25885147791128e-05, "loss": 0.7666, "step": 20150 }, { "epoch": 0.22547687352156626, "grad_norm": 0.2831542193889618, "learning_rate": 6.236008954269268e-05, "loss": 0.7805, "step": 20160 }, { "epoch": 0.22558871720882895, "grad_norm": 0.2860835790634155, "learning_rate": 6.213166430627257e-05, "loss": 0.7816, "step": 20170 }, { "epoch": 0.2257005608960916, "grad_norm": 0.28273066878318787, "learning_rate": 6.190323906985244e-05, "loss": 0.7812, "step": 20180 }, { "epoch": 0.2258124045833543, "grad_norm": 0.29203614592552185, "learning_rate": 6.167481383343232e-05, "loss": 0.7699, "step": 20190 }, { "epoch": 0.225924248270617, "grad_norm": 0.2811570167541504, "learning_rate": 6.14463885970122e-05, "loss": 0.7833, "step": 20200 }, { "epoch": 0.22603609195787966, "grad_norm": 0.30047500133514404, "learning_rate": 6.121796336059208e-05, "loss": 0.7594, "step": 20210 }, { "epoch": 0.22614793564514235, "grad_norm": 0.2838903069496155, "learning_rate": 6.098953812417196e-05, "loss": 0.7678, "step": 20220 }, { "epoch": 0.22625977933240504, "grad_norm": 0.2840651273727417, "learning_rate": 6.0761112887751836e-05, "loss": 0.7546, "step": 20230 }, { "epoch": 0.2263716230196677, "grad_norm": 0.31575652956962585, "learning_rate": 6.053268765133172e-05, "loss": 0.7533, "step": 20240 }, { "epoch": 0.2264834667069304, "grad_norm": 0.2692145109176636, "learning_rate": 6.03042624149116e-05, "loss": 0.744, "step": 20250 }, { "epoch": 0.2265953103941931, "grad_norm": 0.3094116449356079, "learning_rate": 6.007583717849148e-05, "loss": 0.7708, "step": 20260 }, { "epoch": 0.22670715408145575, "grad_norm": 0.3123047947883606, "learning_rate": 5.984741194207136e-05, "loss": 0.7431, "step": 20270 }, { "epoch": 0.22681899776871844, "grad_norm": 0.2733646631240845, "learning_rate": 5.961898670565124e-05, "loss": 0.762, "step": 20280 }, { "epoch": 0.2269308414559811, "grad_norm": 0.23944342136383057, "learning_rate": 5.939056146923112e-05, "loss": 0.7488, "step": 20290 }, { "epoch": 0.2270426851432438, "grad_norm": 0.2459600865840912, "learning_rate": 5.9162136232811e-05, "loss": 0.7443, "step": 20300 }, { "epoch": 0.2271545288305065, "grad_norm": 0.2502724826335907, "learning_rate": 5.893371099639088e-05, "loss": 0.7417, "step": 20310 }, { "epoch": 0.22726637251776916, "grad_norm": 0.23721522092819214, "learning_rate": 5.870528575997076e-05, "loss": 0.7393, "step": 20320 }, { "epoch": 0.22737821620503185, "grad_norm": 0.2526785135269165, "learning_rate": 5.847686052355064e-05, "loss": 0.7346, "step": 20330 }, { "epoch": 0.22749005989229454, "grad_norm": 0.2573647201061249, "learning_rate": 5.824843528713052e-05, "loss": 0.7192, "step": 20340 }, { "epoch": 0.2276019035795572, "grad_norm": 0.2632768750190735, "learning_rate": 5.80200100507104e-05, "loss": 0.7234, "step": 20350 }, { "epoch": 0.2277137472668199, "grad_norm": 0.2589345872402191, "learning_rate": 5.779158481429028e-05, "loss": 0.7165, "step": 20360 }, { "epoch": 0.22782559095408259, "grad_norm": 0.2480648308992386, "learning_rate": 5.756315957787016e-05, "loss": 0.7099, "step": 20370 }, { "epoch": 0.22793743464134525, "grad_norm": 0.24949654936790466, "learning_rate": 5.733473434145004e-05, "loss": 0.7187, "step": 20380 }, { "epoch": 0.22804927832860794, "grad_norm": 0.25637611746788025, "learning_rate": 5.710630910502993e-05, "loss": 0.7098, "step": 20390 }, { "epoch": 0.22816112201587063, "grad_norm": 0.28809231519699097, "learning_rate": 5.687788386860981e-05, "loss": 0.7315, "step": 20400 }, { "epoch": 0.2282729657031333, "grad_norm": 0.25564566254615784, "learning_rate": 5.6649458632189686e-05, "loss": 0.7319, "step": 20410 }, { "epoch": 0.228384809390396, "grad_norm": 0.2693794369697571, "learning_rate": 5.642103339576957e-05, "loss": 0.7173, "step": 20420 }, { "epoch": 0.22849665307765865, "grad_norm": 0.24680989980697632, "learning_rate": 5.619260815934945e-05, "loss": 0.708, "step": 20430 }, { "epoch": 0.22860849676492134, "grad_norm": 0.2790026068687439, "learning_rate": 5.596418292292933e-05, "loss": 0.7023, "step": 20440 }, { "epoch": 0.22872034045218403, "grad_norm": 0.2656199038028717, "learning_rate": 5.573575768650921e-05, "loss": 0.7113, "step": 20450 }, { "epoch": 0.2288321841394467, "grad_norm": 0.30832743644714355, "learning_rate": 5.550733245008909e-05, "loss": 0.7161, "step": 20460 }, { "epoch": 0.2289440278267094, "grad_norm": 0.27060794830322266, "learning_rate": 5.527890721366897e-05, "loss": 0.7208, "step": 20470 }, { "epoch": 0.22905587151397208, "grad_norm": 0.26036307215690613, "learning_rate": 5.505048197724885e-05, "loss": 0.7004, "step": 20480 }, { "epoch": 0.22916771520123475, "grad_norm": 0.2758086919784546, "learning_rate": 5.482205674082873e-05, "loss": 0.7179, "step": 20490 }, { "epoch": 0.22927955888849744, "grad_norm": 0.2821243107318878, "learning_rate": 5.459363150440861e-05, "loss": 0.7255, "step": 20500 }, { "epoch": 0.22939140257576013, "grad_norm": 0.2782810628414154, "learning_rate": 5.436520626798849e-05, "loss": 0.7149, "step": 20510 }, { "epoch": 0.2295032462630228, "grad_norm": 0.2755940854549408, "learning_rate": 5.413678103156837e-05, "loss": 0.7117, "step": 20520 }, { "epoch": 0.22961508995028548, "grad_norm": 0.29176777601242065, "learning_rate": 5.390835579514825e-05, "loss": 0.7188, "step": 20530 }, { "epoch": 0.22972693363754818, "grad_norm": 0.27739444375038147, "learning_rate": 5.367993055872813e-05, "loss": 0.7196, "step": 20540 }, { "epoch": 0.22983877732481084, "grad_norm": 0.27187204360961914, "learning_rate": 5.345150532230801e-05, "loss": 0.722, "step": 20550 }, { "epoch": 0.22995062101207353, "grad_norm": 0.2951996624469757, "learning_rate": 5.322308008588789e-05, "loss": 0.7325, "step": 20560 }, { "epoch": 0.2300624646993362, "grad_norm": 0.2677932381629944, "learning_rate": 5.299465484946777e-05, "loss": 0.7263, "step": 20570 }, { "epoch": 0.23017430838659889, "grad_norm": 0.29231807589530945, "learning_rate": 5.2766229613047654e-05, "loss": 0.7284, "step": 20580 }, { "epoch": 0.23028615207386158, "grad_norm": 0.30211326479911804, "learning_rate": 5.253780437662753e-05, "loss": 0.7222, "step": 20590 }, { "epoch": 0.23039799576112424, "grad_norm": 0.29821720719337463, "learning_rate": 5.230937914020741e-05, "loss": 0.7316, "step": 20600 }, { "epoch": 0.23050983944838693, "grad_norm": 0.3019379675388336, "learning_rate": 5.208095390378729e-05, "loss": 0.7328, "step": 20610 }, { "epoch": 0.23062168313564962, "grad_norm": 0.2569403052330017, "learning_rate": 5.185252866736717e-05, "loss": 0.7215, "step": 20620 }, { "epoch": 0.2307335268229123, "grad_norm": 0.3151782155036926, "learning_rate": 5.1624103430947054e-05, "loss": 0.7326, "step": 20630 }, { "epoch": 0.23084537051017498, "grad_norm": 0.2748591899871826, "learning_rate": 5.139567819452693e-05, "loss": 0.7359, "step": 20640 }, { "epoch": 0.23095721419743767, "grad_norm": 0.27494433522224426, "learning_rate": 5.116725295810681e-05, "loss": 0.7351, "step": 20650 }, { "epoch": 0.23106905788470034, "grad_norm": 0.29428452253341675, "learning_rate": 5.093882772168669e-05, "loss": 0.7361, "step": 20660 }, { "epoch": 0.23118090157196303, "grad_norm": 0.2924981117248535, "learning_rate": 5.071040248526657e-05, "loss": 0.7539, "step": 20670 }, { "epoch": 0.23129274525922572, "grad_norm": 0.28647035360336304, "learning_rate": 5.0481977248846455e-05, "loss": 0.7576, "step": 20680 }, { "epoch": 0.23140458894648838, "grad_norm": 0.3107542097568512, "learning_rate": 5.025355201242633e-05, "loss": 0.7615, "step": 20690 }, { "epoch": 0.23151643263375107, "grad_norm": 0.27186501026153564, "learning_rate": 5.0025126776006213e-05, "loss": 0.7641, "step": 20700 }, { "epoch": 0.23162827632101374, "grad_norm": 0.2838156819343567, "learning_rate": 4.9796701539586096e-05, "loss": 0.7695, "step": 20710 }, { "epoch": 0.23174012000827643, "grad_norm": 0.3377101421356201, "learning_rate": 4.956827630316597e-05, "loss": 0.7696, "step": 20720 }, { "epoch": 0.23185196369553912, "grad_norm": 0.3177778422832489, "learning_rate": 4.9339851066745855e-05, "loss": 0.7677, "step": 20730 }, { "epoch": 0.23196380738280178, "grad_norm": 0.3157583773136139, "learning_rate": 4.911142583032573e-05, "loss": 0.7653, "step": 20740 }, { "epoch": 0.23207565107006448, "grad_norm": 0.3123907148838043, "learning_rate": 4.8883000593905614e-05, "loss": 0.7677, "step": 20750 }, { "epoch": 0.23218749475732717, "grad_norm": 0.30460426211357117, "learning_rate": 4.86545753574855e-05, "loss": 0.7743, "step": 20760 }, { "epoch": 0.23229933844458983, "grad_norm": 0.27507251501083374, "learning_rate": 4.842615012106537e-05, "loss": 0.767, "step": 20770 }, { "epoch": 0.23241118213185252, "grad_norm": 0.3233499228954315, "learning_rate": 4.8197724884645256e-05, "loss": 0.7717, "step": 20780 }, { "epoch": 0.23252302581911521, "grad_norm": 0.30144819617271423, "learning_rate": 4.796929964822513e-05, "loss": 0.7609, "step": 20790 }, { "epoch": 0.23263486950637788, "grad_norm": 0.29588454961776733, "learning_rate": 4.7740874411805014e-05, "loss": 0.7682, "step": 20800 }, { "epoch": 0.23274671319364057, "grad_norm": 0.3111203610897064, "learning_rate": 4.75124491753849e-05, "loss": 0.7652, "step": 20810 }, { "epoch": 0.23285855688090326, "grad_norm": 0.28917646408081055, "learning_rate": 4.728402393896477e-05, "loss": 0.7584, "step": 20820 }, { "epoch": 0.23297040056816593, "grad_norm": 0.3156343698501587, "learning_rate": 4.7055598702544656e-05, "loss": 0.7643, "step": 20830 }, { "epoch": 0.23308224425542862, "grad_norm": 0.2909680902957916, "learning_rate": 4.682717346612454e-05, "loss": 0.7613, "step": 20840 }, { "epoch": 0.2331940879426913, "grad_norm": 0.3006870746612549, "learning_rate": 4.659874822970442e-05, "loss": 0.7603, "step": 20850 }, { "epoch": 0.23330593162995397, "grad_norm": 0.2844945192337036, "learning_rate": 4.6370322993284304e-05, "loss": 0.7589, "step": 20860 }, { "epoch": 0.23341777531721666, "grad_norm": 0.26857924461364746, "learning_rate": 4.614189775686418e-05, "loss": 0.7401, "step": 20870 }, { "epoch": 0.23352961900447933, "grad_norm": 0.31332314014434814, "learning_rate": 4.591347252044406e-05, "loss": 0.7468, "step": 20880 }, { "epoch": 0.23364146269174202, "grad_norm": 0.28083765506744385, "learning_rate": 4.568504728402394e-05, "loss": 0.7451, "step": 20890 }, { "epoch": 0.2337533063790047, "grad_norm": 0.29185009002685547, "learning_rate": 4.545662204760382e-05, "loss": 0.7478, "step": 20900 }, { "epoch": 0.23386515006626737, "grad_norm": 0.30532801151275635, "learning_rate": 4.5228196811183705e-05, "loss": 0.7404, "step": 20910 }, { "epoch": 0.23397699375353007, "grad_norm": 0.2724134922027588, "learning_rate": 4.499977157476358e-05, "loss": 0.732, "step": 20920 }, { "epoch": 0.23408883744079276, "grad_norm": 0.29753822088241577, "learning_rate": 4.4771346338343464e-05, "loss": 0.7236, "step": 20930 }, { "epoch": 0.23420068112805542, "grad_norm": 0.31980055570602417, "learning_rate": 4.454292110192334e-05, "loss": 0.7407, "step": 20940 }, { "epoch": 0.2343125248153181, "grad_norm": 0.29578351974487305, "learning_rate": 4.431449586550322e-05, "loss": 0.7166, "step": 20950 }, { "epoch": 0.2344243685025808, "grad_norm": 0.25261184573173523, "learning_rate": 4.4086070629083105e-05, "loss": 0.7195, "step": 20960 }, { "epoch": 0.23453621218984347, "grad_norm": 0.2669534385204315, "learning_rate": 4.385764539266298e-05, "loss": 0.7224, "step": 20970 }, { "epoch": 0.23464805587710616, "grad_norm": 0.2817215919494629, "learning_rate": 4.3629220156242864e-05, "loss": 0.7405, "step": 20980 }, { "epoch": 0.23475989956436885, "grad_norm": 0.27033400535583496, "learning_rate": 4.340079491982275e-05, "loss": 0.7292, "step": 20990 }, { "epoch": 0.23487174325163152, "grad_norm": 0.3083013594150543, "learning_rate": 4.317236968340262e-05, "loss": 0.7271, "step": 21000 }, { "epoch": 0.2349835869388942, "grad_norm": 0.27074989676475525, "learning_rate": 4.2943944446982506e-05, "loss": 0.7346, "step": 21010 }, { "epoch": 0.23509543062615687, "grad_norm": 0.31609755754470825, "learning_rate": 4.271551921056238e-05, "loss": 0.7285, "step": 21020 }, { "epoch": 0.23520727431341956, "grad_norm": 0.27084672451019287, "learning_rate": 4.2487093974142265e-05, "loss": 0.7411, "step": 21030 }, { "epoch": 0.23531911800068225, "grad_norm": 0.26669842004776, "learning_rate": 4.225866873772215e-05, "loss": 0.7423, "step": 21040 }, { "epoch": 0.23543096168794492, "grad_norm": 0.2873358428478241, "learning_rate": 4.2030243501302024e-05, "loss": 0.7345, "step": 21050 }, { "epoch": 0.2355428053752076, "grad_norm": 0.2831687033176422, "learning_rate": 4.1801818264881906e-05, "loss": 0.7537, "step": 21060 }, { "epoch": 0.2356546490624703, "grad_norm": 0.2781788110733032, "learning_rate": 4.157339302846178e-05, "loss": 0.7494, "step": 21070 }, { "epoch": 0.23576649274973296, "grad_norm": 0.27109071612358093, "learning_rate": 4.1344967792041665e-05, "loss": 0.7493, "step": 21080 }, { "epoch": 0.23587833643699566, "grad_norm": 0.25398164987564087, "learning_rate": 4.111654255562155e-05, "loss": 0.7369, "step": 21090 }, { "epoch": 0.23599018012425835, "grad_norm": 0.3150353729724884, "learning_rate": 4.0888117319201424e-05, "loss": 0.754, "step": 21100 }, { "epoch": 0.236102023811521, "grad_norm": 0.27384257316589355, "learning_rate": 4.065969208278131e-05, "loss": 0.7439, "step": 21110 }, { "epoch": 0.2362138674987837, "grad_norm": 0.2770559787750244, "learning_rate": 4.043126684636118e-05, "loss": 0.7391, "step": 21120 }, { "epoch": 0.2363257111860464, "grad_norm": 0.29367002844810486, "learning_rate": 4.0202841609941066e-05, "loss": 0.746, "step": 21130 }, { "epoch": 0.23643755487330906, "grad_norm": 0.2554051876068115, "learning_rate": 3.997441637352095e-05, "loss": 0.7386, "step": 21140 }, { "epoch": 0.23654939856057175, "grad_norm": 0.2943428158760071, "learning_rate": 3.9745991137100825e-05, "loss": 0.7437, "step": 21150 }, { "epoch": 0.2366612422478344, "grad_norm": 0.24465301632881165, "learning_rate": 3.951756590068071e-05, "loss": 0.7331, "step": 21160 }, { "epoch": 0.2367730859350971, "grad_norm": 0.2545934021472931, "learning_rate": 3.9289140664260584e-05, "loss": 0.7361, "step": 21170 }, { "epoch": 0.2368849296223598, "grad_norm": 0.2792121469974518, "learning_rate": 3.9060715427840466e-05, "loss": 0.7238, "step": 21180 }, { "epoch": 0.23699677330962246, "grad_norm": 0.27943745255470276, "learning_rate": 3.883229019142035e-05, "loss": 0.726, "step": 21190 }, { "epoch": 0.23710861699688515, "grad_norm": 0.2514471411705017, "learning_rate": 3.8603864955000225e-05, "loss": 0.7214, "step": 21200 }, { "epoch": 0.23722046068414784, "grad_norm": 0.2698551416397095, "learning_rate": 3.837543971858011e-05, "loss": 0.7318, "step": 21210 }, { "epoch": 0.2373323043714105, "grad_norm": 0.29603877663612366, "learning_rate": 3.814701448215999e-05, "loss": 0.742, "step": 21220 }, { "epoch": 0.2374441480586732, "grad_norm": 0.26655495166778564, "learning_rate": 3.791858924573987e-05, "loss": 0.7331, "step": 21230 }, { "epoch": 0.2375559917459359, "grad_norm": 0.29367104172706604, "learning_rate": 3.769016400931975e-05, "loss": 0.7233, "step": 21240 }, { "epoch": 0.23766783543319855, "grad_norm": 0.2680334746837616, "learning_rate": 3.7461738772899626e-05, "loss": 0.732, "step": 21250 }, { "epoch": 0.23777967912046125, "grad_norm": 0.2748298943042755, "learning_rate": 3.723331353647951e-05, "loss": 0.7453, "step": 21260 }, { "epoch": 0.23789152280772394, "grad_norm": 0.28276947140693665, "learning_rate": 3.700488830005939e-05, "loss": 0.7524, "step": 21270 }, { "epoch": 0.2380033664949866, "grad_norm": 0.2645372450351715, "learning_rate": 3.677646306363927e-05, "loss": 0.7542, "step": 21280 }, { "epoch": 0.2381152101822493, "grad_norm": 0.2866505980491638, "learning_rate": 3.654803782721916e-05, "loss": 0.7447, "step": 21290 }, { "epoch": 0.23822705386951196, "grad_norm": 0.29611489176750183, "learning_rate": 3.631961259079903e-05, "loss": 0.7662, "step": 21300 }, { "epoch": 0.23833889755677465, "grad_norm": 0.29184749722480774, "learning_rate": 3.6091187354378916e-05, "loss": 0.7558, "step": 21310 }, { "epoch": 0.23845074124403734, "grad_norm": 0.27304571866989136, "learning_rate": 3.58627621179588e-05, "loss": 0.7578, "step": 21320 }, { "epoch": 0.2385625849313, "grad_norm": 0.2700962424278259, "learning_rate": 3.5634336881538675e-05, "loss": 0.7411, "step": 21330 }, { "epoch": 0.2386744286185627, "grad_norm": 0.2845793664455414, "learning_rate": 3.540591164511856e-05, "loss": 0.7392, "step": 21340 }, { "epoch": 0.2387862723058254, "grad_norm": 0.32136180996894836, "learning_rate": 3.5177486408698433e-05, "loss": 0.7431, "step": 21350 }, { "epoch": 0.23889811599308805, "grad_norm": 0.26846998929977417, "learning_rate": 3.4949061172278316e-05, "loss": 0.737, "step": 21360 }, { "epoch": 0.23900995968035074, "grad_norm": 0.26363828778266907, "learning_rate": 3.47206359358582e-05, "loss": 0.7416, "step": 21370 }, { "epoch": 0.23912180336761343, "grad_norm": 0.2900106906890869, "learning_rate": 3.4492210699438075e-05, "loss": 0.7373, "step": 21380 }, { "epoch": 0.2392336470548761, "grad_norm": 0.2762589156627655, "learning_rate": 3.426378546301796e-05, "loss": 0.7379, "step": 21390 }, { "epoch": 0.2393454907421388, "grad_norm": 0.2697104513645172, "learning_rate": 3.4035360226597834e-05, "loss": 0.7448, "step": 21400 }, { "epoch": 0.23945733442940148, "grad_norm": 0.2901761829853058, "learning_rate": 3.380693499017772e-05, "loss": 0.7394, "step": 21410 }, { "epoch": 0.23956917811666414, "grad_norm": 0.245674267411232, "learning_rate": 3.35785097537576e-05, "loss": 0.7387, "step": 21420 }, { "epoch": 0.23968102180392684, "grad_norm": 0.2713403105735779, "learning_rate": 3.3350084517337476e-05, "loss": 0.7604, "step": 21430 }, { "epoch": 0.2397928654911895, "grad_norm": 0.27368244528770447, "learning_rate": 3.312165928091736e-05, "loss": 0.7489, "step": 21440 }, { "epoch": 0.2399047091784522, "grad_norm": 0.3079991340637207, "learning_rate": 3.2893234044497234e-05, "loss": 0.7653, "step": 21450 }, { "epoch": 0.24001655286571488, "grad_norm": 0.2920658588409424, "learning_rate": 3.266480880807712e-05, "loss": 0.7588, "step": 21460 }, { "epoch": 0.24012839655297755, "grad_norm": 0.27589842677116394, "learning_rate": 3.2436383571657e-05, "loss": 0.7607, "step": 21470 }, { "epoch": 0.24024024024024024, "grad_norm": 0.2592112720012665, "learning_rate": 3.2207958335236876e-05, "loss": 0.745, "step": 21480 }, { "epoch": 0.24035208392750293, "grad_norm": 0.27625855803489685, "learning_rate": 3.197953309881676e-05, "loss": 0.7488, "step": 21490 }, { "epoch": 0.2404639276147656, "grad_norm": 0.2769569456577301, "learning_rate": 3.175110786239664e-05, "loss": 0.7326, "step": 21500 }, { "epoch": 0.24057577130202829, "grad_norm": 0.2705914080142975, "learning_rate": 3.152268262597652e-05, "loss": 0.7512, "step": 21510 }, { "epoch": 0.24068761498929098, "grad_norm": 0.2655676603317261, "learning_rate": 3.12942573895564e-05, "loss": 0.7366, "step": 21520 }, { "epoch": 0.24079945867655364, "grad_norm": 0.2606657147407532, "learning_rate": 3.106583215313628e-05, "loss": 0.7436, "step": 21530 }, { "epoch": 0.24091130236381633, "grad_norm": 0.27843552827835083, "learning_rate": 3.083740691671616e-05, "loss": 0.7342, "step": 21540 }, { "epoch": 0.24102314605107902, "grad_norm": 0.27866050601005554, "learning_rate": 3.060898168029604e-05, "loss": 0.7305, "step": 21550 }, { "epoch": 0.2411349897383417, "grad_norm": 0.2803070545196533, "learning_rate": 3.0380556443875918e-05, "loss": 0.727, "step": 21560 }, { "epoch": 0.24124683342560438, "grad_norm": 0.27220121026039124, "learning_rate": 3.01521312074558e-05, "loss": 0.7195, "step": 21570 }, { "epoch": 0.24135867711286707, "grad_norm": 0.26060426235198975, "learning_rate": 2.992370597103568e-05, "loss": 0.7013, "step": 21580 }, { "epoch": 0.24147052080012973, "grad_norm": 0.24253526329994202, "learning_rate": 2.969528073461556e-05, "loss": 0.6925, "step": 21590 }, { "epoch": 0.24158236448739243, "grad_norm": 0.26293566823005676, "learning_rate": 2.946685549819544e-05, "loss": 0.7028, "step": 21600 }, { "epoch": 0.2416942081746551, "grad_norm": 0.26427412033081055, "learning_rate": 2.923843026177532e-05, "loss": 0.6993, "step": 21610 }, { "epoch": 0.24180605186191778, "grad_norm": 0.26823869347572327, "learning_rate": 2.90100050253552e-05, "loss": 0.6999, "step": 21620 }, { "epoch": 0.24191789554918047, "grad_norm": 0.24203690886497498, "learning_rate": 2.878157978893508e-05, "loss": 0.6906, "step": 21630 }, { "epoch": 0.24202973923644314, "grad_norm": 0.2612786889076233, "learning_rate": 2.8553154552514964e-05, "loss": 0.6952, "step": 21640 }, { "epoch": 0.24214158292370583, "grad_norm": 0.27152737975120544, "learning_rate": 2.8324729316094843e-05, "loss": 0.692, "step": 21650 }, { "epoch": 0.24225342661096852, "grad_norm": 0.2592925727367401, "learning_rate": 2.8096304079674726e-05, "loss": 0.6995, "step": 21660 }, { "epoch": 0.24236527029823118, "grad_norm": 0.2419063299894333, "learning_rate": 2.7867878843254605e-05, "loss": 0.7067, "step": 21670 }, { "epoch": 0.24247711398549388, "grad_norm": 0.24731135368347168, "learning_rate": 2.7639453606834485e-05, "loss": 0.734, "step": 21680 }, { "epoch": 0.24258895767275657, "grad_norm": 0.25746017694473267, "learning_rate": 2.7411028370414364e-05, "loss": 0.7075, "step": 21690 }, { "epoch": 0.24270080136001923, "grad_norm": 0.2521972060203552, "learning_rate": 2.7182603133994244e-05, "loss": 0.7137, "step": 21700 }, { "epoch": 0.24281264504728192, "grad_norm": 0.26796218752861023, "learning_rate": 2.6954177897574127e-05, "loss": 0.7227, "step": 21710 }, { "epoch": 0.2429244887345446, "grad_norm": 0.30404597520828247, "learning_rate": 2.6725752661154006e-05, "loss": 0.7243, "step": 21720 }, { "epoch": 0.24303633242180728, "grad_norm": 0.29561156034469604, "learning_rate": 2.6497327424733885e-05, "loss": 0.7357, "step": 21730 }, { "epoch": 0.24314817610906997, "grad_norm": 0.28066596388816833, "learning_rate": 2.6268902188313765e-05, "loss": 0.7224, "step": 21740 }, { "epoch": 0.24326001979633263, "grad_norm": 0.29235216975212097, "learning_rate": 2.6040476951893644e-05, "loss": 0.7288, "step": 21750 }, { "epoch": 0.24337186348359532, "grad_norm": 0.26750460267066956, "learning_rate": 2.5812051715473527e-05, "loss": 0.7414, "step": 21760 }, { "epoch": 0.24348370717085802, "grad_norm": 0.2707473039627075, "learning_rate": 2.5583626479053406e-05, "loss": 0.7478, "step": 21770 }, { "epoch": 0.24359555085812068, "grad_norm": 0.26526397466659546, "learning_rate": 2.5355201242633286e-05, "loss": 0.7513, "step": 21780 }, { "epoch": 0.24370739454538337, "grad_norm": 0.2362915724515915, "learning_rate": 2.5126776006213165e-05, "loss": 0.7507, "step": 21790 }, { "epoch": 0.24381923823264606, "grad_norm": 0.2512950599193573, "learning_rate": 2.4898350769793048e-05, "loss": 0.7417, "step": 21800 }, { "epoch": 0.24393108191990873, "grad_norm": 0.2366458922624588, "learning_rate": 2.4669925533372928e-05, "loss": 0.7402, "step": 21810 }, { "epoch": 0.24404292560717142, "grad_norm": 0.24888353049755096, "learning_rate": 2.4441500296952807e-05, "loss": 0.7456, "step": 21820 }, { "epoch": 0.2441547692944341, "grad_norm": 0.24143491685390472, "learning_rate": 2.4213075060532686e-05, "loss": 0.7405, "step": 21830 }, { "epoch": 0.24426661298169677, "grad_norm": 0.2669823169708252, "learning_rate": 2.3984649824112566e-05, "loss": 0.7544, "step": 21840 }, { "epoch": 0.24437845666895947, "grad_norm": 0.24328452348709106, "learning_rate": 2.375622458769245e-05, "loss": 0.7347, "step": 21850 }, { "epoch": 0.24449030035622216, "grad_norm": 0.26204219460487366, "learning_rate": 2.3527799351272328e-05, "loss": 0.7397, "step": 21860 }, { "epoch": 0.24460214404348482, "grad_norm": 0.2631550431251526, "learning_rate": 2.329937411485221e-05, "loss": 0.7413, "step": 21870 }, { "epoch": 0.2447139877307475, "grad_norm": 0.2729988694190979, "learning_rate": 2.307094887843209e-05, "loss": 0.7336, "step": 21880 }, { "epoch": 0.24482583141801018, "grad_norm": 0.2702917754650116, "learning_rate": 2.284252364201197e-05, "loss": 0.7294, "step": 21890 }, { "epoch": 0.24493767510527287, "grad_norm": 0.22882196307182312, "learning_rate": 2.2614098405591852e-05, "loss": 0.7164, "step": 21900 }, { "epoch": 0.24504951879253556, "grad_norm": 0.2660382390022278, "learning_rate": 2.2385673169171732e-05, "loss": 0.7231, "step": 21910 }, { "epoch": 0.24516136247979822, "grad_norm": 0.2580036222934723, "learning_rate": 2.215724793275161e-05, "loss": 0.7243, "step": 21920 }, { "epoch": 0.24527320616706091, "grad_norm": 0.25490158796310425, "learning_rate": 2.192882269633149e-05, "loss": 0.7129, "step": 21930 }, { "epoch": 0.2453850498543236, "grad_norm": 0.2626509368419647, "learning_rate": 2.1700397459911374e-05, "loss": 0.7177, "step": 21940 }, { "epoch": 0.24549689354158627, "grad_norm": 0.2642146646976471, "learning_rate": 2.1471972223491253e-05, "loss": 0.7119, "step": 21950 }, { "epoch": 0.24560873722884896, "grad_norm": 0.2683079242706299, "learning_rate": 2.1243546987071132e-05, "loss": 0.7226, "step": 21960 }, { "epoch": 0.24572058091611165, "grad_norm": 0.26513761281967163, "learning_rate": 2.1015121750651012e-05, "loss": 0.7276, "step": 21970 }, { "epoch": 0.24583242460337432, "grad_norm": 0.25856319069862366, "learning_rate": 2.078669651423089e-05, "loss": 0.7168, "step": 21980 }, { "epoch": 0.245944268290637, "grad_norm": 0.29048866033554077, "learning_rate": 2.0558271277810774e-05, "loss": 0.7189, "step": 21990 }, { "epoch": 0.2460561119778997, "grad_norm": 0.2775687575340271, "learning_rate": 2.0329846041390653e-05, "loss": 0.7276, "step": 22000 }, { "epoch": 0.24616795566516236, "grad_norm": 0.30157843232154846, "learning_rate": 2.0101420804970533e-05, "loss": 0.7435, "step": 22010 }, { "epoch": 0.24627979935242506, "grad_norm": 0.2602044939994812, "learning_rate": 1.9872995568550412e-05, "loss": 0.7365, "step": 22020 }, { "epoch": 0.24639164303968772, "grad_norm": 0.29975757002830505, "learning_rate": 1.9644570332130292e-05, "loss": 0.7484, "step": 22030 }, { "epoch": 0.2465034867269504, "grad_norm": 0.26586923003196716, "learning_rate": 1.9416145095710175e-05, "loss": 0.7499, "step": 22040 }, { "epoch": 0.2466153304142131, "grad_norm": 0.25447341799736023, "learning_rate": 1.9187719859290054e-05, "loss": 0.7523, "step": 22050 }, { "epoch": 0.24672717410147577, "grad_norm": 0.2876524031162262, "learning_rate": 1.8959294622869933e-05, "loss": 0.7532, "step": 22060 }, { "epoch": 0.24683901778873846, "grad_norm": 0.29897189140319824, "learning_rate": 1.8730869386449813e-05, "loss": 0.7339, "step": 22070 }, { "epoch": 0.24695086147600115, "grad_norm": 0.24629873037338257, "learning_rate": 1.8502444150029696e-05, "loss": 0.7253, "step": 22080 }, { "epoch": 0.2470627051632638, "grad_norm": 0.2844459116458893, "learning_rate": 1.827401891360958e-05, "loss": 0.7247, "step": 22090 }, { "epoch": 0.2471745488505265, "grad_norm": 0.2798469662666321, "learning_rate": 1.8045593677189458e-05, "loss": 0.7334, "step": 22100 }, { "epoch": 0.2472863925377892, "grad_norm": 0.26282501220703125, "learning_rate": 1.7817168440769337e-05, "loss": 0.735, "step": 22110 }, { "epoch": 0.24739823622505186, "grad_norm": 0.25192755460739136, "learning_rate": 1.7588743204349217e-05, "loss": 0.733, "step": 22120 }, { "epoch": 0.24751007991231455, "grad_norm": 0.2808292508125305, "learning_rate": 1.73603179679291e-05, "loss": 0.7403, "step": 22130 }, { "epoch": 0.24762192359957724, "grad_norm": 0.28252866864204407, "learning_rate": 1.713189273150898e-05, "loss": 0.7296, "step": 22140 }, { "epoch": 0.2477337672868399, "grad_norm": 0.2730456590652466, "learning_rate": 1.690346749508886e-05, "loss": 0.7321, "step": 22150 }, { "epoch": 0.2478456109741026, "grad_norm": 0.2562378942966461, "learning_rate": 1.6675042258668738e-05, "loss": 0.7195, "step": 22160 }, { "epoch": 0.2479574546613653, "grad_norm": 0.2450082004070282, "learning_rate": 1.6446617022248617e-05, "loss": 0.7277, "step": 22170 }, { "epoch": 0.24806929834862795, "grad_norm": 0.25871893763542175, "learning_rate": 1.62181917858285e-05, "loss": 0.7143, "step": 22180 }, { "epoch": 0.24818114203589065, "grad_norm": 0.2587449848651886, "learning_rate": 1.598976654940838e-05, "loss": 0.708, "step": 22190 }, { "epoch": 0.2482929857231533, "grad_norm": 0.25496092438697815, "learning_rate": 1.576134131298826e-05, "loss": 0.7123, "step": 22200 }, { "epoch": 0.248404829410416, "grad_norm": 0.2394058257341385, "learning_rate": 1.553291607656814e-05, "loss": 0.714, "step": 22210 }, { "epoch": 0.2485166730976787, "grad_norm": 0.2560165524482727, "learning_rate": 1.530449084014802e-05, "loss": 0.7162, "step": 22220 }, { "epoch": 0.24862851678494136, "grad_norm": 0.24602052569389343, "learning_rate": 1.50760656037279e-05, "loss": 0.7408, "step": 22230 }, { "epoch": 0.24874036047220405, "grad_norm": 0.27800559997558594, "learning_rate": 1.484764036730778e-05, "loss": 0.7247, "step": 22240 }, { "epoch": 0.24885220415946674, "grad_norm": 0.24703536927700043, "learning_rate": 1.461921513088766e-05, "loss": 0.7352, "step": 22250 }, { "epoch": 0.2489640478467294, "grad_norm": 0.27936097979545593, "learning_rate": 1.439078989446754e-05, "loss": 0.7421, "step": 22260 }, { "epoch": 0.2490758915339921, "grad_norm": 0.265828400850296, "learning_rate": 1.4162364658047422e-05, "loss": 0.7234, "step": 22270 }, { "epoch": 0.24918773522125479, "grad_norm": 0.26921194791793823, "learning_rate": 1.3933939421627303e-05, "loss": 0.7414, "step": 22280 }, { "epoch": 0.24929957890851745, "grad_norm": 0.2829255163669586, "learning_rate": 1.3705514185207182e-05, "loss": 0.7378, "step": 22290 }, { "epoch": 0.24941142259578014, "grad_norm": 0.25702667236328125, "learning_rate": 1.3477088948787063e-05, "loss": 0.7475, "step": 22300 }, { "epoch": 0.24952326628304283, "grad_norm": 0.28925350308418274, "learning_rate": 1.3248663712366943e-05, "loss": 0.738, "step": 22310 }, { "epoch": 0.2496351099703055, "grad_norm": 0.2792825698852539, "learning_rate": 1.3020238475946822e-05, "loss": 0.7315, "step": 22320 }, { "epoch": 0.2497469536575682, "grad_norm": 0.246215358376503, "learning_rate": 1.2791813239526703e-05, "loss": 0.7391, "step": 22330 }, { "epoch": 0.24985879734483085, "grad_norm": 0.26492443680763245, "learning_rate": 1.2563388003106583e-05, "loss": 0.7478, "step": 22340 }, { "epoch": 0.24997064103209354, "grad_norm": 0.27402445673942566, "learning_rate": 1.2334962766686464e-05, "loss": 0.7528, "step": 22350 }, { "epoch": 0.25008248471935624, "grad_norm": 0.2757234573364258, "learning_rate": 1.2106537530266343e-05, "loss": 0.7306, "step": 22360 }, { "epoch": 0.2501943284066189, "grad_norm": 0.2723679840564728, "learning_rate": 1.1878112293846224e-05, "loss": 0.7472, "step": 22370 }, { "epoch": 0.2503061720938816, "grad_norm": 0.22666431963443756, "learning_rate": 1.1649687057426105e-05, "loss": 0.7443, "step": 22380 }, { "epoch": 0.25041801578114425, "grad_norm": 0.24548636376857758, "learning_rate": 1.1421261821005985e-05, "loss": 0.7525, "step": 22390 }, { "epoch": 0.25052985946840695, "grad_norm": 0.26941460371017456, "learning_rate": 1.1192836584585866e-05, "loss": 0.7482, "step": 22400 }, { "epoch": 0.25064170315566964, "grad_norm": 0.2741219997406006, "learning_rate": 1.0964411348165745e-05, "loss": 0.7404, "step": 22410 }, { "epoch": 0.25075354684293233, "grad_norm": 0.2622029483318329, "learning_rate": 1.0735986111745626e-05, "loss": 0.7463, "step": 22420 }, { "epoch": 0.250865390530195, "grad_norm": 0.25730788707733154, "learning_rate": 1.0507560875325506e-05, "loss": 0.7596, "step": 22430 }, { "epoch": 0.25097723421745766, "grad_norm": 0.24054691195487976, "learning_rate": 1.0279135638905387e-05, "loss": 0.7397, "step": 22440 }, { "epoch": 0.25108907790472035, "grad_norm": 0.23557224869728088, "learning_rate": 1.0050710402485266e-05, "loss": 0.7426, "step": 22450 }, { "epoch": 0.25120092159198304, "grad_norm": 0.25929298996925354, "learning_rate": 9.822285166065146e-06, "loss": 0.7402, "step": 22460 }, { "epoch": 0.25131276527924573, "grad_norm": 0.26300865411758423, "learning_rate": 9.593859929645027e-06, "loss": 0.755, "step": 22470 }, { "epoch": 0.2514246089665084, "grad_norm": 0.25753623247146606, "learning_rate": 9.365434693224906e-06, "loss": 0.7536, "step": 22480 }, { "epoch": 0.2515364526537711, "grad_norm": 0.2438272088766098, "learning_rate": 9.13700945680479e-06, "loss": 0.7528, "step": 22490 }, { "epoch": 0.25164829634103375, "grad_norm": 0.2870919406414032, "learning_rate": 8.908584220384669e-06, "loss": 0.772, "step": 22500 }, { "epoch": 0.25176014002829644, "grad_norm": 0.2551197111606598, "learning_rate": 8.68015898396455e-06, "loss": 0.7571, "step": 22510 }, { "epoch": 0.25187198371555913, "grad_norm": 0.24423009157180786, "learning_rate": 8.45173374754443e-06, "loss": 0.7548, "step": 22520 }, { "epoch": 0.2519838274028218, "grad_norm": 0.2683405578136444, "learning_rate": 8.223308511124309e-06, "loss": 0.7631, "step": 22530 }, { "epoch": 0.2520956710900845, "grad_norm": 0.25919967889785767, "learning_rate": 7.99488327470419e-06, "loss": 0.7556, "step": 22540 }, { "epoch": 0.25220751477734715, "grad_norm": 0.25076591968536377, "learning_rate": 7.76645803828407e-06, "loss": 0.7528, "step": 22550 }, { "epoch": 0.25231935846460984, "grad_norm": 0.2598860561847687, "learning_rate": 7.53803280186395e-06, "loss": 0.7565, "step": 22560 }, { "epoch": 0.25243120215187254, "grad_norm": 0.30933788418769836, "learning_rate": 7.30960756544383e-06, "loss": 0.7645, "step": 22570 }, { "epoch": 0.2525430458391352, "grad_norm": 0.26472121477127075, "learning_rate": 7.081182329023711e-06, "loss": 0.7559, "step": 22580 }, { "epoch": 0.2526548895263979, "grad_norm": 0.28362420201301575, "learning_rate": 6.852757092603591e-06, "loss": 0.7618, "step": 22590 }, { "epoch": 0.2527667332136606, "grad_norm": 0.27758538722991943, "learning_rate": 6.624331856183471e-06, "loss": 0.7656, "step": 22600 }, { "epoch": 0.25287857690092325, "grad_norm": 0.28303948044776917, "learning_rate": 6.395906619763352e-06, "loss": 0.7672, "step": 22610 }, { "epoch": 0.25299042058818594, "grad_norm": 0.2938460409641266, "learning_rate": 6.167481383343232e-06, "loss": 0.7662, "step": 22620 }, { "epoch": 0.25310226427544863, "grad_norm": 0.25707969069480896, "learning_rate": 5.939056146923112e-06, "loss": 0.7667, "step": 22630 }, { "epoch": 0.2532141079627113, "grad_norm": 0.2813314199447632, "learning_rate": 5.710630910502992e-06, "loss": 0.7645, "step": 22640 }, { "epoch": 0.253325951649974, "grad_norm": 0.2911704480648041, "learning_rate": 5.482205674082873e-06, "loss": 0.763, "step": 22650 }, { "epoch": 0.2534377953372367, "grad_norm": 0.2982921600341797, "learning_rate": 5.253780437662753e-06, "loss": 0.7606, "step": 22660 }, { "epoch": 0.25354963902449934, "grad_norm": 0.2803521156311035, "learning_rate": 5.025355201242633e-06, "loss": 0.7617, "step": 22670 }, { "epoch": 0.25366148271176203, "grad_norm": 0.26502448320388794, "learning_rate": 4.7969299648225135e-06, "loss": 0.7802, "step": 22680 }, { "epoch": 0.2537733263990247, "grad_norm": 0.27778494358062744, "learning_rate": 4.568504728402395e-06, "loss": 0.7776, "step": 22690 }, { "epoch": 0.2538851700862874, "grad_norm": 0.27522069215774536, "learning_rate": 4.340079491982275e-06, "loss": 0.7712, "step": 22700 }, { "epoch": 0.2539970137735501, "grad_norm": 0.2718433141708374, "learning_rate": 4.111654255562154e-06, "loss": 0.7696, "step": 22710 }, { "epoch": 0.25410885746081274, "grad_norm": 0.35057663917541504, "learning_rate": 3.883229019142035e-06, "loss": 0.7648, "step": 22720 }, { "epoch": 0.25422070114807543, "grad_norm": 0.274494469165802, "learning_rate": 3.654803782721915e-06, "loss": 0.7578, "step": 22730 }, { "epoch": 0.2543325448353381, "grad_norm": 0.2570250928401947, "learning_rate": 3.4263785463017955e-06, "loss": 0.7502, "step": 22740 }, { "epoch": 0.2544443885226008, "grad_norm": 0.290217787027359, "learning_rate": 3.197953309881676e-06, "loss": 0.7607, "step": 22750 }, { "epoch": 0.2545562322098635, "grad_norm": 0.25752514600753784, "learning_rate": 2.969528073461556e-06, "loss": 0.7612, "step": 22760 }, { "epoch": 0.2546680758971262, "grad_norm": 0.23857931792736053, "learning_rate": 2.7411028370414363e-06, "loss": 0.7495, "step": 22770 }, { "epoch": 0.25477991958438884, "grad_norm": 0.26004472374916077, "learning_rate": 2.5126776006213166e-06, "loss": 0.7477, "step": 22780 }, { "epoch": 0.25489176327165153, "grad_norm": 0.25449565052986145, "learning_rate": 2.2842523642011973e-06, "loss": 0.7379, "step": 22790 }, { "epoch": 0.2550036069589142, "grad_norm": 0.2568104565143585, "learning_rate": 2.055827127781077e-06, "loss": 0.7407, "step": 22800 }, { "epoch": 0.2551154506461769, "grad_norm": 0.253451406955719, "learning_rate": 1.8274018913609574e-06, "loss": 0.7241, "step": 22810 }, { "epoch": 0.2552272943334396, "grad_norm": 0.25928062200546265, "learning_rate": 1.598976654940838e-06, "loss": 0.7502, "step": 22820 }, { "epoch": 0.2553391380207023, "grad_norm": 0.24965140223503113, "learning_rate": 1.3705514185207182e-06, "loss": 0.7417, "step": 22830 }, { "epoch": 0.25545098170796493, "grad_norm": 0.2660306394100189, "learning_rate": 1.1421261821005987e-06, "loss": 0.7463, "step": 22840 }, { "epoch": 0.2555628253952276, "grad_norm": 0.25784334540367126, "learning_rate": 9.137009456804787e-07, "loss": 0.7379, "step": 22850 }, { "epoch": 0.2556746690824903, "grad_norm": 0.27776214480400085, "learning_rate": 6.852757092603591e-07, "loss": 0.7562, "step": 22860 }, { "epoch": 0.255786512769753, "grad_norm": 0.24403463304042816, "learning_rate": 4.5685047284023936e-07, "loss": 0.7427, "step": 22870 }, { "epoch": 0.2558983564570157, "grad_norm": 0.24544622004032135, "learning_rate": 2.2842523642011968e-07, "loss": 0.748, "step": 22880 } ], "logging_steps": 10, "max_steps": 22889, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.946484739580887e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }