{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 32844, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009134088417975886, "grad_norm": 21.00069236755371, "learning_rate": 3.0441400304414e-08, "loss": 0.6507, "step": 10 }, { "epoch": 0.001826817683595177, "grad_norm": 6.837961196899414, "learning_rate": 6.0882800608828e-08, "loss": 0.6481, "step": 20 }, { "epoch": 0.002740226525392766, "grad_norm": 4.603323459625244, "learning_rate": 9.132420091324201e-08, "loss": 0.5895, "step": 30 }, { "epoch": 0.003653635367190354, "grad_norm": 9.59748649597168, "learning_rate": 1.21765601217656e-07, "loss": 0.7129, "step": 40 }, { "epoch": 0.004567044208987943, "grad_norm": 9.313572883605957, "learning_rate": 1.5220700152207e-07, "loss": 0.6207, "step": 50 }, { "epoch": 0.005480453050785532, "grad_norm": 13.349953651428223, "learning_rate": 1.8264840182648401e-07, "loss": 0.667, "step": 60 }, { "epoch": 0.00639386189258312, "grad_norm": 9.330903053283691, "learning_rate": 2.1308980213089802e-07, "loss": 0.5354, "step": 70 }, { "epoch": 0.007307270734380708, "grad_norm": 11.617569923400879, "learning_rate": 2.43531202435312e-07, "loss": 0.5729, "step": 80 }, { "epoch": 0.008220679576178298, "grad_norm": 10.405041694641113, "learning_rate": 2.73972602739726e-07, "loss": 0.6503, "step": 90 }, { "epoch": 0.009134088417975887, "grad_norm": 8.202179908752441, "learning_rate": 3.0441400304414e-07, "loss": 0.5959, "step": 100 }, { "epoch": 0.010047497259773474, "grad_norm": 12.210713386535645, "learning_rate": 3.3485540334855403e-07, "loss": 0.616, "step": 110 }, { "epoch": 0.010960906101571063, "grad_norm": 4.015219211578369, "learning_rate": 3.6529680365296803e-07, "loss": 0.5604, "step": 120 }, { "epoch": 0.011874314943368651, "grad_norm": 12.470812797546387, "learning_rate": 3.9573820395738203e-07, "loss": 0.6016, "step": 130 }, { "epoch": 0.01278772378516624, "grad_norm": 13.140501976013184, "learning_rate": 4.2617960426179603e-07, "loss": 0.5499, "step": 140 }, { "epoch": 0.01370113262696383, "grad_norm": 12.370471954345703, "learning_rate": 4.5662100456621004e-07, "loss": 0.5448, "step": 150 }, { "epoch": 0.014614541468761417, "grad_norm": 3.008544921875, "learning_rate": 4.87062404870624e-07, "loss": 0.479, "step": 160 }, { "epoch": 0.015527950310559006, "grad_norm": 5.42787504196167, "learning_rate": 5.17503805175038e-07, "loss": 0.4673, "step": 170 }, { "epoch": 0.016441359152356595, "grad_norm": 4.146634101867676, "learning_rate": 5.47945205479452e-07, "loss": 0.4077, "step": 180 }, { "epoch": 0.017354767994154183, "grad_norm": 5.374739646911621, "learning_rate": 5.78386605783866e-07, "loss": 0.4748, "step": 190 }, { "epoch": 0.018268176835951774, "grad_norm": 4.852890968322754, "learning_rate": 6.0882800608828e-07, "loss": 0.4691, "step": 200 }, { "epoch": 0.01918158567774936, "grad_norm": 7.511833190917969, "learning_rate": 6.39269406392694e-07, "loss": 0.429, "step": 210 }, { "epoch": 0.02009499451954695, "grad_norm": 7.170776844024658, "learning_rate": 6.697108066971081e-07, "loss": 0.4556, "step": 220 }, { "epoch": 0.02100840336134454, "grad_norm": 5.283788204193115, "learning_rate": 7.001522070015221e-07, "loss": 0.3925, "step": 230 }, { "epoch": 0.021921812203142127, "grad_norm": 6.619223117828369, "learning_rate": 7.305936073059361e-07, "loss": 0.409, "step": 240 }, { "epoch": 0.022835221044939714, "grad_norm": 3.8212814331054688, "learning_rate": 7.610350076103501e-07, "loss": 0.3845, "step": 250 }, { "epoch": 0.023748629886737302, "grad_norm": 3.5113377571105957, "learning_rate": 7.914764079147641e-07, "loss": 0.3622, "step": 260 }, { "epoch": 0.024662038728534893, "grad_norm": 6.521552085876465, "learning_rate": 8.219178082191781e-07, "loss": 0.4738, "step": 270 }, { "epoch": 0.02557544757033248, "grad_norm": 3.658064126968384, "learning_rate": 8.523592085235921e-07, "loss": 0.4758, "step": 280 }, { "epoch": 0.026488856412130068, "grad_norm": 4.014369964599609, "learning_rate": 8.828006088280061e-07, "loss": 0.4186, "step": 290 }, { "epoch": 0.02740226525392766, "grad_norm": 5.266791820526123, "learning_rate": 9.132420091324201e-07, "loss": 0.3436, "step": 300 }, { "epoch": 0.028315674095725246, "grad_norm": 3.866048812866211, "learning_rate": 9.436834094368341e-07, "loss": 0.3625, "step": 310 }, { "epoch": 0.029229082937522834, "grad_norm": 2.820458173751831, "learning_rate": 9.74124809741248e-07, "loss": 0.3596, "step": 320 }, { "epoch": 0.030142491779320425, "grad_norm": 4.385409355163574, "learning_rate": 1.0045662100456622e-06, "loss": 0.4366, "step": 330 }, { "epoch": 0.031055900621118012, "grad_norm": 6.302259922027588, "learning_rate": 1.035007610350076e-06, "loss": 0.4441, "step": 340 }, { "epoch": 0.0319693094629156, "grad_norm": 5.2402167320251465, "learning_rate": 1.0654490106544902e-06, "loss": 0.3982, "step": 350 }, { "epoch": 0.03288271830471319, "grad_norm": 5.894989967346191, "learning_rate": 1.095890410958904e-06, "loss": 0.3496, "step": 360 }, { "epoch": 0.03379612714651078, "grad_norm": 4.226273536682129, "learning_rate": 1.1263318112633182e-06, "loss": 0.3429, "step": 370 }, { "epoch": 0.034709535988308365, "grad_norm": 3.223613977432251, "learning_rate": 1.156773211567732e-06, "loss": 0.3717, "step": 380 }, { "epoch": 0.03562294483010595, "grad_norm": 7.992124557495117, "learning_rate": 1.1872146118721462e-06, "loss": 0.3956, "step": 390 }, { "epoch": 0.03653635367190355, "grad_norm": 4.104525566101074, "learning_rate": 1.21765601217656e-06, "loss": 0.3628, "step": 400 }, { "epoch": 0.037449762513701135, "grad_norm": 4.614723205566406, "learning_rate": 1.2480974124809742e-06, "loss": 0.3566, "step": 410 }, { "epoch": 0.03836317135549872, "grad_norm": 4.603489398956299, "learning_rate": 1.278538812785388e-06, "loss": 0.3579, "step": 420 }, { "epoch": 0.03927658019729631, "grad_norm": 5.456618309020996, "learning_rate": 1.3089802130898022e-06, "loss": 0.3307, "step": 430 }, { "epoch": 0.0401899890390939, "grad_norm": 2.884916305541992, "learning_rate": 1.3394216133942161e-06, "loss": 0.3664, "step": 440 }, { "epoch": 0.041103397880891485, "grad_norm": 4.94288969039917, "learning_rate": 1.3698630136986302e-06, "loss": 0.3914, "step": 450 }, { "epoch": 0.04201680672268908, "grad_norm": 10.238210678100586, "learning_rate": 1.4003044140030441e-06, "loss": 0.3723, "step": 460 }, { "epoch": 0.042930215564486666, "grad_norm": 3.802596092224121, "learning_rate": 1.4307458143074582e-06, "loss": 0.39, "step": 470 }, { "epoch": 0.043843624406284254, "grad_norm": 4.166705131530762, "learning_rate": 1.4611872146118721e-06, "loss": 0.3645, "step": 480 }, { "epoch": 0.04475703324808184, "grad_norm": 4.38026237487793, "learning_rate": 1.4916286149162862e-06, "loss": 0.3668, "step": 490 }, { "epoch": 0.04567044208987943, "grad_norm": 3.1928367614746094, "learning_rate": 1.5220700152207001e-06, "loss": 0.3379, "step": 500 }, { "epoch": 0.046583850931677016, "grad_norm": 5.309569358825684, "learning_rate": 1.5525114155251142e-06, "loss": 0.4046, "step": 510 }, { "epoch": 0.047497259773474604, "grad_norm": 3.017387866973877, "learning_rate": 1.5829528158295281e-06, "loss": 0.4291, "step": 520 }, { "epoch": 0.0484106686152722, "grad_norm": 5.370119571685791, "learning_rate": 1.6133942161339422e-06, "loss": 0.3435, "step": 530 }, { "epoch": 0.049324077457069786, "grad_norm": 5.038543701171875, "learning_rate": 1.6438356164383561e-06, "loss": 0.4034, "step": 540 }, { "epoch": 0.05023748629886737, "grad_norm": 4.526076316833496, "learning_rate": 1.6742770167427702e-06, "loss": 0.3742, "step": 550 }, { "epoch": 0.05115089514066496, "grad_norm": 3.281928539276123, "learning_rate": 1.7047184170471841e-06, "loss": 0.4166, "step": 560 }, { "epoch": 0.05206430398246255, "grad_norm": 4.673582077026367, "learning_rate": 1.7351598173515982e-06, "loss": 0.3779, "step": 570 }, { "epoch": 0.052977712824260136, "grad_norm": 3.6332755088806152, "learning_rate": 1.7656012176560121e-06, "loss": 0.2771, "step": 580 }, { "epoch": 0.05389112166605773, "grad_norm": 3.0287764072418213, "learning_rate": 1.7960426179604263e-06, "loss": 0.3741, "step": 590 }, { "epoch": 0.05480453050785532, "grad_norm": 3.0409655570983887, "learning_rate": 1.8264840182648401e-06, "loss": 0.3941, "step": 600 }, { "epoch": 0.055717939349652905, "grad_norm": 5.109714508056641, "learning_rate": 1.8569254185692543e-06, "loss": 0.4196, "step": 610 }, { "epoch": 0.05663134819145049, "grad_norm": 3.3487319946289062, "learning_rate": 1.8873668188736682e-06, "loss": 0.351, "step": 620 }, { "epoch": 0.05754475703324808, "grad_norm": 2.760874032974243, "learning_rate": 1.9178082191780823e-06, "loss": 0.3784, "step": 630 }, { "epoch": 0.05845816587504567, "grad_norm": 2.8917722702026367, "learning_rate": 1.948249619482496e-06, "loss": 0.328, "step": 640 }, { "epoch": 0.05937157471684326, "grad_norm": 4.323854923248291, "learning_rate": 1.97869101978691e-06, "loss": 0.3533, "step": 650 }, { "epoch": 0.06028498355864085, "grad_norm": 5.832457542419434, "learning_rate": 2.0091324200913244e-06, "loss": 0.4051, "step": 660 }, { "epoch": 0.06119839240043844, "grad_norm": 3.585838794708252, "learning_rate": 2.0395738203957383e-06, "loss": 0.3354, "step": 670 }, { "epoch": 0.062111801242236024, "grad_norm": 4.0990447998046875, "learning_rate": 2.070015220700152e-06, "loss": 0.3969, "step": 680 }, { "epoch": 0.06302521008403361, "grad_norm": 3.286261796951294, "learning_rate": 2.100456621004566e-06, "loss": 0.3469, "step": 690 }, { "epoch": 0.0639386189258312, "grad_norm": 4.076005935668945, "learning_rate": 2.1308980213089804e-06, "loss": 0.334, "step": 700 }, { "epoch": 0.06485202776762879, "grad_norm": 4.9160380363464355, "learning_rate": 2.1613394216133943e-06, "loss": 0.3072, "step": 710 }, { "epoch": 0.06576543660942638, "grad_norm": 3.1665396690368652, "learning_rate": 2.191780821917808e-06, "loss": 0.3937, "step": 720 }, { "epoch": 0.06667884545122396, "grad_norm": 3.435957193374634, "learning_rate": 2.222222222222222e-06, "loss": 0.3946, "step": 730 }, { "epoch": 0.06759225429302156, "grad_norm": 5.128020763397217, "learning_rate": 2.2526636225266364e-06, "loss": 0.3615, "step": 740 }, { "epoch": 0.06850566313481915, "grad_norm": 2.1916685104370117, "learning_rate": 2.2831050228310503e-06, "loss": 0.4105, "step": 750 }, { "epoch": 0.06941907197661673, "grad_norm": 6.325859069824219, "learning_rate": 2.313546423135464e-06, "loss": 0.3621, "step": 760 }, { "epoch": 0.07033248081841433, "grad_norm": 3.1526148319244385, "learning_rate": 2.343987823439878e-06, "loss": 0.3404, "step": 770 }, { "epoch": 0.0712458896602119, "grad_norm": 5.251583099365234, "learning_rate": 2.3744292237442924e-06, "loss": 0.3204, "step": 780 }, { "epoch": 0.0721592985020095, "grad_norm": 3.0475053787231445, "learning_rate": 2.4048706240487063e-06, "loss": 0.39, "step": 790 }, { "epoch": 0.0730727073438071, "grad_norm": 3.3210880756378174, "learning_rate": 2.43531202435312e-06, "loss": 0.3157, "step": 800 }, { "epoch": 0.07398611618560468, "grad_norm": 3.724000930786133, "learning_rate": 2.4657534246575345e-06, "loss": 0.3921, "step": 810 }, { "epoch": 0.07489952502740227, "grad_norm": 2.8855550289154053, "learning_rate": 2.4961948249619484e-06, "loss": 0.3213, "step": 820 }, { "epoch": 0.07581293386919985, "grad_norm": 3.136337995529175, "learning_rate": 2.5266362252663623e-06, "loss": 0.2551, "step": 830 }, { "epoch": 0.07672634271099744, "grad_norm": 3.2091124057769775, "learning_rate": 2.557077625570776e-06, "loss": 0.3399, "step": 840 }, { "epoch": 0.07763975155279502, "grad_norm": 6.208425998687744, "learning_rate": 2.5875190258751905e-06, "loss": 0.3303, "step": 850 }, { "epoch": 0.07855316039459262, "grad_norm": 3.4714815616607666, "learning_rate": 2.6179604261796044e-06, "loss": 0.2991, "step": 860 }, { "epoch": 0.07946656923639021, "grad_norm": 3.4115118980407715, "learning_rate": 2.6484018264840183e-06, "loss": 0.3414, "step": 870 }, { "epoch": 0.0803799780781878, "grad_norm": 4.860008239746094, "learning_rate": 2.6788432267884322e-06, "loss": 0.4324, "step": 880 }, { "epoch": 0.08129338691998539, "grad_norm": 5.334558010101318, "learning_rate": 2.7092846270928465e-06, "loss": 0.3387, "step": 890 }, { "epoch": 0.08220679576178297, "grad_norm": 4.784554958343506, "learning_rate": 2.7397260273972604e-06, "loss": 0.4076, "step": 900 }, { "epoch": 0.08312020460358056, "grad_norm": 3.209930419921875, "learning_rate": 2.7701674277016743e-06, "loss": 0.4025, "step": 910 }, { "epoch": 0.08403361344537816, "grad_norm": 7.056468963623047, "learning_rate": 2.8006088280060882e-06, "loss": 0.3783, "step": 920 }, { "epoch": 0.08494702228717574, "grad_norm": 5.736630916595459, "learning_rate": 2.8310502283105025e-06, "loss": 0.425, "step": 930 }, { "epoch": 0.08586043112897333, "grad_norm": 4.473703384399414, "learning_rate": 2.8614916286149164e-06, "loss": 0.3191, "step": 940 }, { "epoch": 0.08677383997077091, "grad_norm": 7.93830680847168, "learning_rate": 2.8919330289193303e-06, "loss": 0.3618, "step": 950 }, { "epoch": 0.08768724881256851, "grad_norm": 2.605342149734497, "learning_rate": 2.9223744292237442e-06, "loss": 0.265, "step": 960 }, { "epoch": 0.08860065765436609, "grad_norm": 5.600216865539551, "learning_rate": 2.9528158295281586e-06, "loss": 0.2861, "step": 970 }, { "epoch": 0.08951406649616368, "grad_norm": 6.595243453979492, "learning_rate": 2.9832572298325725e-06, "loss": 0.3062, "step": 980 }, { "epoch": 0.09042747533796128, "grad_norm": 5.1137776374816895, "learning_rate": 3.0136986301369864e-06, "loss": 0.4026, "step": 990 }, { "epoch": 0.09134088417975886, "grad_norm": 6.8435750007629395, "learning_rate": 3.0441400304414002e-06, "loss": 0.2832, "step": 1000 }, { "epoch": 0.09225429302155645, "grad_norm": 3.3159053325653076, "learning_rate": 3.0745814307458146e-06, "loss": 0.3036, "step": 1010 }, { "epoch": 0.09316770186335403, "grad_norm": 3.3568296432495117, "learning_rate": 3.1050228310502285e-06, "loss": 0.3575, "step": 1020 }, { "epoch": 0.09408111070515163, "grad_norm": 6.4950385093688965, "learning_rate": 3.1354642313546424e-06, "loss": 0.3952, "step": 1030 }, { "epoch": 0.09499451954694921, "grad_norm": 3.9376261234283447, "learning_rate": 3.1659056316590563e-06, "loss": 0.3425, "step": 1040 }, { "epoch": 0.0959079283887468, "grad_norm": 10.152996063232422, "learning_rate": 3.1963470319634706e-06, "loss": 0.3498, "step": 1050 }, { "epoch": 0.0968213372305444, "grad_norm": 3.802440643310547, "learning_rate": 3.2267884322678845e-06, "loss": 0.3647, "step": 1060 }, { "epoch": 0.09773474607234198, "grad_norm": 10.519351959228516, "learning_rate": 3.2572298325722984e-06, "loss": 0.3131, "step": 1070 }, { "epoch": 0.09864815491413957, "grad_norm": 4.218064308166504, "learning_rate": 3.2876712328767123e-06, "loss": 0.3377, "step": 1080 }, { "epoch": 0.09956156375593715, "grad_norm": 2.4957351684570312, "learning_rate": 3.3181126331811266e-06, "loss": 0.4005, "step": 1090 }, { "epoch": 0.10047497259773475, "grad_norm": 2.914581537246704, "learning_rate": 3.3485540334855405e-06, "loss": 0.3708, "step": 1100 }, { "epoch": 0.10138838143953234, "grad_norm": 2.552434206008911, "learning_rate": 3.3789954337899544e-06, "loss": 0.362, "step": 1110 }, { "epoch": 0.10230179028132992, "grad_norm": 6.404033660888672, "learning_rate": 3.4094368340943683e-06, "loss": 0.361, "step": 1120 }, { "epoch": 0.10321519912312752, "grad_norm": 8.508428573608398, "learning_rate": 3.4398782343987826e-06, "loss": 0.3632, "step": 1130 }, { "epoch": 0.1041286079649251, "grad_norm": 4.746245384216309, "learning_rate": 3.4703196347031965e-06, "loss": 0.3127, "step": 1140 }, { "epoch": 0.10504201680672269, "grad_norm": 5.235960960388184, "learning_rate": 3.5007610350076104e-06, "loss": 0.3052, "step": 1150 }, { "epoch": 0.10595542564852027, "grad_norm": 4.14143180847168, "learning_rate": 3.5312024353120243e-06, "loss": 0.3172, "step": 1160 }, { "epoch": 0.10686883449031787, "grad_norm": 4.019923210144043, "learning_rate": 3.5616438356164386e-06, "loss": 0.3221, "step": 1170 }, { "epoch": 0.10778224333211546, "grad_norm": 3.331969738006592, "learning_rate": 3.5920852359208525e-06, "loss": 0.3708, "step": 1180 }, { "epoch": 0.10869565217391304, "grad_norm": 3.9821300506591797, "learning_rate": 3.6225266362252664e-06, "loss": 0.4094, "step": 1190 }, { "epoch": 0.10960906101571063, "grad_norm": 3.322673797607422, "learning_rate": 3.6529680365296803e-06, "loss": 0.3012, "step": 1200 }, { "epoch": 0.11052246985750822, "grad_norm": 2.5091915130615234, "learning_rate": 3.6834094368340946e-06, "loss": 0.323, "step": 1210 }, { "epoch": 0.11143587869930581, "grad_norm": 3.2331368923187256, "learning_rate": 3.7138508371385085e-06, "loss": 0.3201, "step": 1220 }, { "epoch": 0.1123492875411034, "grad_norm": 2.9532523155212402, "learning_rate": 3.7442922374429224e-06, "loss": 0.3043, "step": 1230 }, { "epoch": 0.11326269638290098, "grad_norm": 1.8346952199935913, "learning_rate": 3.7747336377473363e-06, "loss": 0.3219, "step": 1240 }, { "epoch": 0.11417610522469858, "grad_norm": 3.4957823753356934, "learning_rate": 3.8051750380517506e-06, "loss": 0.3701, "step": 1250 }, { "epoch": 0.11508951406649616, "grad_norm": 5.400407791137695, "learning_rate": 3.8356164383561645e-06, "loss": 0.3423, "step": 1260 }, { "epoch": 0.11600292290829375, "grad_norm": 5.074181079864502, "learning_rate": 3.866057838660579e-06, "loss": 0.2945, "step": 1270 }, { "epoch": 0.11691633175009133, "grad_norm": 6.368045806884766, "learning_rate": 3.896499238964992e-06, "loss": 0.4465, "step": 1280 }, { "epoch": 0.11782974059188893, "grad_norm": 4.613990306854248, "learning_rate": 3.926940639269407e-06, "loss": 0.3608, "step": 1290 }, { "epoch": 0.11874314943368652, "grad_norm": 3.2069010734558105, "learning_rate": 3.95738203957382e-06, "loss": 0.3354, "step": 1300 }, { "epoch": 0.1196565582754841, "grad_norm": 4.676751136779785, "learning_rate": 3.9878234398782344e-06, "loss": 0.3264, "step": 1310 }, { "epoch": 0.1205699671172817, "grad_norm": 4.901774883270264, "learning_rate": 4.018264840182649e-06, "loss": 0.2965, "step": 1320 }, { "epoch": 0.12148337595907928, "grad_norm": 3.4611661434173584, "learning_rate": 4.048706240487062e-06, "loss": 0.3501, "step": 1330 }, { "epoch": 0.12239678480087687, "grad_norm": 4.863470077514648, "learning_rate": 4.0791476407914765e-06, "loss": 0.3573, "step": 1340 }, { "epoch": 0.12331019364267447, "grad_norm": 4.200720310211182, "learning_rate": 4.109589041095891e-06, "loss": 0.2862, "step": 1350 }, { "epoch": 0.12422360248447205, "grad_norm": 3.7996675968170166, "learning_rate": 4.140030441400304e-06, "loss": 0.3424, "step": 1360 }, { "epoch": 0.12513701132626964, "grad_norm": 3.232576847076416, "learning_rate": 4.170471841704719e-06, "loss": 0.3137, "step": 1370 }, { "epoch": 0.12605042016806722, "grad_norm": 4.050148010253906, "learning_rate": 4.200913242009132e-06, "loss": 0.3595, "step": 1380 }, { "epoch": 0.1269638290098648, "grad_norm": 4.28303337097168, "learning_rate": 4.2313546423135464e-06, "loss": 0.38, "step": 1390 }, { "epoch": 0.1278772378516624, "grad_norm": 7.246886730194092, "learning_rate": 4.261796042617961e-06, "loss": 0.3172, "step": 1400 }, { "epoch": 0.12879064669346, "grad_norm": 2.5083565711975098, "learning_rate": 4.292237442922374e-06, "loss": 0.3496, "step": 1410 }, { "epoch": 0.12970405553525757, "grad_norm": 2.5495970249176025, "learning_rate": 4.3226788432267886e-06, "loss": 0.3399, "step": 1420 }, { "epoch": 0.13061746437705518, "grad_norm": 5.846104621887207, "learning_rate": 4.353120243531203e-06, "loss": 0.313, "step": 1430 }, { "epoch": 0.13153087321885276, "grad_norm": 4.331517696380615, "learning_rate": 4.383561643835616e-06, "loss": 0.2662, "step": 1440 }, { "epoch": 0.13244428206065034, "grad_norm": 2.560365676879883, "learning_rate": 4.414003044140031e-06, "loss": 0.3692, "step": 1450 }, { "epoch": 0.13335769090244792, "grad_norm": 3.7095258235931396, "learning_rate": 4.444444444444444e-06, "loss": 0.2963, "step": 1460 }, { "epoch": 0.13427109974424553, "grad_norm": 4.856600284576416, "learning_rate": 4.4748858447488585e-06, "loss": 0.2713, "step": 1470 }, { "epoch": 0.1351845085860431, "grad_norm": 10.327225685119629, "learning_rate": 4.505327245053273e-06, "loss": 0.3531, "step": 1480 }, { "epoch": 0.1360979174278407, "grad_norm": 2.399681568145752, "learning_rate": 4.535768645357686e-06, "loss": 0.3119, "step": 1490 }, { "epoch": 0.1370113262696383, "grad_norm": 5.375316143035889, "learning_rate": 4.566210045662101e-06, "loss": 0.2987, "step": 1500 }, { "epoch": 0.13792473511143588, "grad_norm": 5.824618816375732, "learning_rate": 4.596651445966515e-06, "loss": 0.3312, "step": 1510 }, { "epoch": 0.13883814395323346, "grad_norm": 4.408431529998779, "learning_rate": 4.627092846270928e-06, "loss": 0.3371, "step": 1520 }, { "epoch": 0.13975155279503104, "grad_norm": 6.270742416381836, "learning_rate": 4.657534246575343e-06, "loss": 0.325, "step": 1530 }, { "epoch": 0.14066496163682865, "grad_norm": 3.8607237339019775, "learning_rate": 4.687975646879756e-06, "loss": 0.4032, "step": 1540 }, { "epoch": 0.14157837047862623, "grad_norm": 3.133507013320923, "learning_rate": 4.7184170471841705e-06, "loss": 0.3156, "step": 1550 }, { "epoch": 0.1424917793204238, "grad_norm": 4.044239044189453, "learning_rate": 4.748858447488585e-06, "loss": 0.4146, "step": 1560 }, { "epoch": 0.14340518816222142, "grad_norm": 4.199702262878418, "learning_rate": 4.779299847792998e-06, "loss": 0.3372, "step": 1570 }, { "epoch": 0.144318597004019, "grad_norm": 5.7882280349731445, "learning_rate": 4.809741248097413e-06, "loss": 0.3388, "step": 1580 }, { "epoch": 0.14523200584581658, "grad_norm": 3.1476380825042725, "learning_rate": 4.840182648401827e-06, "loss": 0.3482, "step": 1590 }, { "epoch": 0.1461454146876142, "grad_norm": 3.493863344192505, "learning_rate": 4.87062404870624e-06, "loss": 0.3717, "step": 1600 }, { "epoch": 0.14705882352941177, "grad_norm": 4.103028297424316, "learning_rate": 4.901065449010655e-06, "loss": 0.3307, "step": 1610 }, { "epoch": 0.14797223237120935, "grad_norm": 4.591942310333252, "learning_rate": 4.931506849315069e-06, "loss": 0.3487, "step": 1620 }, { "epoch": 0.14888564121300693, "grad_norm": 2.425877332687378, "learning_rate": 4.9619482496194825e-06, "loss": 0.3165, "step": 1630 }, { "epoch": 0.14979905005480454, "grad_norm": 2.882711172103882, "learning_rate": 4.992389649923897e-06, "loss": 0.392, "step": 1640 }, { "epoch": 0.15071245889660212, "grad_norm": 3.218334674835205, "learning_rate": 5.02283105022831e-06, "loss": 0.2808, "step": 1650 }, { "epoch": 0.1516258677383997, "grad_norm": 4.905381679534912, "learning_rate": 5.053272450532725e-06, "loss": 0.3079, "step": 1660 }, { "epoch": 0.1525392765801973, "grad_norm": 6.432162761688232, "learning_rate": 5.083713850837139e-06, "loss": 0.3295, "step": 1670 }, { "epoch": 0.1534526854219949, "grad_norm": 2.4773406982421875, "learning_rate": 5.114155251141552e-06, "loss": 0.3414, "step": 1680 }, { "epoch": 0.15436609426379247, "grad_norm": 2.9371869564056396, "learning_rate": 5.144596651445967e-06, "loss": 0.2963, "step": 1690 }, { "epoch": 0.15527950310559005, "grad_norm": 3.950488328933716, "learning_rate": 5.175038051750381e-06, "loss": 0.3663, "step": 1700 }, { "epoch": 0.15619291194738766, "grad_norm": 3.174051284790039, "learning_rate": 5.2054794520547945e-06, "loss": 0.3868, "step": 1710 }, { "epoch": 0.15710632078918524, "grad_norm": 7.768604278564453, "learning_rate": 5.235920852359209e-06, "loss": 0.3108, "step": 1720 }, { "epoch": 0.15801972963098282, "grad_norm": 2.8723599910736084, "learning_rate": 5.266362252663622e-06, "loss": 0.2963, "step": 1730 }, { "epoch": 0.15893313847278043, "grad_norm": 5.128979682922363, "learning_rate": 5.296803652968037e-06, "loss": 0.348, "step": 1740 }, { "epoch": 0.159846547314578, "grad_norm": 2.854691743850708, "learning_rate": 5.327245053272451e-06, "loss": 0.3033, "step": 1750 }, { "epoch": 0.1607599561563756, "grad_norm": 4.166466236114502, "learning_rate": 5.3576864535768644e-06, "loss": 0.3733, "step": 1760 }, { "epoch": 0.16167336499817317, "grad_norm": 3.1518819332122803, "learning_rate": 5.388127853881279e-06, "loss": 0.3313, "step": 1770 }, { "epoch": 0.16258677383997078, "grad_norm": 4.064481258392334, "learning_rate": 5.418569254185693e-06, "loss": 0.3027, "step": 1780 }, { "epoch": 0.16350018268176836, "grad_norm": 4.0051374435424805, "learning_rate": 5.4490106544901065e-06, "loss": 0.3592, "step": 1790 }, { "epoch": 0.16441359152356594, "grad_norm": 3.431480646133423, "learning_rate": 5.479452054794521e-06, "loss": 0.3691, "step": 1800 }, { "epoch": 0.16532700036536355, "grad_norm": 5.638591766357422, "learning_rate": 5.509893455098934e-06, "loss": 0.402, "step": 1810 }, { "epoch": 0.16624040920716113, "grad_norm": 3.3947017192840576, "learning_rate": 5.540334855403349e-06, "loss": 0.2855, "step": 1820 }, { "epoch": 0.1671538180489587, "grad_norm": 3.263679265975952, "learning_rate": 5.570776255707763e-06, "loss": 0.3421, "step": 1830 }, { "epoch": 0.16806722689075632, "grad_norm": 6.514377593994141, "learning_rate": 5.6012176560121765e-06, "loss": 0.3321, "step": 1840 }, { "epoch": 0.1689806357325539, "grad_norm": 2.5299298763275146, "learning_rate": 5.631659056316591e-06, "loss": 0.2962, "step": 1850 }, { "epoch": 0.16989404457435148, "grad_norm": 4.899336338043213, "learning_rate": 5.662100456621005e-06, "loss": 0.4244, "step": 1860 }, { "epoch": 0.17080745341614906, "grad_norm": 2.807912826538086, "learning_rate": 5.6925418569254186e-06, "loss": 0.3593, "step": 1870 }, { "epoch": 0.17172086225794667, "grad_norm": 4.069912910461426, "learning_rate": 5.722983257229833e-06, "loss": 0.3151, "step": 1880 }, { "epoch": 0.17263427109974425, "grad_norm": 5.818482398986816, "learning_rate": 5.753424657534246e-06, "loss": 0.2809, "step": 1890 }, { "epoch": 0.17354767994154183, "grad_norm": 3.027714729309082, "learning_rate": 5.783866057838661e-06, "loss": 0.3403, "step": 1900 }, { "epoch": 0.17446108878333944, "grad_norm": 2.6627819538116455, "learning_rate": 5.814307458143075e-06, "loss": 0.3444, "step": 1910 }, { "epoch": 0.17537449762513702, "grad_norm": 5.642514705657959, "learning_rate": 5.8447488584474885e-06, "loss": 0.3505, "step": 1920 }, { "epoch": 0.1762879064669346, "grad_norm": 9.912672996520996, "learning_rate": 5.875190258751903e-06, "loss": 0.3807, "step": 1930 }, { "epoch": 0.17720131530873218, "grad_norm": 4.516170501708984, "learning_rate": 5.905631659056317e-06, "loss": 0.3581, "step": 1940 }, { "epoch": 0.17811472415052979, "grad_norm": 3.5582780838012695, "learning_rate": 5.936073059360731e-06, "loss": 0.2867, "step": 1950 }, { "epoch": 0.17902813299232737, "grad_norm": 3.9866414070129395, "learning_rate": 5.966514459665145e-06, "loss": 0.407, "step": 1960 }, { "epoch": 0.17994154183412495, "grad_norm": 3.0525522232055664, "learning_rate": 5.996955859969558e-06, "loss": 0.3786, "step": 1970 }, { "epoch": 0.18085495067592255, "grad_norm": 3.5543313026428223, "learning_rate": 6.027397260273973e-06, "loss": 0.3421, "step": 1980 }, { "epoch": 0.18176835951772013, "grad_norm": 2.1008899211883545, "learning_rate": 6.057838660578387e-06, "loss": 0.3196, "step": 1990 }, { "epoch": 0.18268176835951772, "grad_norm": 3.989112377166748, "learning_rate": 6.0882800608828005e-06, "loss": 0.3531, "step": 2000 }, { "epoch": 0.1835951772013153, "grad_norm": 4.183403491973877, "learning_rate": 6.118721461187215e-06, "loss": 0.36, "step": 2010 }, { "epoch": 0.1845085860431129, "grad_norm": 3.1842029094696045, "learning_rate": 6.149162861491629e-06, "loss": 0.3816, "step": 2020 }, { "epoch": 0.18542199488491048, "grad_norm": 2.2626726627349854, "learning_rate": 6.179604261796043e-06, "loss": 0.3266, "step": 2030 }, { "epoch": 0.18633540372670807, "grad_norm": 2.4915964603424072, "learning_rate": 6.210045662100457e-06, "loss": 0.3336, "step": 2040 }, { "epoch": 0.18724881256850567, "grad_norm": 3.5499637126922607, "learning_rate": 6.24048706240487e-06, "loss": 0.3312, "step": 2050 }, { "epoch": 0.18816222141030325, "grad_norm": 3.54205322265625, "learning_rate": 6.270928462709285e-06, "loss": 0.3353, "step": 2060 }, { "epoch": 0.18907563025210083, "grad_norm": 5.065211772918701, "learning_rate": 6.301369863013699e-06, "loss": 0.3192, "step": 2070 }, { "epoch": 0.18998903909389842, "grad_norm": 2.47358775138855, "learning_rate": 6.3318112633181125e-06, "loss": 0.3312, "step": 2080 }, { "epoch": 0.19090244793569602, "grad_norm": 3.1496968269348145, "learning_rate": 6.362252663622527e-06, "loss": 0.313, "step": 2090 }, { "epoch": 0.1918158567774936, "grad_norm": 4.588258266448975, "learning_rate": 6.392694063926941e-06, "loss": 0.3241, "step": 2100 }, { "epoch": 0.19272926561929118, "grad_norm": 6.039133071899414, "learning_rate": 6.423135464231355e-06, "loss": 0.3494, "step": 2110 }, { "epoch": 0.1936426744610888, "grad_norm": 2.677236795425415, "learning_rate": 6.453576864535769e-06, "loss": 0.3184, "step": 2120 }, { "epoch": 0.19455608330288637, "grad_norm": 3.1094653606414795, "learning_rate": 6.484018264840182e-06, "loss": 0.3533, "step": 2130 }, { "epoch": 0.19546949214468395, "grad_norm": 5.155694961547852, "learning_rate": 6.514459665144597e-06, "loss": 0.3588, "step": 2140 }, { "epoch": 0.19638290098648156, "grad_norm": 2.552335262298584, "learning_rate": 6.544901065449011e-06, "loss": 0.301, "step": 2150 }, { "epoch": 0.19729630982827914, "grad_norm": 4.130258560180664, "learning_rate": 6.5753424657534245e-06, "loss": 0.3047, "step": 2160 }, { "epoch": 0.19820971867007672, "grad_norm": 5.820996284484863, "learning_rate": 6.605783866057839e-06, "loss": 0.3199, "step": 2170 }, { "epoch": 0.1991231275118743, "grad_norm": 5.603969097137451, "learning_rate": 6.636225266362253e-06, "loss": 0.3344, "step": 2180 }, { "epoch": 0.2000365363536719, "grad_norm": 4.620244026184082, "learning_rate": 6.666666666666667e-06, "loss": 0.2543, "step": 2190 }, { "epoch": 0.2009499451954695, "grad_norm": 3.6346309185028076, "learning_rate": 6.697108066971081e-06, "loss": 0.268, "step": 2200 }, { "epoch": 0.20186335403726707, "grad_norm": 4.49297571182251, "learning_rate": 6.7275494672754944e-06, "loss": 0.3667, "step": 2210 }, { "epoch": 0.20277676287906468, "grad_norm": 2.709969997406006, "learning_rate": 6.757990867579909e-06, "loss": 0.3415, "step": 2220 }, { "epoch": 0.20369017172086226, "grad_norm": 3.651362657546997, "learning_rate": 6.788432267884323e-06, "loss": 0.2993, "step": 2230 }, { "epoch": 0.20460358056265984, "grad_norm": 3.0914711952209473, "learning_rate": 6.8188736681887366e-06, "loss": 0.3102, "step": 2240 }, { "epoch": 0.20551698940445742, "grad_norm": 4.138451099395752, "learning_rate": 6.849315068493151e-06, "loss": 0.3579, "step": 2250 }, { "epoch": 0.20643039824625503, "grad_norm": 2.4869184494018555, "learning_rate": 6.879756468797565e-06, "loss": 0.2936, "step": 2260 }, { "epoch": 0.2073438070880526, "grad_norm": 4.096683025360107, "learning_rate": 6.910197869101979e-06, "loss": 0.3613, "step": 2270 }, { "epoch": 0.2082572159298502, "grad_norm": 6.047483921051025, "learning_rate": 6.940639269406393e-06, "loss": 0.3489, "step": 2280 }, { "epoch": 0.2091706247716478, "grad_norm": 5.147757530212402, "learning_rate": 6.9710806697108065e-06, "loss": 0.3219, "step": 2290 }, { "epoch": 0.21008403361344538, "grad_norm": 3.3349220752716064, "learning_rate": 7.001522070015221e-06, "loss": 0.358, "step": 2300 }, { "epoch": 0.21099744245524296, "grad_norm": 4.660184383392334, "learning_rate": 7.031963470319635e-06, "loss": 0.2864, "step": 2310 }, { "epoch": 0.21191085129704054, "grad_norm": 4.1252923011779785, "learning_rate": 7.0624048706240486e-06, "loss": 0.2993, "step": 2320 }, { "epoch": 0.21282426013883815, "grad_norm": 4.576345443725586, "learning_rate": 7.092846270928463e-06, "loss": 0.3487, "step": 2330 }, { "epoch": 0.21373766898063573, "grad_norm": 3.2598509788513184, "learning_rate": 7.123287671232877e-06, "loss": 0.2867, "step": 2340 }, { "epoch": 0.2146510778224333, "grad_norm": 2.8939616680145264, "learning_rate": 7.153729071537291e-06, "loss": 0.3334, "step": 2350 }, { "epoch": 0.21556448666423092, "grad_norm": 4.237586975097656, "learning_rate": 7.184170471841705e-06, "loss": 0.3312, "step": 2360 }, { "epoch": 0.2164778955060285, "grad_norm": 5.3941874504089355, "learning_rate": 7.214611872146119e-06, "loss": 0.3449, "step": 2370 }, { "epoch": 0.21739130434782608, "grad_norm": 4.9369659423828125, "learning_rate": 7.245053272450533e-06, "loss": 0.2554, "step": 2380 }, { "epoch": 0.2183047131896237, "grad_norm": 2.9784328937530518, "learning_rate": 7.275494672754947e-06, "loss": 0.3325, "step": 2390 }, { "epoch": 0.21921812203142127, "grad_norm": 3.935805559158325, "learning_rate": 7.305936073059361e-06, "loss": 0.3042, "step": 2400 }, { "epoch": 0.22013153087321885, "grad_norm": 4.336616039276123, "learning_rate": 7.336377473363775e-06, "loss": 0.2963, "step": 2410 }, { "epoch": 0.22104493971501643, "grad_norm": 4.282394886016846, "learning_rate": 7.366818873668189e-06, "loss": 0.328, "step": 2420 }, { "epoch": 0.22195834855681404, "grad_norm": 2.5108118057250977, "learning_rate": 7.397260273972603e-06, "loss": 0.3272, "step": 2430 }, { "epoch": 0.22287175739861162, "grad_norm": 7.33221435546875, "learning_rate": 7.427701674277017e-06, "loss": 0.3659, "step": 2440 }, { "epoch": 0.2237851662404092, "grad_norm": 5.719897747039795, "learning_rate": 7.458143074581431e-06, "loss": 0.3567, "step": 2450 }, { "epoch": 0.2246985750822068, "grad_norm": 2.9148504734039307, "learning_rate": 7.488584474885845e-06, "loss": 0.3291, "step": 2460 }, { "epoch": 0.2256119839240044, "grad_norm": 7.22426700592041, "learning_rate": 7.519025875190259e-06, "loss": 0.3401, "step": 2470 }, { "epoch": 0.22652539276580197, "grad_norm": 4.424638748168945, "learning_rate": 7.549467275494673e-06, "loss": 0.2993, "step": 2480 }, { "epoch": 0.22743880160759955, "grad_norm": 4.101001262664795, "learning_rate": 7.579908675799087e-06, "loss": 0.3233, "step": 2490 }, { "epoch": 0.22835221044939716, "grad_norm": 3.0185065269470215, "learning_rate": 7.610350076103501e-06, "loss": 0.3119, "step": 2500 }, { "epoch": 0.22926561929119474, "grad_norm": 4.087958335876465, "learning_rate": 7.640791476407915e-06, "loss": 0.3208, "step": 2510 }, { "epoch": 0.23017902813299232, "grad_norm": 3.0541634559631348, "learning_rate": 7.671232876712329e-06, "loss": 0.3008, "step": 2520 }, { "epoch": 0.23109243697478993, "grad_norm": 2.066087245941162, "learning_rate": 7.701674277016743e-06, "loss": 0.333, "step": 2530 }, { "epoch": 0.2320058458165875, "grad_norm": 3.032447099685669, "learning_rate": 7.732115677321158e-06, "loss": 0.3116, "step": 2540 }, { "epoch": 0.2329192546583851, "grad_norm": 2.701009750366211, "learning_rate": 7.76255707762557e-06, "loss": 0.3441, "step": 2550 }, { "epoch": 0.23383266350018267, "grad_norm": 2.1937968730926514, "learning_rate": 7.792998477929985e-06, "loss": 0.2292, "step": 2560 }, { "epoch": 0.23474607234198028, "grad_norm": 3.3082008361816406, "learning_rate": 7.823439878234399e-06, "loss": 0.2894, "step": 2570 }, { "epoch": 0.23565948118377786, "grad_norm": 3.96578049659729, "learning_rate": 7.853881278538813e-06, "loss": 0.3316, "step": 2580 }, { "epoch": 0.23657289002557544, "grad_norm": 1.875034213066101, "learning_rate": 7.884322678843228e-06, "loss": 0.321, "step": 2590 }, { "epoch": 0.23748629886737305, "grad_norm": 2.9710276126861572, "learning_rate": 7.91476407914764e-06, "loss": 0.3007, "step": 2600 }, { "epoch": 0.23839970770917063, "grad_norm": 2.334566593170166, "learning_rate": 7.945205479452055e-06, "loss": 0.3468, "step": 2610 }, { "epoch": 0.2393131165509682, "grad_norm": 3.1262054443359375, "learning_rate": 7.975646879756469e-06, "loss": 0.2761, "step": 2620 }, { "epoch": 0.2402265253927658, "grad_norm": 4.068947792053223, "learning_rate": 8.006088280060883e-06, "loss": 0.2817, "step": 2630 }, { "epoch": 0.2411399342345634, "grad_norm": 4.457764625549316, "learning_rate": 8.036529680365297e-06, "loss": 0.3006, "step": 2640 }, { "epoch": 0.24205334307636098, "grad_norm": 7.219207763671875, "learning_rate": 8.066971080669712e-06, "loss": 0.3392, "step": 2650 }, { "epoch": 0.24296675191815856, "grad_norm": 5.541028022766113, "learning_rate": 8.097412480974124e-06, "loss": 0.2943, "step": 2660 }, { "epoch": 0.24388016075995617, "grad_norm": 4.748129367828369, "learning_rate": 8.127853881278539e-06, "loss": 0.3036, "step": 2670 }, { "epoch": 0.24479356960175375, "grad_norm": 3.0326344966888428, "learning_rate": 8.158295281582953e-06, "loss": 0.3318, "step": 2680 }, { "epoch": 0.24570697844355133, "grad_norm": 4.067484378814697, "learning_rate": 8.188736681887367e-06, "loss": 0.2404, "step": 2690 }, { "epoch": 0.24662038728534894, "grad_norm": 4.232189178466797, "learning_rate": 8.219178082191782e-06, "loss": 0.3818, "step": 2700 }, { "epoch": 0.24753379612714652, "grad_norm": 4.644088268280029, "learning_rate": 8.249619482496194e-06, "loss": 0.3473, "step": 2710 }, { "epoch": 0.2484472049689441, "grad_norm": 4.710209846496582, "learning_rate": 8.280060882800609e-06, "loss": 0.3168, "step": 2720 }, { "epoch": 0.24936061381074168, "grad_norm": 2.6727077960968018, "learning_rate": 8.310502283105023e-06, "loss": 0.3548, "step": 2730 }, { "epoch": 0.2502740226525393, "grad_norm": 3.632392644882202, "learning_rate": 8.340943683409437e-06, "loss": 0.2855, "step": 2740 }, { "epoch": 0.25118743149433687, "grad_norm": 3.075626850128174, "learning_rate": 8.371385083713852e-06, "loss": 0.2707, "step": 2750 }, { "epoch": 0.25210084033613445, "grad_norm": 3.5460011959075928, "learning_rate": 8.401826484018264e-06, "loss": 0.2882, "step": 2760 }, { "epoch": 0.253014249177932, "grad_norm": 4.3462724685668945, "learning_rate": 8.432267884322679e-06, "loss": 0.2855, "step": 2770 }, { "epoch": 0.2539276580197296, "grad_norm": 3.0400354862213135, "learning_rate": 8.462709284627093e-06, "loss": 0.3336, "step": 2780 }, { "epoch": 0.25484106686152724, "grad_norm": 6.81427526473999, "learning_rate": 8.493150684931507e-06, "loss": 0.3787, "step": 2790 }, { "epoch": 0.2557544757033248, "grad_norm": 10.290339469909668, "learning_rate": 8.523592085235922e-06, "loss": 0.3445, "step": 2800 }, { "epoch": 0.2566678845451224, "grad_norm": 3.8888938426971436, "learning_rate": 8.554033485540336e-06, "loss": 0.2785, "step": 2810 }, { "epoch": 0.25758129338692, "grad_norm": 2.3394668102264404, "learning_rate": 8.584474885844748e-06, "loss": 0.3253, "step": 2820 }, { "epoch": 0.25849470222871757, "grad_norm": 4.032678127288818, "learning_rate": 8.614916286149163e-06, "loss": 0.27, "step": 2830 }, { "epoch": 0.25940811107051515, "grad_norm": 5.492187976837158, "learning_rate": 8.645357686453577e-06, "loss": 0.3171, "step": 2840 }, { "epoch": 0.2603215199123127, "grad_norm": 3.4714624881744385, "learning_rate": 8.675799086757991e-06, "loss": 0.3059, "step": 2850 }, { "epoch": 0.26123492875411036, "grad_norm": 2.3153045177459717, "learning_rate": 8.706240487062406e-06, "loss": 0.3762, "step": 2860 }, { "epoch": 0.26214833759590794, "grad_norm": 2.854361057281494, "learning_rate": 8.736681887366818e-06, "loss": 0.3385, "step": 2870 }, { "epoch": 0.2630617464377055, "grad_norm": 2.539346218109131, "learning_rate": 8.767123287671233e-06, "loss": 0.2662, "step": 2880 }, { "epoch": 0.2639751552795031, "grad_norm": 5.426159858703613, "learning_rate": 8.797564687975647e-06, "loss": 0.2756, "step": 2890 }, { "epoch": 0.2648885641213007, "grad_norm": 2.9566261768341064, "learning_rate": 8.828006088280061e-06, "loss": 0.3306, "step": 2900 }, { "epoch": 0.26580197296309827, "grad_norm": 4.693178653717041, "learning_rate": 8.858447488584476e-06, "loss": 0.2865, "step": 2910 }, { "epoch": 0.26671538180489585, "grad_norm": 6.000566482543945, "learning_rate": 8.888888888888888e-06, "loss": 0.3324, "step": 2920 }, { "epoch": 0.2676287906466935, "grad_norm": 7.74446964263916, "learning_rate": 8.919330289193303e-06, "loss": 0.2588, "step": 2930 }, { "epoch": 0.26854219948849106, "grad_norm": 3.943545341491699, "learning_rate": 8.949771689497717e-06, "loss": 0.303, "step": 2940 }, { "epoch": 0.26945560833028864, "grad_norm": 2.9924373626708984, "learning_rate": 8.980213089802131e-06, "loss": 0.2766, "step": 2950 }, { "epoch": 0.2703690171720862, "grad_norm": 3.10162353515625, "learning_rate": 9.010654490106546e-06, "loss": 0.3014, "step": 2960 }, { "epoch": 0.2712824260138838, "grad_norm": 3.5185654163360596, "learning_rate": 9.04109589041096e-06, "loss": 0.3344, "step": 2970 }, { "epoch": 0.2721958348556814, "grad_norm": 3.016775369644165, "learning_rate": 9.071537290715373e-06, "loss": 0.2765, "step": 2980 }, { "epoch": 0.27310924369747897, "grad_norm": 2.112630605697632, "learning_rate": 9.101978691019787e-06, "loss": 0.3287, "step": 2990 }, { "epoch": 0.2740226525392766, "grad_norm": 2.646965265274048, "learning_rate": 9.132420091324201e-06, "loss": 0.3331, "step": 3000 }, { "epoch": 0.2749360613810742, "grad_norm": 3.167342185974121, "learning_rate": 9.162861491628615e-06, "loss": 0.3742, "step": 3010 }, { "epoch": 0.27584947022287176, "grad_norm": 3.5542829036712646, "learning_rate": 9.19330289193303e-06, "loss": 0.2924, "step": 3020 }, { "epoch": 0.27676287906466934, "grad_norm": 4.511105060577393, "learning_rate": 9.223744292237442e-06, "loss": 0.3627, "step": 3030 }, { "epoch": 0.2776762879064669, "grad_norm": 2.8609366416931152, "learning_rate": 9.254185692541857e-06, "loss": 0.3504, "step": 3040 }, { "epoch": 0.2785896967482645, "grad_norm": 4.288589954376221, "learning_rate": 9.284627092846271e-06, "loss": 0.2703, "step": 3050 }, { "epoch": 0.2795031055900621, "grad_norm": 3.5257577896118164, "learning_rate": 9.315068493150685e-06, "loss": 0.297, "step": 3060 }, { "epoch": 0.2804165144318597, "grad_norm": 3.5090677738189697, "learning_rate": 9.3455098934551e-06, "loss": 0.3449, "step": 3070 }, { "epoch": 0.2813299232736573, "grad_norm": 3.8643510341644287, "learning_rate": 9.375951293759512e-06, "loss": 0.3263, "step": 3080 }, { "epoch": 0.2822433321154549, "grad_norm": 4.577029228210449, "learning_rate": 9.406392694063927e-06, "loss": 0.3807, "step": 3090 }, { "epoch": 0.28315674095725246, "grad_norm": 4.3608808517456055, "learning_rate": 9.436834094368341e-06, "loss": 0.3327, "step": 3100 }, { "epoch": 0.28407014979905004, "grad_norm": 3.8819568157196045, "learning_rate": 9.467275494672755e-06, "loss": 0.3218, "step": 3110 }, { "epoch": 0.2849835586408476, "grad_norm": 3.289482593536377, "learning_rate": 9.49771689497717e-06, "loss": 0.3302, "step": 3120 }, { "epoch": 0.28589696748264526, "grad_norm": 3.772998571395874, "learning_rate": 9.528158295281584e-06, "loss": 0.2975, "step": 3130 }, { "epoch": 0.28681037632444284, "grad_norm": 3.4591171741485596, "learning_rate": 9.558599695585997e-06, "loss": 0.3109, "step": 3140 }, { "epoch": 0.2877237851662404, "grad_norm": 4.63095760345459, "learning_rate": 9.589041095890411e-06, "loss": 0.3239, "step": 3150 }, { "epoch": 0.288637194008038, "grad_norm": 5.049643516540527, "learning_rate": 9.619482496194825e-06, "loss": 0.3709, "step": 3160 }, { "epoch": 0.2895506028498356, "grad_norm": 3.6874334812164307, "learning_rate": 9.64992389649924e-06, "loss": 0.3281, "step": 3170 }, { "epoch": 0.29046401169163316, "grad_norm": 4.050210952758789, "learning_rate": 9.680365296803654e-06, "loss": 0.3915, "step": 3180 }, { "epoch": 0.29137742053343074, "grad_norm": 4.9166436195373535, "learning_rate": 9.710806697108066e-06, "loss": 0.3406, "step": 3190 }, { "epoch": 0.2922908293752284, "grad_norm": 3.717087507247925, "learning_rate": 9.74124809741248e-06, "loss": 0.2543, "step": 3200 }, { "epoch": 0.29320423821702596, "grad_norm": 3.7182207107543945, "learning_rate": 9.771689497716895e-06, "loss": 0.2799, "step": 3210 }, { "epoch": 0.29411764705882354, "grad_norm": 3.5827107429504395, "learning_rate": 9.80213089802131e-06, "loss": 0.3556, "step": 3220 }, { "epoch": 0.2950310559006211, "grad_norm": 4.453437328338623, "learning_rate": 9.832572298325724e-06, "loss": 0.3063, "step": 3230 }, { "epoch": 0.2959444647424187, "grad_norm": 5.342871189117432, "learning_rate": 9.863013698630138e-06, "loss": 0.2995, "step": 3240 }, { "epoch": 0.2968578735842163, "grad_norm": 6.691931247711182, "learning_rate": 9.89345509893455e-06, "loss": 0.272, "step": 3250 }, { "epoch": 0.29777128242601386, "grad_norm": 3.3311049938201904, "learning_rate": 9.923896499238965e-06, "loss": 0.3164, "step": 3260 }, { "epoch": 0.2986846912678115, "grad_norm": 5.458863258361816, "learning_rate": 9.95433789954338e-06, "loss": 0.3634, "step": 3270 }, { "epoch": 0.2995981001096091, "grad_norm": 2.2188284397125244, "learning_rate": 9.984779299847794e-06, "loss": 0.2767, "step": 3280 }, { "epoch": 0.30051150895140666, "grad_norm": 2.0744550228118896, "learning_rate": 9.999999294007192e-06, "loss": 0.2568, "step": 3290 }, { "epoch": 0.30142491779320424, "grad_norm": 2.650789499282837, "learning_rate": 9.999993646065917e-06, "loss": 0.2816, "step": 3300 }, { "epoch": 0.3023383266350018, "grad_norm": 3.506985902786255, "learning_rate": 9.999982350189746e-06, "loss": 0.3142, "step": 3310 }, { "epoch": 0.3032517354767994, "grad_norm": 2.192408800125122, "learning_rate": 9.999965406391436e-06, "loss": 0.3042, "step": 3320 }, { "epoch": 0.304165144318597, "grad_norm": 3.0572116374969482, "learning_rate": 9.999942814690133e-06, "loss": 0.2652, "step": 3330 }, { "epoch": 0.3050785531603946, "grad_norm": 5.356078147888184, "learning_rate": 9.999914575111352e-06, "loss": 0.3593, "step": 3340 }, { "epoch": 0.3059919620021922, "grad_norm": 3.240548849105835, "learning_rate": 9.99988068768699e-06, "loss": 0.2708, "step": 3350 }, { "epoch": 0.3069053708439898, "grad_norm": 4.936774730682373, "learning_rate": 9.999841152455332e-06, "loss": 0.3175, "step": 3360 }, { "epoch": 0.30781877968578736, "grad_norm": 2.8120999336242676, "learning_rate": 9.999795969461033e-06, "loss": 0.2873, "step": 3370 }, { "epoch": 0.30873218852758494, "grad_norm": 5.568870544433594, "learning_rate": 9.999745138755128e-06, "loss": 0.3526, "step": 3380 }, { "epoch": 0.3096455973693825, "grad_norm": 6.865026950836182, "learning_rate": 9.999688660395042e-06, "loss": 0.2692, "step": 3390 }, { "epoch": 0.3105590062111801, "grad_norm": 3.0485095977783203, "learning_rate": 9.999626534444567e-06, "loss": 0.3059, "step": 3400 }, { "epoch": 0.31147241505297774, "grad_norm": 2.4989874362945557, "learning_rate": 9.999558760973881e-06, "loss": 0.2792, "step": 3410 }, { "epoch": 0.3123858238947753, "grad_norm": 3.4336562156677246, "learning_rate": 9.99948534005954e-06, "loss": 0.3394, "step": 3420 }, { "epoch": 0.3132992327365729, "grad_norm": 2.611142158508301, "learning_rate": 9.99940627178448e-06, "loss": 0.3417, "step": 3430 }, { "epoch": 0.3142126415783705, "grad_norm": 2.2919387817382812, "learning_rate": 9.999321556238015e-06, "loss": 0.2309, "step": 3440 }, { "epoch": 0.31512605042016806, "grad_norm": 3.4934682846069336, "learning_rate": 9.99923119351584e-06, "loss": 0.2974, "step": 3450 }, { "epoch": 0.31603945926196564, "grad_norm": 7.643258094787598, "learning_rate": 9.999135183720026e-06, "loss": 0.2555, "step": 3460 }, { "epoch": 0.3169528681037632, "grad_norm": 2.1463754177093506, "learning_rate": 9.999033526959026e-06, "loss": 0.2913, "step": 3470 }, { "epoch": 0.31786627694556085, "grad_norm": 6.143978595733643, "learning_rate": 9.99892622334767e-06, "loss": 0.2252, "step": 3480 }, { "epoch": 0.31877968578735844, "grad_norm": 5.163086414337158, "learning_rate": 9.998813273007166e-06, "loss": 0.2686, "step": 3490 }, { "epoch": 0.319693094629156, "grad_norm": 2.747480630874634, "learning_rate": 9.9986946760651e-06, "loss": 0.2919, "step": 3500 }, { "epoch": 0.3206065034709536, "grad_norm": 3.772064208984375, "learning_rate": 9.998570432655441e-06, "loss": 0.2844, "step": 3510 }, { "epoch": 0.3215199123127512, "grad_norm": 3.046865701675415, "learning_rate": 9.998440542918532e-06, "loss": 0.3071, "step": 3520 }, { "epoch": 0.32243332115454876, "grad_norm": 2.906670331954956, "learning_rate": 9.998305007001094e-06, "loss": 0.2847, "step": 3530 }, { "epoch": 0.32334672999634634, "grad_norm": 3.8082165718078613, "learning_rate": 9.99816382505623e-06, "loss": 0.349, "step": 3540 }, { "epoch": 0.324260138838144, "grad_norm": 2.737300395965576, "learning_rate": 9.99801699724341e-06, "loss": 0.3336, "step": 3550 }, { "epoch": 0.32517354767994155, "grad_norm": 3.111027956008911, "learning_rate": 9.997864523728497e-06, "loss": 0.2756, "step": 3560 }, { "epoch": 0.32608695652173914, "grad_norm": 2.9324681758880615, "learning_rate": 9.997706404683718e-06, "loss": 0.2809, "step": 3570 }, { "epoch": 0.3270003653635367, "grad_norm": 2.2455217838287354, "learning_rate": 9.997542640287686e-06, "loss": 0.3345, "step": 3580 }, { "epoch": 0.3279137742053343, "grad_norm": 2.7102346420288086, "learning_rate": 9.997373230725386e-06, "loss": 0.3341, "step": 3590 }, { "epoch": 0.3288271830471319, "grad_norm": 4.779205322265625, "learning_rate": 9.99719817618818e-06, "loss": 0.2758, "step": 3600 }, { "epoch": 0.32974059188892946, "grad_norm": 3.7391929626464844, "learning_rate": 9.997017476873809e-06, "loss": 0.3043, "step": 3610 }, { "epoch": 0.3306540007307271, "grad_norm": 3.0744121074676514, "learning_rate": 9.996831132986389e-06, "loss": 0.3074, "step": 3620 }, { "epoch": 0.3315674095725247, "grad_norm": 8.49936294555664, "learning_rate": 9.996639144736411e-06, "loss": 0.3299, "step": 3630 }, { "epoch": 0.33248081841432225, "grad_norm": 8.764328002929688, "learning_rate": 9.996441512340741e-06, "loss": 0.3058, "step": 3640 }, { "epoch": 0.33339422725611984, "grad_norm": 5.276071548461914, "learning_rate": 9.996238236022625e-06, "loss": 0.3214, "step": 3650 }, { "epoch": 0.3343076360979174, "grad_norm": 2.6458494663238525, "learning_rate": 9.996029316011684e-06, "loss": 0.3401, "step": 3660 }, { "epoch": 0.335221044939715, "grad_norm": 5.720144748687744, "learning_rate": 9.995814752543906e-06, "loss": 0.3201, "step": 3670 }, { "epoch": 0.33613445378151263, "grad_norm": 5.153029441833496, "learning_rate": 9.995594545861662e-06, "loss": 0.3194, "step": 3680 }, { "epoch": 0.3370478626233102, "grad_norm": 2.3426713943481445, "learning_rate": 9.995368696213695e-06, "loss": 0.2339, "step": 3690 }, { "epoch": 0.3379612714651078, "grad_norm": 3.9770078659057617, "learning_rate": 9.995137203855123e-06, "loss": 0.3177, "step": 3700 }, { "epoch": 0.3388746803069054, "grad_norm": 2.1912238597869873, "learning_rate": 9.994900069047434e-06, "loss": 0.2948, "step": 3710 }, { "epoch": 0.33978808914870295, "grad_norm": 2.3202672004699707, "learning_rate": 9.994657292058497e-06, "loss": 0.2983, "step": 3720 }, { "epoch": 0.34070149799050053, "grad_norm": 3.0842182636260986, "learning_rate": 9.994408873162544e-06, "loss": 0.2569, "step": 3730 }, { "epoch": 0.3416149068322981, "grad_norm": 3.3993663787841797, "learning_rate": 9.994154812640193e-06, "loss": 0.3351, "step": 3740 }, { "epoch": 0.34252831567409575, "grad_norm": 2.155442714691162, "learning_rate": 9.993895110778424e-06, "loss": 0.3296, "step": 3750 }, { "epoch": 0.34344172451589333, "grad_norm": 3.68332576751709, "learning_rate": 9.993629767870594e-06, "loss": 0.3081, "step": 3760 }, { "epoch": 0.3443551333576909, "grad_norm": 2.6681559085845947, "learning_rate": 9.993358784216432e-06, "loss": 0.2229, "step": 3770 }, { "epoch": 0.3452685421994885, "grad_norm": 2.573707103729248, "learning_rate": 9.993082160122036e-06, "loss": 0.2694, "step": 3780 }, { "epoch": 0.3461819510412861, "grad_norm": 2.328291893005371, "learning_rate": 9.99279989589988e-06, "loss": 0.4169, "step": 3790 }, { "epoch": 0.34709535988308365, "grad_norm": 3.8134191036224365, "learning_rate": 9.992511991868805e-06, "loss": 0.3386, "step": 3800 }, { "epoch": 0.34800876872488123, "grad_norm": 3.7275428771972656, "learning_rate": 9.992218448354022e-06, "loss": 0.2855, "step": 3810 }, { "epoch": 0.34892217756667887, "grad_norm": 4.839414596557617, "learning_rate": 9.991919265687117e-06, "loss": 0.2991, "step": 3820 }, { "epoch": 0.34983558640847645, "grad_norm": 4.342672348022461, "learning_rate": 9.991614444206044e-06, "loss": 0.2859, "step": 3830 }, { "epoch": 0.35074899525027403, "grad_norm": 4.8496012687683105, "learning_rate": 9.991303984255124e-06, "loss": 0.2837, "step": 3840 }, { "epoch": 0.3516624040920716, "grad_norm": 3.3361377716064453, "learning_rate": 9.990987886185049e-06, "loss": 0.2926, "step": 3850 }, { "epoch": 0.3525758129338692, "grad_norm": 3.9906442165374756, "learning_rate": 9.990666150352881e-06, "loss": 0.2887, "step": 3860 }, { "epoch": 0.3534892217756668, "grad_norm": 3.4559247493743896, "learning_rate": 9.990338777122048e-06, "loss": 0.27, "step": 3870 }, { "epoch": 0.35440263061746435, "grad_norm": 1.9956576824188232, "learning_rate": 9.990005766862347e-06, "loss": 0.2712, "step": 3880 }, { "epoch": 0.355316039459262, "grad_norm": 2.453434705734253, "learning_rate": 9.989667119949942e-06, "loss": 0.3042, "step": 3890 }, { "epoch": 0.35622944830105957, "grad_norm": 4.895848274230957, "learning_rate": 9.989322836767367e-06, "loss": 0.3278, "step": 3900 }, { "epoch": 0.35714285714285715, "grad_norm": 3.363934278488159, "learning_rate": 9.988972917703517e-06, "loss": 0.3424, "step": 3910 }, { "epoch": 0.35805626598465473, "grad_norm": 3.9698212146759033, "learning_rate": 9.98861736315366e-06, "loss": 0.3209, "step": 3920 }, { "epoch": 0.3589696748264523, "grad_norm": 4.3630266189575195, "learning_rate": 9.988256173519424e-06, "loss": 0.2943, "step": 3930 }, { "epoch": 0.3598830836682499, "grad_norm": 1.7552975416183472, "learning_rate": 9.987889349208805e-06, "loss": 0.3084, "step": 3940 }, { "epoch": 0.3607964925100475, "grad_norm": 3.433133602142334, "learning_rate": 9.987516890636163e-06, "loss": 0.3188, "step": 3950 }, { "epoch": 0.3617099013518451, "grad_norm": 2.538970947265625, "learning_rate": 9.987138798222225e-06, "loss": 0.3137, "step": 3960 }, { "epoch": 0.3626233101936427, "grad_norm": 3.130812168121338, "learning_rate": 9.986755072394077e-06, "loss": 0.2671, "step": 3970 }, { "epoch": 0.36353671903544027, "grad_norm": 2.540316104888916, "learning_rate": 9.986365713585174e-06, "loss": 0.2749, "step": 3980 }, { "epoch": 0.36445012787723785, "grad_norm": 2.9984078407287598, "learning_rate": 9.985970722235328e-06, "loss": 0.2311, "step": 3990 }, { "epoch": 0.36536353671903543, "grad_norm": 6.326687335968018, "learning_rate": 9.985570098790719e-06, "loss": 0.294, "step": 4000 }, { "epoch": 0.366276945560833, "grad_norm": 5.427982807159424, "learning_rate": 9.985163843703887e-06, "loss": 0.3121, "step": 4010 }, { "epoch": 0.3671903544026306, "grad_norm": 4.9534502029418945, "learning_rate": 9.984751957433731e-06, "loss": 0.2855, "step": 4020 }, { "epoch": 0.36810376324442823, "grad_norm": 2.758335828781128, "learning_rate": 9.984334440445515e-06, "loss": 0.2512, "step": 4030 }, { "epoch": 0.3690171720862258, "grad_norm": 4.328779697418213, "learning_rate": 9.983911293210859e-06, "loss": 0.2491, "step": 4040 }, { "epoch": 0.3699305809280234, "grad_norm": 4.395755767822266, "learning_rate": 9.983482516207747e-06, "loss": 0.2957, "step": 4050 }, { "epoch": 0.37084398976982097, "grad_norm": 2.212921619415283, "learning_rate": 9.983048109920521e-06, "loss": 0.2811, "step": 4060 }, { "epoch": 0.37175739861161855, "grad_norm": 2.904526948928833, "learning_rate": 9.98260807483988e-06, "loss": 0.2505, "step": 4070 }, { "epoch": 0.37267080745341613, "grad_norm": 2.2775251865386963, "learning_rate": 9.982162411462883e-06, "loss": 0.259, "step": 4080 }, { "epoch": 0.3735842162952137, "grad_norm": 4.035882472991943, "learning_rate": 9.981711120292947e-06, "loss": 0.326, "step": 4090 }, { "epoch": 0.37449762513701135, "grad_norm": 2.418787717819214, "learning_rate": 9.981254201839845e-06, "loss": 0.2902, "step": 4100 }, { "epoch": 0.37541103397880893, "grad_norm": 9.288747787475586, "learning_rate": 9.980791656619703e-06, "loss": 0.1851, "step": 4110 }, { "epoch": 0.3763244428206065, "grad_norm": 4.890224456787109, "learning_rate": 9.980323485155013e-06, "loss": 0.248, "step": 4120 }, { "epoch": 0.3772378516624041, "grad_norm": 4.310764312744141, "learning_rate": 9.979849687974611e-06, "loss": 0.2746, "step": 4130 }, { "epoch": 0.37815126050420167, "grad_norm": 2.519655704498291, "learning_rate": 9.979370265613697e-06, "loss": 0.2522, "step": 4140 }, { "epoch": 0.37906466934599925, "grad_norm": 4.283063888549805, "learning_rate": 9.97888521861382e-06, "loss": 0.2901, "step": 4150 }, { "epoch": 0.37997807818779683, "grad_norm": 5.207154273986816, "learning_rate": 9.978394547522878e-06, "loss": 0.2992, "step": 4160 }, { "epoch": 0.38089148702959447, "grad_norm": 2.2380497455596924, "learning_rate": 9.977898252895133e-06, "loss": 0.2855, "step": 4170 }, { "epoch": 0.38180489587139205, "grad_norm": 3.232343912124634, "learning_rate": 9.977396335291193e-06, "loss": 0.2946, "step": 4180 }, { "epoch": 0.3827183047131896, "grad_norm": 4.609151363372803, "learning_rate": 9.97688879527802e-06, "loss": 0.2926, "step": 4190 }, { "epoch": 0.3836317135549872, "grad_norm": 3.641526937484741, "learning_rate": 9.976375633428919e-06, "loss": 0.2989, "step": 4200 }, { "epoch": 0.3845451223967848, "grad_norm": 2.6962080001831055, "learning_rate": 9.975856850323557e-06, "loss": 0.2841, "step": 4210 }, { "epoch": 0.38545853123858237, "grad_norm": 3.007307767868042, "learning_rate": 9.975332446547945e-06, "loss": 0.3442, "step": 4220 }, { "epoch": 0.38637194008038, "grad_norm": 6.159631252288818, "learning_rate": 9.97480242269444e-06, "loss": 0.3154, "step": 4230 }, { "epoch": 0.3872853489221776, "grad_norm": 2.4130070209503174, "learning_rate": 9.974266779361756e-06, "loss": 0.244, "step": 4240 }, { "epoch": 0.38819875776397517, "grad_norm": 4.543765068054199, "learning_rate": 9.973725517154945e-06, "loss": 0.3187, "step": 4250 }, { "epoch": 0.38911216660577275, "grad_norm": 3.471911907196045, "learning_rate": 9.973178636685413e-06, "loss": 0.3208, "step": 4260 }, { "epoch": 0.3900255754475703, "grad_norm": 3.22178316116333, "learning_rate": 9.972626138570908e-06, "loss": 0.3038, "step": 4270 }, { "epoch": 0.3909389842893679, "grad_norm": 2.9535670280456543, "learning_rate": 9.972068023435528e-06, "loss": 0.3269, "step": 4280 }, { "epoch": 0.3918523931311655, "grad_norm": 1.9970108270645142, "learning_rate": 9.971504291909712e-06, "loss": 0.2986, "step": 4290 }, { "epoch": 0.3927658019729631, "grad_norm": 3.527057409286499, "learning_rate": 9.970934944630244e-06, "loss": 0.2768, "step": 4300 }, { "epoch": 0.3936792108147607, "grad_norm": 5.1474223136901855, "learning_rate": 9.970359982240253e-06, "loss": 0.2851, "step": 4310 }, { "epoch": 0.3945926196565583, "grad_norm": 1.8962702751159668, "learning_rate": 9.969779405389208e-06, "loss": 0.2751, "step": 4320 }, { "epoch": 0.39550602849835587, "grad_norm": 3.539865493774414, "learning_rate": 9.969193214732924e-06, "loss": 0.2489, "step": 4330 }, { "epoch": 0.39641943734015345, "grad_norm": 2.9099175930023193, "learning_rate": 9.968601410933556e-06, "loss": 0.3212, "step": 4340 }, { "epoch": 0.397332846181951, "grad_norm": 4.9447832107543945, "learning_rate": 9.968003994659596e-06, "loss": 0.3246, "step": 4350 }, { "epoch": 0.3982462550237486, "grad_norm": 3.974153518676758, "learning_rate": 9.96740096658588e-06, "loss": 0.3591, "step": 4360 }, { "epoch": 0.39915966386554624, "grad_norm": 4.388247489929199, "learning_rate": 9.966792327393581e-06, "loss": 0.3336, "step": 4370 }, { "epoch": 0.4000730727073438, "grad_norm": 2.876927375793457, "learning_rate": 9.966178077770213e-06, "loss": 0.259, "step": 4380 }, { "epoch": 0.4009864815491414, "grad_norm": 2.0727202892303467, "learning_rate": 9.96555821840962e-06, "loss": 0.245, "step": 4390 }, { "epoch": 0.401899890390939, "grad_norm": 3.4795467853546143, "learning_rate": 9.964932750011994e-06, "loss": 0.3482, "step": 4400 }, { "epoch": 0.40281329923273657, "grad_norm": 2.091095447540283, "learning_rate": 9.964301673283854e-06, "loss": 0.285, "step": 4410 }, { "epoch": 0.40372670807453415, "grad_norm": 3.753514289855957, "learning_rate": 9.963664988938055e-06, "loss": 0.2417, "step": 4420 }, { "epoch": 0.4046401169163317, "grad_norm": 5.334874629974365, "learning_rate": 9.963022697693791e-06, "loss": 0.3849, "step": 4430 }, { "epoch": 0.40555352575812936, "grad_norm": 3.769843101501465, "learning_rate": 9.962374800276588e-06, "loss": 0.2332, "step": 4440 }, { "epoch": 0.40646693459992694, "grad_norm": 3.113779067993164, "learning_rate": 9.961721297418302e-06, "loss": 0.2804, "step": 4450 }, { "epoch": 0.4073803434417245, "grad_norm": 4.857155799865723, "learning_rate": 9.961062189857119e-06, "loss": 0.3201, "step": 4460 }, { "epoch": 0.4082937522835221, "grad_norm": 3.545935869216919, "learning_rate": 9.960397478337563e-06, "loss": 0.2567, "step": 4470 }, { "epoch": 0.4092071611253197, "grad_norm": 3.0659611225128174, "learning_rate": 9.959727163610483e-06, "loss": 0.2476, "step": 4480 }, { "epoch": 0.41012056996711727, "grad_norm": 3.7083849906921387, "learning_rate": 9.95905124643306e-06, "loss": 0.2827, "step": 4490 }, { "epoch": 0.41103397880891485, "grad_norm": 3.5766284465789795, "learning_rate": 9.958369727568802e-06, "loss": 0.2601, "step": 4500 }, { "epoch": 0.4119473876507125, "grad_norm": 2.9011943340301514, "learning_rate": 9.957682607787543e-06, "loss": 0.3276, "step": 4510 }, { "epoch": 0.41286079649251006, "grad_norm": 7.2158918380737305, "learning_rate": 9.956989887865447e-06, "loss": 0.2924, "step": 4520 }, { "epoch": 0.41377420533430764, "grad_norm": 2.12216854095459, "learning_rate": 9.956291568585003e-06, "loss": 0.3089, "step": 4530 }, { "epoch": 0.4146876141761052, "grad_norm": 3.211440086364746, "learning_rate": 9.955587650735023e-06, "loss": 0.2527, "step": 4540 }, { "epoch": 0.4156010230179028, "grad_norm": 3.5014595985412598, "learning_rate": 9.954878135110646e-06, "loss": 0.2851, "step": 4550 }, { "epoch": 0.4165144318597004, "grad_norm": 2.8051834106445312, "learning_rate": 9.954163022513332e-06, "loss": 0.3175, "step": 4560 }, { "epoch": 0.41742784070149797, "grad_norm": 2.8681159019470215, "learning_rate": 9.95344231375086e-06, "loss": 0.2791, "step": 4570 }, { "epoch": 0.4183412495432956, "grad_norm": 2.1720151901245117, "learning_rate": 9.952716009637342e-06, "loss": 0.2937, "step": 4580 }, { "epoch": 0.4192546583850932, "grad_norm": 3.0650601387023926, "learning_rate": 9.951984110993195e-06, "loss": 0.331, "step": 4590 }, { "epoch": 0.42016806722689076, "grad_norm": 2.6805593967437744, "learning_rate": 9.95124661864517e-06, "loss": 0.2493, "step": 4600 }, { "epoch": 0.42108147606868834, "grad_norm": 2.133955717086792, "learning_rate": 9.950503533426326e-06, "loss": 0.3123, "step": 4610 }, { "epoch": 0.4219948849104859, "grad_norm": 3.700582265853882, "learning_rate": 9.949754856176043e-06, "loss": 0.3463, "step": 4620 }, { "epoch": 0.4229082937522835, "grad_norm": 2.7414562702178955, "learning_rate": 9.949000587740018e-06, "loss": 0.3168, "step": 4630 }, { "epoch": 0.4238217025940811, "grad_norm": 7.181637287139893, "learning_rate": 9.948240728970268e-06, "loss": 0.3423, "step": 4640 }, { "epoch": 0.4247351114358787, "grad_norm": 3.5562567710876465, "learning_rate": 9.947475280725115e-06, "loss": 0.3187, "step": 4650 }, { "epoch": 0.4256485202776763, "grad_norm": 4.376706600189209, "learning_rate": 9.946704243869205e-06, "loss": 0.3032, "step": 4660 }, { "epoch": 0.4265619291194739, "grad_norm": 2.964596748352051, "learning_rate": 9.94592761927349e-06, "loss": 0.3837, "step": 4670 }, { "epoch": 0.42747533796127146, "grad_norm": 6.199699401855469, "learning_rate": 9.945145407815238e-06, "loss": 0.2514, "step": 4680 }, { "epoch": 0.42838874680306904, "grad_norm": 2.4550631046295166, "learning_rate": 9.944357610378022e-06, "loss": 0.2799, "step": 4690 }, { "epoch": 0.4293021556448666, "grad_norm": 3.110152006149292, "learning_rate": 9.943564227851733e-06, "loss": 0.2999, "step": 4700 }, { "epoch": 0.4302155644866642, "grad_norm": 4.413555145263672, "learning_rate": 9.942765261132565e-06, "loss": 0.3225, "step": 4710 }, { "epoch": 0.43112897332846184, "grad_norm": 2.2136051654815674, "learning_rate": 9.941960711123022e-06, "loss": 0.3073, "step": 4720 }, { "epoch": 0.4320423821702594, "grad_norm": 8.003153800964355, "learning_rate": 9.941150578731913e-06, "loss": 0.3455, "step": 4730 }, { "epoch": 0.432955791012057, "grad_norm": 2.727717638015747, "learning_rate": 9.940334864874356e-06, "loss": 0.283, "step": 4740 }, { "epoch": 0.4338691998538546, "grad_norm": 2.017005205154419, "learning_rate": 9.939513570471773e-06, "loss": 0.2805, "step": 4750 }, { "epoch": 0.43478260869565216, "grad_norm": 4.628815650939941, "learning_rate": 9.938686696451884e-06, "loss": 0.3396, "step": 4760 }, { "epoch": 0.43569601753744974, "grad_norm": 3.6824402809143066, "learning_rate": 9.93785424374872e-06, "loss": 0.2937, "step": 4770 }, { "epoch": 0.4366094263792474, "grad_norm": 2.473849296569824, "learning_rate": 9.93701621330261e-06, "loss": 0.2747, "step": 4780 }, { "epoch": 0.43752283522104496, "grad_norm": 5.785724639892578, "learning_rate": 9.936172606060183e-06, "loss": 0.2872, "step": 4790 }, { "epoch": 0.43843624406284254, "grad_norm": 4.131128787994385, "learning_rate": 9.935323422974365e-06, "loss": 0.2532, "step": 4800 }, { "epoch": 0.4393496529046401, "grad_norm": 3.7533416748046875, "learning_rate": 9.934468665004388e-06, "loss": 0.345, "step": 4810 }, { "epoch": 0.4402630617464377, "grad_norm": 2.6360630989074707, "learning_rate": 9.933608333115776e-06, "loss": 0.2621, "step": 4820 }, { "epoch": 0.4411764705882353, "grad_norm": 4.078582763671875, "learning_rate": 9.932742428280347e-06, "loss": 0.2518, "step": 4830 }, { "epoch": 0.44208987943003286, "grad_norm": 3.9996273517608643, "learning_rate": 9.931870951476216e-06, "loss": 0.2506, "step": 4840 }, { "epoch": 0.4430032882718305, "grad_norm": 3.8597726821899414, "learning_rate": 9.9309939036878e-06, "loss": 0.2877, "step": 4850 }, { "epoch": 0.4439166971136281, "grad_norm": 5.812499523162842, "learning_rate": 9.930111285905795e-06, "loss": 0.2718, "step": 4860 }, { "epoch": 0.44483010595542566, "grad_norm": 3.6305668354034424, "learning_rate": 9.929223099127201e-06, "loss": 0.2561, "step": 4870 }, { "epoch": 0.44574351479722324, "grad_norm": 2.8501129150390625, "learning_rate": 9.928329344355299e-06, "loss": 0.2598, "step": 4880 }, { "epoch": 0.4466569236390208, "grad_norm": 2.8874573707580566, "learning_rate": 9.927430022599667e-06, "loss": 0.3218, "step": 4890 }, { "epoch": 0.4475703324808184, "grad_norm": 2.5470714569091797, "learning_rate": 9.926525134876167e-06, "loss": 0.327, "step": 4900 }, { "epoch": 0.448483741322616, "grad_norm": 1.9438371658325195, "learning_rate": 9.925614682206951e-06, "loss": 0.2289, "step": 4910 }, { "epoch": 0.4493971501644136, "grad_norm": 11.375317573547363, "learning_rate": 9.924698665620453e-06, "loss": 0.3509, "step": 4920 }, { "epoch": 0.4503105590062112, "grad_norm": 3.0730814933776855, "learning_rate": 9.923777086151401e-06, "loss": 0.2843, "step": 4930 }, { "epoch": 0.4512239678480088, "grad_norm": 4.70624303817749, "learning_rate": 9.922849944840792e-06, "loss": 0.2649, "step": 4940 }, { "epoch": 0.45213737668980636, "grad_norm": 2.3170156478881836, "learning_rate": 9.92191724273592e-06, "loss": 0.2645, "step": 4950 }, { "epoch": 0.45305078553160394, "grad_norm": 4.273876190185547, "learning_rate": 9.920978980890352e-06, "loss": 0.2522, "step": 4960 }, { "epoch": 0.4539641943734015, "grad_norm": 3.0586624145507812, "learning_rate": 9.92003516036394e-06, "loss": 0.2708, "step": 4970 }, { "epoch": 0.4548776032151991, "grad_norm": 3.208183765411377, "learning_rate": 9.919085782222811e-06, "loss": 0.2915, "step": 4980 }, { "epoch": 0.45579101205699674, "grad_norm": 1.989342212677002, "learning_rate": 9.91813084753937e-06, "loss": 0.2865, "step": 4990 }, { "epoch": 0.4567044208987943, "grad_norm": 2.438453435897827, "learning_rate": 9.917170357392305e-06, "loss": 0.2037, "step": 5000 }, { "epoch": 0.4576178297405919, "grad_norm": 3.0468664169311523, "learning_rate": 9.916204312866568e-06, "loss": 0.3182, "step": 5010 }, { "epoch": 0.4585312385823895, "grad_norm": 2.2859349250793457, "learning_rate": 9.915232715053398e-06, "loss": 0.2695, "step": 5020 }, { "epoch": 0.45944464742418706, "grad_norm": 3.900787591934204, "learning_rate": 9.914255565050297e-06, "loss": 0.332, "step": 5030 }, { "epoch": 0.46035805626598464, "grad_norm": 3.9059789180755615, "learning_rate": 9.913272863961044e-06, "loss": 0.2899, "step": 5040 }, { "epoch": 0.4612714651077822, "grad_norm": 3.471268653869629, "learning_rate": 9.912284612895684e-06, "loss": 0.296, "step": 5050 }, { "epoch": 0.46218487394957986, "grad_norm": 4.207501411437988, "learning_rate": 9.911290812970538e-06, "loss": 0.2766, "step": 5060 }, { "epoch": 0.46309828279137744, "grad_norm": 5.3543829917907715, "learning_rate": 9.910291465308186e-06, "loss": 0.2851, "step": 5070 }, { "epoch": 0.464011691633175, "grad_norm": 3.399937152862549, "learning_rate": 9.909286571037484e-06, "loss": 0.2848, "step": 5080 }, { "epoch": 0.4649251004749726, "grad_norm": 2.9318981170654297, "learning_rate": 9.908276131293548e-06, "loss": 0.2589, "step": 5090 }, { "epoch": 0.4658385093167702, "grad_norm": 3.176783323287964, "learning_rate": 9.907260147217756e-06, "loss": 0.2812, "step": 5100 }, { "epoch": 0.46675191815856776, "grad_norm": 1.7639135122299194, "learning_rate": 9.906238619957757e-06, "loss": 0.2217, "step": 5110 }, { "epoch": 0.46766532700036534, "grad_norm": 4.44024658203125, "learning_rate": 9.905211550667452e-06, "loss": 0.2638, "step": 5120 }, { "epoch": 0.468578735842163, "grad_norm": 2.9012842178344727, "learning_rate": 9.904178940507008e-06, "loss": 0.2663, "step": 5130 }, { "epoch": 0.46949214468396055, "grad_norm": 3.9532392024993896, "learning_rate": 9.90314079064285e-06, "loss": 0.2828, "step": 5140 }, { "epoch": 0.47040555352575814, "grad_norm": 3.8494935035705566, "learning_rate": 9.90209710224766e-06, "loss": 0.298, "step": 5150 }, { "epoch": 0.4713189623675557, "grad_norm": 3.4110934734344482, "learning_rate": 9.901047876500374e-06, "loss": 0.2916, "step": 5160 }, { "epoch": 0.4722323712093533, "grad_norm": 7.333596229553223, "learning_rate": 9.899993114586188e-06, "loss": 0.3071, "step": 5170 }, { "epoch": 0.4731457800511509, "grad_norm": 5.094028949737549, "learning_rate": 9.898932817696548e-06, "loss": 0.29, "step": 5180 }, { "epoch": 0.47405918889294846, "grad_norm": 4.132980823516846, "learning_rate": 9.897866987029153e-06, "loss": 0.2533, "step": 5190 }, { "epoch": 0.4749725977347461, "grad_norm": 9.229303359985352, "learning_rate": 9.896795623787954e-06, "loss": 0.2991, "step": 5200 }, { "epoch": 0.4758860065765437, "grad_norm": 9.258539199829102, "learning_rate": 9.895718729183148e-06, "loss": 0.2937, "step": 5210 }, { "epoch": 0.47679941541834125, "grad_norm": 2.78472900390625, "learning_rate": 9.894636304431185e-06, "loss": 0.2867, "step": 5220 }, { "epoch": 0.47771282426013884, "grad_norm": 4.150284767150879, "learning_rate": 9.893548350754759e-06, "loss": 0.2678, "step": 5230 }, { "epoch": 0.4786262331019364, "grad_norm": 2.9255781173706055, "learning_rate": 9.892454869382807e-06, "loss": 0.2616, "step": 5240 }, { "epoch": 0.479539641943734, "grad_norm": 3.0230817794799805, "learning_rate": 9.89135586155052e-06, "loss": 0.2685, "step": 5250 }, { "epoch": 0.4804530507855316, "grad_norm": 4.718057632446289, "learning_rate": 9.890251328499316e-06, "loss": 0.2399, "step": 5260 }, { "epoch": 0.4813664596273292, "grad_norm": 1.7854446172714233, "learning_rate": 9.889141271476869e-06, "loss": 0.3391, "step": 5270 }, { "epoch": 0.4822798684691268, "grad_norm": 2.693803071975708, "learning_rate": 9.888025691737084e-06, "loss": 0.3384, "step": 5280 }, { "epoch": 0.4831932773109244, "grad_norm": 2.257547378540039, "learning_rate": 9.886904590540105e-06, "loss": 0.2799, "step": 5290 }, { "epoch": 0.48410668615272195, "grad_norm": 2.2659525871276855, "learning_rate": 9.88577796915232e-06, "loss": 0.2752, "step": 5300 }, { "epoch": 0.48502009499451954, "grad_norm": 2.152337074279785, "learning_rate": 9.884645828846342e-06, "loss": 0.2568, "step": 5310 }, { "epoch": 0.4859335038363171, "grad_norm": 2.288828134536743, "learning_rate": 9.883508170901028e-06, "loss": 0.2901, "step": 5320 }, { "epoch": 0.48684691267811475, "grad_norm": 2.9563052654266357, "learning_rate": 9.882364996601461e-06, "loss": 0.2997, "step": 5330 }, { "epoch": 0.48776032151991233, "grad_norm": 3.7962424755096436, "learning_rate": 9.881216307238958e-06, "loss": 0.3025, "step": 5340 }, { "epoch": 0.4886737303617099, "grad_norm": 2.172938585281372, "learning_rate": 9.880062104111064e-06, "loss": 0.2918, "step": 5350 }, { "epoch": 0.4895871392035075, "grad_norm": 2.480114698410034, "learning_rate": 9.878902388521555e-06, "loss": 0.2701, "step": 5360 }, { "epoch": 0.4905005480453051, "grad_norm": 2.4503819942474365, "learning_rate": 9.877737161780431e-06, "loss": 0.2327, "step": 5370 }, { "epoch": 0.49141395688710265, "grad_norm": 2.367793321609497, "learning_rate": 9.876566425203922e-06, "loss": 0.2568, "step": 5380 }, { "epoch": 0.49232736572890023, "grad_norm": 2.9530556201934814, "learning_rate": 9.875390180114475e-06, "loss": 0.2954, "step": 5390 }, { "epoch": 0.49324077457069787, "grad_norm": 3.8136823177337646, "learning_rate": 9.874208427840761e-06, "loss": 0.3249, "step": 5400 }, { "epoch": 0.49415418341249545, "grad_norm": 3.2845537662506104, "learning_rate": 9.873021169717679e-06, "loss": 0.2985, "step": 5410 }, { "epoch": 0.49506759225429303, "grad_norm": 3.679445743560791, "learning_rate": 9.87182840708634e-06, "loss": 0.3456, "step": 5420 }, { "epoch": 0.4959810010960906, "grad_norm": 2.672470808029175, "learning_rate": 9.870630141294071e-06, "loss": 0.3499, "step": 5430 }, { "epoch": 0.4968944099378882, "grad_norm": 1.559909701347351, "learning_rate": 9.869426373694426e-06, "loss": 0.2664, "step": 5440 }, { "epoch": 0.4978078187796858, "grad_norm": 3.616621732711792, "learning_rate": 9.868217105647161e-06, "loss": 0.2689, "step": 5450 }, { "epoch": 0.49872122762148335, "grad_norm": 4.38258171081543, "learning_rate": 9.867002338518255e-06, "loss": 0.2912, "step": 5460 }, { "epoch": 0.499634636463281, "grad_norm": 3.8265810012817383, "learning_rate": 9.865782073679893e-06, "loss": 0.237, "step": 5470 }, { "epoch": 0.5005480453050786, "grad_norm": 3.513063907623291, "learning_rate": 9.864556312510471e-06, "loss": 0.316, "step": 5480 }, { "epoch": 0.5014614541468762, "grad_norm": 5.5478949546813965, "learning_rate": 9.863325056394597e-06, "loss": 0.2984, "step": 5490 }, { "epoch": 0.5023748629886737, "grad_norm": 4.13325834274292, "learning_rate": 9.862088306723084e-06, "loss": 0.2742, "step": 5500 }, { "epoch": 0.5032882718304713, "grad_norm": 2.748626708984375, "learning_rate": 9.860846064892949e-06, "loss": 0.2548, "step": 5510 }, { "epoch": 0.5042016806722689, "grad_norm": 3.656006336212158, "learning_rate": 9.859598332307411e-06, "loss": 0.3319, "step": 5520 }, { "epoch": 0.5051150895140665, "grad_norm": 4.149877071380615, "learning_rate": 9.8583451103759e-06, "loss": 0.3268, "step": 5530 }, { "epoch": 0.506028498355864, "grad_norm": 3.3387629985809326, "learning_rate": 9.857086400514036e-06, "loss": 0.2788, "step": 5540 }, { "epoch": 0.5069419071976616, "grad_norm": 5.070505619049072, "learning_rate": 9.855822204143646e-06, "loss": 0.3144, "step": 5550 }, { "epoch": 0.5078553160394592, "grad_norm": 2.870758533477783, "learning_rate": 9.85455252269275e-06, "loss": 0.3394, "step": 5560 }, { "epoch": 0.5087687248812568, "grad_norm": 4.068554401397705, "learning_rate": 9.853277357595567e-06, "loss": 0.284, "step": 5570 }, { "epoch": 0.5096821337230545, "grad_norm": 2.2855632305145264, "learning_rate": 9.851996710292506e-06, "loss": 0.3246, "step": 5580 }, { "epoch": 0.5105955425648521, "grad_norm": 3.3873233795166016, "learning_rate": 9.850710582230176e-06, "loss": 0.2363, "step": 5590 }, { "epoch": 0.5115089514066496, "grad_norm": 2.9205915927886963, "learning_rate": 9.849418974861368e-06, "loss": 0.2653, "step": 5600 }, { "epoch": 0.5124223602484472, "grad_norm": 1.7957851886749268, "learning_rate": 9.848121889645068e-06, "loss": 0.2519, "step": 5610 }, { "epoch": 0.5133357690902448, "grad_norm": 3.5013558864593506, "learning_rate": 9.846819328046449e-06, "loss": 0.3063, "step": 5620 }, { "epoch": 0.5142491779320424, "grad_norm": 2.253748655319214, "learning_rate": 9.84551129153687e-06, "loss": 0.2354, "step": 5630 }, { "epoch": 0.51516258677384, "grad_norm": 3.2317233085632324, "learning_rate": 9.844197781593872e-06, "loss": 0.2635, "step": 5640 }, { "epoch": 0.5160759956156376, "grad_norm": 1.7452400922775269, "learning_rate": 9.842878799701183e-06, "loss": 0.2454, "step": 5650 }, { "epoch": 0.5169894044574351, "grad_norm": 7.061709403991699, "learning_rate": 9.841554347348708e-06, "loss": 0.2892, "step": 5660 }, { "epoch": 0.5179028132992327, "grad_norm": 4.365959167480469, "learning_rate": 9.840224426032533e-06, "loss": 0.3022, "step": 5670 }, { "epoch": 0.5188162221410303, "grad_norm": 1.9185420274734497, "learning_rate": 9.838889037254921e-06, "loss": 0.3625, "step": 5680 }, { "epoch": 0.5197296309828279, "grad_norm": 1.7959686517715454, "learning_rate": 9.837548182524314e-06, "loss": 0.2803, "step": 5690 }, { "epoch": 0.5206430398246255, "grad_norm": 2.7780721187591553, "learning_rate": 9.836201863355327e-06, "loss": 0.2707, "step": 5700 }, { "epoch": 0.5215564486664231, "grad_norm": 3.2977302074432373, "learning_rate": 9.834850081268742e-06, "loss": 0.3057, "step": 5710 }, { "epoch": 0.5224698575082207, "grad_norm": 4.0345354080200195, "learning_rate": 9.833492837791516e-06, "loss": 0.2795, "step": 5720 }, { "epoch": 0.5233832663500183, "grad_norm": 6.917937278747559, "learning_rate": 9.832130134456783e-06, "loss": 0.3015, "step": 5730 }, { "epoch": 0.5242966751918159, "grad_norm": 3.0035288333892822, "learning_rate": 9.830761972803829e-06, "loss": 0.2814, "step": 5740 }, { "epoch": 0.5252100840336135, "grad_norm": 3.6756339073181152, "learning_rate": 9.829388354378117e-06, "loss": 0.318, "step": 5750 }, { "epoch": 0.526123492875411, "grad_norm": 3.239065408706665, "learning_rate": 9.828009280731272e-06, "loss": 0.3056, "step": 5760 }, { "epoch": 0.5270369017172086, "grad_norm": 2.97517466545105, "learning_rate": 9.826624753421077e-06, "loss": 0.2888, "step": 5770 }, { "epoch": 0.5279503105590062, "grad_norm": 2.6300947666168213, "learning_rate": 9.825234774011479e-06, "loss": 0.2824, "step": 5780 }, { "epoch": 0.5288637194008038, "grad_norm": 3.2967381477355957, "learning_rate": 9.823839344072582e-06, "loss": 0.3043, "step": 5790 }, { "epoch": 0.5297771282426014, "grad_norm": 2.204991102218628, "learning_rate": 9.822438465180645e-06, "loss": 0.2519, "step": 5800 }, { "epoch": 0.530690537084399, "grad_norm": 3.0927531719207764, "learning_rate": 9.821032138918091e-06, "loss": 0.3377, "step": 5810 }, { "epoch": 0.5316039459261965, "grad_norm": 3.746189832687378, "learning_rate": 9.819620366873486e-06, "loss": 0.2715, "step": 5820 }, { "epoch": 0.5325173547679941, "grad_norm": 4.325469970703125, "learning_rate": 9.81820315064155e-06, "loss": 0.2371, "step": 5830 }, { "epoch": 0.5334307636097917, "grad_norm": 4.1674323081970215, "learning_rate": 9.816780491823156e-06, "loss": 0.2888, "step": 5840 }, { "epoch": 0.5343441724515894, "grad_norm": 3.706885576248169, "learning_rate": 9.815352392025325e-06, "loss": 0.2572, "step": 5850 }, { "epoch": 0.535257581293387, "grad_norm": 3.1083126068115234, "learning_rate": 9.813918852861217e-06, "loss": 0.2772, "step": 5860 }, { "epoch": 0.5361709901351845, "grad_norm": 2.2558939456939697, "learning_rate": 9.812479875950144e-06, "loss": 0.3021, "step": 5870 }, { "epoch": 0.5370843989769821, "grad_norm": 2.0923516750335693, "learning_rate": 9.81103546291756e-06, "loss": 0.2892, "step": 5880 }, { "epoch": 0.5379978078187797, "grad_norm": 2.7652342319488525, "learning_rate": 9.809585615395051e-06, "loss": 0.2285, "step": 5890 }, { "epoch": 0.5389112166605773, "grad_norm": 3.7592389583587646, "learning_rate": 9.808130335020356e-06, "loss": 0.2724, "step": 5900 }, { "epoch": 0.5398246255023749, "grad_norm": 2.8695075511932373, "learning_rate": 9.806669623437334e-06, "loss": 0.2801, "step": 5910 }, { "epoch": 0.5407380343441724, "grad_norm": 9.835343360900879, "learning_rate": 9.805203482295995e-06, "loss": 0.2767, "step": 5920 }, { "epoch": 0.54165144318597, "grad_norm": 3.37471342086792, "learning_rate": 9.803731913252472e-06, "loss": 0.3051, "step": 5930 }, { "epoch": 0.5425648520277676, "grad_norm": 3.4606735706329346, "learning_rate": 9.802254917969033e-06, "loss": 0.2552, "step": 5940 }, { "epoch": 0.5434782608695652, "grad_norm": 3.116419553756714, "learning_rate": 9.800772498114075e-06, "loss": 0.261, "step": 5950 }, { "epoch": 0.5443916697113628, "grad_norm": 1.690928339958191, "learning_rate": 9.79928465536212e-06, "loss": 0.2534, "step": 5960 }, { "epoch": 0.5453050785531603, "grad_norm": 3.111494779586792, "learning_rate": 9.79779139139382e-06, "loss": 0.2885, "step": 5970 }, { "epoch": 0.5462184873949579, "grad_norm": 3.539271116256714, "learning_rate": 9.79629270789595e-06, "loss": 0.2709, "step": 5980 }, { "epoch": 0.5471318962367556, "grad_norm": 3.658198356628418, "learning_rate": 9.794788606561403e-06, "loss": 0.2436, "step": 5990 }, { "epoch": 0.5480453050785532, "grad_norm": 2.670255184173584, "learning_rate": 9.793279089089197e-06, "loss": 0.2262, "step": 6000 }, { "epoch": 0.5489587139203508, "grad_norm": 6.936058044433594, "learning_rate": 9.791764157184463e-06, "loss": 0.3058, "step": 6010 }, { "epoch": 0.5498721227621484, "grad_norm": 3.5721442699432373, "learning_rate": 9.79024381255845e-06, "loss": 0.2669, "step": 6020 }, { "epoch": 0.5507855316039459, "grad_norm": 2.4868996143341064, "learning_rate": 9.788718056928523e-06, "loss": 0.2553, "step": 6030 }, { "epoch": 0.5516989404457435, "grad_norm": 5.480841159820557, "learning_rate": 9.787186892018159e-06, "loss": 0.2318, "step": 6040 }, { "epoch": 0.5526123492875411, "grad_norm": 5.0861287117004395, "learning_rate": 9.785650319556944e-06, "loss": 0.3091, "step": 6050 }, { "epoch": 0.5535257581293387, "grad_norm": 2.94234299659729, "learning_rate": 9.78410834128057e-06, "loss": 0.2476, "step": 6060 }, { "epoch": 0.5544391669711363, "grad_norm": 2.9691739082336426, "learning_rate": 9.78256095893084e-06, "loss": 0.2739, "step": 6070 }, { "epoch": 0.5553525758129338, "grad_norm": 3.6630666255950928, "learning_rate": 9.781008174255659e-06, "loss": 0.2223, "step": 6080 }, { "epoch": 0.5562659846547314, "grad_norm": 3.990704298019409, "learning_rate": 9.779449989009033e-06, "loss": 0.2219, "step": 6090 }, { "epoch": 0.557179393496529, "grad_norm": 4.8168625831604, "learning_rate": 9.777886404951072e-06, "loss": 0.2648, "step": 6100 }, { "epoch": 0.5580928023383266, "grad_norm": 3.042574167251587, "learning_rate": 9.77631742384798e-06, "loss": 0.2581, "step": 6110 }, { "epoch": 0.5590062111801242, "grad_norm": 3.546417474746704, "learning_rate": 9.774743047472062e-06, "loss": 0.2707, "step": 6120 }, { "epoch": 0.5599196200219219, "grad_norm": 3.9305660724639893, "learning_rate": 9.773163277601716e-06, "loss": 0.2358, "step": 6130 }, { "epoch": 0.5608330288637194, "grad_norm": 2.2789790630340576, "learning_rate": 9.77157811602143e-06, "loss": 0.2434, "step": 6140 }, { "epoch": 0.561746437705517, "grad_norm": 3.9269986152648926, "learning_rate": 9.769987564521783e-06, "loss": 0.2946, "step": 6150 }, { "epoch": 0.5626598465473146, "grad_norm": 2.7504191398620605, "learning_rate": 9.768391624899448e-06, "loss": 0.3161, "step": 6160 }, { "epoch": 0.5635732553891122, "grad_norm": 2.651276111602783, "learning_rate": 9.766790298957176e-06, "loss": 0.2165, "step": 6170 }, { "epoch": 0.5644866642309098, "grad_norm": 2.107430934906006, "learning_rate": 9.765183588503805e-06, "loss": 0.2531, "step": 6180 }, { "epoch": 0.5654000730727073, "grad_norm": 2.964421272277832, "learning_rate": 9.763571495354261e-06, "loss": 0.3077, "step": 6190 }, { "epoch": 0.5663134819145049, "grad_norm": 3.345609664916992, "learning_rate": 9.761954021329543e-06, "loss": 0.2409, "step": 6200 }, { "epoch": 0.5672268907563025, "grad_norm": 3.3803491592407227, "learning_rate": 9.760331168256732e-06, "loss": 0.2686, "step": 6210 }, { "epoch": 0.5681402995981001, "grad_norm": 3.0725042819976807, "learning_rate": 9.758702937968984e-06, "loss": 0.3025, "step": 6220 }, { "epoch": 0.5690537084398977, "grad_norm": 2.5636212825775146, "learning_rate": 9.757069332305527e-06, "loss": 0.2373, "step": 6230 }, { "epoch": 0.5699671172816952, "grad_norm": 3.103055238723755, "learning_rate": 9.755430353111666e-06, "loss": 0.3392, "step": 6240 }, { "epoch": 0.5708805261234928, "grad_norm": 2.9263079166412354, "learning_rate": 9.75378600223877e-06, "loss": 0.3317, "step": 6250 }, { "epoch": 0.5717939349652905, "grad_norm": 2.3819878101348877, "learning_rate": 9.752136281544281e-06, "loss": 0.2731, "step": 6260 }, { "epoch": 0.5727073438070881, "grad_norm": 3.6989071369171143, "learning_rate": 9.750481192891705e-06, "loss": 0.2796, "step": 6270 }, { "epoch": 0.5736207526488857, "grad_norm": 3.1856367588043213, "learning_rate": 9.748820738150608e-06, "loss": 0.2861, "step": 6280 }, { "epoch": 0.5745341614906833, "grad_norm": 2.7063374519348145, "learning_rate": 9.747154919196622e-06, "loss": 0.2767, "step": 6290 }, { "epoch": 0.5754475703324808, "grad_norm": 2.7607364654541016, "learning_rate": 9.745483737911438e-06, "loss": 0.2652, "step": 6300 }, { "epoch": 0.5763609791742784, "grad_norm": 4.116032600402832, "learning_rate": 9.743807196182799e-06, "loss": 0.2212, "step": 6310 }, { "epoch": 0.577274388016076, "grad_norm": 3.095385789871216, "learning_rate": 9.742125295904514e-06, "loss": 0.2527, "step": 6320 }, { "epoch": 0.5781877968578736, "grad_norm": 3.220276355743408, "learning_rate": 9.740438038976433e-06, "loss": 0.2488, "step": 6330 }, { "epoch": 0.5791012056996712, "grad_norm": 2.8491690158843994, "learning_rate": 9.73874542730446e-06, "loss": 0.2137, "step": 6340 }, { "epoch": 0.5800146145414687, "grad_norm": 4.125897407531738, "learning_rate": 9.737047462800554e-06, "loss": 0.2976, "step": 6350 }, { "epoch": 0.5809280233832663, "grad_norm": 4.584946632385254, "learning_rate": 9.735344147382715e-06, "loss": 0.2867, "step": 6360 }, { "epoch": 0.5818414322250639, "grad_norm": 2.0963475704193115, "learning_rate": 9.733635482974988e-06, "loss": 0.2655, "step": 6370 }, { "epoch": 0.5827548410668615, "grad_norm": 5.923211574554443, "learning_rate": 9.73192147150746e-06, "loss": 0.3446, "step": 6380 }, { "epoch": 0.5836682499086591, "grad_norm": 4.677428722381592, "learning_rate": 9.730202114916258e-06, "loss": 0.2819, "step": 6390 }, { "epoch": 0.5845816587504568, "grad_norm": 2.791370391845703, "learning_rate": 9.728477415143548e-06, "loss": 0.2449, "step": 6400 }, { "epoch": 0.5854950675922543, "grad_norm": 3.4630367755889893, "learning_rate": 9.726747374137532e-06, "loss": 0.2482, "step": 6410 }, { "epoch": 0.5864084764340519, "grad_norm": 3.82558274269104, "learning_rate": 9.725011993852442e-06, "loss": 0.284, "step": 6420 }, { "epoch": 0.5873218852758495, "grad_norm": 2.790816307067871, "learning_rate": 9.723271276248547e-06, "loss": 0.2746, "step": 6430 }, { "epoch": 0.5882352941176471, "grad_norm": 3.023796558380127, "learning_rate": 9.721525223292137e-06, "loss": 0.2951, "step": 6440 }, { "epoch": 0.5891487029594447, "grad_norm": 13.655096054077148, "learning_rate": 9.719773836955538e-06, "loss": 0.22, "step": 6450 }, { "epoch": 0.5900621118012422, "grad_norm": 2.9080352783203125, "learning_rate": 9.718017119217091e-06, "loss": 0.2794, "step": 6460 }, { "epoch": 0.5909755206430398, "grad_norm": 3.5258448123931885, "learning_rate": 9.716255072061166e-06, "loss": 0.3142, "step": 6470 }, { "epoch": 0.5918889294848374, "grad_norm": 2.141153573989868, "learning_rate": 9.714487697478151e-06, "loss": 0.2431, "step": 6480 }, { "epoch": 0.592802338326635, "grad_norm": 3.0992538928985596, "learning_rate": 9.712714997464453e-06, "loss": 0.2602, "step": 6490 }, { "epoch": 0.5937157471684326, "grad_norm": 3.861177444458008, "learning_rate": 9.71093697402249e-06, "loss": 0.2692, "step": 6500 }, { "epoch": 0.5946291560102301, "grad_norm": 3.2572734355926514, "learning_rate": 9.709153629160703e-06, "loss": 0.2695, "step": 6510 }, { "epoch": 0.5955425648520277, "grad_norm": 3.0455739498138428, "learning_rate": 9.707364964893532e-06, "loss": 0.2707, "step": 6520 }, { "epoch": 0.5964559736938253, "grad_norm": 2.699913263320923, "learning_rate": 9.705570983241433e-06, "loss": 0.2419, "step": 6530 }, { "epoch": 0.597369382535623, "grad_norm": 3.8658323287963867, "learning_rate": 9.703771686230867e-06, "loss": 0.2312, "step": 6540 }, { "epoch": 0.5982827913774206, "grad_norm": 2.220615863800049, "learning_rate": 9.701967075894299e-06, "loss": 0.2697, "step": 6550 }, { "epoch": 0.5991962002192182, "grad_norm": 3.879291296005249, "learning_rate": 9.700157154270197e-06, "loss": 0.2847, "step": 6560 }, { "epoch": 0.6001096090610157, "grad_norm": 2.050427198410034, "learning_rate": 9.698341923403026e-06, "loss": 0.2491, "step": 6570 }, { "epoch": 0.6010230179028133, "grad_norm": 2.6139674186706543, "learning_rate": 9.69652138534325e-06, "loss": 0.244, "step": 6580 }, { "epoch": 0.6019364267446109, "grad_norm": 2.968921422958374, "learning_rate": 9.694695542147328e-06, "loss": 0.261, "step": 6590 }, { "epoch": 0.6028498355864085, "grad_norm": 2.141153573989868, "learning_rate": 9.692864395877712e-06, "loss": 0.2374, "step": 6600 }, { "epoch": 0.6037632444282061, "grad_norm": 2.522078275680542, "learning_rate": 9.691027948602843e-06, "loss": 0.3436, "step": 6610 }, { "epoch": 0.6046766532700036, "grad_norm": 4.399686813354492, "learning_rate": 9.68918620239715e-06, "loss": 0.2634, "step": 6620 }, { "epoch": 0.6055900621118012, "grad_norm": 3.11279034614563, "learning_rate": 9.687339159341047e-06, "loss": 0.2458, "step": 6630 }, { "epoch": 0.6065034709535988, "grad_norm": 2.705336570739746, "learning_rate": 9.685486821520937e-06, "loss": 0.2356, "step": 6640 }, { "epoch": 0.6074168797953964, "grad_norm": 3.1967360973358154, "learning_rate": 9.683629191029195e-06, "loss": 0.2343, "step": 6650 }, { "epoch": 0.608330288637194, "grad_norm": 6.254840850830078, "learning_rate": 9.68176626996418e-06, "loss": 0.2796, "step": 6660 }, { "epoch": 0.6092436974789915, "grad_norm": 6.961836814880371, "learning_rate": 9.679898060430229e-06, "loss": 0.2711, "step": 6670 }, { "epoch": 0.6101571063207892, "grad_norm": 3.3606040477752686, "learning_rate": 9.678024564537644e-06, "loss": 0.259, "step": 6680 }, { "epoch": 0.6110705151625868, "grad_norm": 1.9806880950927734, "learning_rate": 9.676145784402711e-06, "loss": 0.2787, "step": 6690 }, { "epoch": 0.6119839240043844, "grad_norm": 4.4786458015441895, "learning_rate": 9.674261722147673e-06, "loss": 0.2403, "step": 6700 }, { "epoch": 0.612897332846182, "grad_norm": 3.7395308017730713, "learning_rate": 9.672372379900746e-06, "loss": 0.2658, "step": 6710 }, { "epoch": 0.6138107416879796, "grad_norm": 3.523909091949463, "learning_rate": 9.670477759796111e-06, "loss": 0.3265, "step": 6720 }, { "epoch": 0.6147241505297771, "grad_norm": 2.6696527004241943, "learning_rate": 9.668577863973907e-06, "loss": 0.2649, "step": 6730 }, { "epoch": 0.6156375593715747, "grad_norm": 1.602361798286438, "learning_rate": 9.666672694580234e-06, "loss": 0.298, "step": 6740 }, { "epoch": 0.6165509682133723, "grad_norm": 3.1162257194519043, "learning_rate": 9.664762253767153e-06, "loss": 0.2596, "step": 6750 }, { "epoch": 0.6174643770551699, "grad_norm": 3.4244751930236816, "learning_rate": 9.66284654369267e-06, "loss": 0.2404, "step": 6760 }, { "epoch": 0.6183777858969675, "grad_norm": 4.400494575500488, "learning_rate": 9.660925566520754e-06, "loss": 0.2273, "step": 6770 }, { "epoch": 0.619291194738765, "grad_norm": 1.705370545387268, "learning_rate": 9.658999324421316e-06, "loss": 0.2635, "step": 6780 }, { "epoch": 0.6202046035805626, "grad_norm": 3.187904119491577, "learning_rate": 9.657067819570219e-06, "loss": 0.2532, "step": 6790 }, { "epoch": 0.6211180124223602, "grad_norm": 5.749989986419678, "learning_rate": 9.655131054149264e-06, "loss": 0.2856, "step": 6800 }, { "epoch": 0.6220314212641579, "grad_norm": 4.291287422180176, "learning_rate": 9.653189030346203e-06, "loss": 0.2982, "step": 6810 }, { "epoch": 0.6229448301059555, "grad_norm": 2.6348507404327393, "learning_rate": 9.651241750354723e-06, "loss": 0.246, "step": 6820 }, { "epoch": 0.623858238947753, "grad_norm": 2.1266493797302246, "learning_rate": 9.649289216374447e-06, "loss": 0.2838, "step": 6830 }, { "epoch": 0.6247716477895506, "grad_norm": 3.872530460357666, "learning_rate": 9.647331430610935e-06, "loss": 0.2223, "step": 6840 }, { "epoch": 0.6256850566313482, "grad_norm": 4.296280384063721, "learning_rate": 9.645368395275682e-06, "loss": 0.2325, "step": 6850 }, { "epoch": 0.6265984654731458, "grad_norm": 3.9008805751800537, "learning_rate": 9.643400112586108e-06, "loss": 0.2395, "step": 6860 }, { "epoch": 0.6275118743149434, "grad_norm": 2.6780407428741455, "learning_rate": 9.641426584765562e-06, "loss": 0.3187, "step": 6870 }, { "epoch": 0.628425283156741, "grad_norm": 2.636173963546753, "learning_rate": 9.639447814043316e-06, "loss": 0.2731, "step": 6880 }, { "epoch": 0.6293386919985385, "grad_norm": 3.5724072456359863, "learning_rate": 9.637463802654571e-06, "loss": 0.2144, "step": 6890 }, { "epoch": 0.6302521008403361, "grad_norm": 3.251986503601074, "learning_rate": 9.635474552840443e-06, "loss": 0.2548, "step": 6900 }, { "epoch": 0.6311655096821337, "grad_norm": 2.8590807914733887, "learning_rate": 9.633480066847961e-06, "loss": 0.2954, "step": 6910 }, { "epoch": 0.6320789185239313, "grad_norm": 1.9871172904968262, "learning_rate": 9.631480346930078e-06, "loss": 0.2626, "step": 6920 }, { "epoch": 0.6329923273657289, "grad_norm": 2.599102735519409, "learning_rate": 9.629475395345652e-06, "loss": 0.3052, "step": 6930 }, { "epoch": 0.6339057362075264, "grad_norm": 2.9098923206329346, "learning_rate": 9.627465214359453e-06, "loss": 0.2737, "step": 6940 }, { "epoch": 0.6348191450493241, "grad_norm": 3.3348658084869385, "learning_rate": 9.625449806242158e-06, "loss": 0.2705, "step": 6950 }, { "epoch": 0.6357325538911217, "grad_norm": 2.2904629707336426, "learning_rate": 9.623429173270352e-06, "loss": 0.2796, "step": 6960 }, { "epoch": 0.6366459627329193, "grad_norm": 1.974174976348877, "learning_rate": 9.621403317726513e-06, "loss": 0.2494, "step": 6970 }, { "epoch": 0.6375593715747169, "grad_norm": 3.516902208328247, "learning_rate": 9.619372241899028e-06, "loss": 0.2433, "step": 6980 }, { "epoch": 0.6384727804165145, "grad_norm": 2.1735682487487793, "learning_rate": 9.617335948082176e-06, "loss": 0.2431, "step": 6990 }, { "epoch": 0.639386189258312, "grad_norm": 3.8236491680145264, "learning_rate": 9.615294438576131e-06, "loss": 0.2832, "step": 7000 }, { "epoch": 0.6402995981001096, "grad_norm": 4.206499099731445, "learning_rate": 9.613247715686958e-06, "loss": 0.2772, "step": 7010 }, { "epoch": 0.6412130069419072, "grad_norm": 3.433164596557617, "learning_rate": 9.61119578172661e-06, "loss": 0.2722, "step": 7020 }, { "epoch": 0.6421264157837048, "grad_norm": 2.7073190212249756, "learning_rate": 9.60913863901293e-06, "loss": 0.2252, "step": 7030 }, { "epoch": 0.6430398246255024, "grad_norm": 2.316575527191162, "learning_rate": 9.607076289869643e-06, "loss": 0.2235, "step": 7040 }, { "epoch": 0.6439532334672999, "grad_norm": 6.629827976226807, "learning_rate": 9.605008736626352e-06, "loss": 0.2765, "step": 7050 }, { "epoch": 0.6448666423090975, "grad_norm": 2.6827988624572754, "learning_rate": 9.602935981618544e-06, "loss": 0.2421, "step": 7060 }, { "epoch": 0.6457800511508951, "grad_norm": 4.600544452667236, "learning_rate": 9.600858027187579e-06, "loss": 0.255, "step": 7070 }, { "epoch": 0.6466934599926927, "grad_norm": 2.314401865005493, "learning_rate": 9.598774875680687e-06, "loss": 0.2656, "step": 7080 }, { "epoch": 0.6476068688344904, "grad_norm": 3.0470688343048096, "learning_rate": 9.596686529450974e-06, "loss": 0.2226, "step": 7090 }, { "epoch": 0.648520277676288, "grad_norm": 5.843374729156494, "learning_rate": 9.594592990857411e-06, "loss": 0.2811, "step": 7100 }, { "epoch": 0.6494336865180855, "grad_norm": 1.6065675020217896, "learning_rate": 9.592494262264838e-06, "loss": 0.2898, "step": 7110 }, { "epoch": 0.6503470953598831, "grad_norm": 2.605135440826416, "learning_rate": 9.590390346043952e-06, "loss": 0.2845, "step": 7120 }, { "epoch": 0.6512605042016807, "grad_norm": 2.455810308456421, "learning_rate": 9.588281244571311e-06, "loss": 0.2272, "step": 7130 }, { "epoch": 0.6521739130434783, "grad_norm": 2.8199284076690674, "learning_rate": 9.586166960229333e-06, "loss": 0.2014, "step": 7140 }, { "epoch": 0.6530873218852759, "grad_norm": 3.0628132820129395, "learning_rate": 9.58404749540629e-06, "loss": 0.2282, "step": 7150 }, { "epoch": 0.6540007307270734, "grad_norm": 3.2731451988220215, "learning_rate": 9.581922852496304e-06, "loss": 0.3061, "step": 7160 }, { "epoch": 0.654914139568871, "grad_norm": 4.294134616851807, "learning_rate": 9.579793033899346e-06, "loss": 0.28, "step": 7170 }, { "epoch": 0.6558275484106686, "grad_norm": 3.556389331817627, "learning_rate": 9.577658042021237e-06, "loss": 0.2756, "step": 7180 }, { "epoch": 0.6567409572524662, "grad_norm": 4.258586406707764, "learning_rate": 9.575517879273637e-06, "loss": 0.2793, "step": 7190 }, { "epoch": 0.6576543660942638, "grad_norm": 2.7015058994293213, "learning_rate": 9.57337254807405e-06, "loss": 0.2866, "step": 7200 }, { "epoch": 0.6585677749360613, "grad_norm": 3.428126335144043, "learning_rate": 9.571222050845815e-06, "loss": 0.2709, "step": 7210 }, { "epoch": 0.6594811837778589, "grad_norm": 2.6045033931732178, "learning_rate": 9.569066390018111e-06, "loss": 0.244, "step": 7220 }, { "epoch": 0.6603945926196566, "grad_norm": 2.3509225845336914, "learning_rate": 9.566905568025948e-06, "loss": 0.2764, "step": 7230 }, { "epoch": 0.6613080014614542, "grad_norm": 2.4832935333251953, "learning_rate": 9.564739587310165e-06, "loss": 0.2234, "step": 7240 }, { "epoch": 0.6622214103032518, "grad_norm": 3.4874157905578613, "learning_rate": 9.56256845031743e-06, "loss": 0.2987, "step": 7250 }, { "epoch": 0.6631348191450493, "grad_norm": 3.129835844039917, "learning_rate": 9.56039215950023e-06, "loss": 0.2552, "step": 7260 }, { "epoch": 0.6640482279868469, "grad_norm": 4.440527439117432, "learning_rate": 9.55821071731688e-06, "loss": 0.2403, "step": 7270 }, { "epoch": 0.6649616368286445, "grad_norm": 3.0708205699920654, "learning_rate": 9.556024126231513e-06, "loss": 0.2172, "step": 7280 }, { "epoch": 0.6658750456704421, "grad_norm": 2.812324285507202, "learning_rate": 9.553832388714078e-06, "loss": 0.2286, "step": 7290 }, { "epoch": 0.6667884545122397, "grad_norm": 5.3298234939575195, "learning_rate": 9.551635507240332e-06, "loss": 0.2501, "step": 7300 }, { "epoch": 0.6677018633540373, "grad_norm": 2.2287728786468506, "learning_rate": 9.54943348429185e-06, "loss": 0.2185, "step": 7310 }, { "epoch": 0.6686152721958348, "grad_norm": 4.497725009918213, "learning_rate": 9.547226322356012e-06, "loss": 0.2484, "step": 7320 }, { "epoch": 0.6695286810376324, "grad_norm": 4.1438679695129395, "learning_rate": 9.545014023926e-06, "loss": 0.2049, "step": 7330 }, { "epoch": 0.67044208987943, "grad_norm": 3.0879323482513428, "learning_rate": 9.542796591500805e-06, "loss": 0.2746, "step": 7340 }, { "epoch": 0.6713554987212276, "grad_norm": 3.282824993133545, "learning_rate": 9.540574027585207e-06, "loss": 0.3046, "step": 7350 }, { "epoch": 0.6722689075630253, "grad_norm": 2.2861368656158447, "learning_rate": 9.538346334689794e-06, "loss": 0.3076, "step": 7360 }, { "epoch": 0.6731823164048228, "grad_norm": 1.7921628952026367, "learning_rate": 9.536113515330938e-06, "loss": 0.2409, "step": 7370 }, { "epoch": 0.6740957252466204, "grad_norm": 4.977587699890137, "learning_rate": 9.533875572030808e-06, "loss": 0.3193, "step": 7380 }, { "epoch": 0.675009134088418, "grad_norm": 4.446091175079346, "learning_rate": 9.53163250731736e-06, "loss": 0.2388, "step": 7390 }, { "epoch": 0.6759225429302156, "grad_norm": 1.815018653869629, "learning_rate": 9.52938432372433e-06, "loss": 0.2687, "step": 7400 }, { "epoch": 0.6768359517720132, "grad_norm": 2.618931531906128, "learning_rate": 9.527131023791245e-06, "loss": 0.2878, "step": 7410 }, { "epoch": 0.6777493606138107, "grad_norm": 8.465574264526367, "learning_rate": 9.524872610063403e-06, "loss": 0.2691, "step": 7420 }, { "epoch": 0.6786627694556083, "grad_norm": 4.465149402618408, "learning_rate": 9.522609085091882e-06, "loss": 0.2557, "step": 7430 }, { "epoch": 0.6795761782974059, "grad_norm": 3.722456932067871, "learning_rate": 9.520340451433535e-06, "loss": 0.2606, "step": 7440 }, { "epoch": 0.6804895871392035, "grad_norm": 3.374316453933716, "learning_rate": 9.518066711650983e-06, "loss": 0.2246, "step": 7450 }, { "epoch": 0.6814029959810011, "grad_norm": 5.39083194732666, "learning_rate": 9.51578786831262e-06, "loss": 0.3046, "step": 7460 }, { "epoch": 0.6823164048227987, "grad_norm": 3.2873692512512207, "learning_rate": 9.513503923992595e-06, "loss": 0.2582, "step": 7470 }, { "epoch": 0.6832298136645962, "grad_norm": 2.2245473861694336, "learning_rate": 9.51121488127083e-06, "loss": 0.2054, "step": 7480 }, { "epoch": 0.6841432225063938, "grad_norm": 3.187519073486328, "learning_rate": 9.508920742732998e-06, "loss": 0.2507, "step": 7490 }, { "epoch": 0.6850566313481915, "grad_norm": 1.595237374305725, "learning_rate": 9.506621510970532e-06, "loss": 0.2411, "step": 7500 }, { "epoch": 0.6859700401899891, "grad_norm": 3.7285914421081543, "learning_rate": 9.504317188580618e-06, "loss": 0.2644, "step": 7510 }, { "epoch": 0.6868834490317867, "grad_norm": 2.7030630111694336, "learning_rate": 9.502007778166193e-06, "loss": 0.2378, "step": 7520 }, { "epoch": 0.6877968578735842, "grad_norm": 2.5020875930786133, "learning_rate": 9.49969328233594e-06, "loss": 0.3079, "step": 7530 }, { "epoch": 0.6887102667153818, "grad_norm": 2.9254977703094482, "learning_rate": 9.497373703704284e-06, "loss": 0.2319, "step": 7540 }, { "epoch": 0.6896236755571794, "grad_norm": 5.802700996398926, "learning_rate": 9.495049044891398e-06, "loss": 0.218, "step": 7550 }, { "epoch": 0.690537084398977, "grad_norm": 2.1314198970794678, "learning_rate": 9.492719308523186e-06, "loss": 0.2801, "step": 7560 }, { "epoch": 0.6914504932407746, "grad_norm": 2.9095239639282227, "learning_rate": 9.490384497231292e-06, "loss": 0.2291, "step": 7570 }, { "epoch": 0.6923639020825721, "grad_norm": 2.148696184158325, "learning_rate": 9.488044613653095e-06, "loss": 0.2108, "step": 7580 }, { "epoch": 0.6932773109243697, "grad_norm": 5.434717655181885, "learning_rate": 9.485699660431698e-06, "loss": 0.3044, "step": 7590 }, { "epoch": 0.6941907197661673, "grad_norm": 3.1283085346221924, "learning_rate": 9.483349640215933e-06, "loss": 0.2547, "step": 7600 }, { "epoch": 0.6951041286079649, "grad_norm": 3.668691635131836, "learning_rate": 9.480994555660354e-06, "loss": 0.2176, "step": 7610 }, { "epoch": 0.6960175374497625, "grad_norm": 5.080607891082764, "learning_rate": 9.478634409425241e-06, "loss": 0.3217, "step": 7620 }, { "epoch": 0.69693094629156, "grad_norm": 3.371673345565796, "learning_rate": 9.476269204176583e-06, "loss": 0.2312, "step": 7630 }, { "epoch": 0.6978443551333577, "grad_norm": 4.669818878173828, "learning_rate": 9.473898942586092e-06, "loss": 0.2818, "step": 7640 }, { "epoch": 0.6987577639751553, "grad_norm": 2.66201114654541, "learning_rate": 9.47152362733119e-06, "loss": 0.2572, "step": 7650 }, { "epoch": 0.6996711728169529, "grad_norm": 2.989013195037842, "learning_rate": 9.469143261094999e-06, "loss": 0.2662, "step": 7660 }, { "epoch": 0.7005845816587505, "grad_norm": 3.307216167449951, "learning_rate": 9.466757846566359e-06, "loss": 0.2782, "step": 7670 }, { "epoch": 0.7014979905005481, "grad_norm": 4.41278076171875, "learning_rate": 9.464367386439804e-06, "loss": 0.3263, "step": 7680 }, { "epoch": 0.7024113993423456, "grad_norm": 2.4941537380218506, "learning_rate": 9.461971883415571e-06, "loss": 0.2689, "step": 7690 }, { "epoch": 0.7033248081841432, "grad_norm": 3.3912482261657715, "learning_rate": 9.459571340199591e-06, "loss": 0.3646, "step": 7700 }, { "epoch": 0.7042382170259408, "grad_norm": 5.2638936042785645, "learning_rate": 9.457165759503492e-06, "loss": 0.3311, "step": 7710 }, { "epoch": 0.7051516258677384, "grad_norm": 3.4016637802124023, "learning_rate": 9.45475514404459e-06, "loss": 0.2828, "step": 7720 }, { "epoch": 0.706065034709536, "grad_norm": 2.36118221282959, "learning_rate": 9.452339496545887e-06, "loss": 0.2503, "step": 7730 }, { "epoch": 0.7069784435513335, "grad_norm": 4.024897575378418, "learning_rate": 9.449918819736069e-06, "loss": 0.2656, "step": 7740 }, { "epoch": 0.7078918523931311, "grad_norm": 3.4626922607421875, "learning_rate": 9.447493116349506e-06, "loss": 0.2327, "step": 7750 }, { "epoch": 0.7088052612349287, "grad_norm": 2.613381862640381, "learning_rate": 9.445062389126245e-06, "loss": 0.2909, "step": 7760 }, { "epoch": 0.7097186700767263, "grad_norm": 2.6797566413879395, "learning_rate": 9.442626640812007e-06, "loss": 0.2448, "step": 7770 }, { "epoch": 0.710632078918524, "grad_norm": 3.3342456817626953, "learning_rate": 9.440185874158185e-06, "loss": 0.3402, "step": 7780 }, { "epoch": 0.7115454877603216, "grad_norm": 1.8249295949935913, "learning_rate": 9.437740091921842e-06, "loss": 0.2605, "step": 7790 }, { "epoch": 0.7124588966021191, "grad_norm": 2.6762423515319824, "learning_rate": 9.435289296865703e-06, "loss": 0.2445, "step": 7800 }, { "epoch": 0.7133723054439167, "grad_norm": 3.3107216358184814, "learning_rate": 9.432833491758158e-06, "loss": 0.2742, "step": 7810 }, { "epoch": 0.7142857142857143, "grad_norm": 3.581094264984131, "learning_rate": 9.430372679373259e-06, "loss": 0.2832, "step": 7820 }, { "epoch": 0.7151991231275119, "grad_norm": 4.019885540008545, "learning_rate": 9.427906862490707e-06, "loss": 0.263, "step": 7830 }, { "epoch": 0.7161125319693095, "grad_norm": 4.593732833862305, "learning_rate": 9.425436043895861e-06, "loss": 0.2401, "step": 7840 }, { "epoch": 0.717025940811107, "grad_norm": 2.634809970855713, "learning_rate": 9.422960226379733e-06, "loss": 0.3191, "step": 7850 }, { "epoch": 0.7179393496529046, "grad_norm": 2.12272047996521, "learning_rate": 9.420479412738972e-06, "loss": 0.3028, "step": 7860 }, { "epoch": 0.7188527584947022, "grad_norm": 2.5467209815979004, "learning_rate": 9.417993605775881e-06, "loss": 0.214, "step": 7870 }, { "epoch": 0.7197661673364998, "grad_norm": 3.7832775115966797, "learning_rate": 9.415502808298397e-06, "loss": 0.2831, "step": 7880 }, { "epoch": 0.7206795761782974, "grad_norm": 3.5471131801605225, "learning_rate": 9.413007023120095e-06, "loss": 0.304, "step": 7890 }, { "epoch": 0.721592985020095, "grad_norm": 4.563578128814697, "learning_rate": 9.410506253060187e-06, "loss": 0.2258, "step": 7900 }, { "epoch": 0.7225063938618926, "grad_norm": 2.9378485679626465, "learning_rate": 9.408000500943513e-06, "loss": 0.282, "step": 7910 }, { "epoch": 0.7234198027036902, "grad_norm": 8.241348266601562, "learning_rate": 9.40548976960054e-06, "loss": 0.239, "step": 7920 }, { "epoch": 0.7243332115454878, "grad_norm": 3.629884719848633, "learning_rate": 9.402974061867365e-06, "loss": 0.2366, "step": 7930 }, { "epoch": 0.7252466203872854, "grad_norm": 3.6054484844207764, "learning_rate": 9.400453380585697e-06, "loss": 0.2395, "step": 7940 }, { "epoch": 0.726160029229083, "grad_norm": 2.320873498916626, "learning_rate": 9.397927728602872e-06, "loss": 0.2588, "step": 7950 }, { "epoch": 0.7270734380708805, "grad_norm": 9.077363967895508, "learning_rate": 9.395397108771836e-06, "loss": 0.2457, "step": 7960 }, { "epoch": 0.7279868469126781, "grad_norm": 2.6335196495056152, "learning_rate": 9.392861523951148e-06, "loss": 0.2756, "step": 7970 }, { "epoch": 0.7289002557544757, "grad_norm": 3.1989002227783203, "learning_rate": 9.390320977004978e-06, "loss": 0.2743, "step": 7980 }, { "epoch": 0.7298136645962733, "grad_norm": 3.849510669708252, "learning_rate": 9.387775470803094e-06, "loss": 0.2306, "step": 7990 }, { "epoch": 0.7307270734380709, "grad_norm": 3.25121808052063, "learning_rate": 9.385225008220873e-06, "loss": 0.284, "step": 8000 }, { "epoch": 0.7316404822798684, "grad_norm": 2.826185464859009, "learning_rate": 9.382669592139286e-06, "loss": 0.2487, "step": 8010 }, { "epoch": 0.732553891121666, "grad_norm": 2.5829718112945557, "learning_rate": 9.380109225444902e-06, "loss": 0.2877, "step": 8020 }, { "epoch": 0.7334672999634636, "grad_norm": 2.6849732398986816, "learning_rate": 9.377543911029883e-06, "loss": 0.2984, "step": 8030 }, { "epoch": 0.7343807088052612, "grad_norm": 2.421787738800049, "learning_rate": 9.374973651791979e-06, "loss": 0.2235, "step": 8040 }, { "epoch": 0.7352941176470589, "grad_norm": 3.4947142601013184, "learning_rate": 9.372398450634521e-06, "loss": 0.2647, "step": 8050 }, { "epoch": 0.7362075264888565, "grad_norm": 2.9164063930511475, "learning_rate": 9.369818310466432e-06, "loss": 0.3071, "step": 8060 }, { "epoch": 0.737120935330654, "grad_norm": 2.4298055171966553, "learning_rate": 9.367233234202202e-06, "loss": 0.2712, "step": 8070 }, { "epoch": 0.7380343441724516, "grad_norm": 2.5011441707611084, "learning_rate": 9.36464322476191e-06, "loss": 0.3188, "step": 8080 }, { "epoch": 0.7389477530142492, "grad_norm": 3.709627628326416, "learning_rate": 9.362048285071194e-06, "loss": 0.2769, "step": 8090 }, { "epoch": 0.7398611618560468, "grad_norm": 2.6126418113708496, "learning_rate": 9.359448418061275e-06, "loss": 0.2322, "step": 8100 }, { "epoch": 0.7407745706978444, "grad_norm": 2.5875611305236816, "learning_rate": 9.356843626668926e-06, "loss": 0.2424, "step": 8110 }, { "epoch": 0.7416879795396419, "grad_norm": 2.920117139816284, "learning_rate": 9.354233913836492e-06, "loss": 0.2259, "step": 8120 }, { "epoch": 0.7426013883814395, "grad_norm": 4.307920455932617, "learning_rate": 9.351619282511875e-06, "loss": 0.2713, "step": 8130 }, { "epoch": 0.7435147972232371, "grad_norm": 8.607027053833008, "learning_rate": 9.34899973564853e-06, "loss": 0.2609, "step": 8140 }, { "epoch": 0.7444282060650347, "grad_norm": 1.4866833686828613, "learning_rate": 9.346375276205471e-06, "loss": 0.1968, "step": 8150 }, { "epoch": 0.7453416149068323, "grad_norm": 3.0430614948272705, "learning_rate": 9.343745907147253e-06, "loss": 0.2719, "step": 8160 }, { "epoch": 0.7462550237486298, "grad_norm": 5.265416622161865, "learning_rate": 9.341111631443981e-06, "loss": 0.2515, "step": 8170 }, { "epoch": 0.7471684325904274, "grad_norm": 4.513857364654541, "learning_rate": 9.338472452071305e-06, "loss": 0.2889, "step": 8180 }, { "epoch": 0.7480818414322251, "grad_norm": 3.561439037322998, "learning_rate": 9.33582837201041e-06, "loss": 0.2726, "step": 8190 }, { "epoch": 0.7489952502740227, "grad_norm": 3.4749279022216797, "learning_rate": 9.333179394248017e-06, "loss": 0.2984, "step": 8200 }, { "epoch": 0.7499086591158203, "grad_norm": 4.101763725280762, "learning_rate": 9.330525521776383e-06, "loss": 0.252, "step": 8210 }, { "epoch": 0.7508220679576179, "grad_norm": 4.014924049377441, "learning_rate": 9.32786675759329e-06, "loss": 0.2356, "step": 8220 }, { "epoch": 0.7517354767994154, "grad_norm": 2.355116128921509, "learning_rate": 9.325203104702048e-06, "loss": 0.3196, "step": 8230 }, { "epoch": 0.752648885641213, "grad_norm": 3.4083380699157715, "learning_rate": 9.322534566111488e-06, "loss": 0.2781, "step": 8240 }, { "epoch": 0.7535622944830106, "grad_norm": 4.450926303863525, "learning_rate": 9.319861144835958e-06, "loss": 0.2663, "step": 8250 }, { "epoch": 0.7544757033248082, "grad_norm": 6.058822154998779, "learning_rate": 9.317182843895327e-06, "loss": 0.2261, "step": 8260 }, { "epoch": 0.7553891121666058, "grad_norm": 8.411591529846191, "learning_rate": 9.314499666314974e-06, "loss": 0.2223, "step": 8270 }, { "epoch": 0.7563025210084033, "grad_norm": 3.532818555831909, "learning_rate": 9.311811615125782e-06, "loss": 0.2909, "step": 8280 }, { "epoch": 0.7572159298502009, "grad_norm": 4.119094371795654, "learning_rate": 9.309118693364141e-06, "loss": 0.2807, "step": 8290 }, { "epoch": 0.7581293386919985, "grad_norm": 2.60502028465271, "learning_rate": 9.306420904071949e-06, "loss": 0.3002, "step": 8300 }, { "epoch": 0.7590427475337961, "grad_norm": 4.625072956085205, "learning_rate": 9.303718250296596e-06, "loss": 0.3018, "step": 8310 }, { "epoch": 0.7599561563755937, "grad_norm": 2.185211420059204, "learning_rate": 9.301010735090965e-06, "loss": 0.2848, "step": 8320 }, { "epoch": 0.7608695652173914, "grad_norm": 2.5400147438049316, "learning_rate": 9.298298361513438e-06, "loss": 0.2461, "step": 8330 }, { "epoch": 0.7617829740591889, "grad_norm": 3.3778038024902344, "learning_rate": 9.29558113262788e-06, "loss": 0.225, "step": 8340 }, { "epoch": 0.7626963829009865, "grad_norm": 2.125464677810669, "learning_rate": 9.292859051503638e-06, "loss": 0.2224, "step": 8350 }, { "epoch": 0.7636097917427841, "grad_norm": 2.717653512954712, "learning_rate": 9.290132121215546e-06, "loss": 0.2979, "step": 8360 }, { "epoch": 0.7645232005845817, "grad_norm": 4.01572847366333, "learning_rate": 9.287400344843915e-06, "loss": 0.2492, "step": 8370 }, { "epoch": 0.7654366094263793, "grad_norm": 3.0839383602142334, "learning_rate": 9.284663725474524e-06, "loss": 0.3136, "step": 8380 }, { "epoch": 0.7663500182681768, "grad_norm": 3.955944776535034, "learning_rate": 9.281922266198628e-06, "loss": 0.2727, "step": 8390 }, { "epoch": 0.7672634271099744, "grad_norm": 3.1806459426879883, "learning_rate": 9.279175970112948e-06, "loss": 0.241, "step": 8400 }, { "epoch": 0.768176835951772, "grad_norm": 3.1157124042510986, "learning_rate": 9.27642484031967e-06, "loss": 0.2514, "step": 8410 }, { "epoch": 0.7690902447935696, "grad_norm": 5.9353485107421875, "learning_rate": 9.273668879926433e-06, "loss": 0.2517, "step": 8420 }, { "epoch": 0.7700036536353672, "grad_norm": 2.2843568325042725, "learning_rate": 9.270908092046342e-06, "loss": 0.3231, "step": 8430 }, { "epoch": 0.7709170624771647, "grad_norm": 3.2940640449523926, "learning_rate": 9.268142479797952e-06, "loss": 0.2629, "step": 8440 }, { "epoch": 0.7718304713189623, "grad_norm": 3.236586570739746, "learning_rate": 9.265372046305263e-06, "loss": 0.2557, "step": 8450 }, { "epoch": 0.77274388016076, "grad_norm": 4.368768215179443, "learning_rate": 9.262596794697728e-06, "loss": 0.2574, "step": 8460 }, { "epoch": 0.7736572890025576, "grad_norm": 2.1249454021453857, "learning_rate": 9.259816728110236e-06, "loss": 0.2348, "step": 8470 }, { "epoch": 0.7745706978443552, "grad_norm": 5.401440143585205, "learning_rate": 9.25703184968312e-06, "loss": 0.2582, "step": 8480 }, { "epoch": 0.7754841066861528, "grad_norm": 4.54609489440918, "learning_rate": 9.254242162562147e-06, "loss": 0.2427, "step": 8490 }, { "epoch": 0.7763975155279503, "grad_norm": 2.937267780303955, "learning_rate": 9.251447669898511e-06, "loss": 0.2638, "step": 8500 }, { "epoch": 0.7773109243697479, "grad_norm": 1.82789945602417, "learning_rate": 9.248648374848845e-06, "loss": 0.2529, "step": 8510 }, { "epoch": 0.7782243332115455, "grad_norm": 3.499145984649658, "learning_rate": 9.245844280575196e-06, "loss": 0.2389, "step": 8520 }, { "epoch": 0.7791377420533431, "grad_norm": 2.879544258117676, "learning_rate": 9.243035390245037e-06, "loss": 0.2434, "step": 8530 }, { "epoch": 0.7800511508951407, "grad_norm": 5.4327712059021, "learning_rate": 9.240221707031258e-06, "loss": 0.229, "step": 8540 }, { "epoch": 0.7809645597369382, "grad_norm": 7.511201858520508, "learning_rate": 9.237403234112164e-06, "loss": 0.2806, "step": 8550 }, { "epoch": 0.7818779685787358, "grad_norm": 3.7943480014801025, "learning_rate": 9.234579974671468e-06, "loss": 0.2896, "step": 8560 }, { "epoch": 0.7827913774205334, "grad_norm": 2.5843167304992676, "learning_rate": 9.231751931898292e-06, "loss": 0.3215, "step": 8570 }, { "epoch": 0.783704786262331, "grad_norm": 2.569793939590454, "learning_rate": 9.22891910898716e-06, "loss": 0.2378, "step": 8580 }, { "epoch": 0.7846181951041286, "grad_norm": 2.940124988555908, "learning_rate": 9.226081509137995e-06, "loss": 0.271, "step": 8590 }, { "epoch": 0.7855316039459262, "grad_norm": 3.283169746398926, "learning_rate": 9.223239135556119e-06, "loss": 0.229, "step": 8600 }, { "epoch": 0.7864450127877238, "grad_norm": 4.4125590324401855, "learning_rate": 9.220391991452244e-06, "loss": 0.2304, "step": 8610 }, { "epoch": 0.7873584216295214, "grad_norm": 3.30393385887146, "learning_rate": 9.21754008004247e-06, "loss": 0.2963, "step": 8620 }, { "epoch": 0.788271830471319, "grad_norm": 3.517691135406494, "learning_rate": 9.214683404548282e-06, "loss": 0.2447, "step": 8630 }, { "epoch": 0.7891852393131166, "grad_norm": 3.5298359394073486, "learning_rate": 9.211821968196551e-06, "loss": 0.2018, "step": 8640 }, { "epoch": 0.7900986481549142, "grad_norm": 2.976083517074585, "learning_rate": 9.20895577421952e-06, "loss": 0.2785, "step": 8650 }, { "epoch": 0.7910120569967117, "grad_norm": 2.368382215499878, "learning_rate": 9.206084825854807e-06, "loss": 0.2688, "step": 8660 }, { "epoch": 0.7919254658385093, "grad_norm": 2.858123779296875, "learning_rate": 9.203209126345403e-06, "loss": 0.2722, "step": 8670 }, { "epoch": 0.7928388746803069, "grad_norm": 3.5288355350494385, "learning_rate": 9.200328678939668e-06, "loss": 0.2742, "step": 8680 }, { "epoch": 0.7937522835221045, "grad_norm": 2.280726432800293, "learning_rate": 9.197443486891318e-06, "loss": 0.268, "step": 8690 }, { "epoch": 0.794665692363902, "grad_norm": 2.6036622524261475, "learning_rate": 9.194553553459435e-06, "loss": 0.3126, "step": 8700 }, { "epoch": 0.7955791012056996, "grad_norm": 3.121222496032715, "learning_rate": 9.191658881908454e-06, "loss": 0.2864, "step": 8710 }, { "epoch": 0.7964925100474972, "grad_norm": 2.4715633392333984, "learning_rate": 9.18875947550816e-06, "loss": 0.226, "step": 8720 }, { "epoch": 0.7974059188892948, "grad_norm": 3.2132253646850586, "learning_rate": 9.185855337533693e-06, "loss": 0.2158, "step": 8730 }, { "epoch": 0.7983193277310925, "grad_norm": 3.0138769149780273, "learning_rate": 9.18294647126553e-06, "loss": 0.2406, "step": 8740 }, { "epoch": 0.7992327365728901, "grad_norm": 5.332334995269775, "learning_rate": 9.180032879989493e-06, "loss": 0.2445, "step": 8750 }, { "epoch": 0.8001461454146876, "grad_norm": 4.50657844543457, "learning_rate": 9.177114566996744e-06, "loss": 0.2124, "step": 8760 }, { "epoch": 0.8010595542564852, "grad_norm": 3.464733362197876, "learning_rate": 9.174191535583772e-06, "loss": 0.2759, "step": 8770 }, { "epoch": 0.8019729630982828, "grad_norm": 3.3400285243988037, "learning_rate": 9.171263789052402e-06, "loss": 0.2899, "step": 8780 }, { "epoch": 0.8028863719400804, "grad_norm": 3.3917715549468994, "learning_rate": 9.168331330709782e-06, "loss": 0.2381, "step": 8790 }, { "epoch": 0.803799780781878, "grad_norm": 2.8321774005889893, "learning_rate": 9.165394163868382e-06, "loss": 0.2684, "step": 8800 }, { "epoch": 0.8047131896236756, "grad_norm": 1.877874732017517, "learning_rate": 9.162452291845991e-06, "loss": 0.2584, "step": 8810 }, { "epoch": 0.8056265984654731, "grad_norm": 2.1152772903442383, "learning_rate": 9.159505717965715e-06, "loss": 0.1975, "step": 8820 }, { "epoch": 0.8065400073072707, "grad_norm": 4.165322780609131, "learning_rate": 9.15655444555597e-06, "loss": 0.2935, "step": 8830 }, { "epoch": 0.8074534161490683, "grad_norm": 1.910017490386963, "learning_rate": 9.15359847795048e-06, "loss": 0.2995, "step": 8840 }, { "epoch": 0.8083668249908659, "grad_norm": 2.784256935119629, "learning_rate": 9.150637818488269e-06, "loss": 0.2197, "step": 8850 }, { "epoch": 0.8092802338326635, "grad_norm": 10.5337553024292, "learning_rate": 9.147672470513664e-06, "loss": 0.2632, "step": 8860 }, { "epoch": 0.810193642674461, "grad_norm": 2.324427604675293, "learning_rate": 9.144702437376289e-06, "loss": 0.2441, "step": 8870 }, { "epoch": 0.8111070515162587, "grad_norm": 2.0047872066497803, "learning_rate": 9.141727722431058e-06, "loss": 0.2641, "step": 8880 }, { "epoch": 0.8120204603580563, "grad_norm": 3.9362871646881104, "learning_rate": 9.138748329038175e-06, "loss": 0.2421, "step": 8890 }, { "epoch": 0.8129338691998539, "grad_norm": 3.354377508163452, "learning_rate": 9.135764260563131e-06, "loss": 0.2534, "step": 8900 }, { "epoch": 0.8138472780416515, "grad_norm": 8.210514068603516, "learning_rate": 9.132775520376692e-06, "loss": 0.2338, "step": 8910 }, { "epoch": 0.814760686883449, "grad_norm": 2.403374195098877, "learning_rate": 9.129782111854903e-06, "loss": 0.2581, "step": 8920 }, { "epoch": 0.8156740957252466, "grad_norm": 2.41560697555542, "learning_rate": 9.126784038379087e-06, "loss": 0.2207, "step": 8930 }, { "epoch": 0.8165875045670442, "grad_norm": 2.166792154312134, "learning_rate": 9.123781303335831e-06, "loss": 0.2326, "step": 8940 }, { "epoch": 0.8175009134088418, "grad_norm": 9.84984302520752, "learning_rate": 9.120773910116988e-06, "loss": 0.2303, "step": 8950 }, { "epoch": 0.8184143222506394, "grad_norm": 2.768390417098999, "learning_rate": 9.11776186211968e-06, "loss": 0.2536, "step": 8960 }, { "epoch": 0.819327731092437, "grad_norm": 2.607318639755249, "learning_rate": 9.114745162746277e-06, "loss": 0.2504, "step": 8970 }, { "epoch": 0.8202411399342345, "grad_norm": 2.3159570693969727, "learning_rate": 9.111723815404409e-06, "loss": 0.2784, "step": 8980 }, { "epoch": 0.8211545487760321, "grad_norm": 2.9916293621063232, "learning_rate": 9.108697823506956e-06, "loss": 0.2062, "step": 8990 }, { "epoch": 0.8220679576178297, "grad_norm": 8.775740623474121, "learning_rate": 9.105667190472041e-06, "loss": 0.3239, "step": 9000 }, { "epoch": 0.8229813664596274, "grad_norm": 4.478304862976074, "learning_rate": 9.102631919723034e-06, "loss": 0.2801, "step": 9010 }, { "epoch": 0.823894775301425, "grad_norm": 4.523096561431885, "learning_rate": 9.09959201468854e-06, "loss": 0.2771, "step": 9020 }, { "epoch": 0.8248081841432225, "grad_norm": 3.1738572120666504, "learning_rate": 9.096547478802404e-06, "loss": 0.2264, "step": 9030 }, { "epoch": 0.8257215929850201, "grad_norm": 5.142342567443848, "learning_rate": 9.093498315503694e-06, "loss": 0.3014, "step": 9040 }, { "epoch": 0.8266350018268177, "grad_norm": 4.913907051086426, "learning_rate": 9.090444528236713e-06, "loss": 0.2312, "step": 9050 }, { "epoch": 0.8275484106686153, "grad_norm": 2.508955478668213, "learning_rate": 9.08738612045098e-06, "loss": 0.2106, "step": 9060 }, { "epoch": 0.8284618195104129, "grad_norm": 2.518767833709717, "learning_rate": 9.08432309560124e-06, "loss": 0.246, "step": 9070 }, { "epoch": 0.8293752283522104, "grad_norm": 3.5293400287628174, "learning_rate": 9.08125545714745e-06, "loss": 0.2963, "step": 9080 }, { "epoch": 0.830288637194008, "grad_norm": 3.3181793689727783, "learning_rate": 9.078183208554778e-06, "loss": 0.3379, "step": 9090 }, { "epoch": 0.8312020460358056, "grad_norm": 3.747316837310791, "learning_rate": 9.0751063532936e-06, "loss": 0.2202, "step": 9100 }, { "epoch": 0.8321154548776032, "grad_norm": 3.4723596572875977, "learning_rate": 9.0720248948395e-06, "loss": 0.2454, "step": 9110 }, { "epoch": 0.8330288637194008, "grad_norm": 3.627567768096924, "learning_rate": 9.068938836673251e-06, "loss": 0.2719, "step": 9120 }, { "epoch": 0.8339422725611984, "grad_norm": 4.03235387802124, "learning_rate": 9.065848182280835e-06, "loss": 0.2563, "step": 9130 }, { "epoch": 0.8348556814029959, "grad_norm": 3.554941177368164, "learning_rate": 9.062752935153416e-06, "loss": 0.2991, "step": 9140 }, { "epoch": 0.8357690902447936, "grad_norm": 2.2698261737823486, "learning_rate": 9.059653098787349e-06, "loss": 0.2124, "step": 9150 }, { "epoch": 0.8366824990865912, "grad_norm": 2.995501756668091, "learning_rate": 9.056548676684174e-06, "loss": 0.2238, "step": 9160 }, { "epoch": 0.8375959079283888, "grad_norm": 2.928321599960327, "learning_rate": 9.053439672350612e-06, "loss": 0.2393, "step": 9170 }, { "epoch": 0.8385093167701864, "grad_norm": 3.533752202987671, "learning_rate": 9.050326089298555e-06, "loss": 0.2285, "step": 9180 }, { "epoch": 0.8394227256119839, "grad_norm": 2.3301727771759033, "learning_rate": 9.047207931045073e-06, "loss": 0.2744, "step": 9190 }, { "epoch": 0.8403361344537815, "grad_norm": 2.464972972869873, "learning_rate": 9.0440852011124e-06, "loss": 0.2306, "step": 9200 }, { "epoch": 0.8412495432955791, "grad_norm": 3.468233585357666, "learning_rate": 9.040957903027937e-06, "loss": 0.2357, "step": 9210 }, { "epoch": 0.8421629521373767, "grad_norm": 3.2693750858306885, "learning_rate": 9.037826040324243e-06, "loss": 0.2936, "step": 9220 }, { "epoch": 0.8430763609791743, "grad_norm": 2.9886083602905273, "learning_rate": 9.034689616539031e-06, "loss": 0.295, "step": 9230 }, { "epoch": 0.8439897698209718, "grad_norm": 3.6031527519226074, "learning_rate": 9.031548635215174e-06, "loss": 0.2348, "step": 9240 }, { "epoch": 0.8449031786627694, "grad_norm": 3.4606499671936035, "learning_rate": 9.028403099900685e-06, "loss": 0.199, "step": 9250 }, { "epoch": 0.845816587504567, "grad_norm": 3.1294021606445312, "learning_rate": 9.025253014148724e-06, "loss": 0.257, "step": 9260 }, { "epoch": 0.8467299963463646, "grad_norm": 6.265628814697266, "learning_rate": 9.022098381517593e-06, "loss": 0.2773, "step": 9270 }, { "epoch": 0.8476434051881622, "grad_norm": 1.7644050121307373, "learning_rate": 9.018939205570727e-06, "loss": 0.244, "step": 9280 }, { "epoch": 0.8485568140299599, "grad_norm": 2.713165521621704, "learning_rate": 9.015775489876697e-06, "loss": 0.2091, "step": 9290 }, { "epoch": 0.8494702228717574, "grad_norm": 3.9214985370635986, "learning_rate": 9.012607238009196e-06, "loss": 0.2113, "step": 9300 }, { "epoch": 0.850383631713555, "grad_norm": 2.272008180618286, "learning_rate": 9.009434453547046e-06, "loss": 0.2598, "step": 9310 }, { "epoch": 0.8512970405553526, "grad_norm": 3.7626376152038574, "learning_rate": 9.00625714007419e-06, "loss": 0.3222, "step": 9320 }, { "epoch": 0.8522104493971502, "grad_norm": 2.774144172668457, "learning_rate": 9.003075301179683e-06, "loss": 0.2909, "step": 9330 }, { "epoch": 0.8531238582389478, "grad_norm": 9.038482666015625, "learning_rate": 8.99988894045769e-06, "loss": 0.2439, "step": 9340 }, { "epoch": 0.8540372670807453, "grad_norm": 3.35915207862854, "learning_rate": 8.996698061507492e-06, "loss": 0.215, "step": 9350 }, { "epoch": 0.8549506759225429, "grad_norm": 5.083891868591309, "learning_rate": 8.993502667933467e-06, "loss": 0.2633, "step": 9360 }, { "epoch": 0.8558640847643405, "grad_norm": 2.144519805908203, "learning_rate": 8.990302763345094e-06, "loss": 0.2297, "step": 9370 }, { "epoch": 0.8567774936061381, "grad_norm": 3.4722886085510254, "learning_rate": 8.98709835135695e-06, "loss": 0.292, "step": 9380 }, { "epoch": 0.8576909024479357, "grad_norm": 3.9279263019561768, "learning_rate": 8.983889435588698e-06, "loss": 0.2383, "step": 9390 }, { "epoch": 0.8586043112897332, "grad_norm": 5.518746376037598, "learning_rate": 8.980676019665096e-06, "loss": 0.3363, "step": 9400 }, { "epoch": 0.8595177201315308, "grad_norm": 1.8033549785614014, "learning_rate": 8.977458107215977e-06, "loss": 0.2685, "step": 9410 }, { "epoch": 0.8604311289733284, "grad_norm": 4.415343284606934, "learning_rate": 8.974235701876264e-06, "loss": 0.2219, "step": 9420 }, { "epoch": 0.8613445378151261, "grad_norm": 3.8517894744873047, "learning_rate": 8.971008807285944e-06, "loss": 0.3026, "step": 9430 }, { "epoch": 0.8622579466569237, "grad_norm": 4.49314022064209, "learning_rate": 8.96777742709008e-06, "loss": 0.2292, "step": 9440 }, { "epoch": 0.8631713554987213, "grad_norm": 4.355213165283203, "learning_rate": 8.964541564938803e-06, "loss": 0.2957, "step": 9450 }, { "epoch": 0.8640847643405188, "grad_norm": 3.820964813232422, "learning_rate": 8.961301224487305e-06, "loss": 0.2831, "step": 9460 }, { "epoch": 0.8649981731823164, "grad_norm": 2.3454527854919434, "learning_rate": 8.958056409395836e-06, "loss": 0.2517, "step": 9470 }, { "epoch": 0.865911582024114, "grad_norm": 3.505007266998291, "learning_rate": 8.954807123329703e-06, "loss": 0.2569, "step": 9480 }, { "epoch": 0.8668249908659116, "grad_norm": 4.667683124542236, "learning_rate": 8.951553369959262e-06, "loss": 0.2358, "step": 9490 }, { "epoch": 0.8677383997077092, "grad_norm": 19.075889587402344, "learning_rate": 8.948295152959915e-06, "loss": 0.2486, "step": 9500 }, { "epoch": 0.8686518085495067, "grad_norm": 3.501936197280884, "learning_rate": 8.945032476012105e-06, "loss": 0.2753, "step": 9510 }, { "epoch": 0.8695652173913043, "grad_norm": 5.849125862121582, "learning_rate": 8.941765342801314e-06, "loss": 0.2342, "step": 9520 }, { "epoch": 0.8704786262331019, "grad_norm": 4.45645809173584, "learning_rate": 8.93849375701806e-06, "loss": 0.2275, "step": 9530 }, { "epoch": 0.8713920350748995, "grad_norm": 1.8996561765670776, "learning_rate": 8.935217722357887e-06, "loss": 0.2349, "step": 9540 }, { "epoch": 0.8723054439166971, "grad_norm": 2.4494168758392334, "learning_rate": 8.931937242521365e-06, "loss": 0.2628, "step": 9550 }, { "epoch": 0.8732188527584948, "grad_norm": 8.486907958984375, "learning_rate": 8.928652321214086e-06, "loss": 0.21, "step": 9560 }, { "epoch": 0.8741322616002923, "grad_norm": 3.114189386367798, "learning_rate": 8.92536296214666e-06, "loss": 0.2858, "step": 9570 }, { "epoch": 0.8750456704420899, "grad_norm": 1.5911468267440796, "learning_rate": 8.922069169034709e-06, "loss": 0.2466, "step": 9580 }, { "epoch": 0.8759590792838875, "grad_norm": 3.6805219650268555, "learning_rate": 8.918770945598864e-06, "loss": 0.2674, "step": 9590 }, { "epoch": 0.8768724881256851, "grad_norm": 3.5699830055236816, "learning_rate": 8.915468295564758e-06, "loss": 0.2208, "step": 9600 }, { "epoch": 0.8777858969674827, "grad_norm": 3.003600835800171, "learning_rate": 8.91216122266303e-06, "loss": 0.2308, "step": 9610 }, { "epoch": 0.8786993058092802, "grad_norm": 2.6410698890686035, "learning_rate": 8.908849730629305e-06, "loss": 0.3311, "step": 9620 }, { "epoch": 0.8796127146510778, "grad_norm": 4.3253865242004395, "learning_rate": 8.905533823204213e-06, "loss": 0.2316, "step": 9630 }, { "epoch": 0.8805261234928754, "grad_norm": 5.423803329467773, "learning_rate": 8.90221350413336e-06, "loss": 0.2877, "step": 9640 }, { "epoch": 0.881439532334673, "grad_norm": 4.704559803009033, "learning_rate": 8.89888877716734e-06, "loss": 0.2702, "step": 9650 }, { "epoch": 0.8823529411764706, "grad_norm": 4.937850475311279, "learning_rate": 8.89555964606173e-06, "loss": 0.2204, "step": 9660 }, { "epoch": 0.8832663500182681, "grad_norm": 2.3993799686431885, "learning_rate": 8.892226114577073e-06, "loss": 0.2148, "step": 9670 }, { "epoch": 0.8841797588600657, "grad_norm": 3.421501398086548, "learning_rate": 8.888888186478892e-06, "loss": 0.2679, "step": 9680 }, { "epoch": 0.8850931677018633, "grad_norm": 5.602266311645508, "learning_rate": 8.885545865537667e-06, "loss": 0.2382, "step": 9690 }, { "epoch": 0.886006576543661, "grad_norm": 5.682740688323975, "learning_rate": 8.88219915552885e-06, "loss": 0.2616, "step": 9700 }, { "epoch": 0.8869199853854586, "grad_norm": 3.207764148712158, "learning_rate": 8.878848060232843e-06, "loss": 0.2844, "step": 9710 }, { "epoch": 0.8878333942272562, "grad_norm": 2.4764068126678467, "learning_rate": 8.875492583435006e-06, "loss": 0.2156, "step": 9720 }, { "epoch": 0.8887468030690537, "grad_norm": 3.576754570007324, "learning_rate": 8.872132728925645e-06, "loss": 0.2568, "step": 9730 }, { "epoch": 0.8896602119108513, "grad_norm": 2.8046793937683105, "learning_rate": 8.868768500500013e-06, "loss": 0.2468, "step": 9740 }, { "epoch": 0.8905736207526489, "grad_norm": 2.599506378173828, "learning_rate": 8.865399901958305e-06, "loss": 0.245, "step": 9750 }, { "epoch": 0.8914870295944465, "grad_norm": 1.6671375036239624, "learning_rate": 8.862026937105649e-06, "loss": 0.2128, "step": 9760 }, { "epoch": 0.8924004384362441, "grad_norm": 2.3490946292877197, "learning_rate": 8.858649609752106e-06, "loss": 0.2322, "step": 9770 }, { "epoch": 0.8933138472780416, "grad_norm": 3.171074151992798, "learning_rate": 8.85526792371267e-06, "loss": 0.2512, "step": 9780 }, { "epoch": 0.8942272561198392, "grad_norm": 3.9374029636383057, "learning_rate": 8.851881882807252e-06, "loss": 0.2577, "step": 9790 }, { "epoch": 0.8951406649616368, "grad_norm": 2.0398175716400146, "learning_rate": 8.848491490860684e-06, "loss": 0.272, "step": 9800 }, { "epoch": 0.8960540738034344, "grad_norm": 3.7633488178253174, "learning_rate": 8.845096751702715e-06, "loss": 0.2417, "step": 9810 }, { "epoch": 0.896967482645232, "grad_norm": 3.5872418880462646, "learning_rate": 8.841697669168e-06, "loss": 0.2872, "step": 9820 }, { "epoch": 0.8978808914870295, "grad_norm": 2.93212628364563, "learning_rate": 8.838294247096106e-06, "loss": 0.2474, "step": 9830 }, { "epoch": 0.8987943003288272, "grad_norm": 1.9602431058883667, "learning_rate": 8.8348864893315e-06, "loss": 0.2586, "step": 9840 }, { "epoch": 0.8997077091706248, "grad_norm": 3.998948812484741, "learning_rate": 8.831474399723544e-06, "loss": 0.2776, "step": 9850 }, { "epoch": 0.9006211180124224, "grad_norm": 2.9520766735076904, "learning_rate": 8.828057982126495e-06, "loss": 0.2103, "step": 9860 }, { "epoch": 0.90153452685422, "grad_norm": 4.894905090332031, "learning_rate": 8.824637240399499e-06, "loss": 0.2079, "step": 9870 }, { "epoch": 0.9024479356960176, "grad_norm": 3.7362382411956787, "learning_rate": 8.821212178406585e-06, "loss": 0.2728, "step": 9880 }, { "epoch": 0.9033613445378151, "grad_norm": 3.2286477088928223, "learning_rate": 8.817782800016665e-06, "loss": 0.2509, "step": 9890 }, { "epoch": 0.9042747533796127, "grad_norm": 4.5851216316223145, "learning_rate": 8.814349109103524e-06, "loss": 0.2449, "step": 9900 }, { "epoch": 0.9051881622214103, "grad_norm": 7.989936828613281, "learning_rate": 8.810911109545821e-06, "loss": 0.223, "step": 9910 }, { "epoch": 0.9061015710632079, "grad_norm": 3.2256860733032227, "learning_rate": 8.807468805227078e-06, "loss": 0.2026, "step": 9920 }, { "epoch": 0.9070149799050055, "grad_norm": 4.033304214477539, "learning_rate": 8.804022200035685e-06, "loss": 0.246, "step": 9930 }, { "epoch": 0.907928388746803, "grad_norm": 3.527801752090454, "learning_rate": 8.800571297864886e-06, "loss": 0.2501, "step": 9940 }, { "epoch": 0.9088417975886006, "grad_norm": 8.267197608947754, "learning_rate": 8.79711610261278e-06, "loss": 0.2363, "step": 9950 }, { "epoch": 0.9097552064303982, "grad_norm": 3.0020992755889893, "learning_rate": 8.793656618182314e-06, "loss": 0.2043, "step": 9960 }, { "epoch": 0.9106686152721958, "grad_norm": 2.9169986248016357, "learning_rate": 8.790192848481283e-06, "loss": 0.2361, "step": 9970 }, { "epoch": 0.9115820241139935, "grad_norm": 3.786872625350952, "learning_rate": 8.786724797422323e-06, "loss": 0.2798, "step": 9980 }, { "epoch": 0.912495432955791, "grad_norm": 2.5836563110351562, "learning_rate": 8.783252468922901e-06, "loss": 0.2728, "step": 9990 }, { "epoch": 0.9134088417975886, "grad_norm": 5.215598106384277, "learning_rate": 8.77977586690532e-06, "loss": 0.2776, "step": 10000 }, { "epoch": 0.9143222506393862, "grad_norm": 1.6671700477600098, "learning_rate": 8.77629499529671e-06, "loss": 0.2971, "step": 10010 }, { "epoch": 0.9152356594811838, "grad_norm": 4.225934028625488, "learning_rate": 8.772809858029024e-06, "loss": 0.2644, "step": 10020 }, { "epoch": 0.9161490683229814, "grad_norm": 5.902198314666748, "learning_rate": 8.769320459039032e-06, "loss": 0.3013, "step": 10030 }, { "epoch": 0.917062477164779, "grad_norm": 2.2894344329833984, "learning_rate": 8.765826802268318e-06, "loss": 0.2256, "step": 10040 }, { "epoch": 0.9179758860065765, "grad_norm": 5.706624507904053, "learning_rate": 8.762328891663277e-06, "loss": 0.2175, "step": 10050 }, { "epoch": 0.9188892948483741, "grad_norm": 3.4480068683624268, "learning_rate": 8.758826731175106e-06, "loss": 0.2278, "step": 10060 }, { "epoch": 0.9198027036901717, "grad_norm": 4.389881610870361, "learning_rate": 8.755320324759808e-06, "loss": 0.3055, "step": 10070 }, { "epoch": 0.9207161125319693, "grad_norm": 8.588825225830078, "learning_rate": 8.751809676378178e-06, "loss": 0.3072, "step": 10080 }, { "epoch": 0.9216295213737669, "grad_norm": 3.601212739944458, "learning_rate": 8.748294789995804e-06, "loss": 0.2173, "step": 10090 }, { "epoch": 0.9225429302155644, "grad_norm": 6.84972620010376, "learning_rate": 8.744775669583061e-06, "loss": 0.2524, "step": 10100 }, { "epoch": 0.9234563390573621, "grad_norm": 3.351142406463623, "learning_rate": 8.741252319115105e-06, "loss": 0.2517, "step": 10110 }, { "epoch": 0.9243697478991597, "grad_norm": 4.5768961906433105, "learning_rate": 8.737724742571874e-06, "loss": 0.2584, "step": 10120 }, { "epoch": 0.9252831567409573, "grad_norm": 7.116927146911621, "learning_rate": 8.734192943938075e-06, "loss": 0.283, "step": 10130 }, { "epoch": 0.9261965655827549, "grad_norm": 6.280237674713135, "learning_rate": 8.730656927203192e-06, "loss": 0.2362, "step": 10140 }, { "epoch": 0.9271099744245525, "grad_norm": 12.419675827026367, "learning_rate": 8.727116696361461e-06, "loss": 0.3036, "step": 10150 }, { "epoch": 0.92802338326635, "grad_norm": 3.5280561447143555, "learning_rate": 8.723572255411895e-06, "loss": 0.2384, "step": 10160 }, { "epoch": 0.9289367921081476, "grad_norm": 3.738837242126465, "learning_rate": 8.720023608358244e-06, "loss": 0.2677, "step": 10170 }, { "epoch": 0.9298502009499452, "grad_norm": 3.2599565982818604, "learning_rate": 8.716470759209024e-06, "loss": 0.2896, "step": 10180 }, { "epoch": 0.9307636097917428, "grad_norm": 2.3416082859039307, "learning_rate": 8.71291371197749e-06, "loss": 0.2407, "step": 10190 }, { "epoch": 0.9316770186335404, "grad_norm": 2.3987927436828613, "learning_rate": 8.709352470681644e-06, "loss": 0.2217, "step": 10200 }, { "epoch": 0.9325904274753379, "grad_norm": 3.288790464401245, "learning_rate": 8.70578703934422e-06, "loss": 0.2335, "step": 10210 }, { "epoch": 0.9335038363171355, "grad_norm": 2.347038507461548, "learning_rate": 8.70221742199269e-06, "loss": 0.2686, "step": 10220 }, { "epoch": 0.9344172451589331, "grad_norm": 2.759164571762085, "learning_rate": 8.698643622659252e-06, "loss": 0.2866, "step": 10230 }, { "epoch": 0.9353306540007307, "grad_norm": 1.9942917823791504, "learning_rate": 8.695065645380827e-06, "loss": 0.2436, "step": 10240 }, { "epoch": 0.9362440628425284, "grad_norm": 2.9251623153686523, "learning_rate": 8.691483494199057e-06, "loss": 0.2259, "step": 10250 }, { "epoch": 0.937157471684326, "grad_norm": 2.740077495574951, "learning_rate": 8.687897173160303e-06, "loss": 0.269, "step": 10260 }, { "epoch": 0.9380708805261235, "grad_norm": 3.655870199203491, "learning_rate": 8.684306686315624e-06, "loss": 0.2656, "step": 10270 }, { "epoch": 0.9389842893679211, "grad_norm": 3.0772957801818848, "learning_rate": 8.680712037720797e-06, "loss": 0.2847, "step": 10280 }, { "epoch": 0.9398976982097187, "grad_norm": 3.714373826980591, "learning_rate": 8.677113231436295e-06, "loss": 0.2931, "step": 10290 }, { "epoch": 0.9408111070515163, "grad_norm": 2.373314142227173, "learning_rate": 8.673510271527287e-06, "loss": 0.3015, "step": 10300 }, { "epoch": 0.9417245158933139, "grad_norm": 1.4742329120635986, "learning_rate": 8.669903162063637e-06, "loss": 0.2578, "step": 10310 }, { "epoch": 0.9426379247351114, "grad_norm": 3.610978603363037, "learning_rate": 8.66629190711989e-06, "loss": 0.2038, "step": 10320 }, { "epoch": 0.943551333576909, "grad_norm": 3.181845188140869, "learning_rate": 8.662676510775282e-06, "loss": 0.274, "step": 10330 }, { "epoch": 0.9444647424187066, "grad_norm": 4.567254066467285, "learning_rate": 8.659056977113721e-06, "loss": 0.3059, "step": 10340 }, { "epoch": 0.9453781512605042, "grad_norm": 6.751839637756348, "learning_rate": 8.655433310223788e-06, "loss": 0.2029, "step": 10350 }, { "epoch": 0.9462915601023018, "grad_norm": 4.812902450561523, "learning_rate": 8.651805514198738e-06, "loss": 0.2873, "step": 10360 }, { "epoch": 0.9472049689440993, "grad_norm": 2.490232229232788, "learning_rate": 8.648173593136486e-06, "loss": 0.3362, "step": 10370 }, { "epoch": 0.9481183777858969, "grad_norm": 10.693278312683105, "learning_rate": 8.644537551139612e-06, "loss": 0.2163, "step": 10380 }, { "epoch": 0.9490317866276946, "grad_norm": 1.7041294574737549, "learning_rate": 8.64089739231534e-06, "loss": 0.2296, "step": 10390 }, { "epoch": 0.9499451954694922, "grad_norm": 2.899956464767456, "learning_rate": 8.637253120775556e-06, "loss": 0.2707, "step": 10400 }, { "epoch": 0.9508586043112898, "grad_norm": 4.891416072845459, "learning_rate": 8.633604740636785e-06, "loss": 0.2459, "step": 10410 }, { "epoch": 0.9517720131530873, "grad_norm": 3.4227023124694824, "learning_rate": 8.629952256020198e-06, "loss": 0.2696, "step": 10420 }, { "epoch": 0.9526854219948849, "grad_norm": 2.8656466007232666, "learning_rate": 8.626295671051594e-06, "loss": 0.2449, "step": 10430 }, { "epoch": 0.9535988308366825, "grad_norm": 4.831636905670166, "learning_rate": 8.622634989861414e-06, "loss": 0.2255, "step": 10440 }, { "epoch": 0.9545122396784801, "grad_norm": 3.1923937797546387, "learning_rate": 8.618970216584717e-06, "loss": 0.2134, "step": 10450 }, { "epoch": 0.9554256485202777, "grad_norm": 3.322455406188965, "learning_rate": 8.615301355361192e-06, "loss": 0.2755, "step": 10460 }, { "epoch": 0.9563390573620753, "grad_norm": 3.047055959701538, "learning_rate": 8.61162841033514e-06, "loss": 0.2374, "step": 10470 }, { "epoch": 0.9572524662038728, "grad_norm": 2.922208309173584, "learning_rate": 8.607951385655477e-06, "loss": 0.2654, "step": 10480 }, { "epoch": 0.9581658750456704, "grad_norm": 3.063455581665039, "learning_rate": 8.60427028547573e-06, "loss": 0.2455, "step": 10490 }, { "epoch": 0.959079283887468, "grad_norm": 4.8602190017700195, "learning_rate": 8.600585113954023e-06, "loss": 0.27, "step": 10500 }, { "epoch": 0.9599926927292656, "grad_norm": 8.294038772583008, "learning_rate": 8.596895875253089e-06, "loss": 0.2482, "step": 10510 }, { "epoch": 0.9609061015710632, "grad_norm": 2.989853620529175, "learning_rate": 8.593202573540242e-06, "loss": 0.2602, "step": 10520 }, { "epoch": 0.9618195104128608, "grad_norm": 4.239576816558838, "learning_rate": 8.589505212987399e-06, "loss": 0.2207, "step": 10530 }, { "epoch": 0.9627329192546584, "grad_norm": 4.637528419494629, "learning_rate": 8.585803797771053e-06, "loss": 0.2571, "step": 10540 }, { "epoch": 0.963646328096456, "grad_norm": 4.22274923324585, "learning_rate": 8.582098332072277e-06, "loss": 0.2923, "step": 10550 }, { "epoch": 0.9645597369382536, "grad_norm": 5.016778469085693, "learning_rate": 8.578388820076728e-06, "loss": 0.2361, "step": 10560 }, { "epoch": 0.9654731457800512, "grad_norm": 7.159799098968506, "learning_rate": 8.574675265974623e-06, "loss": 0.2053, "step": 10570 }, { "epoch": 0.9663865546218487, "grad_norm": 4.722222328186035, "learning_rate": 8.570957673960752e-06, "loss": 0.2553, "step": 10580 }, { "epoch": 0.9672999634636463, "grad_norm": 2.7758402824401855, "learning_rate": 8.567236048234464e-06, "loss": 0.285, "step": 10590 }, { "epoch": 0.9682133723054439, "grad_norm": 2.7535817623138428, "learning_rate": 8.563510392999663e-06, "loss": 0.2447, "step": 10600 }, { "epoch": 0.9691267811472415, "grad_norm": 2.435124158859253, "learning_rate": 8.559780712464807e-06, "loss": 0.2624, "step": 10610 }, { "epoch": 0.9700401899890391, "grad_norm": 2.4270756244659424, "learning_rate": 8.5560470108429e-06, "loss": 0.2816, "step": 10620 }, { "epoch": 0.9709535988308367, "grad_norm": 9.29770278930664, "learning_rate": 8.552309292351486e-06, "loss": 0.2295, "step": 10630 }, { "epoch": 0.9718670076726342, "grad_norm": 3.437800168991089, "learning_rate": 8.548567561212651e-06, "loss": 0.26, "step": 10640 }, { "epoch": 0.9727804165144318, "grad_norm": 2.5515129566192627, "learning_rate": 8.544821821653012e-06, "loss": 0.2505, "step": 10650 }, { "epoch": 0.9736938253562295, "grad_norm": 3.1831536293029785, "learning_rate": 8.54107207790371e-06, "loss": 0.263, "step": 10660 }, { "epoch": 0.9746072341980271, "grad_norm": 1.8921102285385132, "learning_rate": 8.537318334200413e-06, "loss": 0.2359, "step": 10670 }, { "epoch": 0.9755206430398247, "grad_norm": 3.063145399093628, "learning_rate": 8.533560594783306e-06, "loss": 0.2105, "step": 10680 }, { "epoch": 0.9764340518816222, "grad_norm": 3.7070565223693848, "learning_rate": 8.529798863897089e-06, "loss": 0.2091, "step": 10690 }, { "epoch": 0.9773474607234198, "grad_norm": 2.8981854915618896, "learning_rate": 8.526033145790972e-06, "loss": 0.2726, "step": 10700 }, { "epoch": 0.9782608695652174, "grad_norm": 2.5896973609924316, "learning_rate": 8.522263444718662e-06, "loss": 0.2524, "step": 10710 }, { "epoch": 0.979174278407015, "grad_norm": 4.908833980560303, "learning_rate": 8.518489764938371e-06, "loss": 0.2644, "step": 10720 }, { "epoch": 0.9800876872488126, "grad_norm": 2.4249210357666016, "learning_rate": 8.514712110712805e-06, "loss": 0.2609, "step": 10730 }, { "epoch": 0.9810010960906101, "grad_norm": 3.5380516052246094, "learning_rate": 8.51093048630916e-06, "loss": 0.2808, "step": 10740 }, { "epoch": 0.9819145049324077, "grad_norm": 3.2396981716156006, "learning_rate": 8.50714489599911e-06, "loss": 0.219, "step": 10750 }, { "epoch": 0.9828279137742053, "grad_norm": 3.6200978755950928, "learning_rate": 8.50335534405882e-06, "loss": 0.249, "step": 10760 }, { "epoch": 0.9837413226160029, "grad_norm": 3.839623212814331, "learning_rate": 8.499561834768917e-06, "loss": 0.2696, "step": 10770 }, { "epoch": 0.9846547314578005, "grad_norm": 3.3223836421966553, "learning_rate": 8.495764372414512e-06, "loss": 0.2756, "step": 10780 }, { "epoch": 0.985568140299598, "grad_norm": 2.2228360176086426, "learning_rate": 8.491962961285171e-06, "loss": 0.206, "step": 10790 }, { "epoch": 0.9864815491413957, "grad_norm": 2.8554935455322266, "learning_rate": 8.488157605674924e-06, "loss": 0.3004, "step": 10800 }, { "epoch": 0.9873949579831933, "grad_norm": 2.7505881786346436, "learning_rate": 8.484348309882258e-06, "loss": 0.2282, "step": 10810 }, { "epoch": 0.9883083668249909, "grad_norm": 2.3780407905578613, "learning_rate": 8.480535078210108e-06, "loss": 0.2758, "step": 10820 }, { "epoch": 0.9892217756667885, "grad_norm": 3.6750948429107666, "learning_rate": 8.476717914965858e-06, "loss": 0.2162, "step": 10830 }, { "epoch": 0.9901351845085861, "grad_norm": 3.337299346923828, "learning_rate": 8.472896824461329e-06, "loss": 0.2476, "step": 10840 }, { "epoch": 0.9910485933503836, "grad_norm": 3.271440029144287, "learning_rate": 8.469071811012784e-06, "loss": 0.2646, "step": 10850 }, { "epoch": 0.9919620021921812, "grad_norm": 2.6587891578674316, "learning_rate": 8.465242878940909e-06, "loss": 0.2746, "step": 10860 }, { "epoch": 0.9928754110339788, "grad_norm": 2.953670024871826, "learning_rate": 8.461410032570825e-06, "loss": 0.2546, "step": 10870 }, { "epoch": 0.9937888198757764, "grad_norm": 2.9988551139831543, "learning_rate": 8.45757327623207e-06, "loss": 0.2585, "step": 10880 }, { "epoch": 0.994702228717574, "grad_norm": 6.427531719207764, "learning_rate": 8.4537326142586e-06, "loss": 0.2542, "step": 10890 }, { "epoch": 0.9956156375593715, "grad_norm": 2.015763759613037, "learning_rate": 8.449888050988782e-06, "loss": 0.2767, "step": 10900 }, { "epoch": 0.9965290464011691, "grad_norm": 2.418591260910034, "learning_rate": 8.446039590765389e-06, "loss": 0.2615, "step": 10910 }, { "epoch": 0.9974424552429667, "grad_norm": 5.265003204345703, "learning_rate": 8.442187237935597e-06, "loss": 0.2965, "step": 10920 }, { "epoch": 0.9983558640847643, "grad_norm": 2.0795445442199707, "learning_rate": 8.438330996850982e-06, "loss": 0.2127, "step": 10930 }, { "epoch": 0.999269272926562, "grad_norm": 3.719327688217163, "learning_rate": 8.434470871867506e-06, "loss": 0.2691, "step": 10940 }, { "epoch": 1.0001826817683594, "grad_norm": 2.319429636001587, "learning_rate": 8.430606867345524e-06, "loss": 0.2439, "step": 10950 }, { "epoch": 1.0010960906101571, "grad_norm": 3.4749581813812256, "learning_rate": 8.426738987649769e-06, "loss": 0.2646, "step": 10960 }, { "epoch": 1.0020094994519546, "grad_norm": 3.0940964221954346, "learning_rate": 8.422867237149352e-06, "loss": 0.2254, "step": 10970 }, { "epoch": 1.0029229082937523, "grad_norm": 4.05711030960083, "learning_rate": 8.41899162021776e-06, "loss": 0.2956, "step": 10980 }, { "epoch": 1.0038363171355498, "grad_norm": 2.4431395530700684, "learning_rate": 8.415112141232845e-06, "loss": 0.2451, "step": 10990 }, { "epoch": 1.0047497259773475, "grad_norm": 1.9695569276809692, "learning_rate": 8.411228804576822e-06, "loss": 0.2066, "step": 11000 }, { "epoch": 1.0056631348191452, "grad_norm": 2.4663586616516113, "learning_rate": 8.407341614636257e-06, "loss": 0.2266, "step": 11010 }, { "epoch": 1.0065765436609426, "grad_norm": 3.0054101943969727, "learning_rate": 8.403450575802083e-06, "loss": 0.2215, "step": 11020 }, { "epoch": 1.0074899525027403, "grad_norm": 2.9213321208953857, "learning_rate": 8.399555692469567e-06, "loss": 0.2308, "step": 11030 }, { "epoch": 1.0084033613445378, "grad_norm": 5.762731552124023, "learning_rate": 8.395656969038327e-06, "loss": 0.2295, "step": 11040 }, { "epoch": 1.0093167701863355, "grad_norm": 3.758594512939453, "learning_rate": 8.391754409912315e-06, "loss": 0.2017, "step": 11050 }, { "epoch": 1.010230179028133, "grad_norm": 2.5370571613311768, "learning_rate": 8.387848019499814e-06, "loss": 0.2684, "step": 11060 }, { "epoch": 1.0111435878699306, "grad_norm": 2.4430034160614014, "learning_rate": 8.383937802213441e-06, "loss": 0.2979, "step": 11070 }, { "epoch": 1.012056996711728, "grad_norm": 5.229272842407227, "learning_rate": 8.380023762470134e-06, "loss": 0.1855, "step": 11080 }, { "epoch": 1.0129704055535258, "grad_norm": 2.2412877082824707, "learning_rate": 8.376105904691139e-06, "loss": 0.2365, "step": 11090 }, { "epoch": 1.0138838143953233, "grad_norm": 2.6487627029418945, "learning_rate": 8.372184233302029e-06, "loss": 0.2863, "step": 11100 }, { "epoch": 1.014797223237121, "grad_norm": 2.9262161254882812, "learning_rate": 8.368258752732677e-06, "loss": 0.3271, "step": 11110 }, { "epoch": 1.0157106320789184, "grad_norm": 5.3471174240112305, "learning_rate": 8.364329467417261e-06, "loss": 0.2111, "step": 11120 }, { "epoch": 1.0166240409207161, "grad_norm": 3.2794010639190674, "learning_rate": 8.360396381794256e-06, "loss": 0.2635, "step": 11130 }, { "epoch": 1.0175374497625138, "grad_norm": 2.9278950691223145, "learning_rate": 8.356459500306429e-06, "loss": 0.2241, "step": 11140 }, { "epoch": 1.0184508586043113, "grad_norm": 2.390176296234131, "learning_rate": 8.352518827400835e-06, "loss": 0.2227, "step": 11150 }, { "epoch": 1.019364267446109, "grad_norm": 4.992607593536377, "learning_rate": 8.348574367528816e-06, "loss": 0.2434, "step": 11160 }, { "epoch": 1.0202776762879064, "grad_norm": 4.633663654327393, "learning_rate": 8.344626125145985e-06, "loss": 0.2567, "step": 11170 }, { "epoch": 1.0211910851297041, "grad_norm": 2.9157097339630127, "learning_rate": 8.340674104712232e-06, "loss": 0.1984, "step": 11180 }, { "epoch": 1.0221044939715016, "grad_norm": 6.563963890075684, "learning_rate": 8.336718310691712e-06, "loss": 0.2582, "step": 11190 }, { "epoch": 1.0230179028132993, "grad_norm": 2.843829393386841, "learning_rate": 8.332758747552845e-06, "loss": 0.2145, "step": 11200 }, { "epoch": 1.0239313116550968, "grad_norm": 2.2623257637023926, "learning_rate": 8.328795419768309e-06, "loss": 0.2093, "step": 11210 }, { "epoch": 1.0248447204968945, "grad_norm": 3.753451108932495, "learning_rate": 8.324828331815034e-06, "loss": 0.2074, "step": 11220 }, { "epoch": 1.025758129338692, "grad_norm": 2.7845118045806885, "learning_rate": 8.320857488174193e-06, "loss": 0.2212, "step": 11230 }, { "epoch": 1.0266715381804896, "grad_norm": 2.336176872253418, "learning_rate": 8.316882893331206e-06, "loss": 0.1824, "step": 11240 }, { "epoch": 1.027584947022287, "grad_norm": 2.468842029571533, "learning_rate": 8.312904551775731e-06, "loss": 0.2392, "step": 11250 }, { "epoch": 1.0284983558640848, "grad_norm": 2.973757743835449, "learning_rate": 8.308922468001654e-06, "loss": 0.2512, "step": 11260 }, { "epoch": 1.0294117647058822, "grad_norm": 2.5363759994506836, "learning_rate": 8.304936646507095e-06, "loss": 0.2002, "step": 11270 }, { "epoch": 1.03032517354768, "grad_norm": 2.0081799030303955, "learning_rate": 8.300947091794388e-06, "loss": 0.2126, "step": 11280 }, { "epoch": 1.0312385823894776, "grad_norm": 2.9246490001678467, "learning_rate": 8.296953808370088e-06, "loss": 0.2624, "step": 11290 }, { "epoch": 1.032151991231275, "grad_norm": 13.455506324768066, "learning_rate": 8.292956800744963e-06, "loss": 0.2207, "step": 11300 }, { "epoch": 1.0330654000730728, "grad_norm": 2.552647829055786, "learning_rate": 8.288956073433985e-06, "loss": 0.2465, "step": 11310 }, { "epoch": 1.0339788089148703, "grad_norm": 3.0011720657348633, "learning_rate": 8.28495163095633e-06, "loss": 0.2404, "step": 11320 }, { "epoch": 1.034892217756668, "grad_norm": 2.505624532699585, "learning_rate": 8.280943477835372e-06, "loss": 0.2116, "step": 11330 }, { "epoch": 1.0358056265984654, "grad_norm": 2.4630961418151855, "learning_rate": 8.27693161859867e-06, "loss": 0.1803, "step": 11340 }, { "epoch": 1.0367190354402631, "grad_norm": 4.072052478790283, "learning_rate": 8.272916057777978e-06, "loss": 0.2681, "step": 11350 }, { "epoch": 1.0376324442820606, "grad_norm": 2.1791632175445557, "learning_rate": 8.268896799909225e-06, "loss": 0.2281, "step": 11360 }, { "epoch": 1.0385458531238583, "grad_norm": 2.786611557006836, "learning_rate": 8.264873849532518e-06, "loss": 0.2367, "step": 11370 }, { "epoch": 1.0394592619656557, "grad_norm": 2.7024617195129395, "learning_rate": 8.260847211192135e-06, "loss": 0.2019, "step": 11380 }, { "epoch": 1.0403726708074534, "grad_norm": 3.6548032760620117, "learning_rate": 8.256816889436518e-06, "loss": 0.2179, "step": 11390 }, { "epoch": 1.041286079649251, "grad_norm": 2.713104724884033, "learning_rate": 8.252782888818278e-06, "loss": 0.1961, "step": 11400 }, { "epoch": 1.0421994884910486, "grad_norm": 3.4376039505004883, "learning_rate": 8.248745213894169e-06, "loss": 0.1931, "step": 11410 }, { "epoch": 1.0431128973328463, "grad_norm": 3.424934148788452, "learning_rate": 8.244703869225106e-06, "loss": 0.2469, "step": 11420 }, { "epoch": 1.0440263061746438, "grad_norm": 3.829969882965088, "learning_rate": 8.240658859376142e-06, "loss": 0.2234, "step": 11430 }, { "epoch": 1.0449397150164415, "grad_norm": 1.5478086471557617, "learning_rate": 8.236610188916476e-06, "loss": 0.2237, "step": 11440 }, { "epoch": 1.045853123858239, "grad_norm": 2.3124547004699707, "learning_rate": 8.232557862419437e-06, "loss": 0.2327, "step": 11450 }, { "epoch": 1.0467665327000366, "grad_norm": 2.308912515640259, "learning_rate": 8.228501884462488e-06, "loss": 0.2725, "step": 11460 }, { "epoch": 1.047679941541834, "grad_norm": 5.61586856842041, "learning_rate": 8.224442259627211e-06, "loss": 0.2459, "step": 11470 }, { "epoch": 1.0485933503836318, "grad_norm": 2.6386773586273193, "learning_rate": 8.220378992499316e-06, "loss": 0.2507, "step": 11480 }, { "epoch": 1.0495067592254292, "grad_norm": 2.02817964553833, "learning_rate": 8.21631208766862e-06, "loss": 0.2011, "step": 11490 }, { "epoch": 1.050420168067227, "grad_norm": 7.027730941772461, "learning_rate": 8.212241549729051e-06, "loss": 0.2689, "step": 11500 }, { "epoch": 1.0513335769090244, "grad_norm": 5.010609149932861, "learning_rate": 8.208167383278645e-06, "loss": 0.2362, "step": 11510 }, { "epoch": 1.052246985750822, "grad_norm": 2.7481210231781006, "learning_rate": 8.204089592919529e-06, "loss": 0.2059, "step": 11520 }, { "epoch": 1.0531603945926196, "grad_norm": 3.2311344146728516, "learning_rate": 8.200008183257929e-06, "loss": 0.204, "step": 11530 }, { "epoch": 1.0540738034344173, "grad_norm": 3.094291925430298, "learning_rate": 8.19592315890416e-06, "loss": 0.2423, "step": 11540 }, { "epoch": 1.054987212276215, "grad_norm": 2.4556524753570557, "learning_rate": 8.191834524472614e-06, "loss": 0.2431, "step": 11550 }, { "epoch": 1.0559006211180124, "grad_norm": 2.826571464538574, "learning_rate": 8.18774228458177e-06, "loss": 0.229, "step": 11560 }, { "epoch": 1.05681402995981, "grad_norm": 4.3796515464782715, "learning_rate": 8.183646443854172e-06, "loss": 0.2101, "step": 11570 }, { "epoch": 1.0577274388016076, "grad_norm": 3.687520742416382, "learning_rate": 8.179547006916434e-06, "loss": 0.2296, "step": 11580 }, { "epoch": 1.0586408476434053, "grad_norm": 2.967210531234741, "learning_rate": 8.175443978399233e-06, "loss": 0.2346, "step": 11590 }, { "epoch": 1.0595542564852027, "grad_norm": 2.516808271408081, "learning_rate": 8.171337362937304e-06, "loss": 0.1413, "step": 11600 }, { "epoch": 1.0604676653270004, "grad_norm": 2.4505486488342285, "learning_rate": 8.167227165169432e-06, "loss": 0.2942, "step": 11610 }, { "epoch": 1.061381074168798, "grad_norm": 3.564152479171753, "learning_rate": 8.163113389738443e-06, "loss": 0.2116, "step": 11620 }, { "epoch": 1.0622944830105956, "grad_norm": 2.9156224727630615, "learning_rate": 8.15899604129122e-06, "loss": 0.1737, "step": 11630 }, { "epoch": 1.063207891852393, "grad_norm": 6.590764999389648, "learning_rate": 8.154875124478665e-06, "loss": 0.2156, "step": 11640 }, { "epoch": 1.0641213006941908, "grad_norm": 1.976444959640503, "learning_rate": 8.150750643955719e-06, "loss": 0.2221, "step": 11650 }, { "epoch": 1.0650347095359882, "grad_norm": 1.987165927886963, "learning_rate": 8.14662260438135e-06, "loss": 0.1713, "step": 11660 }, { "epoch": 1.065948118377786, "grad_norm": 2.6841633319854736, "learning_rate": 8.142491010418538e-06, "loss": 0.2037, "step": 11670 }, { "epoch": 1.0668615272195834, "grad_norm": 2.641129493713379, "learning_rate": 8.138355866734292e-06, "loss": 0.225, "step": 11680 }, { "epoch": 1.067774936061381, "grad_norm": 5.532380104064941, "learning_rate": 8.134217177999616e-06, "loss": 0.2456, "step": 11690 }, { "epoch": 1.0686883449031788, "grad_norm": 3.0943596363067627, "learning_rate": 8.130074948889526e-06, "loss": 0.2142, "step": 11700 }, { "epoch": 1.0696017537449762, "grad_norm": 2.6277964115142822, "learning_rate": 8.125929184083037e-06, "loss": 0.2074, "step": 11710 }, { "epoch": 1.070515162586774, "grad_norm": 1.7444112300872803, "learning_rate": 8.121779888263157e-06, "loss": 0.2186, "step": 11720 }, { "epoch": 1.0714285714285714, "grad_norm": 1.5838054418563843, "learning_rate": 8.117627066116882e-06, "loss": 0.1908, "step": 11730 }, { "epoch": 1.072341980270369, "grad_norm": 3.860598087310791, "learning_rate": 8.113470722335192e-06, "loss": 0.2607, "step": 11740 }, { "epoch": 1.0732553891121666, "grad_norm": 2.121791362762451, "learning_rate": 8.109310861613047e-06, "loss": 0.2157, "step": 11750 }, { "epoch": 1.0741687979539642, "grad_norm": 3.9770524501800537, "learning_rate": 8.105147488649372e-06, "loss": 0.2448, "step": 11760 }, { "epoch": 1.0750822067957617, "grad_norm": 2.7025606632232666, "learning_rate": 8.100980608147069e-06, "loss": 0.1828, "step": 11770 }, { "epoch": 1.0759956156375594, "grad_norm": 2.368682861328125, "learning_rate": 8.096810224812999e-06, "loss": 0.2486, "step": 11780 }, { "epoch": 1.0769090244793569, "grad_norm": 3.03540301322937, "learning_rate": 8.092636343357974e-06, "loss": 0.206, "step": 11790 }, { "epoch": 1.0778224333211546, "grad_norm": 2.817044734954834, "learning_rate": 8.088458968496766e-06, "loss": 0.2126, "step": 11800 }, { "epoch": 1.078735842162952, "grad_norm": 3.3897058963775635, "learning_rate": 8.084278104948088e-06, "loss": 0.1889, "step": 11810 }, { "epoch": 1.0796492510047497, "grad_norm": 2.0944619178771973, "learning_rate": 8.080093757434595e-06, "loss": 0.2308, "step": 11820 }, { "epoch": 1.0805626598465472, "grad_norm": 3.0904016494750977, "learning_rate": 8.075905930682879e-06, "loss": 0.2685, "step": 11830 }, { "epoch": 1.081476068688345, "grad_norm": 6.59952449798584, "learning_rate": 8.071714629423459e-06, "loss": 0.2217, "step": 11840 }, { "epoch": 1.0823894775301426, "grad_norm": 4.115360736846924, "learning_rate": 8.06751985839078e-06, "loss": 0.1985, "step": 11850 }, { "epoch": 1.08330288637194, "grad_norm": 2.3992693424224854, "learning_rate": 8.063321622323206e-06, "loss": 0.2231, "step": 11860 }, { "epoch": 1.0842162952137377, "grad_norm": 4.163512706756592, "learning_rate": 8.059119925963017e-06, "loss": 0.1977, "step": 11870 }, { "epoch": 1.0851297040555352, "grad_norm": 2.8129842281341553, "learning_rate": 8.054914774056403e-06, "loss": 0.2164, "step": 11880 }, { "epoch": 1.086043112897333, "grad_norm": 3.9529519081115723, "learning_rate": 8.050706171353451e-06, "loss": 0.2193, "step": 11890 }, { "epoch": 1.0869565217391304, "grad_norm": 3.535243511199951, "learning_rate": 8.04649412260815e-06, "loss": 0.2376, "step": 11900 }, { "epoch": 1.087869930580928, "grad_norm": 2.7754898071289062, "learning_rate": 8.042278632578387e-06, "loss": 0.1986, "step": 11910 }, { "epoch": 1.0887833394227255, "grad_norm": 6.415796756744385, "learning_rate": 8.038059706025922e-06, "loss": 0.1763, "step": 11920 }, { "epoch": 1.0896967482645232, "grad_norm": 2.8565752506256104, "learning_rate": 8.033837347716414e-06, "loss": 0.2841, "step": 11930 }, { "epoch": 1.0906101571063207, "grad_norm": 2.5384397506713867, "learning_rate": 8.029611562419384e-06, "loss": 0.2539, "step": 11940 }, { "epoch": 1.0915235659481184, "grad_norm": 4.940293312072754, "learning_rate": 8.025382354908232e-06, "loss": 0.2265, "step": 11950 }, { "epoch": 1.092436974789916, "grad_norm": 2.1185555458068848, "learning_rate": 8.021149729960223e-06, "loss": 0.2339, "step": 11960 }, { "epoch": 1.0933503836317136, "grad_norm": 5.624914169311523, "learning_rate": 8.01691369235648e-06, "loss": 0.255, "step": 11970 }, { "epoch": 1.0942637924735112, "grad_norm": 2.542921304702759, "learning_rate": 8.012674246881982e-06, "loss": 0.25, "step": 11980 }, { "epoch": 1.0951772013153087, "grad_norm": 2.825529098510742, "learning_rate": 8.008431398325557e-06, "loss": 0.1956, "step": 11990 }, { "epoch": 1.0960906101571064, "grad_norm": 3.001047372817993, "learning_rate": 8.004185151479878e-06, "loss": 0.2343, "step": 12000 }, { "epoch": 1.0970040189989039, "grad_norm": 3.061312437057495, "learning_rate": 7.999935511141458e-06, "loss": 0.2533, "step": 12010 }, { "epoch": 1.0979174278407016, "grad_norm": 3.8080854415893555, "learning_rate": 7.99568248211064e-06, "loss": 0.2089, "step": 12020 }, { "epoch": 1.098830836682499, "grad_norm": 4.378378868103027, "learning_rate": 7.991426069191594e-06, "loss": 0.2145, "step": 12030 }, { "epoch": 1.0997442455242967, "grad_norm": 4.468144416809082, "learning_rate": 7.98716627719232e-06, "loss": 0.2279, "step": 12040 }, { "epoch": 1.1006576543660942, "grad_norm": 3.515069007873535, "learning_rate": 7.982903110924625e-06, "loss": 0.2347, "step": 12050 }, { "epoch": 1.1015710632078919, "grad_norm": 2.4203429222106934, "learning_rate": 7.978636575204134e-06, "loss": 0.2713, "step": 12060 }, { "epoch": 1.1024844720496894, "grad_norm": 2.519684076309204, "learning_rate": 7.974366674850277e-06, "loss": 0.2798, "step": 12070 }, { "epoch": 1.103397880891487, "grad_norm": 2.815709352493286, "learning_rate": 7.970093414686282e-06, "loss": 0.2243, "step": 12080 }, { "epoch": 1.1043112897332845, "grad_norm": 3.1650004386901855, "learning_rate": 7.965816799539176e-06, "loss": 0.2585, "step": 12090 }, { "epoch": 1.1052246985750822, "grad_norm": 2.7760398387908936, "learning_rate": 7.961536834239773e-06, "loss": 0.2159, "step": 12100 }, { "epoch": 1.10613810741688, "grad_norm": 1.5883102416992188, "learning_rate": 7.957253523622674e-06, "loss": 0.2373, "step": 12110 }, { "epoch": 1.1070515162586774, "grad_norm": 1.5728622674942017, "learning_rate": 7.952966872526254e-06, "loss": 0.2456, "step": 12120 }, { "epoch": 1.107964925100475, "grad_norm": 2.691993236541748, "learning_rate": 7.948676885792667e-06, "loss": 0.2293, "step": 12130 }, { "epoch": 1.1088783339422725, "grad_norm": 3.8589305877685547, "learning_rate": 7.94438356826783e-06, "loss": 0.2275, "step": 12140 }, { "epoch": 1.1097917427840702, "grad_norm": 6.6490607261657715, "learning_rate": 7.940086924801426e-06, "loss": 0.2456, "step": 12150 }, { "epoch": 1.1107051516258677, "grad_norm": 3.078117609024048, "learning_rate": 7.935786960246894e-06, "loss": 0.2371, "step": 12160 }, { "epoch": 1.1116185604676654, "grad_norm": 4.028228759765625, "learning_rate": 7.931483679461426e-06, "loss": 0.1884, "step": 12170 }, { "epoch": 1.1125319693094629, "grad_norm": 4.568431377410889, "learning_rate": 7.927177087305956e-06, "loss": 0.2549, "step": 12180 }, { "epoch": 1.1134453781512605, "grad_norm": 2.383380174636841, "learning_rate": 7.922867188645157e-06, "loss": 0.2487, "step": 12190 }, { "epoch": 1.114358786993058, "grad_norm": 1.8064721822738647, "learning_rate": 7.918553988347446e-06, "loss": 0.2646, "step": 12200 }, { "epoch": 1.1152721958348557, "grad_norm": 5.575394630432129, "learning_rate": 7.914237491284963e-06, "loss": 0.2241, "step": 12210 }, { "epoch": 1.1161856046766532, "grad_norm": 3.855232000350952, "learning_rate": 7.909917702333572e-06, "loss": 0.2803, "step": 12220 }, { "epoch": 1.1170990135184509, "grad_norm": 6.798408508300781, "learning_rate": 7.905594626372856e-06, "loss": 0.2231, "step": 12230 }, { "epoch": 1.1180124223602483, "grad_norm": 3.1917357444763184, "learning_rate": 7.901268268286113e-06, "loss": 0.1874, "step": 12240 }, { "epoch": 1.118925831202046, "grad_norm": 4.728837966918945, "learning_rate": 7.896938632960347e-06, "loss": 0.2244, "step": 12250 }, { "epoch": 1.1198392400438437, "grad_norm": 3.177349090576172, "learning_rate": 7.892605725286262e-06, "loss": 0.2034, "step": 12260 }, { "epoch": 1.1207526488856412, "grad_norm": 1.5397074222564697, "learning_rate": 7.888269550158262e-06, "loss": 0.1845, "step": 12270 }, { "epoch": 1.1216660577274389, "grad_norm": 5.163643836975098, "learning_rate": 7.88393011247444e-06, "loss": 0.2326, "step": 12280 }, { "epoch": 1.1225794665692364, "grad_norm": 2.2451419830322266, "learning_rate": 7.879587417136577e-06, "loss": 0.2236, "step": 12290 }, { "epoch": 1.123492875411034, "grad_norm": 1.8246958255767822, "learning_rate": 7.875241469050125e-06, "loss": 0.2139, "step": 12300 }, { "epoch": 1.1244062842528315, "grad_norm": 2.9746761322021484, "learning_rate": 7.870892273124221e-06, "loss": 0.2101, "step": 12310 }, { "epoch": 1.1253196930946292, "grad_norm": 2.3560760021209717, "learning_rate": 7.866539834271667e-06, "loss": 0.2385, "step": 12320 }, { "epoch": 1.1262331019364267, "grad_norm": 5.227660655975342, "learning_rate": 7.862184157408925e-06, "loss": 0.2149, "step": 12330 }, { "epoch": 1.1271465107782244, "grad_norm": 4.127508640289307, "learning_rate": 7.85782524745612e-06, "loss": 0.2561, "step": 12340 }, { "epoch": 1.1280599196200218, "grad_norm": 2.903167486190796, "learning_rate": 7.853463109337021e-06, "loss": 0.2083, "step": 12350 }, { "epoch": 1.1289733284618195, "grad_norm": 3.182908773422241, "learning_rate": 7.849097747979055e-06, "loss": 0.2495, "step": 12360 }, { "epoch": 1.1298867373036172, "grad_norm": 1.5424479246139526, "learning_rate": 7.84472916831328e-06, "loss": 0.1947, "step": 12370 }, { "epoch": 1.1308001461454147, "grad_norm": 1.6349135637283325, "learning_rate": 7.840357375274393e-06, "loss": 0.2038, "step": 12380 }, { "epoch": 1.1317135549872122, "grad_norm": 2.5561912059783936, "learning_rate": 7.835982373800722e-06, "loss": 0.2589, "step": 12390 }, { "epoch": 1.1326269638290098, "grad_norm": 2.583251476287842, "learning_rate": 7.831604168834218e-06, "loss": 0.2436, "step": 12400 }, { "epoch": 1.1335403726708075, "grad_norm": 3.101177453994751, "learning_rate": 7.827222765320448e-06, "loss": 0.208, "step": 12410 }, { "epoch": 1.134453781512605, "grad_norm": 4.210922718048096, "learning_rate": 7.822838168208598e-06, "loss": 0.2017, "step": 12420 }, { "epoch": 1.1353671903544027, "grad_norm": 3.4026565551757812, "learning_rate": 7.818450382451457e-06, "loss": 0.2472, "step": 12430 }, { "epoch": 1.1362805991962002, "grad_norm": 2.2912096977233887, "learning_rate": 7.814059413005416e-06, "loss": 0.2157, "step": 12440 }, { "epoch": 1.1371940080379979, "grad_norm": 10.066202163696289, "learning_rate": 7.809665264830467e-06, "loss": 0.2374, "step": 12450 }, { "epoch": 1.1381074168797953, "grad_norm": 2.6990270614624023, "learning_rate": 7.805267942890183e-06, "loss": 0.2434, "step": 12460 }, { "epoch": 1.139020825721593, "grad_norm": 3.156902313232422, "learning_rate": 7.80086745215173e-06, "loss": 0.2362, "step": 12470 }, { "epoch": 1.1399342345633905, "grad_norm": 5.263601779937744, "learning_rate": 7.796463797585853e-06, "loss": 0.2306, "step": 12480 }, { "epoch": 1.1408476434051882, "grad_norm": 2.6506149768829346, "learning_rate": 7.79205698416687e-06, "loss": 0.2164, "step": 12490 }, { "epoch": 1.1417610522469857, "grad_norm": 23.775251388549805, "learning_rate": 7.787647016872664e-06, "loss": 0.2105, "step": 12500 }, { "epoch": 1.1426744610887833, "grad_norm": 7.64209508895874, "learning_rate": 7.783233900684684e-06, "loss": 0.2148, "step": 12510 }, { "epoch": 1.143587869930581, "grad_norm": 4.524003505706787, "learning_rate": 7.778817640587932e-06, "loss": 0.1865, "step": 12520 }, { "epoch": 1.1445012787723785, "grad_norm": 3.714076519012451, "learning_rate": 7.774398241570968e-06, "loss": 0.244, "step": 12530 }, { "epoch": 1.1454146876141762, "grad_norm": 4.233214855194092, "learning_rate": 7.769975708625894e-06, "loss": 0.1711, "step": 12540 }, { "epoch": 1.1463280964559737, "grad_norm": 2.6914215087890625, "learning_rate": 7.76555004674835e-06, "loss": 0.2392, "step": 12550 }, { "epoch": 1.1472415052977714, "grad_norm": 4.124783515930176, "learning_rate": 7.761121260937514e-06, "loss": 0.2695, "step": 12560 }, { "epoch": 1.1481549141395688, "grad_norm": 2.862318754196167, "learning_rate": 7.756689356196087e-06, "loss": 0.207, "step": 12570 }, { "epoch": 1.1490683229813665, "grad_norm": 2.699005365371704, "learning_rate": 7.752254337530305e-06, "loss": 0.1979, "step": 12580 }, { "epoch": 1.149981731823164, "grad_norm": 2.913879871368408, "learning_rate": 7.747816209949906e-06, "loss": 0.2648, "step": 12590 }, { "epoch": 1.1508951406649617, "grad_norm": 2.851247787475586, "learning_rate": 7.743374978468152e-06, "loss": 0.205, "step": 12600 }, { "epoch": 1.1518085495067591, "grad_norm": 2.758802652359009, "learning_rate": 7.738930648101805e-06, "loss": 0.2553, "step": 12610 }, { "epoch": 1.1527219583485568, "grad_norm": 2.686652660369873, "learning_rate": 7.734483223871129e-06, "loss": 0.235, "step": 12620 }, { "epoch": 1.1536353671903543, "grad_norm": 3.948702096939087, "learning_rate": 7.730032710799885e-06, "loss": 0.2405, "step": 12630 }, { "epoch": 1.154548776032152, "grad_norm": 3.5802836418151855, "learning_rate": 7.725579113915317e-06, "loss": 0.2141, "step": 12640 }, { "epoch": 1.1554621848739495, "grad_norm": 4.698762893676758, "learning_rate": 7.72112243824816e-06, "loss": 0.1913, "step": 12650 }, { "epoch": 1.1563755937157472, "grad_norm": 2.3087334632873535, "learning_rate": 7.716662688832622e-06, "loss": 0.2719, "step": 12660 }, { "epoch": 1.1572890025575449, "grad_norm": 5.319257736206055, "learning_rate": 7.712199870706382e-06, "loss": 0.1945, "step": 12670 }, { "epoch": 1.1582024113993423, "grad_norm": 3.0589919090270996, "learning_rate": 7.707733988910593e-06, "loss": 0.2738, "step": 12680 }, { "epoch": 1.15911582024114, "grad_norm": 6.49130916595459, "learning_rate": 7.703265048489858e-06, "loss": 0.2083, "step": 12690 }, { "epoch": 1.1600292290829375, "grad_norm": 2.567017078399658, "learning_rate": 7.698793054492244e-06, "loss": 0.2091, "step": 12700 }, { "epoch": 1.1609426379247352, "grad_norm": 4.079649925231934, "learning_rate": 7.694318011969261e-06, "loss": 0.2182, "step": 12710 }, { "epoch": 1.1618560467665326, "grad_norm": 4.846829414367676, "learning_rate": 7.689839925975868e-06, "loss": 0.2464, "step": 12720 }, { "epoch": 1.1627694556083303, "grad_norm": 2.9824211597442627, "learning_rate": 7.685358801570457e-06, "loss": 0.2321, "step": 12730 }, { "epoch": 1.1636828644501278, "grad_norm": 5.184215068817139, "learning_rate": 7.680874643814853e-06, "loss": 0.197, "step": 12740 }, { "epoch": 1.1645962732919255, "grad_norm": 11.544676780700684, "learning_rate": 7.676387457774312e-06, "loss": 0.2302, "step": 12750 }, { "epoch": 1.165509682133723, "grad_norm": 3.937159776687622, "learning_rate": 7.671897248517505e-06, "loss": 0.2422, "step": 12760 }, { "epoch": 1.1664230909755207, "grad_norm": 3.783395290374756, "learning_rate": 7.667404021116521e-06, "loss": 0.249, "step": 12770 }, { "epoch": 1.1673364998173184, "grad_norm": 2.685478687286377, "learning_rate": 7.662907780646859e-06, "loss": 0.1703, "step": 12780 }, { "epoch": 1.1682499086591158, "grad_norm": 3.2275965213775635, "learning_rate": 7.658408532187416e-06, "loss": 0.2151, "step": 12790 }, { "epoch": 1.1691633175009133, "grad_norm": 5.522408962249756, "learning_rate": 7.653906280820495e-06, "loss": 0.2327, "step": 12800 }, { "epoch": 1.170076726342711, "grad_norm": 2.868124008178711, "learning_rate": 7.649401031631784e-06, "loss": 0.1919, "step": 12810 }, { "epoch": 1.1709901351845087, "grad_norm": 2.896698474884033, "learning_rate": 7.644892789710365e-06, "loss": 0.2049, "step": 12820 }, { "epoch": 1.1719035440263061, "grad_norm": 4.813317775726318, "learning_rate": 7.64038156014869e-06, "loss": 0.1898, "step": 12830 }, { "epoch": 1.1728169528681038, "grad_norm": 6.502059459686279, "learning_rate": 7.635867348042598e-06, "loss": 0.2397, "step": 12840 }, { "epoch": 1.1737303617099013, "grad_norm": 3.1966681480407715, "learning_rate": 7.631350158491283e-06, "loss": 0.2022, "step": 12850 }, { "epoch": 1.174643770551699, "grad_norm": 3.0154378414154053, "learning_rate": 7.626829996597318e-06, "loss": 0.2095, "step": 12860 }, { "epoch": 1.1755571793934965, "grad_norm": 2.566204309463501, "learning_rate": 7.622306867466619e-06, "loss": 0.1938, "step": 12870 }, { "epoch": 1.1764705882352942, "grad_norm": 3.886803150177002, "learning_rate": 7.617780776208463e-06, "loss": 0.207, "step": 12880 }, { "epoch": 1.1773839970770916, "grad_norm": 2.292982578277588, "learning_rate": 7.613251727935471e-06, "loss": 0.2692, "step": 12890 }, { "epoch": 1.1782974059188893, "grad_norm": 2.6340770721435547, "learning_rate": 7.6087197277636025e-06, "loss": 0.1953, "step": 12900 }, { "epoch": 1.1792108147606868, "grad_norm": 2.653306484222412, "learning_rate": 7.6041847808121515e-06, "loss": 0.2502, "step": 12910 }, { "epoch": 1.1801242236024845, "grad_norm": 4.265839099884033, "learning_rate": 7.599646892203744e-06, "loss": 0.2399, "step": 12920 }, { "epoch": 1.1810376324442822, "grad_norm": 2.366071939468384, "learning_rate": 7.595106067064325e-06, "loss": 0.1604, "step": 12930 }, { "epoch": 1.1819510412860796, "grad_norm": 1.5120054483413696, "learning_rate": 7.590562310523158e-06, "loss": 0.2091, "step": 12940 }, { "epoch": 1.1828644501278773, "grad_norm": 4.656861782073975, "learning_rate": 7.5860156277128146e-06, "loss": 0.1948, "step": 12950 }, { "epoch": 1.1837778589696748, "grad_norm": 4.426368713378906, "learning_rate": 7.5814660237691816e-06, "loss": 0.2287, "step": 12960 }, { "epoch": 1.1846912678114725, "grad_norm": 3.0129761695861816, "learning_rate": 7.576913503831434e-06, "loss": 0.2195, "step": 12970 }, { "epoch": 1.18560467665327, "grad_norm": 1.9197585582733154, "learning_rate": 7.572358073042046e-06, "loss": 0.2139, "step": 12980 }, { "epoch": 1.1865180854950677, "grad_norm": 5.36685037612915, "learning_rate": 7.5677997365467815e-06, "loss": 0.2102, "step": 12990 }, { "epoch": 1.1874314943368651, "grad_norm": 5.3765869140625, "learning_rate": 7.563238499494685e-06, "loss": 0.2005, "step": 13000 }, { "epoch": 1.1883449031786628, "grad_norm": 2.4093456268310547, "learning_rate": 7.5586743670380725e-06, "loss": 0.1948, "step": 13010 }, { "epoch": 1.1892583120204603, "grad_norm": 3.321146011352539, "learning_rate": 7.55410734433254e-06, "loss": 0.188, "step": 13020 }, { "epoch": 1.190171720862258, "grad_norm": 3.3887922763824463, "learning_rate": 7.549537436536942e-06, "loss": 0.2268, "step": 13030 }, { "epoch": 1.1910851297040554, "grad_norm": 11.447179794311523, "learning_rate": 7.544964648813391e-06, "loss": 0.2196, "step": 13040 }, { "epoch": 1.1919985385458531, "grad_norm": 3.223475217819214, "learning_rate": 7.540388986327258e-06, "loss": 0.2598, "step": 13050 }, { "epoch": 1.1929119473876506, "grad_norm": 2.549387216567993, "learning_rate": 7.535810454247159e-06, "loss": 0.2125, "step": 13060 }, { "epoch": 1.1938253562294483, "grad_norm": 7.814163684844971, "learning_rate": 7.5312290577449475e-06, "loss": 0.1927, "step": 13070 }, { "epoch": 1.194738765071246, "grad_norm": 3.436769962310791, "learning_rate": 7.526644801995717e-06, "loss": 0.2627, "step": 13080 }, { "epoch": 1.1956521739130435, "grad_norm": 7.8350419998168945, "learning_rate": 7.522057692177791e-06, "loss": 0.221, "step": 13090 }, { "epoch": 1.1965655827548412, "grad_norm": 2.5071141719818115, "learning_rate": 7.5174677334727166e-06, "loss": 0.2279, "step": 13100 }, { "epoch": 1.1974789915966386, "grad_norm": 7.78272819519043, "learning_rate": 7.512874931065255e-06, "loss": 0.1994, "step": 13110 }, { "epoch": 1.1983924004384363, "grad_norm": 2.4974026679992676, "learning_rate": 7.508279290143383e-06, "loss": 0.2131, "step": 13120 }, { "epoch": 1.1993058092802338, "grad_norm": 4.9624409675598145, "learning_rate": 7.503680815898285e-06, "loss": 0.194, "step": 13130 }, { "epoch": 1.2002192181220315, "grad_norm": 4.272526741027832, "learning_rate": 7.4990795135243435e-06, "loss": 0.1941, "step": 13140 }, { "epoch": 1.201132626963829, "grad_norm": 3.6198625564575195, "learning_rate": 7.494475388219136e-06, "loss": 0.166, "step": 13150 }, { "epoch": 1.2020460358056266, "grad_norm": 3.5009379386901855, "learning_rate": 7.489868445183428e-06, "loss": 0.2357, "step": 13160 }, { "epoch": 1.202959444647424, "grad_norm": 3.027489185333252, "learning_rate": 7.485258689621173e-06, "loss": 0.2667, "step": 13170 }, { "epoch": 1.2038728534892218, "grad_norm": 3.0298333168029785, "learning_rate": 7.480646126739493e-06, "loss": 0.2275, "step": 13180 }, { "epoch": 1.2047862623310195, "grad_norm": 2.8863472938537598, "learning_rate": 7.4760307617486875e-06, "loss": 0.2522, "step": 13190 }, { "epoch": 1.205699671172817, "grad_norm": 2.7381670475006104, "learning_rate": 7.47141259986222e-06, "loss": 0.1418, "step": 13200 }, { "epoch": 1.2066130800146144, "grad_norm": 3.980614185333252, "learning_rate": 7.466791646296711e-06, "loss": 0.2019, "step": 13210 }, { "epoch": 1.2075264888564121, "grad_norm": 3.0782933235168457, "learning_rate": 7.462167906271938e-06, "loss": 0.2223, "step": 13220 }, { "epoch": 1.2084398976982098, "grad_norm": 3.0789904594421387, "learning_rate": 7.457541385010822e-06, "loss": 0.2033, "step": 13230 }, { "epoch": 1.2093533065400073, "grad_norm": 3.9264533519744873, "learning_rate": 7.4529120877394315e-06, "loss": 0.2202, "step": 13240 }, { "epoch": 1.210266715381805, "grad_norm": 3.3716812133789062, "learning_rate": 7.448280019686962e-06, "loss": 0.2002, "step": 13250 }, { "epoch": 1.2111801242236024, "grad_norm": 4.116954803466797, "learning_rate": 7.443645186085747e-06, "loss": 0.1744, "step": 13260 }, { "epoch": 1.2120935330654001, "grad_norm": 1.9357284307479858, "learning_rate": 7.439007592171241e-06, "loss": 0.206, "step": 13270 }, { "epoch": 1.2130069419071976, "grad_norm": 3.0712132453918457, "learning_rate": 7.434367243182016e-06, "loss": 0.1947, "step": 13280 }, { "epoch": 1.2139203507489953, "grad_norm": 2.297762870788574, "learning_rate": 7.429724144359756e-06, "loss": 0.2016, "step": 13290 }, { "epoch": 1.2148337595907928, "grad_norm": 3.1789445877075195, "learning_rate": 7.42507830094925e-06, "loss": 0.2602, "step": 13300 }, { "epoch": 1.2157471684325905, "grad_norm": 2.827148199081421, "learning_rate": 7.4204297181983935e-06, "loss": 0.1976, "step": 13310 }, { "epoch": 1.216660577274388, "grad_norm": 3.0362939834594727, "learning_rate": 7.415778401358167e-06, "loss": 0.236, "step": 13320 }, { "epoch": 1.2175739861161856, "grad_norm": 2.6130802631378174, "learning_rate": 7.411124355682646e-06, "loss": 0.1548, "step": 13330 }, { "epoch": 1.2184873949579833, "grad_norm": 6.486108303070068, "learning_rate": 7.406467586428986e-06, "loss": 0.2167, "step": 13340 }, { "epoch": 1.2194008037997808, "grad_norm": 2.3909926414489746, "learning_rate": 7.401808098857421e-06, "loss": 0.22, "step": 13350 }, { "epoch": 1.2203142126415785, "grad_norm": 3.2931480407714844, "learning_rate": 7.3971458982312545e-06, "loss": 0.1887, "step": 13360 }, { "epoch": 1.221227621483376, "grad_norm": 2.709104299545288, "learning_rate": 7.392480989816852e-06, "loss": 0.2302, "step": 13370 }, { "epoch": 1.2221410303251736, "grad_norm": 3.3522908687591553, "learning_rate": 7.3878133788836416e-06, "loss": 0.2087, "step": 13380 }, { "epoch": 1.223054439166971, "grad_norm": 13.868606567382812, "learning_rate": 7.3831430707041015e-06, "loss": 0.2311, "step": 13390 }, { "epoch": 1.2239678480087688, "grad_norm": 3.962938070297241, "learning_rate": 7.378470070553759e-06, "loss": 0.1893, "step": 13400 }, { "epoch": 1.2248812568505663, "grad_norm": 2.6848015785217285, "learning_rate": 7.37379438371118e-06, "loss": 0.2317, "step": 13410 }, { "epoch": 1.225794665692364, "grad_norm": 3.0147922039031982, "learning_rate": 7.369116015457967e-06, "loss": 0.2028, "step": 13420 }, { "epoch": 1.2267080745341614, "grad_norm": 2.5253610610961914, "learning_rate": 7.364434971078749e-06, "loss": 0.2115, "step": 13430 }, { "epoch": 1.227621483375959, "grad_norm": 2.0693719387054443, "learning_rate": 7.359751255861179e-06, "loss": 0.236, "step": 13440 }, { "epoch": 1.2285348922177566, "grad_norm": 2.6874945163726807, "learning_rate": 7.355064875095932e-06, "loss": 0.221, "step": 13450 }, { "epoch": 1.2294483010595543, "grad_norm": 3.9285597801208496, "learning_rate": 7.3503758340766855e-06, "loss": 0.1825, "step": 13460 }, { "epoch": 1.2303617099013517, "grad_norm": 2.8798842430114746, "learning_rate": 7.345684138100127e-06, "loss": 0.2671, "step": 13470 }, { "epoch": 1.2312751187431494, "grad_norm": 3.2054505348205566, "learning_rate": 7.34098979246594e-06, "loss": 0.1874, "step": 13480 }, { "epoch": 1.2321885275849471, "grad_norm": 6.443300724029541, "learning_rate": 7.336292802476806e-06, "loss": 0.2355, "step": 13490 }, { "epoch": 1.2331019364267446, "grad_norm": 2.7431986331939697, "learning_rate": 7.3315931734383895e-06, "loss": 0.2178, "step": 13500 }, { "epoch": 1.2340153452685423, "grad_norm": 4.389045238494873, "learning_rate": 7.326890910659336e-06, "loss": 0.2193, "step": 13510 }, { "epoch": 1.2349287541103398, "grad_norm": 2.902418375015259, "learning_rate": 7.322186019451267e-06, "loss": 0.2128, "step": 13520 }, { "epoch": 1.2358421629521374, "grad_norm": 3.6324541568756104, "learning_rate": 7.317478505128775e-06, "loss": 0.167, "step": 13530 }, { "epoch": 1.236755571793935, "grad_norm": 13.760610580444336, "learning_rate": 7.312768373009412e-06, "loss": 0.202, "step": 13540 }, { "epoch": 1.2376689806357326, "grad_norm": 1.6868840456008911, "learning_rate": 7.308055628413691e-06, "loss": 0.217, "step": 13550 }, { "epoch": 1.23858238947753, "grad_norm": 8.718588829040527, "learning_rate": 7.303340276665068e-06, "loss": 0.2041, "step": 13560 }, { "epoch": 1.2394957983193278, "grad_norm": 6.034531116485596, "learning_rate": 7.298622323089955e-06, "loss": 0.268, "step": 13570 }, { "epoch": 1.2404092071611252, "grad_norm": 3.945823907852173, "learning_rate": 7.293901773017697e-06, "loss": 0.2063, "step": 13580 }, { "epoch": 1.241322616002923, "grad_norm": 5.063836097717285, "learning_rate": 7.289178631780573e-06, "loss": 0.258, "step": 13590 }, { "epoch": 1.2422360248447206, "grad_norm": 4.573521614074707, "learning_rate": 7.284452904713786e-06, "loss": 0.233, "step": 13600 }, { "epoch": 1.243149433686518, "grad_norm": 4.559846878051758, "learning_rate": 7.279724597155463e-06, "loss": 0.1792, "step": 13610 }, { "epoch": 1.2440628425283156, "grad_norm": 3.411479949951172, "learning_rate": 7.274993714446648e-06, "loss": 0.2408, "step": 13620 }, { "epoch": 1.2449762513701133, "grad_norm": 3.2486252784729004, "learning_rate": 7.27026026193129e-06, "loss": 0.2337, "step": 13630 }, { "epoch": 1.245889660211911, "grad_norm": 3.17232346534729, "learning_rate": 7.265524244956241e-06, "loss": 0.1933, "step": 13640 }, { "epoch": 1.2468030690537084, "grad_norm": 4.028630256652832, "learning_rate": 7.260785668871252e-06, "loss": 0.2393, "step": 13650 }, { "epoch": 1.247716477895506, "grad_norm": 3.9599366188049316, "learning_rate": 7.256044539028965e-06, "loss": 0.171, "step": 13660 }, { "epoch": 1.2486298867373036, "grad_norm": 8.89290714263916, "learning_rate": 7.251300860784902e-06, "loss": 0.1985, "step": 13670 }, { "epoch": 1.2495432955791013, "grad_norm": 3.1376514434814453, "learning_rate": 7.246554639497468e-06, "loss": 0.225, "step": 13680 }, { "epoch": 1.2504567044208987, "grad_norm": 4.694840431213379, "learning_rate": 7.241805880527944e-06, "loss": 0.201, "step": 13690 }, { "epoch": 1.2513701132626964, "grad_norm": 6.321514129638672, "learning_rate": 7.237054589240467e-06, "loss": 0.2524, "step": 13700 }, { "epoch": 1.252283522104494, "grad_norm": 1.5119209289550781, "learning_rate": 7.232300771002042e-06, "loss": 0.1945, "step": 13710 }, { "epoch": 1.2531969309462916, "grad_norm": 4.053893089294434, "learning_rate": 7.2275444311825294e-06, "loss": 0.2618, "step": 13720 }, { "epoch": 1.254110339788089, "grad_norm": 3.0499062538146973, "learning_rate": 7.222785575154635e-06, "loss": 0.1963, "step": 13730 }, { "epoch": 1.2550237486298867, "grad_norm": 4.28633451461792, "learning_rate": 7.218024208293906e-06, "loss": 0.1856, "step": 13740 }, { "epoch": 1.2559371574716844, "grad_norm": 3.8855843544006348, "learning_rate": 7.213260335978727e-06, "loss": 0.1785, "step": 13750 }, { "epoch": 1.256850566313482, "grad_norm": 3.8813748359680176, "learning_rate": 7.208493963590314e-06, "loss": 0.2384, "step": 13760 }, { "epoch": 1.2577639751552794, "grad_norm": 4.367703914642334, "learning_rate": 7.203725096512708e-06, "loss": 0.2355, "step": 13770 }, { "epoch": 1.258677383997077, "grad_norm": 2.931528329849243, "learning_rate": 7.198953740132762e-06, "loss": 0.2057, "step": 13780 }, { "epoch": 1.2595907928388748, "grad_norm": 2.300774335861206, "learning_rate": 7.194179899840147e-06, "loss": 0.1538, "step": 13790 }, { "epoch": 1.2605042016806722, "grad_norm": 2.6730093955993652, "learning_rate": 7.189403581027339e-06, "loss": 0.2457, "step": 13800 }, { "epoch": 1.26141761052247, "grad_norm": 4.450044631958008, "learning_rate": 7.184624789089609e-06, "loss": 0.244, "step": 13810 }, { "epoch": 1.2623310193642674, "grad_norm": 3.8127660751342773, "learning_rate": 7.179843529425028e-06, "loss": 0.1777, "step": 13820 }, { "epoch": 1.263244428206065, "grad_norm": 3.492579936981201, "learning_rate": 7.175059807434451e-06, "loss": 0.2951, "step": 13830 }, { "epoch": 1.2641578370478626, "grad_norm": 2.4479222297668457, "learning_rate": 7.170273628521513e-06, "loss": 0.2071, "step": 13840 }, { "epoch": 1.2650712458896602, "grad_norm": 2.757533311843872, "learning_rate": 7.165484998092629e-06, "loss": 0.1764, "step": 13850 }, { "epoch": 1.265984654731458, "grad_norm": 8.118880271911621, "learning_rate": 7.1606939215569786e-06, "loss": 0.2129, "step": 13860 }, { "epoch": 1.2668980635732554, "grad_norm": 2.8995602130889893, "learning_rate": 7.1559004043265055e-06, "loss": 0.2304, "step": 13870 }, { "epoch": 1.2678114724150529, "grad_norm": 4.246128559112549, "learning_rate": 7.151104451815915e-06, "loss": 0.2374, "step": 13880 }, { "epoch": 1.2687248812568506, "grad_norm": 8.934340476989746, "learning_rate": 7.146306069442655e-06, "loss": 0.2346, "step": 13890 }, { "epoch": 1.2696382900986483, "grad_norm": 2.190981388092041, "learning_rate": 7.141505262626925e-06, "loss": 0.2639, "step": 13900 }, { "epoch": 1.2705516989404457, "grad_norm": 2.3636136054992676, "learning_rate": 7.1367020367916615e-06, "loss": 0.2529, "step": 13910 }, { "epoch": 1.2714651077822432, "grad_norm": 3.505985736846924, "learning_rate": 7.131896397362531e-06, "loss": 0.18, "step": 13920 }, { "epoch": 1.272378516624041, "grad_norm": 2.6809353828430176, "learning_rate": 7.1270883497679284e-06, "loss": 0.1795, "step": 13930 }, { "epoch": 1.2732919254658386, "grad_norm": 3.7278730869293213, "learning_rate": 7.12227789943897e-06, "loss": 0.1753, "step": 13940 }, { "epoch": 1.274205334307636, "grad_norm": 2.5992987155914307, "learning_rate": 7.1174650518094836e-06, "loss": 0.2087, "step": 13950 }, { "epoch": 1.2751187431494337, "grad_norm": 2.5378525257110596, "learning_rate": 7.112649812316006e-06, "loss": 0.169, "step": 13960 }, { "epoch": 1.2760321519912312, "grad_norm": 4.127226829528809, "learning_rate": 7.1078321863977764e-06, "loss": 0.2388, "step": 13970 }, { "epoch": 1.276945560833029, "grad_norm": 2.974475383758545, "learning_rate": 7.103012179496729e-06, "loss": 0.2675, "step": 13980 }, { "epoch": 1.2778589696748264, "grad_norm": 2.0686094760894775, "learning_rate": 7.098189797057487e-06, "loss": 0.2198, "step": 13990 }, { "epoch": 1.278772378516624, "grad_norm": 2.8370919227600098, "learning_rate": 7.093365044527358e-06, "loss": 0.2244, "step": 14000 }, { "epoch": 1.2796857873584218, "grad_norm": 3.765165328979492, "learning_rate": 7.088537927356331e-06, "loss": 0.1793, "step": 14010 }, { "epoch": 1.2805991962002192, "grad_norm": 3.0816590785980225, "learning_rate": 7.083708450997053e-06, "loss": 0.2317, "step": 14020 }, { "epoch": 1.2815126050420167, "grad_norm": 4.289952278137207, "learning_rate": 7.078876620904852e-06, "loss": 0.2254, "step": 14030 }, { "epoch": 1.2824260138838144, "grad_norm": 3.567467451095581, "learning_rate": 7.074042442537703e-06, "loss": 0.2527, "step": 14040 }, { "epoch": 1.283339422725612, "grad_norm": 3.0619828701019287, "learning_rate": 7.0692059213562415e-06, "loss": 0.2311, "step": 14050 }, { "epoch": 1.2842528315674095, "grad_norm": 3.399841785430908, "learning_rate": 7.064367062823743e-06, "loss": 0.2054, "step": 14060 }, { "epoch": 1.2851662404092072, "grad_norm": 2.1284537315368652, "learning_rate": 7.059525872406127e-06, "loss": 0.2425, "step": 14070 }, { "epoch": 1.2860796492510047, "grad_norm": 2.518913507461548, "learning_rate": 7.054682355571944e-06, "loss": 0.2384, "step": 14080 }, { "epoch": 1.2869930580928024, "grad_norm": 5.701150417327881, "learning_rate": 7.049836517792377e-06, "loss": 0.2639, "step": 14090 }, { "epoch": 1.2879064669345999, "grad_norm": 2.569122314453125, "learning_rate": 7.044988364541229e-06, "loss": 0.2118, "step": 14100 }, { "epoch": 1.2888198757763976, "grad_norm": 2.7281150817871094, "learning_rate": 7.0401379012949164e-06, "loss": 0.1854, "step": 14110 }, { "epoch": 1.289733284618195, "grad_norm": 2.103210926055908, "learning_rate": 7.035285133532466e-06, "loss": 0.2077, "step": 14120 }, { "epoch": 1.2906466934599927, "grad_norm": 3.232004165649414, "learning_rate": 7.030430066735507e-06, "loss": 0.2356, "step": 14130 }, { "epoch": 1.2915601023017902, "grad_norm": 7.667621612548828, "learning_rate": 7.025572706388268e-06, "loss": 0.1828, "step": 14140 }, { "epoch": 1.2924735111435879, "grad_norm": 4.670990943908691, "learning_rate": 7.020713057977568e-06, "loss": 0.2473, "step": 14150 }, { "epoch": 1.2933869199853856, "grad_norm": 3.0030887126922607, "learning_rate": 7.015851126992807e-06, "loss": 0.2048, "step": 14160 }, { "epoch": 1.294300328827183, "grad_norm": 3.3566417694091797, "learning_rate": 7.0109869189259684e-06, "loss": 0.2179, "step": 14170 }, { "epoch": 1.2952137376689805, "grad_norm": 4.113404273986816, "learning_rate": 7.006120439271604e-06, "loss": 0.1852, "step": 14180 }, { "epoch": 1.2961271465107782, "grad_norm": 3.4974162578582764, "learning_rate": 7.0012516935268335e-06, "loss": 0.2121, "step": 14190 }, { "epoch": 1.297040555352576, "grad_norm": 2.0229382514953613, "learning_rate": 6.996380687191335e-06, "loss": 0.25, "step": 14200 }, { "epoch": 1.2979539641943734, "grad_norm": 3.0723018646240234, "learning_rate": 6.991507425767341e-06, "loss": 0.2423, "step": 14210 }, { "epoch": 1.298867373036171, "grad_norm": 10.299408912658691, "learning_rate": 6.98663191475963e-06, "loss": 0.2057, "step": 14220 }, { "epoch": 1.2997807818779685, "grad_norm": 3.2371249198913574, "learning_rate": 6.981754159675526e-06, "loss": 0.197, "step": 14230 }, { "epoch": 1.3006941907197662, "grad_norm": 3.7656939029693604, "learning_rate": 6.9768741660248805e-06, "loss": 0.2238, "step": 14240 }, { "epoch": 1.3016075995615637, "grad_norm": 6.370077610015869, "learning_rate": 6.971991939320082e-06, "loss": 0.1887, "step": 14250 }, { "epoch": 1.3025210084033614, "grad_norm": 4.595245361328125, "learning_rate": 6.9671074850760345e-06, "loss": 0.2627, "step": 14260 }, { "epoch": 1.303434417245159, "grad_norm": 5.0425591468811035, "learning_rate": 6.962220808810161e-06, "loss": 0.2338, "step": 14270 }, { "epoch": 1.3043478260869565, "grad_norm": 3.9011528491973877, "learning_rate": 6.957331916042394e-06, "loss": 0.1969, "step": 14280 }, { "epoch": 1.305261234928754, "grad_norm": 2.550671100616455, "learning_rate": 6.952440812295172e-06, "loss": 0.2149, "step": 14290 }, { "epoch": 1.3061746437705517, "grad_norm": 5.720028400421143, "learning_rate": 6.947547503093428e-06, "loss": 0.2643, "step": 14300 }, { "epoch": 1.3070880526123494, "grad_norm": 3.622026205062866, "learning_rate": 6.942651993964589e-06, "loss": 0.2413, "step": 14310 }, { "epoch": 1.3080014614541469, "grad_norm": 4.708377838134766, "learning_rate": 6.9377542904385605e-06, "loss": 0.2451, "step": 14320 }, { "epoch": 1.3089148702959443, "grad_norm": 2.4199397563934326, "learning_rate": 6.932854398047739e-06, "loss": 0.1829, "step": 14330 }, { "epoch": 1.309828279137742, "grad_norm": 4.7570109367370605, "learning_rate": 6.927952322326981e-06, "loss": 0.2786, "step": 14340 }, { "epoch": 1.3107416879795397, "grad_norm": 2.2402210235595703, "learning_rate": 6.923048068813616e-06, "loss": 0.2582, "step": 14350 }, { "epoch": 1.3116550968213372, "grad_norm": 3.7254562377929688, "learning_rate": 6.918141643047429e-06, "loss": 0.1944, "step": 14360 }, { "epoch": 1.3125685056631349, "grad_norm": 3.3206560611724854, "learning_rate": 6.913233050570667e-06, "loss": 0.195, "step": 14370 }, { "epoch": 1.3134819145049323, "grad_norm": 3.3732197284698486, "learning_rate": 6.908322296928016e-06, "loss": 0.2318, "step": 14380 }, { "epoch": 1.31439532334673, "grad_norm": 2.1504085063934326, "learning_rate": 6.903409387666607e-06, "loss": 0.2202, "step": 14390 }, { "epoch": 1.3153087321885275, "grad_norm": 3.1763510704040527, "learning_rate": 6.898494328336005e-06, "loss": 0.2064, "step": 14400 }, { "epoch": 1.3162221410303252, "grad_norm": 2.828474760055542, "learning_rate": 6.893577124488205e-06, "loss": 0.2223, "step": 14410 }, { "epoch": 1.317135549872123, "grad_norm": 3.371454954147339, "learning_rate": 6.888657781677624e-06, "loss": 0.2429, "step": 14420 }, { "epoch": 1.3180489587139204, "grad_norm": 2.5978755950927734, "learning_rate": 6.883736305461091e-06, "loss": 0.2119, "step": 14430 }, { "epoch": 1.3189623675557178, "grad_norm": 3.298352003097534, "learning_rate": 6.878812701397853e-06, "loss": 0.2468, "step": 14440 }, { "epoch": 1.3198757763975155, "grad_norm": 4.166585922241211, "learning_rate": 6.8738869750495525e-06, "loss": 0.2599, "step": 14450 }, { "epoch": 1.3207891852393132, "grad_norm": 2.726292133331299, "learning_rate": 6.868959131980237e-06, "loss": 0.2488, "step": 14460 }, { "epoch": 1.3217025940811107, "grad_norm": 2.167741060256958, "learning_rate": 6.864029177756337e-06, "loss": 0.1795, "step": 14470 }, { "epoch": 1.3226160029229084, "grad_norm": 3.080325126647949, "learning_rate": 6.859097117946675e-06, "loss": 0.2056, "step": 14480 }, { "epoch": 1.3235294117647058, "grad_norm": 3.2582783699035645, "learning_rate": 6.8541629581224454e-06, "loss": 0.2269, "step": 14490 }, { "epoch": 1.3244428206065035, "grad_norm": 3.3082077503204346, "learning_rate": 6.849226703857219e-06, "loss": 0.2344, "step": 14500 }, { "epoch": 1.325356229448301, "grad_norm": 4.291395664215088, "learning_rate": 6.844288360726933e-06, "loss": 0.2641, "step": 14510 }, { "epoch": 1.3262696382900987, "grad_norm": 3.697282552719116, "learning_rate": 6.839347934309881e-06, "loss": 0.2525, "step": 14520 }, { "epoch": 1.3271830471318962, "grad_norm": 3.5624663829803467, "learning_rate": 6.834405430186712e-06, "loss": 0.2437, "step": 14530 }, { "epoch": 1.3280964559736939, "grad_norm": 3.703889846801758, "learning_rate": 6.8294608539404215e-06, "loss": 0.1859, "step": 14540 }, { "epoch": 1.3290098648154913, "grad_norm": 3.1185128688812256, "learning_rate": 6.824514211156346e-06, "loss": 0.2412, "step": 14550 }, { "epoch": 1.329923273657289, "grad_norm": 3.612128496170044, "learning_rate": 6.819565507422153e-06, "loss": 0.2309, "step": 14560 }, { "epoch": 1.3308366824990867, "grad_norm": 5.6860809326171875, "learning_rate": 6.8146147483278455e-06, "loss": 0.2155, "step": 14570 }, { "epoch": 1.3317500913408842, "grad_norm": 3.749669313430786, "learning_rate": 6.8096619394657405e-06, "loss": 0.2804, "step": 14580 }, { "epoch": 1.3326635001826816, "grad_norm": 3.36327862739563, "learning_rate": 6.804707086430474e-06, "loss": 0.2072, "step": 14590 }, { "epoch": 1.3335769090244793, "grad_norm": 2.975567579269409, "learning_rate": 6.79975019481899e-06, "loss": 0.1816, "step": 14600 }, { "epoch": 1.334490317866277, "grad_norm": 2.1733436584472656, "learning_rate": 6.794791270230538e-06, "loss": 0.2638, "step": 14610 }, { "epoch": 1.3354037267080745, "grad_norm": 2.7475202083587646, "learning_rate": 6.78983031826666e-06, "loss": 0.2339, "step": 14620 }, { "epoch": 1.3363171355498722, "grad_norm": 1.1930971145629883, "learning_rate": 6.7848673445311915e-06, "loss": 0.2252, "step": 14630 }, { "epoch": 1.3372305443916697, "grad_norm": 2.9837515354156494, "learning_rate": 6.779902354630247e-06, "loss": 0.1568, "step": 14640 }, { "epoch": 1.3381439532334674, "grad_norm": 3.5305843353271484, "learning_rate": 6.774935354172225e-06, "loss": 0.289, "step": 14650 }, { "epoch": 1.3390573620752648, "grad_norm": 3.3825724124908447, "learning_rate": 6.7699663487677895e-06, "loss": 0.2841, "step": 14660 }, { "epoch": 1.3399707709170625, "grad_norm": 7.54306173324585, "learning_rate": 6.764995344029872e-06, "loss": 0.2254, "step": 14670 }, { "epoch": 1.34088417975886, "grad_norm": 2.9698660373687744, "learning_rate": 6.760022345573662e-06, "loss": 0.1799, "step": 14680 }, { "epoch": 1.3417975886006577, "grad_norm": 3.160379648208618, "learning_rate": 6.755047359016602e-06, "loss": 0.2237, "step": 14690 }, { "epoch": 1.3427109974424551, "grad_norm": 2.201188802719116, "learning_rate": 6.750070389978377e-06, "loss": 0.2001, "step": 14700 }, { "epoch": 1.3436244062842528, "grad_norm": 2.616018533706665, "learning_rate": 6.745091444080915e-06, "loss": 0.2283, "step": 14710 }, { "epoch": 1.3445378151260505, "grad_norm": 3.736201047897339, "learning_rate": 6.740110526948376e-06, "loss": 0.1815, "step": 14720 }, { "epoch": 1.345451223967848, "grad_norm": 3.6584973335266113, "learning_rate": 6.7351276442071445e-06, "loss": 0.2069, "step": 14730 }, { "epoch": 1.3463646328096455, "grad_norm": 2.80116605758667, "learning_rate": 6.730142801485828e-06, "loss": 0.1868, "step": 14740 }, { "epoch": 1.3472780416514432, "grad_norm": 4.192399501800537, "learning_rate": 6.725156004415248e-06, "loss": 0.2364, "step": 14750 }, { "epoch": 1.3481914504932409, "grad_norm": 2.346083164215088, "learning_rate": 6.720167258628432e-06, "loss": 0.2347, "step": 14760 }, { "epoch": 1.3491048593350383, "grad_norm": 3.091513156890869, "learning_rate": 6.71517656976061e-06, "loss": 0.1879, "step": 14770 }, { "epoch": 1.350018268176836, "grad_norm": 2.7897732257843018, "learning_rate": 6.710183943449204e-06, "loss": 0.2068, "step": 14780 }, { "epoch": 1.3509316770186335, "grad_norm": 5.9270243644714355, "learning_rate": 6.70518938533383e-06, "loss": 0.248, "step": 14790 }, { "epoch": 1.3518450858604312, "grad_norm": 3.0038657188415527, "learning_rate": 6.700192901056281e-06, "loss": 0.2215, "step": 14800 }, { "epoch": 1.3527584947022286, "grad_norm": 3.9401769638061523, "learning_rate": 6.695194496260529e-06, "loss": 0.2079, "step": 14810 }, { "epoch": 1.3536719035440263, "grad_norm": 1.8506747484207153, "learning_rate": 6.690194176592712e-06, "loss": 0.2029, "step": 14820 }, { "epoch": 1.354585312385824, "grad_norm": 2.188506841659546, "learning_rate": 6.685191947701136e-06, "loss": 0.2235, "step": 14830 }, { "epoch": 1.3554987212276215, "grad_norm": 2.4241461753845215, "learning_rate": 6.680187815236259e-06, "loss": 0.2074, "step": 14840 }, { "epoch": 1.356412130069419, "grad_norm": 4.2991719245910645, "learning_rate": 6.675181784850693e-06, "loss": 0.2443, "step": 14850 }, { "epoch": 1.3573255389112167, "grad_norm": 8.489387512207031, "learning_rate": 6.670173862199188e-06, "loss": 0.2301, "step": 14860 }, { "epoch": 1.3582389477530143, "grad_norm": 3.413133382797241, "learning_rate": 6.665164052938639e-06, "loss": 0.1553, "step": 14870 }, { "epoch": 1.3591523565948118, "grad_norm": 5.71651029586792, "learning_rate": 6.660152362728066e-06, "loss": 0.2224, "step": 14880 }, { "epoch": 1.3600657654366095, "grad_norm": 3.1579153537750244, "learning_rate": 6.655138797228619e-06, "loss": 0.2348, "step": 14890 }, { "epoch": 1.360979174278407, "grad_norm": 2.5531671047210693, "learning_rate": 6.650123362103562e-06, "loss": 0.2221, "step": 14900 }, { "epoch": 1.3618925831202047, "grad_norm": 3.4404311180114746, "learning_rate": 6.645106063018273e-06, "loss": 0.2214, "step": 14910 }, { "epoch": 1.3628059919620021, "grad_norm": 2.9912664890289307, "learning_rate": 6.640086905640233e-06, "loss": 0.1966, "step": 14920 }, { "epoch": 1.3637194008037998, "grad_norm": 10.122900009155273, "learning_rate": 6.635065895639026e-06, "loss": 0.2061, "step": 14930 }, { "epoch": 1.3646328096455973, "grad_norm": 3.6725287437438965, "learning_rate": 6.630043038686326e-06, "loss": 0.2162, "step": 14940 }, { "epoch": 1.365546218487395, "grad_norm": 3.3008172512054443, "learning_rate": 6.6250183404558944e-06, "loss": 0.257, "step": 14950 }, { "epoch": 1.3664596273291925, "grad_norm": 3.9202938079833984, "learning_rate": 6.6199918066235715e-06, "loss": 0.3079, "step": 14960 }, { "epoch": 1.3673730361709902, "grad_norm": 4.932648658752441, "learning_rate": 6.614963442867272e-06, "loss": 0.196, "step": 14970 }, { "epoch": 1.3682864450127878, "grad_norm": 3.1213760375976562, "learning_rate": 6.609933254866977e-06, "loss": 0.2003, "step": 14980 }, { "epoch": 1.3691998538545853, "grad_norm": 4.4139404296875, "learning_rate": 6.604901248304731e-06, "loss": 0.2236, "step": 14990 }, { "epoch": 1.3701132626963828, "grad_norm": 5.154973983764648, "learning_rate": 6.599867428864626e-06, "loss": 0.2082, "step": 15000 }, { "epoch": 1.3710266715381805, "grad_norm": 2.0037505626678467, "learning_rate": 6.59483180223281e-06, "loss": 0.2063, "step": 15010 }, { "epoch": 1.3719400803799782, "grad_norm": 6.052274703979492, "learning_rate": 6.589794374097465e-06, "loss": 0.2228, "step": 15020 }, { "epoch": 1.3728534892217756, "grad_norm": 4.056301593780518, "learning_rate": 6.5847551501488146e-06, "loss": 0.1981, "step": 15030 }, { "epoch": 1.3737668980635733, "grad_norm": 2.600829839706421, "learning_rate": 6.579714136079107e-06, "loss": 0.2147, "step": 15040 }, { "epoch": 1.3746803069053708, "grad_norm": 3.85402250289917, "learning_rate": 6.574671337582612e-06, "loss": 0.2365, "step": 15050 }, { "epoch": 1.3755937157471685, "grad_norm": 3.443230628967285, "learning_rate": 6.5696267603556186e-06, "loss": 0.2033, "step": 15060 }, { "epoch": 1.376507124588966, "grad_norm": 3.9591519832611084, "learning_rate": 6.564580410096419e-06, "loss": 0.1784, "step": 15070 }, { "epoch": 1.3774205334307636, "grad_norm": 2.688361406326294, "learning_rate": 6.5595322925053164e-06, "loss": 0.2493, "step": 15080 }, { "epoch": 1.3783339422725611, "grad_norm": 6.052970886230469, "learning_rate": 6.554482413284604e-06, "loss": 0.1957, "step": 15090 }, { "epoch": 1.3792473511143588, "grad_norm": 3.119614839553833, "learning_rate": 6.549430778138566e-06, "loss": 0.2237, "step": 15100 }, { "epoch": 1.3801607599561563, "grad_norm": 2.8588523864746094, "learning_rate": 6.544377392773471e-06, "loss": 0.198, "step": 15110 }, { "epoch": 1.381074168797954, "grad_norm": 2.7218074798583984, "learning_rate": 6.539322262897565e-06, "loss": 0.2285, "step": 15120 }, { "epoch": 1.3819875776397517, "grad_norm": 3.2045183181762695, "learning_rate": 6.534265394221066e-06, "loss": 0.2317, "step": 15130 }, { "epoch": 1.3829009864815491, "grad_norm": 2.8873298168182373, "learning_rate": 6.529206792456153e-06, "loss": 0.2087, "step": 15140 }, { "epoch": 1.3838143953233466, "grad_norm": 4.341024875640869, "learning_rate": 6.5241464633169636e-06, "loss": 0.1955, "step": 15150 }, { "epoch": 1.3847278041651443, "grad_norm": 10.121057510375977, "learning_rate": 6.519084412519585e-06, "loss": 0.2854, "step": 15160 }, { "epoch": 1.385641213006942, "grad_norm": 3.422942876815796, "learning_rate": 6.514020645782055e-06, "loss": 0.2321, "step": 15170 }, { "epoch": 1.3865546218487395, "grad_norm": 3.966043472290039, "learning_rate": 6.508955168824343e-06, "loss": 0.1797, "step": 15180 }, { "epoch": 1.3874680306905371, "grad_norm": 2.550900459289551, "learning_rate": 6.503887987368355e-06, "loss": 0.2584, "step": 15190 }, { "epoch": 1.3883814395323346, "grad_norm": 3.015872001647949, "learning_rate": 6.498819107137919e-06, "loss": 0.2307, "step": 15200 }, { "epoch": 1.3892948483741323, "grad_norm": 4.135222434997559, "learning_rate": 6.493748533858785e-06, "loss": 0.168, "step": 15210 }, { "epoch": 1.3902082572159298, "grad_norm": 3.587167263031006, "learning_rate": 6.48867627325861e-06, "loss": 0.2507, "step": 15220 }, { "epoch": 1.3911216660577275, "grad_norm": 4.501245975494385, "learning_rate": 6.4836023310669674e-06, "loss": 0.2222, "step": 15230 }, { "epoch": 1.3920350748995252, "grad_norm": 3.931795120239258, "learning_rate": 6.4785267130153174e-06, "loss": 0.1709, "step": 15240 }, { "epoch": 1.3929484837413226, "grad_norm": 4.762365818023682, "learning_rate": 6.473449424837023e-06, "loss": 0.2381, "step": 15250 }, { "epoch": 1.39386189258312, "grad_norm": 4.2850661277771, "learning_rate": 6.4683704722673255e-06, "loss": 0.161, "step": 15260 }, { "epoch": 1.3947753014249178, "grad_norm": 3.7258284091949463, "learning_rate": 6.463289861043358e-06, "loss": 0.2484, "step": 15270 }, { "epoch": 1.3956887102667155, "grad_norm": 4.6610589027404785, "learning_rate": 6.458207596904114e-06, "loss": 0.2563, "step": 15280 }, { "epoch": 1.396602119108513, "grad_norm": 4.123058319091797, "learning_rate": 6.453123685590464e-06, "loss": 0.1831, "step": 15290 }, { "epoch": 1.3975155279503104, "grad_norm": 2.4859402179718018, "learning_rate": 6.448038132845131e-06, "loss": 0.2006, "step": 15300 }, { "epoch": 1.3984289367921081, "grad_norm": 2.762698173522949, "learning_rate": 6.442950944412702e-06, "loss": 0.2011, "step": 15310 }, { "epoch": 1.3993423456339058, "grad_norm": 2.7418832778930664, "learning_rate": 6.437862126039602e-06, "loss": 0.2145, "step": 15320 }, { "epoch": 1.4002557544757033, "grad_norm": 2.4134113788604736, "learning_rate": 6.432771683474104e-06, "loss": 0.1915, "step": 15330 }, { "epoch": 1.401169163317501, "grad_norm": 3.5470900535583496, "learning_rate": 6.42767962246631e-06, "loss": 0.1435, "step": 15340 }, { "epoch": 1.4020825721592984, "grad_norm": 4.090826034545898, "learning_rate": 6.422585948768154e-06, "loss": 0.2447, "step": 15350 }, { "epoch": 1.4029959810010961, "grad_norm": 2.599471092224121, "learning_rate": 6.417490668133393e-06, "loss": 0.1789, "step": 15360 }, { "epoch": 1.4039093898428936, "grad_norm": 3.324401378631592, "learning_rate": 6.412393786317596e-06, "loss": 0.2175, "step": 15370 }, { "epoch": 1.4048227986846913, "grad_norm": 1.9480727910995483, "learning_rate": 6.407295309078139e-06, "loss": 0.1913, "step": 15380 }, { "epoch": 1.405736207526489, "grad_norm": 3.71199107170105, "learning_rate": 6.402195242174207e-06, "loss": 0.1931, "step": 15390 }, { "epoch": 1.4066496163682864, "grad_norm": 3.2143208980560303, "learning_rate": 6.397093591366772e-06, "loss": 0.2338, "step": 15400 }, { "epoch": 1.407563025210084, "grad_norm": 2.774604558944702, "learning_rate": 6.391990362418603e-06, "loss": 0.1636, "step": 15410 }, { "epoch": 1.4084764340518816, "grad_norm": 3.2456367015838623, "learning_rate": 6.386885561094248e-06, "loss": 0.2685, "step": 15420 }, { "epoch": 1.4093898428936793, "grad_norm": 3.5734922885894775, "learning_rate": 6.3817791931600315e-06, "loss": 0.2206, "step": 15430 }, { "epoch": 1.4103032517354768, "grad_norm": 2.9169046878814697, "learning_rate": 6.376671264384044e-06, "loss": 0.2308, "step": 15440 }, { "epoch": 1.4112166605772745, "grad_norm": 3.178170919418335, "learning_rate": 6.371561780536148e-06, "loss": 0.2201, "step": 15450 }, { "epoch": 1.412130069419072, "grad_norm": 3.0124218463897705, "learning_rate": 6.366450747387953e-06, "loss": 0.1976, "step": 15460 }, { "epoch": 1.4130434782608696, "grad_norm": 2.408907890319824, "learning_rate": 6.361338170712827e-06, "loss": 0.2013, "step": 15470 }, { "epoch": 1.413956887102667, "grad_norm": 4.400557994842529, "learning_rate": 6.356224056285874e-06, "loss": 0.2337, "step": 15480 }, { "epoch": 1.4148702959444648, "grad_norm": 2.5192930698394775, "learning_rate": 6.351108409883937e-06, "loss": 0.1922, "step": 15490 }, { "epoch": 1.4157837047862623, "grad_norm": 3.3259811401367188, "learning_rate": 6.345991237285596e-06, "loss": 0.2193, "step": 15500 }, { "epoch": 1.41669711362806, "grad_norm": 3.6572606563568115, "learning_rate": 6.340872544271146e-06, "loss": 0.2663, "step": 15510 }, { "epoch": 1.4176105224698574, "grad_norm": 3.926248788833618, "learning_rate": 6.335752336622605e-06, "loss": 0.2187, "step": 15520 }, { "epoch": 1.418523931311655, "grad_norm": 4.040346145629883, "learning_rate": 6.3306306201237e-06, "loss": 0.2735, "step": 15530 }, { "epoch": 1.4194373401534528, "grad_norm": 0.8825086355209351, "learning_rate": 6.32550740055986e-06, "loss": 0.2633, "step": 15540 }, { "epoch": 1.4203507489952503, "grad_norm": 1.5545097589492798, "learning_rate": 6.32038268371822e-06, "loss": 0.203, "step": 15550 }, { "epoch": 1.4212641578370477, "grad_norm": 3.3029236793518066, "learning_rate": 6.3152564753875964e-06, "loss": 0.2141, "step": 15560 }, { "epoch": 1.4221775666788454, "grad_norm": 4.235395431518555, "learning_rate": 6.310128781358497e-06, "loss": 0.1717, "step": 15570 }, { "epoch": 1.4230909755206431, "grad_norm": 2.9495558738708496, "learning_rate": 6.304999607423102e-06, "loss": 0.2306, "step": 15580 }, { "epoch": 1.4240043843624406, "grad_norm": 5.0329179763793945, "learning_rate": 6.2998689593752705e-06, "loss": 0.2343, "step": 15590 }, { "epoch": 1.4249177932042383, "grad_norm": 1.9405913352966309, "learning_rate": 6.294736843010523e-06, "loss": 0.1959, "step": 15600 }, { "epoch": 1.4258312020460358, "grad_norm": 3.578904867172241, "learning_rate": 6.289603264126037e-06, "loss": 0.2163, "step": 15610 }, { "epoch": 1.4267446108878334, "grad_norm": 2.5725467205047607, "learning_rate": 6.284468228520644e-06, "loss": 0.2031, "step": 15620 }, { "epoch": 1.427658019729631, "grad_norm": 3.9848148822784424, "learning_rate": 6.279331741994821e-06, "loss": 0.1949, "step": 15630 }, { "epoch": 1.4285714285714286, "grad_norm": 2.5954580307006836, "learning_rate": 6.274193810350684e-06, "loss": 0.1989, "step": 15640 }, { "epoch": 1.4294848374132263, "grad_norm": 3.8013882637023926, "learning_rate": 6.269054439391981e-06, "loss": 0.2113, "step": 15650 }, { "epoch": 1.4303982462550238, "grad_norm": 2.5184106826782227, "learning_rate": 6.2639136349240845e-06, "loss": 0.2033, "step": 15660 }, { "epoch": 1.4313116550968212, "grad_norm": 2.5984761714935303, "learning_rate": 6.258771402753992e-06, "loss": 0.1811, "step": 15670 }, { "epoch": 1.432225063938619, "grad_norm": 2.1849164962768555, "learning_rate": 6.253627748690302e-06, "loss": 0.1937, "step": 15680 }, { "epoch": 1.4331384727804166, "grad_norm": 4.360562324523926, "learning_rate": 6.248482678543234e-06, "loss": 0.2219, "step": 15690 }, { "epoch": 1.434051881622214, "grad_norm": 2.413933038711548, "learning_rate": 6.243336198124595e-06, "loss": 0.1733, "step": 15700 }, { "epoch": 1.4349652904640116, "grad_norm": 7.201540946960449, "learning_rate": 6.238188313247793e-06, "loss": 0.196, "step": 15710 }, { "epoch": 1.4358786993058092, "grad_norm": 2.7643470764160156, "learning_rate": 6.233039029727814e-06, "loss": 0.1595, "step": 15720 }, { "epoch": 1.436792108147607, "grad_norm": 2.7425003051757812, "learning_rate": 6.227888353381235e-06, "loss": 0.2712, "step": 15730 }, { "epoch": 1.4377055169894044, "grad_norm": 2.9494428634643555, "learning_rate": 6.222736290026195e-06, "loss": 0.2293, "step": 15740 }, { "epoch": 1.438618925831202, "grad_norm": 2.7962307929992676, "learning_rate": 6.217582845482411e-06, "loss": 0.2079, "step": 15750 }, { "epoch": 1.4395323346729996, "grad_norm": 2.821167230606079, "learning_rate": 6.212428025571149e-06, "loss": 0.2537, "step": 15760 }, { "epoch": 1.4404457435147973, "grad_norm": 2.8076224327087402, "learning_rate": 6.207271836115236e-06, "loss": 0.1968, "step": 15770 }, { "epoch": 1.4413591523565947, "grad_norm": 2.757390260696411, "learning_rate": 6.202114282939042e-06, "loss": 0.2542, "step": 15780 }, { "epoch": 1.4422725611983924, "grad_norm": 3.394080400466919, "learning_rate": 6.196955371868484e-06, "loss": 0.1852, "step": 15790 }, { "epoch": 1.4431859700401901, "grad_norm": 6.4298834800720215, "learning_rate": 6.191795108731002e-06, "loss": 0.2079, "step": 15800 }, { "epoch": 1.4440993788819876, "grad_norm": 4.135176181793213, "learning_rate": 6.186633499355576e-06, "loss": 0.2112, "step": 15810 }, { "epoch": 1.445012787723785, "grad_norm": 3.601177930831909, "learning_rate": 6.181470549572692e-06, "loss": 0.2353, "step": 15820 }, { "epoch": 1.4459261965655827, "grad_norm": 3.0361344814300537, "learning_rate": 6.176306265214367e-06, "loss": 0.2307, "step": 15830 }, { "epoch": 1.4468396054073804, "grad_norm": 3.035965919494629, "learning_rate": 6.171140652114111e-06, "loss": 0.1845, "step": 15840 }, { "epoch": 1.447753014249178, "grad_norm": 5.092061519622803, "learning_rate": 6.1659737161069445e-06, "loss": 0.2179, "step": 15850 }, { "epoch": 1.4486664230909756, "grad_norm": 3.8095786571502686, "learning_rate": 6.160805463029375e-06, "loss": 0.1793, "step": 15860 }, { "epoch": 1.449579831932773, "grad_norm": 2.589176654815674, "learning_rate": 6.155635898719402e-06, "loss": 0.2358, "step": 15870 }, { "epoch": 1.4504932407745708, "grad_norm": 7.876865386962891, "learning_rate": 6.150465029016507e-06, "loss": 0.234, "step": 15880 }, { "epoch": 1.4514066496163682, "grad_norm": 5.43609094619751, "learning_rate": 6.145292859761644e-06, "loss": 0.1738, "step": 15890 }, { "epoch": 1.452320058458166, "grad_norm": 13.68411636352539, "learning_rate": 6.140119396797235e-06, "loss": 0.2167, "step": 15900 }, { "epoch": 1.4532334672999634, "grad_norm": 3.572451591491699, "learning_rate": 6.134944645967165e-06, "loss": 0.1761, "step": 15910 }, { "epoch": 1.454146876141761, "grad_norm": 3.4407269954681396, "learning_rate": 6.129768613116769e-06, "loss": 0.2627, "step": 15920 }, { "epoch": 1.4550602849835585, "grad_norm": 3.138425588607788, "learning_rate": 6.1245913040928395e-06, "loss": 0.1859, "step": 15930 }, { "epoch": 1.4559736938253562, "grad_norm": 2.2452657222747803, "learning_rate": 6.1194127247435994e-06, "loss": 0.2005, "step": 15940 }, { "epoch": 1.456887102667154, "grad_norm": 6.45519495010376, "learning_rate": 6.114232880918717e-06, "loss": 0.1996, "step": 15950 }, { "epoch": 1.4578005115089514, "grad_norm": 6.371825695037842, "learning_rate": 6.109051778469277e-06, "loss": 0.2108, "step": 15960 }, { "epoch": 1.4587139203507489, "grad_norm": 3.59432315826416, "learning_rate": 6.1038694232478e-06, "loss": 0.2253, "step": 15970 }, { "epoch": 1.4596273291925466, "grad_norm": 2.896901845932007, "learning_rate": 6.09868582110821e-06, "loss": 0.2132, "step": 15980 }, { "epoch": 1.4605407380343443, "grad_norm": 5.494755268096924, "learning_rate": 6.0935009779058465e-06, "loss": 0.1959, "step": 15990 }, { "epoch": 1.4614541468761417, "grad_norm": 2.642092704772949, "learning_rate": 6.088314899497446e-06, "loss": 0.2169, "step": 16000 }, { "epoch": 1.4623675557179394, "grad_norm": 1.8047994375228882, "learning_rate": 6.083127591741144e-06, "loss": 0.2156, "step": 16010 }, { "epoch": 1.4632809645597369, "grad_norm": 3.150040626525879, "learning_rate": 6.077939060496462e-06, "loss": 0.2188, "step": 16020 }, { "epoch": 1.4641943734015346, "grad_norm": 4.752093315124512, "learning_rate": 6.072749311624307e-06, "loss": 0.2263, "step": 16030 }, { "epoch": 1.465107782243332, "grad_norm": 4.300808906555176, "learning_rate": 6.067558350986955e-06, "loss": 0.2092, "step": 16040 }, { "epoch": 1.4660211910851297, "grad_norm": 2.8665313720703125, "learning_rate": 6.062366184448061e-06, "loss": 0.1557, "step": 16050 }, { "epoch": 1.4669345999269274, "grad_norm": 3.3255178928375244, "learning_rate": 6.05717281787263e-06, "loss": 0.24, "step": 16060 }, { "epoch": 1.467848008768725, "grad_norm": 3.3077478408813477, "learning_rate": 6.051978257127034e-06, "loss": 0.2675, "step": 16070 }, { "epoch": 1.4687614176105224, "grad_norm": 4.466035842895508, "learning_rate": 6.046782508078986e-06, "loss": 0.1679, "step": 16080 }, { "epoch": 1.46967482645232, "grad_norm": 3.3253157138824463, "learning_rate": 6.041585576597544e-06, "loss": 0.2781, "step": 16090 }, { "epoch": 1.4705882352941178, "grad_norm": 2.82365083694458, "learning_rate": 6.036387468553101e-06, "loss": 0.2813, "step": 16100 }, { "epoch": 1.4715016441359152, "grad_norm": 3.464719533920288, "learning_rate": 6.031188189817381e-06, "loss": 0.2417, "step": 16110 }, { "epoch": 1.4724150529777127, "grad_norm": 2.5926101207733154, "learning_rate": 6.025987746263428e-06, "loss": 0.2559, "step": 16120 }, { "epoch": 1.4733284618195104, "grad_norm": 2.2432174682617188, "learning_rate": 6.020786143765604e-06, "loss": 0.2512, "step": 16130 }, { "epoch": 1.474241870661308, "grad_norm": 4.742743492126465, "learning_rate": 6.015583388199577e-06, "loss": 0.2911, "step": 16140 }, { "epoch": 1.4751552795031055, "grad_norm": 6.570094585418701, "learning_rate": 6.0103794854423216e-06, "loss": 0.2056, "step": 16150 }, { "epoch": 1.4760686883449032, "grad_norm": 5.570037364959717, "learning_rate": 6.005174441372105e-06, "loss": 0.2075, "step": 16160 }, { "epoch": 1.4769820971867007, "grad_norm": 5.214116096496582, "learning_rate": 5.999968261868485e-06, "loss": 0.209, "step": 16170 }, { "epoch": 1.4778955060284984, "grad_norm": 4.514930725097656, "learning_rate": 5.994760952812301e-06, "loss": 0.1969, "step": 16180 }, { "epoch": 1.4788089148702959, "grad_norm": 4.129974365234375, "learning_rate": 5.98955252008567e-06, "loss": 0.1819, "step": 16190 }, { "epoch": 1.4797223237120936, "grad_norm": 4.396636486053467, "learning_rate": 5.984342969571974e-06, "loss": 0.259, "step": 16200 }, { "epoch": 1.4806357325538912, "grad_norm": 1.517594337463379, "learning_rate": 5.979132307155867e-06, "loss": 0.2055, "step": 16210 }, { "epoch": 1.4815491413956887, "grad_norm": 4.738906383514404, "learning_rate": 5.973920538723248e-06, "loss": 0.1869, "step": 16220 }, { "epoch": 1.4824625502374862, "grad_norm": 3.4806971549987793, "learning_rate": 5.968707670161272e-06, "loss": 0.212, "step": 16230 }, { "epoch": 1.4833759590792839, "grad_norm": 10.074590682983398, "learning_rate": 5.963493707358335e-06, "loss": 0.2291, "step": 16240 }, { "epoch": 1.4842893679210816, "grad_norm": 4.613361358642578, "learning_rate": 5.958278656204067e-06, "loss": 0.2063, "step": 16250 }, { "epoch": 1.485202776762879, "grad_norm": 2.4719226360321045, "learning_rate": 5.953062522589332e-06, "loss": 0.2155, "step": 16260 }, { "epoch": 1.4861161856046767, "grad_norm": 2.402780771255493, "learning_rate": 5.947845312406213e-06, "loss": 0.1846, "step": 16270 }, { "epoch": 1.4870295944464742, "grad_norm": 3.018491744995117, "learning_rate": 5.942627031548011e-06, "loss": 0.2098, "step": 16280 }, { "epoch": 1.487943003288272, "grad_norm": 3.2780234813690186, "learning_rate": 5.937407685909233e-06, "loss": 0.1973, "step": 16290 }, { "epoch": 1.4888564121300694, "grad_norm": 3.8148062229156494, "learning_rate": 5.9321872813855944e-06, "loss": 0.235, "step": 16300 }, { "epoch": 1.489769820971867, "grad_norm": 4.270725727081299, "learning_rate": 5.926965823874002e-06, "loss": 0.172, "step": 16310 }, { "epoch": 1.4906832298136645, "grad_norm": 3.7470014095306396, "learning_rate": 5.9217433192725546e-06, "loss": 0.2138, "step": 16320 }, { "epoch": 1.4915966386554622, "grad_norm": 3.2222182750701904, "learning_rate": 5.916519773480533e-06, "loss": 0.2191, "step": 16330 }, { "epoch": 1.4925100474972597, "grad_norm": 5.767870903015137, "learning_rate": 5.911295192398392e-06, "loss": 0.2141, "step": 16340 }, { "epoch": 1.4934234563390574, "grad_norm": 10.515514373779297, "learning_rate": 5.906069581927762e-06, "loss": 0.2346, "step": 16350 }, { "epoch": 1.494336865180855, "grad_norm": 2.3011395931243896, "learning_rate": 5.900842947971427e-06, "loss": 0.2459, "step": 16360 }, { "epoch": 1.4952502740226525, "grad_norm": 3.8987410068511963, "learning_rate": 5.895615296433336e-06, "loss": 0.1879, "step": 16370 }, { "epoch": 1.49616368286445, "grad_norm": 2.893587589263916, "learning_rate": 5.890386633218581e-06, "loss": 0.2167, "step": 16380 }, { "epoch": 1.4970770917062477, "grad_norm": 1.6947801113128662, "learning_rate": 5.885156964233401e-06, "loss": 0.1679, "step": 16390 }, { "epoch": 1.4979905005480454, "grad_norm": 2.136077642440796, "learning_rate": 5.8799262953851675e-06, "loss": 0.2212, "step": 16400 }, { "epoch": 1.4989039093898429, "grad_norm": 2.8037281036376953, "learning_rate": 5.8746946325823875e-06, "loss": 0.1653, "step": 16410 }, { "epoch": 1.4998173182316406, "grad_norm": 4.6968464851379395, "learning_rate": 5.869461981734683e-06, "loss": 0.1981, "step": 16420 }, { "epoch": 1.500730727073438, "grad_norm": 7.161553859710693, "learning_rate": 5.8642283487527984e-06, "loss": 0.2338, "step": 16430 }, { "epoch": 1.5016441359152357, "grad_norm": 4.627858638763428, "learning_rate": 5.8589937395485805e-06, "loss": 0.2398, "step": 16440 }, { "epoch": 1.5025575447570332, "grad_norm": 3.320897102355957, "learning_rate": 5.853758160034989e-06, "loss": 0.1931, "step": 16450 }, { "epoch": 1.5034709535988309, "grad_norm": 2.5085501670837402, "learning_rate": 5.8485216161260695e-06, "loss": 0.2291, "step": 16460 }, { "epoch": 1.5043843624406286, "grad_norm": 3.9115395545959473, "learning_rate": 5.843284113736964e-06, "loss": 0.228, "step": 16470 }, { "epoch": 1.505297771282426, "grad_norm": 2.786682605743408, "learning_rate": 5.838045658783892e-06, "loss": 0.2275, "step": 16480 }, { "epoch": 1.5062111801242235, "grad_norm": 2.261547803878784, "learning_rate": 5.832806257184154e-06, "loss": 0.2094, "step": 16490 }, { "epoch": 1.5071245889660212, "grad_norm": 7.512785911560059, "learning_rate": 5.8275659148561145e-06, "loss": 0.1575, "step": 16500 }, { "epoch": 1.5080379978078189, "grad_norm": 3.2879831790924072, "learning_rate": 5.822324637719208e-06, "loss": 0.183, "step": 16510 }, { "epoch": 1.5089514066496164, "grad_norm": 5.776729583740234, "learning_rate": 5.817082431693916e-06, "loss": 0.232, "step": 16520 }, { "epoch": 1.5098648154914138, "grad_norm": 2.8525097370147705, "learning_rate": 5.811839302701775e-06, "loss": 0.2671, "step": 16530 }, { "epoch": 1.5107782243332115, "grad_norm": 2.2625041007995605, "learning_rate": 5.806595256665364e-06, "loss": 0.2438, "step": 16540 }, { "epoch": 1.5116916331750092, "grad_norm": 3.0947136878967285, "learning_rate": 5.801350299508296e-06, "loss": 0.2405, "step": 16550 }, { "epoch": 1.5126050420168067, "grad_norm": 7.179372310638428, "learning_rate": 5.796104437155213e-06, "loss": 0.1822, "step": 16560 }, { "epoch": 1.5135184508586041, "grad_norm": 2.15864634513855, "learning_rate": 5.7908576755317815e-06, "loss": 0.1729, "step": 16570 }, { "epoch": 1.514431859700402, "grad_norm": 3.3228933811187744, "learning_rate": 5.78561002056468e-06, "loss": 0.1994, "step": 16580 }, { "epoch": 1.5153452685421995, "grad_norm": 3.467407464981079, "learning_rate": 5.780361478181604e-06, "loss": 0.2182, "step": 16590 }, { "epoch": 1.516258677383997, "grad_norm": 3.6120080947875977, "learning_rate": 5.7751120543112404e-06, "loss": 0.229, "step": 16600 }, { "epoch": 1.5171720862257947, "grad_norm": 3.1206893920898438, "learning_rate": 5.769861754883282e-06, "loss": 0.2112, "step": 16610 }, { "epoch": 1.5180854950675924, "grad_norm": 2.281353235244751, "learning_rate": 5.764610585828404e-06, "loss": 0.1798, "step": 16620 }, { "epoch": 1.5189989039093899, "grad_norm": 3.5455265045166016, "learning_rate": 5.759358553078266e-06, "loss": 0.16, "step": 16630 }, { "epoch": 1.5199123127511873, "grad_norm": 6.807119846343994, "learning_rate": 5.754105662565503e-06, "loss": 0.1915, "step": 16640 }, { "epoch": 1.520825721592985, "grad_norm": 2.532334089279175, "learning_rate": 5.748851920223723e-06, "loss": 0.1666, "step": 16650 }, { "epoch": 1.5217391304347827, "grad_norm": 4.150599479675293, "learning_rate": 5.743597331987487e-06, "loss": 0.1923, "step": 16660 }, { "epoch": 1.5226525392765802, "grad_norm": 7.4044013023376465, "learning_rate": 5.73834190379232e-06, "loss": 0.1877, "step": 16670 }, { "epoch": 1.5235659481183776, "grad_norm": 3.2114574909210205, "learning_rate": 5.733085641574694e-06, "loss": 0.2332, "step": 16680 }, { "epoch": 1.5244793569601753, "grad_norm": 4.609951019287109, "learning_rate": 5.72782855127202e-06, "loss": 0.1992, "step": 16690 }, { "epoch": 1.525392765801973, "grad_norm": 2.514793634414673, "learning_rate": 5.722570638822646e-06, "loss": 0.2126, "step": 16700 }, { "epoch": 1.5263061746437705, "grad_norm": 3.196277618408203, "learning_rate": 5.71731191016585e-06, "loss": 0.1791, "step": 16710 }, { "epoch": 1.5272195834855682, "grad_norm": 2.311365842819214, "learning_rate": 5.712052371241829e-06, "loss": 0.2257, "step": 16720 }, { "epoch": 1.5281329923273659, "grad_norm": 2.7965362071990967, "learning_rate": 5.706792027991701e-06, "loss": 0.1825, "step": 16730 }, { "epoch": 1.5290464011691633, "grad_norm": 4.075442314147949, "learning_rate": 5.701530886357485e-06, "loss": 0.2039, "step": 16740 }, { "epoch": 1.5299598100109608, "grad_norm": 3.7073237895965576, "learning_rate": 5.6962689522821094e-06, "loss": 0.1647, "step": 16750 }, { "epoch": 1.5308732188527585, "grad_norm": 2.3755064010620117, "learning_rate": 5.69100623170939e-06, "loss": 0.1841, "step": 16760 }, { "epoch": 1.5317866276945562, "grad_norm": 2.336045026779175, "learning_rate": 5.685742730584036e-06, "loss": 0.1897, "step": 16770 }, { "epoch": 1.5327000365363537, "grad_norm": 3.1137471199035645, "learning_rate": 5.680478454851639e-06, "loss": 0.1986, "step": 16780 }, { "epoch": 1.5336134453781511, "grad_norm": 4.241494178771973, "learning_rate": 5.6752134104586646e-06, "loss": 0.2265, "step": 16790 }, { "epoch": 1.5345268542199488, "grad_norm": 2.6880757808685303, "learning_rate": 5.6699476033524425e-06, "loss": 0.1721, "step": 16800 }, { "epoch": 1.5354402630617465, "grad_norm": 5.256082534790039, "learning_rate": 5.66468103948117e-06, "loss": 0.2009, "step": 16810 }, { "epoch": 1.536353671903544, "grad_norm": 8.573625564575195, "learning_rate": 5.659413724793895e-06, "loss": 0.2806, "step": 16820 }, { "epoch": 1.5372670807453415, "grad_norm": 2.6147730350494385, "learning_rate": 5.654145665240518e-06, "loss": 0.2662, "step": 16830 }, { "epoch": 1.5381804895871392, "grad_norm": 2.1453990936279297, "learning_rate": 5.648876866771774e-06, "loss": 0.1814, "step": 16840 }, { "epoch": 1.5390938984289368, "grad_norm": 2.572502374649048, "learning_rate": 5.643607335339241e-06, "loss": 0.2437, "step": 16850 }, { "epoch": 1.5400073072707343, "grad_norm": 2.544875383377075, "learning_rate": 5.638337076895315e-06, "loss": 0.2155, "step": 16860 }, { "epoch": 1.540920716112532, "grad_norm": 6.378917217254639, "learning_rate": 5.633066097393225e-06, "loss": 0.2462, "step": 16870 }, { "epoch": 1.5418341249543297, "grad_norm": 3.9392313957214355, "learning_rate": 5.627794402787005e-06, "loss": 0.313, "step": 16880 }, { "epoch": 1.5427475337961272, "grad_norm": 3.492319107055664, "learning_rate": 5.622521999031502e-06, "loss": 0.2111, "step": 16890 }, { "epoch": 1.5436609426379246, "grad_norm": 3.5727779865264893, "learning_rate": 5.617248892082359e-06, "loss": 0.216, "step": 16900 }, { "epoch": 1.5445743514797223, "grad_norm": 3.656167984008789, "learning_rate": 5.611975087896019e-06, "loss": 0.2239, "step": 16910 }, { "epoch": 1.54548776032152, "grad_norm": 2.673614263534546, "learning_rate": 5.606700592429709e-06, "loss": 0.2364, "step": 16920 }, { "epoch": 1.5464011691633175, "grad_norm": 7.0991644859313965, "learning_rate": 5.601425411641441e-06, "loss": 0.2639, "step": 16930 }, { "epoch": 1.547314578005115, "grad_norm": 3.5728695392608643, "learning_rate": 5.5961495514899945e-06, "loss": 0.2522, "step": 16940 }, { "epoch": 1.5482279868469127, "grad_norm": 4.170254230499268, "learning_rate": 5.590873017934921e-06, "loss": 0.1744, "step": 16950 }, { "epoch": 1.5491413956887103, "grad_norm": 3.4760966300964355, "learning_rate": 5.585595816936531e-06, "loss": 0.2025, "step": 16960 }, { "epoch": 1.5500548045305078, "grad_norm": 2.612725019454956, "learning_rate": 5.580317954455893e-06, "loss": 0.236, "step": 16970 }, { "epoch": 1.5509682133723053, "grad_norm": 3.4184556007385254, "learning_rate": 5.575039436454815e-06, "loss": 0.2257, "step": 16980 }, { "epoch": 1.5518816222141032, "grad_norm": 2.7252728939056396, "learning_rate": 5.569760268895853e-06, "loss": 0.2468, "step": 16990 }, { "epoch": 1.5527950310559007, "grad_norm": 6.776663303375244, "learning_rate": 5.564480457742289e-06, "loss": 0.1812, "step": 17000 }, { "epoch": 1.5537084398976981, "grad_norm": 7.591137886047363, "learning_rate": 5.5592000089581434e-06, "loss": 0.2728, "step": 17010 }, { "epoch": 1.5546218487394958, "grad_norm": 3.1431570053100586, "learning_rate": 5.553918928508144e-06, "loss": 0.2178, "step": 17020 }, { "epoch": 1.5555352575812935, "grad_norm": 2.07150936126709, "learning_rate": 5.548637222357742e-06, "loss": 0.1887, "step": 17030 }, { "epoch": 1.556448666423091, "grad_norm": 3.1516575813293457, "learning_rate": 5.543354896473089e-06, "loss": 0.2411, "step": 17040 }, { "epoch": 1.5573620752648885, "grad_norm": 2.6478233337402344, "learning_rate": 5.538071956821039e-06, "loss": 0.2208, "step": 17050 }, { "epoch": 1.5582754841066861, "grad_norm": 8.117449760437012, "learning_rate": 5.532788409369141e-06, "loss": 0.222, "step": 17060 }, { "epoch": 1.5591888929484838, "grad_norm": 2.4808809757232666, "learning_rate": 5.527504260085629e-06, "loss": 0.1916, "step": 17070 }, { "epoch": 1.5601023017902813, "grad_norm": 3.8754422664642334, "learning_rate": 5.522219514939415e-06, "loss": 0.2247, "step": 17080 }, { "epoch": 1.5610157106320788, "grad_norm": 4.531260967254639, "learning_rate": 5.516934179900089e-06, "loss": 0.2278, "step": 17090 }, { "epoch": 1.5619291194738765, "grad_norm": 2.355390787124634, "learning_rate": 5.5116482609378984e-06, "loss": 0.1806, "step": 17100 }, { "epoch": 1.5628425283156742, "grad_norm": 3.267404556274414, "learning_rate": 5.506361764023763e-06, "loss": 0.1953, "step": 17110 }, { "epoch": 1.5637559371574716, "grad_norm": 6.963986396789551, "learning_rate": 5.5010746951292445e-06, "loss": 0.2196, "step": 17120 }, { "epoch": 1.5646693459992693, "grad_norm": 3.2322661876678467, "learning_rate": 5.495787060226556e-06, "loss": 0.2175, "step": 17130 }, { "epoch": 1.565582754841067, "grad_norm": 8.063596725463867, "learning_rate": 5.4904988652885475e-06, "loss": 0.209, "step": 17140 }, { "epoch": 1.5664961636828645, "grad_norm": 1.7485425472259521, "learning_rate": 5.485210116288704e-06, "loss": 0.203, "step": 17150 }, { "epoch": 1.567409572524662, "grad_norm": 3.8213436603546143, "learning_rate": 5.479920819201134e-06, "loss": 0.1926, "step": 17160 }, { "epoch": 1.5683229813664596, "grad_norm": 2.4016366004943848, "learning_rate": 5.474630980000567e-06, "loss": 0.2266, "step": 17170 }, { "epoch": 1.5692363902082573, "grad_norm": 2.2852139472961426, "learning_rate": 5.469340604662343e-06, "loss": 0.2336, "step": 17180 }, { "epoch": 1.5701497990500548, "grad_norm": 4.060347080230713, "learning_rate": 5.464049699162411e-06, "loss": 0.1999, "step": 17190 }, { "epoch": 1.5710632078918523, "grad_norm": 4.850257396697998, "learning_rate": 5.4587582694773135e-06, "loss": 0.2121, "step": 17200 }, { "epoch": 1.57197661673365, "grad_norm": 2.742845058441162, "learning_rate": 5.453466321584191e-06, "loss": 0.1668, "step": 17210 }, { "epoch": 1.5728900255754477, "grad_norm": 5.8409037590026855, "learning_rate": 5.448173861460764e-06, "loss": 0.1875, "step": 17220 }, { "epoch": 1.5738034344172451, "grad_norm": 3.0315442085266113, "learning_rate": 5.442880895085336e-06, "loss": 0.2094, "step": 17230 }, { "epoch": 1.5747168432590426, "grad_norm": 6.808951377868652, "learning_rate": 5.437587428436779e-06, "loss": 0.2147, "step": 17240 }, { "epoch": 1.5756302521008403, "grad_norm": 2.2542402744293213, "learning_rate": 5.432293467494534e-06, "loss": 0.1651, "step": 17250 }, { "epoch": 1.576543660942638, "grad_norm": 2.7639172077178955, "learning_rate": 5.426999018238594e-06, "loss": 0.2005, "step": 17260 }, { "epoch": 1.5774570697844355, "grad_norm": 4.895577907562256, "learning_rate": 5.421704086649513e-06, "loss": 0.2133, "step": 17270 }, { "epoch": 1.5783704786262331, "grad_norm": 2.4970266819000244, "learning_rate": 5.4164086787083794e-06, "loss": 0.2459, "step": 17280 }, { "epoch": 1.5792838874680308, "grad_norm": 2.551989793777466, "learning_rate": 5.411112800396827e-06, "loss": 0.2243, "step": 17290 }, { "epoch": 1.5801972963098283, "grad_norm": 2.8296141624450684, "learning_rate": 5.405816457697017e-06, "loss": 0.211, "step": 17300 }, { "epoch": 1.5811107051516258, "grad_norm": 7.677258014678955, "learning_rate": 5.400519656591641e-06, "loss": 0.2077, "step": 17310 }, { "epoch": 1.5820241139934235, "grad_norm": 3.5612876415252686, "learning_rate": 5.395222403063898e-06, "loss": 0.231, "step": 17320 }, { "epoch": 1.5829375228352212, "grad_norm": 5.6142730712890625, "learning_rate": 5.389924703097508e-06, "loss": 0.183, "step": 17330 }, { "epoch": 1.5838509316770186, "grad_norm": 7.220236301422119, "learning_rate": 5.384626562676692e-06, "loss": 0.2281, "step": 17340 }, { "epoch": 1.584764340518816, "grad_norm": 4.546471118927002, "learning_rate": 5.379327987786166e-06, "loss": 0.1952, "step": 17350 }, { "epoch": 1.5856777493606138, "grad_norm": 3.6837146282196045, "learning_rate": 5.3740289844111394e-06, "loss": 0.202, "step": 17360 }, { "epoch": 1.5865911582024115, "grad_norm": 3.069180965423584, "learning_rate": 5.368729558537306e-06, "loss": 0.1969, "step": 17370 }, { "epoch": 1.587504567044209, "grad_norm": 2.067448616027832, "learning_rate": 5.363429716150833e-06, "loss": 0.1913, "step": 17380 }, { "epoch": 1.5884179758860064, "grad_norm": 2.702258348464966, "learning_rate": 5.358129463238366e-06, "loss": 0.1962, "step": 17390 }, { "epoch": 1.5893313847278043, "grad_norm": 3.01741099357605, "learning_rate": 5.352828805787005e-06, "loss": 0.208, "step": 17400 }, { "epoch": 1.5902447935696018, "grad_norm": 2.522817850112915, "learning_rate": 5.347527749784313e-06, "loss": 0.2083, "step": 17410 }, { "epoch": 1.5911582024113993, "grad_norm": 4.744843482971191, "learning_rate": 5.342226301218301e-06, "loss": 0.1912, "step": 17420 }, { "epoch": 1.592071611253197, "grad_norm": 5.966942310333252, "learning_rate": 5.336924466077423e-06, "loss": 0.1871, "step": 17430 }, { "epoch": 1.5929850200949947, "grad_norm": 2.9549195766448975, "learning_rate": 5.3316222503505725e-06, "loss": 0.2641, "step": 17440 }, { "epoch": 1.5938984289367921, "grad_norm": 4.692993640899658, "learning_rate": 5.32631966002707e-06, "loss": 0.2426, "step": 17450 }, { "epoch": 1.5948118377785896, "grad_norm": 4.104866027832031, "learning_rate": 5.321016701096661e-06, "loss": 0.2581, "step": 17460 }, { "epoch": 1.5957252466203873, "grad_norm": 3.5279998779296875, "learning_rate": 5.3157133795495034e-06, "loss": 0.2039, "step": 17470 }, { "epoch": 1.596638655462185, "grad_norm": 4.539487838745117, "learning_rate": 5.31040970137617e-06, "loss": 0.2213, "step": 17480 }, { "epoch": 1.5975520643039824, "grad_norm": 2.614314079284668, "learning_rate": 5.305105672567636e-06, "loss": 0.2262, "step": 17490 }, { "epoch": 1.59846547314578, "grad_norm": 4.176724910736084, "learning_rate": 5.299801299115268e-06, "loss": 0.2417, "step": 17500 }, { "epoch": 1.5993788819875776, "grad_norm": 2.0199196338653564, "learning_rate": 5.294496587010827e-06, "loss": 0.1904, "step": 17510 }, { "epoch": 1.6002922908293753, "grad_norm": 2.7210614681243896, "learning_rate": 5.289191542246451e-06, "loss": 0.1386, "step": 17520 }, { "epoch": 1.6012056996711728, "grad_norm": 4.151637077331543, "learning_rate": 5.28388617081466e-06, "loss": 0.2387, "step": 17530 }, { "epoch": 1.6021191085129705, "grad_norm": 2.9205398559570312, "learning_rate": 5.278580478708341e-06, "loss": 0.2178, "step": 17540 }, { "epoch": 1.6030325173547681, "grad_norm": 2.6941235065460205, "learning_rate": 5.2732744719207394e-06, "loss": 0.2582, "step": 17550 }, { "epoch": 1.6039459261965656, "grad_norm": 5.336452960968018, "learning_rate": 5.26796815644546e-06, "loss": 0.234, "step": 17560 }, { "epoch": 1.604859335038363, "grad_norm": 4.651072025299072, "learning_rate": 5.262661538276456e-06, "loss": 0.2111, "step": 17570 }, { "epoch": 1.6057727438801608, "grad_norm": 2.655998468399048, "learning_rate": 5.257354623408021e-06, "loss": 0.2278, "step": 17580 }, { "epoch": 1.6066861527219585, "grad_norm": 4.018564224243164, "learning_rate": 5.252047417834786e-06, "loss": 0.2095, "step": 17590 }, { "epoch": 1.607599561563756, "grad_norm": 3.1494011878967285, "learning_rate": 5.246739927551707e-06, "loss": 0.2361, "step": 17600 }, { "epoch": 1.6085129704055534, "grad_norm": 4.070630073547363, "learning_rate": 5.241432158554065e-06, "loss": 0.2079, "step": 17610 }, { "epoch": 1.609426379247351, "grad_norm": 3.3399710655212402, "learning_rate": 5.2361241168374525e-06, "loss": 0.2376, "step": 17620 }, { "epoch": 1.6103397880891488, "grad_norm": 3.1718859672546387, "learning_rate": 5.230815808397775e-06, "loss": 0.1993, "step": 17630 }, { "epoch": 1.6112531969309463, "grad_norm": 2.9193320274353027, "learning_rate": 5.225507239231232e-06, "loss": 0.1553, "step": 17640 }, { "epoch": 1.6121666057727437, "grad_norm": 3.0346341133117676, "learning_rate": 5.220198415334327e-06, "loss": 0.197, "step": 17650 }, { "epoch": 1.6130800146145414, "grad_norm": 4.289315700531006, "learning_rate": 5.214889342703841e-06, "loss": 0.1925, "step": 17660 }, { "epoch": 1.6139934234563391, "grad_norm": 2.5116193294525146, "learning_rate": 5.2095800273368435e-06, "loss": 0.1264, "step": 17670 }, { "epoch": 1.6149068322981366, "grad_norm": 3.5499556064605713, "learning_rate": 5.204270475230676e-06, "loss": 0.2038, "step": 17680 }, { "epoch": 1.6158202411399343, "grad_norm": 2.918954849243164, "learning_rate": 5.198960692382946e-06, "loss": 0.2378, "step": 17690 }, { "epoch": 1.616733649981732, "grad_norm": 3.880783796310425, "learning_rate": 5.193650684791523e-06, "loss": 0.1909, "step": 17700 }, { "epoch": 1.6176470588235294, "grad_norm": 2.667391061782837, "learning_rate": 5.188340458454528e-06, "loss": 0.1872, "step": 17710 }, { "epoch": 1.618560467665327, "grad_norm": 5.731447696685791, "learning_rate": 5.1830300193703354e-06, "loss": 0.2547, "step": 17720 }, { "epoch": 1.6194738765071246, "grad_norm": 2.397023916244507, "learning_rate": 5.177719373537553e-06, "loss": 0.2396, "step": 17730 }, { "epoch": 1.6203872853489223, "grad_norm": 2.988708019256592, "learning_rate": 5.172408526955025e-06, "loss": 0.1979, "step": 17740 }, { "epoch": 1.6213006941907198, "grad_norm": 4.14589262008667, "learning_rate": 5.167097485621822e-06, "loss": 0.2014, "step": 17750 }, { "epoch": 1.6222141030325172, "grad_norm": 9.237394332885742, "learning_rate": 5.161786255537234e-06, "loss": 0.1871, "step": 17760 }, { "epoch": 1.623127511874315, "grad_norm": 3.617218494415283, "learning_rate": 5.1564748427007695e-06, "loss": 0.212, "step": 17770 }, { "epoch": 1.6240409207161126, "grad_norm": 4.523207664489746, "learning_rate": 5.151163253112132e-06, "loss": 0.174, "step": 17780 }, { "epoch": 1.62495432955791, "grad_norm": 4.542119979858398, "learning_rate": 5.145851492771238e-06, "loss": 0.2282, "step": 17790 }, { "epoch": 1.6258677383997076, "grad_norm": 3.811162233352661, "learning_rate": 5.140539567678186e-06, "loss": 0.2486, "step": 17800 }, { "epoch": 1.6267811472415052, "grad_norm": 4.787280082702637, "learning_rate": 5.135227483833266e-06, "loss": 0.2233, "step": 17810 }, { "epoch": 1.627694556083303, "grad_norm": 2.500563383102417, "learning_rate": 5.129915247236947e-06, "loss": 0.1887, "step": 17820 }, { "epoch": 1.6286079649251004, "grad_norm": 2.459972620010376, "learning_rate": 5.1246028638898716e-06, "loss": 0.2102, "step": 17830 }, { "epoch": 1.629521373766898, "grad_norm": 3.339264392852783, "learning_rate": 5.119290339792843e-06, "loss": 0.18, "step": 17840 }, { "epoch": 1.6304347826086958, "grad_norm": 6.045313358306885, "learning_rate": 5.113977680946829e-06, "loss": 0.263, "step": 17850 }, { "epoch": 1.6313481914504933, "grad_norm": 2.2315568923950195, "learning_rate": 5.108664893352947e-06, "loss": 0.2994, "step": 17860 }, { "epoch": 1.6322616002922907, "grad_norm": 4.343386650085449, "learning_rate": 5.103351983012461e-06, "loss": 0.2192, "step": 17870 }, { "epoch": 1.6331750091340884, "grad_norm": 2.2274677753448486, "learning_rate": 5.098038955926772e-06, "loss": 0.1967, "step": 17880 }, { "epoch": 1.634088417975886, "grad_norm": 5.767944812774658, "learning_rate": 5.0927258180974145e-06, "loss": 0.2054, "step": 17890 }, { "epoch": 1.6350018268176836, "grad_norm": 2.511986255645752, "learning_rate": 5.087412575526045e-06, "loss": 0.2052, "step": 17900 }, { "epoch": 1.635915235659481, "grad_norm": 3.8379595279693604, "learning_rate": 5.082099234214446e-06, "loss": 0.2061, "step": 17910 }, { "epoch": 1.6368286445012787, "grad_norm": 3.555284023284912, "learning_rate": 5.076785800164502e-06, "loss": 0.1978, "step": 17920 }, { "epoch": 1.6377420533430764, "grad_norm": 2.5273334980010986, "learning_rate": 5.071472279378208e-06, "loss": 0.1709, "step": 17930 }, { "epoch": 1.638655462184874, "grad_norm": 2.3954572677612305, "learning_rate": 5.0661586778576545e-06, "loss": 0.2211, "step": 17940 }, { "epoch": 1.6395688710266716, "grad_norm": 2.3065056800842285, "learning_rate": 5.060845001605025e-06, "loss": 0.1741, "step": 17950 }, { "epoch": 1.6404822798684693, "grad_norm": 3.2016544342041016, "learning_rate": 5.055531256622587e-06, "loss": 0.2101, "step": 17960 }, { "epoch": 1.6413956887102668, "grad_norm": 6.1913909912109375, "learning_rate": 5.050217448912686e-06, "loss": 0.1987, "step": 17970 }, { "epoch": 1.6423090975520642, "grad_norm": 4.558030605316162, "learning_rate": 5.044903584477734e-06, "loss": 0.2018, "step": 17980 }, { "epoch": 1.643222506393862, "grad_norm": 2.6570863723754883, "learning_rate": 5.039589669320214e-06, "loss": 0.2081, "step": 17990 }, { "epoch": 1.6441359152356596, "grad_norm": 3.6196632385253906, "learning_rate": 5.034275709442662e-06, "loss": 0.2173, "step": 18000 }, { "epoch": 1.645049324077457, "grad_norm": 6.570527076721191, "learning_rate": 5.028961710847667e-06, "loss": 0.1965, "step": 18010 }, { "epoch": 1.6459627329192545, "grad_norm": 5.405580043792725, "learning_rate": 5.023647679537859e-06, "loss": 0.2136, "step": 18020 }, { "epoch": 1.6468761417610522, "grad_norm": 3.8432416915893555, "learning_rate": 5.018333621515906e-06, "loss": 0.2037, "step": 18030 }, { "epoch": 1.64778955060285, "grad_norm": 4.88160514831543, "learning_rate": 5.013019542784504e-06, "loss": 0.2502, "step": 18040 }, { "epoch": 1.6487029594446474, "grad_norm": 2.2578604221343994, "learning_rate": 5.007705449346381e-06, "loss": 0.2145, "step": 18050 }, { "epoch": 1.6496163682864449, "grad_norm": 1.6345058679580688, "learning_rate": 5.002391347204269e-06, "loss": 0.2352, "step": 18060 }, { "epoch": 1.6505297771282426, "grad_norm": 2.1657330989837646, "learning_rate": 4.997077242360921e-06, "loss": 0.1525, "step": 18070 }, { "epoch": 1.6514431859700403, "grad_norm": 2.5434582233428955, "learning_rate": 4.991763140819084e-06, "loss": 0.1829, "step": 18080 }, { "epoch": 1.6523565948118377, "grad_norm": 3.3187758922576904, "learning_rate": 4.986449048581509e-06, "loss": 0.2568, "step": 18090 }, { "epoch": 1.6532700036536354, "grad_norm": 3.119008779525757, "learning_rate": 4.9811349716509325e-06, "loss": 0.2311, "step": 18100 }, { "epoch": 1.654183412495433, "grad_norm": 3.7345669269561768, "learning_rate": 4.9758209160300704e-06, "loss": 0.2038, "step": 18110 }, { "epoch": 1.6550968213372306, "grad_norm": 7.702077388763428, "learning_rate": 4.970506887721623e-06, "loss": 0.1842, "step": 18120 }, { "epoch": 1.656010230179028, "grad_norm": 9.065105438232422, "learning_rate": 4.965192892728256e-06, "loss": 0.1884, "step": 18130 }, { "epoch": 1.6569236390208257, "grad_norm": 3.689192056655884, "learning_rate": 4.959878937052591e-06, "loss": 0.2249, "step": 18140 }, { "epoch": 1.6578370478626234, "grad_norm": 17.291444778442383, "learning_rate": 4.9545650266972165e-06, "loss": 0.2228, "step": 18150 }, { "epoch": 1.658750456704421, "grad_norm": 2.6657028198242188, "learning_rate": 4.949251167664659e-06, "loss": 0.1971, "step": 18160 }, { "epoch": 1.6596638655462184, "grad_norm": 2.904822587966919, "learning_rate": 4.943937365957396e-06, "loss": 0.1574, "step": 18170 }, { "epoch": 1.660577274388016, "grad_norm": 7.456476211547852, "learning_rate": 4.938623627577835e-06, "loss": 0.1991, "step": 18180 }, { "epoch": 1.6614906832298137, "grad_norm": 2.1869006156921387, "learning_rate": 4.933309958528312e-06, "loss": 0.2204, "step": 18190 }, { "epoch": 1.6624040920716112, "grad_norm": 2.536646842956543, "learning_rate": 4.927996364811089e-06, "loss": 0.2132, "step": 18200 }, { "epoch": 1.6633175009134087, "grad_norm": 3.546971321105957, "learning_rate": 4.922682852428336e-06, "loss": 0.1884, "step": 18210 }, { "epoch": 1.6642309097552064, "grad_norm": 9.530715942382812, "learning_rate": 4.917369427382137e-06, "loss": 0.1794, "step": 18220 }, { "epoch": 1.665144318597004, "grad_norm": 1.6967371702194214, "learning_rate": 4.9120560956744775e-06, "loss": 0.1702, "step": 18230 }, { "epoch": 1.6660577274388015, "grad_norm": 3.811177968978882, "learning_rate": 4.9067428633072325e-06, "loss": 0.2274, "step": 18240 }, { "epoch": 1.6669711362805992, "grad_norm": 5.149228572845459, "learning_rate": 4.90142973628217e-06, "loss": 0.1977, "step": 18250 }, { "epoch": 1.667884545122397, "grad_norm": 8.089109420776367, "learning_rate": 4.8961167206009336e-06, "loss": 0.2302, "step": 18260 }, { "epoch": 1.6687979539641944, "grad_norm": 4.920757293701172, "learning_rate": 4.890803822265048e-06, "loss": 0.2213, "step": 18270 }, { "epoch": 1.6697113628059919, "grad_norm": 2.7209393978118896, "learning_rate": 4.8854910472758976e-06, "loss": 0.201, "step": 18280 }, { "epoch": 1.6706247716477896, "grad_norm": 4.077350616455078, "learning_rate": 4.8801784016347345e-06, "loss": 0.2305, "step": 18290 }, { "epoch": 1.6715381804895872, "grad_norm": 2.634143829345703, "learning_rate": 4.874865891342659e-06, "loss": 0.2252, "step": 18300 }, { "epoch": 1.6724515893313847, "grad_norm": 5.231472015380859, "learning_rate": 4.869553522400626e-06, "loss": 0.1992, "step": 18310 }, { "epoch": 1.6733649981731822, "grad_norm": 1.1254595518112183, "learning_rate": 4.864241300809421e-06, "loss": 0.1866, "step": 18320 }, { "epoch": 1.6742784070149799, "grad_norm": 5.104534149169922, "learning_rate": 4.858929232569671e-06, "loss": 0.1829, "step": 18330 }, { "epoch": 1.6751918158567776, "grad_norm": 2.1805419921875, "learning_rate": 4.853617323681824e-06, "loss": 0.1872, "step": 18340 }, { "epoch": 1.676105224698575, "grad_norm": 3.5107152462005615, "learning_rate": 4.848305580146154e-06, "loss": 0.2078, "step": 18350 }, { "epoch": 1.6770186335403725, "grad_norm": 3.2347047328948975, "learning_rate": 4.842994007962742e-06, "loss": 0.1727, "step": 18360 }, { "epoch": 1.6779320423821704, "grad_norm": 4.033168315887451, "learning_rate": 4.837682613131479e-06, "loss": 0.1698, "step": 18370 }, { "epoch": 1.6788454512239679, "grad_norm": 4.5475640296936035, "learning_rate": 4.832371401652058e-06, "loss": 0.202, "step": 18380 }, { "epoch": 1.6797588600657654, "grad_norm": 2.6656110286712646, "learning_rate": 4.827060379523957e-06, "loss": 0.2168, "step": 18390 }, { "epoch": 1.680672268907563, "grad_norm": 3.168297529220581, "learning_rate": 4.821749552746448e-06, "loss": 0.2117, "step": 18400 }, { "epoch": 1.6815856777493607, "grad_norm": 4.323173522949219, "learning_rate": 4.8164389273185806e-06, "loss": 0.1702, "step": 18410 }, { "epoch": 1.6824990865911582, "grad_norm": 2.416372537612915, "learning_rate": 4.811128509239174e-06, "loss": 0.1987, "step": 18420 }, { "epoch": 1.6834124954329557, "grad_norm": 2.672086238861084, "learning_rate": 4.8058183045068155e-06, "loss": 0.1944, "step": 18430 }, { "epoch": 1.6843259042747534, "grad_norm": 4.078382968902588, "learning_rate": 4.8005083191198495e-06, "loss": 0.1281, "step": 18440 }, { "epoch": 1.685239313116551, "grad_norm": 4.37666130065918, "learning_rate": 4.795198559076377e-06, "loss": 0.1678, "step": 18450 }, { "epoch": 1.6861527219583485, "grad_norm": 4.454237461090088, "learning_rate": 4.789889030374238e-06, "loss": 0.2008, "step": 18460 }, { "epoch": 1.687066130800146, "grad_norm": 4.268444538116455, "learning_rate": 4.784579739011015e-06, "loss": 0.2445, "step": 18470 }, { "epoch": 1.6879795396419437, "grad_norm": 4.070363998413086, "learning_rate": 4.779270690984026e-06, "loss": 0.1973, "step": 18480 }, { "epoch": 1.6888929484837414, "grad_norm": 4.616259574890137, "learning_rate": 4.773961892290303e-06, "loss": 0.225, "step": 18490 }, { "epoch": 1.6898063573255389, "grad_norm": 7.9323649406433105, "learning_rate": 4.76865334892661e-06, "loss": 0.2022, "step": 18500 }, { "epoch": 1.6907197661673365, "grad_norm": 10.531329154968262, "learning_rate": 4.763345066889415e-06, "loss": 0.1979, "step": 18510 }, { "epoch": 1.6916331750091342, "grad_norm": 3.627790927886963, "learning_rate": 4.75803705217489e-06, "loss": 0.2506, "step": 18520 }, { "epoch": 1.6925465838509317, "grad_norm": 4.513416767120361, "learning_rate": 4.752729310778908e-06, "loss": 0.2249, "step": 18530 }, { "epoch": 1.6934599926927292, "grad_norm": 3.0547168254852295, "learning_rate": 4.74742184869703e-06, "loss": 0.2699, "step": 18540 }, { "epoch": 1.6943734015345269, "grad_norm": 4.475724697113037, "learning_rate": 4.742114671924508e-06, "loss": 0.1937, "step": 18550 }, { "epoch": 1.6952868103763246, "grad_norm": 1.53419029712677, "learning_rate": 4.736807786456263e-06, "loss": 0.2068, "step": 18560 }, { "epoch": 1.696200219218122, "grad_norm": 3.5047860145568848, "learning_rate": 4.731501198286893e-06, "loss": 0.2304, "step": 18570 }, { "epoch": 1.6971136280599195, "grad_norm": 1.4004063606262207, "learning_rate": 4.726194913410658e-06, "loss": 0.2552, "step": 18580 }, { "epoch": 1.6980270369017172, "grad_norm": 2.536088466644287, "learning_rate": 4.72088893782148e-06, "loss": 0.1894, "step": 18590 }, { "epoch": 1.6989404457435149, "grad_norm": 3.5987677574157715, "learning_rate": 4.715583277512922e-06, "loss": 0.1917, "step": 18600 }, { "epoch": 1.6998538545853124, "grad_norm": 2.8235061168670654, "learning_rate": 4.710277938478199e-06, "loss": 0.2075, "step": 18610 }, { "epoch": 1.7007672634271098, "grad_norm": 3.088275909423828, "learning_rate": 4.70497292671016e-06, "loss": 0.1569, "step": 18620 }, { "epoch": 1.7016806722689075, "grad_norm": 3.4162168502807617, "learning_rate": 4.6996682482012865e-06, "loss": 0.1634, "step": 18630 }, { "epoch": 1.7025940811107052, "grad_norm": 3.5549349784851074, "learning_rate": 4.694363908943679e-06, "loss": 0.2289, "step": 18640 }, { "epoch": 1.7035074899525027, "grad_norm": 3.755814790725708, "learning_rate": 4.6890599149290585e-06, "loss": 0.1794, "step": 18650 }, { "epoch": 1.7044208987943004, "grad_norm": 2.989348888397217, "learning_rate": 4.683756272148757e-06, "loss": 0.1639, "step": 18660 }, { "epoch": 1.705334307636098, "grad_norm": 4.613002777099609, "learning_rate": 4.678452986593706e-06, "loss": 0.1424, "step": 18670 }, { "epoch": 1.7062477164778955, "grad_norm": 2.307985544204712, "learning_rate": 4.673150064254435e-06, "loss": 0.188, "step": 18680 }, { "epoch": 1.707161125319693, "grad_norm": 5.705870628356934, "learning_rate": 4.667847511121067e-06, "loss": 0.1476, "step": 18690 }, { "epoch": 1.7080745341614907, "grad_norm": 8.500395774841309, "learning_rate": 4.662545333183301e-06, "loss": 0.2133, "step": 18700 }, { "epoch": 1.7089879430032884, "grad_norm": 3.158360481262207, "learning_rate": 4.657243536430418e-06, "loss": 0.1965, "step": 18710 }, { "epoch": 1.7099013518450858, "grad_norm": 4.525452136993408, "learning_rate": 4.651942126851265e-06, "loss": 0.2016, "step": 18720 }, { "epoch": 1.7108147606868833, "grad_norm": 2.2774100303649902, "learning_rate": 4.6466411104342526e-06, "loss": 0.1872, "step": 18730 }, { "epoch": 1.711728169528681, "grad_norm": 3.894434928894043, "learning_rate": 4.6413404931673485e-06, "loss": 0.1547, "step": 18740 }, { "epoch": 1.7126415783704787, "grad_norm": 2.622410774230957, "learning_rate": 4.636040281038067e-06, "loss": 0.1954, "step": 18750 }, { "epoch": 1.7135549872122762, "grad_norm": 4.564425468444824, "learning_rate": 4.6307404800334695e-06, "loss": 0.2226, "step": 18760 }, { "epoch": 1.7144683960540736, "grad_norm": 1.589528203010559, "learning_rate": 4.625441096140146e-06, "loss": 0.1862, "step": 18770 }, { "epoch": 1.7153818048958716, "grad_norm": 2.0352234840393066, "learning_rate": 4.62014213534422e-06, "loss": 0.2055, "step": 18780 }, { "epoch": 1.716295213737669, "grad_norm": 4.928898334503174, "learning_rate": 4.6148436036313395e-06, "loss": 0.2115, "step": 18790 }, { "epoch": 1.7172086225794665, "grad_norm": 7.112977027893066, "learning_rate": 4.6095455069866614e-06, "loss": 0.1981, "step": 18800 }, { "epoch": 1.7181220314212642, "grad_norm": 2.5187644958496094, "learning_rate": 4.604247851394855e-06, "loss": 0.2515, "step": 18810 }, { "epoch": 1.7190354402630619, "grad_norm": 3.4672422409057617, "learning_rate": 4.5989506428400905e-06, "loss": 0.222, "step": 18820 }, { "epoch": 1.7199488491048593, "grad_norm": 3.3756484985351562, "learning_rate": 4.593653887306033e-06, "loss": 0.202, "step": 18830 }, { "epoch": 1.7208622579466568, "grad_norm": 4.761664867401123, "learning_rate": 4.588357590775839e-06, "loss": 0.213, "step": 18840 }, { "epoch": 1.7217756667884545, "grad_norm": 1.5276516675949097, "learning_rate": 4.583061759232139e-06, "loss": 0.2094, "step": 18850 }, { "epoch": 1.7226890756302522, "grad_norm": 5.932540416717529, "learning_rate": 4.577766398657047e-06, "loss": 0.154, "step": 18860 }, { "epoch": 1.7236024844720497, "grad_norm": 12.19300365447998, "learning_rate": 4.572471515032137e-06, "loss": 0.1881, "step": 18870 }, { "epoch": 1.7245158933138471, "grad_norm": 3.83575177192688, "learning_rate": 4.567177114338452e-06, "loss": 0.2652, "step": 18880 }, { "epoch": 1.7254293021556448, "grad_norm": 2.3019604682922363, "learning_rate": 4.5618832025564856e-06, "loss": 0.1935, "step": 18890 }, { "epoch": 1.7263427109974425, "grad_norm": 7.613281726837158, "learning_rate": 4.556589785666176e-06, "loss": 0.1578, "step": 18900 }, { "epoch": 1.72725611983924, "grad_norm": 3.1480658054351807, "learning_rate": 4.551296869646909e-06, "loss": 0.2357, "step": 18910 }, { "epoch": 1.7281695286810377, "grad_norm": 3.538295269012451, "learning_rate": 4.5460044604774986e-06, "loss": 0.2616, "step": 18920 }, { "epoch": 1.7290829375228354, "grad_norm": 3.1255362033843994, "learning_rate": 4.540712564136189e-06, "loss": 0.2293, "step": 18930 }, { "epoch": 1.7299963463646328, "grad_norm": 4.7816081047058105, "learning_rate": 4.535421186600648e-06, "loss": 0.2134, "step": 18940 }, { "epoch": 1.7309097552064303, "grad_norm": 2.631300926208496, "learning_rate": 4.530130333847951e-06, "loss": 0.2219, "step": 18950 }, { "epoch": 1.731823164048228, "grad_norm": 5.93371057510376, "learning_rate": 4.524840011854584e-06, "loss": 0.1604, "step": 18960 }, { "epoch": 1.7327365728900257, "grad_norm": 4.512164115905762, "learning_rate": 4.519550226596438e-06, "loss": 0.1981, "step": 18970 }, { "epoch": 1.7336499817318232, "grad_norm": 4.329872131347656, "learning_rate": 4.5142609840487875e-06, "loss": 0.2077, "step": 18980 }, { "epoch": 1.7345633905736206, "grad_norm": 1.7101919651031494, "learning_rate": 4.508972290186303e-06, "loss": 0.2026, "step": 18990 }, { "epoch": 1.7354767994154183, "grad_norm": 2.7956297397613525, "learning_rate": 4.50368415098303e-06, "loss": 0.1861, "step": 19000 }, { "epoch": 1.736390208257216, "grad_norm": 2.1071887016296387, "learning_rate": 4.498396572412391e-06, "loss": 0.1799, "step": 19010 }, { "epoch": 1.7373036170990135, "grad_norm": 3.3342785835266113, "learning_rate": 4.493109560447172e-06, "loss": 0.1898, "step": 19020 }, { "epoch": 1.738217025940811, "grad_norm": 2.567554473876953, "learning_rate": 4.48782312105952e-06, "loss": 0.2102, "step": 19030 }, { "epoch": 1.7391304347826086, "grad_norm": 2.171637535095215, "learning_rate": 4.482537260220939e-06, "loss": 0.1659, "step": 19040 }, { "epoch": 1.7400438436244063, "grad_norm": 7.524522304534912, "learning_rate": 4.4772519839022725e-06, "loss": 0.1695, "step": 19050 }, { "epoch": 1.7409572524662038, "grad_norm": 3.2748327255249023, "learning_rate": 4.4719672980737066e-06, "loss": 0.1902, "step": 19060 }, { "epoch": 1.7418706613080015, "grad_norm": 6.572103500366211, "learning_rate": 4.466683208704766e-06, "loss": 0.2558, "step": 19070 }, { "epoch": 1.7427840701497992, "grad_norm": 6.291947364807129, "learning_rate": 4.461399721764293e-06, "loss": 0.2367, "step": 19080 }, { "epoch": 1.7436974789915967, "grad_norm": 3.0684616565704346, "learning_rate": 4.456116843220456e-06, "loss": 0.2206, "step": 19090 }, { "epoch": 1.7446108878333941, "grad_norm": 3.75504994392395, "learning_rate": 4.450834579040731e-06, "loss": 0.2363, "step": 19100 }, { "epoch": 1.7455242966751918, "grad_norm": 2.5931203365325928, "learning_rate": 4.445552935191903e-06, "loss": 0.2293, "step": 19110 }, { "epoch": 1.7464377055169895, "grad_norm": 5.387319087982178, "learning_rate": 4.440271917640056e-06, "loss": 0.1691, "step": 19120 }, { "epoch": 1.747351114358787, "grad_norm": 4.580089092254639, "learning_rate": 4.434991532350567e-06, "loss": 0.2183, "step": 19130 }, { "epoch": 1.7482645232005845, "grad_norm": 6.017392635345459, "learning_rate": 4.429711785288097e-06, "loss": 0.3286, "step": 19140 }, { "epoch": 1.7491779320423821, "grad_norm": 2.7353713512420654, "learning_rate": 4.424432682416585e-06, "loss": 0.2497, "step": 19150 }, { "epoch": 1.7500913408841798, "grad_norm": 4.072706699371338, "learning_rate": 4.419154229699248e-06, "loss": 0.1976, "step": 19160 }, { "epoch": 1.7510047497259773, "grad_norm": 5.022295951843262, "learning_rate": 4.4138764330985664e-06, "loss": 0.2126, "step": 19170 }, { "epoch": 1.7519181585677748, "grad_norm": 2.2977635860443115, "learning_rate": 4.408599298576272e-06, "loss": 0.229, "step": 19180 }, { "epoch": 1.7528315674095727, "grad_norm": 2.3755483627319336, "learning_rate": 4.403322832093361e-06, "loss": 0.2362, "step": 19190 }, { "epoch": 1.7537449762513702, "grad_norm": 4.612023830413818, "learning_rate": 4.398047039610065e-06, "loss": 0.2124, "step": 19200 }, { "epoch": 1.7546583850931676, "grad_norm": 3.875185966491699, "learning_rate": 4.392771927085859e-06, "loss": 0.2089, "step": 19210 }, { "epoch": 1.7555717939349653, "grad_norm": 10.380887985229492, "learning_rate": 4.38749750047945e-06, "loss": 0.1989, "step": 19220 }, { "epoch": 1.756485202776763, "grad_norm": 4.775038719177246, "learning_rate": 4.382223765748768e-06, "loss": 0.2, "step": 19230 }, { "epoch": 1.7573986116185605, "grad_norm": 2.7789711952209473, "learning_rate": 4.376950728850961e-06, "loss": 0.2103, "step": 19240 }, { "epoch": 1.758312020460358, "grad_norm": 2.1151931285858154, "learning_rate": 4.371678395742395e-06, "loss": 0.1514, "step": 19250 }, { "epoch": 1.7592254293021556, "grad_norm": 3.3393490314483643, "learning_rate": 4.366406772378633e-06, "loss": 0.1769, "step": 19260 }, { "epoch": 1.7601388381439533, "grad_norm": 3.303938627243042, "learning_rate": 4.361135864714441e-06, "loss": 0.1759, "step": 19270 }, { "epoch": 1.7610522469857508, "grad_norm": 8.132279396057129, "learning_rate": 4.3558656787037746e-06, "loss": 0.2564, "step": 19280 }, { "epoch": 1.7619656558275483, "grad_norm": 2.3374550342559814, "learning_rate": 4.350596220299776e-06, "loss": 0.2414, "step": 19290 }, { "epoch": 1.762879064669346, "grad_norm": 5.8248610496521, "learning_rate": 4.345327495454761e-06, "loss": 0.2131, "step": 19300 }, { "epoch": 1.7637924735111437, "grad_norm": 6.14131498336792, "learning_rate": 4.340059510120224e-06, "loss": 0.1919, "step": 19310 }, { "epoch": 1.7647058823529411, "grad_norm": 4.372379302978516, "learning_rate": 4.334792270246819e-06, "loss": 0.2386, "step": 19320 }, { "epoch": 1.7656192911947388, "grad_norm": 2.4166738986968994, "learning_rate": 4.329525781784358e-06, "loss": 0.2141, "step": 19330 }, { "epoch": 1.7665327000365365, "grad_norm": 2.9302873611450195, "learning_rate": 4.324260050681805e-06, "loss": 0.1385, "step": 19340 }, { "epoch": 1.767446108878334, "grad_norm": 2.364776372909546, "learning_rate": 4.318995082887272e-06, "loss": 0.2036, "step": 19350 }, { "epoch": 1.7683595177201314, "grad_norm": 3.9309535026550293, "learning_rate": 4.313730884348003e-06, "loss": 0.1666, "step": 19360 }, { "epoch": 1.7692729265619291, "grad_norm": 3.9367661476135254, "learning_rate": 4.308467461010377e-06, "loss": 0.204, "step": 19370 }, { "epoch": 1.7701863354037268, "grad_norm": 3.6879611015319824, "learning_rate": 4.303204818819895e-06, "loss": 0.1504, "step": 19380 }, { "epoch": 1.7710997442455243, "grad_norm": 3.282850503921509, "learning_rate": 4.297942963721177e-06, "loss": 0.2089, "step": 19390 }, { "epoch": 1.7720131530873218, "grad_norm": 3.535284996032715, "learning_rate": 4.292681901657954e-06, "loss": 0.2039, "step": 19400 }, { "epoch": 1.7729265619291195, "grad_norm": 3.726290464401245, "learning_rate": 4.287421638573059e-06, "loss": 0.202, "step": 19410 }, { "epoch": 1.7738399707709172, "grad_norm": 2.295074701309204, "learning_rate": 4.282162180408428e-06, "loss": 0.2067, "step": 19420 }, { "epoch": 1.7747533796127146, "grad_norm": 4.557220935821533, "learning_rate": 4.276903533105078e-06, "loss": 0.2296, "step": 19430 }, { "epoch": 1.775666788454512, "grad_norm": 4.540415287017822, "learning_rate": 4.271645702603122e-06, "loss": 0.2008, "step": 19440 }, { "epoch": 1.7765801972963098, "grad_norm": 2.1913199424743652, "learning_rate": 4.266388694841743e-06, "loss": 0.1744, "step": 19450 }, { "epoch": 1.7774936061381075, "grad_norm": 3.1031455993652344, "learning_rate": 4.261132515759193e-06, "loss": 0.2072, "step": 19460 }, { "epoch": 1.778407014979905, "grad_norm": 3.319368600845337, "learning_rate": 4.255877171292795e-06, "loss": 0.224, "step": 19470 }, { "epoch": 1.7793204238217026, "grad_norm": 1.1504735946655273, "learning_rate": 4.2506226673789225e-06, "loss": 0.1809, "step": 19480 }, { "epoch": 1.7802338326635003, "grad_norm": 6.486485481262207, "learning_rate": 4.245369009953003e-06, "loss": 0.1687, "step": 19490 }, { "epoch": 1.7811472415052978, "grad_norm": 1.9712486267089844, "learning_rate": 4.240116204949508e-06, "loss": 0.1892, "step": 19500 }, { "epoch": 1.7820606503470953, "grad_norm": 4.45025634765625, "learning_rate": 4.234864258301943e-06, "loss": 0.209, "step": 19510 }, { "epoch": 1.782974059188893, "grad_norm": 4.056071758270264, "learning_rate": 4.2296131759428495e-06, "loss": 0.1862, "step": 19520 }, { "epoch": 1.7838874680306906, "grad_norm": 4.1930999755859375, "learning_rate": 4.2243629638037845e-06, "loss": 0.1959, "step": 19530 }, { "epoch": 1.7848008768724881, "grad_norm": 3.1487040519714355, "learning_rate": 4.21911362781533e-06, "loss": 0.1822, "step": 19540 }, { "epoch": 1.7857142857142856, "grad_norm": 3.5981740951538086, "learning_rate": 4.213865173907077e-06, "loss": 0.2157, "step": 19550 }, { "epoch": 1.7866276945560833, "grad_norm": 2.5381174087524414, "learning_rate": 4.208617608007613e-06, "loss": 0.2261, "step": 19560 }, { "epoch": 1.787541103397881, "grad_norm": 2.9769725799560547, "learning_rate": 4.2033709360445335e-06, "loss": 0.2434, "step": 19570 }, { "epoch": 1.7884545122396784, "grad_norm": 3.522160053253174, "learning_rate": 4.198125163944413e-06, "loss": 0.1873, "step": 19580 }, { "epoch": 1.789367921081476, "grad_norm": 3.23284912109375, "learning_rate": 4.1928802976328184e-06, "loss": 0.2082, "step": 19590 }, { "epoch": 1.7902813299232738, "grad_norm": 1.9188778400421143, "learning_rate": 4.18763634303429e-06, "loss": 0.1551, "step": 19600 }, { "epoch": 1.7911947387650713, "grad_norm": 4.511789798736572, "learning_rate": 4.182393306072335e-06, "loss": 0.2072, "step": 19610 }, { "epoch": 1.7921081476068688, "grad_norm": 2.559715509414673, "learning_rate": 4.177151192669428e-06, "loss": 0.214, "step": 19620 }, { "epoch": 1.7930215564486665, "grad_norm": 2.3086111545562744, "learning_rate": 4.171910008747004e-06, "loss": 0.1638, "step": 19630 }, { "epoch": 1.7939349652904641, "grad_norm": 3.6196725368499756, "learning_rate": 4.166669760225438e-06, "loss": 0.2054, "step": 19640 }, { "epoch": 1.7948483741322616, "grad_norm": 4.7241973876953125, "learning_rate": 4.161430453024058e-06, "loss": 0.194, "step": 19650 }, { "epoch": 1.795761782974059, "grad_norm": 4.605785369873047, "learning_rate": 4.156192093061123e-06, "loss": 0.186, "step": 19660 }, { "epoch": 1.7966751918158568, "grad_norm": 2.606980085372925, "learning_rate": 4.150954686253823e-06, "loss": 0.1663, "step": 19670 }, { "epoch": 1.7975886006576545, "grad_norm": 2.06592059135437, "learning_rate": 4.1457182385182735e-06, "loss": 0.2028, "step": 19680 }, { "epoch": 1.798502009499452, "grad_norm": 4.150547504425049, "learning_rate": 4.140482755769505e-06, "loss": 0.2265, "step": 19690 }, { "epoch": 1.7994154183412494, "grad_norm": 2.352757453918457, "learning_rate": 4.135248243921457e-06, "loss": 0.1897, "step": 19700 }, { "epoch": 1.800328827183047, "grad_norm": 2.9936370849609375, "learning_rate": 4.130014708886974e-06, "loss": 0.1789, "step": 19710 }, { "epoch": 1.8012422360248448, "grad_norm": 4.492934703826904, "learning_rate": 4.124782156577796e-06, "loss": 0.1964, "step": 19720 }, { "epoch": 1.8021556448666423, "grad_norm": 3.4143097400665283, "learning_rate": 4.119550592904556e-06, "loss": 0.1711, "step": 19730 }, { "epoch": 1.80306905370844, "grad_norm": 2.7055318355560303, "learning_rate": 4.114320023776765e-06, "loss": 0.2011, "step": 19740 }, { "epoch": 1.8039824625502376, "grad_norm": 3.3677427768707275, "learning_rate": 4.1090904551028145e-06, "loss": 0.2138, "step": 19750 }, { "epoch": 1.8048958713920351, "grad_norm": 2.6681747436523438, "learning_rate": 4.103861892789964e-06, "loss": 0.1902, "step": 19760 }, { "epoch": 1.8058092802338326, "grad_norm": 3.2497036457061768, "learning_rate": 4.098634342744337e-06, "loss": 0.2072, "step": 19770 }, { "epoch": 1.8067226890756303, "grad_norm": 2.7955868244171143, "learning_rate": 4.093407810870916e-06, "loss": 0.2406, "step": 19780 }, { "epoch": 1.807636097917428, "grad_norm": 3.5045392513275146, "learning_rate": 4.088182303073527e-06, "loss": 0.2095, "step": 19790 }, { "epoch": 1.8085495067592254, "grad_norm": 2.5751798152923584, "learning_rate": 4.0829578252548455e-06, "loss": 0.1519, "step": 19800 }, { "epoch": 1.809462915601023, "grad_norm": 9.29873275756836, "learning_rate": 4.077734383316378e-06, "loss": 0.2373, "step": 19810 }, { "epoch": 1.8103763244428206, "grad_norm": 3.164735794067383, "learning_rate": 4.0725119831584675e-06, "loss": 0.2295, "step": 19820 }, { "epoch": 1.8112897332846183, "grad_norm": 4.059249401092529, "learning_rate": 4.067290630680277e-06, "loss": 0.1928, "step": 19830 }, { "epoch": 1.8122031421264158, "grad_norm": 4.941408634185791, "learning_rate": 4.0620703317797824e-06, "loss": 0.1701, "step": 19840 }, { "epoch": 1.8131165509682132, "grad_norm": 2.1284525394439697, "learning_rate": 4.056851092353777e-06, "loss": 0.1908, "step": 19850 }, { "epoch": 1.814029959810011, "grad_norm": 4.409045219421387, "learning_rate": 4.051632918297849e-06, "loss": 0.2107, "step": 19860 }, { "epoch": 1.8149433686518086, "grad_norm": 2.386167526245117, "learning_rate": 4.0464158155063895e-06, "loss": 0.234, "step": 19870 }, { "epoch": 1.815856777493606, "grad_norm": 4.122093677520752, "learning_rate": 4.041199789872579e-06, "loss": 0.2239, "step": 19880 }, { "epoch": 1.8167701863354038, "grad_norm": 3.6071219444274902, "learning_rate": 4.0359848472883755e-06, "loss": 0.1981, "step": 19890 }, { "epoch": 1.8176835951772015, "grad_norm": 7.579836845397949, "learning_rate": 4.0307709936445215e-06, "loss": 0.1885, "step": 19900 }, { "epoch": 1.818597004018999, "grad_norm": 2.2512526512145996, "learning_rate": 4.0255582348305225e-06, "loss": 0.2046, "step": 19910 }, { "epoch": 1.8195104128607964, "grad_norm": 5.2043538093566895, "learning_rate": 4.020346576734653e-06, "loss": 0.1779, "step": 19920 }, { "epoch": 1.820423821702594, "grad_norm": 4.2529449462890625, "learning_rate": 4.015136025243942e-06, "loss": 0.2548, "step": 19930 }, { "epoch": 1.8213372305443918, "grad_norm": 7.590660095214844, "learning_rate": 4.009926586244165e-06, "loss": 0.157, "step": 19940 }, { "epoch": 1.8222506393861893, "grad_norm": 1.7408607006072998, "learning_rate": 4.0047182656198465e-06, "loss": 0.2672, "step": 19950 }, { "epoch": 1.8231640482279867, "grad_norm": 2.841778516769409, "learning_rate": 3.999511069254245e-06, "loss": 0.2104, "step": 19960 }, { "epoch": 1.8240774570697844, "grad_norm": 3.536263942718506, "learning_rate": 3.9943050030293485e-06, "loss": 0.2514, "step": 19970 }, { "epoch": 1.824990865911582, "grad_norm": 4.502190589904785, "learning_rate": 3.98910007282587e-06, "loss": 0.1775, "step": 19980 }, { "epoch": 1.8259042747533796, "grad_norm": 3.720668077468872, "learning_rate": 3.9838962845232366e-06, "loss": 0.2415, "step": 19990 }, { "epoch": 1.826817683595177, "grad_norm": 3.9353976249694824, "learning_rate": 3.978693643999587e-06, "loss": 0.2781, "step": 20000 }, { "epoch": 1.8277310924369747, "grad_norm": 3.8350307941436768, "learning_rate": 3.9734921571317665e-06, "loss": 0.1809, "step": 20010 }, { "epoch": 1.8286445012787724, "grad_norm": 3.9366941452026367, "learning_rate": 3.968291829795313e-06, "loss": 0.1559, "step": 20020 }, { "epoch": 1.82955791012057, "grad_norm": 2.298891067504883, "learning_rate": 3.9630926678644556e-06, "loss": 0.1896, "step": 20030 }, { "epoch": 1.8304713189623676, "grad_norm": 2.170747995376587, "learning_rate": 3.957894677212107e-06, "loss": 0.2179, "step": 20040 }, { "epoch": 1.8313847278041653, "grad_norm": 3.722423791885376, "learning_rate": 3.952697863709858e-06, "loss": 0.217, "step": 20050 }, { "epoch": 1.8322981366459627, "grad_norm": 3.0080673694610596, "learning_rate": 3.94750223322797e-06, "loss": 0.1963, "step": 20060 }, { "epoch": 1.8332115454877602, "grad_norm": 2.9001519680023193, "learning_rate": 3.942307791635365e-06, "loss": 0.1792, "step": 20070 }, { "epoch": 1.834124954329558, "grad_norm": 3.752213478088379, "learning_rate": 3.937114544799625e-06, "loss": 0.1798, "step": 20080 }, { "epoch": 1.8350383631713556, "grad_norm": 3.0106589794158936, "learning_rate": 3.93192249858698e-06, "loss": 0.1856, "step": 20090 }, { "epoch": 1.835951772013153, "grad_norm": 5.903666019439697, "learning_rate": 3.926731658862307e-06, "loss": 0.1627, "step": 20100 }, { "epoch": 1.8368651808549505, "grad_norm": 3.246351480484009, "learning_rate": 3.921542031489119e-06, "loss": 0.2084, "step": 20110 }, { "epoch": 1.8377785896967482, "grad_norm": 5.961856842041016, "learning_rate": 3.916353622329558e-06, "loss": 0.1997, "step": 20120 }, { "epoch": 1.838691998538546, "grad_norm": 3.898643970489502, "learning_rate": 3.911166437244389e-06, "loss": 0.1791, "step": 20130 }, { "epoch": 1.8396054073803434, "grad_norm": 5.234832763671875, "learning_rate": 3.905980482093e-06, "loss": 0.2043, "step": 20140 }, { "epoch": 1.840518816222141, "grad_norm": 4.748090744018555, "learning_rate": 3.900795762733383e-06, "loss": 0.2146, "step": 20150 }, { "epoch": 1.8414322250639388, "grad_norm": 4.253523349761963, "learning_rate": 3.895612285022138e-06, "loss": 0.2616, "step": 20160 }, { "epoch": 1.8423456339057362, "grad_norm": 8.357851028442383, "learning_rate": 3.89043005481446e-06, "loss": 0.2161, "step": 20170 }, { "epoch": 1.8432590427475337, "grad_norm": 4.2455925941467285, "learning_rate": 3.885249077964139e-06, "loss": 0.2189, "step": 20180 }, { "epoch": 1.8441724515893314, "grad_norm": 1.831407070159912, "learning_rate": 3.880069360323542e-06, "loss": 0.2041, "step": 20190 }, { "epoch": 1.845085860431129, "grad_norm": 2.810434579849243, "learning_rate": 3.874890907743622e-06, "loss": 0.186, "step": 20200 }, { "epoch": 1.8459992692729266, "grad_norm": 3.565648317337036, "learning_rate": 3.8697137260739e-06, "loss": 0.2502, "step": 20210 }, { "epoch": 1.846912678114724, "grad_norm": 5.634875297546387, "learning_rate": 3.8645378211624566e-06, "loss": 0.2021, "step": 20220 }, { "epoch": 1.8478260869565217, "grad_norm": 5.45308256149292, "learning_rate": 3.859363198855935e-06, "loss": 0.2029, "step": 20230 }, { "epoch": 1.8487394957983194, "grad_norm": 3.2547340393066406, "learning_rate": 3.85418986499953e-06, "loss": 0.1954, "step": 20240 }, { "epoch": 1.849652904640117, "grad_norm": 1.3060659170150757, "learning_rate": 3.849017825436979e-06, "loss": 0.1952, "step": 20250 }, { "epoch": 1.8505663134819144, "grad_norm": 2.9836249351501465, "learning_rate": 3.843847086010558e-06, "loss": 0.1754, "step": 20260 }, { "epoch": 1.851479722323712, "grad_norm": 4.550251483917236, "learning_rate": 3.838677652561074e-06, "loss": 0.1726, "step": 20270 }, { "epoch": 1.8523931311655097, "grad_norm": 12.449538230895996, "learning_rate": 3.833509530927857e-06, "loss": 0.221, "step": 20280 }, { "epoch": 1.8533065400073072, "grad_norm": 2.7925515174865723, "learning_rate": 3.828342726948763e-06, "loss": 0.2355, "step": 20290 }, { "epoch": 1.854219948849105, "grad_norm": 2.8055639266967773, "learning_rate": 3.823177246460149e-06, "loss": 0.2177, "step": 20300 }, { "epoch": 1.8551333576909026, "grad_norm": 6.432549953460693, "learning_rate": 3.818013095296885e-06, "loss": 0.282, "step": 20310 }, { "epoch": 1.8560467665327, "grad_norm": 6.261832237243652, "learning_rate": 3.812850279292333e-06, "loss": 0.1968, "step": 20320 }, { "epoch": 1.8569601753744975, "grad_norm": 3.2386510372161865, "learning_rate": 3.807688804278352e-06, "loss": 0.1708, "step": 20330 }, { "epoch": 1.8578735842162952, "grad_norm": 2.212608814239502, "learning_rate": 3.8025286760852863e-06, "loss": 0.151, "step": 20340 }, { "epoch": 1.858786993058093, "grad_norm": 2.8554725646972656, "learning_rate": 3.7973699005419528e-06, "loss": 0.1659, "step": 20350 }, { "epoch": 1.8597004018998904, "grad_norm": 6.708205699920654, "learning_rate": 3.7922124834756483e-06, "loss": 0.2159, "step": 20360 }, { "epoch": 1.8606138107416879, "grad_norm": 3.6177330017089844, "learning_rate": 3.7870564307121292e-06, "loss": 0.1687, "step": 20370 }, { "epoch": 1.8615272195834855, "grad_norm": 2.063164710998535, "learning_rate": 3.7819017480756114e-06, "loss": 0.2077, "step": 20380 }, { "epoch": 1.8624406284252832, "grad_norm": 3.3232367038726807, "learning_rate": 3.77674844138877e-06, "loss": 0.1907, "step": 20390 }, { "epoch": 1.8633540372670807, "grad_norm": 2.930521011352539, "learning_rate": 3.771596516472716e-06, "loss": 0.2292, "step": 20400 }, { "epoch": 1.8642674461088782, "grad_norm": 2.6825058460235596, "learning_rate": 3.766445979147005e-06, "loss": 0.2091, "step": 20410 }, { "epoch": 1.8651808549506759, "grad_norm": 4.577187538146973, "learning_rate": 3.7612968352296263e-06, "loss": 0.1775, "step": 20420 }, { "epoch": 1.8660942637924736, "grad_norm": 4.730849742889404, "learning_rate": 3.756149090536991e-06, "loss": 0.2475, "step": 20430 }, { "epoch": 1.867007672634271, "grad_norm": 3.312612533569336, "learning_rate": 3.751002750883933e-06, "loss": 0.1786, "step": 20440 }, { "epoch": 1.8679210814760687, "grad_norm": 3.2802631855010986, "learning_rate": 3.745857822083697e-06, "loss": 0.1848, "step": 20450 }, { "epoch": 1.8688344903178664, "grad_norm": 5.641161918640137, "learning_rate": 3.740714309947936e-06, "loss": 0.186, "step": 20460 }, { "epoch": 1.8697478991596639, "grad_norm": 3.357978343963623, "learning_rate": 3.7355722202866994e-06, "loss": 0.2518, "step": 20470 }, { "epoch": 1.8706613080014614, "grad_norm": 2.402343511581421, "learning_rate": 3.730431558908435e-06, "loss": 0.1851, "step": 20480 }, { "epoch": 1.871574716843259, "grad_norm": 2.605353593826294, "learning_rate": 3.725292331619974e-06, "loss": 0.1471, "step": 20490 }, { "epoch": 1.8724881256850567, "grad_norm": 4.008423805236816, "learning_rate": 3.7201545442265265e-06, "loss": 0.215, "step": 20500 }, { "epoch": 1.8734015345268542, "grad_norm": 2.966953754425049, "learning_rate": 3.7150182025316777e-06, "loss": 0.201, "step": 20510 }, { "epoch": 1.8743149433686517, "grad_norm": 3.517839193344116, "learning_rate": 3.7098833123373824e-06, "loss": 0.2043, "step": 20520 }, { "epoch": 1.8752283522104494, "grad_norm": 3.142629384994507, "learning_rate": 3.704749879443949e-06, "loss": 0.1597, "step": 20530 }, { "epoch": 1.876141761052247, "grad_norm": 5.1675310134887695, "learning_rate": 3.699617909650047e-06, "loss": 0.2179, "step": 20540 }, { "epoch": 1.8770551698940445, "grad_norm": 3.5633223056793213, "learning_rate": 3.6944874087526883e-06, "loss": 0.2396, "step": 20550 }, { "epoch": 1.877968578735842, "grad_norm": 2.4112274646759033, "learning_rate": 3.689358382547229e-06, "loss": 0.1848, "step": 20560 }, { "epoch": 1.87888198757764, "grad_norm": 4.064748287200928, "learning_rate": 3.6842308368273537e-06, "loss": 0.1921, "step": 20570 }, { "epoch": 1.8797953964194374, "grad_norm": 3.100271224975586, "learning_rate": 3.6791047773850835e-06, "loss": 0.2087, "step": 20580 }, { "epoch": 1.8807088052612349, "grad_norm": 17.286828994750977, "learning_rate": 3.6739802100107542e-06, "loss": 0.1813, "step": 20590 }, { "epoch": 1.8816222141030325, "grad_norm": 3.541158676147461, "learning_rate": 3.668857140493016e-06, "loss": 0.196, "step": 20600 }, { "epoch": 1.8825356229448302, "grad_norm": 3.5205845832824707, "learning_rate": 3.6637355746188307e-06, "loss": 0.2251, "step": 20610 }, { "epoch": 1.8834490317866277, "grad_norm": 3.3163585662841797, "learning_rate": 3.6586155181734594e-06, "loss": 0.1624, "step": 20620 }, { "epoch": 1.8843624406284252, "grad_norm": 2.935480833053589, "learning_rate": 3.6534969769404584e-06, "loss": 0.1885, "step": 20630 }, { "epoch": 1.8852758494702229, "grad_norm": 3.5615265369415283, "learning_rate": 3.6483799567016726e-06, "loss": 0.1515, "step": 20640 }, { "epoch": 1.8861892583120206, "grad_norm": 4.924865245819092, "learning_rate": 3.6432644632372275e-06, "loss": 0.2287, "step": 20650 }, { "epoch": 1.887102667153818, "grad_norm": 4.7560319900512695, "learning_rate": 3.6381505023255257e-06, "loss": 0.2272, "step": 20660 }, { "epoch": 1.8880160759956155, "grad_norm": 3.597555637359619, "learning_rate": 3.6330380797432417e-06, "loss": 0.1542, "step": 20670 }, { "epoch": 1.8889294848374132, "grad_norm": 1.7585787773132324, "learning_rate": 3.6279272012653046e-06, "loss": 0.2033, "step": 20680 }, { "epoch": 1.8898428936792109, "grad_norm": 2.422581911087036, "learning_rate": 3.622817872664905e-06, "loss": 0.2013, "step": 20690 }, { "epoch": 1.8907563025210083, "grad_norm": 3.2609729766845703, "learning_rate": 3.6177100997134817e-06, "loss": 0.1639, "step": 20700 }, { "epoch": 1.891669711362806, "grad_norm": 2.5542590618133545, "learning_rate": 3.612603888180716e-06, "loss": 0.1868, "step": 20710 }, { "epoch": 1.8925831202046037, "grad_norm": 1.9753425121307373, "learning_rate": 3.607499243834524e-06, "loss": 0.2303, "step": 20720 }, { "epoch": 1.8934965290464012, "grad_norm": 7.866260528564453, "learning_rate": 3.602396172441053e-06, "loss": 0.218, "step": 20730 }, { "epoch": 1.8944099378881987, "grad_norm": 4.95161771774292, "learning_rate": 3.5972946797646756e-06, "loss": 0.2216, "step": 20740 }, { "epoch": 1.8953233467299964, "grad_norm": 3.3141632080078125, "learning_rate": 3.592194771567975e-06, "loss": 0.2505, "step": 20750 }, { "epoch": 1.896236755571794, "grad_norm": 2.3051328659057617, "learning_rate": 3.58709645361175e-06, "loss": 0.1685, "step": 20760 }, { "epoch": 1.8971501644135915, "grad_norm": 5.320639610290527, "learning_rate": 3.5819997316550044e-06, "loss": 0.2152, "step": 20770 }, { "epoch": 1.898063573255389, "grad_norm": 2.899531841278076, "learning_rate": 3.576904611454932e-06, "loss": 0.1589, "step": 20780 }, { "epoch": 1.8989769820971867, "grad_norm": 5.703792572021484, "learning_rate": 3.571811098766924e-06, "loss": 0.2381, "step": 20790 }, { "epoch": 1.8998903909389844, "grad_norm": 5.180422306060791, "learning_rate": 3.566719199344554e-06, "loss": 0.2835, "step": 20800 }, { "epoch": 1.9008037997807818, "grad_norm": 5.892490386962891, "learning_rate": 3.5616289189395704e-06, "loss": 0.1833, "step": 20810 }, { "epoch": 1.9017172086225793, "grad_norm": 4.1245245933532715, "learning_rate": 3.5565402633018963e-06, "loss": 0.1933, "step": 20820 }, { "epoch": 1.902630617464377, "grad_norm": 2.450327157974243, "learning_rate": 3.551453238179617e-06, "loss": 0.1707, "step": 20830 }, { "epoch": 1.9035440263061747, "grad_norm": 1.7173906564712524, "learning_rate": 3.546367849318979e-06, "loss": 0.2036, "step": 20840 }, { "epoch": 1.9044574351479722, "grad_norm": 5.287848472595215, "learning_rate": 3.5412841024643752e-06, "loss": 0.1968, "step": 20850 }, { "epoch": 1.9053708439897699, "grad_norm": 3.1491591930389404, "learning_rate": 3.536202003358351e-06, "loss": 0.1813, "step": 20860 }, { "epoch": 1.9062842528315675, "grad_norm": 9.404170989990234, "learning_rate": 3.531121557741586e-06, "loss": 0.215, "step": 20870 }, { "epoch": 1.907197661673365, "grad_norm": 3.00777530670166, "learning_rate": 3.5260427713528905e-06, "loss": 0.2251, "step": 20880 }, { "epoch": 1.9081110705151625, "grad_norm": 4.075198173522949, "learning_rate": 3.5209656499292045e-06, "loss": 0.1688, "step": 20890 }, { "epoch": 1.9090244793569602, "grad_norm": 2.8359079360961914, "learning_rate": 3.515890199205586e-06, "loss": 0.2009, "step": 20900 }, { "epoch": 1.9099378881987579, "grad_norm": 3.699310541152954, "learning_rate": 3.510816424915203e-06, "loss": 0.1743, "step": 20910 }, { "epoch": 1.9108512970405553, "grad_norm": 3.5513832569122314, "learning_rate": 3.505744332789336e-06, "loss": 0.2126, "step": 20920 }, { "epoch": 1.9117647058823528, "grad_norm": 2.7247226238250732, "learning_rate": 3.5006739285573575e-06, "loss": 0.177, "step": 20930 }, { "epoch": 1.9126781147241505, "grad_norm": 2.881704330444336, "learning_rate": 3.4956052179467383e-06, "loss": 0.1581, "step": 20940 }, { "epoch": 1.9135915235659482, "grad_norm": 2.5409088134765625, "learning_rate": 3.4905382066830374e-06, "loss": 0.1991, "step": 20950 }, { "epoch": 1.9145049324077457, "grad_norm": 4.450331211090088, "learning_rate": 3.4854729004898903e-06, "loss": 0.1989, "step": 20960 }, { "epoch": 1.9154183412495431, "grad_norm": 8.495599746704102, "learning_rate": 3.4804093050890074e-06, "loss": 0.2022, "step": 20970 }, { "epoch": 1.916331750091341, "grad_norm": 2.8712286949157715, "learning_rate": 3.4753474262001703e-06, "loss": 0.2082, "step": 20980 }, { "epoch": 1.9172451589331385, "grad_norm": 2.9150359630584717, "learning_rate": 3.470287269541216e-06, "loss": 0.1993, "step": 20990 }, { "epoch": 1.918158567774936, "grad_norm": 3.311730146408081, "learning_rate": 3.46522884082804e-06, "loss": 0.1618, "step": 21000 }, { "epoch": 1.9190719766167337, "grad_norm": 2.4991331100463867, "learning_rate": 3.460172145774584e-06, "loss": 0.1828, "step": 21010 }, { "epoch": 1.9199853854585314, "grad_norm": 3.8922200202941895, "learning_rate": 3.4551171900928336e-06, "loss": 0.285, "step": 21020 }, { "epoch": 1.9208987943003288, "grad_norm": 2.2155814170837402, "learning_rate": 3.4500639794928064e-06, "loss": 0.1724, "step": 21030 }, { "epoch": 1.9218122031421263, "grad_norm": 3.1052117347717285, "learning_rate": 3.44501251968255e-06, "loss": 0.2121, "step": 21040 }, { "epoch": 1.922725611983924, "grad_norm": 2.3461010456085205, "learning_rate": 3.4399628163681386e-06, "loss": 0.1847, "step": 21050 }, { "epoch": 1.9236390208257217, "grad_norm": 3.152574062347412, "learning_rate": 3.4349148752536552e-06, "loss": 0.2277, "step": 21060 }, { "epoch": 1.9245524296675192, "grad_norm": 3.628368854522705, "learning_rate": 3.429868702041197e-06, "loss": 0.1919, "step": 21070 }, { "epoch": 1.9254658385093166, "grad_norm": 3.2155113220214844, "learning_rate": 3.4248243024308624e-06, "loss": 0.1573, "step": 21080 }, { "epoch": 1.9263792473511143, "grad_norm": 3.596201181411743, "learning_rate": 3.4197816821207463e-06, "loss": 0.2146, "step": 21090 }, { "epoch": 1.927292656192912, "grad_norm": 2.862607479095459, "learning_rate": 3.4147408468069364e-06, "loss": 0.1925, "step": 21100 }, { "epoch": 1.9282060650347095, "grad_norm": 3.080683946609497, "learning_rate": 3.4097018021834984e-06, "loss": 0.2074, "step": 21110 }, { "epoch": 1.9291194738765072, "grad_norm": 3.52921462059021, "learning_rate": 3.4046645539424815e-06, "loss": 0.2415, "step": 21120 }, { "epoch": 1.9300328827183049, "grad_norm": 1.5651183128356934, "learning_rate": 3.3996291077738995e-06, "loss": 0.1704, "step": 21130 }, { "epoch": 1.9309462915601023, "grad_norm": 2.379210948944092, "learning_rate": 3.394595469365738e-06, "loss": 0.1581, "step": 21140 }, { "epoch": 1.9318597004018998, "grad_norm": 4.417806148529053, "learning_rate": 3.3895636444039364e-06, "loss": 0.1428, "step": 21150 }, { "epoch": 1.9327731092436975, "grad_norm": 4.523859024047852, "learning_rate": 3.384533638572383e-06, "loss": 0.2186, "step": 21160 }, { "epoch": 1.9336865180854952, "grad_norm": 2.7495956420898438, "learning_rate": 3.379505457552916e-06, "loss": 0.1788, "step": 21170 }, { "epoch": 1.9345999269272927, "grad_norm": 4.610747814178467, "learning_rate": 3.3744791070253112e-06, "loss": 0.2099, "step": 21180 }, { "epoch": 1.9355133357690901, "grad_norm": 5.475003719329834, "learning_rate": 3.369454592667274e-06, "loss": 0.2283, "step": 21190 }, { "epoch": 1.9364267446108878, "grad_norm": 3.9777209758758545, "learning_rate": 3.364431920154439e-06, "loss": 0.2012, "step": 21200 }, { "epoch": 1.9373401534526855, "grad_norm": 3.232109546661377, "learning_rate": 3.3594110951603574e-06, "loss": 0.2688, "step": 21210 }, { "epoch": 1.938253562294483, "grad_norm": 7.473545074462891, "learning_rate": 3.354392123356497e-06, "loss": 0.2212, "step": 21220 }, { "epoch": 1.9391669711362804, "grad_norm": 11.799448013305664, "learning_rate": 3.349375010412227e-06, "loss": 0.23, "step": 21230 }, { "epoch": 1.9400803799780781, "grad_norm": 2.670046091079712, "learning_rate": 3.344359761994823e-06, "loss": 0.2106, "step": 21240 }, { "epoch": 1.9409937888198758, "grad_norm": 4.472700119018555, "learning_rate": 3.3393463837694517e-06, "loss": 0.1701, "step": 21250 }, { "epoch": 1.9419071976616733, "grad_norm": 3.456036329269409, "learning_rate": 3.334334881399166e-06, "loss": 0.2294, "step": 21260 }, { "epoch": 1.942820606503471, "grad_norm": 6.726315021514893, "learning_rate": 3.3293252605449e-06, "loss": 0.1733, "step": 21270 }, { "epoch": 1.9437340153452687, "grad_norm": 5.93501091003418, "learning_rate": 3.3243175268654656e-06, "loss": 0.19, "step": 21280 }, { "epoch": 1.9446474241870662, "grad_norm": 2.4673421382904053, "learning_rate": 3.3193116860175378e-06, "loss": 0.1656, "step": 21290 }, { "epoch": 1.9455608330288636, "grad_norm": 2.5548319816589355, "learning_rate": 3.3143077436556603e-06, "loss": 0.2299, "step": 21300 }, { "epoch": 1.9464742418706613, "grad_norm": 3.395618438720703, "learning_rate": 3.309305705432224e-06, "loss": 0.2426, "step": 21310 }, { "epoch": 1.947387650712459, "grad_norm": 4.950746536254883, "learning_rate": 3.304305576997475e-06, "loss": 0.1715, "step": 21320 }, { "epoch": 1.9483010595542565, "grad_norm": 4.537391662597656, "learning_rate": 3.299307363999503e-06, "loss": 0.1798, "step": 21330 }, { "epoch": 1.949214468396054, "grad_norm": 5.380403995513916, "learning_rate": 3.2943110720842284e-06, "loss": 0.1858, "step": 21340 }, { "epoch": 1.9501278772378516, "grad_norm": 4.565369606018066, "learning_rate": 3.2893167068954064e-06, "loss": 0.1849, "step": 21350 }, { "epoch": 1.9510412860796493, "grad_norm": 2.0187571048736572, "learning_rate": 3.284324274074614e-06, "loss": 0.1855, "step": 21360 }, { "epoch": 1.9519546949214468, "grad_norm": 6.949159145355225, "learning_rate": 3.2793337792612435e-06, "loss": 0.2084, "step": 21370 }, { "epoch": 1.9528681037632443, "grad_norm": 2.535529851913452, "learning_rate": 3.2743452280925027e-06, "loss": 0.1707, "step": 21380 }, { "epoch": 1.9537815126050422, "grad_norm": 2.3787803649902344, "learning_rate": 3.2693586262033985e-06, "loss": 0.224, "step": 21390 }, { "epoch": 1.9546949214468397, "grad_norm": 6.5898284912109375, "learning_rate": 3.2643739792267416e-06, "loss": 0.2032, "step": 21400 }, { "epoch": 1.9556083302886371, "grad_norm": 4.538728713989258, "learning_rate": 3.259391292793128e-06, "loss": 0.2185, "step": 21410 }, { "epoch": 1.9565217391304348, "grad_norm": 3.9661877155303955, "learning_rate": 3.2544105725309448e-06, "loss": 0.1929, "step": 21420 }, { "epoch": 1.9574351479722325, "grad_norm": 2.6184892654418945, "learning_rate": 3.2494318240663557e-06, "loss": 0.2474, "step": 21430 }, { "epoch": 1.95834855681403, "grad_norm": 2.1678168773651123, "learning_rate": 3.2444550530232984e-06, "loss": 0.1922, "step": 21440 }, { "epoch": 1.9592619656558274, "grad_norm": 3.6219687461853027, "learning_rate": 3.2394802650234738e-06, "loss": 0.2059, "step": 21450 }, { "epoch": 1.9601753744976251, "grad_norm": 3.2144317626953125, "learning_rate": 3.234507465686346e-06, "loss": 0.1748, "step": 21460 }, { "epoch": 1.9610887833394228, "grad_norm": 2.561450958251953, "learning_rate": 3.22953666062913e-06, "loss": 0.1966, "step": 21470 }, { "epoch": 1.9620021921812203, "grad_norm": 2.9424381256103516, "learning_rate": 3.2245678554667926e-06, "loss": 0.1826, "step": 21480 }, { "epoch": 1.9629156010230178, "grad_norm": 3.2352309226989746, "learning_rate": 3.219601055812035e-06, "loss": 0.1895, "step": 21490 }, { "epoch": 1.9638290098648155, "grad_norm": 3.6719725131988525, "learning_rate": 3.2146362672752997e-06, "loss": 0.1745, "step": 21500 }, { "epoch": 1.9647424187066131, "grad_norm": 3.8546993732452393, "learning_rate": 3.20967349546475e-06, "loss": 0.2459, "step": 21510 }, { "epoch": 1.9656558275484106, "grad_norm": 3.1922106742858887, "learning_rate": 3.204712745986279e-06, "loss": 0.1676, "step": 21520 }, { "epoch": 1.9665692363902083, "grad_norm": 2.1237525939941406, "learning_rate": 3.199754024443492e-06, "loss": 0.1813, "step": 21530 }, { "epoch": 1.967482645232006, "grad_norm": 2.6765012741088867, "learning_rate": 3.1947973364377034e-06, "loss": 0.2282, "step": 21540 }, { "epoch": 1.9683960540738035, "grad_norm": 2.1465816497802734, "learning_rate": 3.1898426875679283e-06, "loss": 0.1715, "step": 21550 }, { "epoch": 1.969309462915601, "grad_norm": 8.257621765136719, "learning_rate": 3.184890083430883e-06, "loss": 0.1674, "step": 21560 }, { "epoch": 1.9702228717573986, "grad_norm": 3.5949268341064453, "learning_rate": 3.17993952962097e-06, "loss": 0.2136, "step": 21570 }, { "epoch": 1.9711362805991963, "grad_norm": 2.3247873783111572, "learning_rate": 3.174991031730279e-06, "loss": 0.183, "step": 21580 }, { "epoch": 1.9720496894409938, "grad_norm": 4.731454849243164, "learning_rate": 3.1700445953485746e-06, "loss": 0.2286, "step": 21590 }, { "epoch": 1.9729630982827913, "grad_norm": 2.248162031173706, "learning_rate": 3.165100226063295e-06, "loss": 0.1916, "step": 21600 }, { "epoch": 1.973876507124589, "grad_norm": 3.6380207538604736, "learning_rate": 3.1601579294595404e-06, "loss": 0.1468, "step": 21610 }, { "epoch": 1.9747899159663866, "grad_norm": 2.7508530616760254, "learning_rate": 3.1552177111200744e-06, "loss": 0.1574, "step": 21620 }, { "epoch": 1.9757033248081841, "grad_norm": 1.6064592599868774, "learning_rate": 3.150279576625309e-06, "loss": 0.1963, "step": 21630 }, { "epoch": 1.9766167336499816, "grad_norm": 1.4188392162322998, "learning_rate": 3.145343531553306e-06, "loss": 0.1671, "step": 21640 }, { "epoch": 1.9775301424917793, "grad_norm": 4.103569030761719, "learning_rate": 3.1404095814797615e-06, "loss": 0.1545, "step": 21650 }, { "epoch": 1.978443551333577, "grad_norm": 4.2222089767456055, "learning_rate": 3.1354777319780106e-06, "loss": 0.2551, "step": 21660 }, { "epoch": 1.9793569601753744, "grad_norm": 4.963714599609375, "learning_rate": 3.1305479886190116e-06, "loss": 0.2002, "step": 21670 }, { "epoch": 1.9802703690171721, "grad_norm": 3.694347381591797, "learning_rate": 3.1256203569713472e-06, "loss": 0.1874, "step": 21680 }, { "epoch": 1.9811837778589698, "grad_norm": 2.1690940856933594, "learning_rate": 3.1206948426012115e-06, "loss": 0.1493, "step": 21690 }, { "epoch": 1.9820971867007673, "grad_norm": 3.0308618545532227, "learning_rate": 3.115771451072408e-06, "loss": 0.2029, "step": 21700 }, { "epoch": 1.9830105955425648, "grad_norm": 3.3117971420288086, "learning_rate": 3.110850187946345e-06, "loss": 0.2912, "step": 21710 }, { "epoch": 1.9839240043843624, "grad_norm": 2.7507150173187256, "learning_rate": 3.1059310587820236e-06, "loss": 0.1704, "step": 21720 }, { "epoch": 1.9848374132261601, "grad_norm": 2.759589672088623, "learning_rate": 3.101014069136034e-06, "loss": 0.1715, "step": 21730 }, { "epoch": 1.9857508220679576, "grad_norm": 2.5538713932037354, "learning_rate": 3.0960992245625527e-06, "loss": 0.2403, "step": 21740 }, { "epoch": 1.986664230909755, "grad_norm": 9.953274726867676, "learning_rate": 3.091186530613329e-06, "loss": 0.1835, "step": 21750 }, { "epoch": 1.9875776397515528, "grad_norm": 4.306700229644775, "learning_rate": 3.086275992837687e-06, "loss": 0.1931, "step": 21760 }, { "epoch": 1.9884910485933505, "grad_norm": 3.2226431369781494, "learning_rate": 3.081367616782511e-06, "loss": 0.1879, "step": 21770 }, { "epoch": 1.989404457435148, "grad_norm": 2.7904183864593506, "learning_rate": 3.0764614079922483e-06, "loss": 0.2304, "step": 21780 }, { "epoch": 1.9903178662769454, "grad_norm": 8.378860473632812, "learning_rate": 3.071557372008892e-06, "loss": 0.2125, "step": 21790 }, { "epoch": 1.9912312751187433, "grad_norm": 4.546781539916992, "learning_rate": 3.0666555143719876e-06, "loss": 0.1975, "step": 21800 }, { "epoch": 1.9921446839605408, "grad_norm": 2.9643616676330566, "learning_rate": 3.061755840618616e-06, "loss": 0.2039, "step": 21810 }, { "epoch": 1.9930580928023383, "grad_norm": 2.2375316619873047, "learning_rate": 3.056858356283392e-06, "loss": 0.1828, "step": 21820 }, { "epoch": 1.993971501644136, "grad_norm": 3.1812820434570312, "learning_rate": 3.051963066898457e-06, "loss": 0.2053, "step": 21830 }, { "epoch": 1.9948849104859336, "grad_norm": 9.96502685546875, "learning_rate": 3.047069977993472e-06, "loss": 0.2086, "step": 21840 }, { "epoch": 1.995798319327731, "grad_norm": 3.017766237258911, "learning_rate": 3.0421790950956147e-06, "loss": 0.2694, "step": 21850 }, { "epoch": 1.9967117281695286, "grad_norm": 10.595418930053711, "learning_rate": 3.037290423729571e-06, "loss": 0.2004, "step": 21860 }, { "epoch": 1.9976251370113263, "grad_norm": 2.954667091369629, "learning_rate": 3.032403969417523e-06, "loss": 0.1403, "step": 21870 }, { "epoch": 1.998538545853124, "grad_norm": 3.1461727619171143, "learning_rate": 3.027519737679159e-06, "loss": 0.2262, "step": 21880 }, { "epoch": 1.9994519546949214, "grad_norm": 4.5667805671691895, "learning_rate": 3.022637734031644e-06, "loss": 0.1714, "step": 21890 }, { "epoch": 2.000365363536719, "grad_norm": 2.447345495223999, "learning_rate": 3.0177579639896375e-06, "loss": 0.1983, "step": 21900 }, { "epoch": 2.001278772378517, "grad_norm": 3.2306392192840576, "learning_rate": 3.0128804330652707e-06, "loss": 0.1776, "step": 21910 }, { "epoch": 2.0021921812203143, "grad_norm": 6.548824787139893, "learning_rate": 3.0080051467681446e-06, "loss": 0.1237, "step": 21920 }, { "epoch": 2.0031055900621118, "grad_norm": 4.609522342681885, "learning_rate": 3.003132110605327e-06, "loss": 0.192, "step": 21930 }, { "epoch": 2.004018998903909, "grad_norm": 3.885193109512329, "learning_rate": 2.9982613300813434e-06, "loss": 0.1928, "step": 21940 }, { "epoch": 2.004932407745707, "grad_norm": 3.8460092544555664, "learning_rate": 2.99339281069817e-06, "loss": 0.1682, "step": 21950 }, { "epoch": 2.0058458165875046, "grad_norm": 3.7532596588134766, "learning_rate": 2.9885265579552303e-06, "loss": 0.1881, "step": 21960 }, { "epoch": 2.006759225429302, "grad_norm": 3.910675525665283, "learning_rate": 2.9836625773493867e-06, "loss": 0.1907, "step": 21970 }, { "epoch": 2.0076726342710995, "grad_norm": 4.5505900382995605, "learning_rate": 2.9788008743749346e-06, "loss": 0.1703, "step": 21980 }, { "epoch": 2.0085860431128975, "grad_norm": 3.612414598464966, "learning_rate": 2.9739414545235996e-06, "loss": 0.158, "step": 21990 }, { "epoch": 2.009499451954695, "grad_norm": 3.6752827167510986, "learning_rate": 2.9690843232845256e-06, "loss": 0.1518, "step": 22000 }, { "epoch": 2.0104128607964924, "grad_norm": 2.8304386138916016, "learning_rate": 2.9642294861442704e-06, "loss": 0.1901, "step": 22010 }, { "epoch": 2.0113262696382903, "grad_norm": 2.27457332611084, "learning_rate": 2.959376948586804e-06, "loss": 0.1979, "step": 22020 }, { "epoch": 2.012239678480088, "grad_norm": 7.495784282684326, "learning_rate": 2.954526716093493e-06, "loss": 0.2188, "step": 22030 }, { "epoch": 2.0131530873218852, "grad_norm": 2.060793399810791, "learning_rate": 2.949678794143108e-06, "loss": 0.1533, "step": 22040 }, { "epoch": 2.0140664961636827, "grad_norm": 4.081784725189209, "learning_rate": 2.944833188211802e-06, "loss": 0.2225, "step": 22050 }, { "epoch": 2.0149799050054806, "grad_norm": 4.322390079498291, "learning_rate": 2.939989903773117e-06, "loss": 0.1707, "step": 22060 }, { "epoch": 2.015893313847278, "grad_norm": 4.852595329284668, "learning_rate": 2.935148946297971e-06, "loss": 0.1508, "step": 22070 }, { "epoch": 2.0168067226890756, "grad_norm": 3.134838104248047, "learning_rate": 2.9303103212546513e-06, "loss": 0.1774, "step": 22080 }, { "epoch": 2.017720131530873, "grad_norm": 1.426194190979004, "learning_rate": 2.925474034108816e-06, "loss": 0.1353, "step": 22090 }, { "epoch": 2.018633540372671, "grad_norm": 5.508920669555664, "learning_rate": 2.920640090323479e-06, "loss": 0.1722, "step": 22100 }, { "epoch": 2.0195469492144684, "grad_norm": 2.2911195755004883, "learning_rate": 2.9158084953590036e-06, "loss": 0.171, "step": 22110 }, { "epoch": 2.020460358056266, "grad_norm": 5.641573429107666, "learning_rate": 2.910979254673108e-06, "loss": 0.2366, "step": 22120 }, { "epoch": 2.0213737668980634, "grad_norm": 3.50252628326416, "learning_rate": 2.9061523737208442e-06, "loss": 0.2434, "step": 22130 }, { "epoch": 2.0222871757398613, "grad_norm": 2.7768747806549072, "learning_rate": 2.901327857954598e-06, "loss": 0.1356, "step": 22140 }, { "epoch": 2.0232005845816587, "grad_norm": 3.271028995513916, "learning_rate": 2.8965057128240924e-06, "loss": 0.1431, "step": 22150 }, { "epoch": 2.024113993423456, "grad_norm": 3.8206565380096436, "learning_rate": 2.8916859437763634e-06, "loss": 0.2208, "step": 22160 }, { "epoch": 2.025027402265254, "grad_norm": 4.289015769958496, "learning_rate": 2.886868556255765e-06, "loss": 0.1762, "step": 22170 }, { "epoch": 2.0259408111070516, "grad_norm": 1.5583584308624268, "learning_rate": 2.882053555703964e-06, "loss": 0.165, "step": 22180 }, { "epoch": 2.026854219948849, "grad_norm": 2.6230976581573486, "learning_rate": 2.8772409475599317e-06, "loss": 0.1512, "step": 22190 }, { "epoch": 2.0277676287906465, "grad_norm": 3.1448771953582764, "learning_rate": 2.8724307372599314e-06, "loss": 0.1691, "step": 22200 }, { "epoch": 2.0286810376324445, "grad_norm": 3.988848924636841, "learning_rate": 2.867622930237522e-06, "loss": 0.1279, "step": 22210 }, { "epoch": 2.029594446474242, "grad_norm": 2.0142593383789062, "learning_rate": 2.8628175319235443e-06, "loss": 0.1611, "step": 22220 }, { "epoch": 2.0305078553160394, "grad_norm": 2.874131441116333, "learning_rate": 2.858014547746124e-06, "loss": 0.195, "step": 22230 }, { "epoch": 2.031421264157837, "grad_norm": 2.8047266006469727, "learning_rate": 2.8532139831306553e-06, "loss": 0.1536, "step": 22240 }, { "epoch": 2.0323346729996348, "grad_norm": 3.0776174068450928, "learning_rate": 2.8484158434998e-06, "loss": 0.1665, "step": 22250 }, { "epoch": 2.0332480818414322, "grad_norm": 4.965786457061768, "learning_rate": 2.8436201342734787e-06, "loss": 0.178, "step": 22260 }, { "epoch": 2.0341614906832297, "grad_norm": 3.069941759109497, "learning_rate": 2.838826860868872e-06, "loss": 0.1897, "step": 22270 }, { "epoch": 2.0350748995250276, "grad_norm": 3.840886116027832, "learning_rate": 2.8340360287004066e-06, "loss": 0.1756, "step": 22280 }, { "epoch": 2.035988308366825, "grad_norm": 3.038313388824463, "learning_rate": 2.8292476431797494e-06, "loss": 0.2242, "step": 22290 }, { "epoch": 2.0369017172086226, "grad_norm": 2.813964605331421, "learning_rate": 2.824461709715804e-06, "loss": 0.1913, "step": 22300 }, { "epoch": 2.03781512605042, "grad_norm": 4.3804612159729, "learning_rate": 2.819678233714709e-06, "loss": 0.1666, "step": 22310 }, { "epoch": 2.038728534892218, "grad_norm": 3.341142177581787, "learning_rate": 2.8148972205798207e-06, "loss": 0.1706, "step": 22320 }, { "epoch": 2.0396419437340154, "grad_norm": 5.812865734100342, "learning_rate": 2.810118675711718e-06, "loss": 0.1458, "step": 22330 }, { "epoch": 2.040555352575813, "grad_norm": 4.161381721496582, "learning_rate": 2.8053426045081855e-06, "loss": 0.2328, "step": 22340 }, { "epoch": 2.0414687614176104, "grad_norm": 2.6349499225616455, "learning_rate": 2.800569012364224e-06, "loss": 0.2089, "step": 22350 }, { "epoch": 2.0423821702594083, "grad_norm": 4.9546027183532715, "learning_rate": 2.795797904672022e-06, "loss": 0.2201, "step": 22360 }, { "epoch": 2.0432955791012057, "grad_norm": 7.29281759262085, "learning_rate": 2.7910292868209733e-06, "loss": 0.1381, "step": 22370 }, { "epoch": 2.044208987943003, "grad_norm": 2.7406411170959473, "learning_rate": 2.786263164197648e-06, "loss": 0.1665, "step": 22380 }, { "epoch": 2.0451223967848007, "grad_norm": 2.871340274810791, "learning_rate": 2.781499542185807e-06, "loss": 0.1793, "step": 22390 }, { "epoch": 2.0460358056265986, "grad_norm": 3.185056686401367, "learning_rate": 2.776738426166381e-06, "loss": 0.1532, "step": 22400 }, { "epoch": 2.046949214468396, "grad_norm": 1.518349528312683, "learning_rate": 2.771979821517472e-06, "loss": 0.1814, "step": 22410 }, { "epoch": 2.0478626233101935, "grad_norm": 2.0669658184051514, "learning_rate": 2.767223733614343e-06, "loss": 0.1641, "step": 22420 }, { "epoch": 2.0487760321519914, "grad_norm": 4.553879737854004, "learning_rate": 2.7624701678294173e-06, "loss": 0.1834, "step": 22430 }, { "epoch": 2.049689440993789, "grad_norm": 2.200589418411255, "learning_rate": 2.7577191295322684e-06, "loss": 0.1886, "step": 22440 }, { "epoch": 2.0506028498355864, "grad_norm": 3.1154184341430664, "learning_rate": 2.7529706240896113e-06, "loss": 0.2181, "step": 22450 }, { "epoch": 2.051516258677384, "grad_norm": 4.002124786376953, "learning_rate": 2.748224656865304e-06, "loss": 0.202, "step": 22460 }, { "epoch": 2.0524296675191818, "grad_norm": 2.686432361602783, "learning_rate": 2.7434812332203388e-06, "loss": 0.1732, "step": 22470 }, { "epoch": 2.0533430763609792, "grad_norm": 2.300462245941162, "learning_rate": 2.7387403585128304e-06, "loss": 0.1655, "step": 22480 }, { "epoch": 2.0542564852027767, "grad_norm": 3.1422176361083984, "learning_rate": 2.734002038098015e-06, "loss": 0.2401, "step": 22490 }, { "epoch": 2.055169894044574, "grad_norm": 2.555368423461914, "learning_rate": 2.729266277328243e-06, "loss": 0.1663, "step": 22500 }, { "epoch": 2.056083302886372, "grad_norm": 2.4573779106140137, "learning_rate": 2.724533081552979e-06, "loss": 0.1489, "step": 22510 }, { "epoch": 2.0569967117281696, "grad_norm": 4.263746738433838, "learning_rate": 2.7198024561187843e-06, "loss": 0.1473, "step": 22520 }, { "epoch": 2.057910120569967, "grad_norm": 3.380519390106201, "learning_rate": 2.715074406369318e-06, "loss": 0.1685, "step": 22530 }, { "epoch": 2.0588235294117645, "grad_norm": 3.3676977157592773, "learning_rate": 2.7103489376453286e-06, "loss": 0.2001, "step": 22540 }, { "epoch": 2.0597369382535624, "grad_norm": 2.1870522499084473, "learning_rate": 2.7056260552846524e-06, "loss": 0.1181, "step": 22550 }, { "epoch": 2.06065034709536, "grad_norm": 4.428742408752441, "learning_rate": 2.7009057646222046e-06, "loss": 0.1752, "step": 22560 }, { "epoch": 2.0615637559371573, "grad_norm": 3.5527279376983643, "learning_rate": 2.69618807098997e-06, "loss": 0.1352, "step": 22570 }, { "epoch": 2.0624771647789553, "grad_norm": 4.677976131439209, "learning_rate": 2.691472979716998e-06, "loss": 0.2036, "step": 22580 }, { "epoch": 2.0633905736207527, "grad_norm": 3.3277053833007812, "learning_rate": 2.6867604961294047e-06, "loss": 0.1372, "step": 22590 }, { "epoch": 2.06430398246255, "grad_norm": 2.9552929401397705, "learning_rate": 2.6820506255503555e-06, "loss": 0.162, "step": 22600 }, { "epoch": 2.0652173913043477, "grad_norm": 2.4577066898345947, "learning_rate": 2.6773433733000655e-06, "loss": 0.1612, "step": 22610 }, { "epoch": 2.0661308001461456, "grad_norm": 6.2337446212768555, "learning_rate": 2.6726387446957893e-06, "loss": 0.155, "step": 22620 }, { "epoch": 2.067044208987943, "grad_norm": 7.810189247131348, "learning_rate": 2.6679367450518267e-06, "loss": 0.1748, "step": 22630 }, { "epoch": 2.0679576178297405, "grad_norm": 2.8394131660461426, "learning_rate": 2.6632373796794965e-06, "loss": 0.1723, "step": 22640 }, { "epoch": 2.068871026671538, "grad_norm": 3.356693744659424, "learning_rate": 2.6585406538871523e-06, "loss": 0.2176, "step": 22650 }, { "epoch": 2.069784435513336, "grad_norm": 4.209534645080566, "learning_rate": 2.6538465729801567e-06, "loss": 0.194, "step": 22660 }, { "epoch": 2.0706978443551334, "grad_norm": 4.6598663330078125, "learning_rate": 2.649155142260893e-06, "loss": 0.1903, "step": 22670 }, { "epoch": 2.071611253196931, "grad_norm": 3.736055374145508, "learning_rate": 2.6444663670287452e-06, "loss": 0.1925, "step": 22680 }, { "epoch": 2.0725246620387283, "grad_norm": 3.722409725189209, "learning_rate": 2.6397802525801e-06, "loss": 0.1788, "step": 22690 }, { "epoch": 2.0734380708805262, "grad_norm": 4.327380657196045, "learning_rate": 2.635096804208335e-06, "loss": 0.1848, "step": 22700 }, { "epoch": 2.0743514797223237, "grad_norm": 2.185450315475464, "learning_rate": 2.6304160272038237e-06, "loss": 0.1708, "step": 22710 }, { "epoch": 2.075264888564121, "grad_norm": 2.4090828895568848, "learning_rate": 2.625737926853916e-06, "loss": 0.1641, "step": 22720 }, { "epoch": 2.076178297405919, "grad_norm": 5.846837043762207, "learning_rate": 2.621062508442939e-06, "loss": 0.1627, "step": 22730 }, { "epoch": 2.0770917062477166, "grad_norm": 3.6455435752868652, "learning_rate": 2.6163897772521885e-06, "loss": 0.2001, "step": 22740 }, { "epoch": 2.078005115089514, "grad_norm": 6.319693088531494, "learning_rate": 2.6117197385599334e-06, "loss": 0.1876, "step": 22750 }, { "epoch": 2.0789185239313115, "grad_norm": 6.800601959228516, "learning_rate": 2.6070523976413933e-06, "loss": 0.212, "step": 22760 }, { "epoch": 2.0798319327731094, "grad_norm": 3.0084424018859863, "learning_rate": 2.602387759768741e-06, "loss": 0.1775, "step": 22770 }, { "epoch": 2.080745341614907, "grad_norm": 3.2751858234405518, "learning_rate": 2.5977258302110963e-06, "loss": 0.187, "step": 22780 }, { "epoch": 2.0816587504567043, "grad_norm": 3.4728689193725586, "learning_rate": 2.5930666142345252e-06, "loss": 0.1645, "step": 22790 }, { "epoch": 2.082572159298502, "grad_norm": 3.689594030380249, "learning_rate": 2.5884101171020203e-06, "loss": 0.1301, "step": 22800 }, { "epoch": 2.0834855681402997, "grad_norm": 6.168824195861816, "learning_rate": 2.583756344073509e-06, "loss": 0.2096, "step": 22810 }, { "epoch": 2.084398976982097, "grad_norm": 3.1447808742523193, "learning_rate": 2.5791053004058365e-06, "loss": 0.2001, "step": 22820 }, { "epoch": 2.0853123858238947, "grad_norm": 4.1621222496032715, "learning_rate": 2.5744569913527685e-06, "loss": 0.1534, "step": 22830 }, { "epoch": 2.0862257946656926, "grad_norm": 2.5590014457702637, "learning_rate": 2.569811422164985e-06, "loss": 0.1689, "step": 22840 }, { "epoch": 2.08713920350749, "grad_norm": 5.949441432952881, "learning_rate": 2.5651685980900644e-06, "loss": 0.1904, "step": 22850 }, { "epoch": 2.0880526123492875, "grad_norm": 1.4681676626205444, "learning_rate": 2.5605285243724853e-06, "loss": 0.184, "step": 22860 }, { "epoch": 2.088966021191085, "grad_norm": 1.6545701026916504, "learning_rate": 2.5558912062536246e-06, "loss": 0.1766, "step": 22870 }, { "epoch": 2.089879430032883, "grad_norm": 3.2384257316589355, "learning_rate": 2.551256648971741e-06, "loss": 0.1664, "step": 22880 }, { "epoch": 2.0907928388746804, "grad_norm": 5.570948123931885, "learning_rate": 2.546624857761978e-06, "loss": 0.1789, "step": 22890 }, { "epoch": 2.091706247716478, "grad_norm": 5.022153854370117, "learning_rate": 2.541995837856349e-06, "loss": 0.2084, "step": 22900 }, { "epoch": 2.0926196565582753, "grad_norm": 2.852349281311035, "learning_rate": 2.5373695944837482e-06, "loss": 0.1694, "step": 22910 }, { "epoch": 2.0935330654000732, "grad_norm": 3.3367795944213867, "learning_rate": 2.5327461328699223e-06, "loss": 0.1456, "step": 22920 }, { "epoch": 2.0944464742418707, "grad_norm": 3.963184118270874, "learning_rate": 2.5281254582374786e-06, "loss": 0.2031, "step": 22930 }, { "epoch": 2.095359883083668, "grad_norm": 1.6649410724639893, "learning_rate": 2.5235075758058804e-06, "loss": 0.1373, "step": 22940 }, { "epoch": 2.0962732919254656, "grad_norm": 2.4949393272399902, "learning_rate": 2.5188924907914348e-06, "loss": 0.175, "step": 22950 }, { "epoch": 2.0971867007672635, "grad_norm": 3.7336626052856445, "learning_rate": 2.514280208407287e-06, "loss": 0.1695, "step": 22960 }, { "epoch": 2.098100109609061, "grad_norm": 2.9249022006988525, "learning_rate": 2.509670733863418e-06, "loss": 0.2042, "step": 22970 }, { "epoch": 2.0990135184508585, "grad_norm": 1.8535680770874023, "learning_rate": 2.505064072366635e-06, "loss": 0.1992, "step": 22980 }, { "epoch": 2.0999269272926564, "grad_norm": 2.719829797744751, "learning_rate": 2.5004602291205727e-06, "loss": 0.1985, "step": 22990 }, { "epoch": 2.100840336134454, "grad_norm": 2.236696720123291, "learning_rate": 2.4958592093256772e-06, "loss": 0.181, "step": 23000 }, { "epoch": 2.1017537449762513, "grad_norm": 4.617887020111084, "learning_rate": 2.491261018179208e-06, "loss": 0.1557, "step": 23010 }, { "epoch": 2.102667153818049, "grad_norm": 5.0086822509765625, "learning_rate": 2.486665660875224e-06, "loss": 0.1673, "step": 23020 }, { "epoch": 2.1035805626598467, "grad_norm": 4.240482330322266, "learning_rate": 2.4820731426045957e-06, "loss": 0.1645, "step": 23030 }, { "epoch": 2.104493971501644, "grad_norm": 3.5341274738311768, "learning_rate": 2.477483468554974e-06, "loss": 0.1855, "step": 23040 }, { "epoch": 2.1054073803434417, "grad_norm": 2.506225109100342, "learning_rate": 2.472896643910802e-06, "loss": 0.1781, "step": 23050 }, { "epoch": 2.106320789185239, "grad_norm": 5.535055637359619, "learning_rate": 2.468312673853302e-06, "loss": 0.1808, "step": 23060 }, { "epoch": 2.107234198027037, "grad_norm": 17.81688690185547, "learning_rate": 2.463731563560477e-06, "loss": 0.1702, "step": 23070 }, { "epoch": 2.1081476068688345, "grad_norm": 3.0537400245666504, "learning_rate": 2.4591533182070936e-06, "loss": 0.1933, "step": 23080 }, { "epoch": 2.109061015710632, "grad_norm": 7.497527599334717, "learning_rate": 2.454577942964686e-06, "loss": 0.1685, "step": 23090 }, { "epoch": 2.10997442455243, "grad_norm": 2.979519844055176, "learning_rate": 2.450005443001542e-06, "loss": 0.2036, "step": 23100 }, { "epoch": 2.1108878333942274, "grad_norm": 2.7559139728546143, "learning_rate": 2.445435823482709e-06, "loss": 0.1855, "step": 23110 }, { "epoch": 2.111801242236025, "grad_norm": 2.8852715492248535, "learning_rate": 2.440869089569972e-06, "loss": 0.2012, "step": 23120 }, { "epoch": 2.1127146510778223, "grad_norm": 2.92189884185791, "learning_rate": 2.4363052464218657e-06, "loss": 0.136, "step": 23130 }, { "epoch": 2.11362805991962, "grad_norm": 1.9618867635726929, "learning_rate": 2.431744299193649e-06, "loss": 0.1548, "step": 23140 }, { "epoch": 2.1145414687614177, "grad_norm": 2.430980682373047, "learning_rate": 2.42718625303732e-06, "loss": 0.1778, "step": 23150 }, { "epoch": 2.115454877603215, "grad_norm": 2.979595422744751, "learning_rate": 2.4226311131015927e-06, "loss": 0.1431, "step": 23160 }, { "epoch": 2.1163682864450126, "grad_norm": 2.732513904571533, "learning_rate": 2.4180788845319005e-06, "loss": 0.1863, "step": 23170 }, { "epoch": 2.1172816952868105, "grad_norm": 2.4581503868103027, "learning_rate": 2.4135295724703857e-06, "loss": 0.1587, "step": 23180 }, { "epoch": 2.118195104128608, "grad_norm": 3.3553550243377686, "learning_rate": 2.4089831820559024e-06, "loss": 0.1867, "step": 23190 }, { "epoch": 2.1191085129704055, "grad_norm": 3.833317756652832, "learning_rate": 2.4044397184239985e-06, "loss": 0.1713, "step": 23200 }, { "epoch": 2.120021921812203, "grad_norm": 2.8529393672943115, "learning_rate": 2.3998991867069167e-06, "loss": 0.1748, "step": 23210 }, { "epoch": 2.120935330654001, "grad_norm": 3.350764036178589, "learning_rate": 2.39536159203359e-06, "loss": 0.1931, "step": 23220 }, { "epoch": 2.1218487394957983, "grad_norm": 4.706206321716309, "learning_rate": 2.3908269395296358e-06, "loss": 0.1637, "step": 23230 }, { "epoch": 2.122762148337596, "grad_norm": 3.3329708576202393, "learning_rate": 2.386295234317342e-06, "loss": 0.1371, "step": 23240 }, { "epoch": 2.1236755571793937, "grad_norm": 2.7014682292938232, "learning_rate": 2.381766481515671e-06, "loss": 0.1758, "step": 23250 }, { "epoch": 2.124588966021191, "grad_norm": 4.746362209320068, "learning_rate": 2.3772406862402468e-06, "loss": 0.1829, "step": 23260 }, { "epoch": 2.1255023748629887, "grad_norm": 4.463862895965576, "learning_rate": 2.3727178536033607e-06, "loss": 0.2057, "step": 23270 }, { "epoch": 2.126415783704786, "grad_norm": 2.3942978382110596, "learning_rate": 2.3681979887139483e-06, "loss": 0.174, "step": 23280 }, { "epoch": 2.127329192546584, "grad_norm": 2.881181240081787, "learning_rate": 2.363681096677598e-06, "loss": 0.1758, "step": 23290 }, { "epoch": 2.1282426013883815, "grad_norm": 4.749820709228516, "learning_rate": 2.3591671825965344e-06, "loss": 0.1732, "step": 23300 }, { "epoch": 2.129156010230179, "grad_norm": 2.010133743286133, "learning_rate": 2.3546562515696295e-06, "loss": 0.1423, "step": 23310 }, { "epoch": 2.1300694190719764, "grad_norm": 2.8048973083496094, "learning_rate": 2.3501483086923767e-06, "loss": 0.1205, "step": 23320 }, { "epoch": 2.1309828279137744, "grad_norm": 3.524503231048584, "learning_rate": 2.345643359056894e-06, "loss": 0.1739, "step": 23330 }, { "epoch": 2.131896236755572, "grad_norm": 3.193326711654663, "learning_rate": 2.341141407751919e-06, "loss": 0.2206, "step": 23340 }, { "epoch": 2.1328096455973693, "grad_norm": 3.2348968982696533, "learning_rate": 2.3366424598628083e-06, "loss": 0.185, "step": 23350 }, { "epoch": 2.1337230544391668, "grad_norm": 3.3505282402038574, "learning_rate": 2.3321465204715194e-06, "loss": 0.2049, "step": 23360 }, { "epoch": 2.1346364632809647, "grad_norm": 5.073724269866943, "learning_rate": 2.3276535946566125e-06, "loss": 0.1528, "step": 23370 }, { "epoch": 2.135549872122762, "grad_norm": 3.0604512691497803, "learning_rate": 2.3231636874932427e-06, "loss": 0.189, "step": 23380 }, { "epoch": 2.1364632809645596, "grad_norm": 2.1668970584869385, "learning_rate": 2.318676804053162e-06, "loss": 0.1734, "step": 23390 }, { "epoch": 2.1373766898063575, "grad_norm": 2.4213898181915283, "learning_rate": 2.314192949404697e-06, "loss": 0.1317, "step": 23400 }, { "epoch": 2.138290098648155, "grad_norm": 6.1611714363098145, "learning_rate": 2.3097121286127623e-06, "loss": 0.2121, "step": 23410 }, { "epoch": 2.1392035074899525, "grad_norm": 2.9233381748199463, "learning_rate": 2.305234346738837e-06, "loss": 0.1744, "step": 23420 }, { "epoch": 2.14011691633175, "grad_norm": 5.222604751586914, "learning_rate": 2.3007596088409763e-06, "loss": 0.1673, "step": 23430 }, { "epoch": 2.141030325173548, "grad_norm": 8.771183013916016, "learning_rate": 2.296287919973789e-06, "loss": 0.1607, "step": 23440 }, { "epoch": 2.1419437340153453, "grad_norm": 2.6322989463806152, "learning_rate": 2.2918192851884447e-06, "loss": 0.1545, "step": 23450 }, { "epoch": 2.142857142857143, "grad_norm": 3.7538552284240723, "learning_rate": 2.287353709532658e-06, "loss": 0.1518, "step": 23460 }, { "epoch": 2.1437705516989403, "grad_norm": 3.188166618347168, "learning_rate": 2.2828911980506964e-06, "loss": 0.1974, "step": 23470 }, { "epoch": 2.144683960540738, "grad_norm": 4.92081356048584, "learning_rate": 2.2784317557833587e-06, "loss": 0.2045, "step": 23480 }, { "epoch": 2.1455973693825356, "grad_norm": 2.584730863571167, "learning_rate": 2.2739753877679767e-06, "loss": 0.1993, "step": 23490 }, { "epoch": 2.146510778224333, "grad_norm": 4.9678497314453125, "learning_rate": 2.2695220990384153e-06, "loss": 0.1785, "step": 23500 }, { "epoch": 2.1474241870661306, "grad_norm": 4.925375461578369, "learning_rate": 2.2650718946250587e-06, "loss": 0.1702, "step": 23510 }, { "epoch": 2.1483375959079285, "grad_norm": 3.6951634883880615, "learning_rate": 2.260624779554805e-06, "loss": 0.1863, "step": 23520 }, { "epoch": 2.149251004749726, "grad_norm": 7.643051624298096, "learning_rate": 2.2561807588510644e-06, "loss": 0.1507, "step": 23530 }, { "epoch": 2.1501644135915234, "grad_norm": 5.1193742752075195, "learning_rate": 2.251739837533747e-06, "loss": 0.151, "step": 23540 }, { "epoch": 2.1510778224333214, "grad_norm": 5.845807075500488, "learning_rate": 2.247302020619273e-06, "loss": 0.2269, "step": 23550 }, { "epoch": 2.151991231275119, "grad_norm": 1.6448769569396973, "learning_rate": 2.2428673131205447e-06, "loss": 0.191, "step": 23560 }, { "epoch": 2.1529046401169163, "grad_norm": 6.405743598937988, "learning_rate": 2.238435720046957e-06, "loss": 0.1646, "step": 23570 }, { "epoch": 2.1538180489587138, "grad_norm": 3.7058606147766113, "learning_rate": 2.234007246404385e-06, "loss": 0.1969, "step": 23580 }, { "epoch": 2.1547314578005117, "grad_norm": 2.7047057151794434, "learning_rate": 2.2295818971951815e-06, "loss": 0.1517, "step": 23590 }, { "epoch": 2.155644866642309, "grad_norm": 2.091266393661499, "learning_rate": 2.2251596774181734e-06, "loss": 0.1872, "step": 23600 }, { "epoch": 2.1565582754841066, "grad_norm": 3.392944574356079, "learning_rate": 2.2207405920686453e-06, "loss": 0.2268, "step": 23610 }, { "epoch": 2.157471684325904, "grad_norm": 7.39732027053833, "learning_rate": 2.2163246461383443e-06, "loss": 0.1519, "step": 23620 }, { "epoch": 2.158385093167702, "grad_norm": 2.8523459434509277, "learning_rate": 2.2119118446154755e-06, "loss": 0.1559, "step": 23630 }, { "epoch": 2.1592985020094995, "grad_norm": 2.624950647354126, "learning_rate": 2.207502192484685e-06, "loss": 0.167, "step": 23640 }, { "epoch": 2.160211910851297, "grad_norm": 2.60598087310791, "learning_rate": 2.203095694727066e-06, "loss": 0.1826, "step": 23650 }, { "epoch": 2.1611253196930944, "grad_norm": 5.358570575714111, "learning_rate": 2.198692356320146e-06, "loss": 0.1773, "step": 23660 }, { "epoch": 2.1620387285348923, "grad_norm": 5.431685924530029, "learning_rate": 2.194292182237887e-06, "loss": 0.1518, "step": 23670 }, { "epoch": 2.16295213737669, "grad_norm": 4.768185615539551, "learning_rate": 2.189895177450672e-06, "loss": 0.1252, "step": 23680 }, { "epoch": 2.1638655462184873, "grad_norm": 3.700977325439453, "learning_rate": 2.1855013469253107e-06, "loss": 0.1607, "step": 23690 }, { "epoch": 2.164778955060285, "grad_norm": 4.486044406890869, "learning_rate": 2.181110695625019e-06, "loss": 0.1798, "step": 23700 }, { "epoch": 2.1656923639020826, "grad_norm": 3.9662954807281494, "learning_rate": 2.1767232285094296e-06, "loss": 0.2037, "step": 23710 }, { "epoch": 2.16660577274388, "grad_norm": 3.7385199069976807, "learning_rate": 2.1723389505345722e-06, "loss": 0.1978, "step": 23720 }, { "epoch": 2.1675191815856776, "grad_norm": 5.130490779876709, "learning_rate": 2.1679578666528774e-06, "loss": 0.2159, "step": 23730 }, { "epoch": 2.1684325904274755, "grad_norm": 3.2995219230651855, "learning_rate": 2.1635799818131635e-06, "loss": 0.1866, "step": 23740 }, { "epoch": 2.169345999269273, "grad_norm": 3.2901649475097656, "learning_rate": 2.159205300960644e-06, "loss": 0.1451, "step": 23750 }, { "epoch": 2.1702594081110704, "grad_norm": 2.615583658218384, "learning_rate": 2.1548338290369043e-06, "loss": 0.1657, "step": 23760 }, { "epoch": 2.171172816952868, "grad_norm": 3.496723175048828, "learning_rate": 2.1504655709799073e-06, "loss": 0.1557, "step": 23770 }, { "epoch": 2.172086225794666, "grad_norm": 3.654268264770508, "learning_rate": 2.1461005317239887e-06, "loss": 0.1993, "step": 23780 }, { "epoch": 2.1729996346364633, "grad_norm": 4.748775482177734, "learning_rate": 2.1417387161998473e-06, "loss": 0.1614, "step": 23790 }, { "epoch": 2.1739130434782608, "grad_norm": 6.171451091766357, "learning_rate": 2.137380129334538e-06, "loss": 0.2214, "step": 23800 }, { "epoch": 2.1748264523200582, "grad_norm": 4.96385383605957, "learning_rate": 2.1330247760514706e-06, "loss": 0.1639, "step": 23810 }, { "epoch": 2.175739861161856, "grad_norm": 1.8064075708389282, "learning_rate": 2.1286726612703995e-06, "loss": 0.1412, "step": 23820 }, { "epoch": 2.1766532700036536, "grad_norm": 3.04226016998291, "learning_rate": 2.124323789907426e-06, "loss": 0.1369, "step": 23830 }, { "epoch": 2.177566678845451, "grad_norm": 2.768852949142456, "learning_rate": 2.119978166874984e-06, "loss": 0.1595, "step": 23840 }, { "epoch": 2.178480087687249, "grad_norm": 4.602421283721924, "learning_rate": 2.1156357970818385e-06, "loss": 0.2403, "step": 23850 }, { "epoch": 2.1793934965290465, "grad_norm": 1.9571430683135986, "learning_rate": 2.1112966854330792e-06, "loss": 0.2479, "step": 23860 }, { "epoch": 2.180306905370844, "grad_norm": 1.9217032194137573, "learning_rate": 2.1069608368301166e-06, "loss": 0.187, "step": 23870 }, { "epoch": 2.1812203142126414, "grad_norm": 2.3914573192596436, "learning_rate": 2.102628256170677e-06, "loss": 0.1898, "step": 23880 }, { "epoch": 2.1821337230544393, "grad_norm": 8.511054992675781, "learning_rate": 2.0982989483487932e-06, "loss": 0.1859, "step": 23890 }, { "epoch": 2.183047131896237, "grad_norm": 3.7717649936676025, "learning_rate": 2.093972918254798e-06, "loss": 0.1288, "step": 23900 }, { "epoch": 2.1839605407380343, "grad_norm": 2.5965826511383057, "learning_rate": 2.0896501707753286e-06, "loss": 0.1613, "step": 23910 }, { "epoch": 2.184873949579832, "grad_norm": 5.936268329620361, "learning_rate": 2.085330710793309e-06, "loss": 0.1319, "step": 23920 }, { "epoch": 2.1857873584216296, "grad_norm": 3.8059897422790527, "learning_rate": 2.0810145431879507e-06, "loss": 0.1549, "step": 23930 }, { "epoch": 2.186700767263427, "grad_norm": 2.582423448562622, "learning_rate": 2.076701672834745e-06, "loss": 0.1943, "step": 23940 }, { "epoch": 2.1876141761052246, "grad_norm": 4.095987796783447, "learning_rate": 2.0723921046054647e-06, "loss": 0.1818, "step": 23950 }, { "epoch": 2.1885275849470225, "grad_norm": 2.069925308227539, "learning_rate": 2.068085843368145e-06, "loss": 0.1729, "step": 23960 }, { "epoch": 2.18944099378882, "grad_norm": 5.492155075073242, "learning_rate": 2.0637828939870874e-06, "loss": 0.2302, "step": 23970 }, { "epoch": 2.1903544026306174, "grad_norm": 6.36174201965332, "learning_rate": 2.059483261322856e-06, "loss": 0.1615, "step": 23980 }, { "epoch": 2.191267811472415, "grad_norm": 4.7511796951293945, "learning_rate": 2.0551869502322676e-06, "loss": 0.2016, "step": 23990 }, { "epoch": 2.192181220314213, "grad_norm": 2.753790855407715, "learning_rate": 2.050893965568383e-06, "loss": 0.1661, "step": 24000 }, { "epoch": 2.1930946291560103, "grad_norm": 3.4273133277893066, "learning_rate": 2.0466043121805086e-06, "loss": 0.2244, "step": 24010 }, { "epoch": 2.1940080379978077, "grad_norm": 8.111573219299316, "learning_rate": 2.042317994914185e-06, "loss": 0.1702, "step": 24020 }, { "epoch": 2.194921446839605, "grad_norm": 2.7773423194885254, "learning_rate": 2.038035018611191e-06, "loss": 0.1744, "step": 24030 }, { "epoch": 2.195834855681403, "grad_norm": 4.684183597564697, "learning_rate": 2.033755388109524e-06, "loss": 0.1118, "step": 24040 }, { "epoch": 2.1967482645232006, "grad_norm": 2.3635447025299072, "learning_rate": 2.029479108243404e-06, "loss": 0.1801, "step": 24050 }, { "epoch": 2.197661673364998, "grad_norm": 3.3966095447540283, "learning_rate": 2.0252061838432685e-06, "loss": 0.2301, "step": 24060 }, { "epoch": 2.198575082206796, "grad_norm": 3.526059150695801, "learning_rate": 2.0209366197357654e-06, "loss": 0.1658, "step": 24070 }, { "epoch": 2.1994884910485935, "grad_norm": 2.935737371444702, "learning_rate": 2.016670420743743e-06, "loss": 0.1829, "step": 24080 }, { "epoch": 2.200401899890391, "grad_norm": 3.813380002975464, "learning_rate": 2.0124075916862514e-06, "loss": 0.1607, "step": 24090 }, { "epoch": 2.2013153087321884, "grad_norm": 2.7052130699157715, "learning_rate": 2.0081481373785288e-06, "loss": 0.1664, "step": 24100 }, { "epoch": 2.2022287175739863, "grad_norm": 3.37572979927063, "learning_rate": 2.003892062632011e-06, "loss": 0.1648, "step": 24110 }, { "epoch": 2.2031421264157838, "grad_norm": 2.6923885345458984, "learning_rate": 1.9996393722543076e-06, "loss": 0.2139, "step": 24120 }, { "epoch": 2.2040555352575812, "grad_norm": 5.216461181640625, "learning_rate": 1.9953900710492097e-06, "loss": 0.2039, "step": 24130 }, { "epoch": 2.2049689440993787, "grad_norm": 3.8822946548461914, "learning_rate": 1.9911441638166762e-06, "loss": 0.1554, "step": 24140 }, { "epoch": 2.2058823529411766, "grad_norm": 7.1958770751953125, "learning_rate": 1.9869016553528364e-06, "loss": 0.1367, "step": 24150 }, { "epoch": 2.206795761782974, "grad_norm": 5.908149242401123, "learning_rate": 1.9826625504499807e-06, "loss": 0.1576, "step": 24160 }, { "epoch": 2.2077091706247716, "grad_norm": 3.6105892658233643, "learning_rate": 1.9784268538965512e-06, "loss": 0.1732, "step": 24170 }, { "epoch": 2.208622579466569, "grad_norm": 3.4319231510162354, "learning_rate": 1.9741945704771392e-06, "loss": 0.1792, "step": 24180 }, { "epoch": 2.209535988308367, "grad_norm": 2.533782482147217, "learning_rate": 1.9699657049724873e-06, "loss": 0.1763, "step": 24190 }, { "epoch": 2.2104493971501644, "grad_norm": 2.430183172225952, "learning_rate": 1.965740262159471e-06, "loss": 0.2057, "step": 24200 }, { "epoch": 2.211362805991962, "grad_norm": 2.9546263217926025, "learning_rate": 1.9615182468111023e-06, "loss": 0.2168, "step": 24210 }, { "epoch": 2.21227621483376, "grad_norm": 4.125246047973633, "learning_rate": 1.9572996636965173e-06, "loss": 0.1693, "step": 24220 }, { "epoch": 2.2131896236755573, "grad_norm": 3.0135860443115234, "learning_rate": 1.9530845175809838e-06, "loss": 0.1404, "step": 24230 }, { "epoch": 2.2141030325173547, "grad_norm": 2.520442008972168, "learning_rate": 1.9488728132258796e-06, "loss": 0.2562, "step": 24240 }, { "epoch": 2.215016441359152, "grad_norm": 2.400655508041382, "learning_rate": 1.944664555388695e-06, "loss": 0.213, "step": 24250 }, { "epoch": 2.21592985020095, "grad_norm": 2.92391300201416, "learning_rate": 1.9404597488230314e-06, "loss": 0.1594, "step": 24260 }, { "epoch": 2.2168432590427476, "grad_norm": 3.692793607711792, "learning_rate": 1.9362583982785913e-06, "loss": 0.1672, "step": 24270 }, { "epoch": 2.217756667884545, "grad_norm": 6.003443241119385, "learning_rate": 1.9320605085011708e-06, "loss": 0.1824, "step": 24280 }, { "epoch": 2.2186700767263425, "grad_norm": 4.359122276306152, "learning_rate": 1.9278660842326562e-06, "loss": 0.1743, "step": 24290 }, { "epoch": 2.2195834855681404, "grad_norm": 4.050520896911621, "learning_rate": 1.923675130211019e-06, "loss": 0.1403, "step": 24300 }, { "epoch": 2.220496894409938, "grad_norm": 2.8684544563293457, "learning_rate": 1.9194876511703165e-06, "loss": 0.1963, "step": 24310 }, { "epoch": 2.2214103032517354, "grad_norm": 3.0937962532043457, "learning_rate": 1.915303651840674e-06, "loss": 0.2266, "step": 24320 }, { "epoch": 2.222323712093533, "grad_norm": 5.325260639190674, "learning_rate": 1.911123136948288e-06, "loss": 0.1976, "step": 24330 }, { "epoch": 2.2232371209353308, "grad_norm": 3.6305336952209473, "learning_rate": 1.9069461112154204e-06, "loss": 0.2364, "step": 24340 }, { "epoch": 2.2241505297771282, "grad_norm": 2.1484100818634033, "learning_rate": 1.902772579360393e-06, "loss": 0.1721, "step": 24350 }, { "epoch": 2.2250639386189257, "grad_norm": 2.353351354598999, "learning_rate": 1.8986025460975772e-06, "loss": 0.1898, "step": 24360 }, { "epoch": 2.2259773474607236, "grad_norm": 3.2999939918518066, "learning_rate": 1.894436016137395e-06, "loss": 0.2195, "step": 24370 }, { "epoch": 2.226890756302521, "grad_norm": 3.701462984085083, "learning_rate": 1.8902729941863079e-06, "loss": 0.1491, "step": 24380 }, { "epoch": 2.2278041651443186, "grad_norm": 3.12361741065979, "learning_rate": 1.886113484946821e-06, "loss": 0.1797, "step": 24390 }, { "epoch": 2.228717573986116, "grad_norm": 3.0642919540405273, "learning_rate": 1.8819574931174655e-06, "loss": 0.1952, "step": 24400 }, { "epoch": 2.229630982827914, "grad_norm": 3.805121660232544, "learning_rate": 1.877805023392803e-06, "loss": 0.227, "step": 24410 }, { "epoch": 2.2305443916697114, "grad_norm": 2.5260136127471924, "learning_rate": 1.8736560804634118e-06, "loss": 0.1606, "step": 24420 }, { "epoch": 2.231457800511509, "grad_norm": 2.6736133098602295, "learning_rate": 1.869510669015895e-06, "loss": 0.1515, "step": 24430 }, { "epoch": 2.2323712093533064, "grad_norm": 3.880039930343628, "learning_rate": 1.8653687937328558e-06, "loss": 0.1979, "step": 24440 }, { "epoch": 2.2332846181951043, "grad_norm": 4.333569049835205, "learning_rate": 1.8612304592929136e-06, "loss": 0.1835, "step": 24450 }, { "epoch": 2.2341980270369017, "grad_norm": 4.252482891082764, "learning_rate": 1.857095670370679e-06, "loss": 0.1925, "step": 24460 }, { "epoch": 2.235111435878699, "grad_norm": 3.6019065380096436, "learning_rate": 1.8529644316367652e-06, "loss": 0.1762, "step": 24470 }, { "epoch": 2.2360248447204967, "grad_norm": 5.047949314117432, "learning_rate": 1.8488367477577696e-06, "loss": 0.1876, "step": 24480 }, { "epoch": 2.2369382535622946, "grad_norm": 1.7849913835525513, "learning_rate": 1.844712623396277e-06, "loss": 0.2413, "step": 24490 }, { "epoch": 2.237851662404092, "grad_norm": 4.740047931671143, "learning_rate": 1.8405920632108476e-06, "loss": 0.1797, "step": 24500 }, { "epoch": 2.2387650712458895, "grad_norm": 7.756009101867676, "learning_rate": 1.8364750718560225e-06, "loss": 0.1834, "step": 24510 }, { "epoch": 2.2396784800876874, "grad_norm": 4.069055557250977, "learning_rate": 1.8323616539823057e-06, "loss": 0.1388, "step": 24520 }, { "epoch": 2.240591888929485, "grad_norm": 3.44559907913208, "learning_rate": 1.8282518142361645e-06, "loss": 0.1896, "step": 24530 }, { "epoch": 2.2415052977712824, "grad_norm": 3.255389451980591, "learning_rate": 1.8241455572600281e-06, "loss": 0.234, "step": 24540 }, { "epoch": 2.24241870661308, "grad_norm": 4.173270225524902, "learning_rate": 1.8200428876922787e-06, "loss": 0.1484, "step": 24550 }, { "epoch": 2.2433321154548778, "grad_norm": 10.802597045898438, "learning_rate": 1.8159438101672422e-06, "loss": 0.1819, "step": 24560 }, { "epoch": 2.2442455242966752, "grad_norm": 5.3239874839782715, "learning_rate": 1.8118483293151895e-06, "loss": 0.1821, "step": 24570 }, { "epoch": 2.2451589331384727, "grad_norm": 3.034489154815674, "learning_rate": 1.8077564497623262e-06, "loss": 0.2231, "step": 24580 }, { "epoch": 2.24607234198027, "grad_norm": 3.048393964767456, "learning_rate": 1.8036681761307956e-06, "loss": 0.1552, "step": 24590 }, { "epoch": 2.246985750822068, "grad_norm": 3.792559862136841, "learning_rate": 1.7995835130386625e-06, "loss": 0.1482, "step": 24600 }, { "epoch": 2.2478991596638656, "grad_norm": 3.648345470428467, "learning_rate": 1.7955024650999136e-06, "loss": 0.1474, "step": 24610 }, { "epoch": 2.248812568505663, "grad_norm": 2.821141242980957, "learning_rate": 1.7914250369244563e-06, "loss": 0.2285, "step": 24620 }, { "epoch": 2.2497259773474605, "grad_norm": 2.0771374702453613, "learning_rate": 1.7873512331181031e-06, "loss": 0.168, "step": 24630 }, { "epoch": 2.2506393861892584, "grad_norm": 2.4629671573638916, "learning_rate": 1.7832810582825793e-06, "loss": 0.1588, "step": 24640 }, { "epoch": 2.251552795031056, "grad_norm": 3.710310935974121, "learning_rate": 1.779214517015505e-06, "loss": 0.242, "step": 24650 }, { "epoch": 2.2524662038728533, "grad_norm": 3.3082263469696045, "learning_rate": 1.7751516139103968e-06, "loss": 0.2337, "step": 24660 }, { "epoch": 2.2533796127146513, "grad_norm": 3.316352605819702, "learning_rate": 1.771092353556666e-06, "loss": 0.1983, "step": 24670 }, { "epoch": 2.2542930215564487, "grad_norm": 43.68886947631836, "learning_rate": 1.7670367405396043e-06, "loss": 0.1651, "step": 24680 }, { "epoch": 2.255206430398246, "grad_norm": 3.404249429702759, "learning_rate": 1.762984779440386e-06, "loss": 0.1933, "step": 24690 }, { "epoch": 2.2561198392400437, "grad_norm": 3.149243116378784, "learning_rate": 1.7589364748360566e-06, "loss": 0.1695, "step": 24700 }, { "epoch": 2.2570332480818416, "grad_norm": 4.129607200622559, "learning_rate": 1.7548918312995384e-06, "loss": 0.1556, "step": 24710 }, { "epoch": 2.257946656923639, "grad_norm": 3.2787437438964844, "learning_rate": 1.7508508533996093e-06, "loss": 0.1623, "step": 24720 }, { "epoch": 2.2588600657654365, "grad_norm": 5.700830459594727, "learning_rate": 1.746813545700916e-06, "loss": 0.1734, "step": 24730 }, { "epoch": 2.2597734746072344, "grad_norm": 1.951832890510559, "learning_rate": 1.7427799127639505e-06, "loss": 0.1597, "step": 24740 }, { "epoch": 2.260686883449032, "grad_norm": 5.479414463043213, "learning_rate": 1.7387499591450612e-06, "loss": 0.1851, "step": 24750 }, { "epoch": 2.2616002922908294, "grad_norm": 4.379063129425049, "learning_rate": 1.7347236893964347e-06, "loss": 0.1812, "step": 24760 }, { "epoch": 2.262513701132627, "grad_norm": 6.096508026123047, "learning_rate": 1.7307011080660995e-06, "loss": 0.2112, "step": 24770 }, { "epoch": 2.2634271099744243, "grad_norm": 3.661438465118408, "learning_rate": 1.7266822196979154e-06, "loss": 0.1461, "step": 24780 }, { "epoch": 2.2643405188162222, "grad_norm": 3.5858147144317627, "learning_rate": 1.7226670288315745e-06, "loss": 0.236, "step": 24790 }, { "epoch": 2.2652539276580197, "grad_norm": 3.0277631282806396, "learning_rate": 1.718655540002589e-06, "loss": 0.1632, "step": 24800 }, { "epoch": 2.266167336499817, "grad_norm": 4.245289325714111, "learning_rate": 1.7146477577422882e-06, "loss": 0.17, "step": 24810 }, { "epoch": 2.267080745341615, "grad_norm": 2.662914276123047, "learning_rate": 1.7106436865778182e-06, "loss": 0.1697, "step": 24820 }, { "epoch": 2.2679941541834125, "grad_norm": 7.002992630004883, "learning_rate": 1.7066433310321329e-06, "loss": 0.1627, "step": 24830 }, { "epoch": 2.26890756302521, "grad_norm": 8.733967781066895, "learning_rate": 1.7026466956239862e-06, "loss": 0.1957, "step": 24840 }, { "epoch": 2.2698209718670075, "grad_norm": 3.479750871658325, "learning_rate": 1.6986537848679314e-06, "loss": 0.1313, "step": 24850 }, { "epoch": 2.2707343807088054, "grad_norm": 4.1442484855651855, "learning_rate": 1.694664603274312e-06, "loss": 0.1695, "step": 24860 }, { "epoch": 2.271647789550603, "grad_norm": 5.89766263961792, "learning_rate": 1.6906791553492652e-06, "loss": 0.2044, "step": 24870 }, { "epoch": 2.2725611983924003, "grad_norm": 1.492565393447876, "learning_rate": 1.6866974455947044e-06, "loss": 0.1864, "step": 24880 }, { "epoch": 2.2734746072341983, "grad_norm": 3.652796506881714, "learning_rate": 1.6827194785083218e-06, "loss": 0.1463, "step": 24890 }, { "epoch": 2.2743880160759957, "grad_norm": 3.374422311782837, "learning_rate": 1.6787452585835856e-06, "loss": 0.1906, "step": 24900 }, { "epoch": 2.275301424917793, "grad_norm": 3.1186845302581787, "learning_rate": 1.6747747903097261e-06, "loss": 0.2108, "step": 24910 }, { "epoch": 2.2762148337595907, "grad_norm": 4.519214630126953, "learning_rate": 1.6708080781717413e-06, "loss": 0.1746, "step": 24920 }, { "epoch": 2.2771282426013886, "grad_norm": 3.5118112564086914, "learning_rate": 1.6668451266503816e-06, "loss": 0.1901, "step": 24930 }, { "epoch": 2.278041651443186, "grad_norm": 2.0082621574401855, "learning_rate": 1.6628859402221486e-06, "loss": 0.1376, "step": 24940 }, { "epoch": 2.2789550602849835, "grad_norm": 4.137612819671631, "learning_rate": 1.6589305233592984e-06, "loss": 0.2232, "step": 24950 }, { "epoch": 2.279868469126781, "grad_norm": 3.0884008407592773, "learning_rate": 1.6549788805298207e-06, "loss": 0.1519, "step": 24960 }, { "epoch": 2.280781877968579, "grad_norm": 2.915191411972046, "learning_rate": 1.6510310161974468e-06, "loss": 0.2077, "step": 24970 }, { "epoch": 2.2816952868103764, "grad_norm": 3.192009925842285, "learning_rate": 1.6470869348216363e-06, "loss": 0.2148, "step": 24980 }, { "epoch": 2.282608695652174, "grad_norm": 3.174941301345825, "learning_rate": 1.6431466408575808e-06, "loss": 0.1768, "step": 24990 }, { "epoch": 2.2835221044939713, "grad_norm": 2.1452715396881104, "learning_rate": 1.639210138756188e-06, "loss": 0.1364, "step": 25000 }, { "epoch": 2.284435513335769, "grad_norm": 4.468832969665527, "learning_rate": 1.635277432964088e-06, "loss": 0.1805, "step": 25010 }, { "epoch": 2.2853489221775667, "grad_norm": 4.894355297088623, "learning_rate": 1.631348527923617e-06, "loss": 0.1818, "step": 25020 }, { "epoch": 2.286262331019364, "grad_norm": 3.2367336750030518, "learning_rate": 1.627423428072823e-06, "loss": 0.1738, "step": 25030 }, { "epoch": 2.287175739861162, "grad_norm": 5.97412109375, "learning_rate": 1.6235021378454518e-06, "loss": 0.1741, "step": 25040 }, { "epoch": 2.2880891487029595, "grad_norm": 2.904094934463501, "learning_rate": 1.6195846616709482e-06, "loss": 0.1787, "step": 25050 }, { "epoch": 2.289002557544757, "grad_norm": 4.531261444091797, "learning_rate": 1.6156710039744462e-06, "loss": 0.1695, "step": 25060 }, { "epoch": 2.2899159663865545, "grad_norm": 2.932264804840088, "learning_rate": 1.6117611691767703e-06, "loss": 0.2196, "step": 25070 }, { "epoch": 2.2908293752283524, "grad_norm": 2.6910948753356934, "learning_rate": 1.6078551616944238e-06, "loss": 0.1177, "step": 25080 }, { "epoch": 2.29174278407015, "grad_norm": 2.433668851852417, "learning_rate": 1.6039529859395857e-06, "loss": 0.1866, "step": 25090 }, { "epoch": 2.2926561929119473, "grad_norm": 1.3081915378570557, "learning_rate": 1.600054646320111e-06, "loss": 0.133, "step": 25100 }, { "epoch": 2.293569601753745, "grad_norm": 4.369492053985596, "learning_rate": 1.5961601472395193e-06, "loss": 0.1743, "step": 25110 }, { "epoch": 2.2944830105955427, "grad_norm": 2.9200408458709717, "learning_rate": 1.5922694930969907e-06, "loss": 0.2019, "step": 25120 }, { "epoch": 2.29539641943734, "grad_norm": 2.024066686630249, "learning_rate": 1.5883826882873627e-06, "loss": 0.1929, "step": 25130 }, { "epoch": 2.2963098282791377, "grad_norm": 2.2944204807281494, "learning_rate": 1.5844997372011233e-06, "loss": 0.1544, "step": 25140 }, { "epoch": 2.297223237120935, "grad_norm": 4.010076522827148, "learning_rate": 1.5806206442244132e-06, "loss": 0.1699, "step": 25150 }, { "epoch": 2.298136645962733, "grad_norm": 4.835600852966309, "learning_rate": 1.576745413739008e-06, "loss": 0.1696, "step": 25160 }, { "epoch": 2.2990500548045305, "grad_norm": 2.6763432025909424, "learning_rate": 1.5728740501223227e-06, "loss": 0.1799, "step": 25170 }, { "epoch": 2.299963463646328, "grad_norm": 3.2970027923583984, "learning_rate": 1.5690065577474074e-06, "loss": 0.2261, "step": 25180 }, { "epoch": 2.300876872488126, "grad_norm": 4.928851127624512, "learning_rate": 1.5651429409829332e-06, "loss": 0.2203, "step": 25190 }, { "epoch": 2.3017902813299234, "grad_norm": 2.9483540058135986, "learning_rate": 1.5612832041932008e-06, "loss": 0.1458, "step": 25200 }, { "epoch": 2.302703690171721, "grad_norm": 3.4723005294799805, "learning_rate": 1.5574273517381216e-06, "loss": 0.1731, "step": 25210 }, { "epoch": 2.3036170990135183, "grad_norm": 3.7293334007263184, "learning_rate": 1.5535753879732212e-06, "loss": 0.1983, "step": 25220 }, { "epoch": 2.304530507855316, "grad_norm": 1.9045898914337158, "learning_rate": 1.5497273172496363e-06, "loss": 0.2404, "step": 25230 }, { "epoch": 2.3054439166971137, "grad_norm": 2.6788017749786377, "learning_rate": 1.5458831439141008e-06, "loss": 0.184, "step": 25240 }, { "epoch": 2.306357325538911, "grad_norm": 3.987234354019165, "learning_rate": 1.5420428723089486e-06, "loss": 0.1826, "step": 25250 }, { "epoch": 2.3072707343807086, "grad_norm": 1.9607833623886108, "learning_rate": 1.5382065067721047e-06, "loss": 0.1057, "step": 25260 }, { "epoch": 2.3081841432225065, "grad_norm": 3.189500331878662, "learning_rate": 1.5343740516370864e-06, "loss": 0.1531, "step": 25270 }, { "epoch": 2.309097552064304, "grad_norm": 2.606783151626587, "learning_rate": 1.5305455112329887e-06, "loss": 0.1718, "step": 25280 }, { "epoch": 2.3100109609061015, "grad_norm": 7.485494613647461, "learning_rate": 1.526720889884485e-06, "loss": 0.2197, "step": 25290 }, { "epoch": 2.310924369747899, "grad_norm": 5.740393161773682, "learning_rate": 1.5229001919118246e-06, "loss": 0.1957, "step": 25300 }, { "epoch": 2.311837778589697, "grad_norm": 4.725992202758789, "learning_rate": 1.519083421630826e-06, "loss": 0.1598, "step": 25310 }, { "epoch": 2.3127511874314943, "grad_norm": 2.730675220489502, "learning_rate": 1.5152705833528663e-06, "loss": 0.1693, "step": 25320 }, { "epoch": 2.313664596273292, "grad_norm": 7.682080268859863, "learning_rate": 1.5114616813848842e-06, "loss": 0.1749, "step": 25330 }, { "epoch": 2.3145780051150897, "grad_norm": 2.6299760341644287, "learning_rate": 1.507656720029369e-06, "loss": 0.1936, "step": 25340 }, { "epoch": 2.315491413956887, "grad_norm": 3.0721359252929688, "learning_rate": 1.5038557035843643e-06, "loss": 0.1669, "step": 25350 }, { "epoch": 2.3164048227986846, "grad_norm": 3.763089656829834, "learning_rate": 1.5000586363434527e-06, "loss": 0.1753, "step": 25360 }, { "epoch": 2.317318231640482, "grad_norm": 2.4591617584228516, "learning_rate": 1.4962655225957556e-06, "loss": 0.1531, "step": 25370 }, { "epoch": 2.31823164048228, "grad_norm": 4.721521854400635, "learning_rate": 1.492476366625933e-06, "loss": 0.1657, "step": 25380 }, { "epoch": 2.3191450493240775, "grad_norm": 1.7060368061065674, "learning_rate": 1.4886911727141712e-06, "loss": 0.1643, "step": 25390 }, { "epoch": 2.320058458165875, "grad_norm": 6.21627140045166, "learning_rate": 1.4849099451361814e-06, "loss": 0.1971, "step": 25400 }, { "epoch": 2.3209718670076724, "grad_norm": 3.626256227493286, "learning_rate": 1.4811326881631937e-06, "loss": 0.1846, "step": 25410 }, { "epoch": 2.3218852758494704, "grad_norm": 2.234239101409912, "learning_rate": 1.4773594060619528e-06, "loss": 0.1634, "step": 25420 }, { "epoch": 2.322798684691268, "grad_norm": 4.463622570037842, "learning_rate": 1.4735901030947175e-06, "loss": 0.1598, "step": 25430 }, { "epoch": 2.3237120935330653, "grad_norm": 2.5935773849487305, "learning_rate": 1.469824783519247e-06, "loss": 0.1898, "step": 25440 }, { "epoch": 2.3246255023748628, "grad_norm": 4.020395755767822, "learning_rate": 1.4660634515888007e-06, "loss": 0.2326, "step": 25450 }, { "epoch": 2.3255389112166607, "grad_norm": 4.140101432800293, "learning_rate": 1.4623061115521393e-06, "loss": 0.2156, "step": 25460 }, { "epoch": 2.326452320058458, "grad_norm": 3.415390968322754, "learning_rate": 1.4585527676535088e-06, "loss": 0.1491, "step": 25470 }, { "epoch": 2.3273657289002556, "grad_norm": 1.8002947568893433, "learning_rate": 1.4548034241326415e-06, "loss": 0.1765, "step": 25480 }, { "epoch": 2.3282791377420535, "grad_norm": 2.213534116744995, "learning_rate": 1.4510580852247552e-06, "loss": 0.1997, "step": 25490 }, { "epoch": 2.329192546583851, "grad_norm": 3.0773096084594727, "learning_rate": 1.4473167551605388e-06, "loss": 0.1426, "step": 25500 }, { "epoch": 2.3301059554256485, "grad_norm": 4.658303737640381, "learning_rate": 1.4435794381661578e-06, "loss": 0.2552, "step": 25510 }, { "epoch": 2.331019364267446, "grad_norm": 3.588944673538208, "learning_rate": 1.439846138463241e-06, "loss": 0.2056, "step": 25520 }, { "epoch": 2.331932773109244, "grad_norm": 3.1200106143951416, "learning_rate": 1.4361168602688814e-06, "loss": 0.1398, "step": 25530 }, { "epoch": 2.3328461819510413, "grad_norm": 2.335510015487671, "learning_rate": 1.4323916077956252e-06, "loss": 0.1861, "step": 25540 }, { "epoch": 2.333759590792839, "grad_norm": 4.339201927185059, "learning_rate": 1.4286703852514784e-06, "loss": 0.1672, "step": 25550 }, { "epoch": 2.3346729996346367, "grad_norm": 2.3877363204956055, "learning_rate": 1.4249531968398895e-06, "loss": 0.1617, "step": 25560 }, { "epoch": 2.335586408476434, "grad_norm": 3.078481674194336, "learning_rate": 1.4212400467597492e-06, "loss": 0.1528, "step": 25570 }, { "epoch": 2.3364998173182316, "grad_norm": 3.4910364151000977, "learning_rate": 1.4175309392053916e-06, "loss": 0.1676, "step": 25580 }, { "epoch": 2.337413226160029, "grad_norm": 3.619432210922241, "learning_rate": 1.4138258783665814e-06, "loss": 0.2138, "step": 25590 }, { "epoch": 2.3383266350018266, "grad_norm": 5.362312316894531, "learning_rate": 1.410124868428513e-06, "loss": 0.1794, "step": 25600 }, { "epoch": 2.3392400438436245, "grad_norm": 3.246785879135132, "learning_rate": 1.4064279135718022e-06, "loss": 0.1887, "step": 25610 }, { "epoch": 2.340153452685422, "grad_norm": 2.9904377460479736, "learning_rate": 1.4027350179724864e-06, "loss": 0.1492, "step": 25620 }, { "epoch": 2.3410668615272194, "grad_norm": 6.446959018707275, "learning_rate": 1.39904618580202e-06, "loss": 0.2008, "step": 25630 }, { "epoch": 2.3419802703690173, "grad_norm": 6.149611949920654, "learning_rate": 1.3953614212272637e-06, "loss": 0.1373, "step": 25640 }, { "epoch": 2.342893679210815, "grad_norm": 2.879983425140381, "learning_rate": 1.391680728410484e-06, "loss": 0.1757, "step": 25650 }, { "epoch": 2.3438070880526123, "grad_norm": 3.4275922775268555, "learning_rate": 1.388004111509349e-06, "loss": 0.1506, "step": 25660 }, { "epoch": 2.3447204968944098, "grad_norm": 1.7919604778289795, "learning_rate": 1.3843315746769254e-06, "loss": 0.1709, "step": 25670 }, { "epoch": 2.3456339057362077, "grad_norm": 3.946394205093384, "learning_rate": 1.3806631220616657e-06, "loss": 0.218, "step": 25680 }, { "epoch": 2.346547314578005, "grad_norm": 6.273131847381592, "learning_rate": 1.3769987578074123e-06, "loss": 0.2309, "step": 25690 }, { "epoch": 2.3474607234198026, "grad_norm": 2.1227023601531982, "learning_rate": 1.3733384860533867e-06, "loss": 0.2012, "step": 25700 }, { "epoch": 2.3483741322616005, "grad_norm": 2.717991828918457, "learning_rate": 1.3696823109341928e-06, "loss": 0.1421, "step": 25710 }, { "epoch": 2.349287541103398, "grad_norm": 3.322444200515747, "learning_rate": 1.3660302365798018e-06, "loss": 0.1479, "step": 25720 }, { "epoch": 2.3502009499451955, "grad_norm": 3.9090938568115234, "learning_rate": 1.3623822671155529e-06, "loss": 0.1808, "step": 25730 }, { "epoch": 2.351114358786993, "grad_norm": 5.960702419281006, "learning_rate": 1.358738406662154e-06, "loss": 0.2142, "step": 25740 }, { "epoch": 2.3520277676287904, "grad_norm": 4.409420967102051, "learning_rate": 1.3550986593356652e-06, "loss": 0.1946, "step": 25750 }, { "epoch": 2.3529411764705883, "grad_norm": 4.048831939697266, "learning_rate": 1.3514630292475017e-06, "loss": 0.1775, "step": 25760 }, { "epoch": 2.353854585312386, "grad_norm": 3.4044201374053955, "learning_rate": 1.3478315205044323e-06, "loss": 0.1529, "step": 25770 }, { "epoch": 2.3547679941541833, "grad_norm": 2.8514256477355957, "learning_rate": 1.344204137208564e-06, "loss": 0.1614, "step": 25780 }, { "epoch": 2.355681402995981, "grad_norm": 1.8524969816207886, "learning_rate": 1.3405808834573503e-06, "loss": 0.1599, "step": 25790 }, { "epoch": 2.3565948118377786, "grad_norm": 4.301512718200684, "learning_rate": 1.336961763343575e-06, "loss": 0.1556, "step": 25800 }, { "epoch": 2.357508220679576, "grad_norm": 2.020092725753784, "learning_rate": 1.3333467809553525e-06, "loss": 0.1885, "step": 25810 }, { "epoch": 2.3584216295213736, "grad_norm": 2.2897560596466064, "learning_rate": 1.3297359403761285e-06, "loss": 0.2187, "step": 25820 }, { "epoch": 2.3593350383631715, "grad_norm": 4.967813968658447, "learning_rate": 1.3261292456846648e-06, "loss": 0.2126, "step": 25830 }, { "epoch": 2.360248447204969, "grad_norm": 3.227975845336914, "learning_rate": 1.3225267009550418e-06, "loss": 0.1627, "step": 25840 }, { "epoch": 2.3611618560467664, "grad_norm": 2.7992539405822754, "learning_rate": 1.3189283102566507e-06, "loss": 0.1439, "step": 25850 }, { "epoch": 2.3620752648885643, "grad_norm": 3.8331692218780518, "learning_rate": 1.3153340776541944e-06, "loss": 0.1608, "step": 25860 }, { "epoch": 2.362988673730362, "grad_norm": 2.469576835632324, "learning_rate": 1.311744007207677e-06, "loss": 0.1718, "step": 25870 }, { "epoch": 2.3639020825721593, "grad_norm": 3.3363962173461914, "learning_rate": 1.3081581029723994e-06, "loss": 0.2167, "step": 25880 }, { "epoch": 2.3648154914139567, "grad_norm": 3.642385721206665, "learning_rate": 1.3045763689989576e-06, "loss": 0.1949, "step": 25890 }, { "epoch": 2.3657289002557547, "grad_norm": 4.2370734214782715, "learning_rate": 1.3009988093332348e-06, "loss": 0.1775, "step": 25900 }, { "epoch": 2.366642309097552, "grad_norm": 7.786776542663574, "learning_rate": 1.2974254280164038e-06, "loss": 0.1328, "step": 25910 }, { "epoch": 2.3675557179393496, "grad_norm": 22.66819190979004, "learning_rate": 1.2938562290849144e-06, "loss": 0.1589, "step": 25920 }, { "epoch": 2.368469126781147, "grad_norm": 8.909988403320312, "learning_rate": 1.2902912165704895e-06, "loss": 0.1931, "step": 25930 }, { "epoch": 2.369382535622945, "grad_norm": 3.198134422302246, "learning_rate": 1.2867303945001298e-06, "loss": 0.1691, "step": 25940 }, { "epoch": 2.3702959444647425, "grad_norm": 1.3516119718551636, "learning_rate": 1.2831737668960954e-06, "loss": 0.1621, "step": 25950 }, { "epoch": 2.37120935330654, "grad_norm": 4.1546454429626465, "learning_rate": 1.2796213377759142e-06, "loss": 0.1933, "step": 25960 }, { "epoch": 2.3721227621483374, "grad_norm": 2.6292519569396973, "learning_rate": 1.2760731111523682e-06, "loss": 0.142, "step": 25970 }, { "epoch": 2.3730361709901353, "grad_norm": 5.290649890899658, "learning_rate": 1.2725290910334913e-06, "loss": 0.1987, "step": 25980 }, { "epoch": 2.3739495798319328, "grad_norm": 3.171541452407837, "learning_rate": 1.2689892814225708e-06, "loss": 0.1416, "step": 25990 }, { "epoch": 2.3748629886737302, "grad_norm": 3.7407851219177246, "learning_rate": 1.2654536863181328e-06, "loss": 0.1796, "step": 26000 }, { "epoch": 2.375776397515528, "grad_norm": 7.3916015625, "learning_rate": 1.2619223097139428e-06, "loss": 0.2053, "step": 26010 }, { "epoch": 2.3766898063573256, "grad_norm": 3.3788130283355713, "learning_rate": 1.2583951555990064e-06, "loss": 0.2222, "step": 26020 }, { "epoch": 2.377603215199123, "grad_norm": 4.482236385345459, "learning_rate": 1.2548722279575542e-06, "loss": 0.1366, "step": 26030 }, { "epoch": 2.3785166240409206, "grad_norm": 3.1578938961029053, "learning_rate": 1.2513535307690439e-06, "loss": 0.1924, "step": 26040 }, { "epoch": 2.3794300328827185, "grad_norm": 5.360093116760254, "learning_rate": 1.2478390680081565e-06, "loss": 0.1664, "step": 26050 }, { "epoch": 2.380343441724516, "grad_norm": 6.816747188568115, "learning_rate": 1.2443288436447865e-06, "loss": 0.1725, "step": 26060 }, { "epoch": 2.3812568505663134, "grad_norm": 3.4670586585998535, "learning_rate": 1.2408228616440454e-06, "loss": 0.2394, "step": 26070 }, { "epoch": 2.382170259408111, "grad_norm": 2.7638049125671387, "learning_rate": 1.2373211259662483e-06, "loss": 0.1634, "step": 26080 }, { "epoch": 2.383083668249909, "grad_norm": 3.184784412384033, "learning_rate": 1.2338236405669136e-06, "loss": 0.1988, "step": 26090 }, { "epoch": 2.3839970770917063, "grad_norm": 4.665610313415527, "learning_rate": 1.2303304093967638e-06, "loss": 0.1655, "step": 26100 }, { "epoch": 2.3849104859335037, "grad_norm": 3.957987070083618, "learning_rate": 1.2268414364017107e-06, "loss": 0.2024, "step": 26110 }, { "epoch": 2.385823894775301, "grad_norm": 3.792677402496338, "learning_rate": 1.2233567255228568e-06, "loss": 0.1921, "step": 26120 }, { "epoch": 2.386737303617099, "grad_norm": 4.371369361877441, "learning_rate": 1.2198762806964914e-06, "loss": 0.1637, "step": 26130 }, { "epoch": 2.3876507124588966, "grad_norm": 3.380955457687378, "learning_rate": 1.2164001058540847e-06, "loss": 0.1424, "step": 26140 }, { "epoch": 2.388564121300694, "grad_norm": 3.8780927658081055, "learning_rate": 1.2129282049222858e-06, "loss": 0.1773, "step": 26150 }, { "epoch": 2.389477530142492, "grad_norm": 2.2876288890838623, "learning_rate": 1.2094605818229116e-06, "loss": 0.1622, "step": 26160 }, { "epoch": 2.3903909389842894, "grad_norm": 2.552968740463257, "learning_rate": 1.2059972404729497e-06, "loss": 0.1335, "step": 26170 }, { "epoch": 2.391304347826087, "grad_norm": 3.1328492164611816, "learning_rate": 1.2025381847845485e-06, "loss": 0.1542, "step": 26180 }, { "epoch": 2.3922177566678844, "grad_norm": 4.433716297149658, "learning_rate": 1.1990834186650203e-06, "loss": 0.1654, "step": 26190 }, { "epoch": 2.3931311655096823, "grad_norm": 10.523158073425293, "learning_rate": 1.1956329460168275e-06, "loss": 0.2185, "step": 26200 }, { "epoch": 2.3940445743514798, "grad_norm": 3.63954758644104, "learning_rate": 1.1921867707375822e-06, "loss": 0.1407, "step": 26210 }, { "epoch": 2.3949579831932772, "grad_norm": 2.411461353302002, "learning_rate": 1.1887448967200472e-06, "loss": 0.2022, "step": 26220 }, { "epoch": 2.3958713920350747, "grad_norm": 5.022258758544922, "learning_rate": 1.185307327852121e-06, "loss": 0.1827, "step": 26230 }, { "epoch": 2.3967848008768726, "grad_norm": 3.2240359783172607, "learning_rate": 1.1818740680168433e-06, "loss": 0.2153, "step": 26240 }, { "epoch": 2.39769820971867, "grad_norm": 2.724196672439575, "learning_rate": 1.1784451210923847e-06, "loss": 0.1766, "step": 26250 }, { "epoch": 2.3986116185604676, "grad_norm": 3.562636137008667, "learning_rate": 1.175020490952042e-06, "loss": 0.186, "step": 26260 }, { "epoch": 2.399525027402265, "grad_norm": 1.3189464807510376, "learning_rate": 1.1716001814642398e-06, "loss": 0.1307, "step": 26270 }, { "epoch": 2.400438436244063, "grad_norm": 2.7444868087768555, "learning_rate": 1.168184196492519e-06, "loss": 0.1655, "step": 26280 }, { "epoch": 2.4013518450858604, "grad_norm": 3.214262008666992, "learning_rate": 1.1647725398955356e-06, "loss": 0.2171, "step": 26290 }, { "epoch": 2.402265253927658, "grad_norm": 4.597286224365234, "learning_rate": 1.1613652155270605e-06, "loss": 0.1843, "step": 26300 }, { "epoch": 2.403178662769456, "grad_norm": 3.8368465900421143, "learning_rate": 1.1579622272359659e-06, "loss": 0.2043, "step": 26310 }, { "epoch": 2.4040920716112533, "grad_norm": 2.524305582046509, "learning_rate": 1.1545635788662273e-06, "loss": 0.1648, "step": 26320 }, { "epoch": 2.4050054804530507, "grad_norm": 1.9864920377731323, "learning_rate": 1.1511692742569187e-06, "loss": 0.1795, "step": 26330 }, { "epoch": 2.405918889294848, "grad_norm": 2.822941303253174, "learning_rate": 1.1477793172422068e-06, "loss": 0.1569, "step": 26340 }, { "epoch": 2.406832298136646, "grad_norm": 11.415627479553223, "learning_rate": 1.144393711651351e-06, "loss": 0.1548, "step": 26350 }, { "epoch": 2.4077457069784436, "grad_norm": 2.8831944465637207, "learning_rate": 1.1410124613086899e-06, "loss": 0.2008, "step": 26360 }, { "epoch": 2.408659115820241, "grad_norm": 4.225461006164551, "learning_rate": 1.1376355700336429e-06, "loss": 0.1569, "step": 26370 }, { "epoch": 2.409572524662039, "grad_norm": 3.696843147277832, "learning_rate": 1.1342630416407107e-06, "loss": 0.1175, "step": 26380 }, { "epoch": 2.4104859335038364, "grad_norm": 3.1419639587402344, "learning_rate": 1.1308948799394603e-06, "loss": 0.1713, "step": 26390 }, { "epoch": 2.411399342345634, "grad_norm": 4.290574073791504, "learning_rate": 1.1275310887345293e-06, "loss": 0.1703, "step": 26400 }, { "epoch": 2.4123127511874314, "grad_norm": 8.510647773742676, "learning_rate": 1.1241716718256147e-06, "loss": 0.2104, "step": 26410 }, { "epoch": 2.413226160029229, "grad_norm": 4.761580467224121, "learning_rate": 1.1208166330074766e-06, "loss": 0.1847, "step": 26420 }, { "epoch": 2.4141395688710268, "grad_norm": 1.1382358074188232, "learning_rate": 1.117465976069929e-06, "loss": 0.1756, "step": 26430 }, { "epoch": 2.4150529777128242, "grad_norm": 4.323234558105469, "learning_rate": 1.1141197047978353e-06, "loss": 0.1965, "step": 26440 }, { "epoch": 2.4159663865546217, "grad_norm": 2.6092443466186523, "learning_rate": 1.110777822971103e-06, "loss": 0.1676, "step": 26450 }, { "epoch": 2.4168797953964196, "grad_norm": 2.5561840534210205, "learning_rate": 1.1074403343646828e-06, "loss": 0.2293, "step": 26460 }, { "epoch": 2.417793204238217, "grad_norm": 4.279473304748535, "learning_rate": 1.1041072427485655e-06, "loss": 0.2028, "step": 26470 }, { "epoch": 2.4187066130800146, "grad_norm": 3.4586308002471924, "learning_rate": 1.1007785518877711e-06, "loss": 0.1792, "step": 26480 }, { "epoch": 2.419620021921812, "grad_norm": 2.7711265087127686, "learning_rate": 1.0974542655423498e-06, "loss": 0.166, "step": 26490 }, { "epoch": 2.42053343076361, "grad_norm": 3.416233777999878, "learning_rate": 1.0941343874673787e-06, "loss": 0.2158, "step": 26500 }, { "epoch": 2.4214468396054074, "grad_norm": 4.619205951690674, "learning_rate": 1.0908189214129511e-06, "loss": 0.1836, "step": 26510 }, { "epoch": 2.422360248447205, "grad_norm": 5.168989181518555, "learning_rate": 1.087507871124182e-06, "loss": 0.1557, "step": 26520 }, { "epoch": 2.423273657289003, "grad_norm": 2.8395071029663086, "learning_rate": 1.0842012403411944e-06, "loss": 0.2138, "step": 26530 }, { "epoch": 2.4241870661308003, "grad_norm": 2.8371098041534424, "learning_rate": 1.0808990327991182e-06, "loss": 0.1575, "step": 26540 }, { "epoch": 2.4251004749725977, "grad_norm": 7.525110721588135, "learning_rate": 1.0776012522280915e-06, "loss": 0.1496, "step": 26550 }, { "epoch": 2.426013883814395, "grad_norm": 1.5912665128707886, "learning_rate": 1.0743079023532482e-06, "loss": 0.1406, "step": 26560 }, { "epoch": 2.4269272926561927, "grad_norm": 2.8119776248931885, "learning_rate": 1.0710189868947163e-06, "loss": 0.2279, "step": 26570 }, { "epoch": 2.4278407014979906, "grad_norm": 3.2452807426452637, "learning_rate": 1.0677345095676195e-06, "loss": 0.2062, "step": 26580 }, { "epoch": 2.428754110339788, "grad_norm": 3.8070225715637207, "learning_rate": 1.064454474082064e-06, "loss": 0.18, "step": 26590 }, { "epoch": 2.4296675191815855, "grad_norm": 3.282883644104004, "learning_rate": 1.0611788841431398e-06, "loss": 0.2072, "step": 26600 }, { "epoch": 2.4305809280233834, "grad_norm": 3.8248770236968994, "learning_rate": 1.0579077434509139e-06, "loss": 0.1567, "step": 26610 }, { "epoch": 2.431494336865181, "grad_norm": 2.6404836177825928, "learning_rate": 1.0546410557004306e-06, "loss": 0.1841, "step": 26620 }, { "epoch": 2.4324077457069784, "grad_norm": 4.56838846206665, "learning_rate": 1.0513788245817035e-06, "loss": 0.1918, "step": 26630 }, { "epoch": 2.433321154548776, "grad_norm": 6.796634197235107, "learning_rate": 1.0481210537797103e-06, "loss": 0.1511, "step": 26640 }, { "epoch": 2.4342345633905738, "grad_norm": 2.703427314758301, "learning_rate": 1.044867746974389e-06, "loss": 0.1971, "step": 26650 }, { "epoch": 2.4351479722323712, "grad_norm": 2.856462240219116, "learning_rate": 1.0416189078406408e-06, "loss": 0.1837, "step": 26660 }, { "epoch": 2.4360613810741687, "grad_norm": 2.8096017837524414, "learning_rate": 1.0383745400483148e-06, "loss": 0.1991, "step": 26670 }, { "epoch": 2.4369747899159666, "grad_norm": 3.9563207626342773, "learning_rate": 1.0351346472622114e-06, "loss": 0.1509, "step": 26680 }, { "epoch": 2.437888198757764, "grad_norm": 2.3060312271118164, "learning_rate": 1.0318992331420746e-06, "loss": 0.1442, "step": 26690 }, { "epoch": 2.4388016075995615, "grad_norm": 4.423116207122803, "learning_rate": 1.0286683013425925e-06, "loss": 0.1802, "step": 26700 }, { "epoch": 2.439715016441359, "grad_norm": 5.204353332519531, "learning_rate": 1.0254418555133893e-06, "loss": 0.2029, "step": 26710 }, { "epoch": 2.440628425283157, "grad_norm": 1.6058006286621094, "learning_rate": 1.022219899299019e-06, "loss": 0.2084, "step": 26720 }, { "epoch": 2.4415418341249544, "grad_norm": 2.4450764656066895, "learning_rate": 1.0190024363389668e-06, "loss": 0.2074, "step": 26730 }, { "epoch": 2.442455242966752, "grad_norm": 2.7154507637023926, "learning_rate": 1.0157894702676395e-06, "loss": 0.1418, "step": 26740 }, { "epoch": 2.4433686518085493, "grad_norm": 4.652408123016357, "learning_rate": 1.0125810047143691e-06, "loss": 0.1955, "step": 26750 }, { "epoch": 2.4442820606503473, "grad_norm": 3.3438689708709717, "learning_rate": 1.0093770433034005e-06, "loss": 0.1882, "step": 26760 }, { "epoch": 2.4451954694921447, "grad_norm": 3.0218703746795654, "learning_rate": 1.00617758965389e-06, "loss": 0.2027, "step": 26770 }, { "epoch": 2.446108878333942, "grad_norm": 3.4111216068267822, "learning_rate": 1.0029826473799053e-06, "loss": 0.1553, "step": 26780 }, { "epoch": 2.4470222871757397, "grad_norm": 3.401618719100952, "learning_rate": 9.997922200904154e-07, "loss": 0.1765, "step": 26790 }, { "epoch": 2.4479356960175376, "grad_norm": 3.7342824935913086, "learning_rate": 9.966063113892881e-07, "loss": 0.1721, "step": 26800 }, { "epoch": 2.448849104859335, "grad_norm": 2.5254499912261963, "learning_rate": 9.93424924875292e-07, "loss": 0.2418, "step": 26810 }, { "epoch": 2.4497625137011325, "grad_norm": 10.03165340423584, "learning_rate": 9.902480641420819e-07, "loss": 0.1905, "step": 26820 }, { "epoch": 2.4506759225429304, "grad_norm": 3.8252110481262207, "learning_rate": 9.870757327782055e-07, "loss": 0.1755, "step": 26830 }, { "epoch": 2.451589331384728, "grad_norm": 4.43165922164917, "learning_rate": 9.839079343670905e-07, "loss": 0.1614, "step": 26840 }, { "epoch": 2.4525027402265254, "grad_norm": 4.427452564239502, "learning_rate": 9.80744672487043e-07, "loss": 0.1834, "step": 26850 }, { "epoch": 2.453416149068323, "grad_norm": 3.359358787536621, "learning_rate": 9.775859507112506e-07, "loss": 0.1798, "step": 26860 }, { "epoch": 2.4543295579101208, "grad_norm": 4.574195861816406, "learning_rate": 9.744317726077667e-07, "loss": 0.1903, "step": 26870 }, { "epoch": 2.455242966751918, "grad_norm": 3.2146942615509033, "learning_rate": 9.712821417395153e-07, "loss": 0.192, "step": 26880 }, { "epoch": 2.4561563755937157, "grad_norm": 2.9865989685058594, "learning_rate": 9.681370616642811e-07, "loss": 0.1917, "step": 26890 }, { "epoch": 2.457069784435513, "grad_norm": 7.050412178039551, "learning_rate": 9.649965359347113e-07, "loss": 0.1781, "step": 26900 }, { "epoch": 2.457983193277311, "grad_norm": 7.602043628692627, "learning_rate": 9.61860568098309e-07, "loss": 0.1507, "step": 26910 }, { "epoch": 2.4588966021191085, "grad_norm": 3.1518962383270264, "learning_rate": 9.587291616974254e-07, "loss": 0.1565, "step": 26920 }, { "epoch": 2.459810010960906, "grad_norm": 3.0600478649139404, "learning_rate": 9.556023202692599e-07, "loss": 0.1127, "step": 26930 }, { "epoch": 2.4607234198027035, "grad_norm": 6.901859760284424, "learning_rate": 9.524800473458589e-07, "loss": 0.1812, "step": 26940 }, { "epoch": 2.4616368286445014, "grad_norm": 2.325261354446411, "learning_rate": 9.493623464541041e-07, "loss": 0.1529, "step": 26950 }, { "epoch": 2.462550237486299, "grad_norm": 4.260481357574463, "learning_rate": 9.462492211157154e-07, "loss": 0.1689, "step": 26960 }, { "epoch": 2.4634636463280963, "grad_norm": 1.592104196548462, "learning_rate": 9.431406748472404e-07, "loss": 0.1233, "step": 26970 }, { "epoch": 2.4643770551698942, "grad_norm": 3.882398843765259, "learning_rate": 9.4003671116006e-07, "loss": 0.2028, "step": 26980 }, { "epoch": 2.4652904640116917, "grad_norm": 3.907881736755371, "learning_rate": 9.369373335603738e-07, "loss": 0.2133, "step": 26990 }, { "epoch": 2.466203872853489, "grad_norm": 4.19924783706665, "learning_rate": 9.338425455492045e-07, "loss": 0.1783, "step": 27000 }, { "epoch": 2.4671172816952867, "grad_norm": 3.653663158416748, "learning_rate": 9.307523506223882e-07, "loss": 0.2177, "step": 27010 }, { "epoch": 2.4680306905370846, "grad_norm": 5.332494258880615, "learning_rate": 9.276667522705712e-07, "loss": 0.1851, "step": 27020 }, { "epoch": 2.468944099378882, "grad_norm": 3.445405960083008, "learning_rate": 9.245857539792131e-07, "loss": 0.1835, "step": 27030 }, { "epoch": 2.4698575082206795, "grad_norm": 3.47991943359375, "learning_rate": 9.215093592285718e-07, "loss": 0.1858, "step": 27040 }, { "epoch": 2.470770917062477, "grad_norm": 4.583671569824219, "learning_rate": 9.184375714937061e-07, "loss": 0.1727, "step": 27050 }, { "epoch": 2.471684325904275, "grad_norm": 2.305955648422241, "learning_rate": 9.153703942444747e-07, "loss": 0.1648, "step": 27060 }, { "epoch": 2.4725977347460724, "grad_norm": 3.7289841175079346, "learning_rate": 9.123078309455241e-07, "loss": 0.151, "step": 27070 }, { "epoch": 2.47351114358787, "grad_norm": 2.495635747909546, "learning_rate": 9.092498850562892e-07, "loss": 0.1418, "step": 27080 }, { "epoch": 2.4744245524296673, "grad_norm": 2.6553304195404053, "learning_rate": 9.061965600309924e-07, "loss": 0.14, "step": 27090 }, { "epoch": 2.475337961271465, "grad_norm": 1.9059860706329346, "learning_rate": 9.031478593186316e-07, "loss": 0.1821, "step": 27100 }, { "epoch": 2.4762513701132627, "grad_norm": 6.588783264160156, "learning_rate": 9.001037863629869e-07, "loss": 0.2024, "step": 27110 }, { "epoch": 2.47716477895506, "grad_norm": 2.2924134731292725, "learning_rate": 8.970643446026061e-07, "loss": 0.1662, "step": 27120 }, { "epoch": 2.478078187796858, "grad_norm": 2.6955862045288086, "learning_rate": 8.940295374708069e-07, "loss": 0.167, "step": 27130 }, { "epoch": 2.4789915966386555, "grad_norm": 3.915964126586914, "learning_rate": 8.909993683956736e-07, "loss": 0.1753, "step": 27140 }, { "epoch": 2.479905005480453, "grad_norm": 5.633628845214844, "learning_rate": 8.879738408000499e-07, "loss": 0.2103, "step": 27150 }, { "epoch": 2.4808184143222505, "grad_norm": 4.414962291717529, "learning_rate": 8.849529581015365e-07, "loss": 0.1647, "step": 27160 }, { "epoch": 2.4817318231640484, "grad_norm": 3.379923105239868, "learning_rate": 8.819367237124854e-07, "loss": 0.1958, "step": 27170 }, { "epoch": 2.482645232005846, "grad_norm": 4.256839752197266, "learning_rate": 8.789251410400024e-07, "loss": 0.1847, "step": 27180 }, { "epoch": 2.4835586408476433, "grad_norm": 3.7626469135284424, "learning_rate": 8.759182134859373e-07, "loss": 0.2596, "step": 27190 }, { "epoch": 2.4844720496894412, "grad_norm": 3.385770559310913, "learning_rate": 8.729159444468788e-07, "loss": 0.1608, "step": 27200 }, { "epoch": 2.4853854585312387, "grad_norm": 1.9095269441604614, "learning_rate": 8.699183373141545e-07, "loss": 0.1383, "step": 27210 }, { "epoch": 2.486298867373036, "grad_norm": 2.781400442123413, "learning_rate": 8.669253954738294e-07, "loss": 0.1735, "step": 27220 }, { "epoch": 2.4872122762148337, "grad_norm": 2.4905588626861572, "learning_rate": 8.639371223066944e-07, "loss": 0.1876, "step": 27230 }, { "epoch": 2.488125685056631, "grad_norm": 3.1464993953704834, "learning_rate": 8.609535211882681e-07, "loss": 0.1644, "step": 27240 }, { "epoch": 2.489039093898429, "grad_norm": 3.521432399749756, "learning_rate": 8.579745954887908e-07, "loss": 0.1829, "step": 27250 }, { "epoch": 2.4899525027402265, "grad_norm": 2.3307175636291504, "learning_rate": 8.550003485732244e-07, "loss": 0.1623, "step": 27260 }, { "epoch": 2.490865911582024, "grad_norm": 2.073084831237793, "learning_rate": 8.52030783801242e-07, "loss": 0.1526, "step": 27270 }, { "epoch": 2.491779320423822, "grad_norm": 3.356142520904541, "learning_rate": 8.490659045272315e-07, "loss": 0.1935, "step": 27280 }, { "epoch": 2.4926927292656194, "grad_norm": 2.990657329559326, "learning_rate": 8.461057141002854e-07, "loss": 0.1404, "step": 27290 }, { "epoch": 2.493606138107417, "grad_norm": 1.7818745374679565, "learning_rate": 8.431502158641991e-07, "loss": 0.1131, "step": 27300 }, { "epoch": 2.4945195469492143, "grad_norm": 3.872039318084717, "learning_rate": 8.401994131574715e-07, "loss": 0.1596, "step": 27310 }, { "epoch": 2.495432955791012, "grad_norm": 13.770163536071777, "learning_rate": 8.372533093132934e-07, "loss": 0.1589, "step": 27320 }, { "epoch": 2.4963463646328097, "grad_norm": 4.010361194610596, "learning_rate": 8.343119076595496e-07, "loss": 0.1974, "step": 27330 }, { "epoch": 2.497259773474607, "grad_norm": 4.33785343170166, "learning_rate": 8.313752115188151e-07, "loss": 0.1381, "step": 27340 }, { "epoch": 2.498173182316405, "grad_norm": 2.395570993423462, "learning_rate": 8.284432242083462e-07, "loss": 0.1335, "step": 27350 }, { "epoch": 2.4990865911582025, "grad_norm": 4.463876247406006, "learning_rate": 8.25515949040081e-07, "loss": 0.1843, "step": 27360 }, { "epoch": 2.5, "grad_norm": 2.4400553703308105, "learning_rate": 8.225933893206384e-07, "loss": 0.1694, "step": 27370 }, { "epoch": 2.5009134088417975, "grad_norm": 4.316111087799072, "learning_rate": 8.196755483513052e-07, "loss": 0.2211, "step": 27380 }, { "epoch": 2.501826817683595, "grad_norm": 3.558732032775879, "learning_rate": 8.16762429428043e-07, "loss": 0.1178, "step": 27390 }, { "epoch": 2.502740226525393, "grad_norm": 5.983776569366455, "learning_rate": 8.138540358414765e-07, "loss": 0.1795, "step": 27400 }, { "epoch": 2.5036536353671903, "grad_norm": 5.723694324493408, "learning_rate": 8.109503708768917e-07, "loss": 0.216, "step": 27410 }, { "epoch": 2.504567044208988, "grad_norm": 9.548382759094238, "learning_rate": 8.080514378142374e-07, "loss": 0.1512, "step": 27420 }, { "epoch": 2.5054804530507857, "grad_norm": 3.335613250732422, "learning_rate": 8.051572399281138e-07, "loss": 0.1649, "step": 27430 }, { "epoch": 2.506393861892583, "grad_norm": 7.521543502807617, "learning_rate": 8.022677804877732e-07, "loss": 0.2152, "step": 27440 }, { "epoch": 2.5073072707343806, "grad_norm": 3.4474916458129883, "learning_rate": 7.993830627571141e-07, "loss": 0.1288, "step": 27450 }, { "epoch": 2.508220679576178, "grad_norm": 4.5816731452941895, "learning_rate": 7.965030899946818e-07, "loss": 0.1617, "step": 27460 }, { "epoch": 2.509134088417976, "grad_norm": 4.887831211090088, "learning_rate": 7.936278654536611e-07, "loss": 0.1961, "step": 27470 }, { "epoch": 2.5100474972597735, "grad_norm": 2.3447556495666504, "learning_rate": 7.907573923818712e-07, "loss": 0.1419, "step": 27480 }, { "epoch": 2.510960906101571, "grad_norm": 3.1056530475616455, "learning_rate": 7.878916740217646e-07, "loss": 0.2034, "step": 27490 }, { "epoch": 2.511874314943369, "grad_norm": 4.3638529777526855, "learning_rate": 7.850307136104246e-07, "loss": 0.1611, "step": 27500 }, { "epoch": 2.5127877237851663, "grad_norm": 2.6464569568634033, "learning_rate": 7.821745143795589e-07, "loss": 0.1513, "step": 27510 }, { "epoch": 2.513701132626964, "grad_norm": 3.0807178020477295, "learning_rate": 7.793230795554968e-07, "loss": 0.1359, "step": 27520 }, { "epoch": 2.5146145414687613, "grad_norm": 3.8424861431121826, "learning_rate": 7.764764123591845e-07, "loss": 0.1321, "step": 27530 }, { "epoch": 2.5155279503105588, "grad_norm": 3.414522171020508, "learning_rate": 7.73634516006187e-07, "loss": 0.1492, "step": 27540 }, { "epoch": 2.5164413591523567, "grad_norm": 3.923959255218506, "learning_rate": 7.707973937066754e-07, "loss": 0.1469, "step": 27550 }, { "epoch": 2.517354767994154, "grad_norm": 6.356927871704102, "learning_rate": 7.679650486654317e-07, "loss": 0.2117, "step": 27560 }, { "epoch": 2.5182681768359516, "grad_norm": 3.039074182510376, "learning_rate": 7.651374840818398e-07, "loss": 0.2216, "step": 27570 }, { "epoch": 2.5191815856777495, "grad_norm": 4.826582431793213, "learning_rate": 7.623147031498818e-07, "loss": 0.1747, "step": 27580 }, { "epoch": 2.520094994519547, "grad_norm": 3.79316782951355, "learning_rate": 7.594967090581417e-07, "loss": 0.1677, "step": 27590 }, { "epoch": 2.5210084033613445, "grad_norm": 2.8477604389190674, "learning_rate": 7.566835049897908e-07, "loss": 0.1907, "step": 27600 }, { "epoch": 2.521921812203142, "grad_norm": 3.145052194595337, "learning_rate": 7.538750941225914e-07, "loss": 0.1725, "step": 27610 }, { "epoch": 2.52283522104494, "grad_norm": 5.485443592071533, "learning_rate": 7.510714796288937e-07, "loss": 0.148, "step": 27620 }, { "epoch": 2.5237486298867373, "grad_norm": 3.20564341545105, "learning_rate": 7.482726646756272e-07, "loss": 0.1541, "step": 27630 }, { "epoch": 2.524662038728535, "grad_norm": 2.183030605316162, "learning_rate": 7.454786524243013e-07, "loss": 0.1301, "step": 27640 }, { "epoch": 2.5255754475703327, "grad_norm": 4.55396842956543, "learning_rate": 7.426894460309981e-07, "loss": 0.1621, "step": 27650 }, { "epoch": 2.52648885641213, "grad_norm": 3.8069374561309814, "learning_rate": 7.399050486463738e-07, "loss": 0.1815, "step": 27660 }, { "epoch": 2.5274022652539276, "grad_norm": 3.7198147773742676, "learning_rate": 7.371254634156533e-07, "loss": 0.167, "step": 27670 }, { "epoch": 2.528315674095725, "grad_norm": 1.985144019126892, "learning_rate": 7.343506934786221e-07, "loss": 0.1919, "step": 27680 }, { "epoch": 2.5292290829375226, "grad_norm": 3.57143497467041, "learning_rate": 7.315807419696274e-07, "loss": 0.1315, "step": 27690 }, { "epoch": 2.5301424917793205, "grad_norm": 4.909097194671631, "learning_rate": 7.288156120175766e-07, "loss": 0.1703, "step": 27700 }, { "epoch": 2.531055900621118, "grad_norm": 5.025813102722168, "learning_rate": 7.260553067459275e-07, "loss": 0.2486, "step": 27710 }, { "epoch": 2.531969309462916, "grad_norm": 3.057039976119995, "learning_rate": 7.232998292726884e-07, "loss": 0.2034, "step": 27720 }, { "epoch": 2.5328827183047133, "grad_norm": 3.5553243160247803, "learning_rate": 7.205491827104139e-07, "loss": 0.1464, "step": 27730 }, { "epoch": 2.533796127146511, "grad_norm": 5.042836666107178, "learning_rate": 7.178033701662035e-07, "loss": 0.1792, "step": 27740 }, { "epoch": 2.5347095359883083, "grad_norm": 2.920931339263916, "learning_rate": 7.150623947416962e-07, "loss": 0.1985, "step": 27750 }, { "epoch": 2.5356229448301058, "grad_norm": 4.350718021392822, "learning_rate": 7.12326259533066e-07, "loss": 0.1414, "step": 27760 }, { "epoch": 2.5365363536719037, "grad_norm": 3.187411069869995, "learning_rate": 7.095949676310171e-07, "loss": 0.1229, "step": 27770 }, { "epoch": 2.537449762513701, "grad_norm": 5.014810085296631, "learning_rate": 7.068685221207878e-07, "loss": 0.1317, "step": 27780 }, { "epoch": 2.5383631713554986, "grad_norm": 2.2138333320617676, "learning_rate": 7.041469260821387e-07, "loss": 0.1485, "step": 27790 }, { "epoch": 2.5392765801972965, "grad_norm": 2.9966018199920654, "learning_rate": 7.014301825893532e-07, "loss": 0.1422, "step": 27800 }, { "epoch": 2.540189989039094, "grad_norm": 3.745661497116089, "learning_rate": 6.987182947112314e-07, "loss": 0.1806, "step": 27810 }, { "epoch": 2.5411033978808915, "grad_norm": 5.3374834060668945, "learning_rate": 6.960112655110929e-07, "loss": 0.2214, "step": 27820 }, { "epoch": 2.542016806722689, "grad_norm": 4.104368209838867, "learning_rate": 6.93309098046766e-07, "loss": 0.1561, "step": 27830 }, { "epoch": 2.5429302155644864, "grad_norm": 3.9064080715179443, "learning_rate": 6.906117953705859e-07, "loss": 0.1926, "step": 27840 }, { "epoch": 2.5438436244062843, "grad_norm": 2.7208499908447266, "learning_rate": 6.879193605293976e-07, "loss": 0.1955, "step": 27850 }, { "epoch": 2.544757033248082, "grad_norm": 2.655552864074707, "learning_rate": 6.852317965645411e-07, "loss": 0.187, "step": 27860 }, { "epoch": 2.5456704420898797, "grad_norm": 3.4026575088500977, "learning_rate": 6.8254910651186e-07, "loss": 0.1616, "step": 27870 }, { "epoch": 2.546583850931677, "grad_norm": 4.887857913970947, "learning_rate": 6.798712934016894e-07, "loss": 0.2019, "step": 27880 }, { "epoch": 2.5474972597734746, "grad_norm": 1.9662518501281738, "learning_rate": 6.771983602588538e-07, "loss": 0.1532, "step": 27890 }, { "epoch": 2.548410668615272, "grad_norm": 1.7089582681655884, "learning_rate": 6.745303101026707e-07, "loss": 0.1794, "step": 27900 }, { "epoch": 2.5493240774570696, "grad_norm": 5.332671642303467, "learning_rate": 6.718671459469372e-07, "loss": 0.1698, "step": 27910 }, { "epoch": 2.5502374862988675, "grad_norm": 3.9483468532562256, "learning_rate": 6.692088707999328e-07, "loss": 0.2145, "step": 27920 }, { "epoch": 2.551150895140665, "grad_norm": 5.112480163574219, "learning_rate": 6.665554876644125e-07, "loss": 0.145, "step": 27930 }, { "epoch": 2.5520643039824624, "grad_norm": 5.769342422485352, "learning_rate": 6.63906999537609e-07, "loss": 0.2035, "step": 27940 }, { "epoch": 2.5529777128242603, "grad_norm": 3.0805952548980713, "learning_rate": 6.612634094112252e-07, "loss": 0.206, "step": 27950 }, { "epoch": 2.553891121666058, "grad_norm": 3.413102388381958, "learning_rate": 6.586247202714274e-07, "loss": 0.1203, "step": 27960 }, { "epoch": 2.5548045305078553, "grad_norm": 15.552600860595703, "learning_rate": 6.559909350988486e-07, "loss": 0.1821, "step": 27970 }, { "epoch": 2.5557179393496527, "grad_norm": 1.2977948188781738, "learning_rate": 6.533620568685839e-07, "loss": 0.1414, "step": 27980 }, { "epoch": 2.5566313481914507, "grad_norm": 4.185473442077637, "learning_rate": 6.507380885501818e-07, "loss": 0.1495, "step": 27990 }, { "epoch": 2.557544757033248, "grad_norm": 3.548175096511841, "learning_rate": 6.481190331076476e-07, "loss": 0.1794, "step": 28000 }, { "epoch": 2.5584581658750456, "grad_norm": 6.094694137573242, "learning_rate": 6.45504893499434e-07, "loss": 0.1362, "step": 28010 }, { "epoch": 2.5593715747168435, "grad_norm": 2.630436420440674, "learning_rate": 6.42895672678444e-07, "loss": 0.2063, "step": 28020 }, { "epoch": 2.560284983558641, "grad_norm": 2.535360097885132, "learning_rate": 6.402913735920252e-07, "loss": 0.1892, "step": 28030 }, { "epoch": 2.5611983924004385, "grad_norm": 2.3915650844573975, "learning_rate": 6.376919991819619e-07, "loss": 0.1701, "step": 28040 }, { "epoch": 2.562111801242236, "grad_norm": 3.876722812652588, "learning_rate": 6.35097552384476e-07, "loss": 0.1809, "step": 28050 }, { "epoch": 2.5630252100840334, "grad_norm": 12.281547546386719, "learning_rate": 6.325080361302272e-07, "loss": 0.1664, "step": 28060 }, { "epoch": 2.5639386189258313, "grad_norm": 3.3736042976379395, "learning_rate": 6.299234533443021e-07, "loss": 0.1866, "step": 28070 }, { "epoch": 2.5648520277676288, "grad_norm": 2.8642570972442627, "learning_rate": 6.273438069462146e-07, "loss": 0.1272, "step": 28080 }, { "epoch": 2.5657654366094262, "grad_norm": 4.221220016479492, "learning_rate": 6.247690998499034e-07, "loss": 0.183, "step": 28090 }, { "epoch": 2.566678845451224, "grad_norm": 8.705109596252441, "learning_rate": 6.22199334963729e-07, "loss": 0.1397, "step": 28100 }, { "epoch": 2.5675922542930216, "grad_norm": 2.7913341522216797, "learning_rate": 6.196345151904676e-07, "loss": 0.1553, "step": 28110 }, { "epoch": 2.568505663134819, "grad_norm": 2.066905975341797, "learning_rate": 6.170746434273083e-07, "loss": 0.1302, "step": 28120 }, { "epoch": 2.5694190719766166, "grad_norm": 6.146513938903809, "learning_rate": 6.145197225658556e-07, "loss": 0.1395, "step": 28130 }, { "epoch": 2.5703324808184145, "grad_norm": 4.777243614196777, "learning_rate": 6.119697554921156e-07, "loss": 0.1801, "step": 28140 }, { "epoch": 2.571245889660212, "grad_norm": 6.405593395233154, "learning_rate": 6.094247450865037e-07, "loss": 0.1826, "step": 28150 }, { "epoch": 2.5721592985020094, "grad_norm": 2.827810764312744, "learning_rate": 6.068846942238338e-07, "loss": 0.2031, "step": 28160 }, { "epoch": 2.5730727073438073, "grad_norm": 3.35597825050354, "learning_rate": 6.043496057733161e-07, "loss": 0.1824, "step": 28170 }, { "epoch": 2.573986116185605, "grad_norm": 2.429029941558838, "learning_rate": 6.018194825985596e-07, "loss": 0.1723, "step": 28180 }, { "epoch": 2.5748995250274023, "grad_norm": 5.882177829742432, "learning_rate": 5.992943275575608e-07, "loss": 0.1755, "step": 28190 }, { "epoch": 2.5758129338691997, "grad_norm": 3.1148014068603516, "learning_rate": 5.967741435027064e-07, "loss": 0.1682, "step": 28200 }, { "epoch": 2.576726342710997, "grad_norm": 3.7537100315093994, "learning_rate": 5.942589332807647e-07, "loss": 0.1602, "step": 28210 }, { "epoch": 2.577639751552795, "grad_norm": 2.772531747817993, "learning_rate": 5.917486997328903e-07, "loss": 0.1622, "step": 28220 }, { "epoch": 2.5785531603945926, "grad_norm": 4.016494274139404, "learning_rate": 5.892434456946144e-07, "loss": 0.1689, "step": 28230 }, { "epoch": 2.57946656923639, "grad_norm": 4.787386894226074, "learning_rate": 5.867431739958424e-07, "loss": 0.1818, "step": 28240 }, { "epoch": 2.580379978078188, "grad_norm": 3.501722812652588, "learning_rate": 5.842478874608504e-07, "loss": 0.1845, "step": 28250 }, { "epoch": 2.5812933869199854, "grad_norm": 6.012413501739502, "learning_rate": 5.817575889082877e-07, "loss": 0.2241, "step": 28260 }, { "epoch": 2.582206795761783, "grad_norm": 6.76178503036499, "learning_rate": 5.792722811511659e-07, "loss": 0.1829, "step": 28270 }, { "epoch": 2.5831202046035804, "grad_norm": 4.450265884399414, "learning_rate": 5.767919669968591e-07, "loss": 0.1829, "step": 28280 }, { "epoch": 2.5840336134453783, "grad_norm": 4.499739646911621, "learning_rate": 5.743166492471009e-07, "loss": 0.1835, "step": 28290 }, { "epoch": 2.5849470222871758, "grad_norm": 3.4291162490844727, "learning_rate": 5.718463306979837e-07, "loss": 0.1983, "step": 28300 }, { "epoch": 2.5858604311289732, "grad_norm": 3.877037763595581, "learning_rate": 5.693810141399486e-07, "loss": 0.1969, "step": 28310 }, { "epoch": 2.586773839970771, "grad_norm": 4.093713283538818, "learning_rate": 5.669207023577911e-07, "loss": 0.2583, "step": 28320 }, { "epoch": 2.5876872488125686, "grad_norm": 4.7794694900512695, "learning_rate": 5.644653981306475e-07, "loss": 0.181, "step": 28330 }, { "epoch": 2.588600657654366, "grad_norm": 2.777939558029175, "learning_rate": 5.620151042320044e-07, "loss": 0.1628, "step": 28340 }, { "epoch": 2.5895140664961636, "grad_norm": 3.0715830326080322, "learning_rate": 5.595698234296837e-07, "loss": 0.1702, "step": 28350 }, { "epoch": 2.590427475337961, "grad_norm": 2.2324957847595215, "learning_rate": 5.571295584858466e-07, "loss": 0.1421, "step": 28360 }, { "epoch": 2.591340884179759, "grad_norm": 3.1777498722076416, "learning_rate": 5.546943121569864e-07, "loss": 0.1577, "step": 28370 }, { "epoch": 2.5922542930215564, "grad_norm": 4.63186502456665, "learning_rate": 5.522640871939322e-07, "loss": 0.2143, "step": 28380 }, { "epoch": 2.593167701863354, "grad_norm": 5.821626663208008, "learning_rate": 5.498388863418369e-07, "loss": 0.1369, "step": 28390 }, { "epoch": 2.594081110705152, "grad_norm": 4.614604949951172, "learning_rate": 5.47418712340177e-07, "loss": 0.1945, "step": 28400 }, { "epoch": 2.5949945195469493, "grad_norm": 2.700010061264038, "learning_rate": 5.450035679227556e-07, "loss": 0.1699, "step": 28410 }, { "epoch": 2.5959079283887467, "grad_norm": 3.229088306427002, "learning_rate": 5.425934558176915e-07, "loss": 0.1845, "step": 28420 }, { "epoch": 2.596821337230544, "grad_norm": 5.3979082107543945, "learning_rate": 5.401883787474193e-07, "loss": 0.161, "step": 28430 }, { "epoch": 2.597734746072342, "grad_norm": 11.498878479003906, "learning_rate": 5.377883394286859e-07, "loss": 0.2128, "step": 28440 }, { "epoch": 2.5986481549141396, "grad_norm": 2.627903699874878, "learning_rate": 5.353933405725464e-07, "loss": 0.1788, "step": 28450 }, { "epoch": 2.599561563755937, "grad_norm": 3.347543239593506, "learning_rate": 5.330033848843669e-07, "loss": 0.2156, "step": 28460 }, { "epoch": 2.600474972597735, "grad_norm": 3.9646146297454834, "learning_rate": 5.306184750638121e-07, "loss": 0.1096, "step": 28470 }, { "epoch": 2.6013883814395324, "grad_norm": 3.6762778759002686, "learning_rate": 5.282386138048485e-07, "loss": 0.1955, "step": 28480 }, { "epoch": 2.60230179028133, "grad_norm": 2.503816843032837, "learning_rate": 5.258638037957381e-07, "loss": 0.1823, "step": 28490 }, { "epoch": 2.6032151991231274, "grad_norm": 2.1628077030181885, "learning_rate": 5.234940477190409e-07, "loss": 0.1833, "step": 28500 }, { "epoch": 2.604128607964925, "grad_norm": 2.6601667404174805, "learning_rate": 5.211293482516066e-07, "loss": 0.167, "step": 28510 }, { "epoch": 2.6050420168067228, "grad_norm": 2.6038830280303955, "learning_rate": 5.187697080645709e-07, "loss": 0.1393, "step": 28520 }, { "epoch": 2.6059554256485202, "grad_norm": 6.139000415802002, "learning_rate": 5.16415129823355e-07, "loss": 0.1527, "step": 28530 }, { "epoch": 2.606868834490318, "grad_norm": 4.121825218200684, "learning_rate": 5.140656161876662e-07, "loss": 0.1751, "step": 28540 }, { "epoch": 2.6077822433321156, "grad_norm": 3.8292148113250732, "learning_rate": 5.117211698114854e-07, "loss": 0.2021, "step": 28550 }, { "epoch": 2.608695652173913, "grad_norm": 2.5773963928222656, "learning_rate": 5.093817933430734e-07, "loss": 0.2203, "step": 28560 }, { "epoch": 2.6096090610157106, "grad_norm": 3.8354101181030273, "learning_rate": 5.070474894249611e-07, "loss": 0.144, "step": 28570 }, { "epoch": 2.610522469857508, "grad_norm": 2.4685230255126953, "learning_rate": 5.047182606939527e-07, "loss": 0.1548, "step": 28580 }, { "epoch": 2.611435878699306, "grad_norm": 3.266267776489258, "learning_rate": 5.023941097811169e-07, "loss": 0.1519, "step": 28590 }, { "epoch": 2.6123492875411034, "grad_norm": 5.221317768096924, "learning_rate": 5.000750393117898e-07, "loss": 0.2061, "step": 28600 }, { "epoch": 2.613262696382901, "grad_norm": 4.749457359313965, "learning_rate": 4.977610519055631e-07, "loss": 0.153, "step": 28610 }, { "epoch": 2.614176105224699, "grad_norm": 3.9044079780578613, "learning_rate": 4.954521501762937e-07, "loss": 0.2127, "step": 28620 }, { "epoch": 2.6150895140664963, "grad_norm": 53.67898941040039, "learning_rate": 4.931483367320882e-07, "loss": 0.1465, "step": 28630 }, { "epoch": 2.6160029229082937, "grad_norm": 5.797778129577637, "learning_rate": 4.908496141753088e-07, "loss": 0.1396, "step": 28640 }, { "epoch": 2.616916331750091, "grad_norm": 3.8302512168884277, "learning_rate": 4.885559851025634e-07, "loss": 0.2396, "step": 28650 }, { "epoch": 2.6178297405918887, "grad_norm": 2.1766555309295654, "learning_rate": 4.862674521047117e-07, "loss": 0.1355, "step": 28660 }, { "epoch": 2.6187431494336866, "grad_norm": 3.777099609375, "learning_rate": 4.839840177668526e-07, "loss": 0.1492, "step": 28670 }, { "epoch": 2.619656558275484, "grad_norm": 2.358353614807129, "learning_rate": 4.817056846683277e-07, "loss": 0.1878, "step": 28680 }, { "epoch": 2.620569967117282, "grad_norm": 3.8859634399414062, "learning_rate": 4.794324553827129e-07, "loss": 0.1886, "step": 28690 }, { "epoch": 2.6214833759590794, "grad_norm": 2.903855562210083, "learning_rate": 4.771643324778264e-07, "loss": 0.1874, "step": 28700 }, { "epoch": 2.622396784800877, "grad_norm": 2.87847638130188, "learning_rate": 4.749013185157114e-07, "loss": 0.2038, "step": 28710 }, { "epoch": 2.6233101936426744, "grad_norm": 6.271610736846924, "learning_rate": 4.7264341605264165e-07, "loss": 0.1926, "step": 28720 }, { "epoch": 2.624223602484472, "grad_norm": 3.056499481201172, "learning_rate": 4.7039062763911735e-07, "loss": 0.1875, "step": 28730 }, { "epoch": 2.6251370113262698, "grad_norm": 4.430934906005859, "learning_rate": 4.6814295581986355e-07, "loss": 0.1524, "step": 28740 }, { "epoch": 2.6260504201680672, "grad_norm": 1.8527206182479858, "learning_rate": 4.659004031338238e-07, "loss": 0.0957, "step": 28750 }, { "epoch": 2.6269638290098647, "grad_norm": 3.014235496520996, "learning_rate": 4.6366297211415944e-07, "loss": 0.151, "step": 28760 }, { "epoch": 2.6278772378516626, "grad_norm": 2.866987943649292, "learning_rate": 4.614306652882455e-07, "loss": 0.1628, "step": 28770 }, { "epoch": 2.62879064669346, "grad_norm": 6.165231227874756, "learning_rate": 4.5920348517767164e-07, "loss": 0.1489, "step": 28780 }, { "epoch": 2.6297040555352575, "grad_norm": 3.83646297454834, "learning_rate": 4.569814342982348e-07, "loss": 0.1795, "step": 28790 }, { "epoch": 2.630617464377055, "grad_norm": 4.253725528717041, "learning_rate": 4.5476451515993723e-07, "loss": 0.1938, "step": 28800 }, { "epoch": 2.6315308732188525, "grad_norm": 3.998826742172241, "learning_rate": 4.525527302669841e-07, "loss": 0.1375, "step": 28810 }, { "epoch": 2.6324442820606504, "grad_norm": 3.889923572540283, "learning_rate": 4.503460821177841e-07, "loss": 0.1555, "step": 28820 }, { "epoch": 2.633357690902448, "grad_norm": 5.5610246658325195, "learning_rate": 4.4814457320494e-07, "loss": 0.1743, "step": 28830 }, { "epoch": 2.634271099744246, "grad_norm": 1.7337384223937988, "learning_rate": 4.4594820601525146e-07, "loss": 0.1322, "step": 28840 }, { "epoch": 2.6351845085860433, "grad_norm": 5.698246955871582, "learning_rate": 4.4375698302970784e-07, "loss": 0.1964, "step": 28850 }, { "epoch": 2.6360979174278407, "grad_norm": 3.3582777976989746, "learning_rate": 4.415709067234919e-07, "loss": 0.2045, "step": 28860 }, { "epoch": 2.637011326269638, "grad_norm": 2.0463366508483887, "learning_rate": 4.393899795659673e-07, "loss": 0.142, "step": 28870 }, { "epoch": 2.6379247351114357, "grad_norm": 2.583164930343628, "learning_rate": 4.3721420402068617e-07, "loss": 0.206, "step": 28880 }, { "epoch": 2.6388381439532336, "grad_norm": 3.1102864742279053, "learning_rate": 4.35043582545378e-07, "loss": 0.1311, "step": 28890 }, { "epoch": 2.639751552795031, "grad_norm": 3.153418779373169, "learning_rate": 4.328781175919533e-07, "loss": 0.1399, "step": 28900 }, { "epoch": 2.6406649616368285, "grad_norm": 4.9051618576049805, "learning_rate": 4.3071781160649527e-07, "loss": 0.1351, "step": 28910 }, { "epoch": 2.6415783704786264, "grad_norm": 3.146371364593506, "learning_rate": 4.2856266702925975e-07, "loss": 0.1514, "step": 28920 }, { "epoch": 2.642491779320424, "grad_norm": 3.16583251953125, "learning_rate": 4.264126862946738e-07, "loss": 0.1654, "step": 28930 }, { "epoch": 2.6434051881622214, "grad_norm": 2.9945950508117676, "learning_rate": 4.242678718313309e-07, "loss": 0.1551, "step": 28940 }, { "epoch": 2.644318597004019, "grad_norm": 3.448671579360962, "learning_rate": 4.221282260619891e-07, "loss": 0.2371, "step": 28950 }, { "epoch": 2.6452320058458167, "grad_norm": 2.3573901653289795, "learning_rate": 4.1999375140356625e-07, "loss": 0.1456, "step": 28960 }, { "epoch": 2.646145414687614, "grad_norm": 2.6368789672851562, "learning_rate": 4.1786445026713916e-07, "loss": 0.2561, "step": 28970 }, { "epoch": 2.6470588235294117, "grad_norm": 2.728585958480835, "learning_rate": 4.1574032505794505e-07, "loss": 0.1759, "step": 28980 }, { "epoch": 2.6479722323712096, "grad_norm": 6.874827861785889, "learning_rate": 4.1362137817536953e-07, "loss": 0.1499, "step": 28990 }, { "epoch": 2.648885641213007, "grad_norm": 3.402792453765869, "learning_rate": 4.1150761201295074e-07, "loss": 0.2165, "step": 29000 }, { "epoch": 2.6497990500548045, "grad_norm": 2.7407069206237793, "learning_rate": 4.093990289583727e-07, "loss": 0.2111, "step": 29010 }, { "epoch": 2.650712458896602, "grad_norm": 3.2457334995269775, "learning_rate": 4.072956313934684e-07, "loss": 0.1151, "step": 29020 }, { "epoch": 2.6516258677383995, "grad_norm": 4.972794055938721, "learning_rate": 4.051974216942112e-07, "loss": 0.1965, "step": 29030 }, { "epoch": 2.6525392765801974, "grad_norm": 2.689418315887451, "learning_rate": 4.03104402230714e-07, "loss": 0.2523, "step": 29040 }, { "epoch": 2.653452685421995, "grad_norm": 3.2815866470336914, "learning_rate": 4.0101657536722625e-07, "loss": 0.1956, "step": 29050 }, { "epoch": 2.6543660942637923, "grad_norm": 3.526332139968872, "learning_rate": 3.989339434621342e-07, "loss": 0.1433, "step": 29060 }, { "epoch": 2.6552795031055902, "grad_norm": 2.241903066635132, "learning_rate": 3.9685650886795546e-07, "loss": 0.1982, "step": 29070 }, { "epoch": 2.6561929119473877, "grad_norm": 3.8179118633270264, "learning_rate": 3.947842739313351e-07, "loss": 0.1527, "step": 29080 }, { "epoch": 2.657106320789185, "grad_norm": 4.28487491607666, "learning_rate": 3.927172409930446e-07, "loss": 0.1781, "step": 29090 }, { "epoch": 2.6580197296309827, "grad_norm": 4.094175815582275, "learning_rate": 3.90655412387983e-07, "loss": 0.1804, "step": 29100 }, { "epoch": 2.6589331384727806, "grad_norm": 3.7405788898468018, "learning_rate": 3.8859879044516603e-07, "loss": 0.1862, "step": 29110 }, { "epoch": 2.659846547314578, "grad_norm": 5.119068622589111, "learning_rate": 3.865473774877315e-07, "loss": 0.202, "step": 29120 }, { "epoch": 2.6607599561563755, "grad_norm": 2.7963080406188965, "learning_rate": 3.845011758329292e-07, "loss": 0.1871, "step": 29130 }, { "epoch": 2.6616733649981734, "grad_norm": 4.104891777038574, "learning_rate": 3.8246018779212747e-07, "loss": 0.1583, "step": 29140 }, { "epoch": 2.662586773839971, "grad_norm": 6.032232284545898, "learning_rate": 3.80424415670802e-07, "loss": 0.1635, "step": 29150 }, { "epoch": 2.6635001826817684, "grad_norm": 7.371508598327637, "learning_rate": 3.783938617685362e-07, "loss": 0.177, "step": 29160 }, { "epoch": 2.664413591523566, "grad_norm": 2.501544713973999, "learning_rate": 3.763685283790208e-07, "loss": 0.1718, "step": 29170 }, { "epoch": 2.6653270003653633, "grad_norm": 4.4881911277771, "learning_rate": 3.743484177900503e-07, "loss": 0.1642, "step": 29180 }, { "epoch": 2.666240409207161, "grad_norm": 2.8926961421966553, "learning_rate": 3.7233353228351755e-07, "loss": 0.2411, "step": 29190 }, { "epoch": 2.6671538180489587, "grad_norm": 3.6379425525665283, "learning_rate": 3.7032387413541317e-07, "loss": 0.1386, "step": 29200 }, { "epoch": 2.668067226890756, "grad_norm": 4.628418445587158, "learning_rate": 3.6831944561582287e-07, "loss": 0.1972, "step": 29210 }, { "epoch": 2.668980635732554, "grad_norm": 0.6931257843971252, "learning_rate": 3.663202489889284e-07, "loss": 0.1172, "step": 29220 }, { "epoch": 2.6698940445743515, "grad_norm": 4.300020217895508, "learning_rate": 3.6432628651299815e-07, "loss": 0.2098, "step": 29230 }, { "epoch": 2.670807453416149, "grad_norm": 4.559226036071777, "learning_rate": 3.623375604403878e-07, "loss": 0.1835, "step": 29240 }, { "epoch": 2.6717208622579465, "grad_norm": 4.785849571228027, "learning_rate": 3.603540730175392e-07, "loss": 0.2282, "step": 29250 }, { "epoch": 2.6726342710997444, "grad_norm": 5.065683364868164, "learning_rate": 3.5837582648497905e-07, "loss": 0.2007, "step": 29260 }, { "epoch": 2.673547679941542, "grad_norm": 2.564279556274414, "learning_rate": 3.5640282307731035e-07, "loss": 0.1743, "step": 29270 }, { "epoch": 2.6744610887833393, "grad_norm": 1.5829267501831055, "learning_rate": 3.5443506502321535e-07, "loss": 0.1837, "step": 29280 }, { "epoch": 2.6753744976251372, "grad_norm": 4.97260856628418, "learning_rate": 3.524725545454494e-07, "loss": 0.1734, "step": 29290 }, { "epoch": 2.6762879064669347, "grad_norm": 2.9145779609680176, "learning_rate": 3.5051529386084304e-07, "loss": 0.1722, "step": 29300 }, { "epoch": 2.677201315308732, "grad_norm": 4.117083549499512, "learning_rate": 3.4856328518029504e-07, "loss": 0.1632, "step": 29310 }, { "epoch": 2.6781147241505296, "grad_norm": 4.3244099617004395, "learning_rate": 3.46616530708771e-07, "loss": 0.1883, "step": 29320 }, { "epoch": 2.679028132992327, "grad_norm": 5.495502948760986, "learning_rate": 3.4467503264530243e-07, "loss": 0.1332, "step": 29330 }, { "epoch": 2.679941541834125, "grad_norm": 4.136027812957764, "learning_rate": 3.4273879318298397e-07, "loss": 0.2064, "step": 29340 }, { "epoch": 2.6808549506759225, "grad_norm": 4.216952323913574, "learning_rate": 3.408078145089677e-07, "loss": 0.1994, "step": 29350 }, { "epoch": 2.68176835951772, "grad_norm": 4.28631591796875, "learning_rate": 3.3888209880446664e-07, "loss": 0.1758, "step": 29360 }, { "epoch": 2.682681768359518, "grad_norm": 9.774356842041016, "learning_rate": 3.369616482447452e-07, "loss": 0.1466, "step": 29370 }, { "epoch": 2.6835951772013154, "grad_norm": 3.0883216857910156, "learning_rate": 3.350464649991242e-07, "loss": 0.1793, "step": 29380 }, { "epoch": 2.684508586043113, "grad_norm": 3.4956254959106445, "learning_rate": 3.3313655123097033e-07, "loss": 0.1595, "step": 29390 }, { "epoch": 2.6854219948849103, "grad_norm": 3.2147960662841797, "learning_rate": 3.312319090977012e-07, "loss": 0.2153, "step": 29400 }, { "epoch": 2.686335403726708, "grad_norm": 4.222472667694092, "learning_rate": 3.293325407507775e-07, "loss": 0.1875, "step": 29410 }, { "epoch": 2.6872488125685057, "grad_norm": 3.6156182289123535, "learning_rate": 3.2743844833570516e-07, "loss": 0.1913, "step": 29420 }, { "epoch": 2.688162221410303, "grad_norm": 4.582793235778809, "learning_rate": 3.255496339920283e-07, "loss": 0.1447, "step": 29430 }, { "epoch": 2.689075630252101, "grad_norm": 9.66757869720459, "learning_rate": 3.2366609985332865e-07, "loss": 0.1176, "step": 29440 }, { "epoch": 2.6899890390938985, "grad_norm": 7.198057651519775, "learning_rate": 3.2178784804722585e-07, "loss": 0.1652, "step": 29450 }, { "epoch": 2.690902447935696, "grad_norm": 5.453129768371582, "learning_rate": 3.199148806953717e-07, "loss": 0.2001, "step": 29460 }, { "epoch": 2.6918158567774935, "grad_norm": 6.442914962768555, "learning_rate": 3.1804719991344766e-07, "loss": 0.1762, "step": 29470 }, { "epoch": 2.692729265619291, "grad_norm": 2.5320165157318115, "learning_rate": 3.161848078111646e-07, "loss": 0.1598, "step": 29480 }, { "epoch": 2.693642674461089, "grad_norm": 4.487289905548096, "learning_rate": 3.143277064922579e-07, "loss": 0.1759, "step": 29490 }, { "epoch": 2.6945560833028863, "grad_norm": 4.368460178375244, "learning_rate": 3.124758980544901e-07, "loss": 0.159, "step": 29500 }, { "epoch": 2.6954694921446842, "grad_norm": 5.106991291046143, "learning_rate": 3.1062938458964077e-07, "loss": 0.1696, "step": 29510 }, { "epoch": 2.6963829009864817, "grad_norm": 3.059021472930908, "learning_rate": 3.087881681835103e-07, "loss": 0.1248, "step": 29520 }, { "epoch": 2.697296309828279, "grad_norm": 4.929970741271973, "learning_rate": 3.069522509159151e-07, "loss": 0.1737, "step": 29530 }, { "epoch": 2.6982097186700766, "grad_norm": 3.115908622741699, "learning_rate": 3.051216348606867e-07, "loss": 0.1507, "step": 29540 }, { "epoch": 2.699123127511874, "grad_norm": 2.8646724224090576, "learning_rate": 3.032963220856683e-07, "loss": 0.1733, "step": 29550 }, { "epoch": 2.700036536353672, "grad_norm": 4.097107887268066, "learning_rate": 3.0147631465271167e-07, "loss": 0.2251, "step": 29560 }, { "epoch": 2.7009499451954695, "grad_norm": 3.359964370727539, "learning_rate": 2.9966161461767506e-07, "loss": 0.1785, "step": 29570 }, { "epoch": 2.701863354037267, "grad_norm": 3.2329702377319336, "learning_rate": 2.9785222403042437e-07, "loss": 0.1609, "step": 29580 }, { "epoch": 2.702776762879065, "grad_norm": 2.9998605251312256, "learning_rate": 2.960481449348246e-07, "loss": 0.1863, "step": 29590 }, { "epoch": 2.7036901717208623, "grad_norm": 2.373302936553955, "learning_rate": 2.94249379368744e-07, "loss": 0.215, "step": 29600 }, { "epoch": 2.70460358056266, "grad_norm": 12.532777786254883, "learning_rate": 2.9245592936404556e-07, "loss": 0.1963, "step": 29610 }, { "epoch": 2.7055169894044573, "grad_norm": 3.2537379264831543, "learning_rate": 2.9066779694659155e-07, "loss": 0.1572, "step": 29620 }, { "epoch": 2.7064303982462548, "grad_norm": 4.4246649742126465, "learning_rate": 2.88884984136234e-07, "loss": 0.1489, "step": 29630 }, { "epoch": 2.7073438070880527, "grad_norm": 2.9845080375671387, "learning_rate": 2.871074929468193e-07, "loss": 0.1475, "step": 29640 }, { "epoch": 2.70825721592985, "grad_norm": 5.206557273864746, "learning_rate": 2.85335325386179e-07, "loss": 0.1799, "step": 29650 }, { "epoch": 2.709170624771648, "grad_norm": 6.973174095153809, "learning_rate": 2.8356848345613475e-07, "loss": 0.1778, "step": 29660 }, { "epoch": 2.7100840336134455, "grad_norm": 2.3810126781463623, "learning_rate": 2.818069691524905e-07, "loss": 0.1747, "step": 29670 }, { "epoch": 2.710997442455243, "grad_norm": 3.5602266788482666, "learning_rate": 2.800507844650313e-07, "loss": 0.2004, "step": 29680 }, { "epoch": 2.7119108512970405, "grad_norm": 3.728686809539795, "learning_rate": 2.7829993137752307e-07, "loss": 0.1915, "step": 29690 }, { "epoch": 2.712824260138838, "grad_norm": 2.01147198677063, "learning_rate": 2.765544118677105e-07, "loss": 0.149, "step": 29700 }, { "epoch": 2.713737668980636, "grad_norm": 3.1700830459594727, "learning_rate": 2.748142279073113e-07, "loss": 0.197, "step": 29710 }, { "epoch": 2.7146510778224333, "grad_norm": 3.611387252807617, "learning_rate": 2.7307938146201697e-07, "loss": 0.1787, "step": 29720 }, { "epoch": 2.715564486664231, "grad_norm": 3.922872304916382, "learning_rate": 2.7134987449149e-07, "loss": 0.1404, "step": 29730 }, { "epoch": 2.7164778955060287, "grad_norm": 2.985687255859375, "learning_rate": 2.69625708949362e-07, "loss": 0.1967, "step": 29740 }, { "epoch": 2.717391304347826, "grad_norm": 2.165677070617676, "learning_rate": 2.679068867832302e-07, "loss": 0.1774, "step": 29750 }, { "epoch": 2.7183047131896236, "grad_norm": 3.239450454711914, "learning_rate": 2.6619340993465634e-07, "loss": 0.1775, "step": 29760 }, { "epoch": 2.719218122031421, "grad_norm": 14.948457717895508, "learning_rate": 2.644852803391629e-07, "loss": 0.1656, "step": 29770 }, { "epoch": 2.720131530873219, "grad_norm": 4.637759685516357, "learning_rate": 2.627824999262352e-07, "loss": 0.1957, "step": 29780 }, { "epoch": 2.7210449397150165, "grad_norm": 4.274733543395996, "learning_rate": 2.6108507061931245e-07, "loss": 0.1931, "step": 29790 }, { "epoch": 2.721958348556814, "grad_norm": 3.9745938777923584, "learning_rate": 2.5939299433579256e-07, "loss": 0.1857, "step": 29800 }, { "epoch": 2.722871757398612, "grad_norm": 10.616950988769531, "learning_rate": 2.577062729870233e-07, "loss": 0.2393, "step": 29810 }, { "epoch": 2.7237851662404093, "grad_norm": 4.603240489959717, "learning_rate": 2.560249084783073e-07, "loss": 0.1392, "step": 29820 }, { "epoch": 2.724698575082207, "grad_norm": 3.2682995796203613, "learning_rate": 2.5434890270889467e-07, "loss": 0.2151, "step": 29830 }, { "epoch": 2.7256119839240043, "grad_norm": 3.8422954082489014, "learning_rate": 2.5267825757198106e-07, "loss": 0.1777, "step": 29840 }, { "epoch": 2.7265253927658017, "grad_norm": 2.5809288024902344, "learning_rate": 2.51012974954708e-07, "loss": 0.1773, "step": 29850 }, { "epoch": 2.7274388016075997, "grad_norm": 3.231849193572998, "learning_rate": 2.493530567381597e-07, "loss": 0.2033, "step": 29860 }, { "epoch": 2.728352210449397, "grad_norm": 3.7515792846679688, "learning_rate": 2.4769850479736124e-07, "loss": 0.173, "step": 29870 }, { "epoch": 2.7292656192911946, "grad_norm": 2.1019093990325928, "learning_rate": 2.4604932100127434e-07, "loss": 0.1739, "step": 29880 }, { "epoch": 2.7301790281329925, "grad_norm": 5.934996604919434, "learning_rate": 2.4440550721279766e-07, "loss": 0.2323, "step": 29890 }, { "epoch": 2.73109243697479, "grad_norm": 2.436202049255371, "learning_rate": 2.427670652887648e-07, "loss": 0.18, "step": 29900 }, { "epoch": 2.7320058458165875, "grad_norm": 5.293520450592041, "learning_rate": 2.411339970799409e-07, "loss": 0.1982, "step": 29910 }, { "epoch": 2.732919254658385, "grad_norm": 2.3441998958587646, "learning_rate": 2.3950630443102086e-07, "loss": 0.1963, "step": 29920 }, { "epoch": 2.733832663500183, "grad_norm": 1.8282734155654907, "learning_rate": 2.378839891806267e-07, "loss": 0.1366, "step": 29930 }, { "epoch": 2.7347460723419803, "grad_norm": 3.2099366188049316, "learning_rate": 2.3626705316130815e-07, "loss": 0.1386, "step": 29940 }, { "epoch": 2.7356594811837778, "grad_norm": 4.577935218811035, "learning_rate": 2.3465549819953691e-07, "loss": 0.1911, "step": 29950 }, { "epoch": 2.7365728900255757, "grad_norm": 5.309034824371338, "learning_rate": 2.330493261157063e-07, "loss": 0.243, "step": 29960 }, { "epoch": 2.737486298867373, "grad_norm": 3.530811309814453, "learning_rate": 2.3144853872412943e-07, "loss": 0.1302, "step": 29970 }, { "epoch": 2.7383997077091706, "grad_norm": 2.1390774250030518, "learning_rate": 2.2985313783303875e-07, "loss": 0.2041, "step": 29980 }, { "epoch": 2.739313116550968, "grad_norm": 5.1970624923706055, "learning_rate": 2.2826312524457993e-07, "loss": 0.1584, "step": 29990 }, { "epoch": 2.7402265253927656, "grad_norm": 2.199793577194214, "learning_rate": 2.2667850275481185e-07, "loss": 0.15, "step": 30000 }, { "epoch": 2.7411399342345635, "grad_norm": 2.8913819789886475, "learning_rate": 2.250992721537054e-07, "loss": 0.1976, "step": 30010 }, { "epoch": 2.742053343076361, "grad_norm": 2.243295192718506, "learning_rate": 2.235254352251437e-07, "loss": 0.1408, "step": 30020 }, { "epoch": 2.7429667519181584, "grad_norm": 3.4165101051330566, "learning_rate": 2.2195699374691236e-07, "loss": 0.184, "step": 30030 }, { "epoch": 2.7438801607599563, "grad_norm": 2.647298574447632, "learning_rate": 2.2039394949070647e-07, "loss": 0.169, "step": 30040 }, { "epoch": 2.744793569601754, "grad_norm": 4.701382160186768, "learning_rate": 2.1883630422212031e-07, "loss": 0.1575, "step": 30050 }, { "epoch": 2.7457069784435513, "grad_norm": 3.4917750358581543, "learning_rate": 2.172840597006548e-07, "loss": 0.1862, "step": 30060 }, { "epoch": 2.7466203872853487, "grad_norm": 2.6093637943267822, "learning_rate": 2.1573721767970512e-07, "loss": 0.1724, "step": 30070 }, { "epoch": 2.7475337961271467, "grad_norm": 3.6859781742095947, "learning_rate": 2.141957799065675e-07, "loss": 0.1716, "step": 30080 }, { "epoch": 2.748447204968944, "grad_norm": 1.6195764541625977, "learning_rate": 2.1265974812243074e-07, "loss": 0.1728, "step": 30090 }, { "epoch": 2.7493606138107416, "grad_norm": 3.8155574798583984, "learning_rate": 2.1112912406237973e-07, "loss": 0.1765, "step": 30100 }, { "epoch": 2.7502740226525395, "grad_norm": 5.951730251312256, "learning_rate": 2.0960390945538923e-07, "loss": 0.1898, "step": 30110 }, { "epoch": 2.751187431494337, "grad_norm": 2.8212239742279053, "learning_rate": 2.0808410602432495e-07, "loss": 0.1225, "step": 30120 }, { "epoch": 2.7521008403361344, "grad_norm": 2.9701151847839355, "learning_rate": 2.065697154859375e-07, "loss": 0.1869, "step": 30130 }, { "epoch": 2.753014249177932, "grad_norm": 6.513732433319092, "learning_rate": 2.0506073955086636e-07, "loss": 0.1456, "step": 30140 }, { "epoch": 2.7539276580197294, "grad_norm": 6.357501983642578, "learning_rate": 2.0355717992363245e-07, "loss": 0.1951, "step": 30150 }, { "epoch": 2.7548410668615273, "grad_norm": 4.359513759613037, "learning_rate": 2.0205903830263995e-07, "loss": 0.1561, "step": 30160 }, { "epoch": 2.7557544757033248, "grad_norm": 3.8683457374572754, "learning_rate": 2.0056631638017076e-07, "loss": 0.1993, "step": 30170 }, { "epoch": 2.7566678845451222, "grad_norm": 2.8823235034942627, "learning_rate": 1.990790158423883e-07, "loss": 0.1671, "step": 30180 }, { "epoch": 2.75758129338692, "grad_norm": 2.8691093921661377, "learning_rate": 1.9759713836932925e-07, "loss": 0.1657, "step": 30190 }, { "epoch": 2.7584947022287176, "grad_norm": 2.876791000366211, "learning_rate": 1.9612068563490462e-07, "loss": 0.1536, "step": 30200 }, { "epoch": 2.759408111070515, "grad_norm": 8.292618751525879, "learning_rate": 1.946496593068986e-07, "loss": 0.1381, "step": 30210 }, { "epoch": 2.7603215199123126, "grad_norm": 3.213650941848755, "learning_rate": 1.9318406104696597e-07, "loss": 0.2217, "step": 30220 }, { "epoch": 2.7612349287541105, "grad_norm": 2.8058667182922363, "learning_rate": 1.917238925106296e-07, "loss": 0.1499, "step": 30230 }, { "epoch": 2.762148337595908, "grad_norm": 2.8577322959899902, "learning_rate": 1.902691553472791e-07, "loss": 0.1722, "step": 30240 }, { "epoch": 2.7630617464377054, "grad_norm": 6.168932914733887, "learning_rate": 1.8881985120016655e-07, "loss": 0.1866, "step": 30250 }, { "epoch": 2.7639751552795033, "grad_norm": 6.6126508712768555, "learning_rate": 1.8737598170641246e-07, "loss": 0.1641, "step": 30260 }, { "epoch": 2.764888564121301, "grad_norm": 3.0332112312316895, "learning_rate": 1.8593754849699275e-07, "loss": 0.1508, "step": 30270 }, { "epoch": 2.7658019729630983, "grad_norm": 2.5082767009735107, "learning_rate": 1.8450455319674597e-07, "loss": 0.1479, "step": 30280 }, { "epoch": 2.7667153818048957, "grad_norm": 1.5599119663238525, "learning_rate": 1.8307699742436513e-07, "loss": 0.2017, "step": 30290 }, { "epoch": 2.767628790646693, "grad_norm": 2.5415139198303223, "learning_rate": 1.8165488279240307e-07, "loss": 0.1614, "step": 30300 }, { "epoch": 2.768542199488491, "grad_norm": 5.365577697753906, "learning_rate": 1.8023821090726368e-07, "loss": 0.1372, "step": 30310 }, { "epoch": 2.7694556083302886, "grad_norm": 3.359363317489624, "learning_rate": 1.7882698336920245e-07, "loss": 0.1573, "step": 30320 }, { "epoch": 2.7703690171720865, "grad_norm": 3.1379971504211426, "learning_rate": 1.7742120177232535e-07, "loss": 0.1633, "step": 30330 }, { "epoch": 2.771282426013884, "grad_norm": 2.1981897354125977, "learning_rate": 1.7602086770458771e-07, "loss": 0.1649, "step": 30340 }, { "epoch": 2.7721958348556814, "grad_norm": 2.3713231086730957, "learning_rate": 1.746259827477903e-07, "loss": 0.1303, "step": 30350 }, { "epoch": 2.773109243697479, "grad_norm": 2.2317235469818115, "learning_rate": 1.7323654847757886e-07, "loss": 0.2024, "step": 30360 }, { "epoch": 2.7740226525392764, "grad_norm": 3.6669251918792725, "learning_rate": 1.718525664634424e-07, "loss": 0.1337, "step": 30370 }, { "epoch": 2.7749360613810743, "grad_norm": 2.810298204421997, "learning_rate": 1.7047403826871035e-07, "loss": 0.1889, "step": 30380 }, { "epoch": 2.7758494702228718, "grad_norm": 3.6317391395568848, "learning_rate": 1.6910096545055377e-07, "loss": 0.1697, "step": 30390 }, { "epoch": 2.7767628790646692, "grad_norm": 2.515752077102661, "learning_rate": 1.6773334955997923e-07, "loss": 0.1893, "step": 30400 }, { "epoch": 2.777676287906467, "grad_norm": 6.00191068649292, "learning_rate": 1.6637119214182819e-07, "loss": 0.1509, "step": 30410 }, { "epoch": 2.7785896967482646, "grad_norm": 3.405580759048462, "learning_rate": 1.6501449473478037e-07, "loss": 0.1743, "step": 30420 }, { "epoch": 2.779503105590062, "grad_norm": 3.055898666381836, "learning_rate": 1.636632588713438e-07, "loss": 0.183, "step": 30430 }, { "epoch": 2.7804165144318596, "grad_norm": 3.6469919681549072, "learning_rate": 1.6231748607785924e-07, "loss": 0.1981, "step": 30440 }, { "epoch": 2.781329923273657, "grad_norm": 3.2503275871276855, "learning_rate": 1.6097717787449561e-07, "loss": 0.1566, "step": 30450 }, { "epoch": 2.782243332115455, "grad_norm": 2.882218360900879, "learning_rate": 1.596423357752508e-07, "loss": 0.1587, "step": 30460 }, { "epoch": 2.7831567409572524, "grad_norm": 3.681100606918335, "learning_rate": 1.5831296128794638e-07, "loss": 0.1814, "step": 30470 }, { "epoch": 2.7840701497990503, "grad_norm": 4.064323902130127, "learning_rate": 1.569890559142273e-07, "loss": 0.1471, "step": 30480 }, { "epoch": 2.784983558640848, "grad_norm": 7.852768898010254, "learning_rate": 1.5567062114956232e-07, "loss": 0.1989, "step": 30490 }, { "epoch": 2.7858969674826453, "grad_norm": 1.88850998878479, "learning_rate": 1.5435765848324126e-07, "loss": 0.1621, "step": 30500 }, { "epoch": 2.7868103763244427, "grad_norm": 2.551734209060669, "learning_rate": 1.5305016939837046e-07, "loss": 0.1415, "step": 30510 }, { "epoch": 2.78772378516624, "grad_norm": 6.696647644042969, "learning_rate": 1.517481553718747e-07, "loss": 0.1489, "step": 30520 }, { "epoch": 2.788637194008038, "grad_norm": 2.0427379608154297, "learning_rate": 1.5045161787449359e-07, "loss": 0.147, "step": 30530 }, { "epoch": 2.7895506028498356, "grad_norm": 2.1039366722106934, "learning_rate": 1.4916055837078113e-07, "loss": 0.2132, "step": 30540 }, { "epoch": 2.790464011691633, "grad_norm": 2.6952590942382812, "learning_rate": 1.478749783191036e-07, "loss": 0.1562, "step": 30550 }, { "epoch": 2.791377420533431, "grad_norm": 4.942740440368652, "learning_rate": 1.4659487917163596e-07, "loss": 0.1759, "step": 30560 }, { "epoch": 2.7922908293752284, "grad_norm": 2.848492383956909, "learning_rate": 1.4532026237436493e-07, "loss": 0.1603, "step": 30570 }, { "epoch": 2.793204238217026, "grad_norm": 3.250372886657715, "learning_rate": 1.440511293670821e-07, "loss": 0.2531, "step": 30580 }, { "epoch": 2.7941176470588234, "grad_norm": 4.785683631896973, "learning_rate": 1.4278748158338628e-07, "loss": 0.1871, "step": 30590 }, { "epoch": 2.795031055900621, "grad_norm": 3.747417688369751, "learning_rate": 1.4152932045067846e-07, "loss": 0.1885, "step": 30600 }, { "epoch": 2.7959444647424188, "grad_norm": 2.641806125640869, "learning_rate": 1.4027664739016232e-07, "loss": 0.1834, "step": 30610 }, { "epoch": 2.7968578735842162, "grad_norm": 2.6151208877563477, "learning_rate": 1.3902946381684435e-07, "loss": 0.1772, "step": 30620 }, { "epoch": 2.797771282426014, "grad_norm": 2.251767158508301, "learning_rate": 1.3778777113952812e-07, "loss": 0.185, "step": 30630 }, { "epoch": 2.7986846912678116, "grad_norm": 2.1561295986175537, "learning_rate": 1.3655157076081504e-07, "loss": 0.197, "step": 30640 }, { "epoch": 2.799598100109609, "grad_norm": 2.6720657348632812, "learning_rate": 1.3532086407710198e-07, "loss": 0.1678, "step": 30650 }, { "epoch": 2.8005115089514065, "grad_norm": 3.8517913818359375, "learning_rate": 1.3409565247858248e-07, "loss": 0.1923, "step": 30660 }, { "epoch": 2.801424917793204, "grad_norm": 1.7059541940689087, "learning_rate": 1.3287593734923998e-07, "loss": 0.1865, "step": 30670 }, { "epoch": 2.802338326635002, "grad_norm": 3.167004346847534, "learning_rate": 1.3166172006685186e-07, "loss": 0.2011, "step": 30680 }, { "epoch": 2.8032517354767994, "grad_norm": 6.902370452880859, "learning_rate": 1.304530020029826e-07, "loss": 0.2251, "step": 30690 }, { "epoch": 2.804165144318597, "grad_norm": 2.7574892044067383, "learning_rate": 1.2924978452298787e-07, "loss": 0.1787, "step": 30700 }, { "epoch": 2.805078553160395, "grad_norm": 1.7895187139511108, "learning_rate": 1.2805206898600654e-07, "loss": 0.1763, "step": 30710 }, { "epoch": 2.8059919620021923, "grad_norm": 3.509232521057129, "learning_rate": 1.268598567449647e-07, "loss": 0.2124, "step": 30720 }, { "epoch": 2.8069053708439897, "grad_norm": 1.854034662246704, "learning_rate": 1.256731491465718e-07, "loss": 0.1571, "step": 30730 }, { "epoch": 2.807818779685787, "grad_norm": 4.317585468292236, "learning_rate": 1.2449194753131832e-07, "loss": 0.1744, "step": 30740 }, { "epoch": 2.808732188527585, "grad_norm": 3.9074621200561523, "learning_rate": 1.2331625323347696e-07, "loss": 0.1481, "step": 30750 }, { "epoch": 2.8096455973693826, "grad_norm": 3.6889185905456543, "learning_rate": 1.221460675810976e-07, "loss": 0.1847, "step": 30760 }, { "epoch": 2.81055900621118, "grad_norm": 1.8562510013580322, "learning_rate": 1.209813918960079e-07, "loss": 0.1857, "step": 30770 }, { "epoch": 2.811472415052978, "grad_norm": 3.2423956394195557, "learning_rate": 1.1982222749381322e-07, "loss": 0.1985, "step": 30780 }, { "epoch": 2.8123858238947754, "grad_norm": 2.292555093765259, "learning_rate": 1.1866857568389235e-07, "loss": 0.1828, "step": 30790 }, { "epoch": 2.813299232736573, "grad_norm": 4.941780090332031, "learning_rate": 1.175204377693956e-07, "loss": 0.1705, "step": 30800 }, { "epoch": 2.8142126415783704, "grad_norm": 3.199408769607544, "learning_rate": 1.1637781504724721e-07, "loss": 0.1586, "step": 30810 }, { "epoch": 2.815126050420168, "grad_norm": 2.5203497409820557, "learning_rate": 1.1524070880814031e-07, "loss": 0.1282, "step": 30820 }, { "epoch": 2.8160394592619657, "grad_norm": 5.41642427444458, "learning_rate": 1.1410912033653743e-07, "loss": 0.1968, "step": 30830 }, { "epoch": 2.816952868103763, "grad_norm": 4.532895565032959, "learning_rate": 1.1298305091066664e-07, "loss": 0.1812, "step": 30840 }, { "epoch": 2.8178662769455607, "grad_norm": 2.4450063705444336, "learning_rate": 1.1186250180252433e-07, "loss": 0.173, "step": 30850 }, { "epoch": 2.8187796857873586, "grad_norm": 2.0755581855773926, "learning_rate": 1.1074747427786858e-07, "loss": 0.2488, "step": 30860 }, { "epoch": 2.819693094629156, "grad_norm": 3.8777756690979004, "learning_rate": 1.0963796959622241e-07, "loss": 0.155, "step": 30870 }, { "epoch": 2.8206065034709535, "grad_norm": 2.5026493072509766, "learning_rate": 1.0853398901086943e-07, "loss": 0.1381, "step": 30880 }, { "epoch": 2.821519912312751, "grad_norm": 3.350773334503174, "learning_rate": 1.0743553376885262e-07, "loss": 0.1472, "step": 30890 }, { "epoch": 2.822433321154549, "grad_norm": 4.115846633911133, "learning_rate": 1.0634260511097505e-07, "loss": 0.2085, "step": 30900 }, { "epoch": 2.8233467299963464, "grad_norm": 2.5741050243377686, "learning_rate": 1.0525520427179581e-07, "loss": 0.1364, "step": 30910 }, { "epoch": 2.824260138838144, "grad_norm": 5.363935947418213, "learning_rate": 1.0417333247963069e-07, "loss": 0.1749, "step": 30920 }, { "epoch": 2.8251735476799418, "grad_norm": 3.791522741317749, "learning_rate": 1.0309699095654879e-07, "loss": 0.2016, "step": 30930 }, { "epoch": 2.8260869565217392, "grad_norm": 2.532700300216675, "learning_rate": 1.0202618091837368e-07, "loss": 0.1442, "step": 30940 }, { "epoch": 2.8270003653635367, "grad_norm": 2.58868145942688, "learning_rate": 1.0096090357467947e-07, "loss": 0.2237, "step": 30950 }, { "epoch": 2.827913774205334, "grad_norm": 4.34386682510376, "learning_rate": 9.990116012879192e-08, "loss": 0.1538, "step": 30960 }, { "epoch": 2.8288271830471317, "grad_norm": 1.7251232862472534, "learning_rate": 9.884695177778347e-08, "loss": 0.1391, "step": 30970 }, { "epoch": 2.8297405918889296, "grad_norm": 3.1576437950134277, "learning_rate": 9.779827971247768e-08, "loss": 0.14, "step": 30980 }, { "epoch": 2.830654000730727, "grad_norm": 3.23321533203125, "learning_rate": 9.675514511744144e-08, "loss": 0.1542, "step": 30990 }, { "epoch": 2.8315674095725245, "grad_norm": 2.644649028778076, "learning_rate": 9.57175491709872e-08, "loss": 0.2283, "step": 31000 }, { "epoch": 2.8324808184143224, "grad_norm": 2.260758399963379, "learning_rate": 9.468549304517072e-08, "loss": 0.1587, "step": 31010 }, { "epoch": 2.83339422725612, "grad_norm": 3.0297763347625732, "learning_rate": 9.365897790579226e-08, "loss": 0.1843, "step": 31020 }, { "epoch": 2.8343076360979174, "grad_norm": 2.6031622886657715, "learning_rate": 9.263800491239039e-08, "loss": 0.2084, "step": 31030 }, { "epoch": 2.835221044939715, "grad_norm": 3.0719852447509766, "learning_rate": 9.162257521824425e-08, "loss": 0.1844, "step": 31040 }, { "epoch": 2.8361344537815127, "grad_norm": 2.040818452835083, "learning_rate": 9.061268997037187e-08, "loss": 0.1667, "step": 31050 }, { "epoch": 2.83704786262331, "grad_norm": 5.199827671051025, "learning_rate": 8.960835030952853e-08, "loss": 0.1927, "step": 31060 }, { "epoch": 2.8379612714651077, "grad_norm": 3.1799612045288086, "learning_rate": 8.860955737020338e-08, "loss": 0.1667, "step": 31070 }, { "epoch": 2.8388746803069056, "grad_norm": 4.023714065551758, "learning_rate": 8.761631228062228e-08, "loss": 0.1541, "step": 31080 }, { "epoch": 2.839788089148703, "grad_norm": 4.069933891296387, "learning_rate": 8.662861616274277e-08, "loss": 0.2211, "step": 31090 }, { "epoch": 2.8407014979905005, "grad_norm": 6.886049747467041, "learning_rate": 8.56464701322557e-08, "loss": 0.159, "step": 31100 }, { "epoch": 2.841614906832298, "grad_norm": 2.0380871295928955, "learning_rate": 8.466987529858083e-08, "loss": 0.1942, "step": 31110 }, { "epoch": 2.8425283156740955, "grad_norm": 2.441549062728882, "learning_rate": 8.369883276486912e-08, "loss": 0.1583, "step": 31120 }, { "epoch": 2.8434417245158934, "grad_norm": 2.1457529067993164, "learning_rate": 8.273334362799867e-08, "loss": 0.1745, "step": 31130 }, { "epoch": 2.844355133357691, "grad_norm": 2.8850314617156982, "learning_rate": 8.177340897857489e-08, "loss": 0.1444, "step": 31140 }, { "epoch": 2.8452685421994883, "grad_norm": 2.050032615661621, "learning_rate": 8.081902990092927e-08, "loss": 0.1668, "step": 31150 }, { "epoch": 2.8461819510412862, "grad_norm": 4.902590274810791, "learning_rate": 7.98702074731167e-08, "loss": 0.1688, "step": 31160 }, { "epoch": 2.8470953598830837, "grad_norm": 4.513327598571777, "learning_rate": 7.892694276691649e-08, "loss": 0.2275, "step": 31170 }, { "epoch": 2.848008768724881, "grad_norm": 7.831366539001465, "learning_rate": 7.798923684782911e-08, "loss": 0.1725, "step": 31180 }, { "epoch": 2.8489221775666786, "grad_norm": 3.2410125732421875, "learning_rate": 7.705709077507673e-08, "loss": 0.1827, "step": 31190 }, { "epoch": 2.8498355864084766, "grad_norm": 4.604362487792969, "learning_rate": 7.61305056016004e-08, "loss": 0.2036, "step": 31200 }, { "epoch": 2.850748995250274, "grad_norm": 3.6017391681671143, "learning_rate": 7.520948237405957e-08, "loss": 0.1324, "step": 31210 }, { "epoch": 2.8516624040920715, "grad_norm": 3.5544004440307617, "learning_rate": 7.429402213283254e-08, "loss": 0.1404, "step": 31220 }, { "epoch": 2.8525758129338694, "grad_norm": 2.5255112648010254, "learning_rate": 7.33841259120116e-08, "loss": 0.1677, "step": 31230 }, { "epoch": 2.853489221775667, "grad_norm": 8.60118579864502, "learning_rate": 7.247979473940569e-08, "loss": 0.1803, "step": 31240 }, { "epoch": 2.8544026306174644, "grad_norm": 2.789998769760132, "learning_rate": 7.158102963653657e-08, "loss": 0.1524, "step": 31250 }, { "epoch": 2.855316039459262, "grad_norm": 3.3740830421447754, "learning_rate": 7.06878316186388e-08, "loss": 0.1863, "step": 31260 }, { "epoch": 2.8562294483010593, "grad_norm": 1.7667673826217651, "learning_rate": 6.980020169465862e-08, "loss": 0.2085, "step": 31270 }, { "epoch": 2.857142857142857, "grad_norm": 1.835604190826416, "learning_rate": 6.891814086725235e-08, "loss": 0.1522, "step": 31280 }, { "epoch": 2.8580562659846547, "grad_norm": 8.092764854431152, "learning_rate": 6.804165013278518e-08, "loss": 0.1665, "step": 31290 }, { "epoch": 2.8589696748264526, "grad_norm": 2.9582271575927734, "learning_rate": 6.717073048133237e-08, "loss": 0.216, "step": 31300 }, { "epoch": 2.85988308366825, "grad_norm": 6.452463626861572, "learning_rate": 6.630538289667365e-08, "loss": 0.1899, "step": 31310 }, { "epoch": 2.8607964925100475, "grad_norm": 6.491169452667236, "learning_rate": 6.544560835629488e-08, "loss": 0.1919, "step": 31320 }, { "epoch": 2.861709901351845, "grad_norm": 2.004014730453491, "learning_rate": 6.459140783138863e-08, "loss": 0.2196, "step": 31330 }, { "epoch": 2.8626233101936425, "grad_norm": 1.4991315603256226, "learning_rate": 6.374278228685027e-08, "loss": 0.1703, "step": 31340 }, { "epoch": 2.8635367190354404, "grad_norm": 2.603942394256592, "learning_rate": 6.289973268127636e-08, "loss": 0.2119, "step": 31350 }, { "epoch": 2.864450127877238, "grad_norm": 3.665296792984009, "learning_rate": 6.206225996696624e-08, "loss": 0.1551, "step": 31360 }, { "epoch": 2.8653635367190353, "grad_norm": 4.199866771697998, "learning_rate": 6.123036508991931e-08, "loss": 0.2042, "step": 31370 }, { "epoch": 2.8662769455608332, "grad_norm": 1.779120922088623, "learning_rate": 6.040404898983499e-08, "loss": 0.1407, "step": 31380 }, { "epoch": 2.8671903544026307, "grad_norm": 3.4522266387939453, "learning_rate": 5.958331260010941e-08, "loss": 0.1704, "step": 31390 }, { "epoch": 2.868103763244428, "grad_norm": 2.976898670196533, "learning_rate": 5.876815684783765e-08, "loss": 0.1883, "step": 31400 }, { "epoch": 2.8690171720862256, "grad_norm": 6.595038414001465, "learning_rate": 5.7958582653809825e-08, "loss": 0.1598, "step": 31410 }, { "epoch": 2.869930580928023, "grad_norm": 6.684220790863037, "learning_rate": 5.7154590932511634e-08, "loss": 0.1782, "step": 31420 }, { "epoch": 2.870843989769821, "grad_norm": 3.1258671283721924, "learning_rate": 5.635618259212272e-08, "loss": 0.2053, "step": 31430 }, { "epoch": 2.8717573986116185, "grad_norm": 4.426680564880371, "learning_rate": 5.556335853451611e-08, "loss": 0.1803, "step": 31440 }, { "epoch": 2.8726708074534164, "grad_norm": 3.06663179397583, "learning_rate": 5.477611965525653e-08, "loss": 0.1417, "step": 31450 }, { "epoch": 2.873584216295214, "grad_norm": 8.209572792053223, "learning_rate": 5.3994466843600436e-08, "loss": 0.2015, "step": 31460 }, { "epoch": 2.8744976251370113, "grad_norm": 26.554346084594727, "learning_rate": 5.32184009824932e-08, "loss": 0.1416, "step": 31470 }, { "epoch": 2.875411033978809, "grad_norm": 2.7095699310302734, "learning_rate": 5.244792294856971e-08, "loss": 0.1628, "step": 31480 }, { "epoch": 2.8763244428206063, "grad_norm": 2.471179962158203, "learning_rate": 5.168303361215321e-08, "loss": 0.1772, "step": 31490 }, { "epoch": 2.877237851662404, "grad_norm": 3.6156671047210693, "learning_rate": 5.092373383725424e-08, "loss": 0.1458, "step": 31500 }, { "epoch": 2.8781512605042017, "grad_norm": 5.643335819244385, "learning_rate": 5.017002448156838e-08, "loss": 0.2089, "step": 31510 }, { "epoch": 2.879064669345999, "grad_norm": 3.167705535888672, "learning_rate": 4.9421906396476813e-08, "loss": 0.1957, "step": 31520 }, { "epoch": 2.879978078187797, "grad_norm": 2.8825111389160156, "learning_rate": 4.8679380427045786e-08, "loss": 0.2139, "step": 31530 }, { "epoch": 2.8808914870295945, "grad_norm": 4.2082743644714355, "learning_rate": 4.794244741202436e-08, "loss": 0.1549, "step": 31540 }, { "epoch": 2.881804895871392, "grad_norm": 4.76873779296875, "learning_rate": 4.721110818384223e-08, "loss": 0.1862, "step": 31550 }, { "epoch": 2.8827183047131895, "grad_norm": 3.814755916595459, "learning_rate": 4.6485363568612995e-08, "loss": 0.2259, "step": 31560 }, { "epoch": 2.8836317135549874, "grad_norm": 4.155381679534912, "learning_rate": 4.5765214386128686e-08, "loss": 0.1733, "step": 31570 }, { "epoch": 2.884545122396785, "grad_norm": 4.914824962615967, "learning_rate": 4.505066144986137e-08, "loss": 0.1491, "step": 31580 }, { "epoch": 2.8854585312385823, "grad_norm": 6.5866851806640625, "learning_rate": 4.434170556696205e-08, "loss": 0.1813, "step": 31590 }, { "epoch": 2.8863719400803802, "grad_norm": 5.213261127471924, "learning_rate": 4.3638347538259016e-08, "loss": 0.1393, "step": 31600 }, { "epoch": 2.8872853489221777, "grad_norm": 6.5349321365356445, "learning_rate": 4.2940588158256746e-08, "loss": 0.1715, "step": 31610 }, { "epoch": 2.888198757763975, "grad_norm": 5.136638641357422, "learning_rate": 4.224842821513753e-08, "loss": 0.1845, "step": 31620 }, { "epoch": 2.8891121666057726, "grad_norm": 3.2968218326568604, "learning_rate": 4.15618684907565e-08, "loss": 0.1595, "step": 31630 }, { "epoch": 2.89002557544757, "grad_norm": 2.8187365531921387, "learning_rate": 4.08809097606433e-08, "loss": 0.1774, "step": 31640 }, { "epoch": 2.890938984289368, "grad_norm": 3.3538355827331543, "learning_rate": 4.0205552794000955e-08, "loss": 0.1744, "step": 31650 }, { "epoch": 2.8918523931311655, "grad_norm": 3.591006278991699, "learning_rate": 3.9535798353705354e-08, "loss": 0.1929, "step": 31660 }, { "epoch": 2.892765801972963, "grad_norm": 0.6611663699150085, "learning_rate": 3.887164719630354e-08, "loss": 0.1384, "step": 31670 }, { "epoch": 2.893679210814761, "grad_norm": 6.558027744293213, "learning_rate": 3.8213100072012066e-08, "loss": 0.1802, "step": 31680 }, { "epoch": 2.8945926196565583, "grad_norm": 3.722982406616211, "learning_rate": 3.756015772471866e-08, "loss": 0.1732, "step": 31690 }, { "epoch": 2.895506028498356, "grad_norm": 4.339853763580322, "learning_rate": 3.691282089197945e-08, "loss": 0.1938, "step": 31700 }, { "epoch": 2.8964194373401533, "grad_norm": 5.944039344787598, "learning_rate": 3.627109030501841e-08, "loss": 0.1462, "step": 31710 }, { "epoch": 2.897332846181951, "grad_norm": 2.52124285697937, "learning_rate": 3.5634966688727344e-08, "loss": 0.1546, "step": 31720 }, { "epoch": 2.8982462550237487, "grad_norm": 1.9459630250930786, "learning_rate": 3.50044507616637e-08, "loss": 0.1781, "step": 31730 }, { "epoch": 2.899159663865546, "grad_norm": 1.337398648262024, "learning_rate": 3.4379543236051635e-08, "loss": 0.1881, "step": 31740 }, { "epoch": 2.900073072707344, "grad_norm": 2.6260488033294678, "learning_rate": 3.376024481777873e-08, "loss": 0.214, "step": 31750 }, { "epoch": 2.9009864815491415, "grad_norm": 4.424779891967773, "learning_rate": 3.3146556206398176e-08, "loss": 0.1711, "step": 31760 }, { "epoch": 2.901899890390939, "grad_norm": 4.590170860290527, "learning_rate": 3.253847809512378e-08, "loss": 0.1634, "step": 31770 }, { "epoch": 2.9028132992327365, "grad_norm": 4.049939155578613, "learning_rate": 3.1936011170835556e-08, "loss": 0.2071, "step": 31780 }, { "epoch": 2.903726708074534, "grad_norm": 4.37501335144043, "learning_rate": 3.13391561140719e-08, "loss": 0.2412, "step": 31790 }, { "epoch": 2.904640116916332, "grad_norm": 2.999466896057129, "learning_rate": 3.0747913599033506e-08, "loss": 0.1506, "step": 31800 }, { "epoch": 2.9055535257581293, "grad_norm": 5.8624420166015625, "learning_rate": 3.01622842935817e-08, "loss": 0.2235, "step": 31810 }, { "epoch": 2.9064669345999268, "grad_norm": 4.424445629119873, "learning_rate": 2.9582268859236206e-08, "loss": 0.2157, "step": 31820 }, { "epoch": 2.9073803434417247, "grad_norm": 3.584073543548584, "learning_rate": 2.9007867951175716e-08, "loss": 0.1925, "step": 31830 }, { "epoch": 2.908293752283522, "grad_norm": 2.906712055206299, "learning_rate": 2.843908221823677e-08, "loss": 0.1953, "step": 31840 }, { "epoch": 2.9092071611253196, "grad_norm": 3.018005609512329, "learning_rate": 2.7875912302912643e-08, "loss": 0.1594, "step": 31850 }, { "epoch": 2.910120569967117, "grad_norm": 4.6979875564575195, "learning_rate": 2.731835884135503e-08, "loss": 0.1986, "step": 31860 }, { "epoch": 2.911033978808915, "grad_norm": 6.03165864944458, "learning_rate": 2.676642246336847e-08, "loss": 0.1801, "step": 31870 }, { "epoch": 2.9119473876507125, "grad_norm": 4.625025749206543, "learning_rate": 2.6220103792414243e-08, "loss": 0.1868, "step": 31880 }, { "epoch": 2.91286079649251, "grad_norm": 3.4753518104553223, "learning_rate": 2.5679403445607596e-08, "loss": 0.1881, "step": 31890 }, { "epoch": 2.913774205334308, "grad_norm": 2.9014527797698975, "learning_rate": 2.5144322033717748e-08, "loss": 0.1304, "step": 31900 }, { "epoch": 2.9146876141761053, "grad_norm": 3.42879581451416, "learning_rate": 2.461486016116621e-08, "loss": 0.1791, "step": 31910 }, { "epoch": 2.915601023017903, "grad_norm": 2.8180999755859375, "learning_rate": 2.4091018426027347e-08, "loss": 0.1705, "step": 31920 }, { "epoch": 2.9165144318597003, "grad_norm": 3.277371644973755, "learning_rate": 2.3572797420025606e-08, "loss": 0.1782, "step": 31930 }, { "epoch": 2.9174278407014977, "grad_norm": 2.609182119369507, "learning_rate": 2.3060197728538847e-08, "loss": 0.1717, "step": 31940 }, { "epoch": 2.9183412495432957, "grad_norm": 2.1692848205566406, "learning_rate": 2.2553219930592784e-08, "loss": 0.1529, "step": 31950 }, { "epoch": 2.919254658385093, "grad_norm": 4.438725471496582, "learning_rate": 2.205186459886377e-08, "loss": 0.1642, "step": 31960 }, { "epoch": 2.9201680672268906, "grad_norm": 2.842299699783325, "learning_rate": 2.1556132299677124e-08, "loss": 0.194, "step": 31970 }, { "epoch": 2.9210814760686885, "grad_norm": 8.119446754455566, "learning_rate": 2.106602359300658e-08, "loss": 0.1765, "step": 31980 }, { "epoch": 2.921994884910486, "grad_norm": 3.152848720550537, "learning_rate": 2.058153903247262e-08, "loss": 0.1239, "step": 31990 }, { "epoch": 2.9229082937522834, "grad_norm": 4.436875820159912, "learning_rate": 2.0102679165344142e-08, "loss": 0.1388, "step": 32000 }, { "epoch": 2.923821702594081, "grad_norm": 9.933238983154297, "learning_rate": 1.962944453253568e-08, "loss": 0.1704, "step": 32010 }, { "epoch": 2.924735111435879, "grad_norm": 3.981863498687744, "learning_rate": 1.9161835668606853e-08, "loss": 0.1776, "step": 32020 }, { "epoch": 2.9256485202776763, "grad_norm": 3.1931474208831787, "learning_rate": 1.8699853101763476e-08, "loss": 0.2029, "step": 32030 }, { "epoch": 2.9265619291194738, "grad_norm": 5.394498348236084, "learning_rate": 1.8243497353856444e-08, "loss": 0.2224, "step": 32040 }, { "epoch": 2.9274753379612717, "grad_norm": 3.4285762310028076, "learning_rate": 1.77927689403784e-08, "loss": 0.1485, "step": 32050 }, { "epoch": 2.928388746803069, "grad_norm": 7.215638160705566, "learning_rate": 1.734766837046875e-08, "loss": 0.1424, "step": 32060 }, { "epoch": 2.9293021556448666, "grad_norm": 3.857783079147339, "learning_rate": 1.6908196146906418e-08, "loss": 0.181, "step": 32070 }, { "epoch": 2.930215564486664, "grad_norm": 16.0194149017334, "learning_rate": 1.6474352766114866e-08, "loss": 0.1418, "step": 32080 }, { "epoch": 2.9311289733284616, "grad_norm": 4.420904159545898, "learning_rate": 1.6046138718158745e-08, "loss": 0.1459, "step": 32090 }, { "epoch": 2.9320423821702595, "grad_norm": 6.008378505706787, "learning_rate": 1.5623554486743352e-08, "loss": 0.1575, "step": 32100 }, { "epoch": 2.932955791012057, "grad_norm": 3.4047980308532715, "learning_rate": 1.520660054921519e-08, "loss": 0.1875, "step": 32110 }, { "epoch": 2.933869199853855, "grad_norm": 2.3440921306610107, "learning_rate": 1.4795277376560279e-08, "loss": 0.1868, "step": 32120 }, { "epoch": 2.9347826086956523, "grad_norm": 3.612452507019043, "learning_rate": 1.4389585433404186e-08, "loss": 0.1518, "step": 32130 }, { "epoch": 2.93569601753745, "grad_norm": 4.185311794281006, "learning_rate": 1.3989525178012552e-08, "loss": 0.148, "step": 32140 }, { "epoch": 2.9366094263792473, "grad_norm": 2.9504358768463135, "learning_rate": 1.3595097062288343e-08, "loss": 0.2038, "step": 32150 }, { "epoch": 2.9375228352210447, "grad_norm": 2.922417640686035, "learning_rate": 1.320630153177349e-08, "loss": 0.155, "step": 32160 }, { "epoch": 2.9384362440628427, "grad_norm": 3.9673376083374023, "learning_rate": 1.2823139025646692e-08, "loss": 0.1777, "step": 32170 }, { "epoch": 2.93934965290464, "grad_norm": 3.4108378887176514, "learning_rate": 1.2445609976723395e-08, "loss": 0.2192, "step": 32180 }, { "epoch": 2.9402630617464376, "grad_norm": 4.616886138916016, "learning_rate": 1.207371481145636e-08, "loss": 0.2165, "step": 32190 }, { "epoch": 2.9411764705882355, "grad_norm": 4.887918949127197, "learning_rate": 1.1707453949934555e-08, "loss": 0.1657, "step": 32200 }, { "epoch": 2.942089879430033, "grad_norm": 3.7033891677856445, "learning_rate": 1.1346827805881478e-08, "loss": 0.182, "step": 32210 }, { "epoch": 2.9430032882718304, "grad_norm": 3.4210965633392334, "learning_rate": 1.0991836786656274e-08, "loss": 0.18, "step": 32220 }, { "epoch": 2.943916697113628, "grad_norm": 3.5135724544525146, "learning_rate": 1.0642481293252626e-08, "loss": 0.1754, "step": 32230 }, { "epoch": 2.9448301059554254, "grad_norm": 4.466515064239502, "learning_rate": 1.0298761720298756e-08, "loss": 0.1403, "step": 32240 }, { "epoch": 2.9457435147972233, "grad_norm": 2.8632428646087646, "learning_rate": 9.96067845605575e-09, "loss": 0.1604, "step": 32250 }, { "epoch": 2.9466569236390208, "grad_norm": 6.312026023864746, "learning_rate": 9.628231882419237e-09, "loss": 0.1854, "step": 32260 }, { "epoch": 2.9475703324808187, "grad_norm": 8.13270378112793, "learning_rate": 9.301422374916602e-09, "loss": 0.1641, "step": 32270 }, { "epoch": 2.948483741322616, "grad_norm": 3.8104419708251953, "learning_rate": 8.98025030270866e-09, "loss": 0.1385, "step": 32280 }, { "epoch": 2.9493971501644136, "grad_norm": 2.172661542892456, "learning_rate": 8.664716028586317e-09, "loss": 0.1403, "step": 32290 }, { "epoch": 2.950310559006211, "grad_norm": 2.9265196323394775, "learning_rate": 8.35481990897502e-09, "loss": 0.1709, "step": 32300 }, { "epoch": 2.9512239678480086, "grad_norm": 2.6787109375, "learning_rate": 8.050562293928643e-09, "loss": 0.2012, "step": 32310 }, { "epoch": 2.9521373766898065, "grad_norm": 5.205583572387695, "learning_rate": 7.751943527133932e-09, "loss": 0.1697, "step": 32320 }, { "epoch": 2.953050785531604, "grad_norm": 3.1661856174468994, "learning_rate": 7.458963945906616e-09, "loss": 0.1566, "step": 32330 }, { "epoch": 2.9539641943734014, "grad_norm": 1.5394933223724365, "learning_rate": 7.171623881193079e-09, "loss": 0.1544, "step": 32340 }, { "epoch": 2.9548776032151993, "grad_norm": 1.8122727870941162, "learning_rate": 6.889923657569797e-09, "loss": 0.1568, "step": 32350 }, { "epoch": 2.955791012056997, "grad_norm": 2.9886345863342285, "learning_rate": 6.613863593241676e-09, "loss": 0.1719, "step": 32360 }, { "epoch": 2.9567044208987943, "grad_norm": 2.487652063369751, "learning_rate": 6.343444000042609e-09, "loss": 0.1214, "step": 32370 }, { "epoch": 2.9576178297405917, "grad_norm": 5.858112335205078, "learning_rate": 6.078665183436583e-09, "loss": 0.1966, "step": 32380 }, { "epoch": 2.958531238582389, "grad_norm": 2.68648624420166, "learning_rate": 5.81952744251324e-09, "loss": 0.2057, "step": 32390 }, { "epoch": 2.959444647424187, "grad_norm": 1.9640896320343018, "learning_rate": 5.566031069992872e-09, "loss": 0.1845, "step": 32400 }, { "epoch": 2.9603580562659846, "grad_norm": 1.8511533737182617, "learning_rate": 5.318176352220872e-09, "loss": 0.1766, "step": 32410 }, { "epoch": 2.9612714651077825, "grad_norm": 2.9358420372009277, "learning_rate": 5.075963569172171e-09, "loss": 0.2013, "step": 32420 }, { "epoch": 2.96218487394958, "grad_norm": 2.5210113525390625, "learning_rate": 4.839392994446801e-09, "loss": 0.1335, "step": 32430 }, { "epoch": 2.9630982827913774, "grad_norm": 4.7350921630859375, "learning_rate": 4.6084648952721135e-09, "loss": 0.14, "step": 32440 }, { "epoch": 2.964011691633175, "grad_norm": 7.88287878036499, "learning_rate": 4.383179532502224e-09, "loss": 0.1478, "step": 32450 }, { "epoch": 2.9649251004749724, "grad_norm": 3.110056161880493, "learning_rate": 4.163537160616904e-09, "loss": 0.1549, "step": 32460 }, { "epoch": 2.9658385093167703, "grad_norm": 2.5169878005981445, "learning_rate": 3.949538027721022e-09, "loss": 0.1965, "step": 32470 }, { "epoch": 2.9667519181585678, "grad_norm": 2.535029172897339, "learning_rate": 3.7411823755462155e-09, "loss": 0.1469, "step": 32480 }, { "epoch": 2.9676653270003652, "grad_norm": 1.699958086013794, "learning_rate": 3.538470439448105e-09, "loss": 0.1649, "step": 32490 }, { "epoch": 2.968578735842163, "grad_norm": 2.5142931938171387, "learning_rate": 3.341402448408526e-09, "loss": 0.1638, "step": 32500 }, { "epoch": 2.9694921446839606, "grad_norm": 3.274827241897583, "learning_rate": 3.1499786250321904e-09, "loss": 0.2179, "step": 32510 }, { "epoch": 2.970405553525758, "grad_norm": 3.4348742961883545, "learning_rate": 2.9641991855494656e-09, "loss": 0.159, "step": 32520 }, { "epoch": 2.9713189623675555, "grad_norm": 2.715625047683716, "learning_rate": 2.7840643398152623e-09, "loss": 0.1611, "step": 32530 }, { "epoch": 2.9722323712093535, "grad_norm": 4.382182598114014, "learning_rate": 2.6095742913068155e-09, "loss": 0.1819, "step": 32540 }, { "epoch": 2.973145780051151, "grad_norm": 2.241014242172241, "learning_rate": 2.4407292371270152e-09, "loss": 0.1973, "step": 32550 }, { "epoch": 2.9740591888929484, "grad_norm": 2.328784227371216, "learning_rate": 2.2775293680010746e-09, "loss": 0.1994, "step": 32560 }, { "epoch": 2.9749725977347463, "grad_norm": 3.8127105236053467, "learning_rate": 2.119974868277086e-09, "loss": 0.1815, "step": 32570 }, { "epoch": 2.975886006576544, "grad_norm": 2.441767692565918, "learning_rate": 1.968065915927686e-09, "loss": 0.14, "step": 32580 }, { "epoch": 2.9767994154183413, "grad_norm": 4.922434329986572, "learning_rate": 1.821802682546725e-09, "loss": 0.1661, "step": 32590 }, { "epoch": 2.9777128242601387, "grad_norm": 2.3889098167419434, "learning_rate": 1.6811853333520423e-09, "loss": 0.1786, "step": 32600 }, { "epoch": 2.978626233101936, "grad_norm": 2.6719970703125, "learning_rate": 1.5462140271826908e-09, "loss": 0.1352, "step": 32610 }, { "epoch": 2.979539641943734, "grad_norm": 3.8577189445495605, "learning_rate": 1.416888916501158e-09, "loss": 0.281, "step": 32620 }, { "epoch": 2.9804530507855316, "grad_norm": 2.868408679962158, "learning_rate": 1.2932101473916991e-09, "loss": 0.2089, "step": 32630 }, { "epoch": 2.981366459627329, "grad_norm": 4.791360378265381, "learning_rate": 1.175177859559784e-09, "loss": 0.1694, "step": 32640 }, { "epoch": 2.982279868469127, "grad_norm": 2.602430820465088, "learning_rate": 1.0627921863337608e-09, "loss": 0.1745, "step": 32650 }, { "epoch": 2.9831932773109244, "grad_norm": 7.018901824951172, "learning_rate": 9.560532546637469e-10, "loss": 0.1563, "step": 32660 }, { "epoch": 2.984106686152722, "grad_norm": 4.816333770751953, "learning_rate": 8.549611851199624e-10, "loss": 0.1959, "step": 32670 }, { "epoch": 2.9850200949945194, "grad_norm": 2.108297348022461, "learning_rate": 7.595160918949518e-10, "loss": 0.1345, "step": 32680 }, { "epoch": 2.9859335038363173, "grad_norm": 3.251560926437378, "learning_rate": 6.69718082802473e-10, "loss": 0.1607, "step": 32690 }, { "epoch": 2.9868469126781148, "grad_norm": 2.628570556640625, "learning_rate": 5.855672592774975e-10, "loss": 0.1651, "step": 32700 }, { "epoch": 2.987760321519912, "grad_norm": 5.176426410675049, "learning_rate": 5.070637163756553e-10, "loss": 0.2508, "step": 32710 }, { "epoch": 2.98867373036171, "grad_norm": 4.461970806121826, "learning_rate": 4.3420754277379017e-10, "loss": 0.1223, "step": 32720 }, { "epoch": 2.9895871392035076, "grad_norm": 3.207904100418091, "learning_rate": 3.669988207694042e-10, "loss": 0.1718, "step": 32730 }, { "epoch": 2.990500548045305, "grad_norm": 1.8990813493728638, "learning_rate": 3.054376262801029e-10, "loss": 0.168, "step": 32740 }, { "epoch": 2.9914139568871025, "grad_norm": 6.5263190269470215, "learning_rate": 2.495240288452605e-10, "loss": 0.1854, "step": 32750 }, { "epoch": 2.9923273657289, "grad_norm": 15.536097526550293, "learning_rate": 1.9925809162435472e-10, "loss": 0.161, "step": 32760 }, { "epoch": 2.993240774570698, "grad_norm": 2.1718358993530273, "learning_rate": 1.5463987139696657e-10, "loss": 0.1525, "step": 32770 }, { "epoch": 2.9941541834124954, "grad_norm": 2.910853862762451, "learning_rate": 1.1566941856333557e-10, "loss": 0.1892, "step": 32780 }, { "epoch": 2.995067592254293, "grad_norm": 2.90128493309021, "learning_rate": 8.234677714435979e-11, "loss": 0.1497, "step": 32790 }, { "epoch": 2.995981001096091, "grad_norm": 3.8087680339813232, "learning_rate": 5.467198478048552e-11, "loss": 0.157, "step": 32800 }, { "epoch": 2.9968944099378882, "grad_norm": 2.825289011001587, "learning_rate": 3.2645072732262474e-11, "loss": 0.2077, "step": 32810 }, { "epoch": 2.9978078187796857, "grad_norm": 6.990581512451172, "learning_rate": 1.6266065882564186e-11, "loss": 0.1712, "step": 32820 }, { "epoch": 2.998721227621483, "grad_norm": 9.954340934753418, "learning_rate": 5.5349827315920316e-12, "loss": 0.1891, "step": 32830 }, { "epoch": 2.999634636463281, "grad_norm": 3.87980055809021, "learning_rate": 4.5183540131610306e-13, "loss": 0.1546, "step": 32840 }, { "epoch": 3.0, "step": 32844, "total_flos": 4.914745925118722e+18, "train_loss": 0.22788219073667784, "train_runtime": 14032.2828, "train_samples_per_second": 9.362, "train_steps_per_second": 2.341 } ], "logging_steps": 10, "max_steps": 32844, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.914745925118722e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }