{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.6490066225165565, "eval_steps": 500, "global_step": 22000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012040939193257074, "grad_norm": 9.316359519958496, "learning_rate": 3.0102347983142685e-09, "loss": 0.9861, "step": 10 }, { "epoch": 0.002408187838651415, "grad_norm": 9.306455612182617, "learning_rate": 6.020469596628537e-09, "loss": 0.9917, "step": 20 }, { "epoch": 0.003612281757977122, "grad_norm": 10.237154006958008, "learning_rate": 9.030704394942806e-09, "loss": 0.9875, "step": 30 }, { "epoch": 0.00481637567730283, "grad_norm": 10.9087553024292, "learning_rate": 1.2040939193257074e-08, "loss": 1.0501, "step": 40 }, { "epoch": 0.006020469596628537, "grad_norm": 8.430444717407227, "learning_rate": 1.5051173991571343e-08, "loss": 0.9665, "step": 50 }, { "epoch": 0.007224563515954244, "grad_norm": 8.640472412109375, "learning_rate": 1.8061408789885613e-08, "loss": 1.0249, "step": 60 }, { "epoch": 0.008428657435279952, "grad_norm": 8.656485557556152, "learning_rate": 2.107164358819988e-08, "loss": 0.9942, "step": 70 }, { "epoch": 0.00963275135460566, "grad_norm": 11.827771186828613, "learning_rate": 2.4081878386514148e-08, "loss": 1.0359, "step": 80 }, { "epoch": 0.010836845273931367, "grad_norm": 7.204784870147705, "learning_rate": 2.7092113184828417e-08, "loss": 0.9532, "step": 90 }, { "epoch": 0.012040939193257074, "grad_norm": 7.546351432800293, "learning_rate": 3.010234798314269e-08, "loss": 1.0236, "step": 100 }, { "epoch": 0.013245033112582781, "grad_norm": 10.156384468078613, "learning_rate": 3.311258278145695e-08, "loss": 1.0031, "step": 110 }, { "epoch": 0.014449127031908489, "grad_norm": 9.753625869750977, "learning_rate": 3.6122817579771225e-08, "loss": 1.016, "step": 120 }, { "epoch": 0.015653220951234198, "grad_norm": 8.019911766052246, "learning_rate": 3.9133052378085485e-08, "loss": 0.9972, "step": 130 }, { "epoch": 0.016857314870559904, "grad_norm": 8.1049165725708, "learning_rate": 4.214328717639976e-08, "loss": 0.984, "step": 140 }, { "epoch": 0.018061408789885613, "grad_norm": 8.9203462600708, "learning_rate": 4.5153521974714023e-08, "loss": 0.9811, "step": 150 }, { "epoch": 0.01926550270921132, "grad_norm": 8.825779914855957, "learning_rate": 4.8163756773028296e-08, "loss": 1.0207, "step": 160 }, { "epoch": 0.020469596628537028, "grad_norm": 8.64220142364502, "learning_rate": 5.117399157134256e-08, "loss": 0.9971, "step": 170 }, { "epoch": 0.021673690547862733, "grad_norm": 8.120660781860352, "learning_rate": 5.4184226369656835e-08, "loss": 0.9606, "step": 180 }, { "epoch": 0.022877784467188442, "grad_norm": 8.181641578674316, "learning_rate": 5.71944611679711e-08, "loss": 0.9516, "step": 190 }, { "epoch": 0.024081878386514148, "grad_norm": 7.607439994812012, "learning_rate": 6.020469596628537e-08, "loss": 0.9919, "step": 200 }, { "epoch": 0.025285972305839857, "grad_norm": 7.434635162353516, "learning_rate": 6.321493076459963e-08, "loss": 0.9991, "step": 210 }, { "epoch": 0.026490066225165563, "grad_norm": 7.569486141204834, "learning_rate": 6.62251655629139e-08, "loss": 0.984, "step": 220 }, { "epoch": 0.027694160144491272, "grad_norm": 7.499971389770508, "learning_rate": 6.923540036122818e-08, "loss": 0.9598, "step": 230 }, { "epoch": 0.028898254063816978, "grad_norm": 6.992701053619385, "learning_rate": 7.224563515954245e-08, "loss": 0.905, "step": 240 }, { "epoch": 0.030102347983142687, "grad_norm": 6.3157877922058105, "learning_rate": 7.525586995785671e-08, "loss": 0.9493, "step": 250 }, { "epoch": 0.031306441902468396, "grad_norm": 6.263482570648193, "learning_rate": 7.826610475617097e-08, "loss": 0.9501, "step": 260 }, { "epoch": 0.0325105358217941, "grad_norm": 6.178393840789795, "learning_rate": 8.127633955448524e-08, "loss": 0.9056, "step": 270 }, { "epoch": 0.03371462974111981, "grad_norm": 4.896974086761475, "learning_rate": 8.428657435279951e-08, "loss": 0.8679, "step": 280 }, { "epoch": 0.034918723660445516, "grad_norm": 5.896145820617676, "learning_rate": 8.729680915111379e-08, "loss": 0.8658, "step": 290 }, { "epoch": 0.036122817579771226, "grad_norm": 5.6855573654174805, "learning_rate": 9.030704394942805e-08, "loss": 0.9227, "step": 300 }, { "epoch": 0.03732691149909693, "grad_norm": 4.907613277435303, "learning_rate": 9.331727874774232e-08, "loss": 0.8581, "step": 310 }, { "epoch": 0.03853100541842264, "grad_norm": 6.029637336730957, "learning_rate": 9.632751354605659e-08, "loss": 0.8184, "step": 320 }, { "epoch": 0.039735099337748346, "grad_norm": 5.0958333015441895, "learning_rate": 9.933774834437085e-08, "loss": 0.8524, "step": 330 }, { "epoch": 0.040939193257074055, "grad_norm": 6.18320369720459, "learning_rate": 1.0234798314268512e-07, "loss": 0.8371, "step": 340 }, { "epoch": 0.04214328717639976, "grad_norm": 4.874738693237305, "learning_rate": 1.0535821794099938e-07, "loss": 0.8348, "step": 350 }, { "epoch": 0.04334738109572547, "grad_norm": 5.273070812225342, "learning_rate": 1.0836845273931367e-07, "loss": 0.8286, "step": 360 }, { "epoch": 0.044551475015051176, "grad_norm": 5.052524089813232, "learning_rate": 1.1137868753762793e-07, "loss": 0.7585, "step": 370 }, { "epoch": 0.045755568934376885, "grad_norm": 4.216408729553223, "learning_rate": 1.143889223359422e-07, "loss": 0.7869, "step": 380 }, { "epoch": 0.04695966285370259, "grad_norm": 5.456339359283447, "learning_rate": 1.1739915713425646e-07, "loss": 0.7699, "step": 390 }, { "epoch": 0.048163756773028296, "grad_norm": 4.809760093688965, "learning_rate": 1.2040939193257075e-07, "loss": 0.763, "step": 400 }, { "epoch": 0.049367850692354005, "grad_norm": 4.933152675628662, "learning_rate": 1.23419626730885e-07, "loss": 0.7192, "step": 410 }, { "epoch": 0.050571944611679714, "grad_norm": 5.025005340576172, "learning_rate": 1.2642986152919927e-07, "loss": 0.7092, "step": 420 }, { "epoch": 0.05177603853100542, "grad_norm": 4.338512420654297, "learning_rate": 1.2944009632751355e-07, "loss": 0.7346, "step": 430 }, { "epoch": 0.052980132450331126, "grad_norm": 4.557036399841309, "learning_rate": 1.324503311258278e-07, "loss": 0.723, "step": 440 }, { "epoch": 0.054184226369656835, "grad_norm": 4.911799907684326, "learning_rate": 1.3546056592414207e-07, "loss": 0.7673, "step": 450 }, { "epoch": 0.055388320288982544, "grad_norm": 4.063588619232178, "learning_rate": 1.3847080072245636e-07, "loss": 0.708, "step": 460 }, { "epoch": 0.056592414208308246, "grad_norm": 4.037914752960205, "learning_rate": 1.4148103552077062e-07, "loss": 0.7641, "step": 470 }, { "epoch": 0.057796508127633955, "grad_norm": 4.673463344573975, "learning_rate": 1.444912703190849e-07, "loss": 0.7196, "step": 480 }, { "epoch": 0.059000602046959665, "grad_norm": 5.096141815185547, "learning_rate": 1.4750150511739913e-07, "loss": 0.689, "step": 490 }, { "epoch": 0.060204695966285374, "grad_norm": 4.904088973999023, "learning_rate": 1.5051173991571342e-07, "loss": 0.7211, "step": 500 }, { "epoch": 0.061408789885611076, "grad_norm": 5.234721660614014, "learning_rate": 1.535219747140277e-07, "loss": 0.7091, "step": 510 }, { "epoch": 0.06261288380493679, "grad_norm": 4.105505466461182, "learning_rate": 1.5653220951234194e-07, "loss": 0.7607, "step": 520 }, { "epoch": 0.0638169777242625, "grad_norm": 4.666725158691406, "learning_rate": 1.5954244431065622e-07, "loss": 0.7158, "step": 530 }, { "epoch": 0.0650210716435882, "grad_norm": 4.976656436920166, "learning_rate": 1.6255267910897048e-07, "loss": 0.7401, "step": 540 }, { "epoch": 0.06622516556291391, "grad_norm": 5.044974327087402, "learning_rate": 1.6556291390728477e-07, "loss": 0.6685, "step": 550 }, { "epoch": 0.06742925948223961, "grad_norm": 4.7259368896484375, "learning_rate": 1.6857314870559903e-07, "loss": 0.6866, "step": 560 }, { "epoch": 0.06863335340156532, "grad_norm": 5.358945369720459, "learning_rate": 1.715833835039133e-07, "loss": 0.7059, "step": 570 }, { "epoch": 0.06983744732089103, "grad_norm": 5.156592845916748, "learning_rate": 1.7459361830222757e-07, "loss": 0.7111, "step": 580 }, { "epoch": 0.07104154124021674, "grad_norm": 4.320924282073975, "learning_rate": 1.7760385310054183e-07, "loss": 0.6862, "step": 590 }, { "epoch": 0.07224563515954245, "grad_norm": 4.59999418258667, "learning_rate": 1.806140878988561e-07, "loss": 0.6893, "step": 600 }, { "epoch": 0.07344972907886815, "grad_norm": 4.4846391677856445, "learning_rate": 1.8362432269717038e-07, "loss": 0.7128, "step": 610 }, { "epoch": 0.07465382299819386, "grad_norm": 5.029007911682129, "learning_rate": 1.8663455749548464e-07, "loss": 0.7114, "step": 620 }, { "epoch": 0.07585791691751957, "grad_norm": 4.288726806640625, "learning_rate": 1.896447922937989e-07, "loss": 0.6848, "step": 630 }, { "epoch": 0.07706201083684527, "grad_norm": 4.063099384307861, "learning_rate": 1.9265502709211318e-07, "loss": 0.659, "step": 640 }, { "epoch": 0.07826610475617098, "grad_norm": 4.031120300292969, "learning_rate": 1.9566526189042744e-07, "loss": 0.6802, "step": 650 }, { "epoch": 0.07947019867549669, "grad_norm": 5.234511852264404, "learning_rate": 1.986754966887417e-07, "loss": 0.6685, "step": 660 }, { "epoch": 0.0806742925948224, "grad_norm": 5.434250831604004, "learning_rate": 2.01685731487056e-07, "loss": 0.6762, "step": 670 }, { "epoch": 0.08187838651414811, "grad_norm": 5.326634407043457, "learning_rate": 2.0469596628537025e-07, "loss": 0.6951, "step": 680 }, { "epoch": 0.08308248043347381, "grad_norm": 3.630930185317993, "learning_rate": 2.0770620108368453e-07, "loss": 0.6445, "step": 690 }, { "epoch": 0.08428657435279951, "grad_norm": 5.273288726806641, "learning_rate": 2.1071643588199877e-07, "loss": 0.65, "step": 700 }, { "epoch": 0.08549066827212523, "grad_norm": 4.212562084197998, "learning_rate": 2.1372667068031305e-07, "loss": 0.6485, "step": 710 }, { "epoch": 0.08669476219145093, "grad_norm": 4.293779373168945, "learning_rate": 2.1673690547862734e-07, "loss": 0.6734, "step": 720 }, { "epoch": 0.08789885611077664, "grad_norm": 4.917520999908447, "learning_rate": 2.1974714027694157e-07, "loss": 0.6593, "step": 730 }, { "epoch": 0.08910295003010235, "grad_norm": 4.624716281890869, "learning_rate": 2.2275737507525586e-07, "loss": 0.6712, "step": 740 }, { "epoch": 0.09030704394942805, "grad_norm": 5.3648552894592285, "learning_rate": 2.2576760987357014e-07, "loss": 0.6481, "step": 750 }, { "epoch": 0.09151113786875377, "grad_norm": 4.328650951385498, "learning_rate": 2.287778446718844e-07, "loss": 0.6418, "step": 760 }, { "epoch": 0.09271523178807947, "grad_norm": 4.933085918426514, "learning_rate": 2.3178807947019866e-07, "loss": 0.6622, "step": 770 }, { "epoch": 0.09391932570740517, "grad_norm": 4.703038215637207, "learning_rate": 2.3479831426851292e-07, "loss": 0.6317, "step": 780 }, { "epoch": 0.09512341962673089, "grad_norm": 4.468968391418457, "learning_rate": 2.378085490668272e-07, "loss": 0.6431, "step": 790 }, { "epoch": 0.09632751354605659, "grad_norm": 4.49053430557251, "learning_rate": 2.408187838651415e-07, "loss": 0.6212, "step": 800 }, { "epoch": 0.0975316074653823, "grad_norm": 4.350528240203857, "learning_rate": 2.438290186634557e-07, "loss": 0.623, "step": 810 }, { "epoch": 0.09873570138470801, "grad_norm": 4.772309303283691, "learning_rate": 2.4683925346177e-07, "loss": 0.6268, "step": 820 }, { "epoch": 0.09993979530403371, "grad_norm": 5.437624931335449, "learning_rate": 2.498494882600843e-07, "loss": 0.6497, "step": 830 }, { "epoch": 0.10114388922335943, "grad_norm": 4.474155902862549, "learning_rate": 2.5285972305839853e-07, "loss": 0.6092, "step": 840 }, { "epoch": 0.10234798314268513, "grad_norm": 4.417370796203613, "learning_rate": 2.558699578567128e-07, "loss": 0.6587, "step": 850 }, { "epoch": 0.10355207706201083, "grad_norm": 5.5498456954956055, "learning_rate": 2.588801926550271e-07, "loss": 0.6564, "step": 860 }, { "epoch": 0.10475617098133655, "grad_norm": 5.326292514801025, "learning_rate": 2.6189042745334134e-07, "loss": 0.6333, "step": 870 }, { "epoch": 0.10596026490066225, "grad_norm": 4.284069538116455, "learning_rate": 2.649006622516556e-07, "loss": 0.6325, "step": 880 }, { "epoch": 0.10716435881998795, "grad_norm": 4.672844886779785, "learning_rate": 2.679108970499699e-07, "loss": 0.6046, "step": 890 }, { "epoch": 0.10836845273931367, "grad_norm": 4.223860740661621, "learning_rate": 2.7092113184828414e-07, "loss": 0.6094, "step": 900 }, { "epoch": 0.10957254665863937, "grad_norm": 4.813838005065918, "learning_rate": 2.739313666465984e-07, "loss": 0.6332, "step": 910 }, { "epoch": 0.11077664057796509, "grad_norm": 3.5245296955108643, "learning_rate": 2.769416014449127e-07, "loss": 0.6161, "step": 920 }, { "epoch": 0.11198073449729079, "grad_norm": 4.577372074127197, "learning_rate": 2.7995183624322695e-07, "loss": 0.6254, "step": 930 }, { "epoch": 0.11318482841661649, "grad_norm": 4.295224666595459, "learning_rate": 2.8296207104154123e-07, "loss": 0.6089, "step": 940 }, { "epoch": 0.11438892233594221, "grad_norm": 4.899755477905273, "learning_rate": 2.8597230583985546e-07, "loss": 0.6255, "step": 950 }, { "epoch": 0.11559301625526791, "grad_norm": 5.047530651092529, "learning_rate": 2.889825406381698e-07, "loss": 0.6273, "step": 960 }, { "epoch": 0.11679711017459361, "grad_norm": 4.75305700302124, "learning_rate": 2.9199277543648404e-07, "loss": 0.6277, "step": 970 }, { "epoch": 0.11800120409391933, "grad_norm": 5.476251602172852, "learning_rate": 2.9500301023479827e-07, "loss": 0.6121, "step": 980 }, { "epoch": 0.11920529801324503, "grad_norm": 6.20451021194458, "learning_rate": 2.980132450331126e-07, "loss": 0.6121, "step": 990 }, { "epoch": 0.12040939193257075, "grad_norm": 4.61058235168457, "learning_rate": 3.0102347983142684e-07, "loss": 0.5862, "step": 1000 }, { "epoch": 0.12161348585189645, "grad_norm": 4.537725925445557, "learning_rate": 3.0403371462974107e-07, "loss": 0.6186, "step": 1010 }, { "epoch": 0.12281757977122215, "grad_norm": 4.347688674926758, "learning_rate": 3.070439494280554e-07, "loss": 0.6127, "step": 1020 }, { "epoch": 0.12402167369054787, "grad_norm": 4.965167045593262, "learning_rate": 3.1005418422636965e-07, "loss": 0.6026, "step": 1030 }, { "epoch": 0.12522576760987358, "grad_norm": 4.610491752624512, "learning_rate": 3.130644190246839e-07, "loss": 0.6327, "step": 1040 }, { "epoch": 0.12642986152919927, "grad_norm": 5.292304039001465, "learning_rate": 3.160746538229982e-07, "loss": 0.5972, "step": 1050 }, { "epoch": 0.127633955448525, "grad_norm": 6.372762680053711, "learning_rate": 3.1908488862131245e-07, "loss": 0.6393, "step": 1060 }, { "epoch": 0.1288380493678507, "grad_norm": 5.56066370010376, "learning_rate": 3.220951234196267e-07, "loss": 0.6196, "step": 1070 }, { "epoch": 0.1300421432871764, "grad_norm": 4.777896881103516, "learning_rate": 3.2510535821794097e-07, "loss": 0.6217, "step": 1080 }, { "epoch": 0.1312462372065021, "grad_norm": 4.9745683670043945, "learning_rate": 3.2811559301625525e-07, "loss": 0.6189, "step": 1090 }, { "epoch": 0.13245033112582782, "grad_norm": 3.71576189994812, "learning_rate": 3.3112582781456954e-07, "loss": 0.5893, "step": 1100 }, { "epoch": 0.1336544250451535, "grad_norm": 4.458312034606934, "learning_rate": 3.3413606261288377e-07, "loss": 0.5951, "step": 1110 }, { "epoch": 0.13485851896447923, "grad_norm": 4.835500240325928, "learning_rate": 3.3714629741119806e-07, "loss": 0.5791, "step": 1120 }, { "epoch": 0.13606261288380495, "grad_norm": 4.516515254974365, "learning_rate": 3.4015653220951235e-07, "loss": 0.5759, "step": 1130 }, { "epoch": 0.13726670680313063, "grad_norm": 5.564052104949951, "learning_rate": 3.431667670078266e-07, "loss": 0.5791, "step": 1140 }, { "epoch": 0.13847080072245635, "grad_norm": 5.586264610290527, "learning_rate": 3.4617700180614086e-07, "loss": 0.6142, "step": 1150 }, { "epoch": 0.13967489464178207, "grad_norm": 4.408708572387695, "learning_rate": 3.4918723660445515e-07, "loss": 0.617, "step": 1160 }, { "epoch": 0.14087898856110775, "grad_norm": 4.4068403244018555, "learning_rate": 3.521974714027694e-07, "loss": 0.6099, "step": 1170 }, { "epoch": 0.14208308248043347, "grad_norm": 3.947399854660034, "learning_rate": 3.5520770620108367e-07, "loss": 0.5555, "step": 1180 }, { "epoch": 0.1432871763997592, "grad_norm": 5.264540195465088, "learning_rate": 3.5821794099939795e-07, "loss": 0.5556, "step": 1190 }, { "epoch": 0.1444912703190849, "grad_norm": 4.486605644226074, "learning_rate": 3.612281757977122e-07, "loss": 0.5997, "step": 1200 }, { "epoch": 0.1456953642384106, "grad_norm": 6.195891857147217, "learning_rate": 3.642384105960264e-07, "loss": 0.6104, "step": 1210 }, { "epoch": 0.1468994581577363, "grad_norm": 4.5443572998046875, "learning_rate": 3.6724864539434076e-07, "loss": 0.5806, "step": 1220 }, { "epoch": 0.14810355207706202, "grad_norm": 4.380715370178223, "learning_rate": 3.70258880192655e-07, "loss": 0.5759, "step": 1230 }, { "epoch": 0.1493076459963877, "grad_norm": 5.033191680908203, "learning_rate": 3.732691149909693e-07, "loss": 0.5782, "step": 1240 }, { "epoch": 0.15051173991571343, "grad_norm": 4.244385719299316, "learning_rate": 3.7627934978928356e-07, "loss": 0.5658, "step": 1250 }, { "epoch": 0.15171583383503914, "grad_norm": 4.332985877990723, "learning_rate": 3.792895845875978e-07, "loss": 0.5702, "step": 1260 }, { "epoch": 0.15291992775436483, "grad_norm": 4.5175628662109375, "learning_rate": 3.822998193859121e-07, "loss": 0.5588, "step": 1270 }, { "epoch": 0.15412402167369055, "grad_norm": 4.519990921020508, "learning_rate": 3.8531005418422637e-07, "loss": 0.5871, "step": 1280 }, { "epoch": 0.15532811559301626, "grad_norm": 4.500414848327637, "learning_rate": 3.883202889825406e-07, "loss": 0.5977, "step": 1290 }, { "epoch": 0.15653220951234195, "grad_norm": 4.714526653289795, "learning_rate": 3.913305237808549e-07, "loss": 0.5647, "step": 1300 }, { "epoch": 0.15773630343166767, "grad_norm": 4.869201183319092, "learning_rate": 3.9434075857916917e-07, "loss": 0.5816, "step": 1310 }, { "epoch": 0.15894039735099338, "grad_norm": 5.167849540710449, "learning_rate": 3.973509933774834e-07, "loss": 0.5633, "step": 1320 }, { "epoch": 0.16014449127031907, "grad_norm": 4.805886745452881, "learning_rate": 4.003612281757977e-07, "loss": 0.5858, "step": 1330 }, { "epoch": 0.1613485851896448, "grad_norm": 4.569708824157715, "learning_rate": 4.03371462974112e-07, "loss": 0.5729, "step": 1340 }, { "epoch": 0.1625526791089705, "grad_norm": 4.649074554443359, "learning_rate": 4.0638169777242626e-07, "loss": 0.5904, "step": 1350 }, { "epoch": 0.16375677302829622, "grad_norm": 4.956695556640625, "learning_rate": 4.093919325707405e-07, "loss": 0.5743, "step": 1360 }, { "epoch": 0.1649608669476219, "grad_norm": 5.056834697723389, "learning_rate": 4.1240216736905473e-07, "loss": 0.5903, "step": 1370 }, { "epoch": 0.16616496086694763, "grad_norm": 4.751232624053955, "learning_rate": 4.1541240216736907e-07, "loss": 0.5697, "step": 1380 }, { "epoch": 0.16736905478627334, "grad_norm": 4.0161027908325195, "learning_rate": 4.184226369656833e-07, "loss": 0.5588, "step": 1390 }, { "epoch": 0.16857314870559903, "grad_norm": 4.591194152832031, "learning_rate": 4.2143287176399753e-07, "loss": 0.5792, "step": 1400 }, { "epoch": 0.16977724262492475, "grad_norm": 5.218972206115723, "learning_rate": 4.2444310656231187e-07, "loss": 0.5793, "step": 1410 }, { "epoch": 0.17098133654425046, "grad_norm": 4.32102108001709, "learning_rate": 4.274533413606261e-07, "loss": 0.57, "step": 1420 }, { "epoch": 0.17218543046357615, "grad_norm": 4.359175205230713, "learning_rate": 4.3046357615894034e-07, "loss": 0.5675, "step": 1430 }, { "epoch": 0.17338952438290187, "grad_norm": 5.192026615142822, "learning_rate": 4.334738109572547e-07, "loss": 0.5668, "step": 1440 }, { "epoch": 0.17459361830222758, "grad_norm": 4.002780914306641, "learning_rate": 4.364840457555689e-07, "loss": 0.5787, "step": 1450 }, { "epoch": 0.17579771222155327, "grad_norm": 5.319111347198486, "learning_rate": 4.3949428055388314e-07, "loss": 0.5734, "step": 1460 }, { "epoch": 0.177001806140879, "grad_norm": 4.700523376464844, "learning_rate": 4.425045153521975e-07, "loss": 0.5754, "step": 1470 }, { "epoch": 0.1782059000602047, "grad_norm": 4.4386372566223145, "learning_rate": 4.455147501505117e-07, "loss": 0.5459, "step": 1480 }, { "epoch": 0.1794099939795304, "grad_norm": 4.084826946258545, "learning_rate": 4.48524984948826e-07, "loss": 0.5399, "step": 1490 }, { "epoch": 0.1806140878988561, "grad_norm": 4.401342391967773, "learning_rate": 4.515352197471403e-07, "loss": 0.573, "step": 1500 }, { "epoch": 0.18181818181818182, "grad_norm": 4.5059685707092285, "learning_rate": 4.545454545454545e-07, "loss": 0.5724, "step": 1510 }, { "epoch": 0.18302227573750754, "grad_norm": 5.070437431335449, "learning_rate": 4.575556893437688e-07, "loss": 0.5711, "step": 1520 }, { "epoch": 0.18422636965683323, "grad_norm": 4.188956260681152, "learning_rate": 4.6056592414208304e-07, "loss": 0.5498, "step": 1530 }, { "epoch": 0.18543046357615894, "grad_norm": 4.391158580780029, "learning_rate": 4.635761589403973e-07, "loss": 0.5602, "step": 1540 }, { "epoch": 0.18663455749548466, "grad_norm": 5.272259712219238, "learning_rate": 4.665863937387116e-07, "loss": 0.5748, "step": 1550 }, { "epoch": 0.18783865141481035, "grad_norm": 4.982473373413086, "learning_rate": 4.6959662853702584e-07, "loss": 0.5584, "step": 1560 }, { "epoch": 0.18904274533413606, "grad_norm": 5.263506889343262, "learning_rate": 4.7260686333534013e-07, "loss": 0.5828, "step": 1570 }, { "epoch": 0.19024683925346178, "grad_norm": 4.1373724937438965, "learning_rate": 4.756170981336544e-07, "loss": 0.5494, "step": 1580 }, { "epoch": 0.19145093317278747, "grad_norm": 4.439697265625, "learning_rate": 4.786273329319686e-07, "loss": 0.5522, "step": 1590 }, { "epoch": 0.19265502709211318, "grad_norm": 4.79713249206543, "learning_rate": 4.81637567730283e-07, "loss": 0.5058, "step": 1600 }, { "epoch": 0.1938591210114389, "grad_norm": 3.973453998565674, "learning_rate": 4.846478025285972e-07, "loss": 0.5471, "step": 1610 }, { "epoch": 0.1950632149307646, "grad_norm": 4.748741149902344, "learning_rate": 4.876580373269115e-07, "loss": 0.5768, "step": 1620 }, { "epoch": 0.1962673088500903, "grad_norm": 5.98441743850708, "learning_rate": 4.906682721252258e-07, "loss": 0.5497, "step": 1630 }, { "epoch": 0.19747140276941602, "grad_norm": 5.55325174331665, "learning_rate": 4.9367850692354e-07, "loss": 0.5595, "step": 1640 }, { "epoch": 0.1986754966887417, "grad_norm": 5.114386081695557, "learning_rate": 4.966887417218543e-07, "loss": 0.5635, "step": 1650 }, { "epoch": 0.19987959060806743, "grad_norm": 4.869389533996582, "learning_rate": 4.996989765201686e-07, "loss": 0.5409, "step": 1660 }, { "epoch": 0.20108368452739314, "grad_norm": 4.4507222175598145, "learning_rate": 5.027092113184828e-07, "loss": 0.5598, "step": 1670 }, { "epoch": 0.20228777844671886, "grad_norm": 4.574100494384766, "learning_rate": 5.057194461167971e-07, "loss": 0.5432, "step": 1680 }, { "epoch": 0.20349187236604455, "grad_norm": 4.581476211547852, "learning_rate": 5.087296809151114e-07, "loss": 0.5509, "step": 1690 }, { "epoch": 0.20469596628537026, "grad_norm": 4.631548881530762, "learning_rate": 5.117399157134256e-07, "loss": 0.5712, "step": 1700 }, { "epoch": 0.20590006020469598, "grad_norm": 5.006454944610596, "learning_rate": 5.147501505117399e-07, "loss": 0.5586, "step": 1710 }, { "epoch": 0.20710415412402167, "grad_norm": 4.4788408279418945, "learning_rate": 5.177603853100542e-07, "loss": 0.5543, "step": 1720 }, { "epoch": 0.20830824804334738, "grad_norm": 4.614450931549072, "learning_rate": 5.207706201083684e-07, "loss": 0.5677, "step": 1730 }, { "epoch": 0.2095123419626731, "grad_norm": 4.377712249755859, "learning_rate": 5.237808549066827e-07, "loss": 0.5399, "step": 1740 }, { "epoch": 0.2107164358819988, "grad_norm": 6.157577991485596, "learning_rate": 5.26791089704997e-07, "loss": 0.5288, "step": 1750 }, { "epoch": 0.2119205298013245, "grad_norm": 4.206299781799316, "learning_rate": 5.298013245033112e-07, "loss": 0.5308, "step": 1760 }, { "epoch": 0.21312462372065022, "grad_norm": 4.296496868133545, "learning_rate": 5.328115593016255e-07, "loss": 0.552, "step": 1770 }, { "epoch": 0.2143287176399759, "grad_norm": 4.474640846252441, "learning_rate": 5.358217940999398e-07, "loss": 0.5505, "step": 1780 }, { "epoch": 0.21553281155930162, "grad_norm": 4.762406349182129, "learning_rate": 5.38832028898254e-07, "loss": 0.5669, "step": 1790 }, { "epoch": 0.21673690547862734, "grad_norm": 4.40052604675293, "learning_rate": 5.418422636965683e-07, "loss": 0.5386, "step": 1800 }, { "epoch": 0.21794099939795303, "grad_norm": 4.364424228668213, "learning_rate": 5.448524984948826e-07, "loss": 0.5446, "step": 1810 }, { "epoch": 0.21914509331727874, "grad_norm": 5.686670780181885, "learning_rate": 5.478627332931969e-07, "loss": 0.5708, "step": 1820 }, { "epoch": 0.22034918723660446, "grad_norm": 6.244655132293701, "learning_rate": 5.508729680915111e-07, "loss": 0.5353, "step": 1830 }, { "epoch": 0.22155328115593018, "grad_norm": 5.4936323165893555, "learning_rate": 5.538832028898254e-07, "loss": 0.5486, "step": 1840 }, { "epoch": 0.22275737507525586, "grad_norm": 4.955344200134277, "learning_rate": 5.568934376881397e-07, "loss": 0.5142, "step": 1850 }, { "epoch": 0.22396146899458158, "grad_norm": 4.333896636962891, "learning_rate": 5.599036724864539e-07, "loss": 0.5432, "step": 1860 }, { "epoch": 0.2251655629139073, "grad_norm": 4.568367958068848, "learning_rate": 5.629139072847681e-07, "loss": 0.5351, "step": 1870 }, { "epoch": 0.22636965683323299, "grad_norm": 5.548391342163086, "learning_rate": 5.659241420830825e-07, "loss": 0.5053, "step": 1880 }, { "epoch": 0.2275737507525587, "grad_norm": 4.526470184326172, "learning_rate": 5.689343768813967e-07, "loss": 0.5494, "step": 1890 }, { "epoch": 0.22877784467188442, "grad_norm": 4.453249454498291, "learning_rate": 5.719446116797109e-07, "loss": 0.5397, "step": 1900 }, { "epoch": 0.2299819385912101, "grad_norm": 7.503538131713867, "learning_rate": 5.749548464780253e-07, "loss": 0.5232, "step": 1910 }, { "epoch": 0.23118603251053582, "grad_norm": 5.740428924560547, "learning_rate": 5.779650812763396e-07, "loss": 0.5426, "step": 1920 }, { "epoch": 0.23239012642986154, "grad_norm": 5.185967445373535, "learning_rate": 5.809753160746537e-07, "loss": 0.5277, "step": 1930 }, { "epoch": 0.23359422034918723, "grad_norm": 5.1867547035217285, "learning_rate": 5.839855508729681e-07, "loss": 0.5233, "step": 1940 }, { "epoch": 0.23479831426851294, "grad_norm": 4.812213897705078, "learning_rate": 5.869957856712824e-07, "loss": 0.535, "step": 1950 }, { "epoch": 0.23600240818783866, "grad_norm": 5.038625240325928, "learning_rate": 5.900060204695965e-07, "loss": 0.5365, "step": 1960 }, { "epoch": 0.23720650210716435, "grad_norm": 4.050044536590576, "learning_rate": 5.930162552679109e-07, "loss": 0.5145, "step": 1970 }, { "epoch": 0.23841059602649006, "grad_norm": 4.956125736236572, "learning_rate": 5.960264900662252e-07, "loss": 0.5141, "step": 1980 }, { "epoch": 0.23961468994581578, "grad_norm": 4.40023136138916, "learning_rate": 5.990367248645393e-07, "loss": 0.544, "step": 1990 }, { "epoch": 0.2408187838651415, "grad_norm": 5.268930912017822, "learning_rate": 6.020469596628537e-07, "loss": 0.5514, "step": 2000 }, { "epoch": 0.24202287778446718, "grad_norm": 3.9441418647766113, "learning_rate": 6.05057194461168e-07, "loss": 0.5368, "step": 2010 }, { "epoch": 0.2432269717037929, "grad_norm": 4.060418605804443, "learning_rate": 6.080674292594821e-07, "loss": 0.5228, "step": 2020 }, { "epoch": 0.24443106562311862, "grad_norm": 4.1477861404418945, "learning_rate": 6.110776640577965e-07, "loss": 0.5221, "step": 2030 }, { "epoch": 0.2456351595424443, "grad_norm": 5.319125175476074, "learning_rate": 6.140878988561108e-07, "loss": 0.5441, "step": 2040 }, { "epoch": 0.24683925346177002, "grad_norm": 4.920033931732178, "learning_rate": 6.17098133654425e-07, "loss": 0.5307, "step": 2050 }, { "epoch": 0.24804334738109574, "grad_norm": 5.167773246765137, "learning_rate": 6.201083684527393e-07, "loss": 0.5304, "step": 2060 }, { "epoch": 0.24924744130042142, "grad_norm": 5.3018879890441895, "learning_rate": 6.231186032510536e-07, "loss": 0.5356, "step": 2070 }, { "epoch": 0.25045153521974717, "grad_norm": 4.822166919708252, "learning_rate": 6.261288380493678e-07, "loss": 0.513, "step": 2080 }, { "epoch": 0.25165562913907286, "grad_norm": 4.957582473754883, "learning_rate": 6.291390728476821e-07, "loss": 0.5069, "step": 2090 }, { "epoch": 0.25285972305839854, "grad_norm": 6.180065155029297, "learning_rate": 6.321493076459964e-07, "loss": 0.5329, "step": 2100 }, { "epoch": 0.2540638169777243, "grad_norm": 5.123517990112305, "learning_rate": 6.351595424443106e-07, "loss": 0.5169, "step": 2110 }, { "epoch": 0.25526791089705, "grad_norm": 5.372180938720703, "learning_rate": 6.381697772426249e-07, "loss": 0.508, "step": 2120 }, { "epoch": 0.25647200481637566, "grad_norm": 3.907548189163208, "learning_rate": 6.411800120409392e-07, "loss": 0.5082, "step": 2130 }, { "epoch": 0.2576760987357014, "grad_norm": 4.107047080993652, "learning_rate": 6.441902468392534e-07, "loss": 0.5257, "step": 2140 }, { "epoch": 0.2588801926550271, "grad_norm": 5.055625915527344, "learning_rate": 6.472004816375677e-07, "loss": 0.5458, "step": 2150 }, { "epoch": 0.2600842865743528, "grad_norm": 5.573007106781006, "learning_rate": 6.502107164358819e-07, "loss": 0.5178, "step": 2160 }, { "epoch": 0.26128838049367853, "grad_norm": 4.955606460571289, "learning_rate": 6.532209512341962e-07, "loss": 0.5355, "step": 2170 }, { "epoch": 0.2624924744130042, "grad_norm": 4.537413120269775, "learning_rate": 6.562311860325105e-07, "loss": 0.5393, "step": 2180 }, { "epoch": 0.2636965683323299, "grad_norm": 5.761811256408691, "learning_rate": 6.592414208308247e-07, "loss": 0.5497, "step": 2190 }, { "epoch": 0.26490066225165565, "grad_norm": 3.865335464477539, "learning_rate": 6.622516556291391e-07, "loss": 0.4914, "step": 2200 }, { "epoch": 0.26610475617098134, "grad_norm": 4.600432872772217, "learning_rate": 6.652618904274533e-07, "loss": 0.5099, "step": 2210 }, { "epoch": 0.267308850090307, "grad_norm": 4.737097263336182, "learning_rate": 6.682721252257675e-07, "loss": 0.5236, "step": 2220 }, { "epoch": 0.26851294400963277, "grad_norm": 4.7886247634887695, "learning_rate": 6.712823600240819e-07, "loss": 0.5152, "step": 2230 }, { "epoch": 0.26971703792895846, "grad_norm": 6.00905179977417, "learning_rate": 6.742925948223961e-07, "loss": 0.5369, "step": 2240 }, { "epoch": 0.27092113184828415, "grad_norm": 5.080295085906982, "learning_rate": 6.773028296207104e-07, "loss": 0.5135, "step": 2250 }, { "epoch": 0.2721252257676099, "grad_norm": 5.130943775177002, "learning_rate": 6.803130644190247e-07, "loss": 0.4921, "step": 2260 }, { "epoch": 0.2733293196869356, "grad_norm": 4.8161187171936035, "learning_rate": 6.833232992173389e-07, "loss": 0.5243, "step": 2270 }, { "epoch": 0.27453341360626127, "grad_norm": 5.960630416870117, "learning_rate": 6.863335340156532e-07, "loss": 0.525, "step": 2280 }, { "epoch": 0.275737507525587, "grad_norm": 6.012716770172119, "learning_rate": 6.893437688139675e-07, "loss": 0.5126, "step": 2290 }, { "epoch": 0.2769416014449127, "grad_norm": 4.913167476654053, "learning_rate": 6.923540036122817e-07, "loss": 0.531, "step": 2300 }, { "epoch": 0.2781456953642384, "grad_norm": 5.190576076507568, "learning_rate": 6.95364238410596e-07, "loss": 0.5147, "step": 2310 }, { "epoch": 0.27934978928356413, "grad_norm": 4.0760602951049805, "learning_rate": 6.983744732089103e-07, "loss": 0.5135, "step": 2320 }, { "epoch": 0.2805538832028898, "grad_norm": 4.385684490203857, "learning_rate": 7.013847080072245e-07, "loss": 0.5196, "step": 2330 }, { "epoch": 0.2817579771222155, "grad_norm": 4.470118045806885, "learning_rate": 7.043949428055388e-07, "loss": 0.502, "step": 2340 }, { "epoch": 0.28296207104154125, "grad_norm": 4.798367023468018, "learning_rate": 7.074051776038531e-07, "loss": 0.5078, "step": 2350 }, { "epoch": 0.28416616496086694, "grad_norm": 4.64969539642334, "learning_rate": 7.104154124021673e-07, "loss": 0.5126, "step": 2360 }, { "epoch": 0.28537025888019263, "grad_norm": 5.035313606262207, "learning_rate": 7.134256472004816e-07, "loss": 0.5068, "step": 2370 }, { "epoch": 0.2865743527995184, "grad_norm": 3.7338409423828125, "learning_rate": 7.164358819987959e-07, "loss": 0.4956, "step": 2380 }, { "epoch": 0.28777844671884406, "grad_norm": 5.102356910705566, "learning_rate": 7.194461167971101e-07, "loss": 0.5128, "step": 2390 }, { "epoch": 0.2889825406381698, "grad_norm": 5.0710320472717285, "learning_rate": 7.224563515954244e-07, "loss": 0.5064, "step": 2400 }, { "epoch": 0.2901866345574955, "grad_norm": 5.2054667472839355, "learning_rate": 7.254665863937387e-07, "loss": 0.5318, "step": 2410 }, { "epoch": 0.2913907284768212, "grad_norm": 4.590500831604004, "learning_rate": 7.284768211920528e-07, "loss": 0.5352, "step": 2420 }, { "epoch": 0.2925948223961469, "grad_norm": 5.737983226776123, "learning_rate": 7.314870559903672e-07, "loss": 0.5047, "step": 2430 }, { "epoch": 0.2937989163154726, "grad_norm": 5.184499263763428, "learning_rate": 7.344972907886815e-07, "loss": 0.4998, "step": 2440 }, { "epoch": 0.2950030102347983, "grad_norm": 5.553317070007324, "learning_rate": 7.375075255869959e-07, "loss": 0.5099, "step": 2450 }, { "epoch": 0.29620710415412405, "grad_norm": 4.864592552185059, "learning_rate": 7.4051776038531e-07, "loss": 0.5071, "step": 2460 }, { "epoch": 0.29741119807344973, "grad_norm": 4.1055803298950195, "learning_rate": 7.435279951836243e-07, "loss": 0.4985, "step": 2470 }, { "epoch": 0.2986152919927754, "grad_norm": 5.875371932983398, "learning_rate": 7.465382299819386e-07, "loss": 0.4982, "step": 2480 }, { "epoch": 0.29981938591210117, "grad_norm": 4.417768955230713, "learning_rate": 7.495484647802528e-07, "loss": 0.4999, "step": 2490 }, { "epoch": 0.30102347983142685, "grad_norm": 4.034854888916016, "learning_rate": 7.525586995785671e-07, "loss": 0.5063, "step": 2500 }, { "epoch": 0.30222757375075254, "grad_norm": 4.711478233337402, "learning_rate": 7.555689343768814e-07, "loss": 0.5194, "step": 2510 }, { "epoch": 0.3034316676700783, "grad_norm": 4.778373718261719, "learning_rate": 7.585791691751956e-07, "loss": 0.5178, "step": 2520 }, { "epoch": 0.304635761589404, "grad_norm": 3.896817922592163, "learning_rate": 7.615894039735099e-07, "loss": 0.5073, "step": 2530 }, { "epoch": 0.30583985550872966, "grad_norm": 4.729064464569092, "learning_rate": 7.645996387718242e-07, "loss": 0.5083, "step": 2540 }, { "epoch": 0.3070439494280554, "grad_norm": 4.760159015655518, "learning_rate": 7.676098735701384e-07, "loss": 0.5108, "step": 2550 }, { "epoch": 0.3082480433473811, "grad_norm": 4.362825870513916, "learning_rate": 7.706201083684527e-07, "loss": 0.5027, "step": 2560 }, { "epoch": 0.3094521372667068, "grad_norm": 4.749810695648193, "learning_rate": 7.73630343166767e-07, "loss": 0.5051, "step": 2570 }, { "epoch": 0.3106562311860325, "grad_norm": 4.157332897186279, "learning_rate": 7.766405779650812e-07, "loss": 0.5, "step": 2580 }, { "epoch": 0.3118603251053582, "grad_norm": 4.272891044616699, "learning_rate": 7.796508127633955e-07, "loss": 0.4946, "step": 2590 }, { "epoch": 0.3130644190246839, "grad_norm": 4.159026145935059, "learning_rate": 7.826610475617098e-07, "loss": 0.4992, "step": 2600 }, { "epoch": 0.31426851294400965, "grad_norm": 5.095447063446045, "learning_rate": 7.85671282360024e-07, "loss": 0.4968, "step": 2610 }, { "epoch": 0.31547260686333534, "grad_norm": 4.606817722320557, "learning_rate": 7.886815171583383e-07, "loss": 0.5018, "step": 2620 }, { "epoch": 0.316676700782661, "grad_norm": 4.154166221618652, "learning_rate": 7.916917519566526e-07, "loss": 0.4848, "step": 2630 }, { "epoch": 0.31788079470198677, "grad_norm": 4.749946117401123, "learning_rate": 7.947019867549668e-07, "loss": 0.4955, "step": 2640 }, { "epoch": 0.31908488862131246, "grad_norm": 6.158957481384277, "learning_rate": 7.977122215532812e-07, "loss": 0.5088, "step": 2650 }, { "epoch": 0.32028898254063815, "grad_norm": 4.356431484222412, "learning_rate": 8.007224563515954e-07, "loss": 0.5071, "step": 2660 }, { "epoch": 0.3214930764599639, "grad_norm": 5.454282760620117, "learning_rate": 8.037326911499096e-07, "loss": 0.518, "step": 2670 }, { "epoch": 0.3226971703792896, "grad_norm": 4.323178291320801, "learning_rate": 8.06742925948224e-07, "loss": 0.5077, "step": 2680 }, { "epoch": 0.32390126429861527, "grad_norm": 5.352051258087158, "learning_rate": 8.097531607465382e-07, "loss": 0.5042, "step": 2690 }, { "epoch": 0.325105358217941, "grad_norm": 4.680684566497803, "learning_rate": 8.127633955448525e-07, "loss": 0.5006, "step": 2700 }, { "epoch": 0.3263094521372667, "grad_norm": 5.054072380065918, "learning_rate": 8.157736303431668e-07, "loss": 0.5005, "step": 2710 }, { "epoch": 0.32751354605659244, "grad_norm": 4.090258598327637, "learning_rate": 8.18783865141481e-07, "loss": 0.4694, "step": 2720 }, { "epoch": 0.32871763997591813, "grad_norm": 4.663838863372803, "learning_rate": 8.217940999397953e-07, "loss": 0.502, "step": 2730 }, { "epoch": 0.3299217338952438, "grad_norm": 4.440493106842041, "learning_rate": 8.248043347381095e-07, "loss": 0.4933, "step": 2740 }, { "epoch": 0.33112582781456956, "grad_norm": 5.184099197387695, "learning_rate": 8.278145695364238e-07, "loss": 0.5088, "step": 2750 }, { "epoch": 0.33232992173389525, "grad_norm": 4.647283554077148, "learning_rate": 8.308248043347381e-07, "loss": 0.4909, "step": 2760 }, { "epoch": 0.33353401565322094, "grad_norm": 4.6232500076293945, "learning_rate": 8.338350391330523e-07, "loss": 0.4929, "step": 2770 }, { "epoch": 0.3347381095725467, "grad_norm": 5.234133720397949, "learning_rate": 8.368452739313666e-07, "loss": 0.5287, "step": 2780 }, { "epoch": 0.33594220349187237, "grad_norm": 4.967161178588867, "learning_rate": 8.398555087296809e-07, "loss": 0.5041, "step": 2790 }, { "epoch": 0.33714629741119806, "grad_norm": 4.8062591552734375, "learning_rate": 8.428657435279951e-07, "loss": 0.4878, "step": 2800 }, { "epoch": 0.3383503913305238, "grad_norm": 5.188631534576416, "learning_rate": 8.458759783263094e-07, "loss": 0.4907, "step": 2810 }, { "epoch": 0.3395544852498495, "grad_norm": 4.293895244598389, "learning_rate": 8.488862131246237e-07, "loss": 0.4952, "step": 2820 }, { "epoch": 0.3407585791691752, "grad_norm": 5.219202041625977, "learning_rate": 8.518964479229379e-07, "loss": 0.5046, "step": 2830 }, { "epoch": 0.3419626730885009, "grad_norm": 4.529453754425049, "learning_rate": 8.549066827212522e-07, "loss": 0.4951, "step": 2840 }, { "epoch": 0.3431667670078266, "grad_norm": 4.706615924835205, "learning_rate": 8.579169175195666e-07, "loss": 0.5083, "step": 2850 }, { "epoch": 0.3443708609271523, "grad_norm": 5.135066986083984, "learning_rate": 8.609271523178807e-07, "loss": 0.4823, "step": 2860 }, { "epoch": 0.34557495484647804, "grad_norm": 4.977953910827637, "learning_rate": 8.63937387116195e-07, "loss": 0.4845, "step": 2870 }, { "epoch": 0.34677904876580373, "grad_norm": 4.964434623718262, "learning_rate": 8.669476219145094e-07, "loss": 0.5008, "step": 2880 }, { "epoch": 0.3479831426851294, "grad_norm": 4.28712797164917, "learning_rate": 8.699578567128235e-07, "loss": 0.4819, "step": 2890 }, { "epoch": 0.34918723660445516, "grad_norm": 4.125621318817139, "learning_rate": 8.729680915111378e-07, "loss": 0.505, "step": 2900 }, { "epoch": 0.35039133052378085, "grad_norm": 4.779543399810791, "learning_rate": 8.759783263094522e-07, "loss": 0.5002, "step": 2910 }, { "epoch": 0.35159542444310654, "grad_norm": 4.9358320236206055, "learning_rate": 8.789885611077663e-07, "loss": 0.4854, "step": 2920 }, { "epoch": 0.3527995183624323, "grad_norm": 5.439524173736572, "learning_rate": 8.819987959060806e-07, "loss": 0.4893, "step": 2930 }, { "epoch": 0.354003612281758, "grad_norm": 5.939353942871094, "learning_rate": 8.85009030704395e-07, "loss": 0.4876, "step": 2940 }, { "epoch": 0.35520770620108366, "grad_norm": 5.600659370422363, "learning_rate": 8.880192655027092e-07, "loss": 0.4916, "step": 2950 }, { "epoch": 0.3564118001204094, "grad_norm": 6.2792134284973145, "learning_rate": 8.910295003010234e-07, "loss": 0.5139, "step": 2960 }, { "epoch": 0.3576158940397351, "grad_norm": 5.060665130615234, "learning_rate": 8.940397350993378e-07, "loss": 0.5138, "step": 2970 }, { "epoch": 0.3588199879590608, "grad_norm": 5.271560192108154, "learning_rate": 8.97049969897652e-07, "loss": 0.4971, "step": 2980 }, { "epoch": 0.3600240818783865, "grad_norm": 4.9547014236450195, "learning_rate": 9.000602046959662e-07, "loss": 0.4767, "step": 2990 }, { "epoch": 0.3612281757977122, "grad_norm": 5.039198398590088, "learning_rate": 9.030704394942806e-07, "loss": 0.5038, "step": 3000 }, { "epoch": 0.3624322697170379, "grad_norm": 3.5281832218170166, "learning_rate": 9.060806742925948e-07, "loss": 0.4837, "step": 3010 }, { "epoch": 0.36363636363636365, "grad_norm": 4.734562873840332, "learning_rate": 9.09090909090909e-07, "loss": 0.4925, "step": 3020 }, { "epoch": 0.36484045755568933, "grad_norm": 4.400488376617432, "learning_rate": 9.121011438892233e-07, "loss": 0.4819, "step": 3030 }, { "epoch": 0.3660445514750151, "grad_norm": 4.797727584838867, "learning_rate": 9.151113786875376e-07, "loss": 0.4779, "step": 3040 }, { "epoch": 0.36724864539434077, "grad_norm": 4.852715492248535, "learning_rate": 9.181216134858518e-07, "loss": 0.4581, "step": 3050 }, { "epoch": 0.36845273931366646, "grad_norm": 4.8324971199035645, "learning_rate": 9.211318482841661e-07, "loss": 0.5075, "step": 3060 }, { "epoch": 0.3696568332329922, "grad_norm": 4.099527835845947, "learning_rate": 9.241420830824804e-07, "loss": 0.4926, "step": 3070 }, { "epoch": 0.3708609271523179, "grad_norm": 4.540558338165283, "learning_rate": 9.271523178807946e-07, "loss": 0.4901, "step": 3080 }, { "epoch": 0.3720650210716436, "grad_norm": 4.567551612854004, "learning_rate": 9.301625526791089e-07, "loss": 0.4781, "step": 3090 }, { "epoch": 0.3732691149909693, "grad_norm": 5.362119674682617, "learning_rate": 9.331727874774232e-07, "loss": 0.4784, "step": 3100 }, { "epoch": 0.374473208910295, "grad_norm": 4.974254131317139, "learning_rate": 9.361830222757375e-07, "loss": 0.4985, "step": 3110 }, { "epoch": 0.3756773028296207, "grad_norm": 4.490511417388916, "learning_rate": 9.391932570740517e-07, "loss": 0.4619, "step": 3120 }, { "epoch": 0.37688139674894644, "grad_norm": 4.691735744476318, "learning_rate": 9.42203491872366e-07, "loss": 0.4892, "step": 3130 }, { "epoch": 0.37808549066827213, "grad_norm": 5.031266689300537, "learning_rate": 9.452137266706803e-07, "loss": 0.4653, "step": 3140 }, { "epoch": 0.3792895845875978, "grad_norm": 6.112424850463867, "learning_rate": 9.482239614689945e-07, "loss": 0.4887, "step": 3150 }, { "epoch": 0.38049367850692356, "grad_norm": 4.281744480133057, "learning_rate": 9.512341962673088e-07, "loss": 0.4828, "step": 3160 }, { "epoch": 0.38169777242624925, "grad_norm": 4.672320365905762, "learning_rate": 9.54244431065623e-07, "loss": 0.4807, "step": 3170 }, { "epoch": 0.38290186634557494, "grad_norm": 4.8247528076171875, "learning_rate": 9.572546658639373e-07, "loss": 0.4652, "step": 3180 }, { "epoch": 0.3841059602649007, "grad_norm": 4.806872844696045, "learning_rate": 9.602649006622515e-07, "loss": 0.4687, "step": 3190 }, { "epoch": 0.38531005418422637, "grad_norm": 4.877020835876465, "learning_rate": 9.63275135460566e-07, "loss": 0.4954, "step": 3200 }, { "epoch": 0.38651414810355206, "grad_norm": 5.005871295928955, "learning_rate": 9.662853702588802e-07, "loss": 0.5117, "step": 3210 }, { "epoch": 0.3877182420228778, "grad_norm": 4.2746100425720215, "learning_rate": 9.692956050571944e-07, "loss": 0.472, "step": 3220 }, { "epoch": 0.3889223359422035, "grad_norm": 4.155144691467285, "learning_rate": 9.723058398555087e-07, "loss": 0.4882, "step": 3230 }, { "epoch": 0.3901264298615292, "grad_norm": 4.557404041290283, "learning_rate": 9.75316074653823e-07, "loss": 0.4845, "step": 3240 }, { "epoch": 0.3913305237808549, "grad_norm": 4.442798614501953, "learning_rate": 9.783263094521371e-07, "loss": 0.4822, "step": 3250 }, { "epoch": 0.3925346177001806, "grad_norm": 5.363224029541016, "learning_rate": 9.813365442504516e-07, "loss": 0.4808, "step": 3260 }, { "epoch": 0.3937387116195063, "grad_norm": 4.809715747833252, "learning_rate": 9.843467790487658e-07, "loss": 0.4834, "step": 3270 }, { "epoch": 0.39494280553883204, "grad_norm": 4.954145431518555, "learning_rate": 9.8735701384708e-07, "loss": 0.4796, "step": 3280 }, { "epoch": 0.39614689945815773, "grad_norm": 4.381477355957031, "learning_rate": 9.903672486453943e-07, "loss": 0.465, "step": 3290 }, { "epoch": 0.3973509933774834, "grad_norm": 5.086960315704346, "learning_rate": 9.933774834437085e-07, "loss": 0.4996, "step": 3300 }, { "epoch": 0.39855508729680916, "grad_norm": 5.4834303855896, "learning_rate": 9.963877182420227e-07, "loss": 0.4854, "step": 3310 }, { "epoch": 0.39975918121613485, "grad_norm": 4.411494255065918, "learning_rate": 9.993979530403372e-07, "loss": 0.4882, "step": 3320 }, { "epoch": 0.40096327513546054, "grad_norm": 3.9291751384735107, "learning_rate": 9.999998233411383e-07, "loss": 0.4975, "step": 3330 }, { "epoch": 0.4021673690547863, "grad_norm": 4.288562774658203, "learning_rate": 9.999991056647273e-07, "loss": 0.4712, "step": 3340 }, { "epoch": 0.40337146297411197, "grad_norm": 4.603250026702881, "learning_rate": 9.999978359303795e-07, "loss": 0.4933, "step": 3350 }, { "epoch": 0.4045755568934377, "grad_norm": 4.753664970397949, "learning_rate": 9.999960141394973e-07, "loss": 0.4748, "step": 3360 }, { "epoch": 0.4057796508127634, "grad_norm": 4.143571376800537, "learning_rate": 9.99993640294092e-07, "loss": 0.46, "step": 3370 }, { "epoch": 0.4069837447320891, "grad_norm": 5.25679874420166, "learning_rate": 9.99990714396784e-07, "loss": 0.4859, "step": 3380 }, { "epoch": 0.40818783865141484, "grad_norm": 5.903568744659424, "learning_rate": 9.999872364508047e-07, "loss": 0.4942, "step": 3390 }, { "epoch": 0.4093919325707405, "grad_norm": 4.5355939865112305, "learning_rate": 9.999832064599938e-07, "loss": 0.4713, "step": 3400 }, { "epoch": 0.4105960264900662, "grad_norm": 4.297218322753906, "learning_rate": 9.999786244288008e-07, "loss": 0.4701, "step": 3410 }, { "epoch": 0.41180012040939196, "grad_norm": 4.364749908447266, "learning_rate": 9.99973490362285e-07, "loss": 0.4805, "step": 3420 }, { "epoch": 0.41300421432871764, "grad_norm": 5.253974914550781, "learning_rate": 9.999678042661147e-07, "loss": 0.4728, "step": 3430 }, { "epoch": 0.41420830824804333, "grad_norm": 3.7505037784576416, "learning_rate": 9.999615661465685e-07, "loss": 0.4666, "step": 3440 }, { "epoch": 0.4154124021673691, "grad_norm": 4.56821346282959, "learning_rate": 9.999547760105335e-07, "loss": 0.4654, "step": 3450 }, { "epoch": 0.41661649608669477, "grad_norm": 5.777834415435791, "learning_rate": 9.999474338655073e-07, "loss": 0.4708, "step": 3460 }, { "epoch": 0.41782059000602045, "grad_norm": 4.463301181793213, "learning_rate": 9.999395397195961e-07, "loss": 0.4736, "step": 3470 }, { "epoch": 0.4190246839253462, "grad_norm": 4.7559494972229, "learning_rate": 9.999310935815165e-07, "loss": 0.4858, "step": 3480 }, { "epoch": 0.4202287778446719, "grad_norm": 5.451569557189941, "learning_rate": 9.999220954605932e-07, "loss": 0.4945, "step": 3490 }, { "epoch": 0.4214328717639976, "grad_norm": 4.072139739990234, "learning_rate": 9.99912545366762e-07, "loss": 0.4685, "step": 3500 }, { "epoch": 0.4226369656833233, "grad_norm": 5.299817085266113, "learning_rate": 9.999024433105666e-07, "loss": 0.4782, "step": 3510 }, { "epoch": 0.423841059602649, "grad_norm": 4.960267543792725, "learning_rate": 9.998917893031615e-07, "loss": 0.4766, "step": 3520 }, { "epoch": 0.4250451535219747, "grad_norm": 5.582713603973389, "learning_rate": 9.998805833563096e-07, "loss": 0.4737, "step": 3530 }, { "epoch": 0.42624924744130044, "grad_norm": 4.434458255767822, "learning_rate": 9.998688254823836e-07, "loss": 0.4679, "step": 3540 }, { "epoch": 0.4274533413606261, "grad_norm": 4.943469524383545, "learning_rate": 9.99856515694366e-07, "loss": 0.4754, "step": 3550 }, { "epoch": 0.4286574352799518, "grad_norm": 5.145878314971924, "learning_rate": 9.998436540058476e-07, "loss": 0.4855, "step": 3560 }, { "epoch": 0.42986152919927756, "grad_norm": 4.884524822235107, "learning_rate": 9.998302404310296e-07, "loss": 0.4801, "step": 3570 }, { "epoch": 0.43106562311860325, "grad_norm": 4.950911045074463, "learning_rate": 9.998162749847223e-07, "loss": 0.51, "step": 3580 }, { "epoch": 0.43226971703792894, "grad_norm": 4.5520148277282715, "learning_rate": 9.99801757682345e-07, "loss": 0.4887, "step": 3590 }, { "epoch": 0.4334738109572547, "grad_norm": 5.745821952819824, "learning_rate": 9.997866885399265e-07, "loss": 0.4934, "step": 3600 }, { "epoch": 0.43467790487658037, "grad_norm": 4.750070095062256, "learning_rate": 9.997710675741049e-07, "loss": 0.4611, "step": 3610 }, { "epoch": 0.43588199879590606, "grad_norm": 4.3570966720581055, "learning_rate": 9.997548948021275e-07, "loss": 0.4819, "step": 3620 }, { "epoch": 0.4370860927152318, "grad_norm": 3.810598373413086, "learning_rate": 9.997381702418513e-07, "loss": 0.4514, "step": 3630 }, { "epoch": 0.4382901866345575, "grad_norm": 4.763775825500488, "learning_rate": 9.997208939117418e-07, "loss": 0.4686, "step": 3640 }, { "epoch": 0.4394942805538832, "grad_norm": 4.3974385261535645, "learning_rate": 9.997030658308745e-07, "loss": 0.4763, "step": 3650 }, { "epoch": 0.4406983744732089, "grad_norm": 4.901960372924805, "learning_rate": 9.996846860189332e-07, "loss": 0.4649, "step": 3660 }, { "epoch": 0.4419024683925346, "grad_norm": 3.764139175415039, "learning_rate": 9.996657544962118e-07, "loss": 0.4752, "step": 3670 }, { "epoch": 0.44310656231186035, "grad_norm": 4.972975730895996, "learning_rate": 9.996462712836126e-07, "loss": 0.4736, "step": 3680 }, { "epoch": 0.44431065623118604, "grad_norm": 3.928086757659912, "learning_rate": 9.996262364026477e-07, "loss": 0.4939, "step": 3690 }, { "epoch": 0.44551475015051173, "grad_norm": 4.017699718475342, "learning_rate": 9.99605649875438e-07, "loss": 0.4693, "step": 3700 }, { "epoch": 0.4467188440698375, "grad_norm": 6.103999137878418, "learning_rate": 9.995845117247129e-07, "loss": 0.4774, "step": 3710 }, { "epoch": 0.44792293798916316, "grad_norm": 6.031617641448975, "learning_rate": 9.99562821973812e-07, "loss": 0.4528, "step": 3720 }, { "epoch": 0.44912703190848885, "grad_norm": 4.691218852996826, "learning_rate": 9.99540580646683e-07, "loss": 0.4646, "step": 3730 }, { "epoch": 0.4503311258278146, "grad_norm": 4.680331230163574, "learning_rate": 9.995177877678832e-07, "loss": 0.469, "step": 3740 }, { "epoch": 0.4515352197471403, "grad_norm": 4.436509132385254, "learning_rate": 9.994944433625784e-07, "loss": 0.4619, "step": 3750 }, { "epoch": 0.45273931366646597, "grad_norm": 4.72512149810791, "learning_rate": 9.994705474565435e-07, "loss": 0.4404, "step": 3760 }, { "epoch": 0.4539434075857917, "grad_norm": 4.427882194519043, "learning_rate": 9.994461000761627e-07, "loss": 0.4826, "step": 3770 }, { "epoch": 0.4551475015051174, "grad_norm": 4.025267124176025, "learning_rate": 9.994211012484285e-07, "loss": 0.4671, "step": 3780 }, { "epoch": 0.4563515954244431, "grad_norm": 5.315865516662598, "learning_rate": 9.99395551000943e-07, "loss": 0.4922, "step": 3790 }, { "epoch": 0.45755568934376883, "grad_norm": 5.362889289855957, "learning_rate": 9.993694493619162e-07, "loss": 0.4554, "step": 3800 }, { "epoch": 0.4587597832630945, "grad_norm": 3.8804094791412354, "learning_rate": 9.993427963601674e-07, "loss": 0.4558, "step": 3810 }, { "epoch": 0.4599638771824202, "grad_norm": 3.8259241580963135, "learning_rate": 9.99315592025125e-07, "loss": 0.4756, "step": 3820 }, { "epoch": 0.46116797110174595, "grad_norm": 3.806236505508423, "learning_rate": 9.992878363868256e-07, "loss": 0.4801, "step": 3830 }, { "epoch": 0.46237206502107164, "grad_norm": 4.628232002258301, "learning_rate": 9.992595294759147e-07, "loss": 0.4953, "step": 3840 }, { "epoch": 0.46357615894039733, "grad_norm": 4.719220161437988, "learning_rate": 9.992306713236465e-07, "loss": 0.4658, "step": 3850 }, { "epoch": 0.4647802528597231, "grad_norm": 4.918371200561523, "learning_rate": 9.992012619618838e-07, "loss": 0.4691, "step": 3860 }, { "epoch": 0.46598434677904876, "grad_norm": 4.425540447235107, "learning_rate": 9.991713014230981e-07, "loss": 0.4648, "step": 3870 }, { "epoch": 0.46718844069837445, "grad_norm": 3.687819480895996, "learning_rate": 9.99140789740369e-07, "loss": 0.4714, "step": 3880 }, { "epoch": 0.4683925346177002, "grad_norm": 4.835513591766357, "learning_rate": 9.991097269473852e-07, "loss": 0.4866, "step": 3890 }, { "epoch": 0.4695966285370259, "grad_norm": 4.215537071228027, "learning_rate": 9.990781130784437e-07, "loss": 0.4697, "step": 3900 }, { "epoch": 0.4708007224563516, "grad_norm": 4.371738433837891, "learning_rate": 9.990459481684504e-07, "loss": 0.4655, "step": 3910 }, { "epoch": 0.4720048163756773, "grad_norm": 4.469852924346924, "learning_rate": 9.990132322529181e-07, "loss": 0.4416, "step": 3920 }, { "epoch": 0.473208910295003, "grad_norm": 4.61678409576416, "learning_rate": 9.989799653679701e-07, "loss": 0.4625, "step": 3930 }, { "epoch": 0.4744130042143287, "grad_norm": 5.12364387512207, "learning_rate": 9.989461475503362e-07, "loss": 0.4515, "step": 3940 }, { "epoch": 0.47561709813365444, "grad_norm": 5.4315924644470215, "learning_rate": 9.989117788373558e-07, "loss": 0.4773, "step": 3950 }, { "epoch": 0.4768211920529801, "grad_norm": 4.474724769592285, "learning_rate": 9.988768592669756e-07, "loss": 0.445, "step": 3960 }, { "epoch": 0.4780252859723058, "grad_norm": 4.433851718902588, "learning_rate": 9.98841388877751e-07, "loss": 0.4667, "step": 3970 }, { "epoch": 0.47922937989163156, "grad_norm": 4.388487815856934, "learning_rate": 9.988053677088456e-07, "loss": 0.443, "step": 3980 }, { "epoch": 0.48043347381095725, "grad_norm": 5.400040149688721, "learning_rate": 9.987687958000314e-07, "loss": 0.4702, "step": 3990 }, { "epoch": 0.481637567730283, "grad_norm": 4.436804294586182, "learning_rate": 9.987316731916872e-07, "loss": 0.4568, "step": 4000 }, { "epoch": 0.4828416616496087, "grad_norm": 5.063580513000488, "learning_rate": 9.986939999248014e-07, "loss": 0.4877, "step": 4010 }, { "epoch": 0.48404575556893437, "grad_norm": 4.696618556976318, "learning_rate": 9.986557760409694e-07, "loss": 0.464, "step": 4020 }, { "epoch": 0.4852498494882601, "grad_norm": 5.019808292388916, "learning_rate": 9.98617001582395e-07, "loss": 0.4533, "step": 4030 }, { "epoch": 0.4864539434075858, "grad_norm": 4.419073104858398, "learning_rate": 9.9857767659189e-07, "loss": 0.4416, "step": 4040 }, { "epoch": 0.4876580373269115, "grad_norm": 4.31454610824585, "learning_rate": 9.985378011128736e-07, "loss": 0.458, "step": 4050 }, { "epoch": 0.48886213124623723, "grad_norm": 5.41327428817749, "learning_rate": 9.98497375189373e-07, "loss": 0.4669, "step": 4060 }, { "epoch": 0.4900662251655629, "grad_norm": 4.439949035644531, "learning_rate": 9.98456398866023e-07, "loss": 0.4532, "step": 4070 }, { "epoch": 0.4912703190848886, "grad_norm": 4.076527118682861, "learning_rate": 9.98414872188067e-07, "loss": 0.4565, "step": 4080 }, { "epoch": 0.49247441300421435, "grad_norm": 4.239142894744873, "learning_rate": 9.983727952013545e-07, "loss": 0.4686, "step": 4090 }, { "epoch": 0.49367850692354004, "grad_norm": 4.340599060058594, "learning_rate": 9.98330167952344e-07, "loss": 0.4654, "step": 4100 }, { "epoch": 0.4948826008428657, "grad_norm": 4.37545108795166, "learning_rate": 9.982869904881007e-07, "loss": 0.4634, "step": 4110 }, { "epoch": 0.49608669476219147, "grad_norm": 4.235968112945557, "learning_rate": 9.982432628562976e-07, "loss": 0.4537, "step": 4120 }, { "epoch": 0.49729078868151716, "grad_norm": 5.080899715423584, "learning_rate": 9.981989851052153e-07, "loss": 0.4675, "step": 4130 }, { "epoch": 0.49849488260084285, "grad_norm": 4.327193260192871, "learning_rate": 9.98154157283742e-07, "loss": 0.4336, "step": 4140 }, { "epoch": 0.4996989765201686, "grad_norm": 4.647739887237549, "learning_rate": 9.981087794413721e-07, "loss": 0.4547, "step": 4150 }, { "epoch": 0.5009030704394943, "grad_norm": 4.411125659942627, "learning_rate": 9.980628516282088e-07, "loss": 0.4453, "step": 4160 }, { "epoch": 0.50210716435882, "grad_norm": 4.8657026290893555, "learning_rate": 9.980163738949615e-07, "loss": 0.4714, "step": 4170 }, { "epoch": 0.5033112582781457, "grad_norm": 4.7668776512146, "learning_rate": 9.97969346292947e-07, "loss": 0.4472, "step": 4180 }, { "epoch": 0.5045153521974715, "grad_norm": 5.490717887878418, "learning_rate": 9.979217688740895e-07, "loss": 0.4767, "step": 4190 }, { "epoch": 0.5057194461167971, "grad_norm": 4.896997928619385, "learning_rate": 9.978736416909201e-07, "loss": 0.4714, "step": 4200 }, { "epoch": 0.5069235400361228, "grad_norm": 4.777568340301514, "learning_rate": 9.978249647965768e-07, "loss": 0.4608, "step": 4210 }, { "epoch": 0.5081276339554486, "grad_norm": 4.839885711669922, "learning_rate": 9.977757382448047e-07, "loss": 0.4798, "step": 4220 }, { "epoch": 0.5093317278747742, "grad_norm": 4.311272144317627, "learning_rate": 9.977259620899557e-07, "loss": 0.4347, "step": 4230 }, { "epoch": 0.5105358217941, "grad_norm": 4.5723772048950195, "learning_rate": 9.976756363869883e-07, "loss": 0.4485, "step": 4240 }, { "epoch": 0.5117399157134257, "grad_norm": 4.344234943389893, "learning_rate": 9.976247611914681e-07, "loss": 0.4623, "step": 4250 }, { "epoch": 0.5129440096327513, "grad_norm": 4.216832160949707, "learning_rate": 9.975733365595678e-07, "loss": 0.4587, "step": 4260 }, { "epoch": 0.5141481035520771, "grad_norm": 4.828461647033691, "learning_rate": 9.975213625480656e-07, "loss": 0.4616, "step": 4270 }, { "epoch": 0.5153521974714028, "grad_norm": 4.608251571655273, "learning_rate": 9.974688392143473e-07, "loss": 0.4537, "step": 4280 }, { "epoch": 0.5165562913907285, "grad_norm": 5.024391174316406, "learning_rate": 9.974157666164047e-07, "loss": 0.4596, "step": 4290 }, { "epoch": 0.5177603853100542, "grad_norm": 4.869425296783447, "learning_rate": 9.973621448128362e-07, "loss": 0.468, "step": 4300 }, { "epoch": 0.5189644792293799, "grad_norm": 4.599194526672363, "learning_rate": 9.973079738628466e-07, "loss": 0.4475, "step": 4310 }, { "epoch": 0.5201685731487056, "grad_norm": 4.410305500030518, "learning_rate": 9.972532538262473e-07, "loss": 0.4684, "step": 4320 }, { "epoch": 0.5213726670680313, "grad_norm": 3.9566409587860107, "learning_rate": 9.971979847634552e-07, "loss": 0.4472, "step": 4330 }, { "epoch": 0.5225767609873571, "grad_norm": 4.608943462371826, "learning_rate": 9.971421667354944e-07, "loss": 0.4591, "step": 4340 }, { "epoch": 0.5237808549066827, "grad_norm": 4.722293853759766, "learning_rate": 9.97085799803994e-07, "loss": 0.4529, "step": 4350 }, { "epoch": 0.5249849488260084, "grad_norm": 4.868890762329102, "learning_rate": 9.9702888403119e-07, "loss": 0.4742, "step": 4360 }, { "epoch": 0.5261890427453342, "grad_norm": 4.125800132751465, "learning_rate": 9.969714194799243e-07, "loss": 0.4501, "step": 4370 }, { "epoch": 0.5273931366646598, "grad_norm": 4.570892810821533, "learning_rate": 9.969134062136442e-07, "loss": 0.4392, "step": 4380 }, { "epoch": 0.5285972305839856, "grad_norm": 3.8944973945617676, "learning_rate": 9.968548442964033e-07, "loss": 0.4525, "step": 4390 }, { "epoch": 0.5298013245033113, "grad_norm": 4.27981424331665, "learning_rate": 9.96795733792861e-07, "loss": 0.4607, "step": 4400 }, { "epoch": 0.5310054184226369, "grad_norm": 4.3153300285339355, "learning_rate": 9.96736074768282e-07, "loss": 0.4709, "step": 4410 }, { "epoch": 0.5322095123419627, "grad_norm": 5.543158531188965, "learning_rate": 9.966758672885373e-07, "loss": 0.4234, "step": 4420 }, { "epoch": 0.5334136062612884, "grad_norm": 3.463160991668701, "learning_rate": 9.966151114201027e-07, "loss": 0.4684, "step": 4430 }, { "epoch": 0.534617700180614, "grad_norm": 3.8580965995788574, "learning_rate": 9.965538072300598e-07, "loss": 0.4662, "step": 4440 }, { "epoch": 0.5358217940999398, "grad_norm": 4.317717552185059, "learning_rate": 9.96491954786096e-07, "loss": 0.441, "step": 4450 }, { "epoch": 0.5370258880192655, "grad_norm": 4.992043495178223, "learning_rate": 9.964295541565035e-07, "loss": 0.4575, "step": 4460 }, { "epoch": 0.5382299819385912, "grad_norm": 4.042685031890869, "learning_rate": 9.963666054101797e-07, "loss": 0.421, "step": 4470 }, { "epoch": 0.5394340758579169, "grad_norm": 4.4409260749816895, "learning_rate": 9.96303108616628e-07, "loss": 0.4684, "step": 4480 }, { "epoch": 0.5406381697772427, "grad_norm": 4.652424335479736, "learning_rate": 9.96239063845956e-07, "loss": 0.4562, "step": 4490 }, { "epoch": 0.5418422636965683, "grad_norm": 3.927960157394409, "learning_rate": 9.961744711688765e-07, "loss": 0.4636, "step": 4500 }, { "epoch": 0.543046357615894, "grad_norm": 4.20367956161499, "learning_rate": 9.961093306567074e-07, "loss": 0.4629, "step": 4510 }, { "epoch": 0.5442504515352198, "grad_norm": 5.0242791175842285, "learning_rate": 9.960436423813721e-07, "loss": 0.4699, "step": 4520 }, { "epoch": 0.5454545454545454, "grad_norm": 4.339791297912598, "learning_rate": 9.959774064153975e-07, "loss": 0.4393, "step": 4530 }, { "epoch": 0.5466586393738712, "grad_norm": 3.955888509750366, "learning_rate": 9.959106228319164e-07, "loss": 0.4419, "step": 4540 }, { "epoch": 0.5478627332931969, "grad_norm": 4.508617401123047, "learning_rate": 9.958432917046656e-07, "loss": 0.4534, "step": 4550 }, { "epoch": 0.5490668272125225, "grad_norm": 4.84667444229126, "learning_rate": 9.957754131079865e-07, "loss": 0.4621, "step": 4560 }, { "epoch": 0.5502709211318483, "grad_norm": 4.65517520904541, "learning_rate": 9.957069871168252e-07, "loss": 0.4644, "step": 4570 }, { "epoch": 0.551475015051174, "grad_norm": 4.428783416748047, "learning_rate": 9.95638013806732e-07, "loss": 0.4285, "step": 4580 }, { "epoch": 0.5526791089704997, "grad_norm": 5.219538688659668, "learning_rate": 9.955684932538615e-07, "loss": 0.4342, "step": 4590 }, { "epoch": 0.5538832028898254, "grad_norm": 4.356168270111084, "learning_rate": 9.954984255349726e-07, "loss": 0.4502, "step": 4600 }, { "epoch": 0.5550872968091511, "grad_norm": 4.607705116271973, "learning_rate": 9.954278107274286e-07, "loss": 0.4397, "step": 4610 }, { "epoch": 0.5562913907284768, "grad_norm": 4.667281150817871, "learning_rate": 9.95356648909196e-07, "loss": 0.4749, "step": 4620 }, { "epoch": 0.5574954846478025, "grad_norm": 5.4144673347473145, "learning_rate": 9.952849401588464e-07, "loss": 0.4516, "step": 4630 }, { "epoch": 0.5586995785671283, "grad_norm": 4.449268817901611, "learning_rate": 9.952126845555544e-07, "loss": 0.467, "step": 4640 }, { "epoch": 0.5599036724864539, "grad_norm": 4.58141565322876, "learning_rate": 9.951398821790988e-07, "loss": 0.4674, "step": 4650 }, { "epoch": 0.5611077664057796, "grad_norm": 4.779237747192383, "learning_rate": 9.95066533109862e-07, "loss": 0.4486, "step": 4660 }, { "epoch": 0.5623118603251054, "grad_norm": 4.009070873260498, "learning_rate": 9.949926374288298e-07, "loss": 0.4466, "step": 4670 }, { "epoch": 0.563515954244431, "grad_norm": 4.913680553436279, "learning_rate": 9.949181952175922e-07, "loss": 0.4574, "step": 4680 }, { "epoch": 0.5647200481637568, "grad_norm": 4.114124774932861, "learning_rate": 9.94843206558342e-07, "loss": 0.4556, "step": 4690 }, { "epoch": 0.5659241420830825, "grad_norm": 4.208637237548828, "learning_rate": 9.94767671533875e-07, "loss": 0.4446, "step": 4700 }, { "epoch": 0.5671282360024081, "grad_norm": 4.362401962280273, "learning_rate": 9.946915902275914e-07, "loss": 0.4591, "step": 4710 }, { "epoch": 0.5683323299217339, "grad_norm": 4.419969081878662, "learning_rate": 9.946149627234939e-07, "loss": 0.4352, "step": 4720 }, { "epoch": 0.5695364238410596, "grad_norm": 5.162231922149658, "learning_rate": 9.94537789106188e-07, "loss": 0.4613, "step": 4730 }, { "epoch": 0.5707405177603853, "grad_norm": 4.270598411560059, "learning_rate": 9.944600694608825e-07, "loss": 0.4628, "step": 4740 }, { "epoch": 0.571944611679711, "grad_norm": 4.181495666503906, "learning_rate": 9.943818038733891e-07, "loss": 0.4391, "step": 4750 }, { "epoch": 0.5731487055990367, "grad_norm": 4.3339033126831055, "learning_rate": 9.943029924301225e-07, "loss": 0.4406, "step": 4760 }, { "epoch": 0.5743527995183624, "grad_norm": 4.909811496734619, "learning_rate": 9.942236352180996e-07, "loss": 0.4575, "step": 4770 }, { "epoch": 0.5755568934376881, "grad_norm": 4.58059549331665, "learning_rate": 9.941437323249398e-07, "loss": 0.4613, "step": 4780 }, { "epoch": 0.5767609873570139, "grad_norm": 3.9194531440734863, "learning_rate": 9.94063283838866e-07, "loss": 0.4449, "step": 4790 }, { "epoch": 0.5779650812763396, "grad_norm": 4.602609634399414, "learning_rate": 9.93982289848702e-07, "loss": 0.4622, "step": 4800 }, { "epoch": 0.5791691751956652, "grad_norm": 4.630181789398193, "learning_rate": 9.939007504438754e-07, "loss": 0.442, "step": 4810 }, { "epoch": 0.580373269114991, "grad_norm": 3.903799057006836, "learning_rate": 9.938186657144149e-07, "loss": 0.4624, "step": 4820 }, { "epoch": 0.5815773630343167, "grad_norm": 5.423624515533447, "learning_rate": 9.937360357509522e-07, "loss": 0.4372, "step": 4830 }, { "epoch": 0.5827814569536424, "grad_norm": 4.571367263793945, "learning_rate": 9.936528606447198e-07, "loss": 0.4521, "step": 4840 }, { "epoch": 0.5839855508729681, "grad_norm": 3.8848462104797363, "learning_rate": 9.935691404875534e-07, "loss": 0.4399, "step": 4850 }, { "epoch": 0.5851896447922939, "grad_norm": 4.659217357635498, "learning_rate": 9.934848753718896e-07, "loss": 0.4345, "step": 4860 }, { "epoch": 0.5863937387116195, "grad_norm": 5.5009026527404785, "learning_rate": 9.934000653907672e-07, "loss": 0.4173, "step": 4870 }, { "epoch": 0.5875978326309452, "grad_norm": 3.984834671020508, "learning_rate": 9.933147106378263e-07, "loss": 0.4354, "step": 4880 }, { "epoch": 0.588801926550271, "grad_norm": 4.0750346183776855, "learning_rate": 9.932288112073086e-07, "loss": 0.4447, "step": 4890 }, { "epoch": 0.5900060204695966, "grad_norm": 4.871407985687256, "learning_rate": 9.931423671940575e-07, "loss": 0.4501, "step": 4900 }, { "epoch": 0.5912101143889223, "grad_norm": 4.388524055480957, "learning_rate": 9.93055378693517e-07, "loss": 0.4421, "step": 4910 }, { "epoch": 0.5924142083082481, "grad_norm": 4.511969566345215, "learning_rate": 9.929678458017328e-07, "loss": 0.4431, "step": 4920 }, { "epoch": 0.5936183022275737, "grad_norm": 4.788571834564209, "learning_rate": 9.928797686153514e-07, "loss": 0.4621, "step": 4930 }, { "epoch": 0.5948223961468995, "grad_norm": 5.144417762756348, "learning_rate": 9.927911472316205e-07, "loss": 0.4418, "step": 4940 }, { "epoch": 0.5960264900662252, "grad_norm": 4.649743556976318, "learning_rate": 9.927019817483887e-07, "loss": 0.4639, "step": 4950 }, { "epoch": 0.5972305839855508, "grad_norm": 4.76192045211792, "learning_rate": 9.92612272264105e-07, "loss": 0.4646, "step": 4960 }, { "epoch": 0.5984346779048766, "grad_norm": 4.137574195861816, "learning_rate": 9.925220188778193e-07, "loss": 0.4537, "step": 4970 }, { "epoch": 0.5996387718242023, "grad_norm": 4.616219997406006, "learning_rate": 9.924312216891819e-07, "loss": 0.4451, "step": 4980 }, { "epoch": 0.600842865743528, "grad_norm": 4.623941421508789, "learning_rate": 9.923398807984438e-07, "loss": 0.4441, "step": 4990 }, { "epoch": 0.6020469596628537, "grad_norm": 4.540246486663818, "learning_rate": 9.92247996306456e-07, "loss": 0.4477, "step": 5000 }, { "epoch": 0.6032510535821795, "grad_norm": 4.742766380310059, "learning_rate": 9.921555683146695e-07, "loss": 0.4672, "step": 5010 }, { "epoch": 0.6044551475015051, "grad_norm": 5.316002368927002, "learning_rate": 9.920625969251364e-07, "loss": 0.4593, "step": 5020 }, { "epoch": 0.6056592414208308, "grad_norm": 4.386168003082275, "learning_rate": 9.919690822405074e-07, "loss": 0.4438, "step": 5030 }, { "epoch": 0.6068633353401566, "grad_norm": 3.9734067916870117, "learning_rate": 9.91875024364034e-07, "loss": 0.4428, "step": 5040 }, { "epoch": 0.6080674292594822, "grad_norm": 4.917031764984131, "learning_rate": 9.917804233995673e-07, "loss": 0.4622, "step": 5050 }, { "epoch": 0.609271523178808, "grad_norm": 4.690892696380615, "learning_rate": 9.916852794515575e-07, "loss": 0.4513, "step": 5060 }, { "epoch": 0.6104756170981337, "grad_norm": 4.1330952644348145, "learning_rate": 9.915895926250552e-07, "loss": 0.4523, "step": 5070 }, { "epoch": 0.6116797110174593, "grad_norm": 4.932434558868408, "learning_rate": 9.9149336302571e-07, "loss": 0.4407, "step": 5080 }, { "epoch": 0.6128838049367851, "grad_norm": 4.421885967254639, "learning_rate": 9.913965907597702e-07, "loss": 0.4332, "step": 5090 }, { "epoch": 0.6140878988561108, "grad_norm": 5.199044704437256, "learning_rate": 9.91299275934084e-07, "loss": 0.426, "step": 5100 }, { "epoch": 0.6152919927754364, "grad_norm": 4.189499855041504, "learning_rate": 9.912014186560984e-07, "loss": 0.4326, "step": 5110 }, { "epoch": 0.6164960866947622, "grad_norm": 4.297112464904785, "learning_rate": 9.911030190338597e-07, "loss": 0.4622, "step": 5120 }, { "epoch": 0.6177001806140879, "grad_norm": 3.9968087673187256, "learning_rate": 9.910040771760122e-07, "loss": 0.447, "step": 5130 }, { "epoch": 0.6189042745334136, "grad_norm": 4.857995510101318, "learning_rate": 9.909045931917998e-07, "loss": 0.4343, "step": 5140 }, { "epoch": 0.6201083684527393, "grad_norm": 3.741711378097534, "learning_rate": 9.908045671910642e-07, "loss": 0.4366, "step": 5150 }, { "epoch": 0.621312462372065, "grad_norm": 4.424086093902588, "learning_rate": 9.907039992842461e-07, "loss": 0.448, "step": 5160 }, { "epoch": 0.6225165562913907, "grad_norm": 5.499582767486572, "learning_rate": 9.906028895823842e-07, "loss": 0.4546, "step": 5170 }, { "epoch": 0.6237206502107164, "grad_norm": 4.836984634399414, "learning_rate": 9.905012381971157e-07, "loss": 0.4605, "step": 5180 }, { "epoch": 0.6249247441300422, "grad_norm": 4.31553316116333, "learning_rate": 9.903990452406756e-07, "loss": 0.4302, "step": 5190 }, { "epoch": 0.6261288380493678, "grad_norm": 4.909146785736084, "learning_rate": 9.902963108258968e-07, "loss": 0.4445, "step": 5200 }, { "epoch": 0.6273329319686936, "grad_norm": 4.295082092285156, "learning_rate": 9.901930350662103e-07, "loss": 0.4364, "step": 5210 }, { "epoch": 0.6285370258880193, "grad_norm": 4.154002666473389, "learning_rate": 9.90089218075645e-07, "loss": 0.4526, "step": 5220 }, { "epoch": 0.6297411198073449, "grad_norm": 4.30592679977417, "learning_rate": 9.89984859968827e-07, "loss": 0.4442, "step": 5230 }, { "epoch": 0.6309452137266707, "grad_norm": 5.334674835205078, "learning_rate": 9.898799608609795e-07, "loss": 0.4415, "step": 5240 }, { "epoch": 0.6321493076459964, "grad_norm": 4.136261940002441, "learning_rate": 9.897745208679239e-07, "loss": 0.4442, "step": 5250 }, { "epoch": 0.633353401565322, "grad_norm": 4.585081577301025, "learning_rate": 9.896685401060782e-07, "loss": 0.4565, "step": 5260 }, { "epoch": 0.6345574954846478, "grad_norm": 4.742111682891846, "learning_rate": 9.895620186924578e-07, "loss": 0.4393, "step": 5270 }, { "epoch": 0.6357615894039735, "grad_norm": 3.9798941612243652, "learning_rate": 9.894549567446748e-07, "loss": 0.4255, "step": 5280 }, { "epoch": 0.6369656833232992, "grad_norm": 4.722369194030762, "learning_rate": 9.893473543809383e-07, "loss": 0.4377, "step": 5290 }, { "epoch": 0.6381697772426249, "grad_norm": 4.399467945098877, "learning_rate": 9.892392117200536e-07, "loss": 0.4215, "step": 5300 }, { "epoch": 0.6393738711619507, "grad_norm": 4.718751430511475, "learning_rate": 9.891305288814235e-07, "loss": 0.4372, "step": 5310 }, { "epoch": 0.6405779650812763, "grad_norm": 4.376132488250732, "learning_rate": 9.890213059850465e-07, "loss": 0.4567, "step": 5320 }, { "epoch": 0.641782059000602, "grad_norm": 5.186975955963135, "learning_rate": 9.889115431515173e-07, "loss": 0.4414, "step": 5330 }, { "epoch": 0.6429861529199278, "grad_norm": 4.560245037078857, "learning_rate": 9.888012405020271e-07, "loss": 0.4329, "step": 5340 }, { "epoch": 0.6441902468392534, "grad_norm": 5.553184986114502, "learning_rate": 9.886903981583632e-07, "loss": 0.4472, "step": 5350 }, { "epoch": 0.6453943407585792, "grad_norm": 5.126540660858154, "learning_rate": 9.885790162429086e-07, "loss": 0.4577, "step": 5360 }, { "epoch": 0.6465984346779049, "grad_norm": 5.031693935394287, "learning_rate": 9.884670948786417e-07, "loss": 0.4608, "step": 5370 }, { "epoch": 0.6478025285972305, "grad_norm": 4.265883445739746, "learning_rate": 9.883546341891373e-07, "loss": 0.4335, "step": 5380 }, { "epoch": 0.6490066225165563, "grad_norm": 3.7793495655059814, "learning_rate": 9.88241634298565e-07, "loss": 0.4481, "step": 5390 }, { "epoch": 0.650210716435882, "grad_norm": 4.184829235076904, "learning_rate": 9.881280953316903e-07, "loss": 0.4351, "step": 5400 }, { "epoch": 0.6514148103552077, "grad_norm": 5.431835174560547, "learning_rate": 9.880140174138735e-07, "loss": 0.4739, "step": 5410 }, { "epoch": 0.6526189042745334, "grad_norm": 5.218166828155518, "learning_rate": 9.878994006710695e-07, "loss": 0.4547, "step": 5420 }, { "epoch": 0.6538229981938591, "grad_norm": 5.319456100463867, "learning_rate": 9.877842452298293e-07, "loss": 0.453, "step": 5430 }, { "epoch": 0.6550270921131849, "grad_norm": 4.373801231384277, "learning_rate": 9.876685512172979e-07, "loss": 0.4245, "step": 5440 }, { "epoch": 0.6562311860325105, "grad_norm": 4.274784088134766, "learning_rate": 9.875523187612153e-07, "loss": 0.4327, "step": 5450 }, { "epoch": 0.6574352799518363, "grad_norm": 5.235876560211182, "learning_rate": 9.874355479899157e-07, "loss": 0.4365, "step": 5460 }, { "epoch": 0.658639373871162, "grad_norm": 4.505414962768555, "learning_rate": 9.873182390323275e-07, "loss": 0.4236, "step": 5470 }, { "epoch": 0.6598434677904876, "grad_norm": 5.843977451324463, "learning_rate": 9.87200392017974e-07, "loss": 0.4482, "step": 5480 }, { "epoch": 0.6610475617098134, "grad_norm": 4.754218578338623, "learning_rate": 9.870820070769723e-07, "loss": 0.4526, "step": 5490 }, { "epoch": 0.6622516556291391, "grad_norm": 4.734755992889404, "learning_rate": 9.869630843400329e-07, "loss": 0.4286, "step": 5500 }, { "epoch": 0.6634557495484648, "grad_norm": 4.781942367553711, "learning_rate": 9.868436239384608e-07, "loss": 0.4395, "step": 5510 }, { "epoch": 0.6646598434677905, "grad_norm": 4.710615634918213, "learning_rate": 9.86723626004154e-07, "loss": 0.4437, "step": 5520 }, { "epoch": 0.6658639373871162, "grad_norm": 3.9797275066375732, "learning_rate": 9.86603090669605e-07, "loss": 0.4285, "step": 5530 }, { "epoch": 0.6670680313064419, "grad_norm": 5.289978981018066, "learning_rate": 9.864820180678984e-07, "loss": 0.4482, "step": 5540 }, { "epoch": 0.6682721252257676, "grad_norm": 3.6335768699645996, "learning_rate": 9.86360408332713e-07, "loss": 0.4578, "step": 5550 }, { "epoch": 0.6694762191450934, "grad_norm": 3.998011589050293, "learning_rate": 9.862382615983201e-07, "loss": 0.439, "step": 5560 }, { "epoch": 0.670680313064419, "grad_norm": 4.6308369636535645, "learning_rate": 9.861155779995843e-07, "loss": 0.4416, "step": 5570 }, { "epoch": 0.6718844069837447, "grad_norm": 4.869227409362793, "learning_rate": 9.859923576719623e-07, "loss": 0.4271, "step": 5580 }, { "epoch": 0.6730885009030705, "grad_norm": 4.426019668579102, "learning_rate": 9.858686007515043e-07, "loss": 0.424, "step": 5590 }, { "epoch": 0.6742925948223961, "grad_norm": 4.659002304077148, "learning_rate": 9.857443073748526e-07, "loss": 0.4419, "step": 5600 }, { "epoch": 0.6754966887417219, "grad_norm": 3.8600122928619385, "learning_rate": 9.856194776792412e-07, "loss": 0.4397, "step": 5610 }, { "epoch": 0.6767007826610476, "grad_norm": 4.6182756423950195, "learning_rate": 9.854941118024973e-07, "loss": 0.454, "step": 5620 }, { "epoch": 0.6779048765803732, "grad_norm": 4.149092674255371, "learning_rate": 9.853682098830392e-07, "loss": 0.426, "step": 5630 }, { "epoch": 0.679108970499699, "grad_norm": 4.583498954772949, "learning_rate": 9.852417720598778e-07, "loss": 0.4226, "step": 5640 }, { "epoch": 0.6803130644190247, "grad_norm": 4.789090633392334, "learning_rate": 9.851147984726152e-07, "loss": 0.4506, "step": 5650 }, { "epoch": 0.6815171583383504, "grad_norm": 3.850926160812378, "learning_rate": 9.849872892614452e-07, "loss": 0.4149, "step": 5660 }, { "epoch": 0.6827212522576761, "grad_norm": 4.576216697692871, "learning_rate": 9.848592445671532e-07, "loss": 0.4364, "step": 5670 }, { "epoch": 0.6839253461770018, "grad_norm": 5.302231311798096, "learning_rate": 9.847306645311152e-07, "loss": 0.4529, "step": 5680 }, { "epoch": 0.6851294400963275, "grad_norm": 4.6318864822387695, "learning_rate": 9.846015492952993e-07, "loss": 0.4299, "step": 5690 }, { "epoch": 0.6863335340156532, "grad_norm": 4.18743896484375, "learning_rate": 9.844718990022634e-07, "loss": 0.4567, "step": 5700 }, { "epoch": 0.687537627934979, "grad_norm": 4.45042610168457, "learning_rate": 9.84341713795157e-07, "loss": 0.4461, "step": 5710 }, { "epoch": 0.6887417218543046, "grad_norm": 4.0155415534973145, "learning_rate": 9.842109938177197e-07, "loss": 0.4422, "step": 5720 }, { "epoch": 0.6899458157736303, "grad_norm": 4.72194242477417, "learning_rate": 9.840797392142819e-07, "loss": 0.4499, "step": 5730 }, { "epoch": 0.6911499096929561, "grad_norm": 4.1018595695495605, "learning_rate": 9.83947950129764e-07, "loss": 0.4305, "step": 5740 }, { "epoch": 0.6923540036122817, "grad_norm": 4.466518402099609, "learning_rate": 9.838156267096772e-07, "loss": 0.437, "step": 5750 }, { "epoch": 0.6935580975316075, "grad_norm": 4.084195137023926, "learning_rate": 9.836827691001215e-07, "loss": 0.4571, "step": 5760 }, { "epoch": 0.6947621914509332, "grad_norm": 4.3810319900512695, "learning_rate": 9.835493774477876e-07, "loss": 0.4358, "step": 5770 }, { "epoch": 0.6959662853702588, "grad_norm": 4.7473464012146, "learning_rate": 9.834154518999558e-07, "loss": 0.4307, "step": 5780 }, { "epoch": 0.6971703792895846, "grad_norm": 4.240455627441406, "learning_rate": 9.832809926044953e-07, "loss": 0.4456, "step": 5790 }, { "epoch": 0.6983744732089103, "grad_norm": 4.3158087730407715, "learning_rate": 9.831459997098653e-07, "loss": 0.4268, "step": 5800 }, { "epoch": 0.699578567128236, "grad_norm": 4.3610005378723145, "learning_rate": 9.83010473365114e-07, "loss": 0.4334, "step": 5810 }, { "epoch": 0.7007826610475617, "grad_norm": 4.417696952819824, "learning_rate": 9.828744137198778e-07, "loss": 0.4451, "step": 5820 }, { "epoch": 0.7019867549668874, "grad_norm": 4.091536998748779, "learning_rate": 9.827378209243833e-07, "loss": 0.4277, "step": 5830 }, { "epoch": 0.7031908488862131, "grad_norm": 5.2131028175354, "learning_rate": 9.826006951294448e-07, "loss": 0.4353, "step": 5840 }, { "epoch": 0.7043949428055388, "grad_norm": 4.724157810211182, "learning_rate": 9.824630364864653e-07, "loss": 0.4379, "step": 5850 }, { "epoch": 0.7055990367248646, "grad_norm": 3.924499034881592, "learning_rate": 9.82324845147436e-07, "loss": 0.4341, "step": 5860 }, { "epoch": 0.7068031306441902, "grad_norm": 3.9886951446533203, "learning_rate": 9.821861212649367e-07, "loss": 0.4458, "step": 5870 }, { "epoch": 0.708007224563516, "grad_norm": 5.176059246063232, "learning_rate": 9.820468649921348e-07, "loss": 0.4277, "step": 5880 }, { "epoch": 0.7092113184828417, "grad_norm": 5.795221328735352, "learning_rate": 9.819070764827856e-07, "loss": 0.4608, "step": 5890 }, { "epoch": 0.7104154124021673, "grad_norm": 4.0651702880859375, "learning_rate": 9.81766755891232e-07, "loss": 0.4349, "step": 5900 }, { "epoch": 0.7116195063214931, "grad_norm": 4.822697162628174, "learning_rate": 9.816259033724051e-07, "loss": 0.4368, "step": 5910 }, { "epoch": 0.7128236002408188, "grad_norm": 3.429680585861206, "learning_rate": 9.814845190818218e-07, "loss": 0.4119, "step": 5920 }, { "epoch": 0.7140276941601444, "grad_norm": 4.649044513702393, "learning_rate": 9.813426031755873e-07, "loss": 0.431, "step": 5930 }, { "epoch": 0.7152317880794702, "grad_norm": 4.576180458068848, "learning_rate": 9.812001558103937e-07, "loss": 0.4478, "step": 5940 }, { "epoch": 0.7164358819987959, "grad_norm": 4.996614933013916, "learning_rate": 9.810571771435196e-07, "loss": 0.4013, "step": 5950 }, { "epoch": 0.7176399759181216, "grad_norm": 5.006197929382324, "learning_rate": 9.809136673328305e-07, "loss": 0.4275, "step": 5960 }, { "epoch": 0.7188440698374473, "grad_norm": 3.766942024230957, "learning_rate": 9.807696265367776e-07, "loss": 0.4377, "step": 5970 }, { "epoch": 0.720048163756773, "grad_norm": 4.086816787719727, "learning_rate": 9.806250549143992e-07, "loss": 0.4384, "step": 5980 }, { "epoch": 0.7212522576760987, "grad_norm": 5.5871734619140625, "learning_rate": 9.804799526253196e-07, "loss": 0.4511, "step": 5990 }, { "epoch": 0.7224563515954244, "grad_norm": 4.023412704467773, "learning_rate": 9.803343198297484e-07, "loss": 0.4446, "step": 6000 }, { "epoch": 0.7236604455147502, "grad_norm": 4.708857536315918, "learning_rate": 9.80188156688482e-07, "loss": 0.4395, "step": 6010 }, { "epoch": 0.7248645394340758, "grad_norm": 3.879977226257324, "learning_rate": 9.80041463362901e-07, "loss": 0.4434, "step": 6020 }, { "epoch": 0.7260686333534015, "grad_norm": 4.743607997894287, "learning_rate": 9.798942400149726e-07, "loss": 0.4365, "step": 6030 }, { "epoch": 0.7272727272727273, "grad_norm": 3.6438701152801514, "learning_rate": 9.797464868072486e-07, "loss": 0.447, "step": 6040 }, { "epoch": 0.7284768211920529, "grad_norm": 4.472813129425049, "learning_rate": 9.79598203902866e-07, "loss": 0.443, "step": 6050 }, { "epoch": 0.7296809151113787, "grad_norm": 5.6175312995910645, "learning_rate": 9.794493914655467e-07, "loss": 0.4207, "step": 6060 }, { "epoch": 0.7308850090307044, "grad_norm": 4.9606404304504395, "learning_rate": 9.793000496595966e-07, "loss": 0.4279, "step": 6070 }, { "epoch": 0.7320891029500302, "grad_norm": 4.130514144897461, "learning_rate": 9.791501786499074e-07, "loss": 0.4183, "step": 6080 }, { "epoch": 0.7332931968693558, "grad_norm": 2.9547371864318848, "learning_rate": 9.78999778601954e-07, "loss": 0.4038, "step": 6090 }, { "epoch": 0.7344972907886815, "grad_norm": 4.06984281539917, "learning_rate": 9.788488496817958e-07, "loss": 0.4333, "step": 6100 }, { "epoch": 0.7357013847080073, "grad_norm": 3.900606870651245, "learning_rate": 9.78697392056076e-07, "loss": 0.418, "step": 6110 }, { "epoch": 0.7369054786273329, "grad_norm": 4.396324157714844, "learning_rate": 9.78545405892022e-07, "loss": 0.435, "step": 6120 }, { "epoch": 0.7381095725466587, "grad_norm": 4.068949222564697, "learning_rate": 9.78392891357444e-07, "loss": 0.4138, "step": 6130 }, { "epoch": 0.7393136664659844, "grad_norm": 4.090792655944824, "learning_rate": 9.782398486207364e-07, "loss": 0.4106, "step": 6140 }, { "epoch": 0.74051776038531, "grad_norm": 5.222830295562744, "learning_rate": 9.780862778508762e-07, "loss": 0.4534, "step": 6150 }, { "epoch": 0.7417218543046358, "grad_norm": 3.9300661087036133, "learning_rate": 9.779321792174238e-07, "loss": 0.4436, "step": 6160 }, { "epoch": 0.7429259482239615, "grad_norm": 4.139192581176758, "learning_rate": 9.77777552890522e-07, "loss": 0.4384, "step": 6170 }, { "epoch": 0.7441300421432872, "grad_norm": 4.677849292755127, "learning_rate": 9.776223990408969e-07, "loss": 0.4338, "step": 6180 }, { "epoch": 0.7453341360626129, "grad_norm": 4.7174391746521, "learning_rate": 9.77466717839856e-07, "loss": 0.4265, "step": 6190 }, { "epoch": 0.7465382299819386, "grad_norm": 4.314562797546387, "learning_rate": 9.773105094592903e-07, "loss": 0.4389, "step": 6200 }, { "epoch": 0.7477423239012643, "grad_norm": 4.679368495941162, "learning_rate": 9.77153774071672e-07, "loss": 0.4177, "step": 6210 }, { "epoch": 0.74894641782059, "grad_norm": 4.037609577178955, "learning_rate": 9.769965118500554e-07, "loss": 0.4376, "step": 6220 }, { "epoch": 0.7501505117399158, "grad_norm": 4.8901448249816895, "learning_rate": 9.768387229680765e-07, "loss": 0.4597, "step": 6230 }, { "epoch": 0.7513546056592414, "grad_norm": 4.4093122482299805, "learning_rate": 9.76680407599953e-07, "loss": 0.4332, "step": 6240 }, { "epoch": 0.7525586995785671, "grad_norm": 4.720508575439453, "learning_rate": 9.765215659204837e-07, "loss": 0.4579, "step": 6250 }, { "epoch": 0.7537627934978929, "grad_norm": 4.316104412078857, "learning_rate": 9.763621981050486e-07, "loss": 0.4499, "step": 6260 }, { "epoch": 0.7549668874172185, "grad_norm": 4.805814743041992, "learning_rate": 9.762023043296082e-07, "loss": 0.4229, "step": 6270 }, { "epoch": 0.7561709813365443, "grad_norm": 4.259012699127197, "learning_rate": 9.760418847707042e-07, "loss": 0.4307, "step": 6280 }, { "epoch": 0.75737507525587, "grad_norm": 4.74151086807251, "learning_rate": 9.75880939605459e-07, "loss": 0.4039, "step": 6290 }, { "epoch": 0.7585791691751956, "grad_norm": 4.7510294914245605, "learning_rate": 9.757194690115747e-07, "loss": 0.4302, "step": 6300 }, { "epoch": 0.7597832630945214, "grad_norm": 5.057920455932617, "learning_rate": 9.75557473167334e-07, "loss": 0.4196, "step": 6310 }, { "epoch": 0.7609873570138471, "grad_norm": 4.428061485290527, "learning_rate": 9.753949522515992e-07, "loss": 0.4271, "step": 6320 }, { "epoch": 0.7621914509331728, "grad_norm": 4.023929595947266, "learning_rate": 9.75231906443813e-07, "loss": 0.4125, "step": 6330 }, { "epoch": 0.7633955448524985, "grad_norm": 4.456701755523682, "learning_rate": 9.75068335923997e-07, "loss": 0.4177, "step": 6340 }, { "epoch": 0.7645996387718242, "grad_norm": 4.046926975250244, "learning_rate": 9.749042408727517e-07, "loss": 0.4172, "step": 6350 }, { "epoch": 0.7658037326911499, "grad_norm": 4.5811944007873535, "learning_rate": 9.747396214712584e-07, "loss": 0.4165, "step": 6360 }, { "epoch": 0.7670078266104756, "grad_norm": 3.6832375526428223, "learning_rate": 9.745744779012757e-07, "loss": 0.4183, "step": 6370 }, { "epoch": 0.7682119205298014, "grad_norm": 4.535373210906982, "learning_rate": 9.744088103451417e-07, "loss": 0.4205, "step": 6380 }, { "epoch": 0.769416014449127, "grad_norm": 4.3140363693237305, "learning_rate": 9.742426189857729e-07, "loss": 0.4414, "step": 6390 }, { "epoch": 0.7706201083684527, "grad_norm": 4.968809604644775, "learning_rate": 9.74075904006664e-07, "loss": 0.4421, "step": 6400 }, { "epoch": 0.7718242022877785, "grad_norm": 4.488393783569336, "learning_rate": 9.739086655918883e-07, "loss": 0.441, "step": 6410 }, { "epoch": 0.7730282962071041, "grad_norm": 4.255595684051514, "learning_rate": 9.737409039260966e-07, "loss": 0.4211, "step": 6420 }, { "epoch": 0.7742323901264299, "grad_norm": 4.285024642944336, "learning_rate": 9.735726191945175e-07, "loss": 0.42, "step": 6430 }, { "epoch": 0.7754364840457556, "grad_norm": 4.8813347816467285, "learning_rate": 9.734038115829571e-07, "loss": 0.433, "step": 6440 }, { "epoch": 0.7766405779650812, "grad_norm": 3.9893128871917725, "learning_rate": 9.732344812777987e-07, "loss": 0.3902, "step": 6450 }, { "epoch": 0.777844671884407, "grad_norm": 4.2948784828186035, "learning_rate": 9.730646284660035e-07, "loss": 0.4094, "step": 6460 }, { "epoch": 0.7790487658037327, "grad_norm": 4.328617572784424, "learning_rate": 9.728942533351087e-07, "loss": 0.4412, "step": 6470 }, { "epoch": 0.7802528597230584, "grad_norm": 4.67041015625, "learning_rate": 9.727233560732286e-07, "loss": 0.4157, "step": 6480 }, { "epoch": 0.7814569536423841, "grad_norm": 4.249061584472656, "learning_rate": 9.725519368690538e-07, "loss": 0.4398, "step": 6490 }, { "epoch": 0.7826610475617098, "grad_norm": 5.444673538208008, "learning_rate": 9.723799959118513e-07, "loss": 0.4299, "step": 6500 }, { "epoch": 0.7838651414810355, "grad_norm": 4.813880920410156, "learning_rate": 9.722075333914642e-07, "loss": 0.4483, "step": 6510 }, { "epoch": 0.7850692354003612, "grad_norm": 3.9406328201293945, "learning_rate": 9.720345494983116e-07, "loss": 0.4101, "step": 6520 }, { "epoch": 0.786273329319687, "grad_norm": 5.169934272766113, "learning_rate": 9.718610444233878e-07, "loss": 0.4284, "step": 6530 }, { "epoch": 0.7874774232390126, "grad_norm": 4.304941177368164, "learning_rate": 9.71687018358263e-07, "loss": 0.4232, "step": 6540 }, { "epoch": 0.7886815171583383, "grad_norm": 4.452000141143799, "learning_rate": 9.715124714950827e-07, "loss": 0.4506, "step": 6550 }, { "epoch": 0.7898856110776641, "grad_norm": 3.7503676414489746, "learning_rate": 9.713374040265668e-07, "loss": 0.4246, "step": 6560 }, { "epoch": 0.7910897049969897, "grad_norm": 4.534003257751465, "learning_rate": 9.71161816146011e-07, "loss": 0.4247, "step": 6570 }, { "epoch": 0.7922937989163155, "grad_norm": 5.637129306793213, "learning_rate": 9.709857080472845e-07, "loss": 0.4419, "step": 6580 }, { "epoch": 0.7934978928356412, "grad_norm": 3.844273805618286, "learning_rate": 9.708090799248313e-07, "loss": 0.4042, "step": 6590 }, { "epoch": 0.7947019867549668, "grad_norm": 4.556625843048096, "learning_rate": 9.706319319736703e-07, "loss": 0.4384, "step": 6600 }, { "epoch": 0.7959060806742926, "grad_norm": 4.6486053466796875, "learning_rate": 9.70454264389393e-07, "loss": 0.4091, "step": 6610 }, { "epoch": 0.7971101745936183, "grad_norm": 4.751596927642822, "learning_rate": 9.702760773681658e-07, "loss": 0.428, "step": 6620 }, { "epoch": 0.798314268512944, "grad_norm": 4.64603328704834, "learning_rate": 9.700973711067282e-07, "loss": 0.4376, "step": 6630 }, { "epoch": 0.7995183624322697, "grad_norm": 4.823798656463623, "learning_rate": 9.699181458023927e-07, "loss": 0.4057, "step": 6640 }, { "epoch": 0.8007224563515954, "grad_norm": 5.07472562789917, "learning_rate": 9.697384016530451e-07, "loss": 0.4103, "step": 6650 }, { "epoch": 0.8019265502709211, "grad_norm": 5.586597442626953, "learning_rate": 9.695581388571444e-07, "loss": 0.4401, "step": 6660 }, { "epoch": 0.8031306441902468, "grad_norm": 5.10539436340332, "learning_rate": 9.693773576137219e-07, "loss": 0.4298, "step": 6670 }, { "epoch": 0.8043347381095726, "grad_norm": 5.036708354949951, "learning_rate": 9.691960581223815e-07, "loss": 0.4299, "step": 6680 }, { "epoch": 0.8055388320288982, "grad_norm": 4.794188499450684, "learning_rate": 9.690142405832988e-07, "loss": 0.4296, "step": 6690 }, { "epoch": 0.8067429259482239, "grad_norm": 4.483447074890137, "learning_rate": 9.688319051972223e-07, "loss": 0.4063, "step": 6700 }, { "epoch": 0.8079470198675497, "grad_norm": 4.88456916809082, "learning_rate": 9.686490521654713e-07, "loss": 0.4548, "step": 6710 }, { "epoch": 0.8091511137868754, "grad_norm": 4.166242599487305, "learning_rate": 9.684656816899374e-07, "loss": 0.4344, "step": 6720 }, { "epoch": 0.8103552077062011, "grad_norm": 4.282528877258301, "learning_rate": 9.682817939730831e-07, "loss": 0.4143, "step": 6730 }, { "epoch": 0.8115593016255268, "grad_norm": 4.342618942260742, "learning_rate": 9.680973892179423e-07, "loss": 0.4224, "step": 6740 }, { "epoch": 0.8127633955448526, "grad_norm": 4.768647193908691, "learning_rate": 9.679124676281195e-07, "loss": 0.4251, "step": 6750 }, { "epoch": 0.8139674894641782, "grad_norm": 4.024239540100098, "learning_rate": 9.677270294077896e-07, "loss": 0.4415, "step": 6760 }, { "epoch": 0.8151715833835039, "grad_norm": 3.9242262840270996, "learning_rate": 9.675410747616984e-07, "loss": 0.4475, "step": 6770 }, { "epoch": 0.8163756773028297, "grad_norm": 4.580953121185303, "learning_rate": 9.67354603895162e-07, "loss": 0.4067, "step": 6780 }, { "epoch": 0.8175797712221553, "grad_norm": 4.859120845794678, "learning_rate": 9.67167617014066e-07, "loss": 0.4311, "step": 6790 }, { "epoch": 0.818783865141481, "grad_norm": 4.1437835693359375, "learning_rate": 9.66980114324866e-07, "loss": 0.4135, "step": 6800 }, { "epoch": 0.8199879590608068, "grad_norm": 4.027251243591309, "learning_rate": 9.667920960345872e-07, "loss": 0.4021, "step": 6810 }, { "epoch": 0.8211920529801324, "grad_norm": 4.283502101898193, "learning_rate": 9.666035623508237e-07, "loss": 0.4207, "step": 6820 }, { "epoch": 0.8223961468994582, "grad_norm": 4.910589694976807, "learning_rate": 9.66414513481739e-07, "loss": 0.4474, "step": 6830 }, { "epoch": 0.8236002408187839, "grad_norm": 5.238614559173584, "learning_rate": 9.662249496360653e-07, "loss": 0.4294, "step": 6840 }, { "epoch": 0.8248043347381095, "grad_norm": 4.113722801208496, "learning_rate": 9.660348710231036e-07, "loss": 0.4145, "step": 6850 }, { "epoch": 0.8260084286574353, "grad_norm": 4.979987144470215, "learning_rate": 9.65844277852723e-07, "loss": 0.421, "step": 6860 }, { "epoch": 0.827212522576761, "grad_norm": 5.396749973297119, "learning_rate": 9.656531703353608e-07, "loss": 0.4444, "step": 6870 }, { "epoch": 0.8284166164960867, "grad_norm": 4.567556858062744, "learning_rate": 9.654615486820222e-07, "loss": 0.4198, "step": 6880 }, { "epoch": 0.8296207104154124, "grad_norm": 5.2882304191589355, "learning_rate": 9.6526941310428e-07, "loss": 0.4274, "step": 6890 }, { "epoch": 0.8308248043347382, "grad_norm": 4.51816987991333, "learning_rate": 9.650767638142746e-07, "loss": 0.4465, "step": 6900 }, { "epoch": 0.8320288982540638, "grad_norm": 3.9410834312438965, "learning_rate": 9.648836010247137e-07, "loss": 0.4182, "step": 6910 }, { "epoch": 0.8332329921733895, "grad_norm": 4.620553493499756, "learning_rate": 9.646899249488714e-07, "loss": 0.4206, "step": 6920 }, { "epoch": 0.8344370860927153, "grad_norm": 4.430214881896973, "learning_rate": 9.644957358005892e-07, "loss": 0.4313, "step": 6930 }, { "epoch": 0.8356411800120409, "grad_norm": 4.277939796447754, "learning_rate": 9.643010337942747e-07, "loss": 0.4313, "step": 6940 }, { "epoch": 0.8368452739313667, "grad_norm": 5.185015678405762, "learning_rate": 9.64105819144902e-07, "loss": 0.4225, "step": 6950 }, { "epoch": 0.8380493678506924, "grad_norm": 4.402646541595459, "learning_rate": 9.63910092068011e-07, "loss": 0.417, "step": 6960 }, { "epoch": 0.839253461770018, "grad_norm": 3.664020538330078, "learning_rate": 9.637138527797074e-07, "loss": 0.4337, "step": 6970 }, { "epoch": 0.8404575556893438, "grad_norm": 4.9388041496276855, "learning_rate": 9.635171014966625e-07, "loss": 0.412, "step": 6980 }, { "epoch": 0.8416616496086695, "grad_norm": 4.200076103210449, "learning_rate": 9.63319838436113e-07, "loss": 0.4212, "step": 6990 }, { "epoch": 0.8428657435279951, "grad_norm": 4.56259822845459, "learning_rate": 9.631220638158605e-07, "loss": 0.4316, "step": 7000 }, { "epoch": 0.8440698374473209, "grad_norm": 3.910545587539673, "learning_rate": 9.629237778542714e-07, "loss": 0.4, "step": 7010 }, { "epoch": 0.8452739313666466, "grad_norm": 4.639405250549316, "learning_rate": 9.62724980770277e-07, "loss": 0.4084, "step": 7020 }, { "epoch": 0.8464780252859723, "grad_norm": 4.84975528717041, "learning_rate": 9.625256727833725e-07, "loss": 0.4331, "step": 7030 }, { "epoch": 0.847682119205298, "grad_norm": 3.9190306663513184, "learning_rate": 9.623258541136175e-07, "loss": 0.4171, "step": 7040 }, { "epoch": 0.8488862131246238, "grad_norm": 4.248600482940674, "learning_rate": 9.621255249816353e-07, "loss": 0.4255, "step": 7050 }, { "epoch": 0.8500903070439494, "grad_norm": 4.055094242095947, "learning_rate": 9.61924685608613e-07, "loss": 0.4257, "step": 7060 }, { "epoch": 0.8512944009632751, "grad_norm": 4.14054536819458, "learning_rate": 9.617233362163007e-07, "loss": 0.4046, "step": 7070 }, { "epoch": 0.8524984948826009, "grad_norm": 5.480048179626465, "learning_rate": 9.61521477027012e-07, "loss": 0.4007, "step": 7080 }, { "epoch": 0.8537025888019265, "grad_norm": 4.100722312927246, "learning_rate": 9.613191082636232e-07, "loss": 0.4148, "step": 7090 }, { "epoch": 0.8549066827212523, "grad_norm": 3.739861011505127, "learning_rate": 9.611162301495735e-07, "loss": 0.4156, "step": 7100 }, { "epoch": 0.856110776640578, "grad_norm": 4.769533634185791, "learning_rate": 9.60912842908864e-07, "loss": 0.4356, "step": 7110 }, { "epoch": 0.8573148705599036, "grad_norm": 4.347903728485107, "learning_rate": 9.60708946766058e-07, "loss": 0.4509, "step": 7120 }, { "epoch": 0.8585189644792294, "grad_norm": 4.265124797821045, "learning_rate": 9.605045419462813e-07, "loss": 0.4231, "step": 7130 }, { "epoch": 0.8597230583985551, "grad_norm": 5.108783721923828, "learning_rate": 9.602996286752206e-07, "loss": 0.4363, "step": 7140 }, { "epoch": 0.8609271523178808, "grad_norm": 5.001750946044922, "learning_rate": 9.600942071791248e-07, "loss": 0.4223, "step": 7150 }, { "epoch": 0.8621312462372065, "grad_norm": 4.6718668937683105, "learning_rate": 9.598882776848025e-07, "loss": 0.4206, "step": 7160 }, { "epoch": 0.8633353401565322, "grad_norm": 4.35657262802124, "learning_rate": 9.596818404196249e-07, "loss": 0.4136, "step": 7170 }, { "epoch": 0.8645394340758579, "grad_norm": 4.119489669799805, "learning_rate": 9.59474895611523e-07, "loss": 0.4254, "step": 7180 }, { "epoch": 0.8657435279951836, "grad_norm": 4.4842047691345215, "learning_rate": 9.59267443488988e-07, "loss": 0.4279, "step": 7190 }, { "epoch": 0.8669476219145094, "grad_norm": 4.105453014373779, "learning_rate": 9.590594842810714e-07, "loss": 0.4031, "step": 7200 }, { "epoch": 0.868151715833835, "grad_norm": 4.400485992431641, "learning_rate": 9.58851018217385e-07, "loss": 0.4098, "step": 7210 }, { "epoch": 0.8693558097531607, "grad_norm": 4.673033714294434, "learning_rate": 9.586420455280998e-07, "loss": 0.4299, "step": 7220 }, { "epoch": 0.8705599036724865, "grad_norm": 4.483117580413818, "learning_rate": 9.584325664439463e-07, "loss": 0.438, "step": 7230 }, { "epoch": 0.8717639975918121, "grad_norm": 5.068016052246094, "learning_rate": 9.58222581196214e-07, "loss": 0.4162, "step": 7240 }, { "epoch": 0.8729680915111379, "grad_norm": 4.488113880157471, "learning_rate": 9.580120900167513e-07, "loss": 0.4196, "step": 7250 }, { "epoch": 0.8741721854304636, "grad_norm": 4.887204647064209, "learning_rate": 9.578010931379654e-07, "loss": 0.439, "step": 7260 }, { "epoch": 0.8753762793497892, "grad_norm": 4.7396159172058105, "learning_rate": 9.575895907928217e-07, "loss": 0.4202, "step": 7270 }, { "epoch": 0.876580373269115, "grad_norm": 4.224496364593506, "learning_rate": 9.573775832148438e-07, "loss": 0.4027, "step": 7280 }, { "epoch": 0.8777844671884407, "grad_norm": 5.062420845031738, "learning_rate": 9.57165070638113e-07, "loss": 0.4123, "step": 7290 }, { "epoch": 0.8789885611077664, "grad_norm": 3.75753116607666, "learning_rate": 9.569520532972678e-07, "loss": 0.4066, "step": 7300 }, { "epoch": 0.8801926550270921, "grad_norm": 4.535136699676514, "learning_rate": 9.567385314275054e-07, "loss": 0.4067, "step": 7310 }, { "epoch": 0.8813967489464178, "grad_norm": 4.068704128265381, "learning_rate": 9.56524505264578e-07, "loss": 0.4238, "step": 7320 }, { "epoch": 0.8826008428657435, "grad_norm": 5.032285690307617, "learning_rate": 9.563099750447965e-07, "loss": 0.4392, "step": 7330 }, { "epoch": 0.8838049367850692, "grad_norm": 4.432474136352539, "learning_rate": 9.560949410050274e-07, "loss": 0.4394, "step": 7340 }, { "epoch": 0.885009030704395, "grad_norm": 3.7745227813720703, "learning_rate": 9.558794033826933e-07, "loss": 0.4228, "step": 7350 }, { "epoch": 0.8862131246237207, "grad_norm": 4.947648525238037, "learning_rate": 9.556633624157734e-07, "loss": 0.4324, "step": 7360 }, { "epoch": 0.8874172185430463, "grad_norm": 3.695946216583252, "learning_rate": 9.554468183428025e-07, "loss": 0.407, "step": 7370 }, { "epoch": 0.8886213124623721, "grad_norm": 4.399337291717529, "learning_rate": 9.552297714028703e-07, "loss": 0.4313, "step": 7380 }, { "epoch": 0.8898254063816978, "grad_norm": 4.042302131652832, "learning_rate": 9.550122218356227e-07, "loss": 0.4183, "step": 7390 }, { "epoch": 0.8910295003010235, "grad_norm": 4.341307163238525, "learning_rate": 9.5479416988126e-07, "loss": 0.4335, "step": 7400 }, { "epoch": 0.8922335942203492, "grad_norm": 3.7946054935455322, "learning_rate": 9.545756157805367e-07, "loss": 0.4123, "step": 7410 }, { "epoch": 0.893437688139675, "grad_norm": 5.04152250289917, "learning_rate": 9.543565597747632e-07, "loss": 0.4139, "step": 7420 }, { "epoch": 0.8946417820590006, "grad_norm": 3.8958561420440674, "learning_rate": 9.541370021058023e-07, "loss": 0.4084, "step": 7430 }, { "epoch": 0.8958458759783263, "grad_norm": 3.7490954399108887, "learning_rate": 9.53916943016072e-07, "loss": 0.4048, "step": 7440 }, { "epoch": 0.8970499698976521, "grad_norm": 4.4821858406066895, "learning_rate": 9.536963827485434e-07, "loss": 0.3984, "step": 7450 }, { "epoch": 0.8982540638169777, "grad_norm": 4.666491985321045, "learning_rate": 9.53475321546741e-07, "loss": 0.4098, "step": 7460 }, { "epoch": 0.8994581577363034, "grad_norm": 4.890908718109131, "learning_rate": 9.532537596547423e-07, "loss": 0.3982, "step": 7470 }, { "epoch": 0.9006622516556292, "grad_norm": 4.651495933532715, "learning_rate": 9.53031697317178e-07, "loss": 0.418, "step": 7480 }, { "epoch": 0.9018663455749548, "grad_norm": 4.55120849609375, "learning_rate": 9.528091347792308e-07, "loss": 0.4187, "step": 7490 }, { "epoch": 0.9030704394942806, "grad_norm": 5.57934045791626, "learning_rate": 9.525860722866362e-07, "loss": 0.4156, "step": 7500 }, { "epoch": 0.9042745334136063, "grad_norm": 3.860431432723999, "learning_rate": 9.523625100856813e-07, "loss": 0.4078, "step": 7510 }, { "epoch": 0.9054786273329319, "grad_norm": 4.670098781585693, "learning_rate": 9.521384484232054e-07, "loss": 0.4088, "step": 7520 }, { "epoch": 0.9066827212522577, "grad_norm": 4.332681655883789, "learning_rate": 9.519138875465986e-07, "loss": 0.422, "step": 7530 }, { "epoch": 0.9078868151715834, "grad_norm": 4.745145797729492, "learning_rate": 9.516888277038029e-07, "loss": 0.409, "step": 7540 }, { "epoch": 0.9090909090909091, "grad_norm": 4.109555721282959, "learning_rate": 9.514632691433106e-07, "loss": 0.4177, "step": 7550 }, { "epoch": 0.9102950030102348, "grad_norm": 5.039947032928467, "learning_rate": 9.512372121141652e-07, "loss": 0.4132, "step": 7560 }, { "epoch": 0.9114990969295605, "grad_norm": 4.389688968658447, "learning_rate": 9.510106568659599e-07, "loss": 0.4176, "step": 7570 }, { "epoch": 0.9127031908488862, "grad_norm": 4.67106819152832, "learning_rate": 9.50783603648839e-07, "loss": 0.4441, "step": 7580 }, { "epoch": 0.9139072847682119, "grad_norm": 3.6345438957214355, "learning_rate": 9.505560527134956e-07, "loss": 0.395, "step": 7590 }, { "epoch": 0.9151113786875377, "grad_norm": 4.544852256774902, "learning_rate": 9.503280043111728e-07, "loss": 0.4291, "step": 7600 }, { "epoch": 0.9163154726068633, "grad_norm": 5.17853307723999, "learning_rate": 9.50099458693663e-07, "loss": 0.42, "step": 7610 }, { "epoch": 0.917519566526189, "grad_norm": 4.111993789672852, "learning_rate": 9.498704161133073e-07, "loss": 0.4086, "step": 7620 }, { "epoch": 0.9187236604455148, "grad_norm": 3.93930721282959, "learning_rate": 9.49640876822996e-07, "loss": 0.4128, "step": 7630 }, { "epoch": 0.9199277543648404, "grad_norm": 4.442197322845459, "learning_rate": 9.494108410761672e-07, "loss": 0.4107, "step": 7640 }, { "epoch": 0.9211318482841662, "grad_norm": 4.266764163970947, "learning_rate": 9.491803091268077e-07, "loss": 0.4093, "step": 7650 }, { "epoch": 0.9223359422034919, "grad_norm": 4.633232593536377, "learning_rate": 9.48949281229452e-07, "loss": 0.4152, "step": 7660 }, { "epoch": 0.9235400361228175, "grad_norm": 4.4745073318481445, "learning_rate": 9.487177576391818e-07, "loss": 0.4423, "step": 7670 }, { "epoch": 0.9247441300421433, "grad_norm": 3.795365333557129, "learning_rate": 9.484857386116268e-07, "loss": 0.4013, "step": 7680 }, { "epoch": 0.925948223961469, "grad_norm": 4.76974630355835, "learning_rate": 9.48253224402963e-07, "loss": 0.4084, "step": 7690 }, { "epoch": 0.9271523178807947, "grad_norm": 4.584947109222412, "learning_rate": 9.48020215269914e-07, "loss": 0.4237, "step": 7700 }, { "epoch": 0.9283564118001204, "grad_norm": 4.877064228057861, "learning_rate": 9.477867114697486e-07, "loss": 0.409, "step": 7710 }, { "epoch": 0.9295605057194462, "grad_norm": 4.372793674468994, "learning_rate": 9.475527132602832e-07, "loss": 0.4142, "step": 7720 }, { "epoch": 0.9307645996387718, "grad_norm": 4.198723316192627, "learning_rate": 9.473182208998792e-07, "loss": 0.4057, "step": 7730 }, { "epoch": 0.9319686935580975, "grad_norm": 4.460008144378662, "learning_rate": 9.470832346474435e-07, "loss": 0.4235, "step": 7740 }, { "epoch": 0.9331727874774233, "grad_norm": 4.3058905601501465, "learning_rate": 9.468477547624289e-07, "loss": 0.4307, "step": 7750 }, { "epoch": 0.9343768813967489, "grad_norm": 4.6467132568359375, "learning_rate": 9.466117815048329e-07, "loss": 0.4127, "step": 7760 }, { "epoch": 0.9355809753160746, "grad_norm": 5.1491217613220215, "learning_rate": 9.463753151351978e-07, "loss": 0.4181, "step": 7770 }, { "epoch": 0.9367850692354004, "grad_norm": 5.166205883026123, "learning_rate": 9.461383559146102e-07, "loss": 0.4102, "step": 7780 }, { "epoch": 0.937989163154726, "grad_norm": 4.453047275543213, "learning_rate": 9.459009041047012e-07, "loss": 0.4135, "step": 7790 }, { "epoch": 0.9391932570740518, "grad_norm": 5.151276111602783, "learning_rate": 9.456629599676456e-07, "loss": 0.4072, "step": 7800 }, { "epoch": 0.9403973509933775, "grad_norm": 3.8332607746124268, "learning_rate": 9.454245237661615e-07, "loss": 0.4363, "step": 7810 }, { "epoch": 0.9416014449127031, "grad_norm": 4.51285982131958, "learning_rate": 9.451855957635108e-07, "loss": 0.4265, "step": 7820 }, { "epoch": 0.9428055388320289, "grad_norm": 4.756032466888428, "learning_rate": 9.449461762234981e-07, "loss": 0.4322, "step": 7830 }, { "epoch": 0.9440096327513546, "grad_norm": 3.7539730072021484, "learning_rate": 9.447062654104707e-07, "loss": 0.4052, "step": 7840 }, { "epoch": 0.9452137266706803, "grad_norm": 4.208081245422363, "learning_rate": 9.444658635893186e-07, "loss": 0.4101, "step": 7850 }, { "epoch": 0.946417820590006, "grad_norm": 3.338568925857544, "learning_rate": 9.442249710254737e-07, "loss": 0.4195, "step": 7860 }, { "epoch": 0.9476219145093318, "grad_norm": 4.421904563903809, "learning_rate": 9.439835879849096e-07, "loss": 0.4232, "step": 7870 }, { "epoch": 0.9488260084286574, "grad_norm": 4.675938129425049, "learning_rate": 9.437417147341417e-07, "loss": 0.4171, "step": 7880 }, { "epoch": 0.9500301023479831, "grad_norm": 5.047989845275879, "learning_rate": 9.434993515402267e-07, "loss": 0.4083, "step": 7890 }, { "epoch": 0.9512341962673089, "grad_norm": 4.1763916015625, "learning_rate": 9.432564986707621e-07, "loss": 0.3946, "step": 7900 }, { "epoch": 0.9524382901866345, "grad_norm": 4.706401348114014, "learning_rate": 9.43013156393886e-07, "loss": 0.4147, "step": 7910 }, { "epoch": 0.9536423841059603, "grad_norm": 4.3355255126953125, "learning_rate": 9.427693249782769e-07, "loss": 0.4244, "step": 7920 }, { "epoch": 0.954846478025286, "grad_norm": 5.126685619354248, "learning_rate": 9.425250046931537e-07, "loss": 0.4148, "step": 7930 }, { "epoch": 0.9560505719446116, "grad_norm": 3.4599716663360596, "learning_rate": 9.422801958082744e-07, "loss": 0.4237, "step": 7940 }, { "epoch": 0.9572546658639374, "grad_norm": 4.331906795501709, "learning_rate": 9.420348985939371e-07, "loss": 0.4097, "step": 7950 }, { "epoch": 0.9584587597832631, "grad_norm": 4.4911370277404785, "learning_rate": 9.417891133209787e-07, "loss": 0.4029, "step": 7960 }, { "epoch": 0.9596628537025887, "grad_norm": 4.601186275482178, "learning_rate": 9.415428402607754e-07, "loss": 0.4194, "step": 7970 }, { "epoch": 0.9608669476219145, "grad_norm": 4.048129558563232, "learning_rate": 9.412960796852412e-07, "loss": 0.4205, "step": 7980 }, { "epoch": 0.9620710415412402, "grad_norm": 4.8655571937561035, "learning_rate": 9.410488318668292e-07, "loss": 0.4229, "step": 7990 }, { "epoch": 0.963275135460566, "grad_norm": 3.7495744228363037, "learning_rate": 9.408010970785302e-07, "loss": 0.3761, "step": 8000 }, { "epoch": 0.9644792293798916, "grad_norm": 5.3356499671936035, "learning_rate": 9.405528755938725e-07, "loss": 0.4093, "step": 8010 }, { "epoch": 0.9656833232992174, "grad_norm": 5.407442569732666, "learning_rate": 9.403041676869217e-07, "loss": 0.4066, "step": 8020 }, { "epoch": 0.9668874172185431, "grad_norm": 3.860828161239624, "learning_rate": 9.400549736322807e-07, "loss": 0.3982, "step": 8030 }, { "epoch": 0.9680915111378687, "grad_norm": 4.087296962738037, "learning_rate": 9.398052937050892e-07, "loss": 0.3951, "step": 8040 }, { "epoch": 0.9692956050571945, "grad_norm": 4.309443473815918, "learning_rate": 9.395551281810233e-07, "loss": 0.4025, "step": 8050 }, { "epoch": 0.9704996989765202, "grad_norm": 4.655600547790527, "learning_rate": 9.39304477336295e-07, "loss": 0.4187, "step": 8060 }, { "epoch": 0.9717037928958459, "grad_norm": 4.34591007232666, "learning_rate": 9.390533414476527e-07, "loss": 0.4164, "step": 8070 }, { "epoch": 0.9729078868151716, "grad_norm": 4.547005653381348, "learning_rate": 9.388017207923798e-07, "loss": 0.4124, "step": 8080 }, { "epoch": 0.9741119807344973, "grad_norm": 5.021882057189941, "learning_rate": 9.385496156482953e-07, "loss": 0.4289, "step": 8090 }, { "epoch": 0.975316074653823, "grad_norm": 4.165801525115967, "learning_rate": 9.382970262937526e-07, "loss": 0.4058, "step": 8100 }, { "epoch": 0.9765201685731487, "grad_norm": 4.876884460449219, "learning_rate": 9.380439530076407e-07, "loss": 0.43, "step": 8110 }, { "epoch": 0.9777242624924745, "grad_norm": 4.928086757659912, "learning_rate": 9.377903960693818e-07, "loss": 0.423, "step": 8120 }, { "epoch": 0.9789283564118001, "grad_norm": 5.006045341491699, "learning_rate": 9.375363557589331e-07, "loss": 0.4354, "step": 8130 }, { "epoch": 0.9801324503311258, "grad_norm": 3.8796417713165283, "learning_rate": 9.372818323567846e-07, "loss": 0.4132, "step": 8140 }, { "epoch": 0.9813365442504516, "grad_norm": 4.275393962860107, "learning_rate": 9.370268261439604e-07, "loss": 0.4071, "step": 8150 }, { "epoch": 0.9825406381697772, "grad_norm": 5.467378616333008, "learning_rate": 9.367713374020174e-07, "loss": 0.4049, "step": 8160 }, { "epoch": 0.983744732089103, "grad_norm": 3.720611095428467, "learning_rate": 9.365153664130453e-07, "loss": 0.4008, "step": 8170 }, { "epoch": 0.9849488260084287, "grad_norm": 4.539004802703857, "learning_rate": 9.362589134596661e-07, "loss": 0.4118, "step": 8180 }, { "epoch": 0.9861529199277543, "grad_norm": 3.776636838912964, "learning_rate": 9.360019788250342e-07, "loss": 0.4334, "step": 8190 }, { "epoch": 0.9873570138470801, "grad_norm": 3.8309648036956787, "learning_rate": 9.357445627928355e-07, "loss": 0.4179, "step": 8200 }, { "epoch": 0.9885611077664058, "grad_norm": 4.798840045928955, "learning_rate": 9.354866656472881e-07, "loss": 0.4154, "step": 8210 }, { "epoch": 0.9897652016857315, "grad_norm": 4.182796955108643, "learning_rate": 9.352282876731403e-07, "loss": 0.4196, "step": 8220 }, { "epoch": 0.9909692956050572, "grad_norm": 4.6675801277160645, "learning_rate": 9.349694291556723e-07, "loss": 0.4182, "step": 8230 }, { "epoch": 0.9921733895243829, "grad_norm": 4.432309627532959, "learning_rate": 9.347100903806941e-07, "loss": 0.4206, "step": 8240 }, { "epoch": 0.9933774834437086, "grad_norm": 4.616915702819824, "learning_rate": 9.344502716345463e-07, "loss": 0.4153, "step": 8250 }, { "epoch": 0.9945815773630343, "grad_norm": 4.290421485900879, "learning_rate": 9.341899732040994e-07, "loss": 0.4162, "step": 8260 }, { "epoch": 0.9957856712823601, "grad_norm": 4.533810138702393, "learning_rate": 9.339291953767539e-07, "loss": 0.4113, "step": 8270 }, { "epoch": 0.9969897652016857, "grad_norm": 4.271683692932129, "learning_rate": 9.336679384404387e-07, "loss": 0.4166, "step": 8280 }, { "epoch": 0.9981938591210114, "grad_norm": 5.167937755584717, "learning_rate": 9.334062026836127e-07, "loss": 0.385, "step": 8290 }, { "epoch": 0.9993979530403372, "grad_norm": 4.525483131408691, "learning_rate": 9.331439883952628e-07, "loss": 0.3977, "step": 8300 }, { "epoch": 1.0006020469596628, "grad_norm": 4.6253533363342285, "learning_rate": 9.328812958649044e-07, "loss": 0.4123, "step": 8310 }, { "epoch": 1.0018061408789887, "grad_norm": 5.165332317352295, "learning_rate": 9.326181253825812e-07, "loss": 0.3842, "step": 8320 }, { "epoch": 1.0030102347983143, "grad_norm": 3.894192934036255, "learning_rate": 9.323544772388645e-07, "loss": 0.3528, "step": 8330 }, { "epoch": 1.00421432871764, "grad_norm": 3.8034422397613525, "learning_rate": 9.320903517248527e-07, "loss": 0.3817, "step": 8340 }, { "epoch": 1.0054184226369658, "grad_norm": 4.677804946899414, "learning_rate": 9.318257491321714e-07, "loss": 0.3772, "step": 8350 }, { "epoch": 1.0066225165562914, "grad_norm": 4.256035327911377, "learning_rate": 9.315606697529733e-07, "loss": 0.3858, "step": 8360 }, { "epoch": 1.007826610475617, "grad_norm": 4.362122058868408, "learning_rate": 9.312951138799371e-07, "loss": 0.3702, "step": 8370 }, { "epoch": 1.009030704394943, "grad_norm": 4.146007537841797, "learning_rate": 9.310290818062681e-07, "loss": 0.3869, "step": 8380 }, { "epoch": 1.0102347983142685, "grad_norm": 4.480301856994629, "learning_rate": 9.307625738256967e-07, "loss": 0.4082, "step": 8390 }, { "epoch": 1.0114388922335942, "grad_norm": 4.406433582305908, "learning_rate": 9.304955902324793e-07, "loss": 0.3846, "step": 8400 }, { "epoch": 1.01264298615292, "grad_norm": 4.386068820953369, "learning_rate": 9.302281313213972e-07, "loss": 0.3806, "step": 8410 }, { "epoch": 1.0138470800722457, "grad_norm": 4.706192970275879, "learning_rate": 9.299601973877566e-07, "loss": 0.385, "step": 8420 }, { "epoch": 1.0150511739915713, "grad_norm": 5.003023624420166, "learning_rate": 9.29691788727388e-07, "loss": 0.3785, "step": 8430 }, { "epoch": 1.0162552679108972, "grad_norm": 4.118617534637451, "learning_rate": 9.294229056366463e-07, "loss": 0.3649, "step": 8440 }, { "epoch": 1.0174593618302228, "grad_norm": 4.070971488952637, "learning_rate": 9.291535484124101e-07, "loss": 0.3897, "step": 8450 }, { "epoch": 1.0186634557495484, "grad_norm": 4.141367435455322, "learning_rate": 9.288837173520814e-07, "loss": 0.3712, "step": 8460 }, { "epoch": 1.0198675496688743, "grad_norm": 4.00056791305542, "learning_rate": 9.286134127535859e-07, "loss": 0.372, "step": 8470 }, { "epoch": 1.0210716435882, "grad_norm": 4.618954658508301, "learning_rate": 9.283426349153711e-07, "loss": 0.3708, "step": 8480 }, { "epoch": 1.0222757375075255, "grad_norm": 4.50954008102417, "learning_rate": 9.280713841364083e-07, "loss": 0.3831, "step": 8490 }, { "epoch": 1.0234798314268514, "grad_norm": 4.025129795074463, "learning_rate": 9.277996607161898e-07, "loss": 0.3807, "step": 8500 }, { "epoch": 1.024683925346177, "grad_norm": 4.727366924285889, "learning_rate": 9.275274649547307e-07, "loss": 0.3707, "step": 8510 }, { "epoch": 1.0258880192655027, "grad_norm": 4.731372833251953, "learning_rate": 9.272547971525669e-07, "loss": 0.3655, "step": 8520 }, { "epoch": 1.0270921131848285, "grad_norm": 4.237710475921631, "learning_rate": 9.269816576107559e-07, "loss": 0.365, "step": 8530 }, { "epoch": 1.0282962071041541, "grad_norm": 4.294924736022949, "learning_rate": 9.267080466308758e-07, "loss": 0.3774, "step": 8540 }, { "epoch": 1.0295003010234798, "grad_norm": 4.249452590942383, "learning_rate": 9.264339645150256e-07, "loss": 0.372, "step": 8550 }, { "epoch": 1.0307043949428056, "grad_norm": 4.078114986419678, "learning_rate": 9.26159411565824e-07, "loss": 0.3736, "step": 8560 }, { "epoch": 1.0319084888621313, "grad_norm": 5.815018177032471, "learning_rate": 9.258843880864101e-07, "loss": 0.3708, "step": 8570 }, { "epoch": 1.033112582781457, "grad_norm": 4.562671184539795, "learning_rate": 9.256088943804421e-07, "loss": 0.3926, "step": 8580 }, { "epoch": 1.0343166767007828, "grad_norm": 5.159687042236328, "learning_rate": 9.253329307520974e-07, "loss": 0.3754, "step": 8590 }, { "epoch": 1.0355207706201084, "grad_norm": 4.418034076690674, "learning_rate": 9.250564975060725e-07, "loss": 0.3756, "step": 8600 }, { "epoch": 1.036724864539434, "grad_norm": 4.661262035369873, "learning_rate": 9.247795949475823e-07, "loss": 0.3854, "step": 8610 }, { "epoch": 1.0379289584587599, "grad_norm": 4.768362522125244, "learning_rate": 9.245022233823598e-07, "loss": 0.3798, "step": 8620 }, { "epoch": 1.0391330523780855, "grad_norm": 4.090877056121826, "learning_rate": 9.242243831166558e-07, "loss": 0.3883, "step": 8630 }, { "epoch": 1.0403371462974111, "grad_norm": 4.2338032722473145, "learning_rate": 9.23946074457239e-07, "loss": 0.3784, "step": 8640 }, { "epoch": 1.041541240216737, "grad_norm": 4.812314510345459, "learning_rate": 9.236672977113947e-07, "loss": 0.3938, "step": 8650 }, { "epoch": 1.0427453341360626, "grad_norm": 5.055131435394287, "learning_rate": 9.233880531869253e-07, "loss": 0.3784, "step": 8660 }, { "epoch": 1.0439494280553883, "grad_norm": 4.143119812011719, "learning_rate": 9.231083411921497e-07, "loss": 0.368, "step": 8670 }, { "epoch": 1.0451535219747141, "grad_norm": 4.840368270874023, "learning_rate": 9.228281620359029e-07, "loss": 0.3771, "step": 8680 }, { "epoch": 1.0463576158940397, "grad_norm": 4.708595275878906, "learning_rate": 9.225475160275358e-07, "loss": 0.3572, "step": 8690 }, { "epoch": 1.0475617098133654, "grad_norm": 4.715826511383057, "learning_rate": 9.222664034769145e-07, "loss": 0.3929, "step": 8700 }, { "epoch": 1.0487658037326912, "grad_norm": 4.969057559967041, "learning_rate": 9.219848246944205e-07, "loss": 0.3895, "step": 8710 }, { "epoch": 1.0499698976520169, "grad_norm": 3.3560914993286133, "learning_rate": 9.217027799909499e-07, "loss": 0.379, "step": 8720 }, { "epoch": 1.0511739915713425, "grad_norm": 4.196804046630859, "learning_rate": 9.214202696779134e-07, "loss": 0.3692, "step": 8730 }, { "epoch": 1.0523780854906684, "grad_norm": 4.214865684509277, "learning_rate": 9.211372940672355e-07, "loss": 0.3673, "step": 8740 }, { "epoch": 1.053582179409994, "grad_norm": 4.642685890197754, "learning_rate": 9.208538534713548e-07, "loss": 0.3961, "step": 8750 }, { "epoch": 1.0547862733293196, "grad_norm": 4.921828269958496, "learning_rate": 9.20569948203223e-07, "loss": 0.3616, "step": 8760 }, { "epoch": 1.0559903672486455, "grad_norm": 3.9251582622528076, "learning_rate": 9.202855785763051e-07, "loss": 0.3958, "step": 8770 }, { "epoch": 1.0571944611679711, "grad_norm": 4.475203990936279, "learning_rate": 9.200007449045785e-07, "loss": 0.3782, "step": 8780 }, { "epoch": 1.0583985550872967, "grad_norm": 4.735462665557861, "learning_rate": 9.197154475025333e-07, "loss": 0.3571, "step": 8790 }, { "epoch": 1.0596026490066226, "grad_norm": 4.720487117767334, "learning_rate": 9.194296866851712e-07, "loss": 0.3632, "step": 8800 }, { "epoch": 1.0608067429259482, "grad_norm": 4.291871547698975, "learning_rate": 9.191434627680063e-07, "loss": 0.3722, "step": 8810 }, { "epoch": 1.0620108368452739, "grad_norm": 4.449291229248047, "learning_rate": 9.188567760670631e-07, "loss": 0.3857, "step": 8820 }, { "epoch": 1.0632149307645997, "grad_norm": 4.42001485824585, "learning_rate": 9.185696268988776e-07, "loss": 0.3798, "step": 8830 }, { "epoch": 1.0644190246839254, "grad_norm": 4.68118953704834, "learning_rate": 9.182820155804965e-07, "loss": 0.364, "step": 8840 }, { "epoch": 1.065623118603251, "grad_norm": 4.831759929656982, "learning_rate": 9.179939424294763e-07, "loss": 0.3656, "step": 8850 }, { "epoch": 1.0668272125225768, "grad_norm": 4.51068115234375, "learning_rate": 9.177054077638839e-07, "loss": 0.3779, "step": 8860 }, { "epoch": 1.0680313064419025, "grad_norm": 4.588883399963379, "learning_rate": 9.174164119022956e-07, "loss": 0.3766, "step": 8870 }, { "epoch": 1.069235400361228, "grad_norm": 4.487590789794922, "learning_rate": 9.171269551637968e-07, "loss": 0.3676, "step": 8880 }, { "epoch": 1.070439494280554, "grad_norm": 5.2501702308654785, "learning_rate": 9.168370378679819e-07, "loss": 0.3764, "step": 8890 }, { "epoch": 1.0716435881998796, "grad_norm": 4.199159145355225, "learning_rate": 9.165466603349539e-07, "loss": 0.3736, "step": 8900 }, { "epoch": 1.0728476821192052, "grad_norm": 4.138830184936523, "learning_rate": 9.162558228853235e-07, "loss": 0.3745, "step": 8910 }, { "epoch": 1.074051776038531, "grad_norm": 4.139305114746094, "learning_rate": 9.159645258402095e-07, "loss": 0.3693, "step": 8920 }, { "epoch": 1.0752558699578567, "grad_norm": 5.9480438232421875, "learning_rate": 9.156727695212386e-07, "loss": 0.3644, "step": 8930 }, { "epoch": 1.0764599638771823, "grad_norm": 4.251008987426758, "learning_rate": 9.153805542505438e-07, "loss": 0.3844, "step": 8940 }, { "epoch": 1.0776640577965082, "grad_norm": 4.630239486694336, "learning_rate": 9.150878803507654e-07, "loss": 0.3699, "step": 8950 }, { "epoch": 1.0788681517158338, "grad_norm": 5.171538829803467, "learning_rate": 9.147947481450498e-07, "loss": 0.4026, "step": 8960 }, { "epoch": 1.0800722456351595, "grad_norm": 4.777914524078369, "learning_rate": 9.145011579570491e-07, "loss": 0.3642, "step": 8970 }, { "epoch": 1.0812763395544853, "grad_norm": 5.336880207061768, "learning_rate": 9.142071101109224e-07, "loss": 0.3926, "step": 8980 }, { "epoch": 1.082480433473811, "grad_norm": 3.8747503757476807, "learning_rate": 9.139126049313321e-07, "loss": 0.3792, "step": 8990 }, { "epoch": 1.0836845273931366, "grad_norm": 4.528430461883545, "learning_rate": 9.136176427434475e-07, "loss": 0.3735, "step": 9000 }, { "epoch": 1.0848886213124624, "grad_norm": 5.05435848236084, "learning_rate": 9.133222238729412e-07, "loss": 0.3604, "step": 9010 }, { "epoch": 1.086092715231788, "grad_norm": 4.354115962982178, "learning_rate": 9.130263486459904e-07, "loss": 0.3995, "step": 9020 }, { "epoch": 1.0872968091511137, "grad_norm": 5.124173164367676, "learning_rate": 9.127300173892763e-07, "loss": 0.3622, "step": 9030 }, { "epoch": 1.0885009030704396, "grad_norm": 4.644625186920166, "learning_rate": 9.124332304299838e-07, "loss": 0.3704, "step": 9040 }, { "epoch": 1.0897049969897652, "grad_norm": 4.276961803436279, "learning_rate": 9.121359880958002e-07, "loss": 0.3771, "step": 9050 }, { "epoch": 1.0909090909090908, "grad_norm": 4.1808648109436035, "learning_rate": 9.118382907149163e-07, "loss": 0.3638, "step": 9060 }, { "epoch": 1.0921131848284167, "grad_norm": 4.921030521392822, "learning_rate": 9.115401386160251e-07, "loss": 0.3633, "step": 9070 }, { "epoch": 1.0933172787477423, "grad_norm": 4.0871663093566895, "learning_rate": 9.112415321283217e-07, "loss": 0.358, "step": 9080 }, { "epoch": 1.094521372667068, "grad_norm": 3.419311046600342, "learning_rate": 9.10942471581503e-07, "loss": 0.3601, "step": 9090 }, { "epoch": 1.0957254665863938, "grad_norm": 4.138514518737793, "learning_rate": 9.106429573057666e-07, "loss": 0.3764, "step": 9100 }, { "epoch": 1.0969295605057194, "grad_norm": 5.0829691886901855, "learning_rate": 9.10342989631812e-07, "loss": 0.3756, "step": 9110 }, { "epoch": 1.098133654425045, "grad_norm": 4.330390930175781, "learning_rate": 9.100425688908386e-07, "loss": 0.3587, "step": 9120 }, { "epoch": 1.099337748344371, "grad_norm": 5.1065592765808105, "learning_rate": 9.097416954145465e-07, "loss": 0.38, "step": 9130 }, { "epoch": 1.1005418422636966, "grad_norm": 4.509856224060059, "learning_rate": 9.094403695351352e-07, "loss": 0.38, "step": 9140 }, { "epoch": 1.1017459361830222, "grad_norm": 5.324617862701416, "learning_rate": 9.091385915853042e-07, "loss": 0.3658, "step": 9150 }, { "epoch": 1.102950030102348, "grad_norm": 5.061591148376465, "learning_rate": 9.088363618982521e-07, "loss": 0.3723, "step": 9160 }, { "epoch": 1.1041541240216737, "grad_norm": 5.028870582580566, "learning_rate": 9.085336808076758e-07, "loss": 0.3837, "step": 9170 }, { "epoch": 1.1053582179409993, "grad_norm": 4.214852809906006, "learning_rate": 9.082305486477708e-07, "loss": 0.3681, "step": 9180 }, { "epoch": 1.1065623118603252, "grad_norm": 4.787420272827148, "learning_rate": 9.079269657532311e-07, "loss": 0.3843, "step": 9190 }, { "epoch": 1.1077664057796508, "grad_norm": 3.78640079498291, "learning_rate": 9.076229324592477e-07, "loss": 0.3747, "step": 9200 }, { "epoch": 1.1089704996989764, "grad_norm": 4.786212921142578, "learning_rate": 9.073184491015094e-07, "loss": 0.3684, "step": 9210 }, { "epoch": 1.1101745936183023, "grad_norm": 3.932164430618286, "learning_rate": 9.070135160162015e-07, "loss": 0.3662, "step": 9220 }, { "epoch": 1.111378687537628, "grad_norm": 4.249774932861328, "learning_rate": 9.067081335400061e-07, "loss": 0.3722, "step": 9230 }, { "epoch": 1.1125827814569536, "grad_norm": 4.269323348999023, "learning_rate": 9.064023020101015e-07, "loss": 0.3765, "step": 9240 }, { "epoch": 1.1137868753762794, "grad_norm": 4.183831214904785, "learning_rate": 9.060960217641617e-07, "loss": 0.3657, "step": 9250 }, { "epoch": 1.114990969295605, "grad_norm": 4.336716175079346, "learning_rate": 9.057892931403563e-07, "loss": 0.3869, "step": 9260 }, { "epoch": 1.1161950632149307, "grad_norm": 4.948883533477783, "learning_rate": 9.054821164773498e-07, "loss": 0.3823, "step": 9270 }, { "epoch": 1.1173991571342565, "grad_norm": 4.687775611877441, "learning_rate": 9.051744921143014e-07, "loss": 0.3853, "step": 9280 }, { "epoch": 1.1186032510535822, "grad_norm": 4.803307056427002, "learning_rate": 9.048664203908647e-07, "loss": 0.3609, "step": 9290 }, { "epoch": 1.1198073449729078, "grad_norm": 4.377987861633301, "learning_rate": 9.045579016471871e-07, "loss": 0.3873, "step": 9300 }, { "epoch": 1.1210114388922336, "grad_norm": 4.264991760253906, "learning_rate": 9.042489362239096e-07, "loss": 0.3663, "step": 9310 }, { "epoch": 1.1222155328115593, "grad_norm": 4.69897985458374, "learning_rate": 9.039395244621667e-07, "loss": 0.3797, "step": 9320 }, { "epoch": 1.123419626730885, "grad_norm": 4.6573357582092285, "learning_rate": 9.036296667035853e-07, "loss": 0.3774, "step": 9330 }, { "epoch": 1.1246237206502108, "grad_norm": 4.6396307945251465, "learning_rate": 9.033193632902848e-07, "loss": 0.3708, "step": 9340 }, { "epoch": 1.1258278145695364, "grad_norm": 4.781702518463135, "learning_rate": 9.030086145648767e-07, "loss": 0.366, "step": 9350 }, { "epoch": 1.127031908488862, "grad_norm": 3.859081745147705, "learning_rate": 9.026974208704645e-07, "loss": 0.3592, "step": 9360 }, { "epoch": 1.1282360024081879, "grad_norm": 3.917964220046997, "learning_rate": 9.023857825506425e-07, "loss": 0.3828, "step": 9370 }, { "epoch": 1.1294400963275135, "grad_norm": 4.249654293060303, "learning_rate": 9.020736999494962e-07, "loss": 0.3816, "step": 9380 }, { "epoch": 1.1306441902468394, "grad_norm": 4.181410789489746, "learning_rate": 9.017611734116015e-07, "loss": 0.3881, "step": 9390 }, { "epoch": 1.131848284166165, "grad_norm": 4.529959678649902, "learning_rate": 9.014482032820245e-07, "loss": 0.3866, "step": 9400 }, { "epoch": 1.1330523780854906, "grad_norm": 4.115703105926514, "learning_rate": 9.011347899063212e-07, "loss": 0.4017, "step": 9410 }, { "epoch": 1.1342564720048163, "grad_norm": 5.330405235290527, "learning_rate": 9.008209336305369e-07, "loss": 0.382, "step": 9420 }, { "epoch": 1.1354605659241421, "grad_norm": 4.53489351272583, "learning_rate": 9.005066348012058e-07, "loss": 0.4002, "step": 9430 }, { "epoch": 1.1366646598434678, "grad_norm": 4.984791278839111, "learning_rate": 9.00191893765351e-07, "loss": 0.3699, "step": 9440 }, { "epoch": 1.1378687537627936, "grad_norm": 4.83209753036499, "learning_rate": 8.998767108704836e-07, "loss": 0.3612, "step": 9450 }, { "epoch": 1.1390728476821192, "grad_norm": 4.549959659576416, "learning_rate": 8.995610864646029e-07, "loss": 0.3552, "step": 9460 }, { "epoch": 1.1402769416014449, "grad_norm": 4.30760383605957, "learning_rate": 8.992450208961949e-07, "loss": 0.3796, "step": 9470 }, { "epoch": 1.1414810355207705, "grad_norm": 4.3470234870910645, "learning_rate": 8.989285145142338e-07, "loss": 0.3868, "step": 9480 }, { "epoch": 1.1426851294400964, "grad_norm": 4.755895614624023, "learning_rate": 8.986115676681796e-07, "loss": 0.3867, "step": 9490 }, { "epoch": 1.143889223359422, "grad_norm": 4.874184608459473, "learning_rate": 8.982941807079791e-07, "loss": 0.3866, "step": 9500 }, { "epoch": 1.1450933172787479, "grad_norm": 4.068636894226074, "learning_rate": 8.979763539840649e-07, "loss": 0.3558, "step": 9510 }, { "epoch": 1.1462974111980735, "grad_norm": 4.380646705627441, "learning_rate": 8.976580878473552e-07, "loss": 0.3704, "step": 9520 }, { "epoch": 1.1475015051173991, "grad_norm": 4.3028950691223145, "learning_rate": 8.973393826492531e-07, "loss": 0.3995, "step": 9530 }, { "epoch": 1.1487055990367248, "grad_norm": 4.423670768737793, "learning_rate": 8.97020238741647e-07, "loss": 0.38, "step": 9540 }, { "epoch": 1.1499096929560506, "grad_norm": 4.808249473571777, "learning_rate": 8.967006564769093e-07, "loss": 0.3779, "step": 9550 }, { "epoch": 1.1511137868753762, "grad_norm": 5.734920501708984, "learning_rate": 8.963806362078963e-07, "loss": 0.3713, "step": 9560 }, { "epoch": 1.152317880794702, "grad_norm": 4.730371952056885, "learning_rate": 8.960601782879483e-07, "loss": 0.3583, "step": 9570 }, { "epoch": 1.1535219747140277, "grad_norm": 5.035944938659668, "learning_rate": 8.957392830708886e-07, "loss": 0.39, "step": 9580 }, { "epoch": 1.1547260686333534, "grad_norm": 4.2402119636535645, "learning_rate": 8.95417950911023e-07, "loss": 0.3655, "step": 9590 }, { "epoch": 1.155930162552679, "grad_norm": 3.995563507080078, "learning_rate": 8.950961821631406e-07, "loss": 0.3657, "step": 9600 }, { "epoch": 1.1571342564720049, "grad_norm": 5.285823822021484, "learning_rate": 8.947739771825117e-07, "loss": 0.3825, "step": 9610 }, { "epoch": 1.1583383503913305, "grad_norm": 4.332102298736572, "learning_rate": 8.944513363248885e-07, "loss": 0.3808, "step": 9620 }, { "epoch": 1.1595424443106563, "grad_norm": 4.714332103729248, "learning_rate": 8.941282599465047e-07, "loss": 0.3904, "step": 9630 }, { "epoch": 1.160746538229982, "grad_norm": 3.8975484371185303, "learning_rate": 8.938047484040748e-07, "loss": 0.3559, "step": 9640 }, { "epoch": 1.1619506321493076, "grad_norm": 4.700948238372803, "learning_rate": 8.934808020547935e-07, "loss": 0.3676, "step": 9650 }, { "epoch": 1.1631547260686332, "grad_norm": 4.926019191741943, "learning_rate": 8.931564212563356e-07, "loss": 0.3913, "step": 9660 }, { "epoch": 1.164358819987959, "grad_norm": 4.402989864349365, "learning_rate": 8.92831606366856e-07, "loss": 0.3672, "step": 9670 }, { "epoch": 1.1655629139072847, "grad_norm": 4.371270656585693, "learning_rate": 8.925063577449886e-07, "loss": 0.3529, "step": 9680 }, { "epoch": 1.1667670078266106, "grad_norm": 5.072457790374756, "learning_rate": 8.92180675749846e-07, "loss": 0.3703, "step": 9690 }, { "epoch": 1.1679711017459362, "grad_norm": 5.789607524871826, "learning_rate": 8.918545607410197e-07, "loss": 0.3618, "step": 9700 }, { "epoch": 1.1691751956652618, "grad_norm": 4.929603576660156, "learning_rate": 8.91528013078579e-07, "loss": 0.3632, "step": 9710 }, { "epoch": 1.1703792895845875, "grad_norm": 4.385134220123291, "learning_rate": 8.91201033123071e-07, "loss": 0.3726, "step": 9720 }, { "epoch": 1.1715833835039133, "grad_norm": 4.493896961212158, "learning_rate": 8.908736212355201e-07, "loss": 0.396, "step": 9730 }, { "epoch": 1.172787477423239, "grad_norm": 5.4288859367370605, "learning_rate": 8.905457777774278e-07, "loss": 0.3693, "step": 9740 }, { "epoch": 1.1739915713425648, "grad_norm": 4.925263404846191, "learning_rate": 8.902175031107717e-07, "loss": 0.3809, "step": 9750 }, { "epoch": 1.1751956652618905, "grad_norm": 4.450766086578369, "learning_rate": 8.898887975980058e-07, "loss": 0.3747, "step": 9760 }, { "epoch": 1.176399759181216, "grad_norm": 5.003162860870361, "learning_rate": 8.895596616020595e-07, "loss": 0.3763, "step": 9770 }, { "epoch": 1.1776038531005417, "grad_norm": 5.204108238220215, "learning_rate": 8.89230095486338e-07, "loss": 0.3983, "step": 9780 }, { "epoch": 1.1788079470198676, "grad_norm": 5.1089372634887695, "learning_rate": 8.889000996147213e-07, "loss": 0.3757, "step": 9790 }, { "epoch": 1.1800120409391932, "grad_norm": 5.394412994384766, "learning_rate": 8.885696743515632e-07, "loss": 0.3764, "step": 9800 }, { "epoch": 1.181216134858519, "grad_norm": 4.811611175537109, "learning_rate": 8.882388200616926e-07, "loss": 0.3686, "step": 9810 }, { "epoch": 1.1824202287778447, "grad_norm": 4.908543109893799, "learning_rate": 8.879075371104113e-07, "loss": 0.368, "step": 9820 }, { "epoch": 1.1836243226971703, "grad_norm": 4.540360450744629, "learning_rate": 8.875758258634949e-07, "loss": 0.3698, "step": 9830 }, { "epoch": 1.1848284166164962, "grad_norm": 4.033935546875, "learning_rate": 8.872436866871917e-07, "loss": 0.3522, "step": 9840 }, { "epoch": 1.1860325105358218, "grad_norm": 5.225256443023682, "learning_rate": 8.869111199482225e-07, "loss": 0.3837, "step": 9850 }, { "epoch": 1.1872366044551474, "grad_norm": 4.02462100982666, "learning_rate": 8.865781260137801e-07, "loss": 0.381, "step": 9860 }, { "epoch": 1.1884406983744733, "grad_norm": 4.905768871307373, "learning_rate": 8.862447052515291e-07, "loss": 0.384, "step": 9870 }, { "epoch": 1.189644792293799, "grad_norm": 4.620838642120361, "learning_rate": 8.859108580296053e-07, "loss": 0.3533, "step": 9880 }, { "epoch": 1.1908488862131246, "grad_norm": 4.312672138214111, "learning_rate": 8.855765847166154e-07, "loss": 0.3591, "step": 9890 }, { "epoch": 1.1920529801324504, "grad_norm": 4.337918758392334, "learning_rate": 8.852418856816365e-07, "loss": 0.374, "step": 9900 }, { "epoch": 1.193257074051776, "grad_norm": 4.154960632324219, "learning_rate": 8.849067612942158e-07, "loss": 0.3551, "step": 9910 }, { "epoch": 1.1944611679711017, "grad_norm": 4.451188564300537, "learning_rate": 8.845712119243701e-07, "loss": 0.3699, "step": 9920 }, { "epoch": 1.1956652618904275, "grad_norm": 5.723966598510742, "learning_rate": 8.842352379425853e-07, "loss": 0.3875, "step": 9930 }, { "epoch": 1.1968693558097532, "grad_norm": 4.982749938964844, "learning_rate": 8.838988397198166e-07, "loss": 0.375, "step": 9940 }, { "epoch": 1.1980734497290788, "grad_norm": 4.661801338195801, "learning_rate": 8.835620176274869e-07, "loss": 0.3721, "step": 9950 }, { "epoch": 1.1992775436484047, "grad_norm": 5.228112697601318, "learning_rate": 8.832247720374879e-07, "loss": 0.366, "step": 9960 }, { "epoch": 1.2004816375677303, "grad_norm": 4.082928657531738, "learning_rate": 8.828871033221782e-07, "loss": 0.3621, "step": 9970 }, { "epoch": 1.201685731487056, "grad_norm": 3.532892942428589, "learning_rate": 8.82549011854384e-07, "loss": 0.365, "step": 9980 }, { "epoch": 1.2028898254063818, "grad_norm": 4.03758430480957, "learning_rate": 8.822104980073978e-07, "loss": 0.3786, "step": 9990 }, { "epoch": 1.2040939193257074, "grad_norm": 4.233405590057373, "learning_rate": 8.818715621549792e-07, "loss": 0.3664, "step": 10000 }, { "epoch": 1.205298013245033, "grad_norm": 4.029031753540039, "learning_rate": 8.815322046713531e-07, "loss": 0.3655, "step": 10010 }, { "epoch": 1.206502107164359, "grad_norm": 4.398824691772461, "learning_rate": 8.811924259312102e-07, "loss": 0.3818, "step": 10020 }, { "epoch": 1.2077062010836845, "grad_norm": 4.394994258880615, "learning_rate": 8.808522263097063e-07, "loss": 0.3875, "step": 10030 }, { "epoch": 1.2089102950030102, "grad_norm": 4.941735744476318, "learning_rate": 8.805116061824617e-07, "loss": 0.3635, "step": 10040 }, { "epoch": 1.210114388922336, "grad_norm": 4.183002471923828, "learning_rate": 8.801705659255616e-07, "loss": 0.3718, "step": 10050 }, { "epoch": 1.2113184828416617, "grad_norm": 3.9239907264709473, "learning_rate": 8.798291059155541e-07, "loss": 0.3562, "step": 10060 }, { "epoch": 1.2125225767609873, "grad_norm": 4.399021625518799, "learning_rate": 8.794872265294516e-07, "loss": 0.3577, "step": 10070 }, { "epoch": 1.2137266706803131, "grad_norm": 3.739692211151123, "learning_rate": 8.791449281447291e-07, "loss": 0.3715, "step": 10080 }, { "epoch": 1.2149307645996388, "grad_norm": 6.101430416107178, "learning_rate": 8.788022111393245e-07, "loss": 0.3791, "step": 10090 }, { "epoch": 1.2161348585189644, "grad_norm": 4.473653793334961, "learning_rate": 8.784590758916377e-07, "loss": 0.3733, "step": 10100 }, { "epoch": 1.2173389524382903, "grad_norm": 5.723465919494629, "learning_rate": 8.781155227805304e-07, "loss": 0.376, "step": 10110 }, { "epoch": 1.218543046357616, "grad_norm": 6.045252323150635, "learning_rate": 8.777715521853257e-07, "loss": 0.383, "step": 10120 }, { "epoch": 1.2197471402769415, "grad_norm": 4.978476524353027, "learning_rate": 8.774271644858078e-07, "loss": 0.3902, "step": 10130 }, { "epoch": 1.2209512341962674, "grad_norm": 4.655144691467285, "learning_rate": 8.770823600622212e-07, "loss": 0.3832, "step": 10140 }, { "epoch": 1.222155328115593, "grad_norm": 4.3407883644104, "learning_rate": 8.767371392952708e-07, "loss": 0.3582, "step": 10150 }, { "epoch": 1.2233594220349187, "grad_norm": 4.6942596435546875, "learning_rate": 8.763915025661206e-07, "loss": 0.3755, "step": 10160 }, { "epoch": 1.2245635159542445, "grad_norm": 4.285218715667725, "learning_rate": 8.760454502563947e-07, "loss": 0.3776, "step": 10170 }, { "epoch": 1.2257676098735701, "grad_norm": 4.890243053436279, "learning_rate": 8.756989827481755e-07, "loss": 0.37, "step": 10180 }, { "epoch": 1.2269717037928958, "grad_norm": 4.752533912658691, "learning_rate": 8.753521004240038e-07, "loss": 0.3717, "step": 10190 }, { "epoch": 1.2281757977122216, "grad_norm": 4.077126502990723, "learning_rate": 8.750048036668789e-07, "loss": 0.3811, "step": 10200 }, { "epoch": 1.2293798916315473, "grad_norm": 3.9369449615478516, "learning_rate": 8.74657092860257e-07, "loss": 0.3737, "step": 10210 }, { "epoch": 1.230583985550873, "grad_norm": 4.381350040435791, "learning_rate": 8.74308968388052e-07, "loss": 0.3528, "step": 10220 }, { "epoch": 1.2317880794701987, "grad_norm": 4.581336975097656, "learning_rate": 8.739604306346342e-07, "loss": 0.3728, "step": 10230 }, { "epoch": 1.2329921733895244, "grad_norm": 5.837801933288574, "learning_rate": 8.736114799848306e-07, "loss": 0.3812, "step": 10240 }, { "epoch": 1.23419626730885, "grad_norm": 4.347848892211914, "learning_rate": 8.732621168239236e-07, "loss": 0.3818, "step": 10250 }, { "epoch": 1.2354003612281759, "grad_norm": 4.717700004577637, "learning_rate": 8.729123415376514e-07, "loss": 0.3516, "step": 10260 }, { "epoch": 1.2366044551475015, "grad_norm": 4.809170722961426, "learning_rate": 8.725621545122072e-07, "loss": 0.3642, "step": 10270 }, { "epoch": 1.2378085490668271, "grad_norm": 4.547823905944824, "learning_rate": 8.722115561342387e-07, "loss": 0.3791, "step": 10280 }, { "epoch": 1.239012642986153, "grad_norm": 4.235891819000244, "learning_rate": 8.718605467908478e-07, "loss": 0.3663, "step": 10290 }, { "epoch": 1.2402167369054786, "grad_norm": 4.648200035095215, "learning_rate": 8.715091268695901e-07, "loss": 0.3623, "step": 10300 }, { "epoch": 1.2414208308248043, "grad_norm": 4.6003737449646, "learning_rate": 8.711572967584747e-07, "loss": 0.378, "step": 10310 }, { "epoch": 1.24262492474413, "grad_norm": 4.921525001525879, "learning_rate": 8.708050568459635e-07, "loss": 0.3602, "step": 10320 }, { "epoch": 1.2438290186634557, "grad_norm": 4.075355052947998, "learning_rate": 8.704524075209709e-07, "loss": 0.3698, "step": 10330 }, { "epoch": 1.2450331125827814, "grad_norm": 5.707545280456543, "learning_rate": 8.700993491728634e-07, "loss": 0.3538, "step": 10340 }, { "epoch": 1.2462372065021072, "grad_norm": 4.669870853424072, "learning_rate": 8.697458821914587e-07, "loss": 0.3685, "step": 10350 }, { "epoch": 1.2474413004214329, "grad_norm": 4.101998329162598, "learning_rate": 8.693920069670264e-07, "loss": 0.3823, "step": 10360 }, { "epoch": 1.2486453943407585, "grad_norm": 4.307315349578857, "learning_rate": 8.690377238902862e-07, "loss": 0.3718, "step": 10370 }, { "epoch": 1.2498494882600844, "grad_norm": 4.498570442199707, "learning_rate": 8.686830333524084e-07, "loss": 0.3894, "step": 10380 }, { "epoch": 1.25105358217941, "grad_norm": 4.348161697387695, "learning_rate": 8.68327935745013e-07, "loss": 0.3661, "step": 10390 }, { "epoch": 1.2522576760987358, "grad_norm": 4.509785175323486, "learning_rate": 8.679724314601701e-07, "loss": 0.3691, "step": 10400 }, { "epoch": 1.2534617700180615, "grad_norm": 4.251500606536865, "learning_rate": 8.676165208903978e-07, "loss": 0.3489, "step": 10410 }, { "epoch": 1.254665863937387, "grad_norm": 3.91599702835083, "learning_rate": 8.672602044286637e-07, "loss": 0.3835, "step": 10420 }, { "epoch": 1.2558699578567127, "grad_norm": 4.641791820526123, "learning_rate": 8.66903482468383e-07, "loss": 0.3676, "step": 10430 }, { "epoch": 1.2570740517760386, "grad_norm": 6.0034499168396, "learning_rate": 8.665463554034187e-07, "loss": 0.3728, "step": 10440 }, { "epoch": 1.2582781456953642, "grad_norm": 5.09488582611084, "learning_rate": 8.661888236280813e-07, "loss": 0.3718, "step": 10450 }, { "epoch": 1.25948223961469, "grad_norm": 5.368484020233154, "learning_rate": 8.658308875371279e-07, "loss": 0.3908, "step": 10460 }, { "epoch": 1.2606863335340157, "grad_norm": 5.200775623321533, "learning_rate": 8.654725475257621e-07, "loss": 0.3655, "step": 10470 }, { "epoch": 1.2618904274533413, "grad_norm": 4.358388900756836, "learning_rate": 8.651138039896338e-07, "loss": 0.3748, "step": 10480 }, { "epoch": 1.263094521372667, "grad_norm": 4.452842712402344, "learning_rate": 8.647546573248377e-07, "loss": 0.3731, "step": 10490 }, { "epoch": 1.2642986152919928, "grad_norm": 4.0504584312438965, "learning_rate": 8.643951079279144e-07, "loss": 0.3767, "step": 10500 }, { "epoch": 1.2655027092113185, "grad_norm": 5.186153411865234, "learning_rate": 8.640351561958486e-07, "loss": 0.362, "step": 10510 }, { "epoch": 1.2667068031306443, "grad_norm": 4.57370662689209, "learning_rate": 8.636748025260696e-07, "loss": 0.3766, "step": 10520 }, { "epoch": 1.26791089704997, "grad_norm": 5.416035175323486, "learning_rate": 8.633140473164502e-07, "loss": 0.3653, "step": 10530 }, { "epoch": 1.2691149909692956, "grad_norm": 4.351581573486328, "learning_rate": 8.629528909653065e-07, "loss": 0.3556, "step": 10540 }, { "epoch": 1.2703190848886212, "grad_norm": 5.305721759796143, "learning_rate": 8.625913338713982e-07, "loss": 0.3873, "step": 10550 }, { "epoch": 1.271523178807947, "grad_norm": 3.8972630500793457, "learning_rate": 8.622293764339264e-07, "loss": 0.3812, "step": 10560 }, { "epoch": 1.2727272727272727, "grad_norm": 5.005763530731201, "learning_rate": 8.61867019052535e-07, "loss": 0.3761, "step": 10570 }, { "epoch": 1.2739313666465986, "grad_norm": 4.1513848304748535, "learning_rate": 8.615042621273093e-07, "loss": 0.3525, "step": 10580 }, { "epoch": 1.2751354605659242, "grad_norm": 5.166493892669678, "learning_rate": 8.611411060587757e-07, "loss": 0.3866, "step": 10590 }, { "epoch": 1.2763395544852498, "grad_norm": 4.168553352355957, "learning_rate": 8.60777551247901e-07, "loss": 0.3735, "step": 10600 }, { "epoch": 1.2775436484045755, "grad_norm": 4.891838550567627, "learning_rate": 8.60413598096093e-07, "loss": 0.3603, "step": 10610 }, { "epoch": 1.2787477423239013, "grad_norm": 4.317160606384277, "learning_rate": 8.600492470051983e-07, "loss": 0.3765, "step": 10620 }, { "epoch": 1.279951836243227, "grad_norm": 4.056015968322754, "learning_rate": 8.59684498377504e-07, "loss": 0.3704, "step": 10630 }, { "epoch": 1.2811559301625528, "grad_norm": 4.8416242599487305, "learning_rate": 8.593193526157354e-07, "loss": 0.3475, "step": 10640 }, { "epoch": 1.2823600240818784, "grad_norm": 5.178276062011719, "learning_rate": 8.589538101230564e-07, "loss": 0.3823, "step": 10650 }, { "epoch": 1.283564118001204, "grad_norm": 4.507132053375244, "learning_rate": 8.58587871303069e-07, "loss": 0.3597, "step": 10660 }, { "epoch": 1.2847682119205297, "grad_norm": 4.44130277633667, "learning_rate": 8.582215365598127e-07, "loss": 0.3748, "step": 10670 }, { "epoch": 1.2859723058398556, "grad_norm": 4.559373378753662, "learning_rate": 8.578548062977644e-07, "loss": 0.3684, "step": 10680 }, { "epoch": 1.2871763997591812, "grad_norm": 4.59391450881958, "learning_rate": 8.574876809218374e-07, "loss": 0.3729, "step": 10690 }, { "epoch": 1.288380493678507, "grad_norm": 4.64610481262207, "learning_rate": 8.571201608373815e-07, "loss": 0.367, "step": 10700 }, { "epoch": 1.2895845875978327, "grad_norm": 5.637624740600586, "learning_rate": 8.56752246450182e-07, "loss": 0.3799, "step": 10710 }, { "epoch": 1.2907886815171583, "grad_norm": 4.1183271408081055, "learning_rate": 8.563839381664599e-07, "loss": 0.3744, "step": 10720 }, { "epoch": 1.291992775436484, "grad_norm": 5.679279327392578, "learning_rate": 8.560152363928709e-07, "loss": 0.3636, "step": 10730 }, { "epoch": 1.2931968693558098, "grad_norm": 4.73154878616333, "learning_rate": 8.556461415365052e-07, "loss": 0.3772, "step": 10740 }, { "epoch": 1.2944009632751354, "grad_norm": 4.206639289855957, "learning_rate": 8.552766540048871e-07, "loss": 0.3652, "step": 10750 }, { "epoch": 1.2956050571944613, "grad_norm": 4.551361083984375, "learning_rate": 8.549067742059741e-07, "loss": 0.36, "step": 10760 }, { "epoch": 1.296809151113787, "grad_norm": 4.472609043121338, "learning_rate": 8.545365025481574e-07, "loss": 0.3949, "step": 10770 }, { "epoch": 1.2980132450331126, "grad_norm": 3.9386298656463623, "learning_rate": 8.541658394402605e-07, "loss": 0.3736, "step": 10780 }, { "epoch": 1.2992173389524382, "grad_norm": 5.128427505493164, "learning_rate": 8.537947852915388e-07, "loss": 0.3708, "step": 10790 }, { "epoch": 1.300421432871764, "grad_norm": 4.362430095672607, "learning_rate": 8.534233405116804e-07, "loss": 0.3707, "step": 10800 }, { "epoch": 1.3016255267910897, "grad_norm": 5.032322883605957, "learning_rate": 8.530515055108036e-07, "loss": 0.3694, "step": 10810 }, { "epoch": 1.3028296207104155, "grad_norm": 3.745659828186035, "learning_rate": 8.526792806994585e-07, "loss": 0.3531, "step": 10820 }, { "epoch": 1.3040337146297412, "grad_norm": 3.8410699367523193, "learning_rate": 8.523066664886248e-07, "loss": 0.3591, "step": 10830 }, { "epoch": 1.3052378085490668, "grad_norm": 6.065695285797119, "learning_rate": 8.519336632897128e-07, "loss": 0.3748, "step": 10840 }, { "epoch": 1.3064419024683924, "grad_norm": 4.5033464431762695, "learning_rate": 8.515602715145615e-07, "loss": 0.3661, "step": 10850 }, { "epoch": 1.3076459963877183, "grad_norm": 4.6679558753967285, "learning_rate": 8.511864915754399e-07, "loss": 0.3835, "step": 10860 }, { "epoch": 1.308850090307044, "grad_norm": 4.266571998596191, "learning_rate": 8.50812323885045e-07, "loss": 0.3799, "step": 10870 }, { "epoch": 1.3100541842263698, "grad_norm": 4.90196418762207, "learning_rate": 8.504377688565019e-07, "loss": 0.3551, "step": 10880 }, { "epoch": 1.3112582781456954, "grad_norm": 4.301276683807373, "learning_rate": 8.500628269033635e-07, "loss": 0.3825, "step": 10890 }, { "epoch": 1.312462372065021, "grad_norm": 4.9276580810546875, "learning_rate": 8.4968749843961e-07, "loss": 0.37, "step": 10900 }, { "epoch": 1.3136664659843467, "grad_norm": 4.929906845092773, "learning_rate": 8.493117838796482e-07, "loss": 0.3751, "step": 10910 }, { "epoch": 1.3148705599036725, "grad_norm": 4.179794788360596, "learning_rate": 8.489356836383112e-07, "loss": 0.3714, "step": 10920 }, { "epoch": 1.3160746538229982, "grad_norm": 4.671365261077881, "learning_rate": 8.485591981308583e-07, "loss": 0.3665, "step": 10930 }, { "epoch": 1.317278747742324, "grad_norm": 4.073710918426514, "learning_rate": 8.481823277729734e-07, "loss": 0.3602, "step": 10940 }, { "epoch": 1.3184828416616496, "grad_norm": 4.633068084716797, "learning_rate": 8.478050729807663e-07, "loss": 0.3682, "step": 10950 }, { "epoch": 1.3196869355809753, "grad_norm": 5.233600616455078, "learning_rate": 8.474274341707701e-07, "loss": 0.3781, "step": 10960 }, { "epoch": 1.320891029500301, "grad_norm": 4.329504013061523, "learning_rate": 8.470494117599431e-07, "loss": 0.3763, "step": 10970 }, { "epoch": 1.3220951234196268, "grad_norm": 4.211668968200684, "learning_rate": 8.466710061656664e-07, "loss": 0.3325, "step": 10980 }, { "epoch": 1.3232992173389524, "grad_norm": 4.388267993927002, "learning_rate": 8.462922178057443e-07, "loss": 0.3709, "step": 10990 }, { "epoch": 1.3245033112582782, "grad_norm": 5.167718887329102, "learning_rate": 8.45913047098404e-07, "loss": 0.362, "step": 11000 }, { "epoch": 1.3257074051776039, "grad_norm": 4.614595890045166, "learning_rate": 8.455334944622945e-07, "loss": 0.3549, "step": 11010 }, { "epoch": 1.3269114990969295, "grad_norm": 4.618056774139404, "learning_rate": 8.451535603164864e-07, "loss": 0.3773, "step": 11020 }, { "epoch": 1.3281155930162551, "grad_norm": 4.563729763031006, "learning_rate": 8.447732450804723e-07, "loss": 0.3688, "step": 11030 }, { "epoch": 1.329319686935581, "grad_norm": 4.429327011108398, "learning_rate": 8.443925491741646e-07, "loss": 0.3429, "step": 11040 }, { "epoch": 1.3305237808549066, "grad_norm": 4.474249362945557, "learning_rate": 8.440114730178966e-07, "loss": 0.3879, "step": 11050 }, { "epoch": 1.3317278747742325, "grad_norm": 4.212963581085205, "learning_rate": 8.436300170324215e-07, "loss": 0.349, "step": 11060 }, { "epoch": 1.3329319686935581, "grad_norm": 4.393470287322998, "learning_rate": 8.432481816389112e-07, "loss": 0.3609, "step": 11070 }, { "epoch": 1.3341360626128838, "grad_norm": 4.512639045715332, "learning_rate": 8.428659672589574e-07, "loss": 0.3446, "step": 11080 }, { "epoch": 1.3353401565322094, "grad_norm": 5.399291515350342, "learning_rate": 8.424833743145696e-07, "loss": 0.3643, "step": 11090 }, { "epoch": 1.3365442504515352, "grad_norm": 4.692162990570068, "learning_rate": 8.421004032281756e-07, "loss": 0.3782, "step": 11100 }, { "epoch": 1.3377483443708609, "grad_norm": 4.4849677085876465, "learning_rate": 8.417170544226203e-07, "loss": 0.36, "step": 11110 }, { "epoch": 1.3389524382901867, "grad_norm": 4.692328453063965, "learning_rate": 8.413333283211664e-07, "loss": 0.3626, "step": 11120 }, { "epoch": 1.3401565322095124, "grad_norm": 4.903812408447266, "learning_rate": 8.409492253474925e-07, "loss": 0.3576, "step": 11130 }, { "epoch": 1.341360626128838, "grad_norm": 4.484142780303955, "learning_rate": 8.405647459256937e-07, "loss": 0.3611, "step": 11140 }, { "epoch": 1.3425647200481636, "grad_norm": 4.777652263641357, "learning_rate": 8.401798904802804e-07, "loss": 0.3654, "step": 11150 }, { "epoch": 1.3437688139674895, "grad_norm": 4.49363374710083, "learning_rate": 8.397946594361785e-07, "loss": 0.3684, "step": 11160 }, { "epoch": 1.3449729078868151, "grad_norm": 5.207254886627197, "learning_rate": 8.394090532187284e-07, "loss": 0.3706, "step": 11170 }, { "epoch": 1.346177001806141, "grad_norm": 5.246047496795654, "learning_rate": 8.390230722536849e-07, "loss": 0.365, "step": 11180 }, { "epoch": 1.3473810957254666, "grad_norm": 4.5202317237854, "learning_rate": 8.386367169672164e-07, "loss": 0.3549, "step": 11190 }, { "epoch": 1.3485851896447922, "grad_norm": 5.0257368087768555, "learning_rate": 8.382499877859046e-07, "loss": 0.3765, "step": 11200 }, { "epoch": 1.3497892835641179, "grad_norm": 3.513502597808838, "learning_rate": 8.378628851367441e-07, "loss": 0.3435, "step": 11210 }, { "epoch": 1.3509933774834437, "grad_norm": 4.943020820617676, "learning_rate": 8.374754094471421e-07, "loss": 0.3754, "step": 11220 }, { "epoch": 1.3521974714027694, "grad_norm": 4.6621012687683105, "learning_rate": 8.37087561144917e-07, "loss": 0.3823, "step": 11230 }, { "epoch": 1.3534015653220952, "grad_norm": 3.8831217288970947, "learning_rate": 8.366993406582996e-07, "loss": 0.3606, "step": 11240 }, { "epoch": 1.3546056592414208, "grad_norm": 4.315981388092041, "learning_rate": 8.363107484159305e-07, "loss": 0.3647, "step": 11250 }, { "epoch": 1.3558097531607465, "grad_norm": 4.6641011238098145, "learning_rate": 8.359217848468616e-07, "loss": 0.377, "step": 11260 }, { "epoch": 1.357013847080072, "grad_norm": 4.609387397766113, "learning_rate": 8.355324503805545e-07, "loss": 0.369, "step": 11270 }, { "epoch": 1.358217940999398, "grad_norm": 4.37289571762085, "learning_rate": 8.351427454468805e-07, "loss": 0.3594, "step": 11280 }, { "epoch": 1.3594220349187236, "grad_norm": 5.407008171081543, "learning_rate": 8.347526704761192e-07, "loss": 0.3732, "step": 11290 }, { "epoch": 1.3606261288380495, "grad_norm": 4.5802083015441895, "learning_rate": 8.3436222589896e-07, "loss": 0.3506, "step": 11300 }, { "epoch": 1.361830222757375, "grad_norm": 4.10429048538208, "learning_rate": 8.339714121464994e-07, "loss": 0.3917, "step": 11310 }, { "epoch": 1.3630343166767007, "grad_norm": 4.250566005706787, "learning_rate": 8.335802296502419e-07, "loss": 0.3515, "step": 11320 }, { "epoch": 1.3642384105960264, "grad_norm": 5.012816429138184, "learning_rate": 8.33188678842099e-07, "loss": 0.354, "step": 11330 }, { "epoch": 1.3654425045153522, "grad_norm": 4.53849983215332, "learning_rate": 8.327967601543891e-07, "loss": 0.3612, "step": 11340 }, { "epoch": 1.3666465984346778, "grad_norm": 4.784470081329346, "learning_rate": 8.324044740198364e-07, "loss": 0.356, "step": 11350 }, { "epoch": 1.3678506923540037, "grad_norm": 4.100750923156738, "learning_rate": 8.320118208715714e-07, "loss": 0.3769, "step": 11360 }, { "epoch": 1.3690547862733293, "grad_norm": 5.738262176513672, "learning_rate": 8.316188011431291e-07, "loss": 0.3797, "step": 11370 }, { "epoch": 1.370258880192655, "grad_norm": 4.102308750152588, "learning_rate": 8.312254152684495e-07, "loss": 0.3723, "step": 11380 }, { "epoch": 1.3714629741119808, "grad_norm": 3.786195993423462, "learning_rate": 8.308316636818773e-07, "loss": 0.3638, "step": 11390 }, { "epoch": 1.3726670680313064, "grad_norm": 4.1659159660339355, "learning_rate": 8.304375468181606e-07, "loss": 0.3487, "step": 11400 }, { "epoch": 1.373871161950632, "grad_norm": 4.081630229949951, "learning_rate": 8.300430651124505e-07, "loss": 0.3602, "step": 11410 }, { "epoch": 1.375075255869958, "grad_norm": 4.725644111633301, "learning_rate": 8.296482190003019e-07, "loss": 0.3746, "step": 11420 }, { "epoch": 1.3762793497892836, "grad_norm": 4.421098709106445, "learning_rate": 8.292530089176709e-07, "loss": 0.3632, "step": 11430 }, { "epoch": 1.3774834437086092, "grad_norm": 4.213558197021484, "learning_rate": 8.288574353009164e-07, "loss": 0.3748, "step": 11440 }, { "epoch": 1.378687537627935, "grad_norm": 5.2602458000183105, "learning_rate": 8.284614985867979e-07, "loss": 0.355, "step": 11450 }, { "epoch": 1.3798916315472607, "grad_norm": 4.735654354095459, "learning_rate": 8.280651992124766e-07, "loss": 0.3619, "step": 11460 }, { "epoch": 1.3810957254665863, "grad_norm": 5.071203708648682, "learning_rate": 8.276685376155133e-07, "loss": 0.3693, "step": 11470 }, { "epoch": 1.3822998193859122, "grad_norm": 4.431037902832031, "learning_rate": 8.272715142338694e-07, "loss": 0.3652, "step": 11480 }, { "epoch": 1.3835039133052378, "grad_norm": 4.460841178894043, "learning_rate": 8.268741295059056e-07, "loss": 0.3732, "step": 11490 }, { "epoch": 1.3847080072245634, "grad_norm": 5.048714637756348, "learning_rate": 8.264763838703812e-07, "loss": 0.364, "step": 11500 }, { "epoch": 1.3859121011438893, "grad_norm": 4.322780132293701, "learning_rate": 8.260782777664544e-07, "loss": 0.3606, "step": 11510 }, { "epoch": 1.387116195063215, "grad_norm": 4.763073921203613, "learning_rate": 8.256798116336813e-07, "loss": 0.3885, "step": 11520 }, { "epoch": 1.3883202889825406, "grad_norm": 4.54296350479126, "learning_rate": 8.252809859120153e-07, "loss": 0.3629, "step": 11530 }, { "epoch": 1.3895243829018664, "grad_norm": 4.481988430023193, "learning_rate": 8.248818010418073e-07, "loss": 0.3641, "step": 11540 }, { "epoch": 1.390728476821192, "grad_norm": 4.431914806365967, "learning_rate": 8.244822574638041e-07, "loss": 0.3591, "step": 11550 }, { "epoch": 1.3919325707405177, "grad_norm": 4.374257564544678, "learning_rate": 8.240823556191489e-07, "loss": 0.3634, "step": 11560 }, { "epoch": 1.3931366646598435, "grad_norm": 3.9488606452941895, "learning_rate": 8.23682095949381e-07, "loss": 0.3466, "step": 11570 }, { "epoch": 1.3943407585791692, "grad_norm": 4.069718837738037, "learning_rate": 8.232814788964336e-07, "loss": 0.3286, "step": 11580 }, { "epoch": 1.3955448524984948, "grad_norm": 4.749855995178223, "learning_rate": 8.228805049026355e-07, "loss": 0.3546, "step": 11590 }, { "epoch": 1.3967489464178207, "grad_norm": 3.9409117698669434, "learning_rate": 8.224791744107089e-07, "loss": 0.3663, "step": 11600 }, { "epoch": 1.3979530403371463, "grad_norm": 4.028295993804932, "learning_rate": 8.220774878637704e-07, "loss": 0.3705, "step": 11610 }, { "epoch": 1.399157134256472, "grad_norm": 4.911005973815918, "learning_rate": 8.21675445705329e-07, "loss": 0.3691, "step": 11620 }, { "epoch": 1.4003612281757978, "grad_norm": 4.403053283691406, "learning_rate": 8.212730483792868e-07, "loss": 0.3736, "step": 11630 }, { "epoch": 1.4015653220951234, "grad_norm": 4.316033840179443, "learning_rate": 8.208702963299376e-07, "loss": 0.373, "step": 11640 }, { "epoch": 1.402769416014449, "grad_norm": 5.129039764404297, "learning_rate": 8.204671900019676e-07, "loss": 0.37, "step": 11650 }, { "epoch": 1.403973509933775, "grad_norm": 4.374388694763184, "learning_rate": 8.200637298404531e-07, "loss": 0.3621, "step": 11660 }, { "epoch": 1.4051776038531005, "grad_norm": 4.385969161987305, "learning_rate": 8.19659916290862e-07, "loss": 0.3744, "step": 11670 }, { "epoch": 1.4063816977724262, "grad_norm": 4.8797149658203125, "learning_rate": 8.192557497990521e-07, "loss": 0.3554, "step": 11680 }, { "epoch": 1.407585791691752, "grad_norm": 3.9471144676208496, "learning_rate": 8.188512308112707e-07, "loss": 0.3702, "step": 11690 }, { "epoch": 1.4087898856110777, "grad_norm": 4.70519495010376, "learning_rate": 8.184463597741544e-07, "loss": 0.3422, "step": 11700 }, { "epoch": 1.4099939795304035, "grad_norm": 5.044809818267822, "learning_rate": 8.180411371347287e-07, "loss": 0.3702, "step": 11710 }, { "epoch": 1.4111980734497291, "grad_norm": 4.174710273742676, "learning_rate": 8.17635563340407e-07, "loss": 0.3513, "step": 11720 }, { "epoch": 1.4124021673690548, "grad_norm": 4.635099411010742, "learning_rate": 8.172296388389907e-07, "loss": 0.3779, "step": 11730 }, { "epoch": 1.4136062612883804, "grad_norm": 5.230491638183594, "learning_rate": 8.168233640786682e-07, "loss": 0.3601, "step": 11740 }, { "epoch": 1.4148103552077063, "grad_norm": 4.704545497894287, "learning_rate": 8.164167395080149e-07, "loss": 0.3569, "step": 11750 }, { "epoch": 1.416014449127032, "grad_norm": 4.232817649841309, "learning_rate": 8.160097655759917e-07, "loss": 0.374, "step": 11760 }, { "epoch": 1.4172185430463577, "grad_norm": 5.304251670837402, "learning_rate": 8.156024427319463e-07, "loss": 0.3668, "step": 11770 }, { "epoch": 1.4184226369656834, "grad_norm": 4.5971245765686035, "learning_rate": 8.151947714256111e-07, "loss": 0.3778, "step": 11780 }, { "epoch": 1.419626730885009, "grad_norm": 4.492901802062988, "learning_rate": 8.14786752107103e-07, "loss": 0.3418, "step": 11790 }, { "epoch": 1.4208308248043346, "grad_norm": 4.80876350402832, "learning_rate": 8.143783852269237e-07, "loss": 0.3633, "step": 11800 }, { "epoch": 1.4220349187236605, "grad_norm": 3.9827497005462646, "learning_rate": 8.13969671235958e-07, "loss": 0.3649, "step": 11810 }, { "epoch": 1.4232390126429861, "grad_norm": 4.20520544052124, "learning_rate": 8.135606105854747e-07, "loss": 0.3495, "step": 11820 }, { "epoch": 1.424443106562312, "grad_norm": 4.29602575302124, "learning_rate": 8.131512037271247e-07, "loss": 0.3678, "step": 11830 }, { "epoch": 1.4256472004816376, "grad_norm": 4.648280143737793, "learning_rate": 8.127414511129416e-07, "loss": 0.3789, "step": 11840 }, { "epoch": 1.4268512944009633, "grad_norm": 4.162654399871826, "learning_rate": 8.123313531953404e-07, "loss": 0.372, "step": 11850 }, { "epoch": 1.4280553883202889, "grad_norm": 4.688777446746826, "learning_rate": 8.119209104271176e-07, "loss": 0.3576, "step": 11860 }, { "epoch": 1.4292594822396147, "grad_norm": 4.464323997497559, "learning_rate": 8.115101232614506e-07, "loss": 0.3817, "step": 11870 }, { "epoch": 1.4304635761589404, "grad_norm": 4.280879974365234, "learning_rate": 8.110989921518965e-07, "loss": 0.3604, "step": 11880 }, { "epoch": 1.4316676700782662, "grad_norm": 3.778425693511963, "learning_rate": 8.106875175523926e-07, "loss": 0.3553, "step": 11890 }, { "epoch": 1.4328717639975919, "grad_norm": 4.960265159606934, "learning_rate": 8.102756999172554e-07, "loss": 0.3723, "step": 11900 }, { "epoch": 1.4340758579169175, "grad_norm": 4.935343265533447, "learning_rate": 8.098635397011802e-07, "loss": 0.3714, "step": 11910 }, { "epoch": 1.4352799518362431, "grad_norm": 4.417319297790527, "learning_rate": 8.094510373592402e-07, "loss": 0.3612, "step": 11920 }, { "epoch": 1.436484045755569, "grad_norm": 4.819094181060791, "learning_rate": 8.090381933468868e-07, "loss": 0.3602, "step": 11930 }, { "epoch": 1.4376881396748946, "grad_norm": 4.769229888916016, "learning_rate": 8.086250081199484e-07, "loss": 0.3597, "step": 11940 }, { "epoch": 1.4388922335942205, "grad_norm": 4.872611999511719, "learning_rate": 8.082114821346302e-07, "loss": 0.3698, "step": 11950 }, { "epoch": 1.440096327513546, "grad_norm": 4.3483686447143555, "learning_rate": 8.077976158475135e-07, "loss": 0.366, "step": 11960 }, { "epoch": 1.4413004214328717, "grad_norm": 4.28345251083374, "learning_rate": 8.073834097155555e-07, "loss": 0.3564, "step": 11970 }, { "epoch": 1.4425045153521974, "grad_norm": 4.1988606452941895, "learning_rate": 8.069688641960888e-07, "loss": 0.3557, "step": 11980 }, { "epoch": 1.4437086092715232, "grad_norm": 4.156854152679443, "learning_rate": 8.065539797468201e-07, "loss": 0.3631, "step": 11990 }, { "epoch": 1.4449127031908489, "grad_norm": 5.002780914306641, "learning_rate": 8.061387568258312e-07, "loss": 0.362, "step": 12000 }, { "epoch": 1.4461167971101747, "grad_norm": 4.551509380340576, "learning_rate": 8.057231958915767e-07, "loss": 0.3545, "step": 12010 }, { "epoch": 1.4473208910295003, "grad_norm": 3.529510498046875, "learning_rate": 8.053072974028851e-07, "loss": 0.3698, "step": 12020 }, { "epoch": 1.448524984948826, "grad_norm": 5.073483467102051, "learning_rate": 8.048910618189573e-07, "loss": 0.3762, "step": 12030 }, { "epoch": 1.4497290788681516, "grad_norm": 4.148519992828369, "learning_rate": 8.044744895993665e-07, "loss": 0.3714, "step": 12040 }, { "epoch": 1.4509331727874775, "grad_norm": 5.03234338760376, "learning_rate": 8.040575812040574e-07, "loss": 0.3651, "step": 12050 }, { "epoch": 1.452137266706803, "grad_norm": 4.286599159240723, "learning_rate": 8.03640337093346e-07, "loss": 0.3646, "step": 12060 }, { "epoch": 1.453341360626129, "grad_norm": 5.805792808532715, "learning_rate": 8.03222757727919e-07, "loss": 0.3662, "step": 12070 }, { "epoch": 1.4545454545454546, "grad_norm": 5.614697456359863, "learning_rate": 8.028048435688333e-07, "loss": 0.3661, "step": 12080 }, { "epoch": 1.4557495484647802, "grad_norm": 4.117318630218506, "learning_rate": 8.023865950775153e-07, "loss": 0.3611, "step": 12090 }, { "epoch": 1.4569536423841059, "grad_norm": 4.437227249145508, "learning_rate": 8.019680127157606e-07, "loss": 0.3551, "step": 12100 }, { "epoch": 1.4581577363034317, "grad_norm": 4.852316856384277, "learning_rate": 8.015490969457337e-07, "loss": 0.3738, "step": 12110 }, { "epoch": 1.4593618302227573, "grad_norm": 4.06812047958374, "learning_rate": 8.011298482299666e-07, "loss": 0.3535, "step": 12120 }, { "epoch": 1.4605659241420832, "grad_norm": 4.921239376068115, "learning_rate": 8.007102670313595e-07, "loss": 0.3586, "step": 12130 }, { "epoch": 1.4617700180614088, "grad_norm": 3.9317848682403564, "learning_rate": 8.002903538131794e-07, "loss": 0.3527, "step": 12140 }, { "epoch": 1.4629741119807345, "grad_norm": 5.692650318145752, "learning_rate": 7.998701090390601e-07, "loss": 0.364, "step": 12150 }, { "epoch": 1.46417820590006, "grad_norm": 4.238543510437012, "learning_rate": 7.994495331730013e-07, "loss": 0.3516, "step": 12160 }, { "epoch": 1.465382299819386, "grad_norm": 4.356393814086914, "learning_rate": 7.990286266793685e-07, "loss": 0.3464, "step": 12170 }, { "epoch": 1.4665863937387116, "grad_norm": 4.616797924041748, "learning_rate": 7.986073900228916e-07, "loss": 0.3465, "step": 12180 }, { "epoch": 1.4677904876580374, "grad_norm": 3.8541862964630127, "learning_rate": 7.981858236686661e-07, "loss": 0.3546, "step": 12190 }, { "epoch": 1.468994581577363, "grad_norm": 5.685515880584717, "learning_rate": 7.977639280821505e-07, "loss": 0.3563, "step": 12200 }, { "epoch": 1.4701986754966887, "grad_norm": 4.1002702713012695, "learning_rate": 7.973417037291672e-07, "loss": 0.3771, "step": 12210 }, { "epoch": 1.4714027694160143, "grad_norm": 4.752336025238037, "learning_rate": 7.969191510759019e-07, "loss": 0.366, "step": 12220 }, { "epoch": 1.4726068633353402, "grad_norm": 4.7561774253845215, "learning_rate": 7.964962705889027e-07, "loss": 0.3621, "step": 12230 }, { "epoch": 1.4738109572546658, "grad_norm": 4.569270133972168, "learning_rate": 7.96073062735079e-07, "loss": 0.3662, "step": 12240 }, { "epoch": 1.4750150511739917, "grad_norm": 3.9785332679748535, "learning_rate": 7.956495279817025e-07, "loss": 0.3711, "step": 12250 }, { "epoch": 1.4762191450933173, "grad_norm": 4.953578948974609, "learning_rate": 7.952256667964053e-07, "loss": 0.3671, "step": 12260 }, { "epoch": 1.477423239012643, "grad_norm": 4.805257320404053, "learning_rate": 7.948014796471802e-07, "loss": 0.3707, "step": 12270 }, { "epoch": 1.4786273329319686, "grad_norm": 4.094834804534912, "learning_rate": 7.943769670023799e-07, "loss": 0.3699, "step": 12280 }, { "epoch": 1.4798314268512944, "grad_norm": 5.696323394775391, "learning_rate": 7.939521293307161e-07, "loss": 0.3753, "step": 12290 }, { "epoch": 1.48103552077062, "grad_norm": 4.848500728607178, "learning_rate": 7.935269671012599e-07, "loss": 0.3643, "step": 12300 }, { "epoch": 1.482239614689946, "grad_norm": 4.916533946990967, "learning_rate": 7.931014807834404e-07, "loss": 0.3621, "step": 12310 }, { "epoch": 1.4834437086092715, "grad_norm": 4.234400272369385, "learning_rate": 7.926756708470447e-07, "loss": 0.3464, "step": 12320 }, { "epoch": 1.4846478025285972, "grad_norm": 4.844507217407227, "learning_rate": 7.922495377622171e-07, "loss": 0.3535, "step": 12330 }, { "epoch": 1.4858518964479228, "grad_norm": 5.471369743347168, "learning_rate": 7.918230819994588e-07, "loss": 0.3592, "step": 12340 }, { "epoch": 1.4870559903672487, "grad_norm": 5.131628036499023, "learning_rate": 7.913963040296272e-07, "loss": 0.376, "step": 12350 }, { "epoch": 1.4882600842865743, "grad_norm": 4.308112144470215, "learning_rate": 7.909692043239353e-07, "loss": 0.3526, "step": 12360 }, { "epoch": 1.4894641782059002, "grad_norm": 4.5161356925964355, "learning_rate": 7.905417833539518e-07, "loss": 0.3548, "step": 12370 }, { "epoch": 1.4906682721252258, "grad_norm": 4.657468795776367, "learning_rate": 7.901140415915995e-07, "loss": 0.3727, "step": 12380 }, { "epoch": 1.4918723660445514, "grad_norm": 4.615851879119873, "learning_rate": 7.896859795091562e-07, "loss": 0.3728, "step": 12390 }, { "epoch": 1.493076459963877, "grad_norm": 3.6912169456481934, "learning_rate": 7.892575975792523e-07, "loss": 0.3646, "step": 12400 }, { "epoch": 1.494280553883203, "grad_norm": 4.871870517730713, "learning_rate": 7.888288962748723e-07, "loss": 0.3416, "step": 12410 }, { "epoch": 1.4954846478025285, "grad_norm": 4.7089385986328125, "learning_rate": 7.883998760693529e-07, "loss": 0.3883, "step": 12420 }, { "epoch": 1.4966887417218544, "grad_norm": 4.376954078674316, "learning_rate": 7.87970537436383e-07, "loss": 0.3427, "step": 12430 }, { "epoch": 1.49789283564118, "grad_norm": 4.280700206756592, "learning_rate": 7.875408808500028e-07, "loss": 0.3651, "step": 12440 }, { "epoch": 1.4990969295605057, "grad_norm": 4.794469356536865, "learning_rate": 7.871109067846041e-07, "loss": 0.3731, "step": 12450 }, { "epoch": 1.5003010234798313, "grad_norm": 4.945312023162842, "learning_rate": 7.86680615714929e-07, "loss": 0.3586, "step": 12460 }, { "epoch": 1.5015051173991572, "grad_norm": 3.5225555896759033, "learning_rate": 7.862500081160692e-07, "loss": 0.3595, "step": 12470 }, { "epoch": 1.502709211318483, "grad_norm": 4.152462005615234, "learning_rate": 7.858190844634664e-07, "loss": 0.3777, "step": 12480 }, { "epoch": 1.5039133052378086, "grad_norm": 5.704073905944824, "learning_rate": 7.853878452329113e-07, "loss": 0.375, "step": 12490 }, { "epoch": 1.5051173991571343, "grad_norm": 5.431835174560547, "learning_rate": 7.849562909005425e-07, "loss": 0.3596, "step": 12500 }, { "epoch": 1.50632149307646, "grad_norm": 4.785493850708008, "learning_rate": 7.845244219428469e-07, "loss": 0.3888, "step": 12510 }, { "epoch": 1.5075255869957855, "grad_norm": 4.297571182250977, "learning_rate": 7.84092238836659e-07, "loss": 0.3576, "step": 12520 }, { "epoch": 1.5087296809151114, "grad_norm": 4.948373317718506, "learning_rate": 7.836597420591595e-07, "loss": 0.3766, "step": 12530 }, { "epoch": 1.5099337748344372, "grad_norm": 4.502270698547363, "learning_rate": 7.832269320878762e-07, "loss": 0.3624, "step": 12540 }, { "epoch": 1.5111378687537629, "grad_norm": 3.8894035816192627, "learning_rate": 7.827938094006821e-07, "loss": 0.3743, "step": 12550 }, { "epoch": 1.5123419626730885, "grad_norm": 4.615447044372559, "learning_rate": 7.823603744757956e-07, "loss": 0.3586, "step": 12560 }, { "epoch": 1.5135460565924141, "grad_norm": 4.9232401847839355, "learning_rate": 7.8192662779178e-07, "loss": 0.3488, "step": 12570 }, { "epoch": 1.5147501505117398, "grad_norm": 4.241856575012207, "learning_rate": 7.81492569827543e-07, "loss": 0.355, "step": 12580 }, { "epoch": 1.5159542444310656, "grad_norm": 5.041738986968994, "learning_rate": 7.810582010623354e-07, "loss": 0.3755, "step": 12590 }, { "epoch": 1.5171583383503915, "grad_norm": 4.944552421569824, "learning_rate": 7.806235219757518e-07, "loss": 0.3643, "step": 12600 }, { "epoch": 1.5183624322697171, "grad_norm": 5.554732799530029, "learning_rate": 7.801885330477289e-07, "loss": 0.3687, "step": 12610 }, { "epoch": 1.5195665261890428, "grad_norm": 6.034419059753418, "learning_rate": 7.797532347585459e-07, "loss": 0.3595, "step": 12620 }, { "epoch": 1.5207706201083684, "grad_norm": 4.2550048828125, "learning_rate": 7.793176275888231e-07, "loss": 0.3727, "step": 12630 }, { "epoch": 1.521974714027694, "grad_norm": 4.084836006164551, "learning_rate": 7.788817120195226e-07, "loss": 0.3646, "step": 12640 }, { "epoch": 1.5231788079470199, "grad_norm": 4.183859825134277, "learning_rate": 7.784454885319464e-07, "loss": 0.3846, "step": 12650 }, { "epoch": 1.5243829018663457, "grad_norm": 4.216065406799316, "learning_rate": 7.780089576077364e-07, "loss": 0.3794, "step": 12660 }, { "epoch": 1.5255869957856714, "grad_norm": 4.975666522979736, "learning_rate": 7.775721197288744e-07, "loss": 0.3903, "step": 12670 }, { "epoch": 1.526791089704997, "grad_norm": 4.360125541687012, "learning_rate": 7.77134975377681e-07, "loss": 0.3481, "step": 12680 }, { "epoch": 1.5279951836243226, "grad_norm": 5.113675594329834, "learning_rate": 7.766975250368149e-07, "loss": 0.3624, "step": 12690 }, { "epoch": 1.5291992775436483, "grad_norm": 4.466128349304199, "learning_rate": 7.76259769189273e-07, "loss": 0.3619, "step": 12700 }, { "epoch": 1.5304033714629741, "grad_norm": 4.945206165313721, "learning_rate": 7.758217083183891e-07, "loss": 0.358, "step": 12710 }, { "epoch": 1.5316074653823, "grad_norm": 4.3737287521362305, "learning_rate": 7.753833429078342e-07, "loss": 0.3566, "step": 12720 }, { "epoch": 1.5328115593016256, "grad_norm": 4.813685894012451, "learning_rate": 7.749446734416152e-07, "loss": 0.344, "step": 12730 }, { "epoch": 1.5340156532209512, "grad_norm": 3.858191728591919, "learning_rate": 7.745057004040751e-07, "loss": 0.3461, "step": 12740 }, { "epoch": 1.5352197471402769, "grad_norm": 4.396629333496094, "learning_rate": 7.740664242798919e-07, "loss": 0.3496, "step": 12750 }, { "epoch": 1.5364238410596025, "grad_norm": 4.17794132232666, "learning_rate": 7.73626845554078e-07, "loss": 0.3584, "step": 12760 }, { "epoch": 1.5376279349789284, "grad_norm": 6.110503673553467, "learning_rate": 7.731869647119801e-07, "loss": 0.3741, "step": 12770 }, { "epoch": 1.5388320288982542, "grad_norm": 4.858775615692139, "learning_rate": 7.727467822392787e-07, "loss": 0.3489, "step": 12780 }, { "epoch": 1.5400361228175798, "grad_norm": 4.899129390716553, "learning_rate": 7.723062986219871e-07, "loss": 0.3574, "step": 12790 }, { "epoch": 1.5412402167369055, "grad_norm": 4.589954853057861, "learning_rate": 7.718655143464508e-07, "loss": 0.3697, "step": 12800 }, { "epoch": 1.542444310656231, "grad_norm": 4.615177154541016, "learning_rate": 7.71424429899348e-07, "loss": 0.3574, "step": 12810 }, { "epoch": 1.5436484045755567, "grad_norm": 5.081363201141357, "learning_rate": 7.709830457676876e-07, "loss": 0.3793, "step": 12820 }, { "epoch": 1.5448524984948826, "grad_norm": 5.210774898529053, "learning_rate": 7.7054136243881e-07, "loss": 0.3562, "step": 12830 }, { "epoch": 1.5460565924142085, "grad_norm": 4.458885192871094, "learning_rate": 7.700993804003855e-07, "loss": 0.3619, "step": 12840 }, { "epoch": 1.547260686333534, "grad_norm": 4.320379734039307, "learning_rate": 7.696571001404142e-07, "loss": 0.3629, "step": 12850 }, { "epoch": 1.5484647802528597, "grad_norm": 4.779387474060059, "learning_rate": 7.692145221472258e-07, "loss": 0.3633, "step": 12860 }, { "epoch": 1.5496688741721854, "grad_norm": 4.924083709716797, "learning_rate": 7.687716469094786e-07, "loss": 0.3624, "step": 12870 }, { "epoch": 1.550872968091511, "grad_norm": 5.194228649139404, "learning_rate": 7.68328474916159e-07, "loss": 0.3592, "step": 12880 }, { "epoch": 1.5520770620108368, "grad_norm": 4.606070041656494, "learning_rate": 7.67885006656581e-07, "loss": 0.3686, "step": 12890 }, { "epoch": 1.5532811559301627, "grad_norm": 4.206083297729492, "learning_rate": 7.674412426203859e-07, "loss": 0.3551, "step": 12900 }, { "epoch": 1.5544852498494883, "grad_norm": 4.67086124420166, "learning_rate": 7.669971832975416e-07, "loss": 0.3569, "step": 12910 }, { "epoch": 1.555689343768814, "grad_norm": 5.904470443725586, "learning_rate": 7.665528291783417e-07, "loss": 0.3407, "step": 12920 }, { "epoch": 1.5568934376881396, "grad_norm": 4.242117404937744, "learning_rate": 7.661081807534058e-07, "loss": 0.3422, "step": 12930 }, { "epoch": 1.5580975316074652, "grad_norm": 4.790373802185059, "learning_rate": 7.656632385136778e-07, "loss": 0.3573, "step": 12940 }, { "epoch": 1.559301625526791, "grad_norm": 4.904318809509277, "learning_rate": 7.652180029504268e-07, "loss": 0.3606, "step": 12950 }, { "epoch": 1.560505719446117, "grad_norm": 4.863579750061035, "learning_rate": 7.64772474555245e-07, "loss": 0.361, "step": 12960 }, { "epoch": 1.5617098133654426, "grad_norm": 5.459078311920166, "learning_rate": 7.643266538200483e-07, "loss": 0.3577, "step": 12970 }, { "epoch": 1.5629139072847682, "grad_norm": 5.426388740539551, "learning_rate": 7.638805412370755e-07, "loss": 0.3725, "step": 12980 }, { "epoch": 1.5641180012040938, "grad_norm": 4.903288841247559, "learning_rate": 7.634341372988872e-07, "loss": 0.3562, "step": 12990 }, { "epoch": 1.5653220951234195, "grad_norm": 4.128101825714111, "learning_rate": 7.629874424983664e-07, "loss": 0.3405, "step": 13000 }, { "epoch": 1.5665261890427453, "grad_norm": 4.6488213539123535, "learning_rate": 7.625404573287163e-07, "loss": 0.3731, "step": 13010 }, { "epoch": 1.5677302829620712, "grad_norm": 4.610156059265137, "learning_rate": 7.620931822834614e-07, "loss": 0.3575, "step": 13020 }, { "epoch": 1.5689343768813968, "grad_norm": 5.422335147857666, "learning_rate": 7.616456178564462e-07, "loss": 0.3833, "step": 13030 }, { "epoch": 1.5701384708007224, "grad_norm": 4.844593048095703, "learning_rate": 7.611977645418343e-07, "loss": 0.3647, "step": 13040 }, { "epoch": 1.571342564720048, "grad_norm": 4.274131774902344, "learning_rate": 7.607496228341088e-07, "loss": 0.3542, "step": 13050 }, { "epoch": 1.5725466586393737, "grad_norm": 4.641569137573242, "learning_rate": 7.60301193228071e-07, "loss": 0.3704, "step": 13060 }, { "epoch": 1.5737507525586996, "grad_norm": 4.771531105041504, "learning_rate": 7.598524762188395e-07, "loss": 0.3529, "step": 13070 }, { "epoch": 1.5749548464780254, "grad_norm": 5.63432502746582, "learning_rate": 7.594034723018514e-07, "loss": 0.3554, "step": 13080 }, { "epoch": 1.576158940397351, "grad_norm": 3.5664002895355225, "learning_rate": 7.589541819728596e-07, "loss": 0.3617, "step": 13090 }, { "epoch": 1.5773630343166767, "grad_norm": 4.43233060836792, "learning_rate": 7.585046057279337e-07, "loss": 0.3795, "step": 13100 }, { "epoch": 1.5785671282360023, "grad_norm": 4.293588638305664, "learning_rate": 7.580547440634587e-07, "loss": 0.3361, "step": 13110 }, { "epoch": 1.5797712221553282, "grad_norm": 4.606287479400635, "learning_rate": 7.576045974761351e-07, "loss": 0.3573, "step": 13120 }, { "epoch": 1.5809753160746538, "grad_norm": 4.9702558517456055, "learning_rate": 7.571541664629775e-07, "loss": 0.3718, "step": 13130 }, { "epoch": 1.5821794099939797, "grad_norm": 4.685069561004639, "learning_rate": 7.567034515213151e-07, "loss": 0.3704, "step": 13140 }, { "epoch": 1.5833835039133053, "grad_norm": 4.804528713226318, "learning_rate": 7.562524531487902e-07, "loss": 0.3511, "step": 13150 }, { "epoch": 1.584587597832631, "grad_norm": 5.332268714904785, "learning_rate": 7.558011718433582e-07, "loss": 0.3573, "step": 13160 }, { "epoch": 1.5857916917519566, "grad_norm": 4.4862284660339355, "learning_rate": 7.553496081032867e-07, "loss": 0.3423, "step": 13170 }, { "epoch": 1.5869957856712824, "grad_norm": 4.632198810577393, "learning_rate": 7.548977624271555e-07, "loss": 0.3719, "step": 13180 }, { "epoch": 1.588199879590608, "grad_norm": 4.4371137619018555, "learning_rate": 7.544456353138553e-07, "loss": 0.3515, "step": 13190 }, { "epoch": 1.589403973509934, "grad_norm": 4.162461757659912, "learning_rate": 7.539932272625879e-07, "loss": 0.363, "step": 13200 }, { "epoch": 1.5906080674292595, "grad_norm": 4.980907917022705, "learning_rate": 7.535405387728648e-07, "loss": 0.362, "step": 13210 }, { "epoch": 1.5918121613485852, "grad_norm": 4.321689128875732, "learning_rate": 7.530875703445077e-07, "loss": 0.3441, "step": 13220 }, { "epoch": 1.5930162552679108, "grad_norm": 4.930966854095459, "learning_rate": 7.526343224776471e-07, "loss": 0.3505, "step": 13230 }, { "epoch": 1.5942203491872367, "grad_norm": 4.267889499664307, "learning_rate": 7.52180795672722e-07, "loss": 0.3678, "step": 13240 }, { "epoch": 1.5954244431065623, "grad_norm": 3.8834383487701416, "learning_rate": 7.517269904304794e-07, "loss": 0.3648, "step": 13250 }, { "epoch": 1.5966285370258881, "grad_norm": 4.397730827331543, "learning_rate": 7.512729072519739e-07, "loss": 0.3601, "step": 13260 }, { "epoch": 1.5978326309452138, "grad_norm": 4.559187889099121, "learning_rate": 7.508185466385666e-07, "loss": 0.3508, "step": 13270 }, { "epoch": 1.5990367248645394, "grad_norm": 4.514613628387451, "learning_rate": 7.503639090919255e-07, "loss": 0.3578, "step": 13280 }, { "epoch": 1.600240818783865, "grad_norm": 4.5233073234558105, "learning_rate": 7.499089951140237e-07, "loss": 0.3516, "step": 13290 }, { "epoch": 1.601444912703191, "grad_norm": 4.616694450378418, "learning_rate": 7.494538052071402e-07, "loss": 0.3616, "step": 13300 }, { "epoch": 1.6026490066225165, "grad_norm": 4.6488518714904785, "learning_rate": 7.489983398738579e-07, "loss": 0.3582, "step": 13310 }, { "epoch": 1.6038531005418424, "grad_norm": 4.645969390869141, "learning_rate": 7.485425996170644e-07, "loss": 0.3548, "step": 13320 }, { "epoch": 1.605057194461168, "grad_norm": 5.864965438842773, "learning_rate": 7.480865849399507e-07, "loss": 0.3587, "step": 13330 }, { "epoch": 1.6062612883804936, "grad_norm": 4.283803939819336, "learning_rate": 7.476302963460108e-07, "loss": 0.3626, "step": 13340 }, { "epoch": 1.6074653822998193, "grad_norm": 4.545533657073975, "learning_rate": 7.47173734339041e-07, "loss": 0.3526, "step": 13350 }, { "epoch": 1.6086694762191451, "grad_norm": 4.885293483734131, "learning_rate": 7.467168994231393e-07, "loss": 0.3685, "step": 13360 }, { "epoch": 1.6098735701384708, "grad_norm": 4.112198829650879, "learning_rate": 7.462597921027056e-07, "loss": 0.3727, "step": 13370 }, { "epoch": 1.6110776640577966, "grad_norm": 4.272058963775635, "learning_rate": 7.458024128824403e-07, "loss": 0.3567, "step": 13380 }, { "epoch": 1.6122817579771223, "grad_norm": 4.891336441040039, "learning_rate": 7.453447622673438e-07, "loss": 0.3566, "step": 13390 }, { "epoch": 1.6134858518964479, "grad_norm": 5.003636837005615, "learning_rate": 7.448868407627163e-07, "loss": 0.3717, "step": 13400 }, { "epoch": 1.6146899458157735, "grad_norm": 3.9844002723693848, "learning_rate": 7.444286488741571e-07, "loss": 0.3537, "step": 13410 }, { "epoch": 1.6158940397350994, "grad_norm": 4.326488018035889, "learning_rate": 7.439701871075641e-07, "loss": 0.3353, "step": 13420 }, { "epoch": 1.617098133654425, "grad_norm": 4.168161392211914, "learning_rate": 7.435114559691333e-07, "loss": 0.3506, "step": 13430 }, { "epoch": 1.6183022275737509, "grad_norm": 5.062152862548828, "learning_rate": 7.430524559653575e-07, "loss": 0.3536, "step": 13440 }, { "epoch": 1.6195063214930765, "grad_norm": 5.29563570022583, "learning_rate": 7.425931876030272e-07, "loss": 0.359, "step": 13450 }, { "epoch": 1.6207104154124021, "grad_norm": 4.655216693878174, "learning_rate": 7.421336513892284e-07, "loss": 0.3459, "step": 13460 }, { "epoch": 1.6219145093317278, "grad_norm": 4.558264255523682, "learning_rate": 7.416738478313438e-07, "loss": 0.3603, "step": 13470 }, { "epoch": 1.6231186032510536, "grad_norm": 4.36596155166626, "learning_rate": 7.412137774370501e-07, "loss": 0.3632, "step": 13480 }, { "epoch": 1.6243226971703792, "grad_norm": 4.248297214508057, "learning_rate": 7.407534407143198e-07, "loss": 0.3575, "step": 13490 }, { "epoch": 1.625526791089705, "grad_norm": 4.935293197631836, "learning_rate": 7.402928381714184e-07, "loss": 0.3583, "step": 13500 }, { "epoch": 1.6267308850090307, "grad_norm": 4.29832649230957, "learning_rate": 7.398319703169057e-07, "loss": 0.3593, "step": 13510 }, { "epoch": 1.6279349789283564, "grad_norm": 4.707507610321045, "learning_rate": 7.39370837659634e-07, "loss": 0.3486, "step": 13520 }, { "epoch": 1.629139072847682, "grad_norm": 4.7867326736450195, "learning_rate": 7.389094407087481e-07, "loss": 0.3708, "step": 13530 }, { "epoch": 1.6303431667670079, "grad_norm": 5.004173755645752, "learning_rate": 7.384477799736847e-07, "loss": 0.3693, "step": 13540 }, { "epoch": 1.6315472606863335, "grad_norm": 4.378966331481934, "learning_rate": 7.379858559641716e-07, "loss": 0.3792, "step": 13550 }, { "epoch": 1.6327513546056593, "grad_norm": 4.35708475112915, "learning_rate": 7.375236691902272e-07, "loss": 0.357, "step": 13560 }, { "epoch": 1.633955448524985, "grad_norm": 4.158879280090332, "learning_rate": 7.370612201621606e-07, "loss": 0.3705, "step": 13570 }, { "epoch": 1.6351595424443106, "grad_norm": 4.620648384094238, "learning_rate": 7.365985093905693e-07, "loss": 0.3288, "step": 13580 }, { "epoch": 1.6363636363636362, "grad_norm": 4.588129997253418, "learning_rate": 7.361355373863413e-07, "loss": 0.3545, "step": 13590 }, { "epoch": 1.637567730282962, "grad_norm": 4.273639678955078, "learning_rate": 7.356723046606517e-07, "loss": 0.3597, "step": 13600 }, { "epoch": 1.6387718242022877, "grad_norm": 4.793459415435791, "learning_rate": 7.352088117249644e-07, "loss": 0.3532, "step": 13610 }, { "epoch": 1.6399759181216136, "grad_norm": 4.27385950088501, "learning_rate": 7.347450590910299e-07, "loss": 0.3787, "step": 13620 }, { "epoch": 1.6411800120409392, "grad_norm": 4.229093551635742, "learning_rate": 7.34281047270886e-07, "loss": 0.3592, "step": 13630 }, { "epoch": 1.6423841059602649, "grad_norm": 4.402678489685059, "learning_rate": 7.338167767768564e-07, "loss": 0.3612, "step": 13640 }, { "epoch": 1.6435881998795905, "grad_norm": 4.09978723526001, "learning_rate": 7.333522481215503e-07, "loss": 0.3571, "step": 13650 }, { "epoch": 1.6447922937989163, "grad_norm": 4.659477710723877, "learning_rate": 7.32887461817862e-07, "loss": 0.3725, "step": 13660 }, { "epoch": 1.645996387718242, "grad_norm": 4.500072002410889, "learning_rate": 7.324224183789707e-07, "loss": 0.3458, "step": 13670 }, { "epoch": 1.6472004816375678, "grad_norm": 5.1016526222229, "learning_rate": 7.319571183183388e-07, "loss": 0.3734, "step": 13680 }, { "epoch": 1.6484045755568935, "grad_norm": 4.819193363189697, "learning_rate": 7.314915621497129e-07, "loss": 0.3601, "step": 13690 }, { "epoch": 1.649608669476219, "grad_norm": 4.4075026512146, "learning_rate": 7.310257503871214e-07, "loss": 0.3556, "step": 13700 }, { "epoch": 1.6508127633955447, "grad_norm": 4.471024036407471, "learning_rate": 7.305596835448753e-07, "loss": 0.3625, "step": 13710 }, { "epoch": 1.6520168573148706, "grad_norm": 4.29016637802124, "learning_rate": 7.300933621375676e-07, "loss": 0.3619, "step": 13720 }, { "epoch": 1.6532209512341962, "grad_norm": 4.514208793640137, "learning_rate": 7.296267866800722e-07, "loss": 0.3622, "step": 13730 }, { "epoch": 1.654425045153522, "grad_norm": 4.275468826293945, "learning_rate": 7.291599576875432e-07, "loss": 0.3667, "step": 13740 }, { "epoch": 1.6556291390728477, "grad_norm": 4.0805559158325195, "learning_rate": 7.286928756754148e-07, "loss": 0.371, "step": 13750 }, { "epoch": 1.6568332329921733, "grad_norm": 4.84345006942749, "learning_rate": 7.282255411594006e-07, "loss": 0.3696, "step": 13760 }, { "epoch": 1.658037326911499, "grad_norm": 4.703734874725342, "learning_rate": 7.277579546554931e-07, "loss": 0.3673, "step": 13770 }, { "epoch": 1.6592414208308248, "grad_norm": 4.18894624710083, "learning_rate": 7.272901166799627e-07, "loss": 0.3365, "step": 13780 }, { "epoch": 1.6604455147501507, "grad_norm": 4.9901204109191895, "learning_rate": 7.268220277493578e-07, "loss": 0.3588, "step": 13790 }, { "epoch": 1.6616496086694763, "grad_norm": 4.896132946014404, "learning_rate": 7.263536883805039e-07, "loss": 0.3659, "step": 13800 }, { "epoch": 1.662853702588802, "grad_norm": 4.311833381652832, "learning_rate": 7.258850990905025e-07, "loss": 0.3707, "step": 13810 }, { "epoch": 1.6640577965081276, "grad_norm": 4.157628059387207, "learning_rate": 7.254162603967317e-07, "loss": 0.3498, "step": 13820 }, { "epoch": 1.6652618904274532, "grad_norm": 5.240469932556152, "learning_rate": 7.249471728168443e-07, "loss": 0.3559, "step": 13830 }, { "epoch": 1.666465984346779, "grad_norm": 4.077708721160889, "learning_rate": 7.244778368687687e-07, "loss": 0.3745, "step": 13840 }, { "epoch": 1.667670078266105, "grad_norm": 4.9550395011901855, "learning_rate": 7.240082530707069e-07, "loss": 0.3563, "step": 13850 }, { "epoch": 1.6688741721854305, "grad_norm": 5.530270576477051, "learning_rate": 7.235384219411348e-07, "loss": 0.3764, "step": 13860 }, { "epoch": 1.6700782661047562, "grad_norm": 4.50790548324585, "learning_rate": 7.230683439988012e-07, "loss": 0.3471, "step": 13870 }, { "epoch": 1.6712823600240818, "grad_norm": 4.373943328857422, "learning_rate": 7.225980197627277e-07, "loss": 0.3601, "step": 13880 }, { "epoch": 1.6724864539434074, "grad_norm": 3.9449055194854736, "learning_rate": 7.221274497522076e-07, "loss": 0.3533, "step": 13890 }, { "epoch": 1.6736905478627333, "grad_norm": 4.625890254974365, "learning_rate": 7.216566344868058e-07, "loss": 0.3771, "step": 13900 }, { "epoch": 1.6748946417820592, "grad_norm": 4.7843475341796875, "learning_rate": 7.211855744863577e-07, "loss": 0.3477, "step": 13910 }, { "epoch": 1.6760987357013848, "grad_norm": 4.275618076324463, "learning_rate": 7.207142702709688e-07, "loss": 0.3452, "step": 13920 }, { "epoch": 1.6773028296207104, "grad_norm": 5.26132869720459, "learning_rate": 7.202427223610152e-07, "loss": 0.3568, "step": 13930 }, { "epoch": 1.678506923540036, "grad_norm": 4.528031826019287, "learning_rate": 7.197709312771406e-07, "loss": 0.347, "step": 13940 }, { "epoch": 1.6797110174593617, "grad_norm": 4.68961763381958, "learning_rate": 7.192988975402583e-07, "loss": 0.3687, "step": 13950 }, { "epoch": 1.6809151113786875, "grad_norm": 4.3820719718933105, "learning_rate": 7.188266216715493e-07, "loss": 0.3572, "step": 13960 }, { "epoch": 1.6821192052980134, "grad_norm": 3.974177598953247, "learning_rate": 7.183541041924616e-07, "loss": 0.34, "step": 13970 }, { "epoch": 1.683323299217339, "grad_norm": 4.8562331199646, "learning_rate": 7.178813456247102e-07, "loss": 0.3532, "step": 13980 }, { "epoch": 1.6845273931366647, "grad_norm": 3.9439549446105957, "learning_rate": 7.174083464902763e-07, "loss": 0.3459, "step": 13990 }, { "epoch": 1.6857314870559903, "grad_norm": 4.226308345794678, "learning_rate": 7.16935107311407e-07, "loss": 0.3352, "step": 14000 }, { "epoch": 1.686935580975316, "grad_norm": 4.850135326385498, "learning_rate": 7.164616286106135e-07, "loss": 0.3661, "step": 14010 }, { "epoch": 1.6881396748946418, "grad_norm": 4.845891952514648, "learning_rate": 7.159879109106725e-07, "loss": 0.3868, "step": 14020 }, { "epoch": 1.6893437688139676, "grad_norm": 5.063507556915283, "learning_rate": 7.155139547346242e-07, "loss": 0.3628, "step": 14030 }, { "epoch": 1.6905478627332933, "grad_norm": 4.6817216873168945, "learning_rate": 7.15039760605772e-07, "loss": 0.3744, "step": 14040 }, { "epoch": 1.691751956652619, "grad_norm": 4.315075874328613, "learning_rate": 7.145653290476819e-07, "loss": 0.3613, "step": 14050 }, { "epoch": 1.6929560505719445, "grad_norm": 4.234760284423828, "learning_rate": 7.140906605841825e-07, "loss": 0.3733, "step": 14060 }, { "epoch": 1.6941601444912702, "grad_norm": 5.843511581420898, "learning_rate": 7.136157557393637e-07, "loss": 0.3443, "step": 14070 }, { "epoch": 1.695364238410596, "grad_norm": 4.704221248626709, "learning_rate": 7.131406150375762e-07, "loss": 0.3384, "step": 14080 }, { "epoch": 1.6965683323299219, "grad_norm": 4.1078200340271, "learning_rate": 7.126652390034316e-07, "loss": 0.3554, "step": 14090 }, { "epoch": 1.6977724262492475, "grad_norm": 4.6124773025512695, "learning_rate": 7.12189628161801e-07, "loss": 0.3323, "step": 14100 }, { "epoch": 1.6989765201685731, "grad_norm": 3.9569902420043945, "learning_rate": 7.117137830378146e-07, "loss": 0.3581, "step": 14110 }, { "epoch": 1.7001806140878988, "grad_norm": 4.327024459838867, "learning_rate": 7.112377041568617e-07, "loss": 0.3605, "step": 14120 }, { "epoch": 1.7013847080072244, "grad_norm": 4.041974067687988, "learning_rate": 7.107613920445895e-07, "loss": 0.3514, "step": 14130 }, { "epoch": 1.7025888019265503, "grad_norm": 4.295658588409424, "learning_rate": 7.102848472269026e-07, "loss": 0.3489, "step": 14140 }, { "epoch": 1.7037928958458761, "grad_norm": 4.117722988128662, "learning_rate": 7.098080702299628e-07, "loss": 0.3382, "step": 14150 }, { "epoch": 1.7049969897652018, "grad_norm": 5.249290943145752, "learning_rate": 7.093310615801879e-07, "loss": 0.3696, "step": 14160 }, { "epoch": 1.7062010836845274, "grad_norm": 3.8647286891937256, "learning_rate": 7.088538218042518e-07, "loss": 0.3403, "step": 14170 }, { "epoch": 1.707405177603853, "grad_norm": 4.454891204833984, "learning_rate": 7.083763514290834e-07, "loss": 0.3743, "step": 14180 }, { "epoch": 1.7086092715231787, "grad_norm": 4.183931827545166, "learning_rate": 7.078986509818662e-07, "loss": 0.3493, "step": 14190 }, { "epoch": 1.7098133654425045, "grad_norm": 3.9510889053344727, "learning_rate": 7.074207209900379e-07, "loss": 0.3469, "step": 14200 }, { "epoch": 1.7110174593618304, "grad_norm": 4.839264869689941, "learning_rate": 7.069425619812896e-07, "loss": 0.3444, "step": 14210 }, { "epoch": 1.712221553281156, "grad_norm": 4.237350940704346, "learning_rate": 7.064641744835649e-07, "loss": 0.3474, "step": 14220 }, { "epoch": 1.7134256472004816, "grad_norm": 4.17114782333374, "learning_rate": 7.059855590250603e-07, "loss": 0.3465, "step": 14230 }, { "epoch": 1.7146297411198073, "grad_norm": 4.114003658294678, "learning_rate": 7.055067161342233e-07, "loss": 0.3674, "step": 14240 }, { "epoch": 1.715833835039133, "grad_norm": 4.886813640594482, "learning_rate": 7.050276463397533e-07, "loss": 0.3848, "step": 14250 }, { "epoch": 1.7170379289584587, "grad_norm": 4.069955348968506, "learning_rate": 7.045483501705996e-07, "loss": 0.3493, "step": 14260 }, { "epoch": 1.7182420228777846, "grad_norm": 4.502857685089111, "learning_rate": 7.040688281559617e-07, "loss": 0.3548, "step": 14270 }, { "epoch": 1.7194461167971102, "grad_norm": 4.283501148223877, "learning_rate": 7.035890808252884e-07, "loss": 0.3571, "step": 14280 }, { "epoch": 1.7206502107164359, "grad_norm": 4.563022136688232, "learning_rate": 7.031091087082772e-07, "loss": 0.3485, "step": 14290 }, { "epoch": 1.7218543046357615, "grad_norm": 4.165189266204834, "learning_rate": 7.02628912334874e-07, "loss": 0.3417, "step": 14300 }, { "epoch": 1.7230583985550871, "grad_norm": 4.657063961029053, "learning_rate": 7.021484922352721e-07, "loss": 0.3611, "step": 14310 }, { "epoch": 1.724262492474413, "grad_norm": 6.094346046447754, "learning_rate": 7.016678489399121e-07, "loss": 0.3371, "step": 14320 }, { "epoch": 1.7254665863937388, "grad_norm": 4.576262474060059, "learning_rate": 7.011869829794806e-07, "loss": 0.3624, "step": 14330 }, { "epoch": 1.7266706803130645, "grad_norm": 5.231967449188232, "learning_rate": 7.007058948849105e-07, "loss": 0.3745, "step": 14340 }, { "epoch": 1.72787477423239, "grad_norm": 4.39863395690918, "learning_rate": 7.002245851873794e-07, "loss": 0.3545, "step": 14350 }, { "epoch": 1.7290788681517157, "grad_norm": 4.428983211517334, "learning_rate": 6.997430544183103e-07, "loss": 0.3534, "step": 14360 }, { "epoch": 1.7302829620710414, "grad_norm": 5.451033115386963, "learning_rate": 6.992613031093698e-07, "loss": 0.3584, "step": 14370 }, { "epoch": 1.7314870559903672, "grad_norm": 4.715031147003174, "learning_rate": 6.987793317924682e-07, "loss": 0.3643, "step": 14380 }, { "epoch": 1.732691149909693, "grad_norm": 4.199245452880859, "learning_rate": 6.982971409997583e-07, "loss": 0.3539, "step": 14390 }, { "epoch": 1.7338952438290187, "grad_norm": 5.606119632720947, "learning_rate": 6.97814731263636e-07, "loss": 0.3613, "step": 14400 }, { "epoch": 1.7350993377483444, "grad_norm": 5.036284923553467, "learning_rate": 6.973321031167382e-07, "loss": 0.3679, "step": 14410 }, { "epoch": 1.73630343166767, "grad_norm": 4.951879978179932, "learning_rate": 6.968492570919434e-07, "loss": 0.3572, "step": 14420 }, { "epoch": 1.7375075255869958, "grad_norm": 4.428969860076904, "learning_rate": 6.963661937223703e-07, "loss": 0.3538, "step": 14430 }, { "epoch": 1.7387116195063215, "grad_norm": 3.7024569511413574, "learning_rate": 6.958829135413782e-07, "loss": 0.3644, "step": 14440 }, { "epoch": 1.7399157134256473, "grad_norm": 4.4168381690979, "learning_rate": 6.95399417082565e-07, "loss": 0.3498, "step": 14450 }, { "epoch": 1.741119807344973, "grad_norm": 4.818751335144043, "learning_rate": 6.949157048797678e-07, "loss": 0.3726, "step": 14460 }, { "epoch": 1.7423239012642986, "grad_norm": 5.769382953643799, "learning_rate": 6.944317774670622e-07, "loss": 0.3517, "step": 14470 }, { "epoch": 1.7435279951836242, "grad_norm": 4.914524078369141, "learning_rate": 6.939476353787607e-07, "loss": 0.349, "step": 14480 }, { "epoch": 1.74473208910295, "grad_norm": 4.6800456047058105, "learning_rate": 6.934632791494134e-07, "loss": 0.3725, "step": 14490 }, { "epoch": 1.7459361830222757, "grad_norm": 4.627834796905518, "learning_rate": 6.929787093138067e-07, "loss": 0.359, "step": 14500 }, { "epoch": 1.7471402769416016, "grad_norm": 5.098109245300293, "learning_rate": 6.924939264069626e-07, "loss": 0.3502, "step": 14510 }, { "epoch": 1.7483443708609272, "grad_norm": 4.18192720413208, "learning_rate": 6.920089309641388e-07, "loss": 0.3448, "step": 14520 }, { "epoch": 1.7495484647802528, "grad_norm": 4.4052815437316895, "learning_rate": 6.915237235208274e-07, "loss": 0.3459, "step": 14530 }, { "epoch": 1.7507525586995785, "grad_norm": 5.557136058807373, "learning_rate": 6.910383046127544e-07, "loss": 0.355, "step": 14540 }, { "epoch": 1.7519566526189043, "grad_norm": 5.7654128074646, "learning_rate": 6.905526747758796e-07, "loss": 0.3624, "step": 14550 }, { "epoch": 1.75316074653823, "grad_norm": 5.040695667266846, "learning_rate": 6.900668345463957e-07, "loss": 0.3513, "step": 14560 }, { "epoch": 1.7543648404575558, "grad_norm": 4.529175758361816, "learning_rate": 6.895807844607274e-07, "loss": 0.348, "step": 14570 }, { "epoch": 1.7555689343768814, "grad_norm": 4.473850727081299, "learning_rate": 6.890945250555312e-07, "loss": 0.3708, "step": 14580 }, { "epoch": 1.756773028296207, "grad_norm": 4.2242865562438965, "learning_rate": 6.88608056867695e-07, "loss": 0.3536, "step": 14590 }, { "epoch": 1.7579771222155327, "grad_norm": 4.953219413757324, "learning_rate": 6.881213804343369e-07, "loss": 0.3564, "step": 14600 }, { "epoch": 1.7591812161348586, "grad_norm": 4.626575469970703, "learning_rate": 6.876344962928051e-07, "loss": 0.3624, "step": 14610 }, { "epoch": 1.7603853100541842, "grad_norm": 5.615645408630371, "learning_rate": 6.87147404980677e-07, "loss": 0.3711, "step": 14620 }, { "epoch": 1.76158940397351, "grad_norm": 4.350038051605225, "learning_rate": 6.866601070357587e-07, "loss": 0.3517, "step": 14630 }, { "epoch": 1.7627934978928357, "grad_norm": 4.5289387702941895, "learning_rate": 6.861726029960849e-07, "loss": 0.3602, "step": 14640 }, { "epoch": 1.7639975918121613, "grad_norm": 5.127388954162598, "learning_rate": 6.856848933999173e-07, "loss": 0.345, "step": 14650 }, { "epoch": 1.765201685731487, "grad_norm": 4.675601482391357, "learning_rate": 6.851969787857447e-07, "loss": 0.3484, "step": 14660 }, { "epoch": 1.7664057796508128, "grad_norm": 3.9305527210235596, "learning_rate": 6.847088596922824e-07, "loss": 0.3478, "step": 14670 }, { "epoch": 1.7676098735701384, "grad_norm": 4.547889709472656, "learning_rate": 6.842205366584715e-07, "loss": 0.3627, "step": 14680 }, { "epoch": 1.7688139674894643, "grad_norm": 5.042651653289795, "learning_rate": 6.837320102234781e-07, "loss": 0.3595, "step": 14690 }, { "epoch": 1.77001806140879, "grad_norm": 4.645577907562256, "learning_rate": 6.832432809266928e-07, "loss": 0.3417, "step": 14700 }, { "epoch": 1.7712221553281156, "grad_norm": 5.52669095993042, "learning_rate": 6.827543493077306e-07, "loss": 0.352, "step": 14710 }, { "epoch": 1.7724262492474412, "grad_norm": 4.48500394821167, "learning_rate": 6.822652159064293e-07, "loss": 0.3427, "step": 14720 }, { "epoch": 1.773630343166767, "grad_norm": 4.676848411560059, "learning_rate": 6.817758812628503e-07, "loss": 0.3568, "step": 14730 }, { "epoch": 1.7748344370860927, "grad_norm": 4.112384796142578, "learning_rate": 6.812863459172764e-07, "loss": 0.3626, "step": 14740 }, { "epoch": 1.7760385310054185, "grad_norm": 4.3355326652526855, "learning_rate": 6.807966104102122e-07, "loss": 0.3408, "step": 14750 }, { "epoch": 1.7772426249247442, "grad_norm": 4.12075138092041, "learning_rate": 6.803066752823837e-07, "loss": 0.3516, "step": 14760 }, { "epoch": 1.7784467188440698, "grad_norm": 4.14115571975708, "learning_rate": 6.79816541074737e-07, "loss": 0.3442, "step": 14770 }, { "epoch": 1.7796508127633954, "grad_norm": 4.440965175628662, "learning_rate": 6.793262083284377e-07, "loss": 0.348, "step": 14780 }, { "epoch": 1.7808549066827213, "grad_norm": 4.727054595947266, "learning_rate": 6.788356775848712e-07, "loss": 0.3545, "step": 14790 }, { "epoch": 1.782059000602047, "grad_norm": 4.421995639801025, "learning_rate": 6.783449493856411e-07, "loss": 0.3584, "step": 14800 }, { "epoch": 1.7832630945213728, "grad_norm": 4.619497776031494, "learning_rate": 6.778540242725695e-07, "loss": 0.3621, "step": 14810 }, { "epoch": 1.7844671884406984, "grad_norm": 4.975179672241211, "learning_rate": 6.773629027876952e-07, "loss": 0.3433, "step": 14820 }, { "epoch": 1.785671282360024, "grad_norm": 4.3249030113220215, "learning_rate": 6.768715854732743e-07, "loss": 0.362, "step": 14830 }, { "epoch": 1.7868753762793497, "grad_norm": 4.467803001403809, "learning_rate": 6.763800728717792e-07, "loss": 0.3589, "step": 14840 }, { "epoch": 1.7880794701986755, "grad_norm": 5.496029376983643, "learning_rate": 6.758883655258976e-07, "loss": 0.3395, "step": 14850 }, { "epoch": 1.7892835641180012, "grad_norm": 4.524773120880127, "learning_rate": 6.753964639785321e-07, "loss": 0.3544, "step": 14860 }, { "epoch": 1.790487658037327, "grad_norm": 4.625549793243408, "learning_rate": 6.749043687728005e-07, "loss": 0.3721, "step": 14870 }, { "epoch": 1.7916917519566526, "grad_norm": 5.1430230140686035, "learning_rate": 6.744120804520335e-07, "loss": 0.3516, "step": 14880 }, { "epoch": 1.7928958458759783, "grad_norm": 5.0784173011779785, "learning_rate": 6.739195995597757e-07, "loss": 0.3579, "step": 14890 }, { "epoch": 1.794099939795304, "grad_norm": 4.529468536376953, "learning_rate": 6.734269266397836e-07, "loss": 0.3573, "step": 14900 }, { "epoch": 1.7953040337146298, "grad_norm": 4.950248718261719, "learning_rate": 6.729340622360267e-07, "loss": 0.3615, "step": 14910 }, { "epoch": 1.7965081276339554, "grad_norm": 3.968449831008911, "learning_rate": 6.724410068926852e-07, "loss": 0.3361, "step": 14920 }, { "epoch": 1.7977122215532813, "grad_norm": 4.806743144989014, "learning_rate": 6.7194776115415e-07, "loss": 0.3497, "step": 14930 }, { "epoch": 1.7989163154726069, "grad_norm": 4.263092517852783, "learning_rate": 6.714543255650229e-07, "loss": 0.3659, "step": 14940 }, { "epoch": 1.8001204093919325, "grad_norm": 4.752941131591797, "learning_rate": 6.709607006701148e-07, "loss": 0.3363, "step": 14950 }, { "epoch": 1.8013245033112582, "grad_norm": 5.102241516113281, "learning_rate": 6.704668870144458e-07, "loss": 0.3487, "step": 14960 }, { "epoch": 1.802528597230584, "grad_norm": 3.8051202297210693, "learning_rate": 6.699728851432442e-07, "loss": 0.3373, "step": 14970 }, { "epoch": 1.8037326911499096, "grad_norm": 4.386908054351807, "learning_rate": 6.694786956019467e-07, "loss": 0.3646, "step": 14980 }, { "epoch": 1.8049367850692355, "grad_norm": 4.566622257232666, "learning_rate": 6.689843189361962e-07, "loss": 0.3698, "step": 14990 }, { "epoch": 1.8061408789885611, "grad_norm": 4.474935054779053, "learning_rate": 6.684897556918434e-07, "loss": 0.3567, "step": 15000 }, { "epoch": 1.8073449729078868, "grad_norm": 4.712069034576416, "learning_rate": 6.67995006414944e-07, "loss": 0.3573, "step": 15010 }, { "epoch": 1.8085490668272124, "grad_norm": 4.497696876525879, "learning_rate": 6.675000716517595e-07, "loss": 0.3373, "step": 15020 }, { "epoch": 1.8097531607465382, "grad_norm": 4.327920436859131, "learning_rate": 6.670049519487565e-07, "loss": 0.3689, "step": 15030 }, { "epoch": 1.810957254665864, "grad_norm": 6.609139919281006, "learning_rate": 6.665096478526053e-07, "loss": 0.3465, "step": 15040 }, { "epoch": 1.8121613485851897, "grad_norm": 4.8396196365356445, "learning_rate": 6.6601415991018e-07, "loss": 0.3628, "step": 15050 }, { "epoch": 1.8133654425045154, "grad_norm": 5.569112777709961, "learning_rate": 6.655184886685577e-07, "loss": 0.3484, "step": 15060 }, { "epoch": 1.814569536423841, "grad_norm": 4.458260536193848, "learning_rate": 6.650226346750178e-07, "loss": 0.3523, "step": 15070 }, { "epoch": 1.8157736303431666, "grad_norm": 4.671230316162109, "learning_rate": 6.645265984770417e-07, "loss": 0.3501, "step": 15080 }, { "epoch": 1.8169777242624925, "grad_norm": 4.7510504722595215, "learning_rate": 6.640303806223116e-07, "loss": 0.3565, "step": 15090 }, { "epoch": 1.8181818181818183, "grad_norm": 4.930042266845703, "learning_rate": 6.635339816587108e-07, "loss": 0.3519, "step": 15100 }, { "epoch": 1.819385912101144, "grad_norm": 4.401383876800537, "learning_rate": 6.63037402134322e-07, "loss": 0.3444, "step": 15110 }, { "epoch": 1.8205900060204696, "grad_norm": 4.55552864074707, "learning_rate": 6.625406425974277e-07, "loss": 0.3593, "step": 15120 }, { "epoch": 1.8217940999397952, "grad_norm": 4.647222995758057, "learning_rate": 6.620437035965088e-07, "loss": 0.3513, "step": 15130 }, { "epoch": 1.8229981938591209, "grad_norm": 4.750911235809326, "learning_rate": 6.615465856802446e-07, "loss": 0.3754, "step": 15140 }, { "epoch": 1.8242022877784467, "grad_norm": 3.9289968013763428, "learning_rate": 6.610492893975117e-07, "loss": 0.3511, "step": 15150 }, { "epoch": 1.8254063816977726, "grad_norm": 3.834213972091675, "learning_rate": 6.605518152973842e-07, "loss": 0.3446, "step": 15160 }, { "epoch": 1.8266104756170982, "grad_norm": 5.1060075759887695, "learning_rate": 6.600541639291316e-07, "loss": 0.3548, "step": 15170 }, { "epoch": 1.8278145695364238, "grad_norm": 4.696617603302002, "learning_rate": 6.595563358422202e-07, "loss": 0.3576, "step": 15180 }, { "epoch": 1.8290186634557495, "grad_norm": 4.141697883605957, "learning_rate": 6.590583315863105e-07, "loss": 0.3513, "step": 15190 }, { "epoch": 1.8302227573750751, "grad_norm": 5.357382774353027, "learning_rate": 6.58560151711258e-07, "loss": 0.3508, "step": 15200 }, { "epoch": 1.831426851294401, "grad_norm": 4.808011054992676, "learning_rate": 6.58061796767112e-07, "loss": 0.3568, "step": 15210 }, { "epoch": 1.8326309452137268, "grad_norm": 4.633763790130615, "learning_rate": 6.575632673041151e-07, "loss": 0.355, "step": 15220 }, { "epoch": 1.8338350391330525, "grad_norm": 4.953246116638184, "learning_rate": 6.570645638727026e-07, "loss": 0.3604, "step": 15230 }, { "epoch": 1.835039133052378, "grad_norm": 4.354135513305664, "learning_rate": 6.565656870235019e-07, "loss": 0.337, "step": 15240 }, { "epoch": 1.8362432269717037, "grad_norm": 5.245918273925781, "learning_rate": 6.560666373073316e-07, "loss": 0.3711, "step": 15250 }, { "epoch": 1.8374473208910294, "grad_norm": 5.532114028930664, "learning_rate": 6.555674152752016e-07, "loss": 0.3618, "step": 15260 }, { "epoch": 1.8386514148103552, "grad_norm": 5.3348212242126465, "learning_rate": 6.55068021478312e-07, "loss": 0.3646, "step": 15270 }, { "epoch": 1.839855508729681, "grad_norm": 4.423579216003418, "learning_rate": 6.54568456468052e-07, "loss": 0.3522, "step": 15280 }, { "epoch": 1.8410596026490067, "grad_norm": 4.966454982757568, "learning_rate": 6.540687207960005e-07, "loss": 0.3592, "step": 15290 }, { "epoch": 1.8422636965683323, "grad_norm": 4.406902313232422, "learning_rate": 6.535688150139246e-07, "loss": 0.3637, "step": 15300 }, { "epoch": 1.843467790487658, "grad_norm": 4.565004348754883, "learning_rate": 6.530687396737791e-07, "loss": 0.343, "step": 15310 }, { "epoch": 1.8446718844069836, "grad_norm": 4.898248672485352, "learning_rate": 6.525684953277061e-07, "loss": 0.3589, "step": 15320 }, { "epoch": 1.8458759783263095, "grad_norm": 4.416904449462891, "learning_rate": 6.520680825280344e-07, "loss": 0.3297, "step": 15330 }, { "epoch": 1.8470800722456353, "grad_norm": 4.844006538391113, "learning_rate": 6.515675018272786e-07, "loss": 0.3692, "step": 15340 }, { "epoch": 1.848284166164961, "grad_norm": 4.351726531982422, "learning_rate": 6.510667537781389e-07, "loss": 0.3627, "step": 15350 }, { "epoch": 1.8494882600842866, "grad_norm": 4.276306629180908, "learning_rate": 6.505658389335e-07, "loss": 0.3581, "step": 15360 }, { "epoch": 1.8506923540036122, "grad_norm": 4.866278648376465, "learning_rate": 6.500647578464311e-07, "loss": 0.3756, "step": 15370 }, { "epoch": 1.8518964479229378, "grad_norm": 4.005789279937744, "learning_rate": 6.495635110701847e-07, "loss": 0.3551, "step": 15380 }, { "epoch": 1.8531005418422637, "grad_norm": 4.069939136505127, "learning_rate": 6.490620991581963e-07, "loss": 0.3426, "step": 15390 }, { "epoch": 1.8543046357615895, "grad_norm": 5.377545356750488, "learning_rate": 6.485605226640836e-07, "loss": 0.363, "step": 15400 }, { "epoch": 1.8555087296809152, "grad_norm": 4.171127796173096, "learning_rate": 6.480587821416465e-07, "loss": 0.3601, "step": 15410 }, { "epoch": 1.8567128236002408, "grad_norm": 4.944298267364502, "learning_rate": 6.475568781448654e-07, "loss": 0.3445, "step": 15420 }, { "epoch": 1.8579169175195664, "grad_norm": 4.719433784484863, "learning_rate": 6.470548112279015e-07, "loss": 0.349, "step": 15430 }, { "epoch": 1.859121011438892, "grad_norm": 4.289638042449951, "learning_rate": 6.465525819450959e-07, "loss": 0.3675, "step": 15440 }, { "epoch": 1.860325105358218, "grad_norm": 4.580896377563477, "learning_rate": 6.46050190850969e-07, "loss": 0.362, "step": 15450 }, { "epoch": 1.8615291992775438, "grad_norm": 4.68642520904541, "learning_rate": 6.455476385002195e-07, "loss": 0.3544, "step": 15460 }, { "epoch": 1.8627332931968694, "grad_norm": 4.221519470214844, "learning_rate": 6.450449254477246e-07, "loss": 0.3557, "step": 15470 }, { "epoch": 1.863937387116195, "grad_norm": 5.103092670440674, "learning_rate": 6.445420522485387e-07, "loss": 0.3575, "step": 15480 }, { "epoch": 1.8651414810355207, "grad_norm": 5.300514221191406, "learning_rate": 6.440390194578933e-07, "loss": 0.3655, "step": 15490 }, { "epoch": 1.8663455749548463, "grad_norm": 5.2280049324035645, "learning_rate": 6.435358276311955e-07, "loss": 0.3615, "step": 15500 }, { "epoch": 1.8675496688741722, "grad_norm": 4.393173694610596, "learning_rate": 6.430324773240287e-07, "loss": 0.3617, "step": 15510 }, { "epoch": 1.868753762793498, "grad_norm": 3.9914498329162598, "learning_rate": 6.425289690921508e-07, "loss": 0.3482, "step": 15520 }, { "epoch": 1.8699578567128237, "grad_norm": 4.967134475708008, "learning_rate": 6.420253034914943e-07, "loss": 0.3635, "step": 15530 }, { "epoch": 1.8711619506321493, "grad_norm": 4.27791166305542, "learning_rate": 6.415214810781653e-07, "loss": 0.3508, "step": 15540 }, { "epoch": 1.872366044551475, "grad_norm": 4.6500163078308105, "learning_rate": 6.410175024084431e-07, "loss": 0.3589, "step": 15550 }, { "epoch": 1.8735701384708006, "grad_norm": 4.22102689743042, "learning_rate": 6.405133680387797e-07, "loss": 0.3558, "step": 15560 }, { "epoch": 1.8747742323901264, "grad_norm": 4.9325947761535645, "learning_rate": 6.400090785257987e-07, "loss": 0.3696, "step": 15570 }, { "epoch": 1.8759783263094523, "grad_norm": 3.8292155265808105, "learning_rate": 6.395046344262951e-07, "loss": 0.356, "step": 15580 }, { "epoch": 1.877182420228778, "grad_norm": 4.739902973175049, "learning_rate": 6.390000362972348e-07, "loss": 0.3407, "step": 15590 }, { "epoch": 1.8783865141481035, "grad_norm": 3.770754814147949, "learning_rate": 6.384952846957535e-07, "loss": 0.3502, "step": 15600 }, { "epoch": 1.8795906080674292, "grad_norm": 4.367559432983398, "learning_rate": 6.379903801791566e-07, "loss": 0.3566, "step": 15610 }, { "epoch": 1.8807947019867548, "grad_norm": 5.16295862197876, "learning_rate": 6.374853233049182e-07, "loss": 0.3668, "step": 15620 }, { "epoch": 1.8819987959060807, "grad_norm": 4.346946716308594, "learning_rate": 6.369801146306802e-07, "loss": 0.3483, "step": 15630 }, { "epoch": 1.8832028898254065, "grad_norm": 4.716429710388184, "learning_rate": 6.36474754714253e-07, "loss": 0.3452, "step": 15640 }, { "epoch": 1.8844069837447321, "grad_norm": 4.5193891525268555, "learning_rate": 6.359692441136131e-07, "loss": 0.361, "step": 15650 }, { "epoch": 1.8856110776640578, "grad_norm": 3.9874355792999268, "learning_rate": 6.354635833869042e-07, "loss": 0.358, "step": 15660 }, { "epoch": 1.8868151715833834, "grad_norm": 4.598703861236572, "learning_rate": 6.349577730924349e-07, "loss": 0.35, "step": 15670 }, { "epoch": 1.8880192655027093, "grad_norm": 5.374682426452637, "learning_rate": 6.344518137886798e-07, "loss": 0.3639, "step": 15680 }, { "epoch": 1.889223359422035, "grad_norm": 6.002275466918945, "learning_rate": 6.339457060342772e-07, "loss": 0.3546, "step": 15690 }, { "epoch": 1.8904274533413608, "grad_norm": 4.864243984222412, "learning_rate": 6.3343945038803e-07, "loss": 0.3543, "step": 15700 }, { "epoch": 1.8916315472606864, "grad_norm": 3.9879305362701416, "learning_rate": 6.329330474089039e-07, "loss": 0.3549, "step": 15710 }, { "epoch": 1.892835641180012, "grad_norm": 4.457694053649902, "learning_rate": 6.324264976560277e-07, "loss": 0.3584, "step": 15720 }, { "epoch": 1.8940397350993377, "grad_norm": 3.741135835647583, "learning_rate": 6.319198016886918e-07, "loss": 0.3618, "step": 15730 }, { "epoch": 1.8952438290186635, "grad_norm": 4.002588272094727, "learning_rate": 6.314129600663484e-07, "loss": 0.3492, "step": 15740 }, { "epoch": 1.8964479229379891, "grad_norm": 4.551817893981934, "learning_rate": 6.309059733486102e-07, "loss": 0.3567, "step": 15750 }, { "epoch": 1.897652016857315, "grad_norm": 4.268725872039795, "learning_rate": 6.303988420952505e-07, "loss": 0.3591, "step": 15760 }, { "epoch": 1.8988561107766406, "grad_norm": 4.963777542114258, "learning_rate": 6.298915668662017e-07, "loss": 0.3551, "step": 15770 }, { "epoch": 1.9000602046959663, "grad_norm": 4.293519973754883, "learning_rate": 6.293841482215558e-07, "loss": 0.3586, "step": 15780 }, { "epoch": 1.901264298615292, "grad_norm": 4.556762218475342, "learning_rate": 6.288765867215625e-07, "loss": 0.3538, "step": 15790 }, { "epoch": 1.9024683925346177, "grad_norm": 3.792178153991699, "learning_rate": 6.283688829266297e-07, "loss": 0.3331, "step": 15800 }, { "epoch": 1.9036724864539434, "grad_norm": 5.197310447692871, "learning_rate": 6.278610373973219e-07, "loss": 0.3515, "step": 15810 }, { "epoch": 1.9048765803732692, "grad_norm": 5.082350730895996, "learning_rate": 6.273530506943609e-07, "loss": 0.3389, "step": 15820 }, { "epoch": 1.9060806742925949, "grad_norm": 4.892045021057129, "learning_rate": 6.268449233786236e-07, "loss": 0.3531, "step": 15830 }, { "epoch": 1.9072847682119205, "grad_norm": 4.555123805999756, "learning_rate": 6.263366560111423e-07, "loss": 0.3414, "step": 15840 }, { "epoch": 1.9084888621312461, "grad_norm": 4.728994846343994, "learning_rate": 6.258282491531043e-07, "loss": 0.3556, "step": 15850 }, { "epoch": 1.909692956050572, "grad_norm": 4.745967388153076, "learning_rate": 6.253197033658507e-07, "loss": 0.343, "step": 15860 }, { "epoch": 1.9108970499698976, "grad_norm": 4.600861072540283, "learning_rate": 6.248110192108757e-07, "loss": 0.3475, "step": 15870 }, { "epoch": 1.9121011438892235, "grad_norm": 4.099234580993652, "learning_rate": 6.243021972498269e-07, "loss": 0.3624, "step": 15880 }, { "epoch": 1.913305237808549, "grad_norm": 4.272284030914307, "learning_rate": 6.237932380445034e-07, "loss": 0.3565, "step": 15890 }, { "epoch": 1.9145093317278747, "grad_norm": 3.7602131366729736, "learning_rate": 6.232841421568565e-07, "loss": 0.3499, "step": 15900 }, { "epoch": 1.9157134256472004, "grad_norm": 4.971080303192139, "learning_rate": 6.227749101489877e-07, "loss": 0.3701, "step": 15910 }, { "epoch": 1.9169175195665262, "grad_norm": 5.319652080535889, "learning_rate": 6.222655425831495e-07, "loss": 0.3451, "step": 15920 }, { "epoch": 1.9181216134858519, "grad_norm": 4.283812522888184, "learning_rate": 6.217560400217433e-07, "loss": 0.3559, "step": 15930 }, { "epoch": 1.9193257074051777, "grad_norm": 5.055164813995361, "learning_rate": 6.212464030273204e-07, "loss": 0.3562, "step": 15940 }, { "epoch": 1.9205298013245033, "grad_norm": 4.813416004180908, "learning_rate": 6.207366321625798e-07, "loss": 0.3606, "step": 15950 }, { "epoch": 1.921733895243829, "grad_norm": 4.402296543121338, "learning_rate": 6.202267279903686e-07, "loss": 0.353, "step": 15960 }, { "epoch": 1.9229379891631546, "grad_norm": 4.458485126495361, "learning_rate": 6.197166910736814e-07, "loss": 0.3523, "step": 15970 }, { "epoch": 1.9241420830824805, "grad_norm": 3.5323286056518555, "learning_rate": 6.192065219756587e-07, "loss": 0.357, "step": 15980 }, { "epoch": 1.925346177001806, "grad_norm": 4.047741413116455, "learning_rate": 6.186962212595876e-07, "loss": 0.3513, "step": 15990 }, { "epoch": 1.926550270921132, "grad_norm": 4.608432769775391, "learning_rate": 6.181857894889e-07, "loss": 0.3556, "step": 16000 }, { "epoch": 1.9277543648404576, "grad_norm": 4.246164321899414, "learning_rate": 6.17675227227173e-07, "loss": 0.3274, "step": 16010 }, { "epoch": 1.9289584587597832, "grad_norm": 4.55797004699707, "learning_rate": 6.171645350381272e-07, "loss": 0.3537, "step": 16020 }, { "epoch": 1.9301625526791089, "grad_norm": 4.349902629852295, "learning_rate": 6.166537134856272e-07, "loss": 0.3454, "step": 16030 }, { "epoch": 1.9313666465984347, "grad_norm": 4.9922614097595215, "learning_rate": 6.161427631336799e-07, "loss": 0.3377, "step": 16040 }, { "epoch": 1.9325707405177603, "grad_norm": 4.467525005340576, "learning_rate": 6.156316845464351e-07, "loss": 0.345, "step": 16050 }, { "epoch": 1.9337748344370862, "grad_norm": 4.589630603790283, "learning_rate": 6.151204782881835e-07, "loss": 0.3393, "step": 16060 }, { "epoch": 1.9349789283564118, "grad_norm": 4.475553035736084, "learning_rate": 6.146091449233571e-07, "loss": 0.3544, "step": 16070 }, { "epoch": 1.9361830222757375, "grad_norm": 4.827112197875977, "learning_rate": 6.140976850165283e-07, "loss": 0.3447, "step": 16080 }, { "epoch": 1.937387116195063, "grad_norm": 3.81062388420105, "learning_rate": 6.135860991324092e-07, "loss": 0.3493, "step": 16090 }, { "epoch": 1.938591210114389, "grad_norm": 4.450663089752197, "learning_rate": 6.130743878358505e-07, "loss": 0.3601, "step": 16100 }, { "epoch": 1.9397953040337146, "grad_norm": 3.878636598587036, "learning_rate": 6.125625516918421e-07, "loss": 0.3638, "step": 16110 }, { "epoch": 1.9409993979530404, "grad_norm": 4.681748390197754, "learning_rate": 6.120505912655114e-07, "loss": 0.3542, "step": 16120 }, { "epoch": 1.942203491872366, "grad_norm": 5.228558540344238, "learning_rate": 6.115385071221231e-07, "loss": 0.3538, "step": 16130 }, { "epoch": 1.9434075857916917, "grad_norm": 5.1694488525390625, "learning_rate": 6.110262998270781e-07, "loss": 0.3689, "step": 16140 }, { "epoch": 1.9446116797110173, "grad_norm": 4.253943920135498, "learning_rate": 6.10513969945914e-07, "loss": 0.3518, "step": 16150 }, { "epoch": 1.9458157736303432, "grad_norm": 4.636354446411133, "learning_rate": 6.100015180443031e-07, "loss": 0.3643, "step": 16160 }, { "epoch": 1.9470198675496688, "grad_norm": 3.8941125869750977, "learning_rate": 6.094889446880529e-07, "loss": 0.3444, "step": 16170 }, { "epoch": 1.9482239614689947, "grad_norm": 4.6928391456604, "learning_rate": 6.089762504431046e-07, "loss": 0.3541, "step": 16180 }, { "epoch": 1.9494280553883203, "grad_norm": 4.19013786315918, "learning_rate": 6.084634358755334e-07, "loss": 0.357, "step": 16190 }, { "epoch": 1.950632149307646, "grad_norm": 4.565307140350342, "learning_rate": 6.079505015515465e-07, "loss": 0.3419, "step": 16200 }, { "epoch": 1.9518362432269716, "grad_norm": 5.345344543457031, "learning_rate": 6.074374480374843e-07, "loss": 0.3569, "step": 16210 }, { "epoch": 1.9530403371462974, "grad_norm": 4.672290802001953, "learning_rate": 6.069242758998181e-07, "loss": 0.3564, "step": 16220 }, { "epoch": 1.954244431065623, "grad_norm": 4.522906303405762, "learning_rate": 6.064109857051505e-07, "loss": 0.35, "step": 16230 }, { "epoch": 1.955448524984949, "grad_norm": 4.692704200744629, "learning_rate": 6.058975780202143e-07, "loss": 0.334, "step": 16240 }, { "epoch": 1.9566526189042746, "grad_norm": 4.350996971130371, "learning_rate": 6.053840534118722e-07, "loss": 0.3512, "step": 16250 }, { "epoch": 1.9578567128236002, "grad_norm": 4.869346618652344, "learning_rate": 6.04870412447116e-07, "loss": 0.3415, "step": 16260 }, { "epoch": 1.9590608067429258, "grad_norm": 4.5982818603515625, "learning_rate": 6.043566556930655e-07, "loss": 0.3697, "step": 16270 }, { "epoch": 1.9602649006622517, "grad_norm": 4.133756637573242, "learning_rate": 6.038427837169688e-07, "loss": 0.3498, "step": 16280 }, { "epoch": 1.9614689945815773, "grad_norm": 4.6877546310424805, "learning_rate": 6.033287970862013e-07, "loss": 0.3622, "step": 16290 }, { "epoch": 1.9626730885009032, "grad_norm": 5.100693702697754, "learning_rate": 6.028146963682648e-07, "loss": 0.3571, "step": 16300 }, { "epoch": 1.9638771824202288, "grad_norm": 5.0933685302734375, "learning_rate": 6.023004821307867e-07, "loss": 0.3247, "step": 16310 }, { "epoch": 1.9650812763395544, "grad_norm": 3.7194926738739014, "learning_rate": 6.017861549415207e-07, "loss": 0.3519, "step": 16320 }, { "epoch": 1.96628537025888, "grad_norm": 4.424744606018066, "learning_rate": 6.012717153683442e-07, "loss": 0.3401, "step": 16330 }, { "epoch": 1.967489464178206, "grad_norm": 3.9198262691497803, "learning_rate": 6.007571639792593e-07, "loss": 0.3434, "step": 16340 }, { "epoch": 1.9686935580975318, "grad_norm": 3.9350152015686035, "learning_rate": 6.002425013423913e-07, "loss": 0.3447, "step": 16350 }, { "epoch": 1.9698976520168574, "grad_norm": 4.852246284484863, "learning_rate": 5.997277280259885e-07, "loss": 0.3457, "step": 16360 }, { "epoch": 1.971101745936183, "grad_norm": 4.658691883087158, "learning_rate": 5.992128445984212e-07, "loss": 0.3692, "step": 16370 }, { "epoch": 1.9723058398555087, "grad_norm": 4.637414932250977, "learning_rate": 5.986978516281815e-07, "loss": 0.3555, "step": 16380 }, { "epoch": 1.9735099337748343, "grad_norm": 4.982326984405518, "learning_rate": 5.981827496838822e-07, "loss": 0.3526, "step": 16390 }, { "epoch": 1.9747140276941602, "grad_norm": 4.729382514953613, "learning_rate": 5.976675393342566e-07, "loss": 0.3558, "step": 16400 }, { "epoch": 1.975918121613486, "grad_norm": 4.774322509765625, "learning_rate": 5.971522211481575e-07, "loss": 0.358, "step": 16410 }, { "epoch": 1.9771222155328116, "grad_norm": 4.948471546173096, "learning_rate": 5.966367956945572e-07, "loss": 0.359, "step": 16420 }, { "epoch": 1.9783263094521373, "grad_norm": 4.0199198722839355, "learning_rate": 5.961212635425459e-07, "loss": 0.3423, "step": 16430 }, { "epoch": 1.979530403371463, "grad_norm": 4.141156196594238, "learning_rate": 5.956056252613319e-07, "loss": 0.3475, "step": 16440 }, { "epoch": 1.9807344972907885, "grad_norm": 4.316824913024902, "learning_rate": 5.950898814202407e-07, "loss": 0.3436, "step": 16450 }, { "epoch": 1.9819385912101144, "grad_norm": 5.594763278961182, "learning_rate": 5.945740325887144e-07, "loss": 0.3435, "step": 16460 }, { "epoch": 1.9831426851294403, "grad_norm": 4.995075702667236, "learning_rate": 5.940580793363105e-07, "loss": 0.3539, "step": 16470 }, { "epoch": 1.9843467790487659, "grad_norm": 4.139880180358887, "learning_rate": 5.935420222327028e-07, "loss": 0.3544, "step": 16480 }, { "epoch": 1.9855508729680915, "grad_norm": 3.917797088623047, "learning_rate": 5.930258618476785e-07, "loss": 0.3331, "step": 16490 }, { "epoch": 1.9867549668874172, "grad_norm": 5.234194755554199, "learning_rate": 5.9250959875114e-07, "loss": 0.3477, "step": 16500 }, { "epoch": 1.9879590608067428, "grad_norm": 4.324552059173584, "learning_rate": 5.919932335131022e-07, "loss": 0.341, "step": 16510 }, { "epoch": 1.9891631547260686, "grad_norm": 5.321447849273682, "learning_rate": 5.914767667036936e-07, "loss": 0.3606, "step": 16520 }, { "epoch": 1.9903672486453945, "grad_norm": 4.159404277801514, "learning_rate": 5.90960198893154e-07, "loss": 0.3484, "step": 16530 }, { "epoch": 1.9915713425647201, "grad_norm": 4.632839202880859, "learning_rate": 5.904435306518354e-07, "loss": 0.35, "step": 16540 }, { "epoch": 1.9927754364840458, "grad_norm": 4.1767168045043945, "learning_rate": 5.899267625502004e-07, "loss": 0.356, "step": 16550 }, { "epoch": 1.9939795304033714, "grad_norm": 4.770878314971924, "learning_rate": 5.894098951588218e-07, "loss": 0.3338, "step": 16560 }, { "epoch": 1.995183624322697, "grad_norm": 4.481430530548096, "learning_rate": 5.888929290483821e-07, "loss": 0.3569, "step": 16570 }, { "epoch": 1.9963877182420229, "grad_norm": 4.496611595153809, "learning_rate": 5.883758647896729e-07, "loss": 0.3602, "step": 16580 }, { "epoch": 1.9975918121613487, "grad_norm": 3.9505410194396973, "learning_rate": 5.878587029535942e-07, "loss": 0.3403, "step": 16590 }, { "epoch": 1.9987959060806744, "grad_norm": 4.308087348937988, "learning_rate": 5.873414441111532e-07, "loss": 0.3556, "step": 16600 }, { "epoch": 2.0, "grad_norm": 4.440168857574463, "learning_rate": 5.868240888334652e-07, "loss": 0.3312, "step": 16610 }, { "epoch": 2.0012040939193256, "grad_norm": 4.038889408111572, "learning_rate": 5.863066376917508e-07, "loss": 0.3224, "step": 16620 }, { "epoch": 2.0024081878386513, "grad_norm": 4.833006381988525, "learning_rate": 5.857890912573376e-07, "loss": 0.3001, "step": 16630 }, { "epoch": 2.0036122817579773, "grad_norm": 4.160131931304932, "learning_rate": 5.852714501016572e-07, "loss": 0.2985, "step": 16640 }, { "epoch": 2.004816375677303, "grad_norm": 5.080901622772217, "learning_rate": 5.84753714796247e-07, "loss": 0.3228, "step": 16650 }, { "epoch": 2.0060204695966286, "grad_norm": 4.37393856048584, "learning_rate": 5.842358859127478e-07, "loss": 0.3036, "step": 16660 }, { "epoch": 2.0072245635159542, "grad_norm": 4.473939895629883, "learning_rate": 5.837179640229032e-07, "loss": 0.3135, "step": 16670 }, { "epoch": 2.00842865743528, "grad_norm": 5.297366619110107, "learning_rate": 5.831999496985605e-07, "loss": 0.3059, "step": 16680 }, { "epoch": 2.0096327513546055, "grad_norm": 5.174331188201904, "learning_rate": 5.826818435116683e-07, "loss": 0.3123, "step": 16690 }, { "epoch": 2.0108368452739316, "grad_norm": 4.679065704345703, "learning_rate": 5.821636460342769e-07, "loss": 0.3232, "step": 16700 }, { "epoch": 2.012040939193257, "grad_norm": 4.446617126464844, "learning_rate": 5.816453578385375e-07, "loss": 0.3063, "step": 16710 }, { "epoch": 2.013245033112583, "grad_norm": 5.05123233795166, "learning_rate": 5.811269794967014e-07, "loss": 0.3095, "step": 16720 }, { "epoch": 2.0144491270319085, "grad_norm": 4.649383544921875, "learning_rate": 5.806085115811191e-07, "loss": 0.309, "step": 16730 }, { "epoch": 2.015653220951234, "grad_norm": 4.328246116638184, "learning_rate": 5.800899546642406e-07, "loss": 0.2981, "step": 16740 }, { "epoch": 2.0168573148705597, "grad_norm": 4.504574775695801, "learning_rate": 5.795713093186136e-07, "loss": 0.3162, "step": 16750 }, { "epoch": 2.018061408789886, "grad_norm": 4.636085033416748, "learning_rate": 5.790525761168839e-07, "loss": 0.318, "step": 16760 }, { "epoch": 2.0192655027092115, "grad_norm": 5.4193291664123535, "learning_rate": 5.785337556317938e-07, "loss": 0.3216, "step": 16770 }, { "epoch": 2.020469596628537, "grad_norm": 4.318239212036133, "learning_rate": 5.780148484361826e-07, "loss": 0.3018, "step": 16780 }, { "epoch": 2.0216736905478627, "grad_norm": 4.4032087326049805, "learning_rate": 5.774958551029847e-07, "loss": 0.3078, "step": 16790 }, { "epoch": 2.0228777844671884, "grad_norm": 4.946054458618164, "learning_rate": 5.769767762052301e-07, "loss": 0.3155, "step": 16800 }, { "epoch": 2.024081878386514, "grad_norm": 4.1051344871521, "learning_rate": 5.764576123160429e-07, "loss": 0.3183, "step": 16810 }, { "epoch": 2.02528597230584, "grad_norm": 4.6641459465026855, "learning_rate": 5.759383640086415e-07, "loss": 0.3063, "step": 16820 }, { "epoch": 2.0264900662251657, "grad_norm": 4.728779315948486, "learning_rate": 5.75419031856337e-07, "loss": 0.3153, "step": 16830 }, { "epoch": 2.0276941601444913, "grad_norm": 5.103392124176025, "learning_rate": 5.748996164325331e-07, "loss": 0.304, "step": 16840 }, { "epoch": 2.028898254063817, "grad_norm": 5.283243656158447, "learning_rate": 5.743801183107261e-07, "loss": 0.3188, "step": 16850 }, { "epoch": 2.0301023479831426, "grad_norm": 4.704992294311523, "learning_rate": 5.73860538064503e-07, "loss": 0.306, "step": 16860 }, { "epoch": 2.0313064419024682, "grad_norm": 5.523532390594482, "learning_rate": 5.733408762675414e-07, "loss": 0.3164, "step": 16870 }, { "epoch": 2.0325105358217943, "grad_norm": 4.29448127746582, "learning_rate": 5.728211334936093e-07, "loss": 0.3011, "step": 16880 }, { "epoch": 2.03371462974112, "grad_norm": 4.910971164703369, "learning_rate": 5.723013103165642e-07, "loss": 0.3093, "step": 16890 }, { "epoch": 2.0349187236604456, "grad_norm": 4.527739524841309, "learning_rate": 5.717814073103519e-07, "loss": 0.2994, "step": 16900 }, { "epoch": 2.036122817579771, "grad_norm": 4.409666061401367, "learning_rate": 5.712614250490064e-07, "loss": 0.3165, "step": 16910 }, { "epoch": 2.037326911499097, "grad_norm": 4.129342079162598, "learning_rate": 5.707413641066497e-07, "loss": 0.3159, "step": 16920 }, { "epoch": 2.0385310054184225, "grad_norm": 4.361571788787842, "learning_rate": 5.702212250574904e-07, "loss": 0.3008, "step": 16930 }, { "epoch": 2.0397350993377485, "grad_norm": 4.482879638671875, "learning_rate": 5.697010084758232e-07, "loss": 0.3169, "step": 16940 }, { "epoch": 2.040939193257074, "grad_norm": 4.7954535484313965, "learning_rate": 5.691807149360285e-07, "loss": 0.3057, "step": 16950 }, { "epoch": 2.0421432871764, "grad_norm": 4.840571403503418, "learning_rate": 5.686603450125717e-07, "loss": 0.2973, "step": 16960 }, { "epoch": 2.0433473810957254, "grad_norm": 4.597223281860352, "learning_rate": 5.681398992800024e-07, "loss": 0.3144, "step": 16970 }, { "epoch": 2.044551475015051, "grad_norm": 4.794790744781494, "learning_rate": 5.676193783129542e-07, "loss": 0.3087, "step": 16980 }, { "epoch": 2.0457555689343767, "grad_norm": 4.340571403503418, "learning_rate": 5.670987826861435e-07, "loss": 0.3083, "step": 16990 }, { "epoch": 2.046959662853703, "grad_norm": 4.629497051239014, "learning_rate": 5.665781129743693e-07, "loss": 0.3088, "step": 17000 }, { "epoch": 2.0481637567730284, "grad_norm": 4.827451229095459, "learning_rate": 5.660573697525121e-07, "loss": 0.3039, "step": 17010 }, { "epoch": 2.049367850692354, "grad_norm": 4.8336381912231445, "learning_rate": 5.655365535955342e-07, "loss": 0.306, "step": 17020 }, { "epoch": 2.0505719446116797, "grad_norm": 5.4790940284729, "learning_rate": 5.650156650784777e-07, "loss": 0.3129, "step": 17030 }, { "epoch": 2.0517760385310053, "grad_norm": 3.705552577972412, "learning_rate": 5.64494704776465e-07, "loss": 0.3062, "step": 17040 }, { "epoch": 2.052980132450331, "grad_norm": 4.869053840637207, "learning_rate": 5.639736732646976e-07, "loss": 0.3169, "step": 17050 }, { "epoch": 2.054184226369657, "grad_norm": 4.759436130523682, "learning_rate": 5.634525711184556e-07, "loss": 0.3129, "step": 17060 }, { "epoch": 2.0553883202889827, "grad_norm": 4.388055324554443, "learning_rate": 5.629313989130975e-07, "loss": 0.3026, "step": 17070 }, { "epoch": 2.0565924142083083, "grad_norm": 5.617096900939941, "learning_rate": 5.624101572240587e-07, "loss": 0.3064, "step": 17080 }, { "epoch": 2.057796508127634, "grad_norm": 4.787253379821777, "learning_rate": 5.618888466268513e-07, "loss": 0.3174, "step": 17090 }, { "epoch": 2.0590006020469596, "grad_norm": 4.347087383270264, "learning_rate": 5.613674676970638e-07, "loss": 0.3028, "step": 17100 }, { "epoch": 2.060204695966285, "grad_norm": 4.601030349731445, "learning_rate": 5.608460210103598e-07, "loss": 0.3136, "step": 17110 }, { "epoch": 2.0614087898856113, "grad_norm": 4.6767048835754395, "learning_rate": 5.603245071424783e-07, "loss": 0.3126, "step": 17120 }, { "epoch": 2.062612883804937, "grad_norm": 5.636801719665527, "learning_rate": 5.598029266692315e-07, "loss": 0.3107, "step": 17130 }, { "epoch": 2.0638169777242625, "grad_norm": 5.514817714691162, "learning_rate": 5.592812801665061e-07, "loss": 0.3191, "step": 17140 }, { "epoch": 2.065021071643588, "grad_norm": 4.12761116027832, "learning_rate": 5.587595682102611e-07, "loss": 0.3119, "step": 17150 }, { "epoch": 2.066225165562914, "grad_norm": 4.940089702606201, "learning_rate": 5.582377913765283e-07, "loss": 0.3072, "step": 17160 }, { "epoch": 2.0674292594822394, "grad_norm": 4.235925674438477, "learning_rate": 5.577159502414103e-07, "loss": 0.3168, "step": 17170 }, { "epoch": 2.0686333534015655, "grad_norm": 5.036463260650635, "learning_rate": 5.57194045381082e-07, "loss": 0.3236, "step": 17180 }, { "epoch": 2.069837447320891, "grad_norm": 3.9009006023406982, "learning_rate": 5.56672077371787e-07, "loss": 0.3111, "step": 17190 }, { "epoch": 2.0710415412402168, "grad_norm": 4.592634677886963, "learning_rate": 5.5615004678984e-07, "loss": 0.3001, "step": 17200 }, { "epoch": 2.0722456351595424, "grad_norm": 4.5537004470825195, "learning_rate": 5.556279542116242e-07, "loss": 0.305, "step": 17210 }, { "epoch": 2.073449729078868, "grad_norm": 4.557441711425781, "learning_rate": 5.551058002135913e-07, "loss": 0.2978, "step": 17220 }, { "epoch": 2.0746538229981937, "grad_norm": 3.7024407386779785, "learning_rate": 5.545835853722608e-07, "loss": 0.3134, "step": 17230 }, { "epoch": 2.0758579169175198, "grad_norm": 5.503789901733398, "learning_rate": 5.540613102642195e-07, "loss": 0.3217, "step": 17240 }, { "epoch": 2.0770620108368454, "grad_norm": 4.864404678344727, "learning_rate": 5.535389754661208e-07, "loss": 0.2983, "step": 17250 }, { "epoch": 2.078266104756171, "grad_norm": 5.232902526855469, "learning_rate": 5.530165815546835e-07, "loss": 0.3154, "step": 17260 }, { "epoch": 2.0794701986754967, "grad_norm": 4.34998083114624, "learning_rate": 5.524941291066923e-07, "loss": 0.3078, "step": 17270 }, { "epoch": 2.0806742925948223, "grad_norm": 4.243396282196045, "learning_rate": 5.519716186989962e-07, "loss": 0.2971, "step": 17280 }, { "epoch": 2.081878386514148, "grad_norm": 4.376738548278809, "learning_rate": 5.514490509085083e-07, "loss": 0.3081, "step": 17290 }, { "epoch": 2.083082480433474, "grad_norm": 4.597198486328125, "learning_rate": 5.50926426312205e-07, "loss": 0.3279, "step": 17300 }, { "epoch": 2.0842865743527996, "grad_norm": 4.825913906097412, "learning_rate": 5.504037454871258e-07, "loss": 0.3164, "step": 17310 }, { "epoch": 2.0854906682721253, "grad_norm": 4.312431812286377, "learning_rate": 5.498810090103711e-07, "loss": 0.29, "step": 17320 }, { "epoch": 2.086694762191451, "grad_norm": 4.7181854248046875, "learning_rate": 5.493582174591045e-07, "loss": 0.2962, "step": 17330 }, { "epoch": 2.0878988561107765, "grad_norm": 5.4123759269714355, "learning_rate": 5.488353714105488e-07, "loss": 0.3044, "step": 17340 }, { "epoch": 2.089102950030102, "grad_norm": 4.742303371429443, "learning_rate": 5.48312471441988e-07, "loss": 0.287, "step": 17350 }, { "epoch": 2.0903070439494282, "grad_norm": 3.8717334270477295, "learning_rate": 5.477895181307651e-07, "loss": 0.3205, "step": 17360 }, { "epoch": 2.091511137868754, "grad_norm": 4.724112510681152, "learning_rate": 5.472665120542824e-07, "loss": 0.2851, "step": 17370 }, { "epoch": 2.0927152317880795, "grad_norm": 5.797724723815918, "learning_rate": 5.4674345379e-07, "loss": 0.3136, "step": 17380 }, { "epoch": 2.093919325707405, "grad_norm": 4.77787446975708, "learning_rate": 5.462203439154361e-07, "loss": 0.3059, "step": 17390 }, { "epoch": 2.0951234196267308, "grad_norm": 4.670202732086182, "learning_rate": 5.456971830081655e-07, "loss": 0.3219, "step": 17400 }, { "epoch": 2.0963275135460564, "grad_norm": 4.7208099365234375, "learning_rate": 5.451739716458195e-07, "loss": 0.3146, "step": 17410 }, { "epoch": 2.0975316074653825, "grad_norm": 4.647831439971924, "learning_rate": 5.446507104060851e-07, "loss": 0.3266, "step": 17420 }, { "epoch": 2.098735701384708, "grad_norm": 4.2992987632751465, "learning_rate": 5.441273998667046e-07, "loss": 0.3091, "step": 17430 }, { "epoch": 2.0999397953040337, "grad_norm": 4.718204975128174, "learning_rate": 5.436040406054742e-07, "loss": 0.3103, "step": 17440 }, { "epoch": 2.1011438892233594, "grad_norm": 4.716932773590088, "learning_rate": 5.430806332002443e-07, "loss": 0.3044, "step": 17450 }, { "epoch": 2.102347983142685, "grad_norm": 4.856298923492432, "learning_rate": 5.425571782289185e-07, "loss": 0.3039, "step": 17460 }, { "epoch": 2.1035520770620106, "grad_norm": 5.1161208152771, "learning_rate": 5.420336762694524e-07, "loss": 0.3014, "step": 17470 }, { "epoch": 2.1047561709813367, "grad_norm": 4.895595550537109, "learning_rate": 5.415101278998543e-07, "loss": 0.3113, "step": 17480 }, { "epoch": 2.1059602649006623, "grad_norm": 4.259979248046875, "learning_rate": 5.409865336981832e-07, "loss": 0.3158, "step": 17490 }, { "epoch": 2.107164358819988, "grad_norm": 5.523928642272949, "learning_rate": 5.404628942425484e-07, "loss": 0.3293, "step": 17500 }, { "epoch": 2.1083684527393136, "grad_norm": 5.490001201629639, "learning_rate": 5.399392101111102e-07, "loss": 0.3253, "step": 17510 }, { "epoch": 2.1095725466586392, "grad_norm": 4.070251941680908, "learning_rate": 5.39415481882077e-07, "loss": 0.3341, "step": 17520 }, { "epoch": 2.110776640577965, "grad_norm": 4.516000270843506, "learning_rate": 5.388917101337069e-07, "loss": 0.3115, "step": 17530 }, { "epoch": 2.111980734497291, "grad_norm": 4.881539821624756, "learning_rate": 5.383678954443056e-07, "loss": 0.2962, "step": 17540 }, { "epoch": 2.1131848284166166, "grad_norm": 4.361866474151611, "learning_rate": 5.378440383922261e-07, "loss": 0.2959, "step": 17550 }, { "epoch": 2.1143889223359422, "grad_norm": 4.218469619750977, "learning_rate": 5.373201395558683e-07, "loss": 0.3004, "step": 17560 }, { "epoch": 2.115593016255268, "grad_norm": 5.058506488800049, "learning_rate": 5.367961995136782e-07, "loss": 0.3177, "step": 17570 }, { "epoch": 2.1167971101745935, "grad_norm": 5.340724468231201, "learning_rate": 5.362722188441476e-07, "loss": 0.3116, "step": 17580 }, { "epoch": 2.118001204093919, "grad_norm": 4.867612361907959, "learning_rate": 5.357481981258128e-07, "loss": 0.3287, "step": 17590 }, { "epoch": 2.119205298013245, "grad_norm": 4.499852180480957, "learning_rate": 5.352241379372545e-07, "loss": 0.3057, "step": 17600 }, { "epoch": 2.120409391932571, "grad_norm": 5.446403980255127, "learning_rate": 5.347000388570966e-07, "loss": 0.3206, "step": 17610 }, { "epoch": 2.1216134858518965, "grad_norm": 4.157654762268066, "learning_rate": 5.341759014640067e-07, "loss": 0.2985, "step": 17620 }, { "epoch": 2.122817579771222, "grad_norm": 5.162617206573486, "learning_rate": 5.336517263366939e-07, "loss": 0.3057, "step": 17630 }, { "epoch": 2.1240216736905477, "grad_norm": 4.874579906463623, "learning_rate": 5.331275140539094e-07, "loss": 0.3096, "step": 17640 }, { "epoch": 2.125225767609874, "grad_norm": 4.7379350662231445, "learning_rate": 5.326032651944453e-07, "loss": 0.3178, "step": 17650 }, { "epoch": 2.1264298615291994, "grad_norm": 4.660308361053467, "learning_rate": 5.320789803371344e-07, "loss": 0.3121, "step": 17660 }, { "epoch": 2.127633955448525, "grad_norm": 4.264311790466309, "learning_rate": 5.315546600608486e-07, "loss": 0.3041, "step": 17670 }, { "epoch": 2.1288380493678507, "grad_norm": 5.007218360900879, "learning_rate": 5.310303049444995e-07, "loss": 0.3133, "step": 17680 }, { "epoch": 2.1300421432871763, "grad_norm": 4.878419399261475, "learning_rate": 5.305059155670369e-07, "loss": 0.307, "step": 17690 }, { "epoch": 2.131246237206502, "grad_norm": 4.373286724090576, "learning_rate": 5.299814925074485e-07, "loss": 0.2988, "step": 17700 }, { "epoch": 2.1324503311258276, "grad_norm": 4.705572128295898, "learning_rate": 5.294570363447589e-07, "loss": 0.3101, "step": 17710 }, { "epoch": 2.1336544250451537, "grad_norm": 5.6706461906433105, "learning_rate": 5.2893254765803e-07, "loss": 0.3182, "step": 17720 }, { "epoch": 2.1348585189644793, "grad_norm": 4.4038896560668945, "learning_rate": 5.284080270263586e-07, "loss": 0.3055, "step": 17730 }, { "epoch": 2.136062612883805, "grad_norm": 4.746342658996582, "learning_rate": 5.278834750288776e-07, "loss": 0.3098, "step": 17740 }, { "epoch": 2.1372667068031306, "grad_norm": 4.472485065460205, "learning_rate": 5.273588922447543e-07, "loss": 0.3192, "step": 17750 }, { "epoch": 2.138470800722456, "grad_norm": 5.553606033325195, "learning_rate": 5.268342792531897e-07, "loss": 0.3328, "step": 17760 }, { "epoch": 2.1396748946417823, "grad_norm": 5.298537731170654, "learning_rate": 5.263096366334183e-07, "loss": 0.3072, "step": 17770 }, { "epoch": 2.140878988561108, "grad_norm": 4.98936128616333, "learning_rate": 5.257849649647077e-07, "loss": 0.3131, "step": 17780 }, { "epoch": 2.1420830824804336, "grad_norm": 4.389891147613525, "learning_rate": 5.252602648263569e-07, "loss": 0.3142, "step": 17790 }, { "epoch": 2.143287176399759, "grad_norm": 4.614076614379883, "learning_rate": 5.24735536797697e-07, "loss": 0.3075, "step": 17800 }, { "epoch": 2.144491270319085, "grad_norm": 5.098964214324951, "learning_rate": 5.242107814580893e-07, "loss": 0.3125, "step": 17810 }, { "epoch": 2.1456953642384105, "grad_norm": 4.502909183502197, "learning_rate": 5.236859993869258e-07, "loss": 0.2986, "step": 17820 }, { "epoch": 2.146899458157736, "grad_norm": 5.02591609954834, "learning_rate": 5.231611911636276e-07, "loss": 0.294, "step": 17830 }, { "epoch": 2.148103552077062, "grad_norm": 4.412136077880859, "learning_rate": 5.226363573676447e-07, "loss": 0.3085, "step": 17840 }, { "epoch": 2.149307645996388, "grad_norm": 4.393168926239014, "learning_rate": 5.221114985784558e-07, "loss": 0.3145, "step": 17850 }, { "epoch": 2.1505117399157134, "grad_norm": 4.741860389709473, "learning_rate": 5.215866153755666e-07, "loss": 0.3194, "step": 17860 }, { "epoch": 2.151715833835039, "grad_norm": 4.4850006103515625, "learning_rate": 5.210617083385101e-07, "loss": 0.3015, "step": 17870 }, { "epoch": 2.1529199277543647, "grad_norm": 5.466598033905029, "learning_rate": 5.205367780468455e-07, "loss": 0.311, "step": 17880 }, { "epoch": 2.1541240216736908, "grad_norm": 5.164214611053467, "learning_rate": 5.200118250801578e-07, "loss": 0.3161, "step": 17890 }, { "epoch": 2.1553281155930164, "grad_norm": 4.714061737060547, "learning_rate": 5.194868500180567e-07, "loss": 0.3171, "step": 17900 }, { "epoch": 2.156532209512342, "grad_norm": 4.755367279052734, "learning_rate": 5.189618534401768e-07, "loss": 0.3059, "step": 17910 }, { "epoch": 2.1577363034316677, "grad_norm": 4.605241298675537, "learning_rate": 5.184368359261761e-07, "loss": 0.3207, "step": 17920 }, { "epoch": 2.1589403973509933, "grad_norm": 5.180820465087891, "learning_rate": 5.179117980557357e-07, "loss": 0.3097, "step": 17930 }, { "epoch": 2.160144491270319, "grad_norm": 5.053746700286865, "learning_rate": 5.173867404085594e-07, "loss": 0.3208, "step": 17940 }, { "epoch": 2.1613485851896446, "grad_norm": 4.809300899505615, "learning_rate": 5.168616635643728e-07, "loss": 0.3009, "step": 17950 }, { "epoch": 2.1625526791089706, "grad_norm": 4.434291839599609, "learning_rate": 5.163365681029224e-07, "loss": 0.3118, "step": 17960 }, { "epoch": 2.1637567730282963, "grad_norm": 3.94570255279541, "learning_rate": 5.158114546039756e-07, "loss": 0.3081, "step": 17970 }, { "epoch": 2.164960866947622, "grad_norm": 4.972118854522705, "learning_rate": 5.152863236473195e-07, "loss": 0.3, "step": 17980 }, { "epoch": 2.1661649608669475, "grad_norm": 5.422942161560059, "learning_rate": 5.147611758127608e-07, "loss": 0.3039, "step": 17990 }, { "epoch": 2.167369054786273, "grad_norm": 4.45037317276001, "learning_rate": 5.142360116801242e-07, "loss": 0.3158, "step": 18000 }, { "epoch": 2.1685731487055993, "grad_norm": 5.098633289337158, "learning_rate": 5.137108318292533e-07, "loss": 0.2949, "step": 18010 }, { "epoch": 2.169777242624925, "grad_norm": 5.256601810455322, "learning_rate": 5.131856368400082e-07, "loss": 0.3037, "step": 18020 }, { "epoch": 2.1709813365442505, "grad_norm": 5.189584732055664, "learning_rate": 5.126604272922659e-07, "loss": 0.3256, "step": 18030 }, { "epoch": 2.172185430463576, "grad_norm": 4.259381294250488, "learning_rate": 5.121352037659201e-07, "loss": 0.3051, "step": 18040 }, { "epoch": 2.173389524382902, "grad_norm": 4.795348644256592, "learning_rate": 5.116099668408791e-07, "loss": 0.3002, "step": 18050 }, { "epoch": 2.1745936183022274, "grad_norm": 5.63735818862915, "learning_rate": 5.110847170970665e-07, "loss": 0.313, "step": 18060 }, { "epoch": 2.175797712221553, "grad_norm": 6.581758975982666, "learning_rate": 5.1055945511442e-07, "loss": 0.3014, "step": 18070 }, { "epoch": 2.177001806140879, "grad_norm": 5.026032447814941, "learning_rate": 5.100341814728904e-07, "loss": 0.3009, "step": 18080 }, { "epoch": 2.1782059000602048, "grad_norm": 4.6837263107299805, "learning_rate": 5.095088967524423e-07, "loss": 0.3251, "step": 18090 }, { "epoch": 2.1794099939795304, "grad_norm": 4.637839317321777, "learning_rate": 5.089836015330513e-07, "loss": 0.3177, "step": 18100 }, { "epoch": 2.180614087898856, "grad_norm": 4.267435550689697, "learning_rate": 5.084582963947057e-07, "loss": 0.3003, "step": 18110 }, { "epoch": 2.1818181818181817, "grad_norm": 4.481462001800537, "learning_rate": 5.07932981917404e-07, "loss": 0.3084, "step": 18120 }, { "epoch": 2.1830222757375077, "grad_norm": 5.001600742340088, "learning_rate": 5.074076586811554e-07, "loss": 0.3117, "step": 18130 }, { "epoch": 2.1842263696568334, "grad_norm": 4.785762310028076, "learning_rate": 5.068823272659785e-07, "loss": 0.3044, "step": 18140 }, { "epoch": 2.185430463576159, "grad_norm": 4.241122245788574, "learning_rate": 5.063569882519014e-07, "loss": 0.3114, "step": 18150 }, { "epoch": 2.1866345574954846, "grad_norm": 4.614393711090088, "learning_rate": 5.0583164221896e-07, "loss": 0.3143, "step": 18160 }, { "epoch": 2.1878386514148103, "grad_norm": 5.790137767791748, "learning_rate": 5.053062897471985e-07, "loss": 0.3086, "step": 18170 }, { "epoch": 2.189042745334136, "grad_norm": 5.027008056640625, "learning_rate": 5.047809314166677e-07, "loss": 0.2996, "step": 18180 }, { "epoch": 2.190246839253462, "grad_norm": 4.725672245025635, "learning_rate": 5.042555678074251e-07, "loss": 0.3101, "step": 18190 }, { "epoch": 2.1914509331727876, "grad_norm": 4.756001949310303, "learning_rate": 5.037301994995342e-07, "loss": 0.2892, "step": 18200 }, { "epoch": 2.1926550270921132, "grad_norm": 3.9560751914978027, "learning_rate": 5.032048270730634e-07, "loss": 0.3118, "step": 18210 }, { "epoch": 2.193859121011439, "grad_norm": 4.681294918060303, "learning_rate": 5.026794511080859e-07, "loss": 0.306, "step": 18220 }, { "epoch": 2.1950632149307645, "grad_norm": 5.220909118652344, "learning_rate": 5.021540721846787e-07, "loss": 0.3089, "step": 18230 }, { "epoch": 2.19626730885009, "grad_norm": 4.095883369445801, "learning_rate": 5.016286908829218e-07, "loss": 0.3179, "step": 18240 }, { "epoch": 2.197471402769416, "grad_norm": 4.485768795013428, "learning_rate": 5.011033077828982e-07, "loss": 0.3037, "step": 18250 }, { "epoch": 2.198675496688742, "grad_norm": 4.850970268249512, "learning_rate": 5.00577923464693e-07, "loss": 0.3098, "step": 18260 }, { "epoch": 2.1998795906080675, "grad_norm": 4.3276848793029785, "learning_rate": 5.000525385083919e-07, "loss": 0.3117, "step": 18270 }, { "epoch": 2.201083684527393, "grad_norm": 4.39775276184082, "learning_rate": 4.995271534940823e-07, "loss": 0.3185, "step": 18280 }, { "epoch": 2.2022877784467187, "grad_norm": 4.972282409667969, "learning_rate": 4.99001769001851e-07, "loss": 0.3131, "step": 18290 }, { "epoch": 2.2034918723660444, "grad_norm": 4.450355052947998, "learning_rate": 4.984763856117842e-07, "loss": 0.3052, "step": 18300 }, { "epoch": 2.2046959662853705, "grad_norm": 4.771944046020508, "learning_rate": 4.979510039039674e-07, "loss": 0.3087, "step": 18310 }, { "epoch": 2.205900060204696, "grad_norm": 4.077056407928467, "learning_rate": 4.974256244584838e-07, "loss": 0.2991, "step": 18320 }, { "epoch": 2.2071041541240217, "grad_norm": 4.485861778259277, "learning_rate": 4.969002478554139e-07, "loss": 0.3117, "step": 18330 }, { "epoch": 2.2083082480433474, "grad_norm": 4.26900053024292, "learning_rate": 4.963748746748358e-07, "loss": 0.299, "step": 18340 }, { "epoch": 2.209512341962673, "grad_norm": 5.258630752563477, "learning_rate": 4.958495054968235e-07, "loss": 0.3109, "step": 18350 }, { "epoch": 2.2107164358819986, "grad_norm": 5.4050774574279785, "learning_rate": 4.953241409014459e-07, "loss": 0.3263, "step": 18360 }, { "epoch": 2.2119205298013247, "grad_norm": 4.431223392486572, "learning_rate": 4.947987814687679e-07, "loss": 0.3131, "step": 18370 }, { "epoch": 2.2131246237206503, "grad_norm": 5.015274524688721, "learning_rate": 4.942734277788481e-07, "loss": 0.3122, "step": 18380 }, { "epoch": 2.214328717639976, "grad_norm": 5.460362911224365, "learning_rate": 4.937480804117392e-07, "loss": 0.3049, "step": 18390 }, { "epoch": 2.2155328115593016, "grad_norm": 4.469453811645508, "learning_rate": 4.93222739947486e-07, "loss": 0.3109, "step": 18400 }, { "epoch": 2.2167369054786272, "grad_norm": 4.560921669006348, "learning_rate": 4.926974069661265e-07, "loss": 0.3155, "step": 18410 }, { "epoch": 2.217940999397953, "grad_norm": 4.696376800537109, "learning_rate": 4.921720820476904e-07, "loss": 0.3256, "step": 18420 }, { "epoch": 2.219145093317279, "grad_norm": 4.80272102355957, "learning_rate": 4.916467657721984e-07, "loss": 0.3172, "step": 18430 }, { "epoch": 2.2203491872366046, "grad_norm": 4.686549663543701, "learning_rate": 4.911214587196612e-07, "loss": 0.3044, "step": 18440 }, { "epoch": 2.22155328115593, "grad_norm": 4.5141921043396, "learning_rate": 4.9059616147008e-07, "loss": 0.296, "step": 18450 }, { "epoch": 2.222757375075256, "grad_norm": 4.311396598815918, "learning_rate": 4.900708746034446e-07, "loss": 0.3052, "step": 18460 }, { "epoch": 2.2239614689945815, "grad_norm": 4.644687175750732, "learning_rate": 4.895455986997341e-07, "loss": 0.3091, "step": 18470 }, { "epoch": 2.225165562913907, "grad_norm": 4.708485126495361, "learning_rate": 4.890203343389144e-07, "loss": 0.3126, "step": 18480 }, { "epoch": 2.226369656833233, "grad_norm": 4.648069381713867, "learning_rate": 4.884950821009394e-07, "loss": 0.3303, "step": 18490 }, { "epoch": 2.227573750752559, "grad_norm": 5.3636555671691895, "learning_rate": 4.8796984256575e-07, "loss": 0.308, "step": 18500 }, { "epoch": 2.2287778446718844, "grad_norm": 4.061014652252197, "learning_rate": 4.874446163132719e-07, "loss": 0.2957, "step": 18510 }, { "epoch": 2.22998193859121, "grad_norm": 6.169346332550049, "learning_rate": 4.869194039234169e-07, "loss": 0.318, "step": 18520 }, { "epoch": 2.2311860325105357, "grad_norm": 4.9474053382873535, "learning_rate": 4.863942059760817e-07, "loss": 0.3112, "step": 18530 }, { "epoch": 2.2323901264298613, "grad_norm": 4.635356903076172, "learning_rate": 4.858690230511465e-07, "loss": 0.3006, "step": 18540 }, { "epoch": 2.2335942203491874, "grad_norm": 4.872357368469238, "learning_rate": 4.85343855728475e-07, "loss": 0.315, "step": 18550 }, { "epoch": 2.234798314268513, "grad_norm": 4.909818172454834, "learning_rate": 4.848187045879141e-07, "loss": 0.2983, "step": 18560 }, { "epoch": 2.2360024081878387, "grad_norm": 5.507841110229492, "learning_rate": 4.842935702092923e-07, "loss": 0.2919, "step": 18570 }, { "epoch": 2.2372065021071643, "grad_norm": 4.438649654388428, "learning_rate": 4.837684531724202e-07, "loss": 0.3012, "step": 18580 }, { "epoch": 2.23841059602649, "grad_norm": 4.70427942276001, "learning_rate": 4.832433540570885e-07, "loss": 0.3076, "step": 18590 }, { "epoch": 2.2396146899458156, "grad_norm": 4.81848669052124, "learning_rate": 4.827182734430687e-07, "loss": 0.3021, "step": 18600 }, { "epoch": 2.2408187838651417, "grad_norm": 4.911860466003418, "learning_rate": 4.821932119101116e-07, "loss": 0.3109, "step": 18610 }, { "epoch": 2.2420228777844673, "grad_norm": 5.092623233795166, "learning_rate": 4.816681700379472e-07, "loss": 0.3243, "step": 18620 }, { "epoch": 2.243226971703793, "grad_norm": 4.224728584289551, "learning_rate": 4.811431484062832e-07, "loss": 0.3128, "step": 18630 }, { "epoch": 2.2444310656231186, "grad_norm": 4.93331241607666, "learning_rate": 4.806181475948057e-07, "loss": 0.3147, "step": 18640 }, { "epoch": 2.245635159542444, "grad_norm": 6.220354080200195, "learning_rate": 4.800931681831773e-07, "loss": 0.2964, "step": 18650 }, { "epoch": 2.24683925346177, "grad_norm": 5.004923343658447, "learning_rate": 4.795682107510375e-07, "loss": 0.3172, "step": 18660 }, { "epoch": 2.248043347381096, "grad_norm": 5.164400577545166, "learning_rate": 4.790432758780005e-07, "loss": 0.3063, "step": 18670 }, { "epoch": 2.2492474413004215, "grad_norm": 5.098756313323975, "learning_rate": 4.785183641436569e-07, "loss": 0.3045, "step": 18680 }, { "epoch": 2.250451535219747, "grad_norm": 4.363048553466797, "learning_rate": 4.779934761275706e-07, "loss": 0.3084, "step": 18690 }, { "epoch": 2.251655629139073, "grad_norm": 5.233163833618164, "learning_rate": 4.774686124092804e-07, "loss": 0.316, "step": 18700 }, { "epoch": 2.2528597230583984, "grad_norm": 4.870039463043213, "learning_rate": 4.769437735682972e-07, "loss": 0.3008, "step": 18710 }, { "epoch": 2.254063816977724, "grad_norm": 5.44446325302124, "learning_rate": 4.7641896018410506e-07, "loss": 0.3139, "step": 18720 }, { "epoch": 2.25526791089705, "grad_norm": 4.950879096984863, "learning_rate": 4.758941728361599e-07, "loss": 0.3108, "step": 18730 }, { "epoch": 2.2564720048163758, "grad_norm": 4.887548446655273, "learning_rate": 4.7536941210388895e-07, "loss": 0.3195, "step": 18740 }, { "epoch": 2.2576760987357014, "grad_norm": 6.180630207061768, "learning_rate": 4.7484467856668946e-07, "loss": 0.3112, "step": 18750 }, { "epoch": 2.258880192655027, "grad_norm": 5.481302738189697, "learning_rate": 4.743199728039294e-07, "loss": 0.3124, "step": 18760 }, { "epoch": 2.2600842865743527, "grad_norm": 4.6261677742004395, "learning_rate": 4.737952953949457e-07, "loss": 0.3058, "step": 18770 }, { "epoch": 2.2612883804936788, "grad_norm": 4.097585201263428, "learning_rate": 4.732706469190442e-07, "loss": 0.3271, "step": 18780 }, { "epoch": 2.2624924744130044, "grad_norm": 5.000282287597656, "learning_rate": 4.7274602795549836e-07, "loss": 0.317, "step": 18790 }, { "epoch": 2.26369656833233, "grad_norm": 4.3350958824157715, "learning_rate": 4.7222143908354943e-07, "loss": 0.3083, "step": 18800 }, { "epoch": 2.2649006622516556, "grad_norm": 4.336573123931885, "learning_rate": 4.7169688088240555e-07, "loss": 0.3139, "step": 18810 }, { "epoch": 2.2661047561709813, "grad_norm": 4.1952900886535645, "learning_rate": 4.7117235393124064e-07, "loss": 0.294, "step": 18820 }, { "epoch": 2.267308850090307, "grad_norm": 5.418072700500488, "learning_rate": 4.7064785880919414e-07, "loss": 0.3185, "step": 18830 }, { "epoch": 2.2685129440096325, "grad_norm": 5.001430511474609, "learning_rate": 4.701233960953708e-07, "loss": 0.3108, "step": 18840 }, { "epoch": 2.2697170379289586, "grad_norm": 5.28980827331543, "learning_rate": 4.69598966368839e-07, "loss": 0.3149, "step": 18850 }, { "epoch": 2.2709211318482843, "grad_norm": 5.221833229064941, "learning_rate": 4.6907457020863095e-07, "loss": 0.3106, "step": 18860 }, { "epoch": 2.27212522576761, "grad_norm": 4.259886264801025, "learning_rate": 4.6855020819374196e-07, "loss": 0.3159, "step": 18870 }, { "epoch": 2.2733293196869355, "grad_norm": 5.210353851318359, "learning_rate": 4.680258809031293e-07, "loss": 0.306, "step": 18880 }, { "epoch": 2.274533413606261, "grad_norm": 4.933556079864502, "learning_rate": 4.6750158891571246e-07, "loss": 0.2988, "step": 18890 }, { "epoch": 2.2757375075255872, "grad_norm": 5.060166358947754, "learning_rate": 4.669773328103712e-07, "loss": 0.3298, "step": 18900 }, { "epoch": 2.276941601444913, "grad_norm": 5.316260814666748, "learning_rate": 4.664531131659461e-07, "loss": 0.3193, "step": 18910 }, { "epoch": 2.2781456953642385, "grad_norm": 4.371904373168945, "learning_rate": 4.659289305612375e-07, "loss": 0.3181, "step": 18920 }, { "epoch": 2.279349789283564, "grad_norm": 4.114840984344482, "learning_rate": 4.65404785575005e-07, "loss": 0.3089, "step": 18930 }, { "epoch": 2.2805538832028898, "grad_norm": 4.94135046005249, "learning_rate": 4.64880678785966e-07, "loss": 0.3158, "step": 18940 }, { "epoch": 2.2817579771222154, "grad_norm": 5.033153057098389, "learning_rate": 4.6435661077279633e-07, "loss": 0.3087, "step": 18950 }, { "epoch": 2.282962071041541, "grad_norm": 4.434708595275879, "learning_rate": 4.638325821141289e-07, "loss": 0.3031, "step": 18960 }, { "epoch": 2.284166164960867, "grad_norm": 4.674195766448975, "learning_rate": 4.6330859338855325e-07, "loss": 0.3227, "step": 18970 }, { "epoch": 2.2853702588801927, "grad_norm": 4.624505043029785, "learning_rate": 4.6278464517461434e-07, "loss": 0.2994, "step": 18980 }, { "epoch": 2.2865743527995184, "grad_norm": 4.435290336608887, "learning_rate": 4.622607380508129e-07, "loss": 0.3125, "step": 18990 }, { "epoch": 2.287778446718844, "grad_norm": 4.538943767547607, "learning_rate": 4.6173687259560417e-07, "loss": 0.3166, "step": 19000 }, { "epoch": 2.2889825406381696, "grad_norm": 5.1769890785217285, "learning_rate": 4.6121304938739754e-07, "loss": 0.2978, "step": 19010 }, { "epoch": 2.2901866345574957, "grad_norm": 4.897463321685791, "learning_rate": 4.606892690045551e-07, "loss": 0.2857, "step": 19020 }, { "epoch": 2.2913907284768213, "grad_norm": 5.332199573516846, "learning_rate": 4.601655320253924e-07, "loss": 0.3082, "step": 19030 }, { "epoch": 2.292594822396147, "grad_norm": 4.842720985412598, "learning_rate": 4.5964183902817677e-07, "loss": 0.3003, "step": 19040 }, { "epoch": 2.2937989163154726, "grad_norm": 4.277060031890869, "learning_rate": 4.5911819059112724e-07, "loss": 0.3027, "step": 19050 }, { "epoch": 2.2950030102347982, "grad_norm": 4.499503135681152, "learning_rate": 4.5859458729241287e-07, "loss": 0.311, "step": 19060 }, { "epoch": 2.296207104154124, "grad_norm": 5.2861762046813965, "learning_rate": 4.580710297101537e-07, "loss": 0.3197, "step": 19070 }, { "epoch": 2.2974111980734495, "grad_norm": 4.3773112297058105, "learning_rate": 4.5754751842241905e-07, "loss": 0.3113, "step": 19080 }, { "epoch": 2.2986152919927756, "grad_norm": 4.447787284851074, "learning_rate": 4.5702405400722703e-07, "loss": 0.3037, "step": 19090 }, { "epoch": 2.299819385912101, "grad_norm": 5.014771938323975, "learning_rate": 4.5650063704254395e-07, "loss": 0.3018, "step": 19100 }, { "epoch": 2.301023479831427, "grad_norm": 4.333285331726074, "learning_rate": 4.55977268106284e-07, "loss": 0.3176, "step": 19110 }, { "epoch": 2.3022275737507525, "grad_norm": 6.291433334350586, "learning_rate": 4.5545394777630786e-07, "loss": 0.3335, "step": 19120 }, { "epoch": 2.303431667670078, "grad_norm": 4.657562255859375, "learning_rate": 4.5493067663042325e-07, "loss": 0.3059, "step": 19130 }, { "epoch": 2.304635761589404, "grad_norm": 4.472227573394775, "learning_rate": 4.544074552463829e-07, "loss": 0.3074, "step": 19140 }, { "epoch": 2.30583985550873, "grad_norm": 5.011964797973633, "learning_rate": 4.5388428420188486e-07, "loss": 0.3036, "step": 19150 }, { "epoch": 2.3070439494280555, "grad_norm": 5.620879173278809, "learning_rate": 4.533611640745718e-07, "loss": 0.31, "step": 19160 }, { "epoch": 2.308248043347381, "grad_norm": 5.25240421295166, "learning_rate": 4.5283809544202996e-07, "loss": 0.328, "step": 19170 }, { "epoch": 2.3094521372667067, "grad_norm": 4.3917317390441895, "learning_rate": 4.5231507888178856e-07, "loss": 0.3129, "step": 19180 }, { "epoch": 2.3106562311860324, "grad_norm": 4.568994998931885, "learning_rate": 4.517921149713196e-07, "loss": 0.3057, "step": 19190 }, { "epoch": 2.311860325105358, "grad_norm": 4.5026726722717285, "learning_rate": 4.512692042880372e-07, "loss": 0.2997, "step": 19200 }, { "epoch": 2.313064419024684, "grad_norm": 3.986133098602295, "learning_rate": 4.507463474092959e-07, "loss": 0.2952, "step": 19210 }, { "epoch": 2.3142685129440097, "grad_norm": 4.367317199707031, "learning_rate": 4.5022354491239145e-07, "loss": 0.3036, "step": 19220 }, { "epoch": 2.3154726068633353, "grad_norm": 5.649072170257568, "learning_rate": 4.497007973745595e-07, "loss": 0.3173, "step": 19230 }, { "epoch": 2.316676700782661, "grad_norm": 5.655643463134766, "learning_rate": 4.4917810537297514e-07, "loss": 0.327, "step": 19240 }, { "epoch": 2.3178807947019866, "grad_norm": 5.137732982635498, "learning_rate": 4.4865546948475147e-07, "loss": 0.3065, "step": 19250 }, { "epoch": 2.3190848886213127, "grad_norm": 4.715443134307861, "learning_rate": 4.481328902869404e-07, "loss": 0.3207, "step": 19260 }, { "epoch": 2.3202889825406383, "grad_norm": 3.9082722663879395, "learning_rate": 4.476103683565308e-07, "loss": 0.3074, "step": 19270 }, { "epoch": 2.321493076459964, "grad_norm": 4.448252201080322, "learning_rate": 4.4708790427044887e-07, "loss": 0.3063, "step": 19280 }, { "epoch": 2.3226971703792896, "grad_norm": 4.547604560852051, "learning_rate": 4.465654986055559e-07, "loss": 0.3098, "step": 19290 }, { "epoch": 2.323901264298615, "grad_norm": 5.669996738433838, "learning_rate": 4.460431519386497e-07, "loss": 0.3188, "step": 19300 }, { "epoch": 2.325105358217941, "grad_norm": 5.271092891693115, "learning_rate": 4.4552086484646246e-07, "loss": 0.2948, "step": 19310 }, { "epoch": 2.3263094521372665, "grad_norm": 5.6719231605529785, "learning_rate": 4.4499863790566087e-07, "loss": 0.3089, "step": 19320 }, { "epoch": 2.3275135460565926, "grad_norm": 5.9080657958984375, "learning_rate": 4.444764716928447e-07, "loss": 0.3195, "step": 19330 }, { "epoch": 2.328717639975918, "grad_norm": 5.201897144317627, "learning_rate": 4.43954366784547e-07, "loss": 0.2979, "step": 19340 }, { "epoch": 2.329921733895244, "grad_norm": 4.319961071014404, "learning_rate": 4.4343232375723343e-07, "loss": 0.3059, "step": 19350 }, { "epoch": 2.3311258278145695, "grad_norm": 4.492523670196533, "learning_rate": 4.4291034318730086e-07, "loss": 0.2941, "step": 19360 }, { "epoch": 2.332329921733895, "grad_norm": 5.589833736419678, "learning_rate": 4.4238842565107715e-07, "loss": 0.3089, "step": 19370 }, { "epoch": 2.333534015653221, "grad_norm": 4.234698295593262, "learning_rate": 4.4186657172482105e-07, "loss": 0.3012, "step": 19380 }, { "epoch": 2.334738109572547, "grad_norm": 4.777867317199707, "learning_rate": 4.413447819847206e-07, "loss": 0.3083, "step": 19390 }, { "epoch": 2.3359422034918724, "grad_norm": 5.0551533699035645, "learning_rate": 4.4082305700689334e-07, "loss": 0.3056, "step": 19400 }, { "epoch": 2.337146297411198, "grad_norm": 4.407803535461426, "learning_rate": 4.40301397367385e-07, "loss": 0.3137, "step": 19410 }, { "epoch": 2.3383503913305237, "grad_norm": 4.408458709716797, "learning_rate": 4.3977980364216925e-07, "loss": 0.3234, "step": 19420 }, { "epoch": 2.3395544852498493, "grad_norm": 5.100025653839111, "learning_rate": 4.392582764071471e-07, "loss": 0.3053, "step": 19430 }, { "epoch": 2.340758579169175, "grad_norm": 4.870809078216553, "learning_rate": 4.3873681623814634e-07, "loss": 0.2973, "step": 19440 }, { "epoch": 2.341962673088501, "grad_norm": 5.078246116638184, "learning_rate": 4.3821542371092e-07, "loss": 0.3042, "step": 19450 }, { "epoch": 2.3431667670078267, "grad_norm": 4.400288105010986, "learning_rate": 4.3769409940114706e-07, "loss": 0.3012, "step": 19460 }, { "epoch": 2.3443708609271523, "grad_norm": 5.289750576019287, "learning_rate": 4.3717284388443123e-07, "loss": 0.3149, "step": 19470 }, { "epoch": 2.345574954846478, "grad_norm": 4.133148670196533, "learning_rate": 4.3665165773629955e-07, "loss": 0.311, "step": 19480 }, { "epoch": 2.3467790487658036, "grad_norm": 4.689704418182373, "learning_rate": 4.361305415322032e-07, "loss": 0.2985, "step": 19490 }, { "epoch": 2.3479831426851296, "grad_norm": 5.3425822257995605, "learning_rate": 4.35609495847516e-07, "loss": 0.3252, "step": 19500 }, { "epoch": 2.3491872366044553, "grad_norm": 4.8020524978637695, "learning_rate": 4.350885212575338e-07, "loss": 0.3017, "step": 19510 }, { "epoch": 2.350391330523781, "grad_norm": 3.823481798171997, "learning_rate": 4.345676183374737e-07, "loss": 0.3163, "step": 19520 }, { "epoch": 2.3515954244431065, "grad_norm": 5.067866802215576, "learning_rate": 4.3404678766247393e-07, "loss": 0.2985, "step": 19530 }, { "epoch": 2.352799518362432, "grad_norm": 4.470125198364258, "learning_rate": 4.335260298075931e-07, "loss": 0.3215, "step": 19540 }, { "epoch": 2.354003612281758, "grad_norm": 4.854072093963623, "learning_rate": 4.330053453478094e-07, "loss": 0.3139, "step": 19550 }, { "epoch": 2.3552077062010834, "grad_norm": 4.061732292175293, "learning_rate": 4.3248473485801943e-07, "loss": 0.2944, "step": 19560 }, { "epoch": 2.3564118001204095, "grad_norm": 4.881399154663086, "learning_rate": 4.319641989130387e-07, "loss": 0.2958, "step": 19570 }, { "epoch": 2.357615894039735, "grad_norm": 4.650146007537842, "learning_rate": 4.3144373808760026e-07, "loss": 0.3092, "step": 19580 }, { "epoch": 2.358819987959061, "grad_norm": 5.014580249786377, "learning_rate": 4.3092335295635444e-07, "loss": 0.3143, "step": 19590 }, { "epoch": 2.3600240818783864, "grad_norm": 5.064713478088379, "learning_rate": 4.304030440938673e-07, "loss": 0.3106, "step": 19600 }, { "epoch": 2.361228175797712, "grad_norm": 4.044290065765381, "learning_rate": 4.298828120746213e-07, "loss": 0.3024, "step": 19610 }, { "epoch": 2.362432269717038, "grad_norm": 5.447383403778076, "learning_rate": 4.29362657473014e-07, "loss": 0.3147, "step": 19620 }, { "epoch": 2.3636363636363638, "grad_norm": 4.447105884552002, "learning_rate": 4.2884258086335745e-07, "loss": 0.303, "step": 19630 }, { "epoch": 2.3648404575556894, "grad_norm": 4.2513957023620605, "learning_rate": 4.2832258281987724e-07, "loss": 0.3107, "step": 19640 }, { "epoch": 2.366044551475015, "grad_norm": 5.619822025299072, "learning_rate": 4.2780266391671277e-07, "loss": 0.3212, "step": 19650 }, { "epoch": 2.3672486453943407, "grad_norm": 5.056023597717285, "learning_rate": 4.272828247279155e-07, "loss": 0.298, "step": 19660 }, { "epoch": 2.3684527393136663, "grad_norm": 4.584505558013916, "learning_rate": 4.267630658274495e-07, "loss": 0.3069, "step": 19670 }, { "epoch": 2.3696568332329924, "grad_norm": 5.227287292480469, "learning_rate": 4.2624338778918964e-07, "loss": 0.296, "step": 19680 }, { "epoch": 2.370860927152318, "grad_norm": 4.425261974334717, "learning_rate": 4.2572379118692155e-07, "loss": 0.3093, "step": 19690 }, { "epoch": 2.3720650210716436, "grad_norm": 4.10771369934082, "learning_rate": 4.2520427659434134e-07, "loss": 0.295, "step": 19700 }, { "epoch": 2.3732691149909693, "grad_norm": 4.561648845672607, "learning_rate": 4.2468484458505456e-07, "loss": 0.3006, "step": 19710 }, { "epoch": 2.374473208910295, "grad_norm": 3.9050345420837402, "learning_rate": 4.241654957325748e-07, "loss": 0.3016, "step": 19720 }, { "epoch": 2.3756773028296205, "grad_norm": 5.106329917907715, "learning_rate": 4.2364623061032477e-07, "loss": 0.3043, "step": 19730 }, { "epoch": 2.3768813967489466, "grad_norm": 6.0447211265563965, "learning_rate": 4.231270497916343e-07, "loss": 0.3114, "step": 19740 }, { "epoch": 2.3780854906682722, "grad_norm": 4.171956539154053, "learning_rate": 4.2260795384974037e-07, "loss": 0.3033, "step": 19750 }, { "epoch": 2.379289584587598, "grad_norm": 4.500546932220459, "learning_rate": 4.2208894335778573e-07, "loss": 0.3066, "step": 19760 }, { "epoch": 2.3804936785069235, "grad_norm": 5.30014181137085, "learning_rate": 4.215700188888192e-07, "loss": 0.3008, "step": 19770 }, { "epoch": 2.381697772426249, "grad_norm": 4.23181676864624, "learning_rate": 4.2105118101579497e-07, "loss": 0.2925, "step": 19780 }, { "epoch": 2.3829018663455748, "grad_norm": 4.446700096130371, "learning_rate": 4.205324303115706e-07, "loss": 0.3142, "step": 19790 }, { "epoch": 2.384105960264901, "grad_norm": 5.344078063964844, "learning_rate": 4.2001376734890824e-07, "loss": 0.3053, "step": 19800 }, { "epoch": 2.3853100541842265, "grad_norm": 5.066955089569092, "learning_rate": 4.1949519270047295e-07, "loss": 0.3071, "step": 19810 }, { "epoch": 2.386514148103552, "grad_norm": 4.834653377532959, "learning_rate": 4.1897670693883255e-07, "loss": 0.3039, "step": 19820 }, { "epoch": 2.3877182420228777, "grad_norm": 4.982695579528809, "learning_rate": 4.1845831063645586e-07, "loss": 0.3007, "step": 19830 }, { "epoch": 2.3889223359422034, "grad_norm": 5.261125564575195, "learning_rate": 4.1794000436571374e-07, "loss": 0.3121, "step": 19840 }, { "epoch": 2.390126429861529, "grad_norm": 5.1389570236206055, "learning_rate": 4.174217886988775e-07, "loss": 0.3058, "step": 19850 }, { "epoch": 2.391330523780855, "grad_norm": 4.307366371154785, "learning_rate": 4.169036642081183e-07, "loss": 0.3008, "step": 19860 }, { "epoch": 2.3925346177001807, "grad_norm": 5.068446636199951, "learning_rate": 4.163856314655064e-07, "loss": 0.3145, "step": 19870 }, { "epoch": 2.3937387116195064, "grad_norm": 5.377712249755859, "learning_rate": 4.1586769104301124e-07, "loss": 0.3047, "step": 19880 }, { "epoch": 2.394942805538832, "grad_norm": 5.161853313446045, "learning_rate": 4.153498435124999e-07, "loss": 0.3111, "step": 19890 }, { "epoch": 2.3961468994581576, "grad_norm": 4.217031002044678, "learning_rate": 4.1483208944573745e-07, "loss": 0.2886, "step": 19900 }, { "epoch": 2.3973509933774833, "grad_norm": 4.948873996734619, "learning_rate": 4.1431442941438486e-07, "loss": 0.3138, "step": 19910 }, { "epoch": 2.3985550872968093, "grad_norm": 5.304249286651611, "learning_rate": 4.1379686399000016e-07, "loss": 0.3013, "step": 19920 }, { "epoch": 2.399759181216135, "grad_norm": 5.372039318084717, "learning_rate": 4.132793937440365e-07, "loss": 0.316, "step": 19930 }, { "epoch": 2.4009632751354606, "grad_norm": 5.1526265144348145, "learning_rate": 4.127620192478421e-07, "loss": 0.3177, "step": 19940 }, { "epoch": 2.4021673690547862, "grad_norm": 4.650707244873047, "learning_rate": 4.122447410726591e-07, "loss": 0.3014, "step": 19950 }, { "epoch": 2.403371462974112, "grad_norm": 4.576737403869629, "learning_rate": 4.1172755978962395e-07, "loss": 0.3069, "step": 19960 }, { "epoch": 2.4045755568934375, "grad_norm": 5.201079845428467, "learning_rate": 4.1121047596976534e-07, "loss": 0.3151, "step": 19970 }, { "epoch": 2.4057796508127636, "grad_norm": 4.859030723571777, "learning_rate": 4.1069349018400503e-07, "loss": 0.298, "step": 19980 }, { "epoch": 2.406983744732089, "grad_norm": 5.44400691986084, "learning_rate": 4.101766030031562e-07, "loss": 0.303, "step": 19990 }, { "epoch": 2.408187838651415, "grad_norm": 4.533078193664551, "learning_rate": 4.0965981499792307e-07, "loss": 0.3055, "step": 20000 }, { "epoch": 2.4093919325707405, "grad_norm": 5.147141456604004, "learning_rate": 4.0914312673890054e-07, "loss": 0.3141, "step": 20010 }, { "epoch": 2.410596026490066, "grad_norm": 4.530623912811279, "learning_rate": 4.0862653879657373e-07, "loss": 0.3205, "step": 20020 }, { "epoch": 2.411800120409392, "grad_norm": 4.804474830627441, "learning_rate": 4.08110051741316e-07, "loss": 0.3113, "step": 20030 }, { "epoch": 2.413004214328718, "grad_norm": 4.642183780670166, "learning_rate": 4.0759366614339015e-07, "loss": 0.3115, "step": 20040 }, { "epoch": 2.4142083082480434, "grad_norm": 4.975921630859375, "learning_rate": 4.0707738257294685e-07, "loss": 0.3165, "step": 20050 }, { "epoch": 2.415412402167369, "grad_norm": 4.621540546417236, "learning_rate": 4.065612016000241e-07, "loss": 0.2914, "step": 20060 }, { "epoch": 2.4166164960866947, "grad_norm": 4.194451808929443, "learning_rate": 4.060451237945462e-07, "loss": 0.3035, "step": 20070 }, { "epoch": 2.4178205900060203, "grad_norm": 4.82729959487915, "learning_rate": 4.05529149726324e-07, "loss": 0.3068, "step": 20080 }, { "epoch": 2.419024683925346, "grad_norm": 5.17459774017334, "learning_rate": 4.050132799650538e-07, "loss": 0.3092, "step": 20090 }, { "epoch": 2.420228777844672, "grad_norm": 5.787187576293945, "learning_rate": 4.0449751508031666e-07, "loss": 0.3168, "step": 20100 }, { "epoch": 2.4214328717639977, "grad_norm": 4.466209411621094, "learning_rate": 4.039818556415775e-07, "loss": 0.296, "step": 20110 }, { "epoch": 2.4226369656833233, "grad_norm": 4.929852485656738, "learning_rate": 4.034663022181852e-07, "loss": 0.3135, "step": 20120 }, { "epoch": 2.423841059602649, "grad_norm": 4.523739337921143, "learning_rate": 4.029508553793718e-07, "loss": 0.288, "step": 20130 }, { "epoch": 2.4250451535219746, "grad_norm": 7.000367641448975, "learning_rate": 4.0243551569425095e-07, "loss": 0.3105, "step": 20140 }, { "epoch": 2.4262492474413007, "grad_norm": 6.229575157165527, "learning_rate": 4.019202837318185e-07, "loss": 0.3166, "step": 20150 }, { "epoch": 2.4274533413606263, "grad_norm": 5.243337154388428, "learning_rate": 4.0140516006095134e-07, "loss": 0.3046, "step": 20160 }, { "epoch": 2.428657435279952, "grad_norm": 4.598159313201904, "learning_rate": 4.0089014525040685e-07, "loss": 0.3064, "step": 20170 }, { "epoch": 2.4298615291992776, "grad_norm": 4.482394695281982, "learning_rate": 4.003752398688218e-07, "loss": 0.3097, "step": 20180 }, { "epoch": 2.431065623118603, "grad_norm": 5.39198637008667, "learning_rate": 3.9986044448471244e-07, "loss": 0.3112, "step": 20190 }, { "epoch": 2.432269717037929, "grad_norm": 4.356963634490967, "learning_rate": 3.9934575966647375e-07, "loss": 0.3006, "step": 20200 }, { "epoch": 2.4334738109572545, "grad_norm": 4.211975574493408, "learning_rate": 3.9883118598237837e-07, "loss": 0.2989, "step": 20210 }, { "epoch": 2.4346779048765805, "grad_norm": 5.301422119140625, "learning_rate": 3.9831672400057605e-07, "loss": 0.3178, "step": 20220 }, { "epoch": 2.435881998795906, "grad_norm": 4.181766510009766, "learning_rate": 3.978023742890937e-07, "loss": 0.3066, "step": 20230 }, { "epoch": 2.437086092715232, "grad_norm": 5.18208122253418, "learning_rate": 3.9728813741583383e-07, "loss": 0.3001, "step": 20240 }, { "epoch": 2.4382901866345574, "grad_norm": 5.382752418518066, "learning_rate": 3.967740139485748e-07, "loss": 0.3088, "step": 20250 }, { "epoch": 2.439494280553883, "grad_norm": 5.215182304382324, "learning_rate": 3.9626000445496934e-07, "loss": 0.2882, "step": 20260 }, { "epoch": 2.440698374473209, "grad_norm": 5.133399963378906, "learning_rate": 3.957461095025444e-07, "loss": 0.3303, "step": 20270 }, { "epoch": 2.4419024683925348, "grad_norm": 5.194669246673584, "learning_rate": 3.952323296587007e-07, "loss": 0.3172, "step": 20280 }, { "epoch": 2.4431065623118604, "grad_norm": 4.95144510269165, "learning_rate": 3.947186654907119e-07, "loss": 0.3138, "step": 20290 }, { "epoch": 2.444310656231186, "grad_norm": 5.0588812828063965, "learning_rate": 3.9420511756572346e-07, "loss": 0.3058, "step": 20300 }, { "epoch": 2.4455147501505117, "grad_norm": 5.033606052398682, "learning_rate": 3.936916864507529e-07, "loss": 0.3161, "step": 20310 }, { "epoch": 2.4467188440698373, "grad_norm": 5.006187915802002, "learning_rate": 3.9317837271268876e-07, "loss": 0.2993, "step": 20320 }, { "epoch": 2.447922937989163, "grad_norm": 4.955638408660889, "learning_rate": 3.926651769182901e-07, "loss": 0.3023, "step": 20330 }, { "epoch": 2.449127031908489, "grad_norm": 4.786928653717041, "learning_rate": 3.9215209963418513e-07, "loss": 0.3207, "step": 20340 }, { "epoch": 2.4503311258278146, "grad_norm": 4.456767559051514, "learning_rate": 3.9163914142687177e-07, "loss": 0.3142, "step": 20350 }, { "epoch": 2.4515352197471403, "grad_norm": 5.671106338500977, "learning_rate": 3.911263028627164e-07, "loss": 0.3125, "step": 20360 }, { "epoch": 2.452739313666466, "grad_norm": 5.525556564331055, "learning_rate": 3.9061358450795344e-07, "loss": 0.2972, "step": 20370 }, { "epoch": 2.4539434075857915, "grad_norm": 4.18988561630249, "learning_rate": 3.9010098692868397e-07, "loss": 0.2971, "step": 20380 }, { "epoch": 2.4551475015051176, "grad_norm": 5.705048561096191, "learning_rate": 3.895885106908763e-07, "loss": 0.3094, "step": 20390 }, { "epoch": 2.4563515954244433, "grad_norm": 5.453742980957031, "learning_rate": 3.890761563603647e-07, "loss": 0.3079, "step": 20400 }, { "epoch": 2.457555689343769, "grad_norm": 4.007357120513916, "learning_rate": 3.885639245028488e-07, "loss": 0.3119, "step": 20410 }, { "epoch": 2.4587597832630945, "grad_norm": 5.247729301452637, "learning_rate": 3.8805181568389255e-07, "loss": 0.3047, "step": 20420 }, { "epoch": 2.45996387718242, "grad_norm": 4.143746852874756, "learning_rate": 3.8753983046892465e-07, "loss": 0.3062, "step": 20430 }, { "epoch": 2.461167971101746, "grad_norm": 4.356471538543701, "learning_rate": 3.8702796942323736e-07, "loss": 0.3095, "step": 20440 }, { "epoch": 2.4623720650210714, "grad_norm": 4.553625106811523, "learning_rate": 3.8651623311198516e-07, "loss": 0.3117, "step": 20450 }, { "epoch": 2.4635761589403975, "grad_norm": 4.882122039794922, "learning_rate": 3.860046221001855e-07, "loss": 0.322, "step": 20460 }, { "epoch": 2.464780252859723, "grad_norm": 5.218991756439209, "learning_rate": 3.854931369527172e-07, "loss": 0.3138, "step": 20470 }, { "epoch": 2.4659843467790488, "grad_norm": 5.427024841308594, "learning_rate": 3.849817782343201e-07, "loss": 0.3125, "step": 20480 }, { "epoch": 2.4671884406983744, "grad_norm": 4.729675769805908, "learning_rate": 3.8447054650959447e-07, "loss": 0.2925, "step": 20490 }, { "epoch": 2.4683925346177, "grad_norm": 5.330557346343994, "learning_rate": 3.8395944234300053e-07, "loss": 0.2968, "step": 20500 }, { "epoch": 2.469596628537026, "grad_norm": 4.960201740264893, "learning_rate": 3.834484662988573e-07, "loss": 0.3147, "step": 20510 }, { "epoch": 2.4708007224563517, "grad_norm": 4.888551235198975, "learning_rate": 3.829376189413427e-07, "loss": 0.3098, "step": 20520 }, { "epoch": 2.4720048163756774, "grad_norm": 4.717561721801758, "learning_rate": 3.824269008344924e-07, "loss": 0.3018, "step": 20530 }, { "epoch": 2.473208910295003, "grad_norm": 4.666635990142822, "learning_rate": 3.8191631254219927e-07, "loss": 0.2942, "step": 20540 }, { "epoch": 2.4744130042143286, "grad_norm": 5.138599872589111, "learning_rate": 3.8140585462821296e-07, "loss": 0.2922, "step": 20550 }, { "epoch": 2.4756170981336543, "grad_norm": 5.150256633758545, "learning_rate": 3.808955276561395e-07, "loss": 0.3039, "step": 20560 }, { "epoch": 2.47682119205298, "grad_norm": 5.677982807159424, "learning_rate": 3.8038533218943954e-07, "loss": 0.2928, "step": 20570 }, { "epoch": 2.478025285972306, "grad_norm": 4.552664756774902, "learning_rate": 3.798752687914292e-07, "loss": 0.3108, "step": 20580 }, { "epoch": 2.4792293798916316, "grad_norm": 4.48048210144043, "learning_rate": 3.7936533802527855e-07, "loss": 0.3159, "step": 20590 }, { "epoch": 2.4804334738109572, "grad_norm": 4.3352370262146, "learning_rate": 3.7885554045401147e-07, "loss": 0.3079, "step": 20600 }, { "epoch": 2.481637567730283, "grad_norm": 4.1587653160095215, "learning_rate": 3.783458766405042e-07, "loss": 0.3036, "step": 20610 }, { "epoch": 2.4828416616496085, "grad_norm": 4.668213844299316, "learning_rate": 3.7783634714748584e-07, "loss": 0.3003, "step": 20620 }, { "epoch": 2.4840457555689346, "grad_norm": 4.186696529388428, "learning_rate": 3.7732695253753697e-07, "loss": 0.3192, "step": 20630 }, { "epoch": 2.48524984948826, "grad_norm": 4.841115951538086, "learning_rate": 3.7681769337308954e-07, "loss": 0.3064, "step": 20640 }, { "epoch": 2.486453943407586, "grad_norm": 4.4625020027160645, "learning_rate": 3.7630857021642514e-07, "loss": 0.3059, "step": 20650 }, { "epoch": 2.4876580373269115, "grad_norm": 4.459711074829102, "learning_rate": 3.757995836296761e-07, "loss": 0.2925, "step": 20660 }, { "epoch": 2.488862131246237, "grad_norm": 4.983307361602783, "learning_rate": 3.7529073417482345e-07, "loss": 0.2961, "step": 20670 }, { "epoch": 2.4900662251655628, "grad_norm": 4.813161373138428, "learning_rate": 3.747820224136973e-07, "loss": 0.3138, "step": 20680 }, { "epoch": 2.4912703190848884, "grad_norm": 4.922794342041016, "learning_rate": 3.742734489079748e-07, "loss": 0.3219, "step": 20690 }, { "epoch": 2.4924744130042145, "grad_norm": 5.428676128387451, "learning_rate": 3.737650142191814e-07, "loss": 0.3077, "step": 20700 }, { "epoch": 2.49367850692354, "grad_norm": 4.670940399169922, "learning_rate": 3.7325671890868895e-07, "loss": 0.3035, "step": 20710 }, { "epoch": 2.4948826008428657, "grad_norm": 4.245230674743652, "learning_rate": 3.727485635377153e-07, "loss": 0.3102, "step": 20720 }, { "epoch": 2.4960866947621914, "grad_norm": 4.281071186065674, "learning_rate": 3.7224054866732366e-07, "loss": 0.2848, "step": 20730 }, { "epoch": 2.497290788681517, "grad_norm": 4.969486236572266, "learning_rate": 3.717326748584227e-07, "loss": 0.3109, "step": 20740 }, { "epoch": 2.498494882600843, "grad_norm": 6.3518500328063965, "learning_rate": 3.712249426717647e-07, "loss": 0.321, "step": 20750 }, { "epoch": 2.4996989765201687, "grad_norm": 4.896385192871094, "learning_rate": 3.707173526679458e-07, "loss": 0.3096, "step": 20760 }, { "epoch": 2.5009030704394943, "grad_norm": 4.546391487121582, "learning_rate": 3.702099054074054e-07, "loss": 0.3153, "step": 20770 }, { "epoch": 2.50210716435882, "grad_norm": 4.817781925201416, "learning_rate": 3.6970260145042475e-07, "loss": 0.3072, "step": 20780 }, { "epoch": 2.5033112582781456, "grad_norm": 4.495319366455078, "learning_rate": 3.691954413571276e-07, "loss": 0.316, "step": 20790 }, { "epoch": 2.5045153521974717, "grad_norm": 4.200586318969727, "learning_rate": 3.6868842568747826e-07, "loss": 0.3146, "step": 20800 }, { "epoch": 2.505719446116797, "grad_norm": 5.999356269836426, "learning_rate": 3.681815550012816e-07, "loss": 0.3087, "step": 20810 }, { "epoch": 2.506923540036123, "grad_norm": 4.140690326690674, "learning_rate": 3.676748298581828e-07, "loss": 0.2786, "step": 20820 }, { "epoch": 2.5081276339554486, "grad_norm": 4.519384384155273, "learning_rate": 3.6716825081766634e-07, "loss": 0.3073, "step": 20830 }, { "epoch": 2.509331727874774, "grad_norm": 4.580509185791016, "learning_rate": 3.6666181843905477e-07, "loss": 0.3224, "step": 20840 }, { "epoch": 2.5105358217941, "grad_norm": 4.371671676635742, "learning_rate": 3.661555332815092e-07, "loss": 0.303, "step": 20850 }, { "epoch": 2.5117399157134255, "grad_norm": 5.235719680786133, "learning_rate": 3.656493959040283e-07, "loss": 0.3104, "step": 20860 }, { "epoch": 2.5129440096327516, "grad_norm": 5.564718246459961, "learning_rate": 3.651434068654474e-07, "loss": 0.3111, "step": 20870 }, { "epoch": 2.514148103552077, "grad_norm": 4.76020622253418, "learning_rate": 3.646375667244378e-07, "loss": 0.3153, "step": 20880 }, { "epoch": 2.515352197471403, "grad_norm": 4.534407138824463, "learning_rate": 3.6413187603950667e-07, "loss": 0.305, "step": 20890 }, { "epoch": 2.5165562913907285, "grad_norm": 5.413814067840576, "learning_rate": 3.636263353689962e-07, "loss": 0.3088, "step": 20900 }, { "epoch": 2.517760385310054, "grad_norm": 5.003753185272217, "learning_rate": 3.6312094527108307e-07, "loss": 0.3146, "step": 20910 }, { "epoch": 2.51896447922938, "grad_norm": 5.368070125579834, "learning_rate": 3.6261570630377713e-07, "loss": 0.3131, "step": 20920 }, { "epoch": 2.5201685731487053, "grad_norm": 5.054159641265869, "learning_rate": 3.621106190249219e-07, "loss": 0.2967, "step": 20930 }, { "epoch": 2.5213726670680314, "grad_norm": 5.523135185241699, "learning_rate": 3.616056839921932e-07, "loss": 0.3154, "step": 20940 }, { "epoch": 2.522576760987357, "grad_norm": 5.352376937866211, "learning_rate": 3.6110090176309914e-07, "loss": 0.3033, "step": 20950 }, { "epoch": 2.5237808549066827, "grad_norm": 3.677163600921631, "learning_rate": 3.605962728949783e-07, "loss": 0.3198, "step": 20960 }, { "epoch": 2.5249849488260083, "grad_norm": 4.4316840171813965, "learning_rate": 3.6009179794500067e-07, "loss": 0.304, "step": 20970 }, { "epoch": 2.526189042745334, "grad_norm": 4.927300453186035, "learning_rate": 3.5958747747016603e-07, "loss": 0.3221, "step": 20980 }, { "epoch": 2.52739313666466, "grad_norm": 5.448822975158691, "learning_rate": 3.590833120273038e-07, "loss": 0.3186, "step": 20990 }, { "epoch": 2.5285972305839857, "grad_norm": 4.188570022583008, "learning_rate": 3.5857930217307163e-07, "loss": 0.3015, "step": 21000 }, { "epoch": 2.5298013245033113, "grad_norm": 4.157015323638916, "learning_rate": 3.580754484639561e-07, "loss": 0.2909, "step": 21010 }, { "epoch": 2.531005418422637, "grad_norm": 4.773519992828369, "learning_rate": 3.5757175145627107e-07, "loss": 0.3034, "step": 21020 }, { "epoch": 2.5322095123419626, "grad_norm": 5.435080051422119, "learning_rate": 3.570682117061573e-07, "loss": 0.3148, "step": 21030 }, { "epoch": 2.5334136062612886, "grad_norm": 4.959787368774414, "learning_rate": 3.56564829769582e-07, "loss": 0.3115, "step": 21040 }, { "epoch": 2.534617700180614, "grad_norm": 4.7358880043029785, "learning_rate": 3.5606160620233815e-07, "loss": 0.3078, "step": 21050 }, { "epoch": 2.53582179409994, "grad_norm": 4.220034599304199, "learning_rate": 3.5555854156004404e-07, "loss": 0.298, "step": 21060 }, { "epoch": 2.5370258880192655, "grad_norm": 4.433871746063232, "learning_rate": 3.550556363981422e-07, "loss": 0.2809, "step": 21070 }, { "epoch": 2.538229981938591, "grad_norm": 4.491239070892334, "learning_rate": 3.5455289127189907e-07, "loss": 0.3179, "step": 21080 }, { "epoch": 2.539434075857917, "grad_norm": 4.969503879547119, "learning_rate": 3.540503067364047e-07, "loss": 0.3018, "step": 21090 }, { "epoch": 2.5406381697772424, "grad_norm": 4.266849040985107, "learning_rate": 3.535478833465717e-07, "loss": 0.3121, "step": 21100 }, { "epoch": 2.5418422636965685, "grad_norm": 4.8507771492004395, "learning_rate": 3.5304562165713435e-07, "loss": 0.317, "step": 21110 }, { "epoch": 2.543046357615894, "grad_norm": 4.610383987426758, "learning_rate": 3.525435222226491e-07, "loss": 0.3083, "step": 21120 }, { "epoch": 2.54425045153522, "grad_norm": 4.408012390136719, "learning_rate": 3.5204158559749275e-07, "loss": 0.3141, "step": 21130 }, { "epoch": 2.5454545454545454, "grad_norm": 5.178010940551758, "learning_rate": 3.5153981233586274e-07, "loss": 0.3106, "step": 21140 }, { "epoch": 2.546658639373871, "grad_norm": 4.6306681632995605, "learning_rate": 3.5103820299177535e-07, "loss": 0.3086, "step": 21150 }, { "epoch": 2.547862733293197, "grad_norm": 5.366611003875732, "learning_rate": 3.505367581190668e-07, "loss": 0.2985, "step": 21160 }, { "epoch": 2.5490668272125223, "grad_norm": 5.572306156158447, "learning_rate": 3.5003547827139125e-07, "loss": 0.2976, "step": 21170 }, { "epoch": 2.5502709211318484, "grad_norm": 5.326085090637207, "learning_rate": 3.495343640022209e-07, "loss": 0.2971, "step": 21180 }, { "epoch": 2.551475015051174, "grad_norm": 7.600101947784424, "learning_rate": 3.4903341586484456e-07, "loss": 0.2961, "step": 21190 }, { "epoch": 2.5526791089704997, "grad_norm": 4.568670272827148, "learning_rate": 3.4853263441236834e-07, "loss": 0.3142, "step": 21200 }, { "epoch": 2.5538832028898253, "grad_norm": 4.9445695877075195, "learning_rate": 3.480320201977138e-07, "loss": 0.2988, "step": 21210 }, { "epoch": 2.555087296809151, "grad_norm": 5.26786994934082, "learning_rate": 3.475315737736183e-07, "loss": 0.3074, "step": 21220 }, { "epoch": 2.556291390728477, "grad_norm": 4.316328525543213, "learning_rate": 3.4703129569263323e-07, "loss": 0.2917, "step": 21230 }, { "epoch": 2.5574954846478026, "grad_norm": 4.018758773803711, "learning_rate": 3.465311865071248e-07, "loss": 0.2967, "step": 21240 }, { "epoch": 2.5586995785671283, "grad_norm": 5.121528625488281, "learning_rate": 3.460312467692725e-07, "loss": 0.3061, "step": 21250 }, { "epoch": 2.559903672486454, "grad_norm": 4.710129261016846, "learning_rate": 3.4553147703106886e-07, "loss": 0.3074, "step": 21260 }, { "epoch": 2.5611077664057795, "grad_norm": 4.447737216949463, "learning_rate": 3.4503187784431825e-07, "loss": 0.3062, "step": 21270 }, { "epoch": 2.5623118603251056, "grad_norm": 4.8179612159729, "learning_rate": 3.445324497606372e-07, "loss": 0.3007, "step": 21280 }, { "epoch": 2.563515954244431, "grad_norm": 4.53162956237793, "learning_rate": 3.440331933314532e-07, "loss": 0.3103, "step": 21290 }, { "epoch": 2.564720048163757, "grad_norm": 4.889903545379639, "learning_rate": 3.435341091080042e-07, "loss": 0.3109, "step": 21300 }, { "epoch": 2.5659241420830825, "grad_norm": 4.858291149139404, "learning_rate": 3.430351976413378e-07, "loss": 0.3191, "step": 21310 }, { "epoch": 2.567128236002408, "grad_norm": 4.58107852935791, "learning_rate": 3.425364594823114e-07, "loss": 0.2853, "step": 21320 }, { "epoch": 2.5683323299217338, "grad_norm": 5.6206207275390625, "learning_rate": 3.420378951815903e-07, "loss": 0.3081, "step": 21330 }, { "epoch": 2.5695364238410594, "grad_norm": 5.069255352020264, "learning_rate": 3.4153950528964866e-07, "loss": 0.3034, "step": 21340 }, { "epoch": 2.5707405177603855, "grad_norm": 5.086771488189697, "learning_rate": 3.4104129035676743e-07, "loss": 0.318, "step": 21350 }, { "epoch": 2.571944611679711, "grad_norm": 5.416161060333252, "learning_rate": 3.4054325093303447e-07, "loss": 0.3062, "step": 21360 }, { "epoch": 2.5731487055990367, "grad_norm": 4.536307334899902, "learning_rate": 3.4004538756834415e-07, "loss": 0.3028, "step": 21370 }, { "epoch": 2.5743527995183624, "grad_norm": 4.512822151184082, "learning_rate": 3.3954770081239657e-07, "loss": 0.3046, "step": 21380 }, { "epoch": 2.575556893437688, "grad_norm": 5.5262322425842285, "learning_rate": 3.39050191214696e-07, "loss": 0.3012, "step": 21390 }, { "epoch": 2.576760987357014, "grad_norm": 5.3342509269714355, "learning_rate": 3.38552859324552e-07, "loss": 0.3046, "step": 21400 }, { "epoch": 2.5779650812763397, "grad_norm": 4.271503925323486, "learning_rate": 3.380557056910778e-07, "loss": 0.3097, "step": 21410 }, { "epoch": 2.5791691751956654, "grad_norm": 4.600352764129639, "learning_rate": 3.375587308631891e-07, "loss": 0.3094, "step": 21420 }, { "epoch": 2.580373269114991, "grad_norm": 4.630692958831787, "learning_rate": 3.3706193538960493e-07, "loss": 0.3117, "step": 21430 }, { "epoch": 2.5815773630343166, "grad_norm": 4.425769329071045, "learning_rate": 3.3656531981884604e-07, "loss": 0.3097, "step": 21440 }, { "epoch": 2.5827814569536423, "grad_norm": 4.963135242462158, "learning_rate": 3.3606888469923474e-07, "loss": 0.3079, "step": 21450 }, { "epoch": 2.583985550872968, "grad_norm": 5.204167366027832, "learning_rate": 3.3557263057889344e-07, "loss": 0.2965, "step": 21460 }, { "epoch": 2.585189644792294, "grad_norm": 4.431160926818848, "learning_rate": 3.3507655800574554e-07, "loss": 0.2973, "step": 21470 }, { "epoch": 2.5863937387116196, "grad_norm": 5.386955261230469, "learning_rate": 3.345806675275134e-07, "loss": 0.3035, "step": 21480 }, { "epoch": 2.5875978326309452, "grad_norm": 4.363948345184326, "learning_rate": 3.340849596917189e-07, "loss": 0.2848, "step": 21490 }, { "epoch": 2.588801926550271, "grad_norm": 4.813036918640137, "learning_rate": 3.3358943504568147e-07, "loss": 0.3086, "step": 21500 }, { "epoch": 2.5900060204695965, "grad_norm": 4.847212791442871, "learning_rate": 3.3309409413651895e-07, "loss": 0.2939, "step": 21510 }, { "epoch": 2.5912101143889226, "grad_norm": 6.291325569152832, "learning_rate": 3.3259893751114606e-07, "loss": 0.3117, "step": 21520 }, { "epoch": 2.592414208308248, "grad_norm": 5.317537307739258, "learning_rate": 3.321039657162742e-07, "loss": 0.3222, "step": 21530 }, { "epoch": 2.593618302227574, "grad_norm": 4.0502190589904785, "learning_rate": 3.3160917929841027e-07, "loss": 0.2994, "step": 21540 }, { "epoch": 2.5948223961468995, "grad_norm": 5.079105377197266, "learning_rate": 3.3111457880385686e-07, "loss": 0.3002, "step": 21550 }, { "epoch": 2.596026490066225, "grad_norm": 5.073225975036621, "learning_rate": 3.3062016477871147e-07, "loss": 0.2969, "step": 21560 }, { "epoch": 2.5972305839855507, "grad_norm": 5.702369689941406, "learning_rate": 3.3012593776886524e-07, "loss": 0.3229, "step": 21570 }, { "epoch": 2.5984346779048764, "grad_norm": 5.685046672821045, "learning_rate": 3.296318983200028e-07, "loss": 0.3149, "step": 21580 }, { "epoch": 2.5996387718242024, "grad_norm": 5.351219654083252, "learning_rate": 3.2913804697760244e-07, "loss": 0.3116, "step": 21590 }, { "epoch": 2.600842865743528, "grad_norm": 4.610897541046143, "learning_rate": 3.286443842869338e-07, "loss": 0.3092, "step": 21600 }, { "epoch": 2.6020469596628537, "grad_norm": 4.982673168182373, "learning_rate": 3.2815091079305895e-07, "loss": 0.2942, "step": 21610 }, { "epoch": 2.6032510535821793, "grad_norm": 5.005990982055664, "learning_rate": 3.2765762704083067e-07, "loss": 0.311, "step": 21620 }, { "epoch": 2.604455147501505, "grad_norm": 4.512310028076172, "learning_rate": 3.271645335748923e-07, "loss": 0.3267, "step": 21630 }, { "epoch": 2.605659241420831, "grad_norm": 4.117137432098389, "learning_rate": 3.2667163093967716e-07, "loss": 0.3003, "step": 21640 }, { "epoch": 2.6068633353401567, "grad_norm": 5.019242763519287, "learning_rate": 3.2617891967940806e-07, "loss": 0.2979, "step": 21650 }, { "epoch": 2.6080674292594823, "grad_norm": 4.304302215576172, "learning_rate": 3.2568640033809597e-07, "loss": 0.3009, "step": 21660 }, { "epoch": 2.609271523178808, "grad_norm": 5.543119430541992, "learning_rate": 3.2519407345954043e-07, "loss": 0.3085, "step": 21670 }, { "epoch": 2.6104756170981336, "grad_norm": 4.892364025115967, "learning_rate": 3.247019395873283e-07, "loss": 0.2965, "step": 21680 }, { "epoch": 2.611679711017459, "grad_norm": 3.9560534954071045, "learning_rate": 3.242099992648336e-07, "loss": 0.2994, "step": 21690 }, { "epoch": 2.612883804936785, "grad_norm": 4.653574466705322, "learning_rate": 3.2371825303521604e-07, "loss": 0.3072, "step": 21700 }, { "epoch": 2.614087898856111, "grad_norm": 4.340296268463135, "learning_rate": 3.232267014414216e-07, "loss": 0.2965, "step": 21710 }, { "epoch": 2.6152919927754366, "grad_norm": 3.889099597930908, "learning_rate": 3.2273534502618136e-07, "loss": 0.3212, "step": 21720 }, { "epoch": 2.616496086694762, "grad_norm": 4.952009201049805, "learning_rate": 3.2224418433201033e-07, "loss": 0.3121, "step": 21730 }, { "epoch": 2.617700180614088, "grad_norm": 5.229816913604736, "learning_rate": 3.2175321990120797e-07, "loss": 0.304, "step": 21740 }, { "epoch": 2.6189042745334135, "grad_norm": 4.951354503631592, "learning_rate": 3.2126245227585693e-07, "loss": 0.3024, "step": 21750 }, { "epoch": 2.6201083684527395, "grad_norm": 5.034163475036621, "learning_rate": 3.2077188199782257e-07, "loss": 0.3057, "step": 21760 }, { "epoch": 2.621312462372065, "grad_norm": 5.984414100646973, "learning_rate": 3.20281509608752e-07, "loss": 0.3209, "step": 21770 }, { "epoch": 2.622516556291391, "grad_norm": 4.373472213745117, "learning_rate": 3.1979133565007434e-07, "loss": 0.2947, "step": 21780 }, { "epoch": 2.6237206502107164, "grad_norm": 4.750053405761719, "learning_rate": 3.193013606629994e-07, "loss": 0.3196, "step": 21790 }, { "epoch": 2.624924744130042, "grad_norm": 4.528110027313232, "learning_rate": 3.188115851885174e-07, "loss": 0.3053, "step": 21800 }, { "epoch": 2.6261288380493677, "grad_norm": 4.8642072677612305, "learning_rate": 3.1832200976739786e-07, "loss": 0.3328, "step": 21810 }, { "epoch": 2.6273329319686933, "grad_norm": 4.624762535095215, "learning_rate": 3.1783263494019e-07, "loss": 0.3123, "step": 21820 }, { "epoch": 2.6285370258880194, "grad_norm": 4.700741767883301, "learning_rate": 3.1734346124722135e-07, "loss": 0.3011, "step": 21830 }, { "epoch": 2.629741119807345, "grad_norm": 5.0118021965026855, "learning_rate": 3.1685448922859716e-07, "loss": 0.3163, "step": 21840 }, { "epoch": 2.6309452137266707, "grad_norm": 5.321165084838867, "learning_rate": 3.1636571942420014e-07, "loss": 0.3019, "step": 21850 }, { "epoch": 2.6321493076459963, "grad_norm": 5.864070892333984, "learning_rate": 3.1587715237368996e-07, "loss": 0.3027, "step": 21860 }, { "epoch": 2.633353401565322, "grad_norm": 4.458745956420898, "learning_rate": 3.1538878861650194e-07, "loss": 0.3152, "step": 21870 }, { "epoch": 2.634557495484648, "grad_norm": 4.945919036865234, "learning_rate": 3.149006286918474e-07, "loss": 0.3238, "step": 21880 }, { "epoch": 2.6357615894039736, "grad_norm": 4.671433448791504, "learning_rate": 3.144126731387126e-07, "loss": 0.2941, "step": 21890 }, { "epoch": 2.6369656833232993, "grad_norm": 5.389127731323242, "learning_rate": 3.1392492249585744e-07, "loss": 0.3223, "step": 21900 }, { "epoch": 2.638169777242625, "grad_norm": 5.42547607421875, "learning_rate": 3.134373773018165e-07, "loss": 0.305, "step": 21910 }, { "epoch": 2.6393738711619505, "grad_norm": 5.633350849151611, "learning_rate": 3.129500380948973e-07, "loss": 0.296, "step": 21920 }, { "epoch": 2.640577965081276, "grad_norm": 4.668237209320068, "learning_rate": 3.1246290541317937e-07, "loss": 0.3032, "step": 21930 }, { "epoch": 2.641782059000602, "grad_norm": 4.56117057800293, "learning_rate": 3.119759797945147e-07, "loss": 0.3036, "step": 21940 }, { "epoch": 2.642986152919928, "grad_norm": 5.208002090454102, "learning_rate": 3.114892617765266e-07, "loss": 0.2983, "step": 21950 }, { "epoch": 2.6441902468392535, "grad_norm": 4.775214195251465, "learning_rate": 3.110027518966094e-07, "loss": 0.3104, "step": 21960 }, { "epoch": 2.645394340758579, "grad_norm": 4.55642032623291, "learning_rate": 3.1051645069192675e-07, "loss": 0.3162, "step": 21970 }, { "epoch": 2.646598434677905, "grad_norm": 4.810263156890869, "learning_rate": 3.1003035869941295e-07, "loss": 0.2958, "step": 21980 }, { "epoch": 2.6478025285972304, "grad_norm": 4.988792896270752, "learning_rate": 3.0954447645577063e-07, "loss": 0.308, "step": 21990 }, { "epoch": 2.6490066225165565, "grad_norm": 4.394057273864746, "learning_rate": 3.0905880449747134e-07, "loss": 0.2995, "step": 22000 } ], "logging_steps": 10, "max_steps": 33220, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.886664442836628e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }