diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,28139 @@ +{ + "best_metric": 0.9950745058918885, + "best_model_checkpoint": "/content/drive/My Drive/results/checkpoint-32080", + "epoch": 5.0, + "eval_steps": 500, + "global_step": 40100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0012468827930174563, + "grad_norm": 1.4657020568847656, + "learning_rate": 1.9995012468827932e-05, + "loss": 1.0656, + "step": 10 + }, + { + "epoch": 0.0024937655860349127, + "grad_norm": 1.603716492652893, + "learning_rate": 1.9990024937655863e-05, + "loss": 1.0429, + "step": 20 + }, + { + "epoch": 0.003740648379052369, + "grad_norm": 1.6681715250015259, + "learning_rate": 1.998503740648379e-05, + "loss": 1.0375, + "step": 30 + }, + { + "epoch": 0.004987531172069825, + "grad_norm": 2.473482608795166, + "learning_rate": 1.998004987531172e-05, + "loss": 1.0195, + "step": 40 + }, + { + "epoch": 0.006234413965087282, + "grad_norm": 3.1133217811584473, + "learning_rate": 1.997506234413965e-05, + "loss": 0.9203, + "step": 50 + }, + { + "epoch": 0.007481296758104738, + "grad_norm": 4.7094550132751465, + "learning_rate": 1.9970074812967582e-05, + "loss": 0.8761, + "step": 60 + }, + { + "epoch": 0.008728179551122194, + "grad_norm": 5.289208889007568, + "learning_rate": 1.9965087281795513e-05, + "loss": 0.7031, + "step": 70 + }, + { + "epoch": 0.00997506234413965, + "grad_norm": 8.144381523132324, + "learning_rate": 1.9960099750623443e-05, + "loss": 0.5534, + "step": 80 + }, + { + "epoch": 0.011221945137157107, + "grad_norm": 10.670867919921875, + "learning_rate": 1.9955112219451374e-05, + "loss": 0.5365, + "step": 90 + }, + { + "epoch": 0.012468827930174564, + "grad_norm": 9.720280647277832, + "learning_rate": 1.995062344139651e-05, + "loss": 0.3846, + "step": 100 + }, + { + "epoch": 0.01371571072319202, + "grad_norm": 8.397249221801758, + "learning_rate": 1.994563591022444e-05, + "loss": 0.4167, + "step": 110 + }, + { + "epoch": 0.014962593516209476, + "grad_norm": 5.264407157897949, + "learning_rate": 1.9940648379052368e-05, + "loss": 0.3376, + "step": 120 + }, + { + "epoch": 0.016209476309226933, + "grad_norm": 12.360579490661621, + "learning_rate": 1.99356608478803e-05, + "loss": 0.3407, + "step": 130 + }, + { + "epoch": 0.017456359102244388, + "grad_norm": 6.203169345855713, + "learning_rate": 1.993067331670823e-05, + "loss": 0.3516, + "step": 140 + }, + { + "epoch": 0.018703241895261846, + "grad_norm": 10.887129783630371, + "learning_rate": 1.9925685785536163e-05, + "loss": 0.3273, + "step": 150 + }, + { + "epoch": 0.0199501246882793, + "grad_norm": 4.2213287353515625, + "learning_rate": 1.992069825436409e-05, + "loss": 0.3442, + "step": 160 + }, + { + "epoch": 0.02119700748129676, + "grad_norm": 12.920991897583008, + "learning_rate": 1.991620947630923e-05, + "loss": 0.2905, + "step": 170 + }, + { + "epoch": 0.022443890274314215, + "grad_norm": 10.486888885498047, + "learning_rate": 1.9911221945137158e-05, + "loss": 0.3545, + "step": 180 + }, + { + "epoch": 0.02369077306733167, + "grad_norm": 6.999630928039551, + "learning_rate": 1.990623441396509e-05, + "loss": 0.1938, + "step": 190 + }, + { + "epoch": 0.02493765586034913, + "grad_norm": 21.676868438720703, + "learning_rate": 1.990124688279302e-05, + "loss": 0.263, + "step": 200 + }, + { + "epoch": 0.026184538653366583, + "grad_norm": 24.033039093017578, + "learning_rate": 1.9896758104738156e-05, + "loss": 0.2844, + "step": 210 + }, + { + "epoch": 0.02743142144638404, + "grad_norm": 5.284111976623535, + "learning_rate": 1.9891770573566086e-05, + "loss": 0.2864, + "step": 220 + }, + { + "epoch": 0.028678304239401497, + "grad_norm": 21.66871452331543, + "learning_rate": 1.9886783042394017e-05, + "loss": 0.3661, + "step": 230 + }, + { + "epoch": 0.029925187032418952, + "grad_norm": 2.484323740005493, + "learning_rate": 1.9881795511221947e-05, + "loss": 0.1745, + "step": 240 + }, + { + "epoch": 0.03117206982543641, + "grad_norm": 12.316446304321289, + "learning_rate": 1.9876807980049878e-05, + "loss": 0.2384, + "step": 250 + }, + { + "epoch": 0.032418952618453865, + "grad_norm": 15.095812797546387, + "learning_rate": 1.987182044887781e-05, + "loss": 0.2639, + "step": 260 + }, + { + "epoch": 0.03366583541147132, + "grad_norm": 4.764219760894775, + "learning_rate": 1.9866832917705736e-05, + "loss": 0.1828, + "step": 270 + }, + { + "epoch": 0.034912718204488775, + "grad_norm": 6.600683689117432, + "learning_rate": 1.9861845386533667e-05, + "loss": 0.2929, + "step": 280 + }, + { + "epoch": 0.03615960099750624, + "grad_norm": 10.377800941467285, + "learning_rate": 1.9856857855361597e-05, + "loss": 0.2242, + "step": 290 + }, + { + "epoch": 0.03740648379052369, + "grad_norm": 14.586189270019531, + "learning_rate": 1.9851870324189528e-05, + "loss": 0.1666, + "step": 300 + }, + { + "epoch": 0.03865336658354115, + "grad_norm": 10.822226524353027, + "learning_rate": 1.984688279301746e-05, + "loss": 0.3677, + "step": 310 + }, + { + "epoch": 0.0399002493765586, + "grad_norm": 18.322172164916992, + "learning_rate": 1.984189526184539e-05, + "loss": 0.3073, + "step": 320 + }, + { + "epoch": 0.04114713216957606, + "grad_norm": 12.16933822631836, + "learning_rate": 1.983690773067332e-05, + "loss": 0.2328, + "step": 330 + }, + { + "epoch": 0.04239401496259352, + "grad_norm": 18.187196731567383, + "learning_rate": 1.983192019950125e-05, + "loss": 0.237, + "step": 340 + }, + { + "epoch": 0.043640897755610975, + "grad_norm": 19.095233917236328, + "learning_rate": 1.982693266832918e-05, + "loss": 0.2531, + "step": 350 + }, + { + "epoch": 0.04488778054862843, + "grad_norm": 19.445892333984375, + "learning_rate": 1.9821945137157108e-05, + "loss": 0.1914, + "step": 360 + }, + { + "epoch": 0.046134663341645885, + "grad_norm": 7.831539630889893, + "learning_rate": 1.981695760598504e-05, + "loss": 0.2373, + "step": 370 + }, + { + "epoch": 0.04738154613466334, + "grad_norm": 19.73242950439453, + "learning_rate": 1.981197007481297e-05, + "loss": 0.2471, + "step": 380 + }, + { + "epoch": 0.048628428927680795, + "grad_norm": 18.188596725463867, + "learning_rate": 1.98069825436409e-05, + "loss": 0.2306, + "step": 390 + }, + { + "epoch": 0.04987531172069826, + "grad_norm": 15.301477432250977, + "learning_rate": 1.9801995012468827e-05, + "loss": 0.2787, + "step": 400 + }, + { + "epoch": 0.05112219451371571, + "grad_norm": 0.6682356595993042, + "learning_rate": 1.9797007481296758e-05, + "loss": 0.1546, + "step": 410 + }, + { + "epoch": 0.05236907730673317, + "grad_norm": 12.32183837890625, + "learning_rate": 1.979201995012469e-05, + "loss": 0.2832, + "step": 420 + }, + { + "epoch": 0.05361596009975062, + "grad_norm": 13.420507431030273, + "learning_rate": 1.9787032418952622e-05, + "loss": 0.2139, + "step": 430 + }, + { + "epoch": 0.05486284289276808, + "grad_norm": 21.027982711791992, + "learning_rate": 1.978204488778055e-05, + "loss": 0.2048, + "step": 440 + }, + { + "epoch": 0.05610972568578554, + "grad_norm": 1.9423059225082397, + "learning_rate": 1.977705735660848e-05, + "loss": 0.1517, + "step": 450 + }, + { + "epoch": 0.057356608478802994, + "grad_norm": 4.442639350891113, + "learning_rate": 1.977206982543641e-05, + "loss": 0.168, + "step": 460 + }, + { + "epoch": 0.05860349127182045, + "grad_norm": 25.09494972229004, + "learning_rate": 1.976708229426434e-05, + "loss": 0.2924, + "step": 470 + }, + { + "epoch": 0.059850374064837904, + "grad_norm": 8.644868850708008, + "learning_rate": 1.976209476309227e-05, + "loss": 0.335, + "step": 480 + }, + { + "epoch": 0.06109725685785536, + "grad_norm": 15.42319393157959, + "learning_rate": 1.97571072319202e-05, + "loss": 0.2315, + "step": 490 + }, + { + "epoch": 0.06234413965087282, + "grad_norm": 11.491703987121582, + "learning_rate": 1.975211970074813e-05, + "loss": 0.1148, + "step": 500 + }, + { + "epoch": 0.06359102244389027, + "grad_norm": 9.482577323913574, + "learning_rate": 1.974713216957606e-05, + "loss": 0.1658, + "step": 510 + }, + { + "epoch": 0.06483790523690773, + "grad_norm": 0.5576394200325012, + "learning_rate": 1.974214463840399e-05, + "loss": 0.232, + "step": 520 + }, + { + "epoch": 0.06608478802992519, + "grad_norm": 0.857613742351532, + "learning_rate": 1.9737157107231922e-05, + "loss": 0.0998, + "step": 530 + }, + { + "epoch": 0.06733167082294264, + "grad_norm": 10.808552742004395, + "learning_rate": 1.9732169576059853e-05, + "loss": 0.1623, + "step": 540 + }, + { + "epoch": 0.0685785536159601, + "grad_norm": 6.505871772766113, + "learning_rate": 1.9727182044887783e-05, + "loss": 0.1348, + "step": 550 + }, + { + "epoch": 0.06982543640897755, + "grad_norm": 11.519365310668945, + "learning_rate": 1.9722194513715714e-05, + "loss": 0.1733, + "step": 560 + }, + { + "epoch": 0.07107231920199501, + "grad_norm": 5.1333537101745605, + "learning_rate": 1.971720698254364e-05, + "loss": 0.1829, + "step": 570 + }, + { + "epoch": 0.07231920199501247, + "grad_norm": 24.81167221069336, + "learning_rate": 1.971221945137157e-05, + "loss": 0.2117, + "step": 580 + }, + { + "epoch": 0.07356608478802992, + "grad_norm": 21.29195213317871, + "learning_rate": 1.9707231920199502e-05, + "loss": 0.173, + "step": 590 + }, + { + "epoch": 0.07481296758104738, + "grad_norm": 1.9257557392120361, + "learning_rate": 1.9702244389027433e-05, + "loss": 0.1143, + "step": 600 + }, + { + "epoch": 0.07605985037406483, + "grad_norm": 9.582220077514648, + "learning_rate": 1.9697256857855363e-05, + "loss": 0.1527, + "step": 610 + }, + { + "epoch": 0.0773067331670823, + "grad_norm": 10.984932899475098, + "learning_rate": 1.9692269326683294e-05, + "loss": 0.3024, + "step": 620 + }, + { + "epoch": 0.07855361596009976, + "grad_norm": 10.277746200561523, + "learning_rate": 1.9687281795511225e-05, + "loss": 0.1069, + "step": 630 + }, + { + "epoch": 0.0798004987531172, + "grad_norm": 8.234357833862305, + "learning_rate": 1.9682294264339155e-05, + "loss": 0.0953, + "step": 640 + }, + { + "epoch": 0.08104738154613467, + "grad_norm": 5.079126834869385, + "learning_rate": 1.9677306733167083e-05, + "loss": 0.0765, + "step": 650 + }, + { + "epoch": 0.08229426433915212, + "grad_norm": 32.218772888183594, + "learning_rate": 1.9672319201995013e-05, + "loss": 0.1209, + "step": 660 + }, + { + "epoch": 0.08354114713216958, + "grad_norm": 7.151175022125244, + "learning_rate": 1.9667331670822944e-05, + "loss": 0.1674, + "step": 670 + }, + { + "epoch": 0.08478802992518704, + "grad_norm": 18.98018455505371, + "learning_rate": 1.9662344139650874e-05, + "loss": 0.2542, + "step": 680 + }, + { + "epoch": 0.08603491271820449, + "grad_norm": 8.492568969726562, + "learning_rate": 1.9657356608478805e-05, + "loss": 0.177, + "step": 690 + }, + { + "epoch": 0.08728179551122195, + "grad_norm": 15.768592834472656, + "learning_rate": 1.9652369077306736e-05, + "loss": 0.1671, + "step": 700 + }, + { + "epoch": 0.0885286783042394, + "grad_norm": 8.789374351501465, + "learning_rate": 1.9647381546134666e-05, + "loss": 0.1056, + "step": 710 + }, + { + "epoch": 0.08977556109725686, + "grad_norm": 18.865314483642578, + "learning_rate": 1.9642394014962597e-05, + "loss": 0.1371, + "step": 720 + }, + { + "epoch": 0.09102244389027432, + "grad_norm": 5.666704177856445, + "learning_rate": 1.9637406483790524e-05, + "loss": 0.2404, + "step": 730 + }, + { + "epoch": 0.09226932668329177, + "grad_norm": 3.111618995666504, + "learning_rate": 1.9632418952618455e-05, + "loss": 0.2291, + "step": 740 + }, + { + "epoch": 0.09351620947630923, + "grad_norm": 23.726316452026367, + "learning_rate": 1.9627431421446385e-05, + "loss": 0.1431, + "step": 750 + }, + { + "epoch": 0.09476309226932668, + "grad_norm": 0.19223853945732117, + "learning_rate": 1.9622443890274316e-05, + "loss": 0.0933, + "step": 760 + }, + { + "epoch": 0.09600997506234414, + "grad_norm": 3.894308090209961, + "learning_rate": 1.9617456359102243e-05, + "loss": 0.1174, + "step": 770 + }, + { + "epoch": 0.09725685785536159, + "grad_norm": 28.369335174560547, + "learning_rate": 1.9612468827930177e-05, + "loss": 0.1417, + "step": 780 + }, + { + "epoch": 0.09850374064837905, + "grad_norm": 2.8270206451416016, + "learning_rate": 1.9607481296758108e-05, + "loss": 0.2054, + "step": 790 + }, + { + "epoch": 0.09975062344139651, + "grad_norm": 27.438745498657227, + "learning_rate": 1.960249376558604e-05, + "loss": 0.1436, + "step": 800 + }, + { + "epoch": 0.10099750623441396, + "grad_norm": 5.45671272277832, + "learning_rate": 1.959750623441397e-05, + "loss": 0.1969, + "step": 810 + }, + { + "epoch": 0.10224438902743142, + "grad_norm": 9.771449089050293, + "learning_rate": 1.9592518703241896e-05, + "loss": 0.1026, + "step": 820 + }, + { + "epoch": 0.10349127182044887, + "grad_norm": 13.02491569519043, + "learning_rate": 1.9587531172069827e-05, + "loss": 0.125, + "step": 830 + }, + { + "epoch": 0.10473815461346633, + "grad_norm": 4.949501991271973, + "learning_rate": 1.9582543640897758e-05, + "loss": 0.0813, + "step": 840 + }, + { + "epoch": 0.1059850374064838, + "grad_norm": 25.449613571166992, + "learning_rate": 1.9577556109725688e-05, + "loss": 0.1862, + "step": 850 + }, + { + "epoch": 0.10723192019950124, + "grad_norm": 5.5177412033081055, + "learning_rate": 1.9572568578553615e-05, + "loss": 0.0964, + "step": 860 + }, + { + "epoch": 0.1084788029925187, + "grad_norm": 8.13312816619873, + "learning_rate": 1.9567581047381546e-05, + "loss": 0.1136, + "step": 870 + }, + { + "epoch": 0.10972568578553615, + "grad_norm": 19.513381958007812, + "learning_rate": 1.9562593516209477e-05, + "loss": 0.1429, + "step": 880 + }, + { + "epoch": 0.11097256857855362, + "grad_norm": 16.46674346923828, + "learning_rate": 1.9557605985037407e-05, + "loss": 0.0936, + "step": 890 + }, + { + "epoch": 0.11221945137157108, + "grad_norm": 36.8519401550293, + "learning_rate": 1.9552618453865338e-05, + "loss": 0.1374, + "step": 900 + }, + { + "epoch": 0.11346633416458853, + "grad_norm": 4.3099188804626465, + "learning_rate": 1.954763092269327e-05, + "loss": 0.1874, + "step": 910 + }, + { + "epoch": 0.11471321695760599, + "grad_norm": 15.681669235229492, + "learning_rate": 1.95426433915212e-05, + "loss": 0.1291, + "step": 920 + }, + { + "epoch": 0.11596009975062344, + "grad_norm": 46.113014221191406, + "learning_rate": 1.953765586034913e-05, + "loss": 0.1844, + "step": 930 + }, + { + "epoch": 0.1172069825436409, + "grad_norm": 0.08473481982946396, + "learning_rate": 1.9532668329177057e-05, + "loss": 0.1307, + "step": 940 + }, + { + "epoch": 0.11845386533665836, + "grad_norm": 0.33060723543167114, + "learning_rate": 1.9527680798004988e-05, + "loss": 0.1662, + "step": 950 + }, + { + "epoch": 0.11970074812967581, + "grad_norm": 24.14071273803711, + "learning_rate": 1.9522693266832918e-05, + "loss": 0.2013, + "step": 960 + }, + { + "epoch": 0.12094763092269327, + "grad_norm": 1.8103233575820923, + "learning_rate": 1.951770573566085e-05, + "loss": 0.2362, + "step": 970 + }, + { + "epoch": 0.12219451371571072, + "grad_norm": 1.4162346124649048, + "learning_rate": 1.951271820448878e-05, + "loss": 0.0292, + "step": 980 + }, + { + "epoch": 0.12344139650872818, + "grad_norm": 0.10805538296699524, + "learning_rate": 1.950773067331671e-05, + "loss": 0.1698, + "step": 990 + }, + { + "epoch": 0.12468827930174564, + "grad_norm": 25.68360137939453, + "learning_rate": 1.950274314214464e-05, + "loss": 0.2032, + "step": 1000 + }, + { + "epoch": 0.1259351620947631, + "grad_norm": 26.8183536529541, + "learning_rate": 1.949775561097257e-05, + "loss": 0.0992, + "step": 1010 + }, + { + "epoch": 0.12718204488778054, + "grad_norm": 21.154848098754883, + "learning_rate": 1.9492768079800502e-05, + "loss": 0.2059, + "step": 1020 + }, + { + "epoch": 0.128428927680798, + "grad_norm": 1.1379319429397583, + "learning_rate": 1.948778054862843e-05, + "loss": 0.1347, + "step": 1030 + }, + { + "epoch": 0.12967581047381546, + "grad_norm": 19.544143676757812, + "learning_rate": 1.948279301745636e-05, + "loss": 0.1401, + "step": 1040 + }, + { + "epoch": 0.13092269326683292, + "grad_norm": 0.07502997666597366, + "learning_rate": 1.947780548628429e-05, + "loss": 0.2728, + "step": 1050 + }, + { + "epoch": 0.13216957605985039, + "grad_norm": 19.109968185424805, + "learning_rate": 1.947281795511222e-05, + "loss": 0.3007, + "step": 1060 + }, + { + "epoch": 0.13341645885286782, + "grad_norm": 16.674196243286133, + "learning_rate": 1.946783042394015e-05, + "loss": 0.1649, + "step": 1070 + }, + { + "epoch": 0.13466334164588528, + "grad_norm": 20.961565017700195, + "learning_rate": 1.9462842892768082e-05, + "loss": 0.1938, + "step": 1080 + }, + { + "epoch": 0.13591022443890274, + "grad_norm": 14.865165710449219, + "learning_rate": 1.9457855361596013e-05, + "loss": 0.1775, + "step": 1090 + }, + { + "epoch": 0.1371571072319202, + "grad_norm": 33.68144226074219, + "learning_rate": 1.9452867830423943e-05, + "loss": 0.243, + "step": 1100 + }, + { + "epoch": 0.13840399002493767, + "grad_norm": 0.23615525662899017, + "learning_rate": 1.944788029925187e-05, + "loss": 0.1302, + "step": 1110 + }, + { + "epoch": 0.1396508728179551, + "grad_norm": 2.4923934936523438, + "learning_rate": 1.94428927680798e-05, + "loss": 0.1697, + "step": 1120 + }, + { + "epoch": 0.14089775561097256, + "grad_norm": 0.05718168243765831, + "learning_rate": 1.9437905236907732e-05, + "loss": 0.1866, + "step": 1130 + }, + { + "epoch": 0.14214463840399003, + "grad_norm": 0.2824549674987793, + "learning_rate": 1.9432917705735663e-05, + "loss": 0.0844, + "step": 1140 + }, + { + "epoch": 0.1433915211970075, + "grad_norm": 28.53221893310547, + "learning_rate": 1.9427930174563593e-05, + "loss": 0.2335, + "step": 1150 + }, + { + "epoch": 0.14463840399002495, + "grad_norm": 3.9056732654571533, + "learning_rate": 1.9422942643391524e-05, + "loss": 0.2637, + "step": 1160 + }, + { + "epoch": 0.14588528678304238, + "grad_norm": 0.9593342542648315, + "learning_rate": 1.9417955112219454e-05, + "loss": 0.1542, + "step": 1170 + }, + { + "epoch": 0.14713216957605985, + "grad_norm": 3.5426342487335205, + "learning_rate": 1.9412967581047385e-05, + "loss": 0.0539, + "step": 1180 + }, + { + "epoch": 0.1483790523690773, + "grad_norm": 24.645750045776367, + "learning_rate": 1.9407980049875312e-05, + "loss": 0.0829, + "step": 1190 + }, + { + "epoch": 0.14962593516209477, + "grad_norm": 7.14509916305542, + "learning_rate": 1.9402992518703243e-05, + "loss": 0.1406, + "step": 1200 + }, + { + "epoch": 0.15087281795511223, + "grad_norm": 0.049528706818819046, + "learning_rate": 1.9398004987531174e-05, + "loss": 0.1438, + "step": 1210 + }, + { + "epoch": 0.15211970074812967, + "grad_norm": 13.300673484802246, + "learning_rate": 1.9393017456359104e-05, + "loss": 0.1346, + "step": 1220 + }, + { + "epoch": 0.15336658354114713, + "grad_norm": 1.943932056427002, + "learning_rate": 1.938802992518703e-05, + "loss": 0.0558, + "step": 1230 + }, + { + "epoch": 0.1546134663341646, + "grad_norm": 5.144111156463623, + "learning_rate": 1.9383042394014962e-05, + "loss": 0.1436, + "step": 1240 + }, + { + "epoch": 0.15586034912718205, + "grad_norm": 16.302080154418945, + "learning_rate": 1.9378054862842896e-05, + "loss": 0.1697, + "step": 1250 + }, + { + "epoch": 0.1571072319201995, + "grad_norm": 22.913436889648438, + "learning_rate": 1.9373067331670827e-05, + "loss": 0.1536, + "step": 1260 + }, + { + "epoch": 0.15835411471321695, + "grad_norm": 5.865168571472168, + "learning_rate": 1.9368079800498757e-05, + "loss": 0.107, + "step": 1270 + }, + { + "epoch": 0.1596009975062344, + "grad_norm": 0.23293504118919373, + "learning_rate": 1.9363092269326684e-05, + "loss": 0.0561, + "step": 1280 + }, + { + "epoch": 0.16084788029925187, + "grad_norm": 8.98757553100586, + "learning_rate": 1.9358104738154615e-05, + "loss": 0.0798, + "step": 1290 + }, + { + "epoch": 0.16209476309226933, + "grad_norm": 39.03519821166992, + "learning_rate": 1.9353117206982546e-05, + "loss": 0.3176, + "step": 1300 + }, + { + "epoch": 0.1633416458852868, + "grad_norm": 1.581081509590149, + "learning_rate": 1.9348129675810476e-05, + "loss": 0.1908, + "step": 1310 + }, + { + "epoch": 0.16458852867830423, + "grad_norm": 8.705527305603027, + "learning_rate": 1.9343142144638404e-05, + "loss": 0.2558, + "step": 1320 + }, + { + "epoch": 0.1658354114713217, + "grad_norm": 2.133392333984375, + "learning_rate": 1.9338154613466334e-05, + "loss": 0.1147, + "step": 1330 + }, + { + "epoch": 0.16708229426433915, + "grad_norm": 8.2285737991333, + "learning_rate": 1.9333167082294265e-05, + "loss": 0.1026, + "step": 1340 + }, + { + "epoch": 0.16832917705735662, + "grad_norm": 12.087708473205566, + "learning_rate": 1.9328179551122195e-05, + "loss": 0.2513, + "step": 1350 + }, + { + "epoch": 0.16957605985037408, + "grad_norm": 4.111790180206299, + "learning_rate": 1.9323192019950126e-05, + "loss": 0.1766, + "step": 1360 + }, + { + "epoch": 0.1708229426433915, + "grad_norm": 11.881768226623535, + "learning_rate": 1.9318204488778057e-05, + "loss": 0.14, + "step": 1370 + }, + { + "epoch": 0.17206982543640897, + "grad_norm": 11.08498764038086, + "learning_rate": 1.9313216957605987e-05, + "loss": 0.1501, + "step": 1380 + }, + { + "epoch": 0.17331670822942644, + "grad_norm": 0.2787410020828247, + "learning_rate": 1.9308229426433918e-05, + "loss": 0.1337, + "step": 1390 + }, + { + "epoch": 0.1745635910224439, + "grad_norm": 35.38869094848633, + "learning_rate": 1.9303241895261845e-05, + "loss": 0.1991, + "step": 1400 + }, + { + "epoch": 0.17581047381546136, + "grad_norm": 0.29990383982658386, + "learning_rate": 1.9298254364089776e-05, + "loss": 0.0835, + "step": 1410 + }, + { + "epoch": 0.1770573566084788, + "grad_norm": 0.33510634303092957, + "learning_rate": 1.9293266832917706e-05, + "loss": 0.1892, + "step": 1420 + }, + { + "epoch": 0.17830423940149626, + "grad_norm": 0.3308704197406769, + "learning_rate": 1.9288279301745637e-05, + "loss": 0.0766, + "step": 1430 + }, + { + "epoch": 0.17955112219451372, + "grad_norm": 29.095256805419922, + "learning_rate": 1.9283291770573568e-05, + "loss": 0.1195, + "step": 1440 + }, + { + "epoch": 0.18079800498753118, + "grad_norm": 0.04486589506268501, + "learning_rate": 1.9278304239401498e-05, + "loss": 0.0551, + "step": 1450 + }, + { + "epoch": 0.18204488778054864, + "grad_norm": 30.3422794342041, + "learning_rate": 1.927331670822943e-05, + "loss": 0.1743, + "step": 1460 + }, + { + "epoch": 0.18329177057356608, + "grad_norm": 10.932034492492676, + "learning_rate": 1.926832917705736e-05, + "loss": 0.0966, + "step": 1470 + }, + { + "epoch": 0.18453865336658354, + "grad_norm": 16.97438621520996, + "learning_rate": 1.9263341645885287e-05, + "loss": 0.1924, + "step": 1480 + }, + { + "epoch": 0.185785536159601, + "grad_norm": 0.11332409828901291, + "learning_rate": 1.9258354114713217e-05, + "loss": 0.1246, + "step": 1490 + }, + { + "epoch": 0.18703241895261846, + "grad_norm": 32.340240478515625, + "learning_rate": 1.9253366583541148e-05, + "loss": 0.1624, + "step": 1500 + }, + { + "epoch": 0.1882793017456359, + "grad_norm": 22.712034225463867, + "learning_rate": 1.924837905236908e-05, + "loss": 0.2361, + "step": 1510 + }, + { + "epoch": 0.18952618453865336, + "grad_norm": 6.54374885559082, + "learning_rate": 1.924339152119701e-05, + "loss": 0.2299, + "step": 1520 + }, + { + "epoch": 0.19077306733167082, + "grad_norm": 13.710122108459473, + "learning_rate": 1.923840399002494e-05, + "loss": 0.0964, + "step": 1530 + }, + { + "epoch": 0.19201995012468828, + "grad_norm": 2.097973585128784, + "learning_rate": 1.923341645885287e-05, + "loss": 0.0844, + "step": 1540 + }, + { + "epoch": 0.19326683291770574, + "grad_norm": 0.21922890841960907, + "learning_rate": 1.92284289276808e-05, + "loss": 0.2207, + "step": 1550 + }, + { + "epoch": 0.19451371571072318, + "grad_norm": 21.300539016723633, + "learning_rate": 1.922344139650873e-05, + "loss": 0.1941, + "step": 1560 + }, + { + "epoch": 0.19576059850374064, + "grad_norm": 27.9648494720459, + "learning_rate": 1.921845386533666e-05, + "loss": 0.1927, + "step": 1570 + }, + { + "epoch": 0.1970074812967581, + "grad_norm": 0.3763630986213684, + "learning_rate": 1.921346633416459e-05, + "loss": 0.058, + "step": 1580 + }, + { + "epoch": 0.19825436408977556, + "grad_norm": 7.284142971038818, + "learning_rate": 1.920847880299252e-05, + "loss": 0.0967, + "step": 1590 + }, + { + "epoch": 0.19950124688279303, + "grad_norm": 0.2558720111846924, + "learning_rate": 1.920349127182045e-05, + "loss": 0.0766, + "step": 1600 + }, + { + "epoch": 0.20074812967581046, + "grad_norm": 0.2345203459262848, + "learning_rate": 1.919850374064838e-05, + "loss": 0.1259, + "step": 1610 + }, + { + "epoch": 0.20199501246882792, + "grad_norm": 0.19105514883995056, + "learning_rate": 1.9193516209476312e-05, + "loss": 0.0979, + "step": 1620 + }, + { + "epoch": 0.20324189526184538, + "grad_norm": 16.620332717895508, + "learning_rate": 1.9188528678304243e-05, + "loss": 0.1425, + "step": 1630 + }, + { + "epoch": 0.20448877805486285, + "grad_norm": 34.320369720458984, + "learning_rate": 1.9183541147132173e-05, + "loss": 0.1714, + "step": 1640 + }, + { + "epoch": 0.2057356608478803, + "grad_norm": 3.3170464038848877, + "learning_rate": 1.91785536159601e-05, + "loss": 0.0134, + "step": 1650 + }, + { + "epoch": 0.20698254364089774, + "grad_norm": 9.176933288574219, + "learning_rate": 1.917356608478803e-05, + "loss": 0.1046, + "step": 1660 + }, + { + "epoch": 0.2082294264339152, + "grad_norm": 0.061658814549446106, + "learning_rate": 1.9168578553615962e-05, + "loss": 0.0464, + "step": 1670 + }, + { + "epoch": 0.20947630922693267, + "grad_norm": 1.5302953720092773, + "learning_rate": 1.9163591022443892e-05, + "loss": 0.1487, + "step": 1680 + }, + { + "epoch": 0.21072319201995013, + "grad_norm": 0.038375142961740494, + "learning_rate": 1.915860349127182e-05, + "loss": 0.079, + "step": 1690 + }, + { + "epoch": 0.2119700748129676, + "grad_norm": 0.4806802570819855, + "learning_rate": 1.915361596009975e-05, + "loss": 0.0285, + "step": 1700 + }, + { + "epoch": 0.21321695760598502, + "grad_norm": 0.042496830224990845, + "learning_rate": 1.914862842892768e-05, + "loss": 0.1011, + "step": 1710 + }, + { + "epoch": 0.2144638403990025, + "grad_norm": 2.657435894012451, + "learning_rate": 1.9143640897755615e-05, + "loss": 0.1359, + "step": 1720 + }, + { + "epoch": 0.21571072319201995, + "grad_norm": 22.948381423950195, + "learning_rate": 1.9138653366583542e-05, + "loss": 0.1223, + "step": 1730 + }, + { + "epoch": 0.2169576059850374, + "grad_norm": 2.4010567665100098, + "learning_rate": 1.9133665835411473e-05, + "loss": 0.0725, + "step": 1740 + }, + { + "epoch": 0.21820448877805487, + "grad_norm": 20.701522827148438, + "learning_rate": 1.9128678304239403e-05, + "loss": 0.1088, + "step": 1750 + }, + { + "epoch": 0.2194513715710723, + "grad_norm": 0.9114007949829102, + "learning_rate": 1.9123690773067334e-05, + "loss": 0.1289, + "step": 1760 + }, + { + "epoch": 0.22069825436408977, + "grad_norm": 0.5298766493797302, + "learning_rate": 1.9118703241895265e-05, + "loss": 0.1754, + "step": 1770 + }, + { + "epoch": 0.22194513715710723, + "grad_norm": 20.310009002685547, + "learning_rate": 1.9113715710723192e-05, + "loss": 0.0428, + "step": 1780 + }, + { + "epoch": 0.2231920199501247, + "grad_norm": 16.548736572265625, + "learning_rate": 1.9108728179551122e-05, + "loss": 0.1378, + "step": 1790 + }, + { + "epoch": 0.22443890274314215, + "grad_norm": 0.10499307513237, + "learning_rate": 1.9103740648379053e-05, + "loss": 0.1144, + "step": 1800 + }, + { + "epoch": 0.2256857855361596, + "grad_norm": 32.94569778442383, + "learning_rate": 1.9098753117206984e-05, + "loss": 0.1308, + "step": 1810 + }, + { + "epoch": 0.22693266832917705, + "grad_norm": 12.977508544921875, + "learning_rate": 1.9093765586034914e-05, + "loss": 0.0695, + "step": 1820 + }, + { + "epoch": 0.2281795511221945, + "grad_norm": 22.582475662231445, + "learning_rate": 1.9088778054862845e-05, + "loss": 0.1223, + "step": 1830 + }, + { + "epoch": 0.22942643391521197, + "grad_norm": 0.45582443475723267, + "learning_rate": 1.9083790523690775e-05, + "loss": 0.1618, + "step": 1840 + }, + { + "epoch": 0.23067331670822944, + "grad_norm": 14.70036792755127, + "learning_rate": 1.9078802992518706e-05, + "loss": 0.1359, + "step": 1850 + }, + { + "epoch": 0.23192019950124687, + "grad_norm": 5.940314292907715, + "learning_rate": 1.9073815461346633e-05, + "loss": 0.1857, + "step": 1860 + }, + { + "epoch": 0.23316708229426433, + "grad_norm": 2.6022796630859375, + "learning_rate": 1.9068827930174564e-05, + "loss": 0.1109, + "step": 1870 + }, + { + "epoch": 0.2344139650872818, + "grad_norm": 7.17425012588501, + "learning_rate": 1.9063840399002495e-05, + "loss": 0.0246, + "step": 1880 + }, + { + "epoch": 0.23566084788029926, + "grad_norm": 0.07173322141170502, + "learning_rate": 1.9058852867830425e-05, + "loss": 0.1983, + "step": 1890 + }, + { + "epoch": 0.23690773067331672, + "grad_norm": 7.803640842437744, + "learning_rate": 1.9053865336658356e-05, + "loss": 0.2229, + "step": 1900 + }, + { + "epoch": 0.23815461346633415, + "grad_norm": 6.28038215637207, + "learning_rate": 1.9048877805486286e-05, + "loss": 0.0816, + "step": 1910 + }, + { + "epoch": 0.23940149625935161, + "grad_norm": 17.327741622924805, + "learning_rate": 1.9043890274314217e-05, + "loss": 0.0725, + "step": 1920 + }, + { + "epoch": 0.24064837905236908, + "grad_norm": 0.18219555914402008, + "learning_rate": 1.9038902743142148e-05, + "loss": 0.0918, + "step": 1930 + }, + { + "epoch": 0.24189526184538654, + "grad_norm": 7.153271675109863, + "learning_rate": 1.9033915211970075e-05, + "loss": 0.1641, + "step": 1940 + }, + { + "epoch": 0.243142144638404, + "grad_norm": 0.31039583683013916, + "learning_rate": 1.9028927680798005e-05, + "loss": 0.0713, + "step": 1950 + }, + { + "epoch": 0.24438902743142144, + "grad_norm": 21.150863647460938, + "learning_rate": 1.9023940149625936e-05, + "loss": 0.1273, + "step": 1960 + }, + { + "epoch": 0.2456359102244389, + "grad_norm": 2.9307498931884766, + "learning_rate": 1.9018952618453867e-05, + "loss": 0.1131, + "step": 1970 + }, + { + "epoch": 0.24688279301745636, + "grad_norm": 32.69029998779297, + "learning_rate": 1.9013965087281797e-05, + "loss": 0.154, + "step": 1980 + }, + { + "epoch": 0.24812967581047382, + "grad_norm": 2.6511595249176025, + "learning_rate": 1.9008977556109728e-05, + "loss": 0.0656, + "step": 1990 + }, + { + "epoch": 0.24937655860349128, + "grad_norm": 15.334160804748535, + "learning_rate": 1.900399002493766e-05, + "loss": 0.0894, + "step": 2000 + }, + { + "epoch": 0.2506234413965087, + "grad_norm": 29.95343780517578, + "learning_rate": 1.899900249376559e-05, + "loss": 0.1466, + "step": 2010 + }, + { + "epoch": 0.2518703241895262, + "grad_norm": 1.1614725589752197, + "learning_rate": 1.899401496259352e-05, + "loss": 0.156, + "step": 2020 + }, + { + "epoch": 0.25311720698254364, + "grad_norm": 0.3376743495464325, + "learning_rate": 1.8989027431421447e-05, + "loss": 0.0605, + "step": 2030 + }, + { + "epoch": 0.2543640897755611, + "grad_norm": 0.05432422459125519, + "learning_rate": 1.8984039900249378e-05, + "loss": 0.1247, + "step": 2040 + }, + { + "epoch": 0.25561097256857856, + "grad_norm": 0.2011519819498062, + "learning_rate": 1.8979052369077308e-05, + "loss": 0.0973, + "step": 2050 + }, + { + "epoch": 0.256857855361596, + "grad_norm": 23.83899688720703, + "learning_rate": 1.897406483790524e-05, + "loss": 0.1737, + "step": 2060 + }, + { + "epoch": 0.2581047381546135, + "grad_norm": 16.04006004333496, + "learning_rate": 1.896907730673317e-05, + "loss": 0.1346, + "step": 2070 + }, + { + "epoch": 0.2593516209476309, + "grad_norm": 6.27106237411499, + "learning_rate": 1.89640897755611e-05, + "loss": 0.118, + "step": 2080 + }, + { + "epoch": 0.26059850374064836, + "grad_norm": 3.775754451751709, + "learning_rate": 1.895910224438903e-05, + "loss": 0.2552, + "step": 2090 + }, + { + "epoch": 0.26184538653366585, + "grad_norm": 0.19130301475524902, + "learning_rate": 1.895411471321696e-05, + "loss": 0.1049, + "step": 2100 + }, + { + "epoch": 0.2630922693266833, + "grad_norm": 12.918098449707031, + "learning_rate": 1.894912718204489e-05, + "loss": 0.1938, + "step": 2110 + }, + { + "epoch": 0.26433915211970077, + "grad_norm": 33.832454681396484, + "learning_rate": 1.894413965087282e-05, + "loss": 0.0929, + "step": 2120 + }, + { + "epoch": 0.2655860349127182, + "grad_norm": 3.3025131225585938, + "learning_rate": 1.893915211970075e-05, + "loss": 0.0542, + "step": 2130 + }, + { + "epoch": 0.26683291770573564, + "grad_norm": 0.16082660853862762, + "learning_rate": 1.893416458852868e-05, + "loss": 0.0435, + "step": 2140 + }, + { + "epoch": 0.26807980049875313, + "grad_norm": 22.90357780456543, + "learning_rate": 1.8929177057356608e-05, + "loss": 0.064, + "step": 2150 + }, + { + "epoch": 0.26932668329177056, + "grad_norm": 0.03351182863116264, + "learning_rate": 1.892418952618454e-05, + "loss": 0.0409, + "step": 2160 + }, + { + "epoch": 0.27057356608478805, + "grad_norm": 0.09229867160320282, + "learning_rate": 1.891920199501247e-05, + "loss": 0.0076, + "step": 2170 + }, + { + "epoch": 0.2718204488778055, + "grad_norm": 0.03719809651374817, + "learning_rate": 1.8914214463840403e-05, + "loss": 0.1627, + "step": 2180 + }, + { + "epoch": 0.2730673316708229, + "grad_norm": 8.800865173339844, + "learning_rate": 1.890922693266833e-05, + "loss": 0.1737, + "step": 2190 + }, + { + "epoch": 0.2743142144638404, + "grad_norm": 0.03148549422621727, + "learning_rate": 1.890423940149626e-05, + "loss": 0.2566, + "step": 2200 + }, + { + "epoch": 0.27556109725685785, + "grad_norm": 0.2769756019115448, + "learning_rate": 1.889925187032419e-05, + "loss": 0.0604, + "step": 2210 + }, + { + "epoch": 0.27680798004987534, + "grad_norm": 37.58738708496094, + "learning_rate": 1.8894264339152122e-05, + "loss": 0.0611, + "step": 2220 + }, + { + "epoch": 0.27805486284289277, + "grad_norm": 0.1621832549571991, + "learning_rate": 1.888927680798005e-05, + "loss": 0.0724, + "step": 2230 + }, + { + "epoch": 0.2793017456359102, + "grad_norm": 0.07667220383882523, + "learning_rate": 1.888428927680798e-05, + "loss": 0.1044, + "step": 2240 + }, + { + "epoch": 0.2805486284289277, + "grad_norm": 0.16220073401927948, + "learning_rate": 1.887930174563591e-05, + "loss": 0.1377, + "step": 2250 + }, + { + "epoch": 0.2817955112219451, + "grad_norm": 0.0622367337346077, + "learning_rate": 1.887431421446384e-05, + "loss": 0.1296, + "step": 2260 + }, + { + "epoch": 0.2830423940149626, + "grad_norm": 0.31281012296676636, + "learning_rate": 1.8869326683291772e-05, + "loss": 0.1597, + "step": 2270 + }, + { + "epoch": 0.28428927680798005, + "grad_norm": 17.161144256591797, + "learning_rate": 1.8864339152119702e-05, + "loss": 0.0763, + "step": 2280 + }, + { + "epoch": 0.2855361596009975, + "grad_norm": 21.191686630249023, + "learning_rate": 1.8859351620947633e-05, + "loss": 0.0821, + "step": 2290 + }, + { + "epoch": 0.286783042394015, + "grad_norm": 0.03687586635351181, + "learning_rate": 1.8854364089775564e-05, + "loss": 0.0243, + "step": 2300 + }, + { + "epoch": 0.2880299251870324, + "grad_norm": 0.7001633048057556, + "learning_rate": 1.8849376558603494e-05, + "loss": 0.1851, + "step": 2310 + }, + { + "epoch": 0.2892768079800499, + "grad_norm": 46.858665466308594, + "learning_rate": 1.884438902743142e-05, + "loss": 0.1349, + "step": 2320 + }, + { + "epoch": 0.29052369077306733, + "grad_norm": 0.0679154023528099, + "learning_rate": 1.8839401496259352e-05, + "loss": 0.0741, + "step": 2330 + }, + { + "epoch": 0.29177057356608477, + "grad_norm": 21.36118507385254, + "learning_rate": 1.8834413965087283e-05, + "loss": 0.0935, + "step": 2340 + }, + { + "epoch": 0.29301745635910226, + "grad_norm": 4.055055141448975, + "learning_rate": 1.8829426433915213e-05, + "loss": 0.1776, + "step": 2350 + }, + { + "epoch": 0.2942643391521197, + "grad_norm": 0.02688492275774479, + "learning_rate": 1.8824438902743144e-05, + "loss": 0.0141, + "step": 2360 + }, + { + "epoch": 0.2955112219451372, + "grad_norm": 43.924827575683594, + "learning_rate": 1.8819451371571075e-05, + "loss": 0.1315, + "step": 2370 + }, + { + "epoch": 0.2967581047381546, + "grad_norm": 22.235816955566406, + "learning_rate": 1.8814463840399005e-05, + "loss": 0.0706, + "step": 2380 + }, + { + "epoch": 0.29800498753117205, + "grad_norm": 0.8161360621452332, + "learning_rate": 1.8809476309226936e-05, + "loss": 0.1809, + "step": 2390 + }, + { + "epoch": 0.29925187032418954, + "grad_norm": 6.17124080657959, + "learning_rate": 1.8804488778054863e-05, + "loss": 0.1138, + "step": 2400 + }, + { + "epoch": 0.300498753117207, + "grad_norm": 6.6597771644592285, + "learning_rate": 1.8799501246882794e-05, + "loss": 0.0869, + "step": 2410 + }, + { + "epoch": 0.30174563591022446, + "grad_norm": 0.28154247999191284, + "learning_rate": 1.8794513715710724e-05, + "loss": 0.0437, + "step": 2420 + }, + { + "epoch": 0.3029925187032419, + "grad_norm": 0.0688408687710762, + "learning_rate": 1.8789526184538655e-05, + "loss": 0.043, + "step": 2430 + }, + { + "epoch": 0.30423940149625933, + "grad_norm": 25.441835403442383, + "learning_rate": 1.8784538653366586e-05, + "loss": 0.0242, + "step": 2440 + }, + { + "epoch": 0.3054862842892768, + "grad_norm": 0.04695994779467583, + "learning_rate": 1.8779551122194516e-05, + "loss": 0.0904, + "step": 2450 + }, + { + "epoch": 0.30673316708229426, + "grad_norm": 0.020629514008760452, + "learning_rate": 1.8774563591022447e-05, + "loss": 0.0748, + "step": 2460 + }, + { + "epoch": 0.30798004987531175, + "grad_norm": 3.5206122398376465, + "learning_rate": 1.8770074812967583e-05, + "loss": 0.091, + "step": 2470 + }, + { + "epoch": 0.3092269326683292, + "grad_norm": 29.27650260925293, + "learning_rate": 1.8765087281795514e-05, + "loss": 0.1235, + "step": 2480 + }, + { + "epoch": 0.3104738154613466, + "grad_norm": 4.11005973815918, + "learning_rate": 1.8760099750623445e-05, + "loss": 0.1624, + "step": 2490 + }, + { + "epoch": 0.3117206982543641, + "grad_norm": 0.38684889674186707, + "learning_rate": 1.8755112219451372e-05, + "loss": 0.0606, + "step": 2500 + }, + { + "epoch": 0.31296758104738154, + "grad_norm": 0.18241973221302032, + "learning_rate": 1.8750124688279302e-05, + "loss": 0.1873, + "step": 2510 + }, + { + "epoch": 0.314214463840399, + "grad_norm": 0.10438118875026703, + "learning_rate": 1.8745137157107233e-05, + "loss": 0.0626, + "step": 2520 + }, + { + "epoch": 0.31546134663341646, + "grad_norm": 0.040058329701423645, + "learning_rate": 1.8740149625935164e-05, + "loss": 0.1456, + "step": 2530 + }, + { + "epoch": 0.3167082294264339, + "grad_norm": 9.054058074951172, + "learning_rate": 1.8735162094763094e-05, + "loss": 0.0662, + "step": 2540 + }, + { + "epoch": 0.3179551122194514, + "grad_norm": 9.296021461486816, + "learning_rate": 1.8730174563591025e-05, + "loss": 0.1729, + "step": 2550 + }, + { + "epoch": 0.3192019950124688, + "grad_norm": 0.03742093965411186, + "learning_rate": 1.8725187032418955e-05, + "loss": 0.1425, + "step": 2560 + }, + { + "epoch": 0.3204488778054863, + "grad_norm": 30.912044525146484, + "learning_rate": 1.8720199501246886e-05, + "loss": 0.105, + "step": 2570 + }, + { + "epoch": 0.32169576059850374, + "grad_norm": 0.7337010502815247, + "learning_rate": 1.8715211970074813e-05, + "loss": 0.1298, + "step": 2580 + }, + { + "epoch": 0.3229426433915212, + "grad_norm": 21.679298400878906, + "learning_rate": 1.8710224438902744e-05, + "loss": 0.0749, + "step": 2590 + }, + { + "epoch": 0.32418952618453867, + "grad_norm": 2.8091278076171875, + "learning_rate": 1.8705236907730675e-05, + "loss": 0.0869, + "step": 2600 + }, + { + "epoch": 0.3254364089775561, + "grad_norm": 0.2369414120912552, + "learning_rate": 1.8700249376558605e-05, + "loss": 0.0693, + "step": 2610 + }, + { + "epoch": 0.3266832917705736, + "grad_norm": 5.367069721221924, + "learning_rate": 1.8695261845386536e-05, + "loss": 0.1397, + "step": 2620 + }, + { + "epoch": 0.327930174563591, + "grad_norm": 12.290024757385254, + "learning_rate": 1.8690274314214466e-05, + "loss": 0.2084, + "step": 2630 + }, + { + "epoch": 0.32917705735660846, + "grad_norm": 9.538579940795898, + "learning_rate": 1.8685286783042397e-05, + "loss": 0.1456, + "step": 2640 + }, + { + "epoch": 0.33042394014962595, + "grad_norm": 28.887372970581055, + "learning_rate": 1.8680299251870328e-05, + "loss": 0.1234, + "step": 2650 + }, + { + "epoch": 0.3316708229426434, + "grad_norm": 0.04460681602358818, + "learning_rate": 1.8675311720698255e-05, + "loss": 0.0909, + "step": 2660 + }, + { + "epoch": 0.3329177057356609, + "grad_norm": 14.794498443603516, + "learning_rate": 1.8670324189526186e-05, + "loss": 0.1336, + "step": 2670 + }, + { + "epoch": 0.3341645885286783, + "grad_norm": 12.941160202026367, + "learning_rate": 1.8665336658354116e-05, + "loss": 0.1972, + "step": 2680 + }, + { + "epoch": 0.33541147132169574, + "grad_norm": 24.143878936767578, + "learning_rate": 1.8660349127182047e-05, + "loss": 0.1048, + "step": 2690 + }, + { + "epoch": 0.33665835411471323, + "grad_norm": 26.018558502197266, + "learning_rate": 1.8655361596009977e-05, + "loss": 0.1691, + "step": 2700 + }, + { + "epoch": 0.33790523690773067, + "grad_norm": 2.2344202995300293, + "learning_rate": 1.8650374064837905e-05, + "loss": 0.084, + "step": 2710 + }, + { + "epoch": 0.33915211970074816, + "grad_norm": 2.037916660308838, + "learning_rate": 1.8645386533665835e-05, + "loss": 0.0796, + "step": 2720 + }, + { + "epoch": 0.3403990024937656, + "grad_norm": 0.05853952094912529, + "learning_rate": 1.864039900249377e-05, + "loss": 0.0909, + "step": 2730 + }, + { + "epoch": 0.341645885286783, + "grad_norm": 20.358482360839844, + "learning_rate": 1.86354114713217e-05, + "loss": 0.148, + "step": 2740 + }, + { + "epoch": 0.3428927680798005, + "grad_norm": 0.28583335876464844, + "learning_rate": 1.8630423940149627e-05, + "loss": 0.1022, + "step": 2750 + }, + { + "epoch": 0.34413965087281795, + "grad_norm": 15.605045318603516, + "learning_rate": 1.8625436408977558e-05, + "loss": 0.0773, + "step": 2760 + }, + { + "epoch": 0.34538653366583544, + "grad_norm": 0.038003236055374146, + "learning_rate": 1.862044887780549e-05, + "loss": 0.1025, + "step": 2770 + }, + { + "epoch": 0.34663341645885287, + "grad_norm": 0.06002043932676315, + "learning_rate": 1.861546134663342e-05, + "loss": 0.1234, + "step": 2780 + }, + { + "epoch": 0.3478802992518703, + "grad_norm": 40.75321960449219, + "learning_rate": 1.8610473815461346e-05, + "loss": 0.0429, + "step": 2790 + }, + { + "epoch": 0.3491271820448878, + "grad_norm": 4.2325029373168945, + "learning_rate": 1.8605486284289277e-05, + "loss": 0.1183, + "step": 2800 + }, + { + "epoch": 0.35037406483790523, + "grad_norm": 0.026608416810631752, + "learning_rate": 1.8600498753117207e-05, + "loss": 0.0885, + "step": 2810 + }, + { + "epoch": 0.3516209476309227, + "grad_norm": 7.794644832611084, + "learning_rate": 1.8595511221945138e-05, + "loss": 0.083, + "step": 2820 + }, + { + "epoch": 0.35286783042394015, + "grad_norm": 12.012767791748047, + "learning_rate": 1.859052369077307e-05, + "loss": 0.0543, + "step": 2830 + }, + { + "epoch": 0.3541147132169576, + "grad_norm": 1.2812813520431519, + "learning_rate": 1.8585536159601e-05, + "loss": 0.1791, + "step": 2840 + }, + { + "epoch": 0.3553615960099751, + "grad_norm": 0.8094510436058044, + "learning_rate": 1.858054862842893e-05, + "loss": 0.0956, + "step": 2850 + }, + { + "epoch": 0.3566084788029925, + "grad_norm": 21.321758270263672, + "learning_rate": 1.857556109725686e-05, + "loss": 0.099, + "step": 2860 + }, + { + "epoch": 0.35785536159601, + "grad_norm": 0.2500157058238983, + "learning_rate": 1.8570573566084788e-05, + "loss": 0.0875, + "step": 2870 + }, + { + "epoch": 0.35910224438902744, + "grad_norm": 1.042913556098938, + "learning_rate": 1.856558603491272e-05, + "loss": 0.0707, + "step": 2880 + }, + { + "epoch": 0.36034912718204487, + "grad_norm": 7.620708465576172, + "learning_rate": 1.856059850374065e-05, + "loss": 0.2129, + "step": 2890 + }, + { + "epoch": 0.36159600997506236, + "grad_norm": 54.09572219848633, + "learning_rate": 1.855561097256858e-05, + "loss": 0.0507, + "step": 2900 + }, + { + "epoch": 0.3628428927680798, + "grad_norm": 8.779691696166992, + "learning_rate": 1.855062344139651e-05, + "loss": 0.0428, + "step": 2910 + }, + { + "epoch": 0.3640897755610973, + "grad_norm": 16.20435333251953, + "learning_rate": 1.854563591022444e-05, + "loss": 0.1638, + "step": 2920 + }, + { + "epoch": 0.3653366583541147, + "grad_norm": 24.456098556518555, + "learning_rate": 1.854064837905237e-05, + "loss": 0.1414, + "step": 2930 + }, + { + "epoch": 0.36658354114713215, + "grad_norm": 16.864614486694336, + "learning_rate": 1.8535660847880302e-05, + "loss": 0.1618, + "step": 2940 + }, + { + "epoch": 0.36783042394014964, + "grad_norm": 8.25745677947998, + "learning_rate": 1.8530673316708233e-05, + "loss": 0.1248, + "step": 2950 + }, + { + "epoch": 0.3690773067331671, + "grad_norm": 1.0005909204483032, + "learning_rate": 1.852568578553616e-05, + "loss": 0.1164, + "step": 2960 + }, + { + "epoch": 0.37032418952618457, + "grad_norm": 13.815901756286621, + "learning_rate": 1.852069825436409e-05, + "loss": 0.1161, + "step": 2970 + }, + { + "epoch": 0.371571072319202, + "grad_norm": 0.46969351172447205, + "learning_rate": 1.851571072319202e-05, + "loss": 0.02, + "step": 2980 + }, + { + "epoch": 0.37281795511221943, + "grad_norm": 0.06556516140699387, + "learning_rate": 1.8510723192019952e-05, + "loss": 0.184, + "step": 2990 + }, + { + "epoch": 0.3740648379052369, + "grad_norm": 0.3786510229110718, + "learning_rate": 1.8505735660847882e-05, + "loss": 0.0972, + "step": 3000 + }, + { + "epoch": 0.37531172069825436, + "grad_norm": 0.15150892734527588, + "learning_rate": 1.8500748129675813e-05, + "loss": 0.0366, + "step": 3010 + }, + { + "epoch": 0.3765586034912718, + "grad_norm": 0.38234204053878784, + "learning_rate": 1.8495760598503744e-05, + "loss": 0.0643, + "step": 3020 + }, + { + "epoch": 0.3778054862842893, + "grad_norm": 0.059004172682762146, + "learning_rate": 1.8490773067331674e-05, + "loss": 0.029, + "step": 3030 + }, + { + "epoch": 0.3790523690773067, + "grad_norm": 0.02871227078139782, + "learning_rate": 1.84857855361596e-05, + "loss": 0.0238, + "step": 3040 + }, + { + "epoch": 0.3802992518703242, + "grad_norm": 0.2601139545440674, + "learning_rate": 1.8480798004987532e-05, + "loss": 0.1566, + "step": 3050 + }, + { + "epoch": 0.38154613466334164, + "grad_norm": 0.06762529909610748, + "learning_rate": 1.8475810473815463e-05, + "loss": 0.0609, + "step": 3060 + }, + { + "epoch": 0.3827930174563591, + "grad_norm": 26.341711044311523, + "learning_rate": 1.8470822942643393e-05, + "loss": 0.0876, + "step": 3070 + }, + { + "epoch": 0.38403990024937656, + "grad_norm": 9.868642807006836, + "learning_rate": 1.8465835411471324e-05, + "loss": 0.1248, + "step": 3080 + }, + { + "epoch": 0.385286783042394, + "grad_norm": 0.05572517588734627, + "learning_rate": 1.8460847880299255e-05, + "loss": 0.1603, + "step": 3090 + }, + { + "epoch": 0.3865336658354115, + "grad_norm": 8.873476028442383, + "learning_rate": 1.8455860349127185e-05, + "loss": 0.1626, + "step": 3100 + }, + { + "epoch": 0.3877805486284289, + "grad_norm": 26.02020835876465, + "learning_rate": 1.8450872817955116e-05, + "loss": 0.1455, + "step": 3110 + }, + { + "epoch": 0.38902743142144636, + "grad_norm": 0.302876353263855, + "learning_rate": 1.8445885286783043e-05, + "loss": 0.0821, + "step": 3120 + }, + { + "epoch": 0.39027431421446385, + "grad_norm": 25.011768341064453, + "learning_rate": 1.8440897755610974e-05, + "loss": 0.0415, + "step": 3130 + }, + { + "epoch": 0.3915211970074813, + "grad_norm": 0.10454079508781433, + "learning_rate": 1.8435910224438904e-05, + "loss": 0.0653, + "step": 3140 + }, + { + "epoch": 0.39276807980049877, + "grad_norm": 6.67221212387085, + "learning_rate": 1.8430922693266835e-05, + "loss": 0.0632, + "step": 3150 + }, + { + "epoch": 0.3940149625935162, + "grad_norm": 34.12371826171875, + "learning_rate": 1.8425935162094762e-05, + "loss": 0.0664, + "step": 3160 + }, + { + "epoch": 0.39526184538653364, + "grad_norm": 0.04642521217465401, + "learning_rate": 1.8420947630922693e-05, + "loss": 0.0891, + "step": 3170 + }, + { + "epoch": 0.39650872817955113, + "grad_norm": 0.056110795587301254, + "learning_rate": 1.8415960099750623e-05, + "loss": 0.1448, + "step": 3180 + }, + { + "epoch": 0.39775561097256856, + "grad_norm": 0.01612841710448265, + "learning_rate": 1.8410972568578554e-05, + "loss": 0.1943, + "step": 3190 + }, + { + "epoch": 0.39900249376558605, + "grad_norm": 0.07397930324077606, + "learning_rate": 1.8405985037406488e-05, + "loss": 0.1414, + "step": 3200 + }, + { + "epoch": 0.4002493765586035, + "grad_norm": 21.90389633178711, + "learning_rate": 1.8400997506234415e-05, + "loss": 0.0627, + "step": 3210 + }, + { + "epoch": 0.4014962593516209, + "grad_norm": 0.032755568623542786, + "learning_rate": 1.8396009975062346e-05, + "loss": 0.0087, + "step": 3220 + }, + { + "epoch": 0.4027431421446384, + "grad_norm": 26.460538864135742, + "learning_rate": 1.8391022443890276e-05, + "loss": 0.1188, + "step": 3230 + }, + { + "epoch": 0.40399002493765584, + "grad_norm": 0.40684857964515686, + "learning_rate": 1.8386034912718207e-05, + "loss": 0.0714, + "step": 3240 + }, + { + "epoch": 0.40523690773067333, + "grad_norm": 0.03340421989560127, + "learning_rate": 1.8381047381546134e-05, + "loss": 0.0362, + "step": 3250 + }, + { + "epoch": 0.40648379052369077, + "grad_norm": 0.2820630371570587, + "learning_rate": 1.8376059850374065e-05, + "loss": 0.1361, + "step": 3260 + }, + { + "epoch": 0.4077306733167082, + "grad_norm": 0.018967803567647934, + "learning_rate": 1.8371072319201996e-05, + "loss": 0.1086, + "step": 3270 + }, + { + "epoch": 0.4089775561097257, + "grad_norm": 0.6365144848823547, + "learning_rate": 1.8366084788029926e-05, + "loss": 0.0465, + "step": 3280 + }, + { + "epoch": 0.4102244389027431, + "grad_norm": 0.10352272540330887, + "learning_rate": 1.8361097256857857e-05, + "loss": 0.0639, + "step": 3290 + }, + { + "epoch": 0.4114713216957606, + "grad_norm": 20.832256317138672, + "learning_rate": 1.8356109725685787e-05, + "loss": 0.1182, + "step": 3300 + }, + { + "epoch": 0.41271820448877805, + "grad_norm": 0.034411683678627014, + "learning_rate": 1.8351122194513718e-05, + "loss": 0.0996, + "step": 3310 + }, + { + "epoch": 0.4139650872817955, + "grad_norm": 15.220785140991211, + "learning_rate": 1.834613466334165e-05, + "loss": 0.2009, + "step": 3320 + }, + { + "epoch": 0.415211970074813, + "grad_norm": 0.23576247692108154, + "learning_rate": 1.8341147132169576e-05, + "loss": 0.0933, + "step": 3330 + }, + { + "epoch": 0.4164588528678304, + "grad_norm": 0.03238127753138542, + "learning_rate": 1.8336159600997507e-05, + "loss": 0.1636, + "step": 3340 + }, + { + "epoch": 0.4177057356608479, + "grad_norm": 10.25538444519043, + "learning_rate": 1.8331172069825437e-05, + "loss": 0.118, + "step": 3350 + }, + { + "epoch": 0.41895261845386533, + "grad_norm": 0.4214819073677063, + "learning_rate": 1.8326184538653368e-05, + "loss": 0.0542, + "step": 3360 + }, + { + "epoch": 0.42019950124688277, + "grad_norm": 2.8215088844299316, + "learning_rate": 1.83211970074813e-05, + "loss": 0.152, + "step": 3370 + }, + { + "epoch": 0.42144638403990026, + "grad_norm": 29.304697036743164, + "learning_rate": 1.831620947630923e-05, + "loss": 0.128, + "step": 3380 + }, + { + "epoch": 0.4226932668329177, + "grad_norm": 0.18514405190944672, + "learning_rate": 1.831122194513716e-05, + "loss": 0.1495, + "step": 3390 + }, + { + "epoch": 0.4239401496259352, + "grad_norm": 0.17302575707435608, + "learning_rate": 1.830623441396509e-05, + "loss": 0.0485, + "step": 3400 + }, + { + "epoch": 0.4251870324189526, + "grad_norm": 4.4047532081604, + "learning_rate": 1.8301246882793017e-05, + "loss": 0.0401, + "step": 3410 + }, + { + "epoch": 0.42643391521197005, + "grad_norm": 2.6274476051330566, + "learning_rate": 1.8296259351620948e-05, + "loss": 0.1061, + "step": 3420 + }, + { + "epoch": 0.42768079800498754, + "grad_norm": 0.21970237791538239, + "learning_rate": 1.829127182044888e-05, + "loss": 0.0306, + "step": 3430 + }, + { + "epoch": 0.428927680798005, + "grad_norm": 1.312694787979126, + "learning_rate": 1.828628428927681e-05, + "loss": 0.1095, + "step": 3440 + }, + { + "epoch": 0.43017456359102246, + "grad_norm": 0.2038409262895584, + "learning_rate": 1.828129675810474e-05, + "loss": 0.1052, + "step": 3450 + }, + { + "epoch": 0.4314214463840399, + "grad_norm": 0.17797330021858215, + "learning_rate": 1.827630922693267e-05, + "loss": 0.0035, + "step": 3460 + }, + { + "epoch": 0.43266832917705733, + "grad_norm": 1.5515074729919434, + "learning_rate": 1.82713216957606e-05, + "loss": 0.0753, + "step": 3470 + }, + { + "epoch": 0.4339152119700748, + "grad_norm": 10.284847259521484, + "learning_rate": 1.8266334164588532e-05, + "loss": 0.1561, + "step": 3480 + }, + { + "epoch": 0.43516209476309226, + "grad_norm": 5.16575813293457, + "learning_rate": 1.8261346633416462e-05, + "loss": 0.1061, + "step": 3490 + }, + { + "epoch": 0.43640897755610975, + "grad_norm": 0.04115693271160126, + "learning_rate": 1.825635910224439e-05, + "loss": 0.1139, + "step": 3500 + }, + { + "epoch": 0.4376558603491272, + "grad_norm": 0.7454102635383606, + "learning_rate": 1.825137157107232e-05, + "loss": 0.0631, + "step": 3510 + }, + { + "epoch": 0.4389027431421446, + "grad_norm": 26.38698387145996, + "learning_rate": 1.824638403990025e-05, + "loss": 0.0937, + "step": 3520 + }, + { + "epoch": 0.4401496259351621, + "grad_norm": 29.491291046142578, + "learning_rate": 1.824139650872818e-05, + "loss": 0.1123, + "step": 3530 + }, + { + "epoch": 0.44139650872817954, + "grad_norm": 23.8365421295166, + "learning_rate": 1.823640897755611e-05, + "loss": 0.0995, + "step": 3540 + }, + { + "epoch": 0.442643391521197, + "grad_norm": 41.31846237182617, + "learning_rate": 1.8231421446384043e-05, + "loss": 0.0294, + "step": 3550 + }, + { + "epoch": 0.44389027431421446, + "grad_norm": 34.70884704589844, + "learning_rate": 1.8226433915211973e-05, + "loss": 0.1029, + "step": 3560 + }, + { + "epoch": 0.4451371571072319, + "grad_norm": 9.22887134552002, + "learning_rate": 1.8221446384039904e-05, + "loss": 0.1245, + "step": 3570 + }, + { + "epoch": 0.4463840399002494, + "grad_norm": 0.04959265887737274, + "learning_rate": 1.821645885286783e-05, + "loss": 0.0174, + "step": 3580 + }, + { + "epoch": 0.4476309226932668, + "grad_norm": 0.8959947824478149, + "learning_rate": 1.8211471321695762e-05, + "loss": 0.0333, + "step": 3590 + }, + { + "epoch": 0.4488778054862843, + "grad_norm": 0.14847318828105927, + "learning_rate": 1.8206483790523692e-05, + "loss": 0.1173, + "step": 3600 + }, + { + "epoch": 0.45012468827930174, + "grad_norm": 0.10429783910512924, + "learning_rate": 1.8201496259351623e-05, + "loss": 0.0177, + "step": 3610 + }, + { + "epoch": 0.4513715710723192, + "grad_norm": 0.1884181648492813, + "learning_rate": 1.819650872817955e-05, + "loss": 0.0785, + "step": 3620 + }, + { + "epoch": 0.45261845386533667, + "grad_norm": 0.03715066611766815, + "learning_rate": 1.819152119700748e-05, + "loss": 0.012, + "step": 3630 + }, + { + "epoch": 0.4538653366583541, + "grad_norm": 27.58835220336914, + "learning_rate": 1.818653366583541e-05, + "loss": 0.1235, + "step": 3640 + }, + { + "epoch": 0.4551122194513716, + "grad_norm": 16.778362274169922, + "learning_rate": 1.8181546134663342e-05, + "loss": 0.2646, + "step": 3650 + }, + { + "epoch": 0.456359102244389, + "grad_norm": 22.702903747558594, + "learning_rate": 1.8176558603491273e-05, + "loss": 0.0658, + "step": 3660 + }, + { + "epoch": 0.45760598503740646, + "grad_norm": 0.09582442045211792, + "learning_rate": 1.8171571072319203e-05, + "loss": 0.104, + "step": 3670 + }, + { + "epoch": 0.45885286783042395, + "grad_norm": 0.026133723556995392, + "learning_rate": 1.8166583541147134e-05, + "loss": 0.0753, + "step": 3680 + }, + { + "epoch": 0.4600997506234414, + "grad_norm": 2.048128128051758, + "learning_rate": 1.8161596009975065e-05, + "loss": 0.0564, + "step": 3690 + }, + { + "epoch": 0.4613466334164589, + "grad_norm": 1.270678997039795, + "learning_rate": 1.8156608478802995e-05, + "loss": 0.146, + "step": 3700 + }, + { + "epoch": 0.4625935162094763, + "grad_norm": 17.18134307861328, + "learning_rate": 1.8151620947630923e-05, + "loss": 0.0391, + "step": 3710 + }, + { + "epoch": 0.46384039900249374, + "grad_norm": 8.475462913513184, + "learning_rate": 1.8146633416458853e-05, + "loss": 0.0857, + "step": 3720 + }, + { + "epoch": 0.46508728179551123, + "grad_norm": 0.06224285066127777, + "learning_rate": 1.8141645885286784e-05, + "loss": 0.069, + "step": 3730 + }, + { + "epoch": 0.46633416458852867, + "grad_norm": 5.85067081451416, + "learning_rate": 1.8136658354114714e-05, + "loss": 0.1032, + "step": 3740 + }, + { + "epoch": 0.46758104738154616, + "grad_norm": 0.02149471826851368, + "learning_rate": 1.8131670822942645e-05, + "loss": 0.0613, + "step": 3750 + }, + { + "epoch": 0.4688279301745636, + "grad_norm": 21.774948120117188, + "learning_rate": 1.8126683291770576e-05, + "loss": 0.0499, + "step": 3760 + }, + { + "epoch": 0.470074812967581, + "grad_norm": 0.01490458007901907, + "learning_rate": 1.8121695760598506e-05, + "loss": 0.034, + "step": 3770 + }, + { + "epoch": 0.4713216957605985, + "grad_norm": 0.31544768810272217, + "learning_rate": 1.8116708229426437e-05, + "loss": 0.0978, + "step": 3780 + }, + { + "epoch": 0.47256857855361595, + "grad_norm": 1.336230754852295, + "learning_rate": 1.8111720698254364e-05, + "loss": 0.0361, + "step": 3790 + }, + { + "epoch": 0.47381546134663344, + "grad_norm": 3.5526254177093506, + "learning_rate": 1.8106733167082295e-05, + "loss": 0.0204, + "step": 3800 + }, + { + "epoch": 0.47506234413965087, + "grad_norm": 0.0623275563120842, + "learning_rate": 1.8101745635910225e-05, + "loss": 0.102, + "step": 3810 + }, + { + "epoch": 0.4763092269326683, + "grad_norm": 3.013119697570801, + "learning_rate": 1.8096758104738156e-05, + "loss": 0.0479, + "step": 3820 + }, + { + "epoch": 0.4775561097256858, + "grad_norm": 0.018023619428277016, + "learning_rate": 1.8091770573566087e-05, + "loss": 0.058, + "step": 3830 + }, + { + "epoch": 0.47880299251870323, + "grad_norm": 0.05830911546945572, + "learning_rate": 1.8086783042394017e-05, + "loss": 0.014, + "step": 3840 + }, + { + "epoch": 0.4800498753117207, + "grad_norm": 0.3777295649051666, + "learning_rate": 1.8081795511221948e-05, + "loss": 0.08, + "step": 3850 + }, + { + "epoch": 0.48129675810473815, + "grad_norm": 0.010745448060333729, + "learning_rate": 1.807680798004988e-05, + "loss": 0.1088, + "step": 3860 + }, + { + "epoch": 0.4825436408977556, + "grad_norm": 0.2874574065208435, + "learning_rate": 1.8071820448877806e-05, + "loss": 0.062, + "step": 3870 + }, + { + "epoch": 0.4837905236907731, + "grad_norm": 0.011869000270962715, + "learning_rate": 1.8066832917705736e-05, + "loss": 0.1795, + "step": 3880 + }, + { + "epoch": 0.4850374064837905, + "grad_norm": 5.51703405380249, + "learning_rate": 1.8061845386533667e-05, + "loss": 0.0493, + "step": 3890 + }, + { + "epoch": 0.486284289276808, + "grad_norm": 0.057681404054164886, + "learning_rate": 1.8056857855361597e-05, + "loss": 0.0127, + "step": 3900 + }, + { + "epoch": 0.48753117206982544, + "grad_norm": 0.4616217613220215, + "learning_rate": 1.8051870324189528e-05, + "loss": 0.0861, + "step": 3910 + }, + { + "epoch": 0.48877805486284287, + "grad_norm": 38.90608596801758, + "learning_rate": 1.804688279301746e-05, + "loss": 0.1033, + "step": 3920 + }, + { + "epoch": 0.49002493765586036, + "grad_norm": 0.015819426625967026, + "learning_rate": 1.804189526184539e-05, + "loss": 0.077, + "step": 3930 + }, + { + "epoch": 0.4912718204488778, + "grad_norm": 55.003570556640625, + "learning_rate": 1.803690773067332e-05, + "loss": 0.1412, + "step": 3940 + }, + { + "epoch": 0.4925187032418953, + "grad_norm": 0.5927708148956299, + "learning_rate": 1.803192019950125e-05, + "loss": 0.0562, + "step": 3950 + }, + { + "epoch": 0.4937655860349127, + "grad_norm": 30.10116195678711, + "learning_rate": 1.8026932668329178e-05, + "loss": 0.1947, + "step": 3960 + }, + { + "epoch": 0.49501246882793015, + "grad_norm": 28.36803436279297, + "learning_rate": 1.802194513715711e-05, + "loss": 0.0687, + "step": 3970 + }, + { + "epoch": 0.49625935162094764, + "grad_norm": 4.661812782287598, + "learning_rate": 1.801695760598504e-05, + "loss": 0.1681, + "step": 3980 + }, + { + "epoch": 0.4975062344139651, + "grad_norm": 6.973540306091309, + "learning_rate": 1.801197007481297e-05, + "loss": 0.1108, + "step": 3990 + }, + { + "epoch": 0.49875311720698257, + "grad_norm": 0.3708229660987854, + "learning_rate": 1.8006982543640897e-05, + "loss": 0.081, + "step": 4000 + }, + { + "epoch": 0.5, + "grad_norm": 0.08219872415065765, + "learning_rate": 1.8001995012468828e-05, + "loss": 0.067, + "step": 4010 + }, + { + "epoch": 0.5012468827930174, + "grad_norm": 0.05088973417878151, + "learning_rate": 1.799700748129676e-05, + "loss": 0.1103, + "step": 4020 + }, + { + "epoch": 0.5024937655860349, + "grad_norm": 9.498299598693848, + "learning_rate": 1.7992019950124692e-05, + "loss": 0.0871, + "step": 4030 + }, + { + "epoch": 0.5037406483790524, + "grad_norm": 10.723158836364746, + "learning_rate": 1.798703241895262e-05, + "loss": 0.0854, + "step": 4040 + }, + { + "epoch": 0.5049875311720698, + "grad_norm": 1.3653846979141235, + "learning_rate": 1.798204488778055e-05, + "loss": 0.0603, + "step": 4050 + }, + { + "epoch": 0.5062344139650873, + "grad_norm": 0.060665953904390335, + "learning_rate": 1.797705735660848e-05, + "loss": 0.0416, + "step": 4060 + }, + { + "epoch": 0.5074812967581047, + "grad_norm": 0.05798611417412758, + "learning_rate": 1.797206982543641e-05, + "loss": 0.1228, + "step": 4070 + }, + { + "epoch": 0.5087281795511222, + "grad_norm": 0.26482218503952026, + "learning_rate": 1.796708229426434e-05, + "loss": 0.0735, + "step": 4080 + }, + { + "epoch": 0.5099750623441397, + "grad_norm": 0.2417094111442566, + "learning_rate": 1.796209476309227e-05, + "loss": 0.0061, + "step": 4090 + }, + { + "epoch": 0.5112219451371571, + "grad_norm": 0.032584238797426224, + "learning_rate": 1.79571072319202e-05, + "loss": 0.0511, + "step": 4100 + }, + { + "epoch": 0.5124688279301746, + "grad_norm": 57.22048568725586, + "learning_rate": 1.795211970074813e-05, + "loss": 0.1156, + "step": 4110 + }, + { + "epoch": 0.513715710723192, + "grad_norm": 40.46946334838867, + "learning_rate": 1.794713216957606e-05, + "loss": 0.1286, + "step": 4120 + }, + { + "epoch": 0.5149625935162094, + "grad_norm": 0.05879024416208267, + "learning_rate": 1.794214463840399e-05, + "loss": 0.1155, + "step": 4130 + }, + { + "epoch": 0.516209476309227, + "grad_norm": 0.017375554889440536, + "learning_rate": 1.7937157107231922e-05, + "loss": 0.0445, + "step": 4140 + }, + { + "epoch": 0.5174563591022444, + "grad_norm": 0.07895904779434204, + "learning_rate": 1.7932169576059853e-05, + "loss": 0.0931, + "step": 4150 + }, + { + "epoch": 0.5187032418952618, + "grad_norm": 17.329853057861328, + "learning_rate": 1.792718204488778e-05, + "loss": 0.1106, + "step": 4160 + }, + { + "epoch": 0.5199501246882793, + "grad_norm": 6.252292156219482, + "learning_rate": 1.792219451371571e-05, + "loss": 0.1499, + "step": 4170 + }, + { + "epoch": 0.5211970074812967, + "grad_norm": 4.099524974822998, + "learning_rate": 1.791720698254364e-05, + "loss": 0.0033, + "step": 4180 + }, + { + "epoch": 0.5224438902743143, + "grad_norm": 26.322385787963867, + "learning_rate": 1.7912219451371572e-05, + "loss": 0.0409, + "step": 4190 + }, + { + "epoch": 0.5236907730673317, + "grad_norm": 44.511905670166016, + "learning_rate": 1.7907231920199503e-05, + "loss": 0.1004, + "step": 4200 + }, + { + "epoch": 0.5249376558603491, + "grad_norm": 0.012727702967822552, + "learning_rate": 1.7902244389027433e-05, + "loss": 0.0249, + "step": 4210 + }, + { + "epoch": 0.5261845386533666, + "grad_norm": 5.822264194488525, + "learning_rate": 1.7897256857855364e-05, + "loss": 0.0436, + "step": 4220 + }, + { + "epoch": 0.527431421446384, + "grad_norm": 0.01628972217440605, + "learning_rate": 1.7892269326683294e-05, + "loss": 0.1117, + "step": 4230 + }, + { + "epoch": 0.5286783042394015, + "grad_norm": 0.15717430412769318, + "learning_rate": 1.7887281795511225e-05, + "loss": 0.0526, + "step": 4240 + }, + { + "epoch": 0.529925187032419, + "grad_norm": 0.03874291479587555, + "learning_rate": 1.7882294264339152e-05, + "loss": 0.0353, + "step": 4250 + }, + { + "epoch": 0.5311720698254364, + "grad_norm": 0.02741248533129692, + "learning_rate": 1.7877306733167083e-05, + "loss": 0.2038, + "step": 4260 + }, + { + "epoch": 0.5324189526184538, + "grad_norm": 0.10414528846740723, + "learning_rate": 1.7872319201995013e-05, + "loss": 0.0228, + "step": 4270 + }, + { + "epoch": 0.5336658354114713, + "grad_norm": 11.053817749023438, + "learning_rate": 1.7867331670822944e-05, + "loss": 0.1856, + "step": 4280 + }, + { + "epoch": 0.5349127182044888, + "grad_norm": 0.044340793043375015, + "learning_rate": 1.7862344139650875e-05, + "loss": 0.1366, + "step": 4290 + }, + { + "epoch": 0.5361596009975063, + "grad_norm": 0.7331774830818176, + "learning_rate": 1.7857356608478805e-05, + "loss": 0.075, + "step": 4300 + }, + { + "epoch": 0.5374064837905237, + "grad_norm": 0.03085249662399292, + "learning_rate": 1.7852369077306736e-05, + "loss": 0.0312, + "step": 4310 + }, + { + "epoch": 0.5386533665835411, + "grad_norm": 0.10700695961713791, + "learning_rate": 1.7847381546134667e-05, + "loss": 0.0418, + "step": 4320 + }, + { + "epoch": 0.5399002493765586, + "grad_norm": 14.032788276672363, + "learning_rate": 1.7842394014962594e-05, + "loss": 0.1199, + "step": 4330 + }, + { + "epoch": 0.5411471321695761, + "grad_norm": 1.1185815334320068, + "learning_rate": 1.7837406483790524e-05, + "loss": 0.1206, + "step": 4340 + }, + { + "epoch": 0.5423940149625935, + "grad_norm": 0.02402268722653389, + "learning_rate": 1.7832418952618455e-05, + "loss": 0.0587, + "step": 4350 + }, + { + "epoch": 0.543640897755611, + "grad_norm": 10.920904159545898, + "learning_rate": 1.7827431421446386e-05, + "loss": 0.096, + "step": 4360 + }, + { + "epoch": 0.5448877805486284, + "grad_norm": 0.791688859462738, + "learning_rate": 1.7822443890274316e-05, + "loss": 0.0948, + "step": 4370 + }, + { + "epoch": 0.5461346633416458, + "grad_norm": 0.16529826819896698, + "learning_rate": 1.7817456359102247e-05, + "loss": 0.0734, + "step": 4380 + }, + { + "epoch": 0.5473815461346634, + "grad_norm": 0.02176223322749138, + "learning_rate": 1.7812468827930178e-05, + "loss": 0.0816, + "step": 4390 + }, + { + "epoch": 0.5486284289276808, + "grad_norm": 0.0856514424085617, + "learning_rate": 1.7807481296758108e-05, + "loss": 0.0703, + "step": 4400 + }, + { + "epoch": 0.5498753117206983, + "grad_norm": 0.014872372150421143, + "learning_rate": 1.7802493765586035e-05, + "loss": 0.0823, + "step": 4410 + }, + { + "epoch": 0.5511221945137157, + "grad_norm": 5.188063621520996, + "learning_rate": 1.7797506234413966e-05, + "loss": 0.0336, + "step": 4420 + }, + { + "epoch": 0.5523690773067331, + "grad_norm": 0.14992894232273102, + "learning_rate": 1.7792518703241897e-05, + "loss": 0.0741, + "step": 4430 + }, + { + "epoch": 0.5536159600997507, + "grad_norm": 1.0823159217834473, + "learning_rate": 1.7787531172069827e-05, + "loss": 0.0574, + "step": 4440 + }, + { + "epoch": 0.5548628428927681, + "grad_norm": 0.0768449455499649, + "learning_rate": 1.7782543640897758e-05, + "loss": 0.0845, + "step": 4450 + }, + { + "epoch": 0.5561097256857855, + "grad_norm": 0.015782279893755913, + "learning_rate": 1.7777556109725685e-05, + "loss": 0.0828, + "step": 4460 + }, + { + "epoch": 0.557356608478803, + "grad_norm": 0.022945543751120567, + "learning_rate": 1.7772568578553616e-05, + "loss": 0.1476, + "step": 4470 + }, + { + "epoch": 0.5586034912718204, + "grad_norm": 0.04019872471690178, + "learning_rate": 1.7767581047381546e-05, + "loss": 0.0036, + "step": 4480 + }, + { + "epoch": 0.559850374064838, + "grad_norm": 30.77685546875, + "learning_rate": 1.776259351620948e-05, + "loss": 0.0562, + "step": 4490 + }, + { + "epoch": 0.5610972568578554, + "grad_norm": 0.035899385809898376, + "learning_rate": 1.7757605985037408e-05, + "loss": 0.0271, + "step": 4500 + }, + { + "epoch": 0.5623441396508728, + "grad_norm": Infinity, + "learning_rate": 1.7753117206982544e-05, + "loss": 0.1555, + "step": 4510 + }, + { + "epoch": 0.5635910224438903, + "grad_norm": 47.183685302734375, + "learning_rate": 1.7748129675810475e-05, + "loss": 0.1514, + "step": 4520 + }, + { + "epoch": 0.5648379052369077, + "grad_norm": 0.1414586901664734, + "learning_rate": 1.7743142144638405e-05, + "loss": 0.0446, + "step": 4530 + }, + { + "epoch": 0.5660847880299252, + "grad_norm": 6.158328056335449, + "learning_rate": 1.7738154613466336e-05, + "loss": 0.092, + "step": 4540 + }, + { + "epoch": 0.5673316708229427, + "grad_norm": 0.13686908781528473, + "learning_rate": 1.7733167082294263e-05, + "loss": 0.0638, + "step": 4550 + }, + { + "epoch": 0.5685785536159601, + "grad_norm": 0.09912706911563873, + "learning_rate": 1.7728179551122194e-05, + "loss": 0.0048, + "step": 4560 + }, + { + "epoch": 0.5698254364089775, + "grad_norm": 31.120927810668945, + "learning_rate": 1.7723192019950128e-05, + "loss": 0.0572, + "step": 4570 + }, + { + "epoch": 0.571072319201995, + "grad_norm": 0.1557435244321823, + "learning_rate": 1.771820448877806e-05, + "loss": 0.0498, + "step": 4580 + }, + { + "epoch": 0.5723192019950125, + "grad_norm": 0.07351752370595932, + "learning_rate": 1.771321695760599e-05, + "loss": 0.0842, + "step": 4590 + }, + { + "epoch": 0.57356608478803, + "grad_norm": 0.2594575583934784, + "learning_rate": 1.7708229426433916e-05, + "loss": 0.0907, + "step": 4600 + }, + { + "epoch": 0.5748129675810474, + "grad_norm": 0.011486229486763477, + "learning_rate": 1.7703241895261847e-05, + "loss": 0.0171, + "step": 4610 + }, + { + "epoch": 0.5760598503740648, + "grad_norm": 0.23140795528888702, + "learning_rate": 1.7698254364089778e-05, + "loss": 0.0596, + "step": 4620 + }, + { + "epoch": 0.5773067331670823, + "grad_norm": 0.12582992017269135, + "learning_rate": 1.7693266832917708e-05, + "loss": 0.0593, + "step": 4630 + }, + { + "epoch": 0.5785536159600998, + "grad_norm": 1.3327033519744873, + "learning_rate": 1.7688279301745635e-05, + "loss": 0.1281, + "step": 4640 + }, + { + "epoch": 0.5798004987531172, + "grad_norm": 0.009501133114099503, + "learning_rate": 1.7683291770573566e-05, + "loss": 0.0942, + "step": 4650 + }, + { + "epoch": 0.5810473815461347, + "grad_norm": 0.15668757259845734, + "learning_rate": 1.7678304239401497e-05, + "loss": 0.0793, + "step": 4660 + }, + { + "epoch": 0.5822942643391521, + "grad_norm": 0.04126293212175369, + "learning_rate": 1.7673316708229427e-05, + "loss": 0.0937, + "step": 4670 + }, + { + "epoch": 0.5835411471321695, + "grad_norm": 0.015120592899620533, + "learning_rate": 1.7668329177057358e-05, + "loss": 0.0046, + "step": 4680 + }, + { + "epoch": 0.5847880299251871, + "grad_norm": 0.05965186282992363, + "learning_rate": 1.766334164588529e-05, + "loss": 0.1212, + "step": 4690 + }, + { + "epoch": 0.5860349127182045, + "grad_norm": 15.11082649230957, + "learning_rate": 1.765835411471322e-05, + "loss": 0.1122, + "step": 4700 + }, + { + "epoch": 0.587281795511222, + "grad_norm": 10.484663009643555, + "learning_rate": 1.765336658354115e-05, + "loss": 0.0907, + "step": 4710 + }, + { + "epoch": 0.5885286783042394, + "grad_norm": 24.538185119628906, + "learning_rate": 1.7648379052369077e-05, + "loss": 0.1617, + "step": 4720 + }, + { + "epoch": 0.5897755610972568, + "grad_norm": 4.25324821472168, + "learning_rate": 1.7643391521197008e-05, + "loss": 0.0357, + "step": 4730 + }, + { + "epoch": 0.5910224438902744, + "grad_norm": 4.515763282775879, + "learning_rate": 1.7638403990024938e-05, + "loss": 0.0999, + "step": 4740 + }, + { + "epoch": 0.5922693266832918, + "grad_norm": 0.033570531755685806, + "learning_rate": 1.763341645885287e-05, + "loss": 0.0471, + "step": 4750 + }, + { + "epoch": 0.5935162094763092, + "grad_norm": 0.023912442848086357, + "learning_rate": 1.76284289276808e-05, + "loss": 0.0029, + "step": 4760 + }, + { + "epoch": 0.5947630922693267, + "grad_norm": 0.4626505374908447, + "learning_rate": 1.762344139650873e-05, + "loss": 0.1512, + "step": 4770 + }, + { + "epoch": 0.5960099750623441, + "grad_norm": 0.5773617029190063, + "learning_rate": 1.761845386533666e-05, + "loss": 0.0808, + "step": 4780 + }, + { + "epoch": 0.5972568578553616, + "grad_norm": 0.0236464012414217, + "learning_rate": 1.761346633416459e-05, + "loss": 0.0687, + "step": 4790 + }, + { + "epoch": 0.5985037406483791, + "grad_norm": 5.944710731506348, + "learning_rate": 1.760847880299252e-05, + "loss": 0.0253, + "step": 4800 + }, + { + "epoch": 0.5997506234413965, + "grad_norm": 19.9612979888916, + "learning_rate": 1.760349127182045e-05, + "loss": 0.1472, + "step": 4810 + }, + { + "epoch": 0.600997506234414, + "grad_norm": 31.07427406311035, + "learning_rate": 1.759850374064838e-05, + "loss": 0.0273, + "step": 4820 + }, + { + "epoch": 0.6022443890274314, + "grad_norm": 0.054695580154657364, + "learning_rate": 1.759351620947631e-05, + "loss": 0.0612, + "step": 4830 + }, + { + "epoch": 0.6034912718204489, + "grad_norm": 0.5819992423057556, + "learning_rate": 1.758852867830424e-05, + "loss": 0.0695, + "step": 4840 + }, + { + "epoch": 0.6047381546134664, + "grad_norm": 0.04540835693478584, + "learning_rate": 1.758354114713217e-05, + "loss": 0.0961, + "step": 4850 + }, + { + "epoch": 0.6059850374064838, + "grad_norm": 0.22553811967372894, + "learning_rate": 1.7578553615960102e-05, + "loss": 0.0476, + "step": 4860 + }, + { + "epoch": 0.6072319201995012, + "grad_norm": 0.016304977238178253, + "learning_rate": 1.7573566084788033e-05, + "loss": 0.1701, + "step": 4870 + }, + { + "epoch": 0.6084788029925187, + "grad_norm": 3.0572760105133057, + "learning_rate": 1.7568578553615963e-05, + "loss": 0.0097, + "step": 4880 + }, + { + "epoch": 0.6097256857855362, + "grad_norm": 11.772346496582031, + "learning_rate": 1.756359102244389e-05, + "loss": 0.0492, + "step": 4890 + }, + { + "epoch": 0.6109725685785536, + "grad_norm": 6.350620746612549, + "learning_rate": 1.755860349127182e-05, + "loss": 0.0661, + "step": 4900 + }, + { + "epoch": 0.6122194513715711, + "grad_norm": 0.015547297894954681, + "learning_rate": 1.7553615960099752e-05, + "loss": 0.0835, + "step": 4910 + }, + { + "epoch": 0.6134663341645885, + "grad_norm": 37.15751266479492, + "learning_rate": 1.7548628428927683e-05, + "loss": 0.0571, + "step": 4920 + }, + { + "epoch": 0.614713216957606, + "grad_norm": 0.03281101956963539, + "learning_rate": 1.7543640897755613e-05, + "loss": 0.1247, + "step": 4930 + }, + { + "epoch": 0.6159600997506235, + "grad_norm": 6.25891637802124, + "learning_rate": 1.7538653366583544e-05, + "loss": 0.0423, + "step": 4940 + }, + { + "epoch": 0.6172069825436409, + "grad_norm": 30.621646881103516, + "learning_rate": 1.7533665835411474e-05, + "loss": 0.0622, + "step": 4950 + }, + { + "epoch": 0.6184538653366584, + "grad_norm": 37.61522674560547, + "learning_rate": 1.7528678304239405e-05, + "loss": 0.1214, + "step": 4960 + }, + { + "epoch": 0.6197007481296758, + "grad_norm": 0.08783353865146637, + "learning_rate": 1.7523690773067332e-05, + "loss": 0.0865, + "step": 4970 + }, + { + "epoch": 0.6209476309226932, + "grad_norm": 5.21088171005249, + "learning_rate": 1.7518703241895263e-05, + "loss": 0.1417, + "step": 4980 + }, + { + "epoch": 0.6221945137157108, + "grad_norm": 1.4768476486206055, + "learning_rate": 1.7513715710723193e-05, + "loss": 0.0691, + "step": 4990 + }, + { + "epoch": 0.6234413965087282, + "grad_norm": 0.037035878747701645, + "learning_rate": 1.7508728179551124e-05, + "loss": 0.0474, + "step": 5000 + }, + { + "epoch": 0.6246882793017456, + "grad_norm": 7.823322296142578, + "learning_rate": 1.750374064837905e-05, + "loss": 0.1349, + "step": 5010 + }, + { + "epoch": 0.6259351620947631, + "grad_norm": 0.01354975439608097, + "learning_rate": 1.7498753117206982e-05, + "loss": 0.086, + "step": 5020 + }, + { + "epoch": 0.6271820448877805, + "grad_norm": 0.1404193490743637, + "learning_rate": 1.7493765586034913e-05, + "loss": 0.0822, + "step": 5030 + }, + { + "epoch": 0.628428927680798, + "grad_norm": 27.83915138244629, + "learning_rate": 1.7488778054862847e-05, + "loss": 0.0993, + "step": 5040 + }, + { + "epoch": 0.6296758104738155, + "grad_norm": 0.03925345838069916, + "learning_rate": 1.7483790523690774e-05, + "loss": 0.0913, + "step": 5050 + }, + { + "epoch": 0.6309226932668329, + "grad_norm": 21.24605369567871, + "learning_rate": 1.7478802992518704e-05, + "loss": 0.0682, + "step": 5060 + }, + { + "epoch": 0.6321695760598504, + "grad_norm": 7.983184337615967, + "learning_rate": 1.7473815461346635e-05, + "loss": 0.1279, + "step": 5070 + }, + { + "epoch": 0.6334164588528678, + "grad_norm": 0.03581385314464569, + "learning_rate": 1.7468827930174566e-05, + "loss": 0.0188, + "step": 5080 + }, + { + "epoch": 0.6346633416458853, + "grad_norm": 0.03261757269501686, + "learning_rate": 1.7463840399002496e-05, + "loss": 0.0554, + "step": 5090 + }, + { + "epoch": 0.6359102244389028, + "grad_norm": 0.060816071927547455, + "learning_rate": 1.7458852867830424e-05, + "loss": 0.0434, + "step": 5100 + }, + { + "epoch": 0.6371571072319202, + "grad_norm": 20.245588302612305, + "learning_rate": 1.7453865336658354e-05, + "loss": 0.1272, + "step": 5110 + }, + { + "epoch": 0.6384039900249376, + "grad_norm": 18.135963439941406, + "learning_rate": 1.7448877805486285e-05, + "loss": 0.1471, + "step": 5120 + }, + { + "epoch": 0.6396508728179551, + "grad_norm": 25.722951889038086, + "learning_rate": 1.7443890274314215e-05, + "loss": 0.0608, + "step": 5130 + }, + { + "epoch": 0.6408977556109726, + "grad_norm": 8.285656929016113, + "learning_rate": 1.7438902743142146e-05, + "loss": 0.111, + "step": 5140 + }, + { + "epoch": 0.64214463840399, + "grad_norm": 0.15213392674922943, + "learning_rate": 1.7433915211970077e-05, + "loss": 0.1607, + "step": 5150 + }, + { + "epoch": 0.6433915211970075, + "grad_norm": 9.305551528930664, + "learning_rate": 1.7428927680798007e-05, + "loss": 0.0749, + "step": 5160 + }, + { + "epoch": 0.6446384039900249, + "grad_norm": 27.343847274780273, + "learning_rate": 1.7423940149625938e-05, + "loss": 0.0931, + "step": 5170 + }, + { + "epoch": 0.6458852867830424, + "grad_norm": 0.03424638509750366, + "learning_rate": 1.7418952618453865e-05, + "loss": 0.0532, + "step": 5180 + }, + { + "epoch": 0.6471321695760599, + "grad_norm": 3.35919451713562, + "learning_rate": 1.7413965087281796e-05, + "loss": 0.022, + "step": 5190 + }, + { + "epoch": 0.6483790523690773, + "grad_norm": 5.452242374420166, + "learning_rate": 1.7408977556109726e-05, + "loss": 0.0829, + "step": 5200 + }, + { + "epoch": 0.6496259351620948, + "grad_norm": 0.03790761157870293, + "learning_rate": 1.7403990024937657e-05, + "loss": 0.0938, + "step": 5210 + }, + { + "epoch": 0.6508728179551122, + "grad_norm": 0.05794584006071091, + "learning_rate": 1.7399002493765588e-05, + "loss": 0.0363, + "step": 5220 + }, + { + "epoch": 0.6521197007481296, + "grad_norm": 33.54241180419922, + "learning_rate": 1.7394014962593518e-05, + "loss": 0.0982, + "step": 5230 + }, + { + "epoch": 0.6533665835411472, + "grad_norm": 31.3079891204834, + "learning_rate": 1.738902743142145e-05, + "loss": 0.1463, + "step": 5240 + }, + { + "epoch": 0.6546134663341646, + "grad_norm": 0.4372890591621399, + "learning_rate": 1.738403990024938e-05, + "loss": 0.0741, + "step": 5250 + }, + { + "epoch": 0.655860349127182, + "grad_norm": 25.03449821472168, + "learning_rate": 1.7379052369077307e-05, + "loss": 0.1668, + "step": 5260 + }, + { + "epoch": 0.6571072319201995, + "grad_norm": 8.57763957977295, + "learning_rate": 1.7374064837905237e-05, + "loss": 0.0704, + "step": 5270 + }, + { + "epoch": 0.6583541147132169, + "grad_norm": 0.027418160811066628, + "learning_rate": 1.7369077306733168e-05, + "loss": 0.0578, + "step": 5280 + }, + { + "epoch": 0.6596009975062345, + "grad_norm": 0.06828586012125015, + "learning_rate": 1.73640897755611e-05, + "loss": 0.1153, + "step": 5290 + }, + { + "epoch": 0.6608478802992519, + "grad_norm": 16.321455001831055, + "learning_rate": 1.735910224438903e-05, + "loss": 0.0171, + "step": 5300 + }, + { + "epoch": 0.6620947630922693, + "grad_norm": 0.0578501932322979, + "learning_rate": 1.735411471321696e-05, + "loss": 0.0581, + "step": 5310 + }, + { + "epoch": 0.6633416458852868, + "grad_norm": 0.021183906123042107, + "learning_rate": 1.734912718204489e-05, + "loss": 0.025, + "step": 5320 + }, + { + "epoch": 0.6645885286783042, + "grad_norm": 0.0674271285533905, + "learning_rate": 1.734413965087282e-05, + "loss": 0.0311, + "step": 5330 + }, + { + "epoch": 0.6658354114713217, + "grad_norm": 0.015874937176704407, + "learning_rate": 1.733915211970075e-05, + "loss": 0.0561, + "step": 5340 + }, + { + "epoch": 0.6670822942643392, + "grad_norm": 19.91120719909668, + "learning_rate": 1.733416458852868e-05, + "loss": 0.0661, + "step": 5350 + }, + { + "epoch": 0.6683291770573566, + "grad_norm": 0.144506573677063, + "learning_rate": 1.732917705735661e-05, + "loss": 0.0181, + "step": 5360 + }, + { + "epoch": 0.669576059850374, + "grad_norm": 0.03757374733686447, + "learning_rate": 1.732418952618454e-05, + "loss": 0.0721, + "step": 5370 + }, + { + "epoch": 0.6708229426433915, + "grad_norm": 5.466207504272461, + "learning_rate": 1.731920199501247e-05, + "loss": 0.035, + "step": 5380 + }, + { + "epoch": 0.672069825436409, + "grad_norm": 0.16552552580833435, + "learning_rate": 1.73142144638404e-05, + "loss": 0.0149, + "step": 5390 + }, + { + "epoch": 0.6733167082294265, + "grad_norm": 17.71544647216797, + "learning_rate": 1.7309226932668332e-05, + "loss": 0.0357, + "step": 5400 + }, + { + "epoch": 0.6745635910224439, + "grad_norm": 21.00069236755371, + "learning_rate": 1.7304239401496263e-05, + "loss": 0.102, + "step": 5410 + }, + { + "epoch": 0.6758104738154613, + "grad_norm": 0.01819881983101368, + "learning_rate": 1.7299251870324193e-05, + "loss": 0.0978, + "step": 5420 + }, + { + "epoch": 0.6770573566084788, + "grad_norm": 0.01410367339849472, + "learning_rate": 1.729426433915212e-05, + "loss": 0.0038, + "step": 5430 + }, + { + "epoch": 0.6783042394014963, + "grad_norm": 34.391624450683594, + "learning_rate": 1.728927680798005e-05, + "loss": 0.0995, + "step": 5440 + }, + { + "epoch": 0.6795511221945137, + "grad_norm": 0.0064294287003576756, + "learning_rate": 1.728428927680798e-05, + "loss": 0.0772, + "step": 5450 + }, + { + "epoch": 0.6807980049875312, + "grad_norm": 11.470839500427246, + "learning_rate": 1.7279301745635912e-05, + "loss": 0.0381, + "step": 5460 + }, + { + "epoch": 0.6820448877805486, + "grad_norm": 0.05597744882106781, + "learning_rate": 1.727431421446384e-05, + "loss": 0.0378, + "step": 5470 + }, + { + "epoch": 0.683291770573566, + "grad_norm": 22.418514251708984, + "learning_rate": 1.726932668329177e-05, + "loss": 0.1113, + "step": 5480 + }, + { + "epoch": 0.6845386533665836, + "grad_norm": 23.031198501586914, + "learning_rate": 1.72643391521197e-05, + "loss": 0.0742, + "step": 5490 + }, + { + "epoch": 0.685785536159601, + "grad_norm": 0.4775936007499695, + "learning_rate": 1.725935162094763e-05, + "loss": 0.1278, + "step": 5500 + }, + { + "epoch": 0.6870324189526185, + "grad_norm": 0.008405367843806744, + "learning_rate": 1.7254364089775562e-05, + "loss": 0.0549, + "step": 5510 + }, + { + "epoch": 0.6882793017456359, + "grad_norm": 0.021200506016612053, + "learning_rate": 1.7249376558603493e-05, + "loss": 0.0112, + "step": 5520 + }, + { + "epoch": 0.6895261845386533, + "grad_norm": 0.009464691393077374, + "learning_rate": 1.7244389027431423e-05, + "loss": 0.0338, + "step": 5530 + }, + { + "epoch": 0.6907730673316709, + "grad_norm": 0.1951027363538742, + "learning_rate": 1.7239401496259354e-05, + "loss": 0.0759, + "step": 5540 + }, + { + "epoch": 0.6920199501246883, + "grad_norm": 6.1231231689453125, + "learning_rate": 1.723441396508728e-05, + "loss": 0.1388, + "step": 5550 + }, + { + "epoch": 0.6932668329177057, + "grad_norm": 19.70039939880371, + "learning_rate": 1.7229426433915212e-05, + "loss": 0.0925, + "step": 5560 + }, + { + "epoch": 0.6945137157107232, + "grad_norm": 4.138513088226318, + "learning_rate": 1.7224438902743142e-05, + "loss": 0.0567, + "step": 5570 + }, + { + "epoch": 0.6957605985037406, + "grad_norm": 0.1160452663898468, + "learning_rate": 1.7219451371571073e-05, + "loss": 0.1006, + "step": 5580 + }, + { + "epoch": 0.6970074812967582, + "grad_norm": 15.692948341369629, + "learning_rate": 1.7214463840399004e-05, + "loss": 0.1361, + "step": 5590 + }, + { + "epoch": 0.6982543640897756, + "grad_norm": 0.011452429927885532, + "learning_rate": 1.7209476309226934e-05, + "loss": 0.005, + "step": 5600 + }, + { + "epoch": 0.699501246882793, + "grad_norm": 0.11285385489463806, + "learning_rate": 1.7204488778054865e-05, + "loss": 0.0102, + "step": 5610 + }, + { + "epoch": 0.7007481296758105, + "grad_norm": 23.67371940612793, + "learning_rate": 1.7199501246882795e-05, + "loss": 0.0358, + "step": 5620 + }, + { + "epoch": 0.7019950124688279, + "grad_norm": 5.408916473388672, + "learning_rate": 1.7194513715710726e-05, + "loss": 0.0636, + "step": 5630 + }, + { + "epoch": 0.7032418952618454, + "grad_norm": 0.05330492928624153, + "learning_rate": 1.7189526184538653e-05, + "loss": 0.1374, + "step": 5640 + }, + { + "epoch": 0.7044887780548629, + "grad_norm": 19.458311080932617, + "learning_rate": 1.7184538653366584e-05, + "loss": 0.0404, + "step": 5650 + }, + { + "epoch": 0.7057356608478803, + "grad_norm": 0.012190482579171658, + "learning_rate": 1.7179551122194515e-05, + "loss": 0.1912, + "step": 5660 + }, + { + "epoch": 0.7069825436408977, + "grad_norm": 1.1746265888214111, + "learning_rate": 1.7174563591022445e-05, + "loss": 0.0023, + "step": 5670 + }, + { + "epoch": 0.7082294264339152, + "grad_norm": 0.18074272572994232, + "learning_rate": 1.7169576059850376e-05, + "loss": 0.1154, + "step": 5680 + }, + { + "epoch": 0.7094763092269327, + "grad_norm": 0.013185709714889526, + "learning_rate": 1.7164588528678306e-05, + "loss": 0.0299, + "step": 5690 + }, + { + "epoch": 0.7107231920199502, + "grad_norm": 1.1641533374786377, + "learning_rate": 1.7159600997506237e-05, + "loss": 0.049, + "step": 5700 + }, + { + "epoch": 0.7119700748129676, + "grad_norm": 0.016002580523490906, + "learning_rate": 1.7154613466334168e-05, + "loss": 0.0609, + "step": 5710 + }, + { + "epoch": 0.713216957605985, + "grad_norm": 0.015217593871057034, + "learning_rate": 1.7149625935162095e-05, + "loss": 0.1336, + "step": 5720 + }, + { + "epoch": 0.7144638403990025, + "grad_norm": 0.008298971690237522, + "learning_rate": 1.7144638403990025e-05, + "loss": 0.0302, + "step": 5730 + }, + { + "epoch": 0.71571072319202, + "grad_norm": 6.263095855712891, + "learning_rate": 1.7139650872817956e-05, + "loss": 0.0637, + "step": 5740 + }, + { + "epoch": 0.7169576059850374, + "grad_norm": 0.22118152678012848, + "learning_rate": 1.7134663341645887e-05, + "loss": 0.0101, + "step": 5750 + }, + { + "epoch": 0.7182044887780549, + "grad_norm": 5.958626747131348, + "learning_rate": 1.7129675810473817e-05, + "loss": 0.0364, + "step": 5760 + }, + { + "epoch": 0.7194513715710723, + "grad_norm": 1.9810842275619507, + "learning_rate": 1.7124688279301748e-05, + "loss": 0.156, + "step": 5770 + }, + { + "epoch": 0.7206982543640897, + "grad_norm": 0.03880726546049118, + "learning_rate": 1.711970074812968e-05, + "loss": 0.0693, + "step": 5780 + }, + { + "epoch": 0.7219451371571073, + "grad_norm": 30.136775970458984, + "learning_rate": 1.711471321695761e-05, + "loss": 0.0967, + "step": 5790 + }, + { + "epoch": 0.7231920199501247, + "grad_norm": 0.06249859556555748, + "learning_rate": 1.7109725685785536e-05, + "loss": 0.0188, + "step": 5800 + }, + { + "epoch": 0.7244389027431422, + "grad_norm": 0.1799100935459137, + "learning_rate": 1.7104738154613467e-05, + "loss": 0.1193, + "step": 5810 + }, + { + "epoch": 0.7256857855361596, + "grad_norm": 13.862339973449707, + "learning_rate": 1.7099750623441398e-05, + "loss": 0.0895, + "step": 5820 + }, + { + "epoch": 0.726932668329177, + "grad_norm": 0.24228820204734802, + "learning_rate": 1.7094763092269328e-05, + "loss": 0.0109, + "step": 5830 + }, + { + "epoch": 0.7281795511221946, + "grad_norm": 0.2275868058204651, + "learning_rate": 1.708977556109726e-05, + "loss": 0.0442, + "step": 5840 + }, + { + "epoch": 0.729426433915212, + "grad_norm": 0.032161250710487366, + "learning_rate": 1.708478802992519e-05, + "loss": 0.1272, + "step": 5850 + }, + { + "epoch": 0.7306733167082294, + "grad_norm": 38.00654220581055, + "learning_rate": 1.707980049875312e-05, + "loss": 0.0593, + "step": 5860 + }, + { + "epoch": 0.7319201995012469, + "grad_norm": 12.633605003356934, + "learning_rate": 1.707481296758105e-05, + "loss": 0.1155, + "step": 5870 + }, + { + "epoch": 0.7331670822942643, + "grad_norm": 0.16538533568382263, + "learning_rate": 1.706982543640898e-05, + "loss": 0.0581, + "step": 5880 + }, + { + "epoch": 0.7344139650872819, + "grad_norm": 0.48946434259414673, + "learning_rate": 1.706483790523691e-05, + "loss": 0.0584, + "step": 5890 + }, + { + "epoch": 0.7356608478802993, + "grad_norm": 0.027832500636577606, + "learning_rate": 1.705985037406484e-05, + "loss": 0.049, + "step": 5900 + }, + { + "epoch": 0.7369077306733167, + "grad_norm": 0.07207493484020233, + "learning_rate": 1.705486284289277e-05, + "loss": 0.0297, + "step": 5910 + }, + { + "epoch": 0.7381546134663342, + "grad_norm": 6.354367256164551, + "learning_rate": 1.70498753117207e-05, + "loss": 0.0807, + "step": 5920 + }, + { + "epoch": 0.7394014962593516, + "grad_norm": 6.380330562591553, + "learning_rate": 1.7044887780548628e-05, + "loss": 0.1014, + "step": 5930 + }, + { + "epoch": 0.7406483790523691, + "grad_norm": 43.741695404052734, + "learning_rate": 1.7039900249376558e-05, + "loss": 0.0899, + "step": 5940 + }, + { + "epoch": 0.7418952618453866, + "grad_norm": 0.0922226682305336, + "learning_rate": 1.703491271820449e-05, + "loss": 0.1055, + "step": 5950 + }, + { + "epoch": 0.743142144638404, + "grad_norm": 2.604271173477173, + "learning_rate": 1.702992518703242e-05, + "loss": 0.0821, + "step": 5960 + }, + { + "epoch": 0.7443890274314214, + "grad_norm": 21.0607967376709, + "learning_rate": 1.702493765586035e-05, + "loss": 0.1462, + "step": 5970 + }, + { + "epoch": 0.7456359102244389, + "grad_norm": 0.054813966155052185, + "learning_rate": 1.701995012468828e-05, + "loss": 0.1546, + "step": 5980 + }, + { + "epoch": 0.7468827930174564, + "grad_norm": 41.906089782714844, + "learning_rate": 1.701496259351621e-05, + "loss": 0.0519, + "step": 5990 + }, + { + "epoch": 0.7481296758104738, + "grad_norm": 0.035245224833488464, + "learning_rate": 1.7009975062344142e-05, + "loss": 0.0809, + "step": 6000 + }, + { + "epoch": 0.7493765586034913, + "grad_norm": 0.029683228582143784, + "learning_rate": 1.700498753117207e-05, + "loss": 0.0532, + "step": 6010 + }, + { + "epoch": 0.7506234413965087, + "grad_norm": 0.02576983906328678, + "learning_rate": 1.7e-05, + "loss": 0.0713, + "step": 6020 + }, + { + "epoch": 0.7518703241895262, + "grad_norm": 0.014055408537387848, + "learning_rate": 1.699501246882793e-05, + "loss": 0.0065, + "step": 6030 + }, + { + "epoch": 0.7531172069825436, + "grad_norm": 0.011719699949026108, + "learning_rate": 1.699002493765586e-05, + "loss": 0.0325, + "step": 6040 + }, + { + "epoch": 0.7543640897755611, + "grad_norm": 0.021255536004900932, + "learning_rate": 1.6985037406483792e-05, + "loss": 0.1086, + "step": 6050 + }, + { + "epoch": 0.7556109725685786, + "grad_norm": 0.6811969876289368, + "learning_rate": 1.6980049875311722e-05, + "loss": 0.0901, + "step": 6060 + }, + { + "epoch": 0.756857855361596, + "grad_norm": 0.38240328431129456, + "learning_rate": 1.6975062344139653e-05, + "loss": 0.0455, + "step": 6070 + }, + { + "epoch": 0.7581047381546134, + "grad_norm": 0.0566604882478714, + "learning_rate": 1.6970074812967584e-05, + "loss": 0.0675, + "step": 6080 + }, + { + "epoch": 0.7593516209476309, + "grad_norm": 0.12242597341537476, + "learning_rate": 1.6965087281795514e-05, + "loss": 0.0114, + "step": 6090 + }, + { + "epoch": 0.7605985037406484, + "grad_norm": 0.2005094587802887, + "learning_rate": 1.696009975062344e-05, + "loss": 0.0902, + "step": 6100 + }, + { + "epoch": 0.7618453865336658, + "grad_norm": 0.3194235861301422, + "learning_rate": 1.6955112219451372e-05, + "loss": 0.0054, + "step": 6110 + }, + { + "epoch": 0.7630922693266833, + "grad_norm": 4.714141845703125, + "learning_rate": 1.6950124688279303e-05, + "loss": 0.1559, + "step": 6120 + }, + { + "epoch": 0.7643391521197007, + "grad_norm": 2.5545177459716797, + "learning_rate": 1.6945137157107233e-05, + "loss": 0.0985, + "step": 6130 + }, + { + "epoch": 0.7655860349127181, + "grad_norm": 0.014316755346953869, + "learning_rate": 1.6940149625935164e-05, + "loss": 0.0605, + "step": 6140 + }, + { + "epoch": 0.7668329177057357, + "grad_norm": 0.5875915288925171, + "learning_rate": 1.6935162094763095e-05, + "loss": 0.009, + "step": 6150 + }, + { + "epoch": 0.7680798004987531, + "grad_norm": 0.05648162588477135, + "learning_rate": 1.6930174563591025e-05, + "loss": 0.0426, + "step": 6160 + }, + { + "epoch": 0.7693266832917706, + "grad_norm": 5.96530294418335, + "learning_rate": 1.6925187032418956e-05, + "loss": 0.0853, + "step": 6170 + }, + { + "epoch": 0.770573566084788, + "grad_norm": 0.21136033535003662, + "learning_rate": 1.6920199501246883e-05, + "loss": 0.0251, + "step": 6180 + }, + { + "epoch": 0.7718204488778054, + "grad_norm": 1.1626219749450684, + "learning_rate": 1.6915211970074814e-05, + "loss": 0.0397, + "step": 6190 + }, + { + "epoch": 0.773067331670823, + "grad_norm": 0.010134602896869183, + "learning_rate": 1.6910224438902744e-05, + "loss": 0.0024, + "step": 6200 + }, + { + "epoch": 0.7743142144638404, + "grad_norm": 0.15311940014362335, + "learning_rate": 1.6905236907730675e-05, + "loss": 0.0406, + "step": 6210 + }, + { + "epoch": 0.7755610972568578, + "grad_norm": 0.00632174639031291, + "learning_rate": 1.6900249376558605e-05, + "loss": 0.0354, + "step": 6220 + }, + { + "epoch": 0.7768079800498753, + "grad_norm": 2.030371904373169, + "learning_rate": 1.6895261845386536e-05, + "loss": 0.0799, + "step": 6230 + }, + { + "epoch": 0.7780548628428927, + "grad_norm": 3.4602127075195312, + "learning_rate": 1.6890274314214467e-05, + "loss": 0.0697, + "step": 6240 + }, + { + "epoch": 0.7793017456359103, + "grad_norm": 2.2719037532806396, + "learning_rate": 1.6885286783042397e-05, + "loss": 0.1192, + "step": 6250 + }, + { + "epoch": 0.7805486284289277, + "grad_norm": 0.015602881088852882, + "learning_rate": 1.6880299251870325e-05, + "loss": 0.0238, + "step": 6260 + }, + { + "epoch": 0.7817955112219451, + "grad_norm": 2.9303834438323975, + "learning_rate": 1.6875311720698255e-05, + "loss": 0.0726, + "step": 6270 + }, + { + "epoch": 0.7830423940149626, + "grad_norm": 0.4048525393009186, + "learning_rate": 1.6870324189526186e-05, + "loss": 0.0489, + "step": 6280 + }, + { + "epoch": 0.78428927680798, + "grad_norm": 0.10114028304815292, + "learning_rate": 1.6865336658354116e-05, + "loss": 0.0263, + "step": 6290 + }, + { + "epoch": 0.7855361596009975, + "grad_norm": 10.986588478088379, + "learning_rate": 1.6860349127182044e-05, + "loss": 0.0775, + "step": 6300 + }, + { + "epoch": 0.786783042394015, + "grad_norm": 16.34410285949707, + "learning_rate": 1.6855361596009974e-05, + "loss": 0.1214, + "step": 6310 + }, + { + "epoch": 0.7880299251870324, + "grad_norm": 0.12011321634054184, + "learning_rate": 1.6850374064837908e-05, + "loss": 0.0858, + "step": 6320 + }, + { + "epoch": 0.7892768079800498, + "grad_norm": 0.021219773218035698, + "learning_rate": 1.684538653366584e-05, + "loss": 0.1699, + "step": 6330 + }, + { + "epoch": 0.7905236907730673, + "grad_norm": 0.5230464935302734, + "learning_rate": 1.684039900249377e-05, + "loss": 0.0289, + "step": 6340 + }, + { + "epoch": 0.7917705735660848, + "grad_norm": 1.6605863571166992, + "learning_rate": 1.6835411471321697e-05, + "loss": 0.0072, + "step": 6350 + }, + { + "epoch": 0.7930174563591023, + "grad_norm": 0.14761345088481903, + "learning_rate": 1.6830423940149627e-05, + "loss": 0.0995, + "step": 6360 + }, + { + "epoch": 0.7942643391521197, + "grad_norm": 0.01933436468243599, + "learning_rate": 1.6825436408977558e-05, + "loss": 0.0627, + "step": 6370 + }, + { + "epoch": 0.7955112219451371, + "grad_norm": 3.975325107574463, + "learning_rate": 1.682044887780549e-05, + "loss": 0.1487, + "step": 6380 + }, + { + "epoch": 0.7967581047381546, + "grad_norm": 14.431473731994629, + "learning_rate": 1.6815461346633416e-05, + "loss": 0.1239, + "step": 6390 + }, + { + "epoch": 0.7980049875311721, + "grad_norm": 0.07805877178907394, + "learning_rate": 1.6810473815461346e-05, + "loss": 0.0531, + "step": 6400 + }, + { + "epoch": 0.7992518703241895, + "grad_norm": 0.10893228650093079, + "learning_rate": 1.6805486284289277e-05, + "loss": 0.0493, + "step": 6410 + }, + { + "epoch": 0.800498753117207, + "grad_norm": 0.12542615830898285, + "learning_rate": 1.6800498753117208e-05, + "loss": 0.0231, + "step": 6420 + }, + { + "epoch": 0.8017456359102244, + "grad_norm": 0.028069086372852325, + "learning_rate": 1.679551122194514e-05, + "loss": 0.2296, + "step": 6430 + }, + { + "epoch": 0.8029925187032418, + "grad_norm": 25.13450813293457, + "learning_rate": 1.679052369077307e-05, + "loss": 0.0988, + "step": 6440 + }, + { + "epoch": 0.8042394014962594, + "grad_norm": 0.1958978772163391, + "learning_rate": 1.6785536159601e-05, + "loss": 0.015, + "step": 6450 + }, + { + "epoch": 0.8054862842892768, + "grad_norm": 4.110210418701172, + "learning_rate": 1.678054862842893e-05, + "loss": 0.0694, + "step": 6460 + }, + { + "epoch": 0.8067331670822943, + "grad_norm": 0.1173863485455513, + "learning_rate": 1.6775561097256857e-05, + "loss": 0.0204, + "step": 6470 + }, + { + "epoch": 0.8079800498753117, + "grad_norm": 0.0829818993806839, + "learning_rate": 1.6770573566084788e-05, + "loss": 0.0636, + "step": 6480 + }, + { + "epoch": 0.8092269326683291, + "grad_norm": 0.2234659045934677, + "learning_rate": 1.676558603491272e-05, + "loss": 0.1161, + "step": 6490 + }, + { + "epoch": 0.8104738154613467, + "grad_norm": 16.363990783691406, + "learning_rate": 1.676059850374065e-05, + "loss": 0.0608, + "step": 6500 + }, + { + "epoch": 0.8117206982543641, + "grad_norm": 1.963186264038086, + "learning_rate": 1.675561097256858e-05, + "loss": 0.0062, + "step": 6510 + }, + { + "epoch": 0.8129675810473815, + "grad_norm": 0.018916714936494827, + "learning_rate": 1.675062344139651e-05, + "loss": 0.1132, + "step": 6520 + }, + { + "epoch": 0.814214463840399, + "grad_norm": 10.125832557678223, + "learning_rate": 1.674563591022444e-05, + "loss": 0.0553, + "step": 6530 + }, + { + "epoch": 0.8154613466334164, + "grad_norm": 0.034013696014881134, + "learning_rate": 1.6740648379052372e-05, + "loss": 0.0283, + "step": 6540 + }, + { + "epoch": 0.816708229426434, + "grad_norm": Infinity, + "learning_rate": 1.6736159600997508e-05, + "loss": 0.0506, + "step": 6550 + }, + { + "epoch": 0.8179551122194514, + "grad_norm": 18.428625106811523, + "learning_rate": 1.673117206982544e-05, + "loss": 0.0326, + "step": 6560 + }, + { + "epoch": 0.8192019950124688, + "grad_norm": 18.967618942260742, + "learning_rate": 1.6726184538653366e-05, + "loss": 0.0608, + "step": 6570 + }, + { + "epoch": 0.8204488778054863, + "grad_norm": 9.87349796295166, + "learning_rate": 1.6721197007481297e-05, + "loss": 0.066, + "step": 6580 + }, + { + "epoch": 0.8216957605985037, + "grad_norm": 0.5438946485519409, + "learning_rate": 1.6716209476309227e-05, + "loss": 0.0908, + "step": 6590 + }, + { + "epoch": 0.8229426433915212, + "grad_norm": 15.214516639709473, + "learning_rate": 1.6711221945137158e-05, + "loss": 0.0403, + "step": 6600 + }, + { + "epoch": 0.8241895261845387, + "grad_norm": 3.780027151107788, + "learning_rate": 1.670623441396509e-05, + "loss": 0.0534, + "step": 6610 + }, + { + "epoch": 0.8254364089775561, + "grad_norm": 0.5979506373405457, + "learning_rate": 1.670124688279302e-05, + "loss": 0.0648, + "step": 6620 + }, + { + "epoch": 0.8266832917705735, + "grad_norm": 0.009059612639248371, + "learning_rate": 1.669625935162095e-05, + "loss": 0.0309, + "step": 6630 + }, + { + "epoch": 0.827930174563591, + "grad_norm": 0.012184210121631622, + "learning_rate": 1.669127182044888e-05, + "loss": 0.059, + "step": 6640 + }, + { + "epoch": 0.8291770573566085, + "grad_norm": 0.005635616835206747, + "learning_rate": 1.6686284289276808e-05, + "loss": 0.003, + "step": 6650 + }, + { + "epoch": 0.830423940149626, + "grad_norm": 0.09480497986078262, + "learning_rate": 1.668129675810474e-05, + "loss": 0.075, + "step": 6660 + }, + { + "epoch": 0.8316708229426434, + "grad_norm": 0.024640752002596855, + "learning_rate": 1.667630922693267e-05, + "loss": 0.0301, + "step": 6670 + }, + { + "epoch": 0.8329177057356608, + "grad_norm": 0.7184823155403137, + "learning_rate": 1.66713216957606e-05, + "loss": 0.0377, + "step": 6680 + }, + { + "epoch": 0.8341645885286783, + "grad_norm": 38.88700485229492, + "learning_rate": 1.666633416458853e-05, + "loss": 0.0133, + "step": 6690 + }, + { + "epoch": 0.8354114713216958, + "grad_norm": 14.80359172821045, + "learning_rate": 1.666134663341646e-05, + "loss": 0.0799, + "step": 6700 + }, + { + "epoch": 0.8366583541147132, + "grad_norm": 0.06874788552522659, + "learning_rate": 1.665635910224439e-05, + "loss": 0.0161, + "step": 6710 + }, + { + "epoch": 0.8379052369077307, + "grad_norm": 0.01392221450805664, + "learning_rate": 1.6651371571072322e-05, + "loss": 0.0068, + "step": 6720 + }, + { + "epoch": 0.8391521197007481, + "grad_norm": 0.03665358945727348, + "learning_rate": 1.664638403990025e-05, + "loss": 0.0389, + "step": 6730 + }, + { + "epoch": 0.8403990024937655, + "grad_norm": 0.10448456555604935, + "learning_rate": 1.664139650872818e-05, + "loss": 0.0791, + "step": 6740 + }, + { + "epoch": 0.8416458852867831, + "grad_norm": 0.1475229412317276, + "learning_rate": 1.663640897755611e-05, + "loss": 0.1261, + "step": 6750 + }, + { + "epoch": 0.8428927680798005, + "grad_norm": 5.749692916870117, + "learning_rate": 1.663142144638404e-05, + "loss": 0.0223, + "step": 6760 + }, + { + "epoch": 0.844139650872818, + "grad_norm": 20.07712173461914, + "learning_rate": 1.6626433915211972e-05, + "loss": 0.0936, + "step": 6770 + }, + { + "epoch": 0.8453865336658354, + "grad_norm": 6.963824272155762, + "learning_rate": 1.6621446384039902e-05, + "loss": 0.074, + "step": 6780 + }, + { + "epoch": 0.8466334164588528, + "grad_norm": 0.057135455310344696, + "learning_rate": 1.6616458852867833e-05, + "loss": 0.1048, + "step": 6790 + }, + { + "epoch": 0.8478802992518704, + "grad_norm": 0.057420868426561356, + "learning_rate": 1.6611471321695764e-05, + "loss": 0.0706, + "step": 6800 + }, + { + "epoch": 0.8491271820448878, + "grad_norm": 14.05396842956543, + "learning_rate": 1.6606483790523694e-05, + "loss": 0.098, + "step": 6810 + }, + { + "epoch": 0.8503740648379052, + "grad_norm": 0.3075912594795227, + "learning_rate": 1.660149625935162e-05, + "loss": 0.0735, + "step": 6820 + }, + { + "epoch": 0.8516209476309227, + "grad_norm": 0.05939553305506706, + "learning_rate": 1.6596508728179552e-05, + "loss": 0.0436, + "step": 6830 + }, + { + "epoch": 0.8528678304239401, + "grad_norm": 0.012576499953866005, + "learning_rate": 1.6591521197007483e-05, + "loss": 0.0904, + "step": 6840 + }, + { + "epoch": 0.8541147132169576, + "grad_norm": 35.2950325012207, + "learning_rate": 1.6586533665835413e-05, + "loss": 0.0344, + "step": 6850 + }, + { + "epoch": 0.8553615960099751, + "grad_norm": 0.36233705282211304, + "learning_rate": 1.658154613466334e-05, + "loss": 0.0047, + "step": 6860 + }, + { + "epoch": 0.8566084788029925, + "grad_norm": 4.004518508911133, + "learning_rate": 1.6576558603491275e-05, + "loss": 0.0688, + "step": 6870 + }, + { + "epoch": 0.85785536159601, + "grad_norm": 48.95320129394531, + "learning_rate": 1.6571571072319205e-05, + "loss": 0.0667, + "step": 6880 + }, + { + "epoch": 0.8591022443890274, + "grad_norm": 30.85384178161621, + "learning_rate": 1.6566583541147136e-05, + "loss": 0.1294, + "step": 6890 + }, + { + "epoch": 0.8603491271820449, + "grad_norm": 0.15950468182563782, + "learning_rate": 1.6561596009975063e-05, + "loss": 0.1183, + "step": 6900 + }, + { + "epoch": 0.8615960099750624, + "grad_norm": 34.85268020629883, + "learning_rate": 1.6556608478802994e-05, + "loss": 0.0239, + "step": 6910 + }, + { + "epoch": 0.8628428927680798, + "grad_norm": 4.5437164306640625, + "learning_rate": 1.6551620947630924e-05, + "loss": 0.0897, + "step": 6920 + }, + { + "epoch": 0.8640897755610972, + "grad_norm": 0.06113681197166443, + "learning_rate": 1.6546633416458855e-05, + "loss": 0.076, + "step": 6930 + }, + { + "epoch": 0.8653366583541147, + "grad_norm": 0.1872865855693817, + "learning_rate": 1.6541645885286782e-05, + "loss": 0.1372, + "step": 6940 + }, + { + "epoch": 0.8665835411471322, + "grad_norm": 0.03576788678765297, + "learning_rate": 1.6536658354114713e-05, + "loss": 0.0023, + "step": 6950 + }, + { + "epoch": 0.8678304239401496, + "grad_norm": 0.040885813534259796, + "learning_rate": 1.6531670822942643e-05, + "loss": 0.0762, + "step": 6960 + }, + { + "epoch": 0.8690773067331671, + "grad_norm": 13.959723472595215, + "learning_rate": 1.6526683291770574e-05, + "loss": 0.0301, + "step": 6970 + }, + { + "epoch": 0.8703241895261845, + "grad_norm": 0.5481641292572021, + "learning_rate": 1.6521695760598505e-05, + "loss": 0.0881, + "step": 6980 + }, + { + "epoch": 0.871571072319202, + "grad_norm": 0.04001607373356819, + "learning_rate": 1.6516708229426435e-05, + "loss": 0.0146, + "step": 6990 + }, + { + "epoch": 0.8728179551122195, + "grad_norm": 0.044817376881837845, + "learning_rate": 1.6511720698254366e-05, + "loss": 0.0412, + "step": 7000 + }, + { + "epoch": 0.8740648379052369, + "grad_norm": 0.027472732588648796, + "learning_rate": 1.6506733167082296e-05, + "loss": 0.0923, + "step": 7010 + }, + { + "epoch": 0.8753117206982544, + "grad_norm": 0.013095473870635033, + "learning_rate": 1.6501745635910227e-05, + "loss": 0.024, + "step": 7020 + }, + { + "epoch": 0.8765586034912718, + "grad_norm": 0.2899605333805084, + "learning_rate": 1.6496758104738154e-05, + "loss": 0.0865, + "step": 7030 + }, + { + "epoch": 0.8778054862842892, + "grad_norm": 0.07437755167484283, + "learning_rate": 1.6491770573566085e-05, + "loss": 0.0572, + "step": 7040 + }, + { + "epoch": 0.8790523690773068, + "grad_norm": 0.01817794144153595, + "learning_rate": 1.6486783042394016e-05, + "loss": 0.0611, + "step": 7050 + }, + { + "epoch": 0.8802992518703242, + "grad_norm": 2.1446876525878906, + "learning_rate": 1.6481795511221946e-05, + "loss": 0.1099, + "step": 7060 + }, + { + "epoch": 0.8815461346633416, + "grad_norm": 0.42781832814216614, + "learning_rate": 1.6476807980049877e-05, + "loss": 0.1443, + "step": 7070 + }, + { + "epoch": 0.8827930174563591, + "grad_norm": 0.02748112753033638, + "learning_rate": 1.6471820448877807e-05, + "loss": 0.0817, + "step": 7080 + }, + { + "epoch": 0.8840399002493765, + "grad_norm": 0.025286074727773666, + "learning_rate": 1.6466832917705738e-05, + "loss": 0.0076, + "step": 7090 + }, + { + "epoch": 0.885286783042394, + "grad_norm": 0.04752347618341446, + "learning_rate": 1.646184538653367e-05, + "loss": 0.0026, + "step": 7100 + }, + { + "epoch": 0.8865336658354115, + "grad_norm": 0.01973377726972103, + "learning_rate": 1.6456857855361596e-05, + "loss": 0.058, + "step": 7110 + }, + { + "epoch": 0.8877805486284289, + "grad_norm": 0.05321444198489189, + "learning_rate": 1.6451870324189526e-05, + "loss": 0.0864, + "step": 7120 + }, + { + "epoch": 0.8890274314214464, + "grad_norm": 0.23317782580852509, + "learning_rate": 1.6446882793017457e-05, + "loss": 0.0535, + "step": 7130 + }, + { + "epoch": 0.8902743142144638, + "grad_norm": 0.06026599556207657, + "learning_rate": 1.6441895261845388e-05, + "loss": 0.0212, + "step": 7140 + }, + { + "epoch": 0.8915211970074813, + "grad_norm": 33.116111755371094, + "learning_rate": 1.643690773067332e-05, + "loss": 0.0691, + "step": 7150 + }, + { + "epoch": 0.8927680798004988, + "grad_norm": 0.029996905475854874, + "learning_rate": 1.643192019950125e-05, + "loss": 0.0297, + "step": 7160 + }, + { + "epoch": 0.8940149625935162, + "grad_norm": 34.09917068481445, + "learning_rate": 1.642693266832918e-05, + "loss": 0.1996, + "step": 7170 + }, + { + "epoch": 0.8952618453865336, + "grad_norm": 14.287615776062012, + "learning_rate": 1.642194513715711e-05, + "loss": 0.0957, + "step": 7180 + }, + { + "epoch": 0.8965087281795511, + "grad_norm": 0.03745246306061745, + "learning_rate": 1.6416957605985037e-05, + "loss": 0.1025, + "step": 7190 + }, + { + "epoch": 0.8977556109725686, + "grad_norm": 1.472991943359375, + "learning_rate": 1.6411970074812968e-05, + "loss": 0.0165, + "step": 7200 + }, + { + "epoch": 0.899002493765586, + "grad_norm": 10.71480655670166, + "learning_rate": 1.64069825436409e-05, + "loss": 0.0921, + "step": 7210 + }, + { + "epoch": 0.9002493765586035, + "grad_norm": 10.575648307800293, + "learning_rate": 1.640199501246883e-05, + "loss": 0.032, + "step": 7220 + }, + { + "epoch": 0.9014962593516209, + "grad_norm": 0.08652441203594208, + "learning_rate": 1.639700748129676e-05, + "loss": 0.0408, + "step": 7230 + }, + { + "epoch": 0.9027431421446384, + "grad_norm": 0.15267132222652435, + "learning_rate": 1.639201995012469e-05, + "loss": 0.0088, + "step": 7240 + }, + { + "epoch": 0.9039900249376559, + "grad_norm": 0.027022046968340874, + "learning_rate": 1.638703241895262e-05, + "loss": 0.0372, + "step": 7250 + }, + { + "epoch": 0.9052369077306733, + "grad_norm": 0.02087612822651863, + "learning_rate": 1.6382044887780552e-05, + "loss": 0.0057, + "step": 7260 + }, + { + "epoch": 0.9064837905236908, + "grad_norm": 0.007287871092557907, + "learning_rate": 1.6377057356608482e-05, + "loss": 0.0169, + "step": 7270 + }, + { + "epoch": 0.9077306733167082, + "grad_norm": 1.5269182920455933, + "learning_rate": 1.637206982543641e-05, + "loss": 0.0477, + "step": 7280 + }, + { + "epoch": 0.9089775561097256, + "grad_norm": 0.03564201295375824, + "learning_rate": 1.636708229426434e-05, + "loss": 0.0856, + "step": 7290 + }, + { + "epoch": 0.9102244389027432, + "grad_norm": 0.033967025578022, + "learning_rate": 1.636209476309227e-05, + "loss": 0.0049, + "step": 7300 + }, + { + "epoch": 0.9114713216957606, + "grad_norm": 0.0063847266137599945, + "learning_rate": 1.63571072319202e-05, + "loss": 0.0798, + "step": 7310 + }, + { + "epoch": 0.912718204488778, + "grad_norm": 0.018450727686285973, + "learning_rate": 1.635211970074813e-05, + "loss": 0.1612, + "step": 7320 + }, + { + "epoch": 0.9139650872817955, + "grad_norm": 0.16694334149360657, + "learning_rate": 1.634713216957606e-05, + "loss": 0.0057, + "step": 7330 + }, + { + "epoch": 0.9152119700748129, + "grad_norm": 0.015072043053805828, + "learning_rate": 1.6342144638403993e-05, + "loss": 0.1719, + "step": 7340 + }, + { + "epoch": 0.9164588528678305, + "grad_norm": 0.013904220424592495, + "learning_rate": 1.6337157107231924e-05, + "loss": 0.0412, + "step": 7350 + }, + { + "epoch": 0.9177057356608479, + "grad_norm": 0.04418414086103439, + "learning_rate": 1.633216957605985e-05, + "loss": 0.0191, + "step": 7360 + }, + { + "epoch": 0.9189526184538653, + "grad_norm": 8.246712684631348, + "learning_rate": 1.6327182044887782e-05, + "loss": 0.0405, + "step": 7370 + }, + { + "epoch": 0.9201995012468828, + "grad_norm": 0.02130391076207161, + "learning_rate": 1.6322194513715712e-05, + "loss": 0.0013, + "step": 7380 + }, + { + "epoch": 0.9214463840399002, + "grad_norm": 0.07500559091567993, + "learning_rate": 1.6317206982543643e-05, + "loss": 0.0669, + "step": 7390 + }, + { + "epoch": 0.9226932668329177, + "grad_norm": 0.14592240750789642, + "learning_rate": 1.631221945137157e-05, + "loss": 0.0455, + "step": 7400 + }, + { + "epoch": 0.9239401496259352, + "grad_norm": 0.12824711203575134, + "learning_rate": 1.63072319201995e-05, + "loss": 0.1218, + "step": 7410 + }, + { + "epoch": 0.9251870324189526, + "grad_norm": 29.46427345275879, + "learning_rate": 1.630224438902743e-05, + "loss": 0.0587, + "step": 7420 + }, + { + "epoch": 0.92643391521197, + "grad_norm": 0.04521302878856659, + "learning_rate": 1.6297256857855362e-05, + "loss": 0.0363, + "step": 7430 + }, + { + "epoch": 0.9276807980049875, + "grad_norm": 0.17441897094249725, + "learning_rate": 1.6292269326683293e-05, + "loss": 0.0856, + "step": 7440 + }, + { + "epoch": 0.928927680798005, + "grad_norm": 0.02611939236521721, + "learning_rate": 1.6287281795511223e-05, + "loss": 0.0012, + "step": 7450 + }, + { + "epoch": 0.9301745635910225, + "grad_norm": 0.2440149188041687, + "learning_rate": 1.6282294264339154e-05, + "loss": 0.0674, + "step": 7460 + }, + { + "epoch": 0.9314214463840399, + "grad_norm": 0.029897717759013176, + "learning_rate": 1.6277306733167085e-05, + "loss": 0.0296, + "step": 7470 + }, + { + "epoch": 0.9326683291770573, + "grad_norm": 0.022027527913451195, + "learning_rate": 1.6272319201995012e-05, + "loss": 0.0202, + "step": 7480 + }, + { + "epoch": 0.9339152119700748, + "grad_norm": 6.706576347351074, + "learning_rate": 1.6267331670822942e-05, + "loss": 0.0872, + "step": 7490 + }, + { + "epoch": 0.9351620947630923, + "grad_norm": 0.02055455558001995, + "learning_rate": 1.6262344139650873e-05, + "loss": 0.0233, + "step": 7500 + }, + { + "epoch": 0.9364089775561097, + "grad_norm": 0.25276321172714233, + "learning_rate": 1.6257356608478804e-05, + "loss": 0.0446, + "step": 7510 + }, + { + "epoch": 0.9376558603491272, + "grad_norm": 0.08888930082321167, + "learning_rate": 1.6252369077306734e-05, + "loss": 0.0068, + "step": 7520 + }, + { + "epoch": 0.9389027431421446, + "grad_norm": 1.5026854276657104, + "learning_rate": 1.6247381546134665e-05, + "loss": 0.0756, + "step": 7530 + }, + { + "epoch": 0.940149625935162, + "grad_norm": 0.0982433333992958, + "learning_rate": 1.6242394014962596e-05, + "loss": 0.2113, + "step": 7540 + }, + { + "epoch": 0.9413965087281796, + "grad_norm": 0.02279825694859028, + "learning_rate": 1.6237406483790526e-05, + "loss": 0.0353, + "step": 7550 + }, + { + "epoch": 0.942643391521197, + "grad_norm": 0.6083737015724182, + "learning_rate": 1.6232418952618457e-05, + "loss": 0.0942, + "step": 7560 + }, + { + "epoch": 0.9438902743142145, + "grad_norm": 0.08139675855636597, + "learning_rate": 1.6227431421446384e-05, + "loss": 0.0854, + "step": 7570 + }, + { + "epoch": 0.9451371571072319, + "grad_norm": 10.0869140625, + "learning_rate": 1.6222443890274315e-05, + "loss": 0.0515, + "step": 7580 + }, + { + "epoch": 0.9463840399002493, + "grad_norm": 1.962074875831604, + "learning_rate": 1.6217456359102245e-05, + "loss": 0.1042, + "step": 7590 + }, + { + "epoch": 0.9476309226932669, + "grad_norm": 4.804053783416748, + "learning_rate": 1.6212468827930176e-05, + "loss": 0.0433, + "step": 7600 + }, + { + "epoch": 0.9488778054862843, + "grad_norm": 0.2560361921787262, + "learning_rate": 1.6207481296758107e-05, + "loss": 0.0221, + "step": 7610 + }, + { + "epoch": 0.9501246882793017, + "grad_norm": 0.042769655585289, + "learning_rate": 1.6202493765586037e-05, + "loss": 0.0464, + "step": 7620 + }, + { + "epoch": 0.9513715710723192, + "grad_norm": 0.06933890283107758, + "learning_rate": 1.6197506234413968e-05, + "loss": 0.0249, + "step": 7630 + }, + { + "epoch": 0.9526184538653366, + "grad_norm": 9.516181945800781, + "learning_rate": 1.61925187032419e-05, + "loss": 0.0285, + "step": 7640 + }, + { + "epoch": 0.9538653366583542, + "grad_norm": 6.549726963043213, + "learning_rate": 1.6187531172069826e-05, + "loss": 0.0574, + "step": 7650 + }, + { + "epoch": 0.9551122194513716, + "grad_norm": 0.3095720410346985, + "learning_rate": 1.6182543640897756e-05, + "loss": 0.001, + "step": 7660 + }, + { + "epoch": 0.956359102244389, + "grad_norm": 38.27455139160156, + "learning_rate": 1.6177556109725687e-05, + "loss": 0.1111, + "step": 7670 + }, + { + "epoch": 0.9576059850374065, + "grad_norm": 21.775671005249023, + "learning_rate": 1.6172568578553617e-05, + "loss": 0.1198, + "step": 7680 + }, + { + "epoch": 0.9588528678304239, + "grad_norm": 3.9372639656066895, + "learning_rate": 1.6167581047381548e-05, + "loss": 0.0646, + "step": 7690 + }, + { + "epoch": 0.9600997506234414, + "grad_norm": 0.037020809948444366, + "learning_rate": 1.616259351620948e-05, + "loss": 0.094, + "step": 7700 + }, + { + "epoch": 0.9613466334164589, + "grad_norm": 26.675029754638672, + "learning_rate": 1.615760598503741e-05, + "loss": 0.0267, + "step": 7710 + }, + { + "epoch": 0.9625935162094763, + "grad_norm": 0.5148128271102905, + "learning_rate": 1.615261845386534e-05, + "loss": 0.0383, + "step": 7720 + }, + { + "epoch": 0.9638403990024937, + "grad_norm": 0.024793069809675217, + "learning_rate": 1.614763092269327e-05, + "loss": 0.0373, + "step": 7730 + }, + { + "epoch": 0.9650872817955112, + "grad_norm": 0.036278147250413895, + "learning_rate": 1.6142643391521198e-05, + "loss": 0.0275, + "step": 7740 + }, + { + "epoch": 0.9663341645885287, + "grad_norm": 37.560386657714844, + "learning_rate": 1.613765586034913e-05, + "loss": 0.1835, + "step": 7750 + }, + { + "epoch": 0.9675810473815462, + "grad_norm": 0.013977359049022198, + "learning_rate": 1.613266832917706e-05, + "loss": 0.0191, + "step": 7760 + }, + { + "epoch": 0.9688279301745636, + "grad_norm": 5.472465991973877, + "learning_rate": 1.612768079800499e-05, + "loss": 0.0751, + "step": 7770 + }, + { + "epoch": 0.970074812967581, + "grad_norm": 0.008948855102062225, + "learning_rate": 1.6122693266832917e-05, + "loss": 0.0461, + "step": 7780 + }, + { + "epoch": 0.9713216957605985, + "grad_norm": 0.03792479261755943, + "learning_rate": 1.6117705735660847e-05, + "loss": 0.1022, + "step": 7790 + }, + { + "epoch": 0.972568578553616, + "grad_norm": 13.02475357055664, + "learning_rate": 1.6112718204488778e-05, + "loss": 0.0728, + "step": 7800 + }, + { + "epoch": 0.9738154613466334, + "grad_norm": 1.2862738370895386, + "learning_rate": 1.6107730673316712e-05, + "loss": 0.0599, + "step": 7810 + }, + { + "epoch": 0.9750623441396509, + "grad_norm": 0.07089701294898987, + "learning_rate": 1.610274314214464e-05, + "loss": 0.1124, + "step": 7820 + }, + { + "epoch": 0.9763092269326683, + "grad_norm": 0.04689112678170204, + "learning_rate": 1.609775561097257e-05, + "loss": 0.0802, + "step": 7830 + }, + { + "epoch": 0.9775561097256857, + "grad_norm": 0.01914226822555065, + "learning_rate": 1.60927680798005e-05, + "loss": 0.096, + "step": 7840 + }, + { + "epoch": 0.9788029925187033, + "grad_norm": 0.05238153040409088, + "learning_rate": 1.608778054862843e-05, + "loss": 0.0627, + "step": 7850 + }, + { + "epoch": 0.9800498753117207, + "grad_norm": 0.011097576469182968, + "learning_rate": 1.608279301745636e-05, + "loss": 0.1045, + "step": 7860 + }, + { + "epoch": 0.9812967581047382, + "grad_norm": 0.01885097473859787, + "learning_rate": 1.607780548628429e-05, + "loss": 0.0469, + "step": 7870 + }, + { + "epoch": 0.9825436408977556, + "grad_norm": 15.283947944641113, + "learning_rate": 1.607281795511222e-05, + "loss": 0.114, + "step": 7880 + }, + { + "epoch": 0.983790523690773, + "grad_norm": 16.590517044067383, + "learning_rate": 1.606783042394015e-05, + "loss": 0.1792, + "step": 7890 + }, + { + "epoch": 0.9850374064837906, + "grad_norm": 0.038308024406433105, + "learning_rate": 1.606284289276808e-05, + "loss": 0.155, + "step": 7900 + }, + { + "epoch": 0.986284289276808, + "grad_norm": 0.04093620926141739, + "learning_rate": 1.605785536159601e-05, + "loss": 0.0326, + "step": 7910 + }, + { + "epoch": 0.9875311720698254, + "grad_norm": 0.23573100566864014, + "learning_rate": 1.6052867830423942e-05, + "loss": 0.041, + "step": 7920 + }, + { + "epoch": 0.9887780548628429, + "grad_norm": 0.023155642673373222, + "learning_rate": 1.6047880299251873e-05, + "loss": 0.0076, + "step": 7930 + }, + { + "epoch": 0.9900249376558603, + "grad_norm": 0.01830834336578846, + "learning_rate": 1.60428927680798e-05, + "loss": 0.0559, + "step": 7940 + }, + { + "epoch": 0.9912718204488778, + "grad_norm": 0.010360955260694027, + "learning_rate": 1.603790523690773e-05, + "loss": 0.0761, + "step": 7950 + }, + { + "epoch": 0.9925187032418953, + "grad_norm": 32.94133377075195, + "learning_rate": 1.603291770573566e-05, + "loss": 0.1009, + "step": 7960 + }, + { + "epoch": 0.9937655860349127, + "grad_norm": 5.8751630783081055, + "learning_rate": 1.6027930174563592e-05, + "loss": 0.0327, + "step": 7970 + }, + { + "epoch": 0.9950124688279302, + "grad_norm": 0.03165939822793007, + "learning_rate": 1.6022942643391522e-05, + "loss": 0.0085, + "step": 7980 + }, + { + "epoch": 0.9962593516209476, + "grad_norm": 33.61581039428711, + "learning_rate": 1.6017955112219453e-05, + "loss": 0.0331, + "step": 7990 + }, + { + "epoch": 0.9975062344139651, + "grad_norm": 0.19740180671215057, + "learning_rate": 1.6012967581047384e-05, + "loss": 0.0377, + "step": 8000 + }, + { + "epoch": 0.9987531172069826, + "grad_norm": 4.508144378662109, + "learning_rate": 1.6007980049875314e-05, + "loss": 0.0937, + "step": 8010 + }, + { + "epoch": 1.0, + "grad_norm": 0.10585662722587585, + "learning_rate": 1.6002992518703245e-05, + "loss": 0.0008, + "step": 8020 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.990273707837147, + "eval_loss": 0.045430995523929596, + "eval_runtime": 17.5172, + "eval_samples_per_second": 915.616, + "eval_steps_per_second": 57.258, + "step": 8020 + }, + { + "epoch": 1.0012468827930174, + "grad_norm": 0.32553237676620483, + "learning_rate": 1.5998004987531172e-05, + "loss": 0.0008, + "step": 8030 + }, + { + "epoch": 1.0024937655860349, + "grad_norm": 42.72439193725586, + "learning_rate": 1.5993017456359103e-05, + "loss": 0.0373, + "step": 8040 + }, + { + "epoch": 1.0037406483790523, + "grad_norm": 0.026368023827672005, + "learning_rate": 1.5988029925187033e-05, + "loss": 0.001, + "step": 8050 + }, + { + "epoch": 1.0049875311720697, + "grad_norm": 0.006805025972425938, + "learning_rate": 1.5983042394014964e-05, + "loss": 0.0352, + "step": 8060 + }, + { + "epoch": 1.0062344139650872, + "grad_norm": 0.007014899980276823, + "learning_rate": 1.5978054862842895e-05, + "loss": 0.0006, + "step": 8070 + }, + { + "epoch": 1.0074812967581048, + "grad_norm": 0.007944006472826004, + "learning_rate": 1.5973067331670825e-05, + "loss": 0.0684, + "step": 8080 + }, + { + "epoch": 1.0087281795511223, + "grad_norm": 0.13892197608947754, + "learning_rate": 1.5968079800498756e-05, + "loss": 0.0397, + "step": 8090 + }, + { + "epoch": 1.0099750623441397, + "grad_norm": 0.007291398011147976, + "learning_rate": 1.5963092269326687e-05, + "loss": 0.028, + "step": 8100 + }, + { + "epoch": 1.0112219451371571, + "grad_norm": 0.007756800390779972, + "learning_rate": 1.5958104738154614e-05, + "loss": 0.0021, + "step": 8110 + }, + { + "epoch": 1.0124688279301746, + "grad_norm": 0.15053221583366394, + "learning_rate": 1.5953117206982544e-05, + "loss": 0.005, + "step": 8120 + }, + { + "epoch": 1.013715710723192, + "grad_norm": 0.029048511758446693, + "learning_rate": 1.5948129675810475e-05, + "loss": 0.0087, + "step": 8130 + }, + { + "epoch": 1.0149625935162094, + "grad_norm": 0.8408064842224121, + "learning_rate": 1.5943142144638406e-05, + "loss": 0.058, + "step": 8140 + }, + { + "epoch": 1.0162094763092269, + "grad_norm": 0.011089330539107323, + "learning_rate": 1.5938154613466333e-05, + "loss": 0.0189, + "step": 8150 + }, + { + "epoch": 1.0174563591022443, + "grad_norm": 0.1405433863401413, + "learning_rate": 1.5933167082294267e-05, + "loss": 0.0594, + "step": 8160 + }, + { + "epoch": 1.018703241895262, + "grad_norm": 0.020905321463942528, + "learning_rate": 1.5928179551122197e-05, + "loss": 0.0215, + "step": 8170 + }, + { + "epoch": 1.0199501246882794, + "grad_norm": 0.018605615943670273, + "learning_rate": 1.5923192019950128e-05, + "loss": 0.0087, + "step": 8180 + }, + { + "epoch": 1.0211970074812968, + "grad_norm": 0.017665930092334747, + "learning_rate": 1.5918204488778055e-05, + "loss": 0.0563, + "step": 8190 + }, + { + "epoch": 1.0224438902743143, + "grad_norm": 0.005314531270414591, + "learning_rate": 1.5913216957605986e-05, + "loss": 0.0773, + "step": 8200 + }, + { + "epoch": 1.0236907730673317, + "grad_norm": 74.58911895751953, + "learning_rate": 1.5908229426433917e-05, + "loss": 0.063, + "step": 8210 + }, + { + "epoch": 1.0249376558603491, + "grad_norm": 18.045536041259766, + "learning_rate": 1.5903241895261847e-05, + "loss": 0.06, + "step": 8220 + }, + { + "epoch": 1.0261845386533666, + "grad_norm": 26.885278701782227, + "learning_rate": 1.5898254364089778e-05, + "loss": 0.0587, + "step": 8230 + }, + { + "epoch": 1.027431421446384, + "grad_norm": 0.007998127490282059, + "learning_rate": 1.5893266832917705e-05, + "loss": 0.0427, + "step": 8240 + }, + { + "epoch": 1.0286783042394014, + "grad_norm": 0.010522495023906231, + "learning_rate": 1.5888279301745636e-05, + "loss": 0.0416, + "step": 8250 + }, + { + "epoch": 1.0299251870324189, + "grad_norm": 0.05275670811533928, + "learning_rate": 1.5883291770573566e-05, + "loss": 0.0016, + "step": 8260 + }, + { + "epoch": 1.0311720698254363, + "grad_norm": 0.1919606626033783, + "learning_rate": 1.5878304239401497e-05, + "loss": 0.0005, + "step": 8270 + }, + { + "epoch": 1.032418952618454, + "grad_norm": 0.13268867135047913, + "learning_rate": 1.5873316708229428e-05, + "loss": 0.0356, + "step": 8280 + }, + { + "epoch": 1.0336658354114714, + "grad_norm": 0.007612032815814018, + "learning_rate": 1.5868329177057358e-05, + "loss": 0.0397, + "step": 8290 + }, + { + "epoch": 1.0349127182044888, + "grad_norm": 2.647951364517212, + "learning_rate": 1.586334164588529e-05, + "loss": 0.0069, + "step": 8300 + }, + { + "epoch": 1.0361596009975063, + "grad_norm": 0.402752548456192, + "learning_rate": 1.585835411471322e-05, + "loss": 0.0495, + "step": 8310 + }, + { + "epoch": 1.0374064837905237, + "grad_norm": 0.4408397972583771, + "learning_rate": 1.5853366583541147e-05, + "loss": 0.0061, + "step": 8320 + }, + { + "epoch": 1.0386533665835411, + "grad_norm": 0.057391341775655746, + "learning_rate": 1.5848379052369077e-05, + "loss": 0.0003, + "step": 8330 + }, + { + "epoch": 1.0399002493765586, + "grad_norm": 0.18651100993156433, + "learning_rate": 1.5843391521197008e-05, + "loss": 0.0004, + "step": 8340 + }, + { + "epoch": 1.041147132169576, + "grad_norm": 0.04775964096188545, + "learning_rate": 1.583840399002494e-05, + "loss": 0.0992, + "step": 8350 + }, + { + "epoch": 1.0423940149625934, + "grad_norm": 49.80867385864258, + "learning_rate": 1.583341645885287e-05, + "loss": 0.0121, + "step": 8360 + }, + { + "epoch": 1.043640897755611, + "grad_norm": 0.006509189493954182, + "learning_rate": 1.58284289276808e-05, + "loss": 0.0172, + "step": 8370 + }, + { + "epoch": 1.0448877805486285, + "grad_norm": 0.08325016498565674, + "learning_rate": 1.582344139650873e-05, + "loss": 0.0464, + "step": 8380 + }, + { + "epoch": 1.046134663341646, + "grad_norm": 0.007545843254774809, + "learning_rate": 1.581845386533666e-05, + "loss": 0.0351, + "step": 8390 + }, + { + "epoch": 1.0473815461346634, + "grad_norm": 0.013103622011840343, + "learning_rate": 1.5813466334164588e-05, + "loss": 0.0497, + "step": 8400 + }, + { + "epoch": 1.0486284289276808, + "grad_norm": 83.83370208740234, + "learning_rate": 1.580847880299252e-05, + "loss": 0.0322, + "step": 8410 + }, + { + "epoch": 1.0498753117206983, + "grad_norm": 0.11098748445510864, + "learning_rate": 1.580349127182045e-05, + "loss": 0.0285, + "step": 8420 + }, + { + "epoch": 1.0511221945137157, + "grad_norm": 0.003446460235863924, + "learning_rate": 1.579850374064838e-05, + "loss": 0.0917, + "step": 8430 + }, + { + "epoch": 1.0523690773067331, + "grad_norm": 0.4329495131969452, + "learning_rate": 1.579351620947631e-05, + "loss": 0.004, + "step": 8440 + }, + { + "epoch": 1.0536159600997506, + "grad_norm": 0.017273055389523506, + "learning_rate": 1.578852867830424e-05, + "loss": 0.0577, + "step": 8450 + }, + { + "epoch": 1.054862842892768, + "grad_norm": 0.05111582577228546, + "learning_rate": 1.5783541147132172e-05, + "loss": 0.0187, + "step": 8460 + }, + { + "epoch": 1.0561097256857854, + "grad_norm": 19.794246673583984, + "learning_rate": 1.5778553615960103e-05, + "loss": 0.0926, + "step": 8470 + }, + { + "epoch": 1.057356608478803, + "grad_norm": 0.003474411554634571, + "learning_rate": 1.5773566084788033e-05, + "loss": 0.0509, + "step": 8480 + }, + { + "epoch": 1.0586034912718205, + "grad_norm": 0.0037080910988152027, + "learning_rate": 1.576857855361596e-05, + "loss": 0.0011, + "step": 8490 + }, + { + "epoch": 1.059850374064838, + "grad_norm": 0.19036197662353516, + "learning_rate": 1.576359102244389e-05, + "loss": 0.0427, + "step": 8500 + }, + { + "epoch": 1.0610972568578554, + "grad_norm": 0.0166669599711895, + "learning_rate": 1.575860349127182e-05, + "loss": 0.0569, + "step": 8510 + }, + { + "epoch": 1.0623441396508728, + "grad_norm": 0.02685883827507496, + "learning_rate": 1.5753615960099752e-05, + "loss": 0.0405, + "step": 8520 + }, + { + "epoch": 1.0635910224438903, + "grad_norm": 0.003904626239091158, + "learning_rate": 1.5748628428927683e-05, + "loss": 0.045, + "step": 8530 + }, + { + "epoch": 1.0648379052369077, + "grad_norm": 0.045057203620672226, + "learning_rate": 1.5743640897755613e-05, + "loss": 0.0318, + "step": 8540 + }, + { + "epoch": 1.0660847880299251, + "grad_norm": 37.062721252441406, + "learning_rate": 1.5738653366583544e-05, + "loss": 0.0082, + "step": 8550 + }, + { + "epoch": 1.0673316708229426, + "grad_norm": 2.563587188720703, + "learning_rate": 1.5733665835411475e-05, + "loss": 0.0009, + "step": 8560 + }, + { + "epoch": 1.0685785536159602, + "grad_norm": 16.887672424316406, + "learning_rate": 1.5728678304239402e-05, + "loss": 0.1206, + "step": 8570 + }, + { + "epoch": 1.0698254364089776, + "grad_norm": 0.2500365376472473, + "learning_rate": 1.5723690773067333e-05, + "loss": 0.044, + "step": 8580 + }, + { + "epoch": 1.071072319201995, + "grad_norm": 0.046570807695388794, + "learning_rate": 1.5718703241895263e-05, + "loss": 0.0458, + "step": 8590 + }, + { + "epoch": 1.0723192019950125, + "grad_norm": 6.890371322631836, + "learning_rate": 1.5713715710723194e-05, + "loss": 0.0034, + "step": 8600 + }, + { + "epoch": 1.07356608478803, + "grad_norm": 0.011060179211199284, + "learning_rate": 1.570872817955112e-05, + "loss": 0.1253, + "step": 8610 + }, + { + "epoch": 1.0748129675810474, + "grad_norm": 0.009608889929950237, + "learning_rate": 1.570374064837905e-05, + "loss": 0.0238, + "step": 8620 + }, + { + "epoch": 1.0760598503740648, + "grad_norm": 5.49344539642334, + "learning_rate": 1.5698753117206986e-05, + "loss": 0.1225, + "step": 8630 + }, + { + "epoch": 1.0773067331670823, + "grad_norm": 0.20928065478801727, + "learning_rate": 1.5693765586034916e-05, + "loss": 0.0018, + "step": 8640 + }, + { + "epoch": 1.0785536159600997, + "grad_norm": 0.00485580787062645, + "learning_rate": 1.5688778054862844e-05, + "loss": 0.0448, + "step": 8650 + }, + { + "epoch": 1.0798004987531171, + "grad_norm": 0.007073690649122, + "learning_rate": 1.5684289276807983e-05, + "loss": 0.0339, + "step": 8660 + }, + { + "epoch": 1.0810473815461346, + "grad_norm": 24.870899200439453, + "learning_rate": 1.567930174563591e-05, + "loss": 0.0324, + "step": 8670 + }, + { + "epoch": 1.0822942643391522, + "grad_norm": 0.05342073366045952, + "learning_rate": 1.567431421446384e-05, + "loss": 0.0957, + "step": 8680 + }, + { + "epoch": 1.0835411471321696, + "grad_norm": 0.01296048704534769, + "learning_rate": 1.5669326683291772e-05, + "loss": 0.0448, + "step": 8690 + }, + { + "epoch": 1.084788029925187, + "grad_norm": 0.008593573234975338, + "learning_rate": 1.5664339152119703e-05, + "loss": 0.0046, + "step": 8700 + }, + { + "epoch": 1.0860349127182045, + "grad_norm": 0.13003209233283997, + "learning_rate": 1.5659351620947633e-05, + "loss": 0.0026, + "step": 8710 + }, + { + "epoch": 1.087281795511222, + "grad_norm": 0.06288526207208633, + "learning_rate": 1.5654364089775564e-05, + "loss": 0.0531, + "step": 8720 + }, + { + "epoch": 1.0885286783042394, + "grad_norm": 0.01975036971271038, + "learning_rate": 1.5649376558603494e-05, + "loss": 0.0335, + "step": 8730 + }, + { + "epoch": 1.0897755610972568, + "grad_norm": 0.0037824225146323442, + "learning_rate": 1.5644389027431425e-05, + "loss": 0.0229, + "step": 8740 + }, + { + "epoch": 1.0910224438902743, + "grad_norm": 0.01450043823570013, + "learning_rate": 1.5639401496259352e-05, + "loss": 0.0006, + "step": 8750 + }, + { + "epoch": 1.0922693266832917, + "grad_norm": 0.015385148115456104, + "learning_rate": 1.5634413965087283e-05, + "loss": 0.0302, + "step": 8760 + }, + { + "epoch": 1.0935162094763093, + "grad_norm": 0.04634137824177742, + "learning_rate": 1.5629426433915213e-05, + "loss": 0.0027, + "step": 8770 + }, + { + "epoch": 1.0947630922693268, + "grad_norm": 0.01995709352195263, + "learning_rate": 1.5624438902743144e-05, + "loss": 0.0533, + "step": 8780 + }, + { + "epoch": 1.0960099750623442, + "grad_norm": 0.9856222867965698, + "learning_rate": 1.561945137157107e-05, + "loss": 0.109, + "step": 8790 + }, + { + "epoch": 1.0972568578553616, + "grad_norm": 0.011172446422278881, + "learning_rate": 1.5614463840399002e-05, + "loss": 0.0347, + "step": 8800 + }, + { + "epoch": 1.098503740648379, + "grad_norm": 63.454627990722656, + "learning_rate": 1.5609476309226933e-05, + "loss": 0.0124, + "step": 8810 + }, + { + "epoch": 1.0997506234413965, + "grad_norm": 0.06844215095043182, + "learning_rate": 1.5604488778054863e-05, + "loss": 0.0149, + "step": 8820 + }, + { + "epoch": 1.100997506234414, + "grad_norm": 13.142332077026367, + "learning_rate": 1.5599501246882794e-05, + "loss": 0.0348, + "step": 8830 + }, + { + "epoch": 1.1022443890274314, + "grad_norm": 0.0036785067059099674, + "learning_rate": 1.5594513715710724e-05, + "loss": 0.0002, + "step": 8840 + }, + { + "epoch": 1.1034912718204488, + "grad_norm": 0.002891476731747389, + "learning_rate": 1.5589526184538655e-05, + "loss": 0.0345, + "step": 8850 + }, + { + "epoch": 1.1047381546134662, + "grad_norm": 0.021872134879231453, + "learning_rate": 1.5584538653366586e-05, + "loss": 0.0163, + "step": 8860 + }, + { + "epoch": 1.1059850374064837, + "grad_norm": 0.09742742776870728, + "learning_rate": 1.5579551122194513e-05, + "loss": 0.1558, + "step": 8870 + }, + { + "epoch": 1.1072319201995013, + "grad_norm": 0.005483236163854599, + "learning_rate": 1.5574563591022444e-05, + "loss": 0.0449, + "step": 8880 + }, + { + "epoch": 1.1084788029925188, + "grad_norm": 14.885406494140625, + "learning_rate": 1.5569576059850374e-05, + "loss": 0.0061, + "step": 8890 + }, + { + "epoch": 1.1097256857855362, + "grad_norm": 5.764937400817871, + "learning_rate": 1.5564588528678305e-05, + "loss": 0.1297, + "step": 8900 + }, + { + "epoch": 1.1109725685785536, + "grad_norm": 0.0030513282399624586, + "learning_rate": 1.5559600997506235e-05, + "loss": 0.1447, + "step": 8910 + }, + { + "epoch": 1.112219451371571, + "grad_norm": 0.19194713234901428, + "learning_rate": 1.5554613466334166e-05, + "loss": 0.0968, + "step": 8920 + }, + { + "epoch": 1.1134663341645885, + "grad_norm": 0.012002095580101013, + "learning_rate": 1.5549625935162097e-05, + "loss": 0.0003, + "step": 8930 + }, + { + "epoch": 1.114713216957606, + "grad_norm": 0.005370179656893015, + "learning_rate": 1.5544638403990027e-05, + "loss": 0.0005, + "step": 8940 + }, + { + "epoch": 1.1159600997506234, + "grad_norm": 0.054277919232845306, + "learning_rate": 1.5539650872817958e-05, + "loss": 0.0459, + "step": 8950 + }, + { + "epoch": 1.1172069825436408, + "grad_norm": 0.006045327056199312, + "learning_rate": 1.5534663341645885e-05, + "loss": 0.064, + "step": 8960 + }, + { + "epoch": 1.1184538653366585, + "grad_norm": 0.020867647603154182, + "learning_rate": 1.5529675810473816e-05, + "loss": 0.0516, + "step": 8970 + }, + { + "epoch": 1.119700748129676, + "grad_norm": 0.011230267584323883, + "learning_rate": 1.5524688279301746e-05, + "loss": 0.001, + "step": 8980 + }, + { + "epoch": 1.1209476309226933, + "grad_norm": 0.05982297286391258, + "learning_rate": 1.5519700748129677e-05, + "loss": 0.0004, + "step": 8990 + }, + { + "epoch": 1.1221945137157108, + "grad_norm": 9.108420372009277, + "learning_rate": 1.5514713216957608e-05, + "loss": 0.0456, + "step": 9000 + }, + { + "epoch": 1.1234413965087282, + "grad_norm": 0.022048041224479675, + "learning_rate": 1.5509725685785538e-05, + "loss": 0.0168, + "step": 9010 + }, + { + "epoch": 1.1246882793017456, + "grad_norm": 0.010482951998710632, + "learning_rate": 1.550473815461347e-05, + "loss": 0.022, + "step": 9020 + }, + { + "epoch": 1.125935162094763, + "grad_norm": 57.64718246459961, + "learning_rate": 1.54997506234414e-05, + "loss": 0.0169, + "step": 9030 + }, + { + "epoch": 1.1271820448877805, + "grad_norm": 0.008993241004645824, + "learning_rate": 1.5494763092269327e-05, + "loss": 0.0206, + "step": 9040 + }, + { + "epoch": 1.128428927680798, + "grad_norm": 0.4927586019039154, + "learning_rate": 1.5489775561097257e-05, + "loss": 0.0098, + "step": 9050 + }, + { + "epoch": 1.1296758104738154, + "grad_norm": 0.0061515928246080875, + "learning_rate": 1.5484788029925188e-05, + "loss": 0.134, + "step": 9060 + }, + { + "epoch": 1.1309226932668328, + "grad_norm": 0.20207062363624573, + "learning_rate": 1.547980049875312e-05, + "loss": 0.1086, + "step": 9070 + }, + { + "epoch": 1.1321695760598505, + "grad_norm": 0.006458123680204153, + "learning_rate": 1.547481296758105e-05, + "loss": 0.0006, + "step": 9080 + }, + { + "epoch": 1.133416458852868, + "grad_norm": 0.19042325019836426, + "learning_rate": 1.546982543640898e-05, + "loss": 0.0299, + "step": 9090 + }, + { + "epoch": 1.1346633416458853, + "grad_norm": 0.015476653352379799, + "learning_rate": 1.546483790523691e-05, + "loss": 0.0454, + "step": 9100 + }, + { + "epoch": 1.1359102244389028, + "grad_norm": 0.015153790824115276, + "learning_rate": 1.545985037406484e-05, + "loss": 0.0818, + "step": 9110 + }, + { + "epoch": 1.1371571072319202, + "grad_norm": 19.77805519104004, + "learning_rate": 1.5454862842892768e-05, + "loss": 0.1194, + "step": 9120 + }, + { + "epoch": 1.1384039900249376, + "grad_norm": 0.10593175888061523, + "learning_rate": 1.54498753117207e-05, + "loss": 0.0019, + "step": 9130 + }, + { + "epoch": 1.139650872817955, + "grad_norm": 0.013980482704937458, + "learning_rate": 1.544488778054863e-05, + "loss": 0.0429, + "step": 9140 + }, + { + "epoch": 1.1408977556109725, + "grad_norm": 0.025008754804730415, + "learning_rate": 1.543990024937656e-05, + "loss": 0.0121, + "step": 9150 + }, + { + "epoch": 1.14214463840399, + "grad_norm": 0.19609186053276062, + "learning_rate": 1.543491271820449e-05, + "loss": 0.0212, + "step": 9160 + }, + { + "epoch": 1.1433915211970076, + "grad_norm": 0.013282079249620438, + "learning_rate": 1.5429925187032418e-05, + "loss": 0.0507, + "step": 9170 + }, + { + "epoch": 1.144638403990025, + "grad_norm": 0.003723228583112359, + "learning_rate": 1.5424937655860352e-05, + "loss": 0.0539, + "step": 9180 + }, + { + "epoch": 1.1458852867830425, + "grad_norm": 5.059774398803711, + "learning_rate": 1.5419950124688283e-05, + "loss": 0.0509, + "step": 9190 + }, + { + "epoch": 1.14713216957606, + "grad_norm": 0.01049418281763792, + "learning_rate": 1.5414962593516213e-05, + "loss": 0.1348, + "step": 9200 + }, + { + "epoch": 1.1483790523690773, + "grad_norm": 0.020952526479959488, + "learning_rate": 1.540997506234414e-05, + "loss": 0.0618, + "step": 9210 + }, + { + "epoch": 1.1496259351620948, + "grad_norm": 0.03677148371934891, + "learning_rate": 1.540498753117207e-05, + "loss": 0.0568, + "step": 9220 + }, + { + "epoch": 1.1508728179551122, + "grad_norm": 0.006018058396875858, + "learning_rate": 1.54e-05, + "loss": 0.0224, + "step": 9230 + }, + { + "epoch": 1.1521197007481296, + "grad_norm": 16.032939910888672, + "learning_rate": 1.5395012468827932e-05, + "loss": 0.0931, + "step": 9240 + }, + { + "epoch": 1.153366583541147, + "grad_norm": 0.0084098344668746, + "learning_rate": 1.539002493765586e-05, + "loss": 0.0289, + "step": 9250 + }, + { + "epoch": 1.1546134663341645, + "grad_norm": 0.015302467159926891, + "learning_rate": 1.538503740648379e-05, + "loss": 0.0055, + "step": 9260 + }, + { + "epoch": 1.155860349127182, + "grad_norm": 0.08080907166004181, + "learning_rate": 1.538004987531172e-05, + "loss": 0.0461, + "step": 9270 + }, + { + "epoch": 1.1571072319201996, + "grad_norm": 0.13290953636169434, + "learning_rate": 1.537506234413965e-05, + "loss": 0.0356, + "step": 9280 + }, + { + "epoch": 1.158354114713217, + "grad_norm": 0.007370785344392061, + "learning_rate": 1.5370074812967582e-05, + "loss": 0.0503, + "step": 9290 + }, + { + "epoch": 1.1596009975062345, + "grad_norm": 0.4330526292324066, + "learning_rate": 1.5365087281795513e-05, + "loss": 0.0256, + "step": 9300 + }, + { + "epoch": 1.160847880299252, + "grad_norm": 0.03412933647632599, + "learning_rate": 1.5360099750623443e-05, + "loss": 0.0217, + "step": 9310 + }, + { + "epoch": 1.1620947630922693, + "grad_norm": 0.03334099426865578, + "learning_rate": 1.5355112219451374e-05, + "loss": 0.0815, + "step": 9320 + }, + { + "epoch": 1.1633416458852868, + "grad_norm": 0.007466933690011501, + "learning_rate": 1.53501246882793e-05, + "loss": 0.0378, + "step": 9330 + }, + { + "epoch": 1.1645885286783042, + "grad_norm": 11.656465530395508, + "learning_rate": 1.534513715710723e-05, + "loss": 0.0019, + "step": 9340 + }, + { + "epoch": 1.1658354114713216, + "grad_norm": 0.007075449451804161, + "learning_rate": 1.5340149625935162e-05, + "loss": 0.0461, + "step": 9350 + }, + { + "epoch": 1.167082294264339, + "grad_norm": 0.012630700133740902, + "learning_rate": 1.5335162094763093e-05, + "loss": 0.0148, + "step": 9360 + }, + { + "epoch": 1.1683291770573567, + "grad_norm": 0.0029928339645266533, + "learning_rate": 1.5330174563591024e-05, + "loss": 0.0267, + "step": 9370 + }, + { + "epoch": 1.1695760598503742, + "grad_norm": 0.007222963962703943, + "learning_rate": 1.5325187032418954e-05, + "loss": 0.0554, + "step": 9380 + }, + { + "epoch": 1.1708229426433916, + "grad_norm": 26.852447509765625, + "learning_rate": 1.5320199501246885e-05, + "loss": 0.0709, + "step": 9390 + }, + { + "epoch": 1.172069825436409, + "grad_norm": 4.241142272949219, + "learning_rate": 1.5315211970074815e-05, + "loss": 0.082, + "step": 9400 + }, + { + "epoch": 1.1733167082294265, + "grad_norm": 0.007325719576328993, + "learning_rate": 1.5310224438902746e-05, + "loss": 0.0004, + "step": 9410 + }, + { + "epoch": 1.174563591022444, + "grad_norm": 3.9386239051818848, + "learning_rate": 1.5305236907730673e-05, + "loss": 0.0012, + "step": 9420 + }, + { + "epoch": 1.1758104738154613, + "grad_norm": 0.01459722314029932, + "learning_rate": 1.5300249376558604e-05, + "loss": 0.0516, + "step": 9430 + }, + { + "epoch": 1.1770573566084788, + "grad_norm": 2.8290350437164307, + "learning_rate": 1.5295261845386534e-05, + "loss": 0.0472, + "step": 9440 + }, + { + "epoch": 1.1783042394014962, + "grad_norm": 0.013476746156811714, + "learning_rate": 1.5290274314214465e-05, + "loss": 0.0376, + "step": 9450 + }, + { + "epoch": 1.1795511221945136, + "grad_norm": 0.18037371337413788, + "learning_rate": 1.5285286783042396e-05, + "loss": 0.0848, + "step": 9460 + }, + { + "epoch": 1.180798004987531, + "grad_norm": 1.987277865409851, + "learning_rate": 1.5280299251870326e-05, + "loss": 0.0579, + "step": 9470 + }, + { + "epoch": 1.1820448877805487, + "grad_norm": 0.14287962019443512, + "learning_rate": 1.5275311720698257e-05, + "loss": 0.0394, + "step": 9480 + }, + { + "epoch": 1.1832917705735662, + "grad_norm": 0.09158501774072647, + "learning_rate": 1.5270324189526188e-05, + "loss": 0.0104, + "step": 9490 + }, + { + "epoch": 1.1845386533665836, + "grad_norm": 0.1351417750120163, + "learning_rate": 1.5265336658354115e-05, + "loss": 0.0261, + "step": 9500 + }, + { + "epoch": 1.185785536159601, + "grad_norm": 0.029170798137784004, + "learning_rate": 1.5260349127182045e-05, + "loss": 0.1452, + "step": 9510 + }, + { + "epoch": 1.1870324189526185, + "grad_norm": 0.015503687784075737, + "learning_rate": 1.5255361596009976e-05, + "loss": 0.0232, + "step": 9520 + }, + { + "epoch": 1.188279301745636, + "grad_norm": 0.00761051382869482, + "learning_rate": 1.5250374064837907e-05, + "loss": 0.0007, + "step": 9530 + }, + { + "epoch": 1.1895261845386533, + "grad_norm": 0.03034345805644989, + "learning_rate": 1.5245386533665836e-05, + "loss": 0.0061, + "step": 9540 + }, + { + "epoch": 1.1907730673316708, + "grad_norm": 0.20128989219665527, + "learning_rate": 1.5240399002493766e-05, + "loss": 0.1083, + "step": 9550 + }, + { + "epoch": 1.1920199501246882, + "grad_norm": 1.2821415662765503, + "learning_rate": 1.5235411471321697e-05, + "loss": 0.0026, + "step": 9560 + }, + { + "epoch": 1.1932668329177059, + "grad_norm": 0.18182295560836792, + "learning_rate": 1.5230423940149627e-05, + "loss": 0.0444, + "step": 9570 + }, + { + "epoch": 1.1945137157107233, + "grad_norm": 0.008188813924789429, + "learning_rate": 1.5225436408977556e-05, + "loss": 0.0649, + "step": 9580 + }, + { + "epoch": 1.1957605985037407, + "grad_norm": 0.0054941922426223755, + "learning_rate": 1.5220448877805487e-05, + "loss": 0.0004, + "step": 9590 + }, + { + "epoch": 1.1970074812967582, + "grad_norm": 0.005160823930054903, + "learning_rate": 1.5215461346633418e-05, + "loss": 0.0357, + "step": 9600 + }, + { + "epoch": 1.1982543640897756, + "grad_norm": 0.010838705115020275, + "learning_rate": 1.5210473815461348e-05, + "loss": 0.049, + "step": 9610 + }, + { + "epoch": 1.199501246882793, + "grad_norm": 0.008219705894589424, + "learning_rate": 1.5205486284289277e-05, + "loss": 0.0287, + "step": 9620 + }, + { + "epoch": 1.2007481296758105, + "grad_norm": 0.12174854427576065, + "learning_rate": 1.5200498753117208e-05, + "loss": 0.0009, + "step": 9630 + }, + { + "epoch": 1.201995012468828, + "grad_norm": 0.006584263406693935, + "learning_rate": 1.5195511221945138e-05, + "loss": 0.0149, + "step": 9640 + }, + { + "epoch": 1.2032418952618453, + "grad_norm": 0.0048215193673968315, + "learning_rate": 1.5190523690773069e-05, + "loss": 0.002, + "step": 9650 + }, + { + "epoch": 1.2044887780548628, + "grad_norm": 0.08884479105472565, + "learning_rate": 1.5185536159601e-05, + "loss": 0.0005, + "step": 9660 + }, + { + "epoch": 1.2057356608478802, + "grad_norm": 27.005584716796875, + "learning_rate": 1.5180548628428929e-05, + "loss": 0.0372, + "step": 9670 + }, + { + "epoch": 1.2069825436408976, + "grad_norm": 0.00920418743044138, + "learning_rate": 1.517556109725686e-05, + "loss": 0.0308, + "step": 9680 + }, + { + "epoch": 1.2082294264339153, + "grad_norm": 0.009550421498715878, + "learning_rate": 1.517057356608479e-05, + "loss": 0.0009, + "step": 9690 + }, + { + "epoch": 1.2094763092269327, + "grad_norm": 0.8975104689598083, + "learning_rate": 1.516558603491272e-05, + "loss": 0.0436, + "step": 9700 + }, + { + "epoch": 1.2107231920199502, + "grad_norm": 0.02298056147992611, + "learning_rate": 1.516059850374065e-05, + "loss": 0.0955, + "step": 9710 + }, + { + "epoch": 1.2119700748129676, + "grad_norm": 0.002887186361476779, + "learning_rate": 1.515561097256858e-05, + "loss": 0.0004, + "step": 9720 + }, + { + "epoch": 1.213216957605985, + "grad_norm": 0.015636254101991653, + "learning_rate": 1.515062344139651e-05, + "loss": 0.0628, + "step": 9730 + }, + { + "epoch": 1.2144638403990025, + "grad_norm": 24.223073959350586, + "learning_rate": 1.5145635910224441e-05, + "loss": 0.0733, + "step": 9740 + }, + { + "epoch": 1.21571072319202, + "grad_norm": 0.024109138175845146, + "learning_rate": 1.514064837905237e-05, + "loss": 0.0172, + "step": 9750 + }, + { + "epoch": 1.2169576059850373, + "grad_norm": 0.006183837074786425, + "learning_rate": 1.51356608478803e-05, + "loss": 0.1089, + "step": 9760 + }, + { + "epoch": 1.218204488778055, + "grad_norm": 0.013143481686711311, + "learning_rate": 1.5130673316708231e-05, + "loss": 0.0008, + "step": 9770 + }, + { + "epoch": 1.2194513715710724, + "grad_norm": 8.031755447387695, + "learning_rate": 1.5125685785536162e-05, + "loss": 0.0874, + "step": 9780 + }, + { + "epoch": 1.2206982543640899, + "grad_norm": 0.026733698323369026, + "learning_rate": 1.5120698254364091e-05, + "loss": 0.0888, + "step": 9790 + }, + { + "epoch": 1.2219451371571073, + "grad_norm": 0.8957703113555908, + "learning_rate": 1.5115710723192022e-05, + "loss": 0.0762, + "step": 9800 + }, + { + "epoch": 1.2231920199501247, + "grad_norm": 5.752279758453369, + "learning_rate": 1.5110723192019952e-05, + "loss": 0.0462, + "step": 9810 + }, + { + "epoch": 1.2244389027431422, + "grad_norm": 0.04652789980173111, + "learning_rate": 1.5105735660847883e-05, + "loss": 0.0072, + "step": 9820 + }, + { + "epoch": 1.2256857855361596, + "grad_norm": 0.035636574029922485, + "learning_rate": 1.510074812967581e-05, + "loss": 0.0919, + "step": 9830 + }, + { + "epoch": 1.226932668329177, + "grad_norm": 0.010265025310218334, + "learning_rate": 1.509576059850374e-05, + "loss": 0.0213, + "step": 9840 + }, + { + "epoch": 1.2281795511221945, + "grad_norm": 31.54733657836914, + "learning_rate": 1.5090773067331673e-05, + "loss": 0.0269, + "step": 9850 + }, + { + "epoch": 1.229426433915212, + "grad_norm": 0.1989414095878601, + "learning_rate": 1.5085785536159604e-05, + "loss": 0.0064, + "step": 9860 + }, + { + "epoch": 1.2306733167082293, + "grad_norm": 0.182249516248703, + "learning_rate": 1.508079800498753e-05, + "loss": 0.0546, + "step": 9870 + }, + { + "epoch": 1.2319201995012468, + "grad_norm": 0.022076517343521118, + "learning_rate": 1.5075810473815461e-05, + "loss": 0.0712, + "step": 9880 + }, + { + "epoch": 1.2331670822942644, + "grad_norm": 0.8623358011245728, + "learning_rate": 1.5070822942643392e-05, + "loss": 0.0008, + "step": 9890 + }, + { + "epoch": 1.2344139650872819, + "grad_norm": 0.004727656487375498, + "learning_rate": 1.5065835411471323e-05, + "loss": 0.017, + "step": 9900 + }, + { + "epoch": 1.2356608478802993, + "grad_norm": 0.03601466864347458, + "learning_rate": 1.5060847880299255e-05, + "loss": 0.0348, + "step": 9910 + }, + { + "epoch": 1.2369077306733167, + "grad_norm": 0.004477897193282843, + "learning_rate": 1.5055860349127182e-05, + "loss": 0.0005, + "step": 9920 + }, + { + "epoch": 1.2381546134663342, + "grad_norm": 25.25444221496582, + "learning_rate": 1.5050872817955113e-05, + "loss": 0.14, + "step": 9930 + }, + { + "epoch": 1.2394014962593516, + "grad_norm": 0.05620008334517479, + "learning_rate": 1.5045885286783043e-05, + "loss": 0.0469, + "step": 9940 + }, + { + "epoch": 1.240648379052369, + "grad_norm": 0.049697790294885635, + "learning_rate": 1.5040897755610974e-05, + "loss": 0.0004, + "step": 9950 + }, + { + "epoch": 1.2418952618453865, + "grad_norm": 0.041144270449876785, + "learning_rate": 1.5035910224438903e-05, + "loss": 0.0318, + "step": 9960 + }, + { + "epoch": 1.2431421446384041, + "grad_norm": 0.009246827103197575, + "learning_rate": 1.5030922693266834e-05, + "loss": 0.0779, + "step": 9970 + }, + { + "epoch": 1.2443890274314215, + "grad_norm": 0.18010537326335907, + "learning_rate": 1.5025935162094764e-05, + "loss": 0.0013, + "step": 9980 + }, + { + "epoch": 1.245635910224439, + "grad_norm": 0.0025472913403064013, + "learning_rate": 1.5020947630922695e-05, + "loss": 0.0244, + "step": 9990 + }, + { + "epoch": 1.2468827930174564, + "grad_norm": 35.89253616333008, + "learning_rate": 1.5015960099750624e-05, + "loss": 0.0626, + "step": 10000 + }, + { + "epoch": 1.2481296758104738, + "grad_norm": 0.004349586088210344, + "learning_rate": 1.5010972568578554e-05, + "loss": 0.0659, + "step": 10010 + }, + { + "epoch": 1.2493765586034913, + "grad_norm": 0.0075108809396624565, + "learning_rate": 1.5005985037406485e-05, + "loss": 0.0926, + "step": 10020 + }, + { + "epoch": 1.2506234413965087, + "grad_norm": 21.287639617919922, + "learning_rate": 1.5000997506234416e-05, + "loss": 0.0954, + "step": 10030 + }, + { + "epoch": 1.2518703241895262, + "grad_norm": 0.01195969246327877, + "learning_rate": 1.4996009975062345e-05, + "loss": 0.0552, + "step": 10040 + }, + { + "epoch": 1.2531172069825436, + "grad_norm": 0.0076157813891768456, + "learning_rate": 1.4991022443890275e-05, + "loss": 0.0252, + "step": 10050 + }, + { + "epoch": 1.254364089775561, + "grad_norm": 0.005119314882904291, + "learning_rate": 1.4986034912718206e-05, + "loss": 0.0726, + "step": 10060 + }, + { + "epoch": 1.2556109725685785, + "grad_norm": 0.005796648096293211, + "learning_rate": 1.4981047381546136e-05, + "loss": 0.0626, + "step": 10070 + }, + { + "epoch": 1.2568578553615959, + "grad_norm": 0.007139543071389198, + "learning_rate": 1.4976059850374065e-05, + "loss": 0.0278, + "step": 10080 + }, + { + "epoch": 1.2581047381546135, + "grad_norm": 0.003264715662226081, + "learning_rate": 1.4971072319201996e-05, + "loss": 0.0508, + "step": 10090 + }, + { + "epoch": 1.259351620947631, + "grad_norm": 34.05919647216797, + "learning_rate": 1.4966084788029927e-05, + "loss": 0.0393, + "step": 10100 + }, + { + "epoch": 1.2605985037406484, + "grad_norm": 0.0849919244647026, + "learning_rate": 1.4961097256857857e-05, + "loss": 0.0342, + "step": 10110 + }, + { + "epoch": 1.2618453865336658, + "grad_norm": 0.14518681168556213, + "learning_rate": 1.4956109725685786e-05, + "loss": 0.0008, + "step": 10120 + }, + { + "epoch": 1.2630922693266833, + "grad_norm": 0.1475505381822586, + "learning_rate": 1.4951122194513717e-05, + "loss": 0.018, + "step": 10130 + }, + { + "epoch": 1.2643391521197007, + "grad_norm": 0.006064166314899921, + "learning_rate": 1.4946134663341647e-05, + "loss": 0.0796, + "step": 10140 + }, + { + "epoch": 1.2655860349127181, + "grad_norm": 0.004532515071332455, + "learning_rate": 1.4941147132169578e-05, + "loss": 0.0015, + "step": 10150 + }, + { + "epoch": 1.2668329177057356, + "grad_norm": 0.04352598637342453, + "learning_rate": 1.4936159600997509e-05, + "loss": 0.0003, + "step": 10160 + }, + { + "epoch": 1.2680798004987532, + "grad_norm": 0.49034398794174194, + "learning_rate": 1.4931172069825438e-05, + "loss": 0.0086, + "step": 10170 + }, + { + "epoch": 1.2693266832917707, + "grad_norm": 0.003610881045460701, + "learning_rate": 1.4926184538653368e-05, + "loss": 0.0022, + "step": 10180 + }, + { + "epoch": 1.270573566084788, + "grad_norm": 0.003048134967684746, + "learning_rate": 1.4921197007481299e-05, + "loss": 0.1527, + "step": 10190 + }, + { + "epoch": 1.2718204488778055, + "grad_norm": 0.06748661398887634, + "learning_rate": 1.491620947630923e-05, + "loss": 0.0343, + "step": 10200 + }, + { + "epoch": 1.273067331670823, + "grad_norm": 0.014995983801782131, + "learning_rate": 1.4911221945137158e-05, + "loss": 0.017, + "step": 10210 + }, + { + "epoch": 1.2743142144638404, + "grad_norm": 0.004312419332563877, + "learning_rate": 1.4906234413965089e-05, + "loss": 0.0005, + "step": 10220 + }, + { + "epoch": 1.2755610972568578, + "grad_norm": 3.3320200443267822, + "learning_rate": 1.490124688279302e-05, + "loss": 0.0468, + "step": 10230 + }, + { + "epoch": 1.2768079800498753, + "grad_norm": 0.144317626953125, + "learning_rate": 1.489625935162095e-05, + "loss": 0.0271, + "step": 10240 + }, + { + "epoch": 1.2780548628428927, + "grad_norm": 1.370514154434204, + "learning_rate": 1.4891271820448877e-05, + "loss": 0.0018, + "step": 10250 + }, + { + "epoch": 1.2793017456359101, + "grad_norm": 0.012159171514213085, + "learning_rate": 1.488628428927681e-05, + "loss": 0.0184, + "step": 10260 + }, + { + "epoch": 1.2805486284289276, + "grad_norm": 4.12328577041626, + "learning_rate": 1.488129675810474e-05, + "loss": 0.0019, + "step": 10270 + }, + { + "epoch": 1.281795511221945, + "grad_norm": 0.1401759535074234, + "learning_rate": 1.4876309226932671e-05, + "loss": 0.0284, + "step": 10280 + }, + { + "epoch": 1.2830423940149627, + "grad_norm": 0.011864711530506611, + "learning_rate": 1.4871321695760598e-05, + "loss": 0.0447, + "step": 10290 + }, + { + "epoch": 1.28428927680798, + "grad_norm": 0.021031511947512627, + "learning_rate": 1.4866334164588529e-05, + "loss": 0.0722, + "step": 10300 + }, + { + "epoch": 1.2855361596009975, + "grad_norm": 0.021378498524427414, + "learning_rate": 1.486134663341646e-05, + "loss": 0.028, + "step": 10310 + }, + { + "epoch": 1.286783042394015, + "grad_norm": 0.014980390667915344, + "learning_rate": 1.4856359102244392e-05, + "loss": 0.031, + "step": 10320 + }, + { + "epoch": 1.2880299251870324, + "grad_norm": 0.0073977308347821236, + "learning_rate": 1.4851371571072319e-05, + "loss": 0.0417, + "step": 10330 + }, + { + "epoch": 1.2892768079800498, + "grad_norm": 17.954195022583008, + "learning_rate": 1.484638403990025e-05, + "loss": 0.0598, + "step": 10340 + }, + { + "epoch": 1.2905236907730673, + "grad_norm": 0.3461831510066986, + "learning_rate": 1.484139650872818e-05, + "loss": 0.0336, + "step": 10350 + }, + { + "epoch": 1.2917705735660847, + "grad_norm": 28.57352638244629, + "learning_rate": 1.483640897755611e-05, + "loss": 0.0314, + "step": 10360 + }, + { + "epoch": 1.2930174563591024, + "grad_norm": 0.00324003747664392, + "learning_rate": 1.483142144638404e-05, + "loss": 0.0225, + "step": 10370 + }, + { + "epoch": 1.2942643391521198, + "grad_norm": 31.616849899291992, + "learning_rate": 1.482643391521197e-05, + "loss": 0.0034, + "step": 10380 + }, + { + "epoch": 1.2955112219451372, + "grad_norm": 0.2275863140821457, + "learning_rate": 1.4821446384039901e-05, + "loss": 0.0004, + "step": 10390 + }, + { + "epoch": 1.2967581047381547, + "grad_norm": 11.458198547363281, + "learning_rate": 1.4816458852867832e-05, + "loss": 0.0021, + "step": 10400 + }, + { + "epoch": 1.298004987531172, + "grad_norm": 0.006564176641404629, + "learning_rate": 1.4811471321695762e-05, + "loss": 0.0001, + "step": 10410 + }, + { + "epoch": 1.2992518703241895, + "grad_norm": 0.0027762737590819597, + "learning_rate": 1.4806483790523691e-05, + "loss": 0.1087, + "step": 10420 + }, + { + "epoch": 1.300498753117207, + "grad_norm": 19.512054443359375, + "learning_rate": 1.4801496259351622e-05, + "loss": 0.042, + "step": 10430 + }, + { + "epoch": 1.3017456359102244, + "grad_norm": 35.38471984863281, + "learning_rate": 1.4796508728179552e-05, + "loss": 0.0666, + "step": 10440 + }, + { + "epoch": 1.3029925187032418, + "grad_norm": 0.01785680651664734, + "learning_rate": 1.4791521197007483e-05, + "loss": 0.0299, + "step": 10450 + }, + { + "epoch": 1.3042394014962593, + "grad_norm": 0.0048726629465818405, + "learning_rate": 1.4786533665835412e-05, + "loss": 0.0096, + "step": 10460 + }, + { + "epoch": 1.3054862842892767, + "grad_norm": 32.89852523803711, + "learning_rate": 1.4781546134663343e-05, + "loss": 0.0799, + "step": 10470 + }, + { + "epoch": 1.3067331670822941, + "grad_norm": 0.004758730530738831, + "learning_rate": 1.4776558603491273e-05, + "loss": 0.0005, + "step": 10480 + }, + { + "epoch": 1.3079800498753118, + "grad_norm": 0.002490266226232052, + "learning_rate": 1.4771571072319204e-05, + "loss": 0.0568, + "step": 10490 + }, + { + "epoch": 1.3092269326683292, + "grad_norm": 0.006255331449210644, + "learning_rate": 1.4766583541147133e-05, + "loss": 0.0442, + "step": 10500 + }, + { + "epoch": 1.3104738154613467, + "grad_norm": 12.40581226348877, + "learning_rate": 1.4761596009975063e-05, + "loss": 0.0257, + "step": 10510 + }, + { + "epoch": 1.311720698254364, + "grad_norm": 0.01563987322151661, + "learning_rate": 1.4756608478802994e-05, + "loss": 0.1362, + "step": 10520 + }, + { + "epoch": 1.3129675810473815, + "grad_norm": 0.005479677580296993, + "learning_rate": 1.4751620947630925e-05, + "loss": 0.0003, + "step": 10530 + }, + { + "epoch": 1.314214463840399, + "grad_norm": 12.577947616577148, + "learning_rate": 1.4746633416458853e-05, + "loss": 0.0335, + "step": 10540 + }, + { + "epoch": 1.3154613466334164, + "grad_norm": 0.015153160318732262, + "learning_rate": 1.4741645885286784e-05, + "loss": 0.0022, + "step": 10550 + }, + { + "epoch": 1.3167082294264338, + "grad_norm": 0.1170441284775734, + "learning_rate": 1.4736658354114715e-05, + "loss": 0.0674, + "step": 10560 + }, + { + "epoch": 1.3179551122194515, + "grad_norm": 0.003648881334811449, + "learning_rate": 1.4731670822942645e-05, + "loss": 0.0116, + "step": 10570 + }, + { + "epoch": 1.319201995012469, + "grad_norm": 0.002758126938715577, + "learning_rate": 1.4726683291770574e-05, + "loss": 0.0733, + "step": 10580 + }, + { + "epoch": 1.3204488778054864, + "grad_norm": 0.028896719217300415, + "learning_rate": 1.4721695760598505e-05, + "loss": 0.0008, + "step": 10590 + }, + { + "epoch": 1.3216957605985038, + "grad_norm": 3.604020595550537, + "learning_rate": 1.4716708229426436e-05, + "loss": 0.0425, + "step": 10600 + }, + { + "epoch": 1.3229426433915212, + "grad_norm": 0.5918417572975159, + "learning_rate": 1.4711720698254366e-05, + "loss": 0.0613, + "step": 10610 + }, + { + "epoch": 1.3241895261845387, + "grad_norm": 0.012655685655772686, + "learning_rate": 1.4706733167082295e-05, + "loss": 0.1018, + "step": 10620 + }, + { + "epoch": 1.325436408977556, + "grad_norm": 0.0313897430896759, + "learning_rate": 1.4701745635910226e-05, + "loss": 0.0305, + "step": 10630 + }, + { + "epoch": 1.3266832917705735, + "grad_norm": 2.2387187480926514, + "learning_rate": 1.4696758104738156e-05, + "loss": 0.0364, + "step": 10640 + }, + { + "epoch": 1.327930174563591, + "grad_norm": 37.755985260009766, + "learning_rate": 1.4691770573566087e-05, + "loss": 0.0127, + "step": 10650 + }, + { + "epoch": 1.3291770573566084, + "grad_norm": 0.16674213111400604, + "learning_rate": 1.4686783042394018e-05, + "loss": 0.0011, + "step": 10660 + }, + { + "epoch": 1.3304239401496258, + "grad_norm": 0.018290970474481583, + "learning_rate": 1.4681795511221946e-05, + "loss": 0.0714, + "step": 10670 + }, + { + "epoch": 1.3316708229426433, + "grad_norm": 0.700610876083374, + "learning_rate": 1.4676807980049877e-05, + "loss": 0.0035, + "step": 10680 + }, + { + "epoch": 1.332917705735661, + "grad_norm": 0.004379532765597105, + "learning_rate": 1.4671820448877808e-05, + "loss": 0.0004, + "step": 10690 + }, + { + "epoch": 1.3341645885286784, + "grad_norm": 0.020778026431798935, + "learning_rate": 1.4666832917705738e-05, + "loss": 0.0046, + "step": 10700 + }, + { + "epoch": 1.3354114713216958, + "grad_norm": 19.630970001220703, + "learning_rate": 1.4661845386533666e-05, + "loss": 0.0661, + "step": 10710 + }, + { + "epoch": 1.3366583541147132, + "grad_norm": 44.413639068603516, + "learning_rate": 1.4656857855361596e-05, + "loss": 0.0341, + "step": 10720 + }, + { + "epoch": 1.3379052369077307, + "grad_norm": 0.032850079238414764, + "learning_rate": 1.4651870324189528e-05, + "loss": 0.0003, + "step": 10730 + }, + { + "epoch": 1.339152119700748, + "grad_norm": 0.002132246969267726, + "learning_rate": 1.4646882793017459e-05, + "loss": 0.0006, + "step": 10740 + }, + { + "epoch": 1.3403990024937655, + "grad_norm": 22.134477615356445, + "learning_rate": 1.4641895261845386e-05, + "loss": 0.0573, + "step": 10750 + }, + { + "epoch": 1.341645885286783, + "grad_norm": 1.7164827585220337, + "learning_rate": 1.4636907730673317e-05, + "loss": 0.0053, + "step": 10760 + }, + { + "epoch": 1.3428927680798006, + "grad_norm": 0.007867738604545593, + "learning_rate": 1.4631920199501248e-05, + "loss": 0.0013, + "step": 10770 + }, + { + "epoch": 1.344139650872818, + "grad_norm": 0.04650270566344261, + "learning_rate": 1.4626932668329178e-05, + "loss": 0.0003, + "step": 10780 + }, + { + "epoch": 1.3453865336658355, + "grad_norm": 0.009355682879686356, + "learning_rate": 1.4621945137157107e-05, + "loss": 0.1536, + "step": 10790 + }, + { + "epoch": 1.346633416458853, + "grad_norm": 0.01851280778646469, + "learning_rate": 1.4616957605985038e-05, + "loss": 0.0736, + "step": 10800 + }, + { + "epoch": 1.3478802992518704, + "grad_norm": 0.838029682636261, + "learning_rate": 1.4611970074812968e-05, + "loss": 0.1233, + "step": 10810 + }, + { + "epoch": 1.3491271820448878, + "grad_norm": 0.28855791687965393, + "learning_rate": 1.4606982543640899e-05, + "loss": 0.004, + "step": 10820 + }, + { + "epoch": 1.3503740648379052, + "grad_norm": 0.021240094676613808, + "learning_rate": 1.4601995012468828e-05, + "loss": 0.0722, + "step": 10830 + }, + { + "epoch": 1.3516209476309227, + "grad_norm": 0.1711479127407074, + "learning_rate": 1.4597007481296759e-05, + "loss": 0.0011, + "step": 10840 + }, + { + "epoch": 1.35286783042394, + "grad_norm": 0.00700349360704422, + "learning_rate": 1.4592019950124689e-05, + "loss": 0.0229, + "step": 10850 + }, + { + "epoch": 1.3541147132169575, + "grad_norm": 22.966121673583984, + "learning_rate": 1.458703241895262e-05, + "loss": 0.0711, + "step": 10860 + }, + { + "epoch": 1.355361596009975, + "grad_norm": 14.840107917785645, + "learning_rate": 1.4582044887780549e-05, + "loss": 0.1087, + "step": 10870 + }, + { + "epoch": 1.3566084788029924, + "grad_norm": 0.008188108913600445, + "learning_rate": 1.457705735660848e-05, + "loss": 0.0099, + "step": 10880 + }, + { + "epoch": 1.35785536159601, + "grad_norm": 0.03862173855304718, + "learning_rate": 1.457206982543641e-05, + "loss": 0.0301, + "step": 10890 + }, + { + "epoch": 1.3591022443890275, + "grad_norm": 0.029760289937257767, + "learning_rate": 1.456708229426434e-05, + "loss": 0.0249, + "step": 10900 + }, + { + "epoch": 1.360349127182045, + "grad_norm": 0.00440254807472229, + "learning_rate": 1.4562094763092271e-05, + "loss": 0.0615, + "step": 10910 + }, + { + "epoch": 1.3615960099750624, + "grad_norm": 0.009855284355580807, + "learning_rate": 1.45571072319202e-05, + "loss": 0.0212, + "step": 10920 + }, + { + "epoch": 1.3628428927680798, + "grad_norm": 2.0530319213867188, + "learning_rate": 1.455211970074813e-05, + "loss": 0.0594, + "step": 10930 + }, + { + "epoch": 1.3640897755610972, + "grad_norm": 0.09458717703819275, + "learning_rate": 1.4547132169576061e-05, + "loss": 0.1251, + "step": 10940 + }, + { + "epoch": 1.3653366583541147, + "grad_norm": 0.03143605589866638, + "learning_rate": 1.4542144638403992e-05, + "loss": 0.032, + "step": 10950 + }, + { + "epoch": 1.366583541147132, + "grad_norm": 0.02471662499010563, + "learning_rate": 1.4537157107231921e-05, + "loss": 0.0062, + "step": 10960 + }, + { + "epoch": 1.3678304239401498, + "grad_norm": 0.014755766838788986, + "learning_rate": 1.4532169576059851e-05, + "loss": 0.001, + "step": 10970 + }, + { + "epoch": 1.3690773067331672, + "grad_norm": 0.0027742686215788126, + "learning_rate": 1.4527182044887782e-05, + "loss": 0.0038, + "step": 10980 + }, + { + "epoch": 1.3703241895261846, + "grad_norm": 0.08576522767543793, + "learning_rate": 1.4522194513715713e-05, + "loss": 0.0157, + "step": 10990 + }, + { + "epoch": 1.371571072319202, + "grad_norm": 0.0026629208587110043, + "learning_rate": 1.4517206982543642e-05, + "loss": 0.0017, + "step": 11000 + }, + { + "epoch": 1.3728179551122195, + "grad_norm": 0.023445982486009598, + "learning_rate": 1.4512219451371572e-05, + "loss": 0.0692, + "step": 11010 + }, + { + "epoch": 1.374064837905237, + "grad_norm": 0.005688042845577002, + "learning_rate": 1.4507231920199503e-05, + "loss": 0.0359, + "step": 11020 + }, + { + "epoch": 1.3753117206982544, + "grad_norm": 11.719991683959961, + "learning_rate": 1.4502244389027434e-05, + "loss": 0.0445, + "step": 11030 + }, + { + "epoch": 1.3765586034912718, + "grad_norm": 0.037872131913900375, + "learning_rate": 1.4497256857855362e-05, + "loss": 0.0844, + "step": 11040 + }, + { + "epoch": 1.3778054862842892, + "grad_norm": 0.07881703227758408, + "learning_rate": 1.4492269326683293e-05, + "loss": 0.0597, + "step": 11050 + }, + { + "epoch": 1.3790523690773067, + "grad_norm": 0.012651286087930202, + "learning_rate": 1.4487281795511224e-05, + "loss": 0.0433, + "step": 11060 + }, + { + "epoch": 1.380299251870324, + "grad_norm": 55.89772033691406, + "learning_rate": 1.4482294264339154e-05, + "loss": 0.0772, + "step": 11070 + }, + { + "epoch": 1.3815461346633415, + "grad_norm": 0.003019166411831975, + "learning_rate": 1.4477306733167083e-05, + "loss": 0.0618, + "step": 11080 + }, + { + "epoch": 1.382793017456359, + "grad_norm": 0.6327642798423767, + "learning_rate": 1.4472319201995014e-05, + "loss": 0.0481, + "step": 11090 + }, + { + "epoch": 1.3840399002493766, + "grad_norm": 0.029262885451316833, + "learning_rate": 1.4467331670822944e-05, + "loss": 0.0041, + "step": 11100 + }, + { + "epoch": 1.385286783042394, + "grad_norm": 0.0019791433587670326, + "learning_rate": 1.4462344139650875e-05, + "loss": 0.0026, + "step": 11110 + }, + { + "epoch": 1.3865336658354115, + "grad_norm": 0.2362566888332367, + "learning_rate": 1.4457356608478802e-05, + "loss": 0.0016, + "step": 11120 + }, + { + "epoch": 1.387780548628429, + "grad_norm": 0.05378960818052292, + "learning_rate": 1.4452369077306735e-05, + "loss": 0.0029, + "step": 11130 + }, + { + "epoch": 1.3890274314214464, + "grad_norm": 0.015104220248758793, + "learning_rate": 1.4447381546134665e-05, + "loss": 0.0276, + "step": 11140 + }, + { + "epoch": 1.3902743142144638, + "grad_norm": 23.29336929321289, + "learning_rate": 1.4442394014962596e-05, + "loss": 0.0701, + "step": 11150 + }, + { + "epoch": 1.3915211970074812, + "grad_norm": 1.6370915174484253, + "learning_rate": 1.4437406483790526e-05, + "loss": 0.0766, + "step": 11160 + }, + { + "epoch": 1.3927680798004989, + "grad_norm": 0.08965132385492325, + "learning_rate": 1.4432418952618454e-05, + "loss": 0.0046, + "step": 11170 + }, + { + "epoch": 1.3940149625935163, + "grad_norm": 0.14832371473312378, + "learning_rate": 1.4427431421446384e-05, + "loss": 0.0101, + "step": 11180 + }, + { + "epoch": 1.3952618453865338, + "grad_norm": 0.012930690310895443, + "learning_rate": 1.4422443890274315e-05, + "loss": 0.1335, + "step": 11190 + }, + { + "epoch": 1.3965087281795512, + "grad_norm": 0.018670091405510902, + "learning_rate": 1.4417456359102247e-05, + "loss": 0.0581, + "step": 11200 + }, + { + "epoch": 1.3977556109725686, + "grad_norm": 0.009946133941411972, + "learning_rate": 1.4412468827930174e-05, + "loss": 0.0004, + "step": 11210 + }, + { + "epoch": 1.399002493765586, + "grad_norm": 0.007296042516827583, + "learning_rate": 1.4407481296758105e-05, + "loss": 0.064, + "step": 11220 + }, + { + "epoch": 1.4002493765586035, + "grad_norm": 0.031165320426225662, + "learning_rate": 1.4402493765586036e-05, + "loss": 0.0197, + "step": 11230 + }, + { + "epoch": 1.401496259351621, + "grad_norm": 0.020699728280305862, + "learning_rate": 1.4397506234413966e-05, + "loss": 0.001, + "step": 11240 + }, + { + "epoch": 1.4027431421446384, + "grad_norm": 0.05278054252266884, + "learning_rate": 1.4392518703241895e-05, + "loss": 0.0429, + "step": 11250 + }, + { + "epoch": 1.4039900249376558, + "grad_norm": 0.10893259197473526, + "learning_rate": 1.4387531172069826e-05, + "loss": 0.0664, + "step": 11260 + }, + { + "epoch": 1.4052369077306732, + "grad_norm": 0.23036803305149078, + "learning_rate": 1.4382543640897757e-05, + "loss": 0.0108, + "step": 11270 + }, + { + "epoch": 1.4064837905236907, + "grad_norm": 0.04595217853784561, + "learning_rate": 1.4377556109725687e-05, + "loss": 0.0779, + "step": 11280 + }, + { + "epoch": 1.407730673316708, + "grad_norm": 0.0674215629696846, + "learning_rate": 1.4372568578553616e-05, + "loss": 0.0235, + "step": 11290 + }, + { + "epoch": 1.4089775561097257, + "grad_norm": 0.0016587678110226989, + "learning_rate": 1.4368079800498756e-05, + "loss": 0.0588, + "step": 11300 + }, + { + "epoch": 1.4102244389027432, + "grad_norm": 0.007998665794730186, + "learning_rate": 1.4363092269326683e-05, + "loss": 0.0003, + "step": 11310 + }, + { + "epoch": 1.4114713216957606, + "grad_norm": 0.008180646225810051, + "learning_rate": 1.4358104738154614e-05, + "loss": 0.006, + "step": 11320 + }, + { + "epoch": 1.412718204488778, + "grad_norm": 0.04490172863006592, + "learning_rate": 1.4353117206982544e-05, + "loss": 0.032, + "step": 11330 + }, + { + "epoch": 1.4139650872817955, + "grad_norm": 29.317243576049805, + "learning_rate": 1.4348129675810477e-05, + "loss": 0.1154, + "step": 11340 + }, + { + "epoch": 1.415211970074813, + "grad_norm": 0.005312860477715731, + "learning_rate": 1.4343142144638404e-05, + "loss": 0.0432, + "step": 11350 + }, + { + "epoch": 1.4164588528678304, + "grad_norm": 4.058748245239258, + "learning_rate": 1.4338154613466335e-05, + "loss": 0.004, + "step": 11360 + }, + { + "epoch": 1.417705735660848, + "grad_norm": 14.437801361083984, + "learning_rate": 1.4333167082294265e-05, + "loss": 0.0123, + "step": 11370 + }, + { + "epoch": 1.4189526184538654, + "grad_norm": 0.009661520831286907, + "learning_rate": 1.4328179551122196e-05, + "loss": 0.1112, + "step": 11380 + }, + { + "epoch": 1.4201995012468829, + "grad_norm": 0.010127179324626923, + "learning_rate": 1.4323192019950125e-05, + "loss": 0.0103, + "step": 11390 + }, + { + "epoch": 1.4214463840399003, + "grad_norm": 0.005153048317879438, + "learning_rate": 1.4318204488778055e-05, + "loss": 0.0009, + "step": 11400 + }, + { + "epoch": 1.4226932668329177, + "grad_norm": 0.008940218947827816, + "learning_rate": 1.4313216957605986e-05, + "loss": 0.0538, + "step": 11410 + }, + { + "epoch": 1.4239401496259352, + "grad_norm": 0.01359684206545353, + "learning_rate": 1.4308229426433917e-05, + "loss": 0.0435, + "step": 11420 + }, + { + "epoch": 1.4251870324189526, + "grad_norm": 0.008474809117615223, + "learning_rate": 1.4303241895261846e-05, + "loss": 0.0064, + "step": 11430 + }, + { + "epoch": 1.42643391521197, + "grad_norm": 0.004166824277490377, + "learning_rate": 1.4298254364089776e-05, + "loss": 0.0953, + "step": 11440 + }, + { + "epoch": 1.4276807980049875, + "grad_norm": 0.021383510902523994, + "learning_rate": 1.4293266832917707e-05, + "loss": 0.0212, + "step": 11450 + }, + { + "epoch": 1.428927680798005, + "grad_norm": 1.1953846216201782, + "learning_rate": 1.4288279301745637e-05, + "loss": 0.0579, + "step": 11460 + }, + { + "epoch": 1.4301745635910224, + "grad_norm": 0.0063395812176167965, + "learning_rate": 1.4283291770573566e-05, + "loss": 0.0174, + "step": 11470 + }, + { + "epoch": 1.4314214463840398, + "grad_norm": 0.003997775260359049, + "learning_rate": 1.4278304239401497e-05, + "loss": 0.0678, + "step": 11480 + }, + { + "epoch": 1.4326683291770572, + "grad_norm": 7.376005172729492, + "learning_rate": 1.4273316708229428e-05, + "loss": 0.051, + "step": 11490 + }, + { + "epoch": 1.4339152119700749, + "grad_norm": 38.96922302246094, + "learning_rate": 1.4268329177057358e-05, + "loss": 0.048, + "step": 11500 + }, + { + "epoch": 1.4351620947630923, + "grad_norm": 0.06101633608341217, + "learning_rate": 1.4263341645885287e-05, + "loss": 0.0408, + "step": 11510 + }, + { + "epoch": 1.4364089775561097, + "grad_norm": 0.11863374710083008, + "learning_rate": 1.4258354114713218e-05, + "loss": 0.0763, + "step": 11520 + }, + { + "epoch": 1.4376558603491272, + "grad_norm": 0.006517260335385799, + "learning_rate": 1.4253366583541148e-05, + "loss": 0.0835, + "step": 11530 + }, + { + "epoch": 1.4389027431421446, + "grad_norm": 0.005422229878604412, + "learning_rate": 1.4248379052369079e-05, + "loss": 0.0375, + "step": 11540 + }, + { + "epoch": 1.440149625935162, + "grad_norm": 1.8197985887527466, + "learning_rate": 1.424339152119701e-05, + "loss": 0.0052, + "step": 11550 + }, + { + "epoch": 1.4413965087281795, + "grad_norm": 0.018879203125834465, + "learning_rate": 1.4238403990024939e-05, + "loss": 0.0119, + "step": 11560 + }, + { + "epoch": 1.4426433915211971, + "grad_norm": 7.549375534057617, + "learning_rate": 1.423341645885287e-05, + "loss": 0.036, + "step": 11570 + }, + { + "epoch": 1.4438902743142146, + "grad_norm": 0.015450420789420605, + "learning_rate": 1.42284289276808e-05, + "loss": 0.0054, + "step": 11580 + }, + { + "epoch": 1.445137157107232, + "grad_norm": 30.17746925354004, + "learning_rate": 1.422344139650873e-05, + "loss": 0.0805, + "step": 11590 + }, + { + "epoch": 1.4463840399002494, + "grad_norm": 0.004945869091898203, + "learning_rate": 1.421845386533666e-05, + "loss": 0.0387, + "step": 11600 + }, + { + "epoch": 1.4476309226932669, + "grad_norm": 32.93588638305664, + "learning_rate": 1.421346633416459e-05, + "loss": 0.0309, + "step": 11610 + }, + { + "epoch": 1.4488778054862843, + "grad_norm": 0.02093518152832985, + "learning_rate": 1.420847880299252e-05, + "loss": 0.0953, + "step": 11620 + }, + { + "epoch": 1.4501246882793017, + "grad_norm": 0.004532126244157553, + "learning_rate": 1.4203491271820451e-05, + "loss": 0.0002, + "step": 11630 + }, + { + "epoch": 1.4513715710723192, + "grad_norm": 0.005582903511822224, + "learning_rate": 1.419850374064838e-05, + "loss": 0.056, + "step": 11640 + }, + { + "epoch": 1.4526184538653366, + "grad_norm": 33.18028259277344, + "learning_rate": 1.419351620947631e-05, + "loss": 0.0984, + "step": 11650 + }, + { + "epoch": 1.453865336658354, + "grad_norm": 0.005102763418108225, + "learning_rate": 1.4188528678304241e-05, + "loss": 0.0693, + "step": 11660 + }, + { + "epoch": 1.4551122194513715, + "grad_norm": 0.011876431293785572, + "learning_rate": 1.4183541147132172e-05, + "loss": 0.0454, + "step": 11670 + }, + { + "epoch": 1.456359102244389, + "grad_norm": 0.047975603491067886, + "learning_rate": 1.4178553615960101e-05, + "loss": 0.0018, + "step": 11680 + }, + { + "epoch": 1.4576059850374063, + "grad_norm": 0.016569582745432854, + "learning_rate": 1.4173566084788032e-05, + "loss": 0.0607, + "step": 11690 + }, + { + "epoch": 1.458852867830424, + "grad_norm": 1.9552041292190552, + "learning_rate": 1.4168578553615962e-05, + "loss": 0.0601, + "step": 11700 + }, + { + "epoch": 1.4600997506234414, + "grad_norm": 0.05047905817627907, + "learning_rate": 1.4163591022443893e-05, + "loss": 0.1526, + "step": 11710 + }, + { + "epoch": 1.4613466334164589, + "grad_norm": 0.08394069969654083, + "learning_rate": 1.415860349127182e-05, + "loss": 0.0009, + "step": 11720 + }, + { + "epoch": 1.4625935162094763, + "grad_norm": 0.04425472021102905, + "learning_rate": 1.415361596009975e-05, + "loss": 0.0538, + "step": 11730 + }, + { + "epoch": 1.4638403990024937, + "grad_norm": 0.0035213951487094164, + "learning_rate": 1.4148628428927681e-05, + "loss": 0.0362, + "step": 11740 + }, + { + "epoch": 1.4650872817955112, + "grad_norm": 6.682374000549316, + "learning_rate": 1.4143640897755614e-05, + "loss": 0.0823, + "step": 11750 + }, + { + "epoch": 1.4663341645885286, + "grad_norm": 0.15339747071266174, + "learning_rate": 1.413865336658354e-05, + "loss": 0.0878, + "step": 11760 + }, + { + "epoch": 1.4675810473815463, + "grad_norm": 0.010146928951144218, + "learning_rate": 1.4133665835411471e-05, + "loss": 0.0315, + "step": 11770 + }, + { + "epoch": 1.4688279301745637, + "grad_norm": 0.591601550579071, + "learning_rate": 1.4128678304239402e-05, + "loss": 0.0008, + "step": 11780 + }, + { + "epoch": 1.4700748129675811, + "grad_norm": 0.13875390589237213, + "learning_rate": 1.4123690773067333e-05, + "loss": 0.0831, + "step": 11790 + }, + { + "epoch": 1.4713216957605986, + "grad_norm": 0.046373747289180756, + "learning_rate": 1.4118703241895263e-05, + "loss": 0.0239, + "step": 11800 + }, + { + "epoch": 1.472568578553616, + "grad_norm": 0.11153218150138855, + "learning_rate": 1.4113715710723192e-05, + "loss": 0.0091, + "step": 11810 + }, + { + "epoch": 1.4738154613466334, + "grad_norm": 0.008640721440315247, + "learning_rate": 1.4108728179551123e-05, + "loss": 0.0008, + "step": 11820 + }, + { + "epoch": 1.4750623441396509, + "grad_norm": 0.022386932745575905, + "learning_rate": 1.4103740648379053e-05, + "loss": 0.0166, + "step": 11830 + }, + { + "epoch": 1.4763092269326683, + "grad_norm": 2.148536443710327, + "learning_rate": 1.4098753117206984e-05, + "loss": 0.0432, + "step": 11840 + }, + { + "epoch": 1.4775561097256857, + "grad_norm": 0.008330950513482094, + "learning_rate": 1.4093765586034913e-05, + "loss": 0.0562, + "step": 11850 + }, + { + "epoch": 1.4788029925187032, + "grad_norm": 5.3832597732543945, + "learning_rate": 1.4088778054862844e-05, + "loss": 0.0942, + "step": 11860 + }, + { + "epoch": 1.4800498753117206, + "grad_norm": 0.01949029415845871, + "learning_rate": 1.4083790523690774e-05, + "loss": 0.0111, + "step": 11870 + }, + { + "epoch": 1.481296758104738, + "grad_norm": 0.006169204134494066, + "learning_rate": 1.4078802992518705e-05, + "loss": 0.0007, + "step": 11880 + }, + { + "epoch": 1.4825436408977555, + "grad_norm": 0.0029075045604258776, + "learning_rate": 1.4073815461346634e-05, + "loss": 0.0033, + "step": 11890 + }, + { + "epoch": 1.4837905236907731, + "grad_norm": 0.09177600592374802, + "learning_rate": 1.4068827930174564e-05, + "loss": 0.0989, + "step": 11900 + }, + { + "epoch": 1.4850374064837906, + "grad_norm": 0.0033431975170969963, + "learning_rate": 1.4063840399002495e-05, + "loss": 0.027, + "step": 11910 + }, + { + "epoch": 1.486284289276808, + "grad_norm": 0.006316428538411856, + "learning_rate": 1.4058852867830426e-05, + "loss": 0.0007, + "step": 11920 + }, + { + "epoch": 1.4875311720698254, + "grad_norm": 0.3648694157600403, + "learning_rate": 1.4053865336658355e-05, + "loss": 0.0083, + "step": 11930 + }, + { + "epoch": 1.4887780548628429, + "grad_norm": 0.10771086066961288, + "learning_rate": 1.4048877805486285e-05, + "loss": 0.0285, + "step": 11940 + }, + { + "epoch": 1.4900249376558603, + "grad_norm": 20.01485824584961, + "learning_rate": 1.4043890274314216e-05, + "loss": 0.0287, + "step": 11950 + }, + { + "epoch": 1.4912718204488777, + "grad_norm": 0.0025918507017195225, + "learning_rate": 1.4038902743142146e-05, + "loss": 0.059, + "step": 11960 + }, + { + "epoch": 1.4925187032418954, + "grad_norm": 0.0017820337088778615, + "learning_rate": 1.4033915211970075e-05, + "loss": 0.0087, + "step": 11970 + }, + { + "epoch": 1.4937655860349128, + "grad_norm": 40.38088607788086, + "learning_rate": 1.4028927680798006e-05, + "loss": 0.1003, + "step": 11980 + }, + { + "epoch": 1.4950124688279303, + "grad_norm": 0.1557849496603012, + "learning_rate": 1.4023940149625937e-05, + "loss": 0.0726, + "step": 11990 + }, + { + "epoch": 1.4962593516209477, + "grad_norm": 0.012030398473143578, + "learning_rate": 1.4018952618453867e-05, + "loss": 0.0073, + "step": 12000 + }, + { + "epoch": 1.4975062344139651, + "grad_norm": 0.0018893389496952295, + "learning_rate": 1.4013965087281796e-05, + "loss": 0.0332, + "step": 12010 + }, + { + "epoch": 1.4987531172069826, + "grad_norm": 17.814115524291992, + "learning_rate": 1.4008977556109727e-05, + "loss": 0.0321, + "step": 12020 + }, + { + "epoch": 1.5, + "grad_norm": 0.0038544260896742344, + "learning_rate": 1.4003990024937657e-05, + "loss": 0.0243, + "step": 12030 + }, + { + "epoch": 1.5012468827930174, + "grad_norm": 48.38309097290039, + "learning_rate": 1.3999002493765588e-05, + "loss": 0.0636, + "step": 12040 + }, + { + "epoch": 1.5024937655860349, + "grad_norm": 0.007168745622038841, + "learning_rate": 1.3994014962593519e-05, + "loss": 0.0393, + "step": 12050 + }, + { + "epoch": 1.5037406483790523, + "grad_norm": 0.006634199526160955, + "learning_rate": 1.3989027431421447e-05, + "loss": 0.0642, + "step": 12060 + }, + { + "epoch": 1.5049875311720697, + "grad_norm": 0.005328933708369732, + "learning_rate": 1.3984039900249378e-05, + "loss": 0.037, + "step": 12070 + }, + { + "epoch": 1.5062344139650872, + "grad_norm": 0.6847673058509827, + "learning_rate": 1.3979052369077309e-05, + "loss": 0.0006, + "step": 12080 + }, + { + "epoch": 1.5074812967581046, + "grad_norm": 12.247014999389648, + "learning_rate": 1.397406483790524e-05, + "loss": 0.0148, + "step": 12090 + }, + { + "epoch": 1.508728179551122, + "grad_norm": 0.012827611528337002, + "learning_rate": 1.3969077306733168e-05, + "loss": 0.0064, + "step": 12100 + }, + { + "epoch": 1.5099750623441397, + "grad_norm": 0.002687269588932395, + "learning_rate": 1.3964089775561099e-05, + "loss": 0.0304, + "step": 12110 + }, + { + "epoch": 1.5112219451371571, + "grad_norm": 0.009229120798408985, + "learning_rate": 1.395910224438903e-05, + "loss": 0.0832, + "step": 12120 + }, + { + "epoch": 1.5124688279301746, + "grad_norm": 0.010797587223351002, + "learning_rate": 1.395411471321696e-05, + "loss": 0.057, + "step": 12130 + }, + { + "epoch": 1.513715710723192, + "grad_norm": 0.017812317237257957, + "learning_rate": 1.3949127182044887e-05, + "loss": 0.0554, + "step": 12140 + }, + { + "epoch": 1.5149625935162094, + "grad_norm": 0.09482496231794357, + "learning_rate": 1.394413965087282e-05, + "loss": 0.0457, + "step": 12150 + }, + { + "epoch": 1.516209476309227, + "grad_norm": 1.7302439212799072, + "learning_rate": 1.393915211970075e-05, + "loss": 0.0728, + "step": 12160 + }, + { + "epoch": 1.5174563591022445, + "grad_norm": 0.19030161201953888, + "learning_rate": 1.3934164588528681e-05, + "loss": 0.1303, + "step": 12170 + }, + { + "epoch": 1.518703241895262, + "grad_norm": 7.949008941650391, + "learning_rate": 1.3929177057356608e-05, + "loss": 0.0347, + "step": 12180 + }, + { + "epoch": 1.5199501246882794, + "grad_norm": 0.012247119098901749, + "learning_rate": 1.3924189526184539e-05, + "loss": 0.0021, + "step": 12190 + }, + { + "epoch": 1.5211970074812968, + "grad_norm": 0.005694092251360416, + "learning_rate": 1.391920199501247e-05, + "loss": 0.0012, + "step": 12200 + }, + { + "epoch": 1.5224438902743143, + "grad_norm": 0.00615575909614563, + "learning_rate": 1.3914214463840402e-05, + "loss": 0.0228, + "step": 12210 + }, + { + "epoch": 1.5236907730673317, + "grad_norm": 0.013162520714104176, + "learning_rate": 1.3909226932668329e-05, + "loss": 0.0133, + "step": 12220 + }, + { + "epoch": 1.5249376558603491, + "grad_norm": 23.256078720092773, + "learning_rate": 1.390423940149626e-05, + "loss": 0.0441, + "step": 12230 + }, + { + "epoch": 1.5261845386533666, + "grad_norm": 0.007707939483225346, + "learning_rate": 1.389925187032419e-05, + "loss": 0.0024, + "step": 12240 + }, + { + "epoch": 1.527431421446384, + "grad_norm": 0.014242901466786861, + "learning_rate": 1.389426433915212e-05, + "loss": 0.001, + "step": 12250 + }, + { + "epoch": 1.5286783042394014, + "grad_norm": 0.003064389806240797, + "learning_rate": 1.388927680798005e-05, + "loss": 0.0003, + "step": 12260 + }, + { + "epoch": 1.5299251870324189, + "grad_norm": 0.0030908039771020412, + "learning_rate": 1.388428927680798e-05, + "loss": 0.043, + "step": 12270 + }, + { + "epoch": 1.5311720698254363, + "grad_norm": 0.010099162347614765, + "learning_rate": 1.3879301745635911e-05, + "loss": 0.0003, + "step": 12280 + }, + { + "epoch": 1.5324189526184537, + "grad_norm": 2.3775289058685303, + "learning_rate": 1.3874314214463842e-05, + "loss": 0.0066, + "step": 12290 + }, + { + "epoch": 1.5336658354114712, + "grad_norm": 0.00410651508718729, + "learning_rate": 1.3869326683291772e-05, + "loss": 0.0009, + "step": 12300 + }, + { + "epoch": 1.5349127182044888, + "grad_norm": 0.007365523837506771, + "learning_rate": 1.3864339152119701e-05, + "loss": 0.0376, + "step": 12310 + }, + { + "epoch": 1.5361596009975063, + "grad_norm": 0.004369727335870266, + "learning_rate": 1.3859351620947632e-05, + "loss": 0.0124, + "step": 12320 + }, + { + "epoch": 1.5374064837905237, + "grad_norm": 2.023164987564087, + "learning_rate": 1.3854364089775562e-05, + "loss": 0.0051, + "step": 12330 + }, + { + "epoch": 1.5386533665835411, + "grad_norm": 0.008770514279603958, + "learning_rate": 1.3849376558603493e-05, + "loss": 0.0002, + "step": 12340 + }, + { + "epoch": 1.5399002493765586, + "grad_norm": 0.001449818373657763, + "learning_rate": 1.3844389027431422e-05, + "loss": 0.0028, + "step": 12350 + }, + { + "epoch": 1.5411471321695762, + "grad_norm": 0.11426911503076553, + "learning_rate": 1.3839401496259353e-05, + "loss": 0.027, + "step": 12360 + }, + { + "epoch": 1.5423940149625937, + "grad_norm": 0.009318443946540356, + "learning_rate": 1.3834413965087283e-05, + "loss": 0.0007, + "step": 12370 + }, + { + "epoch": 1.543640897755611, + "grad_norm": 0.0020017463248223066, + "learning_rate": 1.3829426433915214e-05, + "loss": 0.0002, + "step": 12380 + }, + { + "epoch": 1.5448877805486285, + "grad_norm": 0.002024906687438488, + "learning_rate": 1.3824438902743143e-05, + "loss": 0.0023, + "step": 12390 + }, + { + "epoch": 1.546134663341646, + "grad_norm": 0.01683424413204193, + "learning_rate": 1.3819451371571073e-05, + "loss": 0.0022, + "step": 12400 + }, + { + "epoch": 1.5473815461346634, + "grad_norm": 0.19490936398506165, + "learning_rate": 1.3814463840399004e-05, + "loss": 0.0003, + "step": 12410 + }, + { + "epoch": 1.5486284289276808, + "grad_norm": 0.007523283362388611, + "learning_rate": 1.3809476309226935e-05, + "loss": 0.0524, + "step": 12420 + }, + { + "epoch": 1.5498753117206983, + "grad_norm": 0.009266285225749016, + "learning_rate": 1.3804488778054863e-05, + "loss": 0.0669, + "step": 12430 + }, + { + "epoch": 1.5511221945137157, + "grad_norm": 0.006378598511219025, + "learning_rate": 1.3799501246882794e-05, + "loss": 0.0714, + "step": 12440 + }, + { + "epoch": 1.5523690773067331, + "grad_norm": 0.26278775930404663, + "learning_rate": 1.3794513715710725e-05, + "loss": 0.0538, + "step": 12450 + }, + { + "epoch": 1.5536159600997506, + "grad_norm": 0.7849764823913574, + "learning_rate": 1.3789526184538655e-05, + "loss": 0.0009, + "step": 12460 + }, + { + "epoch": 1.554862842892768, + "grad_norm": 0.04737240821123123, + "learning_rate": 1.3784538653366584e-05, + "loss": 0.0342, + "step": 12470 + }, + { + "epoch": 1.5561097256857854, + "grad_norm": 32.681884765625, + "learning_rate": 1.3779551122194515e-05, + "loss": 0.0475, + "step": 12480 + }, + { + "epoch": 1.5573566084788029, + "grad_norm": 24.724851608276367, + "learning_rate": 1.3774563591022445e-05, + "loss": 0.0479, + "step": 12490 + }, + { + "epoch": 1.5586034912718203, + "grad_norm": 0.0013840860920026898, + "learning_rate": 1.3769576059850376e-05, + "loss": 0.0367, + "step": 12500 + }, + { + "epoch": 1.559850374064838, + "grad_norm": 0.008518923074007034, + "learning_rate": 1.3764588528678305e-05, + "loss": 0.1191, + "step": 12510 + }, + { + "epoch": 1.5610972568578554, + "grad_norm": 0.00492417486384511, + "learning_rate": 1.3759600997506236e-05, + "loss": 0.0017, + "step": 12520 + }, + { + "epoch": 1.5623441396508728, + "grad_norm": 2.3020267486572266, + "learning_rate": 1.3754613466334166e-05, + "loss": 0.0278, + "step": 12530 + }, + { + "epoch": 1.5635910224438903, + "grad_norm": 4.384720325469971, + "learning_rate": 1.3749625935162097e-05, + "loss": 0.0561, + "step": 12540 + }, + { + "epoch": 1.5648379052369077, + "grad_norm": 0.007074257824569941, + "learning_rate": 1.3744638403990028e-05, + "loss": 0.0002, + "step": 12550 + }, + { + "epoch": 1.5660847880299253, + "grad_norm": 0.016326969489455223, + "learning_rate": 1.3739650872817956e-05, + "loss": 0.0003, + "step": 12560 + }, + { + "epoch": 1.5673316708229428, + "grad_norm": 0.010336228646337986, + "learning_rate": 1.3734663341645887e-05, + "loss": 0.0397, + "step": 12570 + }, + { + "epoch": 1.5685785536159602, + "grad_norm": 0.15242627263069153, + "learning_rate": 1.3729675810473818e-05, + "loss": 0.0902, + "step": 12580 + }, + { + "epoch": 1.5698254364089776, + "grad_norm": 0.1268034428358078, + "learning_rate": 1.3724688279301748e-05, + "loss": 0.0435, + "step": 12590 + }, + { + "epoch": 1.571072319201995, + "grad_norm": 9.81682014465332, + "learning_rate": 1.3719700748129676e-05, + "loss": 0.04, + "step": 12600 + }, + { + "epoch": 1.5723192019950125, + "grad_norm": 0.034833166748285294, + "learning_rate": 1.3714713216957606e-05, + "loss": 0.0007, + "step": 12610 + }, + { + "epoch": 1.57356608478803, + "grad_norm": 0.01569850742816925, + "learning_rate": 1.3709725685785538e-05, + "loss": 0.041, + "step": 12620 + }, + { + "epoch": 1.5748129675810474, + "grad_norm": 0.0046609025448560715, + "learning_rate": 1.3704738154613469e-05, + "loss": 0.0856, + "step": 12630 + }, + { + "epoch": 1.5760598503740648, + "grad_norm": 0.024893207475543022, + "learning_rate": 1.3699750623441396e-05, + "loss": 0.0223, + "step": 12640 + }, + { + "epoch": 1.5773067331670823, + "grad_norm": 0.09220171719789505, + "learning_rate": 1.3694763092269327e-05, + "loss": 0.0718, + "step": 12650 + }, + { + "epoch": 1.5785536159600997, + "grad_norm": 0.04910089075565338, + "learning_rate": 1.3689775561097258e-05, + "loss": 0.0545, + "step": 12660 + }, + { + "epoch": 1.5798004987531171, + "grad_norm": 0.01659906841814518, + "learning_rate": 1.3684788029925188e-05, + "loss": 0.03, + "step": 12670 + }, + { + "epoch": 1.5810473815461346, + "grad_norm": 0.02599448524415493, + "learning_rate": 1.3679800498753117e-05, + "loss": 0.0515, + "step": 12680 + }, + { + "epoch": 1.582294264339152, + "grad_norm": 0.030495228245854378, + "learning_rate": 1.3674812967581048e-05, + "loss": 0.0084, + "step": 12690 + }, + { + "epoch": 1.5835411471321694, + "grad_norm": 0.06779367476701736, + "learning_rate": 1.3669825436408978e-05, + "loss": 0.0029, + "step": 12700 + }, + { + "epoch": 1.584788029925187, + "grad_norm": 0.02692810632288456, + "learning_rate": 1.3664837905236909e-05, + "loss": 0.1027, + "step": 12710 + }, + { + "epoch": 1.5860349127182045, + "grad_norm": 42.45810317993164, + "learning_rate": 1.3659850374064838e-05, + "loss": 0.0173, + "step": 12720 + }, + { + "epoch": 1.587281795511222, + "grad_norm": 0.17942014336585999, + "learning_rate": 1.3654862842892769e-05, + "loss": 0.0006, + "step": 12730 + }, + { + "epoch": 1.5885286783042394, + "grad_norm": 0.03577357530593872, + "learning_rate": 1.3649875311720699e-05, + "loss": 0.001, + "step": 12740 + }, + { + "epoch": 1.5897755610972568, + "grad_norm": 0.2650788724422455, + "learning_rate": 1.364488778054863e-05, + "loss": 0.0389, + "step": 12750 + }, + { + "epoch": 1.5910224438902745, + "grad_norm": 0.018843036144971848, + "learning_rate": 1.3639900249376559e-05, + "loss": 0.0141, + "step": 12760 + }, + { + "epoch": 1.592269326683292, + "grad_norm": 0.693320631980896, + "learning_rate": 1.363491271820449e-05, + "loss": 0.0018, + "step": 12770 + }, + { + "epoch": 1.5935162094763093, + "grad_norm": 0.0032859332859516144, + "learning_rate": 1.362992518703242e-05, + "loss": 0.0008, + "step": 12780 + }, + { + "epoch": 1.5947630922693268, + "grad_norm": 0.018069753423333168, + "learning_rate": 1.362493765586035e-05, + "loss": 0.0003, + "step": 12790 + }, + { + "epoch": 1.5960099750623442, + "grad_norm": 0.003422915004193783, + "learning_rate": 1.3619950124688281e-05, + "loss": 0.0004, + "step": 12800 + }, + { + "epoch": 1.5972568578553616, + "grad_norm": 71.4630355834961, + "learning_rate": 1.361496259351621e-05, + "loss": 0.0492, + "step": 12810 + }, + { + "epoch": 1.598503740648379, + "grad_norm": 0.07465198636054993, + "learning_rate": 1.360997506234414e-05, + "loss": 0.0666, + "step": 12820 + }, + { + "epoch": 1.5997506234413965, + "grad_norm": 0.005183480214327574, + "learning_rate": 1.3604987531172071e-05, + "loss": 0.0027, + "step": 12830 + }, + { + "epoch": 1.600997506234414, + "grad_norm": 20.43458366394043, + "learning_rate": 1.3600000000000002e-05, + "loss": 0.0305, + "step": 12840 + }, + { + "epoch": 1.6022443890274314, + "grad_norm": 0.0220947228372097, + "learning_rate": 1.3595012468827931e-05, + "loss": 0.0226, + "step": 12850 + }, + { + "epoch": 1.6034912718204488, + "grad_norm": 0.0029411078430712223, + "learning_rate": 1.3590024937655861e-05, + "loss": 0.0654, + "step": 12860 + }, + { + "epoch": 1.6047381546134662, + "grad_norm": 0.0013770603109151125, + "learning_rate": 1.3585037406483792e-05, + "loss": 0.0466, + "step": 12870 + }, + { + "epoch": 1.6059850374064837, + "grad_norm": 0.35909491777420044, + "learning_rate": 1.3580049875311723e-05, + "loss": 0.0055, + "step": 12880 + }, + { + "epoch": 1.6072319201995011, + "grad_norm": 0.06752067804336548, + "learning_rate": 1.3575062344139652e-05, + "loss": 0.0771, + "step": 12890 + }, + { + "epoch": 1.6084788029925186, + "grad_norm": 0.002513469662517309, + "learning_rate": 1.3570074812967582e-05, + "loss": 0.0009, + "step": 12900 + }, + { + "epoch": 1.6097256857855362, + "grad_norm": 0.00214858609251678, + "learning_rate": 1.3565087281795513e-05, + "loss": 0.0031, + "step": 12910 + }, + { + "epoch": 1.6109725685785536, + "grad_norm": 0.0014683667104691267, + "learning_rate": 1.3560099750623443e-05, + "loss": 0.0013, + "step": 12920 + }, + { + "epoch": 1.612219451371571, + "grad_norm": 0.3527969419956207, + "learning_rate": 1.3555112219451372e-05, + "loss": 0.076, + "step": 12930 + }, + { + "epoch": 1.6134663341645885, + "grad_norm": 5.842223167419434, + "learning_rate": 1.3550124688279303e-05, + "loss": 0.0978, + "step": 12940 + }, + { + "epoch": 1.614713216957606, + "grad_norm": 7.368631362915039, + "learning_rate": 1.3545137157107234e-05, + "loss": 0.0027, + "step": 12950 + }, + { + "epoch": 1.6159600997506236, + "grad_norm": 1.2361329793930054, + "learning_rate": 1.3540149625935164e-05, + "loss": 0.018, + "step": 12960 + }, + { + "epoch": 1.617206982543641, + "grad_norm": 4.243699550628662, + "learning_rate": 1.3535162094763093e-05, + "loss": 0.0365, + "step": 12970 + }, + { + "epoch": 1.6184538653366585, + "grad_norm": 0.009675173088908195, + "learning_rate": 1.3530174563591024e-05, + "loss": 0.0002, + "step": 12980 + }, + { + "epoch": 1.619700748129676, + "grad_norm": 6.819517612457275, + "learning_rate": 1.3525187032418954e-05, + "loss": 0.106, + "step": 12990 + }, + { + "epoch": 1.6209476309226933, + "grad_norm": 0.08170343935489655, + "learning_rate": 1.3520199501246885e-05, + "loss": 0.0457, + "step": 13000 + }, + { + "epoch": 1.6221945137157108, + "grad_norm": 0.0533466599881649, + "learning_rate": 1.3515211970074812e-05, + "loss": 0.0109, + "step": 13010 + }, + { + "epoch": 1.6234413965087282, + "grad_norm": 0.22970125079154968, + "learning_rate": 1.3510224438902743e-05, + "loss": 0.0215, + "step": 13020 + }, + { + "epoch": 1.6246882793017456, + "grad_norm": 0.006124209146946669, + "learning_rate": 1.3505236907730675e-05, + "loss": 0.0003, + "step": 13030 + }, + { + "epoch": 1.625935162094763, + "grad_norm": 0.005711245816200972, + "learning_rate": 1.3500249376558606e-05, + "loss": 0.0916, + "step": 13040 + }, + { + "epoch": 1.6271820448877805, + "grad_norm": 0.06058807671070099, + "learning_rate": 1.3495261845386536e-05, + "loss": 0.0927, + "step": 13050 + }, + { + "epoch": 1.628428927680798, + "grad_norm": 0.021231012418866158, + "learning_rate": 1.3490274314214464e-05, + "loss": 0.0043, + "step": 13060 + }, + { + "epoch": 1.6296758104738154, + "grad_norm": 0.044012606143951416, + "learning_rate": 1.3485286783042394e-05, + "loss": 0.0078, + "step": 13070 + }, + { + "epoch": 1.6309226932668328, + "grad_norm": 0.0040282756090164185, + "learning_rate": 1.3480299251870325e-05, + "loss": 0.0294, + "step": 13080 + }, + { + "epoch": 1.6321695760598502, + "grad_norm": 13.578664779663086, + "learning_rate": 1.3475311720698257e-05, + "loss": 0.0791, + "step": 13090 + }, + { + "epoch": 1.6334164588528677, + "grad_norm": 0.018825553357601166, + "learning_rate": 1.3470324189526184e-05, + "loss": 0.0313, + "step": 13100 + }, + { + "epoch": 1.6346633416458853, + "grad_norm": 0.18112289905548096, + "learning_rate": 1.3465336658354115e-05, + "loss": 0.0004, + "step": 13110 + }, + { + "epoch": 1.6359102244389028, + "grad_norm": 0.03131983429193497, + "learning_rate": 1.3460349127182046e-05, + "loss": 0.0008, + "step": 13120 + }, + { + "epoch": 1.6371571072319202, + "grad_norm": 6.653219699859619, + "learning_rate": 1.3455361596009976e-05, + "loss": 0.0314, + "step": 13130 + }, + { + "epoch": 1.6384039900249376, + "grad_norm": 1.7269296646118164, + "learning_rate": 1.3450374064837905e-05, + "loss": 0.0793, + "step": 13140 + }, + { + "epoch": 1.639650872817955, + "grad_norm": 0.011295067146420479, + "learning_rate": 1.3445386533665836e-05, + "loss": 0.0018, + "step": 13150 + }, + { + "epoch": 1.6408977556109727, + "grad_norm": 0.5535463690757751, + "learning_rate": 1.3440399002493767e-05, + "loss": 0.0448, + "step": 13160 + }, + { + "epoch": 1.6421446384039902, + "grad_norm": 0.045436304062604904, + "learning_rate": 1.3435411471321697e-05, + "loss": 0.0146, + "step": 13170 + }, + { + "epoch": 1.6433915211970076, + "grad_norm": 0.12149259448051453, + "learning_rate": 1.3430423940149626e-05, + "loss": 0.0252, + "step": 13180 + }, + { + "epoch": 1.644638403990025, + "grad_norm": 0.10686697065830231, + "learning_rate": 1.3425436408977557e-05, + "loss": 0.0019, + "step": 13190 + }, + { + "epoch": 1.6458852867830425, + "grad_norm": 0.021616408601403236, + "learning_rate": 1.3420448877805487e-05, + "loss": 0.0019, + "step": 13200 + }, + { + "epoch": 1.64713216957606, + "grad_norm": 41.76093673706055, + "learning_rate": 1.3415461346633418e-05, + "loss": 0.018, + "step": 13210 + }, + { + "epoch": 1.6483790523690773, + "grad_norm": 0.0063141207210719585, + "learning_rate": 1.3410473815461347e-05, + "loss": 0.0422, + "step": 13220 + }, + { + "epoch": 1.6496259351620948, + "grad_norm": 19.624000549316406, + "learning_rate": 1.3405486284289277e-05, + "loss": 0.0317, + "step": 13230 + }, + { + "epoch": 1.6508728179551122, + "grad_norm": 0.09289072453975677, + "learning_rate": 1.3400498753117208e-05, + "loss": 0.053, + "step": 13240 + }, + { + "epoch": 1.6521197007481296, + "grad_norm": 0.008266476914286613, + "learning_rate": 1.3395511221945139e-05, + "loss": 0.0634, + "step": 13250 + }, + { + "epoch": 1.653366583541147, + "grad_norm": 0.05262904614210129, + "learning_rate": 1.3390523690773068e-05, + "loss": 0.0002, + "step": 13260 + }, + { + "epoch": 1.6546134663341645, + "grad_norm": 0.014744916930794716, + "learning_rate": 1.3385536159600998e-05, + "loss": 0.0099, + "step": 13270 + }, + { + "epoch": 1.655860349127182, + "grad_norm": 0.012456094846129417, + "learning_rate": 1.3380548628428929e-05, + "loss": 0.0959, + "step": 13280 + }, + { + "epoch": 1.6571072319201994, + "grad_norm": 0.3287002742290497, + "learning_rate": 1.337556109725686e-05, + "loss": 0.065, + "step": 13290 + }, + { + "epoch": 1.6583541147132168, + "grad_norm": 15.193024635314941, + "learning_rate": 1.337057356608479e-05, + "loss": 0.0479, + "step": 13300 + }, + { + "epoch": 1.6596009975062345, + "grad_norm": 26.678428649902344, + "learning_rate": 1.3365586034912719e-05, + "loss": 0.0896, + "step": 13310 + }, + { + "epoch": 1.660847880299252, + "grad_norm": 0.01345492247492075, + "learning_rate": 1.336059850374065e-05, + "loss": 0.0075, + "step": 13320 + }, + { + "epoch": 1.6620947630922693, + "grad_norm": 0.003237128956243396, + "learning_rate": 1.335561097256858e-05, + "loss": 0.0008, + "step": 13330 + }, + { + "epoch": 1.6633416458852868, + "grad_norm": 0.012729836627840996, + "learning_rate": 1.3350623441396511e-05, + "loss": 0.105, + "step": 13340 + }, + { + "epoch": 1.6645885286783042, + "grad_norm": 0.001644920208491385, + "learning_rate": 1.334563591022444e-05, + "loss": 0.0701, + "step": 13350 + }, + { + "epoch": 1.6658354114713219, + "grad_norm": 20.716304779052734, + "learning_rate": 1.334064837905237e-05, + "loss": 0.0816, + "step": 13360 + }, + { + "epoch": 1.6670822942643393, + "grad_norm": 0.09023711085319519, + "learning_rate": 1.3335660847880301e-05, + "loss": 0.1163, + "step": 13370 + }, + { + "epoch": 1.6683291770573567, + "grad_norm": 8.607697486877441, + "learning_rate": 1.3330673316708232e-05, + "loss": 0.067, + "step": 13380 + }, + { + "epoch": 1.6695760598503742, + "grad_norm": 0.024321427568793297, + "learning_rate": 1.332568578553616e-05, + "loss": 0.1202, + "step": 13390 + }, + { + "epoch": 1.6708229426433916, + "grad_norm": 0.2328629195690155, + "learning_rate": 1.3320698254364091e-05, + "loss": 0.0063, + "step": 13400 + }, + { + "epoch": 1.672069825436409, + "grad_norm": 0.053208332508802414, + "learning_rate": 1.3315710723192022e-05, + "loss": 0.0353, + "step": 13410 + }, + { + "epoch": 1.6733167082294265, + "grad_norm": 0.006619223393499851, + "learning_rate": 1.3310723192019952e-05, + "loss": 0.0316, + "step": 13420 + }, + { + "epoch": 1.674563591022444, + "grad_norm": 0.006179279647767544, + "learning_rate": 1.330573566084788e-05, + "loss": 0.0117, + "step": 13430 + }, + { + "epoch": 1.6758104738154613, + "grad_norm": 4.701325416564941, + "learning_rate": 1.3300748129675812e-05, + "loss": 0.0027, + "step": 13440 + }, + { + "epoch": 1.6770573566084788, + "grad_norm": 2.834035634994507, + "learning_rate": 1.3296259351620949e-05, + "loss": 0.0534, + "step": 13450 + }, + { + "epoch": 1.6783042394014962, + "grad_norm": 0.005688248202204704, + "learning_rate": 1.3291271820448879e-05, + "loss": 0.0429, + "step": 13460 + }, + { + "epoch": 1.6795511221945136, + "grad_norm": 0.11691449582576752, + "learning_rate": 1.328628428927681e-05, + "loss": 0.0756, + "step": 13470 + }, + { + "epoch": 1.680798004987531, + "grad_norm": 0.005437224637717009, + "learning_rate": 1.328129675810474e-05, + "loss": 0.0323, + "step": 13480 + }, + { + "epoch": 1.6820448877805485, + "grad_norm": 0.0028406723868101835, + "learning_rate": 1.327630922693267e-05, + "loss": 0.0679, + "step": 13490 + }, + { + "epoch": 1.683291770573566, + "grad_norm": 1.3356423377990723, + "learning_rate": 1.32713216957606e-05, + "loss": 0.0044, + "step": 13500 + }, + { + "epoch": 1.6845386533665836, + "grad_norm": 9.16796875, + "learning_rate": 1.326633416458853e-05, + "loss": 0.044, + "step": 13510 + }, + { + "epoch": 1.685785536159601, + "grad_norm": 0.0038144674617797136, + "learning_rate": 1.3261346633416461e-05, + "loss": 0.0342, + "step": 13520 + }, + { + "epoch": 1.6870324189526185, + "grad_norm": 0.0027151876129209995, + "learning_rate": 1.325635910224439e-05, + "loss": 0.077, + "step": 13530 + }, + { + "epoch": 1.688279301745636, + "grad_norm": 47.315982818603516, + "learning_rate": 1.325137157107232e-05, + "loss": 0.0227, + "step": 13540 + }, + { + "epoch": 1.6895261845386533, + "grad_norm": 0.04255508631467819, + "learning_rate": 1.3246384039900251e-05, + "loss": 0.0003, + "step": 13550 + }, + { + "epoch": 1.690773067331671, + "grad_norm": 3.864309310913086, + "learning_rate": 1.3241396508728182e-05, + "loss": 0.0153, + "step": 13560 + }, + { + "epoch": 1.6920199501246884, + "grad_norm": 0.44293212890625, + "learning_rate": 1.323640897755611e-05, + "loss": 0.0138, + "step": 13570 + }, + { + "epoch": 1.6932668329177059, + "grad_norm": 9.212722778320312, + "learning_rate": 1.3231421446384041e-05, + "loss": 0.084, + "step": 13580 + }, + { + "epoch": 1.6945137157107233, + "grad_norm": 0.07577641308307648, + "learning_rate": 1.3226433915211972e-05, + "loss": 0.0004, + "step": 13590 + }, + { + "epoch": 1.6957605985037407, + "grad_norm": 14.254758834838867, + "learning_rate": 1.3221446384039903e-05, + "loss": 0.0978, + "step": 13600 + }, + { + "epoch": 1.6970074812967582, + "grad_norm": 0.5562378168106079, + "learning_rate": 1.321645885286783e-05, + "loss": 0.0175, + "step": 13610 + }, + { + "epoch": 1.6982543640897756, + "grad_norm": 0.31031671166419983, + "learning_rate": 1.321147132169576e-05, + "loss": 0.0909, + "step": 13620 + }, + { + "epoch": 1.699501246882793, + "grad_norm": 0.018233343958854675, + "learning_rate": 1.3206483790523691e-05, + "loss": 0.0012, + "step": 13630 + }, + { + "epoch": 1.7007481296758105, + "grad_norm": 0.01966504380106926, + "learning_rate": 1.3201496259351624e-05, + "loss": 0.0036, + "step": 13640 + }, + { + "epoch": 1.701995012468828, + "grad_norm": 0.004362176638096571, + "learning_rate": 1.319650872817955e-05, + "loss": 0.045, + "step": 13650 + }, + { + "epoch": 1.7032418952618453, + "grad_norm": 0.02185465581715107, + "learning_rate": 1.3191521197007481e-05, + "loss": 0.0368, + "step": 13660 + }, + { + "epoch": 1.7044887780548628, + "grad_norm": 0.01592567190527916, + "learning_rate": 1.3186533665835412e-05, + "loss": 0.0017, + "step": 13670 + }, + { + "epoch": 1.7057356608478802, + "grad_norm": 0.002704000100493431, + "learning_rate": 1.3181546134663343e-05, + "loss": 0.0002, + "step": 13680 + }, + { + "epoch": 1.7069825436408976, + "grad_norm": 0.01973789371550083, + "learning_rate": 1.3176558603491272e-05, + "loss": 0.2209, + "step": 13690 + }, + { + "epoch": 1.708229426433915, + "grad_norm": 0.7285020351409912, + "learning_rate": 1.3171571072319202e-05, + "loss": 0.0378, + "step": 13700 + }, + { + "epoch": 1.7094763092269327, + "grad_norm": 0.01300446130335331, + "learning_rate": 1.3166583541147133e-05, + "loss": 0.0008, + "step": 13710 + }, + { + "epoch": 1.7107231920199502, + "grad_norm": 0.048056092113256454, + "learning_rate": 1.3161596009975063e-05, + "loss": 0.0412, + "step": 13720 + }, + { + "epoch": 1.7119700748129676, + "grad_norm": 0.01191894430667162, + "learning_rate": 1.3156608478802994e-05, + "loss": 0.0009, + "step": 13730 + }, + { + "epoch": 1.713216957605985, + "grad_norm": 0.01139442902058363, + "learning_rate": 1.3151620947630923e-05, + "loss": 0.0161, + "step": 13740 + }, + { + "epoch": 1.7144638403990025, + "grad_norm": 0.028019532561302185, + "learning_rate": 1.3146633416458854e-05, + "loss": 0.0103, + "step": 13750 + }, + { + "epoch": 1.7157107231920201, + "grad_norm": 0.05028171464800835, + "learning_rate": 1.3141645885286784e-05, + "loss": 0.0328, + "step": 13760 + }, + { + "epoch": 1.7169576059850375, + "grad_norm": 0.002151243155822158, + "learning_rate": 1.3136658354114715e-05, + "loss": 0.0004, + "step": 13770 + }, + { + "epoch": 1.718204488778055, + "grad_norm": 0.06736582517623901, + "learning_rate": 1.3131670822942644e-05, + "loss": 0.0813, + "step": 13780 + }, + { + "epoch": 1.7194513715710724, + "grad_norm": 0.1269422173500061, + "learning_rate": 1.3126683291770574e-05, + "loss": 0.0327, + "step": 13790 + }, + { + "epoch": 1.7206982543640899, + "grad_norm": 0.00913538970053196, + "learning_rate": 1.3121695760598505e-05, + "loss": 0.0058, + "step": 13800 + }, + { + "epoch": 1.7219451371571073, + "grad_norm": 0.0841362401843071, + "learning_rate": 1.3116708229426436e-05, + "loss": 0.0288, + "step": 13810 + }, + { + "epoch": 1.7231920199501247, + "grad_norm": 0.007267114706337452, + "learning_rate": 1.3111720698254365e-05, + "loss": 0.036, + "step": 13820 + }, + { + "epoch": 1.7244389027431422, + "grad_norm": 0.013743101619184017, + "learning_rate": 1.3106733167082295e-05, + "loss": 0.1493, + "step": 13830 + }, + { + "epoch": 1.7256857855361596, + "grad_norm": 0.0051359133794903755, + "learning_rate": 1.3101745635910226e-05, + "loss": 0.0403, + "step": 13840 + }, + { + "epoch": 1.726932668329177, + "grad_norm": 0.01619003526866436, + "learning_rate": 1.3096758104738156e-05, + "loss": 0.0269, + "step": 13850 + }, + { + "epoch": 1.7281795511221945, + "grad_norm": 0.017638299614191055, + "learning_rate": 1.3091770573566085e-05, + "loss": 0.0004, + "step": 13860 + }, + { + "epoch": 1.729426433915212, + "grad_norm": 25.916101455688477, + "learning_rate": 1.3086783042394016e-05, + "loss": 0.0045, + "step": 13870 + }, + { + "epoch": 1.7306733167082293, + "grad_norm": 0.033160511404275894, + "learning_rate": 1.3081795511221947e-05, + "loss": 0.0809, + "step": 13880 + }, + { + "epoch": 1.7319201995012468, + "grad_norm": 0.00405894685536623, + "learning_rate": 1.3076807980049877e-05, + "loss": 0.0233, + "step": 13890 + }, + { + "epoch": 1.7331670822942642, + "grad_norm": 0.003699847497045994, + "learning_rate": 1.3071820448877806e-05, + "loss": 0.0149, + "step": 13900 + }, + { + "epoch": 1.7344139650872819, + "grad_norm": 0.003829776542261243, + "learning_rate": 1.3066832917705737e-05, + "loss": 0.0105, + "step": 13910 + }, + { + "epoch": 1.7356608478802993, + "grad_norm": 0.0024333177134394646, + "learning_rate": 1.3061845386533667e-05, + "loss": 0.0835, + "step": 13920 + }, + { + "epoch": 1.7369077306733167, + "grad_norm": 0.0065351747907698154, + "learning_rate": 1.3056857855361598e-05, + "loss": 0.0005, + "step": 13930 + }, + { + "epoch": 1.7381546134663342, + "grad_norm": 0.6707267165184021, + "learning_rate": 1.3051870324189527e-05, + "loss": 0.002, + "step": 13940 + }, + { + "epoch": 1.7394014962593516, + "grad_norm": 0.0017735377186909318, + "learning_rate": 1.3046882793017457e-05, + "loss": 0.0132, + "step": 13950 + }, + { + "epoch": 1.7406483790523692, + "grad_norm": 0.004138738848268986, + "learning_rate": 1.3041895261845388e-05, + "loss": 0.0674, + "step": 13960 + }, + { + "epoch": 1.7418952618453867, + "grad_norm": 0.003494670381769538, + "learning_rate": 1.3036907730673319e-05, + "loss": 0.0539, + "step": 13970 + }, + { + "epoch": 1.7431421446384041, + "grad_norm": 0.015017693862318993, + "learning_rate": 1.303192019950125e-05, + "loss": 0.0721, + "step": 13980 + }, + { + "epoch": 1.7443890274314215, + "grad_norm": 0.008463400416076183, + "learning_rate": 1.3026932668329178e-05, + "loss": 0.0173, + "step": 13990 + }, + { + "epoch": 1.745635910224439, + "grad_norm": 0.008293639868497849, + "learning_rate": 1.3021945137157109e-05, + "loss": 0.0527, + "step": 14000 + }, + { + "epoch": 1.7468827930174564, + "grad_norm": 0.012015490792691708, + "learning_rate": 1.301695760598504e-05, + "loss": 0.0784, + "step": 14010 + }, + { + "epoch": 1.7481296758104738, + "grad_norm": 0.009153264574706554, + "learning_rate": 1.301197007481297e-05, + "loss": 0.0005, + "step": 14020 + }, + { + "epoch": 1.7493765586034913, + "grad_norm": 0.014061570167541504, + "learning_rate": 1.3006982543640897e-05, + "loss": 0.0163, + "step": 14030 + }, + { + "epoch": 1.7506234413965087, + "grad_norm": 0.09979971498250961, + "learning_rate": 1.3001995012468828e-05, + "loss": 0.0026, + "step": 14040 + }, + { + "epoch": 1.7518703241895262, + "grad_norm": 0.013468071818351746, + "learning_rate": 1.299700748129676e-05, + "loss": 0.0016, + "step": 14050 + }, + { + "epoch": 1.7531172069825436, + "grad_norm": 0.037439607083797455, + "learning_rate": 1.2992019950124691e-05, + "loss": 0.0009, + "step": 14060 + }, + { + "epoch": 1.754364089775561, + "grad_norm": 0.004306161310523748, + "learning_rate": 1.2987032418952618e-05, + "loss": 0.0004, + "step": 14070 + }, + { + "epoch": 1.7556109725685785, + "grad_norm": 0.009254061616957188, + "learning_rate": 1.2982044887780549e-05, + "loss": 0.0003, + "step": 14080 + }, + { + "epoch": 1.7568578553615959, + "grad_norm": 0.002783331088721752, + "learning_rate": 1.297705735660848e-05, + "loss": 0.0178, + "step": 14090 + }, + { + "epoch": 1.7581047381546133, + "grad_norm": 2.467909097671509, + "learning_rate": 1.297206982543641e-05, + "loss": 0.099, + "step": 14100 + }, + { + "epoch": 1.7593516209476308, + "grad_norm": 0.0018527753418311477, + "learning_rate": 1.2967082294264339e-05, + "loss": 0.0005, + "step": 14110 + }, + { + "epoch": 1.7605985037406484, + "grad_norm": 0.011839174665510654, + "learning_rate": 1.296209476309227e-05, + "loss": 0.0006, + "step": 14120 + }, + { + "epoch": 1.7618453865336658, + "grad_norm": 0.2751910388469696, + "learning_rate": 1.29571072319202e-05, + "loss": 0.0003, + "step": 14130 + }, + { + "epoch": 1.7630922693266833, + "grad_norm": 0.005288688465952873, + "learning_rate": 1.295211970074813e-05, + "loss": 0.0527, + "step": 14140 + }, + { + "epoch": 1.7643391521197007, + "grad_norm": 0.0022090657148510218, + "learning_rate": 1.294713216957606e-05, + "loss": 0.043, + "step": 14150 + }, + { + "epoch": 1.7655860349127181, + "grad_norm": 0.07446596771478653, + "learning_rate": 1.294214463840399e-05, + "loss": 0.0411, + "step": 14160 + }, + { + "epoch": 1.7668329177057358, + "grad_norm": 15.950480461120605, + "learning_rate": 1.2937157107231921e-05, + "loss": 0.0487, + "step": 14170 + }, + { + "epoch": 1.7680798004987532, + "grad_norm": 0.001634975546039641, + "learning_rate": 1.2932169576059852e-05, + "loss": 0.0003, + "step": 14180 + }, + { + "epoch": 1.7693266832917707, + "grad_norm": 0.007447084411978722, + "learning_rate": 1.292718204488778e-05, + "loss": 0.0424, + "step": 14190 + }, + { + "epoch": 1.770573566084788, + "grad_norm": 27.95421600341797, + "learning_rate": 1.2922194513715711e-05, + "loss": 0.0173, + "step": 14200 + }, + { + "epoch": 1.7718204488778055, + "grad_norm": 0.0031554321758449078, + "learning_rate": 1.2917206982543642e-05, + "loss": 0.0199, + "step": 14210 + }, + { + "epoch": 1.773067331670823, + "grad_norm": 0.0069986735470592976, + "learning_rate": 1.2912219451371572e-05, + "loss": 0.0115, + "step": 14220 + }, + { + "epoch": 1.7743142144638404, + "grad_norm": 0.01298275962471962, + "learning_rate": 1.2907231920199503e-05, + "loss": 0.0625, + "step": 14230 + }, + { + "epoch": 1.7755610972568578, + "grad_norm": 0.01622236892580986, + "learning_rate": 1.2902244389027432e-05, + "loss": 0.008, + "step": 14240 + }, + { + "epoch": 1.7768079800498753, + "grad_norm": 0.012705344706773758, + "learning_rate": 1.2897256857855363e-05, + "loss": 0.0186, + "step": 14250 + }, + { + "epoch": 1.7780548628428927, + "grad_norm": 21.812990188598633, + "learning_rate": 1.2892269326683293e-05, + "loss": 0.0209, + "step": 14260 + }, + { + "epoch": 1.7793017456359101, + "grad_norm": 0.06607460975646973, + "learning_rate": 1.2887281795511224e-05, + "loss": 0.0421, + "step": 14270 + }, + { + "epoch": 1.7805486284289276, + "grad_norm": 0.559027910232544, + "learning_rate": 1.2882294264339153e-05, + "loss": 0.0309, + "step": 14280 + }, + { + "epoch": 1.781795511221945, + "grad_norm": 0.007746709045022726, + "learning_rate": 1.2877306733167083e-05, + "loss": 0.0147, + "step": 14290 + }, + { + "epoch": 1.7830423940149625, + "grad_norm": 0.002600023988634348, + "learning_rate": 1.2872319201995014e-05, + "loss": 0.0383, + "step": 14300 + }, + { + "epoch": 1.7842892768079799, + "grad_norm": 0.004319756757467985, + "learning_rate": 1.2867331670822945e-05, + "loss": 0.0005, + "step": 14310 + }, + { + "epoch": 1.7855361596009975, + "grad_norm": 0.004818111192435026, + "learning_rate": 1.2862344139650873e-05, + "loss": 0.0221, + "step": 14320 + }, + { + "epoch": 1.786783042394015, + "grad_norm": 0.007039282936602831, + "learning_rate": 1.2857356608478804e-05, + "loss": 0.0003, + "step": 14330 + }, + { + "epoch": 1.7880299251870324, + "grad_norm": 0.022477056831121445, + "learning_rate": 1.2852369077306735e-05, + "loss": 0.0019, + "step": 14340 + }, + { + "epoch": 1.7892768079800498, + "grad_norm": 0.12508761882781982, + "learning_rate": 1.2847381546134665e-05, + "loss": 0.0665, + "step": 14350 + }, + { + "epoch": 1.7905236907730673, + "grad_norm": 1.3086934089660645, + "learning_rate": 1.2842394014962594e-05, + "loss": 0.0309, + "step": 14360 + }, + { + "epoch": 1.791770573566085, + "grad_norm": 0.00528644397854805, + "learning_rate": 1.2837406483790525e-05, + "loss": 0.0002, + "step": 14370 + }, + { + "epoch": 1.7930174563591024, + "grad_norm": 3.16678524017334, + "learning_rate": 1.2832418952618455e-05, + "loss": 0.052, + "step": 14380 + }, + { + "epoch": 1.7942643391521198, + "grad_norm": 0.11913002282381058, + "learning_rate": 1.2827431421446386e-05, + "loss": 0.0003, + "step": 14390 + }, + { + "epoch": 1.7955112219451372, + "grad_norm": 16.50526237487793, + "learning_rate": 1.2822443890274315e-05, + "loss": 0.0356, + "step": 14400 + }, + { + "epoch": 1.7967581047381547, + "grad_norm": 0.0016488219844177365, + "learning_rate": 1.2817456359102246e-05, + "loss": 0.0841, + "step": 14410 + }, + { + "epoch": 1.798004987531172, + "grad_norm": 36.24075698852539, + "learning_rate": 1.2812468827930176e-05, + "loss": 0.0096, + "step": 14420 + }, + { + "epoch": 1.7992518703241895, + "grad_norm": 7.698852062225342, + "learning_rate": 1.2807481296758107e-05, + "loss": 0.0015, + "step": 14430 + }, + { + "epoch": 1.800498753117207, + "grad_norm": 0.0017419640207663178, + "learning_rate": 1.2802493765586034e-05, + "loss": 0.0724, + "step": 14440 + }, + { + "epoch": 1.8017456359102244, + "grad_norm": 12.817113876342773, + "learning_rate": 1.2797506234413965e-05, + "loss": 0.0966, + "step": 14450 + }, + { + "epoch": 1.8029925187032418, + "grad_norm": 0.006786948535591364, + "learning_rate": 1.2792518703241897e-05, + "loss": 0.0309, + "step": 14460 + }, + { + "epoch": 1.8042394014962593, + "grad_norm": 0.046667564660310745, + "learning_rate": 1.2787531172069828e-05, + "loss": 0.0011, + "step": 14470 + }, + { + "epoch": 1.8054862842892767, + "grad_norm": 18.406494140625, + "learning_rate": 1.2782543640897758e-05, + "loss": 0.0441, + "step": 14480 + }, + { + "epoch": 1.8067331670822941, + "grad_norm": 0.14043211936950684, + "learning_rate": 1.2777556109725686e-05, + "loss": 0.0286, + "step": 14490 + }, + { + "epoch": 1.8079800498753116, + "grad_norm": 0.0052274251356720924, + "learning_rate": 1.2772568578553616e-05, + "loss": 0.0556, + "step": 14500 + }, + { + "epoch": 1.809226932668329, + "grad_norm": 0.038864616304636, + "learning_rate": 1.2767581047381547e-05, + "loss": 0.0005, + "step": 14510 + }, + { + "epoch": 1.8104738154613467, + "grad_norm": 0.011043121106922626, + "learning_rate": 1.2762593516209479e-05, + "loss": 0.0702, + "step": 14520 + }, + { + "epoch": 1.811720698254364, + "grad_norm": 0.34502744674682617, + "learning_rate": 1.2757605985037406e-05, + "loss": 0.0249, + "step": 14530 + }, + { + "epoch": 1.8129675810473815, + "grad_norm": 0.7066327929496765, + "learning_rate": 1.2752618453865337e-05, + "loss": 0.0004, + "step": 14540 + }, + { + "epoch": 1.814214463840399, + "grad_norm": 0.008470536209642887, + "learning_rate": 1.2747630922693268e-05, + "loss": 0.0475, + "step": 14550 + }, + { + "epoch": 1.8154613466334164, + "grad_norm": 0.02327272854745388, + "learning_rate": 1.2742643391521198e-05, + "loss": 0.0482, + "step": 14560 + }, + { + "epoch": 1.816708229426434, + "grad_norm": 0.025802046060562134, + "learning_rate": 1.2737655860349127e-05, + "loss": 0.0017, + "step": 14570 + }, + { + "epoch": 1.8179551122194515, + "grad_norm": 0.004240632988512516, + "learning_rate": 1.2732668329177058e-05, + "loss": 0.0158, + "step": 14580 + }, + { + "epoch": 1.819201995012469, + "grad_norm": 0.03840193897485733, + "learning_rate": 1.2727680798004988e-05, + "loss": 0.0602, + "step": 14590 + }, + { + "epoch": 1.8204488778054864, + "grad_norm": 71.08118438720703, + "learning_rate": 1.2722693266832919e-05, + "loss": 0.0875, + "step": 14600 + }, + { + "epoch": 1.8216957605985038, + "grad_norm": 0.010022374801337719, + "learning_rate": 1.2717705735660848e-05, + "loss": 0.0301, + "step": 14610 + }, + { + "epoch": 1.8229426433915212, + "grad_norm": 41.78736877441406, + "learning_rate": 1.2712718204488778e-05, + "loss": 0.0334, + "step": 14620 + }, + { + "epoch": 1.8241895261845387, + "grad_norm": 0.00742789451032877, + "learning_rate": 1.2707730673316709e-05, + "loss": 0.0003, + "step": 14630 + }, + { + "epoch": 1.825436408977556, + "grad_norm": 0.013341421261429787, + "learning_rate": 1.270274314214464e-05, + "loss": 0.0258, + "step": 14640 + }, + { + "epoch": 1.8266832917705735, + "grad_norm": 0.02488613687455654, + "learning_rate": 1.2697755610972569e-05, + "loss": 0.0536, + "step": 14650 + }, + { + "epoch": 1.827930174563591, + "grad_norm": 0.0047700293362140656, + "learning_rate": 1.26927680798005e-05, + "loss": 0.0003, + "step": 14660 + }, + { + "epoch": 1.8291770573566084, + "grad_norm": 0.18093907833099365, + "learning_rate": 1.268778054862843e-05, + "loss": 0.0003, + "step": 14670 + }, + { + "epoch": 1.8304239401496258, + "grad_norm": 0.004082898609340191, + "learning_rate": 1.268279301745636e-05, + "loss": 0.0738, + "step": 14680 + }, + { + "epoch": 1.8316708229426433, + "grad_norm": 0.01219853200018406, + "learning_rate": 1.2677805486284291e-05, + "loss": 0.01, + "step": 14690 + }, + { + "epoch": 1.8329177057356607, + "grad_norm": 0.012780722230672836, + "learning_rate": 1.267281795511222e-05, + "loss": 0.1225, + "step": 14700 + }, + { + "epoch": 1.8341645885286781, + "grad_norm": 0.011909456923604012, + "learning_rate": 1.266783042394015e-05, + "loss": 0.1042, + "step": 14710 + }, + { + "epoch": 1.8354114713216958, + "grad_norm": 0.010442191734910011, + "learning_rate": 1.2662842892768081e-05, + "loss": 0.0065, + "step": 14720 + }, + { + "epoch": 1.8366583541147132, + "grad_norm": 2.012455940246582, + "learning_rate": 1.2657855361596012e-05, + "loss": 0.0284, + "step": 14730 + }, + { + "epoch": 1.8379052369077307, + "grad_norm": 0.023272445425391197, + "learning_rate": 1.265286783042394e-05, + "loss": 0.0042, + "step": 14740 + }, + { + "epoch": 1.839152119700748, + "grad_norm": 0.013164395466446877, + "learning_rate": 1.2647880299251871e-05, + "loss": 0.0005, + "step": 14750 + }, + { + "epoch": 1.8403990024937655, + "grad_norm": 0.006373642943799496, + "learning_rate": 1.2642892768079802e-05, + "loss": 0.0036, + "step": 14760 + }, + { + "epoch": 1.8416458852867832, + "grad_norm": 0.021037481725215912, + "learning_rate": 1.2637905236907733e-05, + "loss": 0.0721, + "step": 14770 + }, + { + "epoch": 1.8428927680798006, + "grad_norm": 23.66687774658203, + "learning_rate": 1.2632917705735662e-05, + "loss": 0.0069, + "step": 14780 + }, + { + "epoch": 1.844139650872818, + "grad_norm": 0.4130190908908844, + "learning_rate": 1.2627930174563592e-05, + "loss": 0.0017, + "step": 14790 + }, + { + "epoch": 1.8453865336658355, + "grad_norm": 0.0021154251880943775, + "learning_rate": 1.2622942643391523e-05, + "loss": 0.0153, + "step": 14800 + }, + { + "epoch": 1.846633416458853, + "grad_norm": 0.003588662948459387, + "learning_rate": 1.2617955112219453e-05, + "loss": 0.0021, + "step": 14810 + }, + { + "epoch": 1.8478802992518704, + "grad_norm": 0.004530202131718397, + "learning_rate": 1.2612967581047382e-05, + "loss": 0.0238, + "step": 14820 + }, + { + "epoch": 1.8491271820448878, + "grad_norm": 0.02361251227557659, + "learning_rate": 1.2607980049875313e-05, + "loss": 0.0414, + "step": 14830 + }, + { + "epoch": 1.8503740648379052, + "grad_norm": 0.012375129386782646, + "learning_rate": 1.2602992518703244e-05, + "loss": 0.0006, + "step": 14840 + }, + { + "epoch": 1.8516209476309227, + "grad_norm": 0.00249878759495914, + "learning_rate": 1.2598004987531174e-05, + "loss": 0.0198, + "step": 14850 + }, + { + "epoch": 1.85286783042394, + "grad_norm": 0.008120937272906303, + "learning_rate": 1.2593017456359101e-05, + "loss": 0.1278, + "step": 14860 + }, + { + "epoch": 1.8541147132169575, + "grad_norm": 1.1773933172225952, + "learning_rate": 1.2588029925187034e-05, + "loss": 0.0037, + "step": 14870 + }, + { + "epoch": 1.855361596009975, + "grad_norm": 0.10049566626548767, + "learning_rate": 1.2583042394014964e-05, + "loss": 0.0004, + "step": 14880 + }, + { + "epoch": 1.8566084788029924, + "grad_norm": 0.029792172834277153, + "learning_rate": 1.2578054862842895e-05, + "loss": 0.0048, + "step": 14890 + }, + { + "epoch": 1.8578553615960098, + "grad_norm": 12.35927963256836, + "learning_rate": 1.2573067331670822e-05, + "loss": 0.0016, + "step": 14900 + }, + { + "epoch": 1.8591022443890273, + "grad_norm": 0.00204622489400208, + "learning_rate": 1.2568079800498753e-05, + "loss": 0.0002, + "step": 14910 + }, + { + "epoch": 1.860349127182045, + "grad_norm": 0.014671762473881245, + "learning_rate": 1.2563092269326684e-05, + "loss": 0.0143, + "step": 14920 + }, + { + "epoch": 1.8615960099750624, + "grad_norm": 0.0035495534539222717, + "learning_rate": 1.2558104738154616e-05, + "loss": 0.0276, + "step": 14930 + }, + { + "epoch": 1.8628428927680798, + "grad_norm": 0.0024204726796597242, + "learning_rate": 1.2553117206982546e-05, + "loss": 0.0546, + "step": 14940 + }, + { + "epoch": 1.8640897755610972, + "grad_norm": 0.002721786266192794, + "learning_rate": 1.2548129675810474e-05, + "loss": 0.0434, + "step": 14950 + }, + { + "epoch": 1.8653366583541147, + "grad_norm": 14.899277687072754, + "learning_rate": 1.2543142144638404e-05, + "loss": 0.0351, + "step": 14960 + }, + { + "epoch": 1.8665835411471323, + "grad_norm": 0.0012579448521137238, + "learning_rate": 1.2538154613466335e-05, + "loss": 0.0452, + "step": 14970 + }, + { + "epoch": 1.8678304239401498, + "grad_norm": 0.006773448083549738, + "learning_rate": 1.2533167082294266e-05, + "loss": 0.0017, + "step": 14980 + }, + { + "epoch": 1.8690773067331672, + "grad_norm": 0.0034950890112668276, + "learning_rate": 1.2528179551122194e-05, + "loss": 0.058, + "step": 14990 + }, + { + "epoch": 1.8703241895261846, + "grad_norm": 0.007557340431958437, + "learning_rate": 1.2523192019950125e-05, + "loss": 0.0452, + "step": 15000 + }, + { + "epoch": 1.871571072319202, + "grad_norm": 0.025978926569223404, + "learning_rate": 1.2518204488778056e-05, + "loss": 0.0006, + "step": 15010 + }, + { + "epoch": 1.8728179551122195, + "grad_norm": 0.003996451385319233, + "learning_rate": 1.2513216957605986e-05, + "loss": 0.0024, + "step": 15020 + }, + { + "epoch": 1.874064837905237, + "grad_norm": 0.0036994144320487976, + "learning_rate": 1.2508229426433915e-05, + "loss": 0.0314, + "step": 15030 + }, + { + "epoch": 1.8753117206982544, + "grad_norm": 0.004972072783857584, + "learning_rate": 1.2503241895261846e-05, + "loss": 0.0292, + "step": 15040 + }, + { + "epoch": 1.8765586034912718, + "grad_norm": 0.0011615961557254195, + "learning_rate": 1.2498254364089776e-05, + "loss": 0.0122, + "step": 15050 + }, + { + "epoch": 1.8778054862842892, + "grad_norm": 0.009926113300025463, + "learning_rate": 1.2493266832917707e-05, + "loss": 0.0533, + "step": 15060 + }, + { + "epoch": 1.8790523690773067, + "grad_norm": 0.0035635512322187424, + "learning_rate": 1.2488279301745636e-05, + "loss": 0.0677, + "step": 15070 + }, + { + "epoch": 1.880299251870324, + "grad_norm": 0.0073900381103158, + "learning_rate": 1.2483291770573567e-05, + "loss": 0.0003, + "step": 15080 + }, + { + "epoch": 1.8815461346633415, + "grad_norm": 0.015382417477667332, + "learning_rate": 1.2478304239401497e-05, + "loss": 0.0003, + "step": 15090 + }, + { + "epoch": 1.882793017456359, + "grad_norm": 0.005023096688091755, + "learning_rate": 1.2473316708229428e-05, + "loss": 0.0011, + "step": 15100 + }, + { + "epoch": 1.8840399002493764, + "grad_norm": 0.0024068865459412336, + "learning_rate": 1.2468329177057357e-05, + "loss": 0.0302, + "step": 15110 + }, + { + "epoch": 1.885286783042394, + "grad_norm": 0.01790803112089634, + "learning_rate": 1.2463341645885287e-05, + "loss": 0.0005, + "step": 15120 + }, + { + "epoch": 1.8865336658354115, + "grad_norm": 0.002026566304266453, + "learning_rate": 1.2458354114713218e-05, + "loss": 0.0379, + "step": 15130 + }, + { + "epoch": 1.887780548628429, + "grad_norm": 0.05049702525138855, + "learning_rate": 1.2453366583541149e-05, + "loss": 0.0142, + "step": 15140 + }, + { + "epoch": 1.8890274314214464, + "grad_norm": 27.768470764160156, + "learning_rate": 1.2448379052369078e-05, + "loss": 0.0653, + "step": 15150 + }, + { + "epoch": 1.8902743142144638, + "grad_norm": 0.016509858891367912, + "learning_rate": 1.2443391521197008e-05, + "loss": 0.0102, + "step": 15160 + }, + { + "epoch": 1.8915211970074814, + "grad_norm": 0.9496610760688782, + "learning_rate": 1.2438403990024939e-05, + "loss": 0.0006, + "step": 15170 + }, + { + "epoch": 1.8927680798004989, + "grad_norm": 54.05585479736328, + "learning_rate": 1.243341645885287e-05, + "loss": 0.1083, + "step": 15180 + }, + { + "epoch": 1.8940149625935163, + "grad_norm": 0.021726036444306374, + "learning_rate": 1.24284289276808e-05, + "loss": 0.0922, + "step": 15190 + }, + { + "epoch": 1.8952618453865338, + "grad_norm": 0.008410331793129444, + "learning_rate": 1.2423441396508729e-05, + "loss": 0.0066, + "step": 15200 + }, + { + "epoch": 1.8965087281795512, + "grad_norm": 10.102691650390625, + "learning_rate": 1.241845386533666e-05, + "loss": 0.0943, + "step": 15210 + }, + { + "epoch": 1.8977556109725686, + "grad_norm": 16.93928337097168, + "learning_rate": 1.241346633416459e-05, + "loss": 0.0458, + "step": 15220 + }, + { + "epoch": 1.899002493765586, + "grad_norm": 0.0049775936640799046, + "learning_rate": 1.2408478802992521e-05, + "loss": 0.0003, + "step": 15230 + }, + { + "epoch": 1.9002493765586035, + "grad_norm": 40.914817810058594, + "learning_rate": 1.240349127182045e-05, + "loss": 0.0349, + "step": 15240 + }, + { + "epoch": 1.901496259351621, + "grad_norm": 0.0028333088848739862, + "learning_rate": 1.239850374064838e-05, + "loss": 0.0035, + "step": 15250 + }, + { + "epoch": 1.9027431421446384, + "grad_norm": 0.3201017379760742, + "learning_rate": 1.2393516209476311e-05, + "loss": 0.0065, + "step": 15260 + }, + { + "epoch": 1.9039900249376558, + "grad_norm": 0.007169653195887804, + "learning_rate": 1.2388528678304242e-05, + "loss": 0.0231, + "step": 15270 + }, + { + "epoch": 1.9052369077306732, + "grad_norm": 0.002448942745104432, + "learning_rate": 1.238354114713217e-05, + "loss": 0.0394, + "step": 15280 + }, + { + "epoch": 1.9064837905236907, + "grad_norm": 2.1332430839538574, + "learning_rate": 1.2378553615960101e-05, + "loss": 0.0195, + "step": 15290 + }, + { + "epoch": 1.907730673316708, + "grad_norm": 0.1876264065504074, + "learning_rate": 1.2373566084788032e-05, + "loss": 0.0685, + "step": 15300 + }, + { + "epoch": 1.9089775561097255, + "grad_norm": 0.012347985059022903, + "learning_rate": 1.2368578553615962e-05, + "loss": 0.0733, + "step": 15310 + }, + { + "epoch": 1.9102244389027432, + "grad_norm": 0.007458592299371958, + "learning_rate": 1.236359102244389e-05, + "loss": 0.049, + "step": 15320 + }, + { + "epoch": 1.9114713216957606, + "grad_norm": 0.0171657707542181, + "learning_rate": 1.2358603491271822e-05, + "loss": 0.0267, + "step": 15330 + }, + { + "epoch": 1.912718204488778, + "grad_norm": 0.037479467689991, + "learning_rate": 1.2353615960099753e-05, + "loss": 0.0227, + "step": 15340 + }, + { + "epoch": 1.9139650872817955, + "grad_norm": 0.23706205189228058, + "learning_rate": 1.2348628428927683e-05, + "loss": 0.0251, + "step": 15350 + }, + { + "epoch": 1.915211970074813, + "grad_norm": 0.02107255533337593, + "learning_rate": 1.234364089775561e-05, + "loss": 0.0005, + "step": 15360 + }, + { + "epoch": 1.9164588528678306, + "grad_norm": 0.0038479601498693228, + "learning_rate": 1.2338653366583541e-05, + "loss": 0.091, + "step": 15370 + }, + { + "epoch": 1.917705735660848, + "grad_norm": 0.0024802994448691607, + "learning_rate": 1.2333665835411472e-05, + "loss": 0.0436, + "step": 15380 + }, + { + "epoch": 1.9189526184538654, + "grad_norm": 0.00154173094779253, + "learning_rate": 1.2328678304239404e-05, + "loss": 0.0002, + "step": 15390 + }, + { + "epoch": 1.9201995012468829, + "grad_norm": 0.021371448412537575, + "learning_rate": 1.2323690773067331e-05, + "loss": 0.1347, + "step": 15400 + }, + { + "epoch": 1.9214463840399003, + "grad_norm": 0.809636652469635, + "learning_rate": 1.2318703241895262e-05, + "loss": 0.0564, + "step": 15410 + }, + { + "epoch": 1.9226932668329177, + "grad_norm": 0.01050503272563219, + "learning_rate": 1.2313715710723192e-05, + "loss": 0.0089, + "step": 15420 + }, + { + "epoch": 1.9239401496259352, + "grad_norm": 0.02801765315234661, + "learning_rate": 1.2308728179551123e-05, + "loss": 0.0365, + "step": 15430 + }, + { + "epoch": 1.9251870324189526, + "grad_norm": 0.005082667805254459, + "learning_rate": 1.2303740648379054e-05, + "loss": 0.0059, + "step": 15440 + }, + { + "epoch": 1.92643391521197, + "grad_norm": 0.005822836421430111, + "learning_rate": 1.2298753117206983e-05, + "loss": 0.0035, + "step": 15450 + }, + { + "epoch": 1.9276807980049875, + "grad_norm": 0.0021450971253216267, + "learning_rate": 1.2293765586034913e-05, + "loss": 0.0004, + "step": 15460 + }, + { + "epoch": 1.928927680798005, + "grad_norm": 0.003937265835702419, + "learning_rate": 1.2288778054862844e-05, + "loss": 0.0006, + "step": 15470 + }, + { + "epoch": 1.9301745635910224, + "grad_norm": 0.008908499032258987, + "learning_rate": 1.2283790523690774e-05, + "loss": 0.0385, + "step": 15480 + }, + { + "epoch": 1.9314214463840398, + "grad_norm": 0.15880395472049713, + "learning_rate": 1.2278802992518703e-05, + "loss": 0.0386, + "step": 15490 + }, + { + "epoch": 1.9326683291770572, + "grad_norm": 0.00402474170550704, + "learning_rate": 1.2273815461346634e-05, + "loss": 0.0295, + "step": 15500 + }, + { + "epoch": 1.9339152119700747, + "grad_norm": 0.038588814437389374, + "learning_rate": 1.2268827930174565e-05, + "loss": 0.0339, + "step": 15510 + }, + { + "epoch": 1.9351620947630923, + "grad_norm": 0.004865721333771944, + "learning_rate": 1.2263840399002495e-05, + "loss": 0.0102, + "step": 15520 + }, + { + "epoch": 1.9364089775561097, + "grad_norm": 0.415129154920578, + "learning_rate": 1.2258852867830424e-05, + "loss": 0.1301, + "step": 15530 + }, + { + "epoch": 1.9376558603491272, + "grad_norm": 0.01758623868227005, + "learning_rate": 1.2253865336658355e-05, + "loss": 0.0191, + "step": 15540 + }, + { + "epoch": 1.9389027431421446, + "grad_norm": 19.56462860107422, + "learning_rate": 1.2248877805486285e-05, + "loss": 0.0375, + "step": 15550 + }, + { + "epoch": 1.940149625935162, + "grad_norm": 0.0069420519284904, + "learning_rate": 1.2243890274314216e-05, + "loss": 0.0007, + "step": 15560 + }, + { + "epoch": 1.9413965087281797, + "grad_norm": 0.005758563056588173, + "learning_rate": 1.2238902743142145e-05, + "loss": 0.0156, + "step": 15570 + }, + { + "epoch": 1.9426433915211971, + "grad_norm": 0.014544529840350151, + "learning_rate": 1.2233915211970076e-05, + "loss": 0.0002, + "step": 15580 + }, + { + "epoch": 1.9438902743142146, + "grad_norm": 0.051376599818468094, + "learning_rate": 1.2228927680798006e-05, + "loss": 0.116, + "step": 15590 + }, + { + "epoch": 1.945137157107232, + "grad_norm": 0.005440854001790285, + "learning_rate": 1.2223940149625937e-05, + "loss": 0.0246, + "step": 15600 + }, + { + "epoch": 1.9463840399002494, + "grad_norm": 0.012521665543317795, + "learning_rate": 1.2218952618453866e-05, + "loss": 0.0378, + "step": 15610 + }, + { + "epoch": 1.9476309226932669, + "grad_norm": 9.66970443725586, + "learning_rate": 1.2213965087281796e-05, + "loss": 0.0433, + "step": 15620 + }, + { + "epoch": 1.9488778054862843, + "grad_norm": 0.010639001615345478, + "learning_rate": 1.2208977556109727e-05, + "loss": 0.0421, + "step": 15630 + }, + { + "epoch": 1.9501246882793017, + "grad_norm": 12.684861183166504, + "learning_rate": 1.2203990024937658e-05, + "loss": 0.1112, + "step": 15640 + }, + { + "epoch": 1.9513715710723192, + "grad_norm": 0.013296867720782757, + "learning_rate": 1.2199002493765587e-05, + "loss": 0.0455, + "step": 15650 + }, + { + "epoch": 1.9526184538653366, + "grad_norm": 0.018770115450024605, + "learning_rate": 1.2194014962593517e-05, + "loss": 0.0144, + "step": 15660 + }, + { + "epoch": 1.953865336658354, + "grad_norm": 0.017290420830249786, + "learning_rate": 1.2189027431421448e-05, + "loss": 0.0195, + "step": 15670 + }, + { + "epoch": 1.9551122194513715, + "grad_norm": 0.9697034955024719, + "learning_rate": 1.2184039900249378e-05, + "loss": 0.0517, + "step": 15680 + }, + { + "epoch": 1.956359102244389, + "grad_norm": 0.011194895952939987, + "learning_rate": 1.2179052369077309e-05, + "loss": 0.0004, + "step": 15690 + }, + { + "epoch": 1.9576059850374063, + "grad_norm": 23.957162857055664, + "learning_rate": 1.2174563591022446e-05, + "loss": 0.1551, + "step": 15700 + }, + { + "epoch": 1.9588528678304238, + "grad_norm": 0.026693115010857582, + "learning_rate": 1.2169576059850374e-05, + "loss": 0.0011, + "step": 15710 + }, + { + "epoch": 1.9600997506234414, + "grad_norm": 0.3291858434677124, + "learning_rate": 1.2164588528678305e-05, + "loss": 0.0524, + "step": 15720 + }, + { + "epoch": 1.9613466334164589, + "grad_norm": 0.006115868221968412, + "learning_rate": 1.2159600997506236e-05, + "loss": 0.0008, + "step": 15730 + }, + { + "epoch": 1.9625935162094763, + "grad_norm": 0.002324812114238739, + "learning_rate": 1.2154613466334166e-05, + "loss": 0.0008, + "step": 15740 + }, + { + "epoch": 1.9638403990024937, + "grad_norm": 0.013870062306523323, + "learning_rate": 1.2149625935162095e-05, + "loss": 0.0449, + "step": 15750 + }, + { + "epoch": 1.9650872817955112, + "grad_norm": 0.02065792866051197, + "learning_rate": 1.2144638403990026e-05, + "loss": 0.101, + "step": 15760 + }, + { + "epoch": 1.9663341645885288, + "grad_norm": 0.06540193408727646, + "learning_rate": 1.2139650872817957e-05, + "loss": 0.0092, + "step": 15770 + }, + { + "epoch": 1.9675810473815463, + "grad_norm": 0.766767144203186, + "learning_rate": 1.2134663341645887e-05, + "loss": 0.0006, + "step": 15780 + }, + { + "epoch": 1.9688279301745637, + "grad_norm": 0.024145884439349174, + "learning_rate": 1.2129675810473816e-05, + "loss": 0.0008, + "step": 15790 + }, + { + "epoch": 1.9700748129675811, + "grad_norm": 0.009867561981081963, + "learning_rate": 1.2124688279301747e-05, + "loss": 0.0454, + "step": 15800 + }, + { + "epoch": 1.9713216957605986, + "grad_norm": 0.01759646274149418, + "learning_rate": 1.2119700748129677e-05, + "loss": 0.0003, + "step": 15810 + }, + { + "epoch": 1.972568578553616, + "grad_norm": 0.018786191940307617, + "learning_rate": 1.2114713216957608e-05, + "loss": 0.0423, + "step": 15820 + }, + { + "epoch": 1.9738154613466334, + "grad_norm": 0.02426152490079403, + "learning_rate": 1.2109725685785537e-05, + "loss": 0.0601, + "step": 15830 + }, + { + "epoch": 1.9750623441396509, + "grad_norm": 0.004218484740704298, + "learning_rate": 1.2104738154613467e-05, + "loss": 0.0002, + "step": 15840 + }, + { + "epoch": 1.9763092269326683, + "grad_norm": 0.012382776476442814, + "learning_rate": 1.2099750623441398e-05, + "loss": 0.0194, + "step": 15850 + }, + { + "epoch": 1.9775561097256857, + "grad_norm": 0.010796369053423405, + "learning_rate": 1.2094763092269329e-05, + "loss": 0.022, + "step": 15860 + }, + { + "epoch": 1.9788029925187032, + "grad_norm": 0.008351016789674759, + "learning_rate": 1.208977556109726e-05, + "loss": 0.0394, + "step": 15870 + }, + { + "epoch": 1.9800498753117206, + "grad_norm": 0.0044389450922608376, + "learning_rate": 1.2084788029925188e-05, + "loss": 0.045, + "step": 15880 + }, + { + "epoch": 1.981296758104738, + "grad_norm": 0.03672898933291435, + "learning_rate": 1.2079800498753119e-05, + "loss": 0.0536, + "step": 15890 + }, + { + "epoch": 1.9825436408977555, + "grad_norm": 0.05319588631391525, + "learning_rate": 1.207481296758105e-05, + "loss": 0.0828, + "step": 15900 + }, + { + "epoch": 1.983790523690773, + "grad_norm": 5.178584098815918, + "learning_rate": 1.206982543640898e-05, + "loss": 0.1052, + "step": 15910 + }, + { + "epoch": 1.9850374064837906, + "grad_norm": 0.005634423345327377, + "learning_rate": 1.2064837905236907e-05, + "loss": 0.0469, + "step": 15920 + }, + { + "epoch": 1.986284289276808, + "grad_norm": 0.03831524774432182, + "learning_rate": 1.2059850374064838e-05, + "loss": 0.0012, + "step": 15930 + }, + { + "epoch": 1.9875311720698254, + "grad_norm": 0.006244161166250706, + "learning_rate": 1.205486284289277e-05, + "loss": 0.0126, + "step": 15940 + }, + { + "epoch": 1.9887780548628429, + "grad_norm": 7.760480880737305, + "learning_rate": 1.2049875311720701e-05, + "loss": 0.0419, + "step": 15950 + }, + { + "epoch": 1.9900249376558603, + "grad_norm": 5.048219203948975, + "learning_rate": 1.2044887780548628e-05, + "loss": 0.0377, + "step": 15960 + }, + { + "epoch": 1.991271820448878, + "grad_norm": 0.0169609934091568, + "learning_rate": 1.2039900249376559e-05, + "loss": 0.0487, + "step": 15970 + }, + { + "epoch": 1.9925187032418954, + "grad_norm": 0.020963355898857117, + "learning_rate": 1.203491271820449e-05, + "loss": 0.0217, + "step": 15980 + }, + { + "epoch": 1.9937655860349128, + "grad_norm": 0.008009551092982292, + "learning_rate": 1.202992518703242e-05, + "loss": 0.0015, + "step": 15990 + }, + { + "epoch": 1.9950124688279303, + "grad_norm": 0.01416187733411789, + "learning_rate": 1.2024937655860349e-05, + "loss": 0.0373, + "step": 16000 + }, + { + "epoch": 1.9962593516209477, + "grad_norm": 0.038335274904966354, + "learning_rate": 1.201995012468828e-05, + "loss": 0.0381, + "step": 16010 + }, + { + "epoch": 1.9975062344139651, + "grad_norm": 0.04764119163155556, + "learning_rate": 1.201496259351621e-05, + "loss": 0.0308, + "step": 16020 + }, + { + "epoch": 1.9987531172069826, + "grad_norm": 9.592329978942871, + "learning_rate": 1.200997506234414e-05, + "loss": 0.0543, + "step": 16030 + }, + { + "epoch": 2.0, + "grad_norm": 0.008706348016858101, + "learning_rate": 1.200498753117207e-05, + "loss": 0.0003, + "step": 16040 + }, + { + "epoch": 2.0, + "eval_accuracy": 0.9936405012781345, + "eval_loss": 0.03504941612482071, + "eval_runtime": 17.7412, + "eval_samples_per_second": 904.053, + "eval_steps_per_second": 56.535, + "step": 16040 + }, + { + "epoch": 2.0012468827930174, + "grad_norm": 0.006675767712295055, + "learning_rate": 1.2e-05, + "loss": 0.0004, + "step": 16050 + }, + { + "epoch": 2.002493765586035, + "grad_norm": 0.0029956961516290903, + "learning_rate": 1.1995012468827931e-05, + "loss": 0.0002, + "step": 16060 + }, + { + "epoch": 2.0037406483790523, + "grad_norm": 0.024461830034852028, + "learning_rate": 1.1990024937655862e-05, + "loss": 0.0002, + "step": 16070 + }, + { + "epoch": 2.0049875311720697, + "grad_norm": 0.024466920644044876, + "learning_rate": 1.198503740648379e-05, + "loss": 0.0002, + "step": 16080 + }, + { + "epoch": 2.006234413965087, + "grad_norm": 0.004781090654432774, + "learning_rate": 1.1980049875311721e-05, + "loss": 0.0254, + "step": 16090 + }, + { + "epoch": 2.0074812967581046, + "grad_norm": 55.558677673339844, + "learning_rate": 1.1975062344139652e-05, + "loss": 0.0569, + "step": 16100 + }, + { + "epoch": 2.008728179551122, + "grad_norm": 0.007196883670985699, + "learning_rate": 1.1970074812967582e-05, + "loss": 0.0002, + "step": 16110 + }, + { + "epoch": 2.0099750623441395, + "grad_norm": 0.002488025464117527, + "learning_rate": 1.1965087281795513e-05, + "loss": 0.0018, + "step": 16120 + }, + { + "epoch": 2.011221945137157, + "grad_norm": 0.0017350780544802547, + "learning_rate": 1.1960099750623442e-05, + "loss": 0.0004, + "step": 16130 + }, + { + "epoch": 2.0124688279301743, + "grad_norm": 15.045056343078613, + "learning_rate": 1.1955112219451372e-05, + "loss": 0.0436, + "step": 16140 + }, + { + "epoch": 2.013715710723192, + "grad_norm": 28.597244262695312, + "learning_rate": 1.1950124688279303e-05, + "loss": 0.0061, + "step": 16150 + }, + { + "epoch": 2.0149625935162097, + "grad_norm": 4.736278057098389, + "learning_rate": 1.1945137157107234e-05, + "loss": 0.0434, + "step": 16160 + }, + { + "epoch": 2.016209476309227, + "grad_norm": 0.0017900534439831972, + "learning_rate": 1.1940149625935163e-05, + "loss": 0.0428, + "step": 16170 + }, + { + "epoch": 2.0174563591022445, + "grad_norm": 0.004499204456806183, + "learning_rate": 1.1935162094763093e-05, + "loss": 0.0003, + "step": 16180 + }, + { + "epoch": 2.018703241895262, + "grad_norm": 0.002972670830786228, + "learning_rate": 1.1930174563591024e-05, + "loss": 0.0634, + "step": 16190 + }, + { + "epoch": 2.0199501246882794, + "grad_norm": 0.011449567042291164, + "learning_rate": 1.1925187032418955e-05, + "loss": 0.0002, + "step": 16200 + }, + { + "epoch": 2.021197007481297, + "grad_norm": 0.020797425881028175, + "learning_rate": 1.1920199501246883e-05, + "loss": 0.0002, + "step": 16210 + }, + { + "epoch": 2.0224438902743143, + "grad_norm": 0.4915730953216553, + "learning_rate": 1.1915211970074814e-05, + "loss": 0.0731, + "step": 16220 + }, + { + "epoch": 2.0236907730673317, + "grad_norm": 0.9496850371360779, + "learning_rate": 1.1910224438902745e-05, + "loss": 0.0157, + "step": 16230 + }, + { + "epoch": 2.024937655860349, + "grad_norm": 0.004698851145803928, + "learning_rate": 1.1905236907730675e-05, + "loss": 0.0004, + "step": 16240 + }, + { + "epoch": 2.0261845386533666, + "grad_norm": 0.19270732998847961, + "learning_rate": 1.1900249376558604e-05, + "loss": 0.0355, + "step": 16250 + }, + { + "epoch": 2.027431421446384, + "grad_norm": 0.017890894785523415, + "learning_rate": 1.1895261845386535e-05, + "loss": 0.0002, + "step": 16260 + }, + { + "epoch": 2.0286783042394014, + "grad_norm": 0.018559958785772324, + "learning_rate": 1.1890274314214465e-05, + "loss": 0.0003, + "step": 16270 + }, + { + "epoch": 2.029925187032419, + "grad_norm": 0.019640089944005013, + "learning_rate": 1.1885286783042396e-05, + "loss": 0.0003, + "step": 16280 + }, + { + "epoch": 2.0311720698254363, + "grad_norm": 0.04718825966119766, + "learning_rate": 1.1880299251870325e-05, + "loss": 0.0003, + "step": 16290 + }, + { + "epoch": 2.0324189526184537, + "grad_norm": 0.001198633573949337, + "learning_rate": 1.1875311720698256e-05, + "loss": 0.0006, + "step": 16300 + }, + { + "epoch": 2.033665835411471, + "grad_norm": 0.1309710144996643, + "learning_rate": 1.1870324189526186e-05, + "loss": 0.1054, + "step": 16310 + }, + { + "epoch": 2.0349127182044886, + "grad_norm": 0.008803281933069229, + "learning_rate": 1.1865336658354117e-05, + "loss": 0.0249, + "step": 16320 + }, + { + "epoch": 2.036159600997506, + "grad_norm": 0.45416969060897827, + "learning_rate": 1.1860349127182044e-05, + "loss": 0.0191, + "step": 16330 + }, + { + "epoch": 2.037406483790524, + "grad_norm": 0.022696293890476227, + "learning_rate": 1.1855361596009975e-05, + "loss": 0.0002, + "step": 16340 + }, + { + "epoch": 2.0386533665835413, + "grad_norm": 0.018615229055285454, + "learning_rate": 1.1850374064837907e-05, + "loss": 0.0252, + "step": 16350 + }, + { + "epoch": 2.039900249376559, + "grad_norm": 0.004977565258741379, + "learning_rate": 1.1845386533665838e-05, + "loss": 0.0043, + "step": 16360 + }, + { + "epoch": 2.041147132169576, + "grad_norm": 0.024628182873129845, + "learning_rate": 1.1840399002493768e-05, + "loss": 0.0001, + "step": 16370 + }, + { + "epoch": 2.0423940149625937, + "grad_norm": 0.0027379884850233793, + "learning_rate": 1.1835411471321695e-05, + "loss": 0.0002, + "step": 16380 + }, + { + "epoch": 2.043640897755611, + "grad_norm": 0.0037043734919279814, + "learning_rate": 1.1830423940149626e-05, + "loss": 0.0092, + "step": 16390 + }, + { + "epoch": 2.0448877805486285, + "grad_norm": 0.009499566629529, + "learning_rate": 1.1825436408977557e-05, + "loss": 0.0001, + "step": 16400 + }, + { + "epoch": 2.046134663341646, + "grad_norm": 0.0012607629178091884, + "learning_rate": 1.1820448877805489e-05, + "loss": 0.0664, + "step": 16410 + }, + { + "epoch": 2.0473815461346634, + "grad_norm": 0.0217350572347641, + "learning_rate": 1.1815461346633416e-05, + "loss": 0.0002, + "step": 16420 + }, + { + "epoch": 2.048628428927681, + "grad_norm": 0.14604200422763824, + "learning_rate": 1.1810473815461347e-05, + "loss": 0.0083, + "step": 16430 + }, + { + "epoch": 2.0498753117206983, + "grad_norm": 9.621197700500488, + "learning_rate": 1.1805486284289278e-05, + "loss": 0.0328, + "step": 16440 + }, + { + "epoch": 2.0511221945137157, + "grad_norm": 3.1825685501098633, + "learning_rate": 1.1800498753117208e-05, + "loss": 0.0006, + "step": 16450 + }, + { + "epoch": 2.052369077306733, + "grad_norm": 0.002401788020506501, + "learning_rate": 1.1795511221945137e-05, + "loss": 0.0001, + "step": 16460 + }, + { + "epoch": 2.0536159600997506, + "grad_norm": 0.039967939257621765, + "learning_rate": 1.1790523690773068e-05, + "loss": 0.0001, + "step": 16470 + }, + { + "epoch": 2.054862842892768, + "grad_norm": 0.014854871667921543, + "learning_rate": 1.1785536159600998e-05, + "loss": 0.0103, + "step": 16480 + }, + { + "epoch": 2.0561097256857854, + "grad_norm": 0.005126704927533865, + "learning_rate": 1.1780548628428929e-05, + "loss": 0.0003, + "step": 16490 + }, + { + "epoch": 2.057356608478803, + "grad_norm": 0.0005838332581333816, + "learning_rate": 1.1775561097256858e-05, + "loss": 0.0002, + "step": 16500 + }, + { + "epoch": 2.0586034912718203, + "grad_norm": 0.004360176622867584, + "learning_rate": 1.1770573566084788e-05, + "loss": 0.0024, + "step": 16510 + }, + { + "epoch": 2.0598503740648377, + "grad_norm": 0.0009690506267361343, + "learning_rate": 1.1765586034912719e-05, + "loss": 0.0628, + "step": 16520 + }, + { + "epoch": 2.061097256857855, + "grad_norm": 0.005594516638666391, + "learning_rate": 1.176059850374065e-05, + "loss": 0.0001, + "step": 16530 + }, + { + "epoch": 2.0623441396508726, + "grad_norm": 0.0011889605084434152, + "learning_rate": 1.1755610972568579e-05, + "loss": 0.0207, + "step": 16540 + }, + { + "epoch": 2.0635910224438905, + "grad_norm": 0.0011221276363357902, + "learning_rate": 1.175062344139651e-05, + "loss": 0.018, + "step": 16550 + }, + { + "epoch": 2.064837905236908, + "grad_norm": 7.532267093658447, + "learning_rate": 1.174563591022444e-05, + "loss": 0.0719, + "step": 16560 + }, + { + "epoch": 2.0660847880299253, + "grad_norm": 0.0037209379952400923, + "learning_rate": 1.174064837905237e-05, + "loss": 0.0422, + "step": 16570 + }, + { + "epoch": 2.067331670822943, + "grad_norm": 3.0293376445770264, + "learning_rate": 1.17356608478803e-05, + "loss": 0.0082, + "step": 16580 + }, + { + "epoch": 2.06857855361596, + "grad_norm": 8.686629295349121, + "learning_rate": 1.173067331670823e-05, + "loss": 0.0012, + "step": 16590 + }, + { + "epoch": 2.0698254364089776, + "grad_norm": 0.02134956605732441, + "learning_rate": 1.172568578553616e-05, + "loss": 0.0003, + "step": 16600 + }, + { + "epoch": 2.071072319201995, + "grad_norm": 0.004345687106251717, + "learning_rate": 1.1720698254364091e-05, + "loss": 0.0003, + "step": 16610 + }, + { + "epoch": 2.0723192019950125, + "grad_norm": 0.004607527982443571, + "learning_rate": 1.1715710723192022e-05, + "loss": 0.0553, + "step": 16620 + }, + { + "epoch": 2.07356608478803, + "grad_norm": 0.0037352608051151037, + "learning_rate": 1.171072319201995e-05, + "loss": 0.0442, + "step": 16630 + }, + { + "epoch": 2.0748129675810474, + "grad_norm": 2.7062952518463135, + "learning_rate": 1.1705735660847881e-05, + "loss": 0.0388, + "step": 16640 + }, + { + "epoch": 2.076059850374065, + "grad_norm": 0.2463214248418808, + "learning_rate": 1.1700748129675812e-05, + "loss": 0.0012, + "step": 16650 + }, + { + "epoch": 2.0773067331670823, + "grad_norm": 0.0029989476315677166, + "learning_rate": 1.1695760598503743e-05, + "loss": 0.0003, + "step": 16660 + }, + { + "epoch": 2.0785536159600997, + "grad_norm": 0.01061512902379036, + "learning_rate": 1.1690773067331672e-05, + "loss": 0.0006, + "step": 16670 + }, + { + "epoch": 2.079800498753117, + "grad_norm": 0.0029966856818646193, + "learning_rate": 1.1685785536159602e-05, + "loss": 0.0022, + "step": 16680 + }, + { + "epoch": 2.0810473815461346, + "grad_norm": 0.0038471422158181667, + "learning_rate": 1.1680798004987533e-05, + "loss": 0.0263, + "step": 16690 + }, + { + "epoch": 2.082294264339152, + "grad_norm": 0.013609787449240685, + "learning_rate": 1.1675810473815463e-05, + "loss": 0.0007, + "step": 16700 + }, + { + "epoch": 2.0835411471321694, + "grad_norm": 0.0019859191961586475, + "learning_rate": 1.1670822942643392e-05, + "loss": 0.0372, + "step": 16710 + }, + { + "epoch": 2.084788029925187, + "grad_norm": 0.0010857946472242475, + "learning_rate": 1.1665835411471323e-05, + "loss": 0.028, + "step": 16720 + }, + { + "epoch": 2.0860349127182043, + "grad_norm": 3.9083871841430664, + "learning_rate": 1.1660847880299254e-05, + "loss": 0.0014, + "step": 16730 + }, + { + "epoch": 2.087281795511222, + "grad_norm": 51.425758361816406, + "learning_rate": 1.1655860349127184e-05, + "loss": 0.0603, + "step": 16740 + }, + { + "epoch": 2.0885286783042396, + "grad_norm": 0.0008689459064044058, + "learning_rate": 1.1650872817955111e-05, + "loss": 0.0034, + "step": 16750 + }, + { + "epoch": 2.089775561097257, + "grad_norm": 0.0023854838218539953, + "learning_rate": 1.1645885286783044e-05, + "loss": 0.0001, + "step": 16760 + }, + { + "epoch": 2.0910224438902745, + "grad_norm": 0.010867654345929623, + "learning_rate": 1.1640897755610974e-05, + "loss": 0.0562, + "step": 16770 + }, + { + "epoch": 2.092269326683292, + "grad_norm": 0.002745213219895959, + "learning_rate": 1.1635910224438905e-05, + "loss": 0.0013, + "step": 16780 + }, + { + "epoch": 2.0935162094763093, + "grad_norm": 0.004098067991435528, + "learning_rate": 1.1630922693266832e-05, + "loss": 0.0698, + "step": 16790 + }, + { + "epoch": 2.0947630922693268, + "grad_norm": 0.0039171320386230946, + "learning_rate": 1.1625935162094763e-05, + "loss": 0.0032, + "step": 16800 + }, + { + "epoch": 2.096009975062344, + "grad_norm": 1.781934380531311, + "learning_rate": 1.1620947630922693e-05, + "loss": 0.0178, + "step": 16810 + }, + { + "epoch": 2.0972568578553616, + "grad_norm": 0.0031692027114331722, + "learning_rate": 1.1615960099750626e-05, + "loss": 0.0001, + "step": 16820 + }, + { + "epoch": 2.098503740648379, + "grad_norm": 0.07670797407627106, + "learning_rate": 1.1610972568578553e-05, + "loss": 0.0285, + "step": 16830 + }, + { + "epoch": 2.0997506234413965, + "grad_norm": 0.03737671673297882, + "learning_rate": 1.1605985037406484e-05, + "loss": 0.0001, + "step": 16840 + }, + { + "epoch": 2.100997506234414, + "grad_norm": 0.008449270389974117, + "learning_rate": 1.1600997506234414e-05, + "loss": 0.0002, + "step": 16850 + }, + { + "epoch": 2.1022443890274314, + "grad_norm": 0.004309145733714104, + "learning_rate": 1.1596009975062345e-05, + "loss": 0.0519, + "step": 16860 + }, + { + "epoch": 2.103491271820449, + "grad_norm": 0.053323499858379364, + "learning_rate": 1.1591022443890276e-05, + "loss": 0.0136, + "step": 16870 + }, + { + "epoch": 2.1047381546134662, + "grad_norm": 0.003335277084261179, + "learning_rate": 1.1586034912718204e-05, + "loss": 0.0001, + "step": 16880 + }, + { + "epoch": 2.1059850374064837, + "grad_norm": 0.008609478361904621, + "learning_rate": 1.1581047381546135e-05, + "loss": 0.0623, + "step": 16890 + }, + { + "epoch": 2.107231920199501, + "grad_norm": 0.003915522713214159, + "learning_rate": 1.1576059850374066e-05, + "loss": 0.0002, + "step": 16900 + }, + { + "epoch": 2.1084788029925186, + "grad_norm": 0.005430086050182581, + "learning_rate": 1.1571072319201996e-05, + "loss": 0.0002, + "step": 16910 + }, + { + "epoch": 2.109725685785536, + "grad_norm": 0.01652948372066021, + "learning_rate": 1.1566084788029925e-05, + "loss": 0.0227, + "step": 16920 + }, + { + "epoch": 2.1109725685785534, + "grad_norm": 0.015955012291669846, + "learning_rate": 1.1561097256857856e-05, + "loss": 0.0037, + "step": 16930 + }, + { + "epoch": 2.112219451371571, + "grad_norm": 0.04435069486498833, + "learning_rate": 1.1556109725685786e-05, + "loss": 0.0716, + "step": 16940 + }, + { + "epoch": 2.1134663341645887, + "grad_norm": 0.01482419017702341, + "learning_rate": 1.1551122194513717e-05, + "loss": 0.04, + "step": 16950 + }, + { + "epoch": 2.114713216957606, + "grad_norm": 0.06014908477663994, + "learning_rate": 1.1546134663341646e-05, + "loss": 0.0003, + "step": 16960 + }, + { + "epoch": 2.1159600997506236, + "grad_norm": 0.07580575346946716, + "learning_rate": 1.1541147132169577e-05, + "loss": 0.0386, + "step": 16970 + }, + { + "epoch": 2.117206982543641, + "grad_norm": 7.51279354095459, + "learning_rate": 1.1536159600997507e-05, + "loss": 0.0024, + "step": 16980 + }, + { + "epoch": 2.1184538653366585, + "grad_norm": 2.0569701194763184, + "learning_rate": 1.1531172069825438e-05, + "loss": 0.0009, + "step": 16990 + }, + { + "epoch": 2.119700748129676, + "grad_norm": 27.27562141418457, + "learning_rate": 1.1526184538653367e-05, + "loss": 0.0072, + "step": 17000 + }, + { + "epoch": 2.1209476309226933, + "grad_norm": 0.011304572224617004, + "learning_rate": 1.1521197007481297e-05, + "loss": 0.0005, + "step": 17010 + }, + { + "epoch": 2.1221945137157108, + "grad_norm": 15.838469505310059, + "learning_rate": 1.1516209476309228e-05, + "loss": 0.0097, + "step": 17020 + }, + { + "epoch": 2.123441396508728, + "grad_norm": 0.011107388883829117, + "learning_rate": 1.1511221945137159e-05, + "loss": 0.0005, + "step": 17030 + }, + { + "epoch": 2.1246882793017456, + "grad_norm": 0.06625682860612869, + "learning_rate": 1.1506234413965088e-05, + "loss": 0.0605, + "step": 17040 + }, + { + "epoch": 2.125935162094763, + "grad_norm": 0.0023085640277713537, + "learning_rate": 1.1501246882793018e-05, + "loss": 0.0268, + "step": 17050 + }, + { + "epoch": 2.1271820448877805, + "grad_norm": 5.297941207885742, + "learning_rate": 1.1496259351620949e-05, + "loss": 0.0378, + "step": 17060 + }, + { + "epoch": 2.128428927680798, + "grad_norm": 0.020840341225266457, + "learning_rate": 1.149127182044888e-05, + "loss": 0.0342, + "step": 17070 + }, + { + "epoch": 2.1296758104738154, + "grad_norm": 0.0028252899646759033, + "learning_rate": 1.1486284289276808e-05, + "loss": 0.0006, + "step": 17080 + }, + { + "epoch": 2.130922693266833, + "grad_norm": 0.0014341454952955246, + "learning_rate": 1.1481296758104739e-05, + "loss": 0.011, + "step": 17090 + }, + { + "epoch": 2.1321695760598502, + "grad_norm": 0.0027584710624068975, + "learning_rate": 1.147630922693267e-05, + "loss": 0.0848, + "step": 17100 + }, + { + "epoch": 2.1334164588528677, + "grad_norm": 81.5111083984375, + "learning_rate": 1.14713216957606e-05, + "loss": 0.0485, + "step": 17110 + }, + { + "epoch": 2.134663341645885, + "grad_norm": 0.558646023273468, + "learning_rate": 1.146633416458853e-05, + "loss": 0.0664, + "step": 17120 + }, + { + "epoch": 2.1359102244389025, + "grad_norm": 0.04680400714278221, + "learning_rate": 1.146134663341646e-05, + "loss": 0.0025, + "step": 17130 + }, + { + "epoch": 2.1371571072319204, + "grad_norm": 0.2647761404514313, + "learning_rate": 1.145635910224439e-05, + "loss": 0.008, + "step": 17140 + }, + { + "epoch": 2.138403990024938, + "grad_norm": 0.0029155698139220476, + "learning_rate": 1.1451371571072321e-05, + "loss": 0.0001, + "step": 17150 + }, + { + "epoch": 2.1396508728179553, + "grad_norm": 13.286086082458496, + "learning_rate": 1.1446384039900252e-05, + "loss": 0.0207, + "step": 17160 + }, + { + "epoch": 2.1408977556109727, + "grad_norm": 0.7246861457824707, + "learning_rate": 1.144139650872818e-05, + "loss": 0.002, + "step": 17170 + }, + { + "epoch": 2.14214463840399, + "grad_norm": 0.1160615086555481, + "learning_rate": 1.1436408977556111e-05, + "loss": 0.0561, + "step": 17180 + }, + { + "epoch": 2.1433915211970076, + "grad_norm": 0.0032474566251039505, + "learning_rate": 1.1431421446384042e-05, + "loss": 0.0001, + "step": 17190 + }, + { + "epoch": 2.144638403990025, + "grad_norm": 0.009272860363125801, + "learning_rate": 1.1426433915211972e-05, + "loss": 0.0001, + "step": 17200 + }, + { + "epoch": 2.1458852867830425, + "grad_norm": 0.012639547698199749, + "learning_rate": 1.14214463840399e-05, + "loss": 0.0001, + "step": 17210 + }, + { + "epoch": 2.14713216957606, + "grad_norm": 0.0023768385872244835, + "learning_rate": 1.141645885286783e-05, + "loss": 0.0006, + "step": 17220 + }, + { + "epoch": 2.1483790523690773, + "grad_norm": 0.0028579439967870712, + "learning_rate": 1.1411471321695763e-05, + "loss": 0.0001, + "step": 17230 + }, + { + "epoch": 2.1496259351620948, + "grad_norm": 0.015758490189909935, + "learning_rate": 1.1406483790523693e-05, + "loss": 0.0247, + "step": 17240 + }, + { + "epoch": 2.150872817955112, + "grad_norm": 26.11306381225586, + "learning_rate": 1.140149625935162e-05, + "loss": 0.0048, + "step": 17250 + }, + { + "epoch": 2.1521197007481296, + "grad_norm": 0.002438984578475356, + "learning_rate": 1.1396508728179551e-05, + "loss": 0.0003, + "step": 17260 + }, + { + "epoch": 2.153366583541147, + "grad_norm": 0.00045664477511309087, + "learning_rate": 1.1391521197007482e-05, + "loss": 0.0002, + "step": 17270 + }, + { + "epoch": 2.1546134663341645, + "grad_norm": 8.155107498168945, + "learning_rate": 1.1386533665835412e-05, + "loss": 0.0429, + "step": 17280 + }, + { + "epoch": 2.155860349127182, + "grad_norm": 0.0036463397555053234, + "learning_rate": 1.1381546134663341e-05, + "loss": 0.0088, + "step": 17290 + }, + { + "epoch": 2.1571072319201994, + "grad_norm": 0.0040887449868023396, + "learning_rate": 1.1376558603491272e-05, + "loss": 0.0276, + "step": 17300 + }, + { + "epoch": 2.158354114713217, + "grad_norm": 0.0022863755002617836, + "learning_rate": 1.1371571072319202e-05, + "loss": 0.0001, + "step": 17310 + }, + { + "epoch": 2.1596009975062342, + "grad_norm": 0.0571252815425396, + "learning_rate": 1.1366583541147133e-05, + "loss": 0.0597, + "step": 17320 + }, + { + "epoch": 2.1608478802992517, + "grad_norm": 0.0020809960551559925, + "learning_rate": 1.1361596009975062e-05, + "loss": 0.0001, + "step": 17330 + }, + { + "epoch": 2.162094763092269, + "grad_norm": 0.003306854283437133, + "learning_rate": 1.1356608478802993e-05, + "loss": 0.0356, + "step": 17340 + }, + { + "epoch": 2.163341645885287, + "grad_norm": 0.0009579791803844273, + "learning_rate": 1.1351620947630923e-05, + "loss": 0.0359, + "step": 17350 + }, + { + "epoch": 2.1645885286783044, + "grad_norm": 0.012262257747352123, + "learning_rate": 1.1346633416458854e-05, + "loss": 0.0001, + "step": 17360 + }, + { + "epoch": 2.165835411471322, + "grad_norm": 0.00868857093155384, + "learning_rate": 1.1341645885286784e-05, + "loss": 0.0001, + "step": 17370 + }, + { + "epoch": 2.1670822942643393, + "grad_norm": 0.0008276562439277768, + "learning_rate": 1.1336658354114713e-05, + "loss": 0.0081, + "step": 17380 + }, + { + "epoch": 2.1683291770573567, + "grad_norm": 0.0012495616683736444, + "learning_rate": 1.1331670822942644e-05, + "loss": 0.0001, + "step": 17390 + }, + { + "epoch": 2.169576059850374, + "grad_norm": 0.007158514112234116, + "learning_rate": 1.1326683291770575e-05, + "loss": 0.0001, + "step": 17400 + }, + { + "epoch": 2.1708229426433916, + "grad_norm": 0.015533825382590294, + "learning_rate": 1.1321695760598505e-05, + "loss": 0.0979, + "step": 17410 + }, + { + "epoch": 2.172069825436409, + "grad_norm": 0.0036152431275695562, + "learning_rate": 1.1316708229426434e-05, + "loss": 0.0502, + "step": 17420 + }, + { + "epoch": 2.1733167082294265, + "grad_norm": 0.05553935095667839, + "learning_rate": 1.1311720698254365e-05, + "loss": 0.0002, + "step": 17430 + }, + { + "epoch": 2.174563591022444, + "grad_norm": 0.0016775907715782523, + "learning_rate": 1.1306733167082295e-05, + "loss": 0.0262, + "step": 17440 + }, + { + "epoch": 2.1758104738154613, + "grad_norm": 0.03827022761106491, + "learning_rate": 1.1301745635910226e-05, + "loss": 0.0414, + "step": 17450 + }, + { + "epoch": 2.1770573566084788, + "grad_norm": 0.0065053971484303474, + "learning_rate": 1.1296758104738155e-05, + "loss": 0.1002, + "step": 17460 + }, + { + "epoch": 2.178304239401496, + "grad_norm": 0.001964627066627145, + "learning_rate": 1.1291770573566086e-05, + "loss": 0.0297, + "step": 17470 + }, + { + "epoch": 2.1795511221945136, + "grad_norm": 0.0028284601867198944, + "learning_rate": 1.1286783042394016e-05, + "loss": 0.0001, + "step": 17480 + }, + { + "epoch": 2.180798004987531, + "grad_norm": 0.009003949351608753, + "learning_rate": 1.1281795511221947e-05, + "loss": 0.0197, + "step": 17490 + }, + { + "epoch": 2.1820448877805485, + "grad_norm": 0.007605138700455427, + "learning_rate": 1.1276807980049876e-05, + "loss": 0.0002, + "step": 17500 + }, + { + "epoch": 2.183291770573566, + "grad_norm": 0.008635989390313625, + "learning_rate": 1.1271820448877806e-05, + "loss": 0.0888, + "step": 17510 + }, + { + "epoch": 2.1845386533665834, + "grad_norm": 0.05233432725071907, + "learning_rate": 1.1266832917705737e-05, + "loss": 0.0114, + "step": 17520 + }, + { + "epoch": 2.185785536159601, + "grad_norm": 0.0250040665268898, + "learning_rate": 1.1261845386533668e-05, + "loss": 0.0003, + "step": 17530 + }, + { + "epoch": 2.1870324189526187, + "grad_norm": 0.0034498500172048807, + "learning_rate": 1.1256857855361597e-05, + "loss": 0.0484, + "step": 17540 + }, + { + "epoch": 2.188279301745636, + "grad_norm": 0.0024719752836972475, + "learning_rate": 1.1251870324189527e-05, + "loss": 0.0359, + "step": 17550 + }, + { + "epoch": 2.1895261845386536, + "grad_norm": 0.0028829684015363455, + "learning_rate": 1.1246882793017458e-05, + "loss": 0.0002, + "step": 17560 + }, + { + "epoch": 2.190773067331671, + "grad_norm": 0.0012650667922571301, + "learning_rate": 1.1241895261845388e-05, + "loss": 0.0322, + "step": 17570 + }, + { + "epoch": 2.1920199501246884, + "grad_norm": 0.01969640515744686, + "learning_rate": 1.1236907730673317e-05, + "loss": 0.0165, + "step": 17580 + }, + { + "epoch": 2.193266832917706, + "grad_norm": 49.82617950439453, + "learning_rate": 1.1231920199501248e-05, + "loss": 0.0073, + "step": 17590 + }, + { + "epoch": 2.1945137157107233, + "grad_norm": 39.034271240234375, + "learning_rate": 1.1226932668329179e-05, + "loss": 0.026, + "step": 17600 + }, + { + "epoch": 2.1957605985037407, + "grad_norm": 26.204391479492188, + "learning_rate": 1.122194513715711e-05, + "loss": 0.0747, + "step": 17610 + }, + { + "epoch": 2.197007481296758, + "grad_norm": 0.0015059361467137933, + "learning_rate": 1.121695760598504e-05, + "loss": 0.0002, + "step": 17620 + }, + { + "epoch": 2.1982543640897756, + "grad_norm": 0.016991477459669113, + "learning_rate": 1.1211970074812967e-05, + "loss": 0.0005, + "step": 17630 + }, + { + "epoch": 2.199501246882793, + "grad_norm": 0.006192653905600309, + "learning_rate": 1.12069825436409e-05, + "loss": 0.0422, + "step": 17640 + }, + { + "epoch": 2.2007481296758105, + "grad_norm": 0.014278242364525795, + "learning_rate": 1.120199501246883e-05, + "loss": 0.0144, + "step": 17650 + }, + { + "epoch": 2.201995012468828, + "grad_norm": 1.6394509077072144, + "learning_rate": 1.119700748129676e-05, + "loss": 0.0006, + "step": 17660 + }, + { + "epoch": 2.2032418952618453, + "grad_norm": 0.09551705420017242, + "learning_rate": 1.1192019950124688e-05, + "loss": 0.0002, + "step": 17670 + }, + { + "epoch": 2.2044887780548628, + "grad_norm": 0.0012173291761428118, + "learning_rate": 1.1187032418952618e-05, + "loss": 0.001, + "step": 17680 + }, + { + "epoch": 2.20573566084788, + "grad_norm": 0.004221228417009115, + "learning_rate": 1.1182044887780549e-05, + "loss": 0.0003, + "step": 17690 + }, + { + "epoch": 2.2069825436408976, + "grad_norm": 0.004237989895045757, + "learning_rate": 1.1177057356608481e-05, + "loss": 0.0001, + "step": 17700 + }, + { + "epoch": 2.208229426433915, + "grad_norm": 0.0009970074752345681, + "learning_rate": 1.1172069825436409e-05, + "loss": 0.0438, + "step": 17710 + }, + { + "epoch": 2.2094763092269325, + "grad_norm": 0.037249162793159485, + "learning_rate": 1.116708229426434e-05, + "loss": 0.0002, + "step": 17720 + }, + { + "epoch": 2.21072319201995, + "grad_norm": 0.0009120830800384283, + "learning_rate": 1.116209476309227e-05, + "loss": 0.0542, + "step": 17730 + }, + { + "epoch": 2.2119700748129674, + "grad_norm": 0.011899283155798912, + "learning_rate": 1.11571072319202e-05, + "loss": 0.0282, + "step": 17740 + }, + { + "epoch": 2.213216957605985, + "grad_norm": 0.0017361732898280025, + "learning_rate": 1.115211970074813e-05, + "loss": 0.0087, + "step": 17750 + }, + { + "epoch": 2.2144638403990027, + "grad_norm": 0.0160725899040699, + "learning_rate": 1.114713216957606e-05, + "loss": 0.0008, + "step": 17760 + }, + { + "epoch": 2.21571072319202, + "grad_norm": 0.029333338141441345, + "learning_rate": 1.114214463840399e-05, + "loss": 0.0059, + "step": 17770 + }, + { + "epoch": 2.2169576059850375, + "grad_norm": 21.30148696899414, + "learning_rate": 1.1137157107231921e-05, + "loss": 0.0894, + "step": 17780 + }, + { + "epoch": 2.218204488778055, + "grad_norm": 0.014875189401209354, + "learning_rate": 1.113216957605985e-05, + "loss": 0.0252, + "step": 17790 + }, + { + "epoch": 2.2194513715710724, + "grad_norm": 0.0027588570956140757, + "learning_rate": 1.112718204488778e-05, + "loss": 0.0001, + "step": 17800 + }, + { + "epoch": 2.22069825436409, + "grad_norm": 0.0018637663451954722, + "learning_rate": 1.1122194513715711e-05, + "loss": 0.0002, + "step": 17810 + }, + { + "epoch": 2.2219451371571073, + "grad_norm": 0.0022036132868379354, + "learning_rate": 1.1117206982543642e-05, + "loss": 0.0685, + "step": 17820 + }, + { + "epoch": 2.2231920199501247, + "grad_norm": 0.43087127804756165, + "learning_rate": 1.1112219451371571e-05, + "loss": 0.001, + "step": 17830 + }, + { + "epoch": 2.224438902743142, + "grad_norm": 0.0076804617419838905, + "learning_rate": 1.1107231920199502e-05, + "loss": 0.0005, + "step": 17840 + }, + { + "epoch": 2.2256857855361596, + "grad_norm": 0.04199754819273949, + "learning_rate": 1.1102743142144638e-05, + "loss": 0.0408, + "step": 17850 + }, + { + "epoch": 2.226932668329177, + "grad_norm": 0.004518990404903889, + "learning_rate": 1.1097755610972569e-05, + "loss": 0.0005, + "step": 17860 + }, + { + "epoch": 2.2281795511221945, + "grad_norm": 0.006416243966668844, + "learning_rate": 1.10927680798005e-05, + "loss": 0.0002, + "step": 17870 + }, + { + "epoch": 2.229426433915212, + "grad_norm": 25.688091278076172, + "learning_rate": 1.108778054862843e-05, + "loss": 0.0553, + "step": 17880 + }, + { + "epoch": 2.2306733167082293, + "grad_norm": 0.002934435848146677, + "learning_rate": 1.1082793017456359e-05, + "loss": 0.0019, + "step": 17890 + }, + { + "epoch": 2.2319201995012468, + "grad_norm": 0.0010024199727922678, + "learning_rate": 1.107780548628429e-05, + "loss": 0.0001, + "step": 17900 + }, + { + "epoch": 2.233167082294264, + "grad_norm": 0.0025311210192739964, + "learning_rate": 1.107281795511222e-05, + "loss": 0.0002, + "step": 17910 + }, + { + "epoch": 2.2344139650872816, + "grad_norm": 0.0057137515395879745, + "learning_rate": 1.106783042394015e-05, + "loss": 0.0377, + "step": 17920 + }, + { + "epoch": 2.235660847880299, + "grad_norm": 0.0014953430509194732, + "learning_rate": 1.106284289276808e-05, + "loss": 0.0317, + "step": 17930 + }, + { + "epoch": 2.236907730673317, + "grad_norm": 0.0010911138961091638, + "learning_rate": 1.105785536159601e-05, + "loss": 0.0792, + "step": 17940 + }, + { + "epoch": 2.2381546134663344, + "grad_norm": 0.0012966920621693134, + "learning_rate": 1.1052867830423941e-05, + "loss": 0.0123, + "step": 17950 + }, + { + "epoch": 2.239401496259352, + "grad_norm": 0.05284218490123749, + "learning_rate": 1.1047880299251872e-05, + "loss": 0.0001, + "step": 17960 + }, + { + "epoch": 2.2406483790523692, + "grad_norm": 0.0015723456162959337, + "learning_rate": 1.10428927680798e-05, + "loss": 0.0001, + "step": 17970 + }, + { + "epoch": 2.2418952618453867, + "grad_norm": 0.075335793197155, + "learning_rate": 1.1037905236907731e-05, + "loss": 0.0676, + "step": 17980 + }, + { + "epoch": 2.243142144638404, + "grad_norm": 0.0010825825156643987, + "learning_rate": 1.1032917705735662e-05, + "loss": 0.0001, + "step": 17990 + }, + { + "epoch": 2.2443890274314215, + "grad_norm": 0.0036625145003199577, + "learning_rate": 1.1027930174563592e-05, + "loss": 0.0016, + "step": 18000 + }, + { + "epoch": 2.245635910224439, + "grad_norm": 0.001976029947400093, + "learning_rate": 1.1022942643391523e-05, + "loss": 0.0451, + "step": 18010 + }, + { + "epoch": 2.2468827930174564, + "grad_norm": 0.005771087482571602, + "learning_rate": 1.1017955112219452e-05, + "loss": 0.0546, + "step": 18020 + }, + { + "epoch": 2.248129675810474, + "grad_norm": 0.0013193414779379964, + "learning_rate": 1.1012967581047382e-05, + "loss": 0.0255, + "step": 18030 + }, + { + "epoch": 2.2493765586034913, + "grad_norm": 0.0013921220088377595, + "learning_rate": 1.1007980049875313e-05, + "loss": 0.0002, + "step": 18040 + }, + { + "epoch": 2.2506234413965087, + "grad_norm": 0.16599981486797333, + "learning_rate": 1.1002992518703244e-05, + "loss": 0.0003, + "step": 18050 + }, + { + "epoch": 2.251870324189526, + "grad_norm": 0.003775638760998845, + "learning_rate": 1.0998004987531173e-05, + "loss": 0.0001, + "step": 18060 + }, + { + "epoch": 2.2531172069825436, + "grad_norm": 0.0013987654820084572, + "learning_rate": 1.0993017456359103e-05, + "loss": 0.0002, + "step": 18070 + }, + { + "epoch": 2.254364089775561, + "grad_norm": 0.0029886842239648104, + "learning_rate": 1.0988029925187034e-05, + "loss": 0.0535, + "step": 18080 + }, + { + "epoch": 2.2556109725685785, + "grad_norm": 0.001835786853916943, + "learning_rate": 1.0983042394014964e-05, + "loss": 0.0001, + "step": 18090 + }, + { + "epoch": 2.256857855361596, + "grad_norm": 0.02437090501189232, + "learning_rate": 1.0978054862842893e-05, + "loss": 0.0012, + "step": 18100 + }, + { + "epoch": 2.2581047381546133, + "grad_norm": 0.958751380443573, + "learning_rate": 1.0973067331670824e-05, + "loss": 0.052, + "step": 18110 + }, + { + "epoch": 2.2593516209476308, + "grad_norm": 0.3797219395637512, + "learning_rate": 1.0968079800498755e-05, + "loss": 0.0012, + "step": 18120 + }, + { + "epoch": 2.260598503740648, + "grad_norm": 0.0013616789365187287, + "learning_rate": 1.0963092269326685e-05, + "loss": 0.0001, + "step": 18130 + }, + { + "epoch": 2.2618453865336656, + "grad_norm": 0.0048110876232385635, + "learning_rate": 1.0958104738154614e-05, + "loss": 0.0148, + "step": 18140 + }, + { + "epoch": 2.263092269326683, + "grad_norm": 0.0008074498036876321, + "learning_rate": 1.0953117206982545e-05, + "loss": 0.0002, + "step": 18150 + }, + { + "epoch": 2.264339152119701, + "grad_norm": 0.0034766916651278734, + "learning_rate": 1.0948129675810475e-05, + "loss": 0.0346, + "step": 18160 + }, + { + "epoch": 2.2655860349127184, + "grad_norm": 0.0013960660435259342, + "learning_rate": 1.0943142144638406e-05, + "loss": 0.0001, + "step": 18170 + }, + { + "epoch": 2.266832917705736, + "grad_norm": 0.0030948331113904715, + "learning_rate": 1.0938154613466333e-05, + "loss": 0.0376, + "step": 18180 + }, + { + "epoch": 2.2680798004987532, + "grad_norm": 0.00950552523136139, + "learning_rate": 1.0933167082294266e-05, + "loss": 0.0444, + "step": 18190 + }, + { + "epoch": 2.2693266832917707, + "grad_norm": 0.30063608288764954, + "learning_rate": 1.0928179551122196e-05, + "loss": 0.0001, + "step": 18200 + }, + { + "epoch": 2.270573566084788, + "grad_norm": 0.0005822066450491548, + "learning_rate": 1.0923192019950127e-05, + "loss": 0.1135, + "step": 18210 + }, + { + "epoch": 2.2718204488778055, + "grad_norm": 0.0010623615235090256, + "learning_rate": 1.0918204488778054e-05, + "loss": 0.0456, + "step": 18220 + }, + { + "epoch": 2.273067331670823, + "grad_norm": 0.00394933158531785, + "learning_rate": 1.0913216957605985e-05, + "loss": 0.0294, + "step": 18230 + }, + { + "epoch": 2.2743142144638404, + "grad_norm": 0.08037488162517548, + "learning_rate": 1.0908229426433915e-05, + "loss": 0.0003, + "step": 18240 + }, + { + "epoch": 2.275561097256858, + "grad_norm": 8.893105506896973, + "learning_rate": 1.0903241895261848e-05, + "loss": 0.04, + "step": 18250 + }, + { + "epoch": 2.2768079800498753, + "grad_norm": 0.12478364259004593, + "learning_rate": 1.0898254364089778e-05, + "loss": 0.0167, + "step": 18260 + }, + { + "epoch": 2.2780548628428927, + "grad_norm": 0.013608187437057495, + "learning_rate": 1.0893266832917705e-05, + "loss": 0.0263, + "step": 18270 + }, + { + "epoch": 2.27930174563591, + "grad_norm": 0.015265119262039661, + "learning_rate": 1.0888279301745636e-05, + "loss": 0.0161, + "step": 18280 + }, + { + "epoch": 2.2805486284289276, + "grad_norm": 0.004604879766702652, + "learning_rate": 1.0883291770573567e-05, + "loss": 0.0419, + "step": 18290 + }, + { + "epoch": 2.281795511221945, + "grad_norm": 0.0062502785585820675, + "learning_rate": 1.0878304239401497e-05, + "loss": 0.0004, + "step": 18300 + }, + { + "epoch": 2.2830423940149625, + "grad_norm": 0.3815927505493164, + "learning_rate": 1.0873316708229426e-05, + "loss": 0.0002, + "step": 18310 + }, + { + "epoch": 2.28428927680798, + "grad_norm": 0.0019634002819657326, + "learning_rate": 1.0868329177057357e-05, + "loss": 0.0146, + "step": 18320 + }, + { + "epoch": 2.2855361596009973, + "grad_norm": 0.00816398486495018, + "learning_rate": 1.0863341645885288e-05, + "loss": 0.0426, + "step": 18330 + }, + { + "epoch": 2.286783042394015, + "grad_norm": 0.0013969374122098088, + "learning_rate": 1.0858354114713218e-05, + "loss": 0.0677, + "step": 18340 + }, + { + "epoch": 2.2880299251870326, + "grad_norm": 0.002813225844874978, + "learning_rate": 1.0853366583541147e-05, + "loss": 0.0005, + "step": 18350 + }, + { + "epoch": 2.28927680798005, + "grad_norm": 21.107683181762695, + "learning_rate": 1.0848379052369078e-05, + "loss": 0.1074, + "step": 18360 + }, + { + "epoch": 2.2905236907730675, + "grad_norm": 0.007905518636107445, + "learning_rate": 1.0843391521197008e-05, + "loss": 0.0124, + "step": 18370 + }, + { + "epoch": 2.291770573566085, + "grad_norm": 0.05258958786725998, + "learning_rate": 1.0838403990024939e-05, + "loss": 0.019, + "step": 18380 + }, + { + "epoch": 2.2930174563591024, + "grad_norm": 0.009827799163758755, + "learning_rate": 1.0833416458852868e-05, + "loss": 0.0448, + "step": 18390 + }, + { + "epoch": 2.29426433915212, + "grad_norm": 0.009128672070801258, + "learning_rate": 1.0828428927680798e-05, + "loss": 0.0133, + "step": 18400 + }, + { + "epoch": 2.2955112219451372, + "grad_norm": 28.526371002197266, + "learning_rate": 1.0823441396508729e-05, + "loss": 0.0464, + "step": 18410 + }, + { + "epoch": 2.2967581047381547, + "grad_norm": 0.014480763114988804, + "learning_rate": 1.081845386533666e-05, + "loss": 0.0222, + "step": 18420 + }, + { + "epoch": 2.298004987531172, + "grad_norm": 1.6726791858673096, + "learning_rate": 1.0813466334164589e-05, + "loss": 0.0547, + "step": 18430 + }, + { + "epoch": 2.2992518703241895, + "grad_norm": 14.110183715820312, + "learning_rate": 1.080847880299252e-05, + "loss": 0.0386, + "step": 18440 + }, + { + "epoch": 2.300498753117207, + "grad_norm": 18.027050018310547, + "learning_rate": 1.080349127182045e-05, + "loss": 0.0216, + "step": 18450 + }, + { + "epoch": 2.3017456359102244, + "grad_norm": 0.0013066193787381053, + "learning_rate": 1.079850374064838e-05, + "loss": 0.0446, + "step": 18460 + }, + { + "epoch": 2.302992518703242, + "grad_norm": 0.21965332329273224, + "learning_rate": 1.079351620947631e-05, + "loss": 0.0004, + "step": 18470 + }, + { + "epoch": 2.3042394014962593, + "grad_norm": 0.0025097858160734177, + "learning_rate": 1.078852867830424e-05, + "loss": 0.0262, + "step": 18480 + }, + { + "epoch": 2.3054862842892767, + "grad_norm": 0.0015427289763465524, + "learning_rate": 1.078354114713217e-05, + "loss": 0.0283, + "step": 18490 + }, + { + "epoch": 2.306733167082294, + "grad_norm": 0.00836893543601036, + "learning_rate": 1.0778553615960101e-05, + "loss": 0.0423, + "step": 18500 + }, + { + "epoch": 2.3079800498753116, + "grad_norm": 0.010444067418575287, + "learning_rate": 1.0773566084788032e-05, + "loss": 0.0002, + "step": 18510 + }, + { + "epoch": 2.309226932668329, + "grad_norm": 0.00873679667711258, + "learning_rate": 1.076857855361596e-05, + "loss": 0.0003, + "step": 18520 + }, + { + "epoch": 2.3104738154613464, + "grad_norm": 0.0064266156405210495, + "learning_rate": 1.0763591022443891e-05, + "loss": 0.0003, + "step": 18530 + }, + { + "epoch": 2.311720698254364, + "grad_norm": 0.020968349650502205, + "learning_rate": 1.0758603491271822e-05, + "loss": 0.0052, + "step": 18540 + }, + { + "epoch": 2.3129675810473813, + "grad_norm": 0.03227841854095459, + "learning_rate": 1.0753615960099753e-05, + "loss": 0.0062, + "step": 18550 + }, + { + "epoch": 2.314214463840399, + "grad_norm": 0.001537321018986404, + "learning_rate": 1.0748628428927682e-05, + "loss": 0.0034, + "step": 18560 + }, + { + "epoch": 2.3154613466334166, + "grad_norm": 0.019995620474219322, + "learning_rate": 1.0743640897755612e-05, + "loss": 0.0001, + "step": 18570 + }, + { + "epoch": 2.316708229426434, + "grad_norm": 0.002400660654529929, + "learning_rate": 1.0738653366583543e-05, + "loss": 0.0111, + "step": 18580 + }, + { + "epoch": 2.3179551122194515, + "grad_norm": 0.0035773960407823324, + "learning_rate": 1.0733665835411473e-05, + "loss": 0.0077, + "step": 18590 + }, + { + "epoch": 2.319201995012469, + "grad_norm": 0.006284439004957676, + "learning_rate": 1.0728678304239402e-05, + "loss": 0.0373, + "step": 18600 + }, + { + "epoch": 2.3204488778054864, + "grad_norm": 0.015236135572195053, + "learning_rate": 1.0723690773067333e-05, + "loss": 0.0001, + "step": 18610 + }, + { + "epoch": 2.321695760598504, + "grad_norm": 0.004189270548522472, + "learning_rate": 1.0718703241895264e-05, + "loss": 0.0002, + "step": 18620 + }, + { + "epoch": 2.3229426433915212, + "grad_norm": 0.0024999042507261038, + "learning_rate": 1.0713715710723194e-05, + "loss": 0.0001, + "step": 18630 + }, + { + "epoch": 2.3241895261845387, + "grad_norm": 0.0008350514690391719, + "learning_rate": 1.0708728179551121e-05, + "loss": 0.0188, + "step": 18640 + }, + { + "epoch": 2.325436408977556, + "grad_norm": 0.005228615365922451, + "learning_rate": 1.0703740648379052e-05, + "loss": 0.0433, + "step": 18650 + }, + { + "epoch": 2.3266832917705735, + "grad_norm": 0.02997826412320137, + "learning_rate": 1.0698753117206984e-05, + "loss": 0.0227, + "step": 18660 + }, + { + "epoch": 2.327930174563591, + "grad_norm": 0.001615180866792798, + "learning_rate": 1.0693765586034915e-05, + "loss": 0.0002, + "step": 18670 + }, + { + "epoch": 2.3291770573566084, + "grad_norm": 0.0009207409457303584, + "learning_rate": 1.0688778054862842e-05, + "loss": 0.0001, + "step": 18680 + }, + { + "epoch": 2.330423940149626, + "grad_norm": 11.185653686523438, + "learning_rate": 1.0683790523690773e-05, + "loss": 0.001, + "step": 18690 + }, + { + "epoch": 2.3316708229426433, + "grad_norm": 0.0005180281004868448, + "learning_rate": 1.0678802992518703e-05, + "loss": 0.0031, + "step": 18700 + }, + { + "epoch": 2.3329177057356607, + "grad_norm": 0.0009638606570661068, + "learning_rate": 1.0673815461346634e-05, + "loss": 0.0016, + "step": 18710 + }, + { + "epoch": 2.334164588528678, + "grad_norm": 18.893842697143555, + "learning_rate": 1.0668827930174563e-05, + "loss": 0.0544, + "step": 18720 + }, + { + "epoch": 2.3354114713216956, + "grad_norm": 0.010233055800199509, + "learning_rate": 1.0663840399002494e-05, + "loss": 0.045, + "step": 18730 + }, + { + "epoch": 2.3366583541147135, + "grad_norm": 0.0015732423635199666, + "learning_rate": 1.0658852867830424e-05, + "loss": 0.0543, + "step": 18740 + }, + { + "epoch": 2.337905236907731, + "grad_norm": 0.003005718346685171, + "learning_rate": 1.0653865336658355e-05, + "loss": 0.022, + "step": 18750 + }, + { + "epoch": 2.3391521197007483, + "grad_norm": 0.003704602364450693, + "learning_rate": 1.0648877805486286e-05, + "loss": 0.0001, + "step": 18760 + }, + { + "epoch": 2.3403990024937658, + "grad_norm": 0.07532232254743576, + "learning_rate": 1.0643890274314214e-05, + "loss": 0.0028, + "step": 18770 + }, + { + "epoch": 2.341645885286783, + "grad_norm": 0.0018420940032228827, + "learning_rate": 1.0638902743142145e-05, + "loss": 0.0467, + "step": 18780 + }, + { + "epoch": 2.3428927680798006, + "grad_norm": 0.005858957301825285, + "learning_rate": 1.0633915211970076e-05, + "loss": 0.0387, + "step": 18790 + }, + { + "epoch": 2.344139650872818, + "grad_norm": 0.020084038376808167, + "learning_rate": 1.0628927680798006e-05, + "loss": 0.0513, + "step": 18800 + }, + { + "epoch": 2.3453865336658355, + "grad_norm": 0.008996500633656979, + "learning_rate": 1.0623940149625935e-05, + "loss": 0.0003, + "step": 18810 + }, + { + "epoch": 2.346633416458853, + "grad_norm": 0.003694218583405018, + "learning_rate": 1.0618952618453866e-05, + "loss": 0.0001, + "step": 18820 + }, + { + "epoch": 2.3478802992518704, + "grad_norm": 0.018681691959500313, + "learning_rate": 1.0613965087281796e-05, + "loss": 0.0012, + "step": 18830 + }, + { + "epoch": 2.349127182044888, + "grad_norm": 0.00553112244233489, + "learning_rate": 1.0608977556109727e-05, + "loss": 0.0102, + "step": 18840 + }, + { + "epoch": 2.3503740648379052, + "grad_norm": 0.10294589400291443, + "learning_rate": 1.0603990024937656e-05, + "loss": 0.0603, + "step": 18850 + }, + { + "epoch": 2.3516209476309227, + "grad_norm": 0.0040004318580031395, + "learning_rate": 1.0599002493765587e-05, + "loss": 0.0424, + "step": 18860 + }, + { + "epoch": 2.35286783042394, + "grad_norm": 0.0010672288481146097, + "learning_rate": 1.0594014962593517e-05, + "loss": 0.0001, + "step": 18870 + }, + { + "epoch": 2.3541147132169575, + "grad_norm": 0.003163151443004608, + "learning_rate": 1.0589027431421448e-05, + "loss": 0.0031, + "step": 18880 + }, + { + "epoch": 2.355361596009975, + "grad_norm": 0.24321340024471283, + "learning_rate": 1.0584039900249377e-05, + "loss": 0.0003, + "step": 18890 + }, + { + "epoch": 2.3566084788029924, + "grad_norm": 0.7143842577934265, + "learning_rate": 1.0579052369077307e-05, + "loss": 0.0274, + "step": 18900 + }, + { + "epoch": 2.35785536159601, + "grad_norm": 0.0029214757960289717, + "learning_rate": 1.0574064837905238e-05, + "loss": 0.0373, + "step": 18910 + }, + { + "epoch": 2.3591022443890273, + "grad_norm": 0.0009077019640244544, + "learning_rate": 1.0569077306733169e-05, + "loss": 0.0375, + "step": 18920 + }, + { + "epoch": 2.3603491271820447, + "grad_norm": 0.007901903241872787, + "learning_rate": 1.0564089775561098e-05, + "loss": 0.0001, + "step": 18930 + }, + { + "epoch": 2.361596009975062, + "grad_norm": 23.092758178710938, + "learning_rate": 1.0559102244389028e-05, + "loss": 0.028, + "step": 18940 + }, + { + "epoch": 2.3628428927680796, + "grad_norm": 0.006665545050054789, + "learning_rate": 1.0554114713216959e-05, + "loss": 0.029, + "step": 18950 + }, + { + "epoch": 2.3640897755610975, + "grad_norm": 0.1113353744149208, + "learning_rate": 1.054912718204489e-05, + "loss": 0.0002, + "step": 18960 + }, + { + "epoch": 2.365336658354115, + "grad_norm": 0.0011409720173105597, + "learning_rate": 1.0544139650872818e-05, + "loss": 0.0443, + "step": 18970 + }, + { + "epoch": 2.3665835411471323, + "grad_norm": 0.006274055223912001, + "learning_rate": 1.0539152119700749e-05, + "loss": 0.0001, + "step": 18980 + }, + { + "epoch": 2.3678304239401498, + "grad_norm": 0.03493626415729523, + "learning_rate": 1.053416458852868e-05, + "loss": 0.0148, + "step": 18990 + }, + { + "epoch": 2.369077306733167, + "grad_norm": 0.0024186724331229925, + "learning_rate": 1.052917705735661e-05, + "loss": 0.0008, + "step": 19000 + }, + { + "epoch": 2.3703241895261846, + "grad_norm": 0.0004868563555646688, + "learning_rate": 1.052418952618454e-05, + "loss": 0.0154, + "step": 19010 + }, + { + "epoch": 2.371571072319202, + "grad_norm": 0.01056770421564579, + "learning_rate": 1.051920199501247e-05, + "loss": 0.0014, + "step": 19020 + }, + { + "epoch": 2.3728179551122195, + "grad_norm": 0.0035313288681209087, + "learning_rate": 1.05142144638404e-05, + "loss": 0.0001, + "step": 19030 + }, + { + "epoch": 2.374064837905237, + "grad_norm": 62.090789794921875, + "learning_rate": 1.0509226932668331e-05, + "loss": 0.0211, + "step": 19040 + }, + { + "epoch": 2.3753117206982544, + "grad_norm": 0.0009852793300524354, + "learning_rate": 1.0504239401496262e-05, + "loss": 0.0001, + "step": 19050 + }, + { + "epoch": 2.376558603491272, + "grad_norm": 0.017761345952749252, + "learning_rate": 1.0499251870324189e-05, + "loss": 0.0371, + "step": 19060 + }, + { + "epoch": 2.3778054862842892, + "grad_norm": 0.003993969410657883, + "learning_rate": 1.0494264339152121e-05, + "loss": 0.011, + "step": 19070 + }, + { + "epoch": 2.3790523690773067, + "grad_norm": 0.14724621176719666, + "learning_rate": 1.0489276807980052e-05, + "loss": 0.0002, + "step": 19080 + }, + { + "epoch": 2.380299251870324, + "grad_norm": 102.79788208007812, + "learning_rate": 1.0484289276807982e-05, + "loss": 0.0175, + "step": 19090 + }, + { + "epoch": 2.3815461346633415, + "grad_norm": 49.15989303588867, + "learning_rate": 1.047930174563591e-05, + "loss": 0.0631, + "step": 19100 + }, + { + "epoch": 2.382793017456359, + "grad_norm": 0.16368083655834198, + "learning_rate": 1.047431421446384e-05, + "loss": 0.004, + "step": 19110 + }, + { + "epoch": 2.3840399002493764, + "grad_norm": 0.0005702089983969927, + "learning_rate": 1.0469326683291771e-05, + "loss": 0.0001, + "step": 19120 + }, + { + "epoch": 2.385286783042394, + "grad_norm": 0.0047392770648002625, + "learning_rate": 1.0464339152119703e-05, + "loss": 0.0477, + "step": 19130 + }, + { + "epoch": 2.3865336658354117, + "grad_norm": 0.0008517673704773188, + "learning_rate": 1.045935162094763e-05, + "loss": 0.0668, + "step": 19140 + }, + { + "epoch": 2.387780548628429, + "grad_norm": 0.0008810373838059604, + "learning_rate": 1.0454364089775561e-05, + "loss": 0.0002, + "step": 19150 + }, + { + "epoch": 2.3890274314214466, + "grad_norm": 0.0014551517087966204, + "learning_rate": 1.0449376558603492e-05, + "loss": 0.0002, + "step": 19160 + }, + { + "epoch": 2.390274314214464, + "grad_norm": 0.00041260942816734314, + "learning_rate": 1.0444389027431422e-05, + "loss": 0.0601, + "step": 19170 + }, + { + "epoch": 2.3915211970074814, + "grad_norm": 0.012174108065664768, + "learning_rate": 1.0439401496259351e-05, + "loss": 0.0321, + "step": 19180 + }, + { + "epoch": 2.392768079800499, + "grad_norm": 0.055804114788770676, + "learning_rate": 1.0434413965087282e-05, + "loss": 0.0318, + "step": 19190 + }, + { + "epoch": 2.3940149625935163, + "grad_norm": 0.005796500016003847, + "learning_rate": 1.0429426433915212e-05, + "loss": 0.0001, + "step": 19200 + }, + { + "epoch": 2.3952618453865338, + "grad_norm": 0.0011061924742534757, + "learning_rate": 1.0424438902743143e-05, + "loss": 0.0374, + "step": 19210 + }, + { + "epoch": 2.396508728179551, + "grad_norm": 0.0011768280528485775, + "learning_rate": 1.0419451371571072e-05, + "loss": 0.0058, + "step": 19220 + }, + { + "epoch": 2.3977556109725686, + "grad_norm": 0.004094341304153204, + "learning_rate": 1.0414463840399003e-05, + "loss": 0.0001, + "step": 19230 + }, + { + "epoch": 2.399002493765586, + "grad_norm": 0.005375190172344446, + "learning_rate": 1.0409476309226933e-05, + "loss": 0.0001, + "step": 19240 + }, + { + "epoch": 2.4002493765586035, + "grad_norm": 0.0006093172705732286, + "learning_rate": 1.0404488778054864e-05, + "loss": 0.0007, + "step": 19250 + }, + { + "epoch": 2.401496259351621, + "grad_norm": 3.3103253841400146, + "learning_rate": 1.0399501246882794e-05, + "loss": 0.0166, + "step": 19260 + }, + { + "epoch": 2.4027431421446384, + "grad_norm": 0.0015977061120793223, + "learning_rate": 1.0394513715710723e-05, + "loss": 0.0, + "step": 19270 + }, + { + "epoch": 2.403990024937656, + "grad_norm": 23.932458877563477, + "learning_rate": 1.0389526184538654e-05, + "loss": 0.0425, + "step": 19280 + }, + { + "epoch": 2.4052369077306732, + "grad_norm": 0.0015871457289904356, + "learning_rate": 1.0384538653366585e-05, + "loss": 0.042, + "step": 19290 + }, + { + "epoch": 2.4064837905236907, + "grad_norm": 0.002823758404701948, + "learning_rate": 1.0379551122194515e-05, + "loss": 0.0002, + "step": 19300 + }, + { + "epoch": 2.407730673316708, + "grad_norm": 0.004873698577284813, + "learning_rate": 1.0374563591022444e-05, + "loss": 0.0028, + "step": 19310 + }, + { + "epoch": 2.4089775561097255, + "grad_norm": 70.15239715576172, + "learning_rate": 1.0369576059850375e-05, + "loss": 0.0039, + "step": 19320 + }, + { + "epoch": 2.410224438902743, + "grad_norm": 8.596805572509766, + "learning_rate": 1.0364588528678305e-05, + "loss": 0.048, + "step": 19330 + }, + { + "epoch": 2.4114713216957604, + "grad_norm": 0.0022795642726123333, + "learning_rate": 1.0359600997506236e-05, + "loss": 0.0956, + "step": 19340 + }, + { + "epoch": 2.412718204488778, + "grad_norm": 0.0011459199013188481, + "learning_rate": 1.0354613466334165e-05, + "loss": 0.0231, + "step": 19350 + }, + { + "epoch": 2.4139650872817953, + "grad_norm": 0.014403182081878185, + "learning_rate": 1.0349625935162096e-05, + "loss": 0.0007, + "step": 19360 + }, + { + "epoch": 2.415211970074813, + "grad_norm": 0.0014125104062259197, + "learning_rate": 1.0344638403990026e-05, + "loss": 0.0001, + "step": 19370 + }, + { + "epoch": 2.4164588528678306, + "grad_norm": 20.214357376098633, + "learning_rate": 1.0339650872817957e-05, + "loss": 0.0022, + "step": 19380 + }, + { + "epoch": 2.417705735660848, + "grad_norm": 0.015539759770035744, + "learning_rate": 1.0334663341645886e-05, + "loss": 0.0359, + "step": 19390 + }, + { + "epoch": 2.4189526184538654, + "grad_norm": 0.002788018202409148, + "learning_rate": 1.0329675810473816e-05, + "loss": 0.0505, + "step": 19400 + }, + { + "epoch": 2.420199501246883, + "grad_norm": 0.008958714082837105, + "learning_rate": 1.0324688279301747e-05, + "loss": 0.0033, + "step": 19410 + }, + { + "epoch": 2.4214463840399003, + "grad_norm": 0.03293564170598984, + "learning_rate": 1.0319700748129678e-05, + "loss": 0.0016, + "step": 19420 + }, + { + "epoch": 2.4226932668329177, + "grad_norm": 0.002108257031068206, + "learning_rate": 1.0314713216957607e-05, + "loss": 0.0005, + "step": 19430 + }, + { + "epoch": 2.423940149625935, + "grad_norm": 0.0034332843497395515, + "learning_rate": 1.0309725685785537e-05, + "loss": 0.0068, + "step": 19440 + }, + { + "epoch": 2.4251870324189526, + "grad_norm": 0.01002184022217989, + "learning_rate": 1.0304738154613468e-05, + "loss": 0.0025, + "step": 19450 + }, + { + "epoch": 2.42643391521197, + "grad_norm": 0.0007235811208374798, + "learning_rate": 1.0299750623441398e-05, + "loss": 0.0333, + "step": 19460 + }, + { + "epoch": 2.4276807980049875, + "grad_norm": 87.51153564453125, + "learning_rate": 1.0294763092269327e-05, + "loss": 0.021, + "step": 19470 + }, + { + "epoch": 2.428927680798005, + "grad_norm": 0.01124797947704792, + "learning_rate": 1.0289775561097258e-05, + "loss": 0.0001, + "step": 19480 + }, + { + "epoch": 2.4301745635910224, + "grad_norm": 0.06198972091078758, + "learning_rate": 1.0284788029925189e-05, + "loss": 0.0001, + "step": 19490 + }, + { + "epoch": 2.43142144638404, + "grad_norm": 0.00533120846375823, + "learning_rate": 1.027980049875312e-05, + "loss": 0.0658, + "step": 19500 + }, + { + "epoch": 2.432668329177057, + "grad_norm": 9.353033065795898, + "learning_rate": 1.027481296758105e-05, + "loss": 0.0092, + "step": 19510 + }, + { + "epoch": 2.4339152119700747, + "grad_norm": 0.00038408240652643144, + "learning_rate": 1.0269825436408977e-05, + "loss": 0.0513, + "step": 19520 + }, + { + "epoch": 2.435162094763092, + "grad_norm": 0.010251792147755623, + "learning_rate": 1.026483790523691e-05, + "loss": 0.0001, + "step": 19530 + }, + { + "epoch": 2.43640897755611, + "grad_norm": 0.0018415587255731225, + "learning_rate": 1.025985037406484e-05, + "loss": 0.0001, + "step": 19540 + }, + { + "epoch": 2.4376558603491274, + "grad_norm": 0.0017100636614486575, + "learning_rate": 1.025486284289277e-05, + "loss": 0.0023, + "step": 19550 + }, + { + "epoch": 2.438902743142145, + "grad_norm": 0.0007173551712185144, + "learning_rate": 1.0249875311720698e-05, + "loss": 0.0741, + "step": 19560 + }, + { + "epoch": 2.4401496259351623, + "grad_norm": 0.001977028325200081, + "learning_rate": 1.0244887780548628e-05, + "loss": 0.0235, + "step": 19570 + }, + { + "epoch": 2.4413965087281797, + "grad_norm": 23.780227661132812, + "learning_rate": 1.0239900249376559e-05, + "loss": 0.044, + "step": 19580 + }, + { + "epoch": 2.442643391521197, + "grad_norm": 0.012191710062325, + "learning_rate": 1.0234912718204491e-05, + "loss": 0.0306, + "step": 19590 + }, + { + "epoch": 2.4438902743142146, + "grad_norm": 0.007011815905570984, + "learning_rate": 1.0229925187032419e-05, + "loss": 0.0174, + "step": 19600 + }, + { + "epoch": 2.445137157107232, + "grad_norm": 6.738776683807373, + "learning_rate": 1.022493765586035e-05, + "loss": 0.0007, + "step": 19610 + }, + { + "epoch": 2.4463840399002494, + "grad_norm": 0.0014683338813483715, + "learning_rate": 1.021995012468828e-05, + "loss": 0.0327, + "step": 19620 + }, + { + "epoch": 2.447630922693267, + "grad_norm": 0.003384089795872569, + "learning_rate": 1.021496259351621e-05, + "loss": 0.0271, + "step": 19630 + }, + { + "epoch": 2.4488778054862843, + "grad_norm": 36.67137908935547, + "learning_rate": 1.020997506234414e-05, + "loss": 0.0254, + "step": 19640 + }, + { + "epoch": 2.4501246882793017, + "grad_norm": 0.01410730741918087, + "learning_rate": 1.020498753117207e-05, + "loss": 0.0142, + "step": 19650 + }, + { + "epoch": 2.451371571072319, + "grad_norm": 0.060422949492931366, + "learning_rate": 1.02e-05, + "loss": 0.034, + "step": 19660 + }, + { + "epoch": 2.4526184538653366, + "grad_norm": 0.0010058670304715633, + "learning_rate": 1.0195012468827931e-05, + "loss": 0.0001, + "step": 19670 + }, + { + "epoch": 2.453865336658354, + "grad_norm": 0.4131668210029602, + "learning_rate": 1.019002493765586e-05, + "loss": 0.0342, + "step": 19680 + }, + { + "epoch": 2.4551122194513715, + "grad_norm": 0.0024416018277406693, + "learning_rate": 1.018503740648379e-05, + "loss": 0.0509, + "step": 19690 + }, + { + "epoch": 2.456359102244389, + "grad_norm": 0.012069313786923885, + "learning_rate": 1.0180049875311721e-05, + "loss": 0.0849, + "step": 19700 + }, + { + "epoch": 2.4576059850374063, + "grad_norm": 0.01398361474275589, + "learning_rate": 1.0175062344139652e-05, + "loss": 0.0031, + "step": 19710 + }, + { + "epoch": 2.458852867830424, + "grad_norm": 0.004862997680902481, + "learning_rate": 1.0170074812967581e-05, + "loss": 0.0204, + "step": 19720 + }, + { + "epoch": 2.460099750623441, + "grad_norm": 20.287076950073242, + "learning_rate": 1.0165087281795512e-05, + "loss": 0.0088, + "step": 19730 + }, + { + "epoch": 2.4613466334164587, + "grad_norm": 0.06296839565038681, + "learning_rate": 1.0160099750623442e-05, + "loss": 0.0001, + "step": 19740 + }, + { + "epoch": 2.462593516209476, + "grad_norm": 0.0026271676179021597, + "learning_rate": 1.0155112219451373e-05, + "loss": 0.0116, + "step": 19750 + }, + { + "epoch": 2.4638403990024935, + "grad_norm": 0.03639456257224083, + "learning_rate": 1.0150124688279303e-05, + "loss": 0.0002, + "step": 19760 + }, + { + "epoch": 2.4650872817955114, + "grad_norm": 1.8751825094223022, + "learning_rate": 1.0145137157107232e-05, + "loss": 0.0206, + "step": 19770 + }, + { + "epoch": 2.466334164588529, + "grad_norm": 0.011576034128665924, + "learning_rate": 1.0140149625935163e-05, + "loss": 0.048, + "step": 19780 + }, + { + "epoch": 2.4675810473815463, + "grad_norm": 25.818098068237305, + "learning_rate": 1.0135162094763094e-05, + "loss": 0.108, + "step": 19790 + }, + { + "epoch": 2.4688279301745637, + "grad_norm": 0.0033021382987499237, + "learning_rate": 1.0130174563591024e-05, + "loss": 0.0261, + "step": 19800 + }, + { + "epoch": 2.470074812967581, + "grad_norm": 0.0013052476570010185, + "learning_rate": 1.0125187032418953e-05, + "loss": 0.0002, + "step": 19810 + }, + { + "epoch": 2.4713216957605986, + "grad_norm": 0.018260836601257324, + "learning_rate": 1.0120199501246884e-05, + "loss": 0.0285, + "step": 19820 + }, + { + "epoch": 2.472568578553616, + "grad_norm": 0.005579350516200066, + "learning_rate": 1.0115211970074814e-05, + "loss": 0.0009, + "step": 19830 + }, + { + "epoch": 2.4738154613466334, + "grad_norm": 0.007938280701637268, + "learning_rate": 1.0110224438902745e-05, + "loss": 0.0001, + "step": 19840 + }, + { + "epoch": 2.475062344139651, + "grad_norm": 0.0019651507027447224, + "learning_rate": 1.0105236907730674e-05, + "loss": 0.0751, + "step": 19850 + }, + { + "epoch": 2.4763092269326683, + "grad_norm": 0.005481358617544174, + "learning_rate": 1.0100249376558605e-05, + "loss": 0.0428, + "step": 19860 + }, + { + "epoch": 2.4775561097256857, + "grad_norm": 0.041737351566553116, + "learning_rate": 1.0095261845386535e-05, + "loss": 0.0002, + "step": 19870 + }, + { + "epoch": 2.478802992518703, + "grad_norm": 0.0019266968593001366, + "learning_rate": 1.0090274314214466e-05, + "loss": 0.0009, + "step": 19880 + }, + { + "epoch": 2.4800498753117206, + "grad_norm": 0.014034237712621689, + "learning_rate": 1.0085286783042395e-05, + "loss": 0.0298, + "step": 19890 + }, + { + "epoch": 2.481296758104738, + "grad_norm": 0.286851167678833, + "learning_rate": 1.0080299251870325e-05, + "loss": 0.0002, + "step": 19900 + }, + { + "epoch": 2.4825436408977555, + "grad_norm": 0.002897009951993823, + "learning_rate": 1.0075311720698256e-05, + "loss": 0.0002, + "step": 19910 + }, + { + "epoch": 2.483790523690773, + "grad_norm": 0.4794400930404663, + "learning_rate": 1.0070324189526187e-05, + "loss": 0.0003, + "step": 19920 + }, + { + "epoch": 2.4850374064837903, + "grad_norm": 0.02620755136013031, + "learning_rate": 1.0065336658354114e-05, + "loss": 0.0392, + "step": 19930 + }, + { + "epoch": 2.4862842892768082, + "grad_norm": Infinity, + "learning_rate": 1.0060847880299254e-05, + "loss": 0.0221, + "step": 19940 + }, + { + "epoch": 2.4875311720698257, + "grad_norm": 1.459934115409851, + "learning_rate": 1.0055860349127183e-05, + "loss": 0.0003, + "step": 19950 + }, + { + "epoch": 2.488778054862843, + "grad_norm": 0.012412721291184425, + "learning_rate": 1.0050872817955113e-05, + "loss": 0.0384, + "step": 19960 + }, + { + "epoch": 2.4900249376558605, + "grad_norm": 0.005780264735221863, + "learning_rate": 1.0045885286783044e-05, + "loss": 0.0007, + "step": 19970 + }, + { + "epoch": 2.491271820448878, + "grad_norm": 0.00027117590070702136, + "learning_rate": 1.0040897755610974e-05, + "loss": 0.0019, + "step": 19980 + }, + { + "epoch": 2.4925187032418954, + "grad_norm": 0.3980230391025543, + "learning_rate": 1.0035910224438903e-05, + "loss": 0.0002, + "step": 19990 + }, + { + "epoch": 2.493765586034913, + "grad_norm": 0.0018493568059056997, + "learning_rate": 1.0030922693266834e-05, + "loss": 0.0006, + "step": 20000 + }, + { + "epoch": 2.4950124688279303, + "grad_norm": 0.000776842818595469, + "learning_rate": 1.0025935162094765e-05, + "loss": 0.0001, + "step": 20010 + }, + { + "epoch": 2.4962593516209477, + "grad_norm": 0.003493920899927616, + "learning_rate": 1.0020947630922695e-05, + "loss": 0.0608, + "step": 20020 + }, + { + "epoch": 2.497506234413965, + "grad_norm": 0.012550266459584236, + "learning_rate": 1.0015960099750624e-05, + "loss": 0.0002, + "step": 20030 + }, + { + "epoch": 2.4987531172069826, + "grad_norm": 23.514270782470703, + "learning_rate": 1.0010972568578555e-05, + "loss": 0.0664, + "step": 20040 + }, + { + "epoch": 2.5, + "grad_norm": 0.0014747095992788672, + "learning_rate": 1.0005985037406485e-05, + "loss": 0.0001, + "step": 20050 + }, + { + "epoch": 2.5012468827930174, + "grad_norm": 0.0006397130782715976, + "learning_rate": 1.0000997506234416e-05, + "loss": 0.0527, + "step": 20060 + }, + { + "epoch": 2.502493765586035, + "grad_norm": 0.004135058261454105, + "learning_rate": 9.996009975062345e-06, + "loss": 0.0001, + "step": 20070 + }, + { + "epoch": 2.5037406483790523, + "grad_norm": 0.00247123371809721, + "learning_rate": 9.991022443890276e-06, + "loss": 0.0001, + "step": 20080 + }, + { + "epoch": 2.5049875311720697, + "grad_norm": 0.2668535113334656, + "learning_rate": 9.986034912718206e-06, + "loss": 0.0003, + "step": 20090 + }, + { + "epoch": 2.506234413965087, + "grad_norm": 0.001860244432464242, + "learning_rate": 9.981047381546135e-06, + "loss": 0.0459, + "step": 20100 + }, + { + "epoch": 2.5074812967581046, + "grad_norm": 0.0009490547818131745, + "learning_rate": 9.976059850374066e-06, + "loss": 0.0171, + "step": 20110 + }, + { + "epoch": 2.508728179551122, + "grad_norm": 0.002527831355109811, + "learning_rate": 9.971072319201995e-06, + "loss": 0.0003, + "step": 20120 + }, + { + "epoch": 2.5099750623441395, + "grad_norm": 0.0007606217986904085, + "learning_rate": 9.966084788029925e-06, + "loss": 0.0406, + "step": 20130 + }, + { + "epoch": 2.511221945137157, + "grad_norm": 0.0025609172880649567, + "learning_rate": 9.961097256857856e-06, + "loss": 0.0, + "step": 20140 + }, + { + "epoch": 2.5124688279301743, + "grad_norm": 0.008907945826649666, + "learning_rate": 9.956109725685787e-06, + "loss": 0.0002, + "step": 20150 + }, + { + "epoch": 2.5137157107231918, + "grad_norm": 0.001625910634174943, + "learning_rate": 9.951122194513715e-06, + "loss": 0.0094, + "step": 20160 + }, + { + "epoch": 2.514962593516209, + "grad_norm": 0.0023298319429159164, + "learning_rate": 9.946134663341646e-06, + "loss": 0.0001, + "step": 20170 + }, + { + "epoch": 2.516209476309227, + "grad_norm": 0.0006635018507950008, + "learning_rate": 9.941147132169577e-06, + "loss": 0.0, + "step": 20180 + }, + { + "epoch": 2.5174563591022445, + "grad_norm": 0.008249226957559586, + "learning_rate": 9.936159600997507e-06, + "loss": 0.0, + "step": 20190 + }, + { + "epoch": 2.518703241895262, + "grad_norm": 15.991927146911621, + "learning_rate": 9.931172069825438e-06, + "loss": 0.0927, + "step": 20200 + }, + { + "epoch": 2.5199501246882794, + "grad_norm": 0.01762673631310463, + "learning_rate": 9.926184538653367e-06, + "loss": 0.0001, + "step": 20210 + }, + { + "epoch": 2.521197007481297, + "grad_norm": 0.24744418263435364, + "learning_rate": 9.921197007481297e-06, + "loss": 0.0003, + "step": 20220 + }, + { + "epoch": 2.5224438902743143, + "grad_norm": 0.003450005082413554, + "learning_rate": 9.916209476309228e-06, + "loss": 0.0606, + "step": 20230 + }, + { + "epoch": 2.5236907730673317, + "grad_norm": 0.42235738039016724, + "learning_rate": 9.911221945137159e-06, + "loss": 0.0005, + "step": 20240 + }, + { + "epoch": 2.524937655860349, + "grad_norm": 0.01129044871777296, + "learning_rate": 9.906234413965088e-06, + "loss": 0.0001, + "step": 20250 + }, + { + "epoch": 2.5261845386533666, + "grad_norm": 0.0008669699891470373, + "learning_rate": 9.901246882793018e-06, + "loss": 0.0352, + "step": 20260 + }, + { + "epoch": 2.527431421446384, + "grad_norm": 0.001846897415816784, + "learning_rate": 9.896259351620949e-06, + "loss": 0.0014, + "step": 20270 + }, + { + "epoch": 2.5286783042394014, + "grad_norm": 0.0006753307534381747, + "learning_rate": 9.89127182044888e-06, + "loss": 0.0023, + "step": 20280 + }, + { + "epoch": 2.529925187032419, + "grad_norm": 0.0007630666368640959, + "learning_rate": 9.886284289276808e-06, + "loss": 0.0, + "step": 20290 + }, + { + "epoch": 2.5311720698254363, + "grad_norm": 0.0007382095791399479, + "learning_rate": 9.881296758104739e-06, + "loss": 0.0001, + "step": 20300 + }, + { + "epoch": 2.5324189526184537, + "grad_norm": 0.0006792868953198195, + "learning_rate": 9.876309226932668e-06, + "loss": 0.0114, + "step": 20310 + }, + { + "epoch": 2.533665835411471, + "grad_norm": 0.001210409332998097, + "learning_rate": 9.8713216957606e-06, + "loss": 0.0006, + "step": 20320 + }, + { + "epoch": 2.534912718204489, + "grad_norm": 0.5116429924964905, + "learning_rate": 9.86633416458853e-06, + "loss": 0.0005, + "step": 20330 + }, + { + "epoch": 2.5361596009975065, + "grad_norm": 0.031346336007118225, + "learning_rate": 9.86134663341646e-06, + "loss": 0.0445, + "step": 20340 + }, + { + "epoch": 2.537406483790524, + "grad_norm": 0.0006276296335272491, + "learning_rate": 9.856359102244389e-06, + "loss": 0.0015, + "step": 20350 + }, + { + "epoch": 2.5386533665835413, + "grad_norm": 46.322044372558594, + "learning_rate": 9.85137157107232e-06, + "loss": 0.0762, + "step": 20360 + }, + { + "epoch": 2.539900249376559, + "grad_norm": 0.0012356149964034557, + "learning_rate": 9.84638403990025e-06, + "loss": 0.0908, + "step": 20370 + }, + { + "epoch": 2.541147132169576, + "grad_norm": 0.0005176261183805764, + "learning_rate": 9.84139650872818e-06, + "loss": 0.0001, + "step": 20380 + }, + { + "epoch": 2.5423940149625937, + "grad_norm": 0.009803502820432186, + "learning_rate": 9.83640897755611e-06, + "loss": 0.0001, + "step": 20390 + }, + { + "epoch": 2.543640897755611, + "grad_norm": 0.006273037288337946, + "learning_rate": 9.83142144638404e-06, + "loss": 0.0001, + "step": 20400 + }, + { + "epoch": 2.5448877805486285, + "grad_norm": 0.0008425627020187676, + "learning_rate": 9.82643391521197e-06, + "loss": 0.003, + "step": 20410 + }, + { + "epoch": 2.546134663341646, + "grad_norm": 0.0024014932569116354, + "learning_rate": 9.821446384039901e-06, + "loss": 0.0062, + "step": 20420 + }, + { + "epoch": 2.5473815461346634, + "grad_norm": 0.0010444270446896553, + "learning_rate": 9.816458852867832e-06, + "loss": 0.03, + "step": 20430 + }, + { + "epoch": 2.548628428927681, + "grad_norm": 0.002096327021718025, + "learning_rate": 9.811471321695761e-06, + "loss": 0.0051, + "step": 20440 + }, + { + "epoch": 2.5498753117206983, + "grad_norm": 0.18898934125900269, + "learning_rate": 9.806483790523692e-06, + "loss": 0.0004, + "step": 20450 + }, + { + "epoch": 2.5511221945137157, + "grad_norm": 0.006348905619233847, + "learning_rate": 9.801496259351622e-06, + "loss": 0.0003, + "step": 20460 + }, + { + "epoch": 2.552369077306733, + "grad_norm": 0.0026317578740417957, + "learning_rate": 9.796508728179553e-06, + "loss": 0.0468, + "step": 20470 + }, + { + "epoch": 2.5536159600997506, + "grad_norm": 0.0028476768638938665, + "learning_rate": 9.791521197007482e-06, + "loss": 0.0614, + "step": 20480 + }, + { + "epoch": 2.554862842892768, + "grad_norm": 0.0013454663567245007, + "learning_rate": 9.786533665835412e-06, + "loss": 0.0615, + "step": 20490 + }, + { + "epoch": 2.5561097256857854, + "grad_norm": 0.0007891925633884966, + "learning_rate": 9.781546134663343e-06, + "loss": 0.0, + "step": 20500 + }, + { + "epoch": 2.557356608478803, + "grad_norm": 0.001992990029975772, + "learning_rate": 9.776558603491274e-06, + "loss": 0.0553, + "step": 20510 + }, + { + "epoch": 2.5586034912718203, + "grad_norm": 0.0006816174718551338, + "learning_rate": 9.771571072319203e-06, + "loss": 0.0001, + "step": 20520 + }, + { + "epoch": 2.5598503740648377, + "grad_norm": 0.004548321943730116, + "learning_rate": 9.766583541147133e-06, + "loss": 0.0001, + "step": 20530 + }, + { + "epoch": 2.561097256857855, + "grad_norm": 0.002307687886059284, + "learning_rate": 9.761596009975062e-06, + "loss": 0.0056, + "step": 20540 + }, + { + "epoch": 2.5623441396508726, + "grad_norm": 0.001487944507971406, + "learning_rate": 9.756608478802994e-06, + "loss": 0.0143, + "step": 20550 + }, + { + "epoch": 2.56359102244389, + "grad_norm": 2.5814740657806396, + "learning_rate": 9.751620947630923e-06, + "loss": 0.0006, + "step": 20560 + }, + { + "epoch": 2.5648379052369075, + "grad_norm": 0.0018412600038573146, + "learning_rate": 9.746633416458854e-06, + "loss": 0.0002, + "step": 20570 + }, + { + "epoch": 2.5660847880299253, + "grad_norm": 0.007456095889210701, + "learning_rate": 9.741645885286783e-06, + "loss": 0.0001, + "step": 20580 + }, + { + "epoch": 2.567331670822943, + "grad_norm": 0.003610810497775674, + "learning_rate": 9.736658354114713e-06, + "loss": 0.0019, + "step": 20590 + }, + { + "epoch": 2.56857855361596, + "grad_norm": 0.008344599045813084, + "learning_rate": 9.731670822942644e-06, + "loss": 0.0067, + "step": 20600 + }, + { + "epoch": 2.5698254364089776, + "grad_norm": 75.59593200683594, + "learning_rate": 9.726683291770575e-06, + "loss": 0.037, + "step": 20610 + }, + { + "epoch": 2.571072319201995, + "grad_norm": 0.18108241260051727, + "learning_rate": 9.721695760598504e-06, + "loss": 0.0001, + "step": 20620 + }, + { + "epoch": 2.5723192019950125, + "grad_norm": 0.7683297395706177, + "learning_rate": 9.716708229426434e-06, + "loss": 0.0002, + "step": 20630 + }, + { + "epoch": 2.57356608478803, + "grad_norm": 9.048687934875488, + "learning_rate": 9.711720698254365e-06, + "loss": 0.0009, + "step": 20640 + }, + { + "epoch": 2.5748129675810474, + "grad_norm": 0.001917938468977809, + "learning_rate": 9.706733167082295e-06, + "loss": 0.0022, + "step": 20650 + }, + { + "epoch": 2.576059850374065, + "grad_norm": 0.0009557070443406701, + "learning_rate": 9.701745635910226e-06, + "loss": 0.0024, + "step": 20660 + }, + { + "epoch": 2.5773067331670823, + "grad_norm": 0.000653248920571059, + "learning_rate": 9.696758104738155e-06, + "loss": 0.0277, + "step": 20670 + }, + { + "epoch": 2.5785536159600997, + "grad_norm": 0.0034873723052442074, + "learning_rate": 9.691770573566086e-06, + "loss": 0.0077, + "step": 20680 + }, + { + "epoch": 2.579800498753117, + "grad_norm": 0.0020467734429985285, + "learning_rate": 9.686783042394016e-06, + "loss": 0.0054, + "step": 20690 + }, + { + "epoch": 2.5810473815461346, + "grad_norm": 0.003967711236327887, + "learning_rate": 9.681795511221947e-06, + "loss": 0.0, + "step": 20700 + }, + { + "epoch": 2.582294264339152, + "grad_norm": 0.0005544735467992723, + "learning_rate": 9.676807980049876e-06, + "loss": 0.043, + "step": 20710 + }, + { + "epoch": 2.5835411471321694, + "grad_norm": 19.80573272705078, + "learning_rate": 9.671820448877806e-06, + "loss": 0.073, + "step": 20720 + }, + { + "epoch": 2.5847880299251873, + "grad_norm": 0.06793244928121567, + "learning_rate": 9.666832917705737e-06, + "loss": 0.0036, + "step": 20730 + }, + { + "epoch": 2.5860349127182047, + "grad_norm": 38.223018646240234, + "learning_rate": 9.661845386533668e-06, + "loss": 0.005, + "step": 20740 + }, + { + "epoch": 2.587281795511222, + "grad_norm": 0.0006797302630729973, + "learning_rate": 9.656857855361597e-06, + "loss": 0.0001, + "step": 20750 + }, + { + "epoch": 2.5885286783042396, + "grad_norm": 0.013156076893210411, + "learning_rate": 9.651870324189527e-06, + "loss": 0.0262, + "step": 20760 + }, + { + "epoch": 2.589775561097257, + "grad_norm": 0.0013134418986737728, + "learning_rate": 9.646882793017456e-06, + "loss": 0.0, + "step": 20770 + }, + { + "epoch": 2.5910224438902745, + "grad_norm": 0.0012859961716458201, + "learning_rate": 9.641895261845387e-06, + "loss": 0.0, + "step": 20780 + }, + { + "epoch": 2.592269326683292, + "grad_norm": 0.0007756856502965093, + "learning_rate": 9.636907730673317e-06, + "loss": 0.0273, + "step": 20790 + }, + { + "epoch": 2.5935162094763093, + "grad_norm": 0.0018667828990146518, + "learning_rate": 9.631920199501248e-06, + "loss": 0.0001, + "step": 20800 + }, + { + "epoch": 2.5947630922693268, + "grad_norm": 0.0006253106403164566, + "learning_rate": 9.626932668329177e-06, + "loss": 0.0324, + "step": 20810 + }, + { + "epoch": 2.596009975062344, + "grad_norm": 0.004920145496726036, + "learning_rate": 9.621945137157108e-06, + "loss": 0.044, + "step": 20820 + }, + { + "epoch": 2.5972568578553616, + "grad_norm": 0.00701934564858675, + "learning_rate": 9.616957605985038e-06, + "loss": 0.0001, + "step": 20830 + }, + { + "epoch": 2.598503740648379, + "grad_norm": 0.0005125249153934419, + "learning_rate": 9.611970074812969e-06, + "loss": 0.0004, + "step": 20840 + }, + { + "epoch": 2.5997506234413965, + "grad_norm": 0.000786464661359787, + "learning_rate": 9.606982543640898e-06, + "loss": 0.0, + "step": 20850 + }, + { + "epoch": 2.600997506234414, + "grad_norm": 0.0011195202823728323, + "learning_rate": 9.601995012468828e-06, + "loss": 0.0, + "step": 20860 + }, + { + "epoch": 2.6022443890274314, + "grad_norm": 0.0008278349414467812, + "learning_rate": 9.597007481296759e-06, + "loss": 0.0644, + "step": 20870 + }, + { + "epoch": 2.603491271820449, + "grad_norm": 0.004355038516223431, + "learning_rate": 9.59201995012469e-06, + "loss": 0.0001, + "step": 20880 + }, + { + "epoch": 2.6047381546134662, + "grad_norm": 0.005529096815735102, + "learning_rate": 9.587032418952618e-06, + "loss": 0.0002, + "step": 20890 + }, + { + "epoch": 2.6059850374064837, + "grad_norm": 0.013243569061160088, + "learning_rate": 9.582044887780549e-06, + "loss": 0.0489, + "step": 20900 + }, + { + "epoch": 2.607231920199501, + "grad_norm": 2.0397703647613525, + "learning_rate": 9.57705735660848e-06, + "loss": 0.0005, + "step": 20910 + }, + { + "epoch": 2.6084788029925186, + "grad_norm": 0.0008958008256740868, + "learning_rate": 9.57206982543641e-06, + "loss": 0.0571, + "step": 20920 + }, + { + "epoch": 2.609725685785536, + "grad_norm": 0.022373545914888382, + "learning_rate": 9.567082294264341e-06, + "loss": 0.0812, + "step": 20930 + }, + { + "epoch": 2.6109725685785534, + "grad_norm": 11.973950386047363, + "learning_rate": 9.56209476309227e-06, + "loss": 0.0421, + "step": 20940 + }, + { + "epoch": 2.612219451371571, + "grad_norm": 0.0004938875208608806, + "learning_rate": 9.5571072319202e-06, + "loss": 0.001, + "step": 20950 + }, + { + "epoch": 2.6134663341645883, + "grad_norm": 0.004167089704424143, + "learning_rate": 9.552119700748131e-06, + "loss": 0.0583, + "step": 20960 + }, + { + "epoch": 2.6147132169576057, + "grad_norm": 0.11523910611867905, + "learning_rate": 9.547132169576062e-06, + "loss": 0.0001, + "step": 20970 + }, + { + "epoch": 2.6159600997506236, + "grad_norm": 0.0013754901010543108, + "learning_rate": 9.54214463840399e-06, + "loss": 0.0001, + "step": 20980 + }, + { + "epoch": 2.617206982543641, + "grad_norm": 28.791500091552734, + "learning_rate": 9.537157107231921e-06, + "loss": 0.0372, + "step": 20990 + }, + { + "epoch": 2.6184538653366585, + "grad_norm": 0.0019191583851352334, + "learning_rate": 9.53216957605985e-06, + "loss": 0.0003, + "step": 21000 + }, + { + "epoch": 2.619700748129676, + "grad_norm": 0.001109414966776967, + "learning_rate": 9.527182044887781e-06, + "loss": 0.0002, + "step": 21010 + }, + { + "epoch": 2.6209476309226933, + "grad_norm": 0.0011104693403467536, + "learning_rate": 9.522194513715711e-06, + "loss": 0.0176, + "step": 21020 + }, + { + "epoch": 2.6221945137157108, + "grad_norm": 6.231688022613525, + "learning_rate": 9.517206982543642e-06, + "loss": 0.001, + "step": 21030 + }, + { + "epoch": 2.623441396508728, + "grad_norm": 0.0015027527697384357, + "learning_rate": 9.512219451371571e-06, + "loss": 0.0525, + "step": 21040 + }, + { + "epoch": 2.6246882793017456, + "grad_norm": 0.15147195756435394, + "learning_rate": 9.507231920199502e-06, + "loss": 0.0001, + "step": 21050 + }, + { + "epoch": 2.625935162094763, + "grad_norm": 0.00728128245100379, + "learning_rate": 9.502244389027432e-06, + "loss": 0.0008, + "step": 21060 + }, + { + "epoch": 2.6271820448877805, + "grad_norm": 0.029355598613619804, + "learning_rate": 9.497256857855363e-06, + "loss": 0.0006, + "step": 21070 + }, + { + "epoch": 2.628428927680798, + "grad_norm": 0.005401694681495428, + "learning_rate": 9.492269326683292e-06, + "loss": 0.0001, + "step": 21080 + }, + { + "epoch": 2.6296758104738154, + "grad_norm": 0.09019399434328079, + "learning_rate": 9.487281795511222e-06, + "loss": 0.0001, + "step": 21090 + }, + { + "epoch": 2.630922693266833, + "grad_norm": 0.013616259209811687, + "learning_rate": 9.482294264339153e-06, + "loss": 0.0075, + "step": 21100 + }, + { + "epoch": 2.6321695760598502, + "grad_norm": 0.16121892631053925, + "learning_rate": 9.477306733167084e-06, + "loss": 0.039, + "step": 21110 + }, + { + "epoch": 2.6334164588528677, + "grad_norm": 0.028935719281435013, + "learning_rate": 9.472319201995013e-06, + "loss": 0.0344, + "step": 21120 + }, + { + "epoch": 2.6346633416458856, + "grad_norm": 0.003628489328548312, + "learning_rate": 9.467331670822943e-06, + "loss": 0.0001, + "step": 21130 + }, + { + "epoch": 2.635910224438903, + "grad_norm": 0.08887571841478348, + "learning_rate": 9.462344139650874e-06, + "loss": 0.0367, + "step": 21140 + }, + { + "epoch": 2.6371571072319204, + "grad_norm": 43.324520111083984, + "learning_rate": 9.457356608478804e-06, + "loss": 0.077, + "step": 21150 + }, + { + "epoch": 2.638403990024938, + "grad_norm": 0.0015528800431638956, + "learning_rate": 9.452369077306735e-06, + "loss": 0.0102, + "step": 21160 + }, + { + "epoch": 2.6396508728179553, + "grad_norm": 0.001877202419564128, + "learning_rate": 9.447381546134664e-06, + "loss": 0.0363, + "step": 21170 + }, + { + "epoch": 2.6408977556109727, + "grad_norm": 0.0009081983589567244, + "learning_rate": 9.442394014962595e-06, + "loss": 0.0257, + "step": 21180 + }, + { + "epoch": 2.64214463840399, + "grad_norm": 0.007661967538297176, + "learning_rate": 9.437406483790524e-06, + "loss": 0.0042, + "step": 21190 + }, + { + "epoch": 2.6433915211970076, + "grad_norm": 0.0014459396479651332, + "learning_rate": 9.432418952618456e-06, + "loss": 0.1062, + "step": 21200 + }, + { + "epoch": 2.644638403990025, + "grad_norm": 0.005056493915617466, + "learning_rate": 9.427431421446385e-06, + "loss": 0.0654, + "step": 21210 + }, + { + "epoch": 2.6458852867830425, + "grad_norm": 0.002910393523052335, + "learning_rate": 9.422443890274315e-06, + "loss": 0.0004, + "step": 21220 + }, + { + "epoch": 2.64713216957606, + "grad_norm": 0.013651818037033081, + "learning_rate": 9.417456359102244e-06, + "loss": 0.03, + "step": 21230 + }, + { + "epoch": 2.6483790523690773, + "grad_norm": 0.000517072097864002, + "learning_rate": 9.412468827930175e-06, + "loss": 0.031, + "step": 21240 + }, + { + "epoch": 2.6496259351620948, + "grad_norm": 0.0005630677915178239, + "learning_rate": 9.407481296758106e-06, + "loss": 0.0001, + "step": 21250 + }, + { + "epoch": 2.650872817955112, + "grad_norm": 0.01756085641682148, + "learning_rate": 9.402493765586036e-06, + "loss": 0.0407, + "step": 21260 + }, + { + "epoch": 2.6521197007481296, + "grad_norm": 0.003300454467535019, + "learning_rate": 9.397506234413965e-06, + "loss": 0.0068, + "step": 21270 + }, + { + "epoch": 2.653366583541147, + "grad_norm": 0.01421553548425436, + "learning_rate": 9.392518703241896e-06, + "loss": 0.0445, + "step": 21280 + }, + { + "epoch": 2.6546134663341645, + "grad_norm": 0.010401526466012001, + "learning_rate": 9.387531172069826e-06, + "loss": 0.0026, + "step": 21290 + }, + { + "epoch": 2.655860349127182, + "grad_norm": 0.0018934487598016858, + "learning_rate": 9.382543640897757e-06, + "loss": 0.0295, + "step": 21300 + }, + { + "epoch": 2.6571072319201994, + "grad_norm": 0.0010117872152477503, + "learning_rate": 9.377556109725686e-06, + "loss": 0.0109, + "step": 21310 + }, + { + "epoch": 2.658354114713217, + "grad_norm": 0.1258365958929062, + "learning_rate": 9.372568578553616e-06, + "loss": 0.1095, + "step": 21320 + }, + { + "epoch": 2.6596009975062342, + "grad_norm": 0.002599473111331463, + "learning_rate": 9.367581047381547e-06, + "loss": 0.0659, + "step": 21330 + }, + { + "epoch": 2.6608478802992517, + "grad_norm": 0.01711544394493103, + "learning_rate": 9.362593516209478e-06, + "loss": 0.0001, + "step": 21340 + }, + { + "epoch": 2.662094763092269, + "grad_norm": 0.0035147909075021744, + "learning_rate": 9.357605985037407e-06, + "loss": 0.0365, + "step": 21350 + }, + { + "epoch": 2.6633416458852865, + "grad_norm": 0.3379318118095398, + "learning_rate": 9.352618453865337e-06, + "loss": 0.0303, + "step": 21360 + }, + { + "epoch": 2.664588528678304, + "grad_norm": 4.695526123046875, + "learning_rate": 9.347630922693268e-06, + "loss": 0.001, + "step": 21370 + }, + { + "epoch": 2.665835411471322, + "grad_norm": 0.007908876985311508, + "learning_rate": 9.342643391521199e-06, + "loss": 0.0356, + "step": 21380 + }, + { + "epoch": 2.6670822942643393, + "grad_norm": 0.006391298491507769, + "learning_rate": 9.337655860349127e-06, + "loss": 0.0015, + "step": 21390 + }, + { + "epoch": 2.6683291770573567, + "grad_norm": 0.000589667062740773, + "learning_rate": 9.332668329177058e-06, + "loss": 0.0001, + "step": 21400 + }, + { + "epoch": 2.669576059850374, + "grad_norm": 0.0071742599830031395, + "learning_rate": 9.327680798004989e-06, + "loss": 0.0726, + "step": 21410 + }, + { + "epoch": 2.6708229426433916, + "grad_norm": 26.3955078125, + "learning_rate": 9.322693266832918e-06, + "loss": 0.0816, + "step": 21420 + }, + { + "epoch": 2.672069825436409, + "grad_norm": 0.006245684809982777, + "learning_rate": 9.31770573566085e-06, + "loss": 0.0301, + "step": 21430 + }, + { + "epoch": 2.6733167082294265, + "grad_norm": 0.0037288444582372904, + "learning_rate": 9.312718204488779e-06, + "loss": 0.0007, + "step": 21440 + }, + { + "epoch": 2.674563591022444, + "grad_norm": 0.023546254262328148, + "learning_rate": 9.30773067331671e-06, + "loss": 0.0205, + "step": 21450 + }, + { + "epoch": 2.6758104738154613, + "grad_norm": 0.05411310866475105, + "learning_rate": 9.302743142144638e-06, + "loss": 0.0353, + "step": 21460 + }, + { + "epoch": 2.6770573566084788, + "grad_norm": 0.0028213809709995985, + "learning_rate": 9.297755610972569e-06, + "loss": 0.0014, + "step": 21470 + }, + { + "epoch": 2.678304239401496, + "grad_norm": 0.1361941546201706, + "learning_rate": 9.2927680798005e-06, + "loss": 0.0416, + "step": 21480 + }, + { + "epoch": 2.6795511221945136, + "grad_norm": 0.00903173815459013, + "learning_rate": 9.28778054862843e-06, + "loss": 0.0002, + "step": 21490 + }, + { + "epoch": 2.680798004987531, + "grad_norm": 0.17789961397647858, + "learning_rate": 9.28279301745636e-06, + "loss": 0.0472, + "step": 21500 + }, + { + "epoch": 2.6820448877805485, + "grad_norm": 0.026107098907232285, + "learning_rate": 9.27780548628429e-06, + "loss": 0.0002, + "step": 21510 + }, + { + "epoch": 2.683291770573566, + "grad_norm": 0.0030536989215761423, + "learning_rate": 9.27281795511222e-06, + "loss": 0.0064, + "step": 21520 + }, + { + "epoch": 2.684538653366584, + "grad_norm": 0.004354922566562891, + "learning_rate": 9.267830423940151e-06, + "loss": 0.0357, + "step": 21530 + }, + { + "epoch": 2.6857855361596013, + "grad_norm": 0.026045413687825203, + "learning_rate": 9.26284289276808e-06, + "loss": 0.0243, + "step": 21540 + }, + { + "epoch": 2.6870324189526187, + "grad_norm": 0.060480982065200806, + "learning_rate": 9.25785536159601e-06, + "loss": 0.0013, + "step": 21550 + }, + { + "epoch": 2.688279301745636, + "grad_norm": 0.03292575851082802, + "learning_rate": 9.252867830423941e-06, + "loss": 0.0135, + "step": 21560 + }, + { + "epoch": 2.6895261845386536, + "grad_norm": 4.122298717498779, + "learning_rate": 9.247880299251872e-06, + "loss": 0.0265, + "step": 21570 + }, + { + "epoch": 2.690773067331671, + "grad_norm": 0.0019274103688076138, + "learning_rate": 9.2428927680798e-06, + "loss": 0.036, + "step": 21580 + }, + { + "epoch": 2.6920199501246884, + "grad_norm": 0.008120754733681679, + "learning_rate": 9.237905236907731e-06, + "loss": 0.0259, + "step": 21590 + }, + { + "epoch": 2.693266832917706, + "grad_norm": 0.23278528451919556, + "learning_rate": 9.232917705735662e-06, + "loss": 0.0005, + "step": 21600 + }, + { + "epoch": 2.6945137157107233, + "grad_norm": 0.0032407217659056187, + "learning_rate": 9.227930174563593e-06, + "loss": 0.0034, + "step": 21610 + }, + { + "epoch": 2.6957605985037407, + "grad_norm": 0.0015782383270561695, + "learning_rate": 9.222942643391522e-06, + "loss": 0.0001, + "step": 21620 + }, + { + "epoch": 2.697007481296758, + "grad_norm": 0.002903692191466689, + "learning_rate": 9.217955112219452e-06, + "loss": 0.004, + "step": 21630 + }, + { + "epoch": 2.6982543640897756, + "grad_norm": 0.09239204227924347, + "learning_rate": 9.212967581047381e-06, + "loss": 0.0275, + "step": 21640 + }, + { + "epoch": 2.699501246882793, + "grad_norm": 0.36612099409103394, + "learning_rate": 9.207980049875312e-06, + "loss": 0.0287, + "step": 21650 + }, + { + "epoch": 2.7007481296758105, + "grad_norm": 0.0029661785811185837, + "learning_rate": 9.202992518703244e-06, + "loss": 0.0003, + "step": 21660 + }, + { + "epoch": 2.701995012468828, + "grad_norm": 0.0013707918114960194, + "learning_rate": 9.198004987531173e-06, + "loss": 0.0363, + "step": 21670 + }, + { + "epoch": 2.7032418952618453, + "grad_norm": 0.008741063997149467, + "learning_rate": 9.193017456359104e-06, + "loss": 0.0053, + "step": 21680 + }, + { + "epoch": 2.7044887780548628, + "grad_norm": 0.015000330284237862, + "learning_rate": 9.188029925187032e-06, + "loss": 0.0001, + "step": 21690 + }, + { + "epoch": 2.70573566084788, + "grad_norm": 0.021080205217003822, + "learning_rate": 9.183042394014963e-06, + "loss": 0.0001, + "step": 21700 + }, + { + "epoch": 2.7069825436408976, + "grad_norm": 19.31171226501465, + "learning_rate": 9.178054862842894e-06, + "loss": 0.0308, + "step": 21710 + }, + { + "epoch": 2.708229426433915, + "grad_norm": 0.0010757783893495798, + "learning_rate": 9.173067331670824e-06, + "loss": 0.0178, + "step": 21720 + }, + { + "epoch": 2.7094763092269325, + "grad_norm": 0.0015340207610279322, + "learning_rate": 9.168079800498753e-06, + "loss": 0.0034, + "step": 21730 + }, + { + "epoch": 2.71072319201995, + "grad_norm": 0.0007147280848585069, + "learning_rate": 9.163092269326684e-06, + "loss": 0.0, + "step": 21740 + }, + { + "epoch": 2.7119700748129674, + "grad_norm": 0.0007995369378477335, + "learning_rate": 9.158104738154614e-06, + "loss": 0.0003, + "step": 21750 + }, + { + "epoch": 2.713216957605985, + "grad_norm": 0.03697559982538223, + "learning_rate": 9.153117206982545e-06, + "loss": 0.0007, + "step": 21760 + }, + { + "epoch": 2.7144638403990022, + "grad_norm": 0.0008297286694869399, + "learning_rate": 9.148129675810474e-06, + "loss": 0.0001, + "step": 21770 + }, + { + "epoch": 2.71571072319202, + "grad_norm": 0.000999205862171948, + "learning_rate": 9.143142144638405e-06, + "loss": 0.0006, + "step": 21780 + }, + { + "epoch": 2.7169576059850375, + "grad_norm": 0.00030942095327191055, + "learning_rate": 9.138154613466335e-06, + "loss": 0.0001, + "step": 21790 + }, + { + "epoch": 2.718204488778055, + "grad_norm": 0.011203780770301819, + "learning_rate": 9.133167082294266e-06, + "loss": 0.0004, + "step": 21800 + }, + { + "epoch": 2.7194513715710724, + "grad_norm": 0.001825497834943235, + "learning_rate": 9.128179551122195e-06, + "loss": 0.0001, + "step": 21810 + }, + { + "epoch": 2.72069825436409, + "grad_norm": 0.0016962387599050999, + "learning_rate": 9.123192019950125e-06, + "loss": 0.019, + "step": 21820 + }, + { + "epoch": 2.7219451371571073, + "grad_norm": 27.44611358642578, + "learning_rate": 9.118204488778054e-06, + "loss": 0.0996, + "step": 21830 + }, + { + "epoch": 2.7231920199501247, + "grad_norm": 0.0013832871336489916, + "learning_rate": 9.113216957605987e-06, + "loss": 0.0408, + "step": 21840 + }, + { + "epoch": 2.724438902743142, + "grad_norm": 0.017434384673833847, + "learning_rate": 9.108229426433916e-06, + "loss": 0.0005, + "step": 21850 + }, + { + "epoch": 2.7256857855361596, + "grad_norm": 0.0008209617808461189, + "learning_rate": 9.103241895261846e-06, + "loss": 0.0001, + "step": 21860 + }, + { + "epoch": 2.726932668329177, + "grad_norm": 0.057141683995723724, + "learning_rate": 9.098254364089775e-06, + "loss": 0.0026, + "step": 21870 + }, + { + "epoch": 2.7281795511221945, + "grad_norm": 0.0015385545557364821, + "learning_rate": 9.093266832917706e-06, + "loss": 0.0108, + "step": 21880 + }, + { + "epoch": 2.729426433915212, + "grad_norm": 0.0029364656656980515, + "learning_rate": 9.088279301745636e-06, + "loss": 0.0513, + "step": 21890 + }, + { + "epoch": 2.7306733167082293, + "grad_norm": 0.005240010563284159, + "learning_rate": 9.083291770573567e-06, + "loss": 0.0025, + "step": 21900 + }, + { + "epoch": 2.7319201995012468, + "grad_norm": 0.0024220088962465525, + "learning_rate": 9.078304239401498e-06, + "loss": 0.0074, + "step": 21910 + }, + { + "epoch": 2.733167082294264, + "grad_norm": 0.0010143570834770799, + "learning_rate": 9.073316708229427e-06, + "loss": 0.0833, + "step": 21920 + }, + { + "epoch": 2.734413965087282, + "grad_norm": 0.002402370562776923, + "learning_rate": 9.068329177057357e-06, + "loss": 0.0035, + "step": 21930 + }, + { + "epoch": 2.7356608478802995, + "grad_norm": 0.0024646620731800795, + "learning_rate": 9.063341645885288e-06, + "loss": 0.0393, + "step": 21940 + }, + { + "epoch": 2.736907730673317, + "grad_norm": 0.00634728604927659, + "learning_rate": 9.058354114713218e-06, + "loss": 0.0001, + "step": 21950 + }, + { + "epoch": 2.7381546134663344, + "grad_norm": 0.0004894645535387099, + "learning_rate": 9.053366583541147e-06, + "loss": 0.0424, + "step": 21960 + }, + { + "epoch": 2.739401496259352, + "grad_norm": 0.008502400480210781, + "learning_rate": 9.048379052369078e-06, + "loss": 0.0002, + "step": 21970 + }, + { + "epoch": 2.7406483790523692, + "grad_norm": 0.0029366957023739815, + "learning_rate": 9.043391521197009e-06, + "loss": 0.0819, + "step": 21980 + }, + { + "epoch": 2.7418952618453867, + "grad_norm": 0.003992959391325712, + "learning_rate": 9.03840399002494e-06, + "loss": 0.0001, + "step": 21990 + }, + { + "epoch": 2.743142144638404, + "grad_norm": 0.004475648049265146, + "learning_rate": 9.033416458852868e-06, + "loss": 0.065, + "step": 22000 + }, + { + "epoch": 2.7443890274314215, + "grad_norm": 0.004949849098920822, + "learning_rate": 9.028428927680799e-06, + "loss": 0.0007, + "step": 22010 + }, + { + "epoch": 2.745635910224439, + "grad_norm": 0.0036813318729400635, + "learning_rate": 9.02344139650873e-06, + "loss": 0.103, + "step": 22020 + }, + { + "epoch": 2.7468827930174564, + "grad_norm": 0.006201781798154116, + "learning_rate": 9.01845386533666e-06, + "loss": 0.0162, + "step": 22030 + }, + { + "epoch": 2.748129675810474, + "grad_norm": 0.003080939408391714, + "learning_rate": 9.013466334164589e-06, + "loss": 0.0005, + "step": 22040 + }, + { + "epoch": 2.7493765586034913, + "grad_norm": 0.07051017880439758, + "learning_rate": 9.00847880299252e-06, + "loss": 0.0008, + "step": 22050 + }, + { + "epoch": 2.7506234413965087, + "grad_norm": 0.11431020498275757, + "learning_rate": 9.003491271820448e-06, + "loss": 0.0003, + "step": 22060 + }, + { + "epoch": 2.751870324189526, + "grad_norm": 57.69249725341797, + "learning_rate": 8.99850374064838e-06, + "loss": 0.0093, + "step": 22070 + }, + { + "epoch": 2.7531172069825436, + "grad_norm": 0.06492702662944794, + "learning_rate": 8.99351620947631e-06, + "loss": 0.0018, + "step": 22080 + }, + { + "epoch": 2.754364089775561, + "grad_norm": 0.004266063217073679, + "learning_rate": 8.98852867830424e-06, + "loss": 0.0132, + "step": 22090 + }, + { + "epoch": 2.7556109725685785, + "grad_norm": 0.01503852941095829, + "learning_rate": 8.98354114713217e-06, + "loss": 0.0246, + "step": 22100 + }, + { + "epoch": 2.756857855361596, + "grad_norm": 0.00257158768363297, + "learning_rate": 8.9785536159601e-06, + "loss": 0.0002, + "step": 22110 + }, + { + "epoch": 2.7581047381546133, + "grad_norm": 61.2979736328125, + "learning_rate": 8.97356608478803e-06, + "loss": 0.0168, + "step": 22120 + }, + { + "epoch": 2.7593516209476308, + "grad_norm": 0.01783956028521061, + "learning_rate": 8.968578553615961e-06, + "loss": 0.0084, + "step": 22130 + }, + { + "epoch": 2.760598503740648, + "grad_norm": 0.004634057637304068, + "learning_rate": 8.96359102244389e-06, + "loss": 0.0007, + "step": 22140 + }, + { + "epoch": 2.7618453865336656, + "grad_norm": 0.004496326670050621, + "learning_rate": 8.95860349127182e-06, + "loss": 0.0002, + "step": 22150 + }, + { + "epoch": 2.763092269326683, + "grad_norm": 0.0019250494660809636, + "learning_rate": 8.953615960099751e-06, + "loss": 0.0097, + "step": 22160 + }, + { + "epoch": 2.7643391521197005, + "grad_norm": 0.0009380208211950958, + "learning_rate": 8.948628428927682e-06, + "loss": 0.0089, + "step": 22170 + }, + { + "epoch": 2.765586034912718, + "grad_norm": 0.07003989815711975, + "learning_rate": 8.943640897755613e-06, + "loss": 0.0016, + "step": 22180 + }, + { + "epoch": 2.766832917705736, + "grad_norm": 0.0017002633539959788, + "learning_rate": 8.938653366583541e-06, + "loss": 0.0001, + "step": 22190 + }, + { + "epoch": 2.7680798004987532, + "grad_norm": 0.0055083842016756535, + "learning_rate": 8.933665835411472e-06, + "loss": 0.0108, + "step": 22200 + }, + { + "epoch": 2.7693266832917707, + "grad_norm": 22.551620483398438, + "learning_rate": 8.928678304239403e-06, + "loss": 0.0031, + "step": 22210 + }, + { + "epoch": 2.770573566084788, + "grad_norm": 0.017204271629452705, + "learning_rate": 8.923690773067333e-06, + "loss": 0.0211, + "step": 22220 + }, + { + "epoch": 2.7718204488778055, + "grad_norm": 0.01693701557815075, + "learning_rate": 8.918703241895262e-06, + "loss": 0.0008, + "step": 22230 + }, + { + "epoch": 2.773067331670823, + "grad_norm": 0.0010547166457399726, + "learning_rate": 8.913715710723193e-06, + "loss": 0.0356, + "step": 22240 + }, + { + "epoch": 2.7743142144638404, + "grad_norm": 0.0005315632442943752, + "learning_rate": 8.908728179551123e-06, + "loss": 0.0001, + "step": 22250 + }, + { + "epoch": 2.775561097256858, + "grad_norm": 0.04161824285984039, + "learning_rate": 8.903740648379054e-06, + "loss": 0.0002, + "step": 22260 + }, + { + "epoch": 2.7768079800498753, + "grad_norm": 0.29313403367996216, + "learning_rate": 8.898753117206983e-06, + "loss": 0.0003, + "step": 22270 + }, + { + "epoch": 2.7780548628428927, + "grad_norm": 0.002154151676222682, + "learning_rate": 8.893765586034914e-06, + "loss": 0.0001, + "step": 22280 + }, + { + "epoch": 2.77930174563591, + "grad_norm": 0.001438229694031179, + "learning_rate": 8.888778054862843e-06, + "loss": 0.0002, + "step": 22290 + }, + { + "epoch": 2.7805486284289276, + "grad_norm": 0.0007435880834236741, + "learning_rate": 8.883790523690773e-06, + "loss": 0.0002, + "step": 22300 + }, + { + "epoch": 2.781795511221945, + "grad_norm": 0.0008660277235321701, + "learning_rate": 8.878802992518704e-06, + "loss": 0.0001, + "step": 22310 + }, + { + "epoch": 2.7830423940149625, + "grad_norm": 42.99732971191406, + "learning_rate": 8.873815461346634e-06, + "loss": 0.0339, + "step": 22320 + }, + { + "epoch": 2.78428927680798, + "grad_norm": 0.0006988770910538733, + "learning_rate": 8.868827930174563e-06, + "loss": 0.0, + "step": 22330 + }, + { + "epoch": 2.7855361596009978, + "grad_norm": 0.0033045527525246143, + "learning_rate": 8.863840399002494e-06, + "loss": 0.0001, + "step": 22340 + }, + { + "epoch": 2.786783042394015, + "grad_norm": 0.3833758533000946, + "learning_rate": 8.858852867830425e-06, + "loss": 0.0001, + "step": 22350 + }, + { + "epoch": 2.7880299251870326, + "grad_norm": 0.00027476897230371833, + "learning_rate": 8.853865336658355e-06, + "loss": 0.0488, + "step": 22360 + }, + { + "epoch": 2.78927680798005, + "grad_norm": 0.20554883778095245, + "learning_rate": 8.848877805486284e-06, + "loss": 0.0352, + "step": 22370 + }, + { + "epoch": 2.7905236907730675, + "grad_norm": 0.0017478206427767873, + "learning_rate": 8.843890274314215e-06, + "loss": 0.0, + "step": 22380 + }, + { + "epoch": 2.791770573566085, + "grad_norm": 0.0004887666436843574, + "learning_rate": 8.838902743142145e-06, + "loss": 0.0001, + "step": 22390 + }, + { + "epoch": 2.7930174563591024, + "grad_norm": 0.0020243488252162933, + "learning_rate": 8.833915211970076e-06, + "loss": 0.0153, + "step": 22400 + }, + { + "epoch": 2.79426433915212, + "grad_norm": 0.013695642352104187, + "learning_rate": 8.828927680798007e-06, + "loss": 0.0293, + "step": 22410 + }, + { + "epoch": 2.7955112219451372, + "grad_norm": 0.0038832302670925856, + "learning_rate": 8.823940149625936e-06, + "loss": 0.0003, + "step": 22420 + }, + { + "epoch": 2.7967581047381547, + "grad_norm": 0.000641527003608644, + "learning_rate": 8.818952618453866e-06, + "loss": 0.0, + "step": 22430 + }, + { + "epoch": 2.798004987531172, + "grad_norm": 0.0015550283715128899, + "learning_rate": 8.813965087281797e-06, + "loss": 0.0002, + "step": 22440 + }, + { + "epoch": 2.7992518703241895, + "grad_norm": 0.005692547652870417, + "learning_rate": 8.808977556109727e-06, + "loss": 0.0004, + "step": 22450 + }, + { + "epoch": 2.800498753117207, + "grad_norm": 0.0040212650783360004, + "learning_rate": 8.803990024937656e-06, + "loss": 0.0436, + "step": 22460 + }, + { + "epoch": 2.8017456359102244, + "grad_norm": 29.452655792236328, + "learning_rate": 8.799002493765587e-06, + "loss": 0.0255, + "step": 22470 + }, + { + "epoch": 2.802992518703242, + "grad_norm": 0.04328829422593117, + "learning_rate": 8.794014962593518e-06, + "loss": 0.0001, + "step": 22480 + }, + { + "epoch": 2.8042394014962593, + "grad_norm": 0.001273810165002942, + "learning_rate": 8.789027431421448e-06, + "loss": 0.066, + "step": 22490 + }, + { + "epoch": 2.8054862842892767, + "grad_norm": 0.0005523357540369034, + "learning_rate": 8.784039900249377e-06, + "loss": 0.0, + "step": 22500 + }, + { + "epoch": 2.806733167082294, + "grad_norm": 0.004913564305752516, + "learning_rate": 8.779052369077308e-06, + "loss": 0.0017, + "step": 22510 + }, + { + "epoch": 2.8079800498753116, + "grad_norm": 0.0007415362633764744, + "learning_rate": 8.774064837905237e-06, + "loss": 0.0137, + "step": 22520 + }, + { + "epoch": 2.809226932668329, + "grad_norm": 0.0008837560890242457, + "learning_rate": 8.769077306733167e-06, + "loss": 0.0089, + "step": 22530 + }, + { + "epoch": 2.8104738154613464, + "grad_norm": 0.00038785228389315307, + "learning_rate": 8.764089775561098e-06, + "loss": 0.0, + "step": 22540 + }, + { + "epoch": 2.811720698254364, + "grad_norm": 0.0018591403495520353, + "learning_rate": 8.759102244389028e-06, + "loss": 0.0005, + "step": 22550 + }, + { + "epoch": 2.8129675810473813, + "grad_norm": 0.0027543804608285427, + "learning_rate": 8.754114713216957e-06, + "loss": 0.0017, + "step": 22560 + }, + { + "epoch": 2.8142144638403987, + "grad_norm": 0.000695584574714303, + "learning_rate": 8.749127182044888e-06, + "loss": 0.0001, + "step": 22570 + }, + { + "epoch": 2.815461346633416, + "grad_norm": 0.0011798815103247762, + "learning_rate": 8.744139650872819e-06, + "loss": 0.0117, + "step": 22580 + }, + { + "epoch": 2.816708229426434, + "grad_norm": 0.001240040990523994, + "learning_rate": 8.73915211970075e-06, + "loss": 0.0001, + "step": 22590 + }, + { + "epoch": 2.8179551122194515, + "grad_norm": 0.0005738515174016356, + "learning_rate": 8.734164588528678e-06, + "loss": 0.0325, + "step": 22600 + }, + { + "epoch": 2.819201995012469, + "grad_norm": 0.0006897413404658437, + "learning_rate": 8.729177057356609e-06, + "loss": 0.0274, + "step": 22610 + }, + { + "epoch": 2.8204488778054864, + "grad_norm": 0.0004890425479970872, + "learning_rate": 8.72418952618454e-06, + "loss": 0.0001, + "step": 22620 + }, + { + "epoch": 2.821695760598504, + "grad_norm": 0.0007049014675430954, + "learning_rate": 8.71920199501247e-06, + "loss": 0.0398, + "step": 22630 + }, + { + "epoch": 2.8229426433915212, + "grad_norm": 0.0005111999926157296, + "learning_rate": 8.7142144638404e-06, + "loss": 0.0023, + "step": 22640 + }, + { + "epoch": 2.8241895261845387, + "grad_norm": 0.05086338147521019, + "learning_rate": 8.70922693266833e-06, + "loss": 0.0001, + "step": 22650 + }, + { + "epoch": 2.825436408977556, + "grad_norm": 0.00038924190448597074, + "learning_rate": 8.70423940149626e-06, + "loss": 0.0002, + "step": 22660 + }, + { + "epoch": 2.8266832917705735, + "grad_norm": 0.0014980288688093424, + "learning_rate": 8.69925187032419e-06, + "loss": 0.0, + "step": 22670 + }, + { + "epoch": 2.827930174563591, + "grad_norm": 0.0011230731615796685, + "learning_rate": 8.694264339152121e-06, + "loss": 0.0157, + "step": 22680 + }, + { + "epoch": 2.8291770573566084, + "grad_norm": 0.0012151073897257447, + "learning_rate": 8.68927680798005e-06, + "loss": 0.0001, + "step": 22690 + }, + { + "epoch": 2.830423940149626, + "grad_norm": 0.0005313754081726074, + "learning_rate": 8.684289276807981e-06, + "loss": 0.0001, + "step": 22700 + }, + { + "epoch": 2.8316708229426433, + "grad_norm": 0.0022479502949863672, + "learning_rate": 8.679301745635912e-06, + "loss": 0.0, + "step": 22710 + }, + { + "epoch": 2.8329177057356607, + "grad_norm": 0.0011304900981485844, + "learning_rate": 8.674314214463842e-06, + "loss": 0.0101, + "step": 22720 + }, + { + "epoch": 2.834164588528678, + "grad_norm": 11.917057991027832, + "learning_rate": 8.669326683291771e-06, + "loss": 0.0283, + "step": 22730 + }, + { + "epoch": 2.835411471321696, + "grad_norm": 17.62745475769043, + "learning_rate": 8.664339152119702e-06, + "loss": 0.0024, + "step": 22740 + }, + { + "epoch": 2.8366583541147135, + "grad_norm": 48.56819152832031, + "learning_rate": 8.65935162094763e-06, + "loss": 0.0499, + "step": 22750 + }, + { + "epoch": 2.837905236907731, + "grad_norm": 0.04258447512984276, + "learning_rate": 8.654364089775561e-06, + "loss": 0.0001, + "step": 22760 + }, + { + "epoch": 2.8391521197007483, + "grad_norm": 0.0031998485792428255, + "learning_rate": 8.649376558603492e-06, + "loss": 0.0159, + "step": 22770 + }, + { + "epoch": 2.8403990024937658, + "grad_norm": 0.00028625703998841345, + "learning_rate": 8.644389027431423e-06, + "loss": 0.0, + "step": 22780 + }, + { + "epoch": 2.841645885286783, + "grad_norm": 0.06957925111055374, + "learning_rate": 8.639401496259351e-06, + "loss": 0.0473, + "step": 22790 + }, + { + "epoch": 2.8428927680798006, + "grad_norm": 0.019344905391335487, + "learning_rate": 8.634413965087282e-06, + "loss": 0.0443, + "step": 22800 + }, + { + "epoch": 2.844139650872818, + "grad_norm": 4.3518805503845215, + "learning_rate": 8.629426433915213e-06, + "loss": 0.0028, + "step": 22810 + }, + { + "epoch": 2.8453865336658355, + "grad_norm": 0.001046427059918642, + "learning_rate": 8.624937655860351e-06, + "loss": 0.028, + "step": 22820 + }, + { + "epoch": 2.846633416458853, + "grad_norm": 0.0013077203184366226, + "learning_rate": 8.61995012468828e-06, + "loss": 0.0308, + "step": 22830 + }, + { + "epoch": 2.8478802992518704, + "grad_norm": 0.003886470338329673, + "learning_rate": 8.61496259351621e-06, + "loss": 0.0003, + "step": 22840 + }, + { + "epoch": 2.849127182044888, + "grad_norm": 0.00042426149593666196, + "learning_rate": 8.60997506234414e-06, + "loss": 0.0001, + "step": 22850 + }, + { + "epoch": 2.8503740648379052, + "grad_norm": 0.03965604305267334, + "learning_rate": 8.604987531172072e-06, + "loss": 0.039, + "step": 22860 + }, + { + "epoch": 2.8516209476309227, + "grad_norm": 0.00838763639330864, + "learning_rate": 8.6e-06, + "loss": 0.0503, + "step": 22870 + }, + { + "epoch": 2.85286783042394, + "grad_norm": 0.026502016931772232, + "learning_rate": 8.595012468827931e-06, + "loss": 0.0001, + "step": 22880 + }, + { + "epoch": 2.8541147132169575, + "grad_norm": 0.0036835346836596727, + "learning_rate": 8.59002493765586e-06, + "loss": 0.0, + "step": 22890 + }, + { + "epoch": 2.855361596009975, + "grad_norm": 0.06453216820955276, + "learning_rate": 8.58503740648379e-06, + "loss": 0.0002, + "step": 22900 + }, + { + "epoch": 2.8566084788029924, + "grad_norm": 0.032697517424821854, + "learning_rate": 8.580049875311721e-06, + "loss": 0.0003, + "step": 22910 + }, + { + "epoch": 2.85785536159601, + "grad_norm": 0.0004190378822386265, + "learning_rate": 8.575062344139652e-06, + "loss": 0.0018, + "step": 22920 + }, + { + "epoch": 2.8591022443890273, + "grad_norm": 0.030965005978941917, + "learning_rate": 8.570074812967581e-06, + "loss": 0.0004, + "step": 22930 + }, + { + "epoch": 2.8603491271820447, + "grad_norm": 0.00719516770914197, + "learning_rate": 8.565087281795512e-06, + "loss": 0.0032, + "step": 22940 + }, + { + "epoch": 2.861596009975062, + "grad_norm": 0.0006492987158708274, + "learning_rate": 8.560099750623442e-06, + "loss": 0.0452, + "step": 22950 + }, + { + "epoch": 2.8628428927680796, + "grad_norm": 23.008272171020508, + "learning_rate": 8.555112219451373e-06, + "loss": 0.003, + "step": 22960 + }, + { + "epoch": 2.864089775561097, + "grad_norm": 0.0006293976912274957, + "learning_rate": 8.550124688279302e-06, + "loss": 0.0003, + "step": 22970 + }, + { + "epoch": 2.8653366583541144, + "grad_norm": 0.0036729283165186644, + "learning_rate": 8.545137157107232e-06, + "loss": 0.0, + "step": 22980 + }, + { + "epoch": 2.8665835411471323, + "grad_norm": 0.01885777898132801, + "learning_rate": 8.540149625935163e-06, + "loss": 0.0327, + "step": 22990 + }, + { + "epoch": 2.8678304239401498, + "grad_norm": 0.0021062095183879137, + "learning_rate": 8.535162094763094e-06, + "loss": 0.0315, + "step": 23000 + }, + { + "epoch": 2.869077306733167, + "grad_norm": 0.0004612091288436204, + "learning_rate": 8.530174563591023e-06, + "loss": 0.0001, + "step": 23010 + }, + { + "epoch": 2.8703241895261846, + "grad_norm": 0.00048473486094735563, + "learning_rate": 8.525187032418953e-06, + "loss": 0.0, + "step": 23020 + }, + { + "epoch": 2.871571072319202, + "grad_norm": 0.39935484528541565, + "learning_rate": 8.520199501246884e-06, + "loss": 0.0002, + "step": 23030 + }, + { + "epoch": 2.8728179551122195, + "grad_norm": 0.002812149003148079, + "learning_rate": 8.515211970074814e-06, + "loss": 0.0045, + "step": 23040 + }, + { + "epoch": 2.874064837905237, + "grad_norm": 0.00135338946711272, + "learning_rate": 8.510224438902743e-06, + "loss": 0.0, + "step": 23050 + }, + { + "epoch": 2.8753117206982544, + "grad_norm": 0.002936942270025611, + "learning_rate": 8.505236907730674e-06, + "loss": 0.001, + "step": 23060 + }, + { + "epoch": 2.876558603491272, + "grad_norm": 0.0027010170742869377, + "learning_rate": 8.500249376558605e-06, + "loss": 0.0078, + "step": 23070 + }, + { + "epoch": 2.8778054862842892, + "grad_norm": 0.001148201641626656, + "learning_rate": 8.495261845386534e-06, + "loss": 0.0001, + "step": 23080 + }, + { + "epoch": 2.8790523690773067, + "grad_norm": 0.1928788721561432, + "learning_rate": 8.490274314214466e-06, + "loss": 0.0481, + "step": 23090 + }, + { + "epoch": 2.880299251870324, + "grad_norm": 0.0031981042120605707, + "learning_rate": 8.485286783042395e-06, + "loss": 0.0001, + "step": 23100 + }, + { + "epoch": 2.8815461346633415, + "grad_norm": 0.0007050613639876246, + "learning_rate": 8.480299251870325e-06, + "loss": 0.0, + "step": 23110 + }, + { + "epoch": 2.882793017456359, + "grad_norm": 0.0008105888264253736, + "learning_rate": 8.475311720698254e-06, + "loss": 0.0974, + "step": 23120 + }, + { + "epoch": 2.8840399002493764, + "grad_norm": 4.346730709075928, + "learning_rate": 8.470324189526185e-06, + "loss": 0.0094, + "step": 23130 + }, + { + "epoch": 2.8852867830423943, + "grad_norm": 10.774336814880371, + "learning_rate": 8.465336658354116e-06, + "loss": 0.0009, + "step": 23140 + }, + { + "epoch": 2.8865336658354117, + "grad_norm": 0.001677769236266613, + "learning_rate": 8.460349127182046e-06, + "loss": 0.0, + "step": 23150 + }, + { + "epoch": 2.887780548628429, + "grad_norm": 0.0012615503510460258, + "learning_rate": 8.455361596009975e-06, + "loss": 0.0, + "step": 23160 + }, + { + "epoch": 2.8890274314214466, + "grad_norm": 0.0020845013204962015, + "learning_rate": 8.450374064837906e-06, + "loss": 0.0137, + "step": 23170 + }, + { + "epoch": 2.890274314214464, + "grad_norm": 0.015877505764365196, + "learning_rate": 8.445386533665836e-06, + "loss": 0.0001, + "step": 23180 + }, + { + "epoch": 2.8915211970074814, + "grad_norm": 2.8694908618927, + "learning_rate": 8.440399002493767e-06, + "loss": 0.0003, + "step": 23190 + }, + { + "epoch": 2.892768079800499, + "grad_norm": 0.00029893871396780014, + "learning_rate": 8.435411471321696e-06, + "loss": 0.0001, + "step": 23200 + }, + { + "epoch": 2.8940149625935163, + "grad_norm": 0.0019202682888135314, + "learning_rate": 8.430423940149626e-06, + "loss": 0.0197, + "step": 23210 + }, + { + "epoch": 2.8952618453865338, + "grad_norm": 78.83489227294922, + "learning_rate": 8.425436408977557e-06, + "loss": 0.0308, + "step": 23220 + }, + { + "epoch": 2.896508728179551, + "grad_norm": 0.0004875132581219077, + "learning_rate": 8.420448877805488e-06, + "loss": 0.0, + "step": 23230 + }, + { + "epoch": 2.8977556109725686, + "grad_norm": 0.0007964399410411716, + "learning_rate": 8.415461346633417e-06, + "loss": 0.0, + "step": 23240 + }, + { + "epoch": 2.899002493765586, + "grad_norm": 0.007691626902669668, + "learning_rate": 8.410473815461347e-06, + "loss": 0.0275, + "step": 23250 + }, + { + "epoch": 2.9002493765586035, + "grad_norm": 0.012193195521831512, + "learning_rate": 8.405486284289278e-06, + "loss": 0.0571, + "step": 23260 + }, + { + "epoch": 2.901496259351621, + "grad_norm": 0.0132495928555727, + "learning_rate": 8.400498753117209e-06, + "loss": 0.0, + "step": 23270 + }, + { + "epoch": 2.9027431421446384, + "grad_norm": 0.0009290704620070755, + "learning_rate": 8.395511221945137e-06, + "loss": 0.0, + "step": 23280 + }, + { + "epoch": 2.903990024937656, + "grad_norm": 22.11893081665039, + "learning_rate": 8.390523690773068e-06, + "loss": 0.022, + "step": 23290 + }, + { + "epoch": 2.9052369077306732, + "grad_norm": 0.003784743370488286, + "learning_rate": 8.385536159600997e-06, + "loss": 0.0, + "step": 23300 + }, + { + "epoch": 2.9064837905236907, + "grad_norm": 0.0003365647862665355, + "learning_rate": 8.380548628428928e-06, + "loss": 0.0001, + "step": 23310 + }, + { + "epoch": 2.907730673316708, + "grad_norm": 0.018379248678684235, + "learning_rate": 8.37556109725686e-06, + "loss": 0.0001, + "step": 23320 + }, + { + "epoch": 2.9089775561097255, + "grad_norm": 0.00030551591771654785, + "learning_rate": 8.370573566084789e-06, + "loss": 0.0439, + "step": 23330 + }, + { + "epoch": 2.910224438902743, + "grad_norm": 0.11214791983366013, + "learning_rate": 8.36558603491272e-06, + "loss": 0.0001, + "step": 23340 + }, + { + "epoch": 2.9114713216957604, + "grad_norm": 0.0002639228187035769, + "learning_rate": 8.360598503740648e-06, + "loss": 0.0199, + "step": 23350 + }, + { + "epoch": 2.912718204488778, + "grad_norm": 0.003870609914883971, + "learning_rate": 8.355610972568579e-06, + "loss": 0.0242, + "step": 23360 + }, + { + "epoch": 2.9139650872817953, + "grad_norm": 0.3105866611003876, + "learning_rate": 8.35062344139651e-06, + "loss": 0.0004, + "step": 23370 + }, + { + "epoch": 2.9152119700748127, + "grad_norm": 0.0002682608610484749, + "learning_rate": 8.34563591022444e-06, + "loss": 0.0048, + "step": 23380 + }, + { + "epoch": 2.9164588528678306, + "grad_norm": 0.00046468869550153613, + "learning_rate": 8.34064837905237e-06, + "loss": 0.0001, + "step": 23390 + }, + { + "epoch": 2.917705735660848, + "grad_norm": 0.0008718192693777382, + "learning_rate": 8.3356608478803e-06, + "loss": 0.0862, + "step": 23400 + }, + { + "epoch": 2.9189526184538654, + "grad_norm": 0.003581983968615532, + "learning_rate": 8.33067331670823e-06, + "loss": 0.0001, + "step": 23410 + }, + { + "epoch": 2.920199501246883, + "grad_norm": 0.0020457198843359947, + "learning_rate": 8.325685785536161e-06, + "loss": 0.0001, + "step": 23420 + }, + { + "epoch": 2.9214463840399003, + "grad_norm": 14.641356468200684, + "learning_rate": 8.32069825436409e-06, + "loss": 0.0014, + "step": 23430 + }, + { + "epoch": 2.9226932668329177, + "grad_norm": 0.19841350615024567, + "learning_rate": 8.31571072319202e-06, + "loss": 0.0001, + "step": 23440 + }, + { + "epoch": 2.923940149625935, + "grad_norm": 22.013063430786133, + "learning_rate": 8.310723192019951e-06, + "loss": 0.0313, + "step": 23450 + }, + { + "epoch": 2.9251870324189526, + "grad_norm": 0.0051285261288285255, + "learning_rate": 8.305735660847882e-06, + "loss": 0.0002, + "step": 23460 + }, + { + "epoch": 2.92643391521197, + "grad_norm": 0.0011192033998668194, + "learning_rate": 8.30074812967581e-06, + "loss": 0.0528, + "step": 23470 + }, + { + "epoch": 2.9276807980049875, + "grad_norm": 0.017419705167412758, + "learning_rate": 8.295760598503741e-06, + "loss": 0.0013, + "step": 23480 + }, + { + "epoch": 2.928927680798005, + "grad_norm": 0.006357110105454922, + "learning_rate": 8.29077306733167e-06, + "loss": 0.0414, + "step": 23490 + }, + { + "epoch": 2.9301745635910224, + "grad_norm": 0.007139955647289753, + "learning_rate": 8.285785536159603e-06, + "loss": 0.111, + "step": 23500 + }, + { + "epoch": 2.93142144638404, + "grad_norm": 0.014205764047801495, + "learning_rate": 8.280798004987532e-06, + "loss": 0.0093, + "step": 23510 + }, + { + "epoch": 2.932668329177057, + "grad_norm": 0.009623657912015915, + "learning_rate": 8.275810473815462e-06, + "loss": 0.0053, + "step": 23520 + }, + { + "epoch": 2.9339152119700747, + "grad_norm": 0.0029291717801243067, + "learning_rate": 8.270822942643391e-06, + "loss": 0.0002, + "step": 23530 + }, + { + "epoch": 2.9351620947630925, + "grad_norm": 0.0059037720784544945, + "learning_rate": 8.265835411471322e-06, + "loss": 0.0044, + "step": 23540 + }, + { + "epoch": 2.93640897755611, + "grad_norm": 24.23624038696289, + "learning_rate": 8.260847880299252e-06, + "loss": 0.0675, + "step": 23550 + }, + { + "epoch": 2.9376558603491274, + "grad_norm": 0.009634561836719513, + "learning_rate": 8.255860349127183e-06, + "loss": 0.0352, + "step": 23560 + }, + { + "epoch": 2.938902743142145, + "grad_norm": 0.001296225585974753, + "learning_rate": 8.250872817955114e-06, + "loss": 0.0014, + "step": 23570 + }, + { + "epoch": 2.9401496259351623, + "grad_norm": 0.02580132707953453, + "learning_rate": 8.245885286783042e-06, + "loss": 0.0, + "step": 23580 + }, + { + "epoch": 2.9413965087281797, + "grad_norm": 0.003978101536631584, + "learning_rate": 8.240897755610973e-06, + "loss": 0.0271, + "step": 23590 + }, + { + "epoch": 2.942643391521197, + "grad_norm": 0.009165945462882519, + "learning_rate": 8.235910224438904e-06, + "loss": 0.0424, + "step": 23600 + }, + { + "epoch": 2.9438902743142146, + "grad_norm": 1.0541449785232544, + "learning_rate": 8.230922693266834e-06, + "loss": 0.0003, + "step": 23610 + }, + { + "epoch": 2.945137157107232, + "grad_norm": 0.00030957505805417895, + "learning_rate": 8.225935162094763e-06, + "loss": 0.0397, + "step": 23620 + }, + { + "epoch": 2.9463840399002494, + "grad_norm": 0.0014495253562927246, + "learning_rate": 8.220947630922694e-06, + "loss": 0.0941, + "step": 23630 + }, + { + "epoch": 2.947630922693267, + "grad_norm": 0.002056701574474573, + "learning_rate": 8.215960099750624e-06, + "loss": 0.0001, + "step": 23640 + }, + { + "epoch": 2.9488778054862843, + "grad_norm": 0.002144606551155448, + "learning_rate": 8.210972568578555e-06, + "loss": 0.0005, + "step": 23650 + }, + { + "epoch": 2.9501246882793017, + "grad_norm": 0.0010960723739117384, + "learning_rate": 8.205985037406484e-06, + "loss": 0.0039, + "step": 23660 + }, + { + "epoch": 2.951371571072319, + "grad_norm": 0.0012292738538235426, + "learning_rate": 8.200997506234415e-06, + "loss": 0.0001, + "step": 23670 + }, + { + "epoch": 2.9526184538653366, + "grad_norm": 0.0013217430096119642, + "learning_rate": 8.196009975062345e-06, + "loss": 0.0064, + "step": 23680 + }, + { + "epoch": 2.953865336658354, + "grad_norm": 0.0012824861332774162, + "learning_rate": 8.191022443890276e-06, + "loss": 0.0004, + "step": 23690 + }, + { + "epoch": 2.9551122194513715, + "grad_norm": 0.0026409977581351995, + "learning_rate": 8.186034912718205e-06, + "loss": 0.0032, + "step": 23700 + }, + { + "epoch": 2.956359102244389, + "grad_norm": 0.0022691749036312103, + "learning_rate": 8.181047381546135e-06, + "loss": 0.0408, + "step": 23710 + }, + { + "epoch": 2.9576059850374063, + "grad_norm": 0.0014671648386865854, + "learning_rate": 8.176059850374064e-06, + "loss": 0.0003, + "step": 23720 + }, + { + "epoch": 2.958852867830424, + "grad_norm": 0.0031290799379348755, + "learning_rate": 8.171072319201997e-06, + "loss": 0.0203, + "step": 23730 + }, + { + "epoch": 2.960099750623441, + "grad_norm": 0.0008347875555045903, + "learning_rate": 8.166084788029926e-06, + "loss": 0.0919, + "step": 23740 + }, + { + "epoch": 2.9613466334164587, + "grad_norm": 0.021511143073439598, + "learning_rate": 8.161097256857856e-06, + "loss": 0.0019, + "step": 23750 + }, + { + "epoch": 2.962593516209476, + "grad_norm": 0.0033409115858376026, + "learning_rate": 8.156109725685785e-06, + "loss": 0.0001, + "step": 23760 + }, + { + "epoch": 2.9638403990024935, + "grad_norm": 0.8422829508781433, + "learning_rate": 8.151122194513716e-06, + "loss": 0.0598, + "step": 23770 + }, + { + "epoch": 2.965087281795511, + "grad_norm": 0.020311573520302773, + "learning_rate": 8.146134663341646e-06, + "loss": 0.0001, + "step": 23780 + }, + { + "epoch": 2.966334164588529, + "grad_norm": 0.00174625008367002, + "learning_rate": 8.141147132169577e-06, + "loss": 0.0236, + "step": 23790 + }, + { + "epoch": 2.9675810473815463, + "grad_norm": 0.0019468831596896052, + "learning_rate": 8.136159600997506e-06, + "loss": 0.0306, + "step": 23800 + }, + { + "epoch": 2.9688279301745637, + "grad_norm": 0.014754627831280231, + "learning_rate": 8.131172069825437e-06, + "loss": 0.0641, + "step": 23810 + }, + { + "epoch": 2.970074812967581, + "grad_norm": 0.08833127468824387, + "learning_rate": 8.126184538653367e-06, + "loss": 0.0026, + "step": 23820 + }, + { + "epoch": 2.9713216957605986, + "grad_norm": 0.09355904906988144, + "learning_rate": 8.121197007481298e-06, + "loss": 0.0004, + "step": 23830 + }, + { + "epoch": 2.972568578553616, + "grad_norm": 0.15409357845783234, + "learning_rate": 8.116209476309228e-06, + "loss": 0.0003, + "step": 23840 + }, + { + "epoch": 2.9738154613466334, + "grad_norm": 0.0047310395166277885, + "learning_rate": 8.111221945137157e-06, + "loss": 0.0001, + "step": 23850 + }, + { + "epoch": 2.975062344139651, + "grad_norm": 0.00219229725189507, + "learning_rate": 8.106234413965088e-06, + "loss": 0.0001, + "step": 23860 + }, + { + "epoch": 2.9763092269326683, + "grad_norm": 0.001115076127462089, + "learning_rate": 8.101246882793019e-06, + "loss": 0.0001, + "step": 23870 + }, + { + "epoch": 2.9775561097256857, + "grad_norm": 0.003730418160557747, + "learning_rate": 8.09625935162095e-06, + "loss": 0.0584, + "step": 23880 + }, + { + "epoch": 2.978802992518703, + "grad_norm": 0.0017962405690923333, + "learning_rate": 8.091271820448878e-06, + "loss": 0.0002, + "step": 23890 + }, + { + "epoch": 2.9800498753117206, + "grad_norm": 58.13080596923828, + "learning_rate": 8.086284289276809e-06, + "loss": 0.032, + "step": 23900 + }, + { + "epoch": 2.981296758104738, + "grad_norm": 0.04415411129593849, + "learning_rate": 8.08129675810474e-06, + "loss": 0.0055, + "step": 23910 + }, + { + "epoch": 2.9825436408977555, + "grad_norm": 0.12564721703529358, + "learning_rate": 8.07630922693267e-06, + "loss": 0.0619, + "step": 23920 + }, + { + "epoch": 2.983790523690773, + "grad_norm": 0.0032362595666199923, + "learning_rate": 8.071321695760599e-06, + "loss": 0.0002, + "step": 23930 + }, + { + "epoch": 2.985037406483791, + "grad_norm": 29.849327087402344, + "learning_rate": 8.06633416458853e-06, + "loss": 0.1159, + "step": 23940 + }, + { + "epoch": 2.9862842892768082, + "grad_norm": 0.010884604416787624, + "learning_rate": 8.061346633416458e-06, + "loss": 0.0005, + "step": 23950 + }, + { + "epoch": 2.9875311720698257, + "grad_norm": 0.0014786362880840898, + "learning_rate": 8.056359102244389e-06, + "loss": 0.0053, + "step": 23960 + }, + { + "epoch": 2.988778054862843, + "grad_norm": 0.002504403702914715, + "learning_rate": 8.05137157107232e-06, + "loss": 0.0001, + "step": 23970 + }, + { + "epoch": 2.9900249376558605, + "grad_norm": 0.0029743423219770193, + "learning_rate": 8.04638403990025e-06, + "loss": 0.0307, + "step": 23980 + }, + { + "epoch": 2.991271820448878, + "grad_norm": 0.002073820913210511, + "learning_rate": 8.04139650872818e-06, + "loss": 0.0285, + "step": 23990 + }, + { + "epoch": 2.9925187032418954, + "grad_norm": 0.0032272618263959885, + "learning_rate": 8.03640897755611e-06, + "loss": 0.0024, + "step": 24000 + }, + { + "epoch": 2.993765586034913, + "grad_norm": 0.003961468581110239, + "learning_rate": 8.03142144638404e-06, + "loss": 0.0135, + "step": 24010 + }, + { + "epoch": 2.9950124688279303, + "grad_norm": 4.218677520751953, + "learning_rate": 8.026433915211971e-06, + "loss": 0.002, + "step": 24020 + }, + { + "epoch": 2.9962593516209477, + "grad_norm": 1.7304545640945435, + "learning_rate": 8.0214463840399e-06, + "loss": 0.0088, + "step": 24030 + }, + { + "epoch": 2.997506234413965, + "grad_norm": 0.0033019916154444218, + "learning_rate": 8.01645885286783e-06, + "loss": 0.0006, + "step": 24040 + }, + { + "epoch": 2.9987531172069826, + "grad_norm": 17.089313507080078, + "learning_rate": 8.011471321695761e-06, + "loss": 0.0184, + "step": 24050 + }, + { + "epoch": 3.0, + "grad_norm": 0.001758578815497458, + "learning_rate": 8.006483790523692e-06, + "loss": 0.0008, + "step": 24060 + }, + { + "epoch": 3.0, + "eval_accuracy": 0.9931417170646549, + "eval_loss": 0.03674669936299324, + "eval_runtime": 17.4363, + "eval_samples_per_second": 919.864, + "eval_steps_per_second": 57.524, + "step": 24060 + }, + { + "epoch": 3.0012468827930174, + "grad_norm": 0.004041421227157116, + "learning_rate": 8.001496259351622e-06, + "loss": 0.0425, + "step": 24070 + }, + { + "epoch": 3.002493765586035, + "grad_norm": 0.007590838707983494, + "learning_rate": 7.996508728179551e-06, + "loss": 0.0422, + "step": 24080 + }, + { + "epoch": 3.0037406483790523, + "grad_norm": 0.0012900944566354156, + "learning_rate": 7.991521197007482e-06, + "loss": 0.0002, + "step": 24090 + }, + { + "epoch": 3.0049875311720697, + "grad_norm": 0.02778124250471592, + "learning_rate": 7.986533665835413e-06, + "loss": 0.0001, + "step": 24100 + }, + { + "epoch": 3.006234413965087, + "grad_norm": 0.001751819159835577, + "learning_rate": 7.981546134663343e-06, + "loss": 0.0009, + "step": 24110 + }, + { + "epoch": 3.0074812967581046, + "grad_norm": 0.005507446825504303, + "learning_rate": 7.976558603491272e-06, + "loss": 0.0065, + "step": 24120 + }, + { + "epoch": 3.008728179551122, + "grad_norm": 0.02041839249432087, + "learning_rate": 7.971571072319203e-06, + "loss": 0.0001, + "step": 24130 + }, + { + "epoch": 3.0099750623441395, + "grad_norm": 0.009164830669760704, + "learning_rate": 7.966583541147133e-06, + "loss": 0.0001, + "step": 24140 + }, + { + "epoch": 3.011221945137157, + "grad_norm": 0.002013957826420665, + "learning_rate": 7.961596009975064e-06, + "loss": 0.0001, + "step": 24150 + }, + { + "epoch": 3.0124688279301743, + "grad_norm": 0.0525008849799633, + "learning_rate": 7.956608478802993e-06, + "loss": 0.0001, + "step": 24160 + }, + { + "epoch": 3.013715710723192, + "grad_norm": 0.0016975915059447289, + "learning_rate": 7.951620947630924e-06, + "loss": 0.0001, + "step": 24170 + }, + { + "epoch": 3.0149625935162097, + "grad_norm": 0.0022598865907639265, + "learning_rate": 7.946633416458853e-06, + "loss": 0.0002, + "step": 24180 + }, + { + "epoch": 3.016209476309227, + "grad_norm": 0.0035758563317358494, + "learning_rate": 7.941645885286783e-06, + "loss": 0.0001, + "step": 24190 + }, + { + "epoch": 3.0174563591022445, + "grad_norm": 0.007171344943344593, + "learning_rate": 7.936658354114714e-06, + "loss": 0.0001, + "step": 24200 + }, + { + "epoch": 3.018703241895262, + "grad_norm": 0.18562623858451843, + "learning_rate": 7.931670822942644e-06, + "loss": 0.0002, + "step": 24210 + }, + { + "epoch": 3.0199501246882794, + "grad_norm": 0.001060682232491672, + "learning_rate": 7.926683291770573e-06, + "loss": 0.0012, + "step": 24220 + }, + { + "epoch": 3.021197007481297, + "grad_norm": 20.250171661376953, + "learning_rate": 7.921695760598504e-06, + "loss": 0.0029, + "step": 24230 + }, + { + "epoch": 3.0224438902743143, + "grad_norm": 0.0019098323537036777, + "learning_rate": 7.916708229426435e-06, + "loss": 0.0405, + "step": 24240 + }, + { + "epoch": 3.0236907730673317, + "grad_norm": 0.019634464755654335, + "learning_rate": 7.911720698254365e-06, + "loss": 0.0001, + "step": 24250 + }, + { + "epoch": 3.024937655860349, + "grad_norm": 56.8670768737793, + "learning_rate": 7.906733167082294e-06, + "loss": 0.023, + "step": 24260 + }, + { + "epoch": 3.0261845386533666, + "grad_norm": 42.67612838745117, + "learning_rate": 7.901745635910225e-06, + "loss": 0.0519, + "step": 24270 + }, + { + "epoch": 3.027431421446384, + "grad_norm": 0.000763443938922137, + "learning_rate": 7.896758104738155e-06, + "loss": 0.0, + "step": 24280 + }, + { + "epoch": 3.0286783042394014, + "grad_norm": 0.000828297808766365, + "learning_rate": 7.891770573566086e-06, + "loss": 0.0, + "step": 24290 + }, + { + "epoch": 3.029925187032419, + "grad_norm": 0.0021556930150836706, + "learning_rate": 7.886783042394017e-06, + "loss": 0.0001, + "step": 24300 + }, + { + "epoch": 3.0311720698254363, + "grad_norm": 0.003809295129030943, + "learning_rate": 7.881795511221945e-06, + "loss": 0.0, + "step": 24310 + }, + { + "epoch": 3.0324189526184537, + "grad_norm": 0.011999325826764107, + "learning_rate": 7.876807980049876e-06, + "loss": 0.0001, + "step": 24320 + }, + { + "epoch": 3.033665835411471, + "grad_norm": 0.001197949517518282, + "learning_rate": 7.871820448877807e-06, + "loss": 0.0, + "step": 24330 + }, + { + "epoch": 3.0349127182044886, + "grad_norm": 0.01645563915371895, + "learning_rate": 7.866832917705737e-06, + "loss": 0.0001, + "step": 24340 + }, + { + "epoch": 3.036159600997506, + "grad_norm": 0.0022310710046440363, + "learning_rate": 7.861845386533666e-06, + "loss": 0.001, + "step": 24350 + }, + { + "epoch": 3.037406483790524, + "grad_norm": 0.06693558394908905, + "learning_rate": 7.856857855361597e-06, + "loss": 0.0706, + "step": 24360 + }, + { + "epoch": 3.0386533665835413, + "grad_norm": 0.000548367970623076, + "learning_rate": 7.851870324189526e-06, + "loss": 0.0001, + "step": 24370 + }, + { + "epoch": 3.039900249376559, + "grad_norm": 0.0035804430954158306, + "learning_rate": 7.846882793017458e-06, + "loss": 0.0013, + "step": 24380 + }, + { + "epoch": 3.041147132169576, + "grad_norm": 0.000565096503123641, + "learning_rate": 7.841895261845387e-06, + "loss": 0.0, + "step": 24390 + }, + { + "epoch": 3.0423940149625937, + "grad_norm": 0.000567717244848609, + "learning_rate": 7.836907730673318e-06, + "loss": 0.0001, + "step": 24400 + }, + { + "epoch": 3.043640897755611, + "grad_norm": 0.10077092796564102, + "learning_rate": 7.831920199501247e-06, + "loss": 0.0004, + "step": 24410 + }, + { + "epoch": 3.0448877805486285, + "grad_norm": 0.005670513026416302, + "learning_rate": 7.826932668329177e-06, + "loss": 0.0175, + "step": 24420 + }, + { + "epoch": 3.046134663341646, + "grad_norm": 0.01621229201555252, + "learning_rate": 7.821945137157108e-06, + "loss": 0.0002, + "step": 24430 + }, + { + "epoch": 3.0473815461346634, + "grad_norm": 0.05025783181190491, + "learning_rate": 7.816957605985038e-06, + "loss": 0.0711, + "step": 24440 + }, + { + "epoch": 3.048628428927681, + "grad_norm": 18.436004638671875, + "learning_rate": 7.811970074812967e-06, + "loss": 0.0026, + "step": 24450 + }, + { + "epoch": 3.0498753117206983, + "grad_norm": 2.656740665435791, + "learning_rate": 7.806982543640898e-06, + "loss": 0.0015, + "step": 24460 + }, + { + "epoch": 3.0511221945137157, + "grad_norm": 0.004006446339190006, + "learning_rate": 7.801995012468829e-06, + "loss": 0.0001, + "step": 24470 + }, + { + "epoch": 3.052369077306733, + "grad_norm": 0.0018630550475791097, + "learning_rate": 7.79700748129676e-06, + "loss": 0.0002, + "step": 24480 + }, + { + "epoch": 3.0536159600997506, + "grad_norm": 0.0004907494876533747, + "learning_rate": 7.792019950124688e-06, + "loss": 0.0211, + "step": 24490 + }, + { + "epoch": 3.054862842892768, + "grad_norm": 0.0018739727092906833, + "learning_rate": 7.787032418952619e-06, + "loss": 0.0116, + "step": 24500 + }, + { + "epoch": 3.0561097256857854, + "grad_norm": 0.0038170109037309885, + "learning_rate": 7.78204488778055e-06, + "loss": 0.0002, + "step": 24510 + }, + { + "epoch": 3.057356608478803, + "grad_norm": 0.0017289503011852503, + "learning_rate": 7.77705735660848e-06, + "loss": 0.0001, + "step": 24520 + }, + { + "epoch": 3.0586034912718203, + "grad_norm": 0.0016951598227024078, + "learning_rate": 7.772069825436409e-06, + "loss": 0.0003, + "step": 24530 + }, + { + "epoch": 3.0598503740648377, + "grad_norm": 0.003196330275386572, + "learning_rate": 7.76708229426434e-06, + "loss": 0.0001, + "step": 24540 + }, + { + "epoch": 3.061097256857855, + "grad_norm": 86.77098846435547, + "learning_rate": 7.76209476309227e-06, + "loss": 0.0368, + "step": 24550 + }, + { + "epoch": 3.0623441396508726, + "grad_norm": 0.0013502277433872223, + "learning_rate": 7.7571072319202e-06, + "loss": 0.0, + "step": 24560 + }, + { + "epoch": 3.0635910224438905, + "grad_norm": 0.0012974691344425082, + "learning_rate": 7.752119700748131e-06, + "loss": 0.0001, + "step": 24570 + }, + { + "epoch": 3.064837905236908, + "grad_norm": 0.4907504916191101, + "learning_rate": 7.74713216957606e-06, + "loss": 0.0094, + "step": 24580 + }, + { + "epoch": 3.0660847880299253, + "grad_norm": 0.0008519966504536569, + "learning_rate": 7.742144638403991e-06, + "loss": 0.0, + "step": 24590 + }, + { + "epoch": 3.067331670822943, + "grad_norm": 0.004892734810709953, + "learning_rate": 7.73715710723192e-06, + "loss": 0.0014, + "step": 24600 + }, + { + "epoch": 3.06857855361596, + "grad_norm": 0.01416561659425497, + "learning_rate": 7.732169576059852e-06, + "loss": 0.0006, + "step": 24610 + }, + { + "epoch": 3.0698254364089776, + "grad_norm": 0.0011699148453772068, + "learning_rate": 7.727182044887781e-06, + "loss": 0.0002, + "step": 24620 + }, + { + "epoch": 3.071072319201995, + "grad_norm": 0.40756234526634216, + "learning_rate": 7.722194513715712e-06, + "loss": 0.0212, + "step": 24630 + }, + { + "epoch": 3.0723192019950125, + "grad_norm": 0.0006810550694353878, + "learning_rate": 7.71720698254364e-06, + "loss": 0.0001, + "step": 24640 + }, + { + "epoch": 3.07356608478803, + "grad_norm": 0.0026810094714164734, + "learning_rate": 7.712219451371571e-06, + "loss": 0.0001, + "step": 24650 + }, + { + "epoch": 3.0748129675810474, + "grad_norm": 0.001643951516598463, + "learning_rate": 7.707231920199502e-06, + "loss": 0.0334, + "step": 24660 + }, + { + "epoch": 3.076059850374065, + "grad_norm": 0.0028612790629267693, + "learning_rate": 7.702244389027433e-06, + "loss": 0.0, + "step": 24670 + }, + { + "epoch": 3.0773067331670823, + "grad_norm": 0.019000135362148285, + "learning_rate": 7.697256857855361e-06, + "loss": 0.0, + "step": 24680 + }, + { + "epoch": 3.0785536159600997, + "grad_norm": 0.0008096080855466425, + "learning_rate": 7.692269326683292e-06, + "loss": 0.0032, + "step": 24690 + }, + { + "epoch": 3.079800498753117, + "grad_norm": 0.00035487712011672556, + "learning_rate": 7.687281795511223e-06, + "loss": 0.0275, + "step": 24700 + }, + { + "epoch": 3.0810473815461346, + "grad_norm": 0.003728532698005438, + "learning_rate": 7.682294264339153e-06, + "loss": 0.0001, + "step": 24710 + }, + { + "epoch": 3.082294264339152, + "grad_norm": 0.018092088401317596, + "learning_rate": 7.677306733167082e-06, + "loss": 0.0002, + "step": 24720 + }, + { + "epoch": 3.0835411471321694, + "grad_norm": 0.0015284158289432526, + "learning_rate": 7.672319201995013e-06, + "loss": 0.0333, + "step": 24730 + }, + { + "epoch": 3.084788029925187, + "grad_norm": 0.06528772413730621, + "learning_rate": 7.667331670822943e-06, + "loss": 0.0001, + "step": 24740 + }, + { + "epoch": 3.0860349127182043, + "grad_norm": 0.0012805687729269266, + "learning_rate": 7.662344139650874e-06, + "loss": 0.0, + "step": 24750 + }, + { + "epoch": 3.087281795511222, + "grad_norm": 24.267126083374023, + "learning_rate": 7.657356608478803e-06, + "loss": 0.0254, + "step": 24760 + }, + { + "epoch": 3.0885286783042396, + "grad_norm": 48.1351203918457, + "learning_rate": 7.652369077306734e-06, + "loss": 0.0196, + "step": 24770 + }, + { + "epoch": 3.089775561097257, + "grad_norm": 0.001478642807342112, + "learning_rate": 7.647381546134664e-06, + "loss": 0.0001, + "step": 24780 + }, + { + "epoch": 3.0910224438902745, + "grad_norm": 0.010895784012973309, + "learning_rate": 7.642394014962595e-06, + "loss": 0.0004, + "step": 24790 + }, + { + "epoch": 3.092269326683292, + "grad_norm": 0.0012013005325570703, + "learning_rate": 7.637406483790526e-06, + "loss": 0.0055, + "step": 24800 + }, + { + "epoch": 3.0935162094763093, + "grad_norm": 0.0007267069304361939, + "learning_rate": 7.632418952618454e-06, + "loss": 0.0001, + "step": 24810 + }, + { + "epoch": 3.0947630922693268, + "grad_norm": 0.0029194443486630917, + "learning_rate": 7.627431421446385e-06, + "loss": 0.0, + "step": 24820 + }, + { + "epoch": 3.096009975062344, + "grad_norm": 0.09928423166275024, + "learning_rate": 7.622443890274315e-06, + "loss": 0.0001, + "step": 24830 + }, + { + "epoch": 3.0972568578553616, + "grad_norm": 0.0014014774933457375, + "learning_rate": 7.6174563591022455e-06, + "loss": 0.0, + "step": 24840 + }, + { + "epoch": 3.098503740648379, + "grad_norm": 11.482453346252441, + "learning_rate": 7.612468827930175e-06, + "loss": 0.0066, + "step": 24850 + }, + { + "epoch": 3.0997506234413965, + "grad_norm": 0.0034667307045310736, + "learning_rate": 7.607481296758106e-06, + "loss": 0.0315, + "step": 24860 + }, + { + "epoch": 3.100997506234414, + "grad_norm": 0.0012264890829101205, + "learning_rate": 7.602493765586036e-06, + "loss": 0.0001, + "step": 24870 + }, + { + "epoch": 3.1022443890274314, + "grad_norm": 0.03299201279878616, + "learning_rate": 7.597506234413966e-06, + "loss": 0.0113, + "step": 24880 + }, + { + "epoch": 3.103491271820449, + "grad_norm": 2.718482255935669, + "learning_rate": 7.592518703241896e-06, + "loss": 0.0006, + "step": 24890 + }, + { + "epoch": 3.1047381546134662, + "grad_norm": 0.0009285429841838777, + "learning_rate": 7.587531172069827e-06, + "loss": 0.0423, + "step": 24900 + }, + { + "epoch": 3.1059850374064837, + "grad_norm": 0.0012682249071076512, + "learning_rate": 7.5825436408977556e-06, + "loss": 0.0, + "step": 24910 + }, + { + "epoch": 3.107231920199501, + "grad_norm": 0.0003040742303710431, + "learning_rate": 7.577556109725687e-06, + "loss": 0.0529, + "step": 24920 + }, + { + "epoch": 3.1084788029925186, + "grad_norm": 0.022092144936323166, + "learning_rate": 7.572568578553616e-06, + "loss": 0.0, + "step": 24930 + }, + { + "epoch": 3.109725685785536, + "grad_norm": 0.0005716979503631592, + "learning_rate": 7.5675810473815466e-06, + "loss": 0.0012, + "step": 24940 + }, + { + "epoch": 3.1109725685785534, + "grad_norm": 0.013232385739684105, + "learning_rate": 7.562593516209476e-06, + "loss": 0.0254, + "step": 24950 + }, + { + "epoch": 3.112219451371571, + "grad_norm": 0.0007010533008724451, + "learning_rate": 7.557605985037407e-06, + "loss": 0.0001, + "step": 24960 + }, + { + "epoch": 3.1134663341645887, + "grad_norm": 0.0009610903216525912, + "learning_rate": 7.552618453865337e-06, + "loss": 0.0313, + "step": 24970 + }, + { + "epoch": 3.114713216957606, + "grad_norm": 0.0008341351058334112, + "learning_rate": 7.547630922693267e-06, + "loss": 0.0213, + "step": 24980 + }, + { + "epoch": 3.1159600997506236, + "grad_norm": 0.0012973628472536802, + "learning_rate": 7.542643391521197e-06, + "loss": 0.0001, + "step": 24990 + }, + { + "epoch": 3.117206982543641, + "grad_norm": 0.0037772981449961662, + "learning_rate": 7.537655860349128e-06, + "loss": 0.0042, + "step": 25000 + }, + { + "epoch": 3.1184538653366585, + "grad_norm": 3.0461456775665283, + "learning_rate": 7.5326683291770575e-06, + "loss": 0.0283, + "step": 25010 + }, + { + "epoch": 3.119700748129676, + "grad_norm": 0.001213893759995699, + "learning_rate": 7.527680798004988e-06, + "loss": 0.0, + "step": 25020 + }, + { + "epoch": 3.1209476309226933, + "grad_norm": 0.0026515673380345106, + "learning_rate": 7.522693266832918e-06, + "loss": 0.0488, + "step": 25030 + }, + { + "epoch": 3.1221945137157108, + "grad_norm": 0.00047578109661117196, + "learning_rate": 7.5177057356608485e-06, + "loss": 0.0, + "step": 25040 + }, + { + "epoch": 3.123441396508728, + "grad_norm": 0.01547097135335207, + "learning_rate": 7.512718204488779e-06, + "loss": 0.0007, + "step": 25050 + }, + { + "epoch": 3.1246882793017456, + "grad_norm": 0.010162184946238995, + "learning_rate": 7.507730673316709e-06, + "loss": 0.0001, + "step": 25060 + }, + { + "epoch": 3.125935162094763, + "grad_norm": 0.0032746554352343082, + "learning_rate": 7.5027431421446395e-06, + "loss": 0.0001, + "step": 25070 + }, + { + "epoch": 3.1271820448877805, + "grad_norm": 0.0003806925960816443, + "learning_rate": 7.497755610972569e-06, + "loss": 0.0003, + "step": 25080 + }, + { + "epoch": 3.128428927680798, + "grad_norm": 0.0004894788726232946, + "learning_rate": 7.4927680798005e-06, + "loss": 0.0, + "step": 25090 + }, + { + "epoch": 3.1296758104738154, + "grad_norm": 0.00042781481170095503, + "learning_rate": 7.48778054862843e-06, + "loss": 0.0, + "step": 25100 + }, + { + "epoch": 3.130922693266833, + "grad_norm": 0.006498263217508793, + "learning_rate": 7.48279301745636e-06, + "loss": 0.0437, + "step": 25110 + }, + { + "epoch": 3.1321695760598502, + "grad_norm": 0.0033179358579218388, + "learning_rate": 7.47780548628429e-06, + "loss": 0.0, + "step": 25120 + }, + { + "epoch": 3.1334164588528677, + "grad_norm": 0.0005243680789135396, + "learning_rate": 7.472817955112221e-06, + "loss": 0.0239, + "step": 25130 + }, + { + "epoch": 3.134663341645885, + "grad_norm": 0.0005233477568253875, + "learning_rate": 7.46783042394015e-06, + "loss": 0.0, + "step": 25140 + }, + { + "epoch": 3.1359102244389025, + "grad_norm": 0.0014765688683837652, + "learning_rate": 7.462842892768081e-06, + "loss": 0.0, + "step": 25150 + }, + { + "epoch": 3.1371571072319204, + "grad_norm": Infinity, + "learning_rate": 7.458354114713218e-06, + "loss": 0.0085, + "step": 25160 + }, + { + "epoch": 3.138403990024938, + "grad_norm": 0.0004119553486816585, + "learning_rate": 7.453366583541147e-06, + "loss": 0.0001, + "step": 25170 + }, + { + "epoch": 3.1396508728179553, + "grad_norm": 0.00120373978279531, + "learning_rate": 7.448379052369078e-06, + "loss": 0.0001, + "step": 25180 + }, + { + "epoch": 3.1408977556109727, + "grad_norm": 0.0002969623601529747, + "learning_rate": 7.443391521197008e-06, + "loss": 0.0001, + "step": 25190 + }, + { + "epoch": 3.14214463840399, + "grad_norm": 0.0029246362391859293, + "learning_rate": 7.4384039900249384e-06, + "loss": 0.0, + "step": 25200 + }, + { + "epoch": 3.1433915211970076, + "grad_norm": 0.0017841997323557734, + "learning_rate": 7.433416458852868e-06, + "loss": 0.0004, + "step": 25210 + }, + { + "epoch": 3.144638403990025, + "grad_norm": 0.0006051593809388578, + "learning_rate": 7.428428927680799e-06, + "loss": 0.0, + "step": 25220 + }, + { + "epoch": 3.1458852867830425, + "grad_norm": 0.0006382103892974555, + "learning_rate": 7.4234413965087294e-06, + "loss": 0.0, + "step": 25230 + }, + { + "epoch": 3.14713216957606, + "grad_norm": 0.0008329673437401652, + "learning_rate": 7.418453865336659e-06, + "loss": 0.0, + "step": 25240 + }, + { + "epoch": 3.1483790523690773, + "grad_norm": 0.0003178466286044568, + "learning_rate": 7.41346633416459e-06, + "loss": 0.0001, + "step": 25250 + }, + { + "epoch": 3.1496259351620948, + "grad_norm": 4.301796913146973, + "learning_rate": 7.40847880299252e-06, + "loss": 0.0007, + "step": 25260 + }, + { + "epoch": 3.150872817955112, + "grad_norm": 0.0017022284446284175, + "learning_rate": 7.40349127182045e-06, + "loss": 0.0001, + "step": 25270 + }, + { + "epoch": 3.1521197007481296, + "grad_norm": 0.002003247383981943, + "learning_rate": 7.398503740648379e-06, + "loss": 0.0424, + "step": 25280 + }, + { + "epoch": 3.153366583541147, + "grad_norm": 0.000499337911605835, + "learning_rate": 7.393516209476311e-06, + "loss": 0.0005, + "step": 25290 + }, + { + "epoch": 3.1546134663341645, + "grad_norm": 0.003963457886129618, + "learning_rate": 7.3885286783042395e-06, + "loss": 0.0, + "step": 25300 + }, + { + "epoch": 3.155860349127182, + "grad_norm": 0.0008439902449026704, + "learning_rate": 7.38354114713217e-06, + "loss": 0.0009, + "step": 25310 + }, + { + "epoch": 3.1571072319201994, + "grad_norm": 0.04566599801182747, + "learning_rate": 7.3785536159601e-06, + "loss": 0.0003, + "step": 25320 + }, + { + "epoch": 3.158354114713217, + "grad_norm": 0.0065649570897221565, + "learning_rate": 7.3735660847880306e-06, + "loss": 0.0, + "step": 25330 + }, + { + "epoch": 3.1596009975062342, + "grad_norm": 0.062061987817287445, + "learning_rate": 7.36857855361596e-06, + "loss": 0.0352, + "step": 25340 + }, + { + "epoch": 3.1608478802992517, + "grad_norm": 0.0006778668030165136, + "learning_rate": 7.363591022443891e-06, + "loss": 0.0, + "step": 25350 + }, + { + "epoch": 3.162094763092269, + "grad_norm": 0.001736179576255381, + "learning_rate": 7.358603491271821e-06, + "loss": 0.0, + "step": 25360 + }, + { + "epoch": 3.163341645885287, + "grad_norm": 0.0019771254155784845, + "learning_rate": 7.353615960099751e-06, + "loss": 0.0, + "step": 25370 + }, + { + "epoch": 3.1645885286783044, + "grad_norm": 0.0007458809996023774, + "learning_rate": 7.348628428927681e-06, + "loss": 0.0246, + "step": 25380 + }, + { + "epoch": 3.165835411471322, + "grad_norm": 0.001378356129862368, + "learning_rate": 7.343640897755612e-06, + "loss": 0.0001, + "step": 25390 + }, + { + "epoch": 3.1670822942643393, + "grad_norm": 0.0007513531600125134, + "learning_rate": 7.3386533665835415e-06, + "loss": 0.0001, + "step": 25400 + }, + { + "epoch": 3.1683291770573567, + "grad_norm": 0.00032524106791242957, + "learning_rate": 7.333665835411472e-06, + "loss": 0.0, + "step": 25410 + }, + { + "epoch": 3.169576059850374, + "grad_norm": 0.0005543709266930819, + "learning_rate": 7.328678304239402e-06, + "loss": 0.0001, + "step": 25420 + }, + { + "epoch": 3.1708229426433916, + "grad_norm": 0.0007538437494076788, + "learning_rate": 7.3236907730673325e-06, + "loss": 0.0001, + "step": 25430 + }, + { + "epoch": 3.172069825436409, + "grad_norm": 0.0003479034348856658, + "learning_rate": 7.318703241895262e-06, + "loss": 0.0, + "step": 25440 + }, + { + "epoch": 3.1733167082294265, + "grad_norm": 0.00032112517510540783, + "learning_rate": 7.313715710723193e-06, + "loss": 0.0514, + "step": 25450 + }, + { + "epoch": 3.174563591022444, + "grad_norm": 68.04743194580078, + "learning_rate": 7.308728179551122e-06, + "loss": 0.0246, + "step": 25460 + }, + { + "epoch": 3.1758104738154613, + "grad_norm": 0.003141113556921482, + "learning_rate": 7.303740648379053e-06, + "loss": 0.0, + "step": 25470 + }, + { + "epoch": 3.1770573566084788, + "grad_norm": 127.886474609375, + "learning_rate": 7.298753117206984e-06, + "loss": 0.0086, + "step": 25480 + }, + { + "epoch": 3.178304239401496, + "grad_norm": 0.0004362338804639876, + "learning_rate": 7.293765586034913e-06, + "loss": 0.0138, + "step": 25490 + }, + { + "epoch": 3.1795511221945136, + "grad_norm": 0.43064260482788086, + "learning_rate": 7.288778054862844e-06, + "loss": 0.0002, + "step": 25500 + }, + { + "epoch": 3.180798004987531, + "grad_norm": 0.0011624015169218183, + "learning_rate": 7.283790523690773e-06, + "loss": 0.0, + "step": 25510 + }, + { + "epoch": 3.1820448877805485, + "grad_norm": 0.0005321303615346551, + "learning_rate": 7.278802992518704e-06, + "loss": 0.004, + "step": 25520 + }, + { + "epoch": 3.183291770573566, + "grad_norm": 0.0004226687888149172, + "learning_rate": 7.273815461346634e-06, + "loss": 0.0, + "step": 25530 + }, + { + "epoch": 3.1845386533665834, + "grad_norm": 0.00025975590688176453, + "learning_rate": 7.268827930174564e-06, + "loss": 0.0082, + "step": 25540 + }, + { + "epoch": 3.185785536159601, + "grad_norm": 0.00041354497079737484, + "learning_rate": 7.263840399002494e-06, + "loss": 0.0548, + "step": 25550 + }, + { + "epoch": 3.1870324189526187, + "grad_norm": 0.0013360625598579645, + "learning_rate": 7.258852867830425e-06, + "loss": 0.0001, + "step": 25560 + }, + { + "epoch": 3.188279301745636, + "grad_norm": 0.00040121175698004663, + "learning_rate": 7.253865336658354e-06, + "loss": 0.0464, + "step": 25570 + }, + { + "epoch": 3.1895261845386536, + "grad_norm": 0.001519997720606625, + "learning_rate": 7.248877805486285e-06, + "loss": 0.0, + "step": 25580 + }, + { + "epoch": 3.190773067331671, + "grad_norm": 0.0005633990513160825, + "learning_rate": 7.243890274314215e-06, + "loss": 0.0, + "step": 25590 + }, + { + "epoch": 3.1920199501246884, + "grad_norm": 0.0007187969167716801, + "learning_rate": 7.238902743142145e-06, + "loss": 0.0, + "step": 25600 + }, + { + "epoch": 3.193266832917706, + "grad_norm": 0.026836052536964417, + "learning_rate": 7.233915211970075e-06, + "loss": 0.0002, + "step": 25610 + }, + { + "epoch": 3.1945137157107233, + "grad_norm": 0.001607574988156557, + "learning_rate": 7.228927680798006e-06, + "loss": 0.0034, + "step": 25620 + }, + { + "epoch": 3.1957605985037407, + "grad_norm": 0.0025988956913352013, + "learning_rate": 7.223940149625936e-06, + "loss": 0.0, + "step": 25630 + }, + { + "epoch": 3.197007481296758, + "grad_norm": 86.0445556640625, + "learning_rate": 7.218952618453866e-06, + "loss": 0.0128, + "step": 25640 + }, + { + "epoch": 3.1982543640897756, + "grad_norm": 0.008842726238071918, + "learning_rate": 7.213965087281796e-06, + "loss": 0.0, + "step": 25650 + }, + { + "epoch": 3.199501246882793, + "grad_norm": 0.0012176205636933446, + "learning_rate": 7.208977556109727e-06, + "loss": 0.0242, + "step": 25660 + }, + { + "epoch": 3.2007481296758105, + "grad_norm": 0.00039986352203413844, + "learning_rate": 7.203990024937656e-06, + "loss": 0.0, + "step": 25670 + }, + { + "epoch": 3.201995012468828, + "grad_norm": 0.0003596430760808289, + "learning_rate": 7.199002493765587e-06, + "loss": 0.0, + "step": 25680 + }, + { + "epoch": 3.2032418952618453, + "grad_norm": 0.0017086360603570938, + "learning_rate": 7.194014962593516e-06, + "loss": 0.0317, + "step": 25690 + }, + { + "epoch": 3.2044887780548628, + "grad_norm": 0.005245603155344725, + "learning_rate": 7.189027431421447e-06, + "loss": 0.0358, + "step": 25700 + }, + { + "epoch": 3.20573566084788, + "grad_norm": 0.0008312583668157458, + "learning_rate": 7.184039900249378e-06, + "loss": 0.0, + "step": 25710 + }, + { + "epoch": 3.2069825436408976, + "grad_norm": 22.51250648498535, + "learning_rate": 7.179052369077307e-06, + "loss": 0.0022, + "step": 25720 + }, + { + "epoch": 3.208229426433915, + "grad_norm": 0.001020867028273642, + "learning_rate": 7.174064837905238e-06, + "loss": 0.0004, + "step": 25730 + }, + { + "epoch": 3.2094763092269325, + "grad_norm": 0.00029622670263051987, + "learning_rate": 7.169077306733167e-06, + "loss": 0.0034, + "step": 25740 + }, + { + "epoch": 3.21072319201995, + "grad_norm": 0.001556044677272439, + "learning_rate": 7.164089775561098e-06, + "loss": 0.033, + "step": 25750 + }, + { + "epoch": 3.2119700748129674, + "grad_norm": 0.0003500843304209411, + "learning_rate": 7.159102244389028e-06, + "loss": 0.049, + "step": 25760 + }, + { + "epoch": 3.213216957605985, + "grad_norm": 0.0003617958864197135, + "learning_rate": 7.154114713216958e-06, + "loss": 0.0001, + "step": 25770 + }, + { + "epoch": 3.2144638403990027, + "grad_norm": 0.0004431404813658446, + "learning_rate": 7.149127182044888e-06, + "loss": 0.0, + "step": 25780 + }, + { + "epoch": 3.21571072319202, + "grad_norm": 0.0016715474193915725, + "learning_rate": 7.144139650872819e-06, + "loss": 0.0, + "step": 25790 + }, + { + "epoch": 3.2169576059850375, + "grad_norm": 0.001232789596542716, + "learning_rate": 7.1391521197007485e-06, + "loss": 0.0, + "step": 25800 + }, + { + "epoch": 3.218204488778055, + "grad_norm": 0.0007746867486275733, + "learning_rate": 7.134164588528679e-06, + "loss": 0.0, + "step": 25810 + }, + { + "epoch": 3.2194513715710724, + "grad_norm": 0.0025138193741440773, + "learning_rate": 7.129177057356609e-06, + "loss": 0.0002, + "step": 25820 + }, + { + "epoch": 3.22069825436409, + "grad_norm": 0.00021336287318263203, + "learning_rate": 7.1241895261845395e-06, + "loss": 0.048, + "step": 25830 + }, + { + "epoch": 3.2219451371571073, + "grad_norm": 0.008427153341472149, + "learning_rate": 7.119201995012469e-06, + "loss": 0.0344, + "step": 25840 + }, + { + "epoch": 3.2231920199501247, + "grad_norm": 15.330975532531738, + "learning_rate": 7.1142144638404e-06, + "loss": 0.1138, + "step": 25850 + }, + { + "epoch": 3.224438902743142, + "grad_norm": 0.01085032057017088, + "learning_rate": 7.10922693266833e-06, + "loss": 0.0, + "step": 25860 + }, + { + "epoch": 3.2256857855361596, + "grad_norm": 0.0007842654013074934, + "learning_rate": 7.10423940149626e-06, + "loss": 0.0265, + "step": 25870 + }, + { + "epoch": 3.226932668329177, + "grad_norm": 0.0009141101036220789, + "learning_rate": 7.09925187032419e-06, + "loss": 0.0001, + "step": 25880 + }, + { + "epoch": 3.2281795511221945, + "grad_norm": 0.004496920388191938, + "learning_rate": 7.094264339152121e-06, + "loss": 0.0116, + "step": 25890 + }, + { + "epoch": 3.229426433915212, + "grad_norm": 0.0016870100516825914, + "learning_rate": 7.0892768079800504e-06, + "loss": 0.004, + "step": 25900 + }, + { + "epoch": 3.2306733167082293, + "grad_norm": 0.0009073169785551727, + "learning_rate": 7.084289276807981e-06, + "loss": 0.0001, + "step": 25910 + }, + { + "epoch": 3.2319201995012468, + "grad_norm": 0.0010009161196649075, + "learning_rate": 7.07930174563591e-06, + "loss": 0.0469, + "step": 25920 + }, + { + "epoch": 3.233167082294264, + "grad_norm": 0.002670197281986475, + "learning_rate": 7.074314214463841e-06, + "loss": 0.0142, + "step": 25930 + }, + { + "epoch": 3.2344139650872816, + "grad_norm": 0.0038882438093423843, + "learning_rate": 7.06932668329177e-06, + "loss": 0.0001, + "step": 25940 + }, + { + "epoch": 3.235660847880299, + "grad_norm": 6.603517532348633, + "learning_rate": 7.064339152119701e-06, + "loss": 0.0008, + "step": 25950 + }, + { + "epoch": 3.236907730673317, + "grad_norm": 0.3903234601020813, + "learning_rate": 7.059351620947632e-06, + "loss": 0.0002, + "step": 25960 + }, + { + "epoch": 3.2381546134663344, + "grad_norm": 0.0041964612901210785, + "learning_rate": 7.054364089775561e-06, + "loss": 0.0066, + "step": 25970 + }, + { + "epoch": 3.239401496259352, + "grad_norm": 0.0007558322977274656, + "learning_rate": 7.049376558603492e-06, + "loss": 0.0002, + "step": 25980 + }, + { + "epoch": 3.2406483790523692, + "grad_norm": 0.00039787104469724, + "learning_rate": 7.044389027431422e-06, + "loss": 0.0, + "step": 25990 + }, + { + "epoch": 3.2418952618453867, + "grad_norm": 0.0004548293072730303, + "learning_rate": 7.039401496259352e-06, + "loss": 0.0003, + "step": 26000 + }, + { + "epoch": 3.243142144638404, + "grad_norm": 0.0005862874095328152, + "learning_rate": 7.034413965087282e-06, + "loss": 0.0, + "step": 26010 + }, + { + "epoch": 3.2443890274314215, + "grad_norm": 0.021068502217531204, + "learning_rate": 7.029426433915213e-06, + "loss": 0.0, + "step": 26020 + }, + { + "epoch": 3.245635910224439, + "grad_norm": 0.01115128118544817, + "learning_rate": 7.0244389027431426e-06, + "loss": 0.0, + "step": 26030 + }, + { + "epoch": 3.2468827930174564, + "grad_norm": 0.012454736977815628, + "learning_rate": 7.019451371571073e-06, + "loss": 0.0034, + "step": 26040 + }, + { + "epoch": 3.248129675810474, + "grad_norm": 0.003131842939183116, + "learning_rate": 7.014463840399003e-06, + "loss": 0.0001, + "step": 26050 + }, + { + "epoch": 3.2493765586034913, + "grad_norm": 0.0019079704070463777, + "learning_rate": 7.009476309226934e-06, + "loss": 0.0, + "step": 26060 + }, + { + "epoch": 3.2506234413965087, + "grad_norm": 0.0006699951482005417, + "learning_rate": 7.004488778054863e-06, + "loss": 0.0001, + "step": 26070 + }, + { + "epoch": 3.251870324189526, + "grad_norm": 21.092519760131836, + "learning_rate": 6.999501246882794e-06, + "loss": 0.043, + "step": 26080 + }, + { + "epoch": 3.2531172069825436, + "grad_norm": 15.397884368896484, + "learning_rate": 6.994513715710724e-06, + "loss": 0.0011, + "step": 26090 + }, + { + "epoch": 3.254364089775561, + "grad_norm": 0.0016310204518958926, + "learning_rate": 6.989526184538654e-06, + "loss": 0.0001, + "step": 26100 + }, + { + "epoch": 3.2556109725685785, + "grad_norm": 0.0010182487312704325, + "learning_rate": 6.984538653366584e-06, + "loss": 0.0289, + "step": 26110 + }, + { + "epoch": 3.256857855361596, + "grad_norm": 0.0008488223538734019, + "learning_rate": 6.979551122194515e-06, + "loss": 0.0003, + "step": 26120 + }, + { + "epoch": 3.2581047381546133, + "grad_norm": 0.040211670100688934, + "learning_rate": 6.974563591022444e-06, + "loss": 0.0001, + "step": 26130 + }, + { + "epoch": 3.2593516209476308, + "grad_norm": 0.0003044347686227411, + "learning_rate": 6.969576059850375e-06, + "loss": 0.0, + "step": 26140 + }, + { + "epoch": 3.260598503740648, + "grad_norm": 0.0003406737232580781, + "learning_rate": 6.964588528678304e-06, + "loss": 0.0283, + "step": 26150 + }, + { + "epoch": 3.2618453865336656, + "grad_norm": 0.0006763077690266073, + "learning_rate": 6.959600997506235e-06, + "loss": 0.0006, + "step": 26160 + }, + { + "epoch": 3.263092269326683, + "grad_norm": 20.708433151245117, + "learning_rate": 6.9546134663341645e-06, + "loss": 0.041, + "step": 26170 + }, + { + "epoch": 3.264339152119701, + "grad_norm": 0.000766753451898694, + "learning_rate": 6.949625935162095e-06, + "loss": 0.0, + "step": 26180 + }, + { + "epoch": 3.2655860349127184, + "grad_norm": 0.00931212492287159, + "learning_rate": 6.944638403990025e-06, + "loss": 0.0, + "step": 26190 + }, + { + "epoch": 3.266832917705736, + "grad_norm": 0.0005553375813178718, + "learning_rate": 6.9396508728179555e-06, + "loss": 0.0, + "step": 26200 + }, + { + "epoch": 3.2680798004987532, + "grad_norm": 0.0011248165974393487, + "learning_rate": 6.934663341645886e-06, + "loss": 0.0187, + "step": 26210 + }, + { + "epoch": 3.2693266832917707, + "grad_norm": 0.02715372107923031, + "learning_rate": 6.929675810473816e-06, + "loss": 0.0, + "step": 26220 + }, + { + "epoch": 3.270573566084788, + "grad_norm": 0.0005466627771966159, + "learning_rate": 6.9246882793017465e-06, + "loss": 0.0, + "step": 26230 + }, + { + "epoch": 3.2718204488778055, + "grad_norm": 0.0001991336466744542, + "learning_rate": 6.919700748129676e-06, + "loss": 0.0, + "step": 26240 + }, + { + "epoch": 3.273067331670823, + "grad_norm": 0.0004780891176778823, + "learning_rate": 6.914713216957607e-06, + "loss": 0.0, + "step": 26250 + }, + { + "epoch": 3.2743142144638404, + "grad_norm": 0.0012583925854414701, + "learning_rate": 6.909725685785537e-06, + "loss": 0.0, + "step": 26260 + }, + { + "epoch": 3.275561097256858, + "grad_norm": 0.0002583898603916168, + "learning_rate": 6.904738154613467e-06, + "loss": 0.0001, + "step": 26270 + }, + { + "epoch": 3.2768079800498753, + "grad_norm": 0.008905318565666676, + "learning_rate": 6.899750623441397e-06, + "loss": 0.0256, + "step": 26280 + }, + { + "epoch": 3.2780548628428927, + "grad_norm": 0.00047971465392038226, + "learning_rate": 6.894763092269328e-06, + "loss": 0.0, + "step": 26290 + }, + { + "epoch": 3.27930174563591, + "grad_norm": 0.01915428787469864, + "learning_rate": 6.8897755610972574e-06, + "loss": 0.0001, + "step": 26300 + }, + { + "epoch": 3.2805486284289276, + "grad_norm": 0.00032235312392003834, + "learning_rate": 6.884788029925188e-06, + "loss": 0.0, + "step": 26310 + }, + { + "epoch": 3.281795511221945, + "grad_norm": 0.0005441360990516841, + "learning_rate": 6.879800498753118e-06, + "loss": 0.0, + "step": 26320 + }, + { + "epoch": 3.2830423940149625, + "grad_norm": 42.98784637451172, + "learning_rate": 6.8748129675810484e-06, + "loss": 0.06, + "step": 26330 + }, + { + "epoch": 3.28428927680798, + "grad_norm": 0.000571958429645747, + "learning_rate": 6.869825436408978e-06, + "loss": 0.0167, + "step": 26340 + }, + { + "epoch": 3.2855361596009973, + "grad_norm": 0.00067878607660532, + "learning_rate": 6.864837905236909e-06, + "loss": 0.0001, + "step": 26350 + }, + { + "epoch": 3.286783042394015, + "grad_norm": 0.0006823897711001337, + "learning_rate": 6.859850374064838e-06, + "loss": 0.0, + "step": 26360 + }, + { + "epoch": 3.2880299251870326, + "grad_norm": 0.0009005233878269792, + "learning_rate": 6.854862842892769e-06, + "loss": 0.0408, + "step": 26370 + }, + { + "epoch": 3.28927680798005, + "grad_norm": 0.0013887096429243684, + "learning_rate": 6.849875311720698e-06, + "loss": 0.0, + "step": 26380 + }, + { + "epoch": 3.2905236907730675, + "grad_norm": 0.6036227345466614, + "learning_rate": 6.844887780548629e-06, + "loss": 0.0001, + "step": 26390 + }, + { + "epoch": 3.291770573566085, + "grad_norm": 0.0027249318081885576, + "learning_rate": 6.8399002493765585e-06, + "loss": 0.0, + "step": 26400 + }, + { + "epoch": 3.2930174563591024, + "grad_norm": 0.0006757620139978826, + "learning_rate": 6.834912718204489e-06, + "loss": 0.0, + "step": 26410 + }, + { + "epoch": 3.29426433915212, + "grad_norm": 0.0004033853765577078, + "learning_rate": 6.829925187032419e-06, + "loss": 0.0, + "step": 26420 + }, + { + "epoch": 3.2955112219451372, + "grad_norm": 0.0029158780816942453, + "learning_rate": 6.8249376558603496e-06, + "loss": 0.0004, + "step": 26430 + }, + { + "epoch": 3.2967581047381547, + "grad_norm": 0.0003885283076670021, + "learning_rate": 6.819950124688279e-06, + "loss": 0.0, + "step": 26440 + }, + { + "epoch": 3.298004987531172, + "grad_norm": 0.0016651960322633386, + "learning_rate": 6.81496259351621e-06, + "loss": 0.0, + "step": 26450 + }, + { + "epoch": 3.2992518703241895, + "grad_norm": 27.317020416259766, + "learning_rate": 6.8099750623441406e-06, + "loss": 0.0071, + "step": 26460 + }, + { + "epoch": 3.300498753117207, + "grad_norm": 0.002943522296845913, + "learning_rate": 6.80498753117207e-06, + "loss": 0.0001, + "step": 26470 + }, + { + "epoch": 3.3017456359102244, + "grad_norm": 0.0005533623043447733, + "learning_rate": 6.800000000000001e-06, + "loss": 0.0461, + "step": 26480 + }, + { + "epoch": 3.302992518703242, + "grad_norm": 0.0010401762556284666, + "learning_rate": 6.795012468827931e-06, + "loss": 0.0, + "step": 26490 + }, + { + "epoch": 3.3042394014962593, + "grad_norm": 0.0009254756732843816, + "learning_rate": 6.790024937655861e-06, + "loss": 0.0021, + "step": 26500 + }, + { + "epoch": 3.3054862842892767, + "grad_norm": 0.0019067944958806038, + "learning_rate": 6.785037406483791e-06, + "loss": 0.0001, + "step": 26510 + }, + { + "epoch": 3.306733167082294, + "grad_norm": 0.0010872195707634091, + "learning_rate": 6.780049875311722e-06, + "loss": 0.0, + "step": 26520 + }, + { + "epoch": 3.3079800498753116, + "grad_norm": 0.0006007709307596087, + "learning_rate": 6.7750623441396515e-06, + "loss": 0.0001, + "step": 26530 + }, + { + "epoch": 3.309226932668329, + "grad_norm": 39.467369079589844, + "learning_rate": 6.770074812967582e-06, + "loss": 0.0233, + "step": 26540 + }, + { + "epoch": 3.3104738154613464, + "grad_norm": 20.561660766601562, + "learning_rate": 6.765087281795512e-06, + "loss": 0.0417, + "step": 26550 + }, + { + "epoch": 3.311720698254364, + "grad_norm": 0.000338366546202451, + "learning_rate": 6.7600997506234425e-06, + "loss": 0.0001, + "step": 26560 + }, + { + "epoch": 3.3129675810473813, + "grad_norm": 0.0014671689132228494, + "learning_rate": 6.7551122194513715e-06, + "loss": 0.0097, + "step": 26570 + }, + { + "epoch": 3.314214463840399, + "grad_norm": 0.0009753374033607543, + "learning_rate": 6.750124688279303e-06, + "loss": 0.0286, + "step": 26580 + }, + { + "epoch": 3.3154613466334166, + "grad_norm": 0.0007874347502365708, + "learning_rate": 6.745137157107232e-06, + "loss": 0.0, + "step": 26590 + }, + { + "epoch": 3.316708229426434, + "grad_norm": 0.003849888453260064, + "learning_rate": 6.7401496259351625e-06, + "loss": 0.0001, + "step": 26600 + }, + { + "epoch": 3.3179551122194515, + "grad_norm": 0.0004690260102506727, + "learning_rate": 6.735162094763092e-06, + "loss": 0.0, + "step": 26610 + }, + { + "epoch": 3.319201995012469, + "grad_norm": 0.00019917161262128502, + "learning_rate": 6.730174563591023e-06, + "loss": 0.029, + "step": 26620 + }, + { + "epoch": 3.3204488778054864, + "grad_norm": 0.008112260140478611, + "learning_rate": 6.725187032418953e-06, + "loss": 0.0001, + "step": 26630 + }, + { + "epoch": 3.321695760598504, + "grad_norm": 0.000990871456451714, + "learning_rate": 6.720199501246883e-06, + "loss": 0.0001, + "step": 26640 + }, + { + "epoch": 3.3229426433915212, + "grad_norm": 0.0007548572611995041, + "learning_rate": 6.715211970074813e-06, + "loss": 0.0, + "step": 26650 + }, + { + "epoch": 3.3241895261845387, + "grad_norm": 0.00047548863221891224, + "learning_rate": 6.710224438902744e-06, + "loss": 0.0001, + "step": 26660 + }, + { + "epoch": 3.325436408977556, + "grad_norm": 0.004564858041703701, + "learning_rate": 6.705236907730673e-06, + "loss": 0.0115, + "step": 26670 + }, + { + "epoch": 3.3266832917705735, + "grad_norm": 0.0006807594327256083, + "learning_rate": 6.700249376558604e-06, + "loss": 0.0003, + "step": 26680 + }, + { + "epoch": 3.327930174563591, + "grad_norm": 0.0005415382911451161, + "learning_rate": 6.695261845386534e-06, + "loss": 0.0018, + "step": 26690 + }, + { + "epoch": 3.3291770573566084, + "grad_norm": 0.0002564255555626005, + "learning_rate": 6.6902743142144644e-06, + "loss": 0.0, + "step": 26700 + }, + { + "epoch": 3.330423940149626, + "grad_norm": 0.000789106881711632, + "learning_rate": 6.685286783042395e-06, + "loss": 0.0001, + "step": 26710 + }, + { + "epoch": 3.3316708229426433, + "grad_norm": 0.0043001859448850155, + "learning_rate": 6.680299251870325e-06, + "loss": 0.0, + "step": 26720 + }, + { + "epoch": 3.3329177057356607, + "grad_norm": 0.00039159294101409614, + "learning_rate": 6.6753117206982554e-06, + "loss": 0.0, + "step": 26730 + }, + { + "epoch": 3.334164588528678, + "grad_norm": 0.00038974048220552504, + "learning_rate": 6.670324189526185e-06, + "loss": 0.0, + "step": 26740 + }, + { + "epoch": 3.3354114713216956, + "grad_norm": 0.00043641115189529955, + "learning_rate": 6.665336658354116e-06, + "loss": 0.0, + "step": 26750 + }, + { + "epoch": 3.3366583541147135, + "grad_norm": 0.0005020300159230828, + "learning_rate": 6.660349127182046e-06, + "loss": 0.0018, + "step": 26760 + }, + { + "epoch": 3.337905236907731, + "grad_norm": 0.0005201539606787264, + "learning_rate": 6.655361596009976e-06, + "loss": 0.0553, + "step": 26770 + }, + { + "epoch": 3.3391521197007483, + "grad_norm": 0.0006099382881075144, + "learning_rate": 6.650374064837906e-06, + "loss": 0.0, + "step": 26780 + }, + { + "epoch": 3.3403990024937658, + "grad_norm": 0.0018274527974426746, + "learning_rate": 6.645386533665837e-06, + "loss": 0.0, + "step": 26790 + }, + { + "epoch": 3.341645885286783, + "grad_norm": 0.0006980765028856695, + "learning_rate": 6.6403990024937655e-06, + "loss": 0.0045, + "step": 26800 + }, + { + "epoch": 3.3428927680798006, + "grad_norm": 0.00030875447555445135, + "learning_rate": 6.635411471321697e-06, + "loss": 0.0091, + "step": 26810 + }, + { + "epoch": 3.344139650872818, + "grad_norm": 1.3320343494415283, + "learning_rate": 6.630423940149626e-06, + "loss": 0.0002, + "step": 26820 + }, + { + "epoch": 3.3453865336658355, + "grad_norm": 0.019607802852988243, + "learning_rate": 6.6254364089775565e-06, + "loss": 0.0, + "step": 26830 + }, + { + "epoch": 3.346633416458853, + "grad_norm": 0.00037203048123046756, + "learning_rate": 6.620448877805486e-06, + "loss": 0.0, + "step": 26840 + }, + { + "epoch": 3.3478802992518704, + "grad_norm": 0.0021146514918655157, + "learning_rate": 6.615461346633417e-06, + "loss": 0.0001, + "step": 26850 + }, + { + "epoch": 3.349127182044888, + "grad_norm": 0.002595689380541444, + "learning_rate": 6.610473815461347e-06, + "loss": 0.0751, + "step": 26860 + }, + { + "epoch": 3.3503740648379052, + "grad_norm": 0.00038239543209783733, + "learning_rate": 6.605486284289277e-06, + "loss": 0.0, + "step": 26870 + }, + { + "epoch": 3.3516209476309227, + "grad_norm": 0.0037817058619111776, + "learning_rate": 6.600498753117207e-06, + "loss": 0.0565, + "step": 26880 + }, + { + "epoch": 3.35286783042394, + "grad_norm": 0.0012505949707701802, + "learning_rate": 6.595511221945138e-06, + "loss": 0.0, + "step": 26890 + }, + { + "epoch": 3.3541147132169575, + "grad_norm": 0.0010186401195824146, + "learning_rate": 6.5905236907730675e-06, + "loss": 0.0002, + "step": 26900 + }, + { + "epoch": 3.355361596009975, + "grad_norm": 0.0003353485371917486, + "learning_rate": 6.585536159600998e-06, + "loss": 0.0006, + "step": 26910 + }, + { + "epoch": 3.3566084788029924, + "grad_norm": 0.0005550871719606221, + "learning_rate": 6.580548628428928e-06, + "loss": 0.0004, + "step": 26920 + }, + { + "epoch": 3.35785536159601, + "grad_norm": 0.0023827122058719397, + "learning_rate": 6.5755610972568585e-06, + "loss": 0.0031, + "step": 26930 + }, + { + "epoch": 3.3591022443890273, + "grad_norm": 0.007178848143666983, + "learning_rate": 6.570573566084788e-06, + "loss": 0.049, + "step": 26940 + }, + { + "epoch": 3.3603491271820447, + "grad_norm": 0.0006301432149484754, + "learning_rate": 6.565586034912719e-06, + "loss": 0.0, + "step": 26950 + }, + { + "epoch": 3.361596009975062, + "grad_norm": 0.0008799605420790613, + "learning_rate": 6.5605985037406495e-06, + "loss": 0.0, + "step": 26960 + }, + { + "epoch": 3.3628428927680796, + "grad_norm": 0.000438713381299749, + "learning_rate": 6.555610972568579e-06, + "loss": 0.0382, + "step": 26970 + }, + { + "epoch": 3.3640897755610975, + "grad_norm": 0.13686221837997437, + "learning_rate": 6.55062344139651e-06, + "loss": 0.0001, + "step": 26980 + }, + { + "epoch": 3.365336658354115, + "grad_norm": 0.4949225187301636, + "learning_rate": 6.54563591022444e-06, + "loss": 0.0001, + "step": 26990 + }, + { + "epoch": 3.3665835411471323, + "grad_norm": 0.03666635975241661, + "learning_rate": 6.54064837905237e-06, + "loss": 0.053, + "step": 27000 + }, + { + "epoch": 3.3678304239401498, + "grad_norm": 0.0034022931940853596, + "learning_rate": 6.535660847880299e-06, + "loss": 0.0001, + "step": 27010 + }, + { + "epoch": 3.369077306733167, + "grad_norm": 0.006523266434669495, + "learning_rate": 6.530673316708231e-06, + "loss": 0.0147, + "step": 27020 + }, + { + "epoch": 3.3703241895261846, + "grad_norm": 0.03280205652117729, + "learning_rate": 6.52568578553616e-06, + "loss": 0.0372, + "step": 27030 + }, + { + "epoch": 3.371571072319202, + "grad_norm": 0.002701780293136835, + "learning_rate": 6.52069825436409e-06, + "loss": 0.0098, + "step": 27040 + }, + { + "epoch": 3.3728179551122195, + "grad_norm": 0.0005019395030103624, + "learning_rate": 6.51571072319202e-06, + "loss": 0.0017, + "step": 27050 + }, + { + "epoch": 3.374064837905237, + "grad_norm": 0.000222506292629987, + "learning_rate": 6.510723192019951e-06, + "loss": 0.0001, + "step": 27060 + }, + { + "epoch": 3.3753117206982544, + "grad_norm": 0.05252045765519142, + "learning_rate": 6.50573566084788e-06, + "loss": 0.0001, + "step": 27070 + }, + { + "epoch": 3.376558603491272, + "grad_norm": 0.0026390505954623222, + "learning_rate": 6.500748129675811e-06, + "loss": 0.0, + "step": 27080 + }, + { + "epoch": 3.3778054862842892, + "grad_norm": 0.0017233664402738214, + "learning_rate": 6.495760598503741e-06, + "loss": 0.0001, + "step": 27090 + }, + { + "epoch": 3.3790523690773067, + "grad_norm": 0.000762695271987468, + "learning_rate": 6.490773067331671e-06, + "loss": 0.0001, + "step": 27100 + }, + { + "epoch": 3.380299251870324, + "grad_norm": 0.03908080980181694, + "learning_rate": 6.485785536159601e-06, + "loss": 0.0, + "step": 27110 + }, + { + "epoch": 3.3815461346633415, + "grad_norm": 0.009048929437994957, + "learning_rate": 6.480798004987532e-06, + "loss": 0.0783, + "step": 27120 + }, + { + "epoch": 3.382793017456359, + "grad_norm": 0.0003922952164430171, + "learning_rate": 6.475810473815462e-06, + "loss": 0.0, + "step": 27130 + }, + { + "epoch": 3.3840399002493764, + "grad_norm": 0.0037547594401985407, + "learning_rate": 6.470822942643392e-06, + "loss": 0.0003, + "step": 27140 + }, + { + "epoch": 3.385286783042394, + "grad_norm": 0.002254948951303959, + "learning_rate": 6.465835411471322e-06, + "loss": 0.0, + "step": 27150 + }, + { + "epoch": 3.3865336658354117, + "grad_norm": 0.0015663010999560356, + "learning_rate": 6.460847880299253e-06, + "loss": 0.0001, + "step": 27160 + }, + { + "epoch": 3.387780548628429, + "grad_norm": 0.009484532289206982, + "learning_rate": 6.455860349127182e-06, + "loss": 0.0005, + "step": 27170 + }, + { + "epoch": 3.3890274314214466, + "grad_norm": 1.5235114097595215, + "learning_rate": 6.450872817955113e-06, + "loss": 0.0036, + "step": 27180 + }, + { + "epoch": 3.390274314214464, + "grad_norm": 0.000923574494663626, + "learning_rate": 6.445885286783043e-06, + "loss": 0.0002, + "step": 27190 + }, + { + "epoch": 3.3915211970074814, + "grad_norm": 0.00219345442019403, + "learning_rate": 6.440897755610973e-06, + "loss": 0.0, + "step": 27200 + }, + { + "epoch": 3.392768079800499, + "grad_norm": 0.00026506150607019663, + "learning_rate": 6.435910224438904e-06, + "loss": 0.0404, + "step": 27210 + }, + { + "epoch": 3.3940149625935163, + "grad_norm": 0.00028258541715331376, + "learning_rate": 6.430922693266834e-06, + "loss": 0.0002, + "step": 27220 + }, + { + "epoch": 3.3952618453865338, + "grad_norm": 0.0068185459822416306, + "learning_rate": 6.425935162094764e-06, + "loss": 0.0003, + "step": 27230 + }, + { + "epoch": 3.396508728179551, + "grad_norm": 0.0003236836346331984, + "learning_rate": 6.421446384039901e-06, + "loss": 0.0054, + "step": 27240 + }, + { + "epoch": 3.3977556109725686, + "grad_norm": 0.0008242797921411693, + "learning_rate": 6.416458852867831e-06, + "loss": 0.0, + "step": 27250 + }, + { + "epoch": 3.399002493765586, + "grad_norm": 0.00034484322532080114, + "learning_rate": 6.411471321695761e-06, + "loss": 0.0397, + "step": 27260 + }, + { + "epoch": 3.4002493765586035, + "grad_norm": 0.0011520058615133166, + "learning_rate": 6.406483790523691e-06, + "loss": 0.0, + "step": 27270 + }, + { + "epoch": 3.401496259351621, + "grad_norm": 0.00017111722263507545, + "learning_rate": 6.401496259351622e-06, + "loss": 0.0, + "step": 27280 + }, + { + "epoch": 3.4027431421446384, + "grad_norm": 0.0003394628001842648, + "learning_rate": 6.3965087281795515e-06, + "loss": 0.0, + "step": 27290 + }, + { + "epoch": 3.403990024937656, + "grad_norm": 0.00040050517418421805, + "learning_rate": 6.391521197007482e-06, + "loss": 0.0, + "step": 27300 + }, + { + "epoch": 3.4052369077306732, + "grad_norm": 0.0007083789096213877, + "learning_rate": 6.386533665835412e-06, + "loss": 0.0, + "step": 27310 + }, + { + "epoch": 3.4064837905236907, + "grad_norm": 0.0008792931330390275, + "learning_rate": 6.3815461346633425e-06, + "loss": 0.0, + "step": 27320 + }, + { + "epoch": 3.407730673316708, + "grad_norm": 0.00042291832505725324, + "learning_rate": 6.376558603491272e-06, + "loss": 0.0002, + "step": 27330 + }, + { + "epoch": 3.4089775561097255, + "grad_norm": 0.015671808272600174, + "learning_rate": 6.371571072319203e-06, + "loss": 0.0001, + "step": 27340 + }, + { + "epoch": 3.410224438902743, + "grad_norm": 0.00033854617504402995, + "learning_rate": 6.366583541147132e-06, + "loss": 0.0002, + "step": 27350 + }, + { + "epoch": 3.4114713216957604, + "grad_norm": 0.015324210748076439, + "learning_rate": 6.361596009975063e-06, + "loss": 0.0, + "step": 27360 + }, + { + "epoch": 3.412718204488778, + "grad_norm": 0.00865886453539133, + "learning_rate": 6.356608478802994e-06, + "loss": 0.0, + "step": 27370 + }, + { + "epoch": 3.4139650872817953, + "grad_norm": 0.00048793648602440953, + "learning_rate": 6.351620947630923e-06, + "loss": 0.0015, + "step": 27380 + }, + { + "epoch": 3.415211970074813, + "grad_norm": 0.0030084741301834583, + "learning_rate": 6.346633416458854e-06, + "loss": 0.0, + "step": 27390 + }, + { + "epoch": 3.4164588528678306, + "grad_norm": 0.0003942911571357399, + "learning_rate": 6.341645885286783e-06, + "loss": 0.0, + "step": 27400 + }, + { + "epoch": 3.417705735660848, + "grad_norm": 0.002323322230949998, + "learning_rate": 6.336658354114714e-06, + "loss": 0.0, + "step": 27410 + }, + { + "epoch": 3.4189526184538654, + "grad_norm": 0.0004297247505746782, + "learning_rate": 6.331670822942644e-06, + "loss": 0.0, + "step": 27420 + }, + { + "epoch": 3.420199501246883, + "grad_norm": 0.005649714730679989, + "learning_rate": 6.326683291770574e-06, + "loss": 0.0001, + "step": 27430 + }, + { + "epoch": 3.4214463840399003, + "grad_norm": 0.013090058229863644, + "learning_rate": 6.321695760598504e-06, + "loss": 0.0142, + "step": 27440 + }, + { + "epoch": 3.4226932668329177, + "grad_norm": 0.018311014398932457, + "learning_rate": 6.316708229426435e-06, + "loss": 0.0008, + "step": 27450 + }, + { + "epoch": 3.423940149625935, + "grad_norm": 0.0004497721092775464, + "learning_rate": 6.311720698254364e-06, + "loss": 0.0001, + "step": 27460 + }, + { + "epoch": 3.4251870324189526, + "grad_norm": 59.970619201660156, + "learning_rate": 6.306733167082295e-06, + "loss": 0.0225, + "step": 27470 + }, + { + "epoch": 3.42643391521197, + "grad_norm": 0.00020840381330344826, + "learning_rate": 6.301745635910225e-06, + "loss": 0.0004, + "step": 27480 + }, + { + "epoch": 3.4276807980049875, + "grad_norm": 0.002133867936208844, + "learning_rate": 6.296758104738155e-06, + "loss": 0.0, + "step": 27490 + }, + { + "epoch": 3.428927680798005, + "grad_norm": 0.00448261946439743, + "learning_rate": 6.291770573566085e-06, + "loss": 0.0, + "step": 27500 + }, + { + "epoch": 3.4301745635910224, + "grad_norm": 7.075658321380615, + "learning_rate": 6.286783042394016e-06, + "loss": 0.0531, + "step": 27510 + }, + { + "epoch": 3.43142144638404, + "grad_norm": 0.00027553230756893754, + "learning_rate": 6.2817955112219456e-06, + "loss": 0.0, + "step": 27520 + }, + { + "epoch": 3.432668329177057, + "grad_norm": 0.0008446983993053436, + "learning_rate": 6.276807980049876e-06, + "loss": 0.0001, + "step": 27530 + }, + { + "epoch": 3.4339152119700747, + "grad_norm": 0.00023807137040421367, + "learning_rate": 6.271820448877806e-06, + "loss": 0.0, + "step": 27540 + }, + { + "epoch": 3.435162094763092, + "grad_norm": 0.00029441763763315976, + "learning_rate": 6.266832917705737e-06, + "loss": 0.1117, + "step": 27550 + }, + { + "epoch": 3.43640897755611, + "grad_norm": 0.005209818482398987, + "learning_rate": 6.2618453865336655e-06, + "loss": 0.0056, + "step": 27560 + }, + { + "epoch": 3.4376558603491274, + "grad_norm": 0.035485655069351196, + "learning_rate": 6.256857855361597e-06, + "loss": 0.0, + "step": 27570 + }, + { + "epoch": 3.438902743142145, + "grad_norm": 0.0030119556467980146, + "learning_rate": 6.251870324189526e-06, + "loss": 0.0, + "step": 27580 + }, + { + "epoch": 3.4401496259351623, + "grad_norm": 0.0007518244674429297, + "learning_rate": 6.2468827930174565e-06, + "loss": 0.0, + "step": 27590 + }, + { + "epoch": 3.4413965087281797, + "grad_norm": 0.000781845476012677, + "learning_rate": 6.241895261845386e-06, + "loss": 0.0, + "step": 27600 + }, + { + "epoch": 3.442643391521197, + "grad_norm": 0.046625442802906036, + "learning_rate": 6.236907730673317e-06, + "loss": 0.0, + "step": 27610 + }, + { + "epoch": 3.4438902743142146, + "grad_norm": 0.0009465779294259846, + "learning_rate": 6.2319201995012475e-06, + "loss": 0.0124, + "step": 27620 + }, + { + "epoch": 3.445137157107232, + "grad_norm": 6.235825538635254, + "learning_rate": 6.226932668329177e-06, + "loss": 0.0005, + "step": 27630 + }, + { + "epoch": 3.4463840399002494, + "grad_norm": 0.00022234499920159578, + "learning_rate": 6.221945137157108e-06, + "loss": 0.0, + "step": 27640 + }, + { + "epoch": 3.447630922693267, + "grad_norm": 0.00037174602039158344, + "learning_rate": 6.216957605985038e-06, + "loss": 0.0032, + "step": 27650 + }, + { + "epoch": 3.4488778054862843, + "grad_norm": 0.0011786530958488584, + "learning_rate": 6.211970074812968e-06, + "loss": 0.0, + "step": 27660 + }, + { + "epoch": 3.4501246882793017, + "grad_norm": 0.0008283031056635082, + "learning_rate": 6.206982543640898e-06, + "loss": 0.0002, + "step": 27670 + }, + { + "epoch": 3.451371571072319, + "grad_norm": 0.00013709701306652278, + "learning_rate": 6.201995012468829e-06, + "loss": 0.0264, + "step": 27680 + }, + { + "epoch": 3.4526184538653366, + "grad_norm": 0.0002293088473379612, + "learning_rate": 6.1970074812967585e-06, + "loss": 0.0, + "step": 27690 + }, + { + "epoch": 3.453865336658354, + "grad_norm": 0.0011270396644249558, + "learning_rate": 6.192019950124689e-06, + "loss": 0.0186, + "step": 27700 + }, + { + "epoch": 3.4551122194513715, + "grad_norm": 0.002886722795665264, + "learning_rate": 6.187032418952619e-06, + "loss": 0.0, + "step": 27710 + }, + { + "epoch": 3.456359102244389, + "grad_norm": 0.002366556553170085, + "learning_rate": 6.1820448877805495e-06, + "loss": 0.0, + "step": 27720 + }, + { + "epoch": 3.4576059850374063, + "grad_norm": 0.00040443125180900097, + "learning_rate": 6.177057356608479e-06, + "loss": 0.0, + "step": 27730 + }, + { + "epoch": 3.458852867830424, + "grad_norm": 8.750553131103516, + "learning_rate": 6.17206982543641e-06, + "loss": 0.0339, + "step": 27740 + }, + { + "epoch": 3.460099750623441, + "grad_norm": 0.00026490926393307745, + "learning_rate": 6.16708229426434e-06, + "loss": 0.0, + "step": 27750 + }, + { + "epoch": 3.4613466334164587, + "grad_norm": 0.0013308553025126457, + "learning_rate": 6.16209476309227e-06, + "loss": 0.0, + "step": 27760 + }, + { + "epoch": 3.462593516209476, + "grad_norm": 0.000745519355405122, + "learning_rate": 6.1571072319202e-06, + "loss": 0.0037, + "step": 27770 + }, + { + "epoch": 3.4638403990024935, + "grad_norm": 0.00035905808908864856, + "learning_rate": 6.152119700748131e-06, + "loss": 0.0097, + "step": 27780 + }, + { + "epoch": 3.4650872817955114, + "grad_norm": 0.04515613242983818, + "learning_rate": 6.14713216957606e-06, + "loss": 0.0001, + "step": 27790 + }, + { + "epoch": 3.466334164588529, + "grad_norm": 0.0003502692561596632, + "learning_rate": 6.142144638403991e-06, + "loss": 0.0001, + "step": 27800 + }, + { + "epoch": 3.4675810473815463, + "grad_norm": 0.0015125253703445196, + "learning_rate": 6.13715710723192e-06, + "loss": 0.0, + "step": 27810 + }, + { + "epoch": 3.4688279301745637, + "grad_norm": 0.00017602364823687822, + "learning_rate": 6.132169576059851e-06, + "loss": 0.0, + "step": 27820 + }, + { + "epoch": 3.470074812967581, + "grad_norm": 0.0025234976783394814, + "learning_rate": 6.12718204488778e-06, + "loss": 0.0, + "step": 27830 + }, + { + "epoch": 3.4713216957605986, + "grad_norm": 0.0006344380090013146, + "learning_rate": 6.122194513715711e-06, + "loss": 0.0, + "step": 27840 + }, + { + "epoch": 3.472568578553616, + "grad_norm": 0.0031119780614972115, + "learning_rate": 6.117206982543641e-06, + "loss": 0.0, + "step": 27850 + }, + { + "epoch": 3.4738154613466334, + "grad_norm": 0.0002057456149486825, + "learning_rate": 6.112219451371571e-06, + "loss": 0.0011, + "step": 27860 + }, + { + "epoch": 3.475062344139651, + "grad_norm": 0.0012670621508732438, + "learning_rate": 6.107231920199502e-06, + "loss": 0.0003, + "step": 27870 + }, + { + "epoch": 3.4763092269326683, + "grad_norm": 0.0004024482041131705, + "learning_rate": 6.102244389027432e-06, + "loss": 0.0059, + "step": 27880 + }, + { + "epoch": 3.4775561097256857, + "grad_norm": 53.51555252075195, + "learning_rate": 6.097256857855362e-06, + "loss": 0.0405, + "step": 27890 + }, + { + "epoch": 3.478802992518703, + "grad_norm": 0.00030399305978789926, + "learning_rate": 6.092269326683292e-06, + "loss": 0.0, + "step": 27900 + }, + { + "epoch": 3.4800498753117206, + "grad_norm": 0.058239106088876724, + "learning_rate": 6.087281795511223e-06, + "loss": 0.0238, + "step": 27910 + }, + { + "epoch": 3.481296758104738, + "grad_norm": 0.0014490766916424036, + "learning_rate": 6.0822942643391526e-06, + "loss": 0.0, + "step": 27920 + }, + { + "epoch": 3.4825436408977555, + "grad_norm": 0.0002840702945832163, + "learning_rate": 6.077306733167083e-06, + "loss": 0.0029, + "step": 27930 + }, + { + "epoch": 3.483790523690773, + "grad_norm": 0.005863568279892206, + "learning_rate": 6.072319201995013e-06, + "loss": 0.0, + "step": 27940 + }, + { + "epoch": 3.4850374064837903, + "grad_norm": 0.00029472613823600113, + "learning_rate": 6.0673316708229436e-06, + "loss": 0.0, + "step": 27950 + }, + { + "epoch": 3.4862842892768082, + "grad_norm": 0.0002991229121107608, + "learning_rate": 6.062344139650873e-06, + "loss": 0.0, + "step": 27960 + }, + { + "epoch": 3.4875311720698257, + "grad_norm": 0.0006273513427004218, + "learning_rate": 6.057356608478804e-06, + "loss": 0.0002, + "step": 27970 + }, + { + "epoch": 3.488778054862843, + "grad_norm": 0.0022930046543478966, + "learning_rate": 6.052369077306734e-06, + "loss": 0.0002, + "step": 27980 + }, + { + "epoch": 3.4900249376558605, + "grad_norm": 0.00040200044168159366, + "learning_rate": 6.047381546134664e-06, + "loss": 0.0, + "step": 27990 + }, + { + "epoch": 3.491271820448878, + "grad_norm": 0.0002788232814054936, + "learning_rate": 6.042394014962594e-06, + "loss": 0.0032, + "step": 28000 + }, + { + "epoch": 3.4925187032418954, + "grad_norm": 0.0012403487926349044, + "learning_rate": 6.037406483790525e-06, + "loss": 0.0, + "step": 28010 + }, + { + "epoch": 3.493765586034913, + "grad_norm": 0.0010406679939478636, + "learning_rate": 6.032418952618454e-06, + "loss": 0.0553, + "step": 28020 + }, + { + "epoch": 3.4950124688279303, + "grad_norm": 0.0004845515941269696, + "learning_rate": 6.027431421446385e-06, + "loss": 0.0039, + "step": 28030 + }, + { + "epoch": 3.4962593516209477, + "grad_norm": 0.11747261136770248, + "learning_rate": 6.022443890274314e-06, + "loss": 0.001, + "step": 28040 + }, + { + "epoch": 3.497506234413965, + "grad_norm": 0.0910341814160347, + "learning_rate": 6.017456359102245e-06, + "loss": 0.0, + "step": 28050 + }, + { + "epoch": 3.4987531172069826, + "grad_norm": 0.0005173732060939074, + "learning_rate": 6.0124688279301745e-06, + "loss": 0.0159, + "step": 28060 + }, + { + "epoch": 3.5, + "grad_norm": 0.002428569132462144, + "learning_rate": 6.007481296758105e-06, + "loss": 0.0007, + "step": 28070 + }, + { + "epoch": 3.5012468827930174, + "grad_norm": 0.00020148402836639434, + "learning_rate": 6.002493765586035e-06, + "loss": 0.0, + "step": 28080 + }, + { + "epoch": 3.502493765586035, + "grad_norm": 26.454116821289062, + "learning_rate": 5.9975062344139655e-06, + "loss": 0.0262, + "step": 28090 + }, + { + "epoch": 3.5037406483790523, + "grad_norm": 0.00031432858668267727, + "learning_rate": 5.992518703241895e-06, + "loss": 0.0001, + "step": 28100 + }, + { + "epoch": 3.5049875311720697, + "grad_norm": 0.00043840528815053403, + "learning_rate": 5.987531172069826e-06, + "loss": 0.0011, + "step": 28110 + }, + { + "epoch": 3.506234413965087, + "grad_norm": 0.007529322523623705, + "learning_rate": 5.9825436408977565e-06, + "loss": 0.0, + "step": 28120 + }, + { + "epoch": 3.5074812967581046, + "grad_norm": 0.00011467208241811022, + "learning_rate": 5.977556109725686e-06, + "loss": 0.0, + "step": 28130 + }, + { + "epoch": 3.508728179551122, + "grad_norm": 0.002503705210983753, + "learning_rate": 5.972568578553617e-06, + "loss": 0.0522, + "step": 28140 + }, + { + "epoch": 3.5099750623441395, + "grad_norm": 0.01109258271753788, + "learning_rate": 5.967581047381547e-06, + "loss": 0.0007, + "step": 28150 + }, + { + "epoch": 3.511221945137157, + "grad_norm": 0.002344650449231267, + "learning_rate": 5.962593516209477e-06, + "loss": 0.0001, + "step": 28160 + }, + { + "epoch": 3.5124688279301743, + "grad_norm": 0.0004216936940792948, + "learning_rate": 5.957605985037407e-06, + "loss": 0.0543, + "step": 28170 + }, + { + "epoch": 3.5137157107231918, + "grad_norm": 0.0004381619510240853, + "learning_rate": 5.952618453865338e-06, + "loss": 0.0217, + "step": 28180 + }, + { + "epoch": 3.514962593516209, + "grad_norm": 0.00039310386637225747, + "learning_rate": 5.947630922693267e-06, + "loss": 0.0498, + "step": 28190 + }, + { + "epoch": 3.516209476309227, + "grad_norm": 0.0005476093501783907, + "learning_rate": 5.942643391521198e-06, + "loss": 0.0009, + "step": 28200 + }, + { + "epoch": 3.5174563591022445, + "grad_norm": 0.00018081016605719924, + "learning_rate": 5.937655860349128e-06, + "loss": 0.0, + "step": 28210 + }, + { + "epoch": 3.518703241895262, + "grad_norm": 23.967710494995117, + "learning_rate": 5.9326683291770584e-06, + "loss": 0.0058, + "step": 28220 + }, + { + "epoch": 3.5199501246882794, + "grad_norm": 0.0003085663774982095, + "learning_rate": 5.927680798004987e-06, + "loss": 0.0055, + "step": 28230 + }, + { + "epoch": 3.521197007481297, + "grad_norm": 0.0006709218141622841, + "learning_rate": 5.922693266832919e-06, + "loss": 0.0149, + "step": 28240 + }, + { + "epoch": 3.5224438902743143, + "grad_norm": 0.00043505855137482285, + "learning_rate": 5.917705735660848e-06, + "loss": 0.0757, + "step": 28250 + }, + { + "epoch": 3.5236907730673317, + "grad_norm": 0.00016124600369948894, + "learning_rate": 5.912718204488778e-06, + "loss": 0.0001, + "step": 28260 + }, + { + "epoch": 3.524937655860349, + "grad_norm": 0.008092806674540043, + "learning_rate": 5.907730673316708e-06, + "loss": 0.0004, + "step": 28270 + }, + { + "epoch": 3.5261845386533666, + "grad_norm": 0.0011323315557092428, + "learning_rate": 5.902743142144639e-06, + "loss": 0.0, + "step": 28280 + }, + { + "epoch": 3.527431421446384, + "grad_norm": 0.08847978711128235, + "learning_rate": 5.8977556109725685e-06, + "loss": 0.0344, + "step": 28290 + }, + { + "epoch": 3.5286783042394014, + "grad_norm": 0.0006359029794111848, + "learning_rate": 5.892768079800499e-06, + "loss": 0.0, + "step": 28300 + }, + { + "epoch": 3.529925187032419, + "grad_norm": 0.0006071751704439521, + "learning_rate": 5.887780548628429e-06, + "loss": 0.0331, + "step": 28310 + }, + { + "epoch": 3.5311720698254363, + "grad_norm": 0.0004466983664315194, + "learning_rate": 5.8827930174563595e-06, + "loss": 0.0, + "step": 28320 + }, + { + "epoch": 3.5324189526184537, + "grad_norm": 0.0018139990279451013, + "learning_rate": 5.877805486284289e-06, + "loss": 0.0, + "step": 28330 + }, + { + "epoch": 3.533665835411471, + "grad_norm": 0.0002320937201147899, + "learning_rate": 5.87281795511222e-06, + "loss": 0.0003, + "step": 28340 + }, + { + "epoch": 3.534912718204489, + "grad_norm": 0.0004413308924995363, + "learning_rate": 5.86783042394015e-06, + "loss": 0.0, + "step": 28350 + }, + { + "epoch": 3.5361596009975065, + "grad_norm": 0.27286967635154724, + "learning_rate": 5.86284289276808e-06, + "loss": 0.0516, + "step": 28360 + }, + { + "epoch": 3.537406483790524, + "grad_norm": 0.00041090993909165263, + "learning_rate": 5.857855361596011e-06, + "loss": 0.0285, + "step": 28370 + }, + { + "epoch": 3.5386533665835413, + "grad_norm": 0.00043929347884841263, + "learning_rate": 5.852867830423941e-06, + "loss": 0.0, + "step": 28380 + }, + { + "epoch": 3.539900249376559, + "grad_norm": 1.708524227142334, + "learning_rate": 5.847880299251871e-06, + "loss": 0.0391, + "step": 28390 + }, + { + "epoch": 3.541147132169576, + "grad_norm": 0.0006791963241994381, + "learning_rate": 5.842892768079801e-06, + "loss": 0.0002, + "step": 28400 + }, + { + "epoch": 3.5423940149625937, + "grad_norm": 0.0003175217134412378, + "learning_rate": 5.837905236907732e-06, + "loss": 0.0, + "step": 28410 + }, + { + "epoch": 3.543640897755611, + "grad_norm": 0.00026248840731568635, + "learning_rate": 5.8329177057356615e-06, + "loss": 0.0011, + "step": 28420 + }, + { + "epoch": 3.5448877805486285, + "grad_norm": 28.28713607788086, + "learning_rate": 5.827930174563592e-06, + "loss": 0.0493, + "step": 28430 + }, + { + "epoch": 3.546134663341646, + "grad_norm": 0.0004381363105494529, + "learning_rate": 5.822942643391522e-06, + "loss": 0.0476, + "step": 28440 + }, + { + "epoch": 3.5473815461346634, + "grad_norm": 0.00020880838565062732, + "learning_rate": 5.8179551122194525e-06, + "loss": 0.0, + "step": 28450 + }, + { + "epoch": 3.548628428927681, + "grad_norm": 0.0002543655864428729, + "learning_rate": 5.8129675810473814e-06, + "loss": 0.0513, + "step": 28460 + }, + { + "epoch": 3.5498753117206983, + "grad_norm": 0.0017961261328309774, + "learning_rate": 5.807980049875313e-06, + "loss": 0.0253, + "step": 28470 + }, + { + "epoch": 3.5511221945137157, + "grad_norm": 0.0062349289655685425, + "learning_rate": 5.802992518703242e-06, + "loss": 0.0, + "step": 28480 + }, + { + "epoch": 3.552369077306733, + "grad_norm": 45.63996124267578, + "learning_rate": 5.7980049875311725e-06, + "loss": 0.0339, + "step": 28490 + }, + { + "epoch": 3.5536159600997506, + "grad_norm": 0.021966515108942986, + "learning_rate": 5.793017456359102e-06, + "loss": 0.0, + "step": 28500 + }, + { + "epoch": 3.554862842892768, + "grad_norm": 0.007263632025569677, + "learning_rate": 5.788029925187033e-06, + "loss": 0.0001, + "step": 28510 + }, + { + "epoch": 3.5561097256857854, + "grad_norm": 0.001259212614968419, + "learning_rate": 5.783042394014963e-06, + "loss": 0.0, + "step": 28520 + }, + { + "epoch": 3.557356608478803, + "grad_norm": 0.0005705059738829732, + "learning_rate": 5.778054862842893e-06, + "loss": 0.0, + "step": 28530 + }, + { + "epoch": 3.5586034912718203, + "grad_norm": 0.012875759974122047, + "learning_rate": 5.773067331670823e-06, + "loss": 0.0002, + "step": 28540 + }, + { + "epoch": 3.5598503740648377, + "grad_norm": 0.002192272339016199, + "learning_rate": 5.768079800498754e-06, + "loss": 0.0009, + "step": 28550 + }, + { + "epoch": 3.561097256857855, + "grad_norm": 0.00031267275335267186, + "learning_rate": 5.763092269326683e-06, + "loss": 0.0152, + "step": 28560 + }, + { + "epoch": 3.5623441396508726, + "grad_norm": 0.00037283176789060235, + "learning_rate": 5.758104738154614e-06, + "loss": 0.0001, + "step": 28570 + }, + { + "epoch": 3.56359102244389, + "grad_norm": 0.0001530810259282589, + "learning_rate": 5.753117206982544e-06, + "loss": 0.0, + "step": 28580 + }, + { + "epoch": 3.5648379052369075, + "grad_norm": 0.0004657857643906027, + "learning_rate": 5.748129675810474e-06, + "loss": 0.0034, + "step": 28590 + }, + { + "epoch": 3.5660847880299253, + "grad_norm": 0.000526457850355655, + "learning_rate": 5.743142144638404e-06, + "loss": 0.0, + "step": 28600 + }, + { + "epoch": 3.567331670822943, + "grad_norm": 0.0016893032006919384, + "learning_rate": 5.738154613466335e-06, + "loss": 0.0001, + "step": 28610 + }, + { + "epoch": 3.56857855361596, + "grad_norm": 0.0004379678575787693, + "learning_rate": 5.733167082294265e-06, + "loss": 0.0001, + "step": 28620 + }, + { + "epoch": 3.5698254364089776, + "grad_norm": 0.00044744761544279754, + "learning_rate": 5.728179551122195e-06, + "loss": 0.0, + "step": 28630 + }, + { + "epoch": 3.571072319201995, + "grad_norm": 0.0023382350336760283, + "learning_rate": 5.723192019950126e-06, + "loss": 0.0, + "step": 28640 + }, + { + "epoch": 3.5723192019950125, + "grad_norm": 0.0007506693946197629, + "learning_rate": 5.718204488778056e-06, + "loss": 0.0, + "step": 28650 + }, + { + "epoch": 3.57356608478803, + "grad_norm": 0.0003731877659447491, + "learning_rate": 5.713216957605986e-06, + "loss": 0.0001, + "step": 28660 + }, + { + "epoch": 3.5748129675810474, + "grad_norm": 0.0005579161224886775, + "learning_rate": 5.708229426433915e-06, + "loss": 0.0001, + "step": 28670 + }, + { + "epoch": 3.576059850374065, + "grad_norm": 0.000176784407813102, + "learning_rate": 5.703241895261847e-06, + "loss": 0.0002, + "step": 28680 + }, + { + "epoch": 3.5773067331670823, + "grad_norm": 0.0005845030536875129, + "learning_rate": 5.6982543640897755e-06, + "loss": 0.0216, + "step": 28690 + }, + { + "epoch": 3.5785536159600997, + "grad_norm": 4.049169540405273, + "learning_rate": 5.693266832917706e-06, + "loss": 0.0007, + "step": 28700 + }, + { + "epoch": 3.579800498753117, + "grad_norm": 0.0001853818102972582, + "learning_rate": 5.688279301745636e-06, + "loss": 0.0041, + "step": 28710 + }, + { + "epoch": 3.5810473815461346, + "grad_norm": 0.003477283287793398, + "learning_rate": 5.6832917705735665e-06, + "loss": 0.0, + "step": 28720 + }, + { + "epoch": 3.582294264339152, + "grad_norm": 0.002856282517313957, + "learning_rate": 5.678304239401496e-06, + "loss": 0.0, + "step": 28730 + }, + { + "epoch": 3.5835411471321694, + "grad_norm": 0.00022709465702064335, + "learning_rate": 5.673316708229427e-06, + "loss": 0.0001, + "step": 28740 + }, + { + "epoch": 3.5847880299251873, + "grad_norm": 0.0019931760616600513, + "learning_rate": 5.668329177057357e-06, + "loss": 0.0, + "step": 28750 + }, + { + "epoch": 3.5860349127182047, + "grad_norm": 0.0034395332913845778, + "learning_rate": 5.663341645885287e-06, + "loss": 0.0, + "step": 28760 + }, + { + "epoch": 3.587281795511222, + "grad_norm": 0.0003281449025962502, + "learning_rate": 5.658354114713217e-06, + "loss": 0.0, + "step": 28770 + }, + { + "epoch": 3.5885286783042396, + "grad_norm": 0.00019891714327968657, + "learning_rate": 5.653366583541148e-06, + "loss": 0.0, + "step": 28780 + }, + { + "epoch": 3.589775561097257, + "grad_norm": 0.0002504756848793477, + "learning_rate": 5.6483790523690775e-06, + "loss": 0.0, + "step": 28790 + }, + { + "epoch": 3.5910224438902745, + "grad_norm": 0.0025514832232147455, + "learning_rate": 5.643391521197008e-06, + "loss": 0.0, + "step": 28800 + }, + { + "epoch": 3.592269326683292, + "grad_norm": 0.00017150930943898857, + "learning_rate": 5.638403990024938e-06, + "loss": 0.0, + "step": 28810 + }, + { + "epoch": 3.5935162094763093, + "grad_norm": 0.153523251414299, + "learning_rate": 5.6334164588528685e-06, + "loss": 0.0001, + "step": 28820 + }, + { + "epoch": 3.5947630922693268, + "grad_norm": 0.00042072118958458304, + "learning_rate": 5.628428927680798e-06, + "loss": 0.0, + "step": 28830 + }, + { + "epoch": 3.596009975062344, + "grad_norm": 0.0004099764919374138, + "learning_rate": 5.623441396508729e-06, + "loss": 0.0404, + "step": 28840 + }, + { + "epoch": 3.5972568578553616, + "grad_norm": 0.00011178813292644918, + "learning_rate": 5.618453865336659e-06, + "loss": 0.0001, + "step": 28850 + }, + { + "epoch": 3.598503740648379, + "grad_norm": 0.005052752792835236, + "learning_rate": 5.613466334164589e-06, + "loss": 0.0, + "step": 28860 + }, + { + "epoch": 3.5997506234413965, + "grad_norm": 0.00014898031076882035, + "learning_rate": 5.60847880299252e-06, + "loss": 0.0001, + "step": 28870 + }, + { + "epoch": 3.600997506234414, + "grad_norm": 0.001589166116900742, + "learning_rate": 5.60349127182045e-06, + "loss": 0.0727, + "step": 28880 + }, + { + "epoch": 3.6022443890274314, + "grad_norm": 0.0010678882244974375, + "learning_rate": 5.59850374064838e-06, + "loss": 0.0553, + "step": 28890 + }, + { + "epoch": 3.603491271820449, + "grad_norm": 0.0002836325438693166, + "learning_rate": 5.593516209476309e-06, + "loss": 0.0, + "step": 28900 + }, + { + "epoch": 3.6047381546134662, + "grad_norm": 0.0003612026630435139, + "learning_rate": 5.588528678304241e-06, + "loss": 0.0001, + "step": 28910 + }, + { + "epoch": 3.6059850374064837, + "grad_norm": 0.0014816455077379942, + "learning_rate": 5.58354114713217e-06, + "loss": 0.0009, + "step": 28920 + }, + { + "epoch": 3.607231920199501, + "grad_norm": 0.00015595743025187403, + "learning_rate": 5.5785536159601e-06, + "loss": 0.0001, + "step": 28930 + }, + { + "epoch": 3.6084788029925186, + "grad_norm": 0.0009463661117479205, + "learning_rate": 5.57356608478803e-06, + "loss": 0.0, + "step": 28940 + }, + { + "epoch": 3.609725685785536, + "grad_norm": 0.11780844628810883, + "learning_rate": 5.568578553615961e-06, + "loss": 0.0004, + "step": 28950 + }, + { + "epoch": 3.6109725685785534, + "grad_norm": 0.0011268043890595436, + "learning_rate": 5.56359102244389e-06, + "loss": 0.0, + "step": 28960 + }, + { + "epoch": 3.612219451371571, + "grad_norm": 0.0016340049915015697, + "learning_rate": 5.558603491271821e-06, + "loss": 0.0, + "step": 28970 + }, + { + "epoch": 3.6134663341645883, + "grad_norm": 0.004008717834949493, + "learning_rate": 5.553615960099751e-06, + "loss": 0.0, + "step": 28980 + }, + { + "epoch": 3.6147132169576057, + "grad_norm": 0.0002216768334619701, + "learning_rate": 5.548628428927681e-06, + "loss": 0.0, + "step": 28990 + }, + { + "epoch": 3.6159600997506236, + "grad_norm": 0.00013463239884003997, + "learning_rate": 5.543640897755611e-06, + "loss": 0.0003, + "step": 29000 + }, + { + "epoch": 3.617206982543641, + "grad_norm": 0.00029779941542074084, + "learning_rate": 5.538653366583542e-06, + "loss": 0.0, + "step": 29010 + }, + { + "epoch": 3.6184538653366585, + "grad_norm": 0.00019660868565551937, + "learning_rate": 5.5336658354114716e-06, + "loss": 0.0042, + "step": 29020 + }, + { + "epoch": 3.619700748129676, + "grad_norm": 0.0009630105341784656, + "learning_rate": 5.528678304239402e-06, + "loss": 0.0, + "step": 29030 + }, + { + "epoch": 3.6209476309226933, + "grad_norm": 0.03896058723330498, + "learning_rate": 5.523690773067332e-06, + "loss": 0.0, + "step": 29040 + }, + { + "epoch": 3.6221945137157108, + "grad_norm": 0.0003263501566834748, + "learning_rate": 5.5187032418952626e-06, + "loss": 0.0077, + "step": 29050 + }, + { + "epoch": 3.623441396508728, + "grad_norm": 0.003972560167312622, + "learning_rate": 5.513715710723192e-06, + "loss": 0.0, + "step": 29060 + }, + { + "epoch": 3.6246882793017456, + "grad_norm": 0.001412399928085506, + "learning_rate": 5.508728179551123e-06, + "loss": 0.0, + "step": 29070 + }, + { + "epoch": 3.625935162094763, + "grad_norm": 0.00019415126007515937, + "learning_rate": 5.503740648379052e-06, + "loss": 0.0039, + "step": 29080 + }, + { + "epoch": 3.6271820448877805, + "grad_norm": 0.00024378967646043748, + "learning_rate": 5.498753117206983e-06, + "loss": 0.0444, + "step": 29090 + }, + { + "epoch": 3.628428927680798, + "grad_norm": 0.0002585103502497077, + "learning_rate": 5.493765586034914e-06, + "loss": 0.0, + "step": 29100 + }, + { + "epoch": 3.6296758104738154, + "grad_norm": 0.0010230403859168291, + "learning_rate": 5.488778054862843e-06, + "loss": 0.0, + "step": 29110 + }, + { + "epoch": 3.630922693266833, + "grad_norm": 0.00033736188197508454, + "learning_rate": 5.483790523690774e-06, + "loss": 0.0, + "step": 29120 + }, + { + "epoch": 3.6321695760598502, + "grad_norm": 0.00012601564230863005, + "learning_rate": 5.478802992518703e-06, + "loss": 0.0, + "step": 29130 + }, + { + "epoch": 3.6334164588528677, + "grad_norm": 0.0002682583872228861, + "learning_rate": 5.473815461346634e-06, + "loss": 0.0, + "step": 29140 + }, + { + "epoch": 3.6346633416458856, + "grad_norm": 0.00011863713734783232, + "learning_rate": 5.468827930174564e-06, + "loss": 0.0, + "step": 29150 + }, + { + "epoch": 3.635910224438903, + "grad_norm": 0.0008033208432607353, + "learning_rate": 5.463840399002494e-06, + "loss": 0.0, + "step": 29160 + }, + { + "epoch": 3.6371571072319204, + "grad_norm": 0.00037883687764406204, + "learning_rate": 5.458852867830424e-06, + "loss": 0.003, + "step": 29170 + }, + { + "epoch": 3.638403990024938, + "grad_norm": 3.7584238052368164, + "learning_rate": 5.453865336658355e-06, + "loss": 0.0423, + "step": 29180 + }, + { + "epoch": 3.6396508728179553, + "grad_norm": 0.0013585427077487111, + "learning_rate": 5.4488778054862845e-06, + "loss": 0.0391, + "step": 29190 + }, + { + "epoch": 3.6408977556109727, + "grad_norm": 0.0007598469965159893, + "learning_rate": 5.443890274314215e-06, + "loss": 0.0022, + "step": 29200 + }, + { + "epoch": 3.64214463840399, + "grad_norm": 0.0015173618448898196, + "learning_rate": 5.438902743142145e-06, + "loss": 0.0, + "step": 29210 + }, + { + "epoch": 3.6433915211970076, + "grad_norm": 0.011424501426517963, + "learning_rate": 5.4339152119700755e-06, + "loss": 0.0, + "step": 29220 + }, + { + "epoch": 3.644638403990025, + "grad_norm": 0.01082046888768673, + "learning_rate": 5.428927680798005e-06, + "loss": 0.0017, + "step": 29230 + }, + { + "epoch": 3.6458852867830425, + "grad_norm": 0.0007360957097262144, + "learning_rate": 5.423940149625936e-06, + "loss": 0.0002, + "step": 29240 + }, + { + "epoch": 3.64713216957606, + "grad_norm": 0.0010136303026229143, + "learning_rate": 5.418952618453866e-06, + "loss": 0.0, + "step": 29250 + }, + { + "epoch": 3.6483790523690773, + "grad_norm": 9.919052124023438, + "learning_rate": 5.413965087281796e-06, + "loss": 0.0114, + "step": 29260 + }, + { + "epoch": 3.6496259351620948, + "grad_norm": 0.17939278483390808, + "learning_rate": 5.408977556109726e-06, + "loss": 0.0007, + "step": 29270 + }, + { + "epoch": 3.650872817955112, + "grad_norm": 0.00015363430429715663, + "learning_rate": 5.403990024937657e-06, + "loss": 0.0003, + "step": 29280 + }, + { + "epoch": 3.6521197007481296, + "grad_norm": 0.0038780230097472668, + "learning_rate": 5.3990024937655864e-06, + "loss": 0.0, + "step": 29290 + }, + { + "epoch": 3.653366583541147, + "grad_norm": 0.00012658373452723026, + "learning_rate": 5.394014962593517e-06, + "loss": 0.0, + "step": 29300 + }, + { + "epoch": 3.6546134663341645, + "grad_norm": 0.02487073466181755, + "learning_rate": 5.389027431421446e-06, + "loss": 0.0001, + "step": 29310 + }, + { + "epoch": 3.655860349127182, + "grad_norm": 0.00010112265590578318, + "learning_rate": 5.3840399002493774e-06, + "loss": 0.0108, + "step": 29320 + }, + { + "epoch": 3.6571072319201994, + "grad_norm": 0.0008333465084433556, + "learning_rate": 5.379052369077306e-06, + "loss": 0.0, + "step": 29330 + }, + { + "epoch": 3.658354114713217, + "grad_norm": 0.007760242559015751, + "learning_rate": 5.374064837905237e-06, + "loss": 0.01, + "step": 29340 + }, + { + "epoch": 3.6596009975062342, + "grad_norm": 0.00010809869854710996, + "learning_rate": 5.3690773067331685e-06, + "loss": 0.0001, + "step": 29350 + }, + { + "epoch": 3.6608478802992517, + "grad_norm": 0.001450029551051557, + "learning_rate": 5.364089775561097e-06, + "loss": 0.0, + "step": 29360 + }, + { + "epoch": 3.662094763092269, + "grad_norm": 13.586308479309082, + "learning_rate": 5.359102244389028e-06, + "loss": 0.002, + "step": 29370 + }, + { + "epoch": 3.6633416458852865, + "grad_norm": 0.001481684041209519, + "learning_rate": 5.354114713216958e-06, + "loss": 0.0, + "step": 29380 + }, + { + "epoch": 3.664588528678304, + "grad_norm": 0.0003070503589697182, + "learning_rate": 5.349127182044888e-06, + "loss": 0.0, + "step": 29390 + }, + { + "epoch": 3.665835411471322, + "grad_norm": 0.0039036672096699476, + "learning_rate": 5.344139650872818e-06, + "loss": 0.0, + "step": 29400 + }, + { + "epoch": 3.6670822942643393, + "grad_norm": 0.0003815802629105747, + "learning_rate": 5.339152119700749e-06, + "loss": 0.0128, + "step": 29410 + }, + { + "epoch": 3.6683291770573567, + "grad_norm": 0.0005427736323326826, + "learning_rate": 5.3341645885286786e-06, + "loss": 0.024, + "step": 29420 + }, + { + "epoch": 3.669576059850374, + "grad_norm": 0.00020941866387147456, + "learning_rate": 5.329177057356609e-06, + "loss": 0.0, + "step": 29430 + }, + { + "epoch": 3.6708229426433916, + "grad_norm": 0.0005622466560453176, + "learning_rate": 5.324189526184539e-06, + "loss": 0.0002, + "step": 29440 + }, + { + "epoch": 3.672069825436409, + "grad_norm": 0.00019208036246709526, + "learning_rate": 5.3192019950124696e-06, + "loss": 0.003, + "step": 29450 + }, + { + "epoch": 3.6733167082294265, + "grad_norm": 0.00985199399292469, + "learning_rate": 5.314214463840399e-06, + "loss": 0.0054, + "step": 29460 + }, + { + "epoch": 3.674563591022444, + "grad_norm": 0.00011043824633816257, + "learning_rate": 5.30922693266833e-06, + "loss": 0.0216, + "step": 29470 + }, + { + "epoch": 3.6758104738154613, + "grad_norm": 0.0001325913763139397, + "learning_rate": 5.30423940149626e-06, + "loss": 0.0, + "step": 29480 + }, + { + "epoch": 3.6770573566084788, + "grad_norm": 0.005065642762929201, + "learning_rate": 5.29925187032419e-06, + "loss": 0.0281, + "step": 29490 + }, + { + "epoch": 3.678304239401496, + "grad_norm": 0.0005468591116368771, + "learning_rate": 5.29426433915212e-06, + "loss": 0.0, + "step": 29500 + }, + { + "epoch": 3.6795511221945136, + "grad_norm": 0.0002817381464410573, + "learning_rate": 5.289276807980051e-06, + "loss": 0.0001, + "step": 29510 + }, + { + "epoch": 3.680798004987531, + "grad_norm": 0.0014619999565184116, + "learning_rate": 5.2842892768079805e-06, + "loss": 0.0, + "step": 29520 + }, + { + "epoch": 3.6820448877805485, + "grad_norm": 0.00014814763562753797, + "learning_rate": 5.279301745635911e-06, + "loss": 0.0015, + "step": 29530 + }, + { + "epoch": 3.683291770573566, + "grad_norm": 0.001574231660924852, + "learning_rate": 5.27431421446384e-06, + "loss": 0.0001, + "step": 29540 + }, + { + "epoch": 3.684538653366584, + "grad_norm": 0.00010609447053866461, + "learning_rate": 5.2693266832917715e-06, + "loss": 0.0, + "step": 29550 + }, + { + "epoch": 3.6857855361596013, + "grad_norm": 0.00022953233565203846, + "learning_rate": 5.2643391521197004e-06, + "loss": 0.0, + "step": 29560 + }, + { + "epoch": 3.6870324189526187, + "grad_norm": 9.888015483738855e-05, + "learning_rate": 5.259351620947631e-06, + "loss": 0.0002, + "step": 29570 + }, + { + "epoch": 3.688279301745636, + "grad_norm": 0.00013748396304436028, + "learning_rate": 5.254364089775561e-06, + "loss": 0.0, + "step": 29580 + }, + { + "epoch": 3.6895261845386536, + "grad_norm": 0.00013026520900893956, + "learning_rate": 5.2493765586034915e-06, + "loss": 0.0219, + "step": 29590 + }, + { + "epoch": 3.690773067331671, + "grad_norm": 0.0008388891001231968, + "learning_rate": 5.244389027431422e-06, + "loss": 0.0617, + "step": 29600 + }, + { + "epoch": 3.6920199501246884, + "grad_norm": 0.0006873853853903711, + "learning_rate": 5.239401496259352e-06, + "loss": 0.0381, + "step": 29610 + }, + { + "epoch": 3.693266832917706, + "grad_norm": 7.518016338348389, + "learning_rate": 5.2344139650872825e-06, + "loss": 0.0623, + "step": 29620 + }, + { + "epoch": 3.6945137157107233, + "grad_norm": 0.0008727543754503131, + "learning_rate": 5.229426433915212e-06, + "loss": 0.0, + "step": 29630 + }, + { + "epoch": 3.6957605985037407, + "grad_norm": 0.005849814973771572, + "learning_rate": 5.224438902743143e-06, + "loss": 0.055, + "step": 29640 + }, + { + "epoch": 3.697007481296758, + "grad_norm": 0.0005927455495111644, + "learning_rate": 5.219451371571073e-06, + "loss": 0.0, + "step": 29650 + }, + { + "epoch": 3.6982543640897756, + "grad_norm": 0.0024080451112240553, + "learning_rate": 5.214463840399003e-06, + "loss": 0.0135, + "step": 29660 + }, + { + "epoch": 3.699501246882793, + "grad_norm": 0.0016026614466682076, + "learning_rate": 5.209476309226933e-06, + "loss": 0.0504, + "step": 29670 + }, + { + "epoch": 3.7007481296758105, + "grad_norm": 0.0008264243369922042, + "learning_rate": 5.204488778054864e-06, + "loss": 0.0024, + "step": 29680 + }, + { + "epoch": 3.701995012468828, + "grad_norm": 0.0018659487832337618, + "learning_rate": 5.199501246882793e-06, + "loss": 0.0001, + "step": 29690 + }, + { + "epoch": 3.7032418952618453, + "grad_norm": 0.018036233261227608, + "learning_rate": 5.194513715710724e-06, + "loss": 0.0001, + "step": 29700 + }, + { + "epoch": 3.7044887780548628, + "grad_norm": 0.017439868301153183, + "learning_rate": 5.189526184538654e-06, + "loss": 0.0348, + "step": 29710 + }, + { + "epoch": 3.70573566084788, + "grad_norm": 0.0008811484440229833, + "learning_rate": 5.1845386533665844e-06, + "loss": 0.0, + "step": 29720 + }, + { + "epoch": 3.7069825436408976, + "grad_norm": 0.00037909552338533103, + "learning_rate": 5.179551122194514e-06, + "loss": 0.0, + "step": 29730 + }, + { + "epoch": 3.708229426433915, + "grad_norm": 0.00022248552704695612, + "learning_rate": 5.174563591022445e-06, + "loss": 0.0, + "step": 29740 + }, + { + "epoch": 3.7094763092269325, + "grad_norm": 0.014188253320753574, + "learning_rate": 5.169576059850374e-06, + "loss": 0.0001, + "step": 29750 + }, + { + "epoch": 3.71072319201995, + "grad_norm": 0.013957753777503967, + "learning_rate": 5.164588528678305e-06, + "loss": 0.0, + "step": 29760 + }, + { + "epoch": 3.7119700748129674, + "grad_norm": 0.0008046287694014609, + "learning_rate": 5.159600997506234e-06, + "loss": 0.0, + "step": 29770 + }, + { + "epoch": 3.713216957605985, + "grad_norm": 0.0015382746933028102, + "learning_rate": 5.154613466334165e-06, + "loss": 0.0004, + "step": 29780 + }, + { + "epoch": 3.7144638403990022, + "grad_norm": 0.0011267090449109674, + "learning_rate": 5.1496259351620945e-06, + "loss": 0.0, + "step": 29790 + }, + { + "epoch": 3.71571072319202, + "grad_norm": 0.0019451412372291088, + "learning_rate": 5.144638403990025e-06, + "loss": 0.0, + "step": 29800 + }, + { + "epoch": 3.7169576059850375, + "grad_norm": 0.0013380858581513166, + "learning_rate": 5.139650872817955e-06, + "loss": 0.0003, + "step": 29810 + }, + { + "epoch": 3.718204488778055, + "grad_norm": 0.0008705657673999667, + "learning_rate": 5.1346633416458855e-06, + "loss": 0.0001, + "step": 29820 + }, + { + "epoch": 3.7194513715710724, + "grad_norm": 0.0007306481129489839, + "learning_rate": 5.129675810473815e-06, + "loss": 0.0, + "step": 29830 + }, + { + "epoch": 3.72069825436409, + "grad_norm": 0.0004138894146308303, + "learning_rate": 5.124688279301746e-06, + "loss": 0.0, + "step": 29840 + }, + { + "epoch": 3.7219451371571073, + "grad_norm": 0.0006331994663923979, + "learning_rate": 5.1197007481296766e-06, + "loss": 0.0255, + "step": 29850 + }, + { + "epoch": 3.7231920199501247, + "grad_norm": 0.0023739382158964872, + "learning_rate": 5.114713216957606e-06, + "loss": 0.0, + "step": 29860 + }, + { + "epoch": 3.724438902743142, + "grad_norm": 0.0019165941048413515, + "learning_rate": 5.109725685785537e-06, + "loss": 0.0, + "step": 29870 + }, + { + "epoch": 3.7256857855361596, + "grad_norm": 0.00022552658629138023, + "learning_rate": 5.104738154613467e-06, + "loss": 0.0, + "step": 29880 + }, + { + "epoch": 3.726932668329177, + "grad_norm": 0.0007717168191447854, + "learning_rate": 5.099750623441397e-06, + "loss": 0.0, + "step": 29890 + }, + { + "epoch": 3.7281795511221945, + "grad_norm": 0.027365686371922493, + "learning_rate": 5.094763092269327e-06, + "loss": 0.0, + "step": 29900 + }, + { + "epoch": 3.729426433915212, + "grad_norm": 0.00012416514800861478, + "learning_rate": 5.089775561097258e-06, + "loss": 0.0, + "step": 29910 + }, + { + "epoch": 3.7306733167082293, + "grad_norm": 0.0002640146412886679, + "learning_rate": 5.0847880299251875e-06, + "loss": 0.0, + "step": 29920 + }, + { + "epoch": 3.7319201995012468, + "grad_norm": 0.00022865763457957655, + "learning_rate": 5.079800498753118e-06, + "loss": 0.0511, + "step": 29930 + }, + { + "epoch": 3.733167082294264, + "grad_norm": 0.007766488939523697, + "learning_rate": 5.074812967581048e-06, + "loss": 0.0, + "step": 29940 + }, + { + "epoch": 3.734413965087282, + "grad_norm": 0.003563706064596772, + "learning_rate": 5.0698254364089785e-06, + "loss": 0.0365, + "step": 29950 + }, + { + "epoch": 3.7356608478802995, + "grad_norm": 0.0021255253814160824, + "learning_rate": 5.064837905236908e-06, + "loss": 0.0001, + "step": 29960 + }, + { + "epoch": 3.736907730673317, + "grad_norm": 0.00031094119185581803, + "learning_rate": 5.059850374064839e-06, + "loss": 0.0146, + "step": 29970 + }, + { + "epoch": 3.7381546134663344, + "grad_norm": 0.0007570157176814973, + "learning_rate": 5.054862842892768e-06, + "loss": 0.0001, + "step": 29980 + }, + { + "epoch": 3.739401496259352, + "grad_norm": 0.0021029124036431313, + "learning_rate": 5.049875311720699e-06, + "loss": 0.0011, + "step": 29990 + }, + { + "epoch": 3.7406483790523692, + "grad_norm": 0.0010909464908763766, + "learning_rate": 5.044887780548628e-06, + "loss": 0.0037, + "step": 30000 + }, + { + "epoch": 3.7418952618453867, + "grad_norm": 0.0006604224909096956, + "learning_rate": 5.039900249376559e-06, + "loss": 0.0007, + "step": 30010 + }, + { + "epoch": 3.743142144638404, + "grad_norm": 0.00019923903164453804, + "learning_rate": 5.034912718204489e-06, + "loss": 0.0551, + "step": 30020 + }, + { + "epoch": 3.7443890274314215, + "grad_norm": 0.00044206419261172414, + "learning_rate": 5.029925187032419e-06, + "loss": 0.0, + "step": 30030 + }, + { + "epoch": 3.745635910224439, + "grad_norm": 0.0007878596661612391, + "learning_rate": 5.024937655860349e-06, + "loss": 0.0001, + "step": 30040 + }, + { + "epoch": 3.7468827930174564, + "grad_norm": 0.000546826864592731, + "learning_rate": 5.01995012468828e-06, + "loss": 0.0, + "step": 30050 + }, + { + "epoch": 3.748129675810474, + "grad_norm": 0.0003211061120964587, + "learning_rate": 5.014962593516209e-06, + "loss": 0.0002, + "step": 30060 + }, + { + "epoch": 3.7493765586034913, + "grad_norm": 9.711940219858661e-05, + "learning_rate": 5.00997506234414e-06, + "loss": 0.0362, + "step": 30070 + }, + { + "epoch": 3.7506234413965087, + "grad_norm": 0.0026123167481273413, + "learning_rate": 5.00498753117207e-06, + "loss": 0.0, + "step": 30080 + }, + { + "epoch": 3.751870324189526, + "grad_norm": 0.005424060393124819, + "learning_rate": 5e-06, + "loss": 0.0, + "step": 30090 + }, + { + "epoch": 3.7531172069825436, + "grad_norm": 0.00042586520430631936, + "learning_rate": 4.99501246882793e-06, + "loss": 0.0006, + "step": 30100 + }, + { + "epoch": 3.754364089775561, + "grad_norm": 0.0006117548909969628, + "learning_rate": 4.990024937655861e-06, + "loss": 0.0, + "step": 30110 + }, + { + "epoch": 3.7556109725685785, + "grad_norm": 0.00031122061773203313, + "learning_rate": 4.9850374064837906e-06, + "loss": 0.0001, + "step": 30120 + }, + { + "epoch": 3.756857855361596, + "grad_norm": 0.0004732254019472748, + "learning_rate": 4.980049875311721e-06, + "loss": 0.0, + "step": 30130 + }, + { + "epoch": 3.7581047381546133, + "grad_norm": 0.00018621271010488272, + "learning_rate": 4.975561097256858e-06, + "loss": 0.0076, + "step": 30140 + }, + { + "epoch": 3.7593516209476308, + "grad_norm": 0.0011101874988526106, + "learning_rate": 4.970573566084788e-06, + "loss": 0.0023, + "step": 30150 + }, + { + "epoch": 3.760598503740648, + "grad_norm": 0.00034718040842562914, + "learning_rate": 4.965586034912719e-06, + "loss": 0.0009, + "step": 30160 + }, + { + "epoch": 3.7618453865336656, + "grad_norm": 0.0011685335775837302, + "learning_rate": 4.960598503740649e-06, + "loss": 0.0001, + "step": 30170 + }, + { + "epoch": 3.763092269326683, + "grad_norm": 0.0011077482486143708, + "learning_rate": 4.955610972568579e-06, + "loss": 0.0001, + "step": 30180 + }, + { + "epoch": 3.7643391521197005, + "grad_norm": 0.00016491774294991046, + "learning_rate": 4.950623441396509e-06, + "loss": 0.0, + "step": 30190 + }, + { + "epoch": 3.765586034912718, + "grad_norm": 0.0005304404185153544, + "learning_rate": 4.94563591022444e-06, + "loss": 0.0007, + "step": 30200 + }, + { + "epoch": 3.766832917705736, + "grad_norm": 0.0007268518093042076, + "learning_rate": 4.9406483790523695e-06, + "loss": 0.0, + "step": 30210 + }, + { + "epoch": 3.7680798004987532, + "grad_norm": 0.0003884352627210319, + "learning_rate": 4.9356608478803e-06, + "loss": 0.0, + "step": 30220 + }, + { + "epoch": 3.7693266832917707, + "grad_norm": 0.03600273281335831, + "learning_rate": 4.93067331670823e-06, + "loss": 0.0, + "step": 30230 + }, + { + "epoch": 3.770573566084788, + "grad_norm": 0.00010087342525366694, + "learning_rate": 4.92568578553616e-06, + "loss": 0.0, + "step": 30240 + }, + { + "epoch": 3.7718204488778055, + "grad_norm": 0.0014833903405815363, + "learning_rate": 4.92069825436409e-06, + "loss": 0.0, + "step": 30250 + }, + { + "epoch": 3.773067331670823, + "grad_norm": 0.0001319284929195419, + "learning_rate": 4.91571072319202e-06, + "loss": 0.0, + "step": 30260 + }, + { + "epoch": 3.7743142144638404, + "grad_norm": 0.00032203830778598785, + "learning_rate": 4.910723192019951e-06, + "loss": 0.0001, + "step": 30270 + }, + { + "epoch": 3.775561097256858, + "grad_norm": 0.0009229084243997931, + "learning_rate": 4.9057356608478805e-06, + "loss": 0.0392, + "step": 30280 + }, + { + "epoch": 3.7768079800498753, + "grad_norm": 0.0002168490318581462, + "learning_rate": 4.900748129675811e-06, + "loss": 0.0, + "step": 30290 + }, + { + "epoch": 3.7780548628428927, + "grad_norm": 0.00036489724880084395, + "learning_rate": 4.895760598503741e-06, + "loss": 0.0, + "step": 30300 + }, + { + "epoch": 3.77930174563591, + "grad_norm": 0.00013854789722245187, + "learning_rate": 4.8907730673316715e-06, + "loss": 0.0, + "step": 30310 + }, + { + "epoch": 3.7805486284289276, + "grad_norm": 0.00039082334842532873, + "learning_rate": 4.885785536159601e-06, + "loss": 0.0, + "step": 30320 + }, + { + "epoch": 3.781795511221945, + "grad_norm": 0.0018215180607512593, + "learning_rate": 4.880798004987531e-06, + "loss": 0.021, + "step": 30330 + }, + { + "epoch": 3.7830423940149625, + "grad_norm": 0.00018214010924566537, + "learning_rate": 4.875810473815462e-06, + "loss": 0.0717, + "step": 30340 + }, + { + "epoch": 3.78428927680798, + "grad_norm": 0.0006838123081251979, + "learning_rate": 4.8708229426433914e-06, + "loss": 0.0, + "step": 30350 + }, + { + "epoch": 3.7855361596009978, + "grad_norm": 0.004681428894400597, + "learning_rate": 4.865835411471322e-06, + "loss": 0.0463, + "step": 30360 + }, + { + "epoch": 3.786783042394015, + "grad_norm": 0.0029412105213850737, + "learning_rate": 4.860847880299252e-06, + "loss": 0.0, + "step": 30370 + }, + { + "epoch": 3.7880299251870326, + "grad_norm": 0.00215451349504292, + "learning_rate": 4.8558603491271824e-06, + "loss": 0.0, + "step": 30380 + }, + { + "epoch": 3.78927680798005, + "grad_norm": 0.0009547757799737155, + "learning_rate": 4.850872817955113e-06, + "loss": 0.0, + "step": 30390 + }, + { + "epoch": 3.7905236907730675, + "grad_norm": 0.0007622092380188406, + "learning_rate": 4.845885286783043e-06, + "loss": 0.0, + "step": 30400 + }, + { + "epoch": 3.791770573566085, + "grad_norm": 0.00015706811973359436, + "learning_rate": 4.8408977556109734e-06, + "loss": 0.0001, + "step": 30410 + }, + { + "epoch": 3.7930174563591024, + "grad_norm": 0.0003935690619982779, + "learning_rate": 4.835910224438903e-06, + "loss": 0.0402, + "step": 30420 + }, + { + "epoch": 3.79426433915212, + "grad_norm": 0.0001764619373716414, + "learning_rate": 4.830922693266834e-06, + "loss": 0.0, + "step": 30430 + }, + { + "epoch": 3.7955112219451372, + "grad_norm": 0.00022006318613421172, + "learning_rate": 4.825935162094764e-06, + "loss": 0.0, + "step": 30440 + }, + { + "epoch": 3.7967581047381547, + "grad_norm": 0.004599731881171465, + "learning_rate": 4.820947630922693e-06, + "loss": 0.0, + "step": 30450 + }, + { + "epoch": 3.798004987531172, + "grad_norm": 0.00025898893363773823, + "learning_rate": 4.815960099750624e-06, + "loss": 0.0, + "step": 30460 + }, + { + "epoch": 3.7992518703241895, + "grad_norm": 0.1059185266494751, + "learning_rate": 4.810972568578554e-06, + "loss": 0.0496, + "step": 30470 + }, + { + "epoch": 3.800498753117207, + "grad_norm": 0.00024309437139891088, + "learning_rate": 4.805985037406484e-06, + "loss": 0.0, + "step": 30480 + }, + { + "epoch": 3.8017456359102244, + "grad_norm": 0.000419674877775833, + "learning_rate": 4.800997506234414e-06, + "loss": 0.0436, + "step": 30490 + }, + { + "epoch": 3.802992518703242, + "grad_norm": 0.000803522125352174, + "learning_rate": 4.796009975062345e-06, + "loss": 0.0732, + "step": 30500 + }, + { + "epoch": 3.8042394014962593, + "grad_norm": 0.0010885476367548108, + "learning_rate": 4.7910224438902746e-06, + "loss": 0.0, + "step": 30510 + }, + { + "epoch": 3.8054862842892767, + "grad_norm": 0.012714964337646961, + "learning_rate": 4.786034912718205e-06, + "loss": 0.0049, + "step": 30520 + }, + { + "epoch": 3.806733167082294, + "grad_norm": 0.00023770575353410095, + "learning_rate": 4.781047381546135e-06, + "loss": 0.0, + "step": 30530 + }, + { + "epoch": 3.8079800498753116, + "grad_norm": 0.0004311330849304795, + "learning_rate": 4.7760598503740656e-06, + "loss": 0.0, + "step": 30540 + }, + { + "epoch": 3.809226932668329, + "grad_norm": 0.00026620420976541936, + "learning_rate": 4.771072319201995e-06, + "loss": 0.0068, + "step": 30550 + }, + { + "epoch": 3.8104738154613464, + "grad_norm": 0.0027953782118856907, + "learning_rate": 4.766084788029925e-06, + "loss": 0.0, + "step": 30560 + }, + { + "epoch": 3.811720698254364, + "grad_norm": 0.0024098586291074753, + "learning_rate": 4.761097256857856e-06, + "loss": 0.0, + "step": 30570 + }, + { + "epoch": 3.8129675810473813, + "grad_norm": 58.85100173950195, + "learning_rate": 4.7561097256857855e-06, + "loss": 0.0309, + "step": 30580 + }, + { + "epoch": 3.8142144638403987, + "grad_norm": 0.0023890752345323563, + "learning_rate": 4.751122194513716e-06, + "loss": 0.0554, + "step": 30590 + }, + { + "epoch": 3.815461346633416, + "grad_norm": 0.0031661030370742083, + "learning_rate": 4.746134663341646e-06, + "loss": 0.0, + "step": 30600 + }, + { + "epoch": 3.816708229426434, + "grad_norm": 0.0001850063999881968, + "learning_rate": 4.7411471321695765e-06, + "loss": 0.001, + "step": 30610 + }, + { + "epoch": 3.8179551122194515, + "grad_norm": 0.020202938467264175, + "learning_rate": 4.736159600997506e-06, + "loss": 0.0289, + "step": 30620 + }, + { + "epoch": 3.819201995012469, + "grad_norm": 0.0003594272129703313, + "learning_rate": 4.731172069825437e-06, + "loss": 0.0438, + "step": 30630 + }, + { + "epoch": 3.8204488778054864, + "grad_norm": 0.01444461289793253, + "learning_rate": 4.7261845386533675e-06, + "loss": 0.0, + "step": 30640 + }, + { + "epoch": 3.821695760598504, + "grad_norm": 0.0005201894673518836, + "learning_rate": 4.721197007481297e-06, + "loss": 0.0, + "step": 30650 + }, + { + "epoch": 3.8229426433915212, + "grad_norm": 0.0026227838825434446, + "learning_rate": 4.716209476309228e-06, + "loss": 0.0001, + "step": 30660 + }, + { + "epoch": 3.8241895261845387, + "grad_norm": 0.0017443817341700196, + "learning_rate": 4.711221945137158e-06, + "loss": 0.0006, + "step": 30670 + }, + { + "epoch": 3.825436408977556, + "grad_norm": 0.0010206064907833934, + "learning_rate": 4.7062344139650875e-06, + "loss": 0.0002, + "step": 30680 + }, + { + "epoch": 3.8266832917705735, + "grad_norm": 0.009462667629122734, + "learning_rate": 4.701246882793018e-06, + "loss": 0.0001, + "step": 30690 + }, + { + "epoch": 3.827930174563591, + "grad_norm": 0.0020114071667194366, + "learning_rate": 4.696259351620948e-06, + "loss": 0.0, + "step": 30700 + }, + { + "epoch": 3.8291770573566084, + "grad_norm": 0.00016912985302042216, + "learning_rate": 4.6912718204488785e-06, + "loss": 0.0, + "step": 30710 + }, + { + "epoch": 3.830423940149626, + "grad_norm": 0.0017660657176747918, + "learning_rate": 4.686284289276808e-06, + "loss": 0.0001, + "step": 30720 + }, + { + "epoch": 3.8316708229426433, + "grad_norm": 0.000822771864477545, + "learning_rate": 4.681296758104739e-06, + "loss": 0.0, + "step": 30730 + }, + { + "epoch": 3.8329177057356607, + "grad_norm": 0.0009312349720858037, + "learning_rate": 4.676309226932669e-06, + "loss": 0.0, + "step": 30740 + }, + { + "epoch": 3.834164588528678, + "grad_norm": 0.0004992979229427874, + "learning_rate": 4.671321695760599e-06, + "loss": 0.0114, + "step": 30750 + }, + { + "epoch": 3.835411471321696, + "grad_norm": 0.009063317440450191, + "learning_rate": 4.666334164588529e-06, + "loss": 0.0001, + "step": 30760 + }, + { + "epoch": 3.8366583541147135, + "grad_norm": 0.0017595086246728897, + "learning_rate": 4.661346633416459e-06, + "loss": 0.0112, + "step": 30770 + }, + { + "epoch": 3.837905236907731, + "grad_norm": 0.016222162172198296, + "learning_rate": 4.6563591022443894e-06, + "loss": 0.0222, + "step": 30780 + }, + { + "epoch": 3.8391521197007483, + "grad_norm": 0.009317958727478981, + "learning_rate": 4.651371571072319e-06, + "loss": 0.0, + "step": 30790 + }, + { + "epoch": 3.8403990024937658, + "grad_norm": 0.0005668486119247973, + "learning_rate": 4.64638403990025e-06, + "loss": 0.0362, + "step": 30800 + }, + { + "epoch": 3.841645885286783, + "grad_norm": 0.00031317881075665355, + "learning_rate": 4.64139650872818e-06, + "loss": 0.0007, + "step": 30810 + }, + { + "epoch": 3.8428927680798006, + "grad_norm": 0.00042529948404990137, + "learning_rate": 4.63640897755611e-06, + "loss": 0.0022, + "step": 30820 + }, + { + "epoch": 3.844139650872818, + "grad_norm": 0.000505742384120822, + "learning_rate": 4.63142144638404e-06, + "loss": 0.0, + "step": 30830 + }, + { + "epoch": 3.8453865336658355, + "grad_norm": 0.0003615685855038464, + "learning_rate": 4.626433915211971e-06, + "loss": 0.0001, + "step": 30840 + }, + { + "epoch": 3.846633416458853, + "grad_norm": 0.0002692066482268274, + "learning_rate": 4.6214463840399e-06, + "loss": 0.0001, + "step": 30850 + }, + { + "epoch": 3.8478802992518704, + "grad_norm": 0.00105293991509825, + "learning_rate": 4.616458852867831e-06, + "loss": 0.0, + "step": 30860 + }, + { + "epoch": 3.849127182044888, + "grad_norm": 0.0077606611885130405, + "learning_rate": 4.611471321695761e-06, + "loss": 0.0025, + "step": 30870 + }, + { + "epoch": 3.8503740648379052, + "grad_norm": 0.0007624849677085876, + "learning_rate": 4.6064837905236905e-06, + "loss": 0.0, + "step": 30880 + }, + { + "epoch": 3.8516209476309227, + "grad_norm": 0.00021824889699928463, + "learning_rate": 4.601496259351622e-06, + "loss": 0.0, + "step": 30890 + }, + { + "epoch": 3.85286783042394, + "grad_norm": 0.0004284057067707181, + "learning_rate": 4.596508728179552e-06, + "loss": 0.0003, + "step": 30900 + }, + { + "epoch": 3.8541147132169575, + "grad_norm": 0.0011223534820601344, + "learning_rate": 4.5915211970074815e-06, + "loss": 0.0001, + "step": 30910 + }, + { + "epoch": 3.855361596009975, + "grad_norm": 0.0003385642194189131, + "learning_rate": 4.586533665835412e-06, + "loss": 0.0002, + "step": 30920 + }, + { + "epoch": 3.8566084788029924, + "grad_norm": 7.129866571631283e-05, + "learning_rate": 4.581546134663342e-06, + "loss": 0.0, + "step": 30930 + }, + { + "epoch": 3.85785536159601, + "grad_norm": 0.0835379809141159, + "learning_rate": 4.5765586034912726e-06, + "loss": 0.0001, + "step": 30940 + }, + { + "epoch": 3.8591022443890273, + "grad_norm": 0.0009691972518339753, + "learning_rate": 4.571571072319202e-06, + "loss": 0.0281, + "step": 30950 + }, + { + "epoch": 3.8603491271820447, + "grad_norm": 0.004851430654525757, + "learning_rate": 4.566583541147133e-06, + "loss": 0.0, + "step": 30960 + }, + { + "epoch": 3.861596009975062, + "grad_norm": 0.00014346891839522868, + "learning_rate": 4.561596009975063e-06, + "loss": 0.0, + "step": 30970 + }, + { + "epoch": 3.8628428927680796, + "grad_norm": 0.0014384101377800107, + "learning_rate": 4.556608478802993e-06, + "loss": 0.0, + "step": 30980 + }, + { + "epoch": 3.864089775561097, + "grad_norm": 0.0002044325665337965, + "learning_rate": 4.551620947630923e-06, + "loss": 0.0, + "step": 30990 + }, + { + "epoch": 3.8653366583541144, + "grad_norm": 0.0003074342093896121, + "learning_rate": 4.546633416458853e-06, + "loss": 0.0364, + "step": 31000 + }, + { + "epoch": 3.8665835411471323, + "grad_norm": 0.0023151414934545755, + "learning_rate": 4.5416458852867835e-06, + "loss": 0.0, + "step": 31010 + }, + { + "epoch": 3.8678304239401498, + "grad_norm": 0.005174377933144569, + "learning_rate": 4.536658354114713e-06, + "loss": 0.0, + "step": 31020 + }, + { + "epoch": 3.869077306733167, + "grad_norm": 0.002776858163997531, + "learning_rate": 4.531670822942644e-06, + "loss": 0.0, + "step": 31030 + }, + { + "epoch": 3.8703241895261846, + "grad_norm": 0.008138804696500301, + "learning_rate": 4.526683291770574e-06, + "loss": 0.0, + "step": 31040 + }, + { + "epoch": 3.871571072319202, + "grad_norm": 0.00016253614739980549, + "learning_rate": 4.521695760598504e-06, + "loss": 0.0001, + "step": 31050 + }, + { + "epoch": 3.8728179551122195, + "grad_norm": 0.0030649008695036173, + "learning_rate": 4.516708229426434e-06, + "loss": 0.0431, + "step": 31060 + }, + { + "epoch": 3.874064837905237, + "grad_norm": 0.00010678763646865264, + "learning_rate": 4.511720698254365e-06, + "loss": 0.0127, + "step": 31070 + }, + { + "epoch": 3.8753117206982544, + "grad_norm": 0.00013451423728838563, + "learning_rate": 4.5067331670822945e-06, + "loss": 0.0003, + "step": 31080 + }, + { + "epoch": 3.876558603491272, + "grad_norm": 0.003577812807634473, + "learning_rate": 4.501745635910224e-06, + "loss": 0.0004, + "step": 31090 + }, + { + "epoch": 3.8778054862842892, + "grad_norm": 0.00018554333655629307, + "learning_rate": 4.496758104738155e-06, + "loss": 0.0, + "step": 31100 + }, + { + "epoch": 3.8790523690773067, + "grad_norm": 0.0001478716148994863, + "learning_rate": 4.491770573566085e-06, + "loss": 0.0353, + "step": 31110 + }, + { + "epoch": 3.880299251870324, + "grad_norm": 0.0002082760474877432, + "learning_rate": 4.486783042394015e-06, + "loss": 0.0, + "step": 31120 + }, + { + "epoch": 3.8815461346633415, + "grad_norm": 0.0001366027572657913, + "learning_rate": 4.481795511221945e-06, + "loss": 0.0, + "step": 31130 + }, + { + "epoch": 3.882793017456359, + "grad_norm": 0.004299758467823267, + "learning_rate": 4.476807980049876e-06, + "loss": 0.0, + "step": 31140 + }, + { + "epoch": 3.8840399002493764, + "grad_norm": 0.00022500457998830825, + "learning_rate": 4.471820448877806e-06, + "loss": 0.0001, + "step": 31150 + }, + { + "epoch": 3.8852867830423943, + "grad_norm": 0.0007693602237850428, + "learning_rate": 4.466832917705736e-06, + "loss": 0.0003, + "step": 31160 + }, + { + "epoch": 3.8865336658354117, + "grad_norm": 0.002516511594876647, + "learning_rate": 4.461845386533667e-06, + "loss": 0.0001, + "step": 31170 + }, + { + "epoch": 3.887780548628429, + "grad_norm": 0.0010788318468257785, + "learning_rate": 4.456857855361596e-06, + "loss": 0.0, + "step": 31180 + }, + { + "epoch": 3.8890274314214466, + "grad_norm": 0.00012673945457208902, + "learning_rate": 4.451870324189527e-06, + "loss": 0.0, + "step": 31190 + }, + { + "epoch": 3.890274314214464, + "grad_norm": 105.11888122558594, + "learning_rate": 4.446882793017457e-06, + "loss": 0.0167, + "step": 31200 + }, + { + "epoch": 3.8915211970074814, + "grad_norm": 0.0005449632881209254, + "learning_rate": 4.441895261845387e-06, + "loss": 0.0417, + "step": 31210 + }, + { + "epoch": 3.892768079800499, + "grad_norm": 0.004787981975823641, + "learning_rate": 4.436907730673317e-06, + "loss": 0.0012, + "step": 31220 + }, + { + "epoch": 3.8940149625935163, + "grad_norm": 0.000260724569670856, + "learning_rate": 4.431920199501247e-06, + "loss": 0.0, + "step": 31230 + }, + { + "epoch": 3.8952618453865338, + "grad_norm": 0.00011065945727750659, + "learning_rate": 4.426932668329178e-06, + "loss": 0.0, + "step": 31240 + }, + { + "epoch": 3.896508728179551, + "grad_norm": 0.007168873678892851, + "learning_rate": 4.421945137157107e-06, + "loss": 0.0, + "step": 31250 + }, + { + "epoch": 3.8977556109725686, + "grad_norm": 0.05084488168358803, + "learning_rate": 4.416957605985038e-06, + "loss": 0.0, + "step": 31260 + }, + { + "epoch": 3.899002493765586, + "grad_norm": 0.0007356269052252173, + "learning_rate": 4.411970074812968e-06, + "loss": 0.0004, + "step": 31270 + }, + { + "epoch": 3.9002493765586035, + "grad_norm": 0.0008322421344928443, + "learning_rate": 4.406982543640898e-06, + "loss": 0.0, + "step": 31280 + }, + { + "epoch": 3.901496259351621, + "grad_norm": 1.249976396560669, + "learning_rate": 4.401995012468828e-06, + "loss": 0.0002, + "step": 31290 + }, + { + "epoch": 3.9027431421446384, + "grad_norm": 0.00026053888723254204, + "learning_rate": 4.397007481296759e-06, + "loss": 0.0, + "step": 31300 + }, + { + "epoch": 3.903990024937656, + "grad_norm": 0.00010390794341219589, + "learning_rate": 4.3920199501246885e-06, + "loss": 0.0, + "step": 31310 + }, + { + "epoch": 3.9052369077306732, + "grad_norm": 0.0004543437680695206, + "learning_rate": 4.387032418952618e-06, + "loss": 0.0, + "step": 31320 + }, + { + "epoch": 3.9064837905236907, + "grad_norm": 0.0006542736664414406, + "learning_rate": 4.382044887780549e-06, + "loss": 0.0, + "step": 31330 + }, + { + "epoch": 3.907730673316708, + "grad_norm": 0.011358565650880337, + "learning_rate": 4.377057356608479e-06, + "loss": 0.0001, + "step": 31340 + }, + { + "epoch": 3.9089775561097255, + "grad_norm": 0.00022623350378125906, + "learning_rate": 4.372069825436409e-06, + "loss": 0.0, + "step": 31350 + }, + { + "epoch": 3.910224438902743, + "grad_norm": 0.0005444650305435061, + "learning_rate": 4.367082294264339e-06, + "loss": 0.0, + "step": 31360 + }, + { + "epoch": 3.9114713216957604, + "grad_norm": 0.0046605272218585014, + "learning_rate": 4.36209476309227e-06, + "loss": 0.0148, + "step": 31370 + }, + { + "epoch": 3.912718204488778, + "grad_norm": 0.0033968230709433556, + "learning_rate": 4.3571072319202e-06, + "loss": 0.0011, + "step": 31380 + }, + { + "epoch": 3.9139650872817953, + "grad_norm": 0.0001498850469943136, + "learning_rate": 4.35211970074813e-06, + "loss": 0.0006, + "step": 31390 + }, + { + "epoch": 3.9152119700748127, + "grad_norm": 0.0003460958832874894, + "learning_rate": 4.347132169576061e-06, + "loss": 0.0, + "step": 31400 + }, + { + "epoch": 3.9164588528678306, + "grad_norm": 0.0003900064912158996, + "learning_rate": 4.3421446384039905e-06, + "loss": 0.0155, + "step": 31410 + }, + { + "epoch": 3.917705735660848, + "grad_norm": 0.00030247578979469836, + "learning_rate": 4.337157107231921e-06, + "loss": 0.003, + "step": 31420 + }, + { + "epoch": 3.9189526184538654, + "grad_norm": 0.0009005562751553953, + "learning_rate": 4.332169576059851e-06, + "loss": 0.0, + "step": 31430 + }, + { + "epoch": 3.920199501246883, + "grad_norm": 0.0002896441437769681, + "learning_rate": 4.327182044887781e-06, + "loss": 0.0234, + "step": 31440 + }, + { + "epoch": 3.9214463840399003, + "grad_norm": 0.0017723769415169954, + "learning_rate": 4.322194513715711e-06, + "loss": 0.0, + "step": 31450 + }, + { + "epoch": 3.9226932668329177, + "grad_norm": 0.0003186598187312484, + "learning_rate": 4.317206982543641e-06, + "loss": 0.0254, + "step": 31460 + }, + { + "epoch": 3.923940149625935, + "grad_norm": 0.4577343761920929, + "learning_rate": 4.312219451371572e-06, + "loss": 0.0052, + "step": 31470 + }, + { + "epoch": 3.9251870324189526, + "grad_norm": 0.00017377693438902497, + "learning_rate": 4.3072319201995014e-06, + "loss": 0.0, + "step": 31480 + }, + { + "epoch": 3.92643391521197, + "grad_norm": 0.00017991484492085874, + "learning_rate": 4.302244389027432e-06, + "loss": 0.0613, + "step": 31490 + }, + { + "epoch": 3.9276807980049875, + "grad_norm": 0.009794537909328938, + "learning_rate": 4.297256857855362e-06, + "loss": 0.0, + "step": 31500 + }, + { + "epoch": 3.928927680798005, + "grad_norm": 0.00017601059516891837, + "learning_rate": 4.2922693266832925e-06, + "loss": 0.0, + "step": 31510 + }, + { + "epoch": 3.9301745635910224, + "grad_norm": 0.0005012759938836098, + "learning_rate": 4.287281795511222e-06, + "loss": 0.0002, + "step": 31520 + }, + { + "epoch": 3.93142144638404, + "grad_norm": 0.0002343037340324372, + "learning_rate": 4.282294264339152e-06, + "loss": 0.0016, + "step": 31530 + }, + { + "epoch": 3.932668329177057, + "grad_norm": 0.002151391003280878, + "learning_rate": 4.277306733167083e-06, + "loss": 0.0, + "step": 31540 + }, + { + "epoch": 3.9339152119700747, + "grad_norm": 0.00020666795899160206, + "learning_rate": 4.272319201995012e-06, + "loss": 0.0001, + "step": 31550 + }, + { + "epoch": 3.9351620947630925, + "grad_norm": 0.0015923150349408388, + "learning_rate": 4.267331670822943e-06, + "loss": 0.0109, + "step": 31560 + }, + { + "epoch": 3.93640897755611, + "grad_norm": 0.00036555586848407984, + "learning_rate": 4.262344139650873e-06, + "loss": 0.0002, + "step": 31570 + }, + { + "epoch": 3.9376558603491274, + "grad_norm": 0.000919479236472398, + "learning_rate": 4.257356608478803e-06, + "loss": 0.0, + "step": 31580 + }, + { + "epoch": 3.938902743142145, + "grad_norm": 0.00040298415115103126, + "learning_rate": 4.252369077306733e-06, + "loss": 0.0001, + "step": 31590 + }, + { + "epoch": 3.9401496259351623, + "grad_norm": 0.0005349696730263531, + "learning_rate": 4.247381546134664e-06, + "loss": 0.0, + "step": 31600 + }, + { + "epoch": 3.9413965087281797, + "grad_norm": 0.0037811186630278826, + "learning_rate": 4.2423940149625936e-06, + "loss": 0.0, + "step": 31610 + }, + { + "epoch": 3.942643391521197, + "grad_norm": 0.00016215858340729028, + "learning_rate": 4.237406483790524e-06, + "loss": 0.0, + "step": 31620 + }, + { + "epoch": 3.9438902743142146, + "grad_norm": 0.0003500250750221312, + "learning_rate": 4.232418952618455e-06, + "loss": 0.0, + "step": 31630 + }, + { + "epoch": 3.945137157107232, + "grad_norm": 0.00022250486654229462, + "learning_rate": 4.227930174563591e-06, + "loss": 0.024, + "step": 31640 + }, + { + "epoch": 3.9463840399002494, + "grad_norm": 0.00041795289143919945, + "learning_rate": 4.222942643391522e-06, + "loss": 0.0, + "step": 31650 + }, + { + "epoch": 3.947630922693267, + "grad_norm": 0.0001636535016587004, + "learning_rate": 4.217955112219452e-06, + "loss": 0.0115, + "step": 31660 + }, + { + "epoch": 3.9488778054862843, + "grad_norm": 0.00020100557594560087, + "learning_rate": 4.2129675810473815e-06, + "loss": 0.0211, + "step": 31670 + }, + { + "epoch": 3.9501246882793017, + "grad_norm": 0.00010202820703852922, + "learning_rate": 4.207980049875312e-06, + "loss": 0.0, + "step": 31680 + }, + { + "epoch": 3.951371571072319, + "grad_norm": 0.0005976628744974732, + "learning_rate": 4.202992518703242e-06, + "loss": 0.0308, + "step": 31690 + }, + { + "epoch": 3.9526184538653366, + "grad_norm": 0.00015598548634443432, + "learning_rate": 4.1980049875311725e-06, + "loss": 0.0001, + "step": 31700 + }, + { + "epoch": 3.953865336658354, + "grad_norm": 0.0006848460761830211, + "learning_rate": 4.193017456359102e-06, + "loss": 0.0001, + "step": 31710 + }, + { + "epoch": 3.9551122194513715, + "grad_norm": 0.00013792548270430416, + "learning_rate": 4.188029925187033e-06, + "loss": 0.0, + "step": 31720 + }, + { + "epoch": 3.956359102244389, + "grad_norm": 7.759319123579189e-05, + "learning_rate": 4.183042394014963e-06, + "loss": 0.0, + "step": 31730 + }, + { + "epoch": 3.9576059850374063, + "grad_norm": 0.0001390322286169976, + "learning_rate": 4.178054862842893e-06, + "loss": 0.0, + "step": 31740 + }, + { + "epoch": 3.958852867830424, + "grad_norm": 1.8627551794052124, + "learning_rate": 4.173067331670823e-06, + "loss": 0.0003, + "step": 31750 + }, + { + "epoch": 3.960099750623441, + "grad_norm": 72.14037322998047, + "learning_rate": 4.168079800498753e-06, + "loss": 0.045, + "step": 31760 + }, + { + "epoch": 3.9613466334164587, + "grad_norm": 0.00022149724827613682, + "learning_rate": 4.1630922693266835e-06, + "loss": 0.0, + "step": 31770 + }, + { + "epoch": 3.962593516209476, + "grad_norm": 0.0002591174270492047, + "learning_rate": 4.158104738154613e-06, + "loss": 0.0002, + "step": 31780 + }, + { + "epoch": 3.9638403990024935, + "grad_norm": 0.0002038097009062767, + "learning_rate": 4.153117206982544e-06, + "loss": 0.0, + "step": 31790 + }, + { + "epoch": 3.965087281795511, + "grad_norm": 0.0007396162254735827, + "learning_rate": 4.1481296758104745e-06, + "loss": 0.0, + "step": 31800 + }, + { + "epoch": 3.966334164588529, + "grad_norm": 0.0005356416804715991, + "learning_rate": 4.143142144638404e-06, + "loss": 0.0, + "step": 31810 + }, + { + "epoch": 3.9675810473815463, + "grad_norm": 0.0005426599527709186, + "learning_rate": 4.138154613466335e-06, + "loss": 0.0, + "step": 31820 + }, + { + "epoch": 3.9688279301745637, + "grad_norm": 0.004455675836652517, + "learning_rate": 4.133167082294265e-06, + "loss": 0.0, + "step": 31830 + }, + { + "epoch": 3.970074812967581, + "grad_norm": 0.0010512182489037514, + "learning_rate": 4.128179551122195e-06, + "loss": 0.0, + "step": 31840 + }, + { + "epoch": 3.9713216957605986, + "grad_norm": 0.00018547069339547306, + "learning_rate": 4.123192019950125e-06, + "loss": 0.0, + "step": 31850 + }, + { + "epoch": 3.972568578553616, + "grad_norm": 0.003366265445947647, + "learning_rate": 4.118204488778056e-06, + "loss": 0.0452, + "step": 31860 + }, + { + "epoch": 3.9738154613466334, + "grad_norm": 0.10161428898572922, + "learning_rate": 4.1132169576059854e-06, + "loss": 0.0001, + "step": 31870 + }, + { + "epoch": 3.975062344139651, + "grad_norm": 0.0009560501202940941, + "learning_rate": 4.108229426433916e-06, + "loss": 0.0, + "step": 31880 + }, + { + "epoch": 3.9763092269326683, + "grad_norm": 0.0022827445063740015, + "learning_rate": 4.103241895261846e-06, + "loss": 0.0159, + "step": 31890 + }, + { + "epoch": 3.9775561097256857, + "grad_norm": 0.00025452362024225295, + "learning_rate": 4.098254364089776e-06, + "loss": 0.0, + "step": 31900 + }, + { + "epoch": 3.978802992518703, + "grad_norm": 0.002267055446282029, + "learning_rate": 4.093266832917706e-06, + "loss": 0.0, + "step": 31910 + }, + { + "epoch": 3.9800498753117206, + "grad_norm": 0.007032580208033323, + "learning_rate": 4.088279301745636e-06, + "loss": 0.0001, + "step": 31920 + }, + { + "epoch": 3.981296758104738, + "grad_norm": 0.0001074229076039046, + "learning_rate": 4.083291770573567e-06, + "loss": 0.0, + "step": 31930 + }, + { + "epoch": 3.9825436408977555, + "grad_norm": 14.748251914978027, + "learning_rate": 4.078304239401496e-06, + "loss": 0.001, + "step": 31940 + }, + { + "epoch": 3.983790523690773, + "grad_norm": 7.268755143741146e-05, + "learning_rate": 4.073316708229427e-06, + "loss": 0.0, + "step": 31950 + }, + { + "epoch": 3.985037406483791, + "grad_norm": 0.0001998672669287771, + "learning_rate": 4.068329177057357e-06, + "loss": 0.037, + "step": 31960 + }, + { + "epoch": 3.9862842892768082, + "grad_norm": 9.201698412653059e-05, + "learning_rate": 4.063341645885287e-06, + "loss": 0.0314, + "step": 31970 + }, + { + "epoch": 3.9875311720698257, + "grad_norm": 0.0019448976963758469, + "learning_rate": 4.058354114713217e-06, + "loss": 0.0, + "step": 31980 + }, + { + "epoch": 3.988778054862843, + "grad_norm": 0.0184951052069664, + "learning_rate": 4.053366583541147e-06, + "loss": 0.0688, + "step": 31990 + }, + { + "epoch": 3.9900249376558605, + "grad_norm": 0.534318208694458, + "learning_rate": 4.0483790523690776e-06, + "loss": 0.0193, + "step": 32000 + }, + { + "epoch": 3.991271820448878, + "grad_norm": 0.0036638781893998384, + "learning_rate": 4.043391521197007e-06, + "loss": 0.0, + "step": 32010 + }, + { + "epoch": 3.9925187032418954, + "grad_norm": 0.00014929140161257237, + "learning_rate": 4.038403990024938e-06, + "loss": 0.0, + "step": 32020 + }, + { + "epoch": 3.993765586034913, + "grad_norm": 0.00022338645067065954, + "learning_rate": 4.033416458852868e-06, + "loss": 0.0, + "step": 32030 + }, + { + "epoch": 3.9950124688279303, + "grad_norm": 0.004427746869623661, + "learning_rate": 4.028428927680798e-06, + "loss": 0.0, + "step": 32040 + }, + { + "epoch": 3.9962593516209477, + "grad_norm": 0.0004692091897595674, + "learning_rate": 4.023441396508729e-06, + "loss": 0.0, + "step": 32050 + }, + { + "epoch": 3.997506234413965, + "grad_norm": 0.0003176057361997664, + "learning_rate": 4.018453865336659e-06, + "loss": 0.0, + "step": 32060 + }, + { + "epoch": 3.9987531172069826, + "grad_norm": 0.0016481290804222226, + "learning_rate": 4.013466334164589e-06, + "loss": 0.0245, + "step": 32070 + }, + { + "epoch": 4.0, + "grad_norm": 0.0001445809903088957, + "learning_rate": 4.008478802992519e-06, + "loss": 0.0, + "step": 32080 + }, + { + "epoch": 4.0, + "eval_accuracy": 0.9950745058918885, + "eval_loss": 0.03809603676199913, + "eval_runtime": 17.8987, + "eval_samples_per_second": 896.097, + "eval_steps_per_second": 56.038, + "step": 32080 + }, + { + "epoch": 4.001246882793017, + "grad_norm": 0.0001878750917967409, + "learning_rate": 4.00349127182045e-06, + "loss": 0.0001, + "step": 32090 + }, + { + "epoch": 4.002493765586035, + "grad_norm": 0.00025198431103490293, + "learning_rate": 3.9985037406483795e-06, + "loss": 0.0, + "step": 32100 + }, + { + "epoch": 4.003740648379052, + "grad_norm": 0.0003719111846294254, + "learning_rate": 3.993516209476309e-06, + "loss": 0.0197, + "step": 32110 + }, + { + "epoch": 4.00498753117207, + "grad_norm": 8.772647561272606e-05, + "learning_rate": 3.98852867830424e-06, + "loss": 0.0403, + "step": 32120 + }, + { + "epoch": 4.006234413965087, + "grad_norm": 0.0019759670831263065, + "learning_rate": 3.98354114713217e-06, + "loss": 0.0, + "step": 32130 + }, + { + "epoch": 4.007481296758105, + "grad_norm": 0.004112126771360636, + "learning_rate": 3.9785536159601e-06, + "loss": 0.0, + "step": 32140 + }, + { + "epoch": 4.008728179551122, + "grad_norm": 0.0010385045316070318, + "learning_rate": 3.97356608478803e-06, + "loss": 0.0, + "step": 32150 + }, + { + "epoch": 4.0099750623441395, + "grad_norm": 0.0010680771665647626, + "learning_rate": 3.968578553615961e-06, + "loss": 0.0, + "step": 32160 + }, + { + "epoch": 4.011221945137157, + "grad_norm": 0.00011815309699159116, + "learning_rate": 3.9635910224438905e-06, + "loss": 0.0027, + "step": 32170 + }, + { + "epoch": 4.012468827930174, + "grad_norm": 0.005580600816756487, + "learning_rate": 3.958603491271821e-06, + "loss": 0.0, + "step": 32180 + }, + { + "epoch": 4.013715710723192, + "grad_norm": 0.0022200201638042927, + "learning_rate": 3.953615960099751e-06, + "loss": 0.0, + "step": 32190 + }, + { + "epoch": 4.014962593516209, + "grad_norm": 0.0008524726727046072, + "learning_rate": 3.9486284289276815e-06, + "loss": 0.0, + "step": 32200 + }, + { + "epoch": 4.016209476309227, + "grad_norm": 1.3903931379318237, + "learning_rate": 3.943640897755611e-06, + "loss": 0.0001, + "step": 32210 + }, + { + "epoch": 4.017456359102244, + "grad_norm": 0.001720804488286376, + "learning_rate": 3.938653366583541e-06, + "loss": 0.0, + "step": 32220 + }, + { + "epoch": 4.0187032418952615, + "grad_norm": 0.0005697371670976281, + "learning_rate": 3.933665835411472e-06, + "loss": 0.0, + "step": 32230 + }, + { + "epoch": 4.019950124688279, + "grad_norm": 7.924262899905443e-05, + "learning_rate": 3.928678304239401e-06, + "loss": 0.0, + "step": 32240 + }, + { + "epoch": 4.021197007481296, + "grad_norm": 0.0015229410491883755, + "learning_rate": 3.923690773067332e-06, + "loss": 0.0001, + "step": 32250 + }, + { + "epoch": 4.022443890274314, + "grad_norm": 0.0004081030492670834, + "learning_rate": 3.918703241895262e-06, + "loss": 0.0, + "step": 32260 + }, + { + "epoch": 4.023690773067331, + "grad_norm": 0.002482213545590639, + "learning_rate": 3.913715710723192e-06, + "loss": 0.0, + "step": 32270 + }, + { + "epoch": 4.024937655860349, + "grad_norm": 0.05672341212630272, + "learning_rate": 3.908728179551122e-06, + "loss": 0.0001, + "step": 32280 + }, + { + "epoch": 4.026184538653367, + "grad_norm": 0.002141641452908516, + "learning_rate": 3.903740648379053e-06, + "loss": 0.0, + "step": 32290 + }, + { + "epoch": 4.027431421446384, + "grad_norm": 9.009883797261864e-05, + "learning_rate": 3.8987531172069834e-06, + "loss": 0.0, + "step": 32300 + }, + { + "epoch": 4.028678304239402, + "grad_norm": 0.0011927575105801225, + "learning_rate": 3.893765586034913e-06, + "loss": 0.0, + "step": 32310 + }, + { + "epoch": 4.029925187032419, + "grad_norm": 0.0026731493417173624, + "learning_rate": 3.888778054862844e-06, + "loss": 0.0, + "step": 32320 + }, + { + "epoch": 4.031172069825437, + "grad_norm": 0.009158428758382797, + "learning_rate": 3.883790523690774e-06, + "loss": 0.0023, + "step": 32330 + }, + { + "epoch": 4.032418952618454, + "grad_norm": 7.04668927937746e-05, + "learning_rate": 3.878802992518703e-06, + "loss": 0.0257, + "step": 32340 + }, + { + "epoch": 4.033665835411472, + "grad_norm": 0.0011140556307509542, + "learning_rate": 3.873815461346634e-06, + "loss": 0.0, + "step": 32350 + }, + { + "epoch": 4.034912718204489, + "grad_norm": 0.0011954933870583773, + "learning_rate": 3.868827930174564e-06, + "loss": 0.0, + "step": 32360 + }, + { + "epoch": 4.0361596009975065, + "grad_norm": 0.00039964847383089364, + "learning_rate": 3.863840399002494e-06, + "loss": 0.0215, + "step": 32370 + }, + { + "epoch": 4.037406483790524, + "grad_norm": 0.09617782384157181, + "learning_rate": 3.858852867830424e-06, + "loss": 0.0125, + "step": 32380 + }, + { + "epoch": 4.038653366583541, + "grad_norm": 0.00016554733156226575, + "learning_rate": 3.853865336658355e-06, + "loss": 0.006, + "step": 32390 + }, + { + "epoch": 4.039900249376559, + "grad_norm": 0.00015381313278339803, + "learning_rate": 3.8488778054862845e-06, + "loss": 0.0, + "step": 32400 + }, + { + "epoch": 4.041147132169576, + "grad_norm": 0.00016141528612934053, + "learning_rate": 3.843890274314215e-06, + "loss": 0.0, + "step": 32410 + }, + { + "epoch": 4.042394014962594, + "grad_norm": 0.002103125909343362, + "learning_rate": 3.838902743142145e-06, + "loss": 0.0, + "step": 32420 + }, + { + "epoch": 4.043640897755611, + "grad_norm": 0.00045178926666267216, + "learning_rate": 3.833915211970075e-06, + "loss": 0.0011, + "step": 32430 + }, + { + "epoch": 4.0448877805486285, + "grad_norm": 0.00012241245713084936, + "learning_rate": 3.828927680798005e-06, + "loss": 0.0001, + "step": 32440 + }, + { + "epoch": 4.046134663341646, + "grad_norm": 0.0002882802509702742, + "learning_rate": 3.823940149625935e-06, + "loss": 0.0, + "step": 32450 + }, + { + "epoch": 4.047381546134663, + "grad_norm": 0.00012750252790283412, + "learning_rate": 3.818952618453866e-06, + "loss": 0.0001, + "step": 32460 + }, + { + "epoch": 4.048628428927681, + "grad_norm": 9.009744098875672e-05, + "learning_rate": 3.813965087281796e-06, + "loss": 0.0, + "step": 32470 + }, + { + "epoch": 4.049875311720698, + "grad_norm": 0.017066551372408867, + "learning_rate": 3.8089775561097257e-06, + "loss": 0.0, + "step": 32480 + }, + { + "epoch": 4.051122194513716, + "grad_norm": 0.0004483947705011815, + "learning_rate": 3.803990024937656e-06, + "loss": 0.0, + "step": 32490 + }, + { + "epoch": 4.052369077306733, + "grad_norm": 0.009565845131874084, + "learning_rate": 3.799002493765586e-06, + "loss": 0.0, + "step": 32500 + }, + { + "epoch": 4.053615960099751, + "grad_norm": 0.0004378945450298488, + "learning_rate": 3.7940149625935163e-06, + "loss": 0.0, + "step": 32510 + }, + { + "epoch": 4.054862842892768, + "grad_norm": 0.00025369730428792536, + "learning_rate": 3.7890274314214465e-06, + "loss": 0.0, + "step": 32520 + }, + { + "epoch": 4.056109725685785, + "grad_norm": 0.00017954749637283385, + "learning_rate": 3.7840399002493767e-06, + "loss": 0.0, + "step": 32530 + }, + { + "epoch": 4.057356608478803, + "grad_norm": 0.00015994634304661304, + "learning_rate": 3.779052369077307e-06, + "loss": 0.0, + "step": 32540 + }, + { + "epoch": 4.05860349127182, + "grad_norm": 0.0004566339775919914, + "learning_rate": 3.7740648379052375e-06, + "loss": 0.0, + "step": 32550 + }, + { + "epoch": 4.059850374064838, + "grad_norm": 0.015646522864699364, + "learning_rate": 3.7690773067331677e-06, + "loss": 0.0, + "step": 32560 + }, + { + "epoch": 4.061097256857855, + "grad_norm": 0.0808909609913826, + "learning_rate": 3.764089775561098e-06, + "loss": 0.0, + "step": 32570 + }, + { + "epoch": 4.062344139650873, + "grad_norm": 0.0001575273199705407, + "learning_rate": 3.759102244389028e-06, + "loss": 0.0, + "step": 32580 + }, + { + "epoch": 4.06359102244389, + "grad_norm": 0.0014995918609201908, + "learning_rate": 3.7541147132169583e-06, + "loss": 0.0001, + "step": 32590 + }, + { + "epoch": 4.0648379052369075, + "grad_norm": 0.00016508818953298032, + "learning_rate": 3.749127182044888e-06, + "loss": 0.006, + "step": 32600 + }, + { + "epoch": 4.066084788029925, + "grad_norm": 8.671959221828729e-05, + "learning_rate": 3.7441396508728182e-06, + "loss": 0.0, + "step": 32610 + }, + { + "epoch": 4.067331670822942, + "grad_norm": 0.0003152689023409039, + "learning_rate": 3.7391521197007484e-06, + "loss": 0.0, + "step": 32620 + }, + { + "epoch": 4.06857855361596, + "grad_norm": 8.40279899421148e-05, + "learning_rate": 3.7341645885286786e-06, + "loss": 0.0575, + "step": 32630 + }, + { + "epoch": 4.069825436408977, + "grad_norm": 0.00015720934607088566, + "learning_rate": 3.729177057356609e-06, + "loss": 0.0, + "step": 32640 + }, + { + "epoch": 4.071072319201995, + "grad_norm": 0.00012393532961141318, + "learning_rate": 3.724189526184539e-06, + "loss": 0.0, + "step": 32650 + }, + { + "epoch": 4.072319201995012, + "grad_norm": 0.001939267385751009, + "learning_rate": 3.7192019950124692e-06, + "loss": 0.0, + "step": 32660 + }, + { + "epoch": 4.0735660847880295, + "grad_norm": 0.00018286392150912434, + "learning_rate": 3.7142144638403994e-06, + "loss": 0.0, + "step": 32670 + }, + { + "epoch": 4.074812967581048, + "grad_norm": 7.55258442950435e-05, + "learning_rate": 3.7092269326683296e-06, + "loss": 0.0, + "step": 32680 + }, + { + "epoch": 4.076059850374065, + "grad_norm": 0.00010201766417594627, + "learning_rate": 3.70423940149626e-06, + "loss": 0.0, + "step": 32690 + }, + { + "epoch": 4.077306733167083, + "grad_norm": 0.0008396367775276303, + "learning_rate": 3.6992518703241896e-06, + "loss": 0.0, + "step": 32700 + }, + { + "epoch": 4.0785536159601, + "grad_norm": 0.0001478978811064735, + "learning_rate": 3.6942643391521198e-06, + "loss": 0.0105, + "step": 32710 + }, + { + "epoch": 4.079800498753118, + "grad_norm": 0.01776493340730667, + "learning_rate": 3.68927680798005e-06, + "loss": 0.0, + "step": 32720 + }, + { + "epoch": 4.081047381546135, + "grad_norm": 0.004389037843793631, + "learning_rate": 3.68428927680798e-06, + "loss": 0.0, + "step": 32730 + }, + { + "epoch": 4.082294264339152, + "grad_norm": 0.002210435224696994, + "learning_rate": 3.6793017456359104e-06, + "loss": 0.0, + "step": 32740 + }, + { + "epoch": 4.08354114713217, + "grad_norm": 0.00010050527635030448, + "learning_rate": 3.6743142144638406e-06, + "loss": 0.0349, + "step": 32750 + }, + { + "epoch": 4.084788029925187, + "grad_norm": 0.00010919102351181209, + "learning_rate": 3.6693266832917707e-06, + "loss": 0.0, + "step": 32760 + }, + { + "epoch": 4.086034912718205, + "grad_norm": 0.0007898484473116696, + "learning_rate": 3.664339152119701e-06, + "loss": 0.0287, + "step": 32770 + }, + { + "epoch": 4.087281795511222, + "grad_norm": 0.0010982404928654432, + "learning_rate": 3.659351620947631e-06, + "loss": 0.0128, + "step": 32780 + }, + { + "epoch": 4.08852867830424, + "grad_norm": 0.000133848880068399, + "learning_rate": 3.654364089775561e-06, + "loss": 0.0, + "step": 32790 + }, + { + "epoch": 4.089775561097257, + "grad_norm": 0.0003971258702222258, + "learning_rate": 3.649376558603492e-06, + "loss": 0.0, + "step": 32800 + }, + { + "epoch": 4.0910224438902745, + "grad_norm": 0.00021624041255563498, + "learning_rate": 3.644389027431422e-06, + "loss": 0.0, + "step": 32810 + }, + { + "epoch": 4.092269326683292, + "grad_norm": 0.00039008044404909015, + "learning_rate": 3.639401496259352e-06, + "loss": 0.0002, + "step": 32820 + }, + { + "epoch": 4.093516209476309, + "grad_norm": 0.0006581239867955446, + "learning_rate": 3.634413965087282e-06, + "loss": 0.0002, + "step": 32830 + }, + { + "epoch": 4.094763092269327, + "grad_norm": 0.0005299003678373992, + "learning_rate": 3.6294264339152123e-06, + "loss": 0.0, + "step": 32840 + }, + { + "epoch": 4.096009975062344, + "grad_norm": 0.00010087557166116312, + "learning_rate": 3.6244389027431425e-06, + "loss": 0.0001, + "step": 32850 + }, + { + "epoch": 4.097256857855362, + "grad_norm": 0.0001816202566260472, + "learning_rate": 3.6194513715710727e-06, + "loss": 0.0001, + "step": 32860 + }, + { + "epoch": 4.098503740648379, + "grad_norm": 0.0035556235816329718, + "learning_rate": 3.614463840399003e-06, + "loss": 0.0001, + "step": 32870 + }, + { + "epoch": 4.0997506234413965, + "grad_norm": 0.0010074899764731526, + "learning_rate": 3.609476309226933e-06, + "loss": 0.0, + "step": 32880 + }, + { + "epoch": 4.100997506234414, + "grad_norm": 0.000434140587458387, + "learning_rate": 3.6044887780548633e-06, + "loss": 0.0, + "step": 32890 + }, + { + "epoch": 4.102244389027431, + "grad_norm": 0.00044290581718087196, + "learning_rate": 3.5995012468827935e-06, + "loss": 0.0, + "step": 32900 + }, + { + "epoch": 4.103491271820449, + "grad_norm": 0.00020887772552669048, + "learning_rate": 3.5945137157107237e-06, + "loss": 0.0, + "step": 32910 + }, + { + "epoch": 4.104738154613466, + "grad_norm": 0.0007029934786260128, + "learning_rate": 3.5895261845386535e-06, + "loss": 0.0, + "step": 32920 + }, + { + "epoch": 4.105985037406484, + "grad_norm": 0.0001252719812327996, + "learning_rate": 3.5845386533665837e-06, + "loss": 0.0, + "step": 32930 + }, + { + "epoch": 4.107231920199501, + "grad_norm": 0.0001325898483628407, + "learning_rate": 3.579551122194514e-06, + "loss": 0.0, + "step": 32940 + }, + { + "epoch": 4.1084788029925186, + "grad_norm": 0.00044591727782972157, + "learning_rate": 3.574563591022444e-06, + "loss": 0.0, + "step": 32950 + }, + { + "epoch": 4.109725685785536, + "grad_norm": 0.0007702121511101723, + "learning_rate": 3.5695760598503742e-06, + "loss": 0.0, + "step": 32960 + }, + { + "epoch": 4.110972568578553, + "grad_norm": 0.00013088526611682028, + "learning_rate": 3.5645885286783044e-06, + "loss": 0.0625, + "step": 32970 + }, + { + "epoch": 4.112219451371571, + "grad_norm": 0.10816075652837753, + "learning_rate": 3.5596009975062346e-06, + "loss": 0.0, + "step": 32980 + }, + { + "epoch": 4.113466334164588, + "grad_norm": 0.00041107553988695145, + "learning_rate": 3.554613466334165e-06, + "loss": 0.0, + "step": 32990 + }, + { + "epoch": 4.114713216957606, + "grad_norm": 0.00045265851076692343, + "learning_rate": 3.549625935162095e-06, + "loss": 0.0, + "step": 33000 + }, + { + "epoch": 4.115960099750623, + "grad_norm": 0.00031145347747951746, + "learning_rate": 3.5446384039900252e-06, + "loss": 0.0, + "step": 33010 + }, + { + "epoch": 4.117206982543641, + "grad_norm": 0.0004202498821541667, + "learning_rate": 3.539650872817955e-06, + "loss": 0.0011, + "step": 33020 + }, + { + "epoch": 4.118453865336658, + "grad_norm": 0.00154070311691612, + "learning_rate": 3.534663341645885e-06, + "loss": 0.0, + "step": 33030 + }, + { + "epoch": 4.1197007481296755, + "grad_norm": 0.03240791708230972, + "learning_rate": 3.529675810473816e-06, + "loss": 0.0, + "step": 33040 + }, + { + "epoch": 4.120947630922693, + "grad_norm": 0.00011043441918445751, + "learning_rate": 3.524688279301746e-06, + "loss": 0.0, + "step": 33050 + }, + { + "epoch": 4.12219451371571, + "grad_norm": 0.00030088701169006526, + "learning_rate": 3.519700748129676e-06, + "loss": 0.0, + "step": 33060 + }, + { + "epoch": 4.123441396508728, + "grad_norm": 0.0001598861563252285, + "learning_rate": 3.5147132169576064e-06, + "loss": 0.0432, + "step": 33070 + }, + { + "epoch": 4.124688279301745, + "grad_norm": 0.0008438412332907319, + "learning_rate": 3.5097256857855366e-06, + "loss": 0.0, + "step": 33080 + }, + { + "epoch": 4.1259351620947635, + "grad_norm": 0.0008505359292030334, + "learning_rate": 3.504738154613467e-06, + "loss": 0.0002, + "step": 33090 + }, + { + "epoch": 4.127182044887781, + "grad_norm": 0.0002946627791970968, + "learning_rate": 3.499750623441397e-06, + "loss": 0.0, + "step": 33100 + }, + { + "epoch": 4.128428927680798, + "grad_norm": 0.0001894429442472756, + "learning_rate": 3.494763092269327e-06, + "loss": 0.0, + "step": 33110 + }, + { + "epoch": 4.129675810473816, + "grad_norm": 0.00012741118553094566, + "learning_rate": 3.4897755610972574e-06, + "loss": 0.0, + "step": 33120 + }, + { + "epoch": 4.130922693266833, + "grad_norm": 0.002783777192234993, + "learning_rate": 3.4847880299251876e-06, + "loss": 0.0, + "step": 33130 + }, + { + "epoch": 4.132169576059851, + "grad_norm": 8.558244735468179e-05, + "learning_rate": 3.4798004987531173e-06, + "loss": 0.0, + "step": 33140 + }, + { + "epoch": 4.133416458852868, + "grad_norm": 0.005597286857664585, + "learning_rate": 3.4748129675810475e-06, + "loss": 0.0, + "step": 33150 + }, + { + "epoch": 4.134663341645886, + "grad_norm": 0.0004539691435638815, + "learning_rate": 3.4698254364089777e-06, + "loss": 0.0, + "step": 33160 + }, + { + "epoch": 4.135910224438903, + "grad_norm": 0.0011278591118752956, + "learning_rate": 3.464837905236908e-06, + "loss": 0.0001, + "step": 33170 + }, + { + "epoch": 4.13715710723192, + "grad_norm": 41.194732666015625, + "learning_rate": 3.459850374064838e-06, + "loss": 0.0266, + "step": 33180 + }, + { + "epoch": 4.138403990024938, + "grad_norm": 0.0007619396201334894, + "learning_rate": 3.4548628428927683e-06, + "loss": 0.0, + "step": 33190 + }, + { + "epoch": 4.139650872817955, + "grad_norm": 0.0006730849854648113, + "learning_rate": 3.4498753117206985e-06, + "loss": 0.0, + "step": 33200 + }, + { + "epoch": 4.140897755610973, + "grad_norm": 0.004926951136440039, + "learning_rate": 3.4448877805486287e-06, + "loss": 0.0009, + "step": 33210 + }, + { + "epoch": 4.14214463840399, + "grad_norm": 0.0021408579777926207, + "learning_rate": 3.439900249376559e-06, + "loss": 0.0, + "step": 33220 + }, + { + "epoch": 4.143391521197008, + "grad_norm": 0.0005824111867696047, + "learning_rate": 3.434912718204489e-06, + "loss": 0.0, + "step": 33230 + }, + { + "epoch": 4.144638403990025, + "grad_norm": 0.00010687837493605912, + "learning_rate": 3.429925187032419e-06, + "loss": 0.0, + "step": 33240 + }, + { + "epoch": 4.1458852867830425, + "grad_norm": 9.500970190856606e-05, + "learning_rate": 3.424937655860349e-06, + "loss": 0.0, + "step": 33250 + }, + { + "epoch": 4.14713216957606, + "grad_norm": 0.00021169520914554596, + "learning_rate": 3.4199501246882793e-06, + "loss": 0.0, + "step": 33260 + }, + { + "epoch": 4.148379052369077, + "grad_norm": 0.0015832402277737856, + "learning_rate": 3.4149625935162095e-06, + "loss": 0.0027, + "step": 33270 + }, + { + "epoch": 4.149625935162095, + "grad_norm": 0.0007897487957961857, + "learning_rate": 3.4099750623441397e-06, + "loss": 0.0, + "step": 33280 + }, + { + "epoch": 4.150872817955112, + "grad_norm": 0.00020493895863182843, + "learning_rate": 3.4049875311720703e-06, + "loss": 0.0, + "step": 33290 + }, + { + "epoch": 4.15211970074813, + "grad_norm": 0.0025839328300207853, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.0003, + "step": 33300 + }, + { + "epoch": 4.153366583541147, + "grad_norm": 0.00012474734103307128, + "learning_rate": 3.3950124688279307e-06, + "loss": 0.0, + "step": 33310 + }, + { + "epoch": 4.1546134663341645, + "grad_norm": 0.00013193800987210125, + "learning_rate": 3.390024937655861e-06, + "loss": 0.0, + "step": 33320 + }, + { + "epoch": 4.155860349127182, + "grad_norm": 0.000410831329645589, + "learning_rate": 3.385037406483791e-06, + "loss": 0.001, + "step": 33330 + }, + { + "epoch": 4.157107231920199, + "grad_norm": 0.00024243925872724503, + "learning_rate": 3.3800498753117213e-06, + "loss": 0.0029, + "step": 33340 + }, + { + "epoch": 4.158354114713217, + "grad_norm": 0.0016697397222742438, + "learning_rate": 3.3750623441396515e-06, + "loss": 0.0, + "step": 33350 + }, + { + "epoch": 4.159600997506234, + "grad_norm": 0.0011250133393332362, + "learning_rate": 3.3700748129675812e-06, + "loss": 0.0, + "step": 33360 + }, + { + "epoch": 4.160847880299252, + "grad_norm": 0.0002701326156966388, + "learning_rate": 3.3650872817955114e-06, + "loss": 0.0001, + "step": 33370 + }, + { + "epoch": 4.162094763092269, + "grad_norm": 0.013247229158878326, + "learning_rate": 3.3600997506234416e-06, + "loss": 0.0278, + "step": 33380 + }, + { + "epoch": 4.1633416458852865, + "grad_norm": 0.00168042560108006, + "learning_rate": 3.355112219451372e-06, + "loss": 0.0001, + "step": 33390 + }, + { + "epoch": 4.164588528678304, + "grad_norm": 0.00028758039115928113, + "learning_rate": 3.350124688279302e-06, + "loss": 0.0, + "step": 33400 + }, + { + "epoch": 4.165835411471321, + "grad_norm": 0.00012562373012769967, + "learning_rate": 3.3451371571072322e-06, + "loss": 0.0, + "step": 33410 + }, + { + "epoch": 4.167082294264339, + "grad_norm": 0.00406464422121644, + "learning_rate": 3.3401496259351624e-06, + "loss": 0.0, + "step": 33420 + }, + { + "epoch": 4.168329177057356, + "grad_norm": 0.00012497564603108913, + "learning_rate": 3.3351620947630926e-06, + "loss": 0.0, + "step": 33430 + }, + { + "epoch": 4.169576059850374, + "grad_norm": 0.00038507606950588524, + "learning_rate": 3.330174563591023e-06, + "loss": 0.0, + "step": 33440 + }, + { + "epoch": 4.170822942643391, + "grad_norm": 6.687084533041343e-05, + "learning_rate": 3.325187032418953e-06, + "loss": 0.0, + "step": 33450 + }, + { + "epoch": 4.172069825436409, + "grad_norm": 4.686644388129935e-05, + "learning_rate": 3.3201995012468828e-06, + "loss": 0.0, + "step": 33460 + }, + { + "epoch": 4.173316708229426, + "grad_norm": 0.0007018337491899729, + "learning_rate": 3.315211970074813e-06, + "loss": 0.0, + "step": 33470 + }, + { + "epoch": 4.174563591022444, + "grad_norm": 0.004306942690163851, + "learning_rate": 3.310224438902743e-06, + "loss": 0.0, + "step": 33480 + }, + { + "epoch": 4.175810473815462, + "grad_norm": 0.03564343601465225, + "learning_rate": 3.3052369077306734e-06, + "loss": 0.0, + "step": 33490 + }, + { + "epoch": 4.177057356608479, + "grad_norm": 0.00011154711683047935, + "learning_rate": 3.3002493765586036e-06, + "loss": 0.0, + "step": 33500 + }, + { + "epoch": 4.178304239401497, + "grad_norm": 0.001141234184615314, + "learning_rate": 3.2952618453865337e-06, + "loss": 0.0, + "step": 33510 + }, + { + "epoch": 4.179551122194514, + "grad_norm": 0.00015788464224897325, + "learning_rate": 3.290274314214464e-06, + "loss": 0.0, + "step": 33520 + }, + { + "epoch": 4.1807980049875315, + "grad_norm": 9.262099047191441e-05, + "learning_rate": 3.285286783042394e-06, + "loss": 0.0, + "step": 33530 + }, + { + "epoch": 4.182044887780549, + "grad_norm": 9.043302270583808e-05, + "learning_rate": 3.2802992518703248e-06, + "loss": 0.0, + "step": 33540 + }, + { + "epoch": 4.183291770573566, + "grad_norm": 0.00017372763250023127, + "learning_rate": 3.275311720698255e-06, + "loss": 0.0, + "step": 33550 + }, + { + "epoch": 4.184538653366584, + "grad_norm": 0.00010978298087138683, + "learning_rate": 3.270324189526185e-06, + "loss": 0.0, + "step": 33560 + }, + { + "epoch": 4.185785536159601, + "grad_norm": 0.0002875495993066579, + "learning_rate": 3.2653366583541153e-06, + "loss": 0.0003, + "step": 33570 + }, + { + "epoch": 4.187032418952619, + "grad_norm": 0.33438995480537415, + "learning_rate": 3.260349127182045e-06, + "loss": 0.0001, + "step": 33580 + }, + { + "epoch": 4.188279301745636, + "grad_norm": 0.0013525570975616574, + "learning_rate": 3.2553615960099753e-06, + "loss": 0.0, + "step": 33590 + }, + { + "epoch": 4.1895261845386536, + "grad_norm": 0.00015009859635028988, + "learning_rate": 3.2503740648379055e-06, + "loss": 0.0, + "step": 33600 + }, + { + "epoch": 4.190773067331671, + "grad_norm": 0.2856062352657318, + "learning_rate": 3.2453865336658357e-06, + "loss": 0.0001, + "step": 33610 + }, + { + "epoch": 4.192019950124688, + "grad_norm": 0.00011341932986397296, + "learning_rate": 3.240399002493766e-06, + "loss": 0.0, + "step": 33620 + }, + { + "epoch": 4.193266832917706, + "grad_norm": 0.000367656844900921, + "learning_rate": 3.235411471321696e-06, + "loss": 0.0035, + "step": 33630 + }, + { + "epoch": 4.194513715710723, + "grad_norm": 0.0016489146510139108, + "learning_rate": 3.2304239401496263e-06, + "loss": 0.0, + "step": 33640 + }, + { + "epoch": 4.195760598503741, + "grad_norm": 0.039787132292985916, + "learning_rate": 3.2254364089775565e-06, + "loss": 0.0, + "step": 33650 + }, + { + "epoch": 4.197007481296758, + "grad_norm": 0.00011767564137699082, + "learning_rate": 3.2204488778054867e-06, + "loss": 0.0, + "step": 33660 + }, + { + "epoch": 4.198254364089776, + "grad_norm": 0.007115756161510944, + "learning_rate": 3.215461346633417e-06, + "loss": 0.0, + "step": 33670 + }, + { + "epoch": 4.199501246882793, + "grad_norm": 0.001513071358203888, + "learning_rate": 3.2104738154613467e-06, + "loss": 0.0486, + "step": 33680 + }, + { + "epoch": 4.2007481296758105, + "grad_norm": 0.00044022343354299664, + "learning_rate": 3.205486284289277e-06, + "loss": 0.0, + "step": 33690 + }, + { + "epoch": 4.201995012468828, + "grad_norm": 0.00032083020778372884, + "learning_rate": 3.200498753117207e-06, + "loss": 0.0, + "step": 33700 + }, + { + "epoch": 4.203241895261845, + "grad_norm": 0.0002220886672148481, + "learning_rate": 3.1955112219451372e-06, + "loss": 0.0, + "step": 33710 + }, + { + "epoch": 4.204488778054863, + "grad_norm": 0.0044029937125742435, + "learning_rate": 3.1905236907730674e-06, + "loss": 0.0154, + "step": 33720 + }, + { + "epoch": 4.20573566084788, + "grad_norm": 0.004143120255321264, + "learning_rate": 3.1855361596009976e-06, + "loss": 0.0483, + "step": 33730 + }, + { + "epoch": 4.206982543640898, + "grad_norm": 0.00012014318781439215, + "learning_rate": 3.180548628428928e-06, + "loss": 0.0, + "step": 33740 + }, + { + "epoch": 4.208229426433915, + "grad_norm": 0.0004365080676507205, + "learning_rate": 3.175561097256858e-06, + "loss": 0.0, + "step": 33750 + }, + { + "epoch": 4.2094763092269325, + "grad_norm": 0.0001294610119657591, + "learning_rate": 3.1705735660847882e-06, + "loss": 0.0, + "step": 33760 + }, + { + "epoch": 4.21072319201995, + "grad_norm": 0.0007133973995223641, + "learning_rate": 3.1655860349127184e-06, + "loss": 0.0001, + "step": 33770 + }, + { + "epoch": 4.211970074812967, + "grad_norm": 0.00016860711912158877, + "learning_rate": 3.160598503740648e-06, + "loss": 0.0412, + "step": 33780 + }, + { + "epoch": 4.213216957605985, + "grad_norm": 0.00022174940386321396, + "learning_rate": 3.1556109725685792e-06, + "loss": 0.0001, + "step": 33790 + }, + { + "epoch": 4.214463840399002, + "grad_norm": 0.0011991856154054403, + "learning_rate": 3.1506234413965094e-06, + "loss": 0.0, + "step": 33800 + }, + { + "epoch": 4.21571072319202, + "grad_norm": 0.0002086485328618437, + "learning_rate": 3.145635910224439e-06, + "loss": 0.0, + "step": 33810 + }, + { + "epoch": 4.216957605985037, + "grad_norm": 0.00021221544011496007, + "learning_rate": 3.1406483790523694e-06, + "loss": 0.0007, + "step": 33820 + }, + { + "epoch": 4.2182044887780545, + "grad_norm": 0.0001043640440911986, + "learning_rate": 3.1356608478802996e-06, + "loss": 0.0062, + "step": 33830 + }, + { + "epoch": 4.219451371571072, + "grad_norm": 0.0013198963133618236, + "learning_rate": 3.13067331670823e-06, + "loss": 0.0023, + "step": 33840 + }, + { + "epoch": 4.220698254364089, + "grad_norm": 0.0001692106743576005, + "learning_rate": 3.12568578553616e-06, + "loss": 0.0616, + "step": 33850 + }, + { + "epoch": 4.221945137157107, + "grad_norm": 0.0012058233842253685, + "learning_rate": 3.12069825436409e-06, + "loss": 0.0, + "step": 33860 + }, + { + "epoch": 4.223192019950124, + "grad_norm": 0.0029543384443968534, + "learning_rate": 3.1157107231920204e-06, + "loss": 0.0, + "step": 33870 + }, + { + "epoch": 4.224438902743142, + "grad_norm": 0.00011094296496594325, + "learning_rate": 3.1107231920199506e-06, + "loss": 0.0298, + "step": 33880 + }, + { + "epoch": 4.225685785536159, + "grad_norm": 0.00020665784541051835, + "learning_rate": 3.1057356608478808e-06, + "loss": 0.0001, + "step": 33890 + }, + { + "epoch": 4.2269326683291775, + "grad_norm": 0.0001479942729929462, + "learning_rate": 3.1007481296758105e-06, + "loss": 0.0, + "step": 33900 + }, + { + "epoch": 4.228179551122195, + "grad_norm": 42.932716369628906, + "learning_rate": 3.0957605985037407e-06, + "loss": 0.0041, + "step": 33910 + }, + { + "epoch": 4.229426433915212, + "grad_norm": 0.00013323694292921573, + "learning_rate": 3.090773067331671e-06, + "loss": 0.0, + "step": 33920 + }, + { + "epoch": 4.23067331670823, + "grad_norm": 0.008350051939487457, + "learning_rate": 3.085785536159601e-06, + "loss": 0.0006, + "step": 33930 + }, + { + "epoch": 4.231920199501247, + "grad_norm": 0.003039455274119973, + "learning_rate": 3.0807980049875313e-06, + "loss": 0.0001, + "step": 33940 + }, + { + "epoch": 4.233167082294265, + "grad_norm": 0.00027307405252940953, + "learning_rate": 3.0758104738154615e-06, + "loss": 0.0, + "step": 33950 + }, + { + "epoch": 4.234413965087282, + "grad_norm": 0.0001598122325958684, + "learning_rate": 3.0708229426433917e-06, + "loss": 0.0, + "step": 33960 + }, + { + "epoch": 4.2356608478802995, + "grad_norm": 0.00011701675248332322, + "learning_rate": 3.065835411471322e-06, + "loss": 0.0184, + "step": 33970 + }, + { + "epoch": 4.236907730673317, + "grad_norm": 0.0001561442913953215, + "learning_rate": 3.060847880299252e-06, + "loss": 0.0, + "step": 33980 + }, + { + "epoch": 4.238154613466334, + "grad_norm": 0.0002617475111037493, + "learning_rate": 3.0558603491271823e-06, + "loss": 0.0016, + "step": 33990 + }, + { + "epoch": 4.239401496259352, + "grad_norm": 0.0004843343631364405, + "learning_rate": 3.050872817955112e-06, + "loss": 0.0103, + "step": 34000 + }, + { + "epoch": 4.240648379052369, + "grad_norm": 0.00012468890054151416, + "learning_rate": 3.0458852867830423e-06, + "loss": 0.0002, + "step": 34010 + }, + { + "epoch": 4.241895261845387, + "grad_norm": 0.0029277161229401827, + "learning_rate": 3.0408977556109725e-06, + "loss": 0.0, + "step": 34020 + }, + { + "epoch": 4.243142144638404, + "grad_norm": 9.913599933497608e-05, + "learning_rate": 3.035910224438903e-06, + "loss": 0.0, + "step": 34030 + }, + { + "epoch": 4.2443890274314215, + "grad_norm": 8.990171772893518e-05, + "learning_rate": 3.0309226932668333e-06, + "loss": 0.0285, + "step": 34040 + }, + { + "epoch": 4.245635910224439, + "grad_norm": 0.00037068844540044665, + "learning_rate": 3.0259351620947635e-06, + "loss": 0.0455, + "step": 34050 + }, + { + "epoch": 4.246882793017456, + "grad_norm": 0.0005540825659409165, + "learning_rate": 3.0209476309226937e-06, + "loss": 0.0752, + "step": 34060 + }, + { + "epoch": 4.248129675810474, + "grad_norm": 0.20835602283477783, + "learning_rate": 3.015960099750624e-06, + "loss": 0.0, + "step": 34070 + }, + { + "epoch": 4.249376558603491, + "grad_norm": 0.006817484740167856, + "learning_rate": 3.010972568578554e-06, + "loss": 0.0, + "step": 34080 + }, + { + "epoch": 4.250623441396509, + "grad_norm": 0.00453070318326354, + "learning_rate": 3.0059850374064843e-06, + "loss": 0.0, + "step": 34090 + }, + { + "epoch": 4.251870324189526, + "grad_norm": 0.00016027323727030307, + "learning_rate": 3.0009975062344145e-06, + "loss": 0.0, + "step": 34100 + }, + { + "epoch": 4.253117206982544, + "grad_norm": 0.0012369597097858787, + "learning_rate": 2.9960099750623447e-06, + "loss": 0.0001, + "step": 34110 + }, + { + "epoch": 4.254364089775561, + "grad_norm": 0.00018037218251265585, + "learning_rate": 2.9910224438902744e-06, + "loss": 0.0006, + "step": 34120 + }, + { + "epoch": 4.2556109725685785, + "grad_norm": 0.001004421734251082, + "learning_rate": 2.9860349127182046e-06, + "loss": 0.0, + "step": 34130 + }, + { + "epoch": 4.256857855361596, + "grad_norm": 0.00010146049316972494, + "learning_rate": 2.981047381546135e-06, + "loss": 0.0, + "step": 34140 + }, + { + "epoch": 4.258104738154613, + "grad_norm": 0.00021762358665000647, + "learning_rate": 2.976059850374065e-06, + "loss": 0.0022, + "step": 34150 + }, + { + "epoch": 4.259351620947631, + "grad_norm": 0.0008794396417215466, + "learning_rate": 2.9710723192019952e-06, + "loss": 0.0, + "step": 34160 + }, + { + "epoch": 4.260598503740648, + "grad_norm": 0.0002460590039845556, + "learning_rate": 2.9660847880299254e-06, + "loss": 0.0, + "step": 34170 + }, + { + "epoch": 4.261845386533666, + "grad_norm": 0.0002297249884577468, + "learning_rate": 2.9610972568578556e-06, + "loss": 0.0013, + "step": 34180 + }, + { + "epoch": 4.263092269326683, + "grad_norm": 0.2807936668395996, + "learning_rate": 2.956109725685786e-06, + "loss": 0.0001, + "step": 34190 + }, + { + "epoch": 4.2643391521197005, + "grad_norm": 0.0002086867461912334, + "learning_rate": 2.951122194513716e-06, + "loss": 0.0, + "step": 34200 + }, + { + "epoch": 4.265586034912718, + "grad_norm": 0.00010574016778264195, + "learning_rate": 2.946134663341646e-06, + "loss": 0.0, + "step": 34210 + }, + { + "epoch": 4.266832917705735, + "grad_norm": 0.0005821581580676138, + "learning_rate": 2.941147132169576e-06, + "loss": 0.0055, + "step": 34220 + }, + { + "epoch": 4.268079800498753, + "grad_norm": 0.08548212796449661, + "learning_rate": 2.936159600997506e-06, + "loss": 0.0001, + "step": 34230 + }, + { + "epoch": 4.26932668329177, + "grad_norm": 0.0003080704773310572, + "learning_rate": 2.9311720698254364e-06, + "loss": 0.0, + "step": 34240 + }, + { + "epoch": 4.270573566084788, + "grad_norm": 0.024417024105787277, + "learning_rate": 2.9261845386533666e-06, + "loss": 0.0, + "step": 34250 + }, + { + "epoch": 4.271820448877805, + "grad_norm": 0.00023198517737910151, + "learning_rate": 2.9211970074812967e-06, + "loss": 0.0352, + "step": 34260 + }, + { + "epoch": 4.2730673316708225, + "grad_norm": 0.0014956368831917644, + "learning_rate": 2.916209476309227e-06, + "loss": 0.0, + "step": 34270 + }, + { + "epoch": 4.274314214463841, + "grad_norm": 0.000349792797351256, + "learning_rate": 2.9112219451371576e-06, + "loss": 0.0, + "step": 34280 + }, + { + "epoch": 4.275561097256858, + "grad_norm": 0.0010751100489869714, + "learning_rate": 2.9062344139650878e-06, + "loss": 0.0, + "step": 34290 + }, + { + "epoch": 4.276807980049876, + "grad_norm": 0.00015223190712276846, + "learning_rate": 2.901246882793018e-06, + "loss": 0.0, + "step": 34300 + }, + { + "epoch": 4.278054862842893, + "grad_norm": 0.00022834049013908952, + "learning_rate": 2.896259351620948e-06, + "loss": 0.0, + "step": 34310 + }, + { + "epoch": 4.279301745635911, + "grad_norm": 0.0009195749298669398, + "learning_rate": 2.8912718204488783e-06, + "loss": 0.0, + "step": 34320 + }, + { + "epoch": 4.280548628428928, + "grad_norm": 0.00018121050379704684, + "learning_rate": 2.8862842892768085e-06, + "loss": 0.0, + "step": 34330 + }, + { + "epoch": 4.2817955112219455, + "grad_norm": 0.003172141034156084, + "learning_rate": 2.8812967581047383e-06, + "loss": 0.0, + "step": 34340 + }, + { + "epoch": 4.283042394014963, + "grad_norm": 0.00037091257399879396, + "learning_rate": 2.8763092269326685e-06, + "loss": 0.0, + "step": 34350 + }, + { + "epoch": 4.28428927680798, + "grad_norm": 0.0002679352182894945, + "learning_rate": 2.8713216957605987e-06, + "loss": 0.0, + "step": 34360 + }, + { + "epoch": 4.285536159600998, + "grad_norm": 0.00014105028822086751, + "learning_rate": 2.866334164588529e-06, + "loss": 0.0, + "step": 34370 + }, + { + "epoch": 4.286783042394015, + "grad_norm": 0.0001462107029510662, + "learning_rate": 2.861346633416459e-06, + "loss": 0.0, + "step": 34380 + }, + { + "epoch": 4.288029925187033, + "grad_norm": 0.007252872921526432, + "learning_rate": 2.8563591022443893e-06, + "loss": 0.0032, + "step": 34390 + }, + { + "epoch": 4.28927680798005, + "grad_norm": 8.567462646169588e-05, + "learning_rate": 2.8513715710723195e-06, + "loss": 0.0085, + "step": 34400 + }, + { + "epoch": 4.2905236907730675, + "grad_norm": 6.732952897436917e-05, + "learning_rate": 2.8463840399002497e-06, + "loss": 0.0, + "step": 34410 + }, + { + "epoch": 4.291770573566085, + "grad_norm": 22.605783462524414, + "learning_rate": 2.84139650872818e-06, + "loss": 0.0234, + "step": 34420 + }, + { + "epoch": 4.293017456359102, + "grad_norm": 0.00013546543777920306, + "learning_rate": 2.83640897755611e-06, + "loss": 0.0, + "step": 34430 + }, + { + "epoch": 4.29426433915212, + "grad_norm": 0.0021784852724522352, + "learning_rate": 2.83142144638404e-06, + "loss": 0.0, + "step": 34440 + }, + { + "epoch": 4.295511221945137, + "grad_norm": 0.0005654957494698465, + "learning_rate": 2.82643391521197e-06, + "loss": 0.026, + "step": 34450 + }, + { + "epoch": 4.296758104738155, + "grad_norm": 0.00012377439998090267, + "learning_rate": 2.8214463840399002e-06, + "loss": 0.024, + "step": 34460 + }, + { + "epoch": 4.298004987531172, + "grad_norm": 0.00014131332864053547, + "learning_rate": 2.8164588528678304e-06, + "loss": 0.0, + "step": 34470 + }, + { + "epoch": 4.2992518703241895, + "grad_norm": 0.0001872741268016398, + "learning_rate": 2.8114713216957606e-06, + "loss": 0.0, + "step": 34480 + }, + { + "epoch": 4.300498753117207, + "grad_norm": 0.000619906117208302, + "learning_rate": 2.806483790523691e-06, + "loss": 0.0158, + "step": 34490 + }, + { + "epoch": 4.301745635910224, + "grad_norm": 0.008459209464490414, + "learning_rate": 2.801496259351621e-06, + "loss": 0.0001, + "step": 34500 + }, + { + "epoch": 4.302992518703242, + "grad_norm": 0.0034343558363616467, + "learning_rate": 2.7965087281795512e-06, + "loss": 0.0, + "step": 34510 + }, + { + "epoch": 4.304239401496259, + "grad_norm": 0.0003617657348513603, + "learning_rate": 2.7915211970074814e-06, + "loss": 0.0335, + "step": 34520 + }, + { + "epoch": 4.305486284289277, + "grad_norm": 0.00014825259859208018, + "learning_rate": 2.786533665835412e-06, + "loss": 0.0001, + "step": 34530 + }, + { + "epoch": 4.306733167082294, + "grad_norm": 0.00040610635187476873, + "learning_rate": 2.7815461346633422e-06, + "loss": 0.0, + "step": 34540 + }, + { + "epoch": 4.307980049875312, + "grad_norm": 0.0001523315440863371, + "learning_rate": 2.7765586034912724e-06, + "loss": 0.0, + "step": 34550 + }, + { + "epoch": 4.309226932668329, + "grad_norm": 0.00016806507483124733, + "learning_rate": 2.7715710723192026e-06, + "loss": 0.0, + "step": 34560 + }, + { + "epoch": 4.3104738154613464, + "grad_norm": 0.000623812957201153, + "learning_rate": 2.7665835411471324e-06, + "loss": 0.0, + "step": 34570 + }, + { + "epoch": 4.311720698254364, + "grad_norm": 0.0008510535699315369, + "learning_rate": 2.7615960099750626e-06, + "loss": 0.0286, + "step": 34580 + }, + { + "epoch": 4.312967581047381, + "grad_norm": 0.0007971362792886794, + "learning_rate": 2.756608478802993e-06, + "loss": 0.0, + "step": 34590 + }, + { + "epoch": 4.314214463840399, + "grad_norm": 0.0081477090716362, + "learning_rate": 2.751620947630923e-06, + "loss": 0.0001, + "step": 34600 + }, + { + "epoch": 4.315461346633416, + "grad_norm": 0.00035156356170773506, + "learning_rate": 2.746633416458853e-06, + "loss": 0.0, + "step": 34610 + }, + { + "epoch": 4.316708229426434, + "grad_norm": 0.0002370004658587277, + "learning_rate": 2.7416458852867834e-06, + "loss": 0.0, + "step": 34620 + }, + { + "epoch": 4.317955112219451, + "grad_norm": 0.0005394717445597053, + "learning_rate": 2.7366583541147136e-06, + "loss": 0.0, + "step": 34630 + }, + { + "epoch": 4.3192019950124685, + "grad_norm": 4.46143421868328e-05, + "learning_rate": 2.7316708229426438e-06, + "loss": 0.0, + "step": 34640 + }, + { + "epoch": 4.320448877805486, + "grad_norm": 5.566642357734963e-05, + "learning_rate": 2.726683291770574e-06, + "loss": 0.0163, + "step": 34650 + }, + { + "epoch": 4.321695760598503, + "grad_norm": 0.00012989880633540452, + "learning_rate": 2.7216957605985037e-06, + "loss": 0.0, + "step": 34660 + }, + { + "epoch": 4.322942643391521, + "grad_norm": 0.00028789890347979963, + "learning_rate": 2.716708229426434e-06, + "loss": 0.0103, + "step": 34670 + }, + { + "epoch": 4.324189526184538, + "grad_norm": 0.00021803946583531797, + "learning_rate": 2.711720698254364e-06, + "loss": 0.0, + "step": 34680 + }, + { + "epoch": 4.325436408977556, + "grad_norm": 0.0003036449197679758, + "learning_rate": 2.7067331670822943e-06, + "loss": 0.0, + "step": 34690 + }, + { + "epoch": 4.326683291770574, + "grad_norm": 0.0001597816008143127, + "learning_rate": 2.7017456359102245e-06, + "loss": 0.0, + "step": 34700 + }, + { + "epoch": 4.327930174563591, + "grad_norm": 51.117496490478516, + "learning_rate": 2.6967581047381547e-06, + "loss": 0.0202, + "step": 34710 + }, + { + "epoch": 4.329177057356609, + "grad_norm": 0.00017285061767324805, + "learning_rate": 2.691770573566085e-06, + "loss": 0.0, + "step": 34720 + }, + { + "epoch": 4.330423940149626, + "grad_norm": 0.006054338067770004, + "learning_rate": 2.686783042394015e-06, + "loss": 0.0, + "step": 34730 + }, + { + "epoch": 4.331670822942644, + "grad_norm": 0.0001271928776986897, + "learning_rate": 2.6817955112219453e-06, + "loss": 0.0, + "step": 34740 + }, + { + "epoch": 4.332917705735661, + "grad_norm": 0.0003265069390181452, + "learning_rate": 2.6768079800498755e-06, + "loss": 0.0, + "step": 34750 + }, + { + "epoch": 4.334164588528679, + "grad_norm": 0.00024125789059326053, + "learning_rate": 2.6718204488778053e-06, + "loss": 0.0135, + "step": 34760 + }, + { + "epoch": 4.335411471321696, + "grad_norm": 0.00010157257929677144, + "learning_rate": 2.6668329177057355e-06, + "loss": 0.0, + "step": 34770 + }, + { + "epoch": 4.3366583541147135, + "grad_norm": 0.0003305143618490547, + "learning_rate": 2.6618453865336665e-06, + "loss": 0.0, + "step": 34780 + }, + { + "epoch": 4.337905236907731, + "grad_norm": 0.00010430354450363666, + "learning_rate": 2.6568578553615963e-06, + "loss": 0.0, + "step": 34790 + }, + { + "epoch": 4.339152119700748, + "grad_norm": 0.0006443153251893818, + "learning_rate": 2.6518703241895265e-06, + "loss": 0.0001, + "step": 34800 + }, + { + "epoch": 4.340399002493766, + "grad_norm": 0.00023026694543659687, + "learning_rate": 2.6468827930174567e-06, + "loss": 0.0001, + "step": 34810 + }, + { + "epoch": 4.341645885286783, + "grad_norm": 0.0033188401721417904, + "learning_rate": 2.641895261845387e-06, + "loss": 0.0001, + "step": 34820 + }, + { + "epoch": 4.342892768079801, + "grad_norm": 0.00031385180773213506, + "learning_rate": 2.636907730673317e-06, + "loss": 0.0, + "step": 34830 + }, + { + "epoch": 4.344139650872818, + "grad_norm": 0.00039108918281272054, + "learning_rate": 2.6319201995012473e-06, + "loss": 0.0, + "step": 34840 + }, + { + "epoch": 4.3453865336658355, + "grad_norm": 0.00011719971371348947, + "learning_rate": 2.6269326683291775e-06, + "loss": 0.0, + "step": 34850 + }, + { + "epoch": 4.346633416458853, + "grad_norm": 5.473527198773809e-05, + "learning_rate": 2.6219451371571077e-06, + "loss": 0.0001, + "step": 34860 + }, + { + "epoch": 4.34788029925187, + "grad_norm": 0.00020321720512583852, + "learning_rate": 2.616957605985038e-06, + "loss": 0.0, + "step": 34870 + }, + { + "epoch": 4.349127182044888, + "grad_norm": 0.0006901020533405244, + "learning_rate": 2.6119700748129676e-06, + "loss": 0.0, + "step": 34880 + }, + { + "epoch": 4.350374064837905, + "grad_norm": 0.0014119262341409922, + "learning_rate": 2.606982543640898e-06, + "loss": 0.0, + "step": 34890 + }, + { + "epoch": 4.351620947630923, + "grad_norm": 0.0002134747483069077, + "learning_rate": 2.601995012468828e-06, + "loss": 0.0, + "step": 34900 + }, + { + "epoch": 4.35286783042394, + "grad_norm": 0.00012892240192741156, + "learning_rate": 2.597007481296758e-06, + "loss": 0.0, + "step": 34910 + }, + { + "epoch": 4.3541147132169575, + "grad_norm": 0.00012061430606991053, + "learning_rate": 2.5920199501246884e-06, + "loss": 0.0023, + "step": 34920 + }, + { + "epoch": 4.355361596009975, + "grad_norm": 0.00020210719958413392, + "learning_rate": 2.5870324189526186e-06, + "loss": 0.0, + "step": 34930 + }, + { + "epoch": 4.356608478802992, + "grad_norm": 0.001012521330267191, + "learning_rate": 2.582044887780549e-06, + "loss": 0.0039, + "step": 34940 + }, + { + "epoch": 4.35785536159601, + "grad_norm": 0.00021262998052407056, + "learning_rate": 2.577057356608479e-06, + "loss": 0.023, + "step": 34950 + }, + { + "epoch": 4.359102244389027, + "grad_norm": 0.000502545852214098, + "learning_rate": 2.572069825436409e-06, + "loss": 0.0, + "step": 34960 + }, + { + "epoch": 4.360349127182045, + "grad_norm": 6.385482993209735e-05, + "learning_rate": 2.5670822942643394e-06, + "loss": 0.0, + "step": 34970 + }, + { + "epoch": 4.361596009975062, + "grad_norm": 7.826248474884778e-05, + "learning_rate": 2.562094763092269e-06, + "loss": 0.0, + "step": 34980 + }, + { + "epoch": 4.36284289276808, + "grad_norm": 5.5803859140723944e-05, + "learning_rate": 2.5571072319201994e-06, + "loss": 0.0525, + "step": 34990 + }, + { + "epoch": 4.364089775561097, + "grad_norm": 4.6033244871068746e-05, + "learning_rate": 2.5521197007481296e-06, + "loss": 0.0, + "step": 35000 + }, + { + "epoch": 4.365336658354114, + "grad_norm": 0.00029482910758815706, + "learning_rate": 2.5471321695760597e-06, + "loss": 0.0, + "step": 35010 + }, + { + "epoch": 4.366583541147132, + "grad_norm": 0.00016616334323771298, + "learning_rate": 2.5421446384039904e-06, + "loss": 0.0, + "step": 35020 + }, + { + "epoch": 4.367830423940149, + "grad_norm": 0.00011380357318557799, + "learning_rate": 2.5371571072319206e-06, + "loss": 0.0, + "step": 35030 + }, + { + "epoch": 4.369077306733167, + "grad_norm": 7.7957367466297e-05, + "learning_rate": 2.5321695760598508e-06, + "loss": 0.0, + "step": 35040 + }, + { + "epoch": 4.370324189526184, + "grad_norm": 0.0003632204607129097, + "learning_rate": 2.527182044887781e-06, + "loss": 0.0045, + "step": 35050 + }, + { + "epoch": 4.371571072319202, + "grad_norm": 0.1488853245973587, + "learning_rate": 2.522194513715711e-06, + "loss": 0.0175, + "step": 35060 + }, + { + "epoch": 4.372817955112219, + "grad_norm": 9.009828499983996e-05, + "learning_rate": 2.5172069825436413e-06, + "loss": 0.0, + "step": 35070 + }, + { + "epoch": 4.374064837905237, + "grad_norm": 0.06685556471347809, + "learning_rate": 2.5122194513715715e-06, + "loss": 0.0, + "step": 35080 + }, + { + "epoch": 4.375311720698255, + "grad_norm": 0.0001171770563814789, + "learning_rate": 2.5072319201995017e-06, + "loss": 0.0, + "step": 35090 + }, + { + "epoch": 4.376558603491272, + "grad_norm": 0.00015780334069859236, + "learning_rate": 2.5022443890274315e-06, + "loss": 0.0, + "step": 35100 + }, + { + "epoch": 4.37780548628429, + "grad_norm": 0.0005297464085742831, + "learning_rate": 2.4972568578553617e-06, + "loss": 0.0001, + "step": 35110 + }, + { + "epoch": 4.379052369077307, + "grad_norm": 0.0001470494898967445, + "learning_rate": 2.492269326683292e-06, + "loss": 0.0, + "step": 35120 + }, + { + "epoch": 4.3802992518703245, + "grad_norm": 0.008314243517816067, + "learning_rate": 2.487281795511222e-06, + "loss": 0.0, + "step": 35130 + }, + { + "epoch": 4.381546134663342, + "grad_norm": 9.446181502426043e-05, + "learning_rate": 2.4822942643391523e-06, + "loss": 0.0, + "step": 35140 + }, + { + "epoch": 4.382793017456359, + "grad_norm": 7.607245788676664e-05, + "learning_rate": 2.4773067331670825e-06, + "loss": 0.0, + "step": 35150 + }, + { + "epoch": 4.384039900249377, + "grad_norm": 0.0001715715043246746, + "learning_rate": 2.4723192019950127e-06, + "loss": 0.0, + "step": 35160 + }, + { + "epoch": 4.385286783042394, + "grad_norm": 0.0006570697296410799, + "learning_rate": 2.467331670822943e-06, + "loss": 0.0, + "step": 35170 + }, + { + "epoch": 4.386533665835412, + "grad_norm": 0.0072289216332137585, + "learning_rate": 2.462344139650873e-06, + "loss": 0.0, + "step": 35180 + }, + { + "epoch": 4.387780548628429, + "grad_norm": 0.00012217392213642597, + "learning_rate": 2.4573566084788033e-06, + "loss": 0.0, + "step": 35190 + }, + { + "epoch": 4.389027431421447, + "grad_norm": 0.00018129154341295362, + "learning_rate": 2.452369077306733e-06, + "loss": 0.0, + "step": 35200 + }, + { + "epoch": 4.390274314214464, + "grad_norm": 0.0001524377439636737, + "learning_rate": 2.4473815461346637e-06, + "loss": 0.0, + "step": 35210 + }, + { + "epoch": 4.3915211970074814, + "grad_norm": 0.00016739932470954955, + "learning_rate": 2.442394014962594e-06, + "loss": 0.0, + "step": 35220 + }, + { + "epoch": 4.392768079800499, + "grad_norm": 9.303903061663732e-05, + "learning_rate": 2.437406483790524e-06, + "loss": 0.0001, + "step": 35230 + }, + { + "epoch": 4.394014962593516, + "grad_norm": 0.06675028055906296, + "learning_rate": 2.4324189526184543e-06, + "loss": 0.0, + "step": 35240 + }, + { + "epoch": 4.395261845386534, + "grad_norm": 0.00039317607297562063, + "learning_rate": 2.4274314214463844e-06, + "loss": 0.0, + "step": 35250 + }, + { + "epoch": 4.396508728179551, + "grad_norm": 6.668036803603172e-05, + "learning_rate": 2.4224438902743142e-06, + "loss": 0.0, + "step": 35260 + }, + { + "epoch": 4.397755610972569, + "grad_norm": 0.002230043290182948, + "learning_rate": 2.4174563591022444e-06, + "loss": 0.0005, + "step": 35270 + }, + { + "epoch": 4.399002493765586, + "grad_norm": 0.00021028569608461112, + "learning_rate": 2.4124688279301746e-06, + "loss": 0.0, + "step": 35280 + }, + { + "epoch": 4.4002493765586035, + "grad_norm": 6.0643418692052364e-05, + "learning_rate": 2.407481296758105e-06, + "loss": 0.001, + "step": 35290 + }, + { + "epoch": 4.401496259351621, + "grad_norm": 0.00025467347586527467, + "learning_rate": 2.402493765586035e-06, + "loss": 0.0, + "step": 35300 + }, + { + "epoch": 4.402743142144638, + "grad_norm": 0.0018303323304280639, + "learning_rate": 2.397506234413965e-06, + "loss": 0.0, + "step": 35310 + }, + { + "epoch": 4.403990024937656, + "grad_norm": 0.0001063878953573294, + "learning_rate": 2.3925187032418954e-06, + "loss": 0.0, + "step": 35320 + }, + { + "epoch": 4.405236907730673, + "grad_norm": 6.145804218249395e-05, + "learning_rate": 2.3875311720698256e-06, + "loss": 0.0624, + "step": 35330 + }, + { + "epoch": 4.406483790523691, + "grad_norm": 0.0006972160190343857, + "learning_rate": 2.382543640897756e-06, + "loss": 0.0, + "step": 35340 + }, + { + "epoch": 4.407730673316708, + "grad_norm": 0.00030007187160663307, + "learning_rate": 2.377556109725686e-06, + "loss": 0.012, + "step": 35350 + }, + { + "epoch": 4.4089775561097255, + "grad_norm": 0.0004714836541097611, + "learning_rate": 2.372568578553616e-06, + "loss": 0.0, + "step": 35360 + }, + { + "epoch": 4.410224438902743, + "grad_norm": 8.451470785075799e-05, + "learning_rate": 2.3675810473815464e-06, + "loss": 0.0, + "step": 35370 + }, + { + "epoch": 4.41147132169576, + "grad_norm": 0.00015619279292877764, + "learning_rate": 2.3625935162094766e-06, + "loss": 0.0, + "step": 35380 + }, + { + "epoch": 4.412718204488778, + "grad_norm": 6.922364264028147e-05, + "learning_rate": 2.3576059850374068e-06, + "loss": 0.0, + "step": 35390 + }, + { + "epoch": 4.413965087281795, + "grad_norm": 0.0002487407182343304, + "learning_rate": 2.352618453865337e-06, + "loss": 0.0, + "step": 35400 + }, + { + "epoch": 4.415211970074813, + "grad_norm": 0.0003662613744381815, + "learning_rate": 2.347630922693267e-06, + "loss": 0.0, + "step": 35410 + }, + { + "epoch": 4.41645885286783, + "grad_norm": 5.250598042039201e-05, + "learning_rate": 2.342643391521197e-06, + "loss": 0.0, + "step": 35420 + }, + { + "epoch": 4.417705735660848, + "grad_norm": 0.00037984587834216654, + "learning_rate": 2.337655860349127e-06, + "loss": 0.0, + "step": 35430 + }, + { + "epoch": 4.418952618453865, + "grad_norm": 0.0001668424520175904, + "learning_rate": 2.3326683291770573e-06, + "loss": 0.0, + "step": 35440 + }, + { + "epoch": 4.420199501246882, + "grad_norm": 0.00022639970120508224, + "learning_rate": 2.3276807980049875e-06, + "loss": 0.0006, + "step": 35450 + }, + { + "epoch": 4.4214463840399, + "grad_norm": 0.00010051777644548565, + "learning_rate": 2.322693266832918e-06, + "loss": 0.0, + "step": 35460 + }, + { + "epoch": 4.422693266832917, + "grad_norm": 7.006821397226304e-05, + "learning_rate": 2.3177057356608483e-06, + "loss": 0.0, + "step": 35470 + }, + { + "epoch": 4.423940149625935, + "grad_norm": 0.0008471541223116219, + "learning_rate": 2.312718204488778e-06, + "loss": 0.0, + "step": 35480 + }, + { + "epoch": 4.425187032418952, + "grad_norm": 5.219184822635725e-05, + "learning_rate": 2.3077306733167083e-06, + "loss": 0.0, + "step": 35490 + }, + { + "epoch": 4.42643391521197, + "grad_norm": 0.00016879831673577428, + "learning_rate": 2.3027431421446385e-06, + "loss": 0.0444, + "step": 35500 + }, + { + "epoch": 4.427680798004988, + "grad_norm": 0.011657807044684887, + "learning_rate": 2.2977556109725687e-06, + "loss": 0.0, + "step": 35510 + }, + { + "epoch": 4.428927680798005, + "grad_norm": 0.002720105927437544, + "learning_rate": 2.292768079800499e-06, + "loss": 0.0, + "step": 35520 + }, + { + "epoch": 4.430174563591023, + "grad_norm": 3.922603355022147e-05, + "learning_rate": 2.287780548628429e-06, + "loss": 0.0, + "step": 35530 + }, + { + "epoch": 4.43142144638404, + "grad_norm": 0.0001942600356414914, + "learning_rate": 2.2827930174563593e-06, + "loss": 0.0, + "step": 35540 + }, + { + "epoch": 4.432668329177058, + "grad_norm": 0.0001732075761537999, + "learning_rate": 2.2778054862842895e-06, + "loss": 0.0083, + "step": 35550 + }, + { + "epoch": 4.433915211970075, + "grad_norm": 4.679305857280269e-05, + "learning_rate": 2.2728179551122197e-06, + "loss": 0.0, + "step": 35560 + }, + { + "epoch": 4.4351620947630925, + "grad_norm": 0.00015754872583784163, + "learning_rate": 2.26783042394015e-06, + "loss": 0.0, + "step": 35570 + }, + { + "epoch": 4.43640897755611, + "grad_norm": 5.866608626092784e-05, + "learning_rate": 2.26284289276808e-06, + "loss": 0.0008, + "step": 35580 + }, + { + "epoch": 4.437655860349127, + "grad_norm": 6.611185381188989e-05, + "learning_rate": 2.2578553615960103e-06, + "loss": 0.0, + "step": 35590 + }, + { + "epoch": 4.438902743142145, + "grad_norm": 5.2472081733867526e-05, + "learning_rate": 2.2528678304239405e-06, + "loss": 0.0144, + "step": 35600 + }, + { + "epoch": 4.440149625935162, + "grad_norm": 0.00015604296640958637, + "learning_rate": 2.2478802992518707e-06, + "loss": 0.0, + "step": 35610 + }, + { + "epoch": 4.44139650872818, + "grad_norm": 0.0003281154204159975, + "learning_rate": 2.242892768079801e-06, + "loss": 0.0, + "step": 35620 + }, + { + "epoch": 4.442643391521197, + "grad_norm": 0.00010192135232500732, + "learning_rate": 2.237905236907731e-06, + "loss": 0.0269, + "step": 35630 + }, + { + "epoch": 4.443890274314215, + "grad_norm": 9.66684747254476e-05, + "learning_rate": 2.232917705735661e-06, + "loss": 0.0, + "step": 35640 + }, + { + "epoch": 4.445137157107232, + "grad_norm": 0.0003734457422979176, + "learning_rate": 2.227930174563591e-06, + "loss": 0.0, + "step": 35650 + }, + { + "epoch": 4.446384039900249, + "grad_norm": 7.611950422869995e-05, + "learning_rate": 2.222942643391521e-06, + "loss": 0.0, + "step": 35660 + }, + { + "epoch": 4.447630922693267, + "grad_norm": 0.0004907181719318032, + "learning_rate": 2.2179551122194514e-06, + "loss": 0.0, + "step": 35670 + }, + { + "epoch": 4.448877805486284, + "grad_norm": 0.0001631807826925069, + "learning_rate": 2.2129675810473816e-06, + "loss": 0.0, + "step": 35680 + }, + { + "epoch": 4.450124688279302, + "grad_norm": 9.863793820841238e-05, + "learning_rate": 2.207980049875312e-06, + "loss": 0.0, + "step": 35690 + }, + { + "epoch": 4.451371571072319, + "grad_norm": 0.0002982678124681115, + "learning_rate": 2.2029925187032424e-06, + "loss": 0.0008, + "step": 35700 + }, + { + "epoch": 4.452618453865337, + "grad_norm": 0.002366115804761648, + "learning_rate": 2.198004987531172e-06, + "loss": 0.0, + "step": 35710 + }, + { + "epoch": 4.453865336658354, + "grad_norm": 0.09239999949932098, + "learning_rate": 2.1930174563591024e-06, + "loss": 0.0487, + "step": 35720 + }, + { + "epoch": 4.4551122194513715, + "grad_norm": 9.49343666434288e-05, + "learning_rate": 2.1880299251870326e-06, + "loss": 0.0, + "step": 35730 + }, + { + "epoch": 4.456359102244389, + "grad_norm": 0.0003900736046489328, + "learning_rate": 2.1830423940149628e-06, + "loss": 0.0, + "step": 35740 + }, + { + "epoch": 4.457605985037406, + "grad_norm": 9.119904279941693e-05, + "learning_rate": 2.178054862842893e-06, + "loss": 0.0, + "step": 35750 + }, + { + "epoch": 4.458852867830424, + "grad_norm": 4.107060158275999e-05, + "learning_rate": 2.173067331670823e-06, + "loss": 0.0015, + "step": 35760 + }, + { + "epoch": 4.460099750623441, + "grad_norm": 8.932932541938499e-05, + "learning_rate": 2.1680798004987534e-06, + "loss": 0.0, + "step": 35770 + }, + { + "epoch": 4.461346633416459, + "grad_norm": 9.386024612467736e-05, + "learning_rate": 2.1630922693266836e-06, + "loss": 0.0, + "step": 35780 + }, + { + "epoch": 4.462593516209476, + "grad_norm": 0.0001702239242149517, + "learning_rate": 2.1581047381546138e-06, + "loss": 0.0326, + "step": 35790 + }, + { + "epoch": 4.4638403990024935, + "grad_norm": 0.00030721109942533076, + "learning_rate": 2.1531172069825435e-06, + "loss": 0.0, + "step": 35800 + }, + { + "epoch": 4.465087281795511, + "grad_norm": 0.00013089546700939536, + "learning_rate": 2.1481296758104737e-06, + "loss": 0.0, + "step": 35810 + }, + { + "epoch": 4.466334164588528, + "grad_norm": 4.946505214320496e-05, + "learning_rate": 2.143142144638404e-06, + "loss": 0.004, + "step": 35820 + }, + { + "epoch": 4.467581047381546, + "grad_norm": 0.0005132012884132564, + "learning_rate": 2.1381546134663345e-06, + "loss": 0.0003, + "step": 35830 + }, + { + "epoch": 4.468827930174563, + "grad_norm": 0.00019803202303592116, + "learning_rate": 2.1331670822942647e-06, + "loss": 0.0, + "step": 35840 + }, + { + "epoch": 4.470074812967581, + "grad_norm": 0.0005909769097343087, + "learning_rate": 2.128179551122195e-06, + "loss": 0.0, + "step": 35850 + }, + { + "epoch": 4.471321695760598, + "grad_norm": 0.0003146138333249837, + "learning_rate": 2.1231920199501247e-06, + "loss": 0.0, + "step": 35860 + }, + { + "epoch": 4.472568578553616, + "grad_norm": 0.00033271979191340506, + "learning_rate": 2.118204488778055e-06, + "loss": 0.0, + "step": 35870 + }, + { + "epoch": 4.473815461346634, + "grad_norm": 23.879375457763672, + "learning_rate": 2.113216957605985e-06, + "loss": 0.0013, + "step": 35880 + }, + { + "epoch": 4.475062344139651, + "grad_norm": 0.0009702108800411224, + "learning_rate": 2.1082294264339153e-06, + "loss": 0.0, + "step": 35890 + }, + { + "epoch": 4.476309226932669, + "grad_norm": 0.00046513532288372517, + "learning_rate": 2.1032418952618455e-06, + "loss": 0.0, + "step": 35900 + }, + { + "epoch": 4.477556109725686, + "grad_norm": 0.00018407838069833815, + "learning_rate": 2.0982543640897757e-06, + "loss": 0.0, + "step": 35910 + }, + { + "epoch": 4.478802992518704, + "grad_norm": 0.00024080314324237406, + "learning_rate": 2.093266832917706e-06, + "loss": 0.0051, + "step": 35920 + }, + { + "epoch": 4.480049875311721, + "grad_norm": 0.00024162052432075143, + "learning_rate": 2.088279301745636e-06, + "loss": 0.0, + "step": 35930 + }, + { + "epoch": 4.4812967581047385, + "grad_norm": 5.4415671911556274e-05, + "learning_rate": 2.0832917705735663e-06, + "loss": 0.0, + "step": 35940 + }, + { + "epoch": 4.482543640897756, + "grad_norm": 9.320250683231279e-05, + "learning_rate": 2.0783042394014965e-06, + "loss": 0.0, + "step": 35950 + }, + { + "epoch": 4.483790523690773, + "grad_norm": 7.990971789695323e-05, + "learning_rate": 2.0733167082294267e-06, + "loss": 0.0, + "step": 35960 + }, + { + "epoch": 4.485037406483791, + "grad_norm": 9.405690798303112e-05, + "learning_rate": 2.068329177057357e-06, + "loss": 0.0, + "step": 35970 + }, + { + "epoch": 4.486284289276808, + "grad_norm": 0.03336402401328087, + "learning_rate": 2.063341645885287e-06, + "loss": 0.0, + "step": 35980 + }, + { + "epoch": 4.487531172069826, + "grad_norm": 0.0044538709335029125, + "learning_rate": 2.0583541147132173e-06, + "loss": 0.0045, + "step": 35990 + }, + { + "epoch": 4.488778054862843, + "grad_norm": 5.0544094847282395e-05, + "learning_rate": 2.0533665835411474e-06, + "loss": 0.0, + "step": 36000 + }, + { + "epoch": 4.4900249376558605, + "grad_norm": 9.530440001981333e-05, + "learning_rate": 2.0483790523690776e-06, + "loss": 0.025, + "step": 36010 + }, + { + "epoch": 4.491271820448878, + "grad_norm": 0.0016637337394058704, + "learning_rate": 2.0433915211970074e-06, + "loss": 0.0, + "step": 36020 + }, + { + "epoch": 4.492518703241895, + "grad_norm": 0.0019435989670455456, + "learning_rate": 2.0384039900249376e-06, + "loss": 0.0001, + "step": 36030 + }, + { + "epoch": 4.493765586034913, + "grad_norm": 0.00041165429865941405, + "learning_rate": 2.033416458852868e-06, + "loss": 0.0409, + "step": 36040 + }, + { + "epoch": 4.49501246882793, + "grad_norm": 0.00011268148227827623, + "learning_rate": 2.028428927680798e-06, + "loss": 0.0106, + "step": 36050 + }, + { + "epoch": 4.496259351620948, + "grad_norm": 0.0026401542127132416, + "learning_rate": 2.023441396508728e-06, + "loss": 0.0004, + "step": 36060 + }, + { + "epoch": 4.497506234413965, + "grad_norm": 0.00018546386854723096, + "learning_rate": 2.018453865336659e-06, + "loss": 0.0, + "step": 36070 + }, + { + "epoch": 4.498753117206983, + "grad_norm": 0.0018113835249096155, + "learning_rate": 2.013466334164589e-06, + "loss": 0.0, + "step": 36080 + }, + { + "epoch": 4.5, + "grad_norm": 0.0028145157266408205, + "learning_rate": 2.0084788029925188e-06, + "loss": 0.0, + "step": 36090 + }, + { + "epoch": 4.501246882793017, + "grad_norm": 0.0008215096895582974, + "learning_rate": 2.003491271820449e-06, + "loss": 0.0, + "step": 36100 + }, + { + "epoch": 4.502493765586035, + "grad_norm": 9.587730892235413e-05, + "learning_rate": 1.998503740648379e-06, + "loss": 0.0, + "step": 36110 + }, + { + "epoch": 4.503740648379052, + "grad_norm": 7.04293925082311e-05, + "learning_rate": 1.9935162094763094e-06, + "loss": 0.0, + "step": 36120 + }, + { + "epoch": 4.50498753117207, + "grad_norm": 5.8869190979748964e-05, + "learning_rate": 1.9885286783042396e-06, + "loss": 0.0, + "step": 36130 + }, + { + "epoch": 4.506234413965087, + "grad_norm": 0.0003809872141573578, + "learning_rate": 1.9835411471321698e-06, + "loss": 0.0, + "step": 36140 + }, + { + "epoch": 4.507481296758105, + "grad_norm": 0.016797462478280067, + "learning_rate": 1.9785536159601e-06, + "loss": 0.0016, + "step": 36150 + }, + { + "epoch": 4.508728179551122, + "grad_norm": 0.0005834007752127945, + "learning_rate": 1.97356608478803e-06, + "loss": 0.0, + "step": 36160 + }, + { + "epoch": 4.5099750623441395, + "grad_norm": 0.000800897425506264, + "learning_rate": 1.9685785536159604e-06, + "loss": 0.0, + "step": 36170 + }, + { + "epoch": 4.511221945137157, + "grad_norm": 0.00021014439698774368, + "learning_rate": 1.96359102244389e-06, + "loss": 0.0, + "step": 36180 + }, + { + "epoch": 4.512468827930174, + "grad_norm": 3.088951052632183e-05, + "learning_rate": 1.9586034912718203e-06, + "loss": 0.0, + "step": 36190 + }, + { + "epoch": 4.513715710723192, + "grad_norm": 0.00011189231736352667, + "learning_rate": 1.953615960099751e-06, + "loss": 0.0, + "step": 36200 + }, + { + "epoch": 4.514962593516209, + "grad_norm": 0.00038580026011914015, + "learning_rate": 1.948628428927681e-06, + "loss": 0.0, + "step": 36210 + }, + { + "epoch": 4.516209476309227, + "grad_norm": 0.0007074935710988939, + "learning_rate": 1.9436408977556113e-06, + "loss": 0.0, + "step": 36220 + }, + { + "epoch": 4.517456359102244, + "grad_norm": 0.00019117812917102128, + "learning_rate": 1.9386533665835415e-06, + "loss": 0.0007, + "step": 36230 + }, + { + "epoch": 4.5187032418952615, + "grad_norm": 5.8276415074942634e-05, + "learning_rate": 1.9336658354114713e-06, + "loss": 0.0, + "step": 36240 + }, + { + "epoch": 4.519950124688279, + "grad_norm": 5.8334331697551534e-05, + "learning_rate": 1.9286783042394015e-06, + "loss": 0.0, + "step": 36250 + }, + { + "epoch": 4.521197007481296, + "grad_norm": 7.67667661421001e-05, + "learning_rate": 1.9236907730673317e-06, + "loss": 0.0001, + "step": 36260 + }, + { + "epoch": 4.522443890274314, + "grad_norm": 0.0006992538692429662, + "learning_rate": 1.918703241895262e-06, + "loss": 0.0, + "step": 36270 + }, + { + "epoch": 4.523690773067331, + "grad_norm": 6.20846840320155e-05, + "learning_rate": 1.913715710723192e-06, + "loss": 0.0, + "step": 36280 + }, + { + "epoch": 4.524937655860349, + "grad_norm": 0.00012284106924198568, + "learning_rate": 1.9092269326683295e-06, + "loss": 0.0059, + "step": 36290 + }, + { + "epoch": 4.526184538653366, + "grad_norm": 4.695883035310544e-05, + "learning_rate": 1.9042394014962595e-06, + "loss": 0.0, + "step": 36300 + }, + { + "epoch": 4.5274314214463836, + "grad_norm": 0.012509011663496494, + "learning_rate": 1.8992518703241897e-06, + "loss": 0.0, + "step": 36310 + }, + { + "epoch": 4.528678304239402, + "grad_norm": 0.000993928057141602, + "learning_rate": 1.8942643391521199e-06, + "loss": 0.0, + "step": 36320 + }, + { + "epoch": 4.529925187032419, + "grad_norm": 0.00010343554458813742, + "learning_rate": 1.88927680798005e-06, + "loss": 0.0, + "step": 36330 + }, + { + "epoch": 4.531172069825437, + "grad_norm": 6.238223431864753e-05, + "learning_rate": 1.88428927680798e-06, + "loss": 0.0, + "step": 36340 + }, + { + "epoch": 4.532418952618454, + "grad_norm": 7.380228635156527e-05, + "learning_rate": 1.8793017456359102e-06, + "loss": 0.0, + "step": 36350 + }, + { + "epoch": 4.533665835411472, + "grad_norm": 0.000599766499362886, + "learning_rate": 1.8743142144638404e-06, + "loss": 0.0264, + "step": 36360 + }, + { + "epoch": 4.534912718204489, + "grad_norm": 0.00018769617599900812, + "learning_rate": 1.8693266832917708e-06, + "loss": 0.0, + "step": 36370 + }, + { + "epoch": 4.5361596009975065, + "grad_norm": 0.0028365144971758127, + "learning_rate": 1.864339152119701e-06, + "loss": 0.0, + "step": 36380 + }, + { + "epoch": 4.537406483790524, + "grad_norm": 5.832207898492925e-05, + "learning_rate": 1.8593516209476312e-06, + "loss": 0.0, + "step": 36390 + }, + { + "epoch": 4.538653366583541, + "grad_norm": 0.0001258625416085124, + "learning_rate": 1.8543640897755614e-06, + "loss": 0.0, + "step": 36400 + }, + { + "epoch": 4.539900249376559, + "grad_norm": 0.0002180346637032926, + "learning_rate": 1.8493765586034914e-06, + "loss": 0.0, + "step": 36410 + }, + { + "epoch": 4.541147132169576, + "grad_norm": 0.00015187938697636127, + "learning_rate": 1.8443890274314216e-06, + "loss": 0.0, + "step": 36420 + }, + { + "epoch": 4.542394014962594, + "grad_norm": 0.0003344623255543411, + "learning_rate": 1.8394014962593518e-06, + "loss": 0.0007, + "step": 36430 + }, + { + "epoch": 4.543640897755611, + "grad_norm": 0.0008474570931866765, + "learning_rate": 1.834413965087282e-06, + "loss": 0.0517, + "step": 36440 + }, + { + "epoch": 4.5448877805486285, + "grad_norm": 7.383363117696717e-05, + "learning_rate": 1.829426433915212e-06, + "loss": 0.0, + "step": 36450 + }, + { + "epoch": 4.546134663341646, + "grad_norm": 0.00031828609644435346, + "learning_rate": 1.8244389027431422e-06, + "loss": 0.0, + "step": 36460 + }, + { + "epoch": 4.547381546134663, + "grad_norm": 0.0006407342152670026, + "learning_rate": 1.8194513715710724e-06, + "loss": 0.0, + "step": 36470 + }, + { + "epoch": 4.548628428927681, + "grad_norm": 0.0001172191696241498, + "learning_rate": 1.8144638403990026e-06, + "loss": 0.0001, + "step": 36480 + }, + { + "epoch": 4.549875311720698, + "grad_norm": 0.0008851775201037526, + "learning_rate": 1.8094763092269328e-06, + "loss": 0.0, + "step": 36490 + }, + { + "epoch": 4.551122194513716, + "grad_norm": 0.00010299222049070522, + "learning_rate": 1.8044887780548632e-06, + "loss": 0.0, + "step": 36500 + }, + { + "epoch": 4.552369077306733, + "grad_norm": 0.00030066914041526616, + "learning_rate": 1.7995012468827934e-06, + "loss": 0.0, + "step": 36510 + }, + { + "epoch": 4.553615960099751, + "grad_norm": 0.0028322464786469936, + "learning_rate": 1.7945137157107233e-06, + "loss": 0.0, + "step": 36520 + }, + { + "epoch": 4.554862842892768, + "grad_norm": 0.00024018692784011364, + "learning_rate": 1.7895261845386535e-06, + "loss": 0.006, + "step": 36530 + }, + { + "epoch": 4.556109725685785, + "grad_norm": 7.3418123065494e-05, + "learning_rate": 1.7845386533665837e-06, + "loss": 0.0, + "step": 36540 + }, + { + "epoch": 4.557356608478803, + "grad_norm": 63.39867401123047, + "learning_rate": 1.779551122194514e-06, + "loss": 0.0361, + "step": 36550 + }, + { + "epoch": 4.55860349127182, + "grad_norm": 0.00010892859427258372, + "learning_rate": 1.774563591022444e-06, + "loss": 0.0, + "step": 36560 + }, + { + "epoch": 4.559850374064838, + "grad_norm": 0.00015360671386588365, + "learning_rate": 1.7695760598503741e-06, + "loss": 0.0057, + "step": 36570 + }, + { + "epoch": 4.561097256857855, + "grad_norm": 5.321098797139712e-05, + "learning_rate": 1.7645885286783043e-06, + "loss": 0.0157, + "step": 36580 + }, + { + "epoch": 4.562344139650873, + "grad_norm": 8.898463420337066e-05, + "learning_rate": 1.7596009975062345e-06, + "loss": 0.0, + "step": 36590 + }, + { + "epoch": 4.56359102244389, + "grad_norm": 0.0003251841408200562, + "learning_rate": 1.7546134663341647e-06, + "loss": 0.0329, + "step": 36600 + }, + { + "epoch": 4.5648379052369075, + "grad_norm": 0.00042449383181519806, + "learning_rate": 1.7496259351620947e-06, + "loss": 0.0, + "step": 36610 + }, + { + "epoch": 4.566084788029925, + "grad_norm": 0.005706509575247765, + "learning_rate": 1.7446384039900253e-06, + "loss": 0.0, + "step": 36620 + }, + { + "epoch": 4.567331670822942, + "grad_norm": 7.963182724779472e-05, + "learning_rate": 1.7396508728179553e-06, + "loss": 0.0001, + "step": 36630 + }, + { + "epoch": 4.56857855361596, + "grad_norm": 0.0028878054581582546, + "learning_rate": 1.7346633416458855e-06, + "loss": 0.0, + "step": 36640 + }, + { + "epoch": 4.569825436408977, + "grad_norm": 0.0009514418779872358, + "learning_rate": 1.7296758104738157e-06, + "loss": 0.0, + "step": 36650 + }, + { + "epoch": 4.571072319201995, + "grad_norm": 0.0022119847126305103, + "learning_rate": 1.7246882793017459e-06, + "loss": 0.0, + "step": 36660 + }, + { + "epoch": 4.572319201995013, + "grad_norm": 0.0009768361924216151, + "learning_rate": 1.719700748129676e-06, + "loss": 0.0, + "step": 36670 + }, + { + "epoch": 4.57356608478803, + "grad_norm": 6.927720096427947e-05, + "learning_rate": 1.714713216957606e-06, + "loss": 0.0, + "step": 36680 + }, + { + "epoch": 4.574812967581048, + "grad_norm": 0.001001956406980753, + "learning_rate": 1.7097256857855363e-06, + "loss": 0.0, + "step": 36690 + }, + { + "epoch": 4.576059850374065, + "grad_norm": 0.00015158270252868533, + "learning_rate": 1.7047381546134664e-06, + "loss": 0.0, + "step": 36700 + }, + { + "epoch": 4.577306733167083, + "grad_norm": 0.0009215485770255327, + "learning_rate": 1.6997506234413966e-06, + "loss": 0.0, + "step": 36710 + }, + { + "epoch": 4.5785536159601, + "grad_norm": 0.0002993463131133467, + "learning_rate": 1.6947630922693266e-06, + "loss": 0.0, + "step": 36720 + }, + { + "epoch": 4.579800498753118, + "grad_norm": 0.000504608207847923, + "learning_rate": 1.6897755610972568e-06, + "loss": 0.0001, + "step": 36730 + }, + { + "epoch": 4.581047381546135, + "grad_norm": 6.463566387537867e-05, + "learning_rate": 1.6847880299251872e-06, + "loss": 0.0001, + "step": 36740 + }, + { + "epoch": 4.582294264339152, + "grad_norm": 5.9693622461054474e-05, + "learning_rate": 1.6798004987531174e-06, + "loss": 0.0, + "step": 36750 + }, + { + "epoch": 4.58354114713217, + "grad_norm": 0.00012298337242100388, + "learning_rate": 1.6748129675810476e-06, + "loss": 0.0, + "step": 36760 + }, + { + "epoch": 4.584788029925187, + "grad_norm": 0.0008290084660984576, + "learning_rate": 1.6698254364089778e-06, + "loss": 0.0741, + "step": 36770 + }, + { + "epoch": 4.586034912718205, + "grad_norm": 0.0001284991012653336, + "learning_rate": 1.664837905236908e-06, + "loss": 0.0, + "step": 36780 + }, + { + "epoch": 4.587281795511222, + "grad_norm": 0.00019413598056416959, + "learning_rate": 1.659850374064838e-06, + "loss": 0.0, + "step": 36790 + }, + { + "epoch": 4.58852867830424, + "grad_norm": 0.00030527933267876506, + "learning_rate": 1.6548628428927682e-06, + "loss": 0.0, + "step": 36800 + }, + { + "epoch": 4.589775561097257, + "grad_norm": 0.0001714515092317015, + "learning_rate": 1.6498753117206984e-06, + "loss": 0.0, + "step": 36810 + }, + { + "epoch": 4.5910224438902745, + "grad_norm": 6.510254024760798e-05, + "learning_rate": 1.6448877805486286e-06, + "loss": 0.0, + "step": 36820 + }, + { + "epoch": 4.592269326683292, + "grad_norm": 8.50441720103845e-05, + "learning_rate": 1.6399002493765586e-06, + "loss": 0.0, + "step": 36830 + }, + { + "epoch": 4.593516209476309, + "grad_norm": 0.011633110232651234, + "learning_rate": 1.6349127182044888e-06, + "loss": 0.0, + "step": 36840 + }, + { + "epoch": 4.594763092269327, + "grad_norm": 0.008615621365606785, + "learning_rate": 1.629925187032419e-06, + "loss": 0.0, + "step": 36850 + }, + { + "epoch": 4.596009975062344, + "grad_norm": 0.00014177417324390262, + "learning_rate": 1.6249376558603492e-06, + "loss": 0.0286, + "step": 36860 + }, + { + "epoch": 4.597256857855362, + "grad_norm": 8.73705284902826e-05, + "learning_rate": 1.6199501246882796e-06, + "loss": 0.0, + "step": 36870 + }, + { + "epoch": 4.598503740648379, + "grad_norm": 0.000408838881412521, + "learning_rate": 1.6149625935162098e-06, + "loss": 0.0001, + "step": 36880 + }, + { + "epoch": 4.5997506234413965, + "grad_norm": 5.491747651831247e-05, + "learning_rate": 1.60997506234414e-06, + "loss": 0.0397, + "step": 36890 + }, + { + "epoch": 4.600997506234414, + "grad_norm": 5.760222120443359e-05, + "learning_rate": 1.60498753117207e-06, + "loss": 0.0, + "step": 36900 + }, + { + "epoch": 4.602244389027431, + "grad_norm": 0.00023852268350310624, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.0, + "step": 36910 + }, + { + "epoch": 4.603491271820449, + "grad_norm": 0.00017992674838751554, + "learning_rate": 1.5950124688279303e-06, + "loss": 0.0, + "step": 36920 + }, + { + "epoch": 4.604738154613466, + "grad_norm": 0.00035725522320717573, + "learning_rate": 1.5900249376558605e-06, + "loss": 0.0, + "step": 36930 + }, + { + "epoch": 4.605985037406484, + "grad_norm": 0.0001332864776486531, + "learning_rate": 1.5850374064837905e-06, + "loss": 0.0253, + "step": 36940 + }, + { + "epoch": 4.607231920199501, + "grad_norm": 0.00014862054376862943, + "learning_rate": 1.5800498753117207e-06, + "loss": 0.0, + "step": 36950 + }, + { + "epoch": 4.6084788029925186, + "grad_norm": 8.171637455234304e-05, + "learning_rate": 1.575062344139651e-06, + "loss": 0.0, + "step": 36960 + }, + { + "epoch": 4.609725685785536, + "grad_norm": 7.45083307265304e-05, + "learning_rate": 1.570074812967581e-06, + "loss": 0.0, + "step": 36970 + }, + { + "epoch": 4.610972568578553, + "grad_norm": 7.226823072414845e-05, + "learning_rate": 1.5650872817955113e-06, + "loss": 0.027, + "step": 36980 + }, + { + "epoch": 4.612219451371571, + "grad_norm": 8.009558951016515e-05, + "learning_rate": 1.5600997506234417e-06, + "loss": 0.0, + "step": 36990 + }, + { + "epoch": 4.613466334164588, + "grad_norm": 0.00047921130317263305, + "learning_rate": 1.555112219451372e-06, + "loss": 0.0, + "step": 37000 + }, + { + "epoch": 4.614713216957606, + "grad_norm": 0.00016305805183947086, + "learning_rate": 1.5501246882793019e-06, + "loss": 0.0004, + "step": 37010 + }, + { + "epoch": 4.615960099750623, + "grad_norm": 0.0001415027945768088, + "learning_rate": 1.545137157107232e-06, + "loss": 0.0, + "step": 37020 + }, + { + "epoch": 4.617206982543641, + "grad_norm": 0.00013072905130684376, + "learning_rate": 1.5401496259351623e-06, + "loss": 0.0, + "step": 37030 + }, + { + "epoch": 4.618453865336658, + "grad_norm": 0.00042544634197838604, + "learning_rate": 1.5351620947630925e-06, + "loss": 0.0078, + "step": 37040 + }, + { + "epoch": 4.6197007481296755, + "grad_norm": 8.365141547983512e-05, + "learning_rate": 1.5301745635910227e-06, + "loss": 0.0, + "step": 37050 + }, + { + "epoch": 4.620947630922693, + "grad_norm": 0.0003506205976009369, + "learning_rate": 1.5251870324189527e-06, + "loss": 0.0, + "step": 37060 + }, + { + "epoch": 4.62219451371571, + "grad_norm": 9.824489097809419e-05, + "learning_rate": 1.5201995012468829e-06, + "loss": 0.0, + "step": 37070 + }, + { + "epoch": 4.623441396508728, + "grad_norm": 0.0017115233931690454, + "learning_rate": 1.515211970074813e-06, + "loss": 0.0, + "step": 37080 + }, + { + "epoch": 4.624688279301745, + "grad_norm": 0.0024657100439071655, + "learning_rate": 1.5102244389027432e-06, + "loss": 0.0, + "step": 37090 + }, + { + "epoch": 4.625935162094763, + "grad_norm": 0.0031729680486023426, + "learning_rate": 1.5052369077306732e-06, + "loss": 0.0, + "step": 37100 + }, + { + "epoch": 4.62718204488778, + "grad_norm": 0.0001139767118729651, + "learning_rate": 1.5002493765586034e-06, + "loss": 0.0183, + "step": 37110 + }, + { + "epoch": 4.628428927680798, + "grad_norm": 0.00011955766240134835, + "learning_rate": 1.4952618453865338e-06, + "loss": 0.0, + "step": 37120 + }, + { + "epoch": 4.629675810473816, + "grad_norm": 9.51491019804962e-05, + "learning_rate": 1.490274314214464e-06, + "loss": 0.0025, + "step": 37130 + }, + { + "epoch": 4.630922693266833, + "grad_norm": 6.892336386954412e-05, + "learning_rate": 1.4852867830423942e-06, + "loss": 0.0, + "step": 37140 + }, + { + "epoch": 4.632169576059851, + "grad_norm": 0.000913655967451632, + "learning_rate": 1.4802992518703244e-06, + "loss": 0.0, + "step": 37150 + }, + { + "epoch": 4.633416458852868, + "grad_norm": 0.00010036984895123169, + "learning_rate": 1.4753117206982546e-06, + "loss": 0.0, + "step": 37160 + }, + { + "epoch": 4.634663341645886, + "grad_norm": 0.0002912537893280387, + "learning_rate": 1.4703241895261846e-06, + "loss": 0.0, + "step": 37170 + }, + { + "epoch": 4.635910224438903, + "grad_norm": 0.00021481663861777633, + "learning_rate": 1.4653366583541148e-06, + "loss": 0.0288, + "step": 37180 + }, + { + "epoch": 4.63715710723192, + "grad_norm": 0.00040280946996062994, + "learning_rate": 1.460349127182045e-06, + "loss": 0.0, + "step": 37190 + }, + { + "epoch": 4.638403990024938, + "grad_norm": 9.865140600595623e-05, + "learning_rate": 1.4553615960099752e-06, + "loss": 0.0, + "step": 37200 + }, + { + "epoch": 4.639650872817955, + "grad_norm": 0.00029368087416514754, + "learning_rate": 1.4503740648379052e-06, + "loss": 0.0, + "step": 37210 + }, + { + "epoch": 4.640897755610973, + "grad_norm": 0.009541047737002373, + "learning_rate": 1.4453865336658354e-06, + "loss": 0.0, + "step": 37220 + }, + { + "epoch": 4.64214463840399, + "grad_norm": 0.00014524144353345037, + "learning_rate": 1.4403990024937656e-06, + "loss": 0.0, + "step": 37230 + }, + { + "epoch": 4.643391521197008, + "grad_norm": 8.261830225819722e-05, + "learning_rate": 1.435411471321696e-06, + "loss": 0.0, + "step": 37240 + }, + { + "epoch": 4.644638403990025, + "grad_norm": 0.010629824362695217, + "learning_rate": 1.4304239401496262e-06, + "loss": 0.0, + "step": 37250 + }, + { + "epoch": 4.6458852867830425, + "grad_norm": 0.0003399306442588568, + "learning_rate": 1.4254364089775564e-06, + "loss": 0.001, + "step": 37260 + }, + { + "epoch": 4.64713216957606, + "grad_norm": 0.0002243259659735486, + "learning_rate": 1.4204488778054866e-06, + "loss": 0.0, + "step": 37270 + }, + { + "epoch": 4.648379052369077, + "grad_norm": 8.796357724349946e-05, + "learning_rate": 1.4154613466334165e-06, + "loss": 0.0102, + "step": 37280 + }, + { + "epoch": 4.649625935162095, + "grad_norm": 0.006956281140446663, + "learning_rate": 1.4104738154613467e-06, + "loss": 0.0, + "step": 37290 + }, + { + "epoch": 4.650872817955112, + "grad_norm": 0.0003823429869953543, + "learning_rate": 1.405486284289277e-06, + "loss": 0.0, + "step": 37300 + }, + { + "epoch": 4.65211970074813, + "grad_norm": 4.093679672223516e-05, + "learning_rate": 1.4004987531172071e-06, + "loss": 0.0, + "step": 37310 + }, + { + "epoch": 4.653366583541147, + "grad_norm": 0.028871528804302216, + "learning_rate": 1.3955112219451371e-06, + "loss": 0.0, + "step": 37320 + }, + { + "epoch": 4.6546134663341645, + "grad_norm": 0.0019481899216771126, + "learning_rate": 1.3905236907730673e-06, + "loss": 0.0, + "step": 37330 + }, + { + "epoch": 4.655860349127182, + "grad_norm": 0.00046627031406387687, + "learning_rate": 1.3855361596009975e-06, + "loss": 0.0, + "step": 37340 + }, + { + "epoch": 4.657107231920199, + "grad_norm": 0.00011581995931919664, + "learning_rate": 1.3805486284289277e-06, + "loss": 0.0001, + "step": 37350 + }, + { + "epoch": 4.658354114713217, + "grad_norm": 0.00014464250125456601, + "learning_rate": 1.3755610972568581e-06, + "loss": 0.0, + "step": 37360 + }, + { + "epoch": 4.659600997506234, + "grad_norm": 0.0001020960189634934, + "learning_rate": 1.3705735660847883e-06, + "loss": 0.0001, + "step": 37370 + }, + { + "epoch": 4.660847880299252, + "grad_norm": 0.0036403797566890717, + "learning_rate": 1.3655860349127185e-06, + "loss": 0.0, + "step": 37380 + }, + { + "epoch": 4.662094763092269, + "grad_norm": 7.654747605556622e-05, + "learning_rate": 1.3605985037406485e-06, + "loss": 0.0, + "step": 37390 + }, + { + "epoch": 4.6633416458852865, + "grad_norm": 0.000111608904262539, + "learning_rate": 1.3556109725685787e-06, + "loss": 0.0105, + "step": 37400 + }, + { + "epoch": 4.664588528678304, + "grad_norm": 0.000193897751159966, + "learning_rate": 1.3506234413965089e-06, + "loss": 0.0, + "step": 37410 + }, + { + "epoch": 4.665835411471321, + "grad_norm": 0.001126578776165843, + "learning_rate": 1.345635910224439e-06, + "loss": 0.0, + "step": 37420 + }, + { + "epoch": 4.667082294264339, + "grad_norm": 0.04877956211566925, + "learning_rate": 1.3406483790523693e-06, + "loss": 0.0, + "step": 37430 + }, + { + "epoch": 4.668329177057356, + "grad_norm": 0.0032060265075415373, + "learning_rate": 1.3356608478802993e-06, + "loss": 0.0, + "step": 37440 + }, + { + "epoch": 4.669576059850374, + "grad_norm": 0.00014094925427343696, + "learning_rate": 1.3306733167082294e-06, + "loss": 0.0, + "step": 37450 + }, + { + "epoch": 4.670822942643391, + "grad_norm": 0.1798996776342392, + "learning_rate": 1.3256857855361596e-06, + "loss": 0.0, + "step": 37460 + }, + { + "epoch": 4.6720698254364095, + "grad_norm": 0.002865089802071452, + "learning_rate": 1.3206982543640898e-06, + "loss": 0.0, + "step": 37470 + }, + { + "epoch": 4.673316708229427, + "grad_norm": 0.00010365377966081724, + "learning_rate": 1.3157107231920198e-06, + "loss": 0.0, + "step": 37480 + }, + { + "epoch": 4.674563591022444, + "grad_norm": 0.0003474355034995824, + "learning_rate": 1.3107231920199504e-06, + "loss": 0.0, + "step": 37490 + }, + { + "epoch": 4.675810473815462, + "grad_norm": 0.016976043581962585, + "learning_rate": 1.3057356608478804e-06, + "loss": 0.0, + "step": 37500 + }, + { + "epoch": 4.677057356608479, + "grad_norm": 0.00019507709657773376, + "learning_rate": 1.3007481296758106e-06, + "loss": 0.0, + "step": 37510 + }, + { + "epoch": 4.678304239401497, + "grad_norm": 7.908452244009823e-05, + "learning_rate": 1.2957605985037408e-06, + "loss": 0.0164, + "step": 37520 + }, + { + "epoch": 4.679551122194514, + "grad_norm": 0.00025233966880477965, + "learning_rate": 1.290773067331671e-06, + "loss": 0.0219, + "step": 37530 + }, + { + "epoch": 4.6807980049875315, + "grad_norm": 5.758203042205423e-05, + "learning_rate": 1.2857855361596012e-06, + "loss": 0.0, + "step": 37540 + }, + { + "epoch": 4.682044887780549, + "grad_norm": 0.000734044355340302, + "learning_rate": 1.2807980049875312e-06, + "loss": 0.0, + "step": 37550 + }, + { + "epoch": 4.683291770573566, + "grad_norm": 4.313397585065104e-05, + "learning_rate": 1.2758104738154614e-06, + "loss": 0.0465, + "step": 37560 + }, + { + "epoch": 4.684538653366584, + "grad_norm": 32.49720764160156, + "learning_rate": 1.2708229426433916e-06, + "loss": 0.0364, + "step": 37570 + }, + { + "epoch": 4.685785536159601, + "grad_norm": 0.00016528677952010185, + "learning_rate": 1.2658354114713218e-06, + "loss": 0.0002, + "step": 37580 + }, + { + "epoch": 4.687032418952619, + "grad_norm": 0.004134850576519966, + "learning_rate": 1.2608478802992518e-06, + "loss": 0.0, + "step": 37590 + }, + { + "epoch": 4.688279301745636, + "grad_norm": 0.0005740217748098075, + "learning_rate": 1.255860349127182e-06, + "loss": 0.0, + "step": 37600 + }, + { + "epoch": 4.6895261845386536, + "grad_norm": 5.220138336881064e-05, + "learning_rate": 1.2508728179551124e-06, + "loss": 0.0, + "step": 37610 + }, + { + "epoch": 4.690773067331671, + "grad_norm": 9.989990940084681e-05, + "learning_rate": 1.2458852867830426e-06, + "loss": 0.0, + "step": 37620 + }, + { + "epoch": 4.692019950124688, + "grad_norm": 0.001709539326839149, + "learning_rate": 1.2408977556109726e-06, + "loss": 0.0, + "step": 37630 + }, + { + "epoch": 4.693266832917706, + "grad_norm": 0.02466406300663948, + "learning_rate": 1.235910224438903e-06, + "loss": 0.0, + "step": 37640 + }, + { + "epoch": 4.694513715710723, + "grad_norm": 7.842500053811818e-05, + "learning_rate": 1.2309226932668332e-06, + "loss": 0.0, + "step": 37650 + }, + { + "epoch": 4.695760598503741, + "grad_norm": 0.00015703195822425187, + "learning_rate": 1.2259351620947631e-06, + "loss": 0.0001, + "step": 37660 + }, + { + "epoch": 4.697007481296758, + "grad_norm": 0.0005103639559820294, + "learning_rate": 1.2209476309226933e-06, + "loss": 0.0, + "step": 37670 + }, + { + "epoch": 4.698254364089776, + "grad_norm": 0.0009048631764017045, + "learning_rate": 1.2159600997506235e-06, + "loss": 0.0, + "step": 37680 + }, + { + "epoch": 4.699501246882793, + "grad_norm": 0.008464858867228031, + "learning_rate": 1.2109725685785537e-06, + "loss": 0.0001, + "step": 37690 + }, + { + "epoch": 4.7007481296758105, + "grad_norm": 7.08570732967928e-05, + "learning_rate": 1.205985037406484e-06, + "loss": 0.0, + "step": 37700 + }, + { + "epoch": 4.701995012468828, + "grad_norm": 4.024140434921719e-05, + "learning_rate": 1.2009975062344141e-06, + "loss": 0.0, + "step": 37710 + }, + { + "epoch": 4.703241895261845, + "grad_norm": 0.0004707244224846363, + "learning_rate": 1.1960099750623443e-06, + "loss": 0.0092, + "step": 37720 + }, + { + "epoch": 4.704488778054863, + "grad_norm": 0.010790945962071419, + "learning_rate": 1.1910224438902745e-06, + "loss": 0.0, + "step": 37730 + }, + { + "epoch": 4.70573566084788, + "grad_norm": 5.667881850968115e-05, + "learning_rate": 1.1860349127182045e-06, + "loss": 0.0, + "step": 37740 + }, + { + "epoch": 4.706982543640898, + "grad_norm": 8.554881787858903e-05, + "learning_rate": 1.1810473815461347e-06, + "loss": 0.0011, + "step": 37750 + }, + { + "epoch": 4.708229426433915, + "grad_norm": 6.647824920946732e-05, + "learning_rate": 1.1760598503740649e-06, + "loss": 0.0, + "step": 37760 + }, + { + "epoch": 4.7094763092269325, + "grad_norm": 0.0001823468046495691, + "learning_rate": 1.171072319201995e-06, + "loss": 0.0718, + "step": 37770 + }, + { + "epoch": 4.71072319201995, + "grad_norm": 3.875693801091984e-05, + "learning_rate": 1.1660847880299253e-06, + "loss": 0.0112, + "step": 37780 + }, + { + "epoch": 4.711970074812967, + "grad_norm": 0.0006465655169449747, + "learning_rate": 1.1610972568578555e-06, + "loss": 0.0, + "step": 37790 + }, + { + "epoch": 4.713216957605985, + "grad_norm": 0.0006873203092254698, + "learning_rate": 1.1561097256857857e-06, + "loss": 0.0, + "step": 37800 + }, + { + "epoch": 4.714463840399002, + "grad_norm": 0.00017582096916157752, + "learning_rate": 1.1511221945137159e-06, + "loss": 0.0, + "step": 37810 + }, + { + "epoch": 4.71571072319202, + "grad_norm": 0.002465422498062253, + "learning_rate": 1.1461346633416458e-06, + "loss": 0.0049, + "step": 37820 + }, + { + "epoch": 4.716957605985037, + "grad_norm": 0.00022965417883824557, + "learning_rate": 1.1411471321695763e-06, + "loss": 0.0, + "step": 37830 + }, + { + "epoch": 4.7182044887780545, + "grad_norm": 0.00011124199227197096, + "learning_rate": 1.1361596009975065e-06, + "loss": 0.0, + "step": 37840 + }, + { + "epoch": 4.719451371571072, + "grad_norm": 0.00040846472256816924, + "learning_rate": 1.1311720698254364e-06, + "loss": 0.0741, + "step": 37850 + }, + { + "epoch": 4.720698254364089, + "grad_norm": 0.00880060438066721, + "learning_rate": 1.1261845386533666e-06, + "loss": 0.0, + "step": 37860 + }, + { + "epoch": 4.721945137157107, + "grad_norm": 5.207054709899239e-05, + "learning_rate": 1.1211970074812968e-06, + "loss": 0.091, + "step": 37870 + }, + { + "epoch": 4.723192019950124, + "grad_norm": 0.00037851626984775066, + "learning_rate": 1.116209476309227e-06, + "loss": 0.0, + "step": 37880 + }, + { + "epoch": 4.724438902743142, + "grad_norm": 0.0003735979844350368, + "learning_rate": 1.1112219451371572e-06, + "loss": 0.0, + "step": 37890 + }, + { + "epoch": 4.725685785536159, + "grad_norm": 0.00029443929088301957, + "learning_rate": 1.1062344139650874e-06, + "loss": 0.0, + "step": 37900 + }, + { + "epoch": 4.726932668329177, + "grad_norm": 9.832724026637152e-05, + "learning_rate": 1.1012468827930176e-06, + "loss": 0.0, + "step": 37910 + }, + { + "epoch": 4.728179551122195, + "grad_norm": 0.00041520065860822797, + "learning_rate": 1.0962593516209478e-06, + "loss": 0.0, + "step": 37920 + }, + { + "epoch": 4.729426433915212, + "grad_norm": 7.619358802912757e-05, + "learning_rate": 1.0912718204488778e-06, + "loss": 0.0, + "step": 37930 + }, + { + "epoch": 4.73067331670823, + "grad_norm": 0.00065153295872733, + "learning_rate": 1.086284289276808e-06, + "loss": 0.0, + "step": 37940 + }, + { + "epoch": 4.731920199501247, + "grad_norm": 0.0009183948859572411, + "learning_rate": 1.0812967581047384e-06, + "loss": 0.0, + "step": 37950 + }, + { + "epoch": 4.733167082294265, + "grad_norm": 0.0008579808054491878, + "learning_rate": 1.0763092269326684e-06, + "loss": 0.0001, + "step": 37960 + }, + { + "epoch": 4.734413965087282, + "grad_norm": 0.00019635986245702952, + "learning_rate": 1.0713216957605986e-06, + "loss": 0.0003, + "step": 37970 + }, + { + "epoch": 4.7356608478802995, + "grad_norm": 0.0011371946893632412, + "learning_rate": 1.0663341645885288e-06, + "loss": 0.0, + "step": 37980 + }, + { + "epoch": 4.736907730673317, + "grad_norm": 5.076320667285472e-05, + "learning_rate": 1.061346633416459e-06, + "loss": 0.0011, + "step": 37990 + }, + { + "epoch": 4.738154613466334, + "grad_norm": 4.8724108637543395e-05, + "learning_rate": 1.0563591022443892e-06, + "loss": 0.0, + "step": 38000 + }, + { + "epoch": 4.739401496259352, + "grad_norm": 0.00011148832709295675, + "learning_rate": 1.0513715710723194e-06, + "loss": 0.0, + "step": 38010 + }, + { + "epoch": 4.740648379052369, + "grad_norm": 7.638114766450599e-05, + "learning_rate": 1.0463840399002496e-06, + "loss": 0.0, + "step": 38020 + }, + { + "epoch": 4.741895261845387, + "grad_norm": 0.00010628465679474175, + "learning_rate": 1.0413965087281798e-06, + "loss": 0.0129, + "step": 38030 + }, + { + "epoch": 4.743142144638404, + "grad_norm": 6.248387217056006e-05, + "learning_rate": 1.0364089775561097e-06, + "loss": 0.0, + "step": 38040 + }, + { + "epoch": 4.7443890274314215, + "grad_norm": 5.823820174555294e-05, + "learning_rate": 1.03142144638404e-06, + "loss": 0.0, + "step": 38050 + }, + { + "epoch": 4.745635910224439, + "grad_norm": 0.005552787333726883, + "learning_rate": 1.0264339152119701e-06, + "loss": 0.0, + "step": 38060 + }, + { + "epoch": 4.746882793017456, + "grad_norm": 0.004743535071611404, + "learning_rate": 1.0214463840399003e-06, + "loss": 0.0, + "step": 38070 + }, + { + "epoch": 4.748129675810474, + "grad_norm": 5.940657501923852e-05, + "learning_rate": 1.0164588528678305e-06, + "loss": 0.0, + "step": 38080 + }, + { + "epoch": 4.749376558603491, + "grad_norm": 7.897276373114437e-05, + "learning_rate": 1.0114713216957607e-06, + "loss": 0.0, + "step": 38090 + }, + { + "epoch": 4.750623441396509, + "grad_norm": 8.272264676634222e-05, + "learning_rate": 1.006483790523691e-06, + "loss": 0.0, + "step": 38100 + }, + { + "epoch": 4.751870324189526, + "grad_norm": 0.0009390998166054487, + "learning_rate": 1.0014962593516211e-06, + "loss": 0.0, + "step": 38110 + }, + { + "epoch": 4.753117206982544, + "grad_norm": 6.277462671278045e-05, + "learning_rate": 9.96508728179551e-07, + "loss": 0.0, + "step": 38120 + }, + { + "epoch": 4.754364089775561, + "grad_norm": 3.5075980122201145e-05, + "learning_rate": 9.915211970074813e-07, + "loss": 0.0075, + "step": 38130 + }, + { + "epoch": 4.7556109725685785, + "grad_norm": 0.00023939134553074837, + "learning_rate": 9.865336658354117e-07, + "loss": 0.0, + "step": 38140 + }, + { + "epoch": 4.756857855361596, + "grad_norm": 0.00011813923629233614, + "learning_rate": 9.815461346633417e-07, + "loss": 0.0001, + "step": 38150 + }, + { + "epoch": 4.758104738154613, + "grad_norm": 0.00019262704881839454, + "learning_rate": 9.765586034912719e-07, + "loss": 0.0, + "step": 38160 + }, + { + "epoch": 4.759351620947631, + "grad_norm": 4.599255771609023e-05, + "learning_rate": 9.71571072319202e-07, + "loss": 0.0, + "step": 38170 + }, + { + "epoch": 4.760598503740648, + "grad_norm": 6.70359586365521e-05, + "learning_rate": 9.665835411471323e-07, + "loss": 0.0, + "step": 38180 + }, + { + "epoch": 4.761845386533666, + "grad_norm": 0.0001310739608015865, + "learning_rate": 9.615960099750625e-07, + "loss": 0.0, + "step": 38190 + }, + { + "epoch": 4.763092269326683, + "grad_norm": 0.0007167681469582021, + "learning_rate": 9.566084788029927e-07, + "loss": 0.0, + "step": 38200 + }, + { + "epoch": 4.7643391521197005, + "grad_norm": 0.0002816536871250719, + "learning_rate": 9.516209476309229e-07, + "loss": 0.0, + "step": 38210 + }, + { + "epoch": 4.765586034912718, + "grad_norm": 0.0004467430990189314, + "learning_rate": 9.466334164588529e-07, + "loss": 0.0, + "step": 38220 + }, + { + "epoch": 4.766832917705735, + "grad_norm": 0.00010718198609538376, + "learning_rate": 9.416458852867831e-07, + "loss": 0.0, + "step": 38230 + }, + { + "epoch": 4.768079800498753, + "grad_norm": 0.03801560774445534, + "learning_rate": 9.366583541147132e-07, + "loss": 0.0, + "step": 38240 + }, + { + "epoch": 4.76932668329177, + "grad_norm": 4.085026739630848e-05, + "learning_rate": 9.316708229426434e-07, + "loss": 0.0, + "step": 38250 + }, + { + "epoch": 4.770573566084788, + "grad_norm": 46.325836181640625, + "learning_rate": 9.266832917705737e-07, + "loss": 0.0069, + "step": 38260 + }, + { + "epoch": 4.771820448877805, + "grad_norm": 0.0001395172148477286, + "learning_rate": 9.216957605985038e-07, + "loss": 0.0, + "step": 38270 + }, + { + "epoch": 4.773067331670823, + "grad_norm": 0.0033392098266631365, + "learning_rate": 9.16708229426434e-07, + "loss": 0.0, + "step": 38280 + }, + { + "epoch": 4.774314214463841, + "grad_norm": 7.12929613655433e-05, + "learning_rate": 9.117206982543641e-07, + "loss": 0.0, + "step": 38290 + }, + { + "epoch": 4.775561097256858, + "grad_norm": 0.0020014506299048662, + "learning_rate": 9.067331670822943e-07, + "loss": 0.0, + "step": 38300 + }, + { + "epoch": 4.776807980049876, + "grad_norm": 0.00013769141514785588, + "learning_rate": 9.017456359102245e-07, + "loss": 0.0, + "step": 38310 + }, + { + "epoch": 4.778054862842893, + "grad_norm": 8.867657015798613e-05, + "learning_rate": 8.967581047381548e-07, + "loss": 0.0006, + "step": 38320 + }, + { + "epoch": 4.779301745635911, + "grad_norm": 0.0008881071116775274, + "learning_rate": 8.917705735660849e-07, + "loss": 0.0002, + "step": 38330 + }, + { + "epoch": 4.780548628428928, + "grad_norm": 0.0002598560240585357, + "learning_rate": 8.867830423940151e-07, + "loss": 0.0, + "step": 38340 + }, + { + "epoch": 4.7817955112219455, + "grad_norm": 0.004675508942455053, + "learning_rate": 8.817955112219452e-07, + "loss": 0.0, + "step": 38350 + }, + { + "epoch": 4.783042394014963, + "grad_norm": 0.009007451124489307, + "learning_rate": 8.768079800498754e-07, + "loss": 0.0, + "step": 38360 + }, + { + "epoch": 4.78428927680798, + "grad_norm": 4.994167829863727e-05, + "learning_rate": 8.718204488778055e-07, + "loss": 0.0, + "step": 38370 + }, + { + "epoch": 4.785536159600998, + "grad_norm": 0.000342125742463395, + "learning_rate": 8.668329177057358e-07, + "loss": 0.0, + "step": 38380 + }, + { + "epoch": 4.786783042394015, + "grad_norm": 0.0014879869995638728, + "learning_rate": 8.61845386533666e-07, + "loss": 0.0, + "step": 38390 + }, + { + "epoch": 4.788029925187033, + "grad_norm": 0.0005899175885133445, + "learning_rate": 8.568578553615962e-07, + "loss": 0.0, + "step": 38400 + }, + { + "epoch": 4.78927680798005, + "grad_norm": 0.0007447946700267494, + "learning_rate": 8.518703241895262e-07, + "loss": 0.0, + "step": 38410 + }, + { + "epoch": 4.7905236907730675, + "grad_norm": 0.00011481838737381622, + "learning_rate": 8.468827930174564e-07, + "loss": 0.0, + "step": 38420 + }, + { + "epoch": 4.791770573566085, + "grad_norm": 0.00012223895464558154, + "learning_rate": 8.418952618453865e-07, + "loss": 0.0, + "step": 38430 + }, + { + "epoch": 4.793017456359102, + "grad_norm": 5.043927740189247e-05, + "learning_rate": 8.369077306733167e-07, + "loss": 0.0, + "step": 38440 + }, + { + "epoch": 4.79426433915212, + "grad_norm": 0.0004362256149761379, + "learning_rate": 8.31920199501247e-07, + "loss": 0.0002, + "step": 38450 + }, + { + "epoch": 4.795511221945137, + "grad_norm": 0.0016937785549089313, + "learning_rate": 8.269326683291771e-07, + "loss": 0.035, + "step": 38460 + }, + { + "epoch": 4.796758104738155, + "grad_norm": 0.00018390719196759164, + "learning_rate": 8.219451371571073e-07, + "loss": 0.0, + "step": 38470 + }, + { + "epoch": 4.798004987531172, + "grad_norm": 5.6730925280135125e-05, + "learning_rate": 8.174563591022444e-07, + "loss": 0.03, + "step": 38480 + }, + { + "epoch": 4.7992518703241895, + "grad_norm": 0.00022890319814905524, + "learning_rate": 8.124688279301746e-07, + "loss": 0.0368, + "step": 38490 + }, + { + "epoch": 4.800498753117207, + "grad_norm": 0.00020031262829434127, + "learning_rate": 8.074812967581049e-07, + "loss": 0.0, + "step": 38500 + }, + { + "epoch": 4.801745635910224, + "grad_norm": 45.181060791015625, + "learning_rate": 8.02493765586035e-07, + "loss": 0.0328, + "step": 38510 + }, + { + "epoch": 4.802992518703242, + "grad_norm": 0.0005216131103225052, + "learning_rate": 7.975062344139652e-07, + "loss": 0.0091, + "step": 38520 + }, + { + "epoch": 4.804239401496259, + "grad_norm": 6.901921005919576e-05, + "learning_rate": 7.925187032418953e-07, + "loss": 0.0, + "step": 38530 + }, + { + "epoch": 4.805486284289277, + "grad_norm": 0.024445077404379845, + "learning_rate": 7.875311720698255e-07, + "loss": 0.0, + "step": 38540 + }, + { + "epoch": 4.806733167082294, + "grad_norm": 0.003273892914876342, + "learning_rate": 7.825436408977556e-07, + "loss": 0.0486, + "step": 38550 + }, + { + "epoch": 4.807980049875312, + "grad_norm": 0.00015414993686135858, + "learning_rate": 7.77556109725686e-07, + "loss": 0.0, + "step": 38560 + }, + { + "epoch": 4.809226932668329, + "grad_norm": 6.870167999295518e-05, + "learning_rate": 7.72568578553616e-07, + "loss": 0.0, + "step": 38570 + }, + { + "epoch": 4.8104738154613464, + "grad_norm": 0.001188624300993979, + "learning_rate": 7.675810473815462e-07, + "loss": 0.0, + "step": 38580 + }, + { + "epoch": 4.811720698254364, + "grad_norm": 6.985004438320175e-05, + "learning_rate": 7.625935162094763e-07, + "loss": 0.0, + "step": 38590 + }, + { + "epoch": 4.812967581047381, + "grad_norm": 7.557850040029734e-05, + "learning_rate": 7.576059850374065e-07, + "loss": 0.0, + "step": 38600 + }, + { + "epoch": 4.814214463840399, + "grad_norm": 0.00018456621910445392, + "learning_rate": 7.526184538653366e-07, + "loss": 0.0, + "step": 38610 + }, + { + "epoch": 4.815461346633416, + "grad_norm": 0.00035883314558304846, + "learning_rate": 7.476309226932669e-07, + "loss": 0.0, + "step": 38620 + }, + { + "epoch": 4.816708229426434, + "grad_norm": 0.0002625574416015297, + "learning_rate": 7.426433915211971e-07, + "loss": 0.0061, + "step": 38630 + }, + { + "epoch": 4.817955112219451, + "grad_norm": 0.00019378215074539185, + "learning_rate": 7.376558603491273e-07, + "loss": 0.0002, + "step": 38640 + }, + { + "epoch": 4.8192019950124685, + "grad_norm": 7.995535270310938e-05, + "learning_rate": 7.326683291770574e-07, + "loss": 0.0, + "step": 38650 + }, + { + "epoch": 4.820448877805486, + "grad_norm": 0.0010213602799922228, + "learning_rate": 7.276807980049876e-07, + "loss": 0.0, + "step": 38660 + }, + { + "epoch": 4.821695760598503, + "grad_norm": 9.212228178512305e-05, + "learning_rate": 7.226932668329177e-07, + "loss": 0.0056, + "step": 38670 + }, + { + "epoch": 4.822942643391521, + "grad_norm": 6.223101081559435e-05, + "learning_rate": 7.17705735660848e-07, + "loss": 0.0, + "step": 38680 + }, + { + "epoch": 4.824189526184538, + "grad_norm": 0.0006266386481001973, + "learning_rate": 7.127182044887782e-07, + "loss": 0.0, + "step": 38690 + }, + { + "epoch": 4.825436408977556, + "grad_norm": 6.241526716621593e-05, + "learning_rate": 7.077306733167083e-07, + "loss": 0.0, + "step": 38700 + }, + { + "epoch": 4.826683291770573, + "grad_norm": 0.00010659959662007168, + "learning_rate": 7.027431421446385e-07, + "loss": 0.0, + "step": 38710 + }, + { + "epoch": 4.8279301745635905, + "grad_norm": 5.2077397413086146e-05, + "learning_rate": 6.977556109725686e-07, + "loss": 0.0, + "step": 38720 + }, + { + "epoch": 4.829177057356609, + "grad_norm": 0.0012044019531458616, + "learning_rate": 6.927680798004988e-07, + "loss": 0.0, + "step": 38730 + }, + { + "epoch": 4.830423940149626, + "grad_norm": 0.0006541931070387363, + "learning_rate": 6.877805486284291e-07, + "loss": 0.0, + "step": 38740 + }, + { + "epoch": 4.831670822942644, + "grad_norm": 7.856899901526049e-05, + "learning_rate": 6.827930174563593e-07, + "loss": 0.0, + "step": 38750 + }, + { + "epoch": 4.832917705735661, + "grad_norm": 0.01197484228760004, + "learning_rate": 6.778054862842893e-07, + "loss": 0.0532, + "step": 38760 + }, + { + "epoch": 4.834164588528679, + "grad_norm": 0.00016221591795329005, + "learning_rate": 6.728179551122195e-07, + "loss": 0.0, + "step": 38770 + }, + { + "epoch": 4.835411471321696, + "grad_norm": 0.006540379952639341, + "learning_rate": 6.678304239401496e-07, + "loss": 0.0, + "step": 38780 + }, + { + "epoch": 4.8366583541147135, + "grad_norm": 4.641401028493419e-05, + "learning_rate": 6.628428927680798e-07, + "loss": 0.0, + "step": 38790 + }, + { + "epoch": 4.837905236907731, + "grad_norm": 0.00019889388931915164, + "learning_rate": 6.578553615960099e-07, + "loss": 0.0, + "step": 38800 + }, + { + "epoch": 4.839152119700748, + "grad_norm": 0.003469712333753705, + "learning_rate": 6.528678304239402e-07, + "loss": 0.0, + "step": 38810 + }, + { + "epoch": 4.840399002493766, + "grad_norm": 0.00012375341611914337, + "learning_rate": 6.478802992518704e-07, + "loss": 0.0, + "step": 38820 + }, + { + "epoch": 4.841645885286783, + "grad_norm": 0.0009892029920592904, + "learning_rate": 6.428927680798006e-07, + "loss": 0.0001, + "step": 38830 + }, + { + "epoch": 4.842892768079801, + "grad_norm": 0.014581103809177876, + "learning_rate": 6.379052369077307e-07, + "loss": 0.0002, + "step": 38840 + }, + { + "epoch": 4.844139650872818, + "grad_norm": 6.556464359164238e-05, + "learning_rate": 6.329177057356609e-07, + "loss": 0.0, + "step": 38850 + }, + { + "epoch": 4.8453865336658355, + "grad_norm": 7.368988735834137e-05, + "learning_rate": 6.27930174563591e-07, + "loss": 0.0, + "step": 38860 + }, + { + "epoch": 4.846633416458853, + "grad_norm": 0.0001805610372684896, + "learning_rate": 6.229426433915213e-07, + "loss": 0.0, + "step": 38870 + }, + { + "epoch": 4.84788029925187, + "grad_norm": 0.00016412808327004313, + "learning_rate": 6.179551122194515e-07, + "loss": 0.0, + "step": 38880 + }, + { + "epoch": 4.849127182044888, + "grad_norm": 0.00047609827015548944, + "learning_rate": 6.129675810473816e-07, + "loss": 0.0163, + "step": 38890 + }, + { + "epoch": 4.850374064837905, + "grad_norm": 0.004086254630237818, + "learning_rate": 6.079800498753118e-07, + "loss": 0.0018, + "step": 38900 + }, + { + "epoch": 4.851620947630923, + "grad_norm": 0.0005081315175630152, + "learning_rate": 6.02992518703242e-07, + "loss": 0.0001, + "step": 38910 + }, + { + "epoch": 4.85286783042394, + "grad_norm": 0.0003619830240495503, + "learning_rate": 5.980049875311722e-07, + "loss": 0.0003, + "step": 38920 + }, + { + "epoch": 4.8541147132169575, + "grad_norm": 8.470124157611281e-05, + "learning_rate": 5.930174563591022e-07, + "loss": 0.001, + "step": 38930 + }, + { + "epoch": 4.855361596009975, + "grad_norm": 0.00033788950531743467, + "learning_rate": 5.880299251870324e-07, + "loss": 0.0, + "step": 38940 + }, + { + "epoch": 4.856608478802992, + "grad_norm": 0.009631326422095299, + "learning_rate": 5.830423940149626e-07, + "loss": 0.0001, + "step": 38950 + }, + { + "epoch": 4.85785536159601, + "grad_norm": 6.478117575170472e-05, + "learning_rate": 5.780548628428928e-07, + "loss": 0.0, + "step": 38960 + }, + { + "epoch": 4.859102244389027, + "grad_norm": 0.00014513339556287974, + "learning_rate": 5.730673316708229e-07, + "loss": 0.0371, + "step": 38970 + }, + { + "epoch": 4.860349127182045, + "grad_norm": 0.002222485141828656, + "learning_rate": 5.680798004987532e-07, + "loss": 0.0, + "step": 38980 + }, + { + "epoch": 4.861596009975062, + "grad_norm": 6.835142994532362e-05, + "learning_rate": 5.630922693266833e-07, + "loss": 0.0, + "step": 38990 + }, + { + "epoch": 4.86284289276808, + "grad_norm": 8.790072752162814e-05, + "learning_rate": 5.581047381546135e-07, + "loss": 0.0001, + "step": 39000 + }, + { + "epoch": 4.864089775561097, + "grad_norm": 6.300320819718763e-05, + "learning_rate": 5.531172069825437e-07, + "loss": 0.0, + "step": 39010 + }, + { + "epoch": 4.865336658354114, + "grad_norm": 0.00011180248111486435, + "learning_rate": 5.481296758104739e-07, + "loss": 0.0, + "step": 39020 + }, + { + "epoch": 4.866583541147132, + "grad_norm": 7.89304613135755e-05, + "learning_rate": 5.43142144638404e-07, + "loss": 0.0, + "step": 39030 + }, + { + "epoch": 4.867830423940149, + "grad_norm": 0.00046899239532649517, + "learning_rate": 5.381546134663342e-07, + "loss": 0.0, + "step": 39040 + }, + { + "epoch": 4.869077306733167, + "grad_norm": 0.00407341867685318, + "learning_rate": 5.331670822942644e-07, + "loss": 0.0, + "step": 39050 + }, + { + "epoch": 4.870324189526184, + "grad_norm": 0.000564980844501406, + "learning_rate": 5.281795511221946e-07, + "loss": 0.0, + "step": 39060 + }, + { + "epoch": 4.871571072319202, + "grad_norm": 9.515364217804745e-05, + "learning_rate": 5.231920199501248e-07, + "loss": 0.0, + "step": 39070 + }, + { + "epoch": 4.87281795511222, + "grad_norm": 0.00015221108333207667, + "learning_rate": 5.182044887780549e-07, + "loss": 0.002, + "step": 39080 + }, + { + "epoch": 4.874064837905237, + "grad_norm": 0.007447673939168453, + "learning_rate": 5.132169576059851e-07, + "loss": 0.0, + "step": 39090 + }, + { + "epoch": 4.875311720698255, + "grad_norm": 0.00013376775314100087, + "learning_rate": 5.082294264339153e-07, + "loss": 0.0329, + "step": 39100 + }, + { + "epoch": 4.876558603491272, + "grad_norm": 0.00014891306636855006, + "learning_rate": 5.032418952618455e-07, + "loss": 0.0, + "step": 39110 + }, + { + "epoch": 4.87780548628429, + "grad_norm": 0.3053286671638489, + "learning_rate": 4.982543640897755e-07, + "loss": 0.0, + "step": 39120 + }, + { + "epoch": 4.879052369077307, + "grad_norm": 6.43767198198475e-05, + "learning_rate": 4.932668329177058e-07, + "loss": 0.0, + "step": 39130 + }, + { + "epoch": 4.8802992518703245, + "grad_norm": 0.00010566661512712017, + "learning_rate": 4.882793017456359e-07, + "loss": 0.0, + "step": 39140 + }, + { + "epoch": 4.881546134663342, + "grad_norm": 0.00010986766574205831, + "learning_rate": 4.832917705735661e-07, + "loss": 0.0, + "step": 39150 + }, + { + "epoch": 4.882793017456359, + "grad_norm": 9.44228595471941e-05, + "learning_rate": 4.783042394014963e-07, + "loss": 0.0, + "step": 39160 + }, + { + "epoch": 4.884039900249377, + "grad_norm": 0.00012891492224298418, + "learning_rate": 4.7331670822942647e-07, + "loss": 0.0, + "step": 39170 + }, + { + "epoch": 4.885286783042394, + "grad_norm": 0.00010865272633964196, + "learning_rate": 4.683291770573566e-07, + "loss": 0.0, + "step": 39180 + }, + { + "epoch": 4.886533665835412, + "grad_norm": 0.0020185792818665504, + "learning_rate": 4.6334164588528686e-07, + "loss": 0.0042, + "step": 39190 + }, + { + "epoch": 4.887780548628429, + "grad_norm": 0.00019098054326605052, + "learning_rate": 4.58354114713217e-07, + "loss": 0.0, + "step": 39200 + }, + { + "epoch": 4.889027431421447, + "grad_norm": 3.40030892402865e-05, + "learning_rate": 4.5336658354114715e-07, + "loss": 0.0, + "step": 39210 + }, + { + "epoch": 4.890274314214464, + "grad_norm": 0.00010806202044477686, + "learning_rate": 4.483790523690774e-07, + "loss": 0.0, + "step": 39220 + }, + { + "epoch": 4.8915211970074814, + "grad_norm": 7.696980173932388e-05, + "learning_rate": 4.4339152119700754e-07, + "loss": 0.0, + "step": 39230 + }, + { + "epoch": 4.892768079800499, + "grad_norm": 3.1001432944322005e-05, + "learning_rate": 4.384039900249377e-07, + "loss": 0.0, + "step": 39240 + }, + { + "epoch": 4.894014962593516, + "grad_norm": 0.000295392848784104, + "learning_rate": 4.334164588528679e-07, + "loss": 0.0, + "step": 39250 + }, + { + "epoch": 4.895261845386534, + "grad_norm": 0.0001579842937644571, + "learning_rate": 4.284289276807981e-07, + "loss": 0.0494, + "step": 39260 + }, + { + "epoch": 4.896508728179551, + "grad_norm": 0.000137755909236148, + "learning_rate": 4.234413965087282e-07, + "loss": 0.0, + "step": 39270 + }, + { + "epoch": 4.897755610972569, + "grad_norm": 0.00016627574223093688, + "learning_rate": 4.1845386533665836e-07, + "loss": 0.0006, + "step": 39280 + }, + { + "epoch": 4.899002493765586, + "grad_norm": 0.00019424950005486608, + "learning_rate": 4.1346633416458856e-07, + "loss": 0.0, + "step": 39290 + }, + { + "epoch": 4.9002493765586035, + "grad_norm": 0.00012249869178049266, + "learning_rate": 4.084788029925187e-07, + "loss": 0.0016, + "step": 39300 + }, + { + "epoch": 4.901496259351621, + "grad_norm": 3.534234201651998e-05, + "learning_rate": 4.034912718204489e-07, + "loss": 0.0, + "step": 39310 + }, + { + "epoch": 4.902743142144638, + "grad_norm": 5.2465853514149785e-05, + "learning_rate": 3.985037406483791e-07, + "loss": 0.0, + "step": 39320 + }, + { + "epoch": 4.903990024937656, + "grad_norm": 0.00019393919501453638, + "learning_rate": 3.9351620947630924e-07, + "loss": 0.0, + "step": 39330 + }, + { + "epoch": 4.905236907730673, + "grad_norm": 0.0001156614744104445, + "learning_rate": 3.885286783042394e-07, + "loss": 0.0, + "step": 39340 + }, + { + "epoch": 4.906483790523691, + "grad_norm": 0.00013547898561228067, + "learning_rate": 3.8354114713216963e-07, + "loss": 0.0, + "step": 39350 + }, + { + "epoch": 4.907730673316708, + "grad_norm": 0.001548072206787765, + "learning_rate": 3.7855361596009977e-07, + "loss": 0.0002, + "step": 39360 + }, + { + "epoch": 4.9089775561097255, + "grad_norm": 0.00010577407374512404, + "learning_rate": 3.735660847880299e-07, + "loss": 0.0001, + "step": 39370 + }, + { + "epoch": 4.910224438902743, + "grad_norm": 0.0005236375727690756, + "learning_rate": 3.6857855361596016e-07, + "loss": 0.0, + "step": 39380 + }, + { + "epoch": 4.91147132169576, + "grad_norm": 0.00018029804050456733, + "learning_rate": 3.635910224438903e-07, + "loss": 0.0, + "step": 39390 + }, + { + "epoch": 4.912718204488778, + "grad_norm": 7.623321289429441e-05, + "learning_rate": 3.5860349127182045e-07, + "loss": 0.0, + "step": 39400 + }, + { + "epoch": 4.913965087281795, + "grad_norm": 6.454013055190444e-05, + "learning_rate": 3.536159600997507e-07, + "loss": 0.0, + "step": 39410 + }, + { + "epoch": 4.915211970074813, + "grad_norm": 0.00047880125930532813, + "learning_rate": 3.4862842892768084e-07, + "loss": 0.0, + "step": 39420 + }, + { + "epoch": 4.91645885286783, + "grad_norm": 0.0002257343294331804, + "learning_rate": 3.43640897755611e-07, + "loss": 0.0, + "step": 39430 + }, + { + "epoch": 4.917705735660848, + "grad_norm": 5.091218918096274e-05, + "learning_rate": 3.386533665835412e-07, + "loss": 0.0001, + "step": 39440 + }, + { + "epoch": 4.918952618453865, + "grad_norm": 9.202091314364225e-05, + "learning_rate": 3.336658354114714e-07, + "loss": 0.0, + "step": 39450 + }, + { + "epoch": 4.920199501246882, + "grad_norm": 0.00017863092944025993, + "learning_rate": 3.286783042394015e-07, + "loss": 0.0, + "step": 39460 + }, + { + "epoch": 4.9214463840399, + "grad_norm": 0.0003498582518659532, + "learning_rate": 3.236907730673317e-07, + "loss": 0.0, + "step": 39470 + }, + { + "epoch": 4.922693266832917, + "grad_norm": 0.021878132596611977, + "learning_rate": 3.1870324189526186e-07, + "loss": 0.0, + "step": 39480 + }, + { + "epoch": 4.923940149625935, + "grad_norm": 0.00010034618026111275, + "learning_rate": 3.13715710723192e-07, + "loss": 0.0, + "step": 39490 + }, + { + "epoch": 4.925187032418952, + "grad_norm": 0.0004102317616343498, + "learning_rate": 3.087281795511222e-07, + "loss": 0.0, + "step": 39500 + }, + { + "epoch": 4.92643391521197, + "grad_norm": 0.0001442254288122058, + "learning_rate": 3.037406483790524e-07, + "loss": 0.0, + "step": 39510 + }, + { + "epoch": 4.927680798004987, + "grad_norm": 0.00011706881923601031, + "learning_rate": 2.9875311720698254e-07, + "loss": 0.0, + "step": 39520 + }, + { + "epoch": 4.928927680798005, + "grad_norm": 0.0002453117340337485, + "learning_rate": 2.9376558603491273e-07, + "loss": 0.0, + "step": 39530 + }, + { + "epoch": 4.930174563591023, + "grad_norm": 0.0002756392350420356, + "learning_rate": 2.8877805486284293e-07, + "loss": 0.0, + "step": 39540 + }, + { + "epoch": 4.93142144638404, + "grad_norm": 7.422151247737929e-05, + "learning_rate": 2.8379052369077307e-07, + "loss": 0.0, + "step": 39550 + }, + { + "epoch": 4.932668329177058, + "grad_norm": 7.200430991360918e-05, + "learning_rate": 2.7880299251870327e-07, + "loss": 0.0, + "step": 39560 + }, + { + "epoch": 4.933915211970075, + "grad_norm": 0.00013249997573439032, + "learning_rate": 2.7381546134663346e-07, + "loss": 0.0, + "step": 39570 + }, + { + "epoch": 4.9351620947630925, + "grad_norm": 0.008125074207782745, + "learning_rate": 2.688279301745636e-07, + "loss": 0.0, + "step": 39580 + }, + { + "epoch": 4.93640897755611, + "grad_norm": 7.183442357927561e-05, + "learning_rate": 2.638403990024938e-07, + "loss": 0.0, + "step": 39590 + }, + { + "epoch": 4.937655860349127, + "grad_norm": 0.00014083849964663386, + "learning_rate": 2.58852867830424e-07, + "loss": 0.0, + "step": 39600 + }, + { + "epoch": 4.938902743142145, + "grad_norm": 0.00042223025229759514, + "learning_rate": 2.5386533665835414e-07, + "loss": 0.0, + "step": 39610 + }, + { + "epoch": 4.940149625935162, + "grad_norm": 8.984528540167958e-05, + "learning_rate": 2.4887780548628434e-07, + "loss": 0.0, + "step": 39620 + }, + { + "epoch": 4.94139650872818, + "grad_norm": 0.000292430748231709, + "learning_rate": 2.438902743142145e-07, + "loss": 0.0, + "step": 39630 + }, + { + "epoch": 4.942643391521197, + "grad_norm": 5.6125212722690776e-05, + "learning_rate": 2.389027431421447e-07, + "loss": 0.0, + "step": 39640 + }, + { + "epoch": 4.943890274314215, + "grad_norm": 0.00033320783404633403, + "learning_rate": 2.3391521197007484e-07, + "loss": 0.0, + "step": 39650 + }, + { + "epoch": 4.945137157107232, + "grad_norm": 0.00011107645696029067, + "learning_rate": 2.2892768079800501e-07, + "loss": 0.0, + "step": 39660 + }, + { + "epoch": 4.946384039900249, + "grad_norm": 0.00048359643551521003, + "learning_rate": 2.2394014962593518e-07, + "loss": 0.0428, + "step": 39670 + }, + { + "epoch": 4.947630922693267, + "grad_norm": 0.0021955021657049656, + "learning_rate": 2.1895261845386535e-07, + "loss": 0.0, + "step": 39680 + }, + { + "epoch": 4.948877805486284, + "grad_norm": 0.0004385665524750948, + "learning_rate": 2.1396508728179552e-07, + "loss": 0.0, + "step": 39690 + }, + { + "epoch": 4.950124688279302, + "grad_norm": 4.744509351439774e-05, + "learning_rate": 2.089775561097257e-07, + "loss": 0.0005, + "step": 39700 + }, + { + "epoch": 4.951371571072319, + "grad_norm": 4.5862499973736703e-05, + "learning_rate": 2.039900249376559e-07, + "loss": 0.0, + "step": 39710 + }, + { + "epoch": 4.952618453865337, + "grad_norm": 0.0032922455575317144, + "learning_rate": 1.9900249376558603e-07, + "loss": 0.0001, + "step": 39720 + }, + { + "epoch": 4.953865336658354, + "grad_norm": 0.00016673465142957866, + "learning_rate": 1.9401496259351623e-07, + "loss": 0.0, + "step": 39730 + }, + { + "epoch": 4.9551122194513715, + "grad_norm": 4.499312854022719e-05, + "learning_rate": 1.8902743142144642e-07, + "loss": 0.0, + "step": 39740 + }, + { + "epoch": 4.956359102244389, + "grad_norm": 0.00012304374831728637, + "learning_rate": 1.8403990024937656e-07, + "loss": 0.0, + "step": 39750 + }, + { + "epoch": 4.957605985037406, + "grad_norm": 0.00027020517154596746, + "learning_rate": 1.7905236907730676e-07, + "loss": 0.0063, + "step": 39760 + }, + { + "epoch": 4.958852867830424, + "grad_norm": 0.1422053575515747, + "learning_rate": 1.7406483790523693e-07, + "loss": 0.0, + "step": 39770 + }, + { + "epoch": 4.960099750623441, + "grad_norm": 7.926650869194418e-05, + "learning_rate": 1.6907730673316707e-07, + "loss": 0.0, + "step": 39780 + }, + { + "epoch": 4.961346633416459, + "grad_norm": 5.627061182167381e-05, + "learning_rate": 1.6408977556109727e-07, + "loss": 0.0, + "step": 39790 + }, + { + "epoch": 4.962593516209476, + "grad_norm": 0.00012039417924825102, + "learning_rate": 1.5910224438902747e-07, + "loss": 0.0, + "step": 39800 + }, + { + "epoch": 4.9638403990024935, + "grad_norm": 0.0011466448195278645, + "learning_rate": 1.541147132169576e-07, + "loss": 0.0, + "step": 39810 + }, + { + "epoch": 4.965087281795511, + "grad_norm": 0.00031051429687067866, + "learning_rate": 1.491271820448878e-07, + "loss": 0.0, + "step": 39820 + }, + { + "epoch": 4.966334164588528, + "grad_norm": 9.515901183476672e-05, + "learning_rate": 1.4413965087281797e-07, + "loss": 0.0, + "step": 39830 + }, + { + "epoch": 4.967581047381546, + "grad_norm": 1.0523102283477783, + "learning_rate": 1.3915211970074814e-07, + "loss": 0.0001, + "step": 39840 + }, + { + "epoch": 4.968827930174563, + "grad_norm": 0.0002691926492843777, + "learning_rate": 1.341645885286783e-07, + "loss": 0.0, + "step": 39850 + }, + { + "epoch": 4.970074812967581, + "grad_norm": 0.002537587657570839, + "learning_rate": 1.2917705735660848e-07, + "loss": 0.0, + "step": 39860 + }, + { + "epoch": 4.971321695760598, + "grad_norm": 4.5559128921013325e-05, + "learning_rate": 1.2418952618453865e-07, + "loss": 0.0, + "step": 39870 + }, + { + "epoch": 4.9725685785536164, + "grad_norm": 3.524202838889323e-05, + "learning_rate": 1.1920199501246885e-07, + "loss": 0.0, + "step": 39880 + }, + { + "epoch": 4.973815461346634, + "grad_norm": 0.00018697594350669533, + "learning_rate": 1.1421446384039902e-07, + "loss": 0.0, + "step": 39890 + }, + { + "epoch": 4.975062344139651, + "grad_norm": 0.0023984755389392376, + "learning_rate": 1.0922693266832919e-07, + "loss": 0.0, + "step": 39900 + }, + { + "epoch": 4.976309226932669, + "grad_norm": 0.0003739015955943614, + "learning_rate": 1.0423940149625936e-07, + "loss": 0.0, + "step": 39910 + }, + { + "epoch": 4.977556109725686, + "grad_norm": 5.126370524521917e-05, + "learning_rate": 9.925187032418954e-08, + "loss": 0.0, + "step": 39920 + }, + { + "epoch": 4.978802992518704, + "grad_norm": 5.107426113681868e-05, + "learning_rate": 9.426433915211971e-08, + "loss": 0.0, + "step": 39930 + }, + { + "epoch": 4.980049875311721, + "grad_norm": 9.020322613650933e-05, + "learning_rate": 8.927680798004988e-08, + "loss": 0.0, + "step": 39940 + }, + { + "epoch": 4.9812967581047385, + "grad_norm": 9.316183422924951e-05, + "learning_rate": 8.428927680798006e-08, + "loss": 0.0, + "step": 39950 + }, + { + "epoch": 4.982543640897756, + "grad_norm": 0.01621038280427456, + "learning_rate": 7.930174563591023e-08, + "loss": 0.0, + "step": 39960 + }, + { + "epoch": 4.983790523690773, + "grad_norm": 0.0002640775346662849, + "learning_rate": 7.43142144638404e-08, + "loss": 0.0, + "step": 39970 + }, + { + "epoch": 4.985037406483791, + "grad_norm": 9.524546476313844e-05, + "learning_rate": 6.932668329177058e-08, + "loss": 0.0, + "step": 39980 + }, + { + "epoch": 4.986284289276808, + "grad_norm": 4.487358091864735e-05, + "learning_rate": 6.433915211970075e-08, + "loss": 0.0, + "step": 39990 + }, + { + "epoch": 4.987531172069826, + "grad_norm": 0.0012119744205847383, + "learning_rate": 5.935162094763093e-08, + "loss": 0.0255, + "step": 40000 + }, + { + "epoch": 4.988778054862843, + "grad_norm": 59.783939361572266, + "learning_rate": 5.43640897755611e-08, + "loss": 0.033, + "step": 40010 + }, + { + "epoch": 4.9900249376558605, + "grad_norm": 0.00014162635488901287, + "learning_rate": 4.937655860349127e-08, + "loss": 0.0, + "step": 40020 + }, + { + "epoch": 4.991271820448878, + "grad_norm": 5.852362301084213e-05, + "learning_rate": 4.4389027431421455e-08, + "loss": 0.0, + "step": 40030 + }, + { + "epoch": 4.992518703241895, + "grad_norm": 0.00034787526237778366, + "learning_rate": 3.940149625935162e-08, + "loss": 0.0, + "step": 40040 + }, + { + "epoch": 4.993765586034913, + "grad_norm": 0.000130303087644279, + "learning_rate": 3.44139650872818e-08, + "loss": 0.0, + "step": 40050 + }, + { + "epoch": 4.99501246882793, + "grad_norm": 0.0005980631103739142, + "learning_rate": 2.9426433915211973e-08, + "loss": 0.0, + "step": 40060 + }, + { + "epoch": 4.996259351620948, + "grad_norm": 0.00011551733041414991, + "learning_rate": 2.4438902743142146e-08, + "loss": 0.0, + "step": 40070 + }, + { + "epoch": 4.997506234413965, + "grad_norm": 8.48295385367237e-05, + "learning_rate": 1.9451371571072322e-08, + "loss": 0.0, + "step": 40080 + }, + { + "epoch": 4.998753117206983, + "grad_norm": 0.00042257612221874297, + "learning_rate": 1.4463840399002495e-08, + "loss": 0.0, + "step": 40090 + }, + { + "epoch": 5.0, + "grad_norm": 0.0019580533262342215, + "learning_rate": 9.476309226932669e-09, + "loss": 0.0, + "step": 40100 + } + ], + "logging_steps": 10, + "max_steps": 40100, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.12473217322432e+16, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}