diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,46078 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.99878308487983, + "eval_steps": 500, + "global_step": 6572, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000608457560085184, + "grad_norm": 9.286478996276855, + "learning_rate": 1.9011406844106465e-07, + "loss": 0.8452, + "step": 1 + }, + { + "epoch": 0.001216915120170368, + "grad_norm": 9.28686809539795, + "learning_rate": 3.802281368821293e-07, + "loss": 0.8357, + "step": 2 + }, + { + "epoch": 0.0018253726802555522, + "grad_norm": 9.397164344787598, + "learning_rate": 5.70342205323194e-07, + "loss": 0.8038, + "step": 3 + }, + { + "epoch": 0.002433830240340736, + "grad_norm": 8.887619972229004, + "learning_rate": 7.604562737642586e-07, + "loss": 0.7907, + "step": 4 + }, + { + "epoch": 0.0030422878004259203, + "grad_norm": 10.438508033752441, + "learning_rate": 9.505703422053232e-07, + "loss": 0.8433, + "step": 5 + }, + { + "epoch": 0.0036507453605111044, + "grad_norm": 9.955141067504883, + "learning_rate": 1.140684410646388e-06, + "loss": 0.8832, + "step": 6 + }, + { + "epoch": 0.0042592029205962886, + "grad_norm": 7.124298095703125, + "learning_rate": 1.3307984790874525e-06, + "loss": 0.7259, + "step": 7 + }, + { + "epoch": 0.004867660480681472, + "grad_norm": 6.805534839630127, + "learning_rate": 1.5209125475285172e-06, + "loss": 0.7877, + "step": 8 + }, + { + "epoch": 0.005476118040766657, + "grad_norm": 5.027353286743164, + "learning_rate": 1.711026615969582e-06, + "loss": 0.6938, + "step": 9 + }, + { + "epoch": 0.0060845756008518406, + "grad_norm": 4.865087509155273, + "learning_rate": 1.9011406844106463e-06, + "loss": 0.704, + "step": 10 + }, + { + "epoch": 0.006693033160937024, + "grad_norm": 4.999395847320557, + "learning_rate": 2.091254752851711e-06, + "loss": 0.6225, + "step": 11 + }, + { + "epoch": 0.007301490721022209, + "grad_norm": 4.837018013000488, + "learning_rate": 2.281368821292776e-06, + "loss": 0.6357, + "step": 12 + }, + { + "epoch": 0.007909948281107393, + "grad_norm": 4.025296211242676, + "learning_rate": 2.4714828897338406e-06, + "loss": 0.6002, + "step": 13 + }, + { + "epoch": 0.008518405841192577, + "grad_norm": 3.6638612747192383, + "learning_rate": 2.661596958174905e-06, + "loss": 0.5499, + "step": 14 + }, + { + "epoch": 0.009126863401277762, + "grad_norm": 5.011624336242676, + "learning_rate": 2.8517110266159697e-06, + "loss": 0.6042, + "step": 15 + }, + { + "epoch": 0.009735320961362945, + "grad_norm": 4.899325847625732, + "learning_rate": 3.0418250950570345e-06, + "loss": 0.6279, + "step": 16 + }, + { + "epoch": 0.010343778521448129, + "grad_norm": 4.226297378540039, + "learning_rate": 3.2319391634980988e-06, + "loss": 0.6134, + "step": 17 + }, + { + "epoch": 0.010952236081533314, + "grad_norm": 4.012551784515381, + "learning_rate": 3.422053231939164e-06, + "loss": 0.6285, + "step": 18 + }, + { + "epoch": 0.011560693641618497, + "grad_norm": 3.9116461277008057, + "learning_rate": 3.612167300380228e-06, + "loss": 0.5381, + "step": 19 + }, + { + "epoch": 0.012169151201703681, + "grad_norm": 2.996412515640259, + "learning_rate": 3.8022813688212926e-06, + "loss": 0.4892, + "step": 20 + }, + { + "epoch": 0.012777608761788866, + "grad_norm": 2.7711269855499268, + "learning_rate": 3.992395437262358e-06, + "loss": 0.4954, + "step": 21 + }, + { + "epoch": 0.013386066321874049, + "grad_norm": 3.1407854557037354, + "learning_rate": 4.182509505703422e-06, + "loss": 0.6063, + "step": 22 + }, + { + "epoch": 0.013994523881959233, + "grad_norm": 2.8379178047180176, + "learning_rate": 4.3726235741444865e-06, + "loss": 0.5944, + "step": 23 + }, + { + "epoch": 0.014602981442044418, + "grad_norm": 2.903754234313965, + "learning_rate": 4.562737642585552e-06, + "loss": 0.5006, + "step": 24 + }, + { + "epoch": 0.015211439002129602, + "grad_norm": 3.051934003829956, + "learning_rate": 4.752851711026616e-06, + "loss": 0.5743, + "step": 25 + }, + { + "epoch": 0.015819896562214785, + "grad_norm": 2.664247751235962, + "learning_rate": 4.942965779467681e-06, + "loss": 0.5162, + "step": 26 + }, + { + "epoch": 0.016428354122299968, + "grad_norm": 3.0488321781158447, + "learning_rate": 5.1330798479087455e-06, + "loss": 0.5195, + "step": 27 + }, + { + "epoch": 0.017036811682385154, + "grad_norm": 2.9393467903137207, + "learning_rate": 5.32319391634981e-06, + "loss": 0.5466, + "step": 28 + }, + { + "epoch": 0.017645269242470337, + "grad_norm": 2.603747844696045, + "learning_rate": 5.513307984790875e-06, + "loss": 0.5294, + "step": 29 + }, + { + "epoch": 0.018253726802555523, + "grad_norm": 2.919405698776245, + "learning_rate": 5.703422053231939e-06, + "loss": 0.5621, + "step": 30 + }, + { + "epoch": 0.018862184362640706, + "grad_norm": 2.5989558696746826, + "learning_rate": 5.893536121673004e-06, + "loss": 0.5719, + "step": 31 + }, + { + "epoch": 0.01947064192272589, + "grad_norm": 2.815772294998169, + "learning_rate": 6.083650190114069e-06, + "loss": 0.5748, + "step": 32 + }, + { + "epoch": 0.020079099482811075, + "grad_norm": 2.713179111480713, + "learning_rate": 6.273764258555133e-06, + "loss": 0.4973, + "step": 33 + }, + { + "epoch": 0.020687557042896258, + "grad_norm": 2.5474116802215576, + "learning_rate": 6.4638783269961976e-06, + "loss": 0.5012, + "step": 34 + }, + { + "epoch": 0.02129601460298144, + "grad_norm": 2.737952947616577, + "learning_rate": 6.653992395437263e-06, + "loss": 0.4963, + "step": 35 + }, + { + "epoch": 0.021904472163066627, + "grad_norm": 2.6271746158599854, + "learning_rate": 6.844106463878328e-06, + "loss": 0.5512, + "step": 36 + }, + { + "epoch": 0.02251292972315181, + "grad_norm": 2.547055721282959, + "learning_rate": 7.034220532319392e-06, + "loss": 0.5489, + "step": 37 + }, + { + "epoch": 0.023121387283236993, + "grad_norm": 2.6234169006347656, + "learning_rate": 7.224334600760456e-06, + "loss": 0.4627, + "step": 38 + }, + { + "epoch": 0.02372984484332218, + "grad_norm": 2.4380102157592773, + "learning_rate": 7.414448669201521e-06, + "loss": 0.5985, + "step": 39 + }, + { + "epoch": 0.024338302403407362, + "grad_norm": 2.633112668991089, + "learning_rate": 7.604562737642585e-06, + "loss": 0.4824, + "step": 40 + }, + { + "epoch": 0.024946759963492545, + "grad_norm": 2.357767343521118, + "learning_rate": 7.79467680608365e-06, + "loss": 0.4725, + "step": 41 + }, + { + "epoch": 0.02555521752357773, + "grad_norm": 2.5731077194213867, + "learning_rate": 7.984790874524716e-06, + "loss": 0.5152, + "step": 42 + }, + { + "epoch": 0.026163675083662914, + "grad_norm": 2.6900103092193604, + "learning_rate": 8.17490494296578e-06, + "loss": 0.4789, + "step": 43 + }, + { + "epoch": 0.026772132643748097, + "grad_norm": 2.833528757095337, + "learning_rate": 8.365019011406844e-06, + "loss": 0.534, + "step": 44 + }, + { + "epoch": 0.027380590203833283, + "grad_norm": 2.824359893798828, + "learning_rate": 8.55513307984791e-06, + "loss": 0.4799, + "step": 45 + }, + { + "epoch": 0.027989047763918466, + "grad_norm": 2.2743606567382812, + "learning_rate": 8.745247148288973e-06, + "loss": 0.4938, + "step": 46 + }, + { + "epoch": 0.02859750532400365, + "grad_norm": 2.405674934387207, + "learning_rate": 8.935361216730038e-06, + "loss": 0.5277, + "step": 47 + }, + { + "epoch": 0.029205962884088835, + "grad_norm": 2.473686695098877, + "learning_rate": 9.125475285171103e-06, + "loss": 0.4787, + "step": 48 + }, + { + "epoch": 0.029814420444174018, + "grad_norm": 2.3129618167877197, + "learning_rate": 9.315589353612169e-06, + "loss": 0.4755, + "step": 49 + }, + { + "epoch": 0.030422878004259205, + "grad_norm": 2.2328269481658936, + "learning_rate": 9.505703422053232e-06, + "loss": 0.5602, + "step": 50 + }, + { + "epoch": 0.031031335564344387, + "grad_norm": 2.6735682487487793, + "learning_rate": 9.695817490494297e-06, + "loss": 0.5262, + "step": 51 + }, + { + "epoch": 0.03163979312442957, + "grad_norm": 2.37196946144104, + "learning_rate": 9.885931558935362e-06, + "loss": 0.5204, + "step": 52 + }, + { + "epoch": 0.032248250684514757, + "grad_norm": 2.7100112438201904, + "learning_rate": 1.0076045627376426e-05, + "loss": 0.5656, + "step": 53 + }, + { + "epoch": 0.032856708244599936, + "grad_norm": 2.4364941120147705, + "learning_rate": 1.0266159695817491e-05, + "loss": 0.4961, + "step": 54 + }, + { + "epoch": 0.03346516580468512, + "grad_norm": 2.3422157764434814, + "learning_rate": 1.0456273764258556e-05, + "loss": 0.492, + "step": 55 + }, + { + "epoch": 0.03407362336477031, + "grad_norm": 2.0109498500823975, + "learning_rate": 1.064638783269962e-05, + "loss": 0.503, + "step": 56 + }, + { + "epoch": 0.03468208092485549, + "grad_norm": 2.42881441116333, + "learning_rate": 1.0836501901140685e-05, + "loss": 0.5093, + "step": 57 + }, + { + "epoch": 0.035290538484940674, + "grad_norm": 2.187835693359375, + "learning_rate": 1.102661596958175e-05, + "loss": 0.4663, + "step": 58 + }, + { + "epoch": 0.03589899604502586, + "grad_norm": 2.3914639949798584, + "learning_rate": 1.1216730038022814e-05, + "loss": 0.4773, + "step": 59 + }, + { + "epoch": 0.03650745360511105, + "grad_norm": 2.4356846809387207, + "learning_rate": 1.1406844106463879e-05, + "loss": 0.5378, + "step": 60 + }, + { + "epoch": 0.037115911165196226, + "grad_norm": 2.398271083831787, + "learning_rate": 1.1596958174904944e-05, + "loss": 0.4864, + "step": 61 + }, + { + "epoch": 0.03772436872528141, + "grad_norm": 2.2733852863311768, + "learning_rate": 1.1787072243346007e-05, + "loss": 0.4751, + "step": 62 + }, + { + "epoch": 0.0383328262853666, + "grad_norm": 2.115436315536499, + "learning_rate": 1.1977186311787073e-05, + "loss": 0.4688, + "step": 63 + }, + { + "epoch": 0.03894128384545178, + "grad_norm": 2.285353660583496, + "learning_rate": 1.2167300380228138e-05, + "loss": 0.5107, + "step": 64 + }, + { + "epoch": 0.039549741405536964, + "grad_norm": 2.3437185287475586, + "learning_rate": 1.2357414448669203e-05, + "loss": 0.5132, + "step": 65 + }, + { + "epoch": 0.04015819896562215, + "grad_norm": 2.5334279537200928, + "learning_rate": 1.2547528517110266e-05, + "loss": 0.4868, + "step": 66 + }, + { + "epoch": 0.04076665652570733, + "grad_norm": 2.2732203006744385, + "learning_rate": 1.2737642585551332e-05, + "loss": 0.4784, + "step": 67 + }, + { + "epoch": 0.041375114085792516, + "grad_norm": 2.3344926834106445, + "learning_rate": 1.2927756653992395e-05, + "loss": 0.5702, + "step": 68 + }, + { + "epoch": 0.0419835716458777, + "grad_norm": 2.6183888912200928, + "learning_rate": 1.3117870722433462e-05, + "loss": 0.4828, + "step": 69 + }, + { + "epoch": 0.04259202920596288, + "grad_norm": 3.995020627975464, + "learning_rate": 1.3307984790874526e-05, + "loss": 0.5182, + "step": 70 + }, + { + "epoch": 0.04320048676604807, + "grad_norm": 2.3043413162231445, + "learning_rate": 1.3498098859315589e-05, + "loss": 0.5346, + "step": 71 + }, + { + "epoch": 0.043808944326133255, + "grad_norm": 2.5298848152160645, + "learning_rate": 1.3688212927756656e-05, + "loss": 0.6079, + "step": 72 + }, + { + "epoch": 0.044417401886218434, + "grad_norm": 2.1517183780670166, + "learning_rate": 1.387832699619772e-05, + "loss": 0.4962, + "step": 73 + }, + { + "epoch": 0.04502585944630362, + "grad_norm": 2.3889400959014893, + "learning_rate": 1.4068441064638785e-05, + "loss": 0.5005, + "step": 74 + }, + { + "epoch": 0.04563431700638881, + "grad_norm": 2.265465259552002, + "learning_rate": 1.4258555133079848e-05, + "loss": 0.5684, + "step": 75 + }, + { + "epoch": 0.046242774566473986, + "grad_norm": 2.3578243255615234, + "learning_rate": 1.4448669201520912e-05, + "loss": 0.5401, + "step": 76 + }, + { + "epoch": 0.04685123212655917, + "grad_norm": 2.2916693687438965, + "learning_rate": 1.4638783269961978e-05, + "loss": 0.482, + "step": 77 + }, + { + "epoch": 0.04745968968664436, + "grad_norm": 2.678074359893799, + "learning_rate": 1.4828897338403042e-05, + "loss": 0.5157, + "step": 78 + }, + { + "epoch": 0.04806814724672954, + "grad_norm": 2.3698818683624268, + "learning_rate": 1.5019011406844107e-05, + "loss": 0.4558, + "step": 79 + }, + { + "epoch": 0.048676604806814724, + "grad_norm": 2.1744048595428467, + "learning_rate": 1.520912547528517e-05, + "loss": 0.4658, + "step": 80 + }, + { + "epoch": 0.04928506236689991, + "grad_norm": 2.1933300495147705, + "learning_rate": 1.5399239543726237e-05, + "loss": 0.518, + "step": 81 + }, + { + "epoch": 0.04989351992698509, + "grad_norm": 2.573246479034424, + "learning_rate": 1.55893536121673e-05, + "loss": 0.5668, + "step": 82 + }, + { + "epoch": 0.050501977487070276, + "grad_norm": 2.249875068664551, + "learning_rate": 1.5779467680608364e-05, + "loss": 0.4757, + "step": 83 + }, + { + "epoch": 0.05111043504715546, + "grad_norm": 2.1911234855651855, + "learning_rate": 1.596958174904943e-05, + "loss": 0.4753, + "step": 84 + }, + { + "epoch": 0.05171889260724064, + "grad_norm": 3.867871046066284, + "learning_rate": 1.6159695817490495e-05, + "loss": 0.4797, + "step": 85 + }, + { + "epoch": 0.05232735016732583, + "grad_norm": 2.745781421661377, + "learning_rate": 1.634980988593156e-05, + "loss": 0.4968, + "step": 86 + }, + { + "epoch": 0.052935807727411015, + "grad_norm": 2.0692405700683594, + "learning_rate": 1.6539923954372625e-05, + "loss": 0.481, + "step": 87 + }, + { + "epoch": 0.053544265287496194, + "grad_norm": 2.262392997741699, + "learning_rate": 1.673003802281369e-05, + "loss": 0.5532, + "step": 88 + }, + { + "epoch": 0.05415272284758138, + "grad_norm": 2.1288435459136963, + "learning_rate": 1.6920152091254756e-05, + "loss": 0.4839, + "step": 89 + }, + { + "epoch": 0.05476118040766657, + "grad_norm": 2.0853445529937744, + "learning_rate": 1.711026615969582e-05, + "loss": 0.5071, + "step": 90 + }, + { + "epoch": 0.055369637967751746, + "grad_norm": 2.296994686126709, + "learning_rate": 1.7300380228136882e-05, + "loss": 0.5614, + "step": 91 + }, + { + "epoch": 0.05597809552783693, + "grad_norm": 2.261611223220825, + "learning_rate": 1.7490494296577946e-05, + "loss": 0.536, + "step": 92 + }, + { + "epoch": 0.05658655308792212, + "grad_norm": 2.2473561763763428, + "learning_rate": 1.7680608365019013e-05, + "loss": 0.4652, + "step": 93 + }, + { + "epoch": 0.0571950106480073, + "grad_norm": 2.532961368560791, + "learning_rate": 1.7870722433460076e-05, + "loss": 0.5331, + "step": 94 + }, + { + "epoch": 0.057803468208092484, + "grad_norm": 2.3481945991516113, + "learning_rate": 1.806083650190114e-05, + "loss": 0.52, + "step": 95 + }, + { + "epoch": 0.05841192576817767, + "grad_norm": 2.024817943572998, + "learning_rate": 1.8250950570342207e-05, + "loss": 0.4745, + "step": 96 + }, + { + "epoch": 0.05902038332826286, + "grad_norm": 2.2880470752716064, + "learning_rate": 1.844106463878327e-05, + "loss": 0.5019, + "step": 97 + }, + { + "epoch": 0.059628840888348036, + "grad_norm": 1.9540607929229736, + "learning_rate": 1.8631178707224337e-05, + "loss": 0.5138, + "step": 98 + }, + { + "epoch": 0.06023729844843322, + "grad_norm": 2.5749783515930176, + "learning_rate": 1.88212927756654e-05, + "loss": 0.5639, + "step": 99 + }, + { + "epoch": 0.06084575600851841, + "grad_norm": 2.0296337604522705, + "learning_rate": 1.9011406844106464e-05, + "loss": 0.4929, + "step": 100 + }, + { + "epoch": 0.06145421356860359, + "grad_norm": 2.068000316619873, + "learning_rate": 1.920152091254753e-05, + "loss": 0.4568, + "step": 101 + }, + { + "epoch": 0.062062671128688775, + "grad_norm": 2.4328064918518066, + "learning_rate": 1.9391634980988594e-05, + "loss": 0.5764, + "step": 102 + }, + { + "epoch": 0.06267112868877396, + "grad_norm": 1.961745262145996, + "learning_rate": 1.958174904942966e-05, + "loss": 0.4756, + "step": 103 + }, + { + "epoch": 0.06327958624885914, + "grad_norm": 2.189243793487549, + "learning_rate": 1.9771863117870725e-05, + "loss": 0.5254, + "step": 104 + }, + { + "epoch": 0.06388804380894432, + "grad_norm": 1.9438084363937378, + "learning_rate": 1.9961977186311788e-05, + "loss": 0.4921, + "step": 105 + }, + { + "epoch": 0.06449650136902951, + "grad_norm": 1.9308326244354248, + "learning_rate": 2.0152091254752852e-05, + "loss": 0.4508, + "step": 106 + }, + { + "epoch": 0.06510495892911469, + "grad_norm": 1.9742590188980103, + "learning_rate": 2.0342205323193915e-05, + "loss": 0.4899, + "step": 107 + }, + { + "epoch": 0.06571341648919987, + "grad_norm": 2.1796038150787354, + "learning_rate": 2.0532319391634982e-05, + "loss": 0.5594, + "step": 108 + }, + { + "epoch": 0.06632187404928507, + "grad_norm": 2.2020134925842285, + "learning_rate": 2.0722433460076046e-05, + "loss": 0.4946, + "step": 109 + }, + { + "epoch": 0.06693033160937024, + "grad_norm": 1.9152429103851318, + "learning_rate": 2.0912547528517112e-05, + "loss": 0.485, + "step": 110 + }, + { + "epoch": 0.06753878916945542, + "grad_norm": 2.1203103065490723, + "learning_rate": 2.1102661596958176e-05, + "loss": 0.5062, + "step": 111 + }, + { + "epoch": 0.06814724672954062, + "grad_norm": 1.934693694114685, + "learning_rate": 2.129277566539924e-05, + "loss": 0.4841, + "step": 112 + }, + { + "epoch": 0.0687557042896258, + "grad_norm": 2.1393649578094482, + "learning_rate": 2.1482889733840306e-05, + "loss": 0.585, + "step": 113 + }, + { + "epoch": 0.06936416184971098, + "grad_norm": 2.0923867225646973, + "learning_rate": 2.167300380228137e-05, + "loss": 0.5467, + "step": 114 + }, + { + "epoch": 0.06997261940979617, + "grad_norm": 1.947996735572815, + "learning_rate": 2.1863117870722437e-05, + "loss": 0.4804, + "step": 115 + }, + { + "epoch": 0.07058107696988135, + "grad_norm": 2.1590495109558105, + "learning_rate": 2.20532319391635e-05, + "loss": 0.5335, + "step": 116 + }, + { + "epoch": 0.07118953452996654, + "grad_norm": 2.0173380374908447, + "learning_rate": 2.2243346007604564e-05, + "loss": 0.5273, + "step": 117 + }, + { + "epoch": 0.07179799209005172, + "grad_norm": 2.251988649368286, + "learning_rate": 2.2433460076045627e-05, + "loss": 0.5975, + "step": 118 + }, + { + "epoch": 0.0724064496501369, + "grad_norm": 2.279703378677368, + "learning_rate": 2.262357414448669e-05, + "loss": 0.5642, + "step": 119 + }, + { + "epoch": 0.0730149072102221, + "grad_norm": 2.0928151607513428, + "learning_rate": 2.2813688212927758e-05, + "loss": 0.5462, + "step": 120 + }, + { + "epoch": 0.07362336477030727, + "grad_norm": 9.408754348754883, + "learning_rate": 2.300380228136882e-05, + "loss": 0.5737, + "step": 121 + }, + { + "epoch": 0.07423182233039245, + "grad_norm": 2.0510871410369873, + "learning_rate": 2.3193916349809888e-05, + "loss": 0.4958, + "step": 122 + }, + { + "epoch": 0.07484027989047765, + "grad_norm": 1.8674732446670532, + "learning_rate": 2.338403041825095e-05, + "loss": 0.4752, + "step": 123 + }, + { + "epoch": 0.07544873745056282, + "grad_norm": 1.8963392972946167, + "learning_rate": 2.3574144486692015e-05, + "loss": 0.5461, + "step": 124 + }, + { + "epoch": 0.076057195010648, + "grad_norm": 1.7347912788391113, + "learning_rate": 2.3764258555133082e-05, + "loss": 0.4789, + "step": 125 + }, + { + "epoch": 0.0766656525707332, + "grad_norm": 2.5174317359924316, + "learning_rate": 2.3954372623574145e-05, + "loss": 0.5642, + "step": 126 + }, + { + "epoch": 0.07727411013081838, + "grad_norm": 2.2092862129211426, + "learning_rate": 2.4144486692015212e-05, + "loss": 0.5812, + "step": 127 + }, + { + "epoch": 0.07788256769090356, + "grad_norm": 2.6390881538391113, + "learning_rate": 2.4334600760456276e-05, + "loss": 0.5021, + "step": 128 + }, + { + "epoch": 0.07849102525098875, + "grad_norm": 1.8941545486450195, + "learning_rate": 2.452471482889734e-05, + "loss": 0.4816, + "step": 129 + }, + { + "epoch": 0.07909948281107393, + "grad_norm": 1.7453467845916748, + "learning_rate": 2.4714828897338406e-05, + "loss": 0.4823, + "step": 130 + }, + { + "epoch": 0.07970794037115911, + "grad_norm": 1.9925953149795532, + "learning_rate": 2.490494296577947e-05, + "loss": 0.5208, + "step": 131 + }, + { + "epoch": 0.0803163979312443, + "grad_norm": 2.222625494003296, + "learning_rate": 2.5095057034220533e-05, + "loss": 0.4962, + "step": 132 + }, + { + "epoch": 0.08092485549132948, + "grad_norm": 1.99649178981781, + "learning_rate": 2.5285171102661596e-05, + "loss": 0.5195, + "step": 133 + }, + { + "epoch": 0.08153331305141466, + "grad_norm": 4.0735273361206055, + "learning_rate": 2.5475285171102663e-05, + "loss": 0.5421, + "step": 134 + }, + { + "epoch": 0.08214177061149985, + "grad_norm": 2.117365837097168, + "learning_rate": 2.5665399239543723e-05, + "loss": 0.5745, + "step": 135 + }, + { + "epoch": 0.08275022817158503, + "grad_norm": 2.24652099609375, + "learning_rate": 2.585551330798479e-05, + "loss": 0.5328, + "step": 136 + }, + { + "epoch": 0.08335868573167021, + "grad_norm": 2.288801670074463, + "learning_rate": 2.6045627376425857e-05, + "loss": 0.5145, + "step": 137 + }, + { + "epoch": 0.0839671432917554, + "grad_norm": 1.9129204750061035, + "learning_rate": 2.6235741444866924e-05, + "loss": 0.5466, + "step": 138 + }, + { + "epoch": 0.08457560085184058, + "grad_norm": 2.242582082748413, + "learning_rate": 2.6425855513307984e-05, + "loss": 0.5592, + "step": 139 + }, + { + "epoch": 0.08518405841192576, + "grad_norm": 1.8476616144180298, + "learning_rate": 2.661596958174905e-05, + "loss": 0.5257, + "step": 140 + }, + { + "epoch": 0.08579251597201096, + "grad_norm": 1.9070931673049927, + "learning_rate": 2.6806083650190118e-05, + "loss": 0.5186, + "step": 141 + }, + { + "epoch": 0.08640097353209614, + "grad_norm": 1.9647867679595947, + "learning_rate": 2.6996197718631178e-05, + "loss": 0.595, + "step": 142 + }, + { + "epoch": 0.08700943109218132, + "grad_norm": 2.1235222816467285, + "learning_rate": 2.7186311787072245e-05, + "loss": 0.4921, + "step": 143 + }, + { + "epoch": 0.08761788865226651, + "grad_norm": 2.1548256874084473, + "learning_rate": 2.7376425855513312e-05, + "loss": 0.5166, + "step": 144 + }, + { + "epoch": 0.08822634621235169, + "grad_norm": 3.3486247062683105, + "learning_rate": 2.7566539923954375e-05, + "loss": 0.5909, + "step": 145 + }, + { + "epoch": 0.08883480377243687, + "grad_norm": 2.245957136154175, + "learning_rate": 2.775665399239544e-05, + "loss": 0.5726, + "step": 146 + }, + { + "epoch": 0.08944326133252206, + "grad_norm": 1.7607914209365845, + "learning_rate": 2.7946768060836502e-05, + "loss": 0.4802, + "step": 147 + }, + { + "epoch": 0.09005171889260724, + "grad_norm": 1.840922474861145, + "learning_rate": 2.813688212927757e-05, + "loss": 0.5054, + "step": 148 + }, + { + "epoch": 0.09066017645269242, + "grad_norm": 1.794522762298584, + "learning_rate": 2.832699619771863e-05, + "loss": 0.5443, + "step": 149 + }, + { + "epoch": 0.09126863401277761, + "grad_norm": 1.9434462785720825, + "learning_rate": 2.8517110266159696e-05, + "loss": 0.5159, + "step": 150 + }, + { + "epoch": 0.09187709157286279, + "grad_norm": 2.252189874649048, + "learning_rate": 2.8707224334600763e-05, + "loss": 0.5802, + "step": 151 + }, + { + "epoch": 0.09248554913294797, + "grad_norm": 1.9505751132965088, + "learning_rate": 2.8897338403041823e-05, + "loss": 0.5033, + "step": 152 + }, + { + "epoch": 0.09309400669303317, + "grad_norm": 2.0647778511047363, + "learning_rate": 2.908745247148289e-05, + "loss": 0.5465, + "step": 153 + }, + { + "epoch": 0.09370246425311834, + "grad_norm": 2.034170150756836, + "learning_rate": 2.9277566539923957e-05, + "loss": 0.5278, + "step": 154 + }, + { + "epoch": 0.09431092181320352, + "grad_norm": 2.2246432304382324, + "learning_rate": 2.9467680608365024e-05, + "loss": 0.5276, + "step": 155 + }, + { + "epoch": 0.09491937937328872, + "grad_norm": 1.8863154649734497, + "learning_rate": 2.9657794676806084e-05, + "loss": 0.4909, + "step": 156 + }, + { + "epoch": 0.0955278369333739, + "grad_norm": 2.914320945739746, + "learning_rate": 2.984790874524715e-05, + "loss": 0.674, + "step": 157 + }, + { + "epoch": 0.09613629449345908, + "grad_norm": 4.749186038970947, + "learning_rate": 3.0038022813688214e-05, + "loss": 0.5386, + "step": 158 + }, + { + "epoch": 0.09674475205354427, + "grad_norm": 2.0162932872772217, + "learning_rate": 3.0228136882129278e-05, + "loss": 0.5441, + "step": 159 + }, + { + "epoch": 0.09735320961362945, + "grad_norm": 1.8768587112426758, + "learning_rate": 3.041825095057034e-05, + "loss": 0.5373, + "step": 160 + }, + { + "epoch": 0.09796166717371463, + "grad_norm": 1.9952895641326904, + "learning_rate": 3.060836501901141e-05, + "loss": 0.5037, + "step": 161 + }, + { + "epoch": 0.09857012473379982, + "grad_norm": 1.961302638053894, + "learning_rate": 3.0798479087452475e-05, + "loss": 0.4827, + "step": 162 + }, + { + "epoch": 0.099178582293885, + "grad_norm": 2.2090559005737305, + "learning_rate": 3.098859315589354e-05, + "loss": 0.5783, + "step": 163 + }, + { + "epoch": 0.09978703985397018, + "grad_norm": 1.7766873836517334, + "learning_rate": 3.11787072243346e-05, + "loss": 0.4749, + "step": 164 + }, + { + "epoch": 0.10039549741405537, + "grad_norm": 2.083379030227661, + "learning_rate": 3.1368821292775665e-05, + "loss": 0.5149, + "step": 165 + }, + { + "epoch": 0.10100395497414055, + "grad_norm": 2.4703352451324463, + "learning_rate": 3.155893536121673e-05, + "loss": 0.5672, + "step": 166 + }, + { + "epoch": 0.10161241253422573, + "grad_norm": 2.7795681953430176, + "learning_rate": 3.174904942965779e-05, + "loss": 0.583, + "step": 167 + }, + { + "epoch": 0.10222087009431093, + "grad_norm": 1.8572745323181152, + "learning_rate": 3.193916349809886e-05, + "loss": 0.4859, + "step": 168 + }, + { + "epoch": 0.1028293276543961, + "grad_norm": 2.1731905937194824, + "learning_rate": 3.2129277566539926e-05, + "loss": 0.4851, + "step": 169 + }, + { + "epoch": 0.10343778521448128, + "grad_norm": 2.5642776489257812, + "learning_rate": 3.231939163498099e-05, + "loss": 0.5155, + "step": 170 + }, + { + "epoch": 0.10404624277456648, + "grad_norm": 2.716291904449463, + "learning_rate": 3.250950570342205e-05, + "loss": 0.5387, + "step": 171 + }, + { + "epoch": 0.10465470033465166, + "grad_norm": 2.027435541152954, + "learning_rate": 3.269961977186312e-05, + "loss": 0.5266, + "step": 172 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 2.161743402481079, + "learning_rate": 3.288973384030418e-05, + "loss": 0.58, + "step": 173 + }, + { + "epoch": 0.10587161545482203, + "grad_norm": 2.0740175247192383, + "learning_rate": 3.307984790874525e-05, + "loss": 0.5476, + "step": 174 + }, + { + "epoch": 0.10648007301490721, + "grad_norm": 1.8054178953170776, + "learning_rate": 3.3269961977186314e-05, + "loss": 0.513, + "step": 175 + }, + { + "epoch": 0.10708853057499239, + "grad_norm": 1.872541069984436, + "learning_rate": 3.346007604562738e-05, + "loss": 0.5266, + "step": 176 + }, + { + "epoch": 0.10769698813507758, + "grad_norm": 2.594137191772461, + "learning_rate": 3.365019011406844e-05, + "loss": 0.56, + "step": 177 + }, + { + "epoch": 0.10830544569516276, + "grad_norm": 1.95430588722229, + "learning_rate": 3.384030418250951e-05, + "loss": 0.5971, + "step": 178 + }, + { + "epoch": 0.10891390325524794, + "grad_norm": 2.0000500679016113, + "learning_rate": 3.4030418250950574e-05, + "loss": 0.4828, + "step": 179 + }, + { + "epoch": 0.10952236081533313, + "grad_norm": 1.7855273485183716, + "learning_rate": 3.422053231939164e-05, + "loss": 0.5368, + "step": 180 + }, + { + "epoch": 0.11013081837541831, + "grad_norm": 2.0093138217926025, + "learning_rate": 3.44106463878327e-05, + "loss": 0.5519, + "step": 181 + }, + { + "epoch": 0.11073927593550349, + "grad_norm": 1.8398433923721313, + "learning_rate": 3.4600760456273765e-05, + "loss": 0.5089, + "step": 182 + }, + { + "epoch": 0.11134773349558869, + "grad_norm": 1.9832158088684082, + "learning_rate": 3.479087452471483e-05, + "loss": 0.578, + "step": 183 + }, + { + "epoch": 0.11195619105567386, + "grad_norm": 1.8576923608779907, + "learning_rate": 3.498098859315589e-05, + "loss": 0.545, + "step": 184 + }, + { + "epoch": 0.11256464861575904, + "grad_norm": 1.8127909898757935, + "learning_rate": 3.517110266159696e-05, + "loss": 0.5694, + "step": 185 + }, + { + "epoch": 0.11317310617584424, + "grad_norm": 2.124783515930176, + "learning_rate": 3.5361216730038026e-05, + "loss": 0.6131, + "step": 186 + }, + { + "epoch": 0.11378156373592942, + "grad_norm": 1.8797577619552612, + "learning_rate": 3.555133079847909e-05, + "loss": 0.5098, + "step": 187 + }, + { + "epoch": 0.1143900212960146, + "grad_norm": 1.871032476425171, + "learning_rate": 3.574144486692015e-05, + "loss": 0.5785, + "step": 188 + }, + { + "epoch": 0.11499847885609979, + "grad_norm": 1.6941313743591309, + "learning_rate": 3.593155893536122e-05, + "loss": 0.5115, + "step": 189 + }, + { + "epoch": 0.11560693641618497, + "grad_norm": 2.301297903060913, + "learning_rate": 3.612167300380228e-05, + "loss": 0.587, + "step": 190 + }, + { + "epoch": 0.11621539397627015, + "grad_norm": 1.9337679147720337, + "learning_rate": 3.631178707224335e-05, + "loss": 0.5393, + "step": 191 + }, + { + "epoch": 0.11682385153635534, + "grad_norm": 2.24143648147583, + "learning_rate": 3.6501901140684413e-05, + "loss": 0.5152, + "step": 192 + }, + { + "epoch": 0.11743230909644052, + "grad_norm": 2.0369713306427, + "learning_rate": 3.669201520912548e-05, + "loss": 0.5691, + "step": 193 + }, + { + "epoch": 0.11804076665652571, + "grad_norm": 1.8279515504837036, + "learning_rate": 3.688212927756654e-05, + "loss": 0.4814, + "step": 194 + }, + { + "epoch": 0.1186492242166109, + "grad_norm": 2.0841150283813477, + "learning_rate": 3.7072243346007604e-05, + "loss": 0.5266, + "step": 195 + }, + { + "epoch": 0.11925768177669607, + "grad_norm": 1.830251693725586, + "learning_rate": 3.7262357414448674e-05, + "loss": 0.5533, + "step": 196 + }, + { + "epoch": 0.11986613933678127, + "grad_norm": 1.8735334873199463, + "learning_rate": 3.745247148288973e-05, + "loss": 0.5265, + "step": 197 + }, + { + "epoch": 0.12047459689686645, + "grad_norm": 2.144216299057007, + "learning_rate": 3.76425855513308e-05, + "loss": 0.5795, + "step": 198 + }, + { + "epoch": 0.12108305445695162, + "grad_norm": 1.7879762649536133, + "learning_rate": 3.7832699619771865e-05, + "loss": 0.5834, + "step": 199 + }, + { + "epoch": 0.12169151201703682, + "grad_norm": 2.255518674850464, + "learning_rate": 3.802281368821293e-05, + "loss": 0.5094, + "step": 200 + }, + { + "epoch": 0.122299969577122, + "grad_norm": 1.92243230342865, + "learning_rate": 3.821292775665399e-05, + "loss": 0.5215, + "step": 201 + }, + { + "epoch": 0.12290842713720718, + "grad_norm": 2.395334005355835, + "learning_rate": 3.840304182509506e-05, + "loss": 0.5715, + "step": 202 + }, + { + "epoch": 0.12351688469729237, + "grad_norm": 2.1944241523742676, + "learning_rate": 3.8593155893536125e-05, + "loss": 0.5486, + "step": 203 + }, + { + "epoch": 0.12412534225737755, + "grad_norm": 1.983839750289917, + "learning_rate": 3.878326996197719e-05, + "loss": 0.5876, + "step": 204 + }, + { + "epoch": 0.12473379981746273, + "grad_norm": 2.557204246520996, + "learning_rate": 3.897338403041825e-05, + "loss": 0.5066, + "step": 205 + }, + { + "epoch": 0.12534225737754792, + "grad_norm": 1.9440828561782837, + "learning_rate": 3.916349809885932e-05, + "loss": 0.567, + "step": 206 + }, + { + "epoch": 0.1259507149376331, + "grad_norm": 1.8540195226669312, + "learning_rate": 3.935361216730038e-05, + "loss": 0.5652, + "step": 207 + }, + { + "epoch": 0.12655917249771828, + "grad_norm": 1.9962955713272095, + "learning_rate": 3.954372623574145e-05, + "loss": 0.5995, + "step": 208 + }, + { + "epoch": 0.12716763005780346, + "grad_norm": 2.1377549171447754, + "learning_rate": 3.973384030418251e-05, + "loss": 0.671, + "step": 209 + }, + { + "epoch": 0.12777608761788864, + "grad_norm": 2.1312196254730225, + "learning_rate": 3.9923954372623577e-05, + "loss": 0.5925, + "step": 210 + }, + { + "epoch": 0.12838454517797385, + "grad_norm": 1.659692645072937, + "learning_rate": 4.011406844106464e-05, + "loss": 0.5313, + "step": 211 + }, + { + "epoch": 0.12899300273805903, + "grad_norm": 2.2088122367858887, + "learning_rate": 4.0304182509505703e-05, + "loss": 0.5368, + "step": 212 + }, + { + "epoch": 0.1296014602981442, + "grad_norm": 1.8686944246292114, + "learning_rate": 4.0494296577946774e-05, + "loss": 0.5576, + "step": 213 + }, + { + "epoch": 0.13020991785822938, + "grad_norm": 1.934857726097107, + "learning_rate": 4.068441064638783e-05, + "loss": 0.5519, + "step": 214 + }, + { + "epoch": 0.13081837541831456, + "grad_norm": 1.8353568315505981, + "learning_rate": 4.08745247148289e-05, + "loss": 0.5312, + "step": 215 + }, + { + "epoch": 0.13142683297839974, + "grad_norm": 1.9443809986114502, + "learning_rate": 4.1064638783269964e-05, + "loss": 0.5108, + "step": 216 + }, + { + "epoch": 0.13203529053848495, + "grad_norm": 1.8545550107955933, + "learning_rate": 4.125475285171103e-05, + "loss": 0.559, + "step": 217 + }, + { + "epoch": 0.13264374809857013, + "grad_norm": 1.9417598247528076, + "learning_rate": 4.144486692015209e-05, + "loss": 0.5631, + "step": 218 + }, + { + "epoch": 0.1332522056586553, + "grad_norm": 1.7785390615463257, + "learning_rate": 4.163498098859316e-05, + "loss": 0.5519, + "step": 219 + }, + { + "epoch": 0.1338606632187405, + "grad_norm": 1.9775593280792236, + "learning_rate": 4.1825095057034225e-05, + "loss": 0.6203, + "step": 220 + }, + { + "epoch": 0.13446912077882567, + "grad_norm": 1.7184250354766846, + "learning_rate": 4.201520912547529e-05, + "loss": 0.5356, + "step": 221 + }, + { + "epoch": 0.13507757833891085, + "grad_norm": 1.7691705226898193, + "learning_rate": 4.220532319391635e-05, + "loss": 0.5185, + "step": 222 + }, + { + "epoch": 0.13568603589899605, + "grad_norm": 1.759832739830017, + "learning_rate": 4.2395437262357415e-05, + "loss": 0.5276, + "step": 223 + }, + { + "epoch": 0.13629449345908123, + "grad_norm": 1.8581064939498901, + "learning_rate": 4.258555133079848e-05, + "loss": 0.5341, + "step": 224 + }, + { + "epoch": 0.1369029510191664, + "grad_norm": 1.9540525674819946, + "learning_rate": 4.277566539923954e-05, + "loss": 0.5544, + "step": 225 + }, + { + "epoch": 0.1375114085792516, + "grad_norm": 2.0600829124450684, + "learning_rate": 4.296577946768061e-05, + "loss": 0.5259, + "step": 226 + }, + { + "epoch": 0.13811986613933677, + "grad_norm": 1.8912417888641357, + "learning_rate": 4.3155893536121676e-05, + "loss": 0.5448, + "step": 227 + }, + { + "epoch": 0.13872832369942195, + "grad_norm": 1.7389650344848633, + "learning_rate": 4.334600760456274e-05, + "loss": 0.5143, + "step": 228 + }, + { + "epoch": 0.13933678125950716, + "grad_norm": 1.844450831413269, + "learning_rate": 4.35361216730038e-05, + "loss": 0.542, + "step": 229 + }, + { + "epoch": 0.13994523881959234, + "grad_norm": 1.8148777484893799, + "learning_rate": 4.3726235741444873e-05, + "loss": 0.5151, + "step": 230 + }, + { + "epoch": 0.14055369637967752, + "grad_norm": 1.9488646984100342, + "learning_rate": 4.391634980988593e-05, + "loss": 0.6014, + "step": 231 + }, + { + "epoch": 0.1411621539397627, + "grad_norm": 1.8368064165115356, + "learning_rate": 4.4106463878327e-05, + "loss": 0.5752, + "step": 232 + }, + { + "epoch": 0.14177061149984788, + "grad_norm": 2.520059823989868, + "learning_rate": 4.4296577946768064e-05, + "loss": 0.5604, + "step": 233 + }, + { + "epoch": 0.14237906905993308, + "grad_norm": 1.6602476835250854, + "learning_rate": 4.448669201520913e-05, + "loss": 0.5702, + "step": 234 + }, + { + "epoch": 0.14298752662001826, + "grad_norm": 2.0854058265686035, + "learning_rate": 4.467680608365019e-05, + "loss": 0.6663, + "step": 235 + }, + { + "epoch": 0.14359598418010344, + "grad_norm": 1.8286073207855225, + "learning_rate": 4.4866920152091254e-05, + "loss": 0.5896, + "step": 236 + }, + { + "epoch": 0.14420444174018862, + "grad_norm": 1.8150510787963867, + "learning_rate": 4.5057034220532325e-05, + "loss": 0.5698, + "step": 237 + }, + { + "epoch": 0.1448128993002738, + "grad_norm": 1.7325129508972168, + "learning_rate": 4.524714828897338e-05, + "loss": 0.5399, + "step": 238 + }, + { + "epoch": 0.14542135686035898, + "grad_norm": 1.7553200721740723, + "learning_rate": 4.543726235741445e-05, + "loss": 0.5335, + "step": 239 + }, + { + "epoch": 0.1460298144204442, + "grad_norm": 1.677822470664978, + "learning_rate": 4.5627376425855515e-05, + "loss": 0.5662, + "step": 240 + }, + { + "epoch": 0.14663827198052937, + "grad_norm": 2.0692317485809326, + "learning_rate": 4.581749049429658e-05, + "loss": 0.7082, + "step": 241 + }, + { + "epoch": 0.14724672954061455, + "grad_norm": 1.5937222242355347, + "learning_rate": 4.600760456273764e-05, + "loss": 0.5736, + "step": 242 + }, + { + "epoch": 0.14785518710069973, + "grad_norm": 1.6691651344299316, + "learning_rate": 4.619771863117871e-05, + "loss": 0.5081, + "step": 243 + }, + { + "epoch": 0.1484636446607849, + "grad_norm": 1.916918396949768, + "learning_rate": 4.6387832699619776e-05, + "loss": 0.5834, + "step": 244 + }, + { + "epoch": 0.14907210222087008, + "grad_norm": 1.7552460432052612, + "learning_rate": 4.657794676806084e-05, + "loss": 0.5644, + "step": 245 + }, + { + "epoch": 0.1496805597809553, + "grad_norm": 1.794980764389038, + "learning_rate": 4.67680608365019e-05, + "loss": 0.5694, + "step": 246 + }, + { + "epoch": 0.15028901734104047, + "grad_norm": 1.563038945198059, + "learning_rate": 4.695817490494297e-05, + "loss": 0.5564, + "step": 247 + }, + { + "epoch": 0.15089747490112565, + "grad_norm": 1.6664254665374756, + "learning_rate": 4.714828897338403e-05, + "loss": 0.5837, + "step": 248 + }, + { + "epoch": 0.15150593246121083, + "grad_norm": 1.7116878032684326, + "learning_rate": 4.73384030418251e-05, + "loss": 0.5491, + "step": 249 + }, + { + "epoch": 0.152114390021296, + "grad_norm": 1.987626075744629, + "learning_rate": 4.7528517110266163e-05, + "loss": 0.6203, + "step": 250 + }, + { + "epoch": 0.1527228475813812, + "grad_norm": 1.6196045875549316, + "learning_rate": 4.771863117870723e-05, + "loss": 0.5393, + "step": 251 + }, + { + "epoch": 0.1533313051414664, + "grad_norm": 1.8573944568634033, + "learning_rate": 4.790874524714829e-05, + "loss": 0.575, + "step": 252 + }, + { + "epoch": 0.15393976270155157, + "grad_norm": 1.9413925409317017, + "learning_rate": 4.8098859315589354e-05, + "loss": 0.5713, + "step": 253 + }, + { + "epoch": 0.15454822026163675, + "grad_norm": 1.883753776550293, + "learning_rate": 4.8288973384030424e-05, + "loss": 0.5714, + "step": 254 + }, + { + "epoch": 0.15515667782172193, + "grad_norm": 1.7571375370025635, + "learning_rate": 4.847908745247148e-05, + "loss": 0.6265, + "step": 255 + }, + { + "epoch": 0.1557651353818071, + "grad_norm": 1.780029058456421, + "learning_rate": 4.866920152091255e-05, + "loss": 0.5677, + "step": 256 + }, + { + "epoch": 0.1563735929418923, + "grad_norm": 2.0636849403381348, + "learning_rate": 4.8859315589353615e-05, + "loss": 0.5038, + "step": 257 + }, + { + "epoch": 0.1569820505019775, + "grad_norm": 1.6824311017990112, + "learning_rate": 4.904942965779468e-05, + "loss": 0.5905, + "step": 258 + }, + { + "epoch": 0.15759050806206268, + "grad_norm": 2.048978328704834, + "learning_rate": 4.923954372623574e-05, + "loss": 0.6073, + "step": 259 + }, + { + "epoch": 0.15819896562214786, + "grad_norm": 2.075801134109497, + "learning_rate": 4.942965779467681e-05, + "loss": 0.5321, + "step": 260 + }, + { + "epoch": 0.15880742318223304, + "grad_norm": 1.8397325277328491, + "learning_rate": 4.9619771863117875e-05, + "loss": 0.6598, + "step": 261 + }, + { + "epoch": 0.15941588074231822, + "grad_norm": 2.7808053493499756, + "learning_rate": 4.980988593155894e-05, + "loss": 0.5265, + "step": 262 + }, + { + "epoch": 0.1600243383024034, + "grad_norm": 2.129298686981201, + "learning_rate": 5e-05, + "loss": 0.5235, + "step": 263 + }, + { + "epoch": 0.1606327958624886, + "grad_norm": 6.710923671722412, + "learning_rate": 4.999999690052103e-05, + "loss": 0.562, + "step": 264 + }, + { + "epoch": 0.16124125342257378, + "grad_norm": 4.849277973175049, + "learning_rate": 4.9999987602084876e-05, + "loss": 0.5677, + "step": 265 + }, + { + "epoch": 0.16184971098265896, + "grad_norm": 1.6120680570602417, + "learning_rate": 4.999997210469385e-05, + "loss": 0.5322, + "step": 266 + }, + { + "epoch": 0.16245816854274414, + "grad_norm": 27.193222045898438, + "learning_rate": 4.9999950408351784e-05, + "loss": 0.7448, + "step": 267 + }, + { + "epoch": 0.16306662610282932, + "grad_norm": 2.693523645401001, + "learning_rate": 4.999992251306407e-05, + "loss": 0.6533, + "step": 268 + }, + { + "epoch": 0.1636750836629145, + "grad_norm": 1.7629886865615845, + "learning_rate": 4.999988841883763e-05, + "loss": 0.5668, + "step": 269 + }, + { + "epoch": 0.1642835412229997, + "grad_norm": 1.9537672996520996, + "learning_rate": 4.999984812568089e-05, + "loss": 0.5254, + "step": 270 + }, + { + "epoch": 0.1648919987830849, + "grad_norm": 4.926598072052002, + "learning_rate": 4.999980163360388e-05, + "loss": 0.5795, + "step": 271 + }, + { + "epoch": 0.16550045634317007, + "grad_norm": 2.0768604278564453, + "learning_rate": 4.9999748942618094e-05, + "loss": 0.6014, + "step": 272 + }, + { + "epoch": 0.16610891390325525, + "grad_norm": 2.3579049110412598, + "learning_rate": 4.999969005273661e-05, + "loss": 0.6518, + "step": 273 + }, + { + "epoch": 0.16671737146334042, + "grad_norm": 2.4885222911834717, + "learning_rate": 4.9999624963974045e-05, + "loss": 0.6194, + "step": 274 + }, + { + "epoch": 0.1673258290234256, + "grad_norm": 2.2712252140045166, + "learning_rate": 4.999955367634652e-05, + "loss": 0.6094, + "step": 275 + }, + { + "epoch": 0.1679342865835108, + "grad_norm": 1.9794126749038696, + "learning_rate": 4.999947618987171e-05, + "loss": 0.5214, + "step": 276 + }, + { + "epoch": 0.168542744143596, + "grad_norm": 1.6654919385910034, + "learning_rate": 4.999939250456884e-05, + "loss": 0.5547, + "step": 277 + }, + { + "epoch": 0.16915120170368117, + "grad_norm": 1.932141900062561, + "learning_rate": 4.999930262045865e-05, + "loss": 0.5427, + "step": 278 + }, + { + "epoch": 0.16975965926376635, + "grad_norm": 1.752447247505188, + "learning_rate": 4.999920653756344e-05, + "loss": 0.5661, + "step": 279 + }, + { + "epoch": 0.17036811682385153, + "grad_norm": 2.4955854415893555, + "learning_rate": 4.9999104255907015e-05, + "loss": 0.6635, + "step": 280 + }, + { + "epoch": 0.1709765743839367, + "grad_norm": 1.774271011352539, + "learning_rate": 4.999899577551476e-05, + "loss": 0.6433, + "step": 281 + }, + { + "epoch": 0.17158503194402192, + "grad_norm": 1.7379838228225708, + "learning_rate": 4.9998881096413554e-05, + "loss": 0.4936, + "step": 282 + }, + { + "epoch": 0.1721934895041071, + "grad_norm": 1.5458259582519531, + "learning_rate": 4.9998760218631845e-05, + "loss": 0.5219, + "step": 283 + }, + { + "epoch": 0.17280194706419227, + "grad_norm": 1.8059486150741577, + "learning_rate": 4.99986331421996e-05, + "loss": 0.5384, + "step": 284 + }, + { + "epoch": 0.17341040462427745, + "grad_norm": 1.9652469158172607, + "learning_rate": 4.999849986714833e-05, + "loss": 0.6101, + "step": 285 + }, + { + "epoch": 0.17401886218436263, + "grad_norm": 1.7066302299499512, + "learning_rate": 4.999836039351108e-05, + "loss": 0.5981, + "step": 286 + }, + { + "epoch": 0.1746273197444478, + "grad_norm": 1.5256247520446777, + "learning_rate": 4.999821472132244e-05, + "loss": 0.5109, + "step": 287 + }, + { + "epoch": 0.17523577730453302, + "grad_norm": 1.713955283164978, + "learning_rate": 4.999806285061852e-05, + "loss": 0.557, + "step": 288 + }, + { + "epoch": 0.1758442348646182, + "grad_norm": 1.8485389947891235, + "learning_rate": 4.999790478143699e-05, + "loss": 0.575, + "step": 289 + }, + { + "epoch": 0.17645269242470338, + "grad_norm": 1.7782764434814453, + "learning_rate": 4.999774051381704e-05, + "loss": 0.5693, + "step": 290 + }, + { + "epoch": 0.17706114998478856, + "grad_norm": 2.34918212890625, + "learning_rate": 4.999757004779939e-05, + "loss": 0.6003, + "step": 291 + }, + { + "epoch": 0.17766960754487374, + "grad_norm": 1.9476317167282104, + "learning_rate": 4.999739338342633e-05, + "loss": 0.6231, + "step": 292 + }, + { + "epoch": 0.17827806510495892, + "grad_norm": 1.6623529195785522, + "learning_rate": 4.999721052074164e-05, + "loss": 0.5573, + "step": 293 + }, + { + "epoch": 0.17888652266504412, + "grad_norm": 1.4444576501846313, + "learning_rate": 4.999702145979069e-05, + "loss": 0.5658, + "step": 294 + }, + { + "epoch": 0.1794949802251293, + "grad_norm": 1.9570586681365967, + "learning_rate": 4.9996826200620336e-05, + "loss": 0.538, + "step": 295 + }, + { + "epoch": 0.18010343778521448, + "grad_norm": 1.6489269733428955, + "learning_rate": 4.999662474327901e-05, + "loss": 0.6005, + "step": 296 + }, + { + "epoch": 0.18071189534529966, + "grad_norm": 1.8315165042877197, + "learning_rate": 4.999641708781665e-05, + "loss": 0.5694, + "step": 297 + }, + { + "epoch": 0.18132035290538484, + "grad_norm": 1.9754319190979004, + "learning_rate": 4.9996203234284755e-05, + "loss": 0.5658, + "step": 298 + }, + { + "epoch": 0.18192881046547002, + "grad_norm": 1.5500773191452026, + "learning_rate": 4.999598318273636e-05, + "loss": 0.5115, + "step": 299 + }, + { + "epoch": 0.18253726802555523, + "grad_norm": 2.730600595474243, + "learning_rate": 4.999575693322601e-05, + "loss": 0.5655, + "step": 300 + }, + { + "epoch": 0.1831457255856404, + "grad_norm": 2.1181812286376953, + "learning_rate": 4.9995524485809816e-05, + "loss": 0.5391, + "step": 301 + }, + { + "epoch": 0.18375418314572559, + "grad_norm": 1.870579481124878, + "learning_rate": 4.9995285840545425e-05, + "loss": 0.5846, + "step": 302 + }, + { + "epoch": 0.18436264070581077, + "grad_norm": 1.8520623445510864, + "learning_rate": 4.9995040997491993e-05, + "loss": 0.5664, + "step": 303 + }, + { + "epoch": 0.18497109826589594, + "grad_norm": 1.7064025402069092, + "learning_rate": 4.999478995671024e-05, + "loss": 0.5567, + "step": 304 + }, + { + "epoch": 0.18557955582598112, + "grad_norm": 2.016923427581787, + "learning_rate": 4.99945327182624e-05, + "loss": 0.5576, + "step": 305 + }, + { + "epoch": 0.18618801338606633, + "grad_norm": 1.6377376317977905, + "learning_rate": 4.999426928221229e-05, + "loss": 0.5462, + "step": 306 + }, + { + "epoch": 0.1867964709461515, + "grad_norm": 1.6940040588378906, + "learning_rate": 4.9993999648625197e-05, + "loss": 0.5148, + "step": 307 + }, + { + "epoch": 0.1874049285062367, + "grad_norm": 2.552048921585083, + "learning_rate": 4.9993723817567996e-05, + "loss": 0.6583, + "step": 308 + }, + { + "epoch": 0.18801338606632187, + "grad_norm": 1.4757635593414307, + "learning_rate": 4.9993441789109074e-05, + "loss": 0.5491, + "step": 309 + }, + { + "epoch": 0.18862184362640705, + "grad_norm": 1.6392966508865356, + "learning_rate": 4.999315356331837e-05, + "loss": 0.5338, + "step": 310 + }, + { + "epoch": 0.18923030118649226, + "grad_norm": 1.7045416831970215, + "learning_rate": 4.999285914026734e-05, + "loss": 0.627, + "step": 311 + }, + { + "epoch": 0.18983875874657744, + "grad_norm": 1.4857970476150513, + "learning_rate": 4.999255852002901e-05, + "loss": 0.6172, + "step": 312 + }, + { + "epoch": 0.19044721630666261, + "grad_norm": 3.2289485931396484, + "learning_rate": 4.9992251702677904e-05, + "loss": 0.5705, + "step": 313 + }, + { + "epoch": 0.1910556738667478, + "grad_norm": 1.866363763809204, + "learning_rate": 4.9991938688290105e-05, + "loss": 0.5102, + "step": 314 + }, + { + "epoch": 0.19166413142683297, + "grad_norm": 2.0050652027130127, + "learning_rate": 4.999161947694322e-05, + "loss": 0.5714, + "step": 315 + }, + { + "epoch": 0.19227258898691815, + "grad_norm": 1.7371540069580078, + "learning_rate": 4.9991294068716416e-05, + "loss": 0.5933, + "step": 316 + }, + { + "epoch": 0.19288104654700336, + "grad_norm": 1.7616161108016968, + "learning_rate": 4.9990962463690364e-05, + "loss": 0.5356, + "step": 317 + }, + { + "epoch": 0.19348950410708854, + "grad_norm": 1.7778440713882446, + "learning_rate": 4.99906246619473e-05, + "loss": 0.5461, + "step": 318 + }, + { + "epoch": 0.19409796166717372, + "grad_norm": 1.7068995237350464, + "learning_rate": 4.999028066357098e-05, + "loss": 0.509, + "step": 319 + }, + { + "epoch": 0.1947064192272589, + "grad_norm": 1.5592914819717407, + "learning_rate": 4.9989930468646703e-05, + "loss": 0.5702, + "step": 320 + }, + { + "epoch": 0.19531487678734408, + "grad_norm": 1.4004979133605957, + "learning_rate": 4.99895740772613e-05, + "loss": 0.5483, + "step": 321 + }, + { + "epoch": 0.19592333434742926, + "grad_norm": 1.4833515882492065, + "learning_rate": 4.998921148950314e-05, + "loss": 0.5739, + "step": 322 + }, + { + "epoch": 0.19653179190751446, + "grad_norm": 1.715824007987976, + "learning_rate": 4.998884270546214e-05, + "loss": 0.5488, + "step": 323 + }, + { + "epoch": 0.19714024946759964, + "grad_norm": 1.557228684425354, + "learning_rate": 4.998846772522972e-05, + "loss": 0.5267, + "step": 324 + }, + { + "epoch": 0.19774870702768482, + "grad_norm": 1.7454534769058228, + "learning_rate": 4.998808654889888e-05, + "loss": 0.533, + "step": 325 + }, + { + "epoch": 0.19835716458777, + "grad_norm": 1.6410638093948364, + "learning_rate": 4.998769917656414e-05, + "loss": 0.5841, + "step": 326 + }, + { + "epoch": 0.19896562214785518, + "grad_norm": 1.7179322242736816, + "learning_rate": 4.998730560832154e-05, + "loss": 0.6051, + "step": 327 + }, + { + "epoch": 0.19957407970794036, + "grad_norm": 1.5742508172988892, + "learning_rate": 4.9986905844268667e-05, + "loss": 0.501, + "step": 328 + }, + { + "epoch": 0.20018253726802557, + "grad_norm": 1.944952130317688, + "learning_rate": 4.998649988450465e-05, + "loss": 0.543, + "step": 329 + }, + { + "epoch": 0.20079099482811075, + "grad_norm": 2.1101162433624268, + "learning_rate": 4.998608772913015e-05, + "loss": 0.7782, + "step": 330 + }, + { + "epoch": 0.20139945238819593, + "grad_norm": 1.778678297996521, + "learning_rate": 4.998566937824737e-05, + "loss": 0.5768, + "step": 331 + }, + { + "epoch": 0.2020079099482811, + "grad_norm": 1.5352647304534912, + "learning_rate": 4.9985244831960034e-05, + "loss": 0.5784, + "step": 332 + }, + { + "epoch": 0.20261636750836629, + "grad_norm": 1.6977012157440186, + "learning_rate": 4.998481409037342e-05, + "loss": 0.537, + "step": 333 + }, + { + "epoch": 0.20322482506845146, + "grad_norm": 1.7605286836624146, + "learning_rate": 4.9984377153594327e-05, + "loss": 0.5381, + "step": 334 + }, + { + "epoch": 0.20383328262853667, + "grad_norm": 2.0058248043060303, + "learning_rate": 4.99839340217311e-05, + "loss": 0.6097, + "step": 335 + }, + { + "epoch": 0.20444174018862185, + "grad_norm": 2.2246720790863037, + "learning_rate": 4.9983484694893615e-05, + "loss": 0.5455, + "step": 336 + }, + { + "epoch": 0.20505019774870703, + "grad_norm": 1.8565617799758911, + "learning_rate": 4.99830291731933e-05, + "loss": 0.6721, + "step": 337 + }, + { + "epoch": 0.2056586553087922, + "grad_norm": 1.691021203994751, + "learning_rate": 4.998256745674308e-05, + "loss": 0.58, + "step": 338 + }, + { + "epoch": 0.2062671128688774, + "grad_norm": 2.6199169158935547, + "learning_rate": 4.998209954565746e-05, + "loss": 0.6639, + "step": 339 + }, + { + "epoch": 0.20687557042896257, + "grad_norm": 1.632177710533142, + "learning_rate": 4.998162544005246e-05, + "loss": 0.5949, + "step": 340 + }, + { + "epoch": 0.20748402798904778, + "grad_norm": 2.9773964881896973, + "learning_rate": 4.9981145140045634e-05, + "loss": 0.547, + "step": 341 + }, + { + "epoch": 0.20809248554913296, + "grad_norm": 2.0288548469543457, + "learning_rate": 4.998065864575608e-05, + "loss": 0.619, + "step": 342 + }, + { + "epoch": 0.20870094310921813, + "grad_norm": 1.6673290729522705, + "learning_rate": 4.998016595730442e-05, + "loss": 0.592, + "step": 343 + }, + { + "epoch": 0.2093094006693033, + "grad_norm": 1.5016260147094727, + "learning_rate": 4.997966707481284e-05, + "loss": 0.6014, + "step": 344 + }, + { + "epoch": 0.2099178582293885, + "grad_norm": 2.3136346340179443, + "learning_rate": 4.9979161998405024e-05, + "loss": 0.5583, + "step": 345 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 1.6642321348190308, + "learning_rate": 4.997865072820621e-05, + "loss": 0.5468, + "step": 346 + }, + { + "epoch": 0.21113477334955888, + "grad_norm": 1.8811125755310059, + "learning_rate": 4.9978133264343186e-05, + "loss": 0.6072, + "step": 347 + }, + { + "epoch": 0.21174323090964406, + "grad_norm": 1.5865939855575562, + "learning_rate": 4.997760960694424e-05, + "loss": 0.509, + "step": 348 + }, + { + "epoch": 0.21235168846972924, + "grad_norm": 1.6846966743469238, + "learning_rate": 4.9977079756139247e-05, + "loss": 0.6028, + "step": 349 + }, + { + "epoch": 0.21296014602981442, + "grad_norm": 2.0997796058654785, + "learning_rate": 4.997654371205955e-05, + "loss": 0.5462, + "step": 350 + }, + { + "epoch": 0.2135686035898996, + "grad_norm": 2.4784786701202393, + "learning_rate": 4.9976001474838105e-05, + "loss": 0.5719, + "step": 351 + }, + { + "epoch": 0.21417706114998478, + "grad_norm": 2.554870367050171, + "learning_rate": 4.997545304460933e-05, + "loss": 0.5194, + "step": 352 + }, + { + "epoch": 0.21478551871006998, + "grad_norm": 1.4950354099273682, + "learning_rate": 4.997489842150924e-05, + "loss": 0.5605, + "step": 353 + }, + { + "epoch": 0.21539397627015516, + "grad_norm": 1.4311949014663696, + "learning_rate": 4.9974337605675335e-05, + "loss": 0.5508, + "step": 354 + }, + { + "epoch": 0.21600243383024034, + "grad_norm": 1.7525572776794434, + "learning_rate": 4.9973770597246696e-05, + "loss": 0.5666, + "step": 355 + }, + { + "epoch": 0.21661089139032552, + "grad_norm": 2.4212021827697754, + "learning_rate": 4.99731973963639e-05, + "loss": 0.5436, + "step": 356 + }, + { + "epoch": 0.2172193489504107, + "grad_norm": 6.587791919708252, + "learning_rate": 4.997261800316909e-05, + "loss": 0.6509, + "step": 357 + }, + { + "epoch": 0.21782780651049588, + "grad_norm": 1.5659176111221313, + "learning_rate": 4.997203241780592e-05, + "loss": 0.5326, + "step": 358 + }, + { + "epoch": 0.2184362640705811, + "grad_norm": 1.5538829565048218, + "learning_rate": 4.99714406404196e-05, + "loss": 0.6136, + "step": 359 + }, + { + "epoch": 0.21904472163066627, + "grad_norm": 1.7956405878067017, + "learning_rate": 4.997084267115686e-05, + "loss": 0.604, + "step": 360 + }, + { + "epoch": 0.21965317919075145, + "grad_norm": 1.8146167993545532, + "learning_rate": 4.997023851016598e-05, + "loss": 0.5673, + "step": 361 + }, + { + "epoch": 0.22026163675083663, + "grad_norm": 1.5407328605651855, + "learning_rate": 4.996962815759675e-05, + "loss": 0.5848, + "step": 362 + }, + { + "epoch": 0.2208700943109218, + "grad_norm": 1.560318946838379, + "learning_rate": 4.9969011613600525e-05, + "loss": 0.4974, + "step": 363 + }, + { + "epoch": 0.22147855187100698, + "grad_norm": 1.5309618711471558, + "learning_rate": 4.996838887833018e-05, + "loss": 0.6155, + "step": 364 + }, + { + "epoch": 0.2220870094310922, + "grad_norm": 1.6208105087280273, + "learning_rate": 4.9967759951940127e-05, + "loss": 0.5615, + "step": 365 + }, + { + "epoch": 0.22269546699117737, + "grad_norm": 1.5246007442474365, + "learning_rate": 4.996712483458632e-05, + "loss": 0.5311, + "step": 366 + }, + { + "epoch": 0.22330392455126255, + "grad_norm": 1.9340999126434326, + "learning_rate": 4.9966483526426223e-05, + "loss": 0.5678, + "step": 367 + }, + { + "epoch": 0.22391238211134773, + "grad_norm": 1.4264390468597412, + "learning_rate": 4.996583602761887e-05, + "loss": 0.5468, + "step": 368 + }, + { + "epoch": 0.2245208396714329, + "grad_norm": 1.5261754989624023, + "learning_rate": 4.996518233832481e-05, + "loss": 0.5593, + "step": 369 + }, + { + "epoch": 0.2251292972315181, + "grad_norm": 1.5662906169891357, + "learning_rate": 4.996452245870614e-05, + "loss": 0.5155, + "step": 370 + }, + { + "epoch": 0.2257377547916033, + "grad_norm": 1.6063148975372314, + "learning_rate": 4.9963856388926464e-05, + "loss": 0.5384, + "step": 371 + }, + { + "epoch": 0.22634621235168847, + "grad_norm": 1.5906771421432495, + "learning_rate": 4.996318412915095e-05, + "loss": 0.5049, + "step": 372 + }, + { + "epoch": 0.22695466991177365, + "grad_norm": 1.676358938217163, + "learning_rate": 4.9962505679546285e-05, + "loss": 0.5344, + "step": 373 + }, + { + "epoch": 0.22756312747185883, + "grad_norm": 1.6183301210403442, + "learning_rate": 4.9961821040280697e-05, + "loss": 0.5611, + "step": 374 + }, + { + "epoch": 0.228171585031944, + "grad_norm": 1.670375108718872, + "learning_rate": 4.996113021152397e-05, + "loss": 0.5533, + "step": 375 + }, + { + "epoch": 0.2287800425920292, + "grad_norm": 1.6146153211593628, + "learning_rate": 4.996043319344736e-05, + "loss": 0.5368, + "step": 376 + }, + { + "epoch": 0.2293885001521144, + "grad_norm": 1.5858700275421143, + "learning_rate": 4.9959729986223725e-05, + "loss": 0.5109, + "step": 377 + }, + { + "epoch": 0.22999695771219958, + "grad_norm": 1.696811318397522, + "learning_rate": 4.995902059002743e-05, + "loss": 0.5879, + "step": 378 + }, + { + "epoch": 0.23060541527228476, + "grad_norm": 1.5121724605560303, + "learning_rate": 4.995830500503438e-05, + "loss": 0.518, + "step": 379 + }, + { + "epoch": 0.23121387283236994, + "grad_norm": 1.4365136623382568, + "learning_rate": 4.995758323142199e-05, + "loss": 0.5384, + "step": 380 + }, + { + "epoch": 0.23182233039245512, + "grad_norm": 1.4836268424987793, + "learning_rate": 4.995685526936924e-05, + "loss": 0.5867, + "step": 381 + }, + { + "epoch": 0.2324307879525403, + "grad_norm": 1.8685383796691895, + "learning_rate": 4.9956121119056646e-05, + "loss": 0.628, + "step": 382 + }, + { + "epoch": 0.2330392455126255, + "grad_norm": 1.426859974861145, + "learning_rate": 4.9955380780666233e-05, + "loss": 0.4974, + "step": 383 + }, + { + "epoch": 0.23364770307271068, + "grad_norm": 2.282662868499756, + "learning_rate": 4.9954634254381576e-05, + "loss": 0.5333, + "step": 384 + }, + { + "epoch": 0.23425616063279586, + "grad_norm": 1.881532073020935, + "learning_rate": 4.995388154038779e-05, + "loss": 0.6067, + "step": 385 + }, + { + "epoch": 0.23486461819288104, + "grad_norm": 1.4905718564987183, + "learning_rate": 4.9953122638871505e-05, + "loss": 0.5767, + "step": 386 + }, + { + "epoch": 0.23547307575296622, + "grad_norm": 1.5915948152542114, + "learning_rate": 4.99523575500209e-05, + "loss": 0.5713, + "step": 387 + }, + { + "epoch": 0.23608153331305143, + "grad_norm": 1.8757609128952026, + "learning_rate": 4.9951586274025695e-05, + "loss": 0.5355, + "step": 388 + }, + { + "epoch": 0.2366899908731366, + "grad_norm": 1.7897573709487915, + "learning_rate": 4.9950808811077135e-05, + "loss": 0.5522, + "step": 389 + }, + { + "epoch": 0.2372984484332218, + "grad_norm": 1.5204428434371948, + "learning_rate": 4.995002516136797e-05, + "loss": 0.5935, + "step": 390 + }, + { + "epoch": 0.23790690599330697, + "grad_norm": 1.376928448677063, + "learning_rate": 4.994923532509255e-05, + "loss": 0.5006, + "step": 391 + }, + { + "epoch": 0.23851536355339215, + "grad_norm": 1.6038637161254883, + "learning_rate": 4.99484393024467e-05, + "loss": 0.4999, + "step": 392 + }, + { + "epoch": 0.23912382111347733, + "grad_norm": 1.59404456615448, + "learning_rate": 4.99476370936278e-05, + "loss": 0.5816, + "step": 393 + }, + { + "epoch": 0.23973227867356253, + "grad_norm": 1.748207688331604, + "learning_rate": 4.994682869883478e-05, + "loss": 0.5596, + "step": 394 + }, + { + "epoch": 0.2403407362336477, + "grad_norm": 1.8243966102600098, + "learning_rate": 4.994601411826807e-05, + "loss": 0.6351, + "step": 395 + }, + { + "epoch": 0.2409491937937329, + "grad_norm": 1.70762300491333, + "learning_rate": 4.994519335212966e-05, + "loss": 0.644, + "step": 396 + }, + { + "epoch": 0.24155765135381807, + "grad_norm": 1.5090755224227905, + "learning_rate": 4.9944366400623066e-05, + "loss": 0.5991, + "step": 397 + }, + { + "epoch": 0.24216610891390325, + "grad_norm": 1.3629839420318604, + "learning_rate": 4.994353326395334e-05, + "loss": 0.4862, + "step": 398 + }, + { + "epoch": 0.24277456647398843, + "grad_norm": 1.7295219898223877, + "learning_rate": 4.9942693942327054e-05, + "loss": 0.5848, + "step": 399 + }, + { + "epoch": 0.24338302403407364, + "grad_norm": 1.5481480360031128, + "learning_rate": 4.994184843595234e-05, + "loss": 0.4668, + "step": 400 + }, + { + "epoch": 0.24399148159415882, + "grad_norm": 1.3394376039505005, + "learning_rate": 4.994099674503885e-05, + "loss": 0.5137, + "step": 401 + }, + { + "epoch": 0.244599939154244, + "grad_norm": 1.7005079984664917, + "learning_rate": 4.994013886979775e-05, + "loss": 0.6468, + "step": 402 + }, + { + "epoch": 0.24520839671432917, + "grad_norm": 1.682056188583374, + "learning_rate": 4.993927481044176e-05, + "loss": 0.5919, + "step": 403 + }, + { + "epoch": 0.24581685427441435, + "grad_norm": 1.6984502077102661, + "learning_rate": 4.993840456718515e-05, + "loss": 0.5864, + "step": 404 + }, + { + "epoch": 0.24642531183449953, + "grad_norm": 1.8722606897354126, + "learning_rate": 4.993752814024368e-05, + "loss": 0.4948, + "step": 405 + }, + { + "epoch": 0.24703376939458474, + "grad_norm": 1.4154351949691772, + "learning_rate": 4.993664552983469e-05, + "loss": 0.5495, + "step": 406 + }, + { + "epoch": 0.24764222695466992, + "grad_norm": 1.7346272468566895, + "learning_rate": 4.9935756736177006e-05, + "loss": 0.6606, + "step": 407 + }, + { + "epoch": 0.2482506845147551, + "grad_norm": 1.3692924976348877, + "learning_rate": 4.993486175949104e-05, + "loss": 0.4961, + "step": 408 + }, + { + "epoch": 0.24885914207484028, + "grad_norm": 1.9384909868240356, + "learning_rate": 4.993396059999868e-05, + "loss": 0.6164, + "step": 409 + }, + { + "epoch": 0.24946759963492546, + "grad_norm": 1.8157694339752197, + "learning_rate": 4.99330532579234e-05, + "loss": 0.5164, + "step": 410 + }, + { + "epoch": 0.25007605719501064, + "grad_norm": 1.4756087064743042, + "learning_rate": 4.993213973349017e-05, + "loss": 0.5389, + "step": 411 + }, + { + "epoch": 0.25068451475509584, + "grad_norm": 1.3147797584533691, + "learning_rate": 4.9931220026925506e-05, + "loss": 0.4488, + "step": 412 + }, + { + "epoch": 0.251292972315181, + "grad_norm": 1.9356815814971924, + "learning_rate": 4.993029413845746e-05, + "loss": 0.5484, + "step": 413 + }, + { + "epoch": 0.2519014298752662, + "grad_norm": 1.7992812395095825, + "learning_rate": 4.992936206831561e-05, + "loss": 0.5857, + "step": 414 + }, + { + "epoch": 0.2525098874353514, + "grad_norm": 1.4807167053222656, + "learning_rate": 4.9928423816731086e-05, + "loss": 0.5957, + "step": 415 + }, + { + "epoch": 0.25311834499543656, + "grad_norm": 1.877353549003601, + "learning_rate": 4.9927479383936516e-05, + "loss": 0.5062, + "step": 416 + }, + { + "epoch": 0.25372680255552177, + "grad_norm": 1.6704474687576294, + "learning_rate": 4.992652877016608e-05, + "loss": 0.6027, + "step": 417 + }, + { + "epoch": 0.2543352601156069, + "grad_norm": 1.7480648756027222, + "learning_rate": 4.992557197565551e-05, + "loss": 0.5166, + "step": 418 + }, + { + "epoch": 0.2549437176756921, + "grad_norm": 1.8488813638687134, + "learning_rate": 4.992460900064203e-05, + "loss": 0.627, + "step": 419 + }, + { + "epoch": 0.2555521752357773, + "grad_norm": 1.7881669998168945, + "learning_rate": 4.992363984536443e-05, + "loss": 0.5289, + "step": 420 + }, + { + "epoch": 0.2561606327958625, + "grad_norm": 1.5146019458770752, + "learning_rate": 4.9922664510063024e-05, + "loss": 0.5334, + "step": 421 + }, + { + "epoch": 0.2567690903559477, + "grad_norm": 3.4007725715637207, + "learning_rate": 4.992168299497964e-05, + "loss": 0.6357, + "step": 422 + }, + { + "epoch": 0.25737754791603284, + "grad_norm": 1.778419017791748, + "learning_rate": 4.9920695300357664e-05, + "loss": 0.5248, + "step": 423 + }, + { + "epoch": 0.25798600547611805, + "grad_norm": 1.3506314754486084, + "learning_rate": 4.9919701426442e-05, + "loss": 0.4993, + "step": 424 + }, + { + "epoch": 0.2585944630362032, + "grad_norm": 1.7571451663970947, + "learning_rate": 4.991870137347908e-05, + "loss": 0.591, + "step": 425 + }, + { + "epoch": 0.2592029205962884, + "grad_norm": 1.8246015310287476, + "learning_rate": 4.9917695141716884e-05, + "loss": 0.5652, + "step": 426 + }, + { + "epoch": 0.2598113781563736, + "grad_norm": 1.585374355316162, + "learning_rate": 4.991668273140492e-05, + "loss": 0.494, + "step": 427 + }, + { + "epoch": 0.26041983571645877, + "grad_norm": 1.5021830797195435, + "learning_rate": 4.991566414279421e-05, + "loss": 0.5471, + "step": 428 + }, + { + "epoch": 0.261028293276544, + "grad_norm": 1.493720293045044, + "learning_rate": 4.991463937613733e-05, + "loss": 0.5409, + "step": 429 + }, + { + "epoch": 0.26163675083662913, + "grad_norm": 1.550927758216858, + "learning_rate": 4.991360843168838e-05, + "loss": 0.5049, + "step": 430 + }, + { + "epoch": 0.26224520839671434, + "grad_norm": 1.793480634689331, + "learning_rate": 4.991257130970299e-05, + "loss": 0.5746, + "step": 431 + }, + { + "epoch": 0.2628536659567995, + "grad_norm": 1.3120859861373901, + "learning_rate": 4.991152801043832e-05, + "loss": 0.48, + "step": 432 + }, + { + "epoch": 0.2634621235168847, + "grad_norm": 1.489817500114441, + "learning_rate": 4.991047853415307e-05, + "loss": 0.5376, + "step": 433 + }, + { + "epoch": 0.2640705810769699, + "grad_norm": 1.668395757675171, + "learning_rate": 4.990942288110746e-05, + "loss": 0.5131, + "step": 434 + }, + { + "epoch": 0.26467903863705505, + "grad_norm": 1.8639039993286133, + "learning_rate": 4.9908361051563244e-05, + "loss": 0.5146, + "step": 435 + }, + { + "epoch": 0.26528749619714026, + "grad_norm": 1.944084644317627, + "learning_rate": 4.990729304578373e-05, + "loss": 0.5916, + "step": 436 + }, + { + "epoch": 0.2658959537572254, + "grad_norm": 1.5414292812347412, + "learning_rate": 4.990621886403373e-05, + "loss": 0.5606, + "step": 437 + }, + { + "epoch": 0.2665044113173106, + "grad_norm": 1.4970581531524658, + "learning_rate": 4.990513850657958e-05, + "loss": 0.5137, + "step": 438 + }, + { + "epoch": 0.2671128688773958, + "grad_norm": 1.5545778274536133, + "learning_rate": 4.990405197368919e-05, + "loss": 0.5935, + "step": 439 + }, + { + "epoch": 0.267721326437481, + "grad_norm": 1.836313009262085, + "learning_rate": 4.9902959265631966e-05, + "loss": 0.4994, + "step": 440 + }, + { + "epoch": 0.2683297839975662, + "grad_norm": 2.459785223007202, + "learning_rate": 4.990186038267884e-05, + "loss": 0.5529, + "step": 441 + }, + { + "epoch": 0.26893824155765134, + "grad_norm": 1.6246469020843506, + "learning_rate": 4.990075532510231e-05, + "loss": 0.5785, + "step": 442 + }, + { + "epoch": 0.26954669911773654, + "grad_norm": 1.563841700553894, + "learning_rate": 4.989964409317637e-05, + "loss": 0.5102, + "step": 443 + }, + { + "epoch": 0.2701551566778217, + "grad_norm": 1.729370355606079, + "learning_rate": 4.9898526687176554e-05, + "loss": 0.6466, + "step": 444 + }, + { + "epoch": 0.2707636142379069, + "grad_norm": 1.5674041509628296, + "learning_rate": 4.989740310737995e-05, + "loss": 0.5578, + "step": 445 + }, + { + "epoch": 0.2713720717979921, + "grad_norm": 2.1116135120391846, + "learning_rate": 4.989627335406515e-05, + "loss": 0.6759, + "step": 446 + }, + { + "epoch": 0.27198052935807726, + "grad_norm": 1.3847877979278564, + "learning_rate": 4.9895137427512284e-05, + "loss": 0.4906, + "step": 447 + }, + { + "epoch": 0.27258898691816247, + "grad_norm": 2.1487720012664795, + "learning_rate": 4.989399532800302e-05, + "loss": 0.4625, + "step": 448 + }, + { + "epoch": 0.2731974444782476, + "grad_norm": 2.71392560005188, + "learning_rate": 4.989284705582055e-05, + "loss": 0.5155, + "step": 449 + }, + { + "epoch": 0.2738059020383328, + "grad_norm": 1.6772319078445435, + "learning_rate": 4.989169261124958e-05, + "loss": 0.5715, + "step": 450 + }, + { + "epoch": 0.27441435959841803, + "grad_norm": 1.6435317993164062, + "learning_rate": 4.9890531994576394e-05, + "loss": 0.5688, + "step": 451 + }, + { + "epoch": 0.2750228171585032, + "grad_norm": 1.4484583139419556, + "learning_rate": 4.9889365206088755e-05, + "loss": 0.5269, + "step": 452 + }, + { + "epoch": 0.2756312747185884, + "grad_norm": 1.4935004711151123, + "learning_rate": 4.9888192246075986e-05, + "loss": 0.5619, + "step": 453 + }, + { + "epoch": 0.27623973227867354, + "grad_norm": 2.1516928672790527, + "learning_rate": 4.988701311482893e-05, + "loss": 0.6126, + "step": 454 + }, + { + "epoch": 0.27684818983875875, + "grad_norm": 2.0537219047546387, + "learning_rate": 4.988582781263997e-05, + "loss": 0.5805, + "step": 455 + }, + { + "epoch": 0.2774566473988439, + "grad_norm": 1.384886622428894, + "learning_rate": 4.9884636339803e-05, + "loss": 0.5707, + "step": 456 + }, + { + "epoch": 0.2780651049589291, + "grad_norm": 1.4588273763656616, + "learning_rate": 4.988343869661346e-05, + "loss": 0.5806, + "step": 457 + }, + { + "epoch": 0.2786735625190143, + "grad_norm": 1.3114267587661743, + "learning_rate": 4.988223488336832e-05, + "loss": 0.5397, + "step": 458 + }, + { + "epoch": 0.27928202007909947, + "grad_norm": 1.5628232955932617, + "learning_rate": 4.988102490036606e-05, + "loss": 0.5486, + "step": 459 + }, + { + "epoch": 0.2798904776391847, + "grad_norm": 1.5347833633422852, + "learning_rate": 4.987980874790673e-05, + "loss": 0.492, + "step": 460 + }, + { + "epoch": 0.2804989351992698, + "grad_norm": 1.5386556386947632, + "learning_rate": 4.9878586426291864e-05, + "loss": 0.5133, + "step": 461 + }, + { + "epoch": 0.28110739275935503, + "grad_norm": 1.603733777999878, + "learning_rate": 4.987735793582456e-05, + "loss": 0.5197, + "step": 462 + }, + { + "epoch": 0.28171585031944024, + "grad_norm": 1.7382800579071045, + "learning_rate": 4.987612327680943e-05, + "loss": 0.6745, + "step": 463 + }, + { + "epoch": 0.2823243078795254, + "grad_norm": 1.6408590078353882, + "learning_rate": 4.987488244955261e-05, + "loss": 0.5331, + "step": 464 + }, + { + "epoch": 0.2829327654396106, + "grad_norm": 1.5231281518936157, + "learning_rate": 4.987363545436178e-05, + "loss": 0.5993, + "step": 465 + }, + { + "epoch": 0.28354122299969575, + "grad_norm": 1.5222764015197754, + "learning_rate": 4.9872382291546136e-05, + "loss": 0.4993, + "step": 466 + }, + { + "epoch": 0.28414968055978096, + "grad_norm": 1.3687183856964111, + "learning_rate": 4.9871122961416417e-05, + "loss": 0.5918, + "step": 467 + }, + { + "epoch": 0.28475813811986617, + "grad_norm": 1.5194501876831055, + "learning_rate": 4.9869857464284885e-05, + "loss": 0.5499, + "step": 468 + }, + { + "epoch": 0.2853665956799513, + "grad_norm": 1.4260061979293823, + "learning_rate": 4.986858580046534e-05, + "loss": 0.5771, + "step": 469 + }, + { + "epoch": 0.2859750532400365, + "grad_norm": 1.58305823802948, + "learning_rate": 4.986730797027307e-05, + "loss": 0.5616, + "step": 470 + }, + { + "epoch": 0.2865835108001217, + "grad_norm": 1.5807416439056396, + "learning_rate": 4.9866023974024954e-05, + "loss": 0.513, + "step": 471 + }, + { + "epoch": 0.2871919683602069, + "grad_norm": 1.3939672708511353, + "learning_rate": 4.986473381203937e-05, + "loss": 0.587, + "step": 472 + }, + { + "epoch": 0.28780042592029204, + "grad_norm": 1.4460794925689697, + "learning_rate": 4.98634374846362e-05, + "loss": 0.5267, + "step": 473 + }, + { + "epoch": 0.28840888348037724, + "grad_norm": 1.4511432647705078, + "learning_rate": 4.986213499213689e-05, + "loss": 0.497, + "step": 474 + }, + { + "epoch": 0.28901734104046245, + "grad_norm": 1.877480149269104, + "learning_rate": 4.986082633486442e-05, + "loss": 0.5801, + "step": 475 + }, + { + "epoch": 0.2896257986005476, + "grad_norm": 1.3986396789550781, + "learning_rate": 4.985951151314326e-05, + "loss": 0.5483, + "step": 476 + }, + { + "epoch": 0.2902342561606328, + "grad_norm": 2.344064474105835, + "learning_rate": 4.985819052729944e-05, + "loss": 0.4837, + "step": 477 + }, + { + "epoch": 0.29084271372071796, + "grad_norm": 1.5389190912246704, + "learning_rate": 4.9856863377660515e-05, + "loss": 0.5699, + "step": 478 + }, + { + "epoch": 0.29145117128080317, + "grad_norm": 1.4740633964538574, + "learning_rate": 4.985553006455556e-05, + "loss": 0.512, + "step": 479 + }, + { + "epoch": 0.2920596288408884, + "grad_norm": 3.1011834144592285, + "learning_rate": 4.985419058831517e-05, + "loss": 0.5975, + "step": 480 + }, + { + "epoch": 0.2926680864009735, + "grad_norm": 1.6393465995788574, + "learning_rate": 4.9852844949271496e-05, + "loss": 0.5588, + "step": 481 + }, + { + "epoch": 0.29327654396105873, + "grad_norm": 1.4237196445465088, + "learning_rate": 4.985149314775818e-05, + "loss": 0.5831, + "step": 482 + }, + { + "epoch": 0.2938850015211439, + "grad_norm": 1.3287136554718018, + "learning_rate": 4.985013518411044e-05, + "loss": 0.4769, + "step": 483 + }, + { + "epoch": 0.2944934590812291, + "grad_norm": 1.4517908096313477, + "learning_rate": 4.984877105866497e-05, + "loss": 0.5007, + "step": 484 + }, + { + "epoch": 0.29510191664131424, + "grad_norm": 1.5900462865829468, + "learning_rate": 4.984740077176002e-05, + "loss": 0.6005, + "step": 485 + }, + { + "epoch": 0.29571037420139945, + "grad_norm": 1.5251234769821167, + "learning_rate": 4.984602432373537e-05, + "loss": 0.5368, + "step": 486 + }, + { + "epoch": 0.29631883176148466, + "grad_norm": 1.551039457321167, + "learning_rate": 4.984464171493233e-05, + "loss": 0.6079, + "step": 487 + }, + { + "epoch": 0.2969272893215698, + "grad_norm": 2.1228880882263184, + "learning_rate": 4.984325294569372e-05, + "loss": 0.6416, + "step": 488 + }, + { + "epoch": 0.297535746881655, + "grad_norm": 1.4991825819015503, + "learning_rate": 4.98418580163639e-05, + "loss": 0.4824, + "step": 489 + }, + { + "epoch": 0.29814420444174017, + "grad_norm": 1.746289610862732, + "learning_rate": 4.9840456927288734e-05, + "loss": 0.5908, + "step": 490 + }, + { + "epoch": 0.2987526620018254, + "grad_norm": 1.5192077159881592, + "learning_rate": 4.983904967881567e-05, + "loss": 0.6048, + "step": 491 + }, + { + "epoch": 0.2993611195619106, + "grad_norm": 1.415953278541565, + "learning_rate": 4.983763627129362e-05, + "loss": 0.5042, + "step": 492 + }, + { + "epoch": 0.29996957712199573, + "grad_norm": 1.255936622619629, + "learning_rate": 4.983621670507306e-05, + "loss": 0.5223, + "step": 493 + }, + { + "epoch": 0.30057803468208094, + "grad_norm": 1.3377435207366943, + "learning_rate": 4.9834790980505985e-05, + "loss": 0.553, + "step": 494 + }, + { + "epoch": 0.3011864922421661, + "grad_norm": 1.7899580001831055, + "learning_rate": 4.983335909794591e-05, + "loss": 0.4885, + "step": 495 + }, + { + "epoch": 0.3017949498022513, + "grad_norm": 1.4705740213394165, + "learning_rate": 4.983192105774788e-05, + "loss": 0.5556, + "step": 496 + }, + { + "epoch": 0.30240340736233645, + "grad_norm": 1.9178767204284668, + "learning_rate": 4.983047686026847e-05, + "loss": 0.4836, + "step": 497 + }, + { + "epoch": 0.30301186492242166, + "grad_norm": 1.54046630859375, + "learning_rate": 4.9829026505865794e-05, + "loss": 0.5707, + "step": 498 + }, + { + "epoch": 0.30362032248250687, + "grad_norm": 1.558304786682129, + "learning_rate": 4.982756999489947e-05, + "loss": 0.5615, + "step": 499 + }, + { + "epoch": 0.304228780042592, + "grad_norm": 1.782384991645813, + "learning_rate": 4.982610732773064e-05, + "loss": 0.4886, + "step": 500 + }, + { + "epoch": 0.3048372376026772, + "grad_norm": 1.3223114013671875, + "learning_rate": 4.9824638504722005e-05, + "loss": 0.517, + "step": 501 + }, + { + "epoch": 0.3054456951627624, + "grad_norm": 1.3240535259246826, + "learning_rate": 4.982316352623776e-05, + "loss": 0.5153, + "step": 502 + }, + { + "epoch": 0.3060541527228476, + "grad_norm": 1.610282301902771, + "learning_rate": 4.982168239264364e-05, + "loss": 0.5231, + "step": 503 + }, + { + "epoch": 0.3066626102829328, + "grad_norm": 1.6229352951049805, + "learning_rate": 4.982019510430691e-05, + "loss": 0.613, + "step": 504 + }, + { + "epoch": 0.30727106784301794, + "grad_norm": 1.3226631879806519, + "learning_rate": 4.981870166159635e-05, + "loss": 0.5142, + "step": 505 + }, + { + "epoch": 0.30787952540310315, + "grad_norm": 1.495801568031311, + "learning_rate": 4.981720206488226e-05, + "loss": 0.5711, + "step": 506 + }, + { + "epoch": 0.3084879829631883, + "grad_norm": 1.7586286067962646, + "learning_rate": 4.9815696314536504e-05, + "loss": 0.5152, + "step": 507 + }, + { + "epoch": 0.3090964405232735, + "grad_norm": 1.5324755907058716, + "learning_rate": 4.981418441093243e-05, + "loss": 0.5501, + "step": 508 + }, + { + "epoch": 0.30970489808335866, + "grad_norm": 1.775768756866455, + "learning_rate": 4.981266635444492e-05, + "loss": 0.6562, + "step": 509 + }, + { + "epoch": 0.31031335564344387, + "grad_norm": 1.3444887399673462, + "learning_rate": 4.98111421454504e-05, + "loss": 0.5188, + "step": 510 + }, + { + "epoch": 0.3109218132035291, + "grad_norm": 1.3959381580352783, + "learning_rate": 4.9809611784326815e-05, + "loss": 0.5093, + "step": 511 + }, + { + "epoch": 0.3115302707636142, + "grad_norm": 3.0742530822753906, + "learning_rate": 4.9808075271453616e-05, + "loss": 0.5801, + "step": 512 + }, + { + "epoch": 0.31213872832369943, + "grad_norm": 1.4250247478485107, + "learning_rate": 4.9806532607211797e-05, + "loss": 0.5579, + "step": 513 + }, + { + "epoch": 0.3127471858837846, + "grad_norm": 2.064180374145508, + "learning_rate": 4.980498379198389e-05, + "loss": 0.5433, + "step": 514 + }, + { + "epoch": 0.3133556434438698, + "grad_norm": 1.5153679847717285, + "learning_rate": 4.980342882615392e-05, + "loss": 0.5676, + "step": 515 + }, + { + "epoch": 0.313964101003955, + "grad_norm": 1.5368945598602295, + "learning_rate": 4.9801867710107454e-05, + "loss": 0.5181, + "step": 516 + }, + { + "epoch": 0.31457255856404015, + "grad_norm": 1.2389271259307861, + "learning_rate": 4.98003004442316e-05, + "loss": 0.4591, + "step": 517 + }, + { + "epoch": 0.31518101612412536, + "grad_norm": 1.1532279253005981, + "learning_rate": 4.979872702891495e-05, + "loss": 0.4422, + "step": 518 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 1.6176403760910034, + "learning_rate": 4.9797147464547664e-05, + "loss": 0.6151, + "step": 519 + }, + { + "epoch": 0.3163979312442957, + "grad_norm": 1.5165481567382812, + "learning_rate": 4.9795561751521405e-05, + "loss": 0.5284, + "step": 520 + }, + { + "epoch": 0.31700638880438087, + "grad_norm": 1.6209813356399536, + "learning_rate": 4.9793969890229364e-05, + "loss": 0.5867, + "step": 521 + }, + { + "epoch": 0.3176148463644661, + "grad_norm": 1.3948253393173218, + "learning_rate": 4.9792371881066245e-05, + "loss": 0.5734, + "step": 522 + }, + { + "epoch": 0.3182233039245513, + "grad_norm": 1.2350413799285889, + "learning_rate": 4.9790767724428304e-05, + "loss": 0.5079, + "step": 523 + }, + { + "epoch": 0.31883176148463643, + "grad_norm": 1.3993268013000488, + "learning_rate": 4.978915742071329e-05, + "loss": 0.5713, + "step": 524 + }, + { + "epoch": 0.31944021904472164, + "grad_norm": 1.3843022584915161, + "learning_rate": 4.978754097032051e-05, + "loss": 0.5258, + "step": 525 + }, + { + "epoch": 0.3200486766048068, + "grad_norm": 1.2739397287368774, + "learning_rate": 4.978591837365076e-05, + "loss": 0.5044, + "step": 526 + }, + { + "epoch": 0.320657134164892, + "grad_norm": 2.009359359741211, + "learning_rate": 4.978428963110638e-05, + "loss": 0.495, + "step": 527 + }, + { + "epoch": 0.3212655917249772, + "grad_norm": 1.5859858989715576, + "learning_rate": 4.978265474309123e-05, + "loss": 0.5793, + "step": 528 + }, + { + "epoch": 0.32187404928506236, + "grad_norm": 1.5144940614700317, + "learning_rate": 4.9781013710010696e-05, + "loss": 0.5305, + "step": 529 + }, + { + "epoch": 0.32248250684514757, + "grad_norm": 1.5362976789474487, + "learning_rate": 4.977936653227169e-05, + "loss": 0.6301, + "step": 530 + }, + { + "epoch": 0.3230909644052327, + "grad_norm": 1.516918420791626, + "learning_rate": 4.9777713210282636e-05, + "loss": 0.581, + "step": 531 + }, + { + "epoch": 0.3236994219653179, + "grad_norm": 1.3938144445419312, + "learning_rate": 4.977605374445349e-05, + "loss": 0.6385, + "step": 532 + }, + { + "epoch": 0.3243078795254031, + "grad_norm": 1.1043601036071777, + "learning_rate": 4.977438813519574e-05, + "loss": 0.4772, + "step": 533 + }, + { + "epoch": 0.3249163370854883, + "grad_norm": 1.2884318828582764, + "learning_rate": 4.977271638292237e-05, + "loss": 0.6186, + "step": 534 + }, + { + "epoch": 0.3255247946455735, + "grad_norm": 1.278027057647705, + "learning_rate": 4.9771038488047915e-05, + "loss": 0.5577, + "step": 535 + }, + { + "epoch": 0.32613325220565864, + "grad_norm": 1.1427042484283447, + "learning_rate": 4.976935445098843e-05, + "loss": 0.4842, + "step": 536 + }, + { + "epoch": 0.32674170976574385, + "grad_norm": 1.1449129581451416, + "learning_rate": 4.9767664272161474e-05, + "loss": 0.4771, + "step": 537 + }, + { + "epoch": 0.327350167325829, + "grad_norm": 1.4872303009033203, + "learning_rate": 4.976596795198615e-05, + "loss": 0.5961, + "step": 538 + }, + { + "epoch": 0.3279586248859142, + "grad_norm": 1.3000589609146118, + "learning_rate": 4.976426549088307e-05, + "loss": 0.5076, + "step": 539 + }, + { + "epoch": 0.3285670824459994, + "grad_norm": 1.8529880046844482, + "learning_rate": 4.976255688927436e-05, + "loss": 0.5203, + "step": 540 + }, + { + "epoch": 0.32917554000608457, + "grad_norm": 2.4158706665039062, + "learning_rate": 4.976084214758371e-05, + "loss": 0.5025, + "step": 541 + }, + { + "epoch": 0.3297839975661698, + "grad_norm": 1.3759514093399048, + "learning_rate": 4.9759121266236286e-05, + "loss": 0.5908, + "step": 542 + }, + { + "epoch": 0.3303924551262549, + "grad_norm": 1.3580955266952515, + "learning_rate": 4.97573942456588e-05, + "loss": 0.4575, + "step": 543 + }, + { + "epoch": 0.33100091268634013, + "grad_norm": 3.9890565872192383, + "learning_rate": 4.975566108627948e-05, + "loss": 0.4681, + "step": 544 + }, + { + "epoch": 0.33160937024642534, + "grad_norm": 1.6312041282653809, + "learning_rate": 4.975392178852808e-05, + "loss": 0.5273, + "step": 545 + }, + { + "epoch": 0.3322178278065105, + "grad_norm": 2.074263572692871, + "learning_rate": 4.9752176352835866e-05, + "loss": 0.5376, + "step": 546 + }, + { + "epoch": 0.3328262853665957, + "grad_norm": 2.1829655170440674, + "learning_rate": 4.975042477963564e-05, + "loss": 0.5093, + "step": 547 + }, + { + "epoch": 0.33343474292668085, + "grad_norm": 1.5592143535614014, + "learning_rate": 4.9748667069361715e-05, + "loss": 0.5319, + "step": 548 + }, + { + "epoch": 0.33404320048676606, + "grad_norm": 1.8820182085037231, + "learning_rate": 4.974690322244994e-05, + "loss": 0.5467, + "step": 549 + }, + { + "epoch": 0.3346516580468512, + "grad_norm": 1.3456025123596191, + "learning_rate": 4.974513323933766e-05, + "loss": 0.5365, + "step": 550 + }, + { + "epoch": 0.3352601156069364, + "grad_norm": 1.2476035356521606, + "learning_rate": 4.974335712046376e-05, + "loss": 0.4927, + "step": 551 + }, + { + "epoch": 0.3358685731670216, + "grad_norm": 1.5756597518920898, + "learning_rate": 4.974157486626866e-05, + "loss": 0.5539, + "step": 552 + }, + { + "epoch": 0.3364770307271068, + "grad_norm": 1.4776535034179688, + "learning_rate": 4.973978647719426e-05, + "loss": 0.5584, + "step": 553 + }, + { + "epoch": 0.337085488287192, + "grad_norm": 2.738008975982666, + "learning_rate": 4.9737991953684024e-05, + "loss": 0.5617, + "step": 554 + }, + { + "epoch": 0.33769394584727713, + "grad_norm": 1.5317890644073486, + "learning_rate": 4.973619129618292e-05, + "loss": 0.5562, + "step": 555 + }, + { + "epoch": 0.33830240340736234, + "grad_norm": 1.49234938621521, + "learning_rate": 4.973438450513743e-05, + "loss": 0.5217, + "step": 556 + }, + { + "epoch": 0.33891086096744755, + "grad_norm": 1.301456093788147, + "learning_rate": 4.973257158099556e-05, + "loss": 0.4828, + "step": 557 + }, + { + "epoch": 0.3395193185275327, + "grad_norm": 2.2302095890045166, + "learning_rate": 4.9730752524206835e-05, + "loss": 0.5493, + "step": 558 + }, + { + "epoch": 0.3401277760876179, + "grad_norm": 1.5827584266662598, + "learning_rate": 4.972892733522232e-05, + "loss": 0.5375, + "step": 559 + }, + { + "epoch": 0.34073623364770306, + "grad_norm": 1.3773258924484253, + "learning_rate": 4.972709601449458e-05, + "loss": 0.5809, + "step": 560 + }, + { + "epoch": 0.34134469120778826, + "grad_norm": 1.2351981401443481, + "learning_rate": 4.972525856247769e-05, + "loss": 0.4553, + "step": 561 + }, + { + "epoch": 0.3419531487678734, + "grad_norm": 1.5786877870559692, + "learning_rate": 4.972341497962729e-05, + "loss": 0.5861, + "step": 562 + }, + { + "epoch": 0.3425616063279586, + "grad_norm": 1.3937610387802124, + "learning_rate": 4.97215652664005e-05, + "loss": 0.5209, + "step": 563 + }, + { + "epoch": 0.34317006388804383, + "grad_norm": 1.723417043685913, + "learning_rate": 4.971970942325597e-05, + "loss": 0.6222, + "step": 564 + }, + { + "epoch": 0.343778521448129, + "grad_norm": 1.3207844495773315, + "learning_rate": 4.971784745065386e-05, + "loss": 0.547, + "step": 565 + }, + { + "epoch": 0.3443869790082142, + "grad_norm": 1.4648035764694214, + "learning_rate": 4.971597934905587e-05, + "loss": 0.4984, + "step": 566 + }, + { + "epoch": 0.34499543656829934, + "grad_norm": 1.2435007095336914, + "learning_rate": 4.971410511892523e-05, + "loss": 0.5244, + "step": 567 + }, + { + "epoch": 0.34560389412838455, + "grad_norm": 1.309339165687561, + "learning_rate": 4.971222476072665e-05, + "loss": 0.5165, + "step": 568 + }, + { + "epoch": 0.34621235168846975, + "grad_norm": 1.554057240486145, + "learning_rate": 4.9710338274926384e-05, + "loss": 0.6061, + "step": 569 + }, + { + "epoch": 0.3468208092485549, + "grad_norm": 1.3458071947097778, + "learning_rate": 4.97084456619922e-05, + "loss": 0.5489, + "step": 570 + }, + { + "epoch": 0.3474292668086401, + "grad_norm": 1.2686628103256226, + "learning_rate": 4.9706546922393396e-05, + "loss": 0.4569, + "step": 571 + }, + { + "epoch": 0.34803772436872527, + "grad_norm": 2.3073008060455322, + "learning_rate": 4.970464205660077e-05, + "loss": 0.4969, + "step": 572 + }, + { + "epoch": 0.3486461819288105, + "grad_norm": 1.3341611623764038, + "learning_rate": 4.970273106508666e-05, + "loss": 0.501, + "step": 573 + }, + { + "epoch": 0.3492546394888956, + "grad_norm": 1.5147910118103027, + "learning_rate": 4.97008139483249e-05, + "loss": 0.5889, + "step": 574 + }, + { + "epoch": 0.34986309704898083, + "grad_norm": 2.7254374027252197, + "learning_rate": 4.9698890706790866e-05, + "loss": 0.598, + "step": 575 + }, + { + "epoch": 0.35047155460906604, + "grad_norm": 1.4066073894500732, + "learning_rate": 4.969696134096143e-05, + "loss": 0.5344, + "step": 576 + }, + { + "epoch": 0.3510800121691512, + "grad_norm": 1.7791637182235718, + "learning_rate": 4.969502585131502e-05, + "loss": 0.4642, + "step": 577 + }, + { + "epoch": 0.3516884697292364, + "grad_norm": 1.9573460817337036, + "learning_rate": 4.969308423833152e-05, + "loss": 0.5322, + "step": 578 + }, + { + "epoch": 0.35229692728932155, + "grad_norm": 1.4854366779327393, + "learning_rate": 4.96911365024924e-05, + "loss": 0.486, + "step": 579 + }, + { + "epoch": 0.35290538484940676, + "grad_norm": 1.8296064138412476, + "learning_rate": 4.968918264428059e-05, + "loss": 0.5649, + "step": 580 + }, + { + "epoch": 0.35351384240949196, + "grad_norm": 1.3343514204025269, + "learning_rate": 4.9687222664180585e-05, + "loss": 0.524, + "step": 581 + }, + { + "epoch": 0.3541222999695771, + "grad_norm": 1.521429419517517, + "learning_rate": 4.968525656267838e-05, + "loss": 0.5018, + "step": 582 + }, + { + "epoch": 0.3547307575296623, + "grad_norm": 2.845484972000122, + "learning_rate": 4.968328434026148e-05, + "loss": 0.6363, + "step": 583 + }, + { + "epoch": 0.3553392150897475, + "grad_norm": 1.4245471954345703, + "learning_rate": 4.9681305997418906e-05, + "loss": 0.5181, + "step": 584 + }, + { + "epoch": 0.3559476726498327, + "grad_norm": 1.27480149269104, + "learning_rate": 4.9679321534641214e-05, + "loss": 0.5343, + "step": 585 + }, + { + "epoch": 0.35655613020991783, + "grad_norm": 1.2429792881011963, + "learning_rate": 4.967733095242047e-05, + "loss": 0.4989, + "step": 586 + }, + { + "epoch": 0.35716458777000304, + "grad_norm": 1.9116991758346558, + "learning_rate": 4.967533425125025e-05, + "loss": 0.611, + "step": 587 + }, + { + "epoch": 0.35777304533008825, + "grad_norm": 1.498067855834961, + "learning_rate": 4.967333143162565e-05, + "loss": 0.5896, + "step": 588 + }, + { + "epoch": 0.3583815028901734, + "grad_norm": 1.4009770154953003, + "learning_rate": 4.967132249404329e-05, + "loss": 0.5002, + "step": 589 + }, + { + "epoch": 0.3589899604502586, + "grad_norm": 1.346002459526062, + "learning_rate": 4.966930743900131e-05, + "loss": 0.5411, + "step": 590 + }, + { + "epoch": 0.35959841801034376, + "grad_norm": 1.3442580699920654, + "learning_rate": 4.9667286266999354e-05, + "loss": 0.5158, + "step": 591 + }, + { + "epoch": 0.36020687557042896, + "grad_norm": 1.6261683702468872, + "learning_rate": 4.966525897853858e-05, + "loss": 0.6546, + "step": 592 + }, + { + "epoch": 0.36081533313051417, + "grad_norm": 1.5490068197250366, + "learning_rate": 4.966322557412168e-05, + "loss": 0.5536, + "step": 593 + }, + { + "epoch": 0.3614237906905993, + "grad_norm": 1.3161338567733765, + "learning_rate": 4.966118605425285e-05, + "loss": 0.5315, + "step": 594 + }, + { + "epoch": 0.36203224825068453, + "grad_norm": 1.6688951253890991, + "learning_rate": 4.965914041943781e-05, + "loss": 0.5392, + "step": 595 + }, + { + "epoch": 0.3626407058107697, + "grad_norm": 1.894654393196106, + "learning_rate": 4.9657088670183794e-05, + "loss": 0.5337, + "step": 596 + }, + { + "epoch": 0.3632491633708549, + "grad_norm": 1.5747034549713135, + "learning_rate": 4.9655030806999534e-05, + "loss": 0.5689, + "step": 597 + }, + { + "epoch": 0.36385762093094004, + "grad_norm": 1.315900206565857, + "learning_rate": 4.965296683039532e-05, + "loss": 0.5897, + "step": 598 + }, + { + "epoch": 0.36446607849102525, + "grad_norm": 1.3753433227539062, + "learning_rate": 4.9650896740882905e-05, + "loss": 0.5475, + "step": 599 + }, + { + "epoch": 0.36507453605111045, + "grad_norm": 1.546858787536621, + "learning_rate": 4.96488205389756e-05, + "loss": 0.5734, + "step": 600 + }, + { + "epoch": 0.3656829936111956, + "grad_norm": 1.2179769277572632, + "learning_rate": 4.9646738225188226e-05, + "loss": 0.4914, + "step": 601 + }, + { + "epoch": 0.3662914511712808, + "grad_norm": 1.370540738105774, + "learning_rate": 4.964464980003709e-05, + "loss": 0.4964, + "step": 602 + }, + { + "epoch": 0.36689990873136596, + "grad_norm": 1.3386831283569336, + "learning_rate": 4.9642555264040046e-05, + "loss": 0.5037, + "step": 603 + }, + { + "epoch": 0.36750836629145117, + "grad_norm": 1.3772310018539429, + "learning_rate": 4.964045461771645e-05, + "loss": 0.4962, + "step": 604 + }, + { + "epoch": 0.3681168238515364, + "grad_norm": 1.5445538759231567, + "learning_rate": 4.963834786158717e-05, + "loss": 0.4728, + "step": 605 + }, + { + "epoch": 0.36872528141162153, + "grad_norm": 1.419505000114441, + "learning_rate": 4.96362349961746e-05, + "loss": 0.502, + "step": 606 + }, + { + "epoch": 0.36933373897170674, + "grad_norm": 1.3720256090164185, + "learning_rate": 4.963411602200264e-05, + "loss": 0.5714, + "step": 607 + }, + { + "epoch": 0.3699421965317919, + "grad_norm": 1.3194488286972046, + "learning_rate": 4.963199093959671e-05, + "loss": 0.5201, + "step": 608 + }, + { + "epoch": 0.3705506540918771, + "grad_norm": 1.3677685260772705, + "learning_rate": 4.962985974948373e-05, + "loss": 0.5827, + "step": 609 + }, + { + "epoch": 0.37115911165196225, + "grad_norm": 1.1780776977539062, + "learning_rate": 4.9627722452192164e-05, + "loss": 0.4458, + "step": 610 + }, + { + "epoch": 0.37176756921204746, + "grad_norm": 1.2468417882919312, + "learning_rate": 4.962557904825196e-05, + "loss": 0.5893, + "step": 611 + }, + { + "epoch": 0.37237602677213266, + "grad_norm": 1.4214433431625366, + "learning_rate": 4.962342953819459e-05, + "loss": 0.5349, + "step": 612 + }, + { + "epoch": 0.3729844843322178, + "grad_norm": 1.3830993175506592, + "learning_rate": 4.9621273922553055e-05, + "loss": 0.4956, + "step": 613 + }, + { + "epoch": 0.373592941892303, + "grad_norm": 1.322393774986267, + "learning_rate": 4.961911220186186e-05, + "loss": 0.528, + "step": 614 + }, + { + "epoch": 0.3742013994523882, + "grad_norm": 1.3154321908950806, + "learning_rate": 4.961694437665701e-05, + "loss": 0.4628, + "step": 615 + }, + { + "epoch": 0.3748098570124734, + "grad_norm": 1.4259490966796875, + "learning_rate": 4.9614770447476037e-05, + "loss": 0.5198, + "step": 616 + }, + { + "epoch": 0.3754183145725586, + "grad_norm": 1.643979549407959, + "learning_rate": 4.961259041485799e-05, + "loss": 0.4984, + "step": 617 + }, + { + "epoch": 0.37602677213264374, + "grad_norm": 1.36431884765625, + "learning_rate": 4.9610404279343415e-05, + "loss": 0.5148, + "step": 618 + }, + { + "epoch": 0.37663522969272895, + "grad_norm": 1.2996978759765625, + "learning_rate": 4.96082120414744e-05, + "loss": 0.5277, + "step": 619 + }, + { + "epoch": 0.3772436872528141, + "grad_norm": 1.7564353942871094, + "learning_rate": 4.960601370179452e-05, + "loss": 0.5884, + "step": 620 + }, + { + "epoch": 0.3778521448128993, + "grad_norm": 1.3980865478515625, + "learning_rate": 4.9603809260848864e-05, + "loss": 0.4982, + "step": 621 + }, + { + "epoch": 0.3784606023729845, + "grad_norm": 1.2997080087661743, + "learning_rate": 4.960159871918405e-05, + "loss": 0.5634, + "step": 622 + }, + { + "epoch": 0.37906905993306966, + "grad_norm": 1.2841782569885254, + "learning_rate": 4.9599382077348205e-05, + "loss": 0.4826, + "step": 623 + }, + { + "epoch": 0.37967751749315487, + "grad_norm": 1.347280502319336, + "learning_rate": 4.959715933589095e-05, + "loss": 0.5076, + "step": 624 + }, + { + "epoch": 0.38028597505324, + "grad_norm": 1.6214102506637573, + "learning_rate": 4.9594930495363445e-05, + "loss": 0.5878, + "step": 625 + }, + { + "epoch": 0.38089443261332523, + "grad_norm": 1.3995033502578735, + "learning_rate": 4.959269555631835e-05, + "loss": 0.476, + "step": 626 + }, + { + "epoch": 0.3815028901734104, + "grad_norm": 1.3252878189086914, + "learning_rate": 4.959045451930982e-05, + "loss": 0.5469, + "step": 627 + }, + { + "epoch": 0.3821113477334956, + "grad_norm": 1.2933142185211182, + "learning_rate": 4.958820738489355e-05, + "loss": 0.535, + "step": 628 + }, + { + "epoch": 0.3827198052935808, + "grad_norm": 1.3921632766723633, + "learning_rate": 4.958595415362675e-05, + "loss": 0.5282, + "step": 629 + }, + { + "epoch": 0.38332826285366595, + "grad_norm": 1.2744975090026855, + "learning_rate": 4.95836948260681e-05, + "loss": 0.5035, + "step": 630 + }, + { + "epoch": 0.38393672041375115, + "grad_norm": 2.0861146450042725, + "learning_rate": 4.9581429402777826e-05, + "loss": 0.5104, + "step": 631 + }, + { + "epoch": 0.3845451779738363, + "grad_norm": 1.7195478677749634, + "learning_rate": 4.957915788431768e-05, + "loss": 0.5618, + "step": 632 + }, + { + "epoch": 0.3851536355339215, + "grad_norm": 1.2929902076721191, + "learning_rate": 4.957688027125088e-05, + "loss": 0.4858, + "step": 633 + }, + { + "epoch": 0.3857620930940067, + "grad_norm": 1.324310064315796, + "learning_rate": 4.957459656414219e-05, + "loss": 0.576, + "step": 634 + }, + { + "epoch": 0.38637055065409187, + "grad_norm": 1.3465945720672607, + "learning_rate": 4.957230676355787e-05, + "loss": 0.5138, + "step": 635 + }, + { + "epoch": 0.3869790082141771, + "grad_norm": 1.2938052415847778, + "learning_rate": 4.95700108700657e-05, + "loss": 0.5101, + "step": 636 + }, + { + "epoch": 0.38758746577426223, + "grad_norm": 1.6239326000213623, + "learning_rate": 4.956770888423495e-05, + "loss": 0.5932, + "step": 637 + }, + { + "epoch": 0.38819592333434744, + "grad_norm": 1.3024382591247559, + "learning_rate": 4.9565400806636447e-05, + "loss": 0.5088, + "step": 638 + }, + { + "epoch": 0.3888043808944326, + "grad_norm": 1.557921051979065, + "learning_rate": 4.956308663784247e-05, + "loss": 0.5358, + "step": 639 + }, + { + "epoch": 0.3894128384545178, + "grad_norm": 1.2118357419967651, + "learning_rate": 4.956076637842685e-05, + "loss": 0.5089, + "step": 640 + }, + { + "epoch": 0.390021296014603, + "grad_norm": 1.4801642894744873, + "learning_rate": 4.9558440028964914e-05, + "loss": 0.4758, + "step": 641 + }, + { + "epoch": 0.39062975357468815, + "grad_norm": 1.3965163230895996, + "learning_rate": 4.95561075900335e-05, + "loss": 0.5498, + "step": 642 + }, + { + "epoch": 0.39123821113477336, + "grad_norm": 1.5353859663009644, + "learning_rate": 4.955376906221094e-05, + "loss": 0.4723, + "step": 643 + }, + { + "epoch": 0.3918466686948585, + "grad_norm": 1.248234748840332, + "learning_rate": 4.955142444607711e-05, + "loss": 0.5461, + "step": 644 + }, + { + "epoch": 0.3924551262549437, + "grad_norm": 1.5625355243682861, + "learning_rate": 4.9549073742213375e-05, + "loss": 0.5561, + "step": 645 + }, + { + "epoch": 0.3930635838150289, + "grad_norm": 1.327852487564087, + "learning_rate": 4.9546716951202606e-05, + "loss": 0.4967, + "step": 646 + }, + { + "epoch": 0.3936720413751141, + "grad_norm": 1.3109062910079956, + "learning_rate": 4.9544354073629186e-05, + "loss": 0.5108, + "step": 647 + }, + { + "epoch": 0.3942804989351993, + "grad_norm": 1.6410901546478271, + "learning_rate": 4.954198511007902e-05, + "loss": 0.5998, + "step": 648 + }, + { + "epoch": 0.39488895649528444, + "grad_norm": 1.4157142639160156, + "learning_rate": 4.9539610061139505e-05, + "loss": 0.54, + "step": 649 + }, + { + "epoch": 0.39549741405536964, + "grad_norm": 1.2379719018936157, + "learning_rate": 4.953722892739956e-05, + "loss": 0.4716, + "step": 650 + }, + { + "epoch": 0.3961058716154548, + "grad_norm": 1.3392927646636963, + "learning_rate": 4.95348417094496e-05, + "loss": 0.4665, + "step": 651 + }, + { + "epoch": 0.39671432917554, + "grad_norm": 1.3489748239517212, + "learning_rate": 4.953244840788156e-05, + "loss": 0.5533, + "step": 652 + }, + { + "epoch": 0.3973227867356252, + "grad_norm": 1.4216614961624146, + "learning_rate": 4.953004902328887e-05, + "loss": 0.5107, + "step": 653 + }, + { + "epoch": 0.39793124429571036, + "grad_norm": 1.2721261978149414, + "learning_rate": 4.9527643556266493e-05, + "loss": 0.5454, + "step": 654 + }, + { + "epoch": 0.39853970185579557, + "grad_norm": 1.292842984199524, + "learning_rate": 4.952523200741088e-05, + "loss": 0.4989, + "step": 655 + }, + { + "epoch": 0.3991481594158807, + "grad_norm": 1.2680416107177734, + "learning_rate": 4.952281437731998e-05, + "loss": 0.4913, + "step": 656 + }, + { + "epoch": 0.39975661697596593, + "grad_norm": 1.3643546104431152, + "learning_rate": 4.9520390666593286e-05, + "loss": 0.5144, + "step": 657 + }, + { + "epoch": 0.40036507453605114, + "grad_norm": 1.310077428817749, + "learning_rate": 4.951796087583176e-05, + "loss": 0.5468, + "step": 658 + }, + { + "epoch": 0.4009735320961363, + "grad_norm": 1.3737618923187256, + "learning_rate": 4.95155250056379e-05, + "loss": 0.5958, + "step": 659 + }, + { + "epoch": 0.4015819896562215, + "grad_norm": 1.2556304931640625, + "learning_rate": 4.9513083056615695e-05, + "loss": 0.5171, + "step": 660 + }, + { + "epoch": 0.40219044721630665, + "grad_norm": 1.2657099962234497, + "learning_rate": 4.9510635029370646e-05, + "loss": 0.5154, + "step": 661 + }, + { + "epoch": 0.40279890477639185, + "grad_norm": 1.3634426593780518, + "learning_rate": 4.9508180924509763e-05, + "loss": 0.4383, + "step": 662 + }, + { + "epoch": 0.403407362336477, + "grad_norm": 1.582513689994812, + "learning_rate": 4.950572074264156e-05, + "loss": 0.4719, + "step": 663 + }, + { + "epoch": 0.4040158198965622, + "grad_norm": 2.247115135192871, + "learning_rate": 4.9503254484376074e-05, + "loss": 0.5479, + "step": 664 + }, + { + "epoch": 0.4046242774566474, + "grad_norm": 1.3867719173431396, + "learning_rate": 4.950078215032481e-05, + "loss": 0.5422, + "step": 665 + }, + { + "epoch": 0.40523273501673257, + "grad_norm": 1.2318755388259888, + "learning_rate": 4.949830374110081e-05, + "loss": 0.5662, + "step": 666 + }, + { + "epoch": 0.4058411925768178, + "grad_norm": 1.5015090703964233, + "learning_rate": 4.9495819257318635e-05, + "loss": 0.5895, + "step": 667 + }, + { + "epoch": 0.40644965013690293, + "grad_norm": 1.2691212892532349, + "learning_rate": 4.949332869959432e-05, + "loss": 0.4953, + "step": 668 + }, + { + "epoch": 0.40705810769698814, + "grad_norm": 1.7610334157943726, + "learning_rate": 4.9490832068545414e-05, + "loss": 0.5327, + "step": 669 + }, + { + "epoch": 0.40766656525707334, + "grad_norm": 1.3113877773284912, + "learning_rate": 4.9488329364790986e-05, + "loss": 0.5369, + "step": 670 + }, + { + "epoch": 0.4082750228171585, + "grad_norm": 1.3471949100494385, + "learning_rate": 4.948582058895159e-05, + "loss": 0.4791, + "step": 671 + }, + { + "epoch": 0.4088834803772437, + "grad_norm": 1.1278780698776245, + "learning_rate": 4.9483305741649324e-05, + "loss": 0.4704, + "step": 672 + }, + { + "epoch": 0.40949193793732885, + "grad_norm": 1.2446277141571045, + "learning_rate": 4.948078482350774e-05, + "loss": 0.4728, + "step": 673 + }, + { + "epoch": 0.41010039549741406, + "grad_norm": 1.507495641708374, + "learning_rate": 4.947825783515193e-05, + "loss": 0.6533, + "step": 674 + }, + { + "epoch": 0.4107088530574992, + "grad_norm": 1.4783514738082886, + "learning_rate": 4.9475724777208474e-05, + "loss": 0.5185, + "step": 675 + }, + { + "epoch": 0.4113173106175844, + "grad_norm": 1.186607003211975, + "learning_rate": 4.947318565030548e-05, + "loss": 0.4684, + "step": 676 + }, + { + "epoch": 0.4119257681776696, + "grad_norm": 1.4391772747039795, + "learning_rate": 4.947064045507253e-05, + "loss": 0.5561, + "step": 677 + }, + { + "epoch": 0.4125342257377548, + "grad_norm": 1.3930890560150146, + "learning_rate": 4.946808919214074e-05, + "loss": 0.5353, + "step": 678 + }, + { + "epoch": 0.41314268329784, + "grad_norm": 1.2394658327102661, + "learning_rate": 4.946553186214271e-05, + "loss": 0.4428, + "step": 679 + }, + { + "epoch": 0.41375114085792514, + "grad_norm": 1.2956104278564453, + "learning_rate": 4.9462968465712555e-05, + "loss": 0.4658, + "step": 680 + }, + { + "epoch": 0.41435959841801034, + "grad_norm": 2.5605669021606445, + "learning_rate": 4.946039900348588e-05, + "loss": 0.5784, + "step": 681 + }, + { + "epoch": 0.41496805597809555, + "grad_norm": 1.3224326372146606, + "learning_rate": 4.945782347609982e-05, + "loss": 0.4152, + "step": 682 + }, + { + "epoch": 0.4155765135381807, + "grad_norm": 1.4174445867538452, + "learning_rate": 4.945524188419298e-05, + "loss": 0.5388, + "step": 683 + }, + { + "epoch": 0.4161849710982659, + "grad_norm": 1.2914178371429443, + "learning_rate": 4.9452654228405506e-05, + "loss": 0.4773, + "step": 684 + }, + { + "epoch": 0.41679342865835106, + "grad_norm": 1.3811676502227783, + "learning_rate": 4.945006050937902e-05, + "loss": 0.551, + "step": 685 + }, + { + "epoch": 0.41740188621843627, + "grad_norm": 1.2903940677642822, + "learning_rate": 4.944746072775665e-05, + "loss": 0.535, + "step": 686 + }, + { + "epoch": 0.4180103437785214, + "grad_norm": 1.8577662706375122, + "learning_rate": 4.9444854884183046e-05, + "loss": 0.533, + "step": 687 + }, + { + "epoch": 0.4186188013386066, + "grad_norm": 1.3438717126846313, + "learning_rate": 4.944224297930434e-05, + "loss": 0.5539, + "step": 688 + }, + { + "epoch": 0.41922725889869183, + "grad_norm": 1.184409260749817, + "learning_rate": 4.943962501376818e-05, + "loss": 0.4907, + "step": 689 + }, + { + "epoch": 0.419835716458777, + "grad_norm": 1.1499260663986206, + "learning_rate": 4.9437000988223705e-05, + "loss": 0.4658, + "step": 690 + }, + { + "epoch": 0.4204441740188622, + "grad_norm": 1.16009521484375, + "learning_rate": 4.9434370903321566e-05, + "loss": 0.5088, + "step": 691 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 1.2988529205322266, + "learning_rate": 4.943173475971393e-05, + "loss": 0.5152, + "step": 692 + }, + { + "epoch": 0.42166108913903255, + "grad_norm": 1.246013879776001, + "learning_rate": 4.942909255805443e-05, + "loss": 0.4863, + "step": 693 + }, + { + "epoch": 0.42226954669911776, + "grad_norm": 1.4207082986831665, + "learning_rate": 4.942644429899824e-05, + "loss": 0.5582, + "step": 694 + }, + { + "epoch": 0.4228780042592029, + "grad_norm": 1.1736639738082886, + "learning_rate": 4.9423789983201994e-05, + "loss": 0.4872, + "step": 695 + }, + { + "epoch": 0.4234864618192881, + "grad_norm": 1.3265244960784912, + "learning_rate": 4.942112961132388e-05, + "loss": 0.4979, + "step": 696 + }, + { + "epoch": 0.42409491937937327, + "grad_norm": 1.4081310033798218, + "learning_rate": 4.941846318402353e-05, + "loss": 0.5683, + "step": 697 + }, + { + "epoch": 0.4247033769394585, + "grad_norm": 1.3100318908691406, + "learning_rate": 4.941579070196214e-05, + "loss": 0.5316, + "step": 698 + }, + { + "epoch": 0.4253118344995437, + "grad_norm": 1.369620442390442, + "learning_rate": 4.9413112165802345e-05, + "loss": 0.5076, + "step": 699 + }, + { + "epoch": 0.42592029205962884, + "grad_norm": 1.268389105796814, + "learning_rate": 4.9410427576208316e-05, + "loss": 0.4792, + "step": 700 + }, + { + "epoch": 0.42652874961971404, + "grad_norm": 1.2050520181655884, + "learning_rate": 4.940773693384574e-05, + "loss": 0.4954, + "step": 701 + }, + { + "epoch": 0.4271372071797992, + "grad_norm": 1.5060069561004639, + "learning_rate": 4.940504023938176e-05, + "loss": 0.5535, + "step": 702 + }, + { + "epoch": 0.4277456647398844, + "grad_norm": 1.380094289779663, + "learning_rate": 4.940233749348505e-05, + "loss": 0.48, + "step": 703 + }, + { + "epoch": 0.42835412229996955, + "grad_norm": 1.2237451076507568, + "learning_rate": 4.9399628696825786e-05, + "loss": 0.4835, + "step": 704 + }, + { + "epoch": 0.42896257986005476, + "grad_norm": 1.639406442642212, + "learning_rate": 4.9396913850075636e-05, + "loss": 0.5267, + "step": 705 + }, + { + "epoch": 0.42957103742013997, + "grad_norm": 1.3455315828323364, + "learning_rate": 4.9394192953907757e-05, + "loss": 0.5586, + "step": 706 + }, + { + "epoch": 0.4301794949802251, + "grad_norm": 1.4164340496063232, + "learning_rate": 4.939146600899683e-05, + "loss": 0.4524, + "step": 707 + }, + { + "epoch": 0.4307879525403103, + "grad_norm": 1.3369954824447632, + "learning_rate": 4.938873301601902e-05, + "loss": 0.5494, + "step": 708 + }, + { + "epoch": 0.4313964101003955, + "grad_norm": 1.3146734237670898, + "learning_rate": 4.938599397565199e-05, + "loss": 0.4943, + "step": 709 + }, + { + "epoch": 0.4320048676604807, + "grad_norm": 1.4245113134384155, + "learning_rate": 4.9383248888574916e-05, + "loss": 0.5151, + "step": 710 + }, + { + "epoch": 0.4326133252205659, + "grad_norm": 1.6800333261489868, + "learning_rate": 4.938049775546846e-05, + "loss": 0.5604, + "step": 711 + }, + { + "epoch": 0.43322178278065104, + "grad_norm": 1.3402103185653687, + "learning_rate": 4.9377740577014784e-05, + "loss": 0.5283, + "step": 712 + }, + { + "epoch": 0.43383024034073625, + "grad_norm": 1.3066250085830688, + "learning_rate": 4.9374977353897566e-05, + "loss": 0.4636, + "step": 713 + }, + { + "epoch": 0.4344386979008214, + "grad_norm": 1.3179736137390137, + "learning_rate": 4.937220808680196e-05, + "loss": 0.471, + "step": 714 + }, + { + "epoch": 0.4350471554609066, + "grad_norm": 1.3768038749694824, + "learning_rate": 4.9369432776414634e-05, + "loss": 0.538, + "step": 715 + }, + { + "epoch": 0.43565561302099176, + "grad_norm": 1.1771548986434937, + "learning_rate": 4.936665142342375e-05, + "loss": 0.4856, + "step": 716 + }, + { + "epoch": 0.43626407058107697, + "grad_norm": 2.29483962059021, + "learning_rate": 4.936386402851896e-05, + "loss": 0.4863, + "step": 717 + }, + { + "epoch": 0.4368725281411622, + "grad_norm": 1.554796576499939, + "learning_rate": 4.936107059239143e-05, + "loss": 0.5158, + "step": 718 + }, + { + "epoch": 0.4374809857012473, + "grad_norm": 1.3302370309829712, + "learning_rate": 4.935827111573381e-05, + "loss": 0.5295, + "step": 719 + }, + { + "epoch": 0.43808944326133253, + "grad_norm": 1.2260332107543945, + "learning_rate": 4.9355465599240265e-05, + "loss": 0.5124, + "step": 720 + }, + { + "epoch": 0.4386979008214177, + "grad_norm": 1.272337555885315, + "learning_rate": 4.935265404360643e-05, + "loss": 0.4962, + "step": 721 + }, + { + "epoch": 0.4393063583815029, + "grad_norm": 1.1549131870269775, + "learning_rate": 4.9349836449529463e-05, + "loss": 0.5208, + "step": 722 + }, + { + "epoch": 0.4399148159415881, + "grad_norm": 1.3710724115371704, + "learning_rate": 4.9347012817708e-05, + "loss": 0.5462, + "step": 723 + }, + { + "epoch": 0.44052327350167325, + "grad_norm": 1.3693798780441284, + "learning_rate": 4.93441831488422e-05, + "loss": 0.5133, + "step": 724 + }, + { + "epoch": 0.44113173106175846, + "grad_norm": 1.2647064924240112, + "learning_rate": 4.934134744363369e-05, + "loss": 0.5089, + "step": 725 + }, + { + "epoch": 0.4417401886218436, + "grad_norm": 1.302121877670288, + "learning_rate": 4.933850570278562e-05, + "loss": 0.4566, + "step": 726 + }, + { + "epoch": 0.4423486461819288, + "grad_norm": 1.633015751838684, + "learning_rate": 4.933565792700261e-05, + "loss": 0.5596, + "step": 727 + }, + { + "epoch": 0.44295710374201397, + "grad_norm": 1.376413345336914, + "learning_rate": 4.9332804116990795e-05, + "loss": 0.5597, + "step": 728 + }, + { + "epoch": 0.4435655613020992, + "grad_norm": 1.3377238512039185, + "learning_rate": 4.9329944273457794e-05, + "loss": 0.4782, + "step": 729 + }, + { + "epoch": 0.4441740188621844, + "grad_norm": 1.3485767841339111, + "learning_rate": 4.932707839711273e-05, + "loss": 0.504, + "step": 730 + }, + { + "epoch": 0.44478247642226953, + "grad_norm": 1.3925731182098389, + "learning_rate": 4.9324206488666244e-05, + "loss": 0.5548, + "step": 731 + }, + { + "epoch": 0.44539093398235474, + "grad_norm": 1.1115320920944214, + "learning_rate": 4.9321328548830426e-05, + "loss": 0.5035, + "step": 732 + }, + { + "epoch": 0.4459993915424399, + "grad_norm": 1.2374329566955566, + "learning_rate": 4.9318444578318886e-05, + "loss": 0.47, + "step": 733 + }, + { + "epoch": 0.4466078491025251, + "grad_norm": 1.378677487373352, + "learning_rate": 4.931555457784674e-05, + "loss": 0.5206, + "step": 734 + }, + { + "epoch": 0.4472163066626103, + "grad_norm": 1.3036342859268188, + "learning_rate": 4.931265854813057e-05, + "loss": 0.4869, + "step": 735 + }, + { + "epoch": 0.44782476422269546, + "grad_norm": 1.1578119993209839, + "learning_rate": 4.930975648988849e-05, + "loss": 0.495, + "step": 736 + }, + { + "epoch": 0.44843322178278067, + "grad_norm": 1.815414309501648, + "learning_rate": 4.930684840384008e-05, + "loss": 0.6301, + "step": 737 + }, + { + "epoch": 0.4490416793428658, + "grad_norm": 1.4656637907028198, + "learning_rate": 4.9303934290706424e-05, + "loss": 0.4809, + "step": 738 + }, + { + "epoch": 0.449650136902951, + "grad_norm": 1.5585298538208008, + "learning_rate": 4.93010141512101e-05, + "loss": 0.5067, + "step": 739 + }, + { + "epoch": 0.4502585944630362, + "grad_norm": 1.466367483139038, + "learning_rate": 4.929808798607518e-05, + "loss": 0.5188, + "step": 740 + }, + { + "epoch": 0.4508670520231214, + "grad_norm": 1.1880073547363281, + "learning_rate": 4.9295155796027244e-05, + "loss": 0.5133, + "step": 741 + }, + { + "epoch": 0.4514755095832066, + "grad_norm": 1.425815224647522, + "learning_rate": 4.929221758179333e-05, + "loss": 0.5826, + "step": 742 + }, + { + "epoch": 0.45208396714329174, + "grad_norm": 1.4366766214370728, + "learning_rate": 4.9289273344102014e-05, + "loss": 0.5136, + "step": 743 + }, + { + "epoch": 0.45269242470337695, + "grad_norm": 1.2157377004623413, + "learning_rate": 4.928632308368334e-05, + "loss": 0.4507, + "step": 744 + }, + { + "epoch": 0.4533008822634621, + "grad_norm": 1.1545383930206299, + "learning_rate": 4.928336680126884e-05, + "loss": 0.4523, + "step": 745 + }, + { + "epoch": 0.4539093398235473, + "grad_norm": 1.2865034341812134, + "learning_rate": 4.9280404497591545e-05, + "loss": 0.4843, + "step": 746 + }, + { + "epoch": 0.4545177973836325, + "grad_norm": 1.1949740648269653, + "learning_rate": 4.9277436173386006e-05, + "loss": 0.4669, + "step": 747 + }, + { + "epoch": 0.45512625494371767, + "grad_norm": 1.2178860902786255, + "learning_rate": 4.927446182938822e-05, + "loss": 0.5262, + "step": 748 + }, + { + "epoch": 0.4557347125038029, + "grad_norm": 1.3532716035842896, + "learning_rate": 4.927148146633571e-05, + "loss": 0.4994, + "step": 749 + }, + { + "epoch": 0.456343170063888, + "grad_norm": 1.2420387268066406, + "learning_rate": 4.9268495084967485e-05, + "loss": 0.4912, + "step": 750 + }, + { + "epoch": 0.45695162762397323, + "grad_norm": 1.242753267288208, + "learning_rate": 4.926550268602404e-05, + "loss": 0.5301, + "step": 751 + }, + { + "epoch": 0.4575600851840584, + "grad_norm": 1.2774479389190674, + "learning_rate": 4.926250427024736e-05, + "loss": 0.4994, + "step": 752 + }, + { + "epoch": 0.4581685427441436, + "grad_norm": 1.3734140396118164, + "learning_rate": 4.925949983838094e-05, + "loss": 0.5812, + "step": 753 + }, + { + "epoch": 0.4587770003042288, + "grad_norm": 1.4266109466552734, + "learning_rate": 4.925648939116974e-05, + "loss": 0.5418, + "step": 754 + }, + { + "epoch": 0.45938545786431395, + "grad_norm": 1.2357332706451416, + "learning_rate": 4.9253472929360235e-05, + "loss": 0.5441, + "step": 755 + }, + { + "epoch": 0.45999391542439916, + "grad_norm": 1.4429142475128174, + "learning_rate": 4.925045045370037e-05, + "loss": 0.5116, + "step": 756 + }, + { + "epoch": 0.4606023729844843, + "grad_norm": 1.2204004526138306, + "learning_rate": 4.924742196493961e-05, + "loss": 0.4969, + "step": 757 + }, + { + "epoch": 0.4612108305445695, + "grad_norm": 1.2525185346603394, + "learning_rate": 4.9244387463828876e-05, + "loss": 0.4672, + "step": 758 + }, + { + "epoch": 0.4618192881046547, + "grad_norm": 1.238554835319519, + "learning_rate": 4.9241346951120616e-05, + "loss": 0.5296, + "step": 759 + }, + { + "epoch": 0.4624277456647399, + "grad_norm": 1.2626805305480957, + "learning_rate": 4.923830042756874e-05, + "loss": 0.5025, + "step": 760 + }, + { + "epoch": 0.4630362032248251, + "grad_norm": 1.3802168369293213, + "learning_rate": 4.923524789392866e-05, + "loss": 0.5273, + "step": 761 + }, + { + "epoch": 0.46364466078491023, + "grad_norm": 1.498776912689209, + "learning_rate": 4.923218935095727e-05, + "loss": 0.4597, + "step": 762 + }, + { + "epoch": 0.46425311834499544, + "grad_norm": 1.449130892753601, + "learning_rate": 4.922912479941297e-05, + "loss": 0.533, + "step": 763 + }, + { + "epoch": 0.4648615759050806, + "grad_norm": 2.3711140155792236, + "learning_rate": 4.922605424005565e-05, + "loss": 0.5551, + "step": 764 + }, + { + "epoch": 0.4654700334651658, + "grad_norm": 1.1763001680374146, + "learning_rate": 4.922297767364666e-05, + "loss": 0.4951, + "step": 765 + }, + { + "epoch": 0.466078491025251, + "grad_norm": 1.5831891298294067, + "learning_rate": 4.921989510094888e-05, + "loss": 0.6159, + "step": 766 + }, + { + "epoch": 0.46668694858533616, + "grad_norm": 1.3769197463989258, + "learning_rate": 4.921680652272665e-05, + "loss": 0.5048, + "step": 767 + }, + { + "epoch": 0.46729540614542137, + "grad_norm": 1.137963056564331, + "learning_rate": 4.9213711939745795e-05, + "loss": 0.4931, + "step": 768 + }, + { + "epoch": 0.4679038637055065, + "grad_norm": 1.7989009618759155, + "learning_rate": 4.921061135277366e-05, + "loss": 0.5065, + "step": 769 + }, + { + "epoch": 0.4685123212655917, + "grad_norm": 1.294255256652832, + "learning_rate": 4.920750476257906e-05, + "loss": 0.4863, + "step": 770 + }, + { + "epoch": 0.46912077882567693, + "grad_norm": 1.5027679204940796, + "learning_rate": 4.92043921699323e-05, + "loss": 0.507, + "step": 771 + }, + { + "epoch": 0.4697292363857621, + "grad_norm": 1.1163376569747925, + "learning_rate": 4.920127357560517e-05, + "loss": 0.4507, + "step": 772 + }, + { + "epoch": 0.4703376939458473, + "grad_norm": 1.1562211513519287, + "learning_rate": 4.919814898037095e-05, + "loss": 0.4933, + "step": 773 + }, + { + "epoch": 0.47094615150593244, + "grad_norm": 1.191632866859436, + "learning_rate": 4.919501838500441e-05, + "loss": 0.4549, + "step": 774 + }, + { + "epoch": 0.47155460906601765, + "grad_norm": 1.2718918323516846, + "learning_rate": 4.9191881790281815e-05, + "loss": 0.5133, + "step": 775 + }, + { + "epoch": 0.47216306662610286, + "grad_norm": 1.677453875541687, + "learning_rate": 4.91887391969809e-05, + "loss": 0.4517, + "step": 776 + }, + { + "epoch": 0.472771524186188, + "grad_norm": 1.3520299196243286, + "learning_rate": 4.91855906058809e-05, + "loss": 0.4373, + "step": 777 + }, + { + "epoch": 0.4733799817462732, + "grad_norm": 1.4283875226974487, + "learning_rate": 4.9182436017762535e-05, + "loss": 0.517, + "step": 778 + }, + { + "epoch": 0.47398843930635837, + "grad_norm": 1.2896888256072998, + "learning_rate": 4.917927543340801e-05, + "loss": 0.5267, + "step": 779 + }, + { + "epoch": 0.4745968968664436, + "grad_norm": 1.275476098060608, + "learning_rate": 4.9176108853601024e-05, + "loss": 0.4542, + "step": 780 + }, + { + "epoch": 0.4752053544265287, + "grad_norm": 1.1607316732406616, + "learning_rate": 4.917293627912675e-05, + "loss": 0.4794, + "step": 781 + }, + { + "epoch": 0.47581381198661393, + "grad_norm": 1.6451760530471802, + "learning_rate": 4.916975771077185e-05, + "loss": 0.5549, + "step": 782 + }, + { + "epoch": 0.47642226954669914, + "grad_norm": 1.3104053735733032, + "learning_rate": 4.9166573149324486e-05, + "loss": 0.4835, + "step": 783 + }, + { + "epoch": 0.4770307271067843, + "grad_norm": 1.5024620294570923, + "learning_rate": 4.916338259557429e-05, + "loss": 0.5095, + "step": 784 + }, + { + "epoch": 0.4776391846668695, + "grad_norm": 1.4474741220474243, + "learning_rate": 4.91601860503124e-05, + "loss": 0.5574, + "step": 785 + }, + { + "epoch": 0.47824764222695465, + "grad_norm": 1.329298734664917, + "learning_rate": 4.915698351433141e-05, + "loss": 0.5496, + "step": 786 + }, + { + "epoch": 0.47885609978703986, + "grad_norm": 1.1795927286148071, + "learning_rate": 4.915377498842542e-05, + "loss": 0.4553, + "step": 787 + }, + { + "epoch": 0.47946455734712506, + "grad_norm": 1.329775094985962, + "learning_rate": 4.915056047339002e-05, + "loss": 0.5445, + "step": 788 + }, + { + "epoch": 0.4800730149072102, + "grad_norm": 1.1533706188201904, + "learning_rate": 4.9147339970022256e-05, + "loss": 0.507, + "step": 789 + }, + { + "epoch": 0.4806814724672954, + "grad_norm": 1.1243101358413696, + "learning_rate": 4.9144113479120695e-05, + "loss": 0.5018, + "step": 790 + }, + { + "epoch": 0.4812899300273806, + "grad_norm": 1.1606624126434326, + "learning_rate": 4.9140881001485374e-05, + "loss": 0.4971, + "step": 791 + }, + { + "epoch": 0.4818983875874658, + "grad_norm": 1.342514991760254, + "learning_rate": 4.91376425379178e-05, + "loss": 0.541, + "step": 792 + }, + { + "epoch": 0.48250684514755093, + "grad_norm": 1.347623586654663, + "learning_rate": 4.913439808922098e-05, + "loss": 0.509, + "step": 793 + }, + { + "epoch": 0.48311530270763614, + "grad_norm": 1.340950608253479, + "learning_rate": 4.91311476561994e-05, + "loss": 0.4773, + "step": 794 + }, + { + "epoch": 0.48372376026772135, + "grad_norm": 1.0880382061004639, + "learning_rate": 4.912789123965905e-05, + "loss": 0.4569, + "step": 795 + }, + { + "epoch": 0.4843322178278065, + "grad_norm": 1.922781229019165, + "learning_rate": 4.912462884040737e-05, + "loss": 0.4752, + "step": 796 + }, + { + "epoch": 0.4849406753878917, + "grad_norm": 1.0886273384094238, + "learning_rate": 4.91213604592533e-05, + "loss": 0.4649, + "step": 797 + }, + { + "epoch": 0.48554913294797686, + "grad_norm": 1.411138892173767, + "learning_rate": 4.911808609700726e-05, + "loss": 0.5271, + "step": 798 + }, + { + "epoch": 0.48615759050806207, + "grad_norm": 0.9868936538696289, + "learning_rate": 4.911480575448116e-05, + "loss": 0.435, + "step": 799 + }, + { + "epoch": 0.4867660480681473, + "grad_norm": 1.06026291847229, + "learning_rate": 4.911151943248839e-05, + "loss": 0.4693, + "step": 800 + }, + { + "epoch": 0.4873745056282324, + "grad_norm": 1.5773180723190308, + "learning_rate": 4.910822713184382e-05, + "loss": 0.5369, + "step": 801 + }, + { + "epoch": 0.48798296318831763, + "grad_norm": 1.2286581993103027, + "learning_rate": 4.910492885336381e-05, + "loss": 0.4814, + "step": 802 + }, + { + "epoch": 0.4885914207484028, + "grad_norm": 1.3670451641082764, + "learning_rate": 4.910162459786617e-05, + "loss": 0.5145, + "step": 803 + }, + { + "epoch": 0.489199878308488, + "grad_norm": 1.6404004096984863, + "learning_rate": 4.9098314366170245e-05, + "loss": 0.5741, + "step": 804 + }, + { + "epoch": 0.48980833586857314, + "grad_norm": 1.1956790685653687, + "learning_rate": 4.909499815909682e-05, + "loss": 0.4935, + "step": 805 + }, + { + "epoch": 0.49041679342865835, + "grad_norm": 1.4000447988510132, + "learning_rate": 4.909167597746819e-05, + "loss": 0.4661, + "step": 806 + }, + { + "epoch": 0.49102525098874356, + "grad_norm": 1.4137040376663208, + "learning_rate": 4.908834782210809e-05, + "loss": 0.5235, + "step": 807 + }, + { + "epoch": 0.4916337085488287, + "grad_norm": 1.3332107067108154, + "learning_rate": 4.90850136938418e-05, + "loss": 0.5003, + "step": 808 + }, + { + "epoch": 0.4922421661089139, + "grad_norm": 1.3718630075454712, + "learning_rate": 4.908167359349601e-05, + "loss": 0.4756, + "step": 809 + }, + { + "epoch": 0.49285062366899907, + "grad_norm": 1.3051707744598389, + "learning_rate": 4.907832752189896e-05, + "loss": 0.5024, + "step": 810 + }, + { + "epoch": 0.4934590812290843, + "grad_norm": 1.2241960763931274, + "learning_rate": 4.907497547988031e-05, + "loss": 0.4989, + "step": 811 + }, + { + "epoch": 0.4940675387891695, + "grad_norm": 1.3939323425292969, + "learning_rate": 4.9071617468271234e-05, + "loss": 0.5231, + "step": 812 + }, + { + "epoch": 0.49467599634925463, + "grad_norm": 1.4326186180114746, + "learning_rate": 4.906825348790438e-05, + "loss": 0.4494, + "step": 813 + }, + { + "epoch": 0.49528445390933984, + "grad_norm": 1.1022182703018188, + "learning_rate": 4.9064883539613884e-05, + "loss": 0.5195, + "step": 814 + }, + { + "epoch": 0.495892911469425, + "grad_norm": 1.2197952270507812, + "learning_rate": 4.9061507624235334e-05, + "loss": 0.4793, + "step": 815 + }, + { + "epoch": 0.4965013690295102, + "grad_norm": 1.329502820968628, + "learning_rate": 4.905812574260583e-05, + "loss": 0.5823, + "step": 816 + }, + { + "epoch": 0.49710982658959535, + "grad_norm": 1.4672378301620483, + "learning_rate": 4.9054737895563935e-05, + "loss": 0.5231, + "step": 817 + }, + { + "epoch": 0.49771828414968056, + "grad_norm": 1.1329609155654907, + "learning_rate": 4.905134408394969e-05, + "loss": 0.501, + "step": 818 + }, + { + "epoch": 0.49832674170976576, + "grad_norm": 1.1195259094238281, + "learning_rate": 4.904794430860462e-05, + "loss": 0.482, + "step": 819 + }, + { + "epoch": 0.4989351992698509, + "grad_norm": 1.258161187171936, + "learning_rate": 4.904453857037173e-05, + "loss": 0.4575, + "step": 820 + }, + { + "epoch": 0.4995436568299361, + "grad_norm": 1.2955543994903564, + "learning_rate": 4.904112687009551e-05, + "loss": 0.563, + "step": 821 + }, + { + "epoch": 0.5001521143900213, + "grad_norm": 1.3455455303192139, + "learning_rate": 4.90377092086219e-05, + "loss": 0.5372, + "step": 822 + }, + { + "epoch": 0.5007605719501065, + "grad_norm": 1.2363775968551636, + "learning_rate": 4.903428558679835e-05, + "loss": 0.4754, + "step": 823 + }, + { + "epoch": 0.5013690295101917, + "grad_norm": 1.1832222938537598, + "learning_rate": 4.9030856005473776e-05, + "loss": 0.4748, + "step": 824 + }, + { + "epoch": 0.5019774870702769, + "grad_norm": 1.2365323305130005, + "learning_rate": 4.902742046549856e-05, + "loss": 0.5218, + "step": 825 + }, + { + "epoch": 0.502585944630362, + "grad_norm": 1.120848298072815, + "learning_rate": 4.902397896772459e-05, + "loss": 0.4514, + "step": 826 + }, + { + "epoch": 0.5031944021904472, + "grad_norm": 1.7753840684890747, + "learning_rate": 4.9020531513005194e-05, + "loss": 0.5859, + "step": 827 + }, + { + "epoch": 0.5038028597505324, + "grad_norm": 1.2528876066207886, + "learning_rate": 4.901707810219522e-05, + "loss": 0.4974, + "step": 828 + }, + { + "epoch": 0.5044113173106176, + "grad_norm": 1.1275379657745361, + "learning_rate": 4.901361873615095e-05, + "loss": 0.4611, + "step": 829 + }, + { + "epoch": 0.5050197748707028, + "grad_norm": 1.2789126634597778, + "learning_rate": 4.901015341573017e-05, + "loss": 0.4913, + "step": 830 + }, + { + "epoch": 0.5056282324307879, + "grad_norm": 1.2548869848251343, + "learning_rate": 4.900668214179214e-05, + "loss": 0.5161, + "step": 831 + }, + { + "epoch": 0.5062366899908731, + "grad_norm": 1.1596348285675049, + "learning_rate": 4.900320491519759e-05, + "loss": 0.5323, + "step": 832 + }, + { + "epoch": 0.5068451475509583, + "grad_norm": 1.0443363189697266, + "learning_rate": 4.8999721736808714e-05, + "loss": 0.4074, + "step": 833 + }, + { + "epoch": 0.5074536051110435, + "grad_norm": 1.1528189182281494, + "learning_rate": 4.899623260748921e-05, + "loss": 0.4656, + "step": 834 + }, + { + "epoch": 0.5080620626711286, + "grad_norm": 1.4290034770965576, + "learning_rate": 4.899273752810423e-05, + "loss": 0.5818, + "step": 835 + }, + { + "epoch": 0.5086705202312138, + "grad_norm": 1.298559546470642, + "learning_rate": 4.898923649952041e-05, + "loss": 0.5283, + "step": 836 + }, + { + "epoch": 0.509278977791299, + "grad_norm": 1.4868043661117554, + "learning_rate": 4.8985729522605864e-05, + "loss": 0.4575, + "step": 837 + }, + { + "epoch": 0.5098874353513843, + "grad_norm": 1.40908944606781, + "learning_rate": 4.898221659823016e-05, + "loss": 0.5026, + "step": 838 + }, + { + "epoch": 0.5104958929114695, + "grad_norm": 1.5016429424285889, + "learning_rate": 4.897869772726438e-05, + "loss": 0.4975, + "step": 839 + }, + { + "epoch": 0.5111043504715546, + "grad_norm": 1.2684623003005981, + "learning_rate": 4.8975172910581033e-05, + "loss": 0.5549, + "step": 840 + }, + { + "epoch": 0.5117128080316398, + "grad_norm": 1.4201849699020386, + "learning_rate": 4.897164214905414e-05, + "loss": 0.5264, + "step": 841 + }, + { + "epoch": 0.512321265591725, + "grad_norm": 1.300970435142517, + "learning_rate": 4.8968105443559194e-05, + "loss": 0.509, + "step": 842 + }, + { + "epoch": 0.5129297231518102, + "grad_norm": 1.3174622058868408, + "learning_rate": 4.896456279497312e-05, + "loss": 0.4918, + "step": 843 + }, + { + "epoch": 0.5135381807118954, + "grad_norm": 1.2107031345367432, + "learning_rate": 4.8961014204174384e-05, + "loss": 0.5297, + "step": 844 + }, + { + "epoch": 0.5141466382719805, + "grad_norm": 1.4777075052261353, + "learning_rate": 4.895745967204286e-05, + "loss": 0.4719, + "step": 845 + }, + { + "epoch": 0.5147550958320657, + "grad_norm": 1.252469539642334, + "learning_rate": 4.895389919945993e-05, + "loss": 0.4674, + "step": 846 + }, + { + "epoch": 0.5153635533921509, + "grad_norm": 1.2273253202438354, + "learning_rate": 4.895033278730845e-05, + "loss": 0.5269, + "step": 847 + }, + { + "epoch": 0.5159720109522361, + "grad_norm": 1.6084344387054443, + "learning_rate": 4.894676043647274e-05, + "loss": 0.4133, + "step": 848 + }, + { + "epoch": 0.5165804685123213, + "grad_norm": 0.9914146065711975, + "learning_rate": 4.894318214783859e-05, + "loss": 0.4391, + "step": 849 + }, + { + "epoch": 0.5171889260724064, + "grad_norm": 1.3645412921905518, + "learning_rate": 4.893959792229327e-05, + "loss": 0.5509, + "step": 850 + }, + { + "epoch": 0.5177973836324916, + "grad_norm": 1.2271888256072998, + "learning_rate": 4.8936007760725514e-05, + "loss": 0.5111, + "step": 851 + }, + { + "epoch": 0.5184058411925768, + "grad_norm": 1.262268304824829, + "learning_rate": 4.893241166402553e-05, + "loss": 0.4809, + "step": 852 + }, + { + "epoch": 0.519014298752662, + "grad_norm": 1.3610913753509521, + "learning_rate": 4.892880963308502e-05, + "loss": 0.5159, + "step": 853 + }, + { + "epoch": 0.5196227563127472, + "grad_norm": 1.3238227367401123, + "learning_rate": 4.8925201668797117e-05, + "loss": 0.5162, + "step": 854 + }, + { + "epoch": 0.5202312138728323, + "grad_norm": 1.2725099325180054, + "learning_rate": 4.8921587772056444e-05, + "loss": 0.4616, + "step": 855 + }, + { + "epoch": 0.5208396714329175, + "grad_norm": 1.4523481130599976, + "learning_rate": 4.8917967943759114e-05, + "loss": 0.6087, + "step": 856 + }, + { + "epoch": 0.5214481289930027, + "grad_norm": 1.192795753479004, + "learning_rate": 4.8914342184802675e-05, + "loss": 0.4805, + "step": 857 + }, + { + "epoch": 0.522056586553088, + "grad_norm": 1.2161285877227783, + "learning_rate": 4.891071049608618e-05, + "loss": 0.481, + "step": 858 + }, + { + "epoch": 0.5226650441131732, + "grad_norm": 1.391660213470459, + "learning_rate": 4.890707287851013e-05, + "loss": 0.4796, + "step": 859 + }, + { + "epoch": 0.5232735016732583, + "grad_norm": 1.189489722251892, + "learning_rate": 4.8903429332976494e-05, + "loss": 0.5197, + "step": 860 + }, + { + "epoch": 0.5238819592333435, + "grad_norm": 1.2670400142669678, + "learning_rate": 4.889977986038874e-05, + "loss": 0.4648, + "step": 861 + }, + { + "epoch": 0.5244904167934287, + "grad_norm": 1.570553183555603, + "learning_rate": 4.889612446165176e-05, + "loss": 0.5102, + "step": 862 + }, + { + "epoch": 0.5250988743535139, + "grad_norm": 1.1414783000946045, + "learning_rate": 4.8892463137671963e-05, + "loss": 0.4489, + "step": 863 + }, + { + "epoch": 0.525707331913599, + "grad_norm": 1.4709701538085938, + "learning_rate": 4.888879588935719e-05, + "loss": 0.5205, + "step": 864 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 1.218973159790039, + "learning_rate": 4.888512271761677e-05, + "loss": 0.466, + "step": 865 + }, + { + "epoch": 0.5269242470337694, + "grad_norm": 1.2933622598648071, + "learning_rate": 4.88814436233615e-05, + "loss": 0.5473, + "step": 866 + }, + { + "epoch": 0.5275327045938546, + "grad_norm": 1.085365891456604, + "learning_rate": 4.887775860750363e-05, + "loss": 0.4572, + "step": 867 + }, + { + "epoch": 0.5281411621539398, + "grad_norm": 1.5779204368591309, + "learning_rate": 4.8874067670956905e-05, + "loss": 0.4428, + "step": 868 + }, + { + "epoch": 0.5287496197140249, + "grad_norm": 1.36058509349823, + "learning_rate": 4.887037081463652e-05, + "loss": 0.5667, + "step": 869 + }, + { + "epoch": 0.5293580772741101, + "grad_norm": 1.1790504455566406, + "learning_rate": 4.886666803945914e-05, + "loss": 0.5067, + "step": 870 + }, + { + "epoch": 0.5299665348341953, + "grad_norm": 1.2920281887054443, + "learning_rate": 4.886295934634289e-05, + "loss": 0.5173, + "step": 871 + }, + { + "epoch": 0.5305749923942805, + "grad_norm": 1.085190773010254, + "learning_rate": 4.8859244736207395e-05, + "loss": 0.4904, + "step": 872 + }, + { + "epoch": 0.5311834499543657, + "grad_norm": 1.1711935997009277, + "learning_rate": 4.885552420997369e-05, + "loss": 0.4416, + "step": 873 + }, + { + "epoch": 0.5317919075144508, + "grad_norm": 1.4009342193603516, + "learning_rate": 4.885179776856435e-05, + "loss": 0.529, + "step": 874 + }, + { + "epoch": 0.532400365074536, + "grad_norm": 1.261432409286499, + "learning_rate": 4.8848065412903335e-05, + "loss": 0.5338, + "step": 875 + }, + { + "epoch": 0.5330088226346212, + "grad_norm": 1.0821967124938965, + "learning_rate": 4.884432714391613e-05, + "loss": 0.4251, + "step": 876 + }, + { + "epoch": 0.5336172801947064, + "grad_norm": 1.4930713176727295, + "learning_rate": 4.884058296252969e-05, + "loss": 0.5063, + "step": 877 + }, + { + "epoch": 0.5342257377547917, + "grad_norm": 1.2942625284194946, + "learning_rate": 4.88368328696724e-05, + "loss": 0.4825, + "step": 878 + }, + { + "epoch": 0.5348341953148767, + "grad_norm": 1.1449693441390991, + "learning_rate": 4.883307686627412e-05, + "loss": 0.4833, + "step": 879 + }, + { + "epoch": 0.535442652874962, + "grad_norm": 1.2848173379898071, + "learning_rate": 4.882931495326619e-05, + "loss": 0.4981, + "step": 880 + }, + { + "epoch": 0.5360511104350472, + "grad_norm": 1.0941145420074463, + "learning_rate": 4.882554713158141e-05, + "loss": 0.4457, + "step": 881 + }, + { + "epoch": 0.5366595679951324, + "grad_norm": 1.2851606607437134, + "learning_rate": 4.8821773402154025e-05, + "loss": 0.4238, + "step": 882 + }, + { + "epoch": 0.5372680255552176, + "grad_norm": 1.5043134689331055, + "learning_rate": 4.881799376591979e-05, + "loss": 0.53, + "step": 883 + }, + { + "epoch": 0.5378764831153027, + "grad_norm": 1.2732433080673218, + "learning_rate": 4.8814208223815886e-05, + "loss": 0.5126, + "step": 884 + }, + { + "epoch": 0.5384849406753879, + "grad_norm": 1.2558897733688354, + "learning_rate": 4.8810416776780956e-05, + "loss": 0.4542, + "step": 885 + }, + { + "epoch": 0.5390933982354731, + "grad_norm": 1.1462931632995605, + "learning_rate": 4.880661942575514e-05, + "loss": 0.4536, + "step": 886 + }, + { + "epoch": 0.5397018557955583, + "grad_norm": 1.2264448404312134, + "learning_rate": 4.880281617168001e-05, + "loss": 0.4497, + "step": 887 + }, + { + "epoch": 0.5403103133556434, + "grad_norm": 1.3015364408493042, + "learning_rate": 4.879900701549863e-05, + "loss": 0.4697, + "step": 888 + }, + { + "epoch": 0.5409187709157286, + "grad_norm": 1.5858356952667236, + "learning_rate": 4.879519195815549e-05, + "loss": 0.4573, + "step": 889 + }, + { + "epoch": 0.5415272284758138, + "grad_norm": 1.2215701341629028, + "learning_rate": 4.8791371000596585e-05, + "loss": 0.5027, + "step": 890 + }, + { + "epoch": 0.542135686035899, + "grad_norm": 1.3186434507369995, + "learning_rate": 4.8787544143769335e-05, + "loss": 0.5387, + "step": 891 + }, + { + "epoch": 0.5427441435959842, + "grad_norm": 1.487970232963562, + "learning_rate": 4.878371138862267e-05, + "loss": 0.5038, + "step": 892 + }, + { + "epoch": 0.5433526011560693, + "grad_norm": 1.4170433282852173, + "learning_rate": 4.8779872736106916e-05, + "loss": 0.4948, + "step": 893 + }, + { + "epoch": 0.5439610587161545, + "grad_norm": 1.167901635169983, + "learning_rate": 4.877602818717393e-05, + "loss": 0.503, + "step": 894 + }, + { + "epoch": 0.5445695162762397, + "grad_norm": 1.3846609592437744, + "learning_rate": 4.877217774277698e-05, + "loss": 0.6564, + "step": 895 + }, + { + "epoch": 0.5451779738363249, + "grad_norm": 1.4311436414718628, + "learning_rate": 4.876832140387082e-05, + "loss": 0.5094, + "step": 896 + }, + { + "epoch": 0.5457864313964101, + "grad_norm": 1.1872178316116333, + "learning_rate": 4.876445917141167e-05, + "loss": 0.4632, + "step": 897 + }, + { + "epoch": 0.5463948889564952, + "grad_norm": 1.4088075160980225, + "learning_rate": 4.8760591046357196e-05, + "loss": 0.4989, + "step": 898 + }, + { + "epoch": 0.5470033465165804, + "grad_norm": 1.2164874076843262, + "learning_rate": 4.875671702966653e-05, + "loss": 0.4258, + "step": 899 + }, + { + "epoch": 0.5476118040766657, + "grad_norm": 1.2215385437011719, + "learning_rate": 4.875283712230027e-05, + "loss": 0.4936, + "step": 900 + }, + { + "epoch": 0.5482202616367509, + "grad_norm": 1.4215302467346191, + "learning_rate": 4.874895132522047e-05, + "loss": 0.425, + "step": 901 + }, + { + "epoch": 0.5488287191968361, + "grad_norm": 1.227898359298706, + "learning_rate": 4.874505963939066e-05, + "loss": 0.4306, + "step": 902 + }, + { + "epoch": 0.5494371767569212, + "grad_norm": 1.2220474481582642, + "learning_rate": 4.874116206577578e-05, + "loss": 0.5173, + "step": 903 + }, + { + "epoch": 0.5500456343170064, + "grad_norm": 1.1359342336654663, + "learning_rate": 4.8737258605342304e-05, + "loss": 0.4496, + "step": 904 + }, + { + "epoch": 0.5506540918770916, + "grad_norm": 1.4966158866882324, + "learning_rate": 4.8733349259058105e-05, + "loss": 0.4165, + "step": 905 + }, + { + "epoch": 0.5512625494371768, + "grad_norm": 1.28777277469635, + "learning_rate": 4.872943402789255e-05, + "loss": 0.5104, + "step": 906 + }, + { + "epoch": 0.551871006997262, + "grad_norm": 1.30458664894104, + "learning_rate": 4.872551291281644e-05, + "loss": 0.4773, + "step": 907 + }, + { + "epoch": 0.5524794645573471, + "grad_norm": 1.116233229637146, + "learning_rate": 4.872158591480206e-05, + "loss": 0.4654, + "step": 908 + }, + { + "epoch": 0.5530879221174323, + "grad_norm": 1.341030240058899, + "learning_rate": 4.871765303482314e-05, + "loss": 0.4994, + "step": 909 + }, + { + "epoch": 0.5536963796775175, + "grad_norm": 1.6074213981628418, + "learning_rate": 4.871371427385486e-05, + "loss": 0.4937, + "step": 910 + }, + { + "epoch": 0.5543048372376027, + "grad_norm": 1.4934123754501343, + "learning_rate": 4.870976963287389e-05, + "loss": 0.4713, + "step": 911 + }, + { + "epoch": 0.5549132947976878, + "grad_norm": 1.2505860328674316, + "learning_rate": 4.8705819112858306e-05, + "loss": 0.5438, + "step": 912 + }, + { + "epoch": 0.555521752357773, + "grad_norm": 1.4453731775283813, + "learning_rate": 4.8701862714787704e-05, + "loss": 0.5272, + "step": 913 + }, + { + "epoch": 0.5561302099178582, + "grad_norm": 1.0299216508865356, + "learning_rate": 4.8697900439643087e-05, + "loss": 0.4406, + "step": 914 + }, + { + "epoch": 0.5567386674779434, + "grad_norm": 1.381816029548645, + "learning_rate": 4.869393228840693e-05, + "loss": 0.4839, + "step": 915 + }, + { + "epoch": 0.5573471250380286, + "grad_norm": 1.1093368530273438, + "learning_rate": 4.8689958262063186e-05, + "loss": 0.4567, + "step": 916 + }, + { + "epoch": 0.5579555825981137, + "grad_norm": 1.302557349205017, + "learning_rate": 4.8685978361597234e-05, + "loss": 0.4838, + "step": 917 + }, + { + "epoch": 0.5585640401581989, + "grad_norm": 1.2300167083740234, + "learning_rate": 4.868199258799593e-05, + "loss": 0.4985, + "step": 918 + }, + { + "epoch": 0.5591724977182841, + "grad_norm": 1.171228051185608, + "learning_rate": 4.867800094224758e-05, + "loss": 0.4577, + "step": 919 + }, + { + "epoch": 0.5597809552783694, + "grad_norm": 1.348331093788147, + "learning_rate": 4.867400342534194e-05, + "loss": 0.4892, + "step": 920 + }, + { + "epoch": 0.5603894128384546, + "grad_norm": 1.2311930656433105, + "learning_rate": 4.8670000038270236e-05, + "loss": 0.4981, + "step": 921 + }, + { + "epoch": 0.5609978703985397, + "grad_norm": 1.3021697998046875, + "learning_rate": 4.866599078202514e-05, + "loss": 0.5145, + "step": 922 + }, + { + "epoch": 0.5616063279586249, + "grad_norm": 1.426529049873352, + "learning_rate": 4.8661975657600765e-05, + "loss": 0.502, + "step": 923 + }, + { + "epoch": 0.5622147855187101, + "grad_norm": 1.096643090248108, + "learning_rate": 4.865795466599272e-05, + "loss": 0.4567, + "step": 924 + }, + { + "epoch": 0.5628232430787953, + "grad_norm": 1.1198285818099976, + "learning_rate": 4.8653927808198024e-05, + "loss": 0.4742, + "step": 925 + }, + { + "epoch": 0.5634317006388805, + "grad_norm": 1.3766636848449707, + "learning_rate": 4.8649895085215177e-05, + "loss": 0.5223, + "step": 926 + }, + { + "epoch": 0.5640401581989656, + "grad_norm": 1.1397812366485596, + "learning_rate": 4.8645856498044125e-05, + "loss": 0.4926, + "step": 927 + }, + { + "epoch": 0.5646486157590508, + "grad_norm": 1.2435989379882812, + "learning_rate": 4.8641812047686266e-05, + "loss": 0.44, + "step": 928 + }, + { + "epoch": 0.565257073319136, + "grad_norm": 1.5014331340789795, + "learning_rate": 4.863776173514446e-05, + "loss": 0.4282, + "step": 929 + }, + { + "epoch": 0.5658655308792212, + "grad_norm": 1.1607805490493774, + "learning_rate": 4.8633705561423014e-05, + "loss": 0.4856, + "step": 930 + }, + { + "epoch": 0.5664739884393064, + "grad_norm": 1.08901846408844, + "learning_rate": 4.862964352752769e-05, + "loss": 0.5092, + "step": 931 + }, + { + "epoch": 0.5670824459993915, + "grad_norm": 1.114229440689087, + "learning_rate": 4.862557563446571e-05, + "loss": 0.4652, + "step": 932 + }, + { + "epoch": 0.5676909035594767, + "grad_norm": 1.2063711881637573, + "learning_rate": 4.862150188324573e-05, + "loss": 0.4485, + "step": 933 + }, + { + "epoch": 0.5682993611195619, + "grad_norm": 1.2078471183776855, + "learning_rate": 4.861742227487788e-05, + "loss": 0.5443, + "step": 934 + }, + { + "epoch": 0.5689078186796471, + "grad_norm": 1.0555641651153564, + "learning_rate": 4.861333681037372e-05, + "loss": 0.4346, + "step": 935 + }, + { + "epoch": 0.5695162762397323, + "grad_norm": 0.9378579258918762, + "learning_rate": 4.8609245490746283e-05, + "loss": 0.4181, + "step": 936 + }, + { + "epoch": 0.5701247337998174, + "grad_norm": 1.4141374826431274, + "learning_rate": 4.8605148317010054e-05, + "loss": 0.4459, + "step": 937 + }, + { + "epoch": 0.5707331913599026, + "grad_norm": 1.106641173362732, + "learning_rate": 4.8601045290180946e-05, + "loss": 0.4694, + "step": 938 + }, + { + "epoch": 0.5713416489199878, + "grad_norm": 1.272030234336853, + "learning_rate": 4.8596936411276354e-05, + "loss": 0.5127, + "step": 939 + }, + { + "epoch": 0.571950106480073, + "grad_norm": 1.3205722570419312, + "learning_rate": 4.8592821681315096e-05, + "loss": 0.4937, + "step": 940 + }, + { + "epoch": 0.5725585640401581, + "grad_norm": 1.1618589162826538, + "learning_rate": 4.858870110131746e-05, + "loss": 0.5017, + "step": 941 + }, + { + "epoch": 0.5731670216002434, + "grad_norm": 1.2044800519943237, + "learning_rate": 4.858457467230517e-05, + "loss": 0.4831, + "step": 942 + }, + { + "epoch": 0.5737754791603286, + "grad_norm": 1.5071766376495361, + "learning_rate": 4.858044239530143e-05, + "loss": 0.5215, + "step": 943 + }, + { + "epoch": 0.5743839367204138, + "grad_norm": 1.1561444997787476, + "learning_rate": 4.857630427133084e-05, + "loss": 0.4897, + "step": 944 + }, + { + "epoch": 0.574992394280499, + "grad_norm": 1.1617779731750488, + "learning_rate": 4.857216030141951e-05, + "loss": 0.4417, + "step": 945 + }, + { + "epoch": 0.5756008518405841, + "grad_norm": 1.083703637123108, + "learning_rate": 4.8568010486594964e-05, + "loss": 0.4986, + "step": 946 + }, + { + "epoch": 0.5762093094006693, + "grad_norm": 1.3023382425308228, + "learning_rate": 4.8563854827886166e-05, + "loss": 0.5696, + "step": 947 + }, + { + "epoch": 0.5768177669607545, + "grad_norm": 1.446385383605957, + "learning_rate": 4.855969332632357e-05, + "loss": 0.49, + "step": 948 + }, + { + "epoch": 0.5774262245208397, + "grad_norm": 1.1349619626998901, + "learning_rate": 4.8555525982939046e-05, + "loss": 0.5179, + "step": 949 + }, + { + "epoch": 0.5780346820809249, + "grad_norm": 1.1926076412200928, + "learning_rate": 4.855135279876592e-05, + "loss": 0.4852, + "step": 950 + }, + { + "epoch": 0.57864313964101, + "grad_norm": 1.0169259309768677, + "learning_rate": 4.8547173774838975e-05, + "loss": 0.4002, + "step": 951 + }, + { + "epoch": 0.5792515972010952, + "grad_norm": 1.2183793783187866, + "learning_rate": 4.854298891219441e-05, + "loss": 0.5109, + "step": 952 + }, + { + "epoch": 0.5798600547611804, + "grad_norm": 1.2902944087982178, + "learning_rate": 4.853879821186993e-05, + "loss": 0.5235, + "step": 953 + }, + { + "epoch": 0.5804685123212656, + "grad_norm": 1.0406612157821655, + "learning_rate": 4.8534601674904635e-05, + "loss": 0.4147, + "step": 954 + }, + { + "epoch": 0.5810769698813508, + "grad_norm": 1.2372276782989502, + "learning_rate": 4.85303993023391e-05, + "loss": 0.4658, + "step": 955 + }, + { + "epoch": 0.5816854274414359, + "grad_norm": 1.1386033296585083, + "learning_rate": 4.852619109521533e-05, + "loss": 0.4555, + "step": 956 + }, + { + "epoch": 0.5822938850015211, + "grad_norm": 1.4579136371612549, + "learning_rate": 4.8521977054576783e-05, + "loss": 0.4944, + "step": 957 + }, + { + "epoch": 0.5829023425616063, + "grad_norm": 1.3394607305526733, + "learning_rate": 4.851775718146838e-05, + "loss": 0.5533, + "step": 958 + }, + { + "epoch": 0.5835108001216915, + "grad_norm": 1.263627290725708, + "learning_rate": 4.851353147693646e-05, + "loss": 0.5234, + "step": 959 + }, + { + "epoch": 0.5841192576817767, + "grad_norm": 1.30045747756958, + "learning_rate": 4.850929994202882e-05, + "loss": 0.5035, + "step": 960 + }, + { + "epoch": 0.5847277152418618, + "grad_norm": 1.3387147188186646, + "learning_rate": 4.8505062577794716e-05, + "loss": 0.4725, + "step": 961 + }, + { + "epoch": 0.585336172801947, + "grad_norm": 1.1547223329544067, + "learning_rate": 4.8500819385284835e-05, + "loss": 0.4985, + "step": 962 + }, + { + "epoch": 0.5859446303620323, + "grad_norm": 1.1327009201049805, + "learning_rate": 4.849657036555131e-05, + "loss": 0.4579, + "step": 963 + }, + { + "epoch": 0.5865530879221175, + "grad_norm": 0.9808380007743835, + "learning_rate": 4.849231551964771e-05, + "loss": 0.4366, + "step": 964 + }, + { + "epoch": 0.5871615454822026, + "grad_norm": 1.2232143878936768, + "learning_rate": 4.848805484862908e-05, + "loss": 0.4611, + "step": 965 + }, + { + "epoch": 0.5877700030422878, + "grad_norm": 1.2399910688400269, + "learning_rate": 4.8483788353551876e-05, + "loss": 0.5725, + "step": 966 + }, + { + "epoch": 0.588378460602373, + "grad_norm": 1.1158905029296875, + "learning_rate": 4.8479516035474003e-05, + "loss": 0.4809, + "step": 967 + }, + { + "epoch": 0.5889869181624582, + "grad_norm": 1.1464722156524658, + "learning_rate": 4.8475237895454833e-05, + "loss": 0.4469, + "step": 968 + }, + { + "epoch": 0.5895953757225434, + "grad_norm": 1.3894585371017456, + "learning_rate": 4.847095393455516e-05, + "loss": 0.4433, + "step": 969 + }, + { + "epoch": 0.5902038332826285, + "grad_norm": 1.2021290063858032, + "learning_rate": 4.846666415383724e-05, + "loss": 0.4745, + "step": 970 + }, + { + "epoch": 0.5908122908427137, + "grad_norm": 1.128505825996399, + "learning_rate": 4.846236855436473e-05, + "loss": 0.3727, + "step": 971 + }, + { + "epoch": 0.5914207484027989, + "grad_norm": 1.2462071180343628, + "learning_rate": 4.845806713720279e-05, + "loss": 0.5052, + "step": 972 + }, + { + "epoch": 0.5920292059628841, + "grad_norm": 1.1397188901901245, + "learning_rate": 4.845375990341798e-05, + "loss": 0.447, + "step": 973 + }, + { + "epoch": 0.5926376635229693, + "grad_norm": 1.1206566095352173, + "learning_rate": 4.844944685407831e-05, + "loss": 0.4872, + "step": 974 + }, + { + "epoch": 0.5932461210830544, + "grad_norm": 1.167952299118042, + "learning_rate": 4.8445127990253244e-05, + "loss": 0.489, + "step": 975 + }, + { + "epoch": 0.5938545786431396, + "grad_norm": 1.4353526830673218, + "learning_rate": 4.844080331301368e-05, + "loss": 0.5701, + "step": 976 + }, + { + "epoch": 0.5944630362032248, + "grad_norm": 1.1396512985229492, + "learning_rate": 4.843647282343195e-05, + "loss": 0.4253, + "step": 977 + }, + { + "epoch": 0.59507149376331, + "grad_norm": 1.1176103353500366, + "learning_rate": 4.843213652258185e-05, + "loss": 0.4273, + "step": 978 + }, + { + "epoch": 0.5956799513233952, + "grad_norm": 0.9949272274971008, + "learning_rate": 4.842779441153858e-05, + "loss": 0.4386, + "step": 979 + }, + { + "epoch": 0.5962884088834803, + "grad_norm": 1.0839688777923584, + "learning_rate": 4.842344649137882e-05, + "loss": 0.4394, + "step": 980 + }, + { + "epoch": 0.5968968664435655, + "grad_norm": 1.2163331508636475, + "learning_rate": 4.8419092763180673e-05, + "loss": 0.4894, + "step": 981 + }, + { + "epoch": 0.5975053240036508, + "grad_norm": 1.1652277708053589, + "learning_rate": 4.841473322802367e-05, + "loss": 0.4745, + "step": 982 + }, + { + "epoch": 0.598113781563736, + "grad_norm": 1.3939238786697388, + "learning_rate": 4.84103678869888e-05, + "loss": 0.5641, + "step": 983 + }, + { + "epoch": 0.5987222391238212, + "grad_norm": 1.0871104001998901, + "learning_rate": 4.840599674115849e-05, + "loss": 0.457, + "step": 984 + }, + { + "epoch": 0.5993306966839063, + "grad_norm": 1.193232536315918, + "learning_rate": 4.8401619791616595e-05, + "loss": 0.5823, + "step": 985 + }, + { + "epoch": 0.5999391542439915, + "grad_norm": 1.1120039224624634, + "learning_rate": 4.839723703944842e-05, + "loss": 0.464, + "step": 986 + }, + { + "epoch": 0.6005476118040767, + "grad_norm": 1.0750576257705688, + "learning_rate": 4.8392848485740706e-05, + "loss": 0.5022, + "step": 987 + }, + { + "epoch": 0.6011560693641619, + "grad_norm": 1.062077522277832, + "learning_rate": 4.838845413158162e-05, + "loss": 0.4077, + "step": 988 + }, + { + "epoch": 0.601764526924247, + "grad_norm": 1.1485848426818848, + "learning_rate": 4.83840539780608e-05, + "loss": 0.4517, + "step": 989 + }, + { + "epoch": 0.6023729844843322, + "grad_norm": 1.139553427696228, + "learning_rate": 4.837964802626929e-05, + "loss": 0.5155, + "step": 990 + }, + { + "epoch": 0.6029814420444174, + "grad_norm": 1.2427302598953247, + "learning_rate": 4.8375236277299575e-05, + "loss": 0.4747, + "step": 991 + }, + { + "epoch": 0.6035898996045026, + "grad_norm": 1.172774076461792, + "learning_rate": 4.837081873224559e-05, + "loss": 0.5702, + "step": 992 + }, + { + "epoch": 0.6041983571645878, + "grad_norm": 1.059998869895935, + "learning_rate": 4.83663953922027e-05, + "loss": 0.4608, + "step": 993 + }, + { + "epoch": 0.6048068147246729, + "grad_norm": 1.0970420837402344, + "learning_rate": 4.836196625826772e-05, + "loss": 0.4902, + "step": 994 + }, + { + "epoch": 0.6054152722847581, + "grad_norm": 1.263503909111023, + "learning_rate": 4.835753133153888e-05, + "loss": 0.4825, + "step": 995 + }, + { + "epoch": 0.6060237298448433, + "grad_norm": 1.0827324390411377, + "learning_rate": 4.8353090613115856e-05, + "loss": 0.4678, + "step": 996 + }, + { + "epoch": 0.6066321874049285, + "grad_norm": 1.2301872968673706, + "learning_rate": 4.8348644104099773e-05, + "loss": 0.4315, + "step": 997 + }, + { + "epoch": 0.6072406449650137, + "grad_norm": 1.1217997074127197, + "learning_rate": 4.834419180559317e-05, + "loss": 0.4979, + "step": 998 + }, + { + "epoch": 0.6078491025250988, + "grad_norm": 0.9536124467849731, + "learning_rate": 4.833973371870003e-05, + "loss": 0.4011, + "step": 999 + }, + { + "epoch": 0.608457560085184, + "grad_norm": 1.1605530977249146, + "learning_rate": 4.833526984452578e-05, + "loss": 0.5327, + "step": 1000 + }, + { + "epoch": 0.6090660176452692, + "grad_norm": 1.1049379110336304, + "learning_rate": 4.833080018417726e-05, + "loss": 0.4454, + "step": 1001 + }, + { + "epoch": 0.6096744752053544, + "grad_norm": 1.0455185174942017, + "learning_rate": 4.8326324738762774e-05, + "loss": 0.4685, + "step": 1002 + }, + { + "epoch": 0.6102829327654397, + "grad_norm": 1.3146376609802246, + "learning_rate": 4.832184350939205e-05, + "loss": 0.465, + "step": 1003 + }, + { + "epoch": 0.6108913903255248, + "grad_norm": 1.5373934507369995, + "learning_rate": 4.831735649717623e-05, + "loss": 0.5181, + "step": 1004 + }, + { + "epoch": 0.61149984788561, + "grad_norm": 1.0241584777832031, + "learning_rate": 4.831286370322792e-05, + "loss": 0.4116, + "step": 1005 + }, + { + "epoch": 0.6121083054456952, + "grad_norm": 1.1920700073242188, + "learning_rate": 4.830836512866113e-05, + "loss": 0.5441, + "step": 1006 + }, + { + "epoch": 0.6127167630057804, + "grad_norm": 1.1300125122070312, + "learning_rate": 4.8303860774591336e-05, + "loss": 0.4439, + "step": 1007 + }, + { + "epoch": 0.6133252205658656, + "grad_norm": 1.3509349822998047, + "learning_rate": 4.8299350642135424e-05, + "loss": 0.5142, + "step": 1008 + }, + { + "epoch": 0.6139336781259507, + "grad_norm": 1.2040926218032837, + "learning_rate": 4.8294834732411714e-05, + "loss": 0.5396, + "step": 1009 + }, + { + "epoch": 0.6145421356860359, + "grad_norm": 1.0937200784683228, + "learning_rate": 4.829031304653997e-05, + "loss": 0.4634, + "step": 1010 + }, + { + "epoch": 0.6151505932461211, + "grad_norm": 1.4639860391616821, + "learning_rate": 4.8285785585641375e-05, + "loss": 0.4316, + "step": 1011 + }, + { + "epoch": 0.6157590508062063, + "grad_norm": 1.1813201904296875, + "learning_rate": 4.8281252350838557e-05, + "loss": 0.4838, + "step": 1012 + }, + { + "epoch": 0.6163675083662915, + "grad_norm": 1.0957978963851929, + "learning_rate": 4.827671334325556e-05, + "loss": 0.4231, + "step": 1013 + }, + { + "epoch": 0.6169759659263766, + "grad_norm": 1.061135172843933, + "learning_rate": 4.827216856401788e-05, + "loss": 0.4508, + "step": 1014 + }, + { + "epoch": 0.6175844234864618, + "grad_norm": 1.2017053365707397, + "learning_rate": 4.826761801425243e-05, + "loss": 0.4579, + "step": 1015 + }, + { + "epoch": 0.618192881046547, + "grad_norm": 1.263437271118164, + "learning_rate": 4.826306169508755e-05, + "loss": 0.511, + "step": 1016 + }, + { + "epoch": 0.6188013386066322, + "grad_norm": 1.6724522113800049, + "learning_rate": 4.825849960765303e-05, + "loss": 0.4669, + "step": 1017 + }, + { + "epoch": 0.6194097961667173, + "grad_norm": 1.3626164197921753, + "learning_rate": 4.825393175308006e-05, + "loss": 0.5212, + "step": 1018 + }, + { + "epoch": 0.6200182537268025, + "grad_norm": 1.1306309700012207, + "learning_rate": 4.824935813250129e-05, + "loss": 0.4786, + "step": 1019 + }, + { + "epoch": 0.6206267112868877, + "grad_norm": 1.1889963150024414, + "learning_rate": 4.824477874705079e-05, + "loss": 0.4653, + "step": 1020 + }, + { + "epoch": 0.6212351688469729, + "grad_norm": 1.1174501180648804, + "learning_rate": 4.8240193597864044e-05, + "loss": 0.4382, + "step": 1021 + }, + { + "epoch": 0.6218436264070581, + "grad_norm": 1.1703723669052124, + "learning_rate": 4.8235602686077986e-05, + "loss": 0.4405, + "step": 1022 + }, + { + "epoch": 0.6224520839671432, + "grad_norm": 1.100953221321106, + "learning_rate": 4.823100601283097e-05, + "loss": 0.417, + "step": 1023 + }, + { + "epoch": 0.6230605415272285, + "grad_norm": 0.9949542880058289, + "learning_rate": 4.822640357926278e-05, + "loss": 0.4109, + "step": 1024 + }, + { + "epoch": 0.6236689990873137, + "grad_norm": 1.0636481046676636, + "learning_rate": 4.8221795386514625e-05, + "loss": 0.4349, + "step": 1025 + }, + { + "epoch": 0.6242774566473989, + "grad_norm": 1.30801522731781, + "learning_rate": 4.821718143572914e-05, + "loss": 0.4903, + "step": 1026 + }, + { + "epoch": 0.6248859142074841, + "grad_norm": 1.0893471240997314, + "learning_rate": 4.821256172805041e-05, + "loss": 0.4154, + "step": 1027 + }, + { + "epoch": 0.6254943717675692, + "grad_norm": 1.0990405082702637, + "learning_rate": 4.820793626462391e-05, + "loss": 0.3986, + "step": 1028 + }, + { + "epoch": 0.6261028293276544, + "grad_norm": 1.1979132890701294, + "learning_rate": 4.8203305046596584e-05, + "loss": 0.5231, + "step": 1029 + }, + { + "epoch": 0.6267112868877396, + "grad_norm": 1.0785170793533325, + "learning_rate": 4.8198668075116754e-05, + "loss": 0.495, + "step": 1030 + }, + { + "epoch": 0.6273197444478248, + "grad_norm": 1.0589004755020142, + "learning_rate": 4.819402535133422e-05, + "loss": 0.4546, + "step": 1031 + }, + { + "epoch": 0.62792820200791, + "grad_norm": 1.189632534980774, + "learning_rate": 4.818937687640016e-05, + "loss": 0.4908, + "step": 1032 + }, + { + "epoch": 0.6285366595679951, + "grad_norm": 1.1523140668869019, + "learning_rate": 4.818472265146722e-05, + "loss": 0.4496, + "step": 1033 + }, + { + "epoch": 0.6291451171280803, + "grad_norm": 1.1302261352539062, + "learning_rate": 4.818006267768945e-05, + "loss": 0.4783, + "step": 1034 + }, + { + "epoch": 0.6297535746881655, + "grad_norm": 1.0538748502731323, + "learning_rate": 4.817539695622234e-05, + "loss": 0.4308, + "step": 1035 + }, + { + "epoch": 0.6303620322482507, + "grad_norm": 1.1539735794067383, + "learning_rate": 4.817072548822277e-05, + "loss": 0.4797, + "step": 1036 + }, + { + "epoch": 0.6309704898083359, + "grad_norm": 1.2327778339385986, + "learning_rate": 4.816604827484908e-05, + "loss": 0.4814, + "step": 1037 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 1.306960105895996, + "learning_rate": 4.816136531726104e-05, + "loss": 0.5386, + "step": 1038 + }, + { + "epoch": 0.6321874049285062, + "grad_norm": 1.0629810094833374, + "learning_rate": 4.815667661661981e-05, + "loss": 0.4891, + "step": 1039 + }, + { + "epoch": 0.6327958624885914, + "grad_norm": 1.113374948501587, + "learning_rate": 4.8151982174088e-05, + "loss": 0.4601, + "step": 1040 + }, + { + "epoch": 0.6334043200486766, + "grad_norm": 1.1330082416534424, + "learning_rate": 4.814728199082962e-05, + "loss": 0.4374, + "step": 1041 + }, + { + "epoch": 0.6340127776087617, + "grad_norm": 1.10567307472229, + "learning_rate": 4.8142576068010135e-05, + "loss": 0.4555, + "step": 1042 + }, + { + "epoch": 0.6346212351688469, + "grad_norm": 1.0396449565887451, + "learning_rate": 4.813786440679642e-05, + "loss": 0.4372, + "step": 1043 + }, + { + "epoch": 0.6352296927289321, + "grad_norm": 1.0404661893844604, + "learning_rate": 4.813314700835677e-05, + "loss": 0.4632, + "step": 1044 + }, + { + "epoch": 0.6358381502890174, + "grad_norm": 1.0946401357650757, + "learning_rate": 4.8128423873860894e-05, + "loss": 0.4613, + "step": 1045 + }, + { + "epoch": 0.6364466078491026, + "grad_norm": 1.223552942276001, + "learning_rate": 4.812369500447994e-05, + "loss": 0.475, + "step": 1046 + }, + { + "epoch": 0.6370550654091877, + "grad_norm": 1.0299303531646729, + "learning_rate": 4.8118960401386466e-05, + "loss": 0.4937, + "step": 1047 + }, + { + "epoch": 0.6376635229692729, + "grad_norm": 1.256249189376831, + "learning_rate": 4.811422006575446e-05, + "loss": 0.4967, + "step": 1048 + }, + { + "epoch": 0.6382719805293581, + "grad_norm": 1.0132943391799927, + "learning_rate": 4.810947399875933e-05, + "loss": 0.4349, + "step": 1049 + }, + { + "epoch": 0.6388804380894433, + "grad_norm": 1.097622036933899, + "learning_rate": 4.810472220157789e-05, + "loss": 0.4606, + "step": 1050 + }, + { + "epoch": 0.6394888956495285, + "grad_norm": 1.0819436311721802, + "learning_rate": 4.80999646753884e-05, + "loss": 0.4715, + "step": 1051 + }, + { + "epoch": 0.6400973532096136, + "grad_norm": 1.1682971715927124, + "learning_rate": 4.8095201421370515e-05, + "loss": 0.477, + "step": 1052 + }, + { + "epoch": 0.6407058107696988, + "grad_norm": 0.9915129542350769, + "learning_rate": 4.8090432440705344e-05, + "loss": 0.4397, + "step": 1053 + }, + { + "epoch": 0.641314268329784, + "grad_norm": 1.028359293937683, + "learning_rate": 4.8085657734575387e-05, + "loss": 0.4533, + "step": 1054 + }, + { + "epoch": 0.6419227258898692, + "grad_norm": 1.2199668884277344, + "learning_rate": 4.8080877304164564e-05, + "loss": 0.4632, + "step": 1055 + }, + { + "epoch": 0.6425311834499544, + "grad_norm": 1.3181172609329224, + "learning_rate": 4.807609115065823e-05, + "loss": 0.5162, + "step": 1056 + }, + { + "epoch": 0.6431396410100395, + "grad_norm": 1.1091372966766357, + "learning_rate": 4.8071299275243145e-05, + "loss": 0.4368, + "step": 1057 + }, + { + "epoch": 0.6437480985701247, + "grad_norm": 1.1425464153289795, + "learning_rate": 4.80665016791075e-05, + "loss": 0.4589, + "step": 1058 + }, + { + "epoch": 0.6443565561302099, + "grad_norm": 1.0934760570526123, + "learning_rate": 4.80616983634409e-05, + "loss": 0.4216, + "step": 1059 + }, + { + "epoch": 0.6449650136902951, + "grad_norm": 1.2468703985214233, + "learning_rate": 4.805688932943436e-05, + "loss": 0.4859, + "step": 1060 + }, + { + "epoch": 0.6455734712503803, + "grad_norm": 1.0578359365463257, + "learning_rate": 4.805207457828034e-05, + "loss": 0.4568, + "step": 1061 + }, + { + "epoch": 0.6461819288104654, + "grad_norm": 1.2633030414581299, + "learning_rate": 4.8047254111172665e-05, + "loss": 0.4284, + "step": 1062 + }, + { + "epoch": 0.6467903863705506, + "grad_norm": 1.122072458267212, + "learning_rate": 4.804242792930663e-05, + "loss": 0.4531, + "step": 1063 + }, + { + "epoch": 0.6473988439306358, + "grad_norm": 1.2295277118682861, + "learning_rate": 4.803759603387894e-05, + "loss": 0.5087, + "step": 1064 + }, + { + "epoch": 0.648007301490721, + "grad_norm": 1.3162295818328857, + "learning_rate": 4.803275842608767e-05, + "loss": 0.5562, + "step": 1065 + }, + { + "epoch": 0.6486157590508062, + "grad_norm": 1.1886372566223145, + "learning_rate": 4.802791510713237e-05, + "loss": 0.4722, + "step": 1066 + }, + { + "epoch": 0.6492242166108914, + "grad_norm": 1.1018664836883545, + "learning_rate": 4.802306607821398e-05, + "loss": 0.4603, + "step": 1067 + }, + { + "epoch": 0.6498326741709766, + "grad_norm": 1.2671383619308472, + "learning_rate": 4.8018211340534835e-05, + "loss": 0.506, + "step": 1068 + }, + { + "epoch": 0.6504411317310618, + "grad_norm": 1.1520271301269531, + "learning_rate": 4.8013350895298735e-05, + "loss": 0.465, + "step": 1069 + }, + { + "epoch": 0.651049589291147, + "grad_norm": 1.2675180435180664, + "learning_rate": 4.8008484743710854e-05, + "loss": 0.4874, + "step": 1070 + }, + { + "epoch": 0.6516580468512321, + "grad_norm": 1.073280692100525, + "learning_rate": 4.80036128869778e-05, + "loss": 0.439, + "step": 1071 + }, + { + "epoch": 0.6522665044113173, + "grad_norm": 1.365029215812683, + "learning_rate": 4.7998735326307585e-05, + "loss": 0.3928, + "step": 1072 + }, + { + "epoch": 0.6528749619714025, + "grad_norm": 1.06315016746521, + "learning_rate": 4.799385206290965e-05, + "loss": 0.4725, + "step": 1073 + }, + { + "epoch": 0.6534834195314877, + "grad_norm": 1.0730098485946655, + "learning_rate": 4.798896309799483e-05, + "loss": 0.447, + "step": 1074 + }, + { + "epoch": 0.6540918770915729, + "grad_norm": 1.4145798683166504, + "learning_rate": 4.798406843277538e-05, + "loss": 0.5263, + "step": 1075 + }, + { + "epoch": 0.654700334651658, + "grad_norm": 1.3397396802902222, + "learning_rate": 4.7979168068465e-05, + "loss": 0.4561, + "step": 1076 + }, + { + "epoch": 0.6553087922117432, + "grad_norm": 1.0366169214248657, + "learning_rate": 4.7974262006278745e-05, + "loss": 0.3994, + "step": 1077 + }, + { + "epoch": 0.6559172497718284, + "grad_norm": 1.1666359901428223, + "learning_rate": 4.796935024743313e-05, + "loss": 0.4478, + "step": 1078 + }, + { + "epoch": 0.6565257073319136, + "grad_norm": 1.1900277137756348, + "learning_rate": 4.7964432793146065e-05, + "loss": 0.4719, + "step": 1079 + }, + { + "epoch": 0.6571341648919988, + "grad_norm": 1.5251827239990234, + "learning_rate": 4.795950964463687e-05, + "loss": 0.475, + "step": 1080 + }, + { + "epoch": 0.6577426224520839, + "grad_norm": 1.298077940940857, + "learning_rate": 4.795458080312628e-05, + "loss": 0.4526, + "step": 1081 + }, + { + "epoch": 0.6583510800121691, + "grad_norm": 1.0009061098098755, + "learning_rate": 4.794964626983646e-05, + "loss": 0.3941, + "step": 1082 + }, + { + "epoch": 0.6589595375722543, + "grad_norm": 1.320255994796753, + "learning_rate": 4.794470604599093e-05, + "loss": 0.54, + "step": 1083 + }, + { + "epoch": 0.6595679951323395, + "grad_norm": 1.177507758140564, + "learning_rate": 4.79397601328147e-05, + "loss": 0.4851, + "step": 1084 + }, + { + "epoch": 0.6601764526924248, + "grad_norm": 1.2261627912521362, + "learning_rate": 4.793480853153412e-05, + "loss": 0.4907, + "step": 1085 + }, + { + "epoch": 0.6607849102525098, + "grad_norm": 1.0971055030822754, + "learning_rate": 4.792985124337701e-05, + "loss": 0.4562, + "step": 1086 + }, + { + "epoch": 0.6613933678125951, + "grad_norm": 1.3122934103012085, + "learning_rate": 4.7924888269572545e-05, + "loss": 0.5057, + "step": 1087 + }, + { + "epoch": 0.6620018253726803, + "grad_norm": 1.1012256145477295, + "learning_rate": 4.791991961135135e-05, + "loss": 0.4912, + "step": 1088 + }, + { + "epoch": 0.6626102829327655, + "grad_norm": 1.0547481775283813, + "learning_rate": 4.791494526994544e-05, + "loss": 0.4898, + "step": 1089 + }, + { + "epoch": 0.6632187404928507, + "grad_norm": 1.085845708847046, + "learning_rate": 4.790996524658824e-05, + "loss": 0.4328, + "step": 1090 + }, + { + "epoch": 0.6638271980529358, + "grad_norm": 1.1783792972564697, + "learning_rate": 4.790497954251459e-05, + "loss": 0.4246, + "step": 1091 + }, + { + "epoch": 0.664435655613021, + "grad_norm": 1.1917533874511719, + "learning_rate": 4.789998815896075e-05, + "loss": 0.494, + "step": 1092 + }, + { + "epoch": 0.6650441131731062, + "grad_norm": 1.0818932056427002, + "learning_rate": 4.7894991097164366e-05, + "loss": 0.5001, + "step": 1093 + }, + { + "epoch": 0.6656525707331914, + "grad_norm": 1.1833261251449585, + "learning_rate": 4.788998835836449e-05, + "loss": 0.5394, + "step": 1094 + }, + { + "epoch": 0.6662610282932765, + "grad_norm": 1.4886353015899658, + "learning_rate": 4.788497994380162e-05, + "loss": 0.3904, + "step": 1095 + }, + { + "epoch": 0.6668694858533617, + "grad_norm": 1.6799591779708862, + "learning_rate": 4.78799658547176e-05, + "loss": 0.478, + "step": 1096 + }, + { + "epoch": 0.6674779434134469, + "grad_norm": 1.1299816370010376, + "learning_rate": 4.787494609235575e-05, + "loss": 0.5295, + "step": 1097 + }, + { + "epoch": 0.6680864009735321, + "grad_norm": 1.024692177772522, + "learning_rate": 4.786992065796072e-05, + "loss": 0.4684, + "step": 1098 + }, + { + "epoch": 0.6686948585336173, + "grad_norm": 1.1255176067352295, + "learning_rate": 4.786488955277865e-05, + "loss": 0.455, + "step": 1099 + }, + { + "epoch": 0.6693033160937024, + "grad_norm": 1.1396479606628418, + "learning_rate": 4.7859852778057016e-05, + "loss": 0.5006, + "step": 1100 + }, + { + "epoch": 0.6699117736537876, + "grad_norm": 1.0285197496414185, + "learning_rate": 4.7854810335044745e-05, + "loss": 0.4194, + "step": 1101 + }, + { + "epoch": 0.6705202312138728, + "grad_norm": 0.9936107993125916, + "learning_rate": 4.7849762224992144e-05, + "loss": 0.4164, + "step": 1102 + }, + { + "epoch": 0.671128688773958, + "grad_norm": 1.3650137186050415, + "learning_rate": 4.784470844915093e-05, + "loss": 0.4367, + "step": 1103 + }, + { + "epoch": 0.6717371463340432, + "grad_norm": 1.2089672088623047, + "learning_rate": 4.783964900877425e-05, + "loss": 0.491, + "step": 1104 + }, + { + "epoch": 0.6723456038941283, + "grad_norm": 1.2130366563796997, + "learning_rate": 4.78345839051166e-05, + "loss": 0.4589, + "step": 1105 + }, + { + "epoch": 0.6729540614542135, + "grad_norm": 1.3927088975906372, + "learning_rate": 4.782951313943395e-05, + "loss": 0.5312, + "step": 1106 + }, + { + "epoch": 0.6735625190142988, + "grad_norm": 1.0126924514770508, + "learning_rate": 4.782443671298362e-05, + "loss": 0.4162, + "step": 1107 + }, + { + "epoch": 0.674170976574384, + "grad_norm": 1.0403350591659546, + "learning_rate": 4.781935462702435e-05, + "loss": 0.407, + "step": 1108 + }, + { + "epoch": 0.6747794341344692, + "grad_norm": 1.1724456548690796, + "learning_rate": 4.7814266882816296e-05, + "loss": 0.4615, + "step": 1109 + }, + { + "epoch": 0.6753878916945543, + "grad_norm": 1.178318977355957, + "learning_rate": 4.780917348162099e-05, + "loss": 0.5085, + "step": 1110 + }, + { + "epoch": 0.6759963492546395, + "grad_norm": 1.088005542755127, + "learning_rate": 4.7804074424701406e-05, + "loss": 0.4744, + "step": 1111 + }, + { + "epoch": 0.6766048068147247, + "grad_norm": 1.1180678606033325, + "learning_rate": 4.7798969713321874e-05, + "loss": 0.4814, + "step": 1112 + }, + { + "epoch": 0.6772132643748099, + "grad_norm": 1.1669161319732666, + "learning_rate": 4.779385934874817e-05, + "loss": 0.4187, + "step": 1113 + }, + { + "epoch": 0.6778217219348951, + "grad_norm": 1.0521961450576782, + "learning_rate": 4.7788743332247437e-05, + "loss": 0.459, + "step": 1114 + }, + { + "epoch": 0.6784301794949802, + "grad_norm": 1.1008024215698242, + "learning_rate": 4.778362166508824e-05, + "loss": 0.5048, + "step": 1115 + }, + { + "epoch": 0.6790386370550654, + "grad_norm": 1.0038844347000122, + "learning_rate": 4.777849434854054e-05, + "loss": 0.3917, + "step": 1116 + }, + { + "epoch": 0.6796470946151506, + "grad_norm": 1.1488128900527954, + "learning_rate": 4.7773361383875697e-05, + "loss": 0.4351, + "step": 1117 + }, + { + "epoch": 0.6802555521752358, + "grad_norm": 1.1399554014205933, + "learning_rate": 4.7768222772366466e-05, + "loss": 0.4365, + "step": 1118 + }, + { + "epoch": 0.6808640097353209, + "grad_norm": 1.3681772947311401, + "learning_rate": 4.776307851528702e-05, + "loss": 0.5406, + "step": 1119 + }, + { + "epoch": 0.6814724672954061, + "grad_norm": 1.2079730033874512, + "learning_rate": 4.7757928613912914e-05, + "loss": 0.4459, + "step": 1120 + }, + { + "epoch": 0.6820809248554913, + "grad_norm": 1.1010768413543701, + "learning_rate": 4.7752773069521104e-05, + "loss": 0.42, + "step": 1121 + }, + { + "epoch": 0.6826893824155765, + "grad_norm": 1.095807433128357, + "learning_rate": 4.774761188338995e-05, + "loss": 0.4267, + "step": 1122 + }, + { + "epoch": 0.6832978399756617, + "grad_norm": 1.1266393661499023, + "learning_rate": 4.774244505679923e-05, + "loss": 0.4663, + "step": 1123 + }, + { + "epoch": 0.6839062975357468, + "grad_norm": 1.1465984582901, + "learning_rate": 4.773727259103008e-05, + "loss": 0.4362, + "step": 1124 + }, + { + "epoch": 0.684514755095832, + "grad_norm": 1.1885544061660767, + "learning_rate": 4.7732094487365065e-05, + "loss": 0.4638, + "step": 1125 + }, + { + "epoch": 0.6851232126559172, + "grad_norm": 1.164554476737976, + "learning_rate": 4.772691074708814e-05, + "loss": 0.409, + "step": 1126 + }, + { + "epoch": 0.6857316702160025, + "grad_norm": 1.0508933067321777, + "learning_rate": 4.7721721371484654e-05, + "loss": 0.4334, + "step": 1127 + }, + { + "epoch": 0.6863401277760877, + "grad_norm": 1.0762606859207153, + "learning_rate": 4.771652636184135e-05, + "loss": 0.4735, + "step": 1128 + }, + { + "epoch": 0.6869485853361728, + "grad_norm": 1.0883415937423706, + "learning_rate": 4.771132571944639e-05, + "loss": 0.4749, + "step": 1129 + }, + { + "epoch": 0.687557042896258, + "grad_norm": 1.061272144317627, + "learning_rate": 4.770611944558929e-05, + "loss": 0.4797, + "step": 1130 + }, + { + "epoch": 0.6881655004563432, + "grad_norm": 1.0837888717651367, + "learning_rate": 4.770090754156102e-05, + "loss": 0.516, + "step": 1131 + }, + { + "epoch": 0.6887739580164284, + "grad_norm": 1.197726845741272, + "learning_rate": 4.7695690008653896e-05, + "loss": 0.4487, + "step": 1132 + }, + { + "epoch": 0.6893824155765136, + "grad_norm": 0.9252657294273376, + "learning_rate": 4.769046684816165e-05, + "loss": 0.3675, + "step": 1133 + }, + { + "epoch": 0.6899908731365987, + "grad_norm": 1.079782247543335, + "learning_rate": 4.768523806137941e-05, + "loss": 0.4489, + "step": 1134 + }, + { + "epoch": 0.6905993306966839, + "grad_norm": 1.2456952333450317, + "learning_rate": 4.76800036496037e-05, + "loss": 0.5507, + "step": 1135 + }, + { + "epoch": 0.6912077882567691, + "grad_norm": 1.1496522426605225, + "learning_rate": 4.7674763614132434e-05, + "loss": 0.496, + "step": 1136 + }, + { + "epoch": 0.6918162458168543, + "grad_norm": 1.1047946214675903, + "learning_rate": 4.766951795626493e-05, + "loss": 0.4656, + "step": 1137 + }, + { + "epoch": 0.6924247033769395, + "grad_norm": 1.0433160066604614, + "learning_rate": 4.7664266677301874e-05, + "loss": 0.4197, + "step": 1138 + }, + { + "epoch": 0.6930331609370246, + "grad_norm": 1.2909495830535889, + "learning_rate": 4.7659009778545384e-05, + "loss": 0.4733, + "step": 1139 + }, + { + "epoch": 0.6936416184971098, + "grad_norm": 1.110093355178833, + "learning_rate": 4.765374726129893e-05, + "loss": 0.4824, + "step": 1140 + }, + { + "epoch": 0.694250076057195, + "grad_norm": 1.285891056060791, + "learning_rate": 4.764847912686742e-05, + "loss": 0.485, + "step": 1141 + }, + { + "epoch": 0.6948585336172802, + "grad_norm": 1.0111443996429443, + "learning_rate": 4.764320537655712e-05, + "loss": 0.4081, + "step": 1142 + }, + { + "epoch": 0.6954669911773653, + "grad_norm": 1.2149741649627686, + "learning_rate": 4.76379260116757e-05, + "loss": 0.5088, + "step": 1143 + }, + { + "epoch": 0.6960754487374505, + "grad_norm": 1.2215123176574707, + "learning_rate": 4.7632641033532226e-05, + "loss": 0.4491, + "step": 1144 + }, + { + "epoch": 0.6966839062975357, + "grad_norm": 1.1258493661880493, + "learning_rate": 4.762735044343715e-05, + "loss": 0.4473, + "step": 1145 + }, + { + "epoch": 0.697292363857621, + "grad_norm": 1.2326325178146362, + "learning_rate": 4.7622054242702316e-05, + "loss": 0.4819, + "step": 1146 + }, + { + "epoch": 0.6979008214177062, + "grad_norm": 1.06281316280365, + "learning_rate": 4.761675243264097e-05, + "loss": 0.4228, + "step": 1147 + }, + { + "epoch": 0.6985092789777912, + "grad_norm": 1.1261435747146606, + "learning_rate": 4.761144501456773e-05, + "loss": 0.4255, + "step": 1148 + }, + { + "epoch": 0.6991177365378765, + "grad_norm": 1.1780974864959717, + "learning_rate": 4.760613198979862e-05, + "loss": 0.4302, + "step": 1149 + }, + { + "epoch": 0.6997261940979617, + "grad_norm": 1.119859218597412, + "learning_rate": 4.760081335965104e-05, + "loss": 0.4293, + "step": 1150 + }, + { + "epoch": 0.7003346516580469, + "grad_norm": 1.2447001934051514, + "learning_rate": 4.75954891254438e-05, + "loss": 0.4851, + "step": 1151 + }, + { + "epoch": 0.7009431092181321, + "grad_norm": 1.1865140199661255, + "learning_rate": 4.759015928849709e-05, + "loss": 0.4618, + "step": 1152 + }, + { + "epoch": 0.7015515667782172, + "grad_norm": 1.0534874200820923, + "learning_rate": 4.758482385013247e-05, + "loss": 0.4203, + "step": 1153 + }, + { + "epoch": 0.7021600243383024, + "grad_norm": 0.9607533812522888, + "learning_rate": 4.757948281167292e-05, + "loss": 0.368, + "step": 1154 + }, + { + "epoch": 0.7027684818983876, + "grad_norm": 1.1073296070098877, + "learning_rate": 4.75741361744428e-05, + "loss": 0.4375, + "step": 1155 + }, + { + "epoch": 0.7033769394584728, + "grad_norm": 1.2616397142410278, + "learning_rate": 4.756878393976783e-05, + "loss": 0.5214, + "step": 1156 + }, + { + "epoch": 0.703985397018558, + "grad_norm": 1.152910590171814, + "learning_rate": 4.756342610897517e-05, + "loss": 0.5083, + "step": 1157 + }, + { + "epoch": 0.7045938545786431, + "grad_norm": 1.2775417566299438, + "learning_rate": 4.7558062683393314e-05, + "loss": 0.5371, + "step": 1158 + }, + { + "epoch": 0.7052023121387283, + "grad_norm": 1.0952919721603394, + "learning_rate": 4.755269366435219e-05, + "loss": 0.4826, + "step": 1159 + }, + { + "epoch": 0.7058107696988135, + "grad_norm": 1.0538053512573242, + "learning_rate": 4.754731905318307e-05, + "loss": 0.4789, + "step": 1160 + }, + { + "epoch": 0.7064192272588987, + "grad_norm": 1.293050765991211, + "learning_rate": 4.754193885121865e-05, + "loss": 0.5337, + "step": 1161 + }, + { + "epoch": 0.7070276848189839, + "grad_norm": 1.0085028409957886, + "learning_rate": 4.7536553059792984e-05, + "loss": 0.4581, + "step": 1162 + }, + { + "epoch": 0.707636142379069, + "grad_norm": 1.063040852546692, + "learning_rate": 4.753116168024153e-05, + "loss": 0.5102, + "step": 1163 + }, + { + "epoch": 0.7082445999391542, + "grad_norm": 0.9961603283882141, + "learning_rate": 4.752576471390112e-05, + "loss": 0.4818, + "step": 1164 + }, + { + "epoch": 0.7088530574992394, + "grad_norm": 1.0528628826141357, + "learning_rate": 4.7520362162109986e-05, + "loss": 0.4396, + "step": 1165 + }, + { + "epoch": 0.7094615150593246, + "grad_norm": 1.2712112665176392, + "learning_rate": 4.751495402620774e-05, + "loss": 0.496, + "step": 1166 + }, + { + "epoch": 0.7100699726194099, + "grad_norm": 1.0716404914855957, + "learning_rate": 4.750954030753535e-05, + "loss": 0.4065, + "step": 1167 + }, + { + "epoch": 0.710678430179495, + "grad_norm": 1.2594822645187378, + "learning_rate": 4.7504121007435224e-05, + "loss": 0.5146, + "step": 1168 + }, + { + "epoch": 0.7112868877395802, + "grad_norm": 1.2636467218399048, + "learning_rate": 4.749869612725108e-05, + "loss": 0.5225, + "step": 1169 + }, + { + "epoch": 0.7118953452996654, + "grad_norm": 1.1434320211410522, + "learning_rate": 4.749326566832811e-05, + "loss": 0.4127, + "step": 1170 + }, + { + "epoch": 0.7125038028597506, + "grad_norm": 1.0728880167007446, + "learning_rate": 4.7487829632012816e-05, + "loss": 0.4583, + "step": 1171 + }, + { + "epoch": 0.7131122604198357, + "grad_norm": 1.2107247114181519, + "learning_rate": 4.7482388019653114e-05, + "loss": 0.4255, + "step": 1172 + }, + { + "epoch": 0.7137207179799209, + "grad_norm": 1.1021230220794678, + "learning_rate": 4.7476940832598295e-05, + "loss": 0.4829, + "step": 1173 + }, + { + "epoch": 0.7143291755400061, + "grad_norm": 1.2605928182601929, + "learning_rate": 4.747148807219902e-05, + "loss": 0.4636, + "step": 1174 + }, + { + "epoch": 0.7149376331000913, + "grad_norm": 1.0456511974334717, + "learning_rate": 4.746602973980738e-05, + "loss": 0.4248, + "step": 1175 + }, + { + "epoch": 0.7155460906601765, + "grad_norm": 1.1637083292007446, + "learning_rate": 4.746056583677678e-05, + "loss": 0.4544, + "step": 1176 + }, + { + "epoch": 0.7161545482202616, + "grad_norm": 1.1425434350967407, + "learning_rate": 4.745509636446207e-05, + "loss": 0.4557, + "step": 1177 + }, + { + "epoch": 0.7167630057803468, + "grad_norm": 1.1743497848510742, + "learning_rate": 4.744962132421943e-05, + "loss": 0.425, + "step": 1178 + }, + { + "epoch": 0.717371463340432, + "grad_norm": 1.1483440399169922, + "learning_rate": 4.744414071740644e-05, + "loss": 0.4819, + "step": 1179 + }, + { + "epoch": 0.7179799209005172, + "grad_norm": 1.0176396369934082, + "learning_rate": 4.7438654545382076e-05, + "loss": 0.4357, + "step": 1180 + }, + { + "epoch": 0.7185883784606024, + "grad_norm": 1.065658450126648, + "learning_rate": 4.743316280950667e-05, + "loss": 0.4108, + "step": 1181 + }, + { + "epoch": 0.7191968360206875, + "grad_norm": 0.9383304119110107, + "learning_rate": 4.7427665511141955e-05, + "loss": 0.3916, + "step": 1182 + }, + { + "epoch": 0.7198052935807727, + "grad_norm": 1.0549168586730957, + "learning_rate": 4.7422162651651026e-05, + "loss": 0.4165, + "step": 1183 + }, + { + "epoch": 0.7204137511408579, + "grad_norm": 1.241140365600586, + "learning_rate": 4.741665423239835e-05, + "loss": 0.4203, + "step": 1184 + }, + { + "epoch": 0.7210222087009431, + "grad_norm": 1.204376220703125, + "learning_rate": 4.741114025474981e-05, + "loss": 0.4497, + "step": 1185 + }, + { + "epoch": 0.7216306662610283, + "grad_norm": 1.0280909538269043, + "learning_rate": 4.7405620720072616e-05, + "loss": 0.4068, + "step": 1186 + }, + { + "epoch": 0.7222391238211134, + "grad_norm": 1.3876770734786987, + "learning_rate": 4.74000956297354e-05, + "loss": 0.4713, + "step": 1187 + }, + { + "epoch": 0.7228475813811986, + "grad_norm": 1.0588303804397583, + "learning_rate": 4.739456498510815e-05, + "loss": 0.4042, + "step": 1188 + }, + { + "epoch": 0.7234560389412839, + "grad_norm": 1.0324196815490723, + "learning_rate": 4.738902878756224e-05, + "loss": 0.469, + "step": 1189 + }, + { + "epoch": 0.7240644965013691, + "grad_norm": 0.9513891935348511, + "learning_rate": 4.738348703847041e-05, + "loss": 0.4032, + "step": 1190 + }, + { + "epoch": 0.7246729540614543, + "grad_norm": 1.0217005014419556, + "learning_rate": 4.737793973920678e-05, + "loss": 0.4112, + "step": 1191 + }, + { + "epoch": 0.7252814116215394, + "grad_norm": 1.1146938800811768, + "learning_rate": 4.737238689114686e-05, + "loss": 0.4397, + "step": 1192 + }, + { + "epoch": 0.7258898691816246, + "grad_norm": 1.017473816871643, + "learning_rate": 4.736682849566751e-05, + "loss": 0.4661, + "step": 1193 + }, + { + "epoch": 0.7264983267417098, + "grad_norm": 1.3287296295166016, + "learning_rate": 4.736126455414699e-05, + "loss": 0.4159, + "step": 1194 + }, + { + "epoch": 0.727106784301795, + "grad_norm": 1.1023236513137817, + "learning_rate": 4.7355695067964925e-05, + "loss": 0.425, + "step": 1195 + }, + { + "epoch": 0.7277152418618801, + "grad_norm": 1.1281861066818237, + "learning_rate": 4.735012003850232e-05, + "loss": 0.4161, + "step": 1196 + }, + { + "epoch": 0.7283236994219653, + "grad_norm": 1.031778335571289, + "learning_rate": 4.734453946714154e-05, + "loss": 0.4355, + "step": 1197 + }, + { + "epoch": 0.7289321569820505, + "grad_norm": 1.101798176765442, + "learning_rate": 4.733895335526633e-05, + "loss": 0.4299, + "step": 1198 + }, + { + "epoch": 0.7295406145421357, + "grad_norm": 1.27028489112854, + "learning_rate": 4.7333361704261834e-05, + "loss": 0.4291, + "step": 1199 + }, + { + "epoch": 0.7301490721022209, + "grad_norm": 1.2124416828155518, + "learning_rate": 4.732776451551453e-05, + "loss": 0.4873, + "step": 1200 + }, + { + "epoch": 0.730757529662306, + "grad_norm": 1.1454896926879883, + "learning_rate": 4.73221617904123e-05, + "loss": 0.4799, + "step": 1201 + }, + { + "epoch": 0.7313659872223912, + "grad_norm": 1.1555596590042114, + "learning_rate": 4.731655353034437e-05, + "loss": 0.465, + "step": 1202 + }, + { + "epoch": 0.7319744447824764, + "grad_norm": 1.1674630641937256, + "learning_rate": 4.7310939736701364e-05, + "loss": 0.4619, + "step": 1203 + }, + { + "epoch": 0.7325829023425616, + "grad_norm": 1.320896863937378, + "learning_rate": 4.7305320410875277e-05, + "loss": 0.4513, + "step": 1204 + }, + { + "epoch": 0.7331913599026468, + "grad_norm": 3.161492347717285, + "learning_rate": 4.7299695554259455e-05, + "loss": 0.512, + "step": 1205 + }, + { + "epoch": 0.7337998174627319, + "grad_norm": 1.1867998838424683, + "learning_rate": 4.729406516824864e-05, + "loss": 0.4125, + "step": 1206 + }, + { + "epoch": 0.7344082750228171, + "grad_norm": 1.1488641500473022, + "learning_rate": 4.7288429254238917e-05, + "loss": 0.4309, + "step": 1207 + }, + { + "epoch": 0.7350167325829023, + "grad_norm": 1.1211802959442139, + "learning_rate": 4.728278781362777e-05, + "loss": 0.4359, + "step": 1208 + }, + { + "epoch": 0.7356251901429876, + "grad_norm": 0.9918986558914185, + "learning_rate": 4.7277140847814025e-05, + "loss": 0.4088, + "step": 1209 + }, + { + "epoch": 0.7362336477030728, + "grad_norm": 1.1625863313674927, + "learning_rate": 4.7271488358197924e-05, + "loss": 0.424, + "step": 1210 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 1.432852029800415, + "learning_rate": 4.726583034618103e-05, + "loss": 0.4789, + "step": 1211 + }, + { + "epoch": 0.7374505628232431, + "grad_norm": 1.154141902923584, + "learning_rate": 4.7260166813166285e-05, + "loss": 0.4522, + "step": 1212 + }, + { + "epoch": 0.7380590203833283, + "grad_norm": 1.013265609741211, + "learning_rate": 4.7254497760558024e-05, + "loss": 0.3628, + "step": 1213 + }, + { + "epoch": 0.7386674779434135, + "grad_norm": 1.169954538345337, + "learning_rate": 4.724882318976194e-05, + "loss": 0.4394, + "step": 1214 + }, + { + "epoch": 0.7392759355034987, + "grad_norm": 1.1213183403015137, + "learning_rate": 4.724314310218507e-05, + "loss": 0.4598, + "step": 1215 + }, + { + "epoch": 0.7398843930635838, + "grad_norm": 1.1155545711517334, + "learning_rate": 4.723745749923586e-05, + "loss": 0.4315, + "step": 1216 + }, + { + "epoch": 0.740492850623669, + "grad_norm": 1.197703242301941, + "learning_rate": 4.723176638232408e-05, + "loss": 0.4351, + "step": 1217 + }, + { + "epoch": 0.7411013081837542, + "grad_norm": 1.4502240419387817, + "learning_rate": 4.7226069752860915e-05, + "loss": 0.47, + "step": 1218 + }, + { + "epoch": 0.7417097657438394, + "grad_norm": 1.4130961894989014, + "learning_rate": 4.722036761225888e-05, + "loss": 0.5292, + "step": 1219 + }, + { + "epoch": 0.7423182233039245, + "grad_norm": 1.2829434871673584, + "learning_rate": 4.7214659961931864e-05, + "loss": 0.4497, + "step": 1220 + }, + { + "epoch": 0.7429266808640097, + "grad_norm": 1.1895910501480103, + "learning_rate": 4.720894680329513e-05, + "loss": 0.3774, + "step": 1221 + }, + { + "epoch": 0.7435351384240949, + "grad_norm": 1.1315902471542358, + "learning_rate": 4.720322813776531e-05, + "loss": 0.4519, + "step": 1222 + }, + { + "epoch": 0.7441435959841801, + "grad_norm": 1.225305199623108, + "learning_rate": 4.7197503966760375e-05, + "loss": 0.5347, + "step": 1223 + }, + { + "epoch": 0.7447520535442653, + "grad_norm": 1.2546653747558594, + "learning_rate": 4.7191774291699695e-05, + "loss": 0.4597, + "step": 1224 + }, + { + "epoch": 0.7453605111043504, + "grad_norm": 0.880062997341156, + "learning_rate": 4.7186039114004e-05, + "loss": 0.3922, + "step": 1225 + }, + { + "epoch": 0.7459689686644356, + "grad_norm": 1.0395137071609497, + "learning_rate": 4.718029843509536e-05, + "loss": 0.4635, + "step": 1226 + }, + { + "epoch": 0.7465774262245208, + "grad_norm": 1.2031601667404175, + "learning_rate": 4.717455225639723e-05, + "loss": 0.3788, + "step": 1227 + }, + { + "epoch": 0.747185883784606, + "grad_norm": 0.9248659014701843, + "learning_rate": 4.716880057933441e-05, + "loss": 0.4171, + "step": 1228 + }, + { + "epoch": 0.7477943413446912, + "grad_norm": 0.9494261145591736, + "learning_rate": 4.71630434053331e-05, + "loss": 0.3695, + "step": 1229 + }, + { + "epoch": 0.7484027989047763, + "grad_norm": 1.4120838642120361, + "learning_rate": 4.715728073582082e-05, + "loss": 0.4758, + "step": 1230 + }, + { + "epoch": 0.7490112564648616, + "grad_norm": 1.4075794219970703, + "learning_rate": 4.715151257222649e-05, + "loss": 0.4857, + "step": 1231 + }, + { + "epoch": 0.7496197140249468, + "grad_norm": 1.2198188304901123, + "learning_rate": 4.7145738915980354e-05, + "loss": 0.4428, + "step": 1232 + }, + { + "epoch": 0.750228171585032, + "grad_norm": 1.3420535326004028, + "learning_rate": 4.7139959768514044e-05, + "loss": 0.4027, + "step": 1233 + }, + { + "epoch": 0.7508366291451172, + "grad_norm": 1.2325292825698853, + "learning_rate": 4.713417513126055e-05, + "loss": 0.4469, + "step": 1234 + }, + { + "epoch": 0.7514450867052023, + "grad_norm": 1.3523552417755127, + "learning_rate": 4.712838500565423e-05, + "loss": 0.4269, + "step": 1235 + }, + { + "epoch": 0.7520535442652875, + "grad_norm": 1.099088191986084, + "learning_rate": 4.712258939313078e-05, + "loss": 0.4336, + "step": 1236 + }, + { + "epoch": 0.7526620018253727, + "grad_norm": 1.248347520828247, + "learning_rate": 4.7116788295127275e-05, + "loss": 0.3627, + "step": 1237 + }, + { + "epoch": 0.7532704593854579, + "grad_norm": 1.3930177688598633, + "learning_rate": 4.711098171308214e-05, + "loss": 0.4765, + "step": 1238 + }, + { + "epoch": 0.7538789169455431, + "grad_norm": 0.9983875155448914, + "learning_rate": 4.7105169648435176e-05, + "loss": 0.4179, + "step": 1239 + }, + { + "epoch": 0.7544873745056282, + "grad_norm": 1.0693212747573853, + "learning_rate": 4.7099352102627536e-05, + "loss": 0.4764, + "step": 1240 + }, + { + "epoch": 0.7550958320657134, + "grad_norm": 1.1990723609924316, + "learning_rate": 4.7093529077101714e-05, + "loss": 0.4924, + "step": 1241 + }, + { + "epoch": 0.7557042896257986, + "grad_norm": 1.0372933149337769, + "learning_rate": 4.7087700573301585e-05, + "loss": 0.4129, + "step": 1242 + }, + { + "epoch": 0.7563127471858838, + "grad_norm": 1.0057023763656616, + "learning_rate": 4.7081866592672376e-05, + "loss": 0.4319, + "step": 1243 + }, + { + "epoch": 0.756921204745969, + "grad_norm": 1.0604325532913208, + "learning_rate": 4.7076027136660663e-05, + "loss": 0.4533, + "step": 1244 + }, + { + "epoch": 0.7575296623060541, + "grad_norm": 1.1443201303482056, + "learning_rate": 4.70701822067144e-05, + "loss": 0.4355, + "step": 1245 + }, + { + "epoch": 0.7581381198661393, + "grad_norm": 1.1489462852478027, + "learning_rate": 4.706433180428288e-05, + "loss": 0.4098, + "step": 1246 + }, + { + "epoch": 0.7587465774262245, + "grad_norm": 1.0587549209594727, + "learning_rate": 4.705847593081676e-05, + "loss": 0.3956, + "step": 1247 + }, + { + "epoch": 0.7593550349863097, + "grad_norm": 1.0858579874038696, + "learning_rate": 4.705261458776805e-05, + "loss": 0.4166, + "step": 1248 + }, + { + "epoch": 0.7599634925463948, + "grad_norm": 1.7913252115249634, + "learning_rate": 4.704674777659012e-05, + "loss": 0.4482, + "step": 1249 + }, + { + "epoch": 0.76057195010648, + "grad_norm": 1.3128273487091064, + "learning_rate": 4.70408754987377e-05, + "loss": 0.4596, + "step": 1250 + }, + { + "epoch": 0.7611804076665653, + "grad_norm": 1.0580440759658813, + "learning_rate": 4.703499775566686e-05, + "loss": 0.4517, + "step": 1251 + }, + { + "epoch": 0.7617888652266505, + "grad_norm": 1.076398491859436, + "learning_rate": 4.702911454883504e-05, + "loss": 0.3933, + "step": 1252 + }, + { + "epoch": 0.7623973227867357, + "grad_norm": 1.267801284790039, + "learning_rate": 4.702322587970104e-05, + "loss": 0.4953, + "step": 1253 + }, + { + "epoch": 0.7630057803468208, + "grad_norm": 0.9255053400993347, + "learning_rate": 4.701733174972498e-05, + "loss": 0.3944, + "step": 1254 + }, + { + "epoch": 0.763614237906906, + "grad_norm": 0.9950581192970276, + "learning_rate": 4.7011432160368385e-05, + "loss": 0.4811, + "step": 1255 + }, + { + "epoch": 0.7642226954669912, + "grad_norm": 1.2969601154327393, + "learning_rate": 4.7005527113094094e-05, + "loss": 0.4454, + "step": 1256 + }, + { + "epoch": 0.7648311530270764, + "grad_norm": 1.045771837234497, + "learning_rate": 4.699961660936631e-05, + "loss": 0.4248, + "step": 1257 + }, + { + "epoch": 0.7654396105871616, + "grad_norm": 1.1103192567825317, + "learning_rate": 4.69937006506506e-05, + "loss": 0.402, + "step": 1258 + }, + { + "epoch": 0.7660480681472467, + "grad_norm": 1.0572874546051025, + "learning_rate": 4.698777923841386e-05, + "loss": 0.4319, + "step": 1259 + }, + { + "epoch": 0.7666565257073319, + "grad_norm": 1.1772733926773071, + "learning_rate": 4.6981852374124384e-05, + "loss": 0.4311, + "step": 1260 + }, + { + "epoch": 0.7672649832674171, + "grad_norm": 1.0701271295547485, + "learning_rate": 4.697592005925176e-05, + "loss": 0.4412, + "step": 1261 + }, + { + "epoch": 0.7678734408275023, + "grad_norm": 1.0132157802581787, + "learning_rate": 4.696998229526696e-05, + "loss": 0.3985, + "step": 1262 + }, + { + "epoch": 0.7684818983875875, + "grad_norm": 1.0832394361495972, + "learning_rate": 4.696403908364231e-05, + "loss": 0.4161, + "step": 1263 + }, + { + "epoch": 0.7690903559476726, + "grad_norm": 1.0529204607009888, + "learning_rate": 4.6958090425851465e-05, + "loss": 0.5481, + "step": 1264 + }, + { + "epoch": 0.7696988135077578, + "grad_norm": 1.1204801797866821, + "learning_rate": 4.6952136323369463e-05, + "loss": 0.4214, + "step": 1265 + }, + { + "epoch": 0.770307271067843, + "grad_norm": 1.0171352624893188, + "learning_rate": 4.6946176777672654e-05, + "loss": 0.4146, + "step": 1266 + }, + { + "epoch": 0.7709157286279282, + "grad_norm": 0.9407910704612732, + "learning_rate": 4.694021179023877e-05, + "loss": 0.4224, + "step": 1267 + }, + { + "epoch": 0.7715241861880134, + "grad_norm": 1.1776046752929688, + "learning_rate": 4.6934241362546874e-05, + "loss": 0.5192, + "step": 1268 + }, + { + "epoch": 0.7721326437480985, + "grad_norm": 1.0098623037338257, + "learning_rate": 4.692826549607738e-05, + "loss": 0.4352, + "step": 1269 + }, + { + "epoch": 0.7727411013081837, + "grad_norm": 1.0229098796844482, + "learning_rate": 4.6922284192312074e-05, + "loss": 0.4095, + "step": 1270 + }, + { + "epoch": 0.773349558868269, + "grad_norm": 1.0688673257827759, + "learning_rate": 4.691629745273404e-05, + "loss": 0.4093, + "step": 1271 + }, + { + "epoch": 0.7739580164283542, + "grad_norm": 1.7956994771957397, + "learning_rate": 4.691030527882776e-05, + "loss": 0.4594, + "step": 1272 + }, + { + "epoch": 0.7745664739884393, + "grad_norm": 1.1307945251464844, + "learning_rate": 4.690430767207903e-05, + "loss": 0.4544, + "step": 1273 + }, + { + "epoch": 0.7751749315485245, + "grad_norm": 0.9435398578643799, + "learning_rate": 4.689830463397502e-05, + "loss": 0.3981, + "step": 1274 + }, + { + "epoch": 0.7757833891086097, + "grad_norm": 1.1967155933380127, + "learning_rate": 4.689229616600422e-05, + "loss": 0.4067, + "step": 1275 + }, + { + "epoch": 0.7763918466686949, + "grad_norm": 1.2077351808547974, + "learning_rate": 4.68862822696565e-05, + "loss": 0.4763, + "step": 1276 + }, + { + "epoch": 0.7770003042287801, + "grad_norm": 1.1517752408981323, + "learning_rate": 4.688026294642303e-05, + "loss": 0.4822, + "step": 1277 + }, + { + "epoch": 0.7776087617888652, + "grad_norm": 1.3446024656295776, + "learning_rate": 4.687423819779637e-05, + "loss": 0.3934, + "step": 1278 + }, + { + "epoch": 0.7782172193489504, + "grad_norm": 0.9664615988731384, + "learning_rate": 4.6868208025270396e-05, + "loss": 0.3821, + "step": 1279 + }, + { + "epoch": 0.7788256769090356, + "grad_norm": 1.1611120700836182, + "learning_rate": 4.6862172430340344e-05, + "loss": 0.4274, + "step": 1280 + }, + { + "epoch": 0.7794341344691208, + "grad_norm": 1.4211469888687134, + "learning_rate": 4.6856131414502795e-05, + "loss": 0.5036, + "step": 1281 + }, + { + "epoch": 0.780042592029206, + "grad_norm": 1.0818086862564087, + "learning_rate": 4.685008497925566e-05, + "loss": 0.4481, + "step": 1282 + }, + { + "epoch": 0.7806510495892911, + "grad_norm": 1.3467516899108887, + "learning_rate": 4.6844033126098206e-05, + "loss": 0.4211, + "step": 1283 + }, + { + "epoch": 0.7812595071493763, + "grad_norm": 1.0704116821289062, + "learning_rate": 4.683797585653104e-05, + "loss": 0.4344, + "step": 1284 + }, + { + "epoch": 0.7818679647094615, + "grad_norm": 1.2070298194885254, + "learning_rate": 4.683191317205612e-05, + "loss": 0.4463, + "step": 1285 + }, + { + "epoch": 0.7824764222695467, + "grad_norm": 1.0552793741226196, + "learning_rate": 4.682584507417672e-05, + "loss": 0.4259, + "step": 1286 + }, + { + "epoch": 0.7830848798296319, + "grad_norm": 1.0915594100952148, + "learning_rate": 4.6819771564397496e-05, + "loss": 0.4085, + "step": 1287 + }, + { + "epoch": 0.783693337389717, + "grad_norm": 1.2251918315887451, + "learning_rate": 4.681369264422441e-05, + "loss": 0.4623, + "step": 1288 + }, + { + "epoch": 0.7843017949498022, + "grad_norm": 1.0790926218032837, + "learning_rate": 4.68076083151648e-05, + "loss": 0.454, + "step": 1289 + }, + { + "epoch": 0.7849102525098874, + "grad_norm": 1.0661593675613403, + "learning_rate": 4.680151857872731e-05, + "loss": 0.406, + "step": 1290 + }, + { + "epoch": 0.7855187100699726, + "grad_norm": 0.9886572957038879, + "learning_rate": 4.6795423436421934e-05, + "loss": 0.4058, + "step": 1291 + }, + { + "epoch": 0.7861271676300579, + "grad_norm": 1.0567951202392578, + "learning_rate": 4.678932288976004e-05, + "loss": 0.4134, + "step": 1292 + }, + { + "epoch": 0.786735625190143, + "grad_norm": 1.0652602910995483, + "learning_rate": 4.678321694025428e-05, + "loss": 0.4439, + "step": 1293 + }, + { + "epoch": 0.7873440827502282, + "grad_norm": 1.048807144165039, + "learning_rate": 4.6777105589418695e-05, + "loss": 0.3748, + "step": 1294 + }, + { + "epoch": 0.7879525403103134, + "grad_norm": 1.044026494026184, + "learning_rate": 4.6770988838768634e-05, + "loss": 0.3903, + "step": 1295 + }, + { + "epoch": 0.7885609978703986, + "grad_norm": 0.9846064448356628, + "learning_rate": 4.676486668982081e-05, + "loss": 0.4294, + "step": 1296 + }, + { + "epoch": 0.7891694554304837, + "grad_norm": 1.2005590200424194, + "learning_rate": 4.675873914409324e-05, + "loss": 0.4848, + "step": 1297 + }, + { + "epoch": 0.7897779129905689, + "grad_norm": 1.192406177520752, + "learning_rate": 4.6752606203105314e-05, + "loss": 0.3836, + "step": 1298 + }, + { + "epoch": 0.7903863705506541, + "grad_norm": 0.9216708540916443, + "learning_rate": 4.6746467868377744e-05, + "loss": 0.419, + "step": 1299 + }, + { + "epoch": 0.7909948281107393, + "grad_norm": 1.1610586643218994, + "learning_rate": 4.674032414143258e-05, + "loss": 0.4027, + "step": 1300 + }, + { + "epoch": 0.7916032856708245, + "grad_norm": 1.142449975013733, + "learning_rate": 4.673417502379321e-05, + "loss": 0.4675, + "step": 1301 + }, + { + "epoch": 0.7922117432309096, + "grad_norm": 1.1200268268585205, + "learning_rate": 4.672802051698436e-05, + "loss": 0.4913, + "step": 1302 + }, + { + "epoch": 0.7928202007909948, + "grad_norm": 1.5124874114990234, + "learning_rate": 4.672186062253209e-05, + "loss": 0.5245, + "step": 1303 + }, + { + "epoch": 0.79342865835108, + "grad_norm": 1.1234627962112427, + "learning_rate": 4.671569534196379e-05, + "loss": 0.4452, + "step": 1304 + }, + { + "epoch": 0.7940371159111652, + "grad_norm": 1.069880723953247, + "learning_rate": 4.6709524676808215e-05, + "loss": 0.4368, + "step": 1305 + }, + { + "epoch": 0.7946455734712504, + "grad_norm": 1.0936565399169922, + "learning_rate": 4.670334862859541e-05, + "loss": 0.4527, + "step": 1306 + }, + { + "epoch": 0.7952540310313355, + "grad_norm": 1.0277516841888428, + "learning_rate": 4.669716719885679e-05, + "loss": 0.4186, + "step": 1307 + }, + { + "epoch": 0.7958624885914207, + "grad_norm": 0.9716640114784241, + "learning_rate": 4.6690980389125075e-05, + "loss": 0.3765, + "step": 1308 + }, + { + "epoch": 0.7964709461515059, + "grad_norm": 1.0334478616714478, + "learning_rate": 4.668478820093436e-05, + "loss": 0.416, + "step": 1309 + }, + { + "epoch": 0.7970794037115911, + "grad_norm": 1.0650469064712524, + "learning_rate": 4.667859063582003e-05, + "loss": 0.428, + "step": 1310 + }, + { + "epoch": 0.7976878612716763, + "grad_norm": 1.0338752269744873, + "learning_rate": 4.667238769531883e-05, + "loss": 0.4597, + "step": 1311 + }, + { + "epoch": 0.7982963188317614, + "grad_norm": 1.0277113914489746, + "learning_rate": 4.666617938096884e-05, + "loss": 0.4062, + "step": 1312 + }, + { + "epoch": 0.7989047763918466, + "grad_norm": 1.0234774351119995, + "learning_rate": 4.6659965694309446e-05, + "loss": 0.3839, + "step": 1313 + }, + { + "epoch": 0.7995132339519319, + "grad_norm": 1.0939502716064453, + "learning_rate": 4.66537466368814e-05, + "loss": 0.4286, + "step": 1314 + }, + { + "epoch": 0.8001216915120171, + "grad_norm": 1.0294753313064575, + "learning_rate": 4.664752221022676e-05, + "loss": 0.3898, + "step": 1315 + }, + { + "epoch": 0.8007301490721023, + "grad_norm": 1.0266523361206055, + "learning_rate": 4.6641292415888916e-05, + "loss": 0.3917, + "step": 1316 + }, + { + "epoch": 0.8013386066321874, + "grad_norm": 1.0287213325500488, + "learning_rate": 4.6635057255412606e-05, + "loss": 0.4362, + "step": 1317 + }, + { + "epoch": 0.8019470641922726, + "grad_norm": 1.0150967836380005, + "learning_rate": 4.662881673034389e-05, + "loss": 0.3872, + "step": 1318 + }, + { + "epoch": 0.8025555217523578, + "grad_norm": 1.106278896331787, + "learning_rate": 4.662257084223017e-05, + "loss": 0.4082, + "step": 1319 + }, + { + "epoch": 0.803163979312443, + "grad_norm": 0.9770355224609375, + "learning_rate": 4.661631959262015e-05, + "loss": 0.4392, + "step": 1320 + }, + { + "epoch": 0.8037724368725282, + "grad_norm": 1.2682089805603027, + "learning_rate": 4.661006298306388e-05, + "loss": 0.4548, + "step": 1321 + }, + { + "epoch": 0.8043808944326133, + "grad_norm": 1.000850796699524, + "learning_rate": 4.660380101511275e-05, + "loss": 0.374, + "step": 1322 + }, + { + "epoch": 0.8049893519926985, + "grad_norm": 0.9677361249923706, + "learning_rate": 4.659753369031945e-05, + "loss": 0.3699, + "step": 1323 + }, + { + "epoch": 0.8055978095527837, + "grad_norm": 1.096622109413147, + "learning_rate": 4.659126101023802e-05, + "loss": 0.4508, + "step": 1324 + }, + { + "epoch": 0.8062062671128689, + "grad_norm": 0.9467645883560181, + "learning_rate": 4.658498297642384e-05, + "loss": 0.3777, + "step": 1325 + }, + { + "epoch": 0.806814724672954, + "grad_norm": 0.9743712544441223, + "learning_rate": 4.6578699590433585e-05, + "loss": 0.3962, + "step": 1326 + }, + { + "epoch": 0.8074231822330392, + "grad_norm": 1.0897773504257202, + "learning_rate": 4.657241085382527e-05, + "loss": 0.4304, + "step": 1327 + }, + { + "epoch": 0.8080316397931244, + "grad_norm": 1.0162287950515747, + "learning_rate": 4.6566116768158254e-05, + "loss": 0.4047, + "step": 1328 + }, + { + "epoch": 0.8086400973532096, + "grad_norm": 0.9446073770523071, + "learning_rate": 4.65598173349932e-05, + "loss": 0.3908, + "step": 1329 + }, + { + "epoch": 0.8092485549132948, + "grad_norm": 1.0082128047943115, + "learning_rate": 4.655351255589209e-05, + "loss": 0.3977, + "step": 1330 + }, + { + "epoch": 0.8098570124733799, + "grad_norm": 1.08930242061615, + "learning_rate": 4.6547202432418274e-05, + "loss": 0.4132, + "step": 1331 + }, + { + "epoch": 0.8104654700334651, + "grad_norm": 1.1194206476211548, + "learning_rate": 4.654088696613638e-05, + "loss": 0.3912, + "step": 1332 + }, + { + "epoch": 0.8110739275935503, + "grad_norm": 1.0496968030929565, + "learning_rate": 4.6534566158612395e-05, + "loss": 0.3896, + "step": 1333 + }, + { + "epoch": 0.8116823851536356, + "grad_norm": 1.0310301780700684, + "learning_rate": 4.65282400114136e-05, + "loss": 0.4467, + "step": 1334 + }, + { + "epoch": 0.8122908427137208, + "grad_norm": 1.234102487564087, + "learning_rate": 4.6521908526108624e-05, + "loss": 0.403, + "step": 1335 + }, + { + "epoch": 0.8128993002738059, + "grad_norm": 1.0917807817459106, + "learning_rate": 4.6515571704267414e-05, + "loss": 0.4464, + "step": 1336 + }, + { + "epoch": 0.8135077578338911, + "grad_norm": 1.1464637517929077, + "learning_rate": 4.650922954746123e-05, + "loss": 0.4617, + "step": 1337 + }, + { + "epoch": 0.8141162153939763, + "grad_norm": 1.1391267776489258, + "learning_rate": 4.6502882057262675e-05, + "loss": 0.4209, + "step": 1338 + }, + { + "epoch": 0.8147246729540615, + "grad_norm": 1.1518268585205078, + "learning_rate": 4.6496529235245644e-05, + "loss": 0.4591, + "step": 1339 + }, + { + "epoch": 0.8153331305141467, + "grad_norm": 1.207227349281311, + "learning_rate": 4.649017108298539e-05, + "loss": 0.4252, + "step": 1340 + }, + { + "epoch": 0.8159415880742318, + "grad_norm": 1.172620415687561, + "learning_rate": 4.648380760205846e-05, + "loss": 0.4601, + "step": 1341 + }, + { + "epoch": 0.816550045634317, + "grad_norm": 1.1918513774871826, + "learning_rate": 4.647743879404273e-05, + "loss": 0.4373, + "step": 1342 + }, + { + "epoch": 0.8171585031944022, + "grad_norm": 1.1502752304077148, + "learning_rate": 4.647106466051741e-05, + "loss": 0.4047, + "step": 1343 + }, + { + "epoch": 0.8177669607544874, + "grad_norm": 1.0501556396484375, + "learning_rate": 4.6464685203063005e-05, + "loss": 0.422, + "step": 1344 + }, + { + "epoch": 0.8183754183145726, + "grad_norm": 0.8644611835479736, + "learning_rate": 4.645830042326137e-05, + "loss": 0.3404, + "step": 1345 + }, + { + "epoch": 0.8189838758746577, + "grad_norm": 0.9725202322006226, + "learning_rate": 4.645191032269565e-05, + "loss": 0.4371, + "step": 1346 + }, + { + "epoch": 0.8195923334347429, + "grad_norm": 1.0984727144241333, + "learning_rate": 4.644551490295033e-05, + "loss": 0.396, + "step": 1347 + }, + { + "epoch": 0.8202007909948281, + "grad_norm": 1.0615260601043701, + "learning_rate": 4.643911416561121e-05, + "loss": 0.4147, + "step": 1348 + }, + { + "epoch": 0.8208092485549133, + "grad_norm": 1.1247777938842773, + "learning_rate": 4.6432708112265397e-05, + "loss": 0.3993, + "step": 1349 + }, + { + "epoch": 0.8214177061149984, + "grad_norm": 1.0866373777389526, + "learning_rate": 4.642629674450134e-05, + "loss": 0.439, + "step": 1350 + }, + { + "epoch": 0.8220261636750836, + "grad_norm": 1.415543556213379, + "learning_rate": 4.641988006390877e-05, + "loss": 0.4355, + "step": 1351 + }, + { + "epoch": 0.8226346212351688, + "grad_norm": 3.2268388271331787, + "learning_rate": 4.641345807207879e-05, + "loss": 0.4287, + "step": 1352 + }, + { + "epoch": 0.823243078795254, + "grad_norm": 1.167698860168457, + "learning_rate": 4.640703077060374e-05, + "loss": 0.4728, + "step": 1353 + }, + { + "epoch": 0.8238515363553393, + "grad_norm": 1.2120864391326904, + "learning_rate": 4.640059816107737e-05, + "loss": 0.4648, + "step": 1354 + }, + { + "epoch": 0.8244599939154243, + "grad_norm": 1.0012540817260742, + "learning_rate": 4.639416024509466e-05, + "loss": 0.4621, + "step": 1355 + }, + { + "epoch": 0.8250684514755096, + "grad_norm": 2.5287537574768066, + "learning_rate": 4.638771702425197e-05, + "loss": 0.4767, + "step": 1356 + }, + { + "epoch": 0.8256769090355948, + "grad_norm": 1.1467103958129883, + "learning_rate": 4.638126850014694e-05, + "loss": 0.4715, + "step": 1357 + }, + { + "epoch": 0.82628536659568, + "grad_norm": 1.286732792854309, + "learning_rate": 4.637481467437854e-05, + "loss": 0.404, + "step": 1358 + }, + { + "epoch": 0.8268938241557652, + "grad_norm": 1.0348389148712158, + "learning_rate": 4.6368355548547046e-05, + "loss": 0.4483, + "step": 1359 + }, + { + "epoch": 0.8275022817158503, + "grad_norm": 1.1077744960784912, + "learning_rate": 4.636189112425405e-05, + "loss": 0.4157, + "step": 1360 + }, + { + "epoch": 0.8281107392759355, + "grad_norm": 0.9787223935127258, + "learning_rate": 4.635542140310246e-05, + "loss": 0.3761, + "step": 1361 + }, + { + "epoch": 0.8287191968360207, + "grad_norm": 1.1058927774429321, + "learning_rate": 4.6348946386696506e-05, + "loss": 0.3918, + "step": 1362 + }, + { + "epoch": 0.8293276543961059, + "grad_norm": 1.0943537950515747, + "learning_rate": 4.6342466076641715e-05, + "loss": 0.4403, + "step": 1363 + }, + { + "epoch": 0.8299361119561911, + "grad_norm": 1.1616437435150146, + "learning_rate": 4.633598047454494e-05, + "loss": 0.4496, + "step": 1364 + }, + { + "epoch": 0.8305445695162762, + "grad_norm": 1.026498794555664, + "learning_rate": 4.632948958201432e-05, + "loss": 0.4414, + "step": 1365 + }, + { + "epoch": 0.8311530270763614, + "grad_norm": 1.1366041898727417, + "learning_rate": 4.6322993400659355e-05, + "loss": 0.4226, + "step": 1366 + }, + { + "epoch": 0.8317614846364466, + "grad_norm": 0.9318983554840088, + "learning_rate": 4.631649193209081e-05, + "loss": 0.3743, + "step": 1367 + }, + { + "epoch": 0.8323699421965318, + "grad_norm": 1.0723328590393066, + "learning_rate": 4.6309985177920776e-05, + "loss": 0.3779, + "step": 1368 + }, + { + "epoch": 0.832978399756617, + "grad_norm": 0.9842046499252319, + "learning_rate": 4.630347313976266e-05, + "loss": 0.3871, + "step": 1369 + }, + { + "epoch": 0.8335868573167021, + "grad_norm": 1.232901930809021, + "learning_rate": 4.629695581923118e-05, + "loss": 0.3983, + "step": 1370 + }, + { + "epoch": 0.8341953148767873, + "grad_norm": 1.4811533689498901, + "learning_rate": 4.629043321794237e-05, + "loss": 0.4282, + "step": 1371 + }, + { + "epoch": 0.8348037724368725, + "grad_norm": 0.9871484637260437, + "learning_rate": 4.628390533751353e-05, + "loss": 0.3485, + "step": 1372 + }, + { + "epoch": 0.8354122299969577, + "grad_norm": 1.0894109010696411, + "learning_rate": 4.6277372179563336e-05, + "loss": 0.4226, + "step": 1373 + }, + { + "epoch": 0.8360206875570428, + "grad_norm": 0.9860946536064148, + "learning_rate": 4.627083374571173e-05, + "loss": 0.3852, + "step": 1374 + }, + { + "epoch": 0.836629145117128, + "grad_norm": 0.9105382561683655, + "learning_rate": 4.6264290037579955e-05, + "loss": 0.3875, + "step": 1375 + }, + { + "epoch": 0.8372376026772133, + "grad_norm": 0.9577265977859497, + "learning_rate": 4.625774105679059e-05, + "loss": 0.4193, + "step": 1376 + }, + { + "epoch": 0.8378460602372985, + "grad_norm": 1.2966721057891846, + "learning_rate": 4.625118680496752e-05, + "loss": 0.4104, + "step": 1377 + }, + { + "epoch": 0.8384545177973837, + "grad_norm": 1.9938510656356812, + "learning_rate": 4.624462728373591e-05, + "loss": 0.4041, + "step": 1378 + }, + { + "epoch": 0.8390629753574688, + "grad_norm": 0.9969017505645752, + "learning_rate": 4.6238062494722254e-05, + "loss": 0.4448, + "step": 1379 + }, + { + "epoch": 0.839671432917554, + "grad_norm": 1.0461878776550293, + "learning_rate": 4.623149243955435e-05, + "loss": 0.4411, + "step": 1380 + }, + { + "epoch": 0.8402798904776392, + "grad_norm": 1.1554114818572998, + "learning_rate": 4.6224917119861286e-05, + "loss": 0.4828, + "step": 1381 + }, + { + "epoch": 0.8408883480377244, + "grad_norm": 1.0953601598739624, + "learning_rate": 4.6218336537273476e-05, + "loss": 0.4883, + "step": 1382 + }, + { + "epoch": 0.8414968055978096, + "grad_norm": 1.0780771970748901, + "learning_rate": 4.621175069342263e-05, + "loss": 0.4425, + "step": 1383 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.8909410834312439, + "learning_rate": 4.620515958994176e-05, + "loss": 0.4176, + "step": 1384 + }, + { + "epoch": 0.8427137207179799, + "grad_norm": 1.1288505792617798, + "learning_rate": 4.619856322846518e-05, + "loss": 0.4576, + "step": 1385 + }, + { + "epoch": 0.8433221782780651, + "grad_norm": 1.2266353368759155, + "learning_rate": 4.619196161062854e-05, + "loss": 0.3678, + "step": 1386 + }, + { + "epoch": 0.8439306358381503, + "grad_norm": 1.1094207763671875, + "learning_rate": 4.6185354738068726e-05, + "loss": 0.3636, + "step": 1387 + }, + { + "epoch": 0.8445390933982355, + "grad_norm": 1.0035345554351807, + "learning_rate": 4.617874261242399e-05, + "loss": 0.3561, + "step": 1388 + }, + { + "epoch": 0.8451475509583206, + "grad_norm": 1.1671876907348633, + "learning_rate": 4.617212523533386e-05, + "loss": 0.395, + "step": 1389 + }, + { + "epoch": 0.8457560085184058, + "grad_norm": 1.2926161289215088, + "learning_rate": 4.616550260843917e-05, + "loss": 0.4506, + "step": 1390 + }, + { + "epoch": 0.846364466078491, + "grad_norm": 1.220173954963684, + "learning_rate": 4.6158874733382056e-05, + "loss": 0.4585, + "step": 1391 + }, + { + "epoch": 0.8469729236385762, + "grad_norm": 0.9969469308853149, + "learning_rate": 4.6152241611805956e-05, + "loss": 0.3849, + "step": 1392 + }, + { + "epoch": 0.8475813811986614, + "grad_norm": 0.9798814058303833, + "learning_rate": 4.61456032453556e-05, + "loss": 0.4175, + "step": 1393 + }, + { + "epoch": 0.8481898387587465, + "grad_norm": 1.095361351966858, + "learning_rate": 4.613895963567704e-05, + "loss": 0.4047, + "step": 1394 + }, + { + "epoch": 0.8487982963188317, + "grad_norm": 0.9531521201133728, + "learning_rate": 4.6132310784417595e-05, + "loss": 0.348, + "step": 1395 + }, + { + "epoch": 0.849406753878917, + "grad_norm": 1.015824556350708, + "learning_rate": 4.612565669322592e-05, + "loss": 0.4401, + "step": 1396 + }, + { + "epoch": 0.8500152114390022, + "grad_norm": 1.1718368530273438, + "learning_rate": 4.611899736375194e-05, + "loss": 0.3683, + "step": 1397 + }, + { + "epoch": 0.8506236689990874, + "grad_norm": 1.0078071355819702, + "learning_rate": 4.61123327976469e-05, + "loss": 0.3512, + "step": 1398 + }, + { + "epoch": 0.8512321265591725, + "grad_norm": 1.269882321357727, + "learning_rate": 4.610566299656332e-05, + "loss": 0.425, + "step": 1399 + }, + { + "epoch": 0.8518405841192577, + "grad_norm": 1.0912398099899292, + "learning_rate": 4.609898796215506e-05, + "loss": 0.421, + "step": 1400 + }, + { + "epoch": 0.8524490416793429, + "grad_norm": 1.116542100906372, + "learning_rate": 4.609230769607723e-05, + "loss": 0.4179, + "step": 1401 + }, + { + "epoch": 0.8530574992394281, + "grad_norm": 1.4198042154312134, + "learning_rate": 4.6085622199986266e-05, + "loss": 0.4061, + "step": 1402 + }, + { + "epoch": 0.8536659567995132, + "grad_norm": 1.0006221532821655, + "learning_rate": 4.607893147553989e-05, + "loss": 0.3707, + "step": 1403 + }, + { + "epoch": 0.8542744143595984, + "grad_norm": 1.0920796394348145, + "learning_rate": 4.607223552439711e-05, + "loss": 0.3692, + "step": 1404 + }, + { + "epoch": 0.8548828719196836, + "grad_norm": 0.9862841367721558, + "learning_rate": 4.606553434821826e-05, + "loss": 0.3852, + "step": 1405 + }, + { + "epoch": 0.8554913294797688, + "grad_norm": 1.6820799112319946, + "learning_rate": 4.605882794866495e-05, + "loss": 0.4196, + "step": 1406 + }, + { + "epoch": 0.856099787039854, + "grad_norm": 1.1753647327423096, + "learning_rate": 4.605211632740008e-05, + "loss": 0.4056, + "step": 1407 + }, + { + "epoch": 0.8567082445999391, + "grad_norm": 1.041169285774231, + "learning_rate": 4.6045399486087856e-05, + "loss": 0.4012, + "step": 1408 + }, + { + "epoch": 0.8573167021600243, + "grad_norm": 1.1424866914749146, + "learning_rate": 4.603867742639377e-05, + "loss": 0.4843, + "step": 1409 + }, + { + "epoch": 0.8579251597201095, + "grad_norm": 1.1785975694656372, + "learning_rate": 4.6031950149984624e-05, + "loss": 0.3528, + "step": 1410 + }, + { + "epoch": 0.8585336172801947, + "grad_norm": 1.092089056968689, + "learning_rate": 4.6025217658528497e-05, + "loss": 0.4248, + "step": 1411 + }, + { + "epoch": 0.8591420748402799, + "grad_norm": 1.6886132955551147, + "learning_rate": 4.601847995369477e-05, + "loss": 0.3746, + "step": 1412 + }, + { + "epoch": 0.859750532400365, + "grad_norm": 0.9200694561004639, + "learning_rate": 4.60117370371541e-05, + "loss": 0.3175, + "step": 1413 + }, + { + "epoch": 0.8603589899604502, + "grad_norm": 0.9181839823722839, + "learning_rate": 4.600498891057845e-05, + "loss": 0.3799, + "step": 1414 + }, + { + "epoch": 0.8609674475205354, + "grad_norm": 0.9873498678207397, + "learning_rate": 4.599823557564109e-05, + "loss": 0.3687, + "step": 1415 + }, + { + "epoch": 0.8615759050806207, + "grad_norm": 1.0958657264709473, + "learning_rate": 4.5991477034016564e-05, + "loss": 0.4309, + "step": 1416 + }, + { + "epoch": 0.8621843626407059, + "grad_norm": 0.935565710067749, + "learning_rate": 4.598471328738069e-05, + "loss": 0.3494, + "step": 1417 + }, + { + "epoch": 0.862792820200791, + "grad_norm": 1.0181663036346436, + "learning_rate": 4.597794433741061e-05, + "loss": 0.3567, + "step": 1418 + }, + { + "epoch": 0.8634012777608762, + "grad_norm": 1.1385202407836914, + "learning_rate": 4.597117018578473e-05, + "loss": 0.4259, + "step": 1419 + }, + { + "epoch": 0.8640097353209614, + "grad_norm": 1.041556715965271, + "learning_rate": 4.596439083418278e-05, + "loss": 0.3825, + "step": 1420 + }, + { + "epoch": 0.8646181928810466, + "grad_norm": 0.9178049564361572, + "learning_rate": 4.5957606284285736e-05, + "loss": 0.3587, + "step": 1421 + }, + { + "epoch": 0.8652266504411318, + "grad_norm": 1.0861730575561523, + "learning_rate": 4.595081653777589e-05, + "loss": 0.4345, + "step": 1422 + }, + { + "epoch": 0.8658351080012169, + "grad_norm": 1.1010128259658813, + "learning_rate": 4.594402159633681e-05, + "loss": 0.4007, + "step": 1423 + }, + { + "epoch": 0.8664435655613021, + "grad_norm": 1.000855803489685, + "learning_rate": 4.593722146165337e-05, + "loss": 0.4319, + "step": 1424 + }, + { + "epoch": 0.8670520231213873, + "grad_norm": 1.2340881824493408, + "learning_rate": 4.5930416135411715e-05, + "loss": 0.4397, + "step": 1425 + }, + { + "epoch": 0.8676604806814725, + "grad_norm": 1.198055386543274, + "learning_rate": 4.592360561929928e-05, + "loss": 0.4217, + "step": 1426 + }, + { + "epoch": 0.8682689382415576, + "grad_norm": 1.011428952217102, + "learning_rate": 4.591678991500479e-05, + "loss": 0.3898, + "step": 1427 + }, + { + "epoch": 0.8688773958016428, + "grad_norm": 1.2541236877441406, + "learning_rate": 4.590996902421825e-05, + "loss": 0.485, + "step": 1428 + }, + { + "epoch": 0.869485853361728, + "grad_norm": 1.0267783403396606, + "learning_rate": 4.590314294863097e-05, + "loss": 0.4073, + "step": 1429 + }, + { + "epoch": 0.8700943109218132, + "grad_norm": 1.1068730354309082, + "learning_rate": 4.589631168993552e-05, + "loss": 0.3804, + "step": 1430 + }, + { + "epoch": 0.8707027684818984, + "grad_norm": 0.9509475827217102, + "learning_rate": 4.5889475249825774e-05, + "loss": 0.3519, + "step": 1431 + }, + { + "epoch": 0.8713112260419835, + "grad_norm": 1.0428766012191772, + "learning_rate": 4.5882633629996886e-05, + "loss": 0.3894, + "step": 1432 + }, + { + "epoch": 0.8719196836020687, + "grad_norm": 1.0796087980270386, + "learning_rate": 4.5875786832145287e-05, + "loss": 0.3919, + "step": 1433 + }, + { + "epoch": 0.8725281411621539, + "grad_norm": 1.0344951152801514, + "learning_rate": 4.5868934857968695e-05, + "loss": 0.4074, + "step": 1434 + }, + { + "epoch": 0.8731365987222391, + "grad_norm": 1.127869963645935, + "learning_rate": 4.586207770916612e-05, + "loss": 0.3942, + "step": 1435 + }, + { + "epoch": 0.8737450562823244, + "grad_norm": 1.5959341526031494, + "learning_rate": 4.585521538743785e-05, + "loss": 0.3944, + "step": 1436 + }, + { + "epoch": 0.8743535138424094, + "grad_norm": 0.9701662659645081, + "learning_rate": 4.584834789448544e-05, + "loss": 0.3523, + "step": 1437 + }, + { + "epoch": 0.8749619714024947, + "grad_norm": 1.1680253744125366, + "learning_rate": 4.5841475232011773e-05, + "loss": 0.4448, + "step": 1438 + }, + { + "epoch": 0.8755704289625799, + "grad_norm": 1.0042173862457275, + "learning_rate": 4.5834597401720956e-05, + "loss": 0.3796, + "step": 1439 + }, + { + "epoch": 0.8761788865226651, + "grad_norm": 1.100633978843689, + "learning_rate": 4.582771440531841e-05, + "loss": 0.3943, + "step": 1440 + }, + { + "epoch": 0.8767873440827503, + "grad_norm": 1.1571612358093262, + "learning_rate": 4.582082624451084e-05, + "loss": 0.3959, + "step": 1441 + }, + { + "epoch": 0.8773958016428354, + "grad_norm": 1.4444702863693237, + "learning_rate": 4.581393292100621e-05, + "loss": 0.4962, + "step": 1442 + }, + { + "epoch": 0.8780042592029206, + "grad_norm": 0.991072952747345, + "learning_rate": 4.5807034436513784e-05, + "loss": 0.384, + "step": 1443 + }, + { + "epoch": 0.8786127167630058, + "grad_norm": 1.3088997602462769, + "learning_rate": 4.5800130792744096e-05, + "loss": 0.3756, + "step": 1444 + }, + { + "epoch": 0.879221174323091, + "grad_norm": 1.1207791566848755, + "learning_rate": 4.5793221991408966e-05, + "loss": 0.4245, + "step": 1445 + }, + { + "epoch": 0.8798296318831762, + "grad_norm": 0.9720820188522339, + "learning_rate": 4.578630803422148e-05, + "loss": 0.4084, + "step": 1446 + }, + { + "epoch": 0.8804380894432613, + "grad_norm": 0.9813503623008728, + "learning_rate": 4.577938892289603e-05, + "loss": 0.3907, + "step": 1447 + }, + { + "epoch": 0.8810465470033465, + "grad_norm": 0.9682145714759827, + "learning_rate": 4.577246465914825e-05, + "loss": 0.4057, + "step": 1448 + }, + { + "epoch": 0.8816550045634317, + "grad_norm": 1.0260658264160156, + "learning_rate": 4.576553524469507e-05, + "loss": 0.351, + "step": 1449 + }, + { + "epoch": 0.8822634621235169, + "grad_norm": 1.1361267566680908, + "learning_rate": 4.575860068125471e-05, + "loss": 0.4011, + "step": 1450 + }, + { + "epoch": 0.882871919683602, + "grad_norm": 1.084456443786621, + "learning_rate": 4.575166097054662e-05, + "loss": 0.3984, + "step": 1451 + }, + { + "epoch": 0.8834803772436872, + "grad_norm": 1.2074880599975586, + "learning_rate": 4.57447161142916e-05, + "loss": 0.3984, + "step": 1452 + }, + { + "epoch": 0.8840888348037724, + "grad_norm": 1.044838786125183, + "learning_rate": 4.5737766114211654e-05, + "loss": 0.3779, + "step": 1453 + }, + { + "epoch": 0.8846972923638576, + "grad_norm": 1.0638879537582397, + "learning_rate": 4.5730810972030114e-05, + "loss": 0.3623, + "step": 1454 + }, + { + "epoch": 0.8853057499239428, + "grad_norm": 1.0590424537658691, + "learning_rate": 4.572385068947155e-05, + "loss": 0.3891, + "step": 1455 + }, + { + "epoch": 0.8859142074840279, + "grad_norm": 1.0808945894241333, + "learning_rate": 4.5716885268261834e-05, + "loss": 0.374, + "step": 1456 + }, + { + "epoch": 0.8865226650441131, + "grad_norm": 0.9356719851493835, + "learning_rate": 4.570991471012809e-05, + "loss": 0.3635, + "step": 1457 + }, + { + "epoch": 0.8871311226041984, + "grad_norm": 1.1011765003204346, + "learning_rate": 4.570293901679873e-05, + "loss": 0.3804, + "step": 1458 + }, + { + "epoch": 0.8877395801642836, + "grad_norm": 1.0141364336013794, + "learning_rate": 4.569595819000344e-05, + "loss": 0.4072, + "step": 1459 + }, + { + "epoch": 0.8883480377243688, + "grad_norm": 1.7948359251022339, + "learning_rate": 4.568897223147316e-05, + "loss": 0.3803, + "step": 1460 + }, + { + "epoch": 0.8889564952844539, + "grad_norm": 1.2225985527038574, + "learning_rate": 4.5681981142940126e-05, + "loss": 0.3975, + "step": 1461 + }, + { + "epoch": 0.8895649528445391, + "grad_norm": 0.9955945611000061, + "learning_rate": 4.5674984926137844e-05, + "loss": 0.3601, + "step": 1462 + }, + { + "epoch": 0.8901734104046243, + "grad_norm": 1.1640013456344604, + "learning_rate": 4.5667983582801064e-05, + "loss": 0.3927, + "step": 1463 + }, + { + "epoch": 0.8907818679647095, + "grad_norm": 1.3846951723098755, + "learning_rate": 4.566097711466585e-05, + "loss": 0.3688, + "step": 1464 + }, + { + "epoch": 0.8913903255247947, + "grad_norm": 1.1909358501434326, + "learning_rate": 4.56539655234695e-05, + "loss": 0.4304, + "step": 1465 + }, + { + "epoch": 0.8919987830848798, + "grad_norm": 1.1379756927490234, + "learning_rate": 4.56469488109506e-05, + "loss": 0.4431, + "step": 1466 + }, + { + "epoch": 0.892607240644965, + "grad_norm": 0.9373139142990112, + "learning_rate": 4.5639926978849e-05, + "loss": 0.4213, + "step": 1467 + }, + { + "epoch": 0.8932156982050502, + "grad_norm": 1.0285266637802124, + "learning_rate": 4.563290002890583e-05, + "loss": 0.4201, + "step": 1468 + }, + { + "epoch": 0.8938241557651354, + "grad_norm": 1.1803969144821167, + "learning_rate": 4.5625867962863466e-05, + "loss": 0.4099, + "step": 1469 + }, + { + "epoch": 0.8944326133252206, + "grad_norm": 1.115708589553833, + "learning_rate": 4.5618830782465584e-05, + "loss": 0.4085, + "step": 1470 + }, + { + "epoch": 0.8950410708853057, + "grad_norm": 1.1141424179077148, + "learning_rate": 4.56117884894571e-05, + "loss": 0.4274, + "step": 1471 + }, + { + "epoch": 0.8956495284453909, + "grad_norm": 1.2159873247146606, + "learning_rate": 4.5604741085584215e-05, + "loss": 0.4164, + "step": 1472 + }, + { + "epoch": 0.8962579860054761, + "grad_norm": 1.0636794567108154, + "learning_rate": 4.559768857259438e-05, + "loss": 0.4036, + "step": 1473 + }, + { + "epoch": 0.8968664435655613, + "grad_norm": 1.2936556339263916, + "learning_rate": 4.5590630952236336e-05, + "loss": 0.4509, + "step": 1474 + }, + { + "epoch": 0.8974749011256465, + "grad_norm": 1.1456127166748047, + "learning_rate": 4.558356822626008e-05, + "loss": 0.4016, + "step": 1475 + }, + { + "epoch": 0.8980833586857316, + "grad_norm": 1.0449507236480713, + "learning_rate": 4.557650039641687e-05, + "loss": 0.4636, + "step": 1476 + }, + { + "epoch": 0.8986918162458168, + "grad_norm": 1.048593282699585, + "learning_rate": 4.5569427464459226e-05, + "loss": 0.4334, + "step": 1477 + }, + { + "epoch": 0.899300273805902, + "grad_norm": 0.8729017972946167, + "learning_rate": 4.556234943214095e-05, + "loss": 0.3823, + "step": 1478 + }, + { + "epoch": 0.8999087313659873, + "grad_norm": 1.1806243658065796, + "learning_rate": 4.55552663012171e-05, + "loss": 0.4189, + "step": 1479 + }, + { + "epoch": 0.9005171889260724, + "grad_norm": 1.0129870176315308, + "learning_rate": 4.554817807344399e-05, + "loss": 0.3643, + "step": 1480 + }, + { + "epoch": 0.9011256464861576, + "grad_norm": 1.1537796258926392, + "learning_rate": 4.5541084750579205e-05, + "loss": 0.4925, + "step": 1481 + }, + { + "epoch": 0.9017341040462428, + "grad_norm": 0.9418867230415344, + "learning_rate": 4.55339863343816e-05, + "loss": 0.417, + "step": 1482 + }, + { + "epoch": 0.902342561606328, + "grad_norm": 1.1266993284225464, + "learning_rate": 4.5526882826611285e-05, + "loss": 0.4141, + "step": 1483 + }, + { + "epoch": 0.9029510191664132, + "grad_norm": 1.0497965812683105, + "learning_rate": 4.5519774229029625e-05, + "loss": 0.3798, + "step": 1484 + }, + { + "epoch": 0.9035594767264983, + "grad_norm": 1.159199595451355, + "learning_rate": 4.551266054339927e-05, + "loss": 0.3659, + "step": 1485 + }, + { + "epoch": 0.9041679342865835, + "grad_norm": 0.9190648794174194, + "learning_rate": 4.55055417714841e-05, + "loss": 0.3857, + "step": 1486 + }, + { + "epoch": 0.9047763918466687, + "grad_norm": 1.1719613075256348, + "learning_rate": 4.549841791504929e-05, + "loss": 0.3847, + "step": 1487 + }, + { + "epoch": 0.9053848494067539, + "grad_norm": 0.9929993748664856, + "learning_rate": 4.5491288975861254e-05, + "loss": 0.4189, + "step": 1488 + }, + { + "epoch": 0.9059933069668391, + "grad_norm": 1.0294368267059326, + "learning_rate": 4.548415495568767e-05, + "loss": 0.4341, + "step": 1489 + }, + { + "epoch": 0.9066017645269242, + "grad_norm": 1.082219123840332, + "learning_rate": 4.5477015856297475e-05, + "loss": 0.418, + "step": 1490 + }, + { + "epoch": 0.9072102220870094, + "grad_norm": 1.0178451538085938, + "learning_rate": 4.546987167946088e-05, + "loss": 0.4209, + "step": 1491 + }, + { + "epoch": 0.9078186796470946, + "grad_norm": 0.9833378791809082, + "learning_rate": 4.546272242694933e-05, + "loss": 0.4089, + "step": 1492 + }, + { + "epoch": 0.9084271372071798, + "grad_norm": 1.0812345743179321, + "learning_rate": 4.5455568100535545e-05, + "loss": 0.39, + "step": 1493 + }, + { + "epoch": 0.909035594767265, + "grad_norm": 1.0747591257095337, + "learning_rate": 4.544840870199351e-05, + "loss": 0.4182, + "step": 1494 + }, + { + "epoch": 0.9096440523273501, + "grad_norm": 0.9758956432342529, + "learning_rate": 4.5441244233098434e-05, + "loss": 0.32, + "step": 1495 + }, + { + "epoch": 0.9102525098874353, + "grad_norm": 1.013469934463501, + "learning_rate": 4.5434074695626826e-05, + "loss": 0.4348, + "step": 1496 + }, + { + "epoch": 0.9108609674475205, + "grad_norm": 0.9901841878890991, + "learning_rate": 4.542690009135643e-05, + "loss": 0.3635, + "step": 1497 + }, + { + "epoch": 0.9114694250076057, + "grad_norm": 1.0742686986923218, + "learning_rate": 4.541972042206625e-05, + "loss": 0.4546, + "step": 1498 + }, + { + "epoch": 0.912077882567691, + "grad_norm": 1.0467991828918457, + "learning_rate": 4.541253568953654e-05, + "loss": 0.4002, + "step": 1499 + }, + { + "epoch": 0.912686340127776, + "grad_norm": 1.4334415197372437, + "learning_rate": 4.540534589554881e-05, + "loss": 0.3806, + "step": 1500 + }, + { + "epoch": 0.9132947976878613, + "grad_norm": 1.107809066772461, + "learning_rate": 4.539815104188584e-05, + "loss": 0.4402, + "step": 1501 + }, + { + "epoch": 0.9139032552479465, + "grad_norm": 1.0222909450531006, + "learning_rate": 4.539095113033165e-05, + "loss": 0.4404, + "step": 1502 + }, + { + "epoch": 0.9145117128080317, + "grad_norm": 1.0362690687179565, + "learning_rate": 4.538374616267151e-05, + "loss": 0.385, + "step": 1503 + }, + { + "epoch": 0.9151201703681168, + "grad_norm": 1.0879029035568237, + "learning_rate": 4.537653614069196e-05, + "loss": 0.4255, + "step": 1504 + }, + { + "epoch": 0.915728627928202, + "grad_norm": 1.040208339691162, + "learning_rate": 4.536932106618078e-05, + "loss": 0.3637, + "step": 1505 + }, + { + "epoch": 0.9163370854882872, + "grad_norm": 1.043503999710083, + "learning_rate": 4.536210094092702e-05, + "loss": 0.3691, + "step": 1506 + }, + { + "epoch": 0.9169455430483724, + "grad_norm": 1.0326288938522339, + "learning_rate": 4.535487576672095e-05, + "loss": 0.3499, + "step": 1507 + }, + { + "epoch": 0.9175540006084576, + "grad_norm": 1.1494892835617065, + "learning_rate": 4.5347645545354136e-05, + "loss": 0.3684, + "step": 1508 + }, + { + "epoch": 0.9181624581685427, + "grad_norm": 1.2776802778244019, + "learning_rate": 4.534041027861935e-05, + "loss": 0.4245, + "step": 1509 + }, + { + "epoch": 0.9187709157286279, + "grad_norm": 1.056604266166687, + "learning_rate": 4.533316996831064e-05, + "loss": 0.3585, + "step": 1510 + }, + { + "epoch": 0.9193793732887131, + "grad_norm": 1.12501859664917, + "learning_rate": 4.532592461622331e-05, + "loss": 0.3689, + "step": 1511 + }, + { + "epoch": 0.9199878308487983, + "grad_norm": 1.0672175884246826, + "learning_rate": 4.531867422415391e-05, + "loss": 0.393, + "step": 1512 + }, + { + "epoch": 0.9205962884088835, + "grad_norm": 1.0935568809509277, + "learning_rate": 4.531141879390022e-05, + "loss": 0.3826, + "step": 1513 + }, + { + "epoch": 0.9212047459689686, + "grad_norm": 1.1044657230377197, + "learning_rate": 4.5304158327261294e-05, + "loss": 0.4487, + "step": 1514 + }, + { + "epoch": 0.9218132035290538, + "grad_norm": 1.1453882455825806, + "learning_rate": 4.5296892826037414e-05, + "loss": 0.3947, + "step": 1515 + }, + { + "epoch": 0.922421661089139, + "grad_norm": 0.9416497349739075, + "learning_rate": 4.5289622292030134e-05, + "loss": 0.3573, + "step": 1516 + }, + { + "epoch": 0.9230301186492242, + "grad_norm": 1.0041788816452026, + "learning_rate": 4.528234672704224e-05, + "loss": 0.3347, + "step": 1517 + }, + { + "epoch": 0.9236385762093094, + "grad_norm": 0.9780565500259399, + "learning_rate": 4.527506613287776e-05, + "loss": 0.3597, + "step": 1518 + }, + { + "epoch": 0.9242470337693945, + "grad_norm": 1.2946865558624268, + "learning_rate": 4.526778051134199e-05, + "loss": 0.4323, + "step": 1519 + }, + { + "epoch": 0.9248554913294798, + "grad_norm": 1.030163049697876, + "learning_rate": 4.526048986424146e-05, + "loss": 0.3541, + "step": 1520 + }, + { + "epoch": 0.925463948889565, + "grad_norm": 1.0969247817993164, + "learning_rate": 4.525319419338394e-05, + "loss": 0.415, + "step": 1521 + }, + { + "epoch": 0.9260724064496502, + "grad_norm": 1.0787875652313232, + "learning_rate": 4.5245893500578455e-05, + "loss": 0.4565, + "step": 1522 + }, + { + "epoch": 0.9266808640097354, + "grad_norm": 1.2321113348007202, + "learning_rate": 4.523858778763528e-05, + "loss": 0.4511, + "step": 1523 + }, + { + "epoch": 0.9272893215698205, + "grad_norm": 1.0210340023040771, + "learning_rate": 4.523127705636591e-05, + "loss": 0.4325, + "step": 1524 + }, + { + "epoch": 0.9278977791299057, + "grad_norm": 0.9291285872459412, + "learning_rate": 4.522396130858311e-05, + "loss": 0.3934, + "step": 1525 + }, + { + "epoch": 0.9285062366899909, + "grad_norm": 1.1080670356750488, + "learning_rate": 4.5216640546100884e-05, + "loss": 0.381, + "step": 1526 + }, + { + "epoch": 0.9291146942500761, + "grad_norm": 0.9290838837623596, + "learning_rate": 4.5209314770734475e-05, + "loss": 0.3477, + "step": 1527 + }, + { + "epoch": 0.9297231518101612, + "grad_norm": 1.5479507446289062, + "learning_rate": 4.520198398430037e-05, + "loss": 0.3941, + "step": 1528 + }, + { + "epoch": 0.9303316093702464, + "grad_norm": 1.0913853645324707, + "learning_rate": 4.5194648188616294e-05, + "loss": 0.3909, + "step": 1529 + }, + { + "epoch": 0.9309400669303316, + "grad_norm": 1.1218714714050293, + "learning_rate": 4.518730738550122e-05, + "loss": 0.4052, + "step": 1530 + }, + { + "epoch": 0.9315485244904168, + "grad_norm": 1.0080488920211792, + "learning_rate": 4.517996157677537e-05, + "loss": 0.3412, + "step": 1531 + }, + { + "epoch": 0.932156982050502, + "grad_norm": 1.1207358837127686, + "learning_rate": 4.517261076426018e-05, + "loss": 0.3878, + "step": 1532 + }, + { + "epoch": 0.9327654396105871, + "grad_norm": 1.1212834119796753, + "learning_rate": 4.516525494977837e-05, + "loss": 0.407, + "step": 1533 + }, + { + "epoch": 0.9333738971706723, + "grad_norm": 1.1488605737686157, + "learning_rate": 4.5157894135153845e-05, + "loss": 0.4392, + "step": 1534 + }, + { + "epoch": 0.9339823547307575, + "grad_norm": 1.036495327949524, + "learning_rate": 4.515052832221181e-05, + "loss": 0.3508, + "step": 1535 + }, + { + "epoch": 0.9345908122908427, + "grad_norm": 1.168699860572815, + "learning_rate": 4.514315751277867e-05, + "loss": 0.4334, + "step": 1536 + }, + { + "epoch": 0.9351992698509279, + "grad_norm": 1.0106819868087769, + "learning_rate": 4.513578170868206e-05, + "loss": 0.3479, + "step": 1537 + }, + { + "epoch": 0.935807727411013, + "grad_norm": 0.9819125533103943, + "learning_rate": 4.512840091175089e-05, + "loss": 0.3646, + "step": 1538 + }, + { + "epoch": 0.9364161849710982, + "grad_norm": 0.8794593811035156, + "learning_rate": 4.51210151238153e-05, + "loss": 0.3568, + "step": 1539 + }, + { + "epoch": 0.9370246425311834, + "grad_norm": 0.9659105539321899, + "learning_rate": 4.511362434670663e-05, + "loss": 0.3865, + "step": 1540 + }, + { + "epoch": 0.9376331000912687, + "grad_norm": 1.012877345085144, + "learning_rate": 4.510622858225752e-05, + "loss": 0.4071, + "step": 1541 + }, + { + "epoch": 0.9382415576513539, + "grad_norm": 0.9462942481040955, + "learning_rate": 4.509882783230177e-05, + "loss": 0.345, + "step": 1542 + }, + { + "epoch": 0.938850015211439, + "grad_norm": 1.1898541450500488, + "learning_rate": 4.509142209867448e-05, + "loss": 0.3663, + "step": 1543 + }, + { + "epoch": 0.9394584727715242, + "grad_norm": 1.2533682584762573, + "learning_rate": 4.508401138321196e-05, + "loss": 0.3441, + "step": 1544 + }, + { + "epoch": 0.9400669303316094, + "grad_norm": 1.0154099464416504, + "learning_rate": 4.507659568775177e-05, + "loss": 0.3578, + "step": 1545 + }, + { + "epoch": 0.9406753878916946, + "grad_norm": 1.03611159324646, + "learning_rate": 4.506917501413268e-05, + "loss": 0.3639, + "step": 1546 + }, + { + "epoch": 0.9412838454517798, + "grad_norm": 1.214494228363037, + "learning_rate": 4.506174936419471e-05, + "loss": 0.3985, + "step": 1547 + }, + { + "epoch": 0.9418923030118649, + "grad_norm": 1.0812007188796997, + "learning_rate": 4.505431873977911e-05, + "loss": 0.3469, + "step": 1548 + }, + { + "epoch": 0.9425007605719501, + "grad_norm": 1.1645228862762451, + "learning_rate": 4.504688314272837e-05, + "loss": 0.4381, + "step": 1549 + }, + { + "epoch": 0.9431092181320353, + "grad_norm": 1.1173677444458008, + "learning_rate": 4.5039442574886204e-05, + "loss": 0.3876, + "step": 1550 + }, + { + "epoch": 0.9437176756921205, + "grad_norm": 1.0114907026290894, + "learning_rate": 4.503199703809757e-05, + "loss": 0.4089, + "step": 1551 + }, + { + "epoch": 0.9443261332522057, + "grad_norm": 0.9891692996025085, + "learning_rate": 4.5024546534208645e-05, + "loss": 0.3515, + "step": 1552 + }, + { + "epoch": 0.9449345908122908, + "grad_norm": 1.005385398864746, + "learning_rate": 4.5017091065066837e-05, + "loss": 0.3328, + "step": 1553 + }, + { + "epoch": 0.945543048372376, + "grad_norm": 1.1059056520462036, + "learning_rate": 4.50096306325208e-05, + "loss": 0.4516, + "step": 1554 + }, + { + "epoch": 0.9461515059324612, + "grad_norm": 0.9695335626602173, + "learning_rate": 4.500216523842041e-05, + "loss": 0.3779, + "step": 1555 + }, + { + "epoch": 0.9467599634925464, + "grad_norm": 1.2391901016235352, + "learning_rate": 4.499469488461677e-05, + "loss": 0.3915, + "step": 1556 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 0.9961259961128235, + "learning_rate": 4.4987219572962224e-05, + "loss": 0.3433, + "step": 1557 + }, + { + "epoch": 0.9479768786127167, + "grad_norm": 1.959693193435669, + "learning_rate": 4.497973930531033e-05, + "loss": 0.3842, + "step": 1558 + }, + { + "epoch": 0.9485853361728019, + "grad_norm": 1.5181488990783691, + "learning_rate": 4.497225408351589e-05, + "loss": 0.4688, + "step": 1559 + }, + { + "epoch": 0.9491937937328871, + "grad_norm": 1.1111069917678833, + "learning_rate": 4.4964763909434914e-05, + "loss": 0.3564, + "step": 1560 + }, + { + "epoch": 0.9498022512929724, + "grad_norm": 1.0508601665496826, + "learning_rate": 4.495726878492465e-05, + "loss": 0.3788, + "step": 1561 + }, + { + "epoch": 0.9504107088530575, + "grad_norm": 1.1294502019882202, + "learning_rate": 4.494976871184361e-05, + "loss": 0.3692, + "step": 1562 + }, + { + "epoch": 0.9510191664131427, + "grad_norm": 1.0184043645858765, + "learning_rate": 4.494226369205147e-05, + "loss": 0.4318, + "step": 1563 + }, + { + "epoch": 0.9516276239732279, + "grad_norm": 1.053466796875, + "learning_rate": 4.493475372740916e-05, + "loss": 0.4011, + "step": 1564 + }, + { + "epoch": 0.9522360815333131, + "grad_norm": 1.0696157217025757, + "learning_rate": 4.492723881977885e-05, + "loss": 0.4152, + "step": 1565 + }, + { + "epoch": 0.9528445390933983, + "grad_norm": 1.0308218002319336, + "learning_rate": 4.4919718971023926e-05, + "loss": 0.3829, + "step": 1566 + }, + { + "epoch": 0.9534529966534834, + "grad_norm": 1.0396000146865845, + "learning_rate": 4.4912194183008994e-05, + "loss": 0.3648, + "step": 1567 + }, + { + "epoch": 0.9540614542135686, + "grad_norm": 1.12269926071167, + "learning_rate": 4.490466445759988e-05, + "loss": 0.4426, + "step": 1568 + }, + { + "epoch": 0.9546699117736538, + "grad_norm": 1.1220544576644897, + "learning_rate": 4.489712979666365e-05, + "loss": 0.3841, + "step": 1569 + }, + { + "epoch": 0.955278369333739, + "grad_norm": 0.9514595866203308, + "learning_rate": 4.4889590202068584e-05, + "loss": 0.384, + "step": 1570 + }, + { + "epoch": 0.9558868268938242, + "grad_norm": 0.9942927956581116, + "learning_rate": 4.4882045675684184e-05, + "loss": 0.3854, + "step": 1571 + }, + { + "epoch": 0.9564952844539093, + "grad_norm": 1.0529226064682007, + "learning_rate": 4.487449621938118e-05, + "loss": 0.3861, + "step": 1572 + }, + { + "epoch": 0.9571037420139945, + "grad_norm": 1.1254863739013672, + "learning_rate": 4.486694183503153e-05, + "loss": 0.4056, + "step": 1573 + }, + { + "epoch": 0.9577121995740797, + "grad_norm": 0.9550037980079651, + "learning_rate": 4.48593825245084e-05, + "loss": 0.3541, + "step": 1574 + }, + { + "epoch": 0.9583206571341649, + "grad_norm": 1.0226643085479736, + "learning_rate": 4.4851818289686175e-05, + "loss": 0.4194, + "step": 1575 + }, + { + "epoch": 0.9589291146942501, + "grad_norm": 1.5648815631866455, + "learning_rate": 4.484424913244049e-05, + "loss": 0.3899, + "step": 1576 + }, + { + "epoch": 0.9595375722543352, + "grad_norm": 1.0767285823822021, + "learning_rate": 4.4836675054648156e-05, + "loss": 0.386, + "step": 1577 + }, + { + "epoch": 0.9601460298144204, + "grad_norm": 1.153304100036621, + "learning_rate": 4.482909605818725e-05, + "loss": 0.4087, + "step": 1578 + }, + { + "epoch": 0.9607544873745056, + "grad_norm": 0.8764035105705261, + "learning_rate": 4.482151214493704e-05, + "loss": 0.3476, + "step": 1579 + }, + { + "epoch": 0.9613629449345908, + "grad_norm": 2.1990628242492676, + "learning_rate": 4.4813923316778014e-05, + "loss": 0.43, + "step": 1580 + }, + { + "epoch": 0.9619714024946759, + "grad_norm": 1.129573106765747, + "learning_rate": 4.48063295755919e-05, + "loss": 0.4173, + "step": 1581 + }, + { + "epoch": 0.9625798600547611, + "grad_norm": 1.4812309741973877, + "learning_rate": 4.4798730923261614e-05, + "loss": 0.4175, + "step": 1582 + }, + { + "epoch": 0.9631883176148464, + "grad_norm": 0.9128961563110352, + "learning_rate": 4.4791127361671304e-05, + "loss": 0.3943, + "step": 1583 + }, + { + "epoch": 0.9637967751749316, + "grad_norm": 1.1068278551101685, + "learning_rate": 4.478351889270635e-05, + "loss": 0.4152, + "step": 1584 + }, + { + "epoch": 0.9644052327350168, + "grad_norm": 0.88494873046875, + "learning_rate": 4.477590551825333e-05, + "loss": 0.3745, + "step": 1585 + }, + { + "epoch": 0.9650136902951019, + "grad_norm": 1.0295628309249878, + "learning_rate": 4.476828724020004e-05, + "loss": 0.4369, + "step": 1586 + }, + { + "epoch": 0.9656221478551871, + "grad_norm": 0.9472355246543884, + "learning_rate": 4.47606640604355e-05, + "loss": 0.3364, + "step": 1587 + }, + { + "epoch": 0.9662306054152723, + "grad_norm": 0.9680747389793396, + "learning_rate": 4.4753035980849935e-05, + "loss": 0.3878, + "step": 1588 + }, + { + "epoch": 0.9668390629753575, + "grad_norm": 0.9909029603004456, + "learning_rate": 4.4745403003334784e-05, + "loss": 0.3365, + "step": 1589 + }, + { + "epoch": 0.9674475205354427, + "grad_norm": 1.3338682651519775, + "learning_rate": 4.4737765129782735e-05, + "loss": 0.3644, + "step": 1590 + }, + { + "epoch": 0.9680559780955278, + "grad_norm": 0.9651134014129639, + "learning_rate": 4.473012236208763e-05, + "loss": 0.3977, + "step": 1591 + }, + { + "epoch": 0.968664435655613, + "grad_norm": 1.1207659244537354, + "learning_rate": 4.472247470214458e-05, + "loss": 0.395, + "step": 1592 + }, + { + "epoch": 0.9692728932156982, + "grad_norm": 1.0144009590148926, + "learning_rate": 4.471482215184988e-05, + "loss": 0.3607, + "step": 1593 + }, + { + "epoch": 0.9698813507757834, + "grad_norm": 0.911418616771698, + "learning_rate": 4.470716471310103e-05, + "loss": 0.3452, + "step": 1594 + }, + { + "epoch": 0.9704898083358686, + "grad_norm": 0.9752874374389648, + "learning_rate": 4.469950238779677e-05, + "loss": 0.3708, + "step": 1595 + }, + { + "epoch": 0.9710982658959537, + "grad_norm": 0.8316762447357178, + "learning_rate": 4.469183517783704e-05, + "loss": 0.3127, + "step": 1596 + }, + { + "epoch": 0.9717067234560389, + "grad_norm": 0.9781742095947266, + "learning_rate": 4.4684163085122976e-05, + "loss": 0.3197, + "step": 1597 + }, + { + "epoch": 0.9723151810161241, + "grad_norm": 1.1461424827575684, + "learning_rate": 4.4676486111556936e-05, + "loss": 0.3754, + "step": 1598 + }, + { + "epoch": 0.9729236385762093, + "grad_norm": 1.0407254695892334, + "learning_rate": 4.466880425904251e-05, + "loss": 0.4198, + "step": 1599 + }, + { + "epoch": 0.9735320961362945, + "grad_norm": 0.9534010887145996, + "learning_rate": 4.466111752948446e-05, + "loss": 0.3313, + "step": 1600 + }, + { + "epoch": 0.9741405536963796, + "grad_norm": 1.008195400238037, + "learning_rate": 4.465342592478878e-05, + "loss": 0.366, + "step": 1601 + }, + { + "epoch": 0.9747490112564648, + "grad_norm": 0.928765058517456, + "learning_rate": 4.464572944686266e-05, + "loss": 0.3674, + "step": 1602 + }, + { + "epoch": 0.97535746881655, + "grad_norm": 1.0775413513183594, + "learning_rate": 4.4638028097614515e-05, + "loss": 0.448, + "step": 1603 + }, + { + "epoch": 0.9759659263766353, + "grad_norm": 0.9224221706390381, + "learning_rate": 4.463032187895395e-05, + "loss": 0.3466, + "step": 1604 + }, + { + "epoch": 0.9765743839367204, + "grad_norm": 1.0811947584152222, + "learning_rate": 4.46226107927918e-05, + "loss": 0.4303, + "step": 1605 + }, + { + "epoch": 0.9771828414968056, + "grad_norm": 1.023656964302063, + "learning_rate": 4.4614894841040076e-05, + "loss": 0.3192, + "step": 1606 + }, + { + "epoch": 0.9777912990568908, + "grad_norm": 1.0087846517562866, + "learning_rate": 4.460717402561203e-05, + "loss": 0.3878, + "step": 1607 + }, + { + "epoch": 0.978399756616976, + "grad_norm": 1.254363775253296, + "learning_rate": 4.4599448348422087e-05, + "loss": 0.4048, + "step": 1608 + }, + { + "epoch": 0.9790082141770612, + "grad_norm": 1.086838722229004, + "learning_rate": 4.45917178113859e-05, + "loss": 0.3949, + "step": 1609 + }, + { + "epoch": 0.9796166717371463, + "grad_norm": 1.0660868883132935, + "learning_rate": 4.458398241642032e-05, + "loss": 0.3275, + "step": 1610 + }, + { + "epoch": 0.9802251292972315, + "grad_norm": 1.0936857461929321, + "learning_rate": 4.4576242165443394e-05, + "loss": 0.4235, + "step": 1611 + }, + { + "epoch": 0.9808335868573167, + "grad_norm": 1.1144509315490723, + "learning_rate": 4.456849706037439e-05, + "loss": 0.4732, + "step": 1612 + }, + { + "epoch": 0.9814420444174019, + "grad_norm": 1.0750304460525513, + "learning_rate": 4.456074710313378e-05, + "loss": 0.3902, + "step": 1613 + }, + { + "epoch": 0.9820505019774871, + "grad_norm": 1.7120141983032227, + "learning_rate": 4.455299229564321e-05, + "loss": 0.5309, + "step": 1614 + }, + { + "epoch": 0.9826589595375722, + "grad_norm": 1.0807467699050903, + "learning_rate": 4.454523263982557e-05, + "loss": 0.4076, + "step": 1615 + }, + { + "epoch": 0.9832674170976574, + "grad_norm": 1.0277988910675049, + "learning_rate": 4.453746813760492e-05, + "loss": 0.3838, + "step": 1616 + }, + { + "epoch": 0.9838758746577426, + "grad_norm": 1.0415362119674683, + "learning_rate": 4.452969879090653e-05, + "loss": 0.3689, + "step": 1617 + }, + { + "epoch": 0.9844843322178278, + "grad_norm": 1.0815787315368652, + "learning_rate": 4.452192460165687e-05, + "loss": 0.337, + "step": 1618 + }, + { + "epoch": 0.985092789777913, + "grad_norm": 1.2625021934509277, + "learning_rate": 4.451414557178363e-05, + "loss": 0.3757, + "step": 1619 + }, + { + "epoch": 0.9857012473379981, + "grad_norm": 1.092826247215271, + "learning_rate": 4.450636170321568e-05, + "loss": 0.3752, + "step": 1620 + }, + { + "epoch": 0.9863097048980833, + "grad_norm": 1.0274910926818848, + "learning_rate": 4.449857299788309e-05, + "loss": 0.3388, + "step": 1621 + }, + { + "epoch": 0.9869181624581685, + "grad_norm": 0.9931478500366211, + "learning_rate": 4.449077945771714e-05, + "loss": 0.366, + "step": 1622 + }, + { + "epoch": 0.9875266200182538, + "grad_norm": 1.035408616065979, + "learning_rate": 4.44829810846503e-05, + "loss": 0.3855, + "step": 1623 + }, + { + "epoch": 0.988135077578339, + "grad_norm": 1.1031928062438965, + "learning_rate": 4.447517788061624e-05, + "loss": 0.4198, + "step": 1624 + }, + { + "epoch": 0.9887435351384241, + "grad_norm": 1.122717261314392, + "learning_rate": 4.446736984754982e-05, + "loss": 0.4499, + "step": 1625 + }, + { + "epoch": 0.9893519926985093, + "grad_norm": 0.972223699092865, + "learning_rate": 4.445955698738714e-05, + "loss": 0.4174, + "step": 1626 + }, + { + "epoch": 0.9899604502585945, + "grad_norm": 0.8995710611343384, + "learning_rate": 4.445173930206543e-05, + "loss": 0.3257, + "step": 1627 + }, + { + "epoch": 0.9905689078186797, + "grad_norm": 0.9216899871826172, + "learning_rate": 4.444391679352315e-05, + "loss": 0.3732, + "step": 1628 + }, + { + "epoch": 0.9911773653787649, + "grad_norm": 1.138875961303711, + "learning_rate": 4.4436089463699984e-05, + "loss": 0.4355, + "step": 1629 + }, + { + "epoch": 0.99178582293885, + "grad_norm": 0.9231753945350647, + "learning_rate": 4.442825731453676e-05, + "loss": 0.3415, + "step": 1630 + }, + { + "epoch": 0.9923942804989352, + "grad_norm": 1.0919394493103027, + "learning_rate": 4.4420420347975535e-05, + "loss": 0.3939, + "step": 1631 + }, + { + "epoch": 0.9930027380590204, + "grad_norm": 1.25301194190979, + "learning_rate": 4.4412578565959554e-05, + "loss": 0.3438, + "step": 1632 + }, + { + "epoch": 0.9936111956191056, + "grad_norm": 1.0430591106414795, + "learning_rate": 4.440473197043323e-05, + "loss": 0.4462, + "step": 1633 + }, + { + "epoch": 0.9942196531791907, + "grad_norm": 0.9053996205329895, + "learning_rate": 4.439688056334221e-05, + "loss": 0.3515, + "step": 1634 + }, + { + "epoch": 0.9948281107392759, + "grad_norm": 1.0166853666305542, + "learning_rate": 4.438902434663331e-05, + "loss": 0.3846, + "step": 1635 + }, + { + "epoch": 0.9954365682993611, + "grad_norm": 1.1809371709823608, + "learning_rate": 4.438116332225456e-05, + "loss": 0.4166, + "step": 1636 + }, + { + "epoch": 0.9960450258594463, + "grad_norm": 0.8665146231651306, + "learning_rate": 4.437329749215514e-05, + "loss": 0.3229, + "step": 1637 + }, + { + "epoch": 0.9966534834195315, + "grad_norm": 0.9558749198913574, + "learning_rate": 4.4365426858285466e-05, + "loss": 0.3278, + "step": 1638 + }, + { + "epoch": 0.9972619409796166, + "grad_norm": 0.9763336777687073, + "learning_rate": 4.435755142259712e-05, + "loss": 0.3675, + "step": 1639 + }, + { + "epoch": 0.9978703985397018, + "grad_norm": 1.078508734703064, + "learning_rate": 4.434967118704289e-05, + "loss": 0.3435, + "step": 1640 + }, + { + "epoch": 0.998478856099787, + "grad_norm": 1.0996962785720825, + "learning_rate": 4.434178615357673e-05, + "loss": 0.4125, + "step": 1641 + }, + { + "epoch": 0.9990873136598722, + "grad_norm": 1.0043129920959473, + "learning_rate": 4.433389632415381e-05, + "loss": 0.4054, + "step": 1642 + }, + { + "epoch": 0.9996957712199575, + "grad_norm": 1.0766171216964722, + "learning_rate": 4.432600170073048e-05, + "loss": 0.3377, + "step": 1643 + }, + { + "epoch": 0.9996957712199575, + "eval_loss": 1.0254058837890625, + "eval_runtime": 108.4457, + "eval_samples_per_second": 7.027, + "eval_steps_per_second": 0.443, + "step": 1643 + }, + { + "epoch": 1.0003042287800425, + "grad_norm": 0.9912607073783875, + "learning_rate": 4.431810228526428e-05, + "loss": 0.3113, + "step": 1644 + }, + { + "epoch": 1.0009126863401279, + "grad_norm": 0.9680461287498474, + "learning_rate": 4.431019807971393e-05, + "loss": 0.3006, + "step": 1645 + }, + { + "epoch": 1.001521143900213, + "grad_norm": 0.9305565357208252, + "learning_rate": 4.430228908603934e-05, + "loss": 0.2665, + "step": 1646 + }, + { + "epoch": 1.002129601460298, + "grad_norm": 0.9174848198890686, + "learning_rate": 4.4294375306201617e-05, + "loss": 0.2765, + "step": 1647 + }, + { + "epoch": 1.0027380590203834, + "grad_norm": 0.8088688254356384, + "learning_rate": 4.4286456742163055e-05, + "loss": 0.2378, + "step": 1648 + }, + { + "epoch": 1.0033465165804685, + "grad_norm": 1.254451870918274, + "learning_rate": 4.427853339588711e-05, + "loss": 0.2796, + "step": 1649 + }, + { + "epoch": 1.0039549741405538, + "grad_norm": 0.9241443872451782, + "learning_rate": 4.427060526933846e-05, + "loss": 0.2814, + "step": 1650 + }, + { + "epoch": 1.0045634317006389, + "grad_norm": 1.0514885187149048, + "learning_rate": 4.4262672364482935e-05, + "loss": 0.2402, + "step": 1651 + }, + { + "epoch": 1.005171889260724, + "grad_norm": 0.931946337223053, + "learning_rate": 4.4254734683287575e-05, + "loss": 0.2502, + "step": 1652 + }, + { + "epoch": 1.0057803468208093, + "grad_norm": 2.174992561340332, + "learning_rate": 4.424679222772059e-05, + "loss": 0.2845, + "step": 1653 + }, + { + "epoch": 1.0063888043808944, + "grad_norm": 0.9909196496009827, + "learning_rate": 4.4238844999751376e-05, + "loss": 0.2355, + "step": 1654 + }, + { + "epoch": 1.0069972619409797, + "grad_norm": 0.9932165741920471, + "learning_rate": 4.423089300135052e-05, + "loss": 0.2375, + "step": 1655 + }, + { + "epoch": 1.0076057195010648, + "grad_norm": 1.1847784519195557, + "learning_rate": 4.422293623448978e-05, + "loss": 0.2185, + "step": 1656 + }, + { + "epoch": 1.00821417706115, + "grad_norm": 0.9104436039924622, + "learning_rate": 4.421497470114211e-05, + "loss": 0.2347, + "step": 1657 + }, + { + "epoch": 1.0088226346212352, + "grad_norm": 1.0744693279266357, + "learning_rate": 4.420700840328162e-05, + "loss": 0.2824, + "step": 1658 + }, + { + "epoch": 1.0094310921813203, + "grad_norm": 1.0241225957870483, + "learning_rate": 4.4199037342883656e-05, + "loss": 0.2483, + "step": 1659 + }, + { + "epoch": 1.0100395497414056, + "grad_norm": 1.3999607563018799, + "learning_rate": 4.419106152192467e-05, + "loss": 0.2848, + "step": 1660 + }, + { + "epoch": 1.0106480073014907, + "grad_norm": 1.127521276473999, + "learning_rate": 4.418308094238235e-05, + "loss": 0.2263, + "step": 1661 + }, + { + "epoch": 1.0112564648615758, + "grad_norm": 1.0944982767105103, + "learning_rate": 4.417509560623555e-05, + "loss": 0.2932, + "step": 1662 + }, + { + "epoch": 1.0118649224216612, + "grad_norm": 0.9151603579521179, + "learning_rate": 4.4167105515464305e-05, + "loss": 0.2578, + "step": 1663 + }, + { + "epoch": 1.0124733799817462, + "grad_norm": 1.0785083770751953, + "learning_rate": 4.415911067204981e-05, + "loss": 0.2092, + "step": 1664 + }, + { + "epoch": 1.0130818375418316, + "grad_norm": 1.1762471199035645, + "learning_rate": 4.415111107797445e-05, + "loss": 0.3176, + "step": 1665 + }, + { + "epoch": 1.0136902951019167, + "grad_norm": 0.9114542007446289, + "learning_rate": 4.414310673522181e-05, + "loss": 0.1926, + "step": 1666 + }, + { + "epoch": 1.0142987526620018, + "grad_norm": 1.0898659229278564, + "learning_rate": 4.413509764577663e-05, + "loss": 0.2444, + "step": 1667 + }, + { + "epoch": 1.014907210222087, + "grad_norm": 1.3012317419052124, + "learning_rate": 4.412708381162481e-05, + "loss": 0.2119, + "step": 1668 + }, + { + "epoch": 1.0155156677821722, + "grad_norm": 0.9848833680152893, + "learning_rate": 4.411906523475347e-05, + "loss": 0.2507, + "step": 1669 + }, + { + "epoch": 1.0161241253422575, + "grad_norm": 1.0226378440856934, + "learning_rate": 4.411104191715087e-05, + "loss": 0.2876, + "step": 1670 + }, + { + "epoch": 1.0167325829023426, + "grad_norm": 0.9476386308670044, + "learning_rate": 4.410301386080646e-05, + "loss": 0.209, + "step": 1671 + }, + { + "epoch": 1.0173410404624277, + "grad_norm": 1.0117278099060059, + "learning_rate": 4.4094981067710864e-05, + "loss": 0.2523, + "step": 1672 + }, + { + "epoch": 1.017949498022513, + "grad_norm": 1.3588906526565552, + "learning_rate": 4.408694353985589e-05, + "loss": 0.2786, + "step": 1673 + }, + { + "epoch": 1.018557955582598, + "grad_norm": 1.4190876483917236, + "learning_rate": 4.40789012792345e-05, + "loss": 0.2771, + "step": 1674 + }, + { + "epoch": 1.0191664131426832, + "grad_norm": 1.1058770418167114, + "learning_rate": 4.4070854287840836e-05, + "loss": 0.2673, + "step": 1675 + }, + { + "epoch": 1.0197748707027685, + "grad_norm": 0.8771906495094299, + "learning_rate": 4.406280256767022e-05, + "loss": 0.2254, + "step": 1676 + }, + { + "epoch": 1.0203833282628536, + "grad_norm": 0.8889464735984802, + "learning_rate": 4.4054746120719145e-05, + "loss": 0.2608, + "step": 1677 + }, + { + "epoch": 1.020991785822939, + "grad_norm": 0.8301272392272949, + "learning_rate": 4.404668494898527e-05, + "loss": 0.2089, + "step": 1678 + }, + { + "epoch": 1.021600243383024, + "grad_norm": 0.9632772207260132, + "learning_rate": 4.403861905446744e-05, + "loss": 0.2448, + "step": 1679 + }, + { + "epoch": 1.0222087009431091, + "grad_norm": 0.8604543209075928, + "learning_rate": 4.4030548439165654e-05, + "loss": 0.254, + "step": 1680 + }, + { + "epoch": 1.0228171585031944, + "grad_norm": 0.9880070090293884, + "learning_rate": 4.402247310508108e-05, + "loss": 0.2649, + "step": 1681 + }, + { + "epoch": 1.0234256160632795, + "grad_norm": 1.0415153503417969, + "learning_rate": 4.4014393054216076e-05, + "loss": 0.243, + "step": 1682 + }, + { + "epoch": 1.0240340736233648, + "grad_norm": 1.1902730464935303, + "learning_rate": 4.400630828857415e-05, + "loss": 0.2617, + "step": 1683 + }, + { + "epoch": 1.02464253118345, + "grad_norm": 0.9809486269950867, + "learning_rate": 4.3998218810159995e-05, + "loss": 0.2415, + "step": 1684 + }, + { + "epoch": 1.025250988743535, + "grad_norm": 1.0021169185638428, + "learning_rate": 4.3990124620979454e-05, + "loss": 0.2443, + "step": 1685 + }, + { + "epoch": 1.0258594463036204, + "grad_norm": 0.8601137399673462, + "learning_rate": 4.3982025723039564e-05, + "loss": 0.2291, + "step": 1686 + }, + { + "epoch": 1.0264679038637055, + "grad_norm": 1.0233848094940186, + "learning_rate": 4.3973922118348497e-05, + "loss": 0.2482, + "step": 1687 + }, + { + "epoch": 1.0270763614237908, + "grad_norm": 1.0119305849075317, + "learning_rate": 4.396581380891562e-05, + "loss": 0.2517, + "step": 1688 + }, + { + "epoch": 1.0276848189838759, + "grad_norm": 0.9090347290039062, + "learning_rate": 4.3957700796751446e-05, + "loss": 0.2357, + "step": 1689 + }, + { + "epoch": 1.028293276543961, + "grad_norm": 0.8543658256530762, + "learning_rate": 4.394958308386768e-05, + "loss": 0.2061, + "step": 1690 + }, + { + "epoch": 1.0289017341040463, + "grad_norm": 1.0883513689041138, + "learning_rate": 4.394146067227716e-05, + "loss": 0.2704, + "step": 1691 + }, + { + "epoch": 1.0295101916641314, + "grad_norm": 0.9570848345756531, + "learning_rate": 4.393333356399391e-05, + "loss": 0.2509, + "step": 1692 + }, + { + "epoch": 1.0301186492242167, + "grad_norm": 0.9206425547599792, + "learning_rate": 4.3925201761033116e-05, + "loss": 0.2453, + "step": 1693 + }, + { + "epoch": 1.0307271067843018, + "grad_norm": 0.9678740501403809, + "learning_rate": 4.391706526541114e-05, + "loss": 0.2438, + "step": 1694 + }, + { + "epoch": 1.031335564344387, + "grad_norm": 0.9064133763313293, + "learning_rate": 4.390892407914547e-05, + "loss": 0.2221, + "step": 1695 + }, + { + "epoch": 1.0319440219044722, + "grad_norm": 1.0082942247390747, + "learning_rate": 4.390077820425479e-05, + "loss": 0.2859, + "step": 1696 + }, + { + "epoch": 1.0325524794645573, + "grad_norm": 0.9536372423171997, + "learning_rate": 4.3892627642758946e-05, + "loss": 0.2616, + "step": 1697 + }, + { + "epoch": 1.0331609370246426, + "grad_norm": 0.9097919464111328, + "learning_rate": 4.388447239667892e-05, + "loss": 0.2647, + "step": 1698 + }, + { + "epoch": 1.0337693945847277, + "grad_norm": 0.9455602765083313, + "learning_rate": 4.387631246803689e-05, + "loss": 0.2271, + "step": 1699 + }, + { + "epoch": 1.0343778521448128, + "grad_norm": 1.0754998922348022, + "learning_rate": 4.386814785885617e-05, + "loss": 0.2671, + "step": 1700 + }, + { + "epoch": 1.0349863097048981, + "grad_norm": 0.8816566467285156, + "learning_rate": 4.3859978571161245e-05, + "loss": 0.2149, + "step": 1701 + }, + { + "epoch": 1.0355947672649832, + "grad_norm": 0.8995217084884644, + "learning_rate": 4.3851804606977756e-05, + "loss": 0.2361, + "step": 1702 + }, + { + "epoch": 1.0362032248250685, + "grad_norm": 1.0205188989639282, + "learning_rate": 4.38436259683325e-05, + "loss": 0.2671, + "step": 1703 + }, + { + "epoch": 1.0368116823851536, + "grad_norm": 0.9358905553817749, + "learning_rate": 4.383544265725346e-05, + "loss": 0.2259, + "step": 1704 + }, + { + "epoch": 1.0374201399452387, + "grad_norm": 1.6457610130310059, + "learning_rate": 4.382725467576973e-05, + "loss": 0.3296, + "step": 1705 + }, + { + "epoch": 1.038028597505324, + "grad_norm": 0.9523285627365112, + "learning_rate": 4.38190620259116e-05, + "loss": 0.2466, + "step": 1706 + }, + { + "epoch": 1.0386370550654092, + "grad_norm": 2.4622488021850586, + "learning_rate": 4.381086470971051e-05, + "loss": 0.2464, + "step": 1707 + }, + { + "epoch": 1.0392455126254945, + "grad_norm": 0.9736045002937317, + "learning_rate": 4.380266272919904e-05, + "loss": 0.226, + "step": 1708 + }, + { + "epoch": 1.0398539701855796, + "grad_norm": 0.9621555209159851, + "learning_rate": 4.379445608641095e-05, + "loss": 0.247, + "step": 1709 + }, + { + "epoch": 1.0404624277456647, + "grad_norm": 0.9200358390808105, + "learning_rate": 4.378624478338115e-05, + "loss": 0.2385, + "step": 1710 + }, + { + "epoch": 1.04107088530575, + "grad_norm": 0.921256422996521, + "learning_rate": 4.377802882214568e-05, + "loss": 0.2536, + "step": 1711 + }, + { + "epoch": 1.041679342865835, + "grad_norm": 1.0362918376922607, + "learning_rate": 4.3769808204741766e-05, + "loss": 0.2505, + "step": 1712 + }, + { + "epoch": 1.0422878004259204, + "grad_norm": 0.9670946002006531, + "learning_rate": 4.37615829332078e-05, + "loss": 0.2311, + "step": 1713 + }, + { + "epoch": 1.0428962579860055, + "grad_norm": 0.9415709376335144, + "learning_rate": 4.3753353009583275e-05, + "loss": 0.2073, + "step": 1714 + }, + { + "epoch": 1.0435047155460906, + "grad_norm": 0.9122322797775269, + "learning_rate": 4.374511843590888e-05, + "loss": 0.2956, + "step": 1715 + }, + { + "epoch": 1.044113173106176, + "grad_norm": 0.8133691549301147, + "learning_rate": 4.373687921422646e-05, + "loss": 0.2302, + "step": 1716 + }, + { + "epoch": 1.044721630666261, + "grad_norm": 0.9611904621124268, + "learning_rate": 4.372863534657897e-05, + "loss": 0.242, + "step": 1717 + }, + { + "epoch": 1.045330088226346, + "grad_norm": 1.105997085571289, + "learning_rate": 4.372038683501057e-05, + "loss": 0.2738, + "step": 1718 + }, + { + "epoch": 1.0459385457864314, + "grad_norm": 0.9072719216346741, + "learning_rate": 4.3712133681566546e-05, + "loss": 0.3021, + "step": 1719 + }, + { + "epoch": 1.0465470033465165, + "grad_norm": 0.9673424959182739, + "learning_rate": 4.370387588829332e-05, + "loss": 0.2756, + "step": 1720 + }, + { + "epoch": 1.0471554609066018, + "grad_norm": 0.8735299706459045, + "learning_rate": 4.369561345723849e-05, + "loss": 0.244, + "step": 1721 + }, + { + "epoch": 1.047763918466687, + "grad_norm": 0.8752166628837585, + "learning_rate": 4.36873463904508e-05, + "loss": 0.2251, + "step": 1722 + }, + { + "epoch": 1.048372376026772, + "grad_norm": 0.845640242099762, + "learning_rate": 4.367907468998013e-05, + "loss": 0.2487, + "step": 1723 + }, + { + "epoch": 1.0489808335868573, + "grad_norm": 0.846122682094574, + "learning_rate": 4.3670798357877515e-05, + "loss": 0.2061, + "step": 1724 + }, + { + "epoch": 1.0495892911469424, + "grad_norm": 0.9644899368286133, + "learning_rate": 4.366251739619515e-05, + "loss": 0.2853, + "step": 1725 + }, + { + "epoch": 1.0501977487070278, + "grad_norm": 1.052946925163269, + "learning_rate": 4.365423180698636e-05, + "loss": 0.2502, + "step": 1726 + }, + { + "epoch": 1.0508062062671129, + "grad_norm": 1.0697331428527832, + "learning_rate": 4.364594159230563e-05, + "loss": 0.2259, + "step": 1727 + }, + { + "epoch": 1.051414663827198, + "grad_norm": 0.9684371948242188, + "learning_rate": 4.363764675420858e-05, + "loss": 0.2298, + "step": 1728 + }, + { + "epoch": 1.0520231213872833, + "grad_norm": 0.9283509254455566, + "learning_rate": 4.3629347294752e-05, + "loss": 0.2221, + "step": 1729 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 0.9486536383628845, + "learning_rate": 4.3621043215993793e-05, + "loss": 0.2321, + "step": 1730 + }, + { + "epoch": 1.0532400365074537, + "grad_norm": 0.9327476024627686, + "learning_rate": 4.3612734519993035e-05, + "loss": 0.2353, + "step": 1731 + }, + { + "epoch": 1.0538484940675388, + "grad_norm": 0.9721671342849731, + "learning_rate": 4.360442120880994e-05, + "loss": 0.2255, + "step": 1732 + }, + { + "epoch": 1.0544569516276239, + "grad_norm": 1.0690611600875854, + "learning_rate": 4.3596103284505854e-05, + "loss": 0.2511, + "step": 1733 + }, + { + "epoch": 1.0550654091877092, + "grad_norm": 0.9945971965789795, + "learning_rate": 4.358778074914326e-05, + "loss": 0.2729, + "step": 1734 + }, + { + "epoch": 1.0556738667477943, + "grad_norm": 1.0413203239440918, + "learning_rate": 4.357945360478584e-05, + "loss": 0.2918, + "step": 1735 + }, + { + "epoch": 1.0562823243078796, + "grad_norm": 0.8384091854095459, + "learning_rate": 4.357112185349834e-05, + "loss": 0.2324, + "step": 1736 + }, + { + "epoch": 1.0568907818679647, + "grad_norm": 1.018020510673523, + "learning_rate": 4.3562785497346706e-05, + "loss": 0.262, + "step": 1737 + }, + { + "epoch": 1.0574992394280498, + "grad_norm": 0.9685660004615784, + "learning_rate": 4.355444453839801e-05, + "loss": 0.2684, + "step": 1738 + }, + { + "epoch": 1.0581076969881351, + "grad_norm": 1.006584882736206, + "learning_rate": 4.354609897872044e-05, + "loss": 0.2912, + "step": 1739 + }, + { + "epoch": 1.0587161545482202, + "grad_norm": 0.9230665564537048, + "learning_rate": 4.3537748820383386e-05, + "loss": 0.2542, + "step": 1740 + }, + { + "epoch": 1.0593246121083055, + "grad_norm": 0.9952296018600464, + "learning_rate": 4.35293940654573e-05, + "loss": 0.2588, + "step": 1741 + }, + { + "epoch": 1.0599330696683906, + "grad_norm": 0.8689709305763245, + "learning_rate": 4.352103471601383e-05, + "loss": 0.2159, + "step": 1742 + }, + { + "epoch": 1.0605415272284757, + "grad_norm": 0.9874710440635681, + "learning_rate": 4.351267077412575e-05, + "loss": 0.2766, + "step": 1743 + }, + { + "epoch": 1.061149984788561, + "grad_norm": 0.9209951758384705, + "learning_rate": 4.350430224186696e-05, + "loss": 0.2918, + "step": 1744 + }, + { + "epoch": 1.0617584423486461, + "grad_norm": 0.9789804220199585, + "learning_rate": 4.349592912131252e-05, + "loss": 0.26, + "step": 1745 + }, + { + "epoch": 1.0623668999087315, + "grad_norm": 0.963188111782074, + "learning_rate": 4.3487551414538595e-05, + "loss": 0.2295, + "step": 1746 + }, + { + "epoch": 1.0629753574688166, + "grad_norm": 0.8524497151374817, + "learning_rate": 4.347916912362252e-05, + "loss": 0.2094, + "step": 1747 + }, + { + "epoch": 1.0635838150289016, + "grad_norm": 1.0623359680175781, + "learning_rate": 4.347078225064276e-05, + "loss": 0.263, + "step": 1748 + }, + { + "epoch": 1.064192272588987, + "grad_norm": 1.013371467590332, + "learning_rate": 4.34623907976789e-05, + "loss": 0.2404, + "step": 1749 + }, + { + "epoch": 1.064800730149072, + "grad_norm": 0.8995924592018127, + "learning_rate": 4.345399476681167e-05, + "loss": 0.2167, + "step": 1750 + }, + { + "epoch": 1.0654091877091574, + "grad_norm": 1.011319875717163, + "learning_rate": 4.344559416012293e-05, + "loss": 0.2701, + "step": 1751 + }, + { + "epoch": 1.0660176452692425, + "grad_norm": 0.9735434055328369, + "learning_rate": 4.34371889796957e-05, + "loss": 0.2159, + "step": 1752 + }, + { + "epoch": 1.0666261028293276, + "grad_norm": 0.8756011128425598, + "learning_rate": 4.34287792276141e-05, + "loss": 0.2148, + "step": 1753 + }, + { + "epoch": 1.067234560389413, + "grad_norm": 0.8667939305305481, + "learning_rate": 4.34203649059634e-05, + "loss": 0.1809, + "step": 1754 + }, + { + "epoch": 1.067843017949498, + "grad_norm": 0.9129221439361572, + "learning_rate": 4.341194601683e-05, + "loss": 0.2344, + "step": 1755 + }, + { + "epoch": 1.0684514755095833, + "grad_norm": 0.952587902545929, + "learning_rate": 4.340352256230144e-05, + "loss": 0.2158, + "step": 1756 + }, + { + "epoch": 1.0690599330696684, + "grad_norm": 0.932098388671875, + "learning_rate": 4.339509454446637e-05, + "loss": 0.2488, + "step": 1757 + }, + { + "epoch": 1.0696683906297535, + "grad_norm": 1.0526845455169678, + "learning_rate": 4.338666196541461e-05, + "loss": 0.276, + "step": 1758 + }, + { + "epoch": 1.0702768481898388, + "grad_norm": 1.0784953832626343, + "learning_rate": 4.337822482723708e-05, + "loss": 0.2574, + "step": 1759 + }, + { + "epoch": 1.070885305749924, + "grad_norm": 1.0775099992752075, + "learning_rate": 4.336978313202583e-05, + "loss": 0.2513, + "step": 1760 + }, + { + "epoch": 1.0714937633100092, + "grad_norm": 0.990006685256958, + "learning_rate": 4.336133688187405e-05, + "loss": 0.237, + "step": 1761 + }, + { + "epoch": 1.0721022208700943, + "grad_norm": 0.9512050151824951, + "learning_rate": 4.3352886078876065e-05, + "loss": 0.2594, + "step": 1762 + }, + { + "epoch": 1.0727106784301794, + "grad_norm": 1.0135704278945923, + "learning_rate": 4.3344430725127315e-05, + "loss": 0.2292, + "step": 1763 + }, + { + "epoch": 1.0733191359902647, + "grad_norm": 1.0516897439956665, + "learning_rate": 4.333597082272438e-05, + "loss": 0.2289, + "step": 1764 + }, + { + "epoch": 1.0739275935503498, + "grad_norm": 0.9647794365882874, + "learning_rate": 4.332750637376496e-05, + "loss": 0.2488, + "step": 1765 + }, + { + "epoch": 1.0745360511104352, + "grad_norm": 0.84974604845047, + "learning_rate": 4.331903738034789e-05, + "loss": 0.2196, + "step": 1766 + }, + { + "epoch": 1.0751445086705202, + "grad_norm": 0.9360343813896179, + "learning_rate": 4.331056384457313e-05, + "loss": 0.2452, + "step": 1767 + }, + { + "epoch": 1.0757529662306053, + "grad_norm": 0.920875608921051, + "learning_rate": 4.330208576854176e-05, + "loss": 0.2618, + "step": 1768 + }, + { + "epoch": 1.0763614237906907, + "grad_norm": 0.9963610172271729, + "learning_rate": 4.3293603154355976e-05, + "loss": 0.2536, + "step": 1769 + }, + { + "epoch": 1.0769698813507758, + "grad_norm": 0.914340615272522, + "learning_rate": 4.328511600411913e-05, + "loss": 0.2487, + "step": 1770 + }, + { + "epoch": 1.077578338910861, + "grad_norm": 0.8099347949028015, + "learning_rate": 4.327662431993568e-05, + "loss": 0.2186, + "step": 1771 + }, + { + "epoch": 1.0781867964709462, + "grad_norm": 0.8634665608406067, + "learning_rate": 4.32681281039112e-05, + "loss": 0.2381, + "step": 1772 + }, + { + "epoch": 1.0787952540310313, + "grad_norm": 1.6494232416152954, + "learning_rate": 4.325962735815241e-05, + "loss": 0.3326, + "step": 1773 + }, + { + "epoch": 1.0794037115911166, + "grad_norm": 0.9181280732154846, + "learning_rate": 4.3251122084767124e-05, + "loss": 0.2321, + "step": 1774 + }, + { + "epoch": 1.0800121691512017, + "grad_norm": 1.1291999816894531, + "learning_rate": 4.324261228586431e-05, + "loss": 0.2762, + "step": 1775 + }, + { + "epoch": 1.080620626711287, + "grad_norm": 0.8536227941513062, + "learning_rate": 4.323409796355404e-05, + "loss": 0.1981, + "step": 1776 + }, + { + "epoch": 1.081229084271372, + "grad_norm": 0.9988884329795837, + "learning_rate": 4.322557911994751e-05, + "loss": 0.2943, + "step": 1777 + }, + { + "epoch": 1.0818375418314572, + "grad_norm": 0.9455403089523315, + "learning_rate": 4.321705575715703e-05, + "loss": 0.2459, + "step": 1778 + }, + { + "epoch": 1.0824459993915425, + "grad_norm": 0.9276781678199768, + "learning_rate": 4.320852787729606e-05, + "loss": 0.2779, + "step": 1779 + }, + { + "epoch": 1.0830544569516276, + "grad_norm": 1.0638751983642578, + "learning_rate": 4.319999548247914e-05, + "loss": 0.2595, + "step": 1780 + }, + { + "epoch": 1.0836629145117127, + "grad_norm": 0.9427651762962341, + "learning_rate": 4.3191458574821955e-05, + "loss": 0.2628, + "step": 1781 + }, + { + "epoch": 1.084271372071798, + "grad_norm": 0.9555470943450928, + "learning_rate": 4.3182917156441296e-05, + "loss": 0.245, + "step": 1782 + }, + { + "epoch": 1.0848798296318831, + "grad_norm": 0.9582526683807373, + "learning_rate": 4.317437122945508e-05, + "loss": 0.2842, + "step": 1783 + }, + { + "epoch": 1.0854882871919684, + "grad_norm": 0.9005835652351379, + "learning_rate": 4.316582079598235e-05, + "loss": 0.2646, + "step": 1784 + }, + { + "epoch": 1.0860967447520535, + "grad_norm": 0.7882102727890015, + "learning_rate": 4.3157265858143247e-05, + "loss": 0.2415, + "step": 1785 + }, + { + "epoch": 1.0867052023121386, + "grad_norm": 0.8828130960464478, + "learning_rate": 4.3148706418059046e-05, + "loss": 0.2327, + "step": 1786 + }, + { + "epoch": 1.087313659872224, + "grad_norm": 0.823904812335968, + "learning_rate": 4.314014247785214e-05, + "loss": 0.2377, + "step": 1787 + }, + { + "epoch": 1.087922117432309, + "grad_norm": 0.9788227677345276, + "learning_rate": 4.313157403964601e-05, + "loss": 0.2552, + "step": 1788 + }, + { + "epoch": 1.0885305749923944, + "grad_norm": 0.935871422290802, + "learning_rate": 4.312300110556527e-05, + "loss": 0.2474, + "step": 1789 + }, + { + "epoch": 1.0891390325524795, + "grad_norm": 17.13036346435547, + "learning_rate": 4.311442367773567e-05, + "loss": 0.2759, + "step": 1790 + }, + { + "epoch": 1.0897474901125646, + "grad_norm": 1.1542432308197021, + "learning_rate": 4.3105841758284035e-05, + "loss": 0.2905, + "step": 1791 + }, + { + "epoch": 1.0903559476726499, + "grad_norm": 1.1316038370132446, + "learning_rate": 4.3097255349338344e-05, + "loss": 0.2549, + "step": 1792 + }, + { + "epoch": 1.090964405232735, + "grad_norm": 1.0348681211471558, + "learning_rate": 4.308866445302766e-05, + "loss": 0.2649, + "step": 1793 + }, + { + "epoch": 1.0915728627928203, + "grad_norm": 1.0835968255996704, + "learning_rate": 4.308006907148215e-05, + "loss": 0.2424, + "step": 1794 + }, + { + "epoch": 1.0921813203529054, + "grad_norm": 0.9840952754020691, + "learning_rate": 4.307146920683313e-05, + "loss": 0.27, + "step": 1795 + }, + { + "epoch": 1.0927897779129905, + "grad_norm": 0.8699171543121338, + "learning_rate": 4.3062864861213e-05, + "loss": 0.2061, + "step": 1796 + }, + { + "epoch": 1.0933982354730758, + "grad_norm": 0.9339224100112915, + "learning_rate": 4.305425603675529e-05, + "loss": 0.2612, + "step": 1797 + }, + { + "epoch": 1.094006693033161, + "grad_norm": 1.0255519151687622, + "learning_rate": 4.304564273559462e-05, + "loss": 0.2652, + "step": 1798 + }, + { + "epoch": 1.0946151505932462, + "grad_norm": 1.1433568000793457, + "learning_rate": 4.303702495986672e-05, + "loss": 0.2684, + "step": 1799 + }, + { + "epoch": 1.0952236081533313, + "grad_norm": 1.2076388597488403, + "learning_rate": 4.302840271170846e-05, + "loss": 0.2964, + "step": 1800 + }, + { + "epoch": 1.0958320657134164, + "grad_norm": 0.9302932620048523, + "learning_rate": 4.301977599325779e-05, + "loss": 0.2291, + "step": 1801 + }, + { + "epoch": 1.0964405232735017, + "grad_norm": 0.9104456901550293, + "learning_rate": 4.301114480665377e-05, + "loss": 0.235, + "step": 1802 + }, + { + "epoch": 1.0970489808335868, + "grad_norm": 1.0696483850479126, + "learning_rate": 4.3002509154036585e-05, + "loss": 0.3616, + "step": 1803 + }, + { + "epoch": 1.0976574383936721, + "grad_norm": 0.8358986377716064, + "learning_rate": 4.299386903754751e-05, + "loss": 0.197, + "step": 1804 + }, + { + "epoch": 1.0982658959537572, + "grad_norm": 0.949712872505188, + "learning_rate": 4.2985224459328934e-05, + "loss": 0.2621, + "step": 1805 + }, + { + "epoch": 1.0988743535138423, + "grad_norm": 1.1388565301895142, + "learning_rate": 4.297657542152434e-05, + "loss": 0.3019, + "step": 1806 + }, + { + "epoch": 1.0994828110739276, + "grad_norm": 0.9658118486404419, + "learning_rate": 4.296792192627836e-05, + "loss": 0.2406, + "step": 1807 + }, + { + "epoch": 1.1000912686340127, + "grad_norm": 0.9098191261291504, + "learning_rate": 4.2959263975736676e-05, + "loss": 0.2166, + "step": 1808 + }, + { + "epoch": 1.100699726194098, + "grad_norm": 1.0185233354568481, + "learning_rate": 4.29506015720461e-05, + "loss": 0.2498, + "step": 1809 + }, + { + "epoch": 1.1013081837541832, + "grad_norm": 1.152592658996582, + "learning_rate": 4.294193471735456e-05, + "loss": 0.2417, + "step": 1810 + }, + { + "epoch": 1.1019166413142683, + "grad_norm": 0.8422519564628601, + "learning_rate": 4.2933263413811065e-05, + "loss": 0.2361, + "step": 1811 + }, + { + "epoch": 1.1025250988743536, + "grad_norm": 0.9988545179367065, + "learning_rate": 4.292458766356574e-05, + "loss": 0.273, + "step": 1812 + }, + { + "epoch": 1.1031335564344387, + "grad_norm": 1.068097472190857, + "learning_rate": 4.29159074687698e-05, + "loss": 0.2403, + "step": 1813 + }, + { + "epoch": 1.103742013994524, + "grad_norm": 1.0569621324539185, + "learning_rate": 4.290722283157559e-05, + "loss": 0.2446, + "step": 1814 + }, + { + "epoch": 1.104350471554609, + "grad_norm": 0.9275895357131958, + "learning_rate": 4.289853375413652e-05, + "loss": 0.2369, + "step": 1815 + }, + { + "epoch": 1.1049589291146942, + "grad_norm": 1.1227593421936035, + "learning_rate": 4.2889840238607135e-05, + "loss": 0.262, + "step": 1816 + }, + { + "epoch": 1.1055673866747795, + "grad_norm": 0.9638474583625793, + "learning_rate": 4.2881142287143044e-05, + "loss": 0.232, + "step": 1817 + }, + { + "epoch": 1.1061758442348646, + "grad_norm": 0.9380021691322327, + "learning_rate": 4.2872439901901e-05, + "loss": 0.244, + "step": 1818 + }, + { + "epoch": 1.1067843017949497, + "grad_norm": 0.9702509641647339, + "learning_rate": 4.286373308503881e-05, + "loss": 0.2451, + "step": 1819 + }, + { + "epoch": 1.107392759355035, + "grad_norm": 0.9465189576148987, + "learning_rate": 4.285502183871542e-05, + "loss": 0.2479, + "step": 1820 + }, + { + "epoch": 1.10800121691512, + "grad_norm": 0.9717510342597961, + "learning_rate": 4.284630616509084e-05, + "loss": 0.2178, + "step": 1821 + }, + { + "epoch": 1.1086096744752054, + "grad_norm": 0.9717493653297424, + "learning_rate": 4.283758606632621e-05, + "loss": 0.2601, + "step": 1822 + }, + { + "epoch": 1.1092181320352905, + "grad_norm": 1.4151792526245117, + "learning_rate": 4.2828861544583746e-05, + "loss": 0.29, + "step": 1823 + }, + { + "epoch": 1.1098265895953756, + "grad_norm": 1.0413273572921753, + "learning_rate": 4.282013260202675e-05, + "loss": 0.262, + "step": 1824 + }, + { + "epoch": 1.110435047155461, + "grad_norm": 1.031923532485962, + "learning_rate": 4.281139924081966e-05, + "loss": 0.2929, + "step": 1825 + }, + { + "epoch": 1.111043504715546, + "grad_norm": 0.9092714786529541, + "learning_rate": 4.280266146312797e-05, + "loss": 0.2703, + "step": 1826 + }, + { + "epoch": 1.1116519622756313, + "grad_norm": 1.0851198434829712, + "learning_rate": 4.279391927111828e-05, + "loss": 0.3064, + "step": 1827 + }, + { + "epoch": 1.1122604198357164, + "grad_norm": 0.839637279510498, + "learning_rate": 4.2785172666958305e-05, + "loss": 0.2491, + "step": 1828 + }, + { + "epoch": 1.1128688773958015, + "grad_norm": 0.8800391554832458, + "learning_rate": 4.2776421652816834e-05, + "loss": 0.2591, + "step": 1829 + }, + { + "epoch": 1.1134773349558869, + "grad_norm": 0.9919121861457825, + "learning_rate": 4.2767666230863743e-05, + "loss": 0.2876, + "step": 1830 + }, + { + "epoch": 1.114085792515972, + "grad_norm": 1.0813971757888794, + "learning_rate": 4.2758906403270026e-05, + "loss": 0.2622, + "step": 1831 + }, + { + "epoch": 1.1146942500760573, + "grad_norm": 0.9650895595550537, + "learning_rate": 4.275014217220775e-05, + "loss": 0.2185, + "step": 1832 + }, + { + "epoch": 1.1153027076361424, + "grad_norm": 0.9158915281295776, + "learning_rate": 4.2741373539850076e-05, + "loss": 0.2519, + "step": 1833 + }, + { + "epoch": 1.1159111651962275, + "grad_norm": 1.001535415649414, + "learning_rate": 4.273260050837126e-05, + "loss": 0.268, + "step": 1834 + }, + { + "epoch": 1.1165196227563128, + "grad_norm": 0.9505628347396851, + "learning_rate": 4.272382307994665e-05, + "loss": 0.2334, + "step": 1835 + }, + { + "epoch": 1.1171280803163979, + "grad_norm": 1.0092861652374268, + "learning_rate": 4.271504125675269e-05, + "loss": 0.2992, + "step": 1836 + }, + { + "epoch": 1.1177365378764832, + "grad_norm": 0.9263057708740234, + "learning_rate": 4.270625504096688e-05, + "loss": 0.2283, + "step": 1837 + }, + { + "epoch": 1.1183449954365683, + "grad_norm": 1.5665264129638672, + "learning_rate": 4.269746443476787e-05, + "loss": 0.235, + "step": 1838 + }, + { + "epoch": 1.1189534529966534, + "grad_norm": 1.0436450242996216, + "learning_rate": 4.268866944033533e-05, + "loss": 0.26, + "step": 1839 + }, + { + "epoch": 1.1195619105567387, + "grad_norm": 0.8918569684028625, + "learning_rate": 4.267987005985008e-05, + "loss": 0.2506, + "step": 1840 + }, + { + "epoch": 1.1201703681168238, + "grad_norm": 0.9875538349151611, + "learning_rate": 4.267106629549398e-05, + "loss": 0.2407, + "step": 1841 + }, + { + "epoch": 1.1207788256769091, + "grad_norm": 1.0079050064086914, + "learning_rate": 4.266225814945001e-05, + "loss": 0.3127, + "step": 1842 + }, + { + "epoch": 1.1213872832369942, + "grad_norm": 0.8947494029998779, + "learning_rate": 4.265344562390222e-05, + "loss": 0.2271, + "step": 1843 + }, + { + "epoch": 1.1219957407970793, + "grad_norm": 0.9922159314155579, + "learning_rate": 4.264462872103575e-05, + "loss": 0.2515, + "step": 1844 + }, + { + "epoch": 1.1226041983571646, + "grad_norm": 0.8498148918151855, + "learning_rate": 4.263580744303681e-05, + "loss": 0.2378, + "step": 1845 + }, + { + "epoch": 1.1232126559172497, + "grad_norm": 1.0088249444961548, + "learning_rate": 4.2626981792092735e-05, + "loss": 0.2551, + "step": 1846 + }, + { + "epoch": 1.123821113477335, + "grad_norm": 0.899652898311615, + "learning_rate": 4.261815177039189e-05, + "loss": 0.2272, + "step": 1847 + }, + { + "epoch": 1.1244295710374201, + "grad_norm": 1.3068557977676392, + "learning_rate": 4.260931738012378e-05, + "loss": 0.2594, + "step": 1848 + }, + { + "epoch": 1.1250380285975052, + "grad_norm": 1.037555456161499, + "learning_rate": 4.260047862347894e-05, + "loss": 0.2607, + "step": 1849 + }, + { + "epoch": 1.1256464861575906, + "grad_norm": 1.0918004512786865, + "learning_rate": 4.259163550264904e-05, + "loss": 0.2699, + "step": 1850 + }, + { + "epoch": 1.1262549437176756, + "grad_norm": 0.9314645528793335, + "learning_rate": 4.258278801982678e-05, + "loss": 0.2482, + "step": 1851 + }, + { + "epoch": 1.126863401277761, + "grad_norm": 0.8757783770561218, + "learning_rate": 4.257393617720599e-05, + "loss": 0.2579, + "step": 1852 + }, + { + "epoch": 1.127471858837846, + "grad_norm": 1.0207921266555786, + "learning_rate": 4.256507997698152e-05, + "loss": 0.2311, + "step": 1853 + }, + { + "epoch": 1.1280803163979312, + "grad_norm": 0.9114212989807129, + "learning_rate": 4.2556219421349394e-05, + "loss": 0.2494, + "step": 1854 + }, + { + "epoch": 1.1286887739580165, + "grad_norm": 1.038779616355896, + "learning_rate": 4.254735451250661e-05, + "loss": 0.2665, + "step": 1855 + }, + { + "epoch": 1.1292972315181016, + "grad_norm": 0.8652352094650269, + "learning_rate": 4.2538485252651326e-05, + "loss": 0.2155, + "step": 1856 + }, + { + "epoch": 1.129905689078187, + "grad_norm": 0.9267082810401917, + "learning_rate": 4.2529611643982735e-05, + "loss": 0.2499, + "step": 1857 + }, + { + "epoch": 1.130514146638272, + "grad_norm": 1.116803526878357, + "learning_rate": 4.252073368870113e-05, + "loss": 0.2744, + "step": 1858 + }, + { + "epoch": 1.131122604198357, + "grad_norm": 0.9374389052391052, + "learning_rate": 4.251185138900787e-05, + "loss": 0.2698, + "step": 1859 + }, + { + "epoch": 1.1317310617584424, + "grad_norm": 0.8052027821540833, + "learning_rate": 4.25029647471054e-05, + "loss": 0.2098, + "step": 1860 + }, + { + "epoch": 1.1323395193185275, + "grad_norm": 0.9955977201461792, + "learning_rate": 4.249407376519722e-05, + "loss": 0.2417, + "step": 1861 + }, + { + "epoch": 1.1329479768786128, + "grad_norm": 0.8962797522544861, + "learning_rate": 4.248517844548795e-05, + "loss": 0.2428, + "step": 1862 + }, + { + "epoch": 1.133556434438698, + "grad_norm": 1.001209020614624, + "learning_rate": 4.247627879018323e-05, + "loss": 0.2477, + "step": 1863 + }, + { + "epoch": 1.134164891998783, + "grad_norm": 1.0366243124008179, + "learning_rate": 4.246737480148983e-05, + "loss": 0.2382, + "step": 1864 + }, + { + "epoch": 1.1347733495588683, + "grad_norm": 0.9120834469795227, + "learning_rate": 4.245846648161554e-05, + "loss": 0.2555, + "step": 1865 + }, + { + "epoch": 1.1353818071189534, + "grad_norm": 0.9888159036636353, + "learning_rate": 4.2449553832769284e-05, + "loss": 0.2803, + "step": 1866 + }, + { + "epoch": 1.1359902646790387, + "grad_norm": 0.9909419417381287, + "learning_rate": 4.2440636857161e-05, + "loss": 0.27, + "step": 1867 + }, + { + "epoch": 1.1365987222391238, + "grad_norm": 1.0697113275527954, + "learning_rate": 4.243171555700174e-05, + "loss": 0.2664, + "step": 1868 + }, + { + "epoch": 1.137207179799209, + "grad_norm": 0.9328745603561401, + "learning_rate": 4.242278993450361e-05, + "loss": 0.2762, + "step": 1869 + }, + { + "epoch": 1.1378156373592943, + "grad_norm": 0.8406645059585571, + "learning_rate": 4.24138599918798e-05, + "loss": 0.2151, + "step": 1870 + }, + { + "epoch": 1.1384240949193793, + "grad_norm": 0.9748327136039734, + "learning_rate": 4.240492573134455e-05, + "loss": 0.2286, + "step": 1871 + }, + { + "epoch": 1.1390325524794647, + "grad_norm": 0.8779383301734924, + "learning_rate": 4.239598715511319e-05, + "loss": 0.2185, + "step": 1872 + }, + { + "epoch": 1.1396410100395498, + "grad_norm": 0.96523517370224, + "learning_rate": 4.238704426540213e-05, + "loss": 0.2499, + "step": 1873 + }, + { + "epoch": 1.1402494675996349, + "grad_norm": 1.156562328338623, + "learning_rate": 4.23780970644288e-05, + "loss": 0.272, + "step": 1874 + }, + { + "epoch": 1.1408579251597202, + "grad_norm": 1.0613001585006714, + "learning_rate": 4.236914555441177e-05, + "loss": 0.268, + "step": 1875 + }, + { + "epoch": 1.1414663827198053, + "grad_norm": 0.803700864315033, + "learning_rate": 4.236018973757061e-05, + "loss": 0.2026, + "step": 1876 + }, + { + "epoch": 1.1420748402798906, + "grad_norm": 1.021272897720337, + "learning_rate": 4.235122961612602e-05, + "loss": 0.2904, + "step": 1877 + }, + { + "epoch": 1.1426832978399757, + "grad_norm": 0.7920538187026978, + "learning_rate": 4.234226519229971e-05, + "loss": 0.2118, + "step": 1878 + }, + { + "epoch": 1.1432917554000608, + "grad_norm": 1.0034376382827759, + "learning_rate": 4.233329646831449e-05, + "loss": 0.2355, + "step": 1879 + }, + { + "epoch": 1.143900212960146, + "grad_norm": 1.0037422180175781, + "learning_rate": 4.2324323446394244e-05, + "loss": 0.2941, + "step": 1880 + }, + { + "epoch": 1.1445086705202312, + "grad_norm": 0.7395780086517334, + "learning_rate": 4.2315346128763886e-05, + "loss": 0.1974, + "step": 1881 + }, + { + "epoch": 1.1451171280803165, + "grad_norm": 0.9243549108505249, + "learning_rate": 4.2306364517649434e-05, + "loss": 0.2657, + "step": 1882 + }, + { + "epoch": 1.1457255856404016, + "grad_norm": 0.8601450324058533, + "learning_rate": 4.2297378615277935e-05, + "loss": 0.2241, + "step": 1883 + }, + { + "epoch": 1.1463340432004867, + "grad_norm": 0.9598115682601929, + "learning_rate": 4.228838842387755e-05, + "loss": 0.2484, + "step": 1884 + }, + { + "epoch": 1.146942500760572, + "grad_norm": 1.2198432683944702, + "learning_rate": 4.2279393945677437e-05, + "loss": 0.217, + "step": 1885 + }, + { + "epoch": 1.1475509583206571, + "grad_norm": 0.9545729756355286, + "learning_rate": 4.227039518290786e-05, + "loss": 0.2229, + "step": 1886 + }, + { + "epoch": 1.1481594158807424, + "grad_norm": 1.0584890842437744, + "learning_rate": 4.226139213780016e-05, + "loss": 0.2805, + "step": 1887 + }, + { + "epoch": 1.1487678734408275, + "grad_norm": 0.9882286787033081, + "learning_rate": 4.225238481258669e-05, + "loss": 0.2621, + "step": 1888 + }, + { + "epoch": 1.1493763310009126, + "grad_norm": 0.944648265838623, + "learning_rate": 4.22433732095009e-05, + "loss": 0.2358, + "step": 1889 + }, + { + "epoch": 1.149984788560998, + "grad_norm": 0.7821002006530762, + "learning_rate": 4.223435733077731e-05, + "loss": 0.1971, + "step": 1890 + }, + { + "epoch": 1.150593246121083, + "grad_norm": 1.0753979682922363, + "learning_rate": 4.2225337178651444e-05, + "loss": 0.2761, + "step": 1891 + }, + { + "epoch": 1.1512017036811681, + "grad_norm": 0.9069803953170776, + "learning_rate": 4.221631275535996e-05, + "loss": 0.258, + "step": 1892 + }, + { + "epoch": 1.1518101612412535, + "grad_norm": 1.0057759284973145, + "learning_rate": 4.2207284063140514e-05, + "loss": 0.2679, + "step": 1893 + }, + { + "epoch": 1.1524186188013386, + "grad_norm": 1.0301549434661865, + "learning_rate": 4.2198251104231854e-05, + "loss": 0.2995, + "step": 1894 + }, + { + "epoch": 1.1530270763614239, + "grad_norm": 0.9355301260948181, + "learning_rate": 4.218921388087379e-05, + "loss": 0.2179, + "step": 1895 + }, + { + "epoch": 1.153635533921509, + "grad_norm": 1.027159571647644, + "learning_rate": 4.2180172395307156e-05, + "loss": 0.2577, + "step": 1896 + }, + { + "epoch": 1.154243991481594, + "grad_norm": 1.046653389930725, + "learning_rate": 4.217112664977387e-05, + "loss": 0.2326, + "step": 1897 + }, + { + "epoch": 1.1548524490416794, + "grad_norm": 0.9612645506858826, + "learning_rate": 4.216207664651691e-05, + "loss": 0.2137, + "step": 1898 + }, + { + "epoch": 1.1554609066017645, + "grad_norm": 0.9418598413467407, + "learning_rate": 4.21530223877803e-05, + "loss": 0.2772, + "step": 1899 + }, + { + "epoch": 1.1560693641618498, + "grad_norm": 1.0486358404159546, + "learning_rate": 4.2143963875809096e-05, + "loss": 0.2663, + "step": 1900 + }, + { + "epoch": 1.156677821721935, + "grad_norm": 1.084235668182373, + "learning_rate": 4.213490111284945e-05, + "loss": 0.2499, + "step": 1901 + }, + { + "epoch": 1.15728627928202, + "grad_norm": 0.9962263107299805, + "learning_rate": 4.212583410114855e-05, + "loss": 0.2512, + "step": 1902 + }, + { + "epoch": 1.1578947368421053, + "grad_norm": 0.9712923765182495, + "learning_rate": 4.2116762842954625e-05, + "loss": 0.2396, + "step": 1903 + }, + { + "epoch": 1.1585031944021904, + "grad_norm": 0.9264492988586426, + "learning_rate": 4.210768734051699e-05, + "loss": 0.2584, + "step": 1904 + }, + { + "epoch": 1.1591116519622757, + "grad_norm": 1.201596975326538, + "learning_rate": 4.209860759608597e-05, + "loss": 0.287, + "step": 1905 + }, + { + "epoch": 1.1597201095223608, + "grad_norm": 0.9659140110015869, + "learning_rate": 4.2089523611912966e-05, + "loss": 0.2453, + "step": 1906 + }, + { + "epoch": 1.160328567082446, + "grad_norm": 0.8334513306617737, + "learning_rate": 4.208043539025044e-05, + "loss": 0.2573, + "step": 1907 + }, + { + "epoch": 1.1609370246425312, + "grad_norm": 0.9269493222236633, + "learning_rate": 4.2071342933351886e-05, + "loss": 0.2988, + "step": 1908 + }, + { + "epoch": 1.1615454822026163, + "grad_norm": 0.8946632146835327, + "learning_rate": 4.2062246243471846e-05, + "loss": 0.2378, + "step": 1909 + }, + { + "epoch": 1.1621539397627014, + "grad_norm": 0.9380949139595032, + "learning_rate": 4.2053145322865936e-05, + "loss": 0.2316, + "step": 1910 + }, + { + "epoch": 1.1627623973227867, + "grad_norm": 0.8564540147781372, + "learning_rate": 4.204404017379079e-05, + "loss": 0.2255, + "step": 1911 + }, + { + "epoch": 1.1633708548828718, + "grad_norm": 1.0175832509994507, + "learning_rate": 4.2034930798504114e-05, + "loss": 0.2384, + "step": 1912 + }, + { + "epoch": 1.1639793124429572, + "grad_norm": 0.9850523471832275, + "learning_rate": 4.202581719926465e-05, + "loss": 0.2682, + "step": 1913 + }, + { + "epoch": 1.1645877700030423, + "grad_norm": 1.0594969987869263, + "learning_rate": 4.201669937833219e-05, + "loss": 0.2491, + "step": 1914 + }, + { + "epoch": 1.1651962275631274, + "grad_norm": 0.9938765168190002, + "learning_rate": 4.2007577337967574e-05, + "loss": 0.2847, + "step": 1915 + }, + { + "epoch": 1.1658046851232127, + "grad_norm": 0.8707656860351562, + "learning_rate": 4.19984510804327e-05, + "loss": 0.2301, + "step": 1916 + }, + { + "epoch": 1.1664131426832978, + "grad_norm": 0.9624968767166138, + "learning_rate": 4.1989320607990474e-05, + "loss": 0.2459, + "step": 1917 + }, + { + "epoch": 1.167021600243383, + "grad_norm": 1.0143096446990967, + "learning_rate": 4.1980185922904894e-05, + "loss": 0.2169, + "step": 1918 + }, + { + "epoch": 1.1676300578034682, + "grad_norm": 0.8604801893234253, + "learning_rate": 4.197104702744097e-05, + "loss": 0.2162, + "step": 1919 + }, + { + "epoch": 1.1682385153635533, + "grad_norm": 1.1591583490371704, + "learning_rate": 4.1961903923864775e-05, + "loss": 0.2425, + "step": 1920 + }, + { + "epoch": 1.1688469729236386, + "grad_norm": 1.020859718322754, + "learning_rate": 4.1952756614443415e-05, + "loss": 0.2657, + "step": 1921 + }, + { + "epoch": 1.1694554304837237, + "grad_norm": 0.9535967111587524, + "learning_rate": 4.194360510144504e-05, + "loss": 0.2221, + "step": 1922 + }, + { + "epoch": 1.170063888043809, + "grad_norm": 1.1001362800598145, + "learning_rate": 4.1934449387138845e-05, + "loss": 0.3064, + "step": 1923 + }, + { + "epoch": 1.170672345603894, + "grad_norm": 0.851999819278717, + "learning_rate": 4.192528947379506e-05, + "loss": 0.2202, + "step": 1924 + }, + { + "epoch": 1.1712808031639792, + "grad_norm": 0.888411283493042, + "learning_rate": 4.1916125363684965e-05, + "loss": 0.2617, + "step": 1925 + }, + { + "epoch": 1.1718892607240645, + "grad_norm": 0.9610258340835571, + "learning_rate": 4.1906957059080886e-05, + "loss": 0.2411, + "step": 1926 + }, + { + "epoch": 1.1724977182841496, + "grad_norm": 0.9737393856048584, + "learning_rate": 4.189778456225617e-05, + "loss": 0.2612, + "step": 1927 + }, + { + "epoch": 1.173106175844235, + "grad_norm": 0.9638609886169434, + "learning_rate": 4.188860787548522e-05, + "loss": 0.2328, + "step": 1928 + }, + { + "epoch": 1.17371463340432, + "grad_norm": 1.1524028778076172, + "learning_rate": 4.187942700104346e-05, + "loss": 0.2513, + "step": 1929 + }, + { + "epoch": 1.1743230909644051, + "grad_norm": 0.9276042580604553, + "learning_rate": 4.1870241941207375e-05, + "loss": 0.2542, + "step": 1930 + }, + { + "epoch": 1.1749315485244904, + "grad_norm": 0.9600099325180054, + "learning_rate": 4.186105269825449e-05, + "loss": 0.2704, + "step": 1931 + }, + { + "epoch": 1.1755400060845755, + "grad_norm": 0.940571129322052, + "learning_rate": 4.1851859274463326e-05, + "loss": 0.2159, + "step": 1932 + }, + { + "epoch": 1.1761484636446609, + "grad_norm": 0.9095624685287476, + "learning_rate": 4.184266167211348e-05, + "loss": 0.2321, + "step": 1933 + }, + { + "epoch": 1.176756921204746, + "grad_norm": 0.9772641658782959, + "learning_rate": 4.183345989348558e-05, + "loss": 0.2242, + "step": 1934 + }, + { + "epoch": 1.177365378764831, + "grad_norm": 0.8371894359588623, + "learning_rate": 4.182425394086128e-05, + "loss": 0.1883, + "step": 1935 + }, + { + "epoch": 1.1779738363249164, + "grad_norm": 0.9441771507263184, + "learning_rate": 4.181504381652327e-05, + "loss": 0.2521, + "step": 1936 + }, + { + "epoch": 1.1785822938850015, + "grad_norm": 1.1066073179244995, + "learning_rate": 4.180582952275528e-05, + "loss": 0.275, + "step": 1937 + }, + { + "epoch": 1.1791907514450868, + "grad_norm": 0.9698868989944458, + "learning_rate": 4.179661106184207e-05, + "loss": 0.2583, + "step": 1938 + }, + { + "epoch": 1.1797992090051719, + "grad_norm": 0.9682748317718506, + "learning_rate": 4.178738843606943e-05, + "loss": 0.2726, + "step": 1939 + }, + { + "epoch": 1.180407666565257, + "grad_norm": 0.9256848692893982, + "learning_rate": 4.1778161647724203e-05, + "loss": 0.225, + "step": 1940 + }, + { + "epoch": 1.1810161241253423, + "grad_norm": 0.8582398891448975, + "learning_rate": 4.176893069909422e-05, + "loss": 0.2164, + "step": 1941 + }, + { + "epoch": 1.1816245816854274, + "grad_norm": 0.9054145812988281, + "learning_rate": 4.1759695592468395e-05, + "loss": 0.2086, + "step": 1942 + }, + { + "epoch": 1.1822330392455127, + "grad_norm": 0.9626482725143433, + "learning_rate": 4.175045633013665e-05, + "loss": 0.2939, + "step": 1943 + }, + { + "epoch": 1.1828414968055978, + "grad_norm": 0.8653888702392578, + "learning_rate": 4.1741212914389914e-05, + "loss": 0.2459, + "step": 1944 + }, + { + "epoch": 1.183449954365683, + "grad_norm": 0.94068843126297, + "learning_rate": 4.173196534752019e-05, + "loss": 0.2704, + "step": 1945 + }, + { + "epoch": 1.1840584119257682, + "grad_norm": 0.8980154991149902, + "learning_rate": 4.1722713631820485e-05, + "loss": 0.2332, + "step": 1946 + }, + { + "epoch": 1.1846668694858533, + "grad_norm": 0.9154375791549683, + "learning_rate": 4.171345776958483e-05, + "loss": 0.2293, + "step": 1947 + }, + { + "epoch": 1.1852753270459386, + "grad_norm": 0.9358067512512207, + "learning_rate": 4.17041977631083e-05, + "loss": 0.2325, + "step": 1948 + }, + { + "epoch": 1.1858837846060237, + "grad_norm": 0.8783141374588013, + "learning_rate": 4.1694933614686995e-05, + "loss": 0.2125, + "step": 1949 + }, + { + "epoch": 1.1864922421661088, + "grad_norm": 1.0046310424804688, + "learning_rate": 4.168566532661803e-05, + "loss": 0.2796, + "step": 1950 + }, + { + "epoch": 1.1871006997261941, + "grad_norm": 0.9390839338302612, + "learning_rate": 4.167639290119956e-05, + "loss": 0.2875, + "step": 1951 + }, + { + "epoch": 1.1877091572862792, + "grad_norm": 0.9401413202285767, + "learning_rate": 4.166711634073075e-05, + "loss": 0.2614, + "step": 1952 + }, + { + "epoch": 1.1883176148463646, + "grad_norm": 1.0055378675460815, + "learning_rate": 4.1657835647511804e-05, + "loss": 0.2489, + "step": 1953 + }, + { + "epoch": 1.1889260724064497, + "grad_norm": 0.8895730376243591, + "learning_rate": 4.164855082384396e-05, + "loss": 0.2491, + "step": 1954 + }, + { + "epoch": 1.1895345299665347, + "grad_norm": 0.8464339375495911, + "learning_rate": 4.163926187202946e-05, + "loss": 0.199, + "step": 1955 + }, + { + "epoch": 1.19014298752662, + "grad_norm": 0.9686809778213501, + "learning_rate": 4.162996879437156e-05, + "loss": 0.2742, + "step": 1956 + }, + { + "epoch": 1.1907514450867052, + "grad_norm": 1.0053505897521973, + "learning_rate": 4.1620671593174585e-05, + "loss": 0.2875, + "step": 1957 + }, + { + "epoch": 1.1913599026467905, + "grad_norm": 1.0522572994232178, + "learning_rate": 4.1611370270743826e-05, + "loss": 0.2174, + "step": 1958 + }, + { + "epoch": 1.1919683602068756, + "grad_norm": 0.8975366353988647, + "learning_rate": 4.160206482938565e-05, + "loss": 0.2535, + "step": 1959 + }, + { + "epoch": 1.1925768177669607, + "grad_norm": 0.8640589714050293, + "learning_rate": 4.159275527140739e-05, + "loss": 0.2178, + "step": 1960 + }, + { + "epoch": 1.193185275327046, + "grad_norm": 0.9383702874183655, + "learning_rate": 4.1583441599117453e-05, + "loss": 0.2912, + "step": 1961 + }, + { + "epoch": 1.193793732887131, + "grad_norm": 0.8808874487876892, + "learning_rate": 4.1574123814825226e-05, + "loss": 0.2189, + "step": 1962 + }, + { + "epoch": 1.1944021904472164, + "grad_norm": 0.9913263320922852, + "learning_rate": 4.156480192084114e-05, + "loss": 0.2931, + "step": 1963 + }, + { + "epoch": 1.1950106480073015, + "grad_norm": 1.0202916860580444, + "learning_rate": 4.155547591947663e-05, + "loss": 0.2677, + "step": 1964 + }, + { + "epoch": 1.1956191055673866, + "grad_norm": 1.405519962310791, + "learning_rate": 4.154614581304416e-05, + "loss": 0.3883, + "step": 1965 + }, + { + "epoch": 1.196227563127472, + "grad_norm": 0.9594994783401489, + "learning_rate": 4.15368116038572e-05, + "loss": 0.2595, + "step": 1966 + }, + { + "epoch": 1.196836020687557, + "grad_norm": 1.043186902999878, + "learning_rate": 4.1527473294230255e-05, + "loss": 0.2648, + "step": 1967 + }, + { + "epoch": 1.1974444782476423, + "grad_norm": 0.9707902073860168, + "learning_rate": 4.151813088647883e-05, + "loss": 0.2546, + "step": 1968 + }, + { + "epoch": 1.1980529358077274, + "grad_norm": 0.9176664352416992, + "learning_rate": 4.1508784382919466e-05, + "loss": 0.2161, + "step": 1969 + }, + { + "epoch": 1.1986613933678125, + "grad_norm": 0.9123236536979675, + "learning_rate": 4.149943378586968e-05, + "loss": 0.242, + "step": 1970 + }, + { + "epoch": 1.1992698509278978, + "grad_norm": 0.9186065793037415, + "learning_rate": 4.149007909764805e-05, + "loss": 0.2001, + "step": 1971 + }, + { + "epoch": 1.199878308487983, + "grad_norm": 1.0215867757797241, + "learning_rate": 4.148072032057415e-05, + "loss": 0.2529, + "step": 1972 + }, + { + "epoch": 1.2004867660480683, + "grad_norm": 0.9522402882575989, + "learning_rate": 4.1471357456968556e-05, + "loss": 0.2096, + "step": 1973 + }, + { + "epoch": 1.2010952236081534, + "grad_norm": 0.9875971674919128, + "learning_rate": 4.146199050915288e-05, + "loss": 0.2659, + "step": 1974 + }, + { + "epoch": 1.2017036811682384, + "grad_norm": 0.892595112323761, + "learning_rate": 4.1452619479449714e-05, + "loss": 0.2422, + "step": 1975 + }, + { + "epoch": 1.2023121387283238, + "grad_norm": 0.9332039952278137, + "learning_rate": 4.14432443701827e-05, + "loss": 0.1921, + "step": 1976 + }, + { + "epoch": 1.2029205962884089, + "grad_norm": 0.8022421598434448, + "learning_rate": 4.143386518367648e-05, + "loss": 0.1945, + "step": 1977 + }, + { + "epoch": 1.2035290538484942, + "grad_norm": 0.9475464224815369, + "learning_rate": 4.142448192225669e-05, + "loss": 0.2119, + "step": 1978 + }, + { + "epoch": 1.2041375114085793, + "grad_norm": 0.8126545548439026, + "learning_rate": 4.1415094588249975e-05, + "loss": 0.2195, + "step": 1979 + }, + { + "epoch": 1.2047459689686644, + "grad_norm": 1.1211334466934204, + "learning_rate": 4.140570318398403e-05, + "loss": 0.2904, + "step": 1980 + }, + { + "epoch": 1.2053544265287497, + "grad_norm": 1.054579734802246, + "learning_rate": 4.1396307711787516e-05, + "loss": 0.2495, + "step": 1981 + }, + { + "epoch": 1.2059628840888348, + "grad_norm": 0.9004554152488708, + "learning_rate": 4.1386908173990126e-05, + "loss": 0.2597, + "step": 1982 + }, + { + "epoch": 1.20657134164892, + "grad_norm": 0.9053953886032104, + "learning_rate": 4.1377504572922534e-05, + "loss": 0.2532, + "step": 1983 + }, + { + "epoch": 1.2071797992090052, + "grad_norm": 0.8492591381072998, + "learning_rate": 4.136809691091647e-05, + "loss": 0.2266, + "step": 1984 + }, + { + "epoch": 1.2077882567690903, + "grad_norm": 0.9747501611709595, + "learning_rate": 4.135868519030463e-05, + "loss": 0.2336, + "step": 1985 + }, + { + "epoch": 1.2083967143291756, + "grad_norm": 0.907906711101532, + "learning_rate": 4.134926941342071e-05, + "loss": 0.2306, + "step": 1986 + }, + { + "epoch": 1.2090051718892607, + "grad_norm": 1.0222368240356445, + "learning_rate": 4.1339849582599454e-05, + "loss": 0.2196, + "step": 1987 + }, + { + "epoch": 1.209613629449346, + "grad_norm": 1.05088210105896, + "learning_rate": 4.1330425700176586e-05, + "loss": 0.2503, + "step": 1988 + }, + { + "epoch": 1.2102220870094311, + "grad_norm": 1.1211512088775635, + "learning_rate": 4.132099776848882e-05, + "loss": 0.2371, + "step": 1989 + }, + { + "epoch": 1.2108305445695162, + "grad_norm": 0.9060412645339966, + "learning_rate": 4.1311565789873914e-05, + "loss": 0.2377, + "step": 1990 + }, + { + "epoch": 1.2114390021296015, + "grad_norm": 0.8177694082260132, + "learning_rate": 4.1302129766670586e-05, + "loss": 0.2452, + "step": 1991 + }, + { + "epoch": 1.2120474596896866, + "grad_norm": 0.9358994960784912, + "learning_rate": 4.129268970121858e-05, + "loss": 0.2222, + "step": 1992 + }, + { + "epoch": 1.212655917249772, + "grad_norm": 0.8588563203811646, + "learning_rate": 4.128324559585863e-05, + "loss": 0.219, + "step": 1993 + }, + { + "epoch": 1.213264374809857, + "grad_norm": 0.9672694206237793, + "learning_rate": 4.127379745293251e-05, + "loss": 0.2587, + "step": 1994 + }, + { + "epoch": 1.2138728323699421, + "grad_norm": 0.8904327154159546, + "learning_rate": 4.1264345274782937e-05, + "loss": 0.2371, + "step": 1995 + }, + { + "epoch": 1.2144812899300275, + "grad_norm": 1.017808198928833, + "learning_rate": 4.125488906375367e-05, + "loss": 0.2382, + "step": 1996 + }, + { + "epoch": 1.2150897474901126, + "grad_norm": 0.9038395881652832, + "learning_rate": 4.1245428822189444e-05, + "loss": 0.2444, + "step": 1997 + }, + { + "epoch": 1.2156982050501977, + "grad_norm": 1.1696877479553223, + "learning_rate": 4.123596455243603e-05, + "loss": 0.2993, + "step": 1998 + }, + { + "epoch": 1.216306662610283, + "grad_norm": 0.8989875316619873, + "learning_rate": 4.122649625684014e-05, + "loss": 0.1964, + "step": 1999 + }, + { + "epoch": 1.216915120170368, + "grad_norm": 1.0550363063812256, + "learning_rate": 4.1217023937749544e-05, + "loss": 0.2987, + "step": 2000 + }, + { + "epoch": 1.2175235777304534, + "grad_norm": 0.9538779258728027, + "learning_rate": 4.120754759751296e-05, + "loss": 0.2522, + "step": 2001 + }, + { + "epoch": 1.2181320352905385, + "grad_norm": 1.279292106628418, + "learning_rate": 4.1198067238480145e-05, + "loss": 0.2853, + "step": 2002 + }, + { + "epoch": 1.2187404928506236, + "grad_norm": 1.0189933776855469, + "learning_rate": 4.118858286300182e-05, + "loss": 0.2265, + "step": 2003 + }, + { + "epoch": 1.219348950410709, + "grad_norm": 0.8648117184638977, + "learning_rate": 4.117909447342972e-05, + "loss": 0.2193, + "step": 2004 + }, + { + "epoch": 1.219957407970794, + "grad_norm": 0.8088192939758301, + "learning_rate": 4.1169602072116567e-05, + "loss": 0.2087, + "step": 2005 + }, + { + "epoch": 1.2205658655308793, + "grad_norm": 1.034005045890808, + "learning_rate": 4.116010566141608e-05, + "loss": 0.2648, + "step": 2006 + }, + { + "epoch": 1.2211743230909644, + "grad_norm": 1.020052194595337, + "learning_rate": 4.115060524368297e-05, + "loss": 0.2715, + "step": 2007 + }, + { + "epoch": 1.2217827806510495, + "grad_norm": 0.8594374656677246, + "learning_rate": 4.114110082127296e-05, + "loss": 0.2395, + "step": 2008 + }, + { + "epoch": 1.2223912382111348, + "grad_norm": 0.9257277250289917, + "learning_rate": 4.113159239654273e-05, + "loss": 0.2356, + "step": 2009 + }, + { + "epoch": 1.22299969577122, + "grad_norm": 0.9078371524810791, + "learning_rate": 4.1122079971849994e-05, + "loss": 0.2536, + "step": 2010 + }, + { + "epoch": 1.2236081533313052, + "grad_norm": 0.9304254055023193, + "learning_rate": 4.1112563549553427e-05, + "loss": 0.2504, + "step": 2011 + }, + { + "epoch": 1.2242166108913903, + "grad_norm": 0.9377618432044983, + "learning_rate": 4.11030431320127e-05, + "loss": 0.2245, + "step": 2012 + }, + { + "epoch": 1.2248250684514754, + "grad_norm": 0.8383059501647949, + "learning_rate": 4.1093518721588484e-05, + "loss": 0.1871, + "step": 2013 + }, + { + "epoch": 1.2254335260115607, + "grad_norm": 0.853079617023468, + "learning_rate": 4.108399032064243e-05, + "loss": 0.2169, + "step": 2014 + }, + { + "epoch": 1.2260419835716458, + "grad_norm": 0.9059860110282898, + "learning_rate": 4.10744579315372e-05, + "loss": 0.2204, + "step": 2015 + }, + { + "epoch": 1.226650441131731, + "grad_norm": 0.9250550866127014, + "learning_rate": 4.1064921556636413e-05, + "loss": 0.2556, + "step": 2016 + }, + { + "epoch": 1.2272588986918163, + "grad_norm": 0.9633865356445312, + "learning_rate": 4.1055381198304705e-05, + "loss": 0.2426, + "step": 2017 + }, + { + "epoch": 1.2278673562519014, + "grad_norm": 0.865479588508606, + "learning_rate": 4.1045836858907676e-05, + "loss": 0.2093, + "step": 2018 + }, + { + "epoch": 1.2284758138119867, + "grad_norm": 0.8409626483917236, + "learning_rate": 4.1036288540811935e-05, + "loss": 0.1839, + "step": 2019 + }, + { + "epoch": 1.2290842713720718, + "grad_norm": 0.8517163991928101, + "learning_rate": 4.1026736246385055e-05, + "loss": 0.1791, + "step": 2020 + }, + { + "epoch": 1.2296927289321569, + "grad_norm": 1.0229874849319458, + "learning_rate": 4.101717997799562e-05, + "loss": 0.2466, + "step": 2021 + }, + { + "epoch": 1.2303011864922422, + "grad_norm": 1.0059734582901, + "learning_rate": 4.100761973801317e-05, + "loss": 0.2218, + "step": 2022 + }, + { + "epoch": 1.2309096440523273, + "grad_norm": 1.6977488994598389, + "learning_rate": 4.0998055528808266e-05, + "loss": 0.2822, + "step": 2023 + }, + { + "epoch": 1.2315181016124126, + "grad_norm": 1.067182183265686, + "learning_rate": 4.0988487352752414e-05, + "loss": 0.2576, + "step": 2024 + }, + { + "epoch": 1.2321265591724977, + "grad_norm": 0.9440335631370544, + "learning_rate": 4.097891521221814e-05, + "loss": 0.2397, + "step": 2025 + }, + { + "epoch": 1.2327350167325828, + "grad_norm": 0.9076058268547058, + "learning_rate": 4.096933910957892e-05, + "loss": 0.2329, + "step": 2026 + }, + { + "epoch": 1.233343474292668, + "grad_norm": 0.8461357951164246, + "learning_rate": 4.0959759047209234e-05, + "loss": 0.2133, + "step": 2027 + }, + { + "epoch": 1.2339519318527532, + "grad_norm": 0.8329628109931946, + "learning_rate": 4.095017502748455e-05, + "loss": 0.2251, + "step": 2028 + }, + { + "epoch": 1.2345603894128385, + "grad_norm": 0.8861196041107178, + "learning_rate": 4.094058705278129e-05, + "loss": 0.2232, + "step": 2029 + }, + { + "epoch": 1.2351688469729236, + "grad_norm": 0.9130899906158447, + "learning_rate": 4.093099512547687e-05, + "loss": 0.2517, + "step": 2030 + }, + { + "epoch": 1.2357773045330087, + "grad_norm": 0.8920985460281372, + "learning_rate": 4.09213992479497e-05, + "loss": 0.2403, + "step": 2031 + }, + { + "epoch": 1.236385762093094, + "grad_norm": 0.9351683259010315, + "learning_rate": 4.0911799422579155e-05, + "loss": 0.2416, + "step": 2032 + }, + { + "epoch": 1.2369942196531791, + "grad_norm": 0.8944339156150818, + "learning_rate": 4.090219565174559e-05, + "loss": 0.2441, + "step": 2033 + }, + { + "epoch": 1.2376026772132644, + "grad_norm": 0.9203975200653076, + "learning_rate": 4.089258793783034e-05, + "loss": 0.2541, + "step": 2034 + }, + { + "epoch": 1.2382111347733495, + "grad_norm": 0.880241334438324, + "learning_rate": 4.0882976283215714e-05, + "loss": 0.2492, + "step": 2035 + }, + { + "epoch": 1.2388195923334346, + "grad_norm": 0.9325846433639526, + "learning_rate": 4.087336069028501e-05, + "loss": 0.2663, + "step": 2036 + }, + { + "epoch": 1.23942804989352, + "grad_norm": 1.0154072046279907, + "learning_rate": 4.086374116142249e-05, + "loss": 0.2699, + "step": 2037 + }, + { + "epoch": 1.240036507453605, + "grad_norm": 0.9841558933258057, + "learning_rate": 4.0854117699013396e-05, + "loss": 0.2496, + "step": 2038 + }, + { + "epoch": 1.2406449650136904, + "grad_norm": 0.8984540700912476, + "learning_rate": 4.0844490305443934e-05, + "loss": 0.2008, + "step": 2039 + }, + { + "epoch": 1.2412534225737755, + "grad_norm": 0.8929226994514465, + "learning_rate": 4.083485898310131e-05, + "loss": 0.2588, + "step": 2040 + }, + { + "epoch": 1.2418618801338606, + "grad_norm": 0.7902721762657166, + "learning_rate": 4.0825223734373696e-05, + "loss": 0.1924, + "step": 2041 + }, + { + "epoch": 1.2424703376939459, + "grad_norm": 0.8901758790016174, + "learning_rate": 4.081558456165022e-05, + "loss": 0.246, + "step": 2042 + }, + { + "epoch": 1.243078795254031, + "grad_norm": 1.1308690309524536, + "learning_rate": 4.080594146732099e-05, + "loss": 0.286, + "step": 2043 + }, + { + "epoch": 1.2436872528141163, + "grad_norm": 1.1202785968780518, + "learning_rate": 4.079629445377712e-05, + "loss": 0.3143, + "step": 2044 + }, + { + "epoch": 1.2442957103742014, + "grad_norm": 0.8539735078811646, + "learning_rate": 4.078664352341063e-05, + "loss": 0.2171, + "step": 2045 + }, + { + "epoch": 1.2449041679342865, + "grad_norm": 1.0043787956237793, + "learning_rate": 4.077698867861457e-05, + "loss": 0.2746, + "step": 2046 + }, + { + "epoch": 1.2455126254943718, + "grad_norm": 1.0371819734573364, + "learning_rate": 4.076732992178294e-05, + "loss": 0.269, + "step": 2047 + }, + { + "epoch": 1.246121083054457, + "grad_norm": 0.9970436096191406, + "learning_rate": 4.075766725531069e-05, + "loss": 0.2185, + "step": 2048 + }, + { + "epoch": 1.2467295406145422, + "grad_norm": 1.058952808380127, + "learning_rate": 4.074800068159379e-05, + "loss": 0.2503, + "step": 2049 + }, + { + "epoch": 1.2473379981746273, + "grad_norm": 0.8718734383583069, + "learning_rate": 4.073833020302912e-05, + "loss": 0.2422, + "step": 2050 + }, + { + "epoch": 1.2479464557347124, + "grad_norm": 0.8423463106155396, + "learning_rate": 4.0728655822014574e-05, + "loss": 0.2396, + "step": 2051 + }, + { + "epoch": 1.2485549132947977, + "grad_norm": 0.9200191497802734, + "learning_rate": 4.071897754094898e-05, + "loss": 0.201, + "step": 2052 + }, + { + "epoch": 1.2491633708548828, + "grad_norm": 1.0252035856246948, + "learning_rate": 4.0709295362232156e-05, + "loss": 0.2548, + "step": 2053 + }, + { + "epoch": 1.2497718284149681, + "grad_norm": 0.875744640827179, + "learning_rate": 4.069960928826488e-05, + "loss": 0.2319, + "step": 2054 + }, + { + "epoch": 1.2503802859750532, + "grad_norm": 1.0111494064331055, + "learning_rate": 4.0689919321448885e-05, + "loss": 0.2188, + "step": 2055 + }, + { + "epoch": 1.2509887435351383, + "grad_norm": 1.0929096937179565, + "learning_rate": 4.0680225464186895e-05, + "loss": 0.2029, + "step": 2056 + }, + { + "epoch": 1.2515972010952237, + "grad_norm": 0.9133079051971436, + "learning_rate": 4.067052771888257e-05, + "loss": 0.2084, + "step": 2057 + }, + { + "epoch": 1.2522056586553088, + "grad_norm": 0.8972569108009338, + "learning_rate": 4.066082608794055e-05, + "loss": 0.238, + "step": 2058 + }, + { + "epoch": 1.252814116215394, + "grad_norm": 0.9103267788887024, + "learning_rate": 4.0651120573766447e-05, + "loss": 0.2144, + "step": 2059 + }, + { + "epoch": 1.2534225737754792, + "grad_norm": 0.8961523771286011, + "learning_rate": 4.0641411178766795e-05, + "loss": 0.2639, + "step": 2060 + }, + { + "epoch": 1.2540310313355643, + "grad_norm": 0.907996654510498, + "learning_rate": 4.0631697905349144e-05, + "loss": 0.2035, + "step": 2061 + }, + { + "epoch": 1.2546394888956496, + "grad_norm": 1.2141492366790771, + "learning_rate": 4.0621980755921974e-05, + "loss": 0.2882, + "step": 2062 + }, + { + "epoch": 1.2552479464557347, + "grad_norm": 0.889756441116333, + "learning_rate": 4.061225973289473e-05, + "loss": 0.2281, + "step": 2063 + }, + { + "epoch": 1.25585640401582, + "grad_norm": 0.9822306632995605, + "learning_rate": 4.060253483867783e-05, + "loss": 0.2966, + "step": 2064 + }, + { + "epoch": 1.256464861575905, + "grad_norm": 0.9463213682174683, + "learning_rate": 4.059280607568263e-05, + "loss": 0.2358, + "step": 2065 + }, + { + "epoch": 1.2570733191359902, + "grad_norm": 0.9215368032455444, + "learning_rate": 4.058307344632147e-05, + "loss": 0.2188, + "step": 2066 + }, + { + "epoch": 1.2576817766960755, + "grad_norm": 0.9397695064544678, + "learning_rate": 4.057333695300762e-05, + "loss": 0.2423, + "step": 2067 + }, + { + "epoch": 1.2582902342561606, + "grad_norm": 0.9168745875358582, + "learning_rate": 4.056359659815534e-05, + "loss": 0.1766, + "step": 2068 + }, + { + "epoch": 1.258898691816246, + "grad_norm": 0.8033076524734497, + "learning_rate": 4.055385238417984e-05, + "loss": 0.2049, + "step": 2069 + }, + { + "epoch": 1.259507149376331, + "grad_norm": 0.8447003960609436, + "learning_rate": 4.054410431349724e-05, + "loss": 0.2532, + "step": 2070 + }, + { + "epoch": 1.260115606936416, + "grad_norm": 0.9117369651794434, + "learning_rate": 4.053435238852469e-05, + "loss": 0.2495, + "step": 2071 + }, + { + "epoch": 1.2607240644965014, + "grad_norm": 0.838207483291626, + "learning_rate": 4.052459661168025e-05, + "loss": 0.2442, + "step": 2072 + }, + { + "epoch": 1.2613325220565865, + "grad_norm": 0.9510546922683716, + "learning_rate": 4.051483698538295e-05, + "loss": 0.2255, + "step": 2073 + }, + { + "epoch": 1.2619409796166718, + "grad_norm": 1.0252444744110107, + "learning_rate": 4.0505073512052774e-05, + "loss": 0.2537, + "step": 2074 + }, + { + "epoch": 1.262549437176757, + "grad_norm": 0.8936956524848938, + "learning_rate": 4.049530619411065e-05, + "loss": 0.1971, + "step": 2075 + }, + { + "epoch": 1.263157894736842, + "grad_norm": 0.9542527198791504, + "learning_rate": 4.0485535033978455e-05, + "loss": 0.2105, + "step": 2076 + }, + { + "epoch": 1.2637663522969274, + "grad_norm": 0.9937785863876343, + "learning_rate": 4.047576003407905e-05, + "loss": 0.2452, + "step": 2077 + }, + { + "epoch": 1.2643748098570124, + "grad_norm": 0.9868922233581543, + "learning_rate": 4.046598119683621e-05, + "loss": 0.2215, + "step": 2078 + }, + { + "epoch": 1.2649832674170978, + "grad_norm": 0.9198853373527527, + "learning_rate": 4.045619852467469e-05, + "loss": 0.2079, + "step": 2079 + }, + { + "epoch": 1.2655917249771829, + "grad_norm": 0.9050553441047668, + "learning_rate": 4.0446412020020185e-05, + "loss": 0.2373, + "step": 2080 + }, + { + "epoch": 1.266200182537268, + "grad_norm": 1.0860339403152466, + "learning_rate": 4.0436621685299334e-05, + "loss": 0.2303, + "step": 2081 + }, + { + "epoch": 1.2668086400973533, + "grad_norm": 0.9267223477363586, + "learning_rate": 4.0426827522939735e-05, + "loss": 0.2241, + "step": 2082 + }, + { + "epoch": 1.2674170976574384, + "grad_norm": 0.8770591616630554, + "learning_rate": 4.041702953536994e-05, + "loss": 0.2209, + "step": 2083 + }, + { + "epoch": 1.2680255552175237, + "grad_norm": 0.7461957335472107, + "learning_rate": 4.0407227725019426e-05, + "loss": 0.2294, + "step": 2084 + }, + { + "epoch": 1.2686340127776088, + "grad_norm": 1.0627920627593994, + "learning_rate": 4.039742209431864e-05, + "loss": 0.2419, + "step": 2085 + }, + { + "epoch": 1.2692424703376939, + "grad_norm": 0.9372715353965759, + "learning_rate": 4.0387612645698974e-05, + "loss": 0.2496, + "step": 2086 + }, + { + "epoch": 1.2698509278977792, + "grad_norm": 0.9207741022109985, + "learning_rate": 4.037779938159276e-05, + "loss": 0.2682, + "step": 2087 + }, + { + "epoch": 1.2704593854578643, + "grad_norm": 0.9947006702423096, + "learning_rate": 4.036798230443328e-05, + "loss": 0.241, + "step": 2088 + }, + { + "epoch": 1.2710678430179496, + "grad_norm": 0.9042183756828308, + "learning_rate": 4.035816141665475e-05, + "loss": 0.2357, + "step": 2089 + }, + { + "epoch": 1.2716763005780347, + "grad_norm": 0.9522048830986023, + "learning_rate": 4.0348336720692345e-05, + "loss": 0.1831, + "step": 2090 + }, + { + "epoch": 1.2722847581381198, + "grad_norm": 0.8158754706382751, + "learning_rate": 4.0338508218982197e-05, + "loss": 0.2126, + "step": 2091 + }, + { + "epoch": 1.2728932156982051, + "grad_norm": 1.012651801109314, + "learning_rate": 4.032867591396135e-05, + "loss": 0.2472, + "step": 2092 + }, + { + "epoch": 1.2735016732582902, + "grad_norm": 0.9281013011932373, + "learning_rate": 4.0318839808067796e-05, + "loss": 0.2052, + "step": 2093 + }, + { + "epoch": 1.2741101308183755, + "grad_norm": 1.0591697692871094, + "learning_rate": 4.0308999903740496e-05, + "loss": 0.2503, + "step": 2094 + }, + { + "epoch": 1.2747185883784606, + "grad_norm": 0.852708637714386, + "learning_rate": 4.029915620341933e-05, + "loss": 0.2026, + "step": 2095 + }, + { + "epoch": 1.2753270459385457, + "grad_norm": 0.9270033836364746, + "learning_rate": 4.028930870954512e-05, + "loss": 0.223, + "step": 2096 + }, + { + "epoch": 1.275935503498631, + "grad_norm": 0.8873688578605652, + "learning_rate": 4.0279457424559654e-05, + "loss": 0.2448, + "step": 2097 + }, + { + "epoch": 1.2765439610587161, + "grad_norm": 0.8902617692947388, + "learning_rate": 4.0269602350905615e-05, + "loss": 0.2214, + "step": 2098 + }, + { + "epoch": 1.2771524186188015, + "grad_norm": 0.8026160597801208, + "learning_rate": 4.025974349102667e-05, + "loss": 0.201, + "step": 2099 + }, + { + "epoch": 1.2777608761788866, + "grad_norm": 0.8452394008636475, + "learning_rate": 4.024988084736739e-05, + "loss": 0.2145, + "step": 2100 + }, + { + "epoch": 1.2783693337389717, + "grad_norm": 0.859439492225647, + "learning_rate": 4.0240014422373304e-05, + "loss": 0.2221, + "step": 2101 + }, + { + "epoch": 1.2789777912990568, + "grad_norm": 0.929378867149353, + "learning_rate": 4.023014421849088e-05, + "loss": 0.2458, + "step": 2102 + }, + { + "epoch": 1.279586248859142, + "grad_norm": 0.9007444381713867, + "learning_rate": 4.0220270238167514e-05, + "loss": 0.2225, + "step": 2103 + }, + { + "epoch": 1.2801947064192274, + "grad_norm": 0.821709394454956, + "learning_rate": 4.021039248385154e-05, + "loss": 0.1803, + "step": 2104 + }, + { + "epoch": 1.2808031639793125, + "grad_norm": 1.293845534324646, + "learning_rate": 4.0200510957992234e-05, + "loss": 0.397, + "step": 2105 + }, + { + "epoch": 1.2814116215393976, + "grad_norm": 0.8068326711654663, + "learning_rate": 4.0190625663039796e-05, + "loss": 0.2132, + "step": 2106 + }, + { + "epoch": 1.2820200790994827, + "grad_norm": 0.9025530219078064, + "learning_rate": 4.0180736601445365e-05, + "loss": 0.2425, + "step": 2107 + }, + { + "epoch": 1.282628536659568, + "grad_norm": 0.9707067012786865, + "learning_rate": 4.0170843775661025e-05, + "loss": 0.2608, + "step": 2108 + }, + { + "epoch": 1.2832369942196533, + "grad_norm": 0.9354795217514038, + "learning_rate": 4.0160947188139786e-05, + "loss": 0.2323, + "step": 2109 + }, + { + "epoch": 1.2838454517797384, + "grad_norm": 1.0434750318527222, + "learning_rate": 4.015104684133558e-05, + "loss": 0.2411, + "step": 2110 + }, + { + "epoch": 1.2844539093398235, + "grad_norm": 0.9146111011505127, + "learning_rate": 4.014114273770328e-05, + "loss": 0.247, + "step": 2111 + }, + { + "epoch": 1.2850623668999086, + "grad_norm": 0.9847277402877808, + "learning_rate": 4.01312348796987e-05, + "loss": 0.2576, + "step": 2112 + }, + { + "epoch": 1.285670824459994, + "grad_norm": 0.9638257026672363, + "learning_rate": 4.0121323269778565e-05, + "loss": 0.2398, + "step": 2113 + }, + { + "epoch": 1.286279282020079, + "grad_norm": 0.8207821249961853, + "learning_rate": 4.0111407910400555e-05, + "loss": 0.2134, + "step": 2114 + }, + { + "epoch": 1.2868877395801643, + "grad_norm": 0.9123344421386719, + "learning_rate": 4.010148880402326e-05, + "loss": 0.1845, + "step": 2115 + }, + { + "epoch": 1.2874961971402494, + "grad_norm": 0.9210152626037598, + "learning_rate": 4.009156595310619e-05, + "loss": 0.2483, + "step": 2116 + }, + { + "epoch": 1.2881046547003345, + "grad_norm": 0.9474868178367615, + "learning_rate": 4.0081639360109816e-05, + "loss": 0.2452, + "step": 2117 + }, + { + "epoch": 1.2887131122604198, + "grad_norm": 0.931167721748352, + "learning_rate": 4.007170902749552e-05, + "loss": 0.25, + "step": 2118 + }, + { + "epoch": 1.289321569820505, + "grad_norm": 0.9042659401893616, + "learning_rate": 4.00617749577256e-05, + "loss": 0.2474, + "step": 2119 + }, + { + "epoch": 1.2899300273805903, + "grad_norm": 0.9150558114051819, + "learning_rate": 4.0051837153263296e-05, + "loss": 0.2376, + "step": 2120 + }, + { + "epoch": 1.2905384849406754, + "grad_norm": 0.7283451557159424, + "learning_rate": 4.0041895616572765e-05, + "loss": 0.1869, + "step": 2121 + }, + { + "epoch": 1.2911469425007605, + "grad_norm": 0.8587160110473633, + "learning_rate": 4.0031950350119106e-05, + "loss": 0.2099, + "step": 2122 + }, + { + "epoch": 1.2917554000608458, + "grad_norm": 0.7844583988189697, + "learning_rate": 4.002200135636832e-05, + "loss": 0.1998, + "step": 2123 + }, + { + "epoch": 1.2923638576209309, + "grad_norm": 0.8835726380348206, + "learning_rate": 4.001204863778735e-05, + "loss": 0.2201, + "step": 2124 + }, + { + "epoch": 1.2929723151810162, + "grad_norm": 0.8643476366996765, + "learning_rate": 4.0002092196844046e-05, + "loss": 0.2101, + "step": 2125 + }, + { + "epoch": 1.2935807727411013, + "grad_norm": 0.8908064961433411, + "learning_rate": 3.999213203600719e-05, + "loss": 0.2303, + "step": 2126 + }, + { + "epoch": 1.2941892303011864, + "grad_norm": 1.0389541387557983, + "learning_rate": 3.99821681577465e-05, + "loss": 0.233, + "step": 2127 + }, + { + "epoch": 1.2947976878612717, + "grad_norm": 1.1947957277297974, + "learning_rate": 3.997220056453259e-05, + "loss": 0.2872, + "step": 2128 + }, + { + "epoch": 1.2954061454213568, + "grad_norm": 1.0398141145706177, + "learning_rate": 3.9962229258837013e-05, + "loss": 0.2141, + "step": 2129 + }, + { + "epoch": 1.296014602981442, + "grad_norm": 0.8543111085891724, + "learning_rate": 3.9952254243132243e-05, + "loss": 0.2153, + "step": 2130 + }, + { + "epoch": 1.2966230605415272, + "grad_norm": 1.0683271884918213, + "learning_rate": 3.9942275519891656e-05, + "loss": 0.2487, + "step": 2131 + }, + { + "epoch": 1.2972315181016123, + "grad_norm": 0.8721834421157837, + "learning_rate": 3.993229309158957e-05, + "loss": 0.2295, + "step": 2132 + }, + { + "epoch": 1.2978399756616976, + "grad_norm": 0.8389768004417419, + "learning_rate": 3.9922306960701196e-05, + "loss": 0.2181, + "step": 2133 + }, + { + "epoch": 1.2984484332217827, + "grad_norm": 0.9460014700889587, + "learning_rate": 3.991231712970269e-05, + "loss": 0.2342, + "step": 2134 + }, + { + "epoch": 1.299056890781868, + "grad_norm": 0.9399822950363159, + "learning_rate": 3.990232360107111e-05, + "loss": 0.2097, + "step": 2135 + }, + { + "epoch": 1.2996653483419531, + "grad_norm": 0.8793272376060486, + "learning_rate": 3.989232637728445e-05, + "loss": 0.2349, + "step": 2136 + }, + { + "epoch": 1.3002738059020382, + "grad_norm": 0.9901358485221863, + "learning_rate": 3.988232546082158e-05, + "loss": 0.2422, + "step": 2137 + }, + { + "epoch": 1.3008822634621235, + "grad_norm": 0.8974143862724304, + "learning_rate": 3.9872320854162324e-05, + "loss": 0.2433, + "step": 2138 + }, + { + "epoch": 1.3014907210222086, + "grad_norm": 0.9541013836860657, + "learning_rate": 3.9862312559787404e-05, + "loss": 0.2531, + "step": 2139 + }, + { + "epoch": 1.302099178582294, + "grad_norm": 1.2113054990768433, + "learning_rate": 3.985230058017846e-05, + "loss": 0.2654, + "step": 2140 + }, + { + "epoch": 1.302707636142379, + "grad_norm": 0.8780795931816101, + "learning_rate": 3.984228491781805e-05, + "loss": 0.2094, + "step": 2141 + }, + { + "epoch": 1.3033160937024642, + "grad_norm": 0.9655569791793823, + "learning_rate": 3.9832265575189635e-05, + "loss": 0.2571, + "step": 2142 + }, + { + "epoch": 1.3039245512625495, + "grad_norm": 0.9380378723144531, + "learning_rate": 3.98222425547776e-05, + "loss": 0.2191, + "step": 2143 + }, + { + "epoch": 1.3045330088226346, + "grad_norm": 0.887397825717926, + "learning_rate": 3.981221585906723e-05, + "loss": 0.2585, + "step": 2144 + }, + { + "epoch": 1.3051414663827199, + "grad_norm": 0.8565236926078796, + "learning_rate": 3.980218549054473e-05, + "loss": 0.1939, + "step": 2145 + }, + { + "epoch": 1.305749923942805, + "grad_norm": 0.8174847364425659, + "learning_rate": 3.979215145169721e-05, + "loss": 0.2119, + "step": 2146 + }, + { + "epoch": 1.30635838150289, + "grad_norm": 0.8997043371200562, + "learning_rate": 3.97821137450127e-05, + "loss": 0.2329, + "step": 2147 + }, + { + "epoch": 1.3069668390629754, + "grad_norm": 0.9833238124847412, + "learning_rate": 3.977207237298014e-05, + "loss": 0.2813, + "step": 2148 + }, + { + "epoch": 1.3075752966230605, + "grad_norm": 0.8356313705444336, + "learning_rate": 3.9762027338089356e-05, + "loss": 0.2254, + "step": 2149 + }, + { + "epoch": 1.3081837541831458, + "grad_norm": 1.0737109184265137, + "learning_rate": 3.97519786428311e-05, + "loss": 0.2502, + "step": 2150 + }, + { + "epoch": 1.308792211743231, + "grad_norm": 0.8580512404441833, + "learning_rate": 3.9741926289697036e-05, + "loss": 0.2491, + "step": 2151 + }, + { + "epoch": 1.309400669303316, + "grad_norm": 0.9692168831825256, + "learning_rate": 3.973187028117972e-05, + "loss": 0.2251, + "step": 2152 + }, + { + "epoch": 1.3100091268634013, + "grad_norm": 0.8899025917053223, + "learning_rate": 3.9721810619772636e-05, + "loss": 0.2117, + "step": 2153 + }, + { + "epoch": 1.3106175844234864, + "grad_norm": 0.784062385559082, + "learning_rate": 3.971174730797015e-05, + "loss": 0.2105, + "step": 2154 + }, + { + "epoch": 1.3112260419835717, + "grad_norm": 0.7936700582504272, + "learning_rate": 3.970168034826755e-05, + "loss": 0.1936, + "step": 2155 + }, + { + "epoch": 1.3118344995436568, + "grad_norm": 0.844158947467804, + "learning_rate": 3.9691609743161015e-05, + "loss": 0.2318, + "step": 2156 + }, + { + "epoch": 1.312442957103742, + "grad_norm": 0.9515892267227173, + "learning_rate": 3.968153549514765e-05, + "loss": 0.2188, + "step": 2157 + }, + { + "epoch": 1.3130514146638272, + "grad_norm": 0.8544421792030334, + "learning_rate": 3.967145760672543e-05, + "loss": 0.225, + "step": 2158 + }, + { + "epoch": 1.3136598722239123, + "grad_norm": 0.9000852704048157, + "learning_rate": 3.9661376080393266e-05, + "loss": 0.2036, + "step": 2159 + }, + { + "epoch": 1.3142683297839977, + "grad_norm": 0.9248766899108887, + "learning_rate": 3.965129091865094e-05, + "loss": 0.2235, + "step": 2160 + }, + { + "epoch": 1.3148767873440828, + "grad_norm": 1.002901315689087, + "learning_rate": 3.9641202123999174e-05, + "loss": 0.2248, + "step": 2161 + }, + { + "epoch": 1.3154852449041678, + "grad_norm": 0.8213080763816833, + "learning_rate": 3.963110969893955e-05, + "loss": 0.2088, + "step": 2162 + }, + { + "epoch": 1.3160937024642532, + "grad_norm": 1.0072423219680786, + "learning_rate": 3.9621013645974574e-05, + "loss": 0.2406, + "step": 2163 + }, + { + "epoch": 1.3167021600243383, + "grad_norm": 1.0771087408065796, + "learning_rate": 3.961091396760765e-05, + "loss": 0.2093, + "step": 2164 + }, + { + "epoch": 1.3173106175844236, + "grad_norm": 0.9404158592224121, + "learning_rate": 3.960081066634308e-05, + "loss": 0.2502, + "step": 2165 + }, + { + "epoch": 1.3179190751445087, + "grad_norm": 0.910447359085083, + "learning_rate": 3.959070374468605e-05, + "loss": 0.2086, + "step": 2166 + }, + { + "epoch": 1.3185275327045938, + "grad_norm": 0.7833483219146729, + "learning_rate": 3.958059320514267e-05, + "loss": 0.2102, + "step": 2167 + }, + { + "epoch": 1.319135990264679, + "grad_norm": 1.0158355236053467, + "learning_rate": 3.957047905021991e-05, + "loss": 0.2798, + "step": 2168 + }, + { + "epoch": 1.3197444478247642, + "grad_norm": 0.9133449196815491, + "learning_rate": 3.956036128242568e-05, + "loss": 0.2381, + "step": 2169 + }, + { + "epoch": 1.3203529053848495, + "grad_norm": 0.9535892009735107, + "learning_rate": 3.955023990426876e-05, + "loss": 0.2382, + "step": 2170 + }, + { + "epoch": 1.3209613629449346, + "grad_norm": 0.919695258140564, + "learning_rate": 3.954011491825883e-05, + "loss": 0.2257, + "step": 2171 + }, + { + "epoch": 1.3215698205050197, + "grad_norm": 0.8631134033203125, + "learning_rate": 3.952998632690646e-05, + "loss": 0.1773, + "step": 2172 + }, + { + "epoch": 1.322178278065105, + "grad_norm": 1.0971744060516357, + "learning_rate": 3.951985413272312e-05, + "loss": 0.251, + "step": 2173 + }, + { + "epoch": 1.3227867356251901, + "grad_norm": 0.8867610692977905, + "learning_rate": 3.950971833822117e-05, + "loss": 0.1989, + "step": 2174 + }, + { + "epoch": 1.3233951931852754, + "grad_norm": 0.8583669066429138, + "learning_rate": 3.949957894591387e-05, + "loss": 0.2238, + "step": 2175 + }, + { + "epoch": 1.3240036507453605, + "grad_norm": 0.8695077300071716, + "learning_rate": 3.9489435958315354e-05, + "loss": 0.2755, + "step": 2176 + }, + { + "epoch": 1.3246121083054456, + "grad_norm": 0.9294130206108093, + "learning_rate": 3.947928937794069e-05, + "loss": 0.2271, + "step": 2177 + }, + { + "epoch": 1.325220565865531, + "grad_norm": 0.871168851852417, + "learning_rate": 3.946913920730577e-05, + "loss": 0.215, + "step": 2178 + }, + { + "epoch": 1.325829023425616, + "grad_norm": 0.9917259812355042, + "learning_rate": 3.945898544892744e-05, + "loss": 0.2764, + "step": 2179 + }, + { + "epoch": 1.3264374809857014, + "grad_norm": 0.689329206943512, + "learning_rate": 3.944882810532339e-05, + "loss": 0.1973, + "step": 2180 + }, + { + "epoch": 1.3270459385457865, + "grad_norm": 0.8238869905471802, + "learning_rate": 3.943866717901223e-05, + "loss": 0.2145, + "step": 2181 + }, + { + "epoch": 1.3276543961058715, + "grad_norm": 0.8981241583824158, + "learning_rate": 3.9428502672513446e-05, + "loss": 0.2292, + "step": 2182 + }, + { + "epoch": 1.3282628536659569, + "grad_norm": 0.902793824672699, + "learning_rate": 3.9418334588347406e-05, + "loss": 0.2068, + "step": 2183 + }, + { + "epoch": 1.328871311226042, + "grad_norm": 1.0001530647277832, + "learning_rate": 3.9408162929035375e-05, + "loss": 0.282, + "step": 2184 + }, + { + "epoch": 1.3294797687861273, + "grad_norm": 0.8582153916358948, + "learning_rate": 3.939798769709949e-05, + "loss": 0.2009, + "step": 2185 + }, + { + "epoch": 1.3300882263462124, + "grad_norm": 1.967231035232544, + "learning_rate": 3.93878088950628e-05, + "loss": 0.2382, + "step": 2186 + }, + { + "epoch": 1.3306966839062975, + "grad_norm": 1.2576441764831543, + "learning_rate": 3.937762652544923e-05, + "loss": 0.3125, + "step": 2187 + }, + { + "epoch": 1.3313051414663828, + "grad_norm": 1.0156564712524414, + "learning_rate": 3.9367440590783554e-05, + "loss": 0.2311, + "step": 2188 + }, + { + "epoch": 1.3319135990264679, + "grad_norm": 0.9007154703140259, + "learning_rate": 3.9357251093591485e-05, + "loss": 0.2335, + "step": 2189 + }, + { + "epoch": 1.3325220565865532, + "grad_norm": 0.9337432980537415, + "learning_rate": 3.934705803639959e-05, + "loss": 0.2229, + "step": 2190 + }, + { + "epoch": 1.3331305141466383, + "grad_norm": 0.875648021697998, + "learning_rate": 3.9336861421735305e-05, + "loss": 0.2179, + "step": 2191 + }, + { + "epoch": 1.3337389717067234, + "grad_norm": 0.7988814115524292, + "learning_rate": 3.9326661252126984e-05, + "loss": 0.2233, + "step": 2192 + }, + { + "epoch": 1.3343474292668087, + "grad_norm": 0.8480890989303589, + "learning_rate": 3.931645753010383e-05, + "loss": 0.2495, + "step": 2193 + }, + { + "epoch": 1.3349558868268938, + "grad_norm": 0.9835476279258728, + "learning_rate": 3.9306250258195954e-05, + "loss": 0.2171, + "step": 2194 + }, + { + "epoch": 1.3355643443869791, + "grad_norm": 1.1172250509262085, + "learning_rate": 3.929603943893432e-05, + "loss": 0.2434, + "step": 2195 + }, + { + "epoch": 1.3361728019470642, + "grad_norm": 0.8083821535110474, + "learning_rate": 3.92858250748508e-05, + "loss": 0.2134, + "step": 2196 + }, + { + "epoch": 1.3367812595071493, + "grad_norm": 0.8030064702033997, + "learning_rate": 3.9275607168478126e-05, + "loss": 0.2281, + "step": 2197 + }, + { + "epoch": 1.3373897170672346, + "grad_norm": 0.8596853017807007, + "learning_rate": 3.926538572234991e-05, + "loss": 0.1805, + "step": 2198 + }, + { + "epoch": 1.3379981746273197, + "grad_norm": 1.0698977708816528, + "learning_rate": 3.925516073900064e-05, + "loss": 0.2718, + "step": 2199 + }, + { + "epoch": 1.338606632187405, + "grad_norm": 1.0942227840423584, + "learning_rate": 3.9244932220965704e-05, + "loss": 0.2411, + "step": 2200 + }, + { + "epoch": 1.3392150897474902, + "grad_norm": 0.9584268927574158, + "learning_rate": 3.9234700170781316e-05, + "loss": 0.277, + "step": 2201 + }, + { + "epoch": 1.3398235473075752, + "grad_norm": 1.1233789920806885, + "learning_rate": 3.922446459098463e-05, + "loss": 0.2515, + "step": 2202 + }, + { + "epoch": 1.3404320048676603, + "grad_norm": 1.0286003351211548, + "learning_rate": 3.9214225484113634e-05, + "loss": 0.199, + "step": 2203 + }, + { + "epoch": 1.3410404624277457, + "grad_norm": 0.920686662197113, + "learning_rate": 3.9203982852707184e-05, + "loss": 0.2214, + "step": 2204 + }, + { + "epoch": 1.341648919987831, + "grad_norm": 0.9834322333335876, + "learning_rate": 3.9193736699305044e-05, + "loss": 0.2745, + "step": 2205 + }, + { + "epoch": 1.342257377547916, + "grad_norm": 0.8372621536254883, + "learning_rate": 3.9183487026447824e-05, + "loss": 0.2355, + "step": 2206 + }, + { + "epoch": 1.3428658351080012, + "grad_norm": 0.7708995938301086, + "learning_rate": 3.9173233836677024e-05, + "loss": 0.1949, + "step": 2207 + }, + { + "epoch": 1.3434742926680863, + "grad_norm": 0.9579456448554993, + "learning_rate": 3.9162977132534996e-05, + "loss": 0.2197, + "step": 2208 + }, + { + "epoch": 1.3440827502281716, + "grad_norm": 0.8540910482406616, + "learning_rate": 3.915271691656498e-05, + "loss": 0.2385, + "step": 2209 + }, + { + "epoch": 1.344691207788257, + "grad_norm": 0.8921311497688293, + "learning_rate": 3.914245319131109e-05, + "loss": 0.2211, + "step": 2210 + }, + { + "epoch": 1.345299665348342, + "grad_norm": 0.8396940231323242, + "learning_rate": 3.913218595931829e-05, + "loss": 0.2176, + "step": 2211 + }, + { + "epoch": 1.345908122908427, + "grad_norm": 0.8367871642112732, + "learning_rate": 3.9121915223132436e-05, + "loss": 0.2457, + "step": 2212 + }, + { + "epoch": 1.3465165804685122, + "grad_norm": 0.9021058082580566, + "learning_rate": 3.911164098530023e-05, + "loss": 0.2251, + "step": 2213 + }, + { + "epoch": 1.3471250380285975, + "grad_norm": 0.911104679107666, + "learning_rate": 3.910136324836927e-05, + "loss": 0.2683, + "step": 2214 + }, + { + "epoch": 1.3477334955886826, + "grad_norm": 0.906808614730835, + "learning_rate": 3.909108201488799e-05, + "loss": 0.2472, + "step": 2215 + }, + { + "epoch": 1.348341953148768, + "grad_norm": 0.8513998985290527, + "learning_rate": 3.908079728740571e-05, + "loss": 0.205, + "step": 2216 + }, + { + "epoch": 1.348950410708853, + "grad_norm": 0.9133200645446777, + "learning_rate": 3.9070509068472635e-05, + "loss": 0.2453, + "step": 2217 + }, + { + "epoch": 1.3495588682689381, + "grad_norm": 0.8982611298561096, + "learning_rate": 3.90602173606398e-05, + "loss": 0.1938, + "step": 2218 + }, + { + "epoch": 1.3501673258290234, + "grad_norm": 1.0791575908660889, + "learning_rate": 3.90499221664591e-05, + "loss": 0.2885, + "step": 2219 + }, + { + "epoch": 1.3507757833891085, + "grad_norm": 0.8639183044433594, + "learning_rate": 3.9039623488483346e-05, + "loss": 0.2382, + "step": 2220 + }, + { + "epoch": 1.3513842409491938, + "grad_norm": 1.2875553369522095, + "learning_rate": 3.902932132926616e-05, + "loss": 0.2692, + "step": 2221 + }, + { + "epoch": 1.351992698509279, + "grad_norm": 0.9130270481109619, + "learning_rate": 3.901901569136206e-05, + "loss": 0.2361, + "step": 2222 + }, + { + "epoch": 1.352601156069364, + "grad_norm": 0.832845151424408, + "learning_rate": 3.900870657732641e-05, + "loss": 0.2147, + "step": 2223 + }, + { + "epoch": 1.3532096136294494, + "grad_norm": 1.053515911102295, + "learning_rate": 3.8998393989715434e-05, + "loss": 0.2453, + "step": 2224 + }, + { + "epoch": 1.3538180711895345, + "grad_norm": 0.9339423179626465, + "learning_rate": 3.898807793108624e-05, + "loss": 0.1898, + "step": 2225 + }, + { + "epoch": 1.3544265287496198, + "grad_norm": 0.9041353464126587, + "learning_rate": 3.8977758403996765e-05, + "loss": 0.2065, + "step": 2226 + }, + { + "epoch": 1.3550349863097049, + "grad_norm": 0.8681005239486694, + "learning_rate": 3.896743541100583e-05, + "loss": 0.2186, + "step": 2227 + }, + { + "epoch": 1.35564344386979, + "grad_norm": 0.8641691207885742, + "learning_rate": 3.89571089546731e-05, + "loss": 0.2016, + "step": 2228 + }, + { + "epoch": 1.3562519014298753, + "grad_norm": 0.8914021253585815, + "learning_rate": 3.8946779037559115e-05, + "loss": 0.2332, + "step": 2229 + }, + { + "epoch": 1.3568603589899604, + "grad_norm": 0.9768247604370117, + "learning_rate": 3.8936445662225264e-05, + "loss": 0.1901, + "step": 2230 + }, + { + "epoch": 1.3574688165500457, + "grad_norm": 0.982510507106781, + "learning_rate": 3.892610883123378e-05, + "loss": 0.2356, + "step": 2231 + }, + { + "epoch": 1.3580772741101308, + "grad_norm": 0.9750663638114929, + "learning_rate": 3.891576854714777e-05, + "loss": 0.2241, + "step": 2232 + }, + { + "epoch": 1.358685731670216, + "grad_norm": 0.8455461263656616, + "learning_rate": 3.890542481253121e-05, + "loss": 0.2119, + "step": 2233 + }, + { + "epoch": 1.3592941892303012, + "grad_norm": 1.088861107826233, + "learning_rate": 3.88950776299489e-05, + "loss": 0.2778, + "step": 2234 + }, + { + "epoch": 1.3599026467903863, + "grad_norm": 1.2681231498718262, + "learning_rate": 3.888472700196651e-05, + "loss": 0.2204, + "step": 2235 + }, + { + "epoch": 1.3605111043504716, + "grad_norm": 0.9089130163192749, + "learning_rate": 3.887437293115057e-05, + "loss": 0.2306, + "step": 2236 + }, + { + "epoch": 1.3611195619105567, + "grad_norm": 1.256752610206604, + "learning_rate": 3.8864015420068454e-05, + "loss": 0.2447, + "step": 2237 + }, + { + "epoch": 1.3617280194706418, + "grad_norm": 0.9000508189201355, + "learning_rate": 3.88536544712884e-05, + "loss": 0.2213, + "step": 2238 + }, + { + "epoch": 1.3623364770307271, + "grad_norm": 1.016454815864563, + "learning_rate": 3.884329008737947e-05, + "loss": 0.2309, + "step": 2239 + }, + { + "epoch": 1.3629449345908122, + "grad_norm": 0.9450438618659973, + "learning_rate": 3.883292227091163e-05, + "loss": 0.2443, + "step": 2240 + }, + { + "epoch": 1.3635533921508975, + "grad_norm": 1.0585530996322632, + "learning_rate": 3.882255102445565e-05, + "loss": 0.2413, + "step": 2241 + }, + { + "epoch": 1.3641618497109826, + "grad_norm": 0.9803087115287781, + "learning_rate": 3.8812176350583164e-05, + "loss": 0.219, + "step": 2242 + }, + { + "epoch": 1.3647703072710677, + "grad_norm": 0.93431156873703, + "learning_rate": 3.880179825186667e-05, + "loss": 0.2136, + "step": 2243 + }, + { + "epoch": 1.365378764831153, + "grad_norm": 0.9018591642379761, + "learning_rate": 3.879141673087949e-05, + "loss": 0.2249, + "step": 2244 + }, + { + "epoch": 1.3659872223912382, + "grad_norm": 1.3140918016433716, + "learning_rate": 3.878103179019581e-05, + "loss": 0.2312, + "step": 2245 + }, + { + "epoch": 1.3665956799513235, + "grad_norm": 0.9108927249908447, + "learning_rate": 3.877064343239068e-05, + "loss": 0.2274, + "step": 2246 + }, + { + "epoch": 1.3672041375114086, + "grad_norm": 1.067284107208252, + "learning_rate": 3.8760251660039956e-05, + "loss": 0.2968, + "step": 2247 + }, + { + "epoch": 1.3678125950714937, + "grad_norm": 0.8846755623817444, + "learning_rate": 3.874985647572039e-05, + "loss": 0.2381, + "step": 2248 + }, + { + "epoch": 1.368421052631579, + "grad_norm": 0.9099476933479309, + "learning_rate": 3.8739457882009526e-05, + "loss": 0.1729, + "step": 2249 + }, + { + "epoch": 1.369029510191664, + "grad_norm": 0.866028904914856, + "learning_rate": 3.87290558814858e-05, + "loss": 0.2237, + "step": 2250 + }, + { + "epoch": 1.3696379677517494, + "grad_norm": 0.8966884016990662, + "learning_rate": 3.871865047672848e-05, + "loss": 0.2003, + "step": 2251 + }, + { + "epoch": 1.3702464253118345, + "grad_norm": 0.9942765831947327, + "learning_rate": 3.8708241670317645e-05, + "loss": 0.2374, + "step": 2252 + }, + { + "epoch": 1.3708548828719196, + "grad_norm": 0.9010903239250183, + "learning_rate": 3.869782946483428e-05, + "loss": 0.1948, + "step": 2253 + }, + { + "epoch": 1.371463340432005, + "grad_norm": 0.9701775908470154, + "learning_rate": 3.868741386286016e-05, + "loss": 0.2412, + "step": 2254 + }, + { + "epoch": 1.37207179799209, + "grad_norm": 0.7762331366539001, + "learning_rate": 3.867699486697791e-05, + "loss": 0.1873, + "step": 2255 + }, + { + "epoch": 1.3726802555521753, + "grad_norm": 0.9885874390602112, + "learning_rate": 3.866657247977103e-05, + "loss": 0.2072, + "step": 2256 + }, + { + "epoch": 1.3732887131122604, + "grad_norm": 0.9853124618530273, + "learning_rate": 3.865614670382382e-05, + "loss": 0.2562, + "step": 2257 + }, + { + "epoch": 1.3738971706723455, + "grad_norm": 0.9936604499816895, + "learning_rate": 3.864571754172144e-05, + "loss": 0.2386, + "step": 2258 + }, + { + "epoch": 1.3745056282324308, + "grad_norm": 0.9598672389984131, + "learning_rate": 3.8635284996049904e-05, + "loss": 0.2379, + "step": 2259 + }, + { + "epoch": 1.375114085792516, + "grad_norm": 0.9736387133598328, + "learning_rate": 3.8624849069396024e-05, + "loss": 0.2614, + "step": 2260 + }, + { + "epoch": 1.3757225433526012, + "grad_norm": 0.8407406210899353, + "learning_rate": 3.8614409764347494e-05, + "loss": 0.2115, + "step": 2261 + }, + { + "epoch": 1.3763310009126863, + "grad_norm": 0.8771750330924988, + "learning_rate": 3.860396708349281e-05, + "loss": 0.2091, + "step": 2262 + }, + { + "epoch": 1.3769394584727714, + "grad_norm": 0.9052261114120483, + "learning_rate": 3.859352102942134e-05, + "loss": 0.2031, + "step": 2263 + }, + { + "epoch": 1.3775479160328568, + "grad_norm": 0.8526843190193176, + "learning_rate": 3.8583071604723256e-05, + "loss": 0.1998, + "step": 2264 + }, + { + "epoch": 1.3781563735929419, + "grad_norm": 1.0175981521606445, + "learning_rate": 3.857261881198958e-05, + "loss": 0.209, + "step": 2265 + }, + { + "epoch": 1.3787648311530272, + "grad_norm": 1.1324195861816406, + "learning_rate": 3.856216265381219e-05, + "loss": 0.2031, + "step": 2266 + }, + { + "epoch": 1.3793732887131123, + "grad_norm": 1.1673684120178223, + "learning_rate": 3.8551703132783745e-05, + "loss": 0.2526, + "step": 2267 + }, + { + "epoch": 1.3799817462731974, + "grad_norm": 0.929223895072937, + "learning_rate": 3.854124025149778e-05, + "loss": 0.2443, + "step": 2268 + }, + { + "epoch": 1.3805902038332827, + "grad_norm": 0.9426333904266357, + "learning_rate": 3.8530774012548674e-05, + "loss": 0.2002, + "step": 2269 + }, + { + "epoch": 1.3811986613933678, + "grad_norm": 0.7749611139297485, + "learning_rate": 3.85203044185316e-05, + "loss": 0.2044, + "step": 2270 + }, + { + "epoch": 1.381807118953453, + "grad_norm": 0.8301417231559753, + "learning_rate": 3.8509831472042585e-05, + "loss": 0.2198, + "step": 2271 + }, + { + "epoch": 1.3824155765135382, + "grad_norm": 1.038061499595642, + "learning_rate": 3.849935517567848e-05, + "loss": 0.2578, + "step": 2272 + }, + { + "epoch": 1.3830240340736233, + "grad_norm": 0.9880487322807312, + "learning_rate": 3.8488875532036975e-05, + "loss": 0.2688, + "step": 2273 + }, + { + "epoch": 1.3836324916337086, + "grad_norm": 0.9047033190727234, + "learning_rate": 3.847839254371658e-05, + "loss": 0.2179, + "step": 2274 + }, + { + "epoch": 1.3842409491937937, + "grad_norm": 0.9710778594017029, + "learning_rate": 3.8467906213316636e-05, + "loss": 0.2569, + "step": 2275 + }, + { + "epoch": 1.384849406753879, + "grad_norm": 1.0498100519180298, + "learning_rate": 3.845741654343733e-05, + "loss": 0.2389, + "step": 2276 + }, + { + "epoch": 1.3854578643139641, + "grad_norm": 0.9659872055053711, + "learning_rate": 3.8446923536679644e-05, + "loss": 0.2312, + "step": 2277 + }, + { + "epoch": 1.3860663218740492, + "grad_norm": 0.956907331943512, + "learning_rate": 3.843642719564542e-05, + "loss": 0.2527, + "step": 2278 + }, + { + "epoch": 1.3866747794341345, + "grad_norm": 0.9140104651451111, + "learning_rate": 3.842592752293731e-05, + "loss": 0.2255, + "step": 2279 + }, + { + "epoch": 1.3872832369942196, + "grad_norm": 0.8140145540237427, + "learning_rate": 3.8415424521158804e-05, + "loss": 0.2145, + "step": 2280 + }, + { + "epoch": 1.387891694554305, + "grad_norm": 0.9981208443641663, + "learning_rate": 3.8404918192914184e-05, + "loss": 0.2338, + "step": 2281 + }, + { + "epoch": 1.38850015211439, + "grad_norm": 1.1599704027175903, + "learning_rate": 3.839440854080861e-05, + "loss": 0.3089, + "step": 2282 + }, + { + "epoch": 1.3891086096744751, + "grad_norm": 0.9346492886543274, + "learning_rate": 3.8383895567448015e-05, + "loss": 0.1997, + "step": 2283 + }, + { + "epoch": 1.3897170672345605, + "grad_norm": 0.9026308655738831, + "learning_rate": 3.8373379275439194e-05, + "loss": 0.198, + "step": 2284 + }, + { + "epoch": 1.3903255247946456, + "grad_norm": 0.8270521759986877, + "learning_rate": 3.836285966738974e-05, + "loss": 0.2308, + "step": 2285 + }, + { + "epoch": 1.3909339823547309, + "grad_norm": 0.8808834552764893, + "learning_rate": 3.8352336745908076e-05, + "loss": 0.2238, + "step": 2286 + }, + { + "epoch": 1.391542439914816, + "grad_norm": 0.8379645347595215, + "learning_rate": 3.834181051360346e-05, + "loss": 0.1938, + "step": 2287 + }, + { + "epoch": 1.392150897474901, + "grad_norm": 0.894270658493042, + "learning_rate": 3.833128097308594e-05, + "loss": 0.2172, + "step": 2288 + }, + { + "epoch": 1.3927593550349864, + "grad_norm": 0.8662185668945312, + "learning_rate": 3.832074812696642e-05, + "loss": 0.2332, + "step": 2289 + }, + { + "epoch": 1.3933678125950715, + "grad_norm": 1.1685889959335327, + "learning_rate": 3.8310211977856605e-05, + "loss": 0.254, + "step": 2290 + }, + { + "epoch": 1.3939762701551568, + "grad_norm": 0.9299534559249878, + "learning_rate": 3.8299672528369014e-05, + "loss": 0.2024, + "step": 2291 + }, + { + "epoch": 1.394584727715242, + "grad_norm": 0.7964719533920288, + "learning_rate": 3.828912978111699e-05, + "loss": 0.1907, + "step": 2292 + }, + { + "epoch": 1.395193185275327, + "grad_norm": 0.9740342497825623, + "learning_rate": 3.8278583738714696e-05, + "loss": 0.2479, + "step": 2293 + }, + { + "epoch": 1.3958016428354123, + "grad_norm": 1.016251564025879, + "learning_rate": 3.826803440377712e-05, + "loss": 0.2517, + "step": 2294 + }, + { + "epoch": 1.3964101003954974, + "grad_norm": 1.0677728652954102, + "learning_rate": 3.8257481778920045e-05, + "loss": 0.23, + "step": 2295 + }, + { + "epoch": 1.3970185579555827, + "grad_norm": 0.9394170045852661, + "learning_rate": 3.824692586676009e-05, + "loss": 0.2383, + "step": 2296 + }, + { + "epoch": 1.3976270155156678, + "grad_norm": 0.9719864130020142, + "learning_rate": 3.823636666991468e-05, + "loss": 0.2135, + "step": 2297 + }, + { + "epoch": 1.398235473075753, + "grad_norm": 0.9637436270713806, + "learning_rate": 3.8225804191002054e-05, + "loss": 0.2471, + "step": 2298 + }, + { + "epoch": 1.3988439306358382, + "grad_norm": 1.0839718580245972, + "learning_rate": 3.821523843264127e-05, + "loss": 0.2424, + "step": 2299 + }, + { + "epoch": 1.3994523881959233, + "grad_norm": 0.9015427231788635, + "learning_rate": 3.82046693974522e-05, + "loss": 0.2568, + "step": 2300 + }, + { + "epoch": 1.4000608457560086, + "grad_norm": 0.9065006971359253, + "learning_rate": 3.8194097088055505e-05, + "loss": 0.1932, + "step": 2301 + }, + { + "epoch": 1.4006693033160937, + "grad_norm": 0.876019299030304, + "learning_rate": 3.818352150707269e-05, + "loss": 0.2278, + "step": 2302 + }, + { + "epoch": 1.4012777608761788, + "grad_norm": 0.8508959412574768, + "learning_rate": 3.817294265712606e-05, + "loss": 0.2087, + "step": 2303 + }, + { + "epoch": 1.4018862184362642, + "grad_norm": 0.8045951128005981, + "learning_rate": 3.8162360540838726e-05, + "loss": 0.1957, + "step": 2304 + }, + { + "epoch": 1.4024946759963492, + "grad_norm": 0.893156111240387, + "learning_rate": 3.815177516083461e-05, + "loss": 0.1938, + "step": 2305 + }, + { + "epoch": 1.4031031335564346, + "grad_norm": 0.9133938550949097, + "learning_rate": 3.8141186519738456e-05, + "loss": 0.2445, + "step": 2306 + }, + { + "epoch": 1.4037115911165197, + "grad_norm": 0.9459069967269897, + "learning_rate": 3.8130594620175786e-05, + "loss": 0.2532, + "step": 2307 + }, + { + "epoch": 1.4043200486766048, + "grad_norm": 0.9361640810966492, + "learning_rate": 3.811999946477296e-05, + "loss": 0.2845, + "step": 2308 + }, + { + "epoch": 1.4049285062366899, + "grad_norm": 0.7425569295883179, + "learning_rate": 3.810940105615715e-05, + "loss": 0.1709, + "step": 2309 + }, + { + "epoch": 1.4055369637967752, + "grad_norm": 0.9451609253883362, + "learning_rate": 3.8098799396956284e-05, + "loss": 0.2082, + "step": 2310 + }, + { + "epoch": 1.4061454213568605, + "grad_norm": 0.9120255708694458, + "learning_rate": 3.808819448979917e-05, + "loss": 0.2128, + "step": 2311 + }, + { + "epoch": 1.4067538789169456, + "grad_norm": 0.9225917458534241, + "learning_rate": 3.8077586337315365e-05, + "loss": 0.2095, + "step": 2312 + }, + { + "epoch": 1.4073623364770307, + "grad_norm": 0.9341639876365662, + "learning_rate": 3.8066974942135234e-05, + "loss": 0.2372, + "step": 2313 + }, + { + "epoch": 1.4079707940371158, + "grad_norm": 0.9186847805976868, + "learning_rate": 3.8056360306889985e-05, + "loss": 0.2227, + "step": 2314 + }, + { + "epoch": 1.408579251597201, + "grad_norm": 0.8264140486717224, + "learning_rate": 3.8045742434211595e-05, + "loss": 0.1934, + "step": 2315 + }, + { + "epoch": 1.4091877091572864, + "grad_norm": 0.8795403242111206, + "learning_rate": 3.803512132673286e-05, + "loss": 0.2349, + "step": 2316 + }, + { + "epoch": 1.4097961667173715, + "grad_norm": 0.9693627953529358, + "learning_rate": 3.802449698708736e-05, + "loss": 0.2146, + "step": 2317 + }, + { + "epoch": 1.4104046242774566, + "grad_norm": 0.9687555432319641, + "learning_rate": 3.8013869417909496e-05, + "loss": 0.2455, + "step": 2318 + }, + { + "epoch": 1.4110130818375417, + "grad_norm": 4.701800346374512, + "learning_rate": 3.800323862183446e-05, + "loss": 0.2436, + "step": 2319 + }, + { + "epoch": 1.411621539397627, + "grad_norm": 0.9624002575874329, + "learning_rate": 3.799260460149825e-05, + "loss": 0.2143, + "step": 2320 + }, + { + "epoch": 1.4122299969577121, + "grad_norm": 1.0840182304382324, + "learning_rate": 3.7981967359537656e-05, + "loss": 0.219, + "step": 2321 + }, + { + "epoch": 1.4128384545177974, + "grad_norm": 0.9719953536987305, + "learning_rate": 3.797132689859027e-05, + "loss": 0.2347, + "step": 2322 + }, + { + "epoch": 1.4134469120778825, + "grad_norm": 0.9404840469360352, + "learning_rate": 3.796068322129449e-05, + "loss": 0.2089, + "step": 2323 + }, + { + "epoch": 1.4140553696379676, + "grad_norm": 0.7975339889526367, + "learning_rate": 3.795003633028949e-05, + "loss": 0.2036, + "step": 2324 + }, + { + "epoch": 1.414663827198053, + "grad_norm": 0.9089730978012085, + "learning_rate": 3.793938622821528e-05, + "loss": 0.2158, + "step": 2325 + }, + { + "epoch": 1.415272284758138, + "grad_norm": 1.023104190826416, + "learning_rate": 3.792873291771261e-05, + "loss": 0.2297, + "step": 2326 + }, + { + "epoch": 1.4158807423182234, + "grad_norm": 0.9837586283683777, + "learning_rate": 3.7918076401423076e-05, + "loss": 0.2383, + "step": 2327 + }, + { + "epoch": 1.4164891998783085, + "grad_norm": 0.9490445852279663, + "learning_rate": 3.790741668198906e-05, + "loss": 0.2691, + "step": 2328 + }, + { + "epoch": 1.4170976574383936, + "grad_norm": 1.8430839776992798, + "learning_rate": 3.7896753762053693e-05, + "loss": 0.2033, + "step": 2329 + }, + { + "epoch": 1.4177061149984789, + "grad_norm": 0.8883879780769348, + "learning_rate": 3.788608764426097e-05, + "loss": 0.2133, + "step": 2330 + }, + { + "epoch": 1.418314572558564, + "grad_norm": 0.9225703477859497, + "learning_rate": 3.787541833125563e-05, + "loss": 0.1979, + "step": 2331 + }, + { + "epoch": 1.4189230301186493, + "grad_norm": 0.8261762261390686, + "learning_rate": 3.786474582568321e-05, + "loss": 0.2257, + "step": 2332 + }, + { + "epoch": 1.4195314876787344, + "grad_norm": 0.8989948630332947, + "learning_rate": 3.785407013019006e-05, + "loss": 0.2048, + "step": 2333 + }, + { + "epoch": 1.4201399452388195, + "grad_norm": 0.8330014944076538, + "learning_rate": 3.78433912474233e-05, + "loss": 0.2363, + "step": 2334 + }, + { + "epoch": 1.4207484027989048, + "grad_norm": 0.8741576075553894, + "learning_rate": 3.783270918003085e-05, + "loss": 0.2149, + "step": 2335 + }, + { + "epoch": 1.42135686035899, + "grad_norm": 0.9070691466331482, + "learning_rate": 3.782202393066141e-05, + "loss": 0.2316, + "step": 2336 + }, + { + "epoch": 1.4219653179190752, + "grad_norm": 0.9362895488739014, + "learning_rate": 3.7811335501964495e-05, + "loss": 0.2179, + "step": 2337 + }, + { + "epoch": 1.4225737754791603, + "grad_norm": 0.8994218111038208, + "learning_rate": 3.7800643896590375e-05, + "loss": 0.1861, + "step": 2338 + }, + { + "epoch": 1.4231822330392454, + "grad_norm": 0.9640007019042969, + "learning_rate": 3.778994911719013e-05, + "loss": 0.2578, + "step": 2339 + }, + { + "epoch": 1.4237906905993307, + "grad_norm": 0.8841598629951477, + "learning_rate": 3.7779251166415606e-05, + "loss": 0.2208, + "step": 2340 + }, + { + "epoch": 1.4243991481594158, + "grad_norm": 0.7951266765594482, + "learning_rate": 3.776855004691946e-05, + "loss": 0.1759, + "step": 2341 + }, + { + "epoch": 1.4250076057195011, + "grad_norm": 0.9464953541755676, + "learning_rate": 3.775784576135513e-05, + "loss": 0.2117, + "step": 2342 + }, + { + "epoch": 1.4256160632795862, + "grad_norm": 0.8788123726844788, + "learning_rate": 3.774713831237682e-05, + "loss": 0.2078, + "step": 2343 + }, + { + "epoch": 1.4262245208396713, + "grad_norm": 0.9868049621582031, + "learning_rate": 3.7736427702639526e-05, + "loss": 0.2355, + "step": 2344 + }, + { + "epoch": 1.4268329783997566, + "grad_norm": 1.0244841575622559, + "learning_rate": 3.7725713934799045e-05, + "loss": 0.2093, + "step": 2345 + }, + { + "epoch": 1.4274414359598417, + "grad_norm": 0.9810612797737122, + "learning_rate": 3.7714997011511956e-05, + "loss": 0.1909, + "step": 2346 + }, + { + "epoch": 1.428049893519927, + "grad_norm": 0.8638327717781067, + "learning_rate": 3.770427693543558e-05, + "loss": 0.209, + "step": 2347 + }, + { + "epoch": 1.4286583510800122, + "grad_norm": 0.9180063009262085, + "learning_rate": 3.769355370922807e-05, + "loss": 0.1966, + "step": 2348 + }, + { + "epoch": 1.4292668086400973, + "grad_norm": 2.471264600753784, + "learning_rate": 3.768282733554833e-05, + "loss": 0.189, + "step": 2349 + }, + { + "epoch": 1.4298752662001826, + "grad_norm": 0.8565406799316406, + "learning_rate": 3.7672097817056065e-05, + "loss": 0.2314, + "step": 2350 + }, + { + "epoch": 1.4304837237602677, + "grad_norm": 0.9157159924507141, + "learning_rate": 3.766136515641174e-05, + "loss": 0.2094, + "step": 2351 + }, + { + "epoch": 1.431092181320353, + "grad_norm": 0.7455861568450928, + "learning_rate": 3.76506293562766e-05, + "loss": 0.1656, + "step": 2352 + }, + { + "epoch": 1.431700638880438, + "grad_norm": 0.8483655452728271, + "learning_rate": 3.7639890419312694e-05, + "loss": 0.2195, + "step": 2353 + }, + { + "epoch": 1.4323090964405232, + "grad_norm": 0.9521898627281189, + "learning_rate": 3.7629148348182807e-05, + "loss": 0.1907, + "step": 2354 + }, + { + "epoch": 1.4329175540006085, + "grad_norm": 0.9046710133552551, + "learning_rate": 3.761840314555055e-05, + "loss": 0.1728, + "step": 2355 + }, + { + "epoch": 1.4335260115606936, + "grad_norm": 0.9136427044868469, + "learning_rate": 3.760765481408027e-05, + "loss": 0.2079, + "step": 2356 + }, + { + "epoch": 1.434134469120779, + "grad_norm": 0.7689423561096191, + "learning_rate": 3.759690335643711e-05, + "loss": 0.1898, + "step": 2357 + }, + { + "epoch": 1.434742926680864, + "grad_norm": 0.868561327457428, + "learning_rate": 3.758614877528698e-05, + "loss": 0.1768, + "step": 2358 + }, + { + "epoch": 1.435351384240949, + "grad_norm": 0.889870285987854, + "learning_rate": 3.7575391073296575e-05, + "loss": 0.2147, + "step": 2359 + }, + { + "epoch": 1.4359598418010344, + "grad_norm": 1.013046145439148, + "learning_rate": 3.756463025313335e-05, + "loss": 0.2365, + "step": 2360 + }, + { + "epoch": 1.4365682993611195, + "grad_norm": 1.0037925243377686, + "learning_rate": 3.7553866317465536e-05, + "loss": 0.2229, + "step": 2361 + }, + { + "epoch": 1.4371767569212048, + "grad_norm": 0.910197913646698, + "learning_rate": 3.754309926896215e-05, + "loss": 0.2133, + "step": 2362 + }, + { + "epoch": 1.43778521448129, + "grad_norm": 0.8792757987976074, + "learning_rate": 3.7532329110292966e-05, + "loss": 0.2481, + "step": 2363 + }, + { + "epoch": 1.438393672041375, + "grad_norm": 0.8445896506309509, + "learning_rate": 3.752155584412854e-05, + "loss": 0.2402, + "step": 2364 + }, + { + "epoch": 1.4390021296014603, + "grad_norm": 0.8201121687889099, + "learning_rate": 3.751077947314019e-05, + "loss": 0.1696, + "step": 2365 + }, + { + "epoch": 1.4396105871615454, + "grad_norm": 0.9546027183532715, + "learning_rate": 3.7500000000000003e-05, + "loss": 0.2487, + "step": 2366 + }, + { + "epoch": 1.4402190447216308, + "grad_norm": 0.9001442193984985, + "learning_rate": 3.748921742738084e-05, + "loss": 0.2163, + "step": 2367 + }, + { + "epoch": 1.4408275022817159, + "grad_norm": 0.7833166122436523, + "learning_rate": 3.7478431757956335e-05, + "loss": 0.2015, + "step": 2368 + }, + { + "epoch": 1.441435959841801, + "grad_norm": 0.7708922028541565, + "learning_rate": 3.746764299440087e-05, + "loss": 0.2106, + "step": 2369 + }, + { + "epoch": 1.4420444174018863, + "grad_norm": 0.7704883217811584, + "learning_rate": 3.745685113938963e-05, + "loss": 0.2011, + "step": 2370 + }, + { + "epoch": 1.4426528749619714, + "grad_norm": 0.7907694578170776, + "learning_rate": 3.7446056195598536e-05, + "loss": 0.2023, + "step": 2371 + }, + { + "epoch": 1.4432613325220567, + "grad_norm": 1.1114675998687744, + "learning_rate": 3.743525816570428e-05, + "loss": 0.1887, + "step": 2372 + }, + { + "epoch": 1.4438697900821418, + "grad_norm": 0.7749055624008179, + "learning_rate": 3.742445705238432e-05, + "loss": 0.1806, + "step": 2373 + }, + { + "epoch": 1.4444782476422269, + "grad_norm": 0.8483683466911316, + "learning_rate": 3.74136528583169e-05, + "loss": 0.1959, + "step": 2374 + }, + { + "epoch": 1.4450867052023122, + "grad_norm": 0.9588725566864014, + "learning_rate": 3.740284558618099e-05, + "loss": 0.2355, + "step": 2375 + }, + { + "epoch": 1.4456951627623973, + "grad_norm": 0.9711139798164368, + "learning_rate": 3.739203523865635e-05, + "loss": 0.2419, + "step": 2376 + }, + { + "epoch": 1.4463036203224826, + "grad_norm": 0.8849513530731201, + "learning_rate": 3.738122181842349e-05, + "loss": 0.2023, + "step": 2377 + }, + { + "epoch": 1.4469120778825677, + "grad_norm": 0.8123235702514648, + "learning_rate": 3.73704053281637e-05, + "loss": 0.2084, + "step": 2378 + }, + { + "epoch": 1.4475205354426528, + "grad_norm": 0.9523190259933472, + "learning_rate": 3.7359585770559024e-05, + "loss": 0.2318, + "step": 2379 + }, + { + "epoch": 1.4481289930027381, + "grad_norm": 0.7958305478096008, + "learning_rate": 3.7348763148292236e-05, + "loss": 0.2133, + "step": 2380 + }, + { + "epoch": 1.4487374505628232, + "grad_norm": 0.9427193403244019, + "learning_rate": 3.733793746404692e-05, + "loss": 0.1828, + "step": 2381 + }, + { + "epoch": 1.4493459081229085, + "grad_norm": 0.9071337580680847, + "learning_rate": 3.732710872050737e-05, + "loss": 0.1915, + "step": 2382 + }, + { + "epoch": 1.4499543656829936, + "grad_norm": 0.9505900740623474, + "learning_rate": 3.731627692035869e-05, + "loss": 0.2522, + "step": 2383 + }, + { + "epoch": 1.4505628232430787, + "grad_norm": 0.9712157845497131, + "learning_rate": 3.730544206628669e-05, + "loss": 0.2114, + "step": 2384 + }, + { + "epoch": 1.451171280803164, + "grad_norm": 0.8542957901954651, + "learning_rate": 3.729460416097797e-05, + "loss": 0.1831, + "step": 2385 + }, + { + "epoch": 1.4517797383632491, + "grad_norm": 0.9015089869499207, + "learning_rate": 3.7283763207119894e-05, + "loss": 0.1942, + "step": 2386 + }, + { + "epoch": 1.4523881959233345, + "grad_norm": 0.8781471848487854, + "learning_rate": 3.7272919207400556e-05, + "loss": 0.2114, + "step": 2387 + }, + { + "epoch": 1.4529966534834196, + "grad_norm": 0.9101613163948059, + "learning_rate": 3.72620721645088e-05, + "loss": 0.1945, + "step": 2388 + }, + { + "epoch": 1.4536051110435046, + "grad_norm": 0.9514543414115906, + "learning_rate": 3.725122208113427e-05, + "loss": 0.2435, + "step": 2389 + }, + { + "epoch": 1.45421356860359, + "grad_norm": 0.9269886612892151, + "learning_rate": 3.724036895996732e-05, + "loss": 0.2401, + "step": 2390 + }, + { + "epoch": 1.454822026163675, + "grad_norm": 0.9546543955802917, + "learning_rate": 3.722951280369906e-05, + "loss": 0.2285, + "step": 2391 + }, + { + "epoch": 1.4554304837237604, + "grad_norm": 0.8362632393836975, + "learning_rate": 3.7218653615021395e-05, + "loss": 0.2115, + "step": 2392 + }, + { + "epoch": 1.4560389412838455, + "grad_norm": 0.9519213438034058, + "learning_rate": 3.720779139662691e-05, + "loss": 0.2214, + "step": 2393 + }, + { + "epoch": 1.4566473988439306, + "grad_norm": 0.7873554825782776, + "learning_rate": 3.719692615120902e-05, + "loss": 0.2028, + "step": 2394 + }, + { + "epoch": 1.457255856404016, + "grad_norm": 0.8509390354156494, + "learning_rate": 3.718605788146183e-05, + "loss": 0.2084, + "step": 2395 + }, + { + "epoch": 1.457864313964101, + "grad_norm": 0.9013881087303162, + "learning_rate": 3.717518659008023e-05, + "loss": 0.2139, + "step": 2396 + }, + { + "epoch": 1.4584727715241863, + "grad_norm": 0.8233511447906494, + "learning_rate": 3.7164312279759836e-05, + "loss": 0.1946, + "step": 2397 + }, + { + "epoch": 1.4590812290842714, + "grad_norm": 0.8857746124267578, + "learning_rate": 3.7153434953197044e-05, + "loss": 0.1972, + "step": 2398 + }, + { + "epoch": 1.4596896866443565, + "grad_norm": 0.793010950088501, + "learning_rate": 3.714255461308895e-05, + "loss": 0.184, + "step": 2399 + }, + { + "epoch": 1.4602981442044418, + "grad_norm": 0.9649643301963806, + "learning_rate": 3.7131671262133444e-05, + "loss": 0.2153, + "step": 2400 + }, + { + "epoch": 1.460906601764527, + "grad_norm": 0.9244347810745239, + "learning_rate": 3.712078490302913e-05, + "loss": 0.1921, + "step": 2401 + }, + { + "epoch": 1.4615150593246122, + "grad_norm": 1.1217639446258545, + "learning_rate": 3.7109895538475394e-05, + "loss": 0.2125, + "step": 2402 + }, + { + "epoch": 1.4621235168846973, + "grad_norm": 0.8992077112197876, + "learning_rate": 3.709900317117232e-05, + "loss": 0.1979, + "step": 2403 + }, + { + "epoch": 1.4627319744447824, + "grad_norm": 0.9109466075897217, + "learning_rate": 3.708810780382077e-05, + "loss": 0.1959, + "step": 2404 + }, + { + "epoch": 1.4633404320048677, + "grad_norm": 0.8444501161575317, + "learning_rate": 3.707720943912235e-05, + "loss": 0.1709, + "step": 2405 + }, + { + "epoch": 1.4639488895649528, + "grad_norm": 0.9722615480422974, + "learning_rate": 3.706630807977938e-05, + "loss": 0.2304, + "step": 2406 + }, + { + "epoch": 1.4645573471250382, + "grad_norm": 0.9228814244270325, + "learning_rate": 3.705540372849496e-05, + "loss": 0.2209, + "step": 2407 + }, + { + "epoch": 1.4651658046851233, + "grad_norm": 0.9532406330108643, + "learning_rate": 3.7044496387972914e-05, + "loss": 0.2162, + "step": 2408 + }, + { + "epoch": 1.4657742622452083, + "grad_norm": 0.7384613156318665, + "learning_rate": 3.7033586060917795e-05, + "loss": 0.1676, + "step": 2409 + }, + { + "epoch": 1.4663827198052934, + "grad_norm": 0.804396390914917, + "learning_rate": 3.7022672750034926e-05, + "loss": 0.2018, + "step": 2410 + }, + { + "epoch": 1.4669911773653788, + "grad_norm": 0.8668960332870483, + "learning_rate": 3.701175645803034e-05, + "loss": 0.238, + "step": 2411 + }, + { + "epoch": 1.467599634925464, + "grad_norm": 0.8890330195426941, + "learning_rate": 3.7000837187610826e-05, + "loss": 0.1914, + "step": 2412 + }, + { + "epoch": 1.4682080924855492, + "grad_norm": 1.1121948957443237, + "learning_rate": 3.698991494148391e-05, + "loss": 0.2169, + "step": 2413 + }, + { + "epoch": 1.4688165500456343, + "grad_norm": 1.0296958684921265, + "learning_rate": 3.697898972235785e-05, + "loss": 0.2164, + "step": 2414 + }, + { + "epoch": 1.4694250076057194, + "grad_norm": 0.9091324210166931, + "learning_rate": 3.6968061532941654e-05, + "loss": 0.207, + "step": 2415 + }, + { + "epoch": 1.4700334651658047, + "grad_norm": 0.8409296274185181, + "learning_rate": 3.695713037594505e-05, + "loss": 0.2317, + "step": 2416 + }, + { + "epoch": 1.47064192272589, + "grad_norm": 0.8644183278083801, + "learning_rate": 3.6946196254078515e-05, + "loss": 0.2301, + "step": 2417 + }, + { + "epoch": 1.471250380285975, + "grad_norm": 0.9687103033065796, + "learning_rate": 3.693525917005324e-05, + "loss": 0.2214, + "step": 2418 + }, + { + "epoch": 1.4718588378460602, + "grad_norm": 0.8268604278564453, + "learning_rate": 3.692431912658118e-05, + "loss": 0.2614, + "step": 2419 + }, + { + "epoch": 1.4724672954061453, + "grad_norm": 0.8227600455284119, + "learning_rate": 3.691337612637501e-05, + "loss": 0.1931, + "step": 2420 + }, + { + "epoch": 1.4730757529662306, + "grad_norm": 0.8828481435775757, + "learning_rate": 3.690243017214813e-05, + "loss": 0.1842, + "step": 2421 + }, + { + "epoch": 1.4736842105263157, + "grad_norm": 0.7842615246772766, + "learning_rate": 3.689148126661469e-05, + "loss": 0.2161, + "step": 2422 + }, + { + "epoch": 1.474292668086401, + "grad_norm": 0.9619740843772888, + "learning_rate": 3.688052941248956e-05, + "loss": 0.2421, + "step": 2423 + }, + { + "epoch": 1.4749011256464861, + "grad_norm": 0.8373282551765442, + "learning_rate": 3.686957461248833e-05, + "loss": 0.2045, + "step": 2424 + }, + { + "epoch": 1.4755095832065712, + "grad_norm": 0.9472412467002869, + "learning_rate": 3.685861686932735e-05, + "loss": 0.2319, + "step": 2425 + }, + { + "epoch": 1.4761180407666565, + "grad_norm": 0.9255704283714294, + "learning_rate": 3.6847656185723686e-05, + "loss": 0.2266, + "step": 2426 + }, + { + "epoch": 1.4767264983267416, + "grad_norm": 0.851377010345459, + "learning_rate": 3.683669256439511e-05, + "loss": 0.2215, + "step": 2427 + }, + { + "epoch": 1.477334955886827, + "grad_norm": 0.9094513654708862, + "learning_rate": 3.6825726008060155e-05, + "loss": 0.2713, + "step": 2428 + }, + { + "epoch": 1.477943413446912, + "grad_norm": 0.9373210668563843, + "learning_rate": 3.681475651943809e-05, + "loss": 0.2021, + "step": 2429 + }, + { + "epoch": 1.4785518710069971, + "grad_norm": 0.8862281441688538, + "learning_rate": 3.680378410124885e-05, + "loss": 0.2172, + "step": 2430 + }, + { + "epoch": 1.4791603285670825, + "grad_norm": 0.9415962100028992, + "learning_rate": 3.6792808756213166e-05, + "loss": 0.2334, + "step": 2431 + }, + { + "epoch": 1.4797687861271676, + "grad_norm": 0.8250126838684082, + "learning_rate": 3.678183048705246e-05, + "loss": 0.2086, + "step": 2432 + }, + { + "epoch": 1.4803772436872529, + "grad_norm": 0.8701198697090149, + "learning_rate": 3.6770849296488885e-05, + "loss": 0.218, + "step": 2433 + }, + { + "epoch": 1.480985701247338, + "grad_norm": 0.7581288814544678, + "learning_rate": 3.675986518724532e-05, + "loss": 0.1843, + "step": 2434 + }, + { + "epoch": 1.481594158807423, + "grad_norm": 0.8139165043830872, + "learning_rate": 3.674887816204536e-05, + "loss": 0.1995, + "step": 2435 + }, + { + "epoch": 1.4822026163675084, + "grad_norm": 0.9862119555473328, + "learning_rate": 3.6737888223613323e-05, + "loss": 0.173, + "step": 2436 + }, + { + "epoch": 1.4828110739275935, + "grad_norm": 0.9770458936691284, + "learning_rate": 3.6726895374674286e-05, + "loss": 0.2326, + "step": 2437 + }, + { + "epoch": 1.4834195314876788, + "grad_norm": 0.9238155484199524, + "learning_rate": 3.671589961795399e-05, + "loss": 0.1974, + "step": 2438 + }, + { + "epoch": 1.484027989047764, + "grad_norm": 0.7940313816070557, + "learning_rate": 3.6704900956178924e-05, + "loss": 0.1888, + "step": 2439 + }, + { + "epoch": 1.484636446607849, + "grad_norm": 0.9048517942428589, + "learning_rate": 3.6693899392076306e-05, + "loss": 0.2122, + "step": 2440 + }, + { + "epoch": 1.4852449041679343, + "grad_norm": 0.8240786194801331, + "learning_rate": 3.6682894928374074e-05, + "loss": 0.169, + "step": 2441 + }, + { + "epoch": 1.4858533617280194, + "grad_norm": 0.9464930295944214, + "learning_rate": 3.6671887567800853e-05, + "loss": 0.1947, + "step": 2442 + }, + { + "epoch": 1.4864618192881047, + "grad_norm": 0.8999909162521362, + "learning_rate": 3.666087731308604e-05, + "loss": 0.2237, + "step": 2443 + }, + { + "epoch": 1.4870702768481898, + "grad_norm": 0.8082036972045898, + "learning_rate": 3.664986416695969e-05, + "loss": 0.1476, + "step": 2444 + }, + { + "epoch": 1.487678734408275, + "grad_norm": 0.8836786150932312, + "learning_rate": 3.663884813215263e-05, + "loss": 0.1995, + "step": 2445 + }, + { + "epoch": 1.4882871919683602, + "grad_norm": 0.8656254410743713, + "learning_rate": 3.6627829211396345e-05, + "loss": 0.2678, + "step": 2446 + }, + { + "epoch": 1.4888956495284453, + "grad_norm": 0.7731005549430847, + "learning_rate": 3.66168074074231e-05, + "loss": 0.1776, + "step": 2447 + }, + { + "epoch": 1.4895041070885306, + "grad_norm": 0.8301362991333008, + "learning_rate": 3.6605782722965834e-05, + "loss": 0.1878, + "step": 2448 + }, + { + "epoch": 1.4901125646486157, + "grad_norm": 0.8362447023391724, + "learning_rate": 3.65947551607582e-05, + "loss": 0.2027, + "step": 2449 + }, + { + "epoch": 1.4907210222087008, + "grad_norm": 0.9739522933959961, + "learning_rate": 3.6583724723534574e-05, + "loss": 0.2361, + "step": 2450 + }, + { + "epoch": 1.4913294797687862, + "grad_norm": 0.9524062275886536, + "learning_rate": 3.6572691414030055e-05, + "loss": 0.2252, + "step": 2451 + }, + { + "epoch": 1.4919379373288713, + "grad_norm": 1.0326991081237793, + "learning_rate": 3.656165523498044e-05, + "loss": 0.2235, + "step": 2452 + }, + { + "epoch": 1.4925463948889566, + "grad_norm": 0.9849575161933899, + "learning_rate": 3.655061618912224e-05, + "loss": 0.2279, + "step": 2453 + }, + { + "epoch": 1.4931548524490417, + "grad_norm": 0.8630450367927551, + "learning_rate": 3.653957427919268e-05, + "loss": 0.1915, + "step": 2454 + }, + { + "epoch": 1.4937633100091268, + "grad_norm": 0.845496654510498, + "learning_rate": 3.652852950792969e-05, + "loss": 0.1721, + "step": 2455 + }, + { + "epoch": 1.494371767569212, + "grad_norm": 1.2583898305892944, + "learning_rate": 3.651748187807191e-05, + "loss": 0.2039, + "step": 2456 + }, + { + "epoch": 1.4949802251292972, + "grad_norm": 0.7946431040763855, + "learning_rate": 3.650643139235871e-05, + "loss": 0.1687, + "step": 2457 + }, + { + "epoch": 1.4955886826893825, + "grad_norm": 1.014686107635498, + "learning_rate": 3.649537805353013e-05, + "loss": 0.2319, + "step": 2458 + }, + { + "epoch": 1.4961971402494676, + "grad_norm": 1.0813982486724854, + "learning_rate": 3.648432186432694e-05, + "loss": 0.2508, + "step": 2459 + }, + { + "epoch": 1.4968055978095527, + "grad_norm": 0.8511247038841248, + "learning_rate": 3.6473262827490614e-05, + "loss": 0.1737, + "step": 2460 + }, + { + "epoch": 1.497414055369638, + "grad_norm": 0.963362455368042, + "learning_rate": 3.646220094576334e-05, + "loss": 0.2431, + "step": 2461 + }, + { + "epoch": 1.498022512929723, + "grad_norm": 0.8051499724388123, + "learning_rate": 3.6451136221888005e-05, + "loss": 0.1954, + "step": 2462 + }, + { + "epoch": 1.4986309704898084, + "grad_norm": 0.8649519085884094, + "learning_rate": 3.644006865860819e-05, + "loss": 0.1716, + "step": 2463 + }, + { + "epoch": 1.4992394280498935, + "grad_norm": 0.9137884378433228, + "learning_rate": 3.642899825866819e-05, + "loss": 0.1964, + "step": 2464 + }, + { + "epoch": 1.4998478856099786, + "grad_norm": 0.7866896986961365, + "learning_rate": 3.641792502481301e-05, + "loss": 0.1809, + "step": 2465 + }, + { + "epoch": 1.500456343170064, + "grad_norm": 0.9844456911087036, + "learning_rate": 3.640684895978834e-05, + "loss": 0.2182, + "step": 2466 + }, + { + "epoch": 1.501064800730149, + "grad_norm": 0.9635726809501648, + "learning_rate": 3.6395770066340596e-05, + "loss": 0.2182, + "step": 2467 + }, + { + "epoch": 1.5016732582902343, + "grad_norm": 0.9193302392959595, + "learning_rate": 3.6384688347216875e-05, + "loss": 0.2455, + "step": 2468 + }, + { + "epoch": 1.5022817158503194, + "grad_norm": 1.0144778490066528, + "learning_rate": 3.637360380516498e-05, + "loss": 0.2717, + "step": 2469 + }, + { + "epoch": 1.5028901734104045, + "grad_norm": 0.8575990796089172, + "learning_rate": 3.6362516442933416e-05, + "loss": 0.2333, + "step": 2470 + }, + { + "epoch": 1.5034986309704899, + "grad_norm": 1.1676721572875977, + "learning_rate": 3.635142626327139e-05, + "loss": 0.2948, + "step": 2471 + }, + { + "epoch": 1.504107088530575, + "grad_norm": 0.7690470814704895, + "learning_rate": 3.634033326892879e-05, + "loss": 0.2268, + "step": 2472 + }, + { + "epoch": 1.5047155460906603, + "grad_norm": 0.8740969300270081, + "learning_rate": 3.632923746265623e-05, + "loss": 0.227, + "step": 2473 + }, + { + "epoch": 1.5053240036507454, + "grad_norm": 0.8315359354019165, + "learning_rate": 3.631813884720502e-05, + "loss": 0.2036, + "step": 2474 + }, + { + "epoch": 1.5059324612108305, + "grad_norm": 0.8344742059707642, + "learning_rate": 3.630703742532713e-05, + "loss": 0.2397, + "step": 2475 + }, + { + "epoch": 1.5065409187709158, + "grad_norm": 1.0899635553359985, + "learning_rate": 3.6295933199775265e-05, + "loss": 0.1978, + "step": 2476 + }, + { + "epoch": 1.5071493763310009, + "grad_norm": 0.9321961998939514, + "learning_rate": 3.628482617330279e-05, + "loss": 0.2296, + "step": 2477 + }, + { + "epoch": 1.5077578338910862, + "grad_norm": 0.8498799204826355, + "learning_rate": 3.627371634866381e-05, + "loss": 0.1986, + "step": 2478 + }, + { + "epoch": 1.5083662914511713, + "grad_norm": 0.8041394948959351, + "learning_rate": 3.626260372861308e-05, + "loss": 0.1844, + "step": 2479 + }, + { + "epoch": 1.5089747490112564, + "grad_norm": 0.8486339449882507, + "learning_rate": 3.625148831590608e-05, + "loss": 0.2032, + "step": 2480 + }, + { + "epoch": 1.5095832065713415, + "grad_norm": 0.817248523235321, + "learning_rate": 3.624037011329896e-05, + "loss": 0.1985, + "step": 2481 + }, + { + "epoch": 1.5101916641314268, + "grad_norm": 0.8181208372116089, + "learning_rate": 3.6229249123548574e-05, + "loss": 0.1856, + "step": 2482 + }, + { + "epoch": 1.5108001216915121, + "grad_norm": 0.7917095422744751, + "learning_rate": 3.621812534941246e-05, + "loss": 0.1694, + "step": 2483 + }, + { + "epoch": 1.5114085792515972, + "grad_norm": 0.8646035194396973, + "learning_rate": 3.620699879364886e-05, + "loss": 0.2158, + "step": 2484 + }, + { + "epoch": 1.5120170368116823, + "grad_norm": 0.9511632323265076, + "learning_rate": 3.619586945901669e-05, + "loss": 0.1938, + "step": 2485 + }, + { + "epoch": 1.5126254943717674, + "grad_norm": 0.7660108208656311, + "learning_rate": 3.618473734827556e-05, + "loss": 0.1687, + "step": 2486 + }, + { + "epoch": 1.5132339519318527, + "grad_norm": 0.772885799407959, + "learning_rate": 3.6173602464185765e-05, + "loss": 0.2055, + "step": 2487 + }, + { + "epoch": 1.513842409491938, + "grad_norm": 0.8760683536529541, + "learning_rate": 3.61624648095083e-05, + "loss": 0.1855, + "step": 2488 + }, + { + "epoch": 1.5144508670520231, + "grad_norm": 1.2097331285476685, + "learning_rate": 3.615132438700484e-05, + "loss": 0.2581, + "step": 2489 + }, + { + "epoch": 1.5150593246121082, + "grad_norm": 0.7943812012672424, + "learning_rate": 3.614018119943774e-05, + "loss": 0.2076, + "step": 2490 + }, + { + "epoch": 1.5156677821721933, + "grad_norm": 0.8257327079772949, + "learning_rate": 3.6129035249570053e-05, + "loss": 0.202, + "step": 2491 + }, + { + "epoch": 1.5162762397322787, + "grad_norm": 0.768048107624054, + "learning_rate": 3.6117886540165504e-05, + "loss": 0.1859, + "step": 2492 + }, + { + "epoch": 1.516884697292364, + "grad_norm": 0.8744530081748962, + "learning_rate": 3.6106735073988504e-05, + "loss": 0.1751, + "step": 2493 + }, + { + "epoch": 1.517493154852449, + "grad_norm": 0.9588749408721924, + "learning_rate": 3.6095580853804155e-05, + "loss": 0.2156, + "step": 2494 + }, + { + "epoch": 1.5181016124125342, + "grad_norm": 0.8880223035812378, + "learning_rate": 3.608442388237825e-05, + "loss": 0.2116, + "step": 2495 + }, + { + "epoch": 1.5187100699726193, + "grad_norm": 0.9411167502403259, + "learning_rate": 3.607326416247723e-05, + "loss": 0.217, + "step": 2496 + }, + { + "epoch": 1.5193185275327046, + "grad_norm": 0.9615315198898315, + "learning_rate": 3.606210169686827e-05, + "loss": 0.2129, + "step": 2497 + }, + { + "epoch": 1.51992698509279, + "grad_norm": 0.9808461666107178, + "learning_rate": 3.605093648831917e-05, + "loss": 0.2242, + "step": 2498 + }, + { + "epoch": 1.520535442652875, + "grad_norm": 1.2025115489959717, + "learning_rate": 3.603976853959845e-05, + "loss": 0.2028, + "step": 2499 + }, + { + "epoch": 1.52114390021296, + "grad_norm": 0.8565399646759033, + "learning_rate": 3.602859785347529e-05, + "loss": 0.228, + "step": 2500 + }, + { + "epoch": 1.5217523577730452, + "grad_norm": 0.769661545753479, + "learning_rate": 3.601742443271956e-05, + "loss": 0.1927, + "step": 2501 + }, + { + "epoch": 1.5223608153331305, + "grad_norm": 0.8793697953224182, + "learning_rate": 3.600624828010181e-05, + "loss": 0.199, + "step": 2502 + }, + { + "epoch": 1.5229692728932158, + "grad_norm": 0.9120194315910339, + "learning_rate": 3.599506939839323e-05, + "loss": 0.2185, + "step": 2503 + }, + { + "epoch": 1.523577730453301, + "grad_norm": 0.9638735055923462, + "learning_rate": 3.598388779036575e-05, + "loss": 0.2264, + "step": 2504 + }, + { + "epoch": 1.524186188013386, + "grad_norm": 0.9277864694595337, + "learning_rate": 3.597270345879192e-05, + "loss": 0.2542, + "step": 2505 + }, + { + "epoch": 1.524794645573471, + "grad_norm": 0.8778446316719055, + "learning_rate": 3.5961516406445e-05, + "loss": 0.2076, + "step": 2506 + }, + { + "epoch": 1.5254031031335564, + "grad_norm": 0.8511151671409607, + "learning_rate": 3.595032663609891e-05, + "loss": 0.1923, + "step": 2507 + }, + { + "epoch": 1.5260115606936417, + "grad_norm": 0.7811472415924072, + "learning_rate": 3.593913415052825e-05, + "loss": 0.1589, + "step": 2508 + }, + { + "epoch": 1.5266200182537268, + "grad_norm": 0.8555339574813843, + "learning_rate": 3.5927938952508284e-05, + "loss": 0.1826, + "step": 2509 + }, + { + "epoch": 1.527228475813812, + "grad_norm": 0.8637688159942627, + "learning_rate": 3.591674104481495e-05, + "loss": 0.193, + "step": 2510 + }, + { + "epoch": 1.527836933373897, + "grad_norm": 0.8080950379371643, + "learning_rate": 3.590554043022488e-05, + "loss": 0.1814, + "step": 2511 + }, + { + "epoch": 1.5284453909339824, + "grad_norm": 0.9069305658340454, + "learning_rate": 3.5894337111515344e-05, + "loss": 0.1547, + "step": 2512 + }, + { + "epoch": 1.5290538484940677, + "grad_norm": 0.9532658457756042, + "learning_rate": 3.58831310914643e-05, + "loss": 0.1953, + "step": 2513 + }, + { + "epoch": 1.5296623060541528, + "grad_norm": 1.2767683267593384, + "learning_rate": 3.5871922372850376e-05, + "loss": 0.2019, + "step": 2514 + }, + { + "epoch": 1.5302707636142379, + "grad_norm": 0.8507498502731323, + "learning_rate": 3.586071095845287e-05, + "loss": 0.2115, + "step": 2515 + }, + { + "epoch": 1.530879221174323, + "grad_norm": 1.0665339231491089, + "learning_rate": 3.5849496851051744e-05, + "loss": 0.2373, + "step": 2516 + }, + { + "epoch": 1.5314876787344083, + "grad_norm": 0.9984211325645447, + "learning_rate": 3.583828005342763e-05, + "loss": 0.245, + "step": 2517 + }, + { + "epoch": 1.5320961362944936, + "grad_norm": 0.850425124168396, + "learning_rate": 3.5827060568361817e-05, + "loss": 0.1987, + "step": 2518 + }, + { + "epoch": 1.5327045938545787, + "grad_norm": 1.2310839891433716, + "learning_rate": 3.5815838398636284e-05, + "loss": 0.2151, + "step": 2519 + }, + { + "epoch": 1.5333130514146638, + "grad_norm": 0.8494765162467957, + "learning_rate": 3.580461354703365e-05, + "loss": 0.245, + "step": 2520 + }, + { + "epoch": 1.5339215089747489, + "grad_norm": 0.929420530796051, + "learning_rate": 3.579338601633722e-05, + "loss": 0.24, + "step": 2521 + }, + { + "epoch": 1.5345299665348342, + "grad_norm": 0.7413504719734192, + "learning_rate": 3.578215580933095e-05, + "loss": 0.1886, + "step": 2522 + }, + { + "epoch": 1.5351384240949195, + "grad_norm": 0.9758164286613464, + "learning_rate": 3.577092292879946e-05, + "loss": 0.2134, + "step": 2523 + }, + { + "epoch": 1.5357468816550046, + "grad_norm": 0.8202041983604431, + "learning_rate": 3.5759687377528026e-05, + "loss": 0.1697, + "step": 2524 + }, + { + "epoch": 1.5363553392150897, + "grad_norm": 0.816893994808197, + "learning_rate": 3.574844915830263e-05, + "loss": 0.1809, + "step": 2525 + }, + { + "epoch": 1.5369637967751748, + "grad_norm": 0.8673279285430908, + "learning_rate": 3.573720827390984e-05, + "loss": 0.2501, + "step": 2526 + }, + { + "epoch": 1.5375722543352601, + "grad_norm": 0.8785646557807922, + "learning_rate": 3.572596472713696e-05, + "loss": 0.1991, + "step": 2527 + }, + { + "epoch": 1.5381807118953454, + "grad_norm": 0.8835013508796692, + "learning_rate": 3.5714718520771904e-05, + "loss": 0.1982, + "step": 2528 + }, + { + "epoch": 1.5387891694554305, + "grad_norm": 0.8082259297370911, + "learning_rate": 3.570346965760326e-05, + "loss": 0.174, + "step": 2529 + }, + { + "epoch": 1.5393976270155156, + "grad_norm": 0.869325578212738, + "learning_rate": 3.5692218140420295e-05, + "loss": 0.1844, + "step": 2530 + }, + { + "epoch": 1.5400060845756007, + "grad_norm": 0.9018534421920776, + "learning_rate": 3.5680963972012894e-05, + "loss": 0.1921, + "step": 2531 + }, + { + "epoch": 1.540614542135686, + "grad_norm": 0.8284593820571899, + "learning_rate": 3.566970715517164e-05, + "loss": 0.1818, + "step": 2532 + }, + { + "epoch": 1.5412229996957714, + "grad_norm": 0.7905526161193848, + "learning_rate": 3.565844769268774e-05, + "loss": 0.1917, + "step": 2533 + }, + { + "epoch": 1.5418314572558565, + "grad_norm": 0.7893586158752441, + "learning_rate": 3.564718558735308e-05, + "loss": 0.177, + "step": 2534 + }, + { + "epoch": 1.5424399148159416, + "grad_norm": 0.7778828144073486, + "learning_rate": 3.56359208419602e-05, + "loss": 0.2133, + "step": 2535 + }, + { + "epoch": 1.5430483723760267, + "grad_norm": 0.8154110312461853, + "learning_rate": 3.562465345930227e-05, + "loss": 0.2001, + "step": 2536 + }, + { + "epoch": 1.543656829936112, + "grad_norm": 0.8788914084434509, + "learning_rate": 3.561338344217314e-05, + "loss": 0.1756, + "step": 2537 + }, + { + "epoch": 1.5442652874961973, + "grad_norm": 0.7563880085945129, + "learning_rate": 3.560211079336731e-05, + "loss": 0.1944, + "step": 2538 + }, + { + "epoch": 1.5448737450562824, + "grad_norm": 0.7627280354499817, + "learning_rate": 3.559083551567991e-05, + "loss": 0.1919, + "step": 2539 + }, + { + "epoch": 1.5454822026163675, + "grad_norm": 1.1467633247375488, + "learning_rate": 3.5579557611906755e-05, + "loss": 0.1835, + "step": 2540 + }, + { + "epoch": 1.5460906601764526, + "grad_norm": 0.9292544722557068, + "learning_rate": 3.5568277084844295e-05, + "loss": 0.2053, + "step": 2541 + }, + { + "epoch": 1.546699117736538, + "grad_norm": 0.7936065196990967, + "learning_rate": 3.555699393728962e-05, + "loss": 0.1753, + "step": 2542 + }, + { + "epoch": 1.5473075752966232, + "grad_norm": 0.8932594060897827, + "learning_rate": 3.554570817204048e-05, + "loss": 0.1695, + "step": 2543 + }, + { + "epoch": 1.5479160328567083, + "grad_norm": 1.080686330795288, + "learning_rate": 3.553441979189529e-05, + "loss": 0.259, + "step": 2544 + }, + { + "epoch": 1.5485244904167934, + "grad_norm": 0.7938811182975769, + "learning_rate": 3.5523128799653084e-05, + "loss": 0.1673, + "step": 2545 + }, + { + "epoch": 1.5491329479768785, + "grad_norm": 0.8280820846557617, + "learning_rate": 3.551183519811356e-05, + "loss": 0.1884, + "step": 2546 + }, + { + "epoch": 1.5497414055369638, + "grad_norm": 0.8619610071182251, + "learning_rate": 3.550053899007707e-05, + "loss": 0.1851, + "step": 2547 + }, + { + "epoch": 1.5503498630970491, + "grad_norm": 0.824386477470398, + "learning_rate": 3.548924017834458e-05, + "loss": 0.1824, + "step": 2548 + }, + { + "epoch": 1.5509583206571342, + "grad_norm": 0.8854561448097229, + "learning_rate": 3.547793876571775e-05, + "loss": 0.1681, + "step": 2549 + }, + { + "epoch": 1.5515667782172193, + "grad_norm": 0.7834053635597229, + "learning_rate": 3.546663475499884e-05, + "loss": 0.1574, + "step": 2550 + }, + { + "epoch": 1.5521752357773044, + "grad_norm": 1.0175268650054932, + "learning_rate": 3.5455328148990794e-05, + "loss": 0.2224, + "step": 2551 + }, + { + "epoch": 1.5527836933373897, + "grad_norm": 0.7926445007324219, + "learning_rate": 3.544401895049716e-05, + "loss": 0.1664, + "step": 2552 + }, + { + "epoch": 1.553392150897475, + "grad_norm": 0.8977791666984558, + "learning_rate": 3.543270716232215e-05, + "loss": 0.2121, + "step": 2553 + }, + { + "epoch": 1.5540006084575602, + "grad_norm": 0.7897874116897583, + "learning_rate": 3.542139278727062e-05, + "loss": 0.2033, + "step": 2554 + }, + { + "epoch": 1.5546090660176453, + "grad_norm": 0.8928864598274231, + "learning_rate": 3.541007582814807e-05, + "loss": 0.1999, + "step": 2555 + }, + { + "epoch": 1.5552175235777304, + "grad_norm": 0.8382681608200073, + "learning_rate": 3.539875628776062e-05, + "loss": 0.1996, + "step": 2556 + }, + { + "epoch": 1.5558259811378157, + "grad_norm": 0.7771824598312378, + "learning_rate": 3.5387434168915065e-05, + "loss": 0.1788, + "step": 2557 + }, + { + "epoch": 1.5564344386979008, + "grad_norm": 0.9810817241668701, + "learning_rate": 3.53761094744188e-05, + "loss": 0.1917, + "step": 2558 + }, + { + "epoch": 1.557042896257986, + "grad_norm": 0.8481950163841248, + "learning_rate": 3.5364782207079886e-05, + "loss": 0.1748, + "step": 2559 + }, + { + "epoch": 1.5576513538180712, + "grad_norm": 0.8863506317138672, + "learning_rate": 3.5353452369707e-05, + "loss": 0.1898, + "step": 2560 + }, + { + "epoch": 1.5582598113781563, + "grad_norm": 0.9136548638343811, + "learning_rate": 3.534211996510949e-05, + "loss": 0.2652, + "step": 2561 + }, + { + "epoch": 1.5588682689382416, + "grad_norm": 0.814624011516571, + "learning_rate": 3.53307849960973e-05, + "loss": 0.1837, + "step": 2562 + }, + { + "epoch": 1.5594767264983267, + "grad_norm": 0.9348104000091553, + "learning_rate": 3.531944746548105e-05, + "loss": 0.1614, + "step": 2563 + }, + { + "epoch": 1.560085184058412, + "grad_norm": 0.817783534526825, + "learning_rate": 3.530810737607195e-05, + "loss": 0.1853, + "step": 2564 + }, + { + "epoch": 1.560693641618497, + "grad_norm": 0.8149775266647339, + "learning_rate": 3.529676473068189e-05, + "loss": 0.1745, + "step": 2565 + }, + { + "epoch": 1.5613020991785822, + "grad_norm": 0.8247946500778198, + "learning_rate": 3.5285419532123375e-05, + "loss": 0.1779, + "step": 2566 + }, + { + "epoch": 1.5619105567386675, + "grad_norm": 0.8637406826019287, + "learning_rate": 3.5274071783209525e-05, + "loss": 0.1981, + "step": 2567 + }, + { + "epoch": 1.5625190142987526, + "grad_norm": 0.7747231721878052, + "learning_rate": 3.5262721486754125e-05, + "loss": 0.1827, + "step": 2568 + }, + { + "epoch": 1.563127471858838, + "grad_norm": 0.9045551419258118, + "learning_rate": 3.525136864557156e-05, + "loss": 0.2214, + "step": 2569 + }, + { + "epoch": 1.563735929418923, + "grad_norm": 0.9226981997489929, + "learning_rate": 3.5240013262476866e-05, + "loss": 0.2045, + "step": 2570 + }, + { + "epoch": 1.5643443869790081, + "grad_norm": 0.793237566947937, + "learning_rate": 3.522865534028572e-05, + "loss": 0.1727, + "step": 2571 + }, + { + "epoch": 1.5649528445390934, + "grad_norm": 0.9380844235420227, + "learning_rate": 3.5217294881814386e-05, + "loss": 0.2171, + "step": 2572 + }, + { + "epoch": 1.5655613020991785, + "grad_norm": 0.964013397693634, + "learning_rate": 3.520593188987982e-05, + "loss": 0.2032, + "step": 2573 + }, + { + "epoch": 1.5661697596592639, + "grad_norm": 0.9411283731460571, + "learning_rate": 3.519456636729953e-05, + "loss": 0.1901, + "step": 2574 + }, + { + "epoch": 1.566778217219349, + "grad_norm": 0.8983286023139954, + "learning_rate": 3.518319831689172e-05, + "loss": 0.1837, + "step": 2575 + }, + { + "epoch": 1.567386674779434, + "grad_norm": 0.8221237659454346, + "learning_rate": 3.517182774147518e-05, + "loss": 0.1748, + "step": 2576 + }, + { + "epoch": 1.5679951323395194, + "grad_norm": 0.9628398418426514, + "learning_rate": 3.516045464386935e-05, + "loss": 0.1888, + "step": 2577 + }, + { + "epoch": 1.5686035898996045, + "grad_norm": 0.8376243710517883, + "learning_rate": 3.5149079026894266e-05, + "loss": 0.2096, + "step": 2578 + }, + { + "epoch": 1.5692120474596898, + "grad_norm": 0.8941128253936768, + "learning_rate": 3.513770089337063e-05, + "loss": 0.2027, + "step": 2579 + }, + { + "epoch": 1.5698205050197749, + "grad_norm": 0.9198672771453857, + "learning_rate": 3.512632024611972e-05, + "loss": 0.218, + "step": 2580 + }, + { + "epoch": 1.57042896257986, + "grad_norm": 0.9642953872680664, + "learning_rate": 3.511493708796348e-05, + "loss": 0.2402, + "step": 2581 + }, + { + "epoch": 1.571037420139945, + "grad_norm": 0.8804176449775696, + "learning_rate": 3.5103551421724457e-05, + "loss": 0.2181, + "step": 2582 + }, + { + "epoch": 1.5716458777000304, + "grad_norm": 0.7416863441467285, + "learning_rate": 3.509216325022582e-05, + "loss": 0.192, + "step": 2583 + }, + { + "epoch": 1.5722543352601157, + "grad_norm": 0.8433103561401367, + "learning_rate": 3.5080772576291356e-05, + "loss": 0.2138, + "step": 2584 + }, + { + "epoch": 1.5728627928202008, + "grad_norm": 0.8431623578071594, + "learning_rate": 3.506937940274547e-05, + "loss": 0.1981, + "step": 2585 + }, + { + "epoch": 1.573471250380286, + "grad_norm": 0.8424465656280518, + "learning_rate": 3.5057983732413224e-05, + "loss": 0.1745, + "step": 2586 + }, + { + "epoch": 1.574079707940371, + "grad_norm": 0.8414326310157776, + "learning_rate": 3.504658556812024e-05, + "loss": 0.1687, + "step": 2587 + }, + { + "epoch": 1.5746881655004563, + "grad_norm": 0.8884822130203247, + "learning_rate": 3.503518491269279e-05, + "loss": 0.1771, + "step": 2588 + }, + { + "epoch": 1.5752966230605416, + "grad_norm": 0.7992542386054993, + "learning_rate": 3.502378176895778e-05, + "loss": 0.1782, + "step": 2589 + }, + { + "epoch": 1.5759050806206267, + "grad_norm": 0.9021846055984497, + "learning_rate": 3.50123761397427e-05, + "loss": 0.2075, + "step": 2590 + }, + { + "epoch": 1.5765135381807118, + "grad_norm": 1.1060278415679932, + "learning_rate": 3.500096802787567e-05, + "loss": 0.2263, + "step": 2591 + }, + { + "epoch": 1.577121995740797, + "grad_norm": 1.0591530799865723, + "learning_rate": 3.4989557436185434e-05, + "loss": 0.1848, + "step": 2592 + }, + { + "epoch": 1.5777304533008822, + "grad_norm": 0.945749044418335, + "learning_rate": 3.4978144367501335e-05, + "loss": 0.2026, + "step": 2593 + }, + { + "epoch": 1.5783389108609676, + "grad_norm": 0.8668262958526611, + "learning_rate": 3.496672882465335e-05, + "loss": 0.1701, + "step": 2594 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 0.8319501280784607, + "learning_rate": 3.495531081047204e-05, + "loss": 0.1532, + "step": 2595 + }, + { + "epoch": 1.5795558259811378, + "grad_norm": 0.9786222577095032, + "learning_rate": 3.494389032778862e-05, + "loss": 0.2242, + "step": 2596 + }, + { + "epoch": 1.5801642835412228, + "grad_norm": 0.8528378009796143, + "learning_rate": 3.493246737943487e-05, + "loss": 0.2037, + "step": 2597 + }, + { + "epoch": 1.5807727411013082, + "grad_norm": 1.3592191934585571, + "learning_rate": 3.492104196824322e-05, + "loss": 0.2073, + "step": 2598 + }, + { + "epoch": 1.5813811986613935, + "grad_norm": 0.7618198394775391, + "learning_rate": 3.4909614097046686e-05, + "loss": 0.1418, + "step": 2599 + }, + { + "epoch": 1.5819896562214786, + "grad_norm": 0.8264166712760925, + "learning_rate": 3.489818376867891e-05, + "loss": 0.1669, + "step": 2600 + }, + { + "epoch": 1.5825981137815637, + "grad_norm": 0.8527823090553284, + "learning_rate": 3.4886750985974136e-05, + "loss": 0.1732, + "step": 2601 + }, + { + "epoch": 1.5832065713416488, + "grad_norm": 0.8907186985015869, + "learning_rate": 3.487531575176722e-05, + "loss": 0.2095, + "step": 2602 + }, + { + "epoch": 1.583815028901734, + "grad_norm": 0.8088558912277222, + "learning_rate": 3.4863878068893625e-05, + "loss": 0.1638, + "step": 2603 + }, + { + "epoch": 1.5844234864618194, + "grad_norm": 1.0112056732177734, + "learning_rate": 3.4852437940189414e-05, + "loss": 0.1842, + "step": 2604 + }, + { + "epoch": 1.5850319440219045, + "grad_norm": 0.6868709325790405, + "learning_rate": 3.4840995368491255e-05, + "loss": 0.1572, + "step": 2605 + }, + { + "epoch": 1.5856404015819896, + "grad_norm": 0.8623439073562622, + "learning_rate": 3.4829550356636445e-05, + "loss": 0.1846, + "step": 2606 + }, + { + "epoch": 1.5862488591420747, + "grad_norm": 0.8471298813819885, + "learning_rate": 3.481810290746287e-05, + "loss": 0.1654, + "step": 2607 + }, + { + "epoch": 1.58685731670216, + "grad_norm": 0.8028208017349243, + "learning_rate": 3.4806653023808996e-05, + "loss": 0.1955, + "step": 2608 + }, + { + "epoch": 1.5874657742622453, + "grad_norm": 0.8367049694061279, + "learning_rate": 3.4795200708513945e-05, + "loss": 0.1996, + "step": 2609 + }, + { + "epoch": 1.5880742318223304, + "grad_norm": 0.8771877884864807, + "learning_rate": 3.47837459644174e-05, + "loss": 0.2201, + "step": 2610 + }, + { + "epoch": 1.5886826893824155, + "grad_norm": 0.876565158367157, + "learning_rate": 3.477228879435966e-05, + "loss": 0.1817, + "step": 2611 + }, + { + "epoch": 1.5892911469425006, + "grad_norm": 0.9059187173843384, + "learning_rate": 3.4760829201181635e-05, + "loss": 0.2115, + "step": 2612 + }, + { + "epoch": 1.589899604502586, + "grad_norm": 0.9595020413398743, + "learning_rate": 3.474936718772481e-05, + "loss": 0.212, + "step": 2613 + }, + { + "epoch": 1.5905080620626713, + "grad_norm": 0.8010035753250122, + "learning_rate": 3.47379027568313e-05, + "loss": 0.1616, + "step": 2614 + }, + { + "epoch": 1.5911165196227564, + "grad_norm": 0.9189199805259705, + "learning_rate": 3.4726435911343804e-05, + "loss": 0.1859, + "step": 2615 + }, + { + "epoch": 1.5917249771828414, + "grad_norm": 0.8791665434837341, + "learning_rate": 3.471496665410561e-05, + "loss": 0.2185, + "step": 2616 + }, + { + "epoch": 1.5923334347429265, + "grad_norm": 0.9701746106147766, + "learning_rate": 3.4703494987960637e-05, + "loss": 0.2412, + "step": 2617 + }, + { + "epoch": 1.5929418923030119, + "grad_norm": 0.7525075674057007, + "learning_rate": 3.469202091575337e-05, + "loss": 0.1916, + "step": 2618 + }, + { + "epoch": 1.5935503498630972, + "grad_norm": 0.9566659331321716, + "learning_rate": 3.468054444032889e-05, + "loss": 0.1985, + "step": 2619 + }, + { + "epoch": 1.5941588074231823, + "grad_norm": 0.7912515997886658, + "learning_rate": 3.46690655645329e-05, + "loss": 0.2016, + "step": 2620 + }, + { + "epoch": 1.5947672649832674, + "grad_norm": 0.6893926858901978, + "learning_rate": 3.465758429121168e-05, + "loss": 0.1687, + "step": 2621 + }, + { + "epoch": 1.5953757225433525, + "grad_norm": 0.8185518980026245, + "learning_rate": 3.4646100623212094e-05, + "loss": 0.1726, + "step": 2622 + }, + { + "epoch": 1.5959841801034378, + "grad_norm": 1.019525170326233, + "learning_rate": 3.4634614563381624e-05, + "loss": 0.1899, + "step": 2623 + }, + { + "epoch": 1.596592637663523, + "grad_norm": 0.8567332625389099, + "learning_rate": 3.4623126114568336e-05, + "loss": 0.1901, + "step": 2624 + }, + { + "epoch": 1.5972010952236082, + "grad_norm": 0.7891209125518799, + "learning_rate": 3.4611635279620885e-05, + "loss": 0.1894, + "step": 2625 + }, + { + "epoch": 1.5978095527836933, + "grad_norm": 0.782440721988678, + "learning_rate": 3.460014206138851e-05, + "loss": 0.1761, + "step": 2626 + }, + { + "epoch": 1.5984180103437784, + "grad_norm": 1.1387567520141602, + "learning_rate": 3.458864646272107e-05, + "loss": 0.1906, + "step": 2627 + }, + { + "epoch": 1.5990264679038637, + "grad_norm": 0.766904354095459, + "learning_rate": 3.4577148486468975e-05, + "loss": 0.1905, + "step": 2628 + }, + { + "epoch": 1.599634925463949, + "grad_norm": 0.8891963362693787, + "learning_rate": 3.456564813548325e-05, + "loss": 0.1965, + "step": 2629 + }, + { + "epoch": 1.6002433830240341, + "grad_norm": 0.8071811199188232, + "learning_rate": 3.45541454126155e-05, + "loss": 0.185, + "step": 2630 + }, + { + "epoch": 1.6008518405841192, + "grad_norm": 0.9761980175971985, + "learning_rate": 3.4542640320717926e-05, + "loss": 0.2078, + "step": 2631 + }, + { + "epoch": 1.6014602981442043, + "grad_norm": 1.070326805114746, + "learning_rate": 3.453113286264332e-05, + "loss": 0.2261, + "step": 2632 + }, + { + "epoch": 1.6020687557042896, + "grad_norm": 0.7715917825698853, + "learning_rate": 3.4519623041245026e-05, + "loss": 0.1596, + "step": 2633 + }, + { + "epoch": 1.602677213264375, + "grad_norm": 0.7550219893455505, + "learning_rate": 3.450811085937702e-05, + "loss": 0.1756, + "step": 2634 + }, + { + "epoch": 1.60328567082446, + "grad_norm": 1.0285836458206177, + "learning_rate": 3.4496596319893844e-05, + "loss": 0.1657, + "step": 2635 + }, + { + "epoch": 1.6038941283845451, + "grad_norm": 0.9173053503036499, + "learning_rate": 3.448507942565061e-05, + "loss": 0.1729, + "step": 2636 + }, + { + "epoch": 1.6045025859446302, + "grad_norm": 3.4212629795074463, + "learning_rate": 3.4473560179503045e-05, + "loss": 0.2194, + "step": 2637 + }, + { + "epoch": 1.6051110435047156, + "grad_norm": 0.9600544571876526, + "learning_rate": 3.4462038584307424e-05, + "loss": 0.16, + "step": 2638 + }, + { + "epoch": 1.6057195010648009, + "grad_norm": 0.8442495465278625, + "learning_rate": 3.4450514642920636e-05, + "loss": 0.1872, + "step": 2639 + }, + { + "epoch": 1.606327958624886, + "grad_norm": 0.9204790592193604, + "learning_rate": 3.443898835820014e-05, + "loss": 0.1905, + "step": 2640 + }, + { + "epoch": 1.606936416184971, + "grad_norm": 0.9242935180664062, + "learning_rate": 3.442745973300395e-05, + "loss": 0.1767, + "step": 2641 + }, + { + "epoch": 1.6075448737450562, + "grad_norm": 0.7664668560028076, + "learning_rate": 3.441592877019072e-05, + "loss": 0.1682, + "step": 2642 + }, + { + "epoch": 1.6081533313051415, + "grad_norm": 0.7136707901954651, + "learning_rate": 3.440439547261962e-05, + "loss": 0.1695, + "step": 2643 + }, + { + "epoch": 1.6087617888652268, + "grad_norm": 0.8739480972290039, + "learning_rate": 3.4392859843150435e-05, + "loss": 0.1854, + "step": 2644 + }, + { + "epoch": 1.609370246425312, + "grad_norm": 0.9654523134231567, + "learning_rate": 3.4381321884643534e-05, + "loss": 0.1728, + "step": 2645 + }, + { + "epoch": 1.609978703985397, + "grad_norm": 0.9120699167251587, + "learning_rate": 3.4369781599959825e-05, + "loss": 0.1915, + "step": 2646 + }, + { + "epoch": 1.610587161545482, + "grad_norm": 0.8703420758247375, + "learning_rate": 3.4358238991960837e-05, + "loss": 0.1929, + "step": 2647 + }, + { + "epoch": 1.6111956191055674, + "grad_norm": 1.062511920928955, + "learning_rate": 3.434669406350866e-05, + "loss": 0.2282, + "step": 2648 + }, + { + "epoch": 1.6118040766656527, + "grad_norm": 0.9723806381225586, + "learning_rate": 3.433514681746593e-05, + "loss": 0.2546, + "step": 2649 + }, + { + "epoch": 1.6124125342257378, + "grad_norm": 0.8156774044036865, + "learning_rate": 3.4323597256695906e-05, + "loss": 0.1962, + "step": 2650 + }, + { + "epoch": 1.613020991785823, + "grad_norm": 0.6880114674568176, + "learning_rate": 3.4312045384062386e-05, + "loss": 0.1365, + "step": 2651 + }, + { + "epoch": 1.613629449345908, + "grad_norm": 0.9442213773727417, + "learning_rate": 3.430049120242975e-05, + "loss": 0.2038, + "step": 2652 + }, + { + "epoch": 1.6142379069059933, + "grad_norm": 0.8928872346878052, + "learning_rate": 3.428893471466297e-05, + "loss": 0.2134, + "step": 2653 + }, + { + "epoch": 1.6148463644660787, + "grad_norm": 0.7231988310813904, + "learning_rate": 3.427737592362755e-05, + "loss": 0.1674, + "step": 2654 + }, + { + "epoch": 1.6154548220261638, + "grad_norm": 0.8021813631057739, + "learning_rate": 3.4265814832189614e-05, + "loss": 0.1836, + "step": 2655 + }, + { + "epoch": 1.6160632795862488, + "grad_norm": 0.8998550772666931, + "learning_rate": 3.425425144321581e-05, + "loss": 0.1854, + "step": 2656 + }, + { + "epoch": 1.616671737146334, + "grad_norm": 0.8088403344154358, + "learning_rate": 3.424268575957339e-05, + "loss": 0.1573, + "step": 2657 + }, + { + "epoch": 1.6172801947064193, + "grad_norm": 0.9545161724090576, + "learning_rate": 3.423111778413015e-05, + "loss": 0.229, + "step": 2658 + }, + { + "epoch": 1.6178886522665044, + "grad_norm": 1.0301522016525269, + "learning_rate": 3.4219547519754475e-05, + "loss": 0.2269, + "step": 2659 + }, + { + "epoch": 1.6184971098265897, + "grad_norm": 1.287305474281311, + "learning_rate": 3.42079749693153e-05, + "loss": 0.2085, + "step": 2660 + }, + { + "epoch": 1.6191055673866748, + "grad_norm": 0.7046839594841003, + "learning_rate": 3.419640013568215e-05, + "loss": 0.1573, + "step": 2661 + }, + { + "epoch": 1.6197140249467599, + "grad_norm": 0.8109933137893677, + "learning_rate": 3.418482302172508e-05, + "loss": 0.173, + "step": 2662 + }, + { + "epoch": 1.6203224825068452, + "grad_norm": 0.792901873588562, + "learning_rate": 3.4173243630314754e-05, + "loss": 0.1872, + "step": 2663 + }, + { + "epoch": 1.6209309400669303, + "grad_norm": 0.8485409021377563, + "learning_rate": 3.4161661964322354e-05, + "loss": 0.1863, + "step": 2664 + }, + { + "epoch": 1.6215393976270156, + "grad_norm": 0.7961007356643677, + "learning_rate": 3.415007802661967e-05, + "loss": 0.1448, + "step": 2665 + }, + { + "epoch": 1.6221478551871007, + "grad_norm": 0.7520683407783508, + "learning_rate": 3.413849182007903e-05, + "loss": 0.1682, + "step": 2666 + }, + { + "epoch": 1.6227563127471858, + "grad_norm": 0.8546669483184814, + "learning_rate": 3.412690334757334e-05, + "loss": 0.1731, + "step": 2667 + }, + { + "epoch": 1.623364770307271, + "grad_norm": 0.9142362475395203, + "learning_rate": 3.411531261197603e-05, + "loss": 0.2352, + "step": 2668 + }, + { + "epoch": 1.6239732278673562, + "grad_norm": 0.7282983064651489, + "learning_rate": 3.410371961616114e-05, + "loss": 0.1176, + "step": 2669 + }, + { + "epoch": 1.6245816854274415, + "grad_norm": 0.8929833769798279, + "learning_rate": 3.409212436300326e-05, + "loss": 0.1912, + "step": 2670 + }, + { + "epoch": 1.6251901429875266, + "grad_norm": 0.7896724939346313, + "learning_rate": 3.40805268553775e-05, + "loss": 0.1885, + "step": 2671 + }, + { + "epoch": 1.6257986005476117, + "grad_norm": 0.9245496392250061, + "learning_rate": 3.4068927096159594e-05, + "loss": 0.1935, + "step": 2672 + }, + { + "epoch": 1.626407058107697, + "grad_norm": 0.8307329416275024, + "learning_rate": 3.4057325088225764e-05, + "loss": 0.1696, + "step": 2673 + }, + { + "epoch": 1.6270155156677821, + "grad_norm": 0.9103578925132751, + "learning_rate": 3.404572083445285e-05, + "loss": 0.1489, + "step": 2674 + }, + { + "epoch": 1.6276239732278674, + "grad_norm": 0.868897557258606, + "learning_rate": 3.40341143377182e-05, + "loss": 0.2073, + "step": 2675 + }, + { + "epoch": 1.6282324307879525, + "grad_norm": 0.79030442237854, + "learning_rate": 3.402250560089977e-05, + "loss": 0.195, + "step": 2676 + }, + { + "epoch": 1.6288408883480376, + "grad_norm": 0.7882320284843445, + "learning_rate": 3.401089462687602e-05, + "loss": 0.1669, + "step": 2677 + }, + { + "epoch": 1.629449345908123, + "grad_norm": 0.920220673084259, + "learning_rate": 3.399928141852599e-05, + "loss": 0.2067, + "step": 2678 + }, + { + "epoch": 1.630057803468208, + "grad_norm": 1.1648184061050415, + "learning_rate": 3.398766597872928e-05, + "loss": 0.1998, + "step": 2679 + }, + { + "epoch": 1.6306662610282934, + "grad_norm": 1.0704290866851807, + "learning_rate": 3.397604831036604e-05, + "loss": 0.2699, + "step": 2680 + }, + { + "epoch": 1.6312747185883785, + "grad_norm": 1.071129560470581, + "learning_rate": 3.396442841631695e-05, + "loss": 0.224, + "step": 2681 + }, + { + "epoch": 1.6318831761484636, + "grad_norm": 0.7212443947792053, + "learning_rate": 3.395280629946327e-05, + "loss": 0.1551, + "step": 2682 + }, + { + "epoch": 1.6324916337085489, + "grad_norm": 0.8159400224685669, + "learning_rate": 3.39411819626868e-05, + "loss": 0.1954, + "step": 2683 + }, + { + "epoch": 1.633100091268634, + "grad_norm": 0.8605476021766663, + "learning_rate": 3.3929555408869896e-05, + "loss": 0.2296, + "step": 2684 + }, + { + "epoch": 1.6337085488287193, + "grad_norm": 0.8847449421882629, + "learning_rate": 3.3917926640895445e-05, + "loss": 0.2023, + "step": 2685 + }, + { + "epoch": 1.6343170063888044, + "grad_norm": 0.8677266240119934, + "learning_rate": 3.390629566164691e-05, + "loss": 0.2, + "step": 2686 + }, + { + "epoch": 1.6349254639488895, + "grad_norm": 0.8311256170272827, + "learning_rate": 3.389466247400828e-05, + "loss": 0.2025, + "step": 2687 + }, + { + "epoch": 1.6355339215089746, + "grad_norm": 3.200028896331787, + "learning_rate": 3.38830270808641e-05, + "loss": 0.2398, + "step": 2688 + }, + { + "epoch": 1.63614237906906, + "grad_norm": 0.7760987877845764, + "learning_rate": 3.3871389485099474e-05, + "loss": 0.1651, + "step": 2689 + }, + { + "epoch": 1.6367508366291452, + "grad_norm": 1.227342128753662, + "learning_rate": 3.385974968960003e-05, + "loss": 0.1905, + "step": 2690 + }, + { + "epoch": 1.6373592941892303, + "grad_norm": 0.832114577293396, + "learning_rate": 3.384810769725196e-05, + "loss": 0.1901, + "step": 2691 + }, + { + "epoch": 1.6379677517493154, + "grad_norm": 0.7426611185073853, + "learning_rate": 3.383646351094198e-05, + "loss": 0.1425, + "step": 2692 + }, + { + "epoch": 1.6385762093094005, + "grad_norm": 0.7370221018791199, + "learning_rate": 3.382481713355738e-05, + "loss": 0.1638, + "step": 2693 + }, + { + "epoch": 1.6391846668694858, + "grad_norm": 0.9661339521408081, + "learning_rate": 3.381316856798596e-05, + "loss": 0.1937, + "step": 2694 + }, + { + "epoch": 1.6397931244295711, + "grad_norm": 0.8891693353652954, + "learning_rate": 3.3801517817116094e-05, + "loss": 0.1547, + "step": 2695 + }, + { + "epoch": 1.6404015819896562, + "grad_norm": 1.042301058769226, + "learning_rate": 3.378986488383667e-05, + "loss": 0.2168, + "step": 2696 + }, + { + "epoch": 1.6410100395497413, + "grad_norm": 0.7799018025398254, + "learning_rate": 3.377820977103714e-05, + "loss": 0.1444, + "step": 2697 + }, + { + "epoch": 1.6416184971098264, + "grad_norm": 0.9158018231391907, + "learning_rate": 3.376655248160747e-05, + "loss": 0.2048, + "step": 2698 + }, + { + "epoch": 1.6422269546699118, + "grad_norm": 0.9366563558578491, + "learning_rate": 3.37548930184382e-05, + "loss": 0.2095, + "step": 2699 + }, + { + "epoch": 1.642835412229997, + "grad_norm": 0.7867175340652466, + "learning_rate": 3.3743231384420384e-05, + "loss": 0.1721, + "step": 2700 + }, + { + "epoch": 1.6434438697900822, + "grad_norm": 0.9113629460334778, + "learning_rate": 3.3731567582445615e-05, + "loss": 0.1875, + "step": 2701 + }, + { + "epoch": 1.6440523273501673, + "grad_norm": 0.9016094207763672, + "learning_rate": 3.371990161540603e-05, + "loss": 0.2127, + "step": 2702 + }, + { + "epoch": 1.6446607849102524, + "grad_norm": 0.8337920904159546, + "learning_rate": 3.3708233486194324e-05, + "loss": 0.1788, + "step": 2703 + }, + { + "epoch": 1.6452692424703377, + "grad_norm": 0.8652783036231995, + "learning_rate": 3.369656319770369e-05, + "loss": 0.1558, + "step": 2704 + }, + { + "epoch": 1.645877700030423, + "grad_norm": 0.8067019581794739, + "learning_rate": 3.368489075282786e-05, + "loss": 0.174, + "step": 2705 + }, + { + "epoch": 1.646486157590508, + "grad_norm": 0.7883349657058716, + "learning_rate": 3.367321615446113e-05, + "loss": 0.1485, + "step": 2706 + }, + { + "epoch": 1.6470946151505932, + "grad_norm": 1.0703591108322144, + "learning_rate": 3.366153940549832e-05, + "loss": 0.1884, + "step": 2707 + }, + { + "epoch": 1.6477030727106783, + "grad_norm": 0.6376674175262451, + "learning_rate": 3.364986050883476e-05, + "loss": 0.1306, + "step": 2708 + }, + { + "epoch": 1.6483115302707636, + "grad_norm": 0.9308871030807495, + "learning_rate": 3.363817946736634e-05, + "loss": 0.1994, + "step": 2709 + }, + { + "epoch": 1.648919987830849, + "grad_norm": 0.8675797581672668, + "learning_rate": 3.3626496283989476e-05, + "loss": 0.197, + "step": 2710 + }, + { + "epoch": 1.649528445390934, + "grad_norm": 0.9659071564674377, + "learning_rate": 3.361481096160109e-05, + "loss": 0.2214, + "step": 2711 + }, + { + "epoch": 1.6501369029510191, + "grad_norm": 0.8575699925422668, + "learning_rate": 3.3603123503098675e-05, + "loss": 0.1685, + "step": 2712 + }, + { + "epoch": 1.6507453605111042, + "grad_norm": 0.8685420155525208, + "learning_rate": 3.359143391138023e-05, + "loss": 0.1846, + "step": 2713 + }, + { + "epoch": 1.6513538180711895, + "grad_norm": 0.862989604473114, + "learning_rate": 3.357974218934429e-05, + "loss": 0.2076, + "step": 2714 + }, + { + "epoch": 1.6519622756312748, + "grad_norm": 0.9207594990730286, + "learning_rate": 3.356804833988989e-05, + "loss": 0.1636, + "step": 2715 + }, + { + "epoch": 1.65257073319136, + "grad_norm": 0.9252761602401733, + "learning_rate": 3.3556352365916646e-05, + "loss": 0.2106, + "step": 2716 + }, + { + "epoch": 1.653179190751445, + "grad_norm": 1.0307517051696777, + "learning_rate": 3.354465427032467e-05, + "loss": 0.255, + "step": 2717 + }, + { + "epoch": 1.6537876483115301, + "grad_norm": 0.7410596013069153, + "learning_rate": 3.353295405601457e-05, + "loss": 0.1325, + "step": 2718 + }, + { + "epoch": 1.6543961058716155, + "grad_norm": 0.8445053696632385, + "learning_rate": 3.352125172588756e-05, + "loss": 0.1847, + "step": 2719 + }, + { + "epoch": 1.6550045634317008, + "grad_norm": 0.7926309704780579, + "learning_rate": 3.350954728284529e-05, + "loss": 0.178, + "step": 2720 + }, + { + "epoch": 1.6556130209917859, + "grad_norm": 0.8683445453643799, + "learning_rate": 3.349784072979e-05, + "loss": 0.1496, + "step": 2721 + }, + { + "epoch": 1.656221478551871, + "grad_norm": 0.9647382497787476, + "learning_rate": 3.348613206962441e-05, + "loss": 0.1771, + "step": 2722 + }, + { + "epoch": 1.656829936111956, + "grad_norm": 1.0857597589492798, + "learning_rate": 3.3474421305251785e-05, + "loss": 0.1893, + "step": 2723 + }, + { + "epoch": 1.6574383936720414, + "grad_norm": 0.904272198677063, + "learning_rate": 3.3462708439575916e-05, + "loss": 0.1812, + "step": 2724 + }, + { + "epoch": 1.6580468512321267, + "grad_norm": 0.9223242402076721, + "learning_rate": 3.3450993475501084e-05, + "loss": 0.2006, + "step": 2725 + }, + { + "epoch": 1.6586553087922118, + "grad_norm": 0.8843355178833008, + "learning_rate": 3.343927641593213e-05, + "loss": 0.1732, + "step": 2726 + }, + { + "epoch": 1.6592637663522969, + "grad_norm": 0.8139360547065735, + "learning_rate": 3.3427557263774395e-05, + "loss": 0.1904, + "step": 2727 + }, + { + "epoch": 1.659872223912382, + "grad_norm": 0.8607851266860962, + "learning_rate": 3.341583602193373e-05, + "loss": 0.1773, + "step": 2728 + }, + { + "epoch": 1.6604806814724673, + "grad_norm": 0.7943541407585144, + "learning_rate": 3.3404112693316525e-05, + "loss": 0.1659, + "step": 2729 + }, + { + "epoch": 1.6610891390325526, + "grad_norm": 0.8024083971977234, + "learning_rate": 3.339238728082968e-05, + "loss": 0.1562, + "step": 2730 + }, + { + "epoch": 1.6616975965926377, + "grad_norm": 0.7931082248687744, + "learning_rate": 3.338065978738059e-05, + "loss": 0.1367, + "step": 2731 + }, + { + "epoch": 1.6623060541527228, + "grad_norm": 0.8233804106712341, + "learning_rate": 3.33689302158772e-05, + "loss": 0.1757, + "step": 2732 + }, + { + "epoch": 1.662914511712808, + "grad_norm": 0.9862247705459595, + "learning_rate": 3.3357198569227954e-05, + "loss": 0.1799, + "step": 2733 + }, + { + "epoch": 1.6635229692728932, + "grad_norm": 0.7572802901268005, + "learning_rate": 3.33454648503418e-05, + "loss": 0.1937, + "step": 2734 + }, + { + "epoch": 1.6641314268329785, + "grad_norm": 0.764121949672699, + "learning_rate": 3.333372906212823e-05, + "loss": 0.1609, + "step": 2735 + }, + { + "epoch": 1.6647398843930636, + "grad_norm": 0.9266926646232605, + "learning_rate": 3.332199120749721e-05, + "loss": 0.1643, + "step": 2736 + }, + { + "epoch": 1.6653483419531487, + "grad_norm": 0.7189835906028748, + "learning_rate": 3.331025128935926e-05, + "loss": 0.1632, + "step": 2737 + }, + { + "epoch": 1.6659567995132338, + "grad_norm": 0.9593905806541443, + "learning_rate": 3.3298509310625363e-05, + "loss": 0.1825, + "step": 2738 + }, + { + "epoch": 1.6665652570733192, + "grad_norm": 0.8221538066864014, + "learning_rate": 3.328676527420706e-05, + "loss": 0.184, + "step": 2739 + }, + { + "epoch": 1.6671737146334045, + "grad_norm": 0.8926746249198914, + "learning_rate": 3.3275019183016384e-05, + "loss": 0.1851, + "step": 2740 + }, + { + "epoch": 1.6677821721934896, + "grad_norm": 0.8557993769645691, + "learning_rate": 3.326327103996587e-05, + "loss": 0.1588, + "step": 2741 + }, + { + "epoch": 1.6683906297535747, + "grad_norm": 0.701745867729187, + "learning_rate": 3.3251520847968566e-05, + "loss": 0.1516, + "step": 2742 + }, + { + "epoch": 1.6689990873136598, + "grad_norm": 0.9786536693572998, + "learning_rate": 3.323976860993803e-05, + "loss": 0.1958, + "step": 2743 + }, + { + "epoch": 1.669607544873745, + "grad_norm": 0.7492821216583252, + "learning_rate": 3.322801432878833e-05, + "loss": 0.1302, + "step": 2744 + }, + { + "epoch": 1.6702160024338304, + "grad_norm": 0.8823762536048889, + "learning_rate": 3.321625800743404e-05, + "loss": 0.1697, + "step": 2745 + }, + { + "epoch": 1.6708244599939155, + "grad_norm": 1.0150517225265503, + "learning_rate": 3.320449964879023e-05, + "loss": 0.1984, + "step": 2746 + }, + { + "epoch": 1.6714329175540006, + "grad_norm": 0.9592936635017395, + "learning_rate": 3.31927392557725e-05, + "loss": 0.2018, + "step": 2747 + }, + { + "epoch": 1.6720413751140857, + "grad_norm": 0.9560689926147461, + "learning_rate": 3.318097683129691e-05, + "loss": 0.1886, + "step": 2748 + }, + { + "epoch": 1.672649832674171, + "grad_norm": 1.1188546419143677, + "learning_rate": 3.316921237828007e-05, + "loss": 0.2193, + "step": 2749 + }, + { + "epoch": 1.6732582902342563, + "grad_norm": 0.8860459923744202, + "learning_rate": 3.315744589963907e-05, + "loss": 0.1493, + "step": 2750 + }, + { + "epoch": 1.6738667477943414, + "grad_norm": 0.9111357927322388, + "learning_rate": 3.314567739829151e-05, + "loss": 0.2217, + "step": 2751 + }, + { + "epoch": 1.6744752053544265, + "grad_norm": 0.9053143262863159, + "learning_rate": 3.313390687715548e-05, + "loss": 0.2102, + "step": 2752 + }, + { + "epoch": 1.6750836629145116, + "grad_norm": 0.8748982548713684, + "learning_rate": 3.3122134339149585e-05, + "loss": 0.1728, + "step": 2753 + }, + { + "epoch": 1.675692120474597, + "grad_norm": 0.8217591643333435, + "learning_rate": 3.311035978719292e-05, + "loss": 0.1708, + "step": 2754 + }, + { + "epoch": 1.6763005780346822, + "grad_norm": 0.9044924378395081, + "learning_rate": 3.309858322420508e-05, + "loss": 0.2065, + "step": 2755 + }, + { + "epoch": 1.6769090355947673, + "grad_norm": 0.7668700814247131, + "learning_rate": 3.308680465310617e-05, + "loss": 0.2101, + "step": 2756 + }, + { + "epoch": 1.6775174931548524, + "grad_norm": 0.9722397327423096, + "learning_rate": 3.307502407681678e-05, + "loss": 0.1881, + "step": 2757 + }, + { + "epoch": 1.6781259507149375, + "grad_norm": 0.842012882232666, + "learning_rate": 3.3063241498258e-05, + "loss": 0.1471, + "step": 2758 + }, + { + "epoch": 1.6787344082750228, + "grad_norm": 0.8087152242660522, + "learning_rate": 3.305145692035143e-05, + "loss": 0.1465, + "step": 2759 + }, + { + "epoch": 1.6793428658351082, + "grad_norm": 0.8884347081184387, + "learning_rate": 3.303967034601914e-05, + "loss": 0.1811, + "step": 2760 + }, + { + "epoch": 1.6799513233951933, + "grad_norm": 1.1114667654037476, + "learning_rate": 3.3027881778183715e-05, + "loss": 0.1998, + "step": 2761 + }, + { + "epoch": 1.6805597809552784, + "grad_norm": 0.8144546151161194, + "learning_rate": 3.301609121976822e-05, + "loss": 0.1806, + "step": 2762 + }, + { + "epoch": 1.6811682385153635, + "grad_norm": 0.858739972114563, + "learning_rate": 3.300429867369623e-05, + "loss": 0.2098, + "step": 2763 + }, + { + "epoch": 1.6817766960754488, + "grad_norm": 0.9147648811340332, + "learning_rate": 3.299250414289181e-05, + "loss": 0.1956, + "step": 2764 + }, + { + "epoch": 1.6823851536355339, + "grad_norm": 0.8137851357460022, + "learning_rate": 3.298070763027951e-05, + "loss": 0.2142, + "step": 2765 + }, + { + "epoch": 1.6829936111956192, + "grad_norm": 0.8987473845481873, + "learning_rate": 3.296890913878436e-05, + "loss": 0.1742, + "step": 2766 + }, + { + "epoch": 1.6836020687557043, + "grad_norm": 0.6859608888626099, + "learning_rate": 3.295710867133191e-05, + "loss": 0.1589, + "step": 2767 + }, + { + "epoch": 1.6842105263157894, + "grad_norm": 0.7220624685287476, + "learning_rate": 3.2945306230848185e-05, + "loss": 0.138, + "step": 2768 + }, + { + "epoch": 1.6848189838758747, + "grad_norm": 0.6847586631774902, + "learning_rate": 3.293350182025968e-05, + "loss": 0.1614, + "step": 2769 + }, + { + "epoch": 1.6854274414359598, + "grad_norm": 0.846610426902771, + "learning_rate": 3.292169544249341e-05, + "loss": 0.1999, + "step": 2770 + }, + { + "epoch": 1.6860358989960451, + "grad_norm": 0.7773505449295044, + "learning_rate": 3.290988710047687e-05, + "loss": 0.1724, + "step": 2771 + }, + { + "epoch": 1.6866443565561302, + "grad_norm": 0.7428063750267029, + "learning_rate": 3.289807679713803e-05, + "loss": 0.1307, + "step": 2772 + }, + { + "epoch": 1.6872528141162153, + "grad_norm": 0.8252652883529663, + "learning_rate": 3.288626453540535e-05, + "loss": 0.2018, + "step": 2773 + }, + { + "epoch": 1.6878612716763006, + "grad_norm": 0.8059362173080444, + "learning_rate": 3.287445031820777e-05, + "loss": 0.1656, + "step": 2774 + }, + { + "epoch": 1.6884697292363857, + "grad_norm": 0.9173924326896667, + "learning_rate": 3.2862634148474744e-05, + "loss": 0.1938, + "step": 2775 + }, + { + "epoch": 1.689078186796471, + "grad_norm": 0.8214101791381836, + "learning_rate": 3.285081602913618e-05, + "loss": 0.1551, + "step": 2776 + }, + { + "epoch": 1.6896866443565561, + "grad_norm": 0.9609821438789368, + "learning_rate": 3.283899596312247e-05, + "loss": 0.1968, + "step": 2777 + }, + { + "epoch": 1.6902951019166412, + "grad_norm": 0.8379231095314026, + "learning_rate": 3.2827173953364526e-05, + "loss": 0.1466, + "step": 2778 + }, + { + "epoch": 1.6909035594767265, + "grad_norm": 0.7881366014480591, + "learning_rate": 3.281535000279368e-05, + "loss": 0.132, + "step": 2779 + }, + { + "epoch": 1.6915120170368116, + "grad_norm": 0.8496389389038086, + "learning_rate": 3.2803524114341795e-05, + "loss": 0.2176, + "step": 2780 + }, + { + "epoch": 1.692120474596897, + "grad_norm": 0.7638592720031738, + "learning_rate": 3.2791696290941196e-05, + "loss": 0.1763, + "step": 2781 + }, + { + "epoch": 1.692728932156982, + "grad_norm": 0.9417856335639954, + "learning_rate": 3.27798665355247e-05, + "loss": 0.1721, + "step": 2782 + }, + { + "epoch": 1.6933373897170672, + "grad_norm": 0.788231611251831, + "learning_rate": 3.276803485102557e-05, + "loss": 0.1493, + "step": 2783 + }, + { + "epoch": 1.6939458472771525, + "grad_norm": 0.8815106153488159, + "learning_rate": 3.2756201240377596e-05, + "loss": 0.1762, + "step": 2784 + }, + { + "epoch": 1.6945543048372376, + "grad_norm": 0.8450623750686646, + "learning_rate": 3.2744365706514995e-05, + "loss": 0.2006, + "step": 2785 + }, + { + "epoch": 1.6951627623973229, + "grad_norm": 1.0778264999389648, + "learning_rate": 3.273252825237251e-05, + "loss": 0.2408, + "step": 2786 + }, + { + "epoch": 1.695771219957408, + "grad_norm": 0.7945128083229065, + "learning_rate": 3.2720688880885324e-05, + "loss": 0.1696, + "step": 2787 + }, + { + "epoch": 1.696379677517493, + "grad_norm": 0.7426414489746094, + "learning_rate": 3.270884759498911e-05, + "loss": 0.157, + "step": 2788 + }, + { + "epoch": 1.6969881350775782, + "grad_norm": 0.8385169506072998, + "learning_rate": 3.269700439762001e-05, + "loss": 0.1832, + "step": 2789 + }, + { + "epoch": 1.6975965926376635, + "grad_norm": 1.0522770881652832, + "learning_rate": 3.268515929171465e-05, + "loss": 0.2037, + "step": 2790 + }, + { + "epoch": 1.6982050501977488, + "grad_norm": 0.7473107576370239, + "learning_rate": 3.2673312280210124e-05, + "loss": 0.1878, + "step": 2791 + }, + { + "epoch": 1.698813507757834, + "grad_norm": 0.7824332118034363, + "learning_rate": 3.2661463366043985e-05, + "loss": 0.1556, + "step": 2792 + }, + { + "epoch": 1.699421965317919, + "grad_norm": 0.8999038338661194, + "learning_rate": 3.2649612552154276e-05, + "loss": 0.1974, + "step": 2793 + }, + { + "epoch": 1.700030422878004, + "grad_norm": 0.7992719411849976, + "learning_rate": 3.263775984147951e-05, + "loss": 0.1679, + "step": 2794 + }, + { + "epoch": 1.7006388804380894, + "grad_norm": 0.7823406457901001, + "learning_rate": 3.2625905236958655e-05, + "loss": 0.1676, + "step": 2795 + }, + { + "epoch": 1.7012473379981747, + "grad_norm": 0.8602148294448853, + "learning_rate": 3.2614048741531166e-05, + "loss": 0.1666, + "step": 2796 + }, + { + "epoch": 1.7018557955582598, + "grad_norm": 0.8012514114379883, + "learning_rate": 3.2602190358136965e-05, + "loss": 0.1704, + "step": 2797 + }, + { + "epoch": 1.702464253118345, + "grad_norm": 0.9197515249252319, + "learning_rate": 3.259033008971642e-05, + "loss": 0.1809, + "step": 2798 + }, + { + "epoch": 1.70307271067843, + "grad_norm": 1.046052098274231, + "learning_rate": 3.25784679392104e-05, + "loss": 0.2223, + "step": 2799 + }, + { + "epoch": 1.7036811682385153, + "grad_norm": 0.7376840710639954, + "learning_rate": 3.256660390956022e-05, + "loss": 0.1596, + "step": 2800 + }, + { + "epoch": 1.7042896257986007, + "grad_norm": 0.8339237570762634, + "learning_rate": 3.255473800370765e-05, + "loss": 0.1818, + "step": 2801 + }, + { + "epoch": 1.7048980833586858, + "grad_norm": 0.7850296497344971, + "learning_rate": 3.254287022459496e-05, + "loss": 0.1752, + "step": 2802 + }, + { + "epoch": 1.7055065409187709, + "grad_norm": 0.8697124719619751, + "learning_rate": 3.253100057516486e-05, + "loss": 0.1768, + "step": 2803 + }, + { + "epoch": 1.706114998478856, + "grad_norm": 1.1126950979232788, + "learning_rate": 3.251912905836052e-05, + "loss": 0.2258, + "step": 2804 + }, + { + "epoch": 1.7067234560389413, + "grad_norm": 0.7312096953392029, + "learning_rate": 3.250725567712559e-05, + "loss": 0.1779, + "step": 2805 + }, + { + "epoch": 1.7073319135990266, + "grad_norm": 0.7978745102882385, + "learning_rate": 3.2495380434404167e-05, + "loss": 0.1554, + "step": 2806 + }, + { + "epoch": 1.7079403711591117, + "grad_norm": 0.9084555506706238, + "learning_rate": 3.248350333314082e-05, + "loss": 0.1468, + "step": 2807 + }, + { + "epoch": 1.7085488287191968, + "grad_norm": 0.7953585386276245, + "learning_rate": 3.247162437628057e-05, + "loss": 0.1308, + "step": 2808 + }, + { + "epoch": 1.7091572862792819, + "grad_norm": 0.7833218574523926, + "learning_rate": 3.2459743566768916e-05, + "loss": 0.1697, + "step": 2809 + }, + { + "epoch": 1.7097657438393672, + "grad_norm": 0.7225537896156311, + "learning_rate": 3.2447860907551786e-05, + "loss": 0.1569, + "step": 2810 + }, + { + "epoch": 1.7103742013994525, + "grad_norm": 0.7659318447113037, + "learning_rate": 3.24359764015756e-05, + "loss": 0.1846, + "step": 2811 + }, + { + "epoch": 1.7109826589595376, + "grad_norm": 0.8912885189056396, + "learning_rate": 3.242409005178721e-05, + "loss": 0.1805, + "step": 2812 + }, + { + "epoch": 1.7115911165196227, + "grad_norm": 0.7760403156280518, + "learning_rate": 3.241220186113394e-05, + "loss": 0.1389, + "step": 2813 + }, + { + "epoch": 1.7121995740797078, + "grad_norm": 1.0232988595962524, + "learning_rate": 3.2400311832563563e-05, + "loss": 0.2085, + "step": 2814 + }, + { + "epoch": 1.7128080316397931, + "grad_norm": 0.8213723301887512, + "learning_rate": 3.238841996902431e-05, + "loss": 0.1491, + "step": 2815 + }, + { + "epoch": 1.7134164891998784, + "grad_norm": 0.8206520676612854, + "learning_rate": 3.237652627346487e-05, + "loss": 0.1543, + "step": 2816 + }, + { + "epoch": 1.7140249467599635, + "grad_norm": 0.885313093662262, + "learning_rate": 3.2364630748834385e-05, + "loss": 0.1985, + "step": 2817 + }, + { + "epoch": 1.7146334043200486, + "grad_norm": 0.7833919525146484, + "learning_rate": 3.235273339808245e-05, + "loss": 0.1493, + "step": 2818 + }, + { + "epoch": 1.7152418618801337, + "grad_norm": 0.8376486301422119, + "learning_rate": 3.2340834224159104e-05, + "loss": 0.164, + "step": 2819 + }, + { + "epoch": 1.715850319440219, + "grad_norm": 0.8096402287483215, + "learning_rate": 3.232893323001485e-05, + "loss": 0.1883, + "step": 2820 + }, + { + "epoch": 1.7164587770003044, + "grad_norm": 0.881629467010498, + "learning_rate": 3.2317030418600645e-05, + "loss": 0.1901, + "step": 2821 + }, + { + "epoch": 1.7170672345603895, + "grad_norm": 0.910102128982544, + "learning_rate": 3.2305125792867886e-05, + "loss": 0.1814, + "step": 2822 + }, + { + "epoch": 1.7176756921204746, + "grad_norm": 0.7410189509391785, + "learning_rate": 3.229321935576842e-05, + "loss": 0.1637, + "step": 2823 + }, + { + "epoch": 1.7182841496805596, + "grad_norm": 0.8830031752586365, + "learning_rate": 3.228131111025455e-05, + "loss": 0.1492, + "step": 2824 + }, + { + "epoch": 1.718892607240645, + "grad_norm": 0.9548397660255432, + "learning_rate": 3.226940105927903e-05, + "loss": 0.1643, + "step": 2825 + }, + { + "epoch": 1.7195010648007303, + "grad_norm": 0.829814612865448, + "learning_rate": 3.2257489205795034e-05, + "loss": 0.1411, + "step": 2826 + }, + { + "epoch": 1.7201095223608154, + "grad_norm": 0.9172588586807251, + "learning_rate": 3.224557555275623e-05, + "loss": 0.1536, + "step": 2827 + }, + { + "epoch": 1.7207179799209005, + "grad_norm": 0.8592941164970398, + "learning_rate": 3.223366010311671e-05, + "loss": 0.1777, + "step": 2828 + }, + { + "epoch": 1.7213264374809856, + "grad_norm": 0.8923897743225098, + "learning_rate": 3.2221742859831e-05, + "loss": 0.1696, + "step": 2829 + }, + { + "epoch": 1.721934895041071, + "grad_norm": 1.0709412097930908, + "learning_rate": 3.220982382585406e-05, + "loss": 0.2554, + "step": 2830 + }, + { + "epoch": 1.7225433526011562, + "grad_norm": 0.8038744330406189, + "learning_rate": 3.219790300414134e-05, + "loss": 0.1586, + "step": 2831 + }, + { + "epoch": 1.7231518101612413, + "grad_norm": 0.8866575956344604, + "learning_rate": 3.21859803976487e-05, + "loss": 0.1691, + "step": 2832 + }, + { + "epoch": 1.7237602677213264, + "grad_norm": 0.8635597825050354, + "learning_rate": 3.217405600933245e-05, + "loss": 0.1865, + "step": 2833 + }, + { + "epoch": 1.7243687252814115, + "grad_norm": 0.8194993138313293, + "learning_rate": 3.2162129842149336e-05, + "loss": 0.1614, + "step": 2834 + }, + { + "epoch": 1.7249771828414968, + "grad_norm": 0.8445285558700562, + "learning_rate": 3.215020189905655e-05, + "loss": 0.2077, + "step": 2835 + }, + { + "epoch": 1.7255856404015821, + "grad_norm": 0.9541175365447998, + "learning_rate": 3.213827218301173e-05, + "loss": 0.2071, + "step": 2836 + }, + { + "epoch": 1.7261940979616672, + "grad_norm": 0.7591641545295715, + "learning_rate": 3.212634069697295e-05, + "loss": 0.1616, + "step": 2837 + }, + { + "epoch": 1.7268025555217523, + "grad_norm": 0.7483083605766296, + "learning_rate": 3.211440744389871e-05, + "loss": 0.1756, + "step": 2838 + }, + { + "epoch": 1.7274110130818374, + "grad_norm": 0.7312026619911194, + "learning_rate": 3.2102472426747975e-05, + "loss": 0.1834, + "step": 2839 + }, + { + "epoch": 1.7280194706419227, + "grad_norm": 0.8581993579864502, + "learning_rate": 3.2090535648480126e-05, + "loss": 0.1491, + "step": 2840 + }, + { + "epoch": 1.728627928202008, + "grad_norm": 0.7880068421363831, + "learning_rate": 3.207859711205498e-05, + "loss": 0.1904, + "step": 2841 + }, + { + "epoch": 1.7292363857620932, + "grad_norm": 0.9084392786026001, + "learning_rate": 3.20666568204328e-05, + "loss": 0.1815, + "step": 2842 + }, + { + "epoch": 1.7298448433221782, + "grad_norm": 0.8204681873321533, + "learning_rate": 3.205471477657428e-05, + "loss": 0.1863, + "step": 2843 + }, + { + "epoch": 1.7304533008822633, + "grad_norm": 0.728208065032959, + "learning_rate": 3.204277098344055e-05, + "loss": 0.1384, + "step": 2844 + }, + { + "epoch": 1.7310617584423487, + "grad_norm": 0.9103240966796875, + "learning_rate": 3.203082544399318e-05, + "loss": 0.1875, + "step": 2845 + }, + { + "epoch": 1.731670216002434, + "grad_norm": 0.8469733595848083, + "learning_rate": 3.201887816119416e-05, + "loss": 0.1546, + "step": 2846 + }, + { + "epoch": 1.732278673562519, + "grad_norm": 1.0149333477020264, + "learning_rate": 3.2006929138005905e-05, + "loss": 0.1469, + "step": 2847 + }, + { + "epoch": 1.7328871311226042, + "grad_norm": 0.8808649778366089, + "learning_rate": 3.1994978377391295e-05, + "loss": 0.1811, + "step": 2848 + }, + { + "epoch": 1.7334955886826893, + "grad_norm": 0.77308589220047, + "learning_rate": 3.198302588231361e-05, + "loss": 0.1662, + "step": 2849 + }, + { + "epoch": 1.7341040462427746, + "grad_norm": 0.7700105309486389, + "learning_rate": 3.197107165573657e-05, + "loss": 0.1371, + "step": 2850 + }, + { + "epoch": 1.73471250380286, + "grad_norm": 0.8109223246574402, + "learning_rate": 3.195911570062434e-05, + "loss": 0.138, + "step": 2851 + }, + { + "epoch": 1.735320961362945, + "grad_norm": 0.8431537747383118, + "learning_rate": 3.1947158019941476e-05, + "loss": 0.1787, + "step": 2852 + }, + { + "epoch": 1.73592941892303, + "grad_norm": 0.8387619256973267, + "learning_rate": 3.1935198616652996e-05, + "loss": 0.1893, + "step": 2853 + }, + { + "epoch": 1.7365378764831152, + "grad_norm": 0.8504119515419006, + "learning_rate": 3.192323749372433e-05, + "loss": 0.1783, + "step": 2854 + }, + { + "epoch": 1.7371463340432005, + "grad_norm": 0.8203498125076294, + "learning_rate": 3.1911274654121345e-05, + "loss": 0.1651, + "step": 2855 + }, + { + "epoch": 1.7377547916032858, + "grad_norm": 0.941662073135376, + "learning_rate": 3.1899310100810326e-05, + "loss": 0.193, + "step": 2856 + }, + { + "epoch": 1.738363249163371, + "grad_norm": 0.8915706276893616, + "learning_rate": 3.1887343836757977e-05, + "loss": 0.2059, + "step": 2857 + }, + { + "epoch": 1.738971706723456, + "grad_norm": 0.7377119064331055, + "learning_rate": 3.1875375864931426e-05, + "loss": 0.1607, + "step": 2858 + }, + { + "epoch": 1.7395801642835411, + "grad_norm": 0.8279535174369812, + "learning_rate": 3.186340618829825e-05, + "loss": 0.1356, + "step": 2859 + }, + { + "epoch": 1.7401886218436264, + "grad_norm": 0.9348523020744324, + "learning_rate": 3.185143480982642e-05, + "loss": 0.2069, + "step": 2860 + }, + { + "epoch": 1.7407970794037118, + "grad_norm": 0.7679805159568787, + "learning_rate": 3.183946173248433e-05, + "loss": 0.1833, + "step": 2861 + }, + { + "epoch": 1.7414055369637969, + "grad_norm": 0.8387174010276794, + "learning_rate": 3.182748695924082e-05, + "loss": 0.2116, + "step": 2862 + }, + { + "epoch": 1.742013994523882, + "grad_norm": 0.8004626035690308, + "learning_rate": 3.181551049306513e-05, + "loss": 0.1897, + "step": 2863 + }, + { + "epoch": 1.742622452083967, + "grad_norm": 0.8716647624969482, + "learning_rate": 3.180353233692691e-05, + "loss": 0.2086, + "step": 2864 + }, + { + "epoch": 1.7432309096440524, + "grad_norm": 0.901866614818573, + "learning_rate": 3.179155249379628e-05, + "loss": 0.2046, + "step": 2865 + }, + { + "epoch": 1.7438393672041375, + "grad_norm": 0.7688228487968445, + "learning_rate": 3.17795709666437e-05, + "loss": 0.1335, + "step": 2866 + }, + { + "epoch": 1.7444478247642228, + "grad_norm": 0.7995738983154297, + "learning_rate": 3.1767587758440106e-05, + "loss": 0.1418, + "step": 2867 + }, + { + "epoch": 1.7450562823243079, + "grad_norm": 0.8365558981895447, + "learning_rate": 3.175560287215684e-05, + "loss": 0.1897, + "step": 2868 + }, + { + "epoch": 1.745664739884393, + "grad_norm": 0.8732959032058716, + "learning_rate": 3.1743616310765644e-05, + "loss": 0.1946, + "step": 2869 + }, + { + "epoch": 1.7462731974444783, + "grad_norm": 0.972126841545105, + "learning_rate": 3.1731628077238694e-05, + "loss": 0.1725, + "step": 2870 + }, + { + "epoch": 1.7468816550045634, + "grad_norm": 0.811004102230072, + "learning_rate": 3.171963817454857e-05, + "loss": 0.1305, + "step": 2871 + }, + { + "epoch": 1.7474901125646487, + "grad_norm": 0.9573332667350769, + "learning_rate": 3.170764660566826e-05, + "loss": 0.1681, + "step": 2872 + }, + { + "epoch": 1.7480985701247338, + "grad_norm": 0.7610151767730713, + "learning_rate": 3.1695653373571196e-05, + "loss": 0.154, + "step": 2873 + }, + { + "epoch": 1.748707027684819, + "grad_norm": 1.0020116567611694, + "learning_rate": 3.168365848123117e-05, + "loss": 0.171, + "step": 2874 + }, + { + "epoch": 1.7493154852449042, + "grad_norm": 0.8454048037528992, + "learning_rate": 3.167166193162244e-05, + "loss": 0.1776, + "step": 2875 + }, + { + "epoch": 1.7499239428049893, + "grad_norm": 0.8977103233337402, + "learning_rate": 3.1659663727719625e-05, + "loss": 0.1786, + "step": 2876 + }, + { + "epoch": 1.7505324003650746, + "grad_norm": 0.7743524312973022, + "learning_rate": 3.1647663872497804e-05, + "loss": 0.1425, + "step": 2877 + }, + { + "epoch": 1.7511408579251597, + "grad_norm": 0.9040097594261169, + "learning_rate": 3.1635662368932426e-05, + "loss": 0.2031, + "step": 2878 + }, + { + "epoch": 1.7517493154852448, + "grad_norm": 0.6998404264450073, + "learning_rate": 3.1623659219999374e-05, + "loss": 0.1344, + "step": 2879 + }, + { + "epoch": 1.7523577730453301, + "grad_norm": 0.8501226902008057, + "learning_rate": 3.161165442867492e-05, + "loss": 0.1767, + "step": 2880 + }, + { + "epoch": 1.7529662306054152, + "grad_norm": 0.9239878058433533, + "learning_rate": 3.159964799793575e-05, + "loss": 0.2296, + "step": 2881 + }, + { + "epoch": 1.7535746881655006, + "grad_norm": 0.9915949106216431, + "learning_rate": 3.158763993075897e-05, + "loss": 0.2277, + "step": 2882 + }, + { + "epoch": 1.7541831457255856, + "grad_norm": 0.746919572353363, + "learning_rate": 3.157563023012208e-05, + "loss": 0.1781, + "step": 2883 + }, + { + "epoch": 1.7547916032856707, + "grad_norm": 0.6546300649642944, + "learning_rate": 3.1563618899002965e-05, + "loss": 0.1176, + "step": 2884 + }, + { + "epoch": 1.755400060845756, + "grad_norm": 0.824098527431488, + "learning_rate": 3.1551605940379954e-05, + "loss": 0.1666, + "step": 2885 + }, + { + "epoch": 1.7560085184058412, + "grad_norm": 0.7335473299026489, + "learning_rate": 3.1539591357231755e-05, + "loss": 0.1489, + "step": 2886 + }, + { + "epoch": 1.7566169759659265, + "grad_norm": 0.8762836456298828, + "learning_rate": 3.152757515253748e-05, + "loss": 0.1802, + "step": 2887 + }, + { + "epoch": 1.7572254335260116, + "grad_norm": 0.9030479192733765, + "learning_rate": 3.1515557329276654e-05, + "loss": 0.1571, + "step": 2888 + }, + { + "epoch": 1.7578338910860967, + "grad_norm": 0.8152849674224854, + "learning_rate": 3.150353789042919e-05, + "loss": 0.1772, + "step": 2889 + }, + { + "epoch": 1.7584423486461818, + "grad_norm": 0.8419698476791382, + "learning_rate": 3.149151683897541e-05, + "loss": 0.1603, + "step": 2890 + }, + { + "epoch": 1.759050806206267, + "grad_norm": 1.0735416412353516, + "learning_rate": 3.147949417789604e-05, + "loss": 0.1659, + "step": 2891 + }, + { + "epoch": 1.7596592637663524, + "grad_norm": 0.8551287651062012, + "learning_rate": 3.1467469910172184e-05, + "loss": 0.199, + "step": 2892 + }, + { + "epoch": 1.7602677213264375, + "grad_norm": 1.121510624885559, + "learning_rate": 3.145544403878538e-05, + "loss": 0.1591, + "step": 2893 + }, + { + "epoch": 1.7608761788865226, + "grad_norm": 0.9003729224205017, + "learning_rate": 3.144341656671751e-05, + "loss": 0.1713, + "step": 2894 + }, + { + "epoch": 1.7614846364466077, + "grad_norm": 0.8984287977218628, + "learning_rate": 3.143138749695091e-05, + "loss": 0.2174, + "step": 2895 + }, + { + "epoch": 1.762093094006693, + "grad_norm": 0.8700388669967651, + "learning_rate": 3.14193568324683e-05, + "loss": 0.2276, + "step": 2896 + }, + { + "epoch": 1.7627015515667783, + "grad_norm": 0.7582150101661682, + "learning_rate": 3.140732457625276e-05, + "loss": 0.1604, + "step": 2897 + }, + { + "epoch": 1.7633100091268634, + "grad_norm": 0.8540982007980347, + "learning_rate": 3.13952907312878e-05, + "loss": 0.1728, + "step": 2898 + }, + { + "epoch": 1.7639184666869485, + "grad_norm": 0.8038197159767151, + "learning_rate": 3.1383255300557293e-05, + "loss": 0.171, + "step": 2899 + }, + { + "epoch": 1.7645269242470336, + "grad_norm": 0.8544119000434875, + "learning_rate": 3.137121828704555e-05, + "loss": 0.1521, + "step": 2900 + }, + { + "epoch": 1.765135381807119, + "grad_norm": 0.8337200284004211, + "learning_rate": 3.135917969373724e-05, + "loss": 0.1895, + "step": 2901 + }, + { + "epoch": 1.7657438393672042, + "grad_norm": 0.8394232988357544, + "learning_rate": 3.134713952361742e-05, + "loss": 0.2031, + "step": 2902 + }, + { + "epoch": 1.7663522969272893, + "grad_norm": 0.7780349850654602, + "learning_rate": 3.1335097779671564e-05, + "loss": 0.1559, + "step": 2903 + }, + { + "epoch": 1.7669607544873744, + "grad_norm": 0.7052262425422668, + "learning_rate": 3.132305446488552e-05, + "loss": 0.1905, + "step": 2904 + }, + { + "epoch": 1.7675692120474595, + "grad_norm": 0.7838650941848755, + "learning_rate": 3.1311009582245525e-05, + "loss": 0.1404, + "step": 2905 + }, + { + "epoch": 1.7681776696075449, + "grad_norm": 0.9082717299461365, + "learning_rate": 3.1298963134738214e-05, + "loss": 0.2033, + "step": 2906 + }, + { + "epoch": 1.7687861271676302, + "grad_norm": 0.882872462272644, + "learning_rate": 3.128691512535059e-05, + "loss": 0.186, + "step": 2907 + }, + { + "epoch": 1.7693945847277153, + "grad_norm": 1.0027045011520386, + "learning_rate": 3.127486555707007e-05, + "loss": 0.1539, + "step": 2908 + }, + { + "epoch": 1.7700030422878004, + "grad_norm": 0.9138793349266052, + "learning_rate": 3.126281443288445e-05, + "loss": 0.1769, + "step": 2909 + }, + { + "epoch": 1.7706114998478855, + "grad_norm": 0.8712251782417297, + "learning_rate": 3.125076175578189e-05, + "loss": 0.1787, + "step": 2910 + }, + { + "epoch": 1.7712199574079708, + "grad_norm": 1.6176162958145142, + "learning_rate": 3.123870752875096e-05, + "loss": 0.174, + "step": 2911 + }, + { + "epoch": 1.771828414968056, + "grad_norm": 0.8051137924194336, + "learning_rate": 3.12266517547806e-05, + "loss": 0.177, + "step": 2912 + }, + { + "epoch": 1.7724368725281412, + "grad_norm": 0.8419429063796997, + "learning_rate": 3.121459443686015e-05, + "loss": 0.1795, + "step": 2913 + }, + { + "epoch": 1.7730453300882263, + "grad_norm": 0.8457537889480591, + "learning_rate": 3.120253557797932e-05, + "loss": 0.1616, + "step": 2914 + }, + { + "epoch": 1.7736537876483114, + "grad_norm": 0.7322502732276917, + "learning_rate": 3.1190475181128194e-05, + "loss": 0.1396, + "step": 2915 + }, + { + "epoch": 1.7742622452083967, + "grad_norm": 0.755520761013031, + "learning_rate": 3.1178413249297255e-05, + "loss": 0.1479, + "step": 2916 + }, + { + "epoch": 1.774870702768482, + "grad_norm": 0.8889976739883423, + "learning_rate": 3.116634978547737e-05, + "loss": 0.2115, + "step": 2917 + }, + { + "epoch": 1.7754791603285671, + "grad_norm": 0.8492110967636108, + "learning_rate": 3.115428479265975e-05, + "loss": 0.182, + "step": 2918 + }, + { + "epoch": 1.7760876178886522, + "grad_norm": 0.7823325395584106, + "learning_rate": 3.1142218273836025e-05, + "loss": 0.1563, + "step": 2919 + }, + { + "epoch": 1.7766960754487373, + "grad_norm": 0.774039626121521, + "learning_rate": 3.11301502319982e-05, + "loss": 0.1498, + "step": 2920 + }, + { + "epoch": 1.7773045330088226, + "grad_norm": 0.8219572901725769, + "learning_rate": 3.111808067013863e-05, + "loss": 0.1536, + "step": 2921 + }, + { + "epoch": 1.777912990568908, + "grad_norm": 0.7709328532218933, + "learning_rate": 3.1106009591250066e-05, + "loss": 0.135, + "step": 2922 + }, + { + "epoch": 1.778521448128993, + "grad_norm": 0.7555780410766602, + "learning_rate": 3.109393699832564e-05, + "loss": 0.1303, + "step": 2923 + }, + { + "epoch": 1.7791299056890781, + "grad_norm": 0.58537757396698, + "learning_rate": 3.108186289435884e-05, + "loss": 0.1188, + "step": 2924 + }, + { + "epoch": 1.7797383632491632, + "grad_norm": 0.8996820449829102, + "learning_rate": 3.106978728234354e-05, + "loss": 0.1952, + "step": 2925 + }, + { + "epoch": 1.7803468208092486, + "grad_norm": 1.680888056755066, + "learning_rate": 3.1057710165274004e-05, + "loss": 0.2346, + "step": 2926 + }, + { + "epoch": 1.7809552783693339, + "grad_norm": 0.6698862314224243, + "learning_rate": 3.1045631546144846e-05, + "loss": 0.1375, + "step": 2927 + }, + { + "epoch": 1.781563735929419, + "grad_norm": 0.7749894261360168, + "learning_rate": 3.1033551427951064e-05, + "loss": 0.1948, + "step": 2928 + }, + { + "epoch": 1.782172193489504, + "grad_norm": 0.8020423054695129, + "learning_rate": 3.102146981368801e-05, + "loss": 0.1814, + "step": 2929 + }, + { + "epoch": 1.7827806510495892, + "grad_norm": 0.8180800676345825, + "learning_rate": 3.100938670635143e-05, + "loss": 0.1584, + "step": 2930 + }, + { + "epoch": 1.7833891086096745, + "grad_norm": 1.1050705909729004, + "learning_rate": 3.099730210893743e-05, + "loss": 0.1605, + "step": 2931 + }, + { + "epoch": 1.7839975661697598, + "grad_norm": 0.8110726475715637, + "learning_rate": 3.0985216024442484e-05, + "loss": 0.1342, + "step": 2932 + }, + { + "epoch": 1.784606023729845, + "grad_norm": 0.7474027872085571, + "learning_rate": 3.097312845586345e-05, + "loss": 0.1579, + "step": 2933 + }, + { + "epoch": 1.78521448128993, + "grad_norm": 0.908333957195282, + "learning_rate": 3.096103940619752e-05, + "loss": 0.2047, + "step": 2934 + }, + { + "epoch": 1.785822938850015, + "grad_norm": 1.506729006767273, + "learning_rate": 3.0948948878442293e-05, + "loss": 0.189, + "step": 2935 + }, + { + "epoch": 1.7864313964101004, + "grad_norm": 0.7309430837631226, + "learning_rate": 3.093685687559571e-05, + "loss": 0.1383, + "step": 2936 + }, + { + "epoch": 1.7870398539701857, + "grad_norm": 0.866033136844635, + "learning_rate": 3.092476340065608e-05, + "loss": 0.1515, + "step": 2937 + }, + { + "epoch": 1.7876483115302708, + "grad_norm": 0.8974893689155579, + "learning_rate": 3.091266845662208e-05, + "loss": 0.1721, + "step": 2938 + }, + { + "epoch": 1.788256769090356, + "grad_norm": 0.8083369135856628, + "learning_rate": 3.090057204649276e-05, + "loss": 0.1714, + "step": 2939 + }, + { + "epoch": 1.788865226650441, + "grad_norm": 1.0619298219680786, + "learning_rate": 3.088847417326752e-05, + "loss": 0.172, + "step": 2940 + }, + { + "epoch": 1.7894736842105263, + "grad_norm": 0.9113779067993164, + "learning_rate": 3.087637483994612e-05, + "loss": 0.1679, + "step": 2941 + }, + { + "epoch": 1.7900821417706116, + "grad_norm": 0.8694355487823486, + "learning_rate": 3.086427404952871e-05, + "loss": 0.1498, + "step": 2942 + }, + { + "epoch": 1.7906905993306967, + "grad_norm": 0.789814829826355, + "learning_rate": 3.085217180501576e-05, + "loss": 0.1598, + "step": 2943 + }, + { + "epoch": 1.7912990568907818, + "grad_norm": 0.7697128653526306, + "learning_rate": 3.084006810940814e-05, + "loss": 0.1259, + "step": 2944 + }, + { + "epoch": 1.791907514450867, + "grad_norm": 0.8780056834220886, + "learning_rate": 3.082796296570706e-05, + "loss": 0.1966, + "step": 2945 + }, + { + "epoch": 1.7925159720109523, + "grad_norm": 0.790896475315094, + "learning_rate": 3.081585637691407e-05, + "loss": 0.1333, + "step": 2946 + }, + { + "epoch": 1.7931244295710376, + "grad_norm": 0.8985186815261841, + "learning_rate": 3.080374834603113e-05, + "loss": 0.127, + "step": 2947 + }, + { + "epoch": 1.7937328871311227, + "grad_norm": 0.9989727735519409, + "learning_rate": 3.079163887606051e-05, + "loss": 0.1706, + "step": 2948 + }, + { + "epoch": 1.7943413446912078, + "grad_norm": 0.7312899827957153, + "learning_rate": 3.077952797000485e-05, + "loss": 0.1619, + "step": 2949 + }, + { + "epoch": 1.7949498022512929, + "grad_norm": 0.9777790307998657, + "learning_rate": 3.0767415630867165e-05, + "loss": 0.178, + "step": 2950 + }, + { + "epoch": 1.7955582598113782, + "grad_norm": 1.0288338661193848, + "learning_rate": 3.0755301861650794e-05, + "loss": 0.1798, + "step": 2951 + }, + { + "epoch": 1.7961667173714635, + "grad_norm": 0.7798663973808289, + "learning_rate": 3.074318666535946e-05, + "loss": 0.1747, + "step": 2952 + }, + { + "epoch": 1.7967751749315486, + "grad_norm": 0.8773430585861206, + "learning_rate": 3.0731070044997215e-05, + "loss": 0.1644, + "step": 2953 + }, + { + "epoch": 1.7973836324916337, + "grad_norm": 0.828058123588562, + "learning_rate": 3.071895200356848e-05, + "loss": 0.163, + "step": 2954 + }, + { + "epoch": 1.7979920900517188, + "grad_norm": 0.8165954947471619, + "learning_rate": 3.070683254407803e-05, + "loss": 0.1583, + "step": 2955 + }, + { + "epoch": 1.798600547611804, + "grad_norm": 0.6299397349357605, + "learning_rate": 3.069471166953098e-05, + "loss": 0.1258, + "step": 2956 + }, + { + "epoch": 1.7992090051718894, + "grad_norm": 0.7792373895645142, + "learning_rate": 3.068258938293281e-05, + "loss": 0.1431, + "step": 2957 + }, + { + "epoch": 1.7998174627319745, + "grad_norm": 0.6819939613342285, + "learning_rate": 3.0670465687289325e-05, + "loss": 0.1071, + "step": 2958 + }, + { + "epoch": 1.8004259202920596, + "grad_norm": 1.1301064491271973, + "learning_rate": 3.065834058560671e-05, + "loss": 0.1438, + "step": 2959 + }, + { + "epoch": 1.8010343778521447, + "grad_norm": 1.0041139125823975, + "learning_rate": 3.064621408089148e-05, + "loss": 0.1515, + "step": 2960 + }, + { + "epoch": 1.80164283541223, + "grad_norm": 0.7390562295913696, + "learning_rate": 3.0634086176150504e-05, + "loss": 0.1325, + "step": 2961 + }, + { + "epoch": 1.8022512929723153, + "grad_norm": 0.8052049279212952, + "learning_rate": 3.0621956874391e-05, + "loss": 0.1349, + "step": 2962 + }, + { + "epoch": 1.8028597505324004, + "grad_norm": 0.7416874170303345, + "learning_rate": 3.060982617862053e-05, + "loss": 0.1505, + "step": 2963 + }, + { + "epoch": 1.8034682080924855, + "grad_norm": 1.035781979560852, + "learning_rate": 3.0597694091846985e-05, + "loss": 0.1632, + "step": 2964 + }, + { + "epoch": 1.8040766656525706, + "grad_norm": 1.187511682510376, + "learning_rate": 3.058556061707863e-05, + "loss": 0.26, + "step": 2965 + }, + { + "epoch": 1.804685123212656, + "grad_norm": 0.9462476372718811, + "learning_rate": 3.057342575732406e-05, + "loss": 0.1846, + "step": 2966 + }, + { + "epoch": 1.805293580772741, + "grad_norm": 0.9190717339515686, + "learning_rate": 3.0561289515592226e-05, + "loss": 0.1966, + "step": 2967 + }, + { + "epoch": 1.8059020383328264, + "grad_norm": 0.8364342451095581, + "learning_rate": 3.054915189489239e-05, + "loss": 0.1795, + "step": 2968 + }, + { + "epoch": 1.8065104958929115, + "grad_norm": 0.7401247620582581, + "learning_rate": 3.053701289823418e-05, + "loss": 0.1576, + "step": 2969 + }, + { + "epoch": 1.8071189534529966, + "grad_norm": 0.7658497095108032, + "learning_rate": 3.052487252862758e-05, + "loss": 0.1873, + "step": 2970 + }, + { + "epoch": 1.8077274110130819, + "grad_norm": 0.765739381313324, + "learning_rate": 3.0512730789082862e-05, + "loss": 0.1662, + "step": 2971 + }, + { + "epoch": 1.808335868573167, + "grad_norm": 0.7397434711456299, + "learning_rate": 3.0500587682610694e-05, + "loss": 0.154, + "step": 2972 + }, + { + "epoch": 1.8089443261332523, + "grad_norm": 0.8113225102424622, + "learning_rate": 3.0488443212222067e-05, + "loss": 0.1421, + "step": 2973 + }, + { + "epoch": 1.8095527836933374, + "grad_norm": 0.9399985074996948, + "learning_rate": 3.047629738092828e-05, + "loss": 0.1484, + "step": 2974 + }, + { + "epoch": 1.8101612412534225, + "grad_norm": 0.8929994106292725, + "learning_rate": 3.046415019174102e-05, + "loss": 0.1615, + "step": 2975 + }, + { + "epoch": 1.8107696988135078, + "grad_norm": 0.9099370837211609, + "learning_rate": 3.0452001647672256e-05, + "loss": 0.1902, + "step": 2976 + }, + { + "epoch": 1.811378156373593, + "grad_norm": 0.7755023837089539, + "learning_rate": 3.043985175173434e-05, + "loss": 0.1494, + "step": 2977 + }, + { + "epoch": 1.8119866139336782, + "grad_norm": 0.868893027305603, + "learning_rate": 3.042770050693994e-05, + "loss": 0.1345, + "step": 2978 + }, + { + "epoch": 1.8125950714937633, + "grad_norm": 0.9862670302391052, + "learning_rate": 3.0415547916302044e-05, + "loss": 0.1903, + "step": 2979 + }, + { + "epoch": 1.8132035290538484, + "grad_norm": 0.7265908718109131, + "learning_rate": 3.0403393982834e-05, + "loss": 0.1474, + "step": 2980 + }, + { + "epoch": 1.8138119866139337, + "grad_norm": 0.7525795698165894, + "learning_rate": 3.039123870954947e-05, + "loss": 0.1376, + "step": 2981 + }, + { + "epoch": 1.8144204441740188, + "grad_norm": 0.8074904680252075, + "learning_rate": 3.0379082099462454e-05, + "loss": 0.1699, + "step": 2982 + }, + { + "epoch": 1.8150289017341041, + "grad_norm": 0.7635090351104736, + "learning_rate": 3.0366924155587296e-05, + "loss": 0.1865, + "step": 2983 + }, + { + "epoch": 1.8156373592941892, + "grad_norm": 0.8797259330749512, + "learning_rate": 3.0354764880938647e-05, + "loss": 0.1832, + "step": 2984 + }, + { + "epoch": 1.8162458168542743, + "grad_norm": 0.7532181739807129, + "learning_rate": 3.0342604278531512e-05, + "loss": 0.1289, + "step": 2985 + }, + { + "epoch": 1.8168542744143596, + "grad_norm": 1.013737440109253, + "learning_rate": 3.0330442351381198e-05, + "loss": 0.1913, + "step": 2986 + }, + { + "epoch": 1.8174627319744447, + "grad_norm": 0.8055927753448486, + "learning_rate": 3.0318279102503367e-05, + "loss": 0.1773, + "step": 2987 + }, + { + "epoch": 1.81807118953453, + "grad_norm": 0.8605936765670776, + "learning_rate": 3.0306114534913998e-05, + "loss": 0.1499, + "step": 2988 + }, + { + "epoch": 1.8186796470946152, + "grad_norm": 0.8703776001930237, + "learning_rate": 3.0293948651629388e-05, + "loss": 0.2272, + "step": 2989 + }, + { + "epoch": 1.8192881046547003, + "grad_norm": 0.7393494844436646, + "learning_rate": 3.0281781455666182e-05, + "loss": 0.1363, + "step": 2990 + }, + { + "epoch": 1.8198965622147856, + "grad_norm": 0.9417717456817627, + "learning_rate": 3.026961295004133e-05, + "loss": 0.1626, + "step": 2991 + }, + { + "epoch": 1.8205050197748707, + "grad_norm": 0.72954922914505, + "learning_rate": 3.025744313777211e-05, + "loss": 0.1273, + "step": 2992 + }, + { + "epoch": 1.821113477334956, + "grad_norm": 0.8940945863723755, + "learning_rate": 3.0245272021876144e-05, + "loss": 0.1814, + "step": 2993 + }, + { + "epoch": 1.821721934895041, + "grad_norm": 0.8653423190116882, + "learning_rate": 3.023309960537134e-05, + "loss": 0.1743, + "step": 2994 + }, + { + "epoch": 1.8223303924551262, + "grad_norm": 0.9504120945930481, + "learning_rate": 3.0220925891275957e-05, + "loss": 0.18, + "step": 2995 + }, + { + "epoch": 1.8229388500152113, + "grad_norm": 0.9393471479415894, + "learning_rate": 3.0208750882608583e-05, + "loss": 0.1825, + "step": 2996 + }, + { + "epoch": 1.8235473075752966, + "grad_norm": 0.9305369257926941, + "learning_rate": 3.0196574582388095e-05, + "loss": 0.1618, + "step": 2997 + }, + { + "epoch": 1.824155765135382, + "grad_norm": 0.7652149796485901, + "learning_rate": 3.0184396993633718e-05, + "loss": 0.1558, + "step": 2998 + }, + { + "epoch": 1.824764222695467, + "grad_norm": 0.9691605567932129, + "learning_rate": 3.0172218119364975e-05, + "loss": 0.1435, + "step": 2999 + }, + { + "epoch": 1.825372680255552, + "grad_norm": 1.8901022672653198, + "learning_rate": 3.0160037962601727e-05, + "loss": 0.1259, + "step": 3000 + }, + { + "epoch": 1.8259811378156372, + "grad_norm": 0.7600100636482239, + "learning_rate": 3.014785652636416e-05, + "loss": 0.1442, + "step": 3001 + }, + { + "epoch": 1.8265895953757225, + "grad_norm": 0.904800295829773, + "learning_rate": 3.0135673813672734e-05, + "loss": 0.1831, + "step": 3002 + }, + { + "epoch": 1.8271980529358078, + "grad_norm": 0.9065991640090942, + "learning_rate": 3.0123489827548273e-05, + "loss": 0.1632, + "step": 3003 + }, + { + "epoch": 1.827806510495893, + "grad_norm": 0.7399230599403381, + "learning_rate": 3.0111304571011888e-05, + "loss": 0.1456, + "step": 3004 + }, + { + "epoch": 1.828414968055978, + "grad_norm": 0.6830241680145264, + "learning_rate": 3.0099118047085024e-05, + "loss": 0.1322, + "step": 3005 + }, + { + "epoch": 1.8290234256160631, + "grad_norm": 0.8357345461845398, + "learning_rate": 3.0086930258789426e-05, + "loss": 0.1839, + "step": 3006 + }, + { + "epoch": 1.8296318831761484, + "grad_norm": 0.8393163681030273, + "learning_rate": 3.0074741209147157e-05, + "loss": 0.1388, + "step": 3007 + }, + { + "epoch": 1.8302403407362338, + "grad_norm": 0.7641516923904419, + "learning_rate": 3.006255090118059e-05, + "loss": 0.134, + "step": 3008 + }, + { + "epoch": 1.8308487982963189, + "grad_norm": 0.8829279541969299, + "learning_rate": 3.005035933791242e-05, + "loss": 0.165, + "step": 3009 + }, + { + "epoch": 1.831457255856404, + "grad_norm": 0.7792665958404541, + "learning_rate": 3.0038166522365642e-05, + "loss": 0.1351, + "step": 3010 + }, + { + "epoch": 1.832065713416489, + "grad_norm": 0.7733790278434753, + "learning_rate": 3.0025972457563573e-05, + "loss": 0.1098, + "step": 3011 + }, + { + "epoch": 1.8326741709765744, + "grad_norm": 0.7731171250343323, + "learning_rate": 3.001377714652982e-05, + "loss": 0.1458, + "step": 3012 + }, + { + "epoch": 1.8332826285366597, + "grad_norm": 0.7515898942947388, + "learning_rate": 3.000158059228832e-05, + "loss": 0.1682, + "step": 3013 + }, + { + "epoch": 1.8338910860967448, + "grad_norm": 0.7409710884094238, + "learning_rate": 2.9989382797863313e-05, + "loss": 0.1413, + "step": 3014 + }, + { + "epoch": 1.8344995436568299, + "grad_norm": 0.8755896687507629, + "learning_rate": 2.997718376627934e-05, + "loss": 0.1678, + "step": 3015 + }, + { + "epoch": 1.835108001216915, + "grad_norm": 0.7442592978477478, + "learning_rate": 2.996498350056125e-05, + "loss": 0.1285, + "step": 3016 + }, + { + "epoch": 1.8357164587770003, + "grad_norm": 0.7244062423706055, + "learning_rate": 2.9952782003734202e-05, + "loss": 0.1296, + "step": 3017 + }, + { + "epoch": 1.8363249163370856, + "grad_norm": 0.7951245903968811, + "learning_rate": 2.994057927882366e-05, + "loss": 0.1638, + "step": 3018 + }, + { + "epoch": 1.8369333738971707, + "grad_norm": 0.778948962688446, + "learning_rate": 2.9928375328855396e-05, + "loss": 0.1594, + "step": 3019 + }, + { + "epoch": 1.8375418314572558, + "grad_norm": 1.0201570987701416, + "learning_rate": 2.9916170156855467e-05, + "loss": 0.1291, + "step": 3020 + }, + { + "epoch": 1.838150289017341, + "grad_norm": 0.8786885738372803, + "learning_rate": 2.9903963765850263e-05, + "loss": 0.1835, + "step": 3021 + }, + { + "epoch": 1.8387587465774262, + "grad_norm": 0.8748721480369568, + "learning_rate": 2.989175615886644e-05, + "loss": 0.1542, + "step": 3022 + }, + { + "epoch": 1.8393672041375115, + "grad_norm": 0.7832545042037964, + "learning_rate": 2.9879547338930997e-05, + "loss": 0.1622, + "step": 3023 + }, + { + "epoch": 1.8399756616975966, + "grad_norm": 0.902432382106781, + "learning_rate": 2.98673373090712e-05, + "loss": 0.1641, + "step": 3024 + }, + { + "epoch": 1.8405841192576817, + "grad_norm": 0.9937904477119446, + "learning_rate": 2.9855126072314638e-05, + "loss": 0.1468, + "step": 3025 + }, + { + "epoch": 1.8411925768177668, + "grad_norm": 0.8179795145988464, + "learning_rate": 2.984291363168918e-05, + "loss": 0.1675, + "step": 3026 + }, + { + "epoch": 1.8418010343778521, + "grad_norm": 0.7362605929374695, + "learning_rate": 2.9830699990222992e-05, + "loss": 0.1494, + "step": 3027 + }, + { + "epoch": 1.8424094919379375, + "grad_norm": 0.7124605178833008, + "learning_rate": 2.981848515094457e-05, + "loss": 0.117, + "step": 3028 + }, + { + "epoch": 1.8430179494980226, + "grad_norm": 0.9013100266456604, + "learning_rate": 2.9806269116882678e-05, + "loss": 0.1282, + "step": 3029 + }, + { + "epoch": 1.8436264070581077, + "grad_norm": 0.7831732630729675, + "learning_rate": 2.979405189106637e-05, + "loss": 0.1539, + "step": 3030 + }, + { + "epoch": 1.8442348646181927, + "grad_norm": 0.8050282597541809, + "learning_rate": 2.9781833476525022e-05, + "loss": 0.1714, + "step": 3031 + }, + { + "epoch": 1.844843322178278, + "grad_norm": 0.7580960988998413, + "learning_rate": 2.97696138762883e-05, + "loss": 0.133, + "step": 3032 + }, + { + "epoch": 1.8454517797383634, + "grad_norm": 0.5984554290771484, + "learning_rate": 2.9757393093386133e-05, + "loss": 0.1178, + "step": 3033 + }, + { + "epoch": 1.8460602372984485, + "grad_norm": 0.9737577438354492, + "learning_rate": 2.974517113084878e-05, + "loss": 0.2278, + "step": 3034 + }, + { + "epoch": 1.8466686948585336, + "grad_norm": 0.7336621284484863, + "learning_rate": 2.973294799170677e-05, + "loss": 0.1532, + "step": 3035 + }, + { + "epoch": 1.8472771524186187, + "grad_norm": 0.9183807373046875, + "learning_rate": 2.9720723678990943e-05, + "loss": 0.1445, + "step": 3036 + }, + { + "epoch": 1.847885609978704, + "grad_norm": 0.8194828629493713, + "learning_rate": 2.970849819573241e-05, + "loss": 0.1511, + "step": 3037 + }, + { + "epoch": 1.8484940675387893, + "grad_norm": 0.9400119185447693, + "learning_rate": 2.9696271544962583e-05, + "loss": 0.1763, + "step": 3038 + }, + { + "epoch": 1.8491025250988744, + "grad_norm": 0.9202710390090942, + "learning_rate": 2.968404372971316e-05, + "loss": 0.1323, + "step": 3039 + }, + { + "epoch": 1.8497109826589595, + "grad_norm": 1.0977171659469604, + "learning_rate": 2.9671814753016147e-05, + "loss": 0.1417, + "step": 3040 + }, + { + "epoch": 1.8503194402190446, + "grad_norm": 0.8560418486595154, + "learning_rate": 2.9659584617903795e-05, + "loss": 0.16, + "step": 3041 + }, + { + "epoch": 1.85092789777913, + "grad_norm": 0.9262466430664062, + "learning_rate": 2.9647353327408678e-05, + "loss": 0.1722, + "step": 3042 + }, + { + "epoch": 1.8515363553392152, + "grad_norm": 0.9393658638000488, + "learning_rate": 2.9635120884563654e-05, + "loss": 0.1889, + "step": 3043 + }, + { + "epoch": 1.8521448128993003, + "grad_norm": 0.6739933490753174, + "learning_rate": 2.9622887292401847e-05, + "loss": 0.1257, + "step": 3044 + }, + { + "epoch": 1.8527532704593854, + "grad_norm": 0.8402737975120544, + "learning_rate": 2.9610652553956688e-05, + "loss": 0.1763, + "step": 3045 + }, + { + "epoch": 1.8533617280194705, + "grad_norm": 0.7270654439926147, + "learning_rate": 2.959841667226187e-05, + "loss": 0.1321, + "step": 3046 + }, + { + "epoch": 1.8539701855795558, + "grad_norm": 0.8025201559066772, + "learning_rate": 2.9586179650351386e-05, + "loss": 0.1524, + "step": 3047 + }, + { + "epoch": 1.8545786431396412, + "grad_norm": 0.7017185091972351, + "learning_rate": 2.9573941491259523e-05, + "loss": 0.1274, + "step": 3048 + }, + { + "epoch": 1.8551871006997263, + "grad_norm": 0.7935231328010559, + "learning_rate": 2.9561702198020813e-05, + "loss": 0.1738, + "step": 3049 + }, + { + "epoch": 1.8557955582598114, + "grad_norm": 0.830694317817688, + "learning_rate": 2.9549461773670094e-05, + "loss": 0.1483, + "step": 3050 + }, + { + "epoch": 1.8564040158198964, + "grad_norm": 0.8252891302108765, + "learning_rate": 2.9537220221242496e-05, + "loss": 0.2033, + "step": 3051 + }, + { + "epoch": 1.8570124733799818, + "grad_norm": 1.0744190216064453, + "learning_rate": 2.9524977543773397e-05, + "loss": 0.1949, + "step": 3052 + }, + { + "epoch": 1.857620930940067, + "grad_norm": 0.7989039421081543, + "learning_rate": 2.9512733744298482e-05, + "loss": 0.1604, + "step": 3053 + }, + { + "epoch": 1.8582293885001522, + "grad_norm": 0.768959105014801, + "learning_rate": 2.9500488825853702e-05, + "loss": 0.1412, + "step": 3054 + }, + { + "epoch": 1.8588378460602373, + "grad_norm": 0.8371010422706604, + "learning_rate": 2.9488242791475272e-05, + "loss": 0.1422, + "step": 3055 + }, + { + "epoch": 1.8594463036203224, + "grad_norm": 0.8141250610351562, + "learning_rate": 2.947599564419971e-05, + "loss": 0.2141, + "step": 3056 + }, + { + "epoch": 1.8600547611804077, + "grad_norm": 0.7183147072792053, + "learning_rate": 2.9463747387063807e-05, + "loss": 0.1782, + "step": 3057 + }, + { + "epoch": 1.860663218740493, + "grad_norm": 0.7062293887138367, + "learning_rate": 2.94514980231046e-05, + "loss": 0.1377, + "step": 3058 + }, + { + "epoch": 1.861271676300578, + "grad_norm": 0.7800901532173157, + "learning_rate": 2.943924755535944e-05, + "loss": 0.16, + "step": 3059 + }, + { + "epoch": 1.8618801338606632, + "grad_norm": 0.8286892175674438, + "learning_rate": 2.9426995986865918e-05, + "loss": 0.1543, + "step": 3060 + }, + { + "epoch": 1.8624885914207483, + "grad_norm": 0.7443828582763672, + "learning_rate": 2.941474332066192e-05, + "loss": 0.1543, + "step": 3061 + }, + { + "epoch": 1.8630970489808336, + "grad_norm": 0.6537884473800659, + "learning_rate": 2.9402489559785594e-05, + "loss": 0.1595, + "step": 3062 + }, + { + "epoch": 1.863705506540919, + "grad_norm": 0.7740469574928284, + "learning_rate": 2.9390234707275355e-05, + "loss": 0.1529, + "step": 3063 + }, + { + "epoch": 1.864313964101004, + "grad_norm": 0.6976113319396973, + "learning_rate": 2.9377978766169912e-05, + "loss": 0.1339, + "step": 3064 + }, + { + "epoch": 1.8649224216610891, + "grad_norm": 0.7111392021179199, + "learning_rate": 2.9365721739508213e-05, + "loss": 0.142, + "step": 3065 + }, + { + "epoch": 1.8655308792211742, + "grad_norm": 0.671752393245697, + "learning_rate": 2.9353463630329493e-05, + "loss": 0.1487, + "step": 3066 + }, + { + "epoch": 1.8661393367812595, + "grad_norm": 0.8591635823249817, + "learning_rate": 2.9341204441673266e-05, + "loss": 0.1523, + "step": 3067 + }, + { + "epoch": 1.8667477943413449, + "grad_norm": 1.018031358718872, + "learning_rate": 2.932894417657927e-05, + "loss": 0.1422, + "step": 3068 + }, + { + "epoch": 1.86735625190143, + "grad_norm": 1.00792396068573, + "learning_rate": 2.9316682838087565e-05, + "loss": 0.1496, + "step": 3069 + }, + { + "epoch": 1.867964709461515, + "grad_norm": 0.8856237530708313, + "learning_rate": 2.930442042923845e-05, + "loss": 0.1572, + "step": 3070 + }, + { + "epoch": 1.8685731670216001, + "grad_norm": 0.9289006590843201, + "learning_rate": 2.929215695307248e-05, + "loss": 0.1744, + "step": 3071 + }, + { + "epoch": 1.8691816245816855, + "grad_norm": 0.7653868794441223, + "learning_rate": 2.9279892412630493e-05, + "loss": 0.17, + "step": 3072 + }, + { + "epoch": 1.8697900821417706, + "grad_norm": 0.700298011302948, + "learning_rate": 2.9267626810953584e-05, + "loss": 0.1628, + "step": 3073 + }, + { + "epoch": 1.8703985397018559, + "grad_norm": 0.7971607446670532, + "learning_rate": 2.9255360151083107e-05, + "loss": 0.1727, + "step": 3074 + }, + { + "epoch": 1.871006997261941, + "grad_norm": 0.8248562812805176, + "learning_rate": 2.924309243606069e-05, + "loss": 0.1263, + "step": 3075 + }, + { + "epoch": 1.871615454822026, + "grad_norm": 0.8164551854133606, + "learning_rate": 2.9230823668928198e-05, + "loss": 0.1476, + "step": 3076 + }, + { + "epoch": 1.8722239123821114, + "grad_norm": 0.7059839963912964, + "learning_rate": 2.9218553852727794e-05, + "loss": 0.1399, + "step": 3077 + }, + { + "epoch": 1.8728323699421965, + "grad_norm": 0.8123924732208252, + "learning_rate": 2.920628299050187e-05, + "loss": 0.1431, + "step": 3078 + }, + { + "epoch": 1.8734408275022818, + "grad_norm": 0.6520960927009583, + "learning_rate": 2.9194011085293093e-05, + "loss": 0.1423, + "step": 3079 + }, + { + "epoch": 1.874049285062367, + "grad_norm": 0.7921064496040344, + "learning_rate": 2.9181738140144382e-05, + "loss": 0.1541, + "step": 3080 + }, + { + "epoch": 1.874657742622452, + "grad_norm": 0.8729209303855896, + "learning_rate": 2.9169464158098914e-05, + "loss": 0.1544, + "step": 3081 + }, + { + "epoch": 1.8752662001825373, + "grad_norm": 0.745010256767273, + "learning_rate": 2.9157189142200124e-05, + "loss": 0.1548, + "step": 3082 + }, + { + "epoch": 1.8758746577426224, + "grad_norm": 0.7852745056152344, + "learning_rate": 2.914491309549171e-05, + "loss": 0.1595, + "step": 3083 + }, + { + "epoch": 1.8764831153027077, + "grad_norm": 0.7547739744186401, + "learning_rate": 2.9132636021017616e-05, + "loss": 0.14, + "step": 3084 + }, + { + "epoch": 1.8770915728627928, + "grad_norm": 0.9375334978103638, + "learning_rate": 2.912035792182205e-05, + "loss": 0.1149, + "step": 3085 + }, + { + "epoch": 1.877700030422878, + "grad_norm": 0.7835490703582764, + "learning_rate": 2.9108078800949456e-05, + "loss": 0.1577, + "step": 3086 + }, + { + "epoch": 1.8783084879829632, + "grad_norm": 0.7294179797172546, + "learning_rate": 2.9095798661444557e-05, + "loss": 0.1358, + "step": 3087 + }, + { + "epoch": 1.8789169455430483, + "grad_norm": 0.7559580206871033, + "learning_rate": 2.9083517506352315e-05, + "loss": 0.144, + "step": 3088 + }, + { + "epoch": 1.8795254031031337, + "grad_norm": 0.7099955677986145, + "learning_rate": 2.9071235338717935e-05, + "loss": 0.1271, + "step": 3089 + }, + { + "epoch": 1.8801338606632187, + "grad_norm": 0.8515664339065552, + "learning_rate": 2.9058952161586896e-05, + "loss": 0.1777, + "step": 3090 + }, + { + "epoch": 1.8807423182233038, + "grad_norm": 0.8556992411613464, + "learning_rate": 2.90466679780049e-05, + "loss": 0.1626, + "step": 3091 + }, + { + "epoch": 1.8813507757833892, + "grad_norm": 0.872840404510498, + "learning_rate": 2.9034382791017918e-05, + "loss": 0.1781, + "step": 3092 + }, + { + "epoch": 1.8819592333434743, + "grad_norm": 0.6905577182769775, + "learning_rate": 2.9022096603672166e-05, + "loss": 0.171, + "step": 3093 + }, + { + "epoch": 1.8825676909035596, + "grad_norm": 0.7203934788703918, + "learning_rate": 2.9009809419014107e-05, + "loss": 0.1366, + "step": 3094 + }, + { + "epoch": 1.8831761484636447, + "grad_norm": 0.8403582572937012, + "learning_rate": 2.8997521240090448e-05, + "loss": 0.1349, + "step": 3095 + }, + { + "epoch": 1.8837846060237298, + "grad_norm": 0.7579777836799622, + "learning_rate": 2.898523206994815e-05, + "loss": 0.1664, + "step": 3096 + }, + { + "epoch": 1.8843930635838149, + "grad_norm": 0.8160215616226196, + "learning_rate": 2.8972941911634406e-05, + "loss": 0.1475, + "step": 3097 + }, + { + "epoch": 1.8850015211439002, + "grad_norm": 0.7258248925209045, + "learning_rate": 2.8960650768196672e-05, + "loss": 0.1625, + "step": 3098 + }, + { + "epoch": 1.8856099787039855, + "grad_norm": 0.9643956422805786, + "learning_rate": 2.894835864268263e-05, + "loss": 0.1672, + "step": 3099 + }, + { + "epoch": 1.8862184362640706, + "grad_norm": 0.7425087690353394, + "learning_rate": 2.8936065538140228e-05, + "loss": 0.1373, + "step": 3100 + }, + { + "epoch": 1.8868268938241557, + "grad_norm": 0.8233270645141602, + "learning_rate": 2.8923771457617634e-05, + "loss": 0.2127, + "step": 3101 + }, + { + "epoch": 1.8874353513842408, + "grad_norm": 0.7719386219978333, + "learning_rate": 2.891147640416327e-05, + "loss": 0.1492, + "step": 3102 + }, + { + "epoch": 1.888043808944326, + "grad_norm": 0.8687146902084351, + "learning_rate": 2.889918038082579e-05, + "loss": 0.1577, + "step": 3103 + }, + { + "epoch": 1.8886522665044114, + "grad_norm": 0.8466896414756775, + "learning_rate": 2.8886883390654106e-05, + "loss": 0.1688, + "step": 3104 + }, + { + "epoch": 1.8892607240644965, + "grad_norm": 0.7307385206222534, + "learning_rate": 2.8874585436697355e-05, + "loss": 0.0943, + "step": 3105 + }, + { + "epoch": 1.8898691816245816, + "grad_norm": 0.9252493977546692, + "learning_rate": 2.8862286522004916e-05, + "loss": 0.1971, + "step": 3106 + }, + { + "epoch": 1.8904776391846667, + "grad_norm": 0.7721157073974609, + "learning_rate": 2.8849986649626405e-05, + "loss": 0.1684, + "step": 3107 + }, + { + "epoch": 1.891086096744752, + "grad_norm": 0.8666673302650452, + "learning_rate": 2.8837685822611682e-05, + "loss": 0.1636, + "step": 3108 + }, + { + "epoch": 1.8916945543048374, + "grad_norm": 0.8242167234420776, + "learning_rate": 2.882538404401084e-05, + "loss": 0.1706, + "step": 3109 + }, + { + "epoch": 1.8923030118649224, + "grad_norm": 0.8100599050521851, + "learning_rate": 2.8813081316874197e-05, + "loss": 0.1448, + "step": 3110 + }, + { + "epoch": 1.8929114694250075, + "grad_norm": 0.7981263399124146, + "learning_rate": 2.8800777644252336e-05, + "loss": 0.1324, + "step": 3111 + }, + { + "epoch": 1.8935199269850926, + "grad_norm": 0.7660496830940247, + "learning_rate": 2.8788473029196033e-05, + "loss": 0.1506, + "step": 3112 + }, + { + "epoch": 1.894128384545178, + "grad_norm": 0.8053763508796692, + "learning_rate": 2.877616747475634e-05, + "loss": 0.1377, + "step": 3113 + }, + { + "epoch": 1.8947368421052633, + "grad_norm": 0.7818877696990967, + "learning_rate": 2.8763860983984502e-05, + "loss": 0.1491, + "step": 3114 + }, + { + "epoch": 1.8953452996653484, + "grad_norm": 0.8091548085212708, + "learning_rate": 2.8751553559932033e-05, + "loss": 0.1398, + "step": 3115 + }, + { + "epoch": 1.8959537572254335, + "grad_norm": 0.8318586349487305, + "learning_rate": 2.873924520565065e-05, + "loss": 0.1602, + "step": 3116 + }, + { + "epoch": 1.8965622147855186, + "grad_norm": 0.8310206532478333, + "learning_rate": 2.872693592419231e-05, + "loss": 0.1797, + "step": 3117 + }, + { + "epoch": 1.8971706723456039, + "grad_norm": 0.802715003490448, + "learning_rate": 2.8714625718609213e-05, + "loss": 0.156, + "step": 3118 + }, + { + "epoch": 1.8977791299056892, + "grad_norm": 0.7569795846939087, + "learning_rate": 2.8702314591953776e-05, + "loss": 0.1636, + "step": 3119 + }, + { + "epoch": 1.8983875874657743, + "grad_norm": 0.6739256381988525, + "learning_rate": 2.8690002547278633e-05, + "loss": 0.1341, + "step": 3120 + }, + { + "epoch": 1.8989960450258594, + "grad_norm": 0.8993747234344482, + "learning_rate": 2.8677689587636668e-05, + "loss": 0.1541, + "step": 3121 + }, + { + "epoch": 1.8996045025859445, + "grad_norm": 0.7927727699279785, + "learning_rate": 2.866537571608098e-05, + "loss": 0.1603, + "step": 3122 + }, + { + "epoch": 1.9002129601460298, + "grad_norm": 0.7829379439353943, + "learning_rate": 2.8653060935664888e-05, + "loss": 0.1674, + "step": 3123 + }, + { + "epoch": 1.9008214177061151, + "grad_norm": 0.8112666606903076, + "learning_rate": 2.8640745249441958e-05, + "loss": 0.1731, + "step": 3124 + }, + { + "epoch": 1.9014298752662002, + "grad_norm": 0.8178973197937012, + "learning_rate": 2.8628428660465957e-05, + "loss": 0.1408, + "step": 3125 + }, + { + "epoch": 1.9020383328262853, + "grad_norm": 0.7954561114311218, + "learning_rate": 2.8616111171790894e-05, + "loss": 0.1557, + "step": 3126 + }, + { + "epoch": 1.9026467903863704, + "grad_norm": 0.8346177935600281, + "learning_rate": 2.860379278647098e-05, + "loss": 0.1281, + "step": 3127 + }, + { + "epoch": 1.9032552479464557, + "grad_norm": 0.7774602174758911, + "learning_rate": 2.8591473507560667e-05, + "loss": 0.1308, + "step": 3128 + }, + { + "epoch": 1.903863705506541, + "grad_norm": 0.8558622598648071, + "learning_rate": 2.8579153338114635e-05, + "loss": 0.1745, + "step": 3129 + }, + { + "epoch": 1.9044721630666261, + "grad_norm": 0.8205204606056213, + "learning_rate": 2.856683228118775e-05, + "loss": 0.1451, + "step": 3130 + }, + { + "epoch": 1.9050806206267112, + "grad_norm": 0.8976646661758423, + "learning_rate": 2.8554510339835144e-05, + "loss": 0.1594, + "step": 3131 + }, + { + "epoch": 1.9056890781867963, + "grad_norm": 0.6619974970817566, + "learning_rate": 2.8542187517112124e-05, + "loss": 0.1453, + "step": 3132 + }, + { + "epoch": 1.9062975357468817, + "grad_norm": 0.9032138586044312, + "learning_rate": 2.8529863816074244e-05, + "loss": 0.1222, + "step": 3133 + }, + { + "epoch": 1.906905993306967, + "grad_norm": 0.7461523413658142, + "learning_rate": 2.851753923977728e-05, + "loss": 0.129, + "step": 3134 + }, + { + "epoch": 1.907514450867052, + "grad_norm": 0.8470805883407593, + "learning_rate": 2.850521379127719e-05, + "loss": 0.1702, + "step": 3135 + }, + { + "epoch": 1.9081229084271372, + "grad_norm": 0.7833938002586365, + "learning_rate": 2.8492887473630193e-05, + "loss": 0.1323, + "step": 3136 + }, + { + "epoch": 1.9087313659872223, + "grad_norm": 0.8622560501098633, + "learning_rate": 2.848056028989269e-05, + "loss": 0.1663, + "step": 3137 + }, + { + "epoch": 1.9093398235473076, + "grad_norm": 0.7503676414489746, + "learning_rate": 2.8468232243121313e-05, + "loss": 0.1282, + "step": 3138 + }, + { + "epoch": 1.909948281107393, + "grad_norm": 0.7636743187904358, + "learning_rate": 2.8455903336372902e-05, + "loss": 0.1245, + "step": 3139 + }, + { + "epoch": 1.910556738667478, + "grad_norm": 0.8729671239852905, + "learning_rate": 2.844357357270451e-05, + "loss": 0.1747, + "step": 3140 + }, + { + "epoch": 1.911165196227563, + "grad_norm": 0.6713892817497253, + "learning_rate": 2.8431242955173408e-05, + "loss": 0.1449, + "step": 3141 + }, + { + "epoch": 1.9117736537876482, + "grad_norm": 0.7802202105522156, + "learning_rate": 2.841891148683708e-05, + "loss": 0.1552, + "step": 3142 + }, + { + "epoch": 1.9123821113477335, + "grad_norm": 0.8736104369163513, + "learning_rate": 2.8406579170753205e-05, + "loss": 0.1342, + "step": 3143 + }, + { + "epoch": 1.9129905689078188, + "grad_norm": 0.730114758014679, + "learning_rate": 2.8394246009979697e-05, + "loss": 0.1631, + "step": 3144 + }, + { + "epoch": 1.913599026467904, + "grad_norm": 0.7180743217468262, + "learning_rate": 2.8381912007574653e-05, + "loss": 0.124, + "step": 3145 + }, + { + "epoch": 1.914207484027989, + "grad_norm": 0.6744349002838135, + "learning_rate": 2.836957716659639e-05, + "loss": 0.1257, + "step": 3146 + }, + { + "epoch": 1.914815941588074, + "grad_norm": 0.8679302930831909, + "learning_rate": 2.8357241490103447e-05, + "loss": 0.1492, + "step": 3147 + }, + { + "epoch": 1.9154243991481594, + "grad_norm": 0.821484386920929, + "learning_rate": 2.8344904981154548e-05, + "loss": 0.1498, + "step": 3148 + }, + { + "epoch": 1.9160328567082447, + "grad_norm": 0.8600984811782837, + "learning_rate": 2.833256764280864e-05, + "loss": 0.1466, + "step": 3149 + }, + { + "epoch": 1.9166413142683298, + "grad_norm": 0.7836093902587891, + "learning_rate": 2.832022947812486e-05, + "loss": 0.1434, + "step": 3150 + }, + { + "epoch": 1.917249771828415, + "grad_norm": 0.8246343731880188, + "learning_rate": 2.8307890490162564e-05, + "loss": 0.1209, + "step": 3151 + }, + { + "epoch": 1.9178582293885, + "grad_norm": 0.8240684866905212, + "learning_rate": 2.8295550681981314e-05, + "loss": 0.1811, + "step": 3152 + }, + { + "epoch": 1.9184666869485854, + "grad_norm": 0.7745963931083679, + "learning_rate": 2.828321005664085e-05, + "loss": 0.1375, + "step": 3153 + }, + { + "epoch": 1.9190751445086707, + "grad_norm": 0.8456833362579346, + "learning_rate": 2.827086861720115e-05, + "loss": 0.1742, + "step": 3154 + }, + { + "epoch": 1.9196836020687558, + "grad_norm": 0.8081454634666443, + "learning_rate": 2.8258526366722364e-05, + "loss": 0.1807, + "step": 3155 + }, + { + "epoch": 1.9202920596288409, + "grad_norm": 0.7536942362785339, + "learning_rate": 2.8246183308264862e-05, + "loss": 0.144, + "step": 3156 + }, + { + "epoch": 1.920900517188926, + "grad_norm": 0.7224938869476318, + "learning_rate": 2.8233839444889216e-05, + "loss": 0.1139, + "step": 3157 + }, + { + "epoch": 1.9215089747490113, + "grad_norm": 0.6938396692276001, + "learning_rate": 2.822149477965617e-05, + "loss": 0.1324, + "step": 3158 + }, + { + "epoch": 1.9221174323090966, + "grad_norm": 0.8000092506408691, + "learning_rate": 2.8209149315626697e-05, + "loss": 0.1448, + "step": 3159 + }, + { + "epoch": 1.9227258898691817, + "grad_norm": 0.6927624344825745, + "learning_rate": 2.8196803055861964e-05, + "loss": 0.1666, + "step": 3160 + }, + { + "epoch": 1.9233343474292668, + "grad_norm": 0.6752164363861084, + "learning_rate": 2.818445600342332e-05, + "loss": 0.1415, + "step": 3161 + }, + { + "epoch": 1.9239428049893519, + "grad_norm": 0.8532926440238953, + "learning_rate": 2.817210816137232e-05, + "loss": 0.2002, + "step": 3162 + }, + { + "epoch": 1.9245512625494372, + "grad_norm": 0.6797624230384827, + "learning_rate": 2.815975953277072e-05, + "loss": 0.1286, + "step": 3163 + }, + { + "epoch": 1.9251597201095225, + "grad_norm": 0.7706435918807983, + "learning_rate": 2.8147410120680455e-05, + "loss": 0.1423, + "step": 3164 + }, + { + "epoch": 1.9257681776696076, + "grad_norm": 0.7351078391075134, + "learning_rate": 2.8135059928163683e-05, + "loss": 0.1081, + "step": 3165 + }, + { + "epoch": 1.9263766352296927, + "grad_norm": 0.7624043226242065, + "learning_rate": 2.812270895828271e-05, + "loss": 0.1184, + "step": 3166 + }, + { + "epoch": 1.9269850927897778, + "grad_norm": 1.0484083890914917, + "learning_rate": 2.811035721410008e-05, + "loss": 0.169, + "step": 3167 + }, + { + "epoch": 1.9275935503498631, + "grad_norm": 0.7948042750358582, + "learning_rate": 2.8098004698678517e-05, + "loss": 0.1207, + "step": 3168 + }, + { + "epoch": 1.9282020079099484, + "grad_norm": 0.7036427855491638, + "learning_rate": 2.8085651415080903e-05, + "loss": 0.1205, + "step": 3169 + }, + { + "epoch": 1.9288104654700335, + "grad_norm": 0.7674738168716431, + "learning_rate": 2.8073297366370365e-05, + "loss": 0.1141, + "step": 3170 + }, + { + "epoch": 1.9294189230301186, + "grad_norm": 0.932759702205658, + "learning_rate": 2.806094255561018e-05, + "loss": 0.1654, + "step": 3171 + }, + { + "epoch": 1.9300273805902037, + "grad_norm": 2.075066566467285, + "learning_rate": 2.804858698586383e-05, + "loss": 0.1699, + "step": 3172 + }, + { + "epoch": 1.930635838150289, + "grad_norm": 0.8003236055374146, + "learning_rate": 2.8036230660194972e-05, + "loss": 0.1529, + "step": 3173 + }, + { + "epoch": 1.9312442957103741, + "grad_norm": 0.8078186511993408, + "learning_rate": 2.8023873581667476e-05, + "loss": 0.1817, + "step": 3174 + }, + { + "epoch": 1.9318527532704595, + "grad_norm": 0.8531659841537476, + "learning_rate": 2.8011515753345363e-05, + "loss": 0.1293, + "step": 3175 + }, + { + "epoch": 1.9324612108305446, + "grad_norm": 0.642602264881134, + "learning_rate": 2.7999157178292873e-05, + "loss": 0.1312, + "step": 3176 + }, + { + "epoch": 1.9330696683906297, + "grad_norm": 0.780268132686615, + "learning_rate": 2.798679785957442e-05, + "loss": 0.1632, + "step": 3177 + }, + { + "epoch": 1.933678125950715, + "grad_norm": 0.8036219477653503, + "learning_rate": 2.797443780025459e-05, + "loss": 0.1647, + "step": 3178 + }, + { + "epoch": 1.9342865835108, + "grad_norm": 0.6498638391494751, + "learning_rate": 2.7962077003398162e-05, + "loss": 0.1176, + "step": 3179 + }, + { + "epoch": 1.9348950410708854, + "grad_norm": 0.6818913817405701, + "learning_rate": 2.794971547207011e-05, + "loss": 0.1363, + "step": 3180 + }, + { + "epoch": 1.9355034986309705, + "grad_norm": 0.8026784062385559, + "learning_rate": 2.793735320933556e-05, + "loss": 0.1645, + "step": 3181 + }, + { + "epoch": 1.9361119561910556, + "grad_norm": 0.7087329030036926, + "learning_rate": 2.7924990218259862e-05, + "loss": 0.1045, + "step": 3182 + }, + { + "epoch": 1.936720413751141, + "grad_norm": 0.7351903319358826, + "learning_rate": 2.79126265019085e-05, + "loss": 0.1643, + "step": 3183 + }, + { + "epoch": 1.937328871311226, + "grad_norm": 0.7057482004165649, + "learning_rate": 2.7900262063347172e-05, + "loss": 0.1522, + "step": 3184 + }, + { + "epoch": 1.9379373288713113, + "grad_norm": 0.8340151309967041, + "learning_rate": 2.7887896905641746e-05, + "loss": 0.152, + "step": 3185 + }, + { + "epoch": 1.9385457864313964, + "grad_norm": 0.8494538068771362, + "learning_rate": 2.7875531031858253e-05, + "loss": 0.1919, + "step": 3186 + }, + { + "epoch": 1.9391542439914815, + "grad_norm": 0.6910502314567566, + "learning_rate": 2.7863164445062928e-05, + "loss": 0.1377, + "step": 3187 + }, + { + "epoch": 1.9397627015515668, + "grad_norm": 0.7750650644302368, + "learning_rate": 2.785079714832216e-05, + "loss": 0.1633, + "step": 3188 + }, + { + "epoch": 1.940371159111652, + "grad_norm": 0.7941158413887024, + "learning_rate": 2.7838429144702528e-05, + "loss": 0.1317, + "step": 3189 + }, + { + "epoch": 1.9409796166717372, + "grad_norm": 0.7237272262573242, + "learning_rate": 2.7826060437270786e-05, + "loss": 0.1287, + "step": 3190 + }, + { + "epoch": 1.9415880742318223, + "grad_norm": 0.6614869236946106, + "learning_rate": 2.781369102909384e-05, + "loss": 0.0973, + "step": 3191 + }, + { + "epoch": 1.9421965317919074, + "grad_norm": 0.8167053461074829, + "learning_rate": 2.78013209232388e-05, + "loss": 0.1562, + "step": 3192 + }, + { + "epoch": 1.9428049893519928, + "grad_norm": 0.6749504804611206, + "learning_rate": 2.7788950122772944e-05, + "loss": 0.1205, + "step": 3193 + }, + { + "epoch": 1.9434134469120778, + "grad_norm": 0.729088544845581, + "learning_rate": 2.77765786307637e-05, + "loss": 0.1377, + "step": 3194 + }, + { + "epoch": 1.9440219044721632, + "grad_norm": 0.7123382687568665, + "learning_rate": 2.7764206450278697e-05, + "loss": 0.1255, + "step": 3195 + }, + { + "epoch": 1.9446303620322483, + "grad_norm": 0.7942476272583008, + "learning_rate": 2.7751833584385707e-05, + "loss": 0.1207, + "step": 3196 + }, + { + "epoch": 1.9452388195923334, + "grad_norm": 0.6616600751876831, + "learning_rate": 2.7739460036152686e-05, + "loss": 0.1114, + "step": 3197 + }, + { + "epoch": 1.9458472771524185, + "grad_norm": 0.7162274718284607, + "learning_rate": 2.772708580864777e-05, + "loss": 0.1137, + "step": 3198 + }, + { + "epoch": 1.9464557347125038, + "grad_norm": 0.7372970581054688, + "learning_rate": 2.7714710904939238e-05, + "loss": 0.1576, + "step": 3199 + }, + { + "epoch": 1.947064192272589, + "grad_norm": 0.8062869906425476, + "learning_rate": 2.7702335328095562e-05, + "loss": 0.1603, + "step": 3200 + }, + { + "epoch": 1.9476726498326742, + "grad_norm": 0.7464812397956848, + "learning_rate": 2.7689959081185355e-05, + "loss": 0.1139, + "step": 3201 + }, + { + "epoch": 1.9482811073927593, + "grad_norm": 0.7284194231033325, + "learning_rate": 2.7677582167277428e-05, + "loss": 0.1258, + "step": 3202 + }, + { + "epoch": 1.9488895649528444, + "grad_norm": 0.8625699877738953, + "learning_rate": 2.766520458944073e-05, + "loss": 0.164, + "step": 3203 + }, + { + "epoch": 1.9494980225129297, + "grad_norm": 0.8282449841499329, + "learning_rate": 2.7652826350744375e-05, + "loss": 0.136, + "step": 3204 + }, + { + "epoch": 1.950106480073015, + "grad_norm": 0.8043208122253418, + "learning_rate": 2.7640447454257667e-05, + "loss": 0.1447, + "step": 3205 + }, + { + "epoch": 1.9507149376331, + "grad_norm": 0.7022130489349365, + "learning_rate": 2.7628067903050052e-05, + "loss": 0.1197, + "step": 3206 + }, + { + "epoch": 1.9513233951931852, + "grad_norm": 0.7067160606384277, + "learning_rate": 2.761568770019114e-05, + "loss": 0.1394, + "step": 3207 + }, + { + "epoch": 1.9519318527532703, + "grad_norm": 0.779850423336029, + "learning_rate": 2.7603306848750703e-05, + "loss": 0.1669, + "step": 3208 + }, + { + "epoch": 1.9525403103133556, + "grad_norm": 0.765367865562439, + "learning_rate": 2.759092535179868e-05, + "loss": 0.1147, + "step": 3209 + }, + { + "epoch": 1.953148767873441, + "grad_norm": 0.6569228172302246, + "learning_rate": 2.757854321240516e-05, + "loss": 0.1039, + "step": 3210 + }, + { + "epoch": 1.953757225433526, + "grad_norm": 0.7526434063911438, + "learning_rate": 2.756616043364041e-05, + "loss": 0.1418, + "step": 3211 + }, + { + "epoch": 1.9543656829936111, + "grad_norm": 0.7364466786384583, + "learning_rate": 2.7553777018574834e-05, + "loss": 0.1265, + "step": 3212 + }, + { + "epoch": 1.9549741405536962, + "grad_norm": 0.7600389122962952, + "learning_rate": 2.7541392970279e-05, + "loss": 0.1372, + "step": 3213 + }, + { + "epoch": 1.9555825981137815, + "grad_norm": 0.7566266059875488, + "learning_rate": 2.7529008291823642e-05, + "loss": 0.1094, + "step": 3214 + }, + { + "epoch": 1.9561910556738669, + "grad_norm": 0.8381999135017395, + "learning_rate": 2.7516622986279638e-05, + "loss": 0.1618, + "step": 3215 + }, + { + "epoch": 1.956799513233952, + "grad_norm": 0.8576135039329529, + "learning_rate": 2.7504237056718042e-05, + "loss": 0.149, + "step": 3216 + }, + { + "epoch": 1.957407970794037, + "grad_norm": 0.7162174582481384, + "learning_rate": 2.7491850506210027e-05, + "loss": 0.129, + "step": 3217 + }, + { + "epoch": 1.9580164283541222, + "grad_norm": 0.6965352892875671, + "learning_rate": 2.747946333782696e-05, + "loss": 0.0866, + "step": 3218 + }, + { + "epoch": 1.9586248859142075, + "grad_norm": 0.7965853810310364, + "learning_rate": 2.7467075554640326e-05, + "loss": 0.1564, + "step": 3219 + }, + { + "epoch": 1.9592333434742928, + "grad_norm": 0.6897141933441162, + "learning_rate": 2.745468715972179e-05, + "loss": 0.1425, + "step": 3220 + }, + { + "epoch": 1.9598418010343779, + "grad_norm": 0.6696418523788452, + "learning_rate": 2.744229815614316e-05, + "loss": 0.1209, + "step": 3221 + }, + { + "epoch": 1.960450258594463, + "grad_norm": 0.7108597159385681, + "learning_rate": 2.742990854697638e-05, + "loss": 0.1068, + "step": 3222 + }, + { + "epoch": 1.961058716154548, + "grad_norm": 0.716172456741333, + "learning_rate": 2.741751833529357e-05, + "loss": 0.1491, + "step": 3223 + }, + { + "epoch": 1.9616671737146334, + "grad_norm": 0.8014208674430847, + "learning_rate": 2.7405127524166973e-05, + "loss": 0.1468, + "step": 3224 + }, + { + "epoch": 1.9622756312747187, + "grad_norm": 0.7219249606132507, + "learning_rate": 2.7392736116669005e-05, + "loss": 0.123, + "step": 3225 + }, + { + "epoch": 1.9628840888348038, + "grad_norm": 0.8846964240074158, + "learning_rate": 2.738034411587222e-05, + "loss": 0.1553, + "step": 3226 + }, + { + "epoch": 1.963492546394889, + "grad_norm": 0.7260108590126038, + "learning_rate": 2.73679515248493e-05, + "loss": 0.1358, + "step": 3227 + }, + { + "epoch": 1.964101003954974, + "grad_norm": 0.7410035133361816, + "learning_rate": 2.735555834667311e-05, + "loss": 0.1261, + "step": 3228 + }, + { + "epoch": 1.9647094615150593, + "grad_norm": 0.8541549444198608, + "learning_rate": 2.7343164584416637e-05, + "loss": 0.1394, + "step": 3229 + }, + { + "epoch": 1.9653179190751446, + "grad_norm": 0.7482866048812866, + "learning_rate": 2.7330770241153008e-05, + "loss": 0.1007, + "step": 3230 + }, + { + "epoch": 1.9659263766352297, + "grad_norm": 0.9279438853263855, + "learning_rate": 2.7318375319955512e-05, + "loss": 0.1857, + "step": 3231 + }, + { + "epoch": 1.9665348341953148, + "grad_norm": 0.815904438495636, + "learning_rate": 2.7305979823897576e-05, + "loss": 0.1255, + "step": 3232 + }, + { + "epoch": 1.9671432917554, + "grad_norm": 0.8731521368026733, + "learning_rate": 2.7293583756052755e-05, + "loss": 0.1447, + "step": 3233 + }, + { + "epoch": 1.9677517493154852, + "grad_norm": 0.8212949633598328, + "learning_rate": 2.7281187119494773e-05, + "loss": 0.1359, + "step": 3234 + }, + { + "epoch": 1.9683602068755706, + "grad_norm": 0.7952809929847717, + "learning_rate": 2.726878991729746e-05, + "loss": 0.1302, + "step": 3235 + }, + { + "epoch": 1.9689686644356557, + "grad_norm": 0.8877054452896118, + "learning_rate": 2.7256392152534825e-05, + "loss": 0.1527, + "step": 3236 + }, + { + "epoch": 1.9695771219957408, + "grad_norm": 0.844585120677948, + "learning_rate": 2.724399382828098e-05, + "loss": 0.1602, + "step": 3237 + }, + { + "epoch": 1.9701855795558258, + "grad_norm": 0.7585496306419373, + "learning_rate": 2.72315949476102e-05, + "loss": 0.148, + "step": 3238 + }, + { + "epoch": 1.9707940371159112, + "grad_norm": 0.7337778210639954, + "learning_rate": 2.7219195513596894e-05, + "loss": 0.1213, + "step": 3239 + }, + { + "epoch": 1.9714024946759965, + "grad_norm": 0.8415425419807434, + "learning_rate": 2.72067955293156e-05, + "loss": 0.1589, + "step": 3240 + }, + { + "epoch": 1.9720109522360816, + "grad_norm": 0.7917572855949402, + "learning_rate": 2.7194394997841e-05, + "loss": 0.1552, + "step": 3241 + }, + { + "epoch": 1.9726194097961667, + "grad_norm": 0.7449464201927185, + "learning_rate": 2.7181993922247907e-05, + "loss": 0.1232, + "step": 3242 + }, + { + "epoch": 1.9732278673562518, + "grad_norm": 0.956392228603363, + "learning_rate": 2.7169592305611262e-05, + "loss": 0.1336, + "step": 3243 + }, + { + "epoch": 1.973836324916337, + "grad_norm": 0.8234003782272339, + "learning_rate": 2.715719015100617e-05, + "loss": 0.1144, + "step": 3244 + }, + { + "epoch": 1.9744447824764224, + "grad_norm": 0.7959507703781128, + "learning_rate": 2.714478746150783e-05, + "loss": 0.1309, + "step": 3245 + }, + { + "epoch": 1.9750532400365075, + "grad_norm": 0.8472030162811279, + "learning_rate": 2.71323842401916e-05, + "loss": 0.1262, + "step": 3246 + }, + { + "epoch": 1.9756616975965926, + "grad_norm": 0.9578633904457092, + "learning_rate": 2.711998049013297e-05, + "loss": 0.1577, + "step": 3247 + }, + { + "epoch": 1.9762701551566777, + "grad_norm": 0.6737805604934692, + "learning_rate": 2.710757621440753e-05, + "loss": 0.1057, + "step": 3248 + }, + { + "epoch": 1.976878612716763, + "grad_norm": 0.8866187334060669, + "learning_rate": 2.709517141609105e-05, + "loss": 0.1305, + "step": 3249 + }, + { + "epoch": 1.9774870702768483, + "grad_norm": 0.7365625500679016, + "learning_rate": 2.7082766098259377e-05, + "loss": 0.1691, + "step": 3250 + }, + { + "epoch": 1.9780955278369334, + "grad_norm": 0.7187755703926086, + "learning_rate": 2.7070360263988526e-05, + "loss": 0.1036, + "step": 3251 + }, + { + "epoch": 1.9787039853970185, + "grad_norm": 0.7651616334915161, + "learning_rate": 2.7057953916354638e-05, + "loss": 0.1153, + "step": 3252 + }, + { + "epoch": 1.9793124429571036, + "grad_norm": 0.7826346158981323, + "learning_rate": 2.7045547058433953e-05, + "loss": 0.1409, + "step": 3253 + }, + { + "epoch": 1.979920900517189, + "grad_norm": 0.7577517032623291, + "learning_rate": 2.7033139693302854e-05, + "loss": 0.1137, + "step": 3254 + }, + { + "epoch": 1.9805293580772743, + "grad_norm": 0.7163174152374268, + "learning_rate": 2.7020731824037865e-05, + "loss": 0.1258, + "step": 3255 + }, + { + "epoch": 1.9811378156373594, + "grad_norm": 0.7855852246284485, + "learning_rate": 2.7008323453715607e-05, + "loss": 0.1282, + "step": 3256 + }, + { + "epoch": 1.9817462731974445, + "grad_norm": 0.8346313834190369, + "learning_rate": 2.6995914585412847e-05, + "loss": 0.126, + "step": 3257 + }, + { + "epoch": 1.9823547307575295, + "grad_norm": 0.7510027885437012, + "learning_rate": 2.6983505222206456e-05, + "loss": 0.1334, + "step": 3258 + }, + { + "epoch": 1.9829631883176149, + "grad_norm": 0.8980780839920044, + "learning_rate": 2.697109536717346e-05, + "loss": 0.1224, + "step": 3259 + }, + { + "epoch": 1.9835716458777002, + "grad_norm": 0.7538368105888367, + "learning_rate": 2.6958685023390963e-05, + "loss": 0.1239, + "step": 3260 + }, + { + "epoch": 1.9841801034377853, + "grad_norm": 0.9021039605140686, + "learning_rate": 2.6946274193936222e-05, + "loss": 0.1808, + "step": 3261 + }, + { + "epoch": 1.9847885609978704, + "grad_norm": 0.6933613419532776, + "learning_rate": 2.6933862881886607e-05, + "loss": 0.1236, + "step": 3262 + }, + { + "epoch": 1.9853970185579555, + "grad_norm": 0.8434465527534485, + "learning_rate": 2.6921451090319603e-05, + "loss": 0.1461, + "step": 3263 + }, + { + "epoch": 1.9860054761180408, + "grad_norm": 0.692387580871582, + "learning_rate": 2.6909038822312826e-05, + "loss": 0.1007, + "step": 3264 + }, + { + "epoch": 1.986613933678126, + "grad_norm": 0.7423060536384583, + "learning_rate": 2.6896626080943983e-05, + "loss": 0.1531, + "step": 3265 + }, + { + "epoch": 1.9872223912382112, + "grad_norm": 0.7204686403274536, + "learning_rate": 2.6884212869290932e-05, + "loss": 0.132, + "step": 3266 + }, + { + "epoch": 1.9878308487982963, + "grad_norm": 0.7953908443450928, + "learning_rate": 2.6871799190431627e-05, + "loss": 0.1327, + "step": 3267 + }, + { + "epoch": 1.9884393063583814, + "grad_norm": 0.7700750827789307, + "learning_rate": 2.6859385047444146e-05, + "loss": 0.1351, + "step": 3268 + }, + { + "epoch": 1.9890477639184667, + "grad_norm": 0.8604716658592224, + "learning_rate": 2.684697044340667e-05, + "loss": 0.1616, + "step": 3269 + }, + { + "epoch": 1.989656221478552, + "grad_norm": 0.7228001356124878, + "learning_rate": 2.683455538139752e-05, + "loss": 0.1162, + "step": 3270 + }, + { + "epoch": 1.9902646790386371, + "grad_norm": 0.7633928060531616, + "learning_rate": 2.6822139864495092e-05, + "loss": 0.1104, + "step": 3271 + }, + { + "epoch": 1.9908731365987222, + "grad_norm": 0.7890945076942444, + "learning_rate": 2.680972389577794e-05, + "loss": 0.1464, + "step": 3272 + }, + { + "epoch": 1.9914815941588073, + "grad_norm": 0.6925262808799744, + "learning_rate": 2.6797307478324683e-05, + "loss": 0.1416, + "step": 3273 + }, + { + "epoch": 1.9920900517188926, + "grad_norm": 0.7635548710823059, + "learning_rate": 2.678489061521409e-05, + "loss": 0.0874, + "step": 3274 + }, + { + "epoch": 1.9926985092789777, + "grad_norm": 0.7441225051879883, + "learning_rate": 2.6772473309525027e-05, + "loss": 0.1141, + "step": 3275 + }, + { + "epoch": 1.993306966839063, + "grad_norm": 0.7360615730285645, + "learning_rate": 2.6760055564336462e-05, + "loss": 0.1356, + "step": 3276 + }, + { + "epoch": 1.9939154243991482, + "grad_norm": 0.7741050720214844, + "learning_rate": 2.674763738272748e-05, + "loss": 0.1568, + "step": 3277 + }, + { + "epoch": 1.9945238819592332, + "grad_norm": 0.7115604281425476, + "learning_rate": 2.673521876777727e-05, + "loss": 0.1049, + "step": 3278 + }, + { + "epoch": 1.9951323395193186, + "grad_norm": 0.750180721282959, + "learning_rate": 2.6722799722565127e-05, + "loss": 0.1387, + "step": 3279 + }, + { + "epoch": 1.9957407970794037, + "grad_norm": 0.7199980020523071, + "learning_rate": 2.6710380250170476e-05, + "loss": 0.1238, + "step": 3280 + }, + { + "epoch": 1.996349254639489, + "grad_norm": 0.7894657850265503, + "learning_rate": 2.6697960353672808e-05, + "loss": 0.1399, + "step": 3281 + }, + { + "epoch": 1.996957712199574, + "grad_norm": 0.7097627520561218, + "learning_rate": 2.6685540036151752e-05, + "loss": 0.1499, + "step": 3282 + }, + { + "epoch": 1.9975661697596592, + "grad_norm": 0.7155794501304626, + "learning_rate": 2.6673119300687015e-05, + "loss": 0.1142, + "step": 3283 + }, + { + "epoch": 1.9981746273197445, + "grad_norm": 0.6458375453948975, + "learning_rate": 2.6660698150358433e-05, + "loss": 0.0996, + "step": 3284 + }, + { + "epoch": 1.9987830848798296, + "grad_norm": 0.6683632135391235, + "learning_rate": 2.6648276588245935e-05, + "loss": 0.1444, + "step": 3285 + }, + { + "epoch": 1.999391542439915, + "grad_norm": 0.6919201016426086, + "learning_rate": 2.663585461742954e-05, + "loss": 0.1567, + "step": 3286 + }, + { + "epoch": 2.0, + "grad_norm": 0.7347707152366638, + "learning_rate": 2.662343224098939e-05, + "loss": 0.1315, + "step": 3287 + }, + { + "epoch": 2.0, + "eval_loss": 1.1706266403198242, + "eval_runtime": 105.2304, + "eval_samples_per_second": 7.241, + "eval_steps_per_second": 0.456, + "step": 3287 + }, + { + "epoch": 2.000608457560085, + "grad_norm": 0.6429162621498108, + "learning_rate": 2.6611009462005716e-05, + "loss": 0.0639, + "step": 3288 + }, + { + "epoch": 2.00121691512017, + "grad_norm": 0.9474073052406311, + "learning_rate": 2.659858628355884e-05, + "loss": 0.0587, + "step": 3289 + }, + { + "epoch": 2.0018253726802557, + "grad_norm": 0.5293480753898621, + "learning_rate": 2.6586162708729197e-05, + "loss": 0.041, + "step": 3290 + }, + { + "epoch": 2.002433830240341, + "grad_norm": 0.6578330993652344, + "learning_rate": 2.657373874059732e-05, + "loss": 0.0757, + "step": 3291 + }, + { + "epoch": 2.003042287800426, + "grad_norm": 0.4521423876285553, + "learning_rate": 2.6561314382243835e-05, + "loss": 0.0431, + "step": 3292 + }, + { + "epoch": 2.003650745360511, + "grad_norm": 0.5849077701568604, + "learning_rate": 2.654888963674945e-05, + "loss": 0.0604, + "step": 3293 + }, + { + "epoch": 2.004259202920596, + "grad_norm": 0.6320520043373108, + "learning_rate": 2.6536464507195015e-05, + "loss": 0.0662, + "step": 3294 + }, + { + "epoch": 2.0048676604806817, + "grad_norm": 0.6279209852218628, + "learning_rate": 2.652403899666141e-05, + "loss": 0.0622, + "step": 3295 + }, + { + "epoch": 2.0054761180407668, + "grad_norm": 0.7935817837715149, + "learning_rate": 2.651161310822966e-05, + "loss": 0.0558, + "step": 3296 + }, + { + "epoch": 2.006084575600852, + "grad_norm": 0.5588900446891785, + "learning_rate": 2.649918684498087e-05, + "loss": 0.0638, + "step": 3297 + }, + { + "epoch": 2.006693033160937, + "grad_norm": 0.5976481437683105, + "learning_rate": 2.6486760209996238e-05, + "loss": 0.0582, + "step": 3298 + }, + { + "epoch": 2.007301490721022, + "grad_norm": 0.7166075110435486, + "learning_rate": 2.6474333206357038e-05, + "loss": 0.0676, + "step": 3299 + }, + { + "epoch": 2.0079099482811076, + "grad_norm": 0.71381014585495, + "learning_rate": 2.6461905837144656e-05, + "loss": 0.0694, + "step": 3300 + }, + { + "epoch": 2.0085184058411927, + "grad_norm": 0.626546323299408, + "learning_rate": 2.644947810544056e-05, + "loss": 0.0456, + "step": 3301 + }, + { + "epoch": 2.0091268634012778, + "grad_norm": 0.5405663251876831, + "learning_rate": 2.6437050014326313e-05, + "loss": 0.0539, + "step": 3302 + }, + { + "epoch": 2.009735320961363, + "grad_norm": 0.56814044713974, + "learning_rate": 2.642462156688356e-05, + "loss": 0.0652, + "step": 3303 + }, + { + "epoch": 2.010343778521448, + "grad_norm": 0.6406037211418152, + "learning_rate": 2.6412192766194043e-05, + "loss": 0.0693, + "step": 3304 + }, + { + "epoch": 2.0109522360815335, + "grad_norm": 0.5662345886230469, + "learning_rate": 2.6399763615339583e-05, + "loss": 0.0526, + "step": 3305 + }, + { + "epoch": 2.0115606936416186, + "grad_norm": 0.5635650157928467, + "learning_rate": 2.638733411740209e-05, + "loss": 0.0641, + "step": 3306 + }, + { + "epoch": 2.0121691512017037, + "grad_norm": 0.6141881346702576, + "learning_rate": 2.6374904275463563e-05, + "loss": 0.0482, + "step": 3307 + }, + { + "epoch": 2.012777608761789, + "grad_norm": 0.6459068655967712, + "learning_rate": 2.6362474092606088e-05, + "loss": 0.0558, + "step": 3308 + }, + { + "epoch": 2.013386066321874, + "grad_norm": 0.6503152251243591, + "learning_rate": 2.635004357191182e-05, + "loss": 0.0591, + "step": 3309 + }, + { + "epoch": 2.0139945238819594, + "grad_norm": 0.6401180624961853, + "learning_rate": 2.633761271646302e-05, + "loss": 0.052, + "step": 3310 + }, + { + "epoch": 2.0146029814420445, + "grad_norm": 0.6508215069770813, + "learning_rate": 2.632518152934203e-05, + "loss": 0.0513, + "step": 3311 + }, + { + "epoch": 2.0152114390021296, + "grad_norm": 0.5980373024940491, + "learning_rate": 2.631275001363125e-05, + "loss": 0.0738, + "step": 3312 + }, + { + "epoch": 2.0158198965622147, + "grad_norm": 0.6737512946128845, + "learning_rate": 2.630031817241319e-05, + "loss": 0.0581, + "step": 3313 + }, + { + "epoch": 2.0164283541223, + "grad_norm": 0.5935463905334473, + "learning_rate": 2.6287886008770418e-05, + "loss": 0.0489, + "step": 3314 + }, + { + "epoch": 2.0170368116823854, + "grad_norm": 0.565272331237793, + "learning_rate": 2.62754535257856e-05, + "loss": 0.0601, + "step": 3315 + }, + { + "epoch": 2.0176452692424705, + "grad_norm": 0.5857186317443848, + "learning_rate": 2.626302072654147e-05, + "loss": 0.0419, + "step": 3316 + }, + { + "epoch": 2.0182537268025555, + "grad_norm": 0.6303470134735107, + "learning_rate": 2.625058761412085e-05, + "loss": 0.061, + "step": 3317 + }, + { + "epoch": 2.0188621843626406, + "grad_norm": 0.48001688718795776, + "learning_rate": 2.6238154191606625e-05, + "loss": 0.0497, + "step": 3318 + }, + { + "epoch": 2.0194706419227257, + "grad_norm": 0.6741344332695007, + "learning_rate": 2.6225720462081765e-05, + "loss": 0.0624, + "step": 3319 + }, + { + "epoch": 2.0200790994828113, + "grad_norm": 0.5248076915740967, + "learning_rate": 2.6213286428629324e-05, + "loss": 0.0398, + "step": 3320 + }, + { + "epoch": 2.0206875570428964, + "grad_norm": 0.5384348630905151, + "learning_rate": 2.6200852094332423e-05, + "loss": 0.0336, + "step": 3321 + }, + { + "epoch": 2.0212960146029815, + "grad_norm": 0.527263343334198, + "learning_rate": 2.618841746227425e-05, + "loss": 0.0464, + "step": 3322 + }, + { + "epoch": 2.0219044721630666, + "grad_norm": 0.6149359941482544, + "learning_rate": 2.6175982535538098e-05, + "loss": 0.0427, + "step": 3323 + }, + { + "epoch": 2.0225129297231517, + "grad_norm": 0.691123366355896, + "learning_rate": 2.6163547317207276e-05, + "loss": 0.0704, + "step": 3324 + }, + { + "epoch": 2.023121387283237, + "grad_norm": 0.6679997444152832, + "learning_rate": 2.6151111810365224e-05, + "loss": 0.0534, + "step": 3325 + }, + { + "epoch": 2.0237298448433223, + "grad_norm": 0.5870081186294556, + "learning_rate": 2.613867601809543e-05, + "loss": 0.0486, + "step": 3326 + }, + { + "epoch": 2.0243383024034074, + "grad_norm": 0.5979164838790894, + "learning_rate": 2.612623994348144e-05, + "loss": 0.0516, + "step": 3327 + }, + { + "epoch": 2.0249467599634925, + "grad_norm": 0.6477155685424805, + "learning_rate": 2.6113803589606894e-05, + "loss": 0.0495, + "step": 3328 + }, + { + "epoch": 2.0255552175235776, + "grad_norm": 0.6475479006767273, + "learning_rate": 2.6101366959555478e-05, + "loss": 0.0615, + "step": 3329 + }, + { + "epoch": 2.026163675083663, + "grad_norm": 0.48027321696281433, + "learning_rate": 2.608893005641096e-05, + "loss": 0.0438, + "step": 3330 + }, + { + "epoch": 2.0267721326437482, + "grad_norm": 0.7494072914123535, + "learning_rate": 2.6076492883257187e-05, + "loss": 0.0782, + "step": 3331 + }, + { + "epoch": 2.0273805902038333, + "grad_norm": 0.49906063079833984, + "learning_rate": 2.6064055443178036e-05, + "loss": 0.0474, + "step": 3332 + }, + { + "epoch": 2.0279890477639184, + "grad_norm": 0.6378363370895386, + "learning_rate": 2.6051617739257494e-05, + "loss": 0.0663, + "step": 3333 + }, + { + "epoch": 2.0285975053240035, + "grad_norm": 0.6063147783279419, + "learning_rate": 2.603917977457959e-05, + "loss": 0.0589, + "step": 3334 + }, + { + "epoch": 2.029205962884089, + "grad_norm": 0.575063169002533, + "learning_rate": 2.6026741552228417e-05, + "loss": 0.054, + "step": 3335 + }, + { + "epoch": 2.029814420444174, + "grad_norm": 0.569412112236023, + "learning_rate": 2.601430307528813e-05, + "loss": 0.0419, + "step": 3336 + }, + { + "epoch": 2.0304228780042592, + "grad_norm": 0.5765204429626465, + "learning_rate": 2.6001864346842964e-05, + "loss": 0.0443, + "step": 3337 + }, + { + "epoch": 2.0310313355643443, + "grad_norm": 0.5352804064750671, + "learning_rate": 2.5989425369977195e-05, + "loss": 0.0451, + "step": 3338 + }, + { + "epoch": 2.0316397931244294, + "grad_norm": 0.6071703433990479, + "learning_rate": 2.5976986147775178e-05, + "loss": 0.0492, + "step": 3339 + }, + { + "epoch": 2.032248250684515, + "grad_norm": 1.0402305126190186, + "learning_rate": 2.5964546683321317e-05, + "loss": 0.0945, + "step": 3340 + }, + { + "epoch": 2.0328567082446, + "grad_norm": 0.7192779183387756, + "learning_rate": 2.595210697970009e-05, + "loss": 0.041, + "step": 3341 + }, + { + "epoch": 2.033465165804685, + "grad_norm": 0.623833417892456, + "learning_rate": 2.5939667039996006e-05, + "loss": 0.0514, + "step": 3342 + }, + { + "epoch": 2.0340736233647703, + "grad_norm": 0.7127055525779724, + "learning_rate": 2.592722686729367e-05, + "loss": 0.0609, + "step": 3343 + }, + { + "epoch": 2.0346820809248554, + "grad_norm": 0.7052009701728821, + "learning_rate": 2.5914786464677728e-05, + "loss": 0.047, + "step": 3344 + }, + { + "epoch": 2.0352905384849405, + "grad_norm": 0.5680457949638367, + "learning_rate": 2.590234583523286e-05, + "loss": 0.0566, + "step": 3345 + }, + { + "epoch": 2.035898996045026, + "grad_norm": 0.505352795124054, + "learning_rate": 2.5889904982043845e-05, + "loss": 0.0526, + "step": 3346 + }, + { + "epoch": 2.036507453605111, + "grad_norm": 0.4761469066143036, + "learning_rate": 2.5877463908195475e-05, + "loss": 0.0408, + "step": 3347 + }, + { + "epoch": 2.037115911165196, + "grad_norm": 0.5836338996887207, + "learning_rate": 2.586502261677264e-05, + "loss": 0.0519, + "step": 3348 + }, + { + "epoch": 2.0377243687252813, + "grad_norm": 0.7985496520996094, + "learning_rate": 2.5852581110860253e-05, + "loss": 0.0479, + "step": 3349 + }, + { + "epoch": 2.0383328262853664, + "grad_norm": 0.645341157913208, + "learning_rate": 2.5840139393543285e-05, + "loss": 0.0608, + "step": 3350 + }, + { + "epoch": 2.038941283845452, + "grad_norm": 0.6739196181297302, + "learning_rate": 2.582769746790677e-05, + "loss": 0.0543, + "step": 3351 + }, + { + "epoch": 2.039549741405537, + "grad_norm": 0.6631483435630798, + "learning_rate": 2.5815255337035775e-05, + "loss": 0.0529, + "step": 3352 + }, + { + "epoch": 2.040158198965622, + "grad_norm": 0.596749484539032, + "learning_rate": 2.5802813004015443e-05, + "loss": 0.0584, + "step": 3353 + }, + { + "epoch": 2.040766656525707, + "grad_norm": 0.5741117596626282, + "learning_rate": 2.5790370471930953e-05, + "loss": 0.0559, + "step": 3354 + }, + { + "epoch": 2.0413751140857923, + "grad_norm": 0.5789355635643005, + "learning_rate": 2.577792774386752e-05, + "loss": 0.0371, + "step": 3355 + }, + { + "epoch": 2.041983571645878, + "grad_norm": 0.5400775074958801, + "learning_rate": 2.5765484822910434e-05, + "loss": 0.0524, + "step": 3356 + }, + { + "epoch": 2.042592029205963, + "grad_norm": 0.5711213946342468, + "learning_rate": 2.5753041712145032e-05, + "loss": 0.0506, + "step": 3357 + }, + { + "epoch": 2.043200486766048, + "grad_norm": 0.6942930817604065, + "learning_rate": 2.5740598414656658e-05, + "loss": 0.0607, + "step": 3358 + }, + { + "epoch": 2.043808944326133, + "grad_norm": 0.6652560234069824, + "learning_rate": 2.5728154933530755e-05, + "loss": 0.0663, + "step": 3359 + }, + { + "epoch": 2.0444174018862182, + "grad_norm": 0.6147892475128174, + "learning_rate": 2.5715711271852777e-05, + "loss": 0.0445, + "step": 3360 + }, + { + "epoch": 2.0450258594463038, + "grad_norm": 0.546258270740509, + "learning_rate": 2.570326743270824e-05, + "loss": 0.0399, + "step": 3361 + }, + { + "epoch": 2.045634317006389, + "grad_norm": 0.6004377007484436, + "learning_rate": 2.569082341918269e-05, + "loss": 0.048, + "step": 3362 + }, + { + "epoch": 2.046242774566474, + "grad_norm": 0.6591867208480835, + "learning_rate": 2.5678379234361728e-05, + "loss": 0.0637, + "step": 3363 + }, + { + "epoch": 2.046851232126559, + "grad_norm": 0.7136169075965881, + "learning_rate": 2.566593488133099e-05, + "loss": 0.0634, + "step": 3364 + }, + { + "epoch": 2.047459689686644, + "grad_norm": 0.5792864561080933, + "learning_rate": 2.5653490363176157e-05, + "loss": 0.0515, + "step": 3365 + }, + { + "epoch": 2.0480681472467297, + "grad_norm": 0.6176591515541077, + "learning_rate": 2.5641045682982957e-05, + "loss": 0.0665, + "step": 3366 + }, + { + "epoch": 2.048676604806815, + "grad_norm": 0.5917629599571228, + "learning_rate": 2.5628600843837147e-05, + "loss": 0.0364, + "step": 3367 + }, + { + "epoch": 2.0492850623669, + "grad_norm": 0.6469512581825256, + "learning_rate": 2.561615584882453e-05, + "loss": 0.0625, + "step": 3368 + }, + { + "epoch": 2.049893519926985, + "grad_norm": 0.7131155133247375, + "learning_rate": 2.5603710701030946e-05, + "loss": 0.0567, + "step": 3369 + }, + { + "epoch": 2.05050197748707, + "grad_norm": 0.7157228589057922, + "learning_rate": 2.559126540354227e-05, + "loss": 0.0609, + "step": 3370 + }, + { + "epoch": 2.0511104350471556, + "grad_norm": 0.5784830451011658, + "learning_rate": 2.5578819959444413e-05, + "loss": 0.0495, + "step": 3371 + }, + { + "epoch": 2.0517188926072407, + "grad_norm": 0.6077147126197815, + "learning_rate": 2.5566374371823342e-05, + "loss": 0.0473, + "step": 3372 + }, + { + "epoch": 2.052327350167326, + "grad_norm": 0.47868582606315613, + "learning_rate": 2.555392864376503e-05, + "loss": 0.0446, + "step": 3373 + }, + { + "epoch": 2.052935807727411, + "grad_norm": 0.646273136138916, + "learning_rate": 2.554148277835551e-05, + "loss": 0.0495, + "step": 3374 + }, + { + "epoch": 2.053544265287496, + "grad_norm": 0.5764486193656921, + "learning_rate": 2.552903677868082e-05, + "loss": 0.0493, + "step": 3375 + }, + { + "epoch": 2.0541527228475815, + "grad_norm": 0.5889171957969666, + "learning_rate": 2.551659064782706e-05, + "loss": 0.0579, + "step": 3376 + }, + { + "epoch": 2.0547611804076666, + "grad_norm": 0.6388028860092163, + "learning_rate": 2.5504144388880364e-05, + "loss": 0.0435, + "step": 3377 + }, + { + "epoch": 2.0553696379677517, + "grad_norm": 0.6067814230918884, + "learning_rate": 2.5491698004926862e-05, + "loss": 0.0472, + "step": 3378 + }, + { + "epoch": 2.055978095527837, + "grad_norm": 0.652549684047699, + "learning_rate": 2.5479251499052752e-05, + "loss": 0.0672, + "step": 3379 + }, + { + "epoch": 2.056586553087922, + "grad_norm": 0.6091573238372803, + "learning_rate": 2.5466804874344253e-05, + "loss": 0.0586, + "step": 3380 + }, + { + "epoch": 2.0571950106480075, + "grad_norm": 0.6867235898971558, + "learning_rate": 2.5454358133887594e-05, + "loss": 0.0604, + "step": 3381 + }, + { + "epoch": 2.0578034682080926, + "grad_norm": 0.5185385942459106, + "learning_rate": 2.5441911280769065e-05, + "loss": 0.0516, + "step": 3382 + }, + { + "epoch": 2.0584119257681777, + "grad_norm": 0.5316767692565918, + "learning_rate": 2.5429464318074952e-05, + "loss": 0.0468, + "step": 3383 + }, + { + "epoch": 2.0590203833282628, + "grad_norm": 0.6571846008300781, + "learning_rate": 2.541701724889159e-05, + "loss": 0.0547, + "step": 3384 + }, + { + "epoch": 2.059628840888348, + "grad_norm": 0.6461378335952759, + "learning_rate": 2.5404570076305334e-05, + "loss": 0.0463, + "step": 3385 + }, + { + "epoch": 2.0602372984484334, + "grad_norm": 0.5801909565925598, + "learning_rate": 2.5392122803402562e-05, + "loss": 0.0558, + "step": 3386 + }, + { + "epoch": 2.0608457560085185, + "grad_norm": 0.5066788196563721, + "learning_rate": 2.5379675433269684e-05, + "loss": 0.0393, + "step": 3387 + }, + { + "epoch": 2.0614542135686036, + "grad_norm": 0.6354464292526245, + "learning_rate": 2.5367227968993112e-05, + "loss": 0.0723, + "step": 3388 + }, + { + "epoch": 2.0620626711286887, + "grad_norm": 0.6328490972518921, + "learning_rate": 2.535478041365932e-05, + "loss": 0.0533, + "step": 3389 + }, + { + "epoch": 2.062671128688774, + "grad_norm": 0.6055465936660767, + "learning_rate": 2.5342332770354772e-05, + "loss": 0.0624, + "step": 3390 + }, + { + "epoch": 2.0632795862488593, + "grad_norm": 0.6035982370376587, + "learning_rate": 2.5329885042165963e-05, + "loss": 0.0491, + "step": 3391 + }, + { + "epoch": 2.0638880438089444, + "grad_norm": 0.5577566027641296, + "learning_rate": 2.531743723217942e-05, + "loss": 0.0433, + "step": 3392 + }, + { + "epoch": 2.0644965013690295, + "grad_norm": 0.6334397196769714, + "learning_rate": 2.5304989343481668e-05, + "loss": 0.0557, + "step": 3393 + }, + { + "epoch": 2.0651049589291146, + "grad_norm": 0.6014379262924194, + "learning_rate": 2.5292541379159273e-05, + "loss": 0.0428, + "step": 3394 + }, + { + "epoch": 2.0657134164891997, + "grad_norm": 0.624415934085846, + "learning_rate": 2.5280093342298817e-05, + "loss": 0.0525, + "step": 3395 + }, + { + "epoch": 2.0663218740492852, + "grad_norm": 0.5978030562400818, + "learning_rate": 2.5267645235986874e-05, + "loss": 0.0581, + "step": 3396 + }, + { + "epoch": 2.0669303316093703, + "grad_norm": 0.5354481935501099, + "learning_rate": 2.5255197063310075e-05, + "loss": 0.0519, + "step": 3397 + }, + { + "epoch": 2.0675387891694554, + "grad_norm": 0.6657739877700806, + "learning_rate": 2.5242748827355046e-05, + "loss": 0.0474, + "step": 3398 + }, + { + "epoch": 2.0681472467295405, + "grad_norm": 0.5657456517219543, + "learning_rate": 2.5230300531208417e-05, + "loss": 0.0368, + "step": 3399 + }, + { + "epoch": 2.0687557042896256, + "grad_norm": 0.572913646697998, + "learning_rate": 2.5217852177956862e-05, + "loss": 0.0542, + "step": 3400 + }, + { + "epoch": 2.069364161849711, + "grad_norm": 0.45607224106788635, + "learning_rate": 2.5205403770687048e-05, + "loss": 0.0383, + "step": 3401 + }, + { + "epoch": 2.0699726194097963, + "grad_norm": 0.5699754953384399, + "learning_rate": 2.5192955312485655e-05, + "loss": 0.0553, + "step": 3402 + }, + { + "epoch": 2.0705810769698814, + "grad_norm": 0.6078694462776184, + "learning_rate": 2.51805068064394e-05, + "loss": 0.0538, + "step": 3403 + }, + { + "epoch": 2.0711895345299665, + "grad_norm": 0.6573747396469116, + "learning_rate": 2.5168058255634963e-05, + "loss": 0.0367, + "step": 3404 + }, + { + "epoch": 2.0717979920900516, + "grad_norm": 0.5173442959785461, + "learning_rate": 2.5155609663159098e-05, + "loss": 0.0324, + "step": 3405 + }, + { + "epoch": 2.072406449650137, + "grad_norm": 0.6545349359512329, + "learning_rate": 2.514316103209851e-05, + "loss": 0.0446, + "step": 3406 + }, + { + "epoch": 2.073014907210222, + "grad_norm": 0.6538964509963989, + "learning_rate": 2.513071236553996e-05, + "loss": 0.0494, + "step": 3407 + }, + { + "epoch": 2.0736233647703073, + "grad_norm": 0.636489987373352, + "learning_rate": 2.5118263666570198e-05, + "loss": 0.048, + "step": 3408 + }, + { + "epoch": 2.0742318223303924, + "grad_norm": 0.6040591597557068, + "learning_rate": 2.5105814938275968e-05, + "loss": 0.065, + "step": 3409 + }, + { + "epoch": 2.0748402798904775, + "grad_norm": 0.624994695186615, + "learning_rate": 2.5093366183744045e-05, + "loss": 0.0609, + "step": 3410 + }, + { + "epoch": 2.075448737450563, + "grad_norm": 0.7067580223083496, + "learning_rate": 2.50809174060612e-05, + "loss": 0.0649, + "step": 3411 + }, + { + "epoch": 2.076057195010648, + "grad_norm": 0.6198956370353699, + "learning_rate": 2.5068468608314212e-05, + "loss": 0.0583, + "step": 3412 + }, + { + "epoch": 2.076665652570733, + "grad_norm": 0.6904608607292175, + "learning_rate": 2.5056019793589858e-05, + "loss": 0.0611, + "step": 3413 + }, + { + "epoch": 2.0772741101308183, + "grad_norm": 0.6566850543022156, + "learning_rate": 2.504357096497494e-05, + "loss": 0.0545, + "step": 3414 + }, + { + "epoch": 2.0778825676909034, + "grad_norm": 0.544262707233429, + "learning_rate": 2.5031122125556234e-05, + "loss": 0.0518, + "step": 3415 + }, + { + "epoch": 2.078491025250989, + "grad_norm": 0.5134237408638, + "learning_rate": 2.5018673278420534e-05, + "loss": 0.0523, + "step": 3416 + }, + { + "epoch": 2.079099482811074, + "grad_norm": 0.5637457370758057, + "learning_rate": 2.500622442665465e-05, + "loss": 0.0466, + "step": 3417 + }, + { + "epoch": 2.079707940371159, + "grad_norm": 0.6894789338111877, + "learning_rate": 2.4993775573345356e-05, + "loss": 0.0601, + "step": 3418 + }, + { + "epoch": 2.0803163979312442, + "grad_norm": 0.5236656069755554, + "learning_rate": 2.4981326721579472e-05, + "loss": 0.0439, + "step": 3419 + }, + { + "epoch": 2.0809248554913293, + "grad_norm": 0.5601134300231934, + "learning_rate": 2.496887787444377e-05, + "loss": 0.0547, + "step": 3420 + }, + { + "epoch": 2.081533313051415, + "grad_norm": 0.6053742170333862, + "learning_rate": 2.4956429035025063e-05, + "loss": 0.064, + "step": 3421 + }, + { + "epoch": 2.0821417706115, + "grad_norm": 0.5905669331550598, + "learning_rate": 2.4943980206410144e-05, + "loss": 0.0522, + "step": 3422 + }, + { + "epoch": 2.082750228171585, + "grad_norm": 0.5676148533821106, + "learning_rate": 2.4931531391685794e-05, + "loss": 0.054, + "step": 3423 + }, + { + "epoch": 2.08335868573167, + "grad_norm": 0.6174594759941101, + "learning_rate": 2.4919082593938802e-05, + "loss": 0.0686, + "step": 3424 + }, + { + "epoch": 2.0839671432917553, + "grad_norm": 0.5777302980422974, + "learning_rate": 2.4906633816255964e-05, + "loss": 0.0531, + "step": 3425 + }, + { + "epoch": 2.084575600851841, + "grad_norm": 0.5795081257820129, + "learning_rate": 2.489418506172404e-05, + "loss": 0.0629, + "step": 3426 + }, + { + "epoch": 2.085184058411926, + "grad_norm": 0.6431803107261658, + "learning_rate": 2.4881736333429808e-05, + "loss": 0.0645, + "step": 3427 + }, + { + "epoch": 2.085792515972011, + "grad_norm": 0.5932779312133789, + "learning_rate": 2.4869287634460045e-05, + "loss": 0.0682, + "step": 3428 + }, + { + "epoch": 2.086400973532096, + "grad_norm": 0.4519423246383667, + "learning_rate": 2.4856838967901492e-05, + "loss": 0.0399, + "step": 3429 + }, + { + "epoch": 2.087009431092181, + "grad_norm": 0.5230922698974609, + "learning_rate": 2.4844390336840908e-05, + "loss": 0.0462, + "step": 3430 + }, + { + "epoch": 2.0876178886522667, + "grad_norm": 0.5311405658721924, + "learning_rate": 2.4831941744365043e-05, + "loss": 0.0402, + "step": 3431 + }, + { + "epoch": 2.088226346212352, + "grad_norm": 0.658665657043457, + "learning_rate": 2.4819493193560618e-05, + "loss": 0.0498, + "step": 3432 + }, + { + "epoch": 2.088834803772437, + "grad_norm": 0.5413145422935486, + "learning_rate": 2.4807044687514344e-05, + "loss": 0.0499, + "step": 3433 + }, + { + "epoch": 2.089443261332522, + "grad_norm": 0.6543242931365967, + "learning_rate": 2.4794596229312958e-05, + "loss": 0.0605, + "step": 3434 + }, + { + "epoch": 2.090051718892607, + "grad_norm": 0.5483290553092957, + "learning_rate": 2.4782147822043144e-05, + "loss": 0.0289, + "step": 3435 + }, + { + "epoch": 2.090660176452692, + "grad_norm": 0.5126484036445618, + "learning_rate": 2.476969946879159e-05, + "loss": 0.0396, + "step": 3436 + }, + { + "epoch": 2.0912686340127777, + "grad_norm": 0.5537660121917725, + "learning_rate": 2.475725117264496e-05, + "loss": 0.0515, + "step": 3437 + }, + { + "epoch": 2.091877091572863, + "grad_norm": 0.5816277861595154, + "learning_rate": 2.474480293668993e-05, + "loss": 0.041, + "step": 3438 + }, + { + "epoch": 2.092485549132948, + "grad_norm": 0.6799830794334412, + "learning_rate": 2.473235476401313e-05, + "loss": 0.0589, + "step": 3439 + }, + { + "epoch": 2.093094006693033, + "grad_norm": 0.6604799032211304, + "learning_rate": 2.4719906657701192e-05, + "loss": 0.0506, + "step": 3440 + }, + { + "epoch": 2.093702464253118, + "grad_norm": 0.7943026423454285, + "learning_rate": 2.4707458620840732e-05, + "loss": 0.0622, + "step": 3441 + }, + { + "epoch": 2.0943109218132037, + "grad_norm": 0.5456538200378418, + "learning_rate": 2.469501065651834e-05, + "loss": 0.0451, + "step": 3442 + }, + { + "epoch": 2.0949193793732888, + "grad_norm": 0.6690013408660889, + "learning_rate": 2.4682562767820587e-05, + "loss": 0.0578, + "step": 3443 + }, + { + "epoch": 2.095527836933374, + "grad_norm": 0.645366907119751, + "learning_rate": 2.4670114957834043e-05, + "loss": 0.062, + "step": 3444 + }, + { + "epoch": 2.096136294493459, + "grad_norm": 0.5978085398674011, + "learning_rate": 2.4657667229645237e-05, + "loss": 0.0486, + "step": 3445 + }, + { + "epoch": 2.096744752053544, + "grad_norm": 0.5921434760093689, + "learning_rate": 2.4645219586340683e-05, + "loss": 0.0548, + "step": 3446 + }, + { + "epoch": 2.0973532096136296, + "grad_norm": 0.4595530033111572, + "learning_rate": 2.463277203100689e-05, + "loss": 0.0425, + "step": 3447 + }, + { + "epoch": 2.0979616671737147, + "grad_norm": 0.5539052486419678, + "learning_rate": 2.462032456673033e-05, + "loss": 0.0537, + "step": 3448 + }, + { + "epoch": 2.0985701247338, + "grad_norm": 0.5757131576538086, + "learning_rate": 2.4607877196597437e-05, + "loss": 0.0438, + "step": 3449 + }, + { + "epoch": 2.099178582293885, + "grad_norm": 0.569847047328949, + "learning_rate": 2.4595429923694668e-05, + "loss": 0.0506, + "step": 3450 + }, + { + "epoch": 2.09978703985397, + "grad_norm": 0.5236031413078308, + "learning_rate": 2.4582982751108417e-05, + "loss": 0.0345, + "step": 3451 + }, + { + "epoch": 2.1003954974140555, + "grad_norm": 0.6032043695449829, + "learning_rate": 2.4570535681925047e-05, + "loss": 0.0528, + "step": 3452 + }, + { + "epoch": 2.1010039549741406, + "grad_norm": 0.5206643342971802, + "learning_rate": 2.455808871923094e-05, + "loss": 0.046, + "step": 3453 + }, + { + "epoch": 2.1016124125342257, + "grad_norm": 0.5682435035705566, + "learning_rate": 2.4545641866112408e-05, + "loss": 0.0408, + "step": 3454 + }, + { + "epoch": 2.102220870094311, + "grad_norm": 0.6149486899375916, + "learning_rate": 2.453319512565576e-05, + "loss": 0.0507, + "step": 3455 + }, + { + "epoch": 2.102829327654396, + "grad_norm": 0.5755969882011414, + "learning_rate": 2.4520748500947247e-05, + "loss": 0.0593, + "step": 3456 + }, + { + "epoch": 2.1034377852144814, + "grad_norm": 0.6103445291519165, + "learning_rate": 2.4508301995073144e-05, + "loss": 0.0553, + "step": 3457 + }, + { + "epoch": 2.1040462427745665, + "grad_norm": 0.5596705675125122, + "learning_rate": 2.449585561111965e-05, + "loss": 0.0489, + "step": 3458 + }, + { + "epoch": 2.1046547003346516, + "grad_norm": 0.535492479801178, + "learning_rate": 2.4483409352172936e-05, + "loss": 0.0503, + "step": 3459 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 0.6022449731826782, + "learning_rate": 2.4470963221319188e-05, + "loss": 0.0457, + "step": 3460 + }, + { + "epoch": 2.105871615454822, + "grad_norm": 0.680794894695282, + "learning_rate": 2.4458517221644507e-05, + "loss": 0.0521, + "step": 3461 + }, + { + "epoch": 2.1064800730149074, + "grad_norm": 0.6097959280014038, + "learning_rate": 2.444607135623497e-05, + "loss": 0.0467, + "step": 3462 + }, + { + "epoch": 2.1070885305749925, + "grad_norm": 0.4896848499774933, + "learning_rate": 2.4433625628176663e-05, + "loss": 0.0311, + "step": 3463 + }, + { + "epoch": 2.1076969881350776, + "grad_norm": 0.6469544172286987, + "learning_rate": 2.442118004055559e-05, + "loss": 0.0737, + "step": 3464 + }, + { + "epoch": 2.1083054456951626, + "grad_norm": 0.5251388549804688, + "learning_rate": 2.4408734596457744e-05, + "loss": 0.0456, + "step": 3465 + }, + { + "epoch": 2.1089139032552477, + "grad_norm": 0.5152940154075623, + "learning_rate": 2.439628929896906e-05, + "loss": 0.0327, + "step": 3466 + }, + { + "epoch": 2.1095223608153333, + "grad_norm": 0.552280068397522, + "learning_rate": 2.4383844151175478e-05, + "loss": 0.0592, + "step": 3467 + }, + { + "epoch": 2.1101308183754184, + "grad_norm": 0.6947097182273865, + "learning_rate": 2.4371399156162862e-05, + "loss": 0.0865, + "step": 3468 + }, + { + "epoch": 2.1107392759355035, + "grad_norm": 0.5415664315223694, + "learning_rate": 2.4358954317017045e-05, + "loss": 0.0453, + "step": 3469 + }, + { + "epoch": 2.1113477334955886, + "grad_norm": 0.5632659196853638, + "learning_rate": 2.434650963682385e-05, + "loss": 0.0497, + "step": 3470 + }, + { + "epoch": 2.1119561910556737, + "grad_norm": 0.5756339430809021, + "learning_rate": 2.433406511866902e-05, + "loss": 0.0493, + "step": 3471 + }, + { + "epoch": 2.112564648615759, + "grad_norm": 0.5048176646232605, + "learning_rate": 2.4321620765638274e-05, + "loss": 0.0509, + "step": 3472 + }, + { + "epoch": 2.1131731061758443, + "grad_norm": 0.6544382572174072, + "learning_rate": 2.4309176580817318e-05, + "loss": 0.0654, + "step": 3473 + }, + { + "epoch": 2.1137815637359294, + "grad_norm": 0.5566495656967163, + "learning_rate": 2.429673256729177e-05, + "loss": 0.0493, + "step": 3474 + }, + { + "epoch": 2.1143900212960145, + "grad_norm": 0.53816819190979, + "learning_rate": 2.428428872814722e-05, + "loss": 0.0438, + "step": 3475 + }, + { + "epoch": 2.1149984788560996, + "grad_norm": 0.5756782293319702, + "learning_rate": 2.4271845066469247e-05, + "loss": 0.0514, + "step": 3476 + }, + { + "epoch": 2.115606936416185, + "grad_norm": 0.6478017568588257, + "learning_rate": 2.4259401585343344e-05, + "loss": 0.0584, + "step": 3477 + }, + { + "epoch": 2.1162153939762702, + "grad_norm": 0.6319558024406433, + "learning_rate": 2.424695828785498e-05, + "loss": 0.0524, + "step": 3478 + }, + { + "epoch": 2.1168238515363553, + "grad_norm": 0.5530511736869812, + "learning_rate": 2.4234515177089562e-05, + "loss": 0.0523, + "step": 3479 + }, + { + "epoch": 2.1174323090964404, + "grad_norm": 0.5840123295783997, + "learning_rate": 2.4222072256132483e-05, + "loss": 0.0473, + "step": 3480 + }, + { + "epoch": 2.1180407666565255, + "grad_norm": 0.5127155184745789, + "learning_rate": 2.4209629528069063e-05, + "loss": 0.0456, + "step": 3481 + }, + { + "epoch": 2.118649224216611, + "grad_norm": 0.6958695650100708, + "learning_rate": 2.419718699598456e-05, + "loss": 0.0579, + "step": 3482 + }, + { + "epoch": 2.119257681776696, + "grad_norm": 0.6823195815086365, + "learning_rate": 2.418474466296423e-05, + "loss": 0.0523, + "step": 3483 + }, + { + "epoch": 2.1198661393367813, + "grad_norm": 0.5722386240959167, + "learning_rate": 2.417230253209324e-05, + "loss": 0.0497, + "step": 3484 + }, + { + "epoch": 2.1204745968968663, + "grad_norm": 1.0104351043701172, + "learning_rate": 2.4159860606456718e-05, + "loss": 0.0758, + "step": 3485 + }, + { + "epoch": 2.1210830544569514, + "grad_norm": 0.5905888080596924, + "learning_rate": 2.414741888913975e-05, + "loss": 0.0535, + "step": 3486 + }, + { + "epoch": 2.121691512017037, + "grad_norm": 0.732254683971405, + "learning_rate": 2.4134977383227364e-05, + "loss": 0.0567, + "step": 3487 + }, + { + "epoch": 2.122299969577122, + "grad_norm": 0.5152473449707031, + "learning_rate": 2.412253609180453e-05, + "loss": 0.0359, + "step": 3488 + }, + { + "epoch": 2.122908427137207, + "grad_norm": 0.6493118405342102, + "learning_rate": 2.4110095017956164e-05, + "loss": 0.0565, + "step": 3489 + }, + { + "epoch": 2.1235168846972923, + "grad_norm": 0.5857681632041931, + "learning_rate": 2.4097654164767148e-05, + "loss": 0.0332, + "step": 3490 + }, + { + "epoch": 2.1241253422573774, + "grad_norm": 0.533673107624054, + "learning_rate": 2.4085213535322288e-05, + "loss": 0.0364, + "step": 3491 + }, + { + "epoch": 2.124733799817463, + "grad_norm": 0.6203796863555908, + "learning_rate": 2.4072773132706326e-05, + "loss": 0.0544, + "step": 3492 + }, + { + "epoch": 2.125342257377548, + "grad_norm": 0.5900121927261353, + "learning_rate": 2.4060332960003996e-05, + "loss": 0.0369, + "step": 3493 + }, + { + "epoch": 2.125950714937633, + "grad_norm": 0.6247245669364929, + "learning_rate": 2.4047893020299922e-05, + "loss": 0.0499, + "step": 3494 + }, + { + "epoch": 2.126559172497718, + "grad_norm": 0.5890246033668518, + "learning_rate": 2.403545331667868e-05, + "loss": 0.0455, + "step": 3495 + }, + { + "epoch": 2.1271676300578033, + "grad_norm": 0.7032641768455505, + "learning_rate": 2.4023013852224828e-05, + "loss": 0.0676, + "step": 3496 + }, + { + "epoch": 2.127776087617889, + "grad_norm": 0.5493380427360535, + "learning_rate": 2.401057463002281e-05, + "loss": 0.0512, + "step": 3497 + }, + { + "epoch": 2.128384545177974, + "grad_norm": 0.590238094329834, + "learning_rate": 2.3998135653157035e-05, + "loss": 0.0602, + "step": 3498 + }, + { + "epoch": 2.128993002738059, + "grad_norm": 0.6478054523468018, + "learning_rate": 2.398569692471187e-05, + "loss": 0.0459, + "step": 3499 + }, + { + "epoch": 2.129601460298144, + "grad_norm": 0.6916689872741699, + "learning_rate": 2.397325844777159e-05, + "loss": 0.0472, + "step": 3500 + }, + { + "epoch": 2.130209917858229, + "grad_norm": 0.5881983637809753, + "learning_rate": 2.3960820225420418e-05, + "loss": 0.0581, + "step": 3501 + }, + { + "epoch": 2.1308183754183148, + "grad_norm": 0.5607914924621582, + "learning_rate": 2.39483822607425e-05, + "loss": 0.0485, + "step": 3502 + }, + { + "epoch": 2.1314268329784, + "grad_norm": 0.601393461227417, + "learning_rate": 2.3935944556821966e-05, + "loss": 0.0679, + "step": 3503 + }, + { + "epoch": 2.132035290538485, + "grad_norm": 0.5792237520217896, + "learning_rate": 2.3923507116742826e-05, + "loss": 0.0596, + "step": 3504 + }, + { + "epoch": 2.13264374809857, + "grad_norm": 0.5309643745422363, + "learning_rate": 2.391106994358904e-05, + "loss": 0.0422, + "step": 3505 + }, + { + "epoch": 2.133252205658655, + "grad_norm": 0.61043381690979, + "learning_rate": 2.3898633040444528e-05, + "loss": 0.0519, + "step": 3506 + }, + { + "epoch": 2.1338606632187407, + "grad_norm": 0.5031585693359375, + "learning_rate": 2.388619641039312e-05, + "loss": 0.0334, + "step": 3507 + }, + { + "epoch": 2.134469120778826, + "grad_norm": 0.5993291139602661, + "learning_rate": 2.387376005651856e-05, + "loss": 0.0476, + "step": 3508 + }, + { + "epoch": 2.135077578338911, + "grad_norm": 0.49983397126197815, + "learning_rate": 2.3861323981904575e-05, + "loss": 0.0476, + "step": 3509 + }, + { + "epoch": 2.135686035898996, + "grad_norm": 0.6096305847167969, + "learning_rate": 2.3848888189634778e-05, + "loss": 0.0449, + "step": 3510 + }, + { + "epoch": 2.136294493459081, + "grad_norm": 0.6761244535446167, + "learning_rate": 2.3836452682792734e-05, + "loss": 0.0474, + "step": 3511 + }, + { + "epoch": 2.1369029510191666, + "grad_norm": 0.6972514390945435, + "learning_rate": 2.382401746446191e-05, + "loss": 0.0652, + "step": 3512 + }, + { + "epoch": 2.1375114085792517, + "grad_norm": 0.6121935844421387, + "learning_rate": 2.3811582537725753e-05, + "loss": 0.0501, + "step": 3513 + }, + { + "epoch": 2.138119866139337, + "grad_norm": 0.6078847646713257, + "learning_rate": 2.379914790566759e-05, + "loss": 0.0568, + "step": 3514 + }, + { + "epoch": 2.138728323699422, + "grad_norm": 0.6310839653015137, + "learning_rate": 2.378671357137068e-05, + "loss": 0.0451, + "step": 3515 + }, + { + "epoch": 2.139336781259507, + "grad_norm": 0.48389577865600586, + "learning_rate": 2.377427953791824e-05, + "loss": 0.0449, + "step": 3516 + }, + { + "epoch": 2.1399452388195925, + "grad_norm": 0.6960240006446838, + "learning_rate": 2.3761845808393388e-05, + "loss": 0.0582, + "step": 3517 + }, + { + "epoch": 2.1405536963796776, + "grad_norm": 0.571576714515686, + "learning_rate": 2.3749412385879154e-05, + "loss": 0.0661, + "step": 3518 + }, + { + "epoch": 2.1411621539397627, + "grad_norm": 0.6685907244682312, + "learning_rate": 2.3736979273458535e-05, + "loss": 0.0555, + "step": 3519 + }, + { + "epoch": 2.141770611499848, + "grad_norm": 0.6551706194877625, + "learning_rate": 2.3724546474214406e-05, + "loss": 0.0392, + "step": 3520 + }, + { + "epoch": 2.142379069059933, + "grad_norm": 0.609714686870575, + "learning_rate": 2.371211399122958e-05, + "loss": 0.0674, + "step": 3521 + }, + { + "epoch": 2.1429875266200185, + "grad_norm": 0.5921576619148254, + "learning_rate": 2.3699681827586813e-05, + "loss": 0.0498, + "step": 3522 + }, + { + "epoch": 2.1435959841801036, + "grad_norm": 0.5523459315299988, + "learning_rate": 2.3687249986368752e-05, + "loss": 0.0502, + "step": 3523 + }, + { + "epoch": 2.1442044417401886, + "grad_norm": 0.6032785177230835, + "learning_rate": 2.3674818470657975e-05, + "loss": 0.0567, + "step": 3524 + }, + { + "epoch": 2.1448128993002737, + "grad_norm": 0.5920435786247253, + "learning_rate": 2.3662387283536978e-05, + "loss": 0.0389, + "step": 3525 + }, + { + "epoch": 2.145421356860359, + "grad_norm": 0.5291022062301636, + "learning_rate": 2.3649956428088187e-05, + "loss": 0.042, + "step": 3526 + }, + { + "epoch": 2.1460298144204444, + "grad_norm": 0.6911409497261047, + "learning_rate": 2.363752590739392e-05, + "loss": 0.0591, + "step": 3527 + }, + { + "epoch": 2.1466382719805295, + "grad_norm": 0.6609728336334229, + "learning_rate": 2.362509572453644e-05, + "loss": 0.0463, + "step": 3528 + }, + { + "epoch": 2.1472467295406146, + "grad_norm": 0.5278157591819763, + "learning_rate": 2.3612665882597915e-05, + "loss": 0.0378, + "step": 3529 + }, + { + "epoch": 2.1478551871006997, + "grad_norm": 0.7173718810081482, + "learning_rate": 2.360023638466042e-05, + "loss": 0.0511, + "step": 3530 + }, + { + "epoch": 2.1484636446607848, + "grad_norm": 0.5184733271598816, + "learning_rate": 2.3587807233805956e-05, + "loss": 0.0448, + "step": 3531 + }, + { + "epoch": 2.1490721022208703, + "grad_norm": 0.7443384528160095, + "learning_rate": 2.3575378433116445e-05, + "loss": 0.0522, + "step": 3532 + }, + { + "epoch": 2.1496805597809554, + "grad_norm": 0.5677869915962219, + "learning_rate": 2.356294998567369e-05, + "loss": 0.0391, + "step": 3533 + }, + { + "epoch": 2.1502890173410405, + "grad_norm": 0.7566191554069519, + "learning_rate": 2.3550521894559446e-05, + "loss": 0.0478, + "step": 3534 + }, + { + "epoch": 2.1508974749011256, + "grad_norm": 0.5744057297706604, + "learning_rate": 2.353809416285535e-05, + "loss": 0.0428, + "step": 3535 + }, + { + "epoch": 2.1515059324612107, + "grad_norm": 0.6628245115280151, + "learning_rate": 2.3525666793642968e-05, + "loss": 0.0547, + "step": 3536 + }, + { + "epoch": 2.1521143900212962, + "grad_norm": 0.5155208706855774, + "learning_rate": 2.351323979000377e-05, + "loss": 0.047, + "step": 3537 + }, + { + "epoch": 2.1527228475813813, + "grad_norm": 0.6521499752998352, + "learning_rate": 2.350081315501913e-05, + "loss": 0.0421, + "step": 3538 + }, + { + "epoch": 2.1533313051414664, + "grad_norm": 0.6431947946548462, + "learning_rate": 2.3488386891770346e-05, + "loss": 0.0441, + "step": 3539 + }, + { + "epoch": 2.1539397627015515, + "grad_norm": 0.6007315516471863, + "learning_rate": 2.3475961003338594e-05, + "loss": 0.052, + "step": 3540 + }, + { + "epoch": 2.1545482202616366, + "grad_norm": 0.5834375023841858, + "learning_rate": 2.3463535492804994e-05, + "loss": 0.056, + "step": 3541 + }, + { + "epoch": 2.155156677821722, + "grad_norm": 0.6027958989143372, + "learning_rate": 2.345111036325055e-05, + "loss": 0.0483, + "step": 3542 + }, + { + "epoch": 2.1557651353818073, + "grad_norm": 0.7618597745895386, + "learning_rate": 2.3438685617756174e-05, + "loss": 0.0605, + "step": 3543 + }, + { + "epoch": 2.1563735929418923, + "grad_norm": 0.6837668418884277, + "learning_rate": 2.342626125940268e-05, + "loss": 0.0551, + "step": 3544 + }, + { + "epoch": 2.1569820505019774, + "grad_norm": 0.5220887064933777, + "learning_rate": 2.341383729127081e-05, + "loss": 0.0433, + "step": 3545 + }, + { + "epoch": 2.1575905080620625, + "grad_norm": 0.7032208442687988, + "learning_rate": 2.3401413716441166e-05, + "loss": 0.066, + "step": 3546 + }, + { + "epoch": 2.158198965622148, + "grad_norm": 0.5578327178955078, + "learning_rate": 2.3388990537994296e-05, + "loss": 0.0519, + "step": 3547 + }, + { + "epoch": 2.158807423182233, + "grad_norm": 0.686423122882843, + "learning_rate": 2.3376567759010614e-05, + "loss": 0.0479, + "step": 3548 + }, + { + "epoch": 2.1594158807423183, + "grad_norm": 0.7522620558738708, + "learning_rate": 2.3364145382570462e-05, + "loss": 0.0713, + "step": 3549 + }, + { + "epoch": 2.1600243383024034, + "grad_norm": 0.5565488934516907, + "learning_rate": 2.3351723411754074e-05, + "loss": 0.0474, + "step": 3550 + }, + { + "epoch": 2.1606327958624885, + "grad_norm": 0.5260975956916809, + "learning_rate": 2.3339301849641573e-05, + "loss": 0.0464, + "step": 3551 + }, + { + "epoch": 2.161241253422574, + "grad_norm": 0.574340283870697, + "learning_rate": 2.332688069931299e-05, + "loss": 0.0425, + "step": 3552 + }, + { + "epoch": 2.161849710982659, + "grad_norm": 0.5723605751991272, + "learning_rate": 2.3314459963848264e-05, + "loss": 0.0519, + "step": 3553 + }, + { + "epoch": 2.162458168542744, + "grad_norm": 0.6394765377044678, + "learning_rate": 2.33020396463272e-05, + "loss": 0.0454, + "step": 3554 + }, + { + "epoch": 2.1630666261028293, + "grad_norm": 0.6276997923851013, + "learning_rate": 2.328961974982953e-05, + "loss": 0.0485, + "step": 3555 + }, + { + "epoch": 2.1636750836629144, + "grad_norm": 0.6199513077735901, + "learning_rate": 2.3277200277434876e-05, + "loss": 0.0526, + "step": 3556 + }, + { + "epoch": 2.164283541223, + "grad_norm": 0.5897301435470581, + "learning_rate": 2.3264781232222742e-05, + "loss": 0.0539, + "step": 3557 + }, + { + "epoch": 2.164891998783085, + "grad_norm": 0.5835275053977966, + "learning_rate": 2.3252362617272527e-05, + "loss": 0.0548, + "step": 3558 + }, + { + "epoch": 2.16550045634317, + "grad_norm": 0.5806435346603394, + "learning_rate": 2.3239944435663547e-05, + "loss": 0.0499, + "step": 3559 + }, + { + "epoch": 2.166108913903255, + "grad_norm": 0.6462282538414001, + "learning_rate": 2.3227526690474982e-05, + "loss": 0.0611, + "step": 3560 + }, + { + "epoch": 2.1667173714633403, + "grad_norm": 0.58769690990448, + "learning_rate": 2.321510938478591e-05, + "loss": 0.0462, + "step": 3561 + }, + { + "epoch": 2.1673258290234254, + "grad_norm": 0.5471524596214294, + "learning_rate": 2.3202692521675323e-05, + "loss": 0.0348, + "step": 3562 + }, + { + "epoch": 2.167934286583511, + "grad_norm": 0.5656521320343018, + "learning_rate": 2.3190276104222073e-05, + "loss": 0.055, + "step": 3563 + }, + { + "epoch": 2.168542744143596, + "grad_norm": 0.6103416085243225, + "learning_rate": 2.3177860135504907e-05, + "loss": 0.0562, + "step": 3564 + }, + { + "epoch": 2.169151201703681, + "grad_norm": 0.6096418499946594, + "learning_rate": 2.316544461860249e-05, + "loss": 0.0513, + "step": 3565 + }, + { + "epoch": 2.1697596592637662, + "grad_norm": 0.5852376818656921, + "learning_rate": 2.3153029556593335e-05, + "loss": 0.0505, + "step": 3566 + }, + { + "epoch": 2.1703681168238513, + "grad_norm": 0.6946520805358887, + "learning_rate": 2.3140614952555856e-05, + "loss": 0.0659, + "step": 3567 + }, + { + "epoch": 2.170976574383937, + "grad_norm": 0.546222448348999, + "learning_rate": 2.3128200809568375e-05, + "loss": 0.057, + "step": 3568 + }, + { + "epoch": 2.171585031944022, + "grad_norm": 0.5653393864631653, + "learning_rate": 2.3115787130709074e-05, + "loss": 0.0415, + "step": 3569 + }, + { + "epoch": 2.172193489504107, + "grad_norm": 0.5774003863334656, + "learning_rate": 2.3103373919056026e-05, + "loss": 0.0561, + "step": 3570 + }, + { + "epoch": 2.172801947064192, + "grad_norm": 0.5911126136779785, + "learning_rate": 2.309096117768718e-05, + "loss": 0.0538, + "step": 3571 + }, + { + "epoch": 2.1734104046242773, + "grad_norm": 0.549182653427124, + "learning_rate": 2.3078548909680403e-05, + "loss": 0.0474, + "step": 3572 + }, + { + "epoch": 2.174018862184363, + "grad_norm": 0.5475966334342957, + "learning_rate": 2.3066137118113405e-05, + "loss": 0.0486, + "step": 3573 + }, + { + "epoch": 2.174627319744448, + "grad_norm": 0.6825101375579834, + "learning_rate": 2.305372580606378e-05, + "loss": 0.0569, + "step": 3574 + }, + { + "epoch": 2.175235777304533, + "grad_norm": 0.5372380614280701, + "learning_rate": 2.3041314976609043e-05, + "loss": 0.0525, + "step": 3575 + }, + { + "epoch": 2.175844234864618, + "grad_norm": 0.5879507064819336, + "learning_rate": 2.3028904632826555e-05, + "loss": 0.0485, + "step": 3576 + }, + { + "epoch": 2.176452692424703, + "grad_norm": 0.6694579124450684, + "learning_rate": 2.301649477779354e-05, + "loss": 0.0516, + "step": 3577 + }, + { + "epoch": 2.1770611499847887, + "grad_norm": 0.48730021715164185, + "learning_rate": 2.300408541458716e-05, + "loss": 0.0444, + "step": 3578 + }, + { + "epoch": 2.177669607544874, + "grad_norm": 0.6008870005607605, + "learning_rate": 2.29916765462844e-05, + "loss": 0.0413, + "step": 3579 + }, + { + "epoch": 2.178278065104959, + "grad_norm": 0.557727038860321, + "learning_rate": 2.2979268175962134e-05, + "loss": 0.0375, + "step": 3580 + }, + { + "epoch": 2.178886522665044, + "grad_norm": 0.5483887195587158, + "learning_rate": 2.2966860306697148e-05, + "loss": 0.0426, + "step": 3581 + }, + { + "epoch": 2.179494980225129, + "grad_norm": 0.6324334144592285, + "learning_rate": 2.2954452941566057e-05, + "loss": 0.0419, + "step": 3582 + }, + { + "epoch": 2.1801034377852146, + "grad_norm": 0.643545925617218, + "learning_rate": 2.2942046083645375e-05, + "loss": 0.0425, + "step": 3583 + }, + { + "epoch": 2.1807118953452997, + "grad_norm": 0.5663473010063171, + "learning_rate": 2.292963973601147e-05, + "loss": 0.0356, + "step": 3584 + }, + { + "epoch": 2.181320352905385, + "grad_norm": 0.6918662786483765, + "learning_rate": 2.291723390174063e-05, + "loss": 0.0698, + "step": 3585 + }, + { + "epoch": 2.18192881046547, + "grad_norm": 0.8219648003578186, + "learning_rate": 2.2904828583908967e-05, + "loss": 0.0711, + "step": 3586 + }, + { + "epoch": 2.182537268025555, + "grad_norm": 0.5029035806655884, + "learning_rate": 2.289242378559247e-05, + "loss": 0.0496, + "step": 3587 + }, + { + "epoch": 2.1831457255856406, + "grad_norm": 0.5251539945602417, + "learning_rate": 2.288001950986704e-05, + "loss": 0.0411, + "step": 3588 + }, + { + "epoch": 2.1837541831457257, + "grad_norm": 0.7362745404243469, + "learning_rate": 2.2867615759808403e-05, + "loss": 0.0523, + "step": 3589 + }, + { + "epoch": 2.1843626407058108, + "grad_norm": 0.5987080335617065, + "learning_rate": 2.2855212538492167e-05, + "loss": 0.0515, + "step": 3590 + }, + { + "epoch": 2.184971098265896, + "grad_norm": 0.5399122834205627, + "learning_rate": 2.2842809848993834e-05, + "loss": 0.0555, + "step": 3591 + }, + { + "epoch": 2.185579555825981, + "grad_norm": 0.6066261529922485, + "learning_rate": 2.2830407694388743e-05, + "loss": 0.0404, + "step": 3592 + }, + { + "epoch": 2.1861880133860665, + "grad_norm": 0.5191651582717896, + "learning_rate": 2.281800607775211e-05, + "loss": 0.0519, + "step": 3593 + }, + { + "epoch": 2.1867964709461516, + "grad_norm": 0.6118345856666565, + "learning_rate": 2.2805605002159007e-05, + "loss": 0.05, + "step": 3594 + }, + { + "epoch": 2.1874049285062367, + "grad_norm": 0.6577736139297485, + "learning_rate": 2.2793204470684406e-05, + "loss": 0.0558, + "step": 3595 + }, + { + "epoch": 2.188013386066322, + "grad_norm": 0.5983335375785828, + "learning_rate": 2.2780804486403115e-05, + "loss": 0.064, + "step": 3596 + }, + { + "epoch": 2.188621843626407, + "grad_norm": 0.6087679266929626, + "learning_rate": 2.2768405052389802e-05, + "loss": 0.0561, + "step": 3597 + }, + { + "epoch": 2.1892303011864924, + "grad_norm": 0.5540795922279358, + "learning_rate": 2.2756006171719027e-05, + "loss": 0.0431, + "step": 3598 + }, + { + "epoch": 2.1898387587465775, + "grad_norm": 0.5119609236717224, + "learning_rate": 2.2743607847465188e-05, + "loss": 0.0466, + "step": 3599 + }, + { + "epoch": 2.1904472163066626, + "grad_norm": 0.5084932446479797, + "learning_rate": 2.273121008270254e-05, + "loss": 0.0535, + "step": 3600 + }, + { + "epoch": 2.1910556738667477, + "grad_norm": 0.5782137513160706, + "learning_rate": 2.2718812880505232e-05, + "loss": 0.0635, + "step": 3601 + }, + { + "epoch": 2.191664131426833, + "grad_norm": 0.6178430318832397, + "learning_rate": 2.2706416243947248e-05, + "loss": 0.0442, + "step": 3602 + }, + { + "epoch": 2.1922725889869183, + "grad_norm": 0.5488747954368591, + "learning_rate": 2.2694020176102427e-05, + "loss": 0.0394, + "step": 3603 + }, + { + "epoch": 2.1928810465470034, + "grad_norm": 0.6640757322311401, + "learning_rate": 2.268162468004449e-05, + "loss": 0.0524, + "step": 3604 + }, + { + "epoch": 2.1934895041070885, + "grad_norm": 0.5369768738746643, + "learning_rate": 2.2669229758846998e-05, + "loss": 0.044, + "step": 3605 + }, + { + "epoch": 2.1940979616671736, + "grad_norm": 0.554397702217102, + "learning_rate": 2.265683541558338e-05, + "loss": 0.0384, + "step": 3606 + }, + { + "epoch": 2.1947064192272587, + "grad_norm": 0.6047532558441162, + "learning_rate": 2.2644441653326892e-05, + "loss": 0.0555, + "step": 3607 + }, + { + "epoch": 2.1953148767873443, + "grad_norm": 0.5616478323936462, + "learning_rate": 2.2632048475150705e-05, + "loss": 0.0516, + "step": 3608 + }, + { + "epoch": 2.1959233343474294, + "grad_norm": 0.5725663900375366, + "learning_rate": 2.2619655884127793e-05, + "loss": 0.0551, + "step": 3609 + }, + { + "epoch": 2.1965317919075145, + "grad_norm": 0.6617404222488403, + "learning_rate": 2.2607263883330994e-05, + "loss": 0.0457, + "step": 3610 + }, + { + "epoch": 2.1971402494675996, + "grad_norm": 1.2666044235229492, + "learning_rate": 2.259487247583303e-05, + "loss": 0.078, + "step": 3611 + }, + { + "epoch": 2.1977487070276847, + "grad_norm": 0.7400673031806946, + "learning_rate": 2.258248166470644e-05, + "loss": 0.0534, + "step": 3612 + }, + { + "epoch": 2.19835716458777, + "grad_norm": 0.6314421892166138, + "learning_rate": 2.257009145302362e-05, + "loss": 0.039, + "step": 3613 + }, + { + "epoch": 2.1989656221478553, + "grad_norm": 0.5742847919464111, + "learning_rate": 2.2557701843856847e-05, + "loss": 0.0468, + "step": 3614 + }, + { + "epoch": 2.1995740797079404, + "grad_norm": 0.66370689868927, + "learning_rate": 2.2545312840278214e-05, + "loss": 0.0853, + "step": 3615 + }, + { + "epoch": 2.2001825372680255, + "grad_norm": 0.5838887095451355, + "learning_rate": 2.2532924445359686e-05, + "loss": 0.0373, + "step": 3616 + }, + { + "epoch": 2.2007909948281106, + "grad_norm": 0.6079072952270508, + "learning_rate": 2.252053666217305e-05, + "loss": 0.0372, + "step": 3617 + }, + { + "epoch": 2.201399452388196, + "grad_norm": 0.7246667742729187, + "learning_rate": 2.250814949378998e-05, + "loss": 0.057, + "step": 3618 + }, + { + "epoch": 2.202007909948281, + "grad_norm": 0.5479733943939209, + "learning_rate": 2.2495762943281974e-05, + "loss": 0.0404, + "step": 3619 + }, + { + "epoch": 2.2026163675083663, + "grad_norm": 0.5117501020431519, + "learning_rate": 2.248337701372036e-05, + "loss": 0.0419, + "step": 3620 + }, + { + "epoch": 2.2032248250684514, + "grad_norm": 0.735734224319458, + "learning_rate": 2.2470991708176364e-05, + "loss": 0.0618, + "step": 3621 + }, + { + "epoch": 2.2038332826285365, + "grad_norm": 0.4875301122665405, + "learning_rate": 2.245860702972101e-05, + "loss": 0.0497, + "step": 3622 + }, + { + "epoch": 2.204441740188622, + "grad_norm": 0.6231479644775391, + "learning_rate": 2.244622298142517e-05, + "loss": 0.0457, + "step": 3623 + }, + { + "epoch": 2.205050197748707, + "grad_norm": 0.5008909106254578, + "learning_rate": 2.2433839566359593e-05, + "loss": 0.0479, + "step": 3624 + }, + { + "epoch": 2.2056586553087922, + "grad_norm": 0.49350735545158386, + "learning_rate": 2.2421456787594845e-05, + "loss": 0.0543, + "step": 3625 + }, + { + "epoch": 2.2062671128688773, + "grad_norm": 0.6941964626312256, + "learning_rate": 2.240907464820132e-05, + "loss": 0.0747, + "step": 3626 + }, + { + "epoch": 2.2068755704289624, + "grad_norm": 0.5441345572471619, + "learning_rate": 2.2396693151249303e-05, + "loss": 0.047, + "step": 3627 + }, + { + "epoch": 2.207484027989048, + "grad_norm": 0.5408318042755127, + "learning_rate": 2.2384312299808868e-05, + "loss": 0.0556, + "step": 3628 + }, + { + "epoch": 2.208092485549133, + "grad_norm": 0.6332383155822754, + "learning_rate": 2.2371932096949957e-05, + "loss": 0.0488, + "step": 3629 + }, + { + "epoch": 2.208700943109218, + "grad_norm": 0.6663236021995544, + "learning_rate": 2.2359552545742332e-05, + "loss": 0.0381, + "step": 3630 + }, + { + "epoch": 2.2093094006693033, + "grad_norm": 0.5764215588569641, + "learning_rate": 2.2347173649255627e-05, + "loss": 0.0576, + "step": 3631 + }, + { + "epoch": 2.2099178582293884, + "grad_norm": 0.4953424632549286, + "learning_rate": 2.2334795410559283e-05, + "loss": 0.0395, + "step": 3632 + }, + { + "epoch": 2.2105263157894735, + "grad_norm": 0.5949841737747192, + "learning_rate": 2.2322417832722574e-05, + "loss": 0.0563, + "step": 3633 + }, + { + "epoch": 2.211134773349559, + "grad_norm": 0.5839807391166687, + "learning_rate": 2.2310040918814647e-05, + "loss": 0.0493, + "step": 3634 + }, + { + "epoch": 2.211743230909644, + "grad_norm": 0.648003101348877, + "learning_rate": 2.2297664671904447e-05, + "loss": 0.0691, + "step": 3635 + }, + { + "epoch": 2.212351688469729, + "grad_norm": 0.582639217376709, + "learning_rate": 2.2285289095060764e-05, + "loss": 0.0542, + "step": 3636 + }, + { + "epoch": 2.2129601460298143, + "grad_norm": 0.6050490140914917, + "learning_rate": 2.2272914191352234e-05, + "loss": 0.0421, + "step": 3637 + }, + { + "epoch": 2.2135686035898994, + "grad_norm": 0.8048490285873413, + "learning_rate": 2.2260539963847317e-05, + "loss": 0.0838, + "step": 3638 + }, + { + "epoch": 2.214177061149985, + "grad_norm": 0.6212171316146851, + "learning_rate": 2.2248166415614305e-05, + "loss": 0.0601, + "step": 3639 + }, + { + "epoch": 2.21478551871007, + "grad_norm": 0.5588234663009644, + "learning_rate": 2.223579354972131e-05, + "loss": 0.0384, + "step": 3640 + }, + { + "epoch": 2.215393976270155, + "grad_norm": 0.5870838761329651, + "learning_rate": 2.2223421369236304e-05, + "loss": 0.0476, + "step": 3641 + }, + { + "epoch": 2.21600243383024, + "grad_norm": 0.6339545845985413, + "learning_rate": 2.2211049877227065e-05, + "loss": 0.0791, + "step": 3642 + }, + { + "epoch": 2.2166108913903253, + "grad_norm": 0.6535237431526184, + "learning_rate": 2.2198679076761196e-05, + "loss": 0.0458, + "step": 3643 + }, + { + "epoch": 2.217219348950411, + "grad_norm": 0.4435776174068451, + "learning_rate": 2.2186308970906166e-05, + "loss": 0.0386, + "step": 3644 + }, + { + "epoch": 2.217827806510496, + "grad_norm": 0.6395978331565857, + "learning_rate": 2.217393956272923e-05, + "loss": 0.0606, + "step": 3645 + }, + { + "epoch": 2.218436264070581, + "grad_norm": 0.6212596297264099, + "learning_rate": 2.2161570855297474e-05, + "loss": 0.0597, + "step": 3646 + }, + { + "epoch": 2.219044721630666, + "grad_norm": 0.49921971559524536, + "learning_rate": 2.2149202851677842e-05, + "loss": 0.041, + "step": 3647 + }, + { + "epoch": 2.2196531791907512, + "grad_norm": 0.45042991638183594, + "learning_rate": 2.213683555493708e-05, + "loss": 0.0454, + "step": 3648 + }, + { + "epoch": 2.2202616367508368, + "grad_norm": 0.5090251564979553, + "learning_rate": 2.2124468968141746e-05, + "loss": 0.0528, + "step": 3649 + }, + { + "epoch": 2.220870094310922, + "grad_norm": 0.6496664881706238, + "learning_rate": 2.211210309435826e-05, + "loss": 0.0516, + "step": 3650 + }, + { + "epoch": 2.221478551871007, + "grad_norm": 0.5032176375389099, + "learning_rate": 2.2099737936652834e-05, + "loss": 0.0365, + "step": 3651 + }, + { + "epoch": 2.222087009431092, + "grad_norm": 0.5386142730712891, + "learning_rate": 2.2087373498091505e-05, + "loss": 0.0489, + "step": 3652 + }, + { + "epoch": 2.222695466991177, + "grad_norm": 0.5866867303848267, + "learning_rate": 2.2075009781740144e-05, + "loss": 0.0507, + "step": 3653 + }, + { + "epoch": 2.2233039245512627, + "grad_norm": 0.6343091726303101, + "learning_rate": 2.2062646790664443e-05, + "loss": 0.05, + "step": 3654 + }, + { + "epoch": 2.223912382111348, + "grad_norm": 0.49765875935554504, + "learning_rate": 2.2050284527929897e-05, + "loss": 0.0349, + "step": 3655 + }, + { + "epoch": 2.224520839671433, + "grad_norm": 0.6474698185920715, + "learning_rate": 2.203792299660184e-05, + "loss": 0.0544, + "step": 3656 + }, + { + "epoch": 2.225129297231518, + "grad_norm": 0.5999079346656799, + "learning_rate": 2.202556219974542e-05, + "loss": 0.0387, + "step": 3657 + }, + { + "epoch": 2.225737754791603, + "grad_norm": 0.5736773014068604, + "learning_rate": 2.2013202140425584e-05, + "loss": 0.0553, + "step": 3658 + }, + { + "epoch": 2.2263462123516886, + "grad_norm": 0.6365640163421631, + "learning_rate": 2.2000842821707122e-05, + "loss": 0.046, + "step": 3659 + }, + { + "epoch": 2.2269546699117737, + "grad_norm": 0.5798011422157288, + "learning_rate": 2.198848424665464e-05, + "loss": 0.0466, + "step": 3660 + }, + { + "epoch": 2.227563127471859, + "grad_norm": 0.599982738494873, + "learning_rate": 2.197612641833253e-05, + "loss": 0.0608, + "step": 3661 + }, + { + "epoch": 2.228171585031944, + "grad_norm": 0.5781305432319641, + "learning_rate": 2.196376933980503e-05, + "loss": 0.0446, + "step": 3662 + }, + { + "epoch": 2.228780042592029, + "grad_norm": 0.6489582061767578, + "learning_rate": 2.1951413014136177e-05, + "loss": 0.0562, + "step": 3663 + }, + { + "epoch": 2.2293885001521145, + "grad_norm": 0.5983100533485413, + "learning_rate": 2.1939057444389822e-05, + "loss": 0.046, + "step": 3664 + }, + { + "epoch": 2.2299969577121996, + "grad_norm": 0.5980080366134644, + "learning_rate": 2.192670263362964e-05, + "loss": 0.0658, + "step": 3665 + }, + { + "epoch": 2.2306054152722847, + "grad_norm": 0.5348019003868103, + "learning_rate": 2.19143485849191e-05, + "loss": 0.0498, + "step": 3666 + }, + { + "epoch": 2.23121387283237, + "grad_norm": 0.5412232875823975, + "learning_rate": 2.1901995301321493e-05, + "loss": 0.0397, + "step": 3667 + }, + { + "epoch": 2.231822330392455, + "grad_norm": 0.5473434925079346, + "learning_rate": 2.1889642785899926e-05, + "loss": 0.0388, + "step": 3668 + }, + { + "epoch": 2.2324307879525405, + "grad_norm": 0.8068600296974182, + "learning_rate": 2.1877291041717294e-05, + "loss": 0.0524, + "step": 3669 + }, + { + "epoch": 2.2330392455126256, + "grad_norm": 0.5853879451751709, + "learning_rate": 2.1864940071836326e-05, + "loss": 0.0582, + "step": 3670 + }, + { + "epoch": 2.2336477030727107, + "grad_norm": 0.5249461531639099, + "learning_rate": 2.1852589879319547e-05, + "loss": 0.048, + "step": 3671 + }, + { + "epoch": 2.2342561606327958, + "grad_norm": 0.5302576422691345, + "learning_rate": 2.1840240467229283e-05, + "loss": 0.0553, + "step": 3672 + }, + { + "epoch": 2.234864618192881, + "grad_norm": 0.5220465660095215, + "learning_rate": 2.1827891838627687e-05, + "loss": 0.0431, + "step": 3673 + }, + { + "epoch": 2.2354730757529664, + "grad_norm": 0.5852304697036743, + "learning_rate": 2.1815543996576688e-05, + "loss": 0.0496, + "step": 3674 + }, + { + "epoch": 2.2360815333130515, + "grad_norm": 0.6456661820411682, + "learning_rate": 2.1803196944138045e-05, + "loss": 0.0519, + "step": 3675 + }, + { + "epoch": 2.2366899908731366, + "grad_norm": 0.5966864824295044, + "learning_rate": 2.1790850684373305e-05, + "loss": 0.0358, + "step": 3676 + }, + { + "epoch": 2.2372984484332217, + "grad_norm": 0.6074363589286804, + "learning_rate": 2.1778505220343836e-05, + "loss": 0.0507, + "step": 3677 + }, + { + "epoch": 2.2379069059933068, + "grad_norm": 0.5367887616157532, + "learning_rate": 2.17661605551108e-05, + "loss": 0.0602, + "step": 3678 + }, + { + "epoch": 2.2385153635533923, + "grad_norm": 0.5965633392333984, + "learning_rate": 2.175381669173514e-05, + "loss": 0.043, + "step": 3679 + }, + { + "epoch": 2.2391238211134774, + "grad_norm": 0.5841217041015625, + "learning_rate": 2.174147363327764e-05, + "loss": 0.0536, + "step": 3680 + }, + { + "epoch": 2.2397322786735625, + "grad_norm": 0.5816332101821899, + "learning_rate": 2.1729131382798858e-05, + "loss": 0.0578, + "step": 3681 + }, + { + "epoch": 2.2403407362336476, + "grad_norm": 0.5089899301528931, + "learning_rate": 2.1716789943359155e-05, + "loss": 0.0424, + "step": 3682 + }, + { + "epoch": 2.2409491937937327, + "grad_norm": 1.4226927757263184, + "learning_rate": 2.1704449318018692e-05, + "loss": 0.0814, + "step": 3683 + }, + { + "epoch": 2.2415576513538182, + "grad_norm": 0.6067140102386475, + "learning_rate": 2.1692109509837442e-05, + "loss": 0.0553, + "step": 3684 + }, + { + "epoch": 2.2421661089139033, + "grad_norm": 0.504880964756012, + "learning_rate": 2.167977052187515e-05, + "loss": 0.0329, + "step": 3685 + }, + { + "epoch": 2.2427745664739884, + "grad_norm": 0.5495906472206116, + "learning_rate": 2.1667432357191364e-05, + "loss": 0.0526, + "step": 3686 + }, + { + "epoch": 2.2433830240340735, + "grad_norm": 0.6342142820358276, + "learning_rate": 2.1655095018845455e-05, + "loss": 0.0491, + "step": 3687 + }, + { + "epoch": 2.2439914815941586, + "grad_norm": 0.5818422436714172, + "learning_rate": 2.1642758509896562e-05, + "loss": 0.0383, + "step": 3688 + }, + { + "epoch": 2.244599939154244, + "grad_norm": 0.6000524163246155, + "learning_rate": 2.1630422833403613e-05, + "loss": 0.0517, + "step": 3689 + }, + { + "epoch": 2.2452083967143293, + "grad_norm": 0.5182064175605774, + "learning_rate": 2.1618087992425356e-05, + "loss": 0.0441, + "step": 3690 + }, + { + "epoch": 2.2458168542744144, + "grad_norm": 0.6751769781112671, + "learning_rate": 2.1605753990020315e-05, + "loss": 0.0608, + "step": 3691 + }, + { + "epoch": 2.2464253118344994, + "grad_norm": 0.6493861079216003, + "learning_rate": 2.1593420829246794e-05, + "loss": 0.0533, + "step": 3692 + }, + { + "epoch": 2.2470337693945845, + "grad_norm": 0.6242604851722717, + "learning_rate": 2.1581088513162923e-05, + "loss": 0.0522, + "step": 3693 + }, + { + "epoch": 2.24764222695467, + "grad_norm": 0.5824303030967712, + "learning_rate": 2.1568757044826595e-05, + "loss": 0.0507, + "step": 3694 + }, + { + "epoch": 2.248250684514755, + "grad_norm": 0.6880918741226196, + "learning_rate": 2.1556426427295488e-05, + "loss": 0.0582, + "step": 3695 + }, + { + "epoch": 2.2488591420748403, + "grad_norm": 0.6002342104911804, + "learning_rate": 2.1544096663627104e-05, + "loss": 0.053, + "step": 3696 + }, + { + "epoch": 2.2494675996349254, + "grad_norm": 0.5104690790176392, + "learning_rate": 2.1531767756878696e-05, + "loss": 0.0394, + "step": 3697 + }, + { + "epoch": 2.2500760571950105, + "grad_norm": 0.5821288228034973, + "learning_rate": 2.151943971010732e-05, + "loss": 0.0422, + "step": 3698 + }, + { + "epoch": 2.250684514755096, + "grad_norm": 0.6568170785903931, + "learning_rate": 2.150711252636981e-05, + "loss": 0.0554, + "step": 3699 + }, + { + "epoch": 2.251292972315181, + "grad_norm": 0.5166933536529541, + "learning_rate": 2.1494786208722814e-05, + "loss": 0.0454, + "step": 3700 + }, + { + "epoch": 2.251901429875266, + "grad_norm": 0.6102219223976135, + "learning_rate": 2.1482460760222733e-05, + "loss": 0.0525, + "step": 3701 + }, + { + "epoch": 2.2525098874353513, + "grad_norm": 0.5432948470115662, + "learning_rate": 2.1470136183925755e-05, + "loss": 0.0538, + "step": 3702 + }, + { + "epoch": 2.2531183449954364, + "grad_norm": 0.5394953489303589, + "learning_rate": 2.1457812482887882e-05, + "loss": 0.0412, + "step": 3703 + }, + { + "epoch": 2.253726802555522, + "grad_norm": 0.5707597136497498, + "learning_rate": 2.1445489660164868e-05, + "loss": 0.0498, + "step": 3704 + }, + { + "epoch": 2.254335260115607, + "grad_norm": 0.5889186263084412, + "learning_rate": 2.1433167718812247e-05, + "loss": 0.0554, + "step": 3705 + }, + { + "epoch": 2.254943717675692, + "grad_norm": 0.5888684391975403, + "learning_rate": 2.142084666188537e-05, + "loss": 0.0512, + "step": 3706 + }, + { + "epoch": 2.2555521752357772, + "grad_norm": 0.4928998053073883, + "learning_rate": 2.1408526492439336e-05, + "loss": 0.0578, + "step": 3707 + }, + { + "epoch": 2.2561606327958623, + "grad_norm": 0.5299960374832153, + "learning_rate": 2.139620721352903e-05, + "loss": 0.0405, + "step": 3708 + }, + { + "epoch": 2.256769090355948, + "grad_norm": 0.5291889905929565, + "learning_rate": 2.138388882820911e-05, + "loss": 0.0555, + "step": 3709 + }, + { + "epoch": 2.257377547916033, + "grad_norm": 0.6831831932067871, + "learning_rate": 2.1371571339534046e-05, + "loss": 0.0863, + "step": 3710 + }, + { + "epoch": 2.257986005476118, + "grad_norm": 0.5145639181137085, + "learning_rate": 2.135925475055805e-05, + "loss": 0.0474, + "step": 3711 + }, + { + "epoch": 2.258594463036203, + "grad_norm": 0.5370884537696838, + "learning_rate": 2.134693906433511e-05, + "loss": 0.0416, + "step": 3712 + }, + { + "epoch": 2.2592029205962882, + "grad_norm": 0.5465013384819031, + "learning_rate": 2.1334624283919026e-05, + "loss": 0.0537, + "step": 3713 + }, + { + "epoch": 2.259811378156374, + "grad_norm": 0.5709042549133301, + "learning_rate": 2.132231041236334e-05, + "loss": 0.0366, + "step": 3714 + }, + { + "epoch": 2.260419835716459, + "grad_norm": 0.6409594416618347, + "learning_rate": 2.1309997452721366e-05, + "loss": 0.0586, + "step": 3715 + }, + { + "epoch": 2.261028293276544, + "grad_norm": 0.634441077709198, + "learning_rate": 2.129768540804623e-05, + "loss": 0.0634, + "step": 3716 + }, + { + "epoch": 2.261636750836629, + "grad_norm": 0.5557528734207153, + "learning_rate": 2.128537428139079e-05, + "loss": 0.0429, + "step": 3717 + }, + { + "epoch": 2.262245208396714, + "grad_norm": 0.678367018699646, + "learning_rate": 2.1273064075807686e-05, + "loss": 0.0686, + "step": 3718 + }, + { + "epoch": 2.2628536659567997, + "grad_norm": 0.5972436666488647, + "learning_rate": 2.1260754794349354e-05, + "loss": 0.0461, + "step": 3719 + }, + { + "epoch": 2.263462123516885, + "grad_norm": 0.664558470249176, + "learning_rate": 2.1248446440067976e-05, + "loss": 0.0467, + "step": 3720 + }, + { + "epoch": 2.26407058107697, + "grad_norm": 0.6116575598716736, + "learning_rate": 2.1236139016015507e-05, + "loss": 0.0517, + "step": 3721 + }, + { + "epoch": 2.264679038637055, + "grad_norm": 0.5580343008041382, + "learning_rate": 2.1223832525243663e-05, + "loss": 0.0533, + "step": 3722 + }, + { + "epoch": 2.26528749619714, + "grad_norm": 0.5420104265213013, + "learning_rate": 2.1211526970803973e-05, + "loss": 0.039, + "step": 3723 + }, + { + "epoch": 2.2658959537572256, + "grad_norm": 1.199730634689331, + "learning_rate": 2.1199222355747674e-05, + "loss": 0.0618, + "step": 3724 + }, + { + "epoch": 2.2665044113173107, + "grad_norm": 0.6017125844955444, + "learning_rate": 2.1186918683125802e-05, + "loss": 0.0559, + "step": 3725 + }, + { + "epoch": 2.267112868877396, + "grad_norm": 0.5252535343170166, + "learning_rate": 2.117461595598917e-05, + "loss": 0.0335, + "step": 3726 + }, + { + "epoch": 2.267721326437481, + "grad_norm": 0.6079027652740479, + "learning_rate": 2.1162314177388327e-05, + "loss": 0.0508, + "step": 3727 + }, + { + "epoch": 2.268329783997566, + "grad_norm": 0.49775218963623047, + "learning_rate": 2.1150013350373594e-05, + "loss": 0.0402, + "step": 3728 + }, + { + "epoch": 2.2689382415576516, + "grad_norm": 0.5613334774971008, + "learning_rate": 2.113771347799509e-05, + "loss": 0.0483, + "step": 3729 + }, + { + "epoch": 2.2695466991177367, + "grad_norm": 0.5892758965492249, + "learning_rate": 2.1125414563302654e-05, + "loss": 0.0589, + "step": 3730 + }, + { + "epoch": 2.2701551566778218, + "grad_norm": 0.6577876806259155, + "learning_rate": 2.1113116609345893e-05, + "loss": 0.0417, + "step": 3731 + }, + { + "epoch": 2.270763614237907, + "grad_norm": 0.6426102519035339, + "learning_rate": 2.110081961917421e-05, + "loss": 0.0514, + "step": 3732 + }, + { + "epoch": 2.271372071797992, + "grad_norm": 0.5062007308006287, + "learning_rate": 2.108852359583674e-05, + "loss": 0.0332, + "step": 3733 + }, + { + "epoch": 2.2719805293580775, + "grad_norm": 0.5272140502929688, + "learning_rate": 2.1076228542382376e-05, + "loss": 0.0393, + "step": 3734 + }, + { + "epoch": 2.2725889869181626, + "grad_norm": 0.6248688697814941, + "learning_rate": 2.1063934461859774e-05, + "loss": 0.0518, + "step": 3735 + }, + { + "epoch": 2.2731974444782477, + "grad_norm": 0.6348571181297302, + "learning_rate": 2.105164135731737e-05, + "loss": 0.0552, + "step": 3736 + }, + { + "epoch": 2.2738059020383328, + "grad_norm": 0.5358805656433105, + "learning_rate": 2.1039349231803337e-05, + "loss": 0.0462, + "step": 3737 + }, + { + "epoch": 2.274414359598418, + "grad_norm": 0.5874552130699158, + "learning_rate": 2.1027058088365593e-05, + "loss": 0.0503, + "step": 3738 + }, + { + "epoch": 2.2750228171585034, + "grad_norm": 0.5456854104995728, + "learning_rate": 2.1014767930051856e-05, + "loss": 0.0462, + "step": 3739 + }, + { + "epoch": 2.2756312747185885, + "grad_norm": 0.5097061991691589, + "learning_rate": 2.1002478759909558e-05, + "loss": 0.0488, + "step": 3740 + }, + { + "epoch": 2.2762397322786736, + "grad_norm": 0.5960326790809631, + "learning_rate": 2.0990190580985892e-05, + "loss": 0.057, + "step": 3741 + }, + { + "epoch": 2.2768481898387587, + "grad_norm": 0.6707984805107117, + "learning_rate": 2.0977903396327837e-05, + "loss": 0.0909, + "step": 3742 + }, + { + "epoch": 2.277456647398844, + "grad_norm": 0.5370257496833801, + "learning_rate": 2.096561720898209e-05, + "loss": 0.0396, + "step": 3743 + }, + { + "epoch": 2.2780651049589293, + "grad_norm": 0.5569398403167725, + "learning_rate": 2.0953332021995114e-05, + "loss": 0.0492, + "step": 3744 + }, + { + "epoch": 2.2786735625190144, + "grad_norm": 0.5827397108078003, + "learning_rate": 2.094104783841311e-05, + "loss": 0.0591, + "step": 3745 + }, + { + "epoch": 2.2792820200790995, + "grad_norm": 0.5374563336372375, + "learning_rate": 2.0928764661282068e-05, + "loss": 0.0438, + "step": 3746 + }, + { + "epoch": 2.2798904776391846, + "grad_norm": 0.41728445887565613, + "learning_rate": 2.0916482493647694e-05, + "loss": 0.0278, + "step": 3747 + }, + { + "epoch": 2.2804989351992697, + "grad_norm": 0.5607496500015259, + "learning_rate": 2.0904201338555442e-05, + "loss": 0.0335, + "step": 3748 + }, + { + "epoch": 2.2811073927593553, + "grad_norm": 0.6181521415710449, + "learning_rate": 2.0891921199050547e-05, + "loss": 0.0428, + "step": 3749 + }, + { + "epoch": 2.2817158503194404, + "grad_norm": 0.6066822409629822, + "learning_rate": 2.087964207817796e-05, + "loss": 0.051, + "step": 3750 + }, + { + "epoch": 2.2823243078795254, + "grad_norm": 0.5879988670349121, + "learning_rate": 2.0867363978982383e-05, + "loss": 0.0372, + "step": 3751 + }, + { + "epoch": 2.2829327654396105, + "grad_norm": 0.6240618228912354, + "learning_rate": 2.085508690450829e-05, + "loss": 0.0432, + "step": 3752 + }, + { + "epoch": 2.2835412229996956, + "grad_norm": 0.5587534308433533, + "learning_rate": 2.084281085779988e-05, + "loss": 0.0415, + "step": 3753 + }, + { + "epoch": 2.284149680559781, + "grad_norm": 0.545318067073822, + "learning_rate": 2.083053584190109e-05, + "loss": 0.0417, + "step": 3754 + }, + { + "epoch": 2.2847581381198663, + "grad_norm": 0.5254773497581482, + "learning_rate": 2.0818261859855624e-05, + "loss": 0.0284, + "step": 3755 + }, + { + "epoch": 2.2853665956799514, + "grad_norm": 0.6514537930488586, + "learning_rate": 2.0805988914706913e-05, + "loss": 0.0714, + "step": 3756 + }, + { + "epoch": 2.2859750532400365, + "grad_norm": 0.5673707127571106, + "learning_rate": 2.079371700949814e-05, + "loss": 0.0426, + "step": 3757 + }, + { + "epoch": 2.2865835108001216, + "grad_norm": 0.8576644062995911, + "learning_rate": 2.078144614727221e-05, + "loss": 0.0707, + "step": 3758 + }, + { + "epoch": 2.287191968360207, + "grad_norm": 0.6915388107299805, + "learning_rate": 2.0769176331071805e-05, + "loss": 0.052, + "step": 3759 + }, + { + "epoch": 2.287800425920292, + "grad_norm": 0.536024808883667, + "learning_rate": 2.0756907563939327e-05, + "loss": 0.041, + "step": 3760 + }, + { + "epoch": 2.2884088834803773, + "grad_norm": 0.5904122591018677, + "learning_rate": 2.0744639848916895e-05, + "loss": 0.047, + "step": 3761 + }, + { + "epoch": 2.2890173410404624, + "grad_norm": 0.49669381976127625, + "learning_rate": 2.073237318904642e-05, + "loss": 0.0417, + "step": 3762 + }, + { + "epoch": 2.2896257986005475, + "grad_norm": 0.540749192237854, + "learning_rate": 2.0720107587369513e-05, + "loss": 0.0603, + "step": 3763 + }, + { + "epoch": 2.290234256160633, + "grad_norm": 0.5298412442207336, + "learning_rate": 2.070784304692752e-05, + "loss": 0.056, + "step": 3764 + }, + { + "epoch": 2.290842713720718, + "grad_norm": 0.623453676700592, + "learning_rate": 2.0695579570761552e-05, + "loss": 0.055, + "step": 3765 + }, + { + "epoch": 2.2914511712808032, + "grad_norm": 0.5402014255523682, + "learning_rate": 2.0683317161912437e-05, + "loss": 0.0448, + "step": 3766 + }, + { + "epoch": 2.2920596288408883, + "grad_norm": 0.5555523633956909, + "learning_rate": 2.067105582342073e-05, + "loss": 0.0432, + "step": 3767 + }, + { + "epoch": 2.2926680864009734, + "grad_norm": 0.4748152792453766, + "learning_rate": 2.0658795558326743e-05, + "loss": 0.0386, + "step": 3768 + }, + { + "epoch": 2.293276543961059, + "grad_norm": 0.5108730792999268, + "learning_rate": 2.064653636967051e-05, + "loss": 0.0442, + "step": 3769 + }, + { + "epoch": 2.293885001521144, + "grad_norm": 0.6018065810203552, + "learning_rate": 2.063427826049179e-05, + "loss": 0.0436, + "step": 3770 + }, + { + "epoch": 2.294493459081229, + "grad_norm": 0.5590872168540955, + "learning_rate": 2.062202123383009e-05, + "loss": 0.0568, + "step": 3771 + }, + { + "epoch": 2.2951019166413142, + "grad_norm": 0.6751280426979065, + "learning_rate": 2.0609765292724647e-05, + "loss": 0.0736, + "step": 3772 + }, + { + "epoch": 2.2957103742013993, + "grad_norm": 0.5929213762283325, + "learning_rate": 2.059751044021441e-05, + "loss": 0.0584, + "step": 3773 + }, + { + "epoch": 2.296318831761485, + "grad_norm": 1.4224424362182617, + "learning_rate": 2.0585256679338083e-05, + "loss": 0.0544, + "step": 3774 + }, + { + "epoch": 2.29692728932157, + "grad_norm": 0.5579968690872192, + "learning_rate": 2.0573004013134088e-05, + "loss": 0.0644, + "step": 3775 + }, + { + "epoch": 2.297535746881655, + "grad_norm": 0.6264181137084961, + "learning_rate": 2.0560752444640567e-05, + "loss": 0.0596, + "step": 3776 + }, + { + "epoch": 2.29814420444174, + "grad_norm": 0.6934359669685364, + "learning_rate": 2.0548501976895395e-05, + "loss": 0.0666, + "step": 3777 + }, + { + "epoch": 2.2987526620018253, + "grad_norm": 0.5914580821990967, + "learning_rate": 2.0536252612936196e-05, + "loss": 0.0455, + "step": 3778 + }, + { + "epoch": 2.299361119561911, + "grad_norm": 0.4668748080730438, + "learning_rate": 2.0524004355800292e-05, + "loss": 0.0424, + "step": 3779 + }, + { + "epoch": 2.299969577121996, + "grad_norm": 0.5458466410636902, + "learning_rate": 2.0511757208524734e-05, + "loss": 0.0447, + "step": 3780 + }, + { + "epoch": 2.300578034682081, + "grad_norm": 0.6712079644203186, + "learning_rate": 2.0499511174146307e-05, + "loss": 0.0512, + "step": 3781 + }, + { + "epoch": 2.301186492242166, + "grad_norm": 0.5523056387901306, + "learning_rate": 2.0487266255701527e-05, + "loss": 0.0471, + "step": 3782 + }, + { + "epoch": 2.301794949802251, + "grad_norm": 0.612053632736206, + "learning_rate": 2.0475022456226606e-05, + "loss": 0.0547, + "step": 3783 + }, + { + "epoch": 2.3024034073623363, + "grad_norm": 0.6411483883857727, + "learning_rate": 2.0462779778757507e-05, + "loss": 0.0698, + "step": 3784 + }, + { + "epoch": 2.303011864922422, + "grad_norm": 0.5178706645965576, + "learning_rate": 2.045053822632991e-05, + "loss": 0.0485, + "step": 3785 + }, + { + "epoch": 2.303620322482507, + "grad_norm": 0.5511070489883423, + "learning_rate": 2.0438297801979196e-05, + "loss": 0.0442, + "step": 3786 + }, + { + "epoch": 2.304228780042592, + "grad_norm": 0.5382661819458008, + "learning_rate": 2.0426058508740483e-05, + "loss": 0.0564, + "step": 3787 + }, + { + "epoch": 2.304837237602677, + "grad_norm": 0.5051350593566895, + "learning_rate": 2.041382034964862e-05, + "loss": 0.0333, + "step": 3788 + }, + { + "epoch": 2.305445695162762, + "grad_norm": 0.6360859870910645, + "learning_rate": 2.040158332773814e-05, + "loss": 0.0551, + "step": 3789 + }, + { + "epoch": 2.3060541527228477, + "grad_norm": 0.651378333568573, + "learning_rate": 2.0389347446043325e-05, + "loss": 0.0489, + "step": 3790 + }, + { + "epoch": 2.306662610282933, + "grad_norm": 0.4725477397441864, + "learning_rate": 2.0377112707598163e-05, + "loss": 0.0325, + "step": 3791 + }, + { + "epoch": 2.307271067843018, + "grad_norm": 0.6588869690895081, + "learning_rate": 2.0364879115436352e-05, + "loss": 0.0461, + "step": 3792 + }, + { + "epoch": 2.307879525403103, + "grad_norm": 0.5729846954345703, + "learning_rate": 2.0352646672591328e-05, + "loss": 0.0658, + "step": 3793 + }, + { + "epoch": 2.308487982963188, + "grad_norm": 0.6155021786689758, + "learning_rate": 2.034041538209621e-05, + "loss": 0.0583, + "step": 3794 + }, + { + "epoch": 2.3090964405232737, + "grad_norm": 0.5684482455253601, + "learning_rate": 2.0328185246983862e-05, + "loss": 0.0488, + "step": 3795 + }, + { + "epoch": 2.3097048980833588, + "grad_norm": 0.5797809362411499, + "learning_rate": 2.031595627028684e-05, + "loss": 0.0394, + "step": 3796 + }, + { + "epoch": 2.310313355643444, + "grad_norm": 0.7263544201850891, + "learning_rate": 2.0303728455037422e-05, + "loss": 0.0685, + "step": 3797 + }, + { + "epoch": 2.310921813203529, + "grad_norm": 0.5464826226234436, + "learning_rate": 2.0291501804267592e-05, + "loss": 0.0483, + "step": 3798 + }, + { + "epoch": 2.311530270763614, + "grad_norm": 0.5334215760231018, + "learning_rate": 2.0279276321009067e-05, + "loss": 0.0416, + "step": 3799 + }, + { + "epoch": 2.3121387283236996, + "grad_norm": 0.5960489511489868, + "learning_rate": 2.026705200829323e-05, + "loss": 0.0465, + "step": 3800 + }, + { + "epoch": 2.3127471858837847, + "grad_norm": 0.5816206932067871, + "learning_rate": 2.0254828869151225e-05, + "loss": 0.0482, + "step": 3801 + }, + { + "epoch": 2.31335564344387, + "grad_norm": 0.5829434990882874, + "learning_rate": 2.0242606906613876e-05, + "loss": 0.0693, + "step": 3802 + }, + { + "epoch": 2.313964101003955, + "grad_norm": 0.5510879158973694, + "learning_rate": 2.0230386123711714e-05, + "loss": 0.0428, + "step": 3803 + }, + { + "epoch": 2.31457255856404, + "grad_norm": 0.6710597276687622, + "learning_rate": 2.0218166523474973e-05, + "loss": 0.0483, + "step": 3804 + }, + { + "epoch": 2.3151810161241255, + "grad_norm": 0.5956816673278809, + "learning_rate": 2.020594810893363e-05, + "loss": 0.0566, + "step": 3805 + }, + { + "epoch": 2.3157894736842106, + "grad_norm": 0.6392737030982971, + "learning_rate": 2.0193730883117335e-05, + "loss": 0.0547, + "step": 3806 + }, + { + "epoch": 2.3163979312442957, + "grad_norm": 0.548627495765686, + "learning_rate": 2.018151484905543e-05, + "loss": 0.0348, + "step": 3807 + }, + { + "epoch": 2.317006388804381, + "grad_norm": 0.560031533241272, + "learning_rate": 2.016930000977701e-05, + "loss": 0.0522, + "step": 3808 + }, + { + "epoch": 2.317614846364466, + "grad_norm": 0.6135550141334534, + "learning_rate": 2.0157086368310836e-05, + "loss": 0.0511, + "step": 3809 + }, + { + "epoch": 2.3182233039245514, + "grad_norm": 0.6668503284454346, + "learning_rate": 2.014487392768537e-05, + "loss": 0.0485, + "step": 3810 + }, + { + "epoch": 2.3188317614846365, + "grad_norm": 0.6775378584861755, + "learning_rate": 2.01326626909288e-05, + "loss": 0.0683, + "step": 3811 + }, + { + "epoch": 2.3194402190447216, + "grad_norm": 0.48050111532211304, + "learning_rate": 2.012045266106901e-05, + "loss": 0.0377, + "step": 3812 + }, + { + "epoch": 2.3200486766048067, + "grad_norm": 0.9371777772903442, + "learning_rate": 2.010824384113357e-05, + "loss": 0.0584, + "step": 3813 + }, + { + "epoch": 2.320657134164892, + "grad_norm": 0.6224126815795898, + "learning_rate": 2.0096036234149746e-05, + "loss": 0.0479, + "step": 3814 + }, + { + "epoch": 2.3212655917249774, + "grad_norm": 0.6318977475166321, + "learning_rate": 2.0083829843144543e-05, + "loss": 0.0657, + "step": 3815 + }, + { + "epoch": 2.3218740492850625, + "grad_norm": 0.5769774913787842, + "learning_rate": 2.0071624671144616e-05, + "loss": 0.0409, + "step": 3816 + }, + { + "epoch": 2.3224825068451476, + "grad_norm": 0.5656179189682007, + "learning_rate": 2.005942072117634e-05, + "loss": 0.0515, + "step": 3817 + }, + { + "epoch": 2.3230909644052327, + "grad_norm": 0.5796024799346924, + "learning_rate": 2.00472179962658e-05, + "loss": 0.0501, + "step": 3818 + }, + { + "epoch": 2.3236994219653178, + "grad_norm": 0.5330328345298767, + "learning_rate": 2.0035016499438758e-05, + "loss": 0.0495, + "step": 3819 + }, + { + "epoch": 2.324307879525403, + "grad_norm": 0.6313185691833496, + "learning_rate": 2.002281623372066e-05, + "loss": 0.045, + "step": 3820 + }, + { + "epoch": 2.3249163370854884, + "grad_norm": 0.5151857137680054, + "learning_rate": 2.001061720213669e-05, + "loss": 0.0319, + "step": 3821 + }, + { + "epoch": 2.3255247946455735, + "grad_norm": 0.6799402236938477, + "learning_rate": 1.9998419407711686e-05, + "loss": 0.0514, + "step": 3822 + }, + { + "epoch": 2.3261332522056586, + "grad_norm": 0.499461829662323, + "learning_rate": 1.998622285347018e-05, + "loss": 0.0368, + "step": 3823 + }, + { + "epoch": 2.3267417097657437, + "grad_norm": 0.5110387206077576, + "learning_rate": 1.9974027542436433e-05, + "loss": 0.0464, + "step": 3824 + }, + { + "epoch": 2.327350167325829, + "grad_norm": 0.7454649806022644, + "learning_rate": 1.996183347763436e-05, + "loss": 0.076, + "step": 3825 + }, + { + "epoch": 2.3279586248859143, + "grad_norm": 0.5358094573020935, + "learning_rate": 1.994964066208759e-05, + "loss": 0.0465, + "step": 3826 + }, + { + "epoch": 2.3285670824459994, + "grad_norm": 0.6061232686042786, + "learning_rate": 1.993744909881941e-05, + "loss": 0.0434, + "step": 3827 + }, + { + "epoch": 2.3291755400060845, + "grad_norm": 0.6150352358818054, + "learning_rate": 1.9925258790852852e-05, + "loss": 0.0528, + "step": 3828 + }, + { + "epoch": 2.3297839975661696, + "grad_norm": 0.5760040283203125, + "learning_rate": 1.9913069741210583e-05, + "loss": 0.0573, + "step": 3829 + }, + { + "epoch": 2.3303924551262547, + "grad_norm": 0.6556773781776428, + "learning_rate": 1.9900881952914975e-05, + "loss": 0.0423, + "step": 3830 + }, + { + "epoch": 2.3310009126863402, + "grad_norm": 0.5123857259750366, + "learning_rate": 1.9888695428988115e-05, + "loss": 0.0432, + "step": 3831 + }, + { + "epoch": 2.3316093702464253, + "grad_norm": 0.651202917098999, + "learning_rate": 1.9876510172451733e-05, + "loss": 0.0423, + "step": 3832 + }, + { + "epoch": 2.3322178278065104, + "grad_norm": 0.4787401258945465, + "learning_rate": 1.9864326186327265e-05, + "loss": 0.0382, + "step": 3833 + }, + { + "epoch": 2.3328262853665955, + "grad_norm": 0.5490178465843201, + "learning_rate": 1.9852143473635847e-05, + "loss": 0.0627, + "step": 3834 + }, + { + "epoch": 2.3334347429266806, + "grad_norm": 0.47695285081863403, + "learning_rate": 1.9839962037398276e-05, + "loss": 0.0515, + "step": 3835 + }, + { + "epoch": 2.334043200486766, + "grad_norm": 0.5950469374656677, + "learning_rate": 1.9827781880635034e-05, + "loss": 0.0443, + "step": 3836 + }, + { + "epoch": 2.3346516580468513, + "grad_norm": 0.5753946304321289, + "learning_rate": 1.981560300636629e-05, + "loss": 0.0452, + "step": 3837 + }, + { + "epoch": 2.3352601156069364, + "grad_norm": 0.5735435485839844, + "learning_rate": 1.9803425417611915e-05, + "loss": 0.0579, + "step": 3838 + }, + { + "epoch": 2.3358685731670215, + "grad_norm": 0.5695839524269104, + "learning_rate": 1.979124911739143e-05, + "loss": 0.0466, + "step": 3839 + }, + { + "epoch": 2.3364770307271066, + "grad_norm": 0.4902591407299042, + "learning_rate": 1.9779074108724042e-05, + "loss": 0.0431, + "step": 3840 + }, + { + "epoch": 2.337085488287192, + "grad_norm": 0.6744665503501892, + "learning_rate": 1.9766900394628668e-05, + "loss": 0.0583, + "step": 3841 + }, + { + "epoch": 2.337693945847277, + "grad_norm": 0.5691081285476685, + "learning_rate": 1.975472797812387e-05, + "loss": 0.0315, + "step": 3842 + }, + { + "epoch": 2.3383024034073623, + "grad_norm": 0.5368900299072266, + "learning_rate": 1.9742556862227888e-05, + "loss": 0.0418, + "step": 3843 + }, + { + "epoch": 2.3389108609674474, + "grad_norm": 0.5534371137619019, + "learning_rate": 1.9730387049958672e-05, + "loss": 0.0415, + "step": 3844 + }, + { + "epoch": 2.3395193185275325, + "grad_norm": 0.5242774486541748, + "learning_rate": 1.971821854433382e-05, + "loss": 0.0447, + "step": 3845 + }, + { + "epoch": 2.340127776087618, + "grad_norm": 0.6565991044044495, + "learning_rate": 1.9706051348370608e-05, + "loss": 0.0448, + "step": 3846 + }, + { + "epoch": 2.340736233647703, + "grad_norm": 0.4980752766132355, + "learning_rate": 1.9693885465086008e-05, + "loss": 0.044, + "step": 3847 + }, + { + "epoch": 2.341344691207788, + "grad_norm": 0.5127741694450378, + "learning_rate": 1.968172089749664e-05, + "loss": 0.0374, + "step": 3848 + }, + { + "epoch": 2.3419531487678733, + "grad_norm": 0.5670256614685059, + "learning_rate": 1.9669557648618815e-05, + "loss": 0.0474, + "step": 3849 + }, + { + "epoch": 2.3425616063279584, + "grad_norm": 0.6457985043525696, + "learning_rate": 1.9657395721468494e-05, + "loss": 0.06, + "step": 3850 + }, + { + "epoch": 2.343170063888044, + "grad_norm": 0.6051198244094849, + "learning_rate": 1.964523511906136e-05, + "loss": 0.0545, + "step": 3851 + }, + { + "epoch": 2.343778521448129, + "grad_norm": 0.6616335511207581, + "learning_rate": 1.9633075844412714e-05, + "loss": 0.0371, + "step": 3852 + }, + { + "epoch": 2.344386979008214, + "grad_norm": 0.5447221398353577, + "learning_rate": 1.9620917900537546e-05, + "loss": 0.0435, + "step": 3853 + }, + { + "epoch": 2.3449954365682992, + "grad_norm": 0.518090546131134, + "learning_rate": 1.960876129045054e-05, + "loss": 0.0454, + "step": 3854 + }, + { + "epoch": 2.3456038941283843, + "grad_norm": 0.5191083550453186, + "learning_rate": 1.959660601716601e-05, + "loss": 0.0364, + "step": 3855 + }, + { + "epoch": 2.34621235168847, + "grad_norm": 0.6573032140731812, + "learning_rate": 1.958445208369796e-05, + "loss": 0.0589, + "step": 3856 + }, + { + "epoch": 2.346820809248555, + "grad_norm": 0.5200658440589905, + "learning_rate": 1.9572299493060067e-05, + "loss": 0.0445, + "step": 3857 + }, + { + "epoch": 2.34742926680864, + "grad_norm": 0.5530250072479248, + "learning_rate": 1.9560148248265662e-05, + "loss": 0.0455, + "step": 3858 + }, + { + "epoch": 2.348037724368725, + "grad_norm": 0.7511923313140869, + "learning_rate": 1.954799835232774e-05, + "loss": 0.0459, + "step": 3859 + }, + { + "epoch": 2.3486461819288103, + "grad_norm": 0.48297080397605896, + "learning_rate": 1.9535849808258985e-05, + "loss": 0.0386, + "step": 3860 + }, + { + "epoch": 2.349254639488896, + "grad_norm": 0.4536675214767456, + "learning_rate": 1.9523702619071722e-05, + "loss": 0.0357, + "step": 3861 + }, + { + "epoch": 2.349863097048981, + "grad_norm": 0.522323727607727, + "learning_rate": 1.9511556787777945e-05, + "loss": 0.0429, + "step": 3862 + }, + { + "epoch": 2.350471554609066, + "grad_norm": 0.5316919088363647, + "learning_rate": 1.9499412317389305e-05, + "loss": 0.0322, + "step": 3863 + }, + { + "epoch": 2.351080012169151, + "grad_norm": 0.5202551484107971, + "learning_rate": 1.9487269210917144e-05, + "loss": 0.0453, + "step": 3864 + }, + { + "epoch": 2.351688469729236, + "grad_norm": 0.6849666833877563, + "learning_rate": 1.9475127471372436e-05, + "loss": 0.0488, + "step": 3865 + }, + { + "epoch": 2.3522969272893217, + "grad_norm": 0.531730592250824, + "learning_rate": 1.946298710176582e-05, + "loss": 0.0435, + "step": 3866 + }, + { + "epoch": 2.352905384849407, + "grad_norm": 0.6236624121665955, + "learning_rate": 1.9450848105107615e-05, + "loss": 0.0723, + "step": 3867 + }, + { + "epoch": 2.353513842409492, + "grad_norm": 0.502086877822876, + "learning_rate": 1.9438710484407786e-05, + "loss": 0.0486, + "step": 3868 + }, + { + "epoch": 2.354122299969577, + "grad_norm": 0.4249696731567383, + "learning_rate": 1.9426574242675932e-05, + "loss": 0.0282, + "step": 3869 + }, + { + "epoch": 2.354730757529662, + "grad_norm": 0.6139441132545471, + "learning_rate": 1.941443938292137e-05, + "loss": 0.0492, + "step": 3870 + }, + { + "epoch": 2.3553392150897476, + "grad_norm": 0.4936158061027527, + "learning_rate": 1.9402305908153025e-05, + "loss": 0.0392, + "step": 3871 + }, + { + "epoch": 2.3559476726498327, + "grad_norm": 0.5353962182998657, + "learning_rate": 1.9390173821379486e-05, + "loss": 0.0347, + "step": 3872 + }, + { + "epoch": 2.356556130209918, + "grad_norm": 0.46766939759254456, + "learning_rate": 1.9378043125609003e-05, + "loss": 0.041, + "step": 3873 + }, + { + "epoch": 2.357164587770003, + "grad_norm": 0.5060580372810364, + "learning_rate": 1.93659138238495e-05, + "loss": 0.0414, + "step": 3874 + }, + { + "epoch": 2.357773045330088, + "grad_norm": 0.5533692240715027, + "learning_rate": 1.935378591910853e-05, + "loss": 0.0365, + "step": 3875 + }, + { + "epoch": 2.3583815028901736, + "grad_norm": 0.5114442110061646, + "learning_rate": 1.934165941439329e-05, + "loss": 0.0452, + "step": 3876 + }, + { + "epoch": 2.3589899604502587, + "grad_norm": 0.563607394695282, + "learning_rate": 1.932953431271068e-05, + "loss": 0.06, + "step": 3877 + }, + { + "epoch": 2.3595984180103438, + "grad_norm": 0.4950112998485565, + "learning_rate": 1.93174106170672e-05, + "loss": 0.047, + "step": 3878 + }, + { + "epoch": 2.360206875570429, + "grad_norm": 0.6233085989952087, + "learning_rate": 1.930528833046902e-05, + "loss": 0.0455, + "step": 3879 + }, + { + "epoch": 2.360815333130514, + "grad_norm": 0.5214982628822327, + "learning_rate": 1.9293167455921972e-05, + "loss": 0.0359, + "step": 3880 + }, + { + "epoch": 2.3614237906905995, + "grad_norm": 0.5291669368743896, + "learning_rate": 1.9281047996431527e-05, + "loss": 0.041, + "step": 3881 + }, + { + "epoch": 2.3620322482506846, + "grad_norm": 0.7979464530944824, + "learning_rate": 1.9268929955002787e-05, + "loss": 0.0629, + "step": 3882 + }, + { + "epoch": 2.3626407058107697, + "grad_norm": 0.5536260008811951, + "learning_rate": 1.9256813334640546e-05, + "loss": 0.05, + "step": 3883 + }, + { + "epoch": 2.3632491633708548, + "grad_norm": 0.5184295177459717, + "learning_rate": 1.9244698138349212e-05, + "loss": 0.0453, + "step": 3884 + }, + { + "epoch": 2.36385762093094, + "grad_norm": 0.632335901260376, + "learning_rate": 1.9232584369132848e-05, + "loss": 0.0579, + "step": 3885 + }, + { + "epoch": 2.3644660784910254, + "grad_norm": 0.611529529094696, + "learning_rate": 1.922047202999515e-05, + "loss": 0.0749, + "step": 3886 + }, + { + "epoch": 2.3650745360511105, + "grad_norm": 0.6848330497741699, + "learning_rate": 1.9208361123939498e-05, + "loss": 0.0487, + "step": 3887 + }, + { + "epoch": 2.3656829936111956, + "grad_norm": 0.5429683923721313, + "learning_rate": 1.9196251653968877e-05, + "loss": 0.0382, + "step": 3888 + }, + { + "epoch": 2.3662914511712807, + "grad_norm": 0.5638923645019531, + "learning_rate": 1.9184143623085926e-05, + "loss": 0.0403, + "step": 3889 + }, + { + "epoch": 2.366899908731366, + "grad_norm": 0.5319681167602539, + "learning_rate": 1.917203703429295e-05, + "loss": 0.0563, + "step": 3890 + }, + { + "epoch": 2.3675083662914513, + "grad_norm": 0.5609192848205566, + "learning_rate": 1.9159931890591865e-05, + "loss": 0.0425, + "step": 3891 + }, + { + "epoch": 2.3681168238515364, + "grad_norm": 0.5878105759620667, + "learning_rate": 1.9147828194984236e-05, + "loss": 0.0424, + "step": 3892 + }, + { + "epoch": 2.3687252814116215, + "grad_norm": 0.5947871208190918, + "learning_rate": 1.91357259504713e-05, + "loss": 0.0574, + "step": 3893 + }, + { + "epoch": 2.3693337389717066, + "grad_norm": 0.4823724031448364, + "learning_rate": 1.9123625160053885e-05, + "loss": 0.0452, + "step": 3894 + }, + { + "epoch": 2.3699421965317917, + "grad_norm": 0.5193595886230469, + "learning_rate": 1.911152582673249e-05, + "loss": 0.0459, + "step": 3895 + }, + { + "epoch": 2.3705506540918773, + "grad_norm": 0.6103157997131348, + "learning_rate": 1.9099427953507245e-05, + "loss": 0.0555, + "step": 3896 + }, + { + "epoch": 2.3711591116519624, + "grad_norm": 0.5717165470123291, + "learning_rate": 1.9087331543377925e-05, + "loss": 0.0421, + "step": 3897 + }, + { + "epoch": 2.3717675692120475, + "grad_norm": 0.5617209076881409, + "learning_rate": 1.9075236599343927e-05, + "loss": 0.0419, + "step": 3898 + }, + { + "epoch": 2.3723760267721326, + "grad_norm": 0.503222644329071, + "learning_rate": 1.9063143124404293e-05, + "loss": 0.054, + "step": 3899 + }, + { + "epoch": 2.3729844843322176, + "grad_norm": 0.6546376943588257, + "learning_rate": 1.9051051121557712e-05, + "loss": 0.0527, + "step": 3900 + }, + { + "epoch": 2.373592941892303, + "grad_norm": 0.5000297427177429, + "learning_rate": 1.903896059380248e-05, + "loss": 0.0388, + "step": 3901 + }, + { + "epoch": 2.3742013994523883, + "grad_norm": 0.6017688512802124, + "learning_rate": 1.9026871544136554e-05, + "loss": 0.0479, + "step": 3902 + }, + { + "epoch": 2.3748098570124734, + "grad_norm": 0.5484517216682434, + "learning_rate": 1.901478397555752e-05, + "loss": 0.0428, + "step": 3903 + }, + { + "epoch": 2.3754183145725585, + "grad_norm": 0.6508259177207947, + "learning_rate": 1.9002697891062572e-05, + "loss": 0.0521, + "step": 3904 + }, + { + "epoch": 2.3760267721326436, + "grad_norm": 0.6614736914634705, + "learning_rate": 1.8990613293648572e-05, + "loss": 0.0626, + "step": 3905 + }, + { + "epoch": 2.376635229692729, + "grad_norm": 0.5715472102165222, + "learning_rate": 1.8978530186312e-05, + "loss": 0.0365, + "step": 3906 + }, + { + "epoch": 2.377243687252814, + "grad_norm": 0.4740070104598999, + "learning_rate": 1.8966448572048946e-05, + "loss": 0.0401, + "step": 3907 + }, + { + "epoch": 2.3778521448128993, + "grad_norm": 0.4682742655277252, + "learning_rate": 1.895436845385516e-05, + "loss": 0.0391, + "step": 3908 + }, + { + "epoch": 2.3784606023729844, + "grad_norm": 0.6115542054176331, + "learning_rate": 1.8942289834725995e-05, + "loss": 0.0662, + "step": 3909 + }, + { + "epoch": 2.3790690599330695, + "grad_norm": 0.45415550470352173, + "learning_rate": 1.8930212717656464e-05, + "loss": 0.0377, + "step": 3910 + }, + { + "epoch": 2.379677517493155, + "grad_norm": 0.6085162162780762, + "learning_rate": 1.891813710564117e-05, + "loss": 0.0598, + "step": 3911 + }, + { + "epoch": 2.38028597505324, + "grad_norm": 0.5516512393951416, + "learning_rate": 1.8906063001674368e-05, + "loss": 0.0526, + "step": 3912 + }, + { + "epoch": 2.3808944326133252, + "grad_norm": 0.5332078337669373, + "learning_rate": 1.8893990408749943e-05, + "loss": 0.0507, + "step": 3913 + }, + { + "epoch": 2.3815028901734103, + "grad_norm": 0.5707116723060608, + "learning_rate": 1.8881919329861377e-05, + "loss": 0.0486, + "step": 3914 + }, + { + "epoch": 2.3821113477334954, + "grad_norm": 0.5801469683647156, + "learning_rate": 1.8869849768001803e-05, + "loss": 0.05, + "step": 3915 + }, + { + "epoch": 2.382719805293581, + "grad_norm": 0.4456758201122284, + "learning_rate": 1.8857781726163977e-05, + "loss": 0.0382, + "step": 3916 + }, + { + "epoch": 2.383328262853666, + "grad_norm": 0.5199500322341919, + "learning_rate": 1.8845715207340254e-05, + "loss": 0.0443, + "step": 3917 + }, + { + "epoch": 2.383936720413751, + "grad_norm": 0.6407058835029602, + "learning_rate": 1.8833650214522643e-05, + "loss": 0.065, + "step": 3918 + }, + { + "epoch": 2.3845451779738362, + "grad_norm": 0.5200806856155396, + "learning_rate": 1.882158675070275e-05, + "loss": 0.0285, + "step": 3919 + }, + { + "epoch": 2.3851536355339213, + "grad_norm": 0.5379014611244202, + "learning_rate": 1.880952481887181e-05, + "loss": 0.0615, + "step": 3920 + }, + { + "epoch": 2.385762093094007, + "grad_norm": 0.46084079146385193, + "learning_rate": 1.879746442202069e-05, + "loss": 0.0379, + "step": 3921 + }, + { + "epoch": 2.386370550654092, + "grad_norm": 0.5246143937110901, + "learning_rate": 1.878540556313986e-05, + "loss": 0.0558, + "step": 3922 + }, + { + "epoch": 2.386979008214177, + "grad_norm": 0.5521069169044495, + "learning_rate": 1.8773348245219403e-05, + "loss": 0.0533, + "step": 3923 + }, + { + "epoch": 2.387587465774262, + "grad_norm": 0.6149189472198486, + "learning_rate": 1.876129247124905e-05, + "loss": 0.0434, + "step": 3924 + }, + { + "epoch": 2.3881959233343473, + "grad_norm": 0.6903012990951538, + "learning_rate": 1.874923824421812e-05, + "loss": 0.052, + "step": 3925 + }, + { + "epoch": 2.388804380894433, + "grad_norm": 0.8194772005081177, + "learning_rate": 1.8737185567115555e-05, + "loss": 0.0505, + "step": 3926 + }, + { + "epoch": 2.389412838454518, + "grad_norm": 0.4386597275733948, + "learning_rate": 1.872513444292993e-05, + "loss": 0.028, + "step": 3927 + }, + { + "epoch": 2.390021296014603, + "grad_norm": 0.4583303928375244, + "learning_rate": 1.871308487464941e-05, + "loss": 0.0417, + "step": 3928 + }, + { + "epoch": 2.390629753574688, + "grad_norm": 0.5294291973114014, + "learning_rate": 1.8701036865261792e-05, + "loss": 0.0372, + "step": 3929 + }, + { + "epoch": 2.391238211134773, + "grad_norm": 0.5232396721839905, + "learning_rate": 1.868899041775448e-05, + "loss": 0.0386, + "step": 3930 + }, + { + "epoch": 2.3918466686948587, + "grad_norm": 0.5549443960189819, + "learning_rate": 1.867694553511449e-05, + "loss": 0.053, + "step": 3931 + }, + { + "epoch": 2.392455126254944, + "grad_norm": 0.6172237396240234, + "learning_rate": 1.8664902220328435e-05, + "loss": 0.0574, + "step": 3932 + }, + { + "epoch": 2.393063583815029, + "grad_norm": 0.7529854774475098, + "learning_rate": 1.8652860476382584e-05, + "loss": 0.0613, + "step": 3933 + }, + { + "epoch": 2.393672041375114, + "grad_norm": 0.5938291549682617, + "learning_rate": 1.8640820306262774e-05, + "loss": 0.0592, + "step": 3934 + }, + { + "epoch": 2.394280498935199, + "grad_norm": 0.6293975114822388, + "learning_rate": 1.862878171295445e-05, + "loss": 0.0491, + "step": 3935 + }, + { + "epoch": 2.3948889564952847, + "grad_norm": 0.42942139506340027, + "learning_rate": 1.8616744699442706e-05, + "loss": 0.0368, + "step": 3936 + }, + { + "epoch": 2.3954974140553698, + "grad_norm": 0.5470993518829346, + "learning_rate": 1.8604709268712213e-05, + "loss": 0.0506, + "step": 3937 + }, + { + "epoch": 2.396105871615455, + "grad_norm": 1.0791350603103638, + "learning_rate": 1.859267542374724e-05, + "loss": 0.0415, + "step": 3938 + }, + { + "epoch": 2.39671432917554, + "grad_norm": 0.6190977096557617, + "learning_rate": 1.8580643167531703e-05, + "loss": 0.0691, + "step": 3939 + }, + { + "epoch": 2.397322786735625, + "grad_norm": 0.5365692973136902, + "learning_rate": 1.8568612503049086e-05, + "loss": 0.0547, + "step": 3940 + }, + { + "epoch": 2.3979312442957106, + "grad_norm": 0.5777300596237183, + "learning_rate": 1.8556583433282497e-05, + "loss": 0.0521, + "step": 3941 + }, + { + "epoch": 2.3985397018557957, + "grad_norm": 0.612249493598938, + "learning_rate": 1.8544555961214634e-05, + "loss": 0.0507, + "step": 3942 + }, + { + "epoch": 2.3991481594158808, + "grad_norm": 0.5814237594604492, + "learning_rate": 1.853253008982782e-05, + "loss": 0.0476, + "step": 3943 + }, + { + "epoch": 2.399756616975966, + "grad_norm": 0.5139836072921753, + "learning_rate": 1.8520505822103972e-05, + "loss": 0.033, + "step": 3944 + }, + { + "epoch": 2.400365074536051, + "grad_norm": 0.5720403790473938, + "learning_rate": 1.8508483161024586e-05, + "loss": 0.0567, + "step": 3945 + }, + { + "epoch": 2.4009735320961365, + "grad_norm": 0.6230379343032837, + "learning_rate": 1.849646210957081e-05, + "loss": 0.0538, + "step": 3946 + }, + { + "epoch": 2.4015819896562216, + "grad_norm": 0.4990738034248352, + "learning_rate": 1.848444267072335e-05, + "loss": 0.0483, + "step": 3947 + }, + { + "epoch": 2.4021904472163067, + "grad_norm": 0.600247323513031, + "learning_rate": 1.8472424847462517e-05, + "loss": 0.0729, + "step": 3948 + }, + { + "epoch": 2.402798904776392, + "grad_norm": 0.535139799118042, + "learning_rate": 1.8460408642768244e-05, + "loss": 0.0502, + "step": 3949 + }, + { + "epoch": 2.403407362336477, + "grad_norm": 0.4909554421901703, + "learning_rate": 1.844839405962005e-05, + "loss": 0.0368, + "step": 3950 + }, + { + "epoch": 2.4040158198965624, + "grad_norm": 0.5839510560035706, + "learning_rate": 1.8436381100997034e-05, + "loss": 0.0523, + "step": 3951 + }, + { + "epoch": 2.4046242774566475, + "grad_norm": 0.7578384280204773, + "learning_rate": 1.8424369769877927e-05, + "loss": 0.0429, + "step": 3952 + }, + { + "epoch": 2.4052327350167326, + "grad_norm": 0.5987670421600342, + "learning_rate": 1.8412360069241034e-05, + "loss": 0.0529, + "step": 3953 + }, + { + "epoch": 2.4058411925768177, + "grad_norm": 0.5466412901878357, + "learning_rate": 1.8400352002064253e-05, + "loss": 0.0491, + "step": 3954 + }, + { + "epoch": 2.406449650136903, + "grad_norm": 1.5153589248657227, + "learning_rate": 1.8388345571325083e-05, + "loss": 0.0481, + "step": 3955 + }, + { + "epoch": 2.4070581076969884, + "grad_norm": 0.5942850708961487, + "learning_rate": 1.8376340780000635e-05, + "loss": 0.0641, + "step": 3956 + }, + { + "epoch": 2.4076665652570735, + "grad_norm": 0.5600593090057373, + "learning_rate": 1.836433763106758e-05, + "loss": 0.0534, + "step": 3957 + }, + { + "epoch": 2.4082750228171586, + "grad_norm": 0.5812585353851318, + "learning_rate": 1.8352336127502198e-05, + "loss": 0.0532, + "step": 3958 + }, + { + "epoch": 2.4088834803772436, + "grad_norm": 0.4886872172355652, + "learning_rate": 1.8340336272280377e-05, + "loss": 0.0524, + "step": 3959 + }, + { + "epoch": 2.4094919379373287, + "grad_norm": 0.6744169592857361, + "learning_rate": 1.8328338068377578e-05, + "loss": 0.0497, + "step": 3960 + }, + { + "epoch": 2.4101003954974143, + "grad_norm": 0.6337938904762268, + "learning_rate": 1.831634151876883e-05, + "loss": 0.0431, + "step": 3961 + }, + { + "epoch": 2.4107088530574994, + "grad_norm": 0.6581575870513916, + "learning_rate": 1.8304346626428813e-05, + "loss": 0.0637, + "step": 3962 + }, + { + "epoch": 2.4113173106175845, + "grad_norm": 1.0695427656173706, + "learning_rate": 1.829235339433174e-05, + "loss": 0.0482, + "step": 3963 + }, + { + "epoch": 2.4119257681776696, + "grad_norm": 0.6076899766921997, + "learning_rate": 1.828036182545144e-05, + "loss": 0.0375, + "step": 3964 + }, + { + "epoch": 2.4125342257377547, + "grad_norm": 0.6036027669906616, + "learning_rate": 1.8268371922761308e-05, + "loss": 0.0442, + "step": 3965 + }, + { + "epoch": 2.41314268329784, + "grad_norm": 0.5249053239822388, + "learning_rate": 1.8256383689234362e-05, + "loss": 0.0415, + "step": 3966 + }, + { + "epoch": 2.4137511408579253, + "grad_norm": 0.5730521082878113, + "learning_rate": 1.824439712784317e-05, + "loss": 0.0438, + "step": 3967 + }, + { + "epoch": 2.4143595984180104, + "grad_norm": 0.5469090342521667, + "learning_rate": 1.8232412241559896e-05, + "loss": 0.0265, + "step": 3968 + }, + { + "epoch": 2.4149680559780955, + "grad_norm": 0.5718511939048767, + "learning_rate": 1.8220429033356305e-05, + "loss": 0.0395, + "step": 3969 + }, + { + "epoch": 2.4155765135381806, + "grad_norm": 0.5090388655662537, + "learning_rate": 1.8208447506203734e-05, + "loss": 0.0359, + "step": 3970 + }, + { + "epoch": 2.416184971098266, + "grad_norm": 0.4648432731628418, + "learning_rate": 1.819646766307308e-05, + "loss": 0.0351, + "step": 3971 + }, + { + "epoch": 2.4167934286583512, + "grad_norm": 0.5751369595527649, + "learning_rate": 1.8184489506934875e-05, + "loss": 0.035, + "step": 3972 + }, + { + "epoch": 2.4174018862184363, + "grad_norm": 0.607879638671875, + "learning_rate": 1.8172513040759183e-05, + "loss": 0.0605, + "step": 3973 + }, + { + "epoch": 2.4180103437785214, + "grad_norm": 0.5648092031478882, + "learning_rate": 1.8160538267515665e-05, + "loss": 0.0417, + "step": 3974 + }, + { + "epoch": 2.4186188013386065, + "grad_norm": 0.6005630493164062, + "learning_rate": 1.8148565190173586e-05, + "loss": 0.0441, + "step": 3975 + }, + { + "epoch": 2.419227258898692, + "grad_norm": 0.524438202381134, + "learning_rate": 1.813659381170176e-05, + "loss": 0.0344, + "step": 3976 + }, + { + "epoch": 2.419835716458777, + "grad_norm": 0.4712424874305725, + "learning_rate": 1.812462413506858e-05, + "loss": 0.0264, + "step": 3977 + }, + { + "epoch": 2.4204441740188622, + "grad_norm": 0.561529278755188, + "learning_rate": 1.811265616324203e-05, + "loss": 0.0522, + "step": 3978 + }, + { + "epoch": 2.4210526315789473, + "grad_norm": 0.6183802485466003, + "learning_rate": 1.8100689899189683e-05, + "loss": 0.0509, + "step": 3979 + }, + { + "epoch": 2.4216610891390324, + "grad_norm": 0.5042179226875305, + "learning_rate": 1.808872534587866e-05, + "loss": 0.0501, + "step": 3980 + }, + { + "epoch": 2.422269546699118, + "grad_norm": 0.6235750317573547, + "learning_rate": 1.8076762506275667e-05, + "loss": 0.0617, + "step": 3981 + }, + { + "epoch": 2.422878004259203, + "grad_norm": 0.5784009099006653, + "learning_rate": 1.8064801383347006e-05, + "loss": 0.0542, + "step": 3982 + }, + { + "epoch": 2.423486461819288, + "grad_norm": 0.6061785221099854, + "learning_rate": 1.8052841980058533e-05, + "loss": 0.0477, + "step": 3983 + }, + { + "epoch": 2.4240949193793733, + "grad_norm": 0.5976025462150574, + "learning_rate": 1.8040884299375665e-05, + "loss": 0.0554, + "step": 3984 + }, + { + "epoch": 2.4247033769394584, + "grad_norm": 0.5100542306900024, + "learning_rate": 1.802892834426343e-05, + "loss": 0.0437, + "step": 3985 + }, + { + "epoch": 2.425311834499544, + "grad_norm": 0.6174180507659912, + "learning_rate": 1.8016974117686398e-05, + "loss": 0.0661, + "step": 3986 + }, + { + "epoch": 2.425920292059629, + "grad_norm": 0.6207003593444824, + "learning_rate": 1.8005021622608717e-05, + "loss": 0.0463, + "step": 3987 + }, + { + "epoch": 2.426528749619714, + "grad_norm": 0.5053902864456177, + "learning_rate": 1.79930708619941e-05, + "loss": 0.043, + "step": 3988 + }, + { + "epoch": 2.427137207179799, + "grad_norm": 0.5556389689445496, + "learning_rate": 1.7981121838805852e-05, + "loss": 0.0522, + "step": 3989 + }, + { + "epoch": 2.4277456647398843, + "grad_norm": 0.522250235080719, + "learning_rate": 1.7969174556006832e-05, + "loss": 0.0527, + "step": 3990 + }, + { + "epoch": 2.4283541222999694, + "grad_norm": 0.5426846742630005, + "learning_rate": 1.795722901655945e-05, + "loss": 0.0481, + "step": 3991 + }, + { + "epoch": 2.428962579860055, + "grad_norm": 0.5758845806121826, + "learning_rate": 1.7945285223425724e-05, + "loss": 0.0414, + "step": 3992 + }, + { + "epoch": 2.42957103742014, + "grad_norm": 0.556006669998169, + "learning_rate": 1.7933343179567208e-05, + "loss": 0.0647, + "step": 3993 + }, + { + "epoch": 2.430179494980225, + "grad_norm": 0.6288806796073914, + "learning_rate": 1.792140288794502e-05, + "loss": 0.0633, + "step": 3994 + }, + { + "epoch": 2.43078795254031, + "grad_norm": 0.5981294512748718, + "learning_rate": 1.790946435151988e-05, + "loss": 0.0627, + "step": 3995 + }, + { + "epoch": 2.4313964101003953, + "grad_norm": 0.5165243148803711, + "learning_rate": 1.789752757325203e-05, + "loss": 0.0411, + "step": 3996 + }, + { + "epoch": 2.432004867660481, + "grad_norm": 0.464417964220047, + "learning_rate": 1.7885592556101282e-05, + "loss": 0.0387, + "step": 3997 + }, + { + "epoch": 2.432613325220566, + "grad_norm": 0.4882068932056427, + "learning_rate": 1.7873659303027052e-05, + "loss": 0.0376, + "step": 3998 + }, + { + "epoch": 2.433221782780651, + "grad_norm": 0.5661106109619141, + "learning_rate": 1.7861727816988275e-05, + "loss": 0.0457, + "step": 3999 + }, + { + "epoch": 2.433830240340736, + "grad_norm": 0.5687078833580017, + "learning_rate": 1.784979810094346e-05, + "loss": 0.0454, + "step": 4000 + }, + { + "epoch": 2.4344386979008212, + "grad_norm": 0.6012535691261292, + "learning_rate": 1.783787015785067e-05, + "loss": 0.0612, + "step": 4001 + }, + { + "epoch": 2.4350471554609068, + "grad_norm": 0.5028715133666992, + "learning_rate": 1.782594399066756e-05, + "loss": 0.0442, + "step": 4002 + }, + { + "epoch": 2.435655613020992, + "grad_norm": 0.5349405407905579, + "learning_rate": 1.781401960235131e-05, + "loss": 0.06, + "step": 4003 + }, + { + "epoch": 2.436264070581077, + "grad_norm": 0.5278841257095337, + "learning_rate": 1.7802096995858658e-05, + "loss": 0.051, + "step": 4004 + }, + { + "epoch": 2.436872528141162, + "grad_norm": 0.6222814917564392, + "learning_rate": 1.7790176174145944e-05, + "loss": 0.0548, + "step": 4005 + }, + { + "epoch": 2.437480985701247, + "grad_norm": 0.47187790274620056, + "learning_rate": 1.7778257140169016e-05, + "loss": 0.0439, + "step": 4006 + }, + { + "epoch": 2.4380894432613327, + "grad_norm": 0.48313429951667786, + "learning_rate": 1.7766339896883292e-05, + "loss": 0.0319, + "step": 4007 + }, + { + "epoch": 2.438697900821418, + "grad_norm": 0.5558401346206665, + "learning_rate": 1.7754424447243767e-05, + "loss": 0.0484, + "step": 4008 + }, + { + "epoch": 2.439306358381503, + "grad_norm": 0.45081377029418945, + "learning_rate": 1.774251079420497e-05, + "loss": 0.0323, + "step": 4009 + }, + { + "epoch": 2.439914815941588, + "grad_norm": 0.4787575900554657, + "learning_rate": 1.773059894072098e-05, + "loss": 0.0441, + "step": 4010 + }, + { + "epoch": 2.440523273501673, + "grad_norm": 0.5224798321723938, + "learning_rate": 1.7718688889745455e-05, + "loss": 0.0532, + "step": 4011 + }, + { + "epoch": 2.4411317310617586, + "grad_norm": 0.40973082184791565, + "learning_rate": 1.7706780644231592e-05, + "loss": 0.037, + "step": 4012 + }, + { + "epoch": 2.4417401886218437, + "grad_norm": 0.4948725998401642, + "learning_rate": 1.7694874207132127e-05, + "loss": 0.0422, + "step": 4013 + }, + { + "epoch": 2.442348646181929, + "grad_norm": 0.5685492753982544, + "learning_rate": 1.7682969581399358e-05, + "loss": 0.0348, + "step": 4014 + }, + { + "epoch": 2.442957103742014, + "grad_norm": 0.4990760385990143, + "learning_rate": 1.7671066769985155e-05, + "loss": 0.0444, + "step": 4015 + }, + { + "epoch": 2.443565561302099, + "grad_norm": 0.5178315043449402, + "learning_rate": 1.765916577584091e-05, + "loss": 0.0404, + "step": 4016 + }, + { + "epoch": 2.4441740188621845, + "grad_norm": 0.7539467811584473, + "learning_rate": 1.7647266601917557e-05, + "loss": 0.0713, + "step": 4017 + }, + { + "epoch": 2.4447824764222696, + "grad_norm": 0.4690307676792145, + "learning_rate": 1.7635369251165617e-05, + "loss": 0.0527, + "step": 4018 + }, + { + "epoch": 2.4453909339823547, + "grad_norm": 0.37490659952163696, + "learning_rate": 1.7623473726535136e-05, + "loss": 0.0346, + "step": 4019 + }, + { + "epoch": 2.44599939154244, + "grad_norm": 0.5802814960479736, + "learning_rate": 1.7611580030975688e-05, + "loss": 0.0545, + "step": 4020 + }, + { + "epoch": 2.446607849102525, + "grad_norm": 0.5390932559967041, + "learning_rate": 1.7599688167436442e-05, + "loss": 0.0484, + "step": 4021 + }, + { + "epoch": 2.4472163066626105, + "grad_norm": 0.5329564809799194, + "learning_rate": 1.7587798138866067e-05, + "loss": 0.0373, + "step": 4022 + }, + { + "epoch": 2.4478247642226956, + "grad_norm": 0.4844886064529419, + "learning_rate": 1.7575909948212795e-05, + "loss": 0.0382, + "step": 4023 + }, + { + "epoch": 2.4484332217827807, + "grad_norm": 0.4697389602661133, + "learning_rate": 1.7564023598424403e-05, + "loss": 0.0416, + "step": 4024 + }, + { + "epoch": 2.4490416793428658, + "grad_norm": 0.46831750869750977, + "learning_rate": 1.7552139092448216e-05, + "loss": 0.0292, + "step": 4025 + }, + { + "epoch": 2.449650136902951, + "grad_norm": 0.4629685878753662, + "learning_rate": 1.754025643323109e-05, + "loss": 0.0332, + "step": 4026 + }, + { + "epoch": 2.450258594463036, + "grad_norm": 0.6923487782478333, + "learning_rate": 1.7528375623719427e-05, + "loss": 0.0602, + "step": 4027 + }, + { + "epoch": 2.4508670520231215, + "grad_norm": 0.5200254917144775, + "learning_rate": 1.751649666685919e-05, + "loss": 0.0419, + "step": 4028 + }, + { + "epoch": 2.4514755095832066, + "grad_norm": 0.635124683380127, + "learning_rate": 1.7504619565595836e-05, + "loss": 0.0439, + "step": 4029 + }, + { + "epoch": 2.4520839671432917, + "grad_norm": 0.47341227531433105, + "learning_rate": 1.7492744322874414e-05, + "loss": 0.0356, + "step": 4030 + }, + { + "epoch": 2.452692424703377, + "grad_norm": 0.5724462270736694, + "learning_rate": 1.7480870941639485e-05, + "loss": 0.0399, + "step": 4031 + }, + { + "epoch": 2.453300882263462, + "grad_norm": 0.5328457355499268, + "learning_rate": 1.7468999424835144e-05, + "loss": 0.0526, + "step": 4032 + }, + { + "epoch": 2.4539093398235474, + "grad_norm": 0.5413589477539062, + "learning_rate": 1.745712977540504e-05, + "loss": 0.0466, + "step": 4033 + }, + { + "epoch": 2.4545177973836325, + "grad_norm": 0.5428217053413391, + "learning_rate": 1.744526199629235e-05, + "loss": 0.0372, + "step": 4034 + }, + { + "epoch": 2.4551262549437176, + "grad_norm": 0.5191711187362671, + "learning_rate": 1.743339609043979e-05, + "loss": 0.0486, + "step": 4035 + }, + { + "epoch": 2.4557347125038027, + "grad_norm": 0.5395163297653198, + "learning_rate": 1.742153206078961e-05, + "loss": 0.0505, + "step": 4036 + }, + { + "epoch": 2.456343170063888, + "grad_norm": 0.5724096894264221, + "learning_rate": 1.7409669910283583e-05, + "loss": 0.0428, + "step": 4037 + }, + { + "epoch": 2.4569516276239733, + "grad_norm": 0.6358472108840942, + "learning_rate": 1.739780964186304e-05, + "loss": 0.0344, + "step": 4038 + }, + { + "epoch": 2.4575600851840584, + "grad_norm": 0.6139354109764099, + "learning_rate": 1.738595125846884e-05, + "loss": 0.082, + "step": 4039 + }, + { + "epoch": 2.4581685427441435, + "grad_norm": 0.5228933095932007, + "learning_rate": 1.7374094763041347e-05, + "loss": 0.0266, + "step": 4040 + }, + { + "epoch": 2.4587770003042286, + "grad_norm": 0.5466329455375671, + "learning_rate": 1.73622401585205e-05, + "loss": 0.0438, + "step": 4041 + }, + { + "epoch": 2.4593854578643137, + "grad_norm": 0.5522369742393494, + "learning_rate": 1.735038744784573e-05, + "loss": 0.0568, + "step": 4042 + }, + { + "epoch": 2.4599939154243993, + "grad_norm": 0.7028734087944031, + "learning_rate": 1.733853663395602e-05, + "loss": 0.0511, + "step": 4043 + }, + { + "epoch": 2.4606023729844844, + "grad_norm": 0.48779624700546265, + "learning_rate": 1.732668771978988e-05, + "loss": 0.0265, + "step": 4044 + }, + { + "epoch": 2.4612108305445695, + "grad_norm": 0.5411667823791504, + "learning_rate": 1.7314840708285354e-05, + "loss": 0.0484, + "step": 4045 + }, + { + "epoch": 2.4618192881046546, + "grad_norm": 0.5587241649627686, + "learning_rate": 1.7302995602379996e-05, + "loss": 0.0461, + "step": 4046 + }, + { + "epoch": 2.4624277456647397, + "grad_norm": 0.49550849199295044, + "learning_rate": 1.7291152405010898e-05, + "loss": 0.051, + "step": 4047 + }, + { + "epoch": 2.463036203224825, + "grad_norm": 0.5094623565673828, + "learning_rate": 1.727931111911468e-05, + "loss": 0.0483, + "step": 4048 + }, + { + "epoch": 2.4636446607849103, + "grad_norm": 0.5672603249549866, + "learning_rate": 1.72674717476275e-05, + "loss": 0.0535, + "step": 4049 + }, + { + "epoch": 2.4642531183449954, + "grad_norm": 0.4778454005718231, + "learning_rate": 1.725563429348501e-05, + "loss": 0.0443, + "step": 4050 + }, + { + "epoch": 2.4648615759050805, + "grad_norm": 0.5189485549926758, + "learning_rate": 1.7243798759622414e-05, + "loss": 0.0483, + "step": 4051 + }, + { + "epoch": 2.4654700334651656, + "grad_norm": 0.5488086938858032, + "learning_rate": 1.7231965148974438e-05, + "loss": 0.0477, + "step": 4052 + }, + { + "epoch": 2.466078491025251, + "grad_norm": 0.5553414225578308, + "learning_rate": 1.7220133464475312e-05, + "loss": 0.0434, + "step": 4053 + }, + { + "epoch": 2.466686948585336, + "grad_norm": 0.5483329892158508, + "learning_rate": 1.7208303709058806e-05, + "loss": 0.0504, + "step": 4054 + }, + { + "epoch": 2.4672954061454213, + "grad_norm": 0.4734504818916321, + "learning_rate": 1.7196475885658214e-05, + "loss": 0.0363, + "step": 4055 + }, + { + "epoch": 2.4679038637055064, + "grad_norm": 0.4984531104564667, + "learning_rate": 1.7184649997206324e-05, + "loss": 0.039, + "step": 4056 + }, + { + "epoch": 2.4685123212655915, + "grad_norm": 0.5191166996955872, + "learning_rate": 1.7172826046635483e-05, + "loss": 0.044, + "step": 4057 + }, + { + "epoch": 2.469120778825677, + "grad_norm": 0.5484380722045898, + "learning_rate": 1.7161004036877526e-05, + "loss": 0.0433, + "step": 4058 + }, + { + "epoch": 2.469729236385762, + "grad_norm": 0.570693850517273, + "learning_rate": 1.714918397086383e-05, + "loss": 0.0468, + "step": 4059 + }, + { + "epoch": 2.4703376939458472, + "grad_norm": 0.5350715517997742, + "learning_rate": 1.7137365851525255e-05, + "loss": 0.0406, + "step": 4060 + }, + { + "epoch": 2.4709461515059323, + "grad_norm": 0.5469467639923096, + "learning_rate": 1.7125549681792233e-05, + "loss": 0.0499, + "step": 4061 + }, + { + "epoch": 2.4715546090660174, + "grad_norm": 0.5633114576339722, + "learning_rate": 1.7113735464594665e-05, + "loss": 0.0371, + "step": 4062 + }, + { + "epoch": 2.472163066626103, + "grad_norm": 0.6275270581245422, + "learning_rate": 1.7101923202861974e-05, + "loss": 0.0482, + "step": 4063 + }, + { + "epoch": 2.472771524186188, + "grad_norm": 0.6021205186843872, + "learning_rate": 1.7090112899523132e-05, + "loss": 0.0524, + "step": 4064 + }, + { + "epoch": 2.473379981746273, + "grad_norm": 0.6164666414260864, + "learning_rate": 1.7078304557506593e-05, + "loss": 0.0459, + "step": 4065 + }, + { + "epoch": 2.4739884393063583, + "grad_norm": 0.5763289332389832, + "learning_rate": 1.7066498179740318e-05, + "loss": 0.046, + "step": 4066 + }, + { + "epoch": 2.4745968968664434, + "grad_norm": 0.6294301152229309, + "learning_rate": 1.705469376915182e-05, + "loss": 0.0465, + "step": 4067 + }, + { + "epoch": 2.475205354426529, + "grad_norm": 0.409706711769104, + "learning_rate": 1.7042891328668094e-05, + "loss": 0.0301, + "step": 4068 + }, + { + "epoch": 2.475813811986614, + "grad_norm": 0.42503032088279724, + "learning_rate": 1.7031090861215646e-05, + "loss": 0.0307, + "step": 4069 + }, + { + "epoch": 2.476422269546699, + "grad_norm": 0.5593361258506775, + "learning_rate": 1.7019292369720493e-05, + "loss": 0.0366, + "step": 4070 + }, + { + "epoch": 2.477030727106784, + "grad_norm": 0.5310861468315125, + "learning_rate": 1.700749585710819e-05, + "loss": 0.0367, + "step": 4071 + }, + { + "epoch": 2.4776391846668693, + "grad_norm": 0.5321909189224243, + "learning_rate": 1.6995701326303776e-05, + "loss": 0.031, + "step": 4072 + }, + { + "epoch": 2.478247642226955, + "grad_norm": 0.5682234764099121, + "learning_rate": 1.6983908780231782e-05, + "loss": 0.0455, + "step": 4073 + }, + { + "epoch": 2.47885609978704, + "grad_norm": 0.46487629413604736, + "learning_rate": 1.6972118221816298e-05, + "loss": 0.032, + "step": 4074 + }, + { + "epoch": 2.479464557347125, + "grad_norm": 0.616710364818573, + "learning_rate": 1.696032965398087e-05, + "loss": 0.0516, + "step": 4075 + }, + { + "epoch": 2.48007301490721, + "grad_norm": 0.5947301387786865, + "learning_rate": 1.6948543079648575e-05, + "loss": 0.0376, + "step": 4076 + }, + { + "epoch": 2.480681472467295, + "grad_norm": 0.5795893669128418, + "learning_rate": 1.6936758501742e-05, + "loss": 0.0487, + "step": 4077 + }, + { + "epoch": 2.4812899300273807, + "grad_norm": 0.5290836095809937, + "learning_rate": 1.6924975923183228e-05, + "loss": 0.0292, + "step": 4078 + }, + { + "epoch": 2.481898387587466, + "grad_norm": 0.4861297309398651, + "learning_rate": 1.6913195346893828e-05, + "loss": 0.0336, + "step": 4079 + }, + { + "epoch": 2.482506845147551, + "grad_norm": 0.5103831887245178, + "learning_rate": 1.690141677579492e-05, + "loss": 0.0422, + "step": 4080 + }, + { + "epoch": 2.483115302707636, + "grad_norm": 0.42127537727355957, + "learning_rate": 1.688964021280709e-05, + "loss": 0.0317, + "step": 4081 + }, + { + "epoch": 2.483723760267721, + "grad_norm": 0.7771899104118347, + "learning_rate": 1.6877865660850427e-05, + "loss": 0.0537, + "step": 4082 + }, + { + "epoch": 2.4843322178278067, + "grad_norm": 0.5473512411117554, + "learning_rate": 1.6866093122844523e-05, + "loss": 0.0322, + "step": 4083 + }, + { + "epoch": 2.4849406753878918, + "grad_norm": 0.5636205077171326, + "learning_rate": 1.6854322601708495e-05, + "loss": 0.0538, + "step": 4084 + }, + { + "epoch": 2.485549132947977, + "grad_norm": 0.5534929037094116, + "learning_rate": 1.6842554100360937e-05, + "loss": 0.0604, + "step": 4085 + }, + { + "epoch": 2.486157590508062, + "grad_norm": 0.5635712742805481, + "learning_rate": 1.683078762171993e-05, + "loss": 0.0418, + "step": 4086 + }, + { + "epoch": 2.486766048068147, + "grad_norm": 0.5449815392494202, + "learning_rate": 1.6819023168703094e-05, + "loss": 0.0422, + "step": 4087 + }, + { + "epoch": 2.4873745056282326, + "grad_norm": 0.5356243252754211, + "learning_rate": 1.6807260744227513e-05, + "loss": 0.052, + "step": 4088 + }, + { + "epoch": 2.4879829631883177, + "grad_norm": 0.525938093662262, + "learning_rate": 1.6795500351209766e-05, + "loss": 0.0457, + "step": 4089 + }, + { + "epoch": 2.488591420748403, + "grad_norm": 0.5305033922195435, + "learning_rate": 1.6783741992565963e-05, + "loss": 0.0469, + "step": 4090 + }, + { + "epoch": 2.489199878308488, + "grad_norm": 0.47671419382095337, + "learning_rate": 1.6771985671211673e-05, + "loss": 0.0426, + "step": 4091 + }, + { + "epoch": 2.489808335868573, + "grad_norm": 0.6762577891349792, + "learning_rate": 1.676023139006198e-05, + "loss": 0.0523, + "step": 4092 + }, + { + "epoch": 2.4904167934286585, + "grad_norm": 0.5468366146087646, + "learning_rate": 1.6748479152031436e-05, + "loss": 0.0502, + "step": 4093 + }, + { + "epoch": 2.4910252509887436, + "grad_norm": 0.5539106130599976, + "learning_rate": 1.6736728960034137e-05, + "loss": 0.0457, + "step": 4094 + }, + { + "epoch": 2.4916337085488287, + "grad_norm": 0.5105188488960266, + "learning_rate": 1.6724980816983625e-05, + "loss": 0.0392, + "step": 4095 + }, + { + "epoch": 2.492242166108914, + "grad_norm": 0.5564658045768738, + "learning_rate": 1.6713234725792938e-05, + "loss": 0.0382, + "step": 4096 + }, + { + "epoch": 2.492850623668999, + "grad_norm": 0.6725486516952515, + "learning_rate": 1.6701490689374642e-05, + "loss": 0.0393, + "step": 4097 + }, + { + "epoch": 2.4934590812290844, + "grad_norm": 0.46235400438308716, + "learning_rate": 1.6689748710640756e-05, + "loss": 0.0401, + "step": 4098 + }, + { + "epoch": 2.4940675387891695, + "grad_norm": 0.5110651254653931, + "learning_rate": 1.6678008792502793e-05, + "loss": 0.035, + "step": 4099 + }, + { + "epoch": 2.4946759963492546, + "grad_norm": 0.6853166818618774, + "learning_rate": 1.6666270937871774e-05, + "loss": 0.0396, + "step": 4100 + }, + { + "epoch": 2.4952844539093397, + "grad_norm": 0.4850734770298004, + "learning_rate": 1.6654535149658203e-05, + "loss": 0.0599, + "step": 4101 + }, + { + "epoch": 2.495892911469425, + "grad_norm": 0.5870274305343628, + "learning_rate": 1.6642801430772048e-05, + "loss": 0.0399, + "step": 4102 + }, + { + "epoch": 2.4965013690295104, + "grad_norm": 0.561387300491333, + "learning_rate": 1.6631069784122803e-05, + "loss": 0.033, + "step": 4103 + }, + { + "epoch": 2.4971098265895955, + "grad_norm": 0.5187516212463379, + "learning_rate": 1.6619340212619417e-05, + "loss": 0.0425, + "step": 4104 + }, + { + "epoch": 2.4977182841496806, + "grad_norm": 0.4592403173446655, + "learning_rate": 1.660761271917033e-05, + "loss": 0.0334, + "step": 4105 + }, + { + "epoch": 2.4983267417097657, + "grad_norm": 0.5504740476608276, + "learning_rate": 1.6595887306683474e-05, + "loss": 0.0474, + "step": 4106 + }, + { + "epoch": 2.4989351992698507, + "grad_norm": 0.5082219243049622, + "learning_rate": 1.658416397806627e-05, + "loss": 0.0448, + "step": 4107 + }, + { + "epoch": 2.4995436568299363, + "grad_norm": 0.6115595102310181, + "learning_rate": 1.6572442736225614e-05, + "loss": 0.0633, + "step": 4108 + }, + { + "epoch": 2.5001521143900214, + "grad_norm": 0.4949551224708557, + "learning_rate": 1.656072358406787e-05, + "loss": 0.0471, + "step": 4109 + }, + { + "epoch": 2.5007605719501065, + "grad_norm": 0.6002260446548462, + "learning_rate": 1.654900652449892e-05, + "loss": 0.0593, + "step": 4110 + }, + { + "epoch": 2.5013690295101916, + "grad_norm": 0.5908356308937073, + "learning_rate": 1.6537291560424097e-05, + "loss": 0.0536, + "step": 4111 + }, + { + "epoch": 2.5019774870702767, + "grad_norm": 0.6577576398849487, + "learning_rate": 1.6525578694748217e-05, + "loss": 0.0594, + "step": 4112 + }, + { + "epoch": 2.502585944630362, + "grad_norm": 0.6429257392883301, + "learning_rate": 1.6513867930375596e-05, + "loss": 0.0613, + "step": 4113 + }, + { + "epoch": 2.5031944021904473, + "grad_norm": 0.5084056854248047, + "learning_rate": 1.650215927021001e-05, + "loss": 0.0447, + "step": 4114 + }, + { + "epoch": 2.5038028597505324, + "grad_norm": 0.6193863749504089, + "learning_rate": 1.649045271715472e-05, + "loss": 0.0486, + "step": 4115 + }, + { + "epoch": 2.5044113173106175, + "grad_norm": 0.5566953420639038, + "learning_rate": 1.6478748274112445e-05, + "loss": 0.0441, + "step": 4116 + }, + { + "epoch": 2.5050197748707026, + "grad_norm": 0.5336138606071472, + "learning_rate": 1.646704594398543e-05, + "loss": 0.0477, + "step": 4117 + }, + { + "epoch": 2.505628232430788, + "grad_norm": 0.544262170791626, + "learning_rate": 1.6455345729675348e-05, + "loss": 0.0542, + "step": 4118 + }, + { + "epoch": 2.5062366899908732, + "grad_norm": 0.46165311336517334, + "learning_rate": 1.6443647634083353e-05, + "loss": 0.0326, + "step": 4119 + }, + { + "epoch": 2.5068451475509583, + "grad_norm": 0.4707409143447876, + "learning_rate": 1.6431951660110113e-05, + "loss": 0.0401, + "step": 4120 + }, + { + "epoch": 2.5074536051110434, + "grad_norm": 0.5299643874168396, + "learning_rate": 1.6420257810655727e-05, + "loss": 0.0488, + "step": 4121 + }, + { + "epoch": 2.5080620626711285, + "grad_norm": 0.9495867490768433, + "learning_rate": 1.640856608861977e-05, + "loss": 0.0375, + "step": 4122 + }, + { + "epoch": 2.508670520231214, + "grad_norm": 0.5048344135284424, + "learning_rate": 1.6396876496901324e-05, + "loss": 0.0285, + "step": 4123 + }, + { + "epoch": 2.509278977791299, + "grad_norm": 0.6059793829917908, + "learning_rate": 1.6385189038398914e-05, + "loss": 0.0442, + "step": 4124 + }, + { + "epoch": 2.5098874353513843, + "grad_norm": 0.5671916604042053, + "learning_rate": 1.637350371601053e-05, + "loss": 0.0442, + "step": 4125 + }, + { + "epoch": 2.5104958929114694, + "grad_norm": 0.49703407287597656, + "learning_rate": 1.6361820532633665e-05, + "loss": 0.0431, + "step": 4126 + }, + { + "epoch": 2.5111043504715544, + "grad_norm": 0.5402058959007263, + "learning_rate": 1.6350139491165246e-05, + "loss": 0.0349, + "step": 4127 + }, + { + "epoch": 2.51171280803164, + "grad_norm": 0.5467166900634766, + "learning_rate": 1.6338460594501692e-05, + "loss": 0.0577, + "step": 4128 + }, + { + "epoch": 2.512321265591725, + "grad_norm": 0.5586373805999756, + "learning_rate": 1.632678384553887e-05, + "loss": 0.0426, + "step": 4129 + }, + { + "epoch": 2.51292972315181, + "grad_norm": 0.613847017288208, + "learning_rate": 1.6315109247172146e-05, + "loss": 0.0535, + "step": 4130 + }, + { + "epoch": 2.5135381807118953, + "grad_norm": 0.47474873065948486, + "learning_rate": 1.6303436802296325e-05, + "loss": 0.0426, + "step": 4131 + }, + { + "epoch": 2.5141466382719804, + "grad_norm": 0.5153751373291016, + "learning_rate": 1.629176651380568e-05, + "loss": 0.0478, + "step": 4132 + }, + { + "epoch": 2.514755095832066, + "grad_norm": 0.7585486769676208, + "learning_rate": 1.6280098384593966e-05, + "loss": 0.0642, + "step": 4133 + }, + { + "epoch": 2.515363553392151, + "grad_norm": 0.5414904952049255, + "learning_rate": 1.626843241755439e-05, + "loss": 0.0553, + "step": 4134 + }, + { + "epoch": 2.515972010952236, + "grad_norm": 0.8704773783683777, + "learning_rate": 1.625676861557962e-05, + "loss": 0.0435, + "step": 4135 + }, + { + "epoch": 2.516580468512321, + "grad_norm": 0.551547110080719, + "learning_rate": 1.6245106981561804e-05, + "loss": 0.0475, + "step": 4136 + }, + { + "epoch": 2.5171889260724063, + "grad_norm": 0.614238977432251, + "learning_rate": 1.6233447518392537e-05, + "loss": 0.0593, + "step": 4137 + }, + { + "epoch": 2.517797383632492, + "grad_norm": 0.7122321724891663, + "learning_rate": 1.622179022896287e-05, + "loss": 0.0735, + "step": 4138 + }, + { + "epoch": 2.518405841192577, + "grad_norm": 0.5139821767807007, + "learning_rate": 1.6210135116163333e-05, + "loss": 0.0405, + "step": 4139 + }, + { + "epoch": 2.519014298752662, + "grad_norm": 0.6365357637405396, + "learning_rate": 1.6198482182883912e-05, + "loss": 0.0634, + "step": 4140 + }, + { + "epoch": 2.519622756312747, + "grad_norm": 0.5459882616996765, + "learning_rate": 1.618683143201404e-05, + "loss": 0.0341, + "step": 4141 + }, + { + "epoch": 2.520231213872832, + "grad_norm": 0.45921745896339417, + "learning_rate": 1.6175182866442624e-05, + "loss": 0.0335, + "step": 4142 + }, + { + "epoch": 2.5208396714329178, + "grad_norm": 0.524821400642395, + "learning_rate": 1.6163536489058023e-05, + "loss": 0.0544, + "step": 4143 + }, + { + "epoch": 2.521448128993003, + "grad_norm": 0.5286942720413208, + "learning_rate": 1.6151892302748046e-05, + "loss": 0.046, + "step": 4144 + }, + { + "epoch": 2.522056586553088, + "grad_norm": 0.5625891089439392, + "learning_rate": 1.614025031039997e-05, + "loss": 0.0543, + "step": 4145 + }, + { + "epoch": 2.522665044113173, + "grad_norm": 0.5208843350410461, + "learning_rate": 1.612861051490053e-05, + "loss": 0.0391, + "step": 4146 + }, + { + "epoch": 2.523273501673258, + "grad_norm": 0.6042157411575317, + "learning_rate": 1.6116972919135907e-05, + "loss": 0.0726, + "step": 4147 + }, + { + "epoch": 2.5238819592333437, + "grad_norm": 0.43940305709838867, + "learning_rate": 1.6105337525991723e-05, + "loss": 0.0372, + "step": 4148 + }, + { + "epoch": 2.524490416793429, + "grad_norm": 0.4963323771953583, + "learning_rate": 1.6093704338353097e-05, + "loss": 0.0383, + "step": 4149 + }, + { + "epoch": 2.525098874353514, + "grad_norm": 0.4309125244617462, + "learning_rate": 1.6082073359104564e-05, + "loss": 0.0448, + "step": 4150 + }, + { + "epoch": 2.525707331913599, + "grad_norm": 0.4299990236759186, + "learning_rate": 1.6070444591130113e-05, + "loss": 0.0361, + "step": 4151 + }, + { + "epoch": 2.526315789473684, + "grad_norm": 0.5838400721549988, + "learning_rate": 1.60588180373132e-05, + "loss": 0.0494, + "step": 4152 + }, + { + "epoch": 2.5269242470337696, + "grad_norm": 0.4594270586967468, + "learning_rate": 1.6047193700536734e-05, + "loss": 0.0425, + "step": 4153 + }, + { + "epoch": 2.5275327045938547, + "grad_norm": 0.6414621472358704, + "learning_rate": 1.6035571583683055e-05, + "loss": 0.0606, + "step": 4154 + }, + { + "epoch": 2.52814116215394, + "grad_norm": 0.4599505364894867, + "learning_rate": 1.6023951689633965e-05, + "loss": 0.0304, + "step": 4155 + }, + { + "epoch": 2.528749619714025, + "grad_norm": 0.4809854030609131, + "learning_rate": 1.601233402127072e-05, + "loss": 0.0463, + "step": 4156 + }, + { + "epoch": 2.52935807727411, + "grad_norm": 0.5729118585586548, + "learning_rate": 1.600071858147401e-05, + "loss": 0.0412, + "step": 4157 + }, + { + "epoch": 2.5299665348341955, + "grad_norm": 0.5415656566619873, + "learning_rate": 1.5989105373123986e-05, + "loss": 0.0426, + "step": 4158 + }, + { + "epoch": 2.5305749923942806, + "grad_norm": 0.46047234535217285, + "learning_rate": 1.5977494399100242e-05, + "loss": 0.0414, + "step": 4159 + }, + { + "epoch": 2.5311834499543657, + "grad_norm": 0.5674901604652405, + "learning_rate": 1.59658856622818e-05, + "loss": 0.061, + "step": 4160 + }, + { + "epoch": 2.531791907514451, + "grad_norm": 0.539481520652771, + "learning_rate": 1.5954279165547158e-05, + "loss": 0.0422, + "step": 4161 + }, + { + "epoch": 2.532400365074536, + "grad_norm": 0.4912717342376709, + "learning_rate": 1.594267491177424e-05, + "loss": 0.0409, + "step": 4162 + }, + { + "epoch": 2.5330088226346215, + "grad_norm": 0.5007967948913574, + "learning_rate": 1.5931072903840415e-05, + "loss": 0.045, + "step": 4163 + }, + { + "epoch": 2.5336172801947066, + "grad_norm": 0.5245926976203918, + "learning_rate": 1.5919473144622502e-05, + "loss": 0.0438, + "step": 4164 + }, + { + "epoch": 2.5342257377547917, + "grad_norm": 0.5035905241966248, + "learning_rate": 1.5907875636996748e-05, + "loss": 0.0442, + "step": 4165 + }, + { + "epoch": 2.5348341953148767, + "grad_norm": 0.573156476020813, + "learning_rate": 1.5896280383838856e-05, + "loss": 0.0642, + "step": 4166 + }, + { + "epoch": 2.535442652874962, + "grad_norm": 0.5619865655899048, + "learning_rate": 1.5884687388023975e-05, + "loss": 0.0432, + "step": 4167 + }, + { + "epoch": 2.5360511104350474, + "grad_norm": 0.4712476432323456, + "learning_rate": 1.587309665242667e-05, + "loss": 0.0344, + "step": 4168 + }, + { + "epoch": 2.5366595679951325, + "grad_norm": 0.5480785965919495, + "learning_rate": 1.586150817992097e-05, + "loss": 0.0429, + "step": 4169 + }, + { + "epoch": 2.5372680255552176, + "grad_norm": 0.602826714515686, + "learning_rate": 1.5849921973380332e-05, + "loss": 0.047, + "step": 4170 + }, + { + "epoch": 2.5378764831153027, + "grad_norm": 0.5343915820121765, + "learning_rate": 1.5838338035677645e-05, + "loss": 0.0414, + "step": 4171 + }, + { + "epoch": 2.5384849406753878, + "grad_norm": 0.5871137380599976, + "learning_rate": 1.582675636968525e-05, + "loss": 0.0423, + "step": 4172 + }, + { + "epoch": 2.5390933982354733, + "grad_norm": 0.47411757707595825, + "learning_rate": 1.5815176978274924e-05, + "loss": 0.0368, + "step": 4173 + }, + { + "epoch": 2.5397018557955584, + "grad_norm": 0.5202276110649109, + "learning_rate": 1.580359986431786e-05, + "loss": 0.0423, + "step": 4174 + }, + { + "epoch": 2.5403103133556435, + "grad_norm": 0.5111712217330933, + "learning_rate": 1.57920250306847e-05, + "loss": 0.0434, + "step": 4175 + }, + { + "epoch": 2.5409187709157286, + "grad_norm": 0.5714934468269348, + "learning_rate": 1.578045248024553e-05, + "loss": 0.0529, + "step": 4176 + }, + { + "epoch": 2.5415272284758137, + "grad_norm": 0.5358994603157043, + "learning_rate": 1.5768882215869858e-05, + "loss": 0.043, + "step": 4177 + }, + { + "epoch": 2.5421356860358992, + "grad_norm": 0.5511855483055115, + "learning_rate": 1.5757314240426613e-05, + "loss": 0.0419, + "step": 4178 + }, + { + "epoch": 2.5427441435959843, + "grad_norm": 0.6131108999252319, + "learning_rate": 1.5745748556784194e-05, + "loss": 0.0716, + "step": 4179 + }, + { + "epoch": 2.5433526011560694, + "grad_norm": 0.39662912487983704, + "learning_rate": 1.5734185167810395e-05, + "loss": 0.0278, + "step": 4180 + }, + { + "epoch": 2.5439610587161545, + "grad_norm": 0.5008838772773743, + "learning_rate": 1.572262407637245e-05, + "loss": 0.0401, + "step": 4181 + }, + { + "epoch": 2.5445695162762396, + "grad_norm": 0.6066216230392456, + "learning_rate": 1.5711065285337035e-05, + "loss": 0.0686, + "step": 4182 + }, + { + "epoch": 2.545177973836325, + "grad_norm": 0.5468981862068176, + "learning_rate": 1.5699508797570255e-05, + "loss": 0.028, + "step": 4183 + }, + { + "epoch": 2.5457864313964103, + "grad_norm": 0.5890219807624817, + "learning_rate": 1.5687954615937623e-05, + "loss": 0.0497, + "step": 4184 + }, + { + "epoch": 2.5463948889564954, + "grad_norm": 0.489978164434433, + "learning_rate": 1.56764027433041e-05, + "loss": 0.0387, + "step": 4185 + }, + { + "epoch": 2.5470033465165804, + "grad_norm": 0.5345786809921265, + "learning_rate": 1.5664853182534077e-05, + "loss": 0.0441, + "step": 4186 + }, + { + "epoch": 2.5476118040766655, + "grad_norm": 0.5965332984924316, + "learning_rate": 1.565330593649135e-05, + "loss": 0.0456, + "step": 4187 + }, + { + "epoch": 2.548220261636751, + "grad_norm": 0.5223818421363831, + "learning_rate": 1.564176100803916e-05, + "loss": 0.0447, + "step": 4188 + }, + { + "epoch": 2.548828719196836, + "grad_norm": 0.6047852635383606, + "learning_rate": 1.5630218400040174e-05, + "loss": 0.049, + "step": 4189 + }, + { + "epoch": 2.5494371767569213, + "grad_norm": 0.6201773285865784, + "learning_rate": 1.561867811535648e-05, + "loss": 0.0598, + "step": 4190 + }, + { + "epoch": 2.5500456343170064, + "grad_norm": 0.4544050693511963, + "learning_rate": 1.5607140156849564e-05, + "loss": 0.0313, + "step": 4191 + }, + { + "epoch": 2.5506540918770915, + "grad_norm": 0.5224777460098267, + "learning_rate": 1.5595604527380387e-05, + "loss": 0.0538, + "step": 4192 + }, + { + "epoch": 2.551262549437177, + "grad_norm": 0.57136070728302, + "learning_rate": 1.5584071229809294e-05, + "loss": 0.0452, + "step": 4193 + }, + { + "epoch": 2.551871006997262, + "grad_norm": 0.45948007702827454, + "learning_rate": 1.5572540266996047e-05, + "loss": 0.0289, + "step": 4194 + }, + { + "epoch": 2.552479464557347, + "grad_norm": 0.5531154274940491, + "learning_rate": 1.5561011641799872e-05, + "loss": 0.0417, + "step": 4195 + }, + { + "epoch": 2.5530879221174323, + "grad_norm": 0.6279348134994507, + "learning_rate": 1.5549485357079373e-05, + "loss": 0.0619, + "step": 4196 + }, + { + "epoch": 2.5536963796775174, + "grad_norm": 0.5489982962608337, + "learning_rate": 1.5537961415692585e-05, + "loss": 0.0395, + "step": 4197 + }, + { + "epoch": 2.554304837237603, + "grad_norm": 0.4770684540271759, + "learning_rate": 1.5526439820496965e-05, + "loss": 0.0422, + "step": 4198 + }, + { + "epoch": 2.5549132947976876, + "grad_norm": 0.5624024868011475, + "learning_rate": 1.5514920574349397e-05, + "loss": 0.0552, + "step": 4199 + }, + { + "epoch": 2.555521752357773, + "grad_norm": 0.5154407620429993, + "learning_rate": 1.5503403680106168e-05, + "loss": 0.0474, + "step": 4200 + }, + { + "epoch": 2.556130209917858, + "grad_norm": 0.48511430621147156, + "learning_rate": 1.549188914062298e-05, + "loss": 0.0311, + "step": 4201 + }, + { + "epoch": 2.5567386674779433, + "grad_norm": 0.4834255874156952, + "learning_rate": 1.5480376958754976e-05, + "loss": 0.0426, + "step": 4202 + }, + { + "epoch": 2.557347125038029, + "grad_norm": 0.5208700299263, + "learning_rate": 1.5468867137356696e-05, + "loss": 0.0357, + "step": 4203 + }, + { + "epoch": 2.5579555825981135, + "grad_norm": 0.4725053906440735, + "learning_rate": 1.545735967928207e-05, + "loss": 0.0489, + "step": 4204 + }, + { + "epoch": 2.558564040158199, + "grad_norm": 0.4903971552848816, + "learning_rate": 1.54458545873845e-05, + "loss": 0.048, + "step": 4205 + }, + { + "epoch": 2.559172497718284, + "grad_norm": 0.5080806612968445, + "learning_rate": 1.543435186451676e-05, + "loss": 0.0525, + "step": 4206 + }, + { + "epoch": 2.5597809552783692, + "grad_norm": 0.608734130859375, + "learning_rate": 1.5422851513531028e-05, + "loss": 0.0495, + "step": 4207 + }, + { + "epoch": 2.560389412838455, + "grad_norm": 0.4380703270435333, + "learning_rate": 1.5411353537278935e-05, + "loss": 0.0282, + "step": 4208 + }, + { + "epoch": 2.5609978703985394, + "grad_norm": 0.4923262894153595, + "learning_rate": 1.539985793861149e-05, + "loss": 0.0473, + "step": 4209 + }, + { + "epoch": 2.561606327958625, + "grad_norm": 0.45295250415802, + "learning_rate": 1.5388364720379124e-05, + "loss": 0.041, + "step": 4210 + }, + { + "epoch": 2.56221478551871, + "grad_norm": 0.568891704082489, + "learning_rate": 1.5376873885431663e-05, + "loss": 0.0386, + "step": 4211 + }, + { + "epoch": 2.562823243078795, + "grad_norm": 0.4208332598209381, + "learning_rate": 1.536538543661838e-05, + "loss": 0.0389, + "step": 4212 + }, + { + "epoch": 2.5634317006388807, + "grad_norm": 0.4591726064682007, + "learning_rate": 1.535389937678792e-05, + "loss": 0.0361, + "step": 4213 + }, + { + "epoch": 2.5640401581989654, + "grad_norm": 0.48485690355300903, + "learning_rate": 1.5342415708788326e-05, + "loss": 0.0322, + "step": 4214 + }, + { + "epoch": 2.564648615759051, + "grad_norm": 0.546795129776001, + "learning_rate": 1.5330934435467104e-05, + "loss": 0.0465, + "step": 4215 + }, + { + "epoch": 2.565257073319136, + "grad_norm": 0.5020440816879272, + "learning_rate": 1.5319455559671116e-05, + "loss": 0.0341, + "step": 4216 + }, + { + "epoch": 2.565865530879221, + "grad_norm": 0.592922031879425, + "learning_rate": 1.530797908424663e-05, + "loss": 0.063, + "step": 4217 + }, + { + "epoch": 2.5664739884393066, + "grad_norm": 0.5598524808883667, + "learning_rate": 1.5296505012039362e-05, + "loss": 0.0321, + "step": 4218 + }, + { + "epoch": 2.5670824459993913, + "grad_norm": 0.5240581631660461, + "learning_rate": 1.5285033345894392e-05, + "loss": 0.0521, + "step": 4219 + }, + { + "epoch": 2.567690903559477, + "grad_norm": 0.4785360097885132, + "learning_rate": 1.5273564088656208e-05, + "loss": 0.0447, + "step": 4220 + }, + { + "epoch": 2.568299361119562, + "grad_norm": 0.502627432346344, + "learning_rate": 1.5262097243168705e-05, + "loss": 0.0438, + "step": 4221 + }, + { + "epoch": 2.568907818679647, + "grad_norm": 0.5592759251594543, + "learning_rate": 1.5250632812275194e-05, + "loss": 0.0488, + "step": 4222 + }, + { + "epoch": 2.5695162762397326, + "grad_norm": 0.528427004814148, + "learning_rate": 1.5239170798818381e-05, + "loss": 0.0451, + "step": 4223 + }, + { + "epoch": 2.570124733799817, + "grad_norm": 0.5380241870880127, + "learning_rate": 1.5227711205640341e-05, + "loss": 0.0442, + "step": 4224 + }, + { + "epoch": 2.5707331913599027, + "grad_norm": 0.4866541624069214, + "learning_rate": 1.5216254035582605e-05, + "loss": 0.0364, + "step": 4225 + }, + { + "epoch": 2.571341648919988, + "grad_norm": 0.6870383620262146, + "learning_rate": 1.5204799291486063e-05, + "loss": 0.0662, + "step": 4226 + }, + { + "epoch": 2.571950106480073, + "grad_norm": 0.5109632611274719, + "learning_rate": 1.5193346976191003e-05, + "loss": 0.0363, + "step": 4227 + }, + { + "epoch": 2.572558564040158, + "grad_norm": 0.4471718370914459, + "learning_rate": 1.518189709253714e-05, + "loss": 0.028, + "step": 4228 + }, + { + "epoch": 2.573167021600243, + "grad_norm": 0.5469757318496704, + "learning_rate": 1.517044964336356e-05, + "loss": 0.0474, + "step": 4229 + }, + { + "epoch": 2.5737754791603287, + "grad_norm": 0.5387968420982361, + "learning_rate": 1.5159004631508739e-05, + "loss": 0.0518, + "step": 4230 + }, + { + "epoch": 2.5743839367204138, + "grad_norm": 0.4985266327857971, + "learning_rate": 1.514756205981059e-05, + "loss": 0.0372, + "step": 4231 + }, + { + "epoch": 2.574992394280499, + "grad_norm": 0.8584986925125122, + "learning_rate": 1.5136121931106378e-05, + "loss": 0.0439, + "step": 4232 + }, + { + "epoch": 2.575600851840584, + "grad_norm": 0.4893166720867157, + "learning_rate": 1.5124684248232784e-05, + "loss": 0.0351, + "step": 4233 + }, + { + "epoch": 2.576209309400669, + "grad_norm": 0.5629805326461792, + "learning_rate": 1.511324901402586e-05, + "loss": 0.046, + "step": 4234 + }, + { + "epoch": 2.5768177669607546, + "grad_norm": 0.593343198299408, + "learning_rate": 1.5101816231321092e-05, + "loss": 0.0604, + "step": 4235 + }, + { + "epoch": 2.5774262245208397, + "grad_norm": 0.45182177424430847, + "learning_rate": 1.5090385902953325e-05, + "loss": 0.0377, + "step": 4236 + }, + { + "epoch": 2.578034682080925, + "grad_norm": 0.4370990991592407, + "learning_rate": 1.5078958031756784e-05, + "loss": 0.0393, + "step": 4237 + }, + { + "epoch": 2.57864313964101, + "grad_norm": 0.5261332392692566, + "learning_rate": 1.506753262056514e-05, + "loss": 0.0346, + "step": 4238 + }, + { + "epoch": 2.579251597201095, + "grad_norm": 0.5731834173202515, + "learning_rate": 1.5056109672211394e-05, + "loss": 0.0374, + "step": 4239 + }, + { + "epoch": 2.5798600547611805, + "grad_norm": 0.4817114770412445, + "learning_rate": 1.5044689189527957e-05, + "loss": 0.0364, + "step": 4240 + }, + { + "epoch": 2.5804685123212656, + "grad_norm": 0.6388393044471741, + "learning_rate": 1.5033271175346658e-05, + "loss": 0.0571, + "step": 4241 + }, + { + "epoch": 2.5810769698813507, + "grad_norm": 0.4850732088088989, + "learning_rate": 1.5021855632498671e-05, + "loss": 0.0417, + "step": 4242 + }, + { + "epoch": 2.581685427441436, + "grad_norm": 0.5559534430503845, + "learning_rate": 1.5010442563814575e-05, + "loss": 0.0567, + "step": 4243 + }, + { + "epoch": 2.582293885001521, + "grad_norm": 0.4856293201446533, + "learning_rate": 1.4999031972124333e-05, + "loss": 0.0494, + "step": 4244 + }, + { + "epoch": 2.5829023425616064, + "grad_norm": 0.8523834347724915, + "learning_rate": 1.498762386025731e-05, + "loss": 0.0328, + "step": 4245 + }, + { + "epoch": 2.5835108001216915, + "grad_norm": 0.6239566802978516, + "learning_rate": 1.4976218231042233e-05, + "loss": 0.0505, + "step": 4246 + }, + { + "epoch": 2.5841192576817766, + "grad_norm": 0.5066648125648499, + "learning_rate": 1.496481508730721e-05, + "loss": 0.0503, + "step": 4247 + }, + { + "epoch": 2.5847277152418617, + "grad_norm": 0.6433850526809692, + "learning_rate": 1.495341443187977e-05, + "loss": 0.0472, + "step": 4248 + }, + { + "epoch": 2.585336172801947, + "grad_norm": 0.5411747097969055, + "learning_rate": 1.4942016267586789e-05, + "loss": 0.0525, + "step": 4249 + }, + { + "epoch": 2.5859446303620324, + "grad_norm": 0.5121768116950989, + "learning_rate": 1.4930620597254524e-05, + "loss": 0.0443, + "step": 4250 + }, + { + "epoch": 2.5865530879221175, + "grad_norm": 0.40693843364715576, + "learning_rate": 1.4919227423708653e-05, + "loss": 0.0301, + "step": 4251 + }, + { + "epoch": 2.5871615454822026, + "grad_norm": 0.49290603399276733, + "learning_rate": 1.490783674977419e-05, + "loss": 0.0389, + "step": 4252 + }, + { + "epoch": 2.5877700030422877, + "grad_norm": 0.5338709354400635, + "learning_rate": 1.489644857827554e-05, + "loss": 0.0364, + "step": 4253 + }, + { + "epoch": 2.5883784606023728, + "grad_norm": 0.5304934978485107, + "learning_rate": 1.4885062912036517e-05, + "loss": 0.0427, + "step": 4254 + }, + { + "epoch": 2.5889869181624583, + "grad_norm": 0.47712579369544983, + "learning_rate": 1.4873679753880284e-05, + "loss": 0.0442, + "step": 4255 + }, + { + "epoch": 2.5895953757225434, + "grad_norm": 0.5361647605895996, + "learning_rate": 1.4862299106629385e-05, + "loss": 0.0473, + "step": 4256 + }, + { + "epoch": 2.5902038332826285, + "grad_norm": 0.49737563729286194, + "learning_rate": 1.4850920973105736e-05, + "loss": 0.0344, + "step": 4257 + }, + { + "epoch": 2.5908122908427136, + "grad_norm": 0.5141322612762451, + "learning_rate": 1.483954535613066e-05, + "loss": 0.0588, + "step": 4258 + }, + { + "epoch": 2.5914207484027987, + "grad_norm": 0.4289071559906006, + "learning_rate": 1.4828172258524828e-05, + "loss": 0.0318, + "step": 4259 + }, + { + "epoch": 2.592029205962884, + "grad_norm": 0.4381569027900696, + "learning_rate": 1.4816801683108283e-05, + "loss": 0.0371, + "step": 4260 + }, + { + "epoch": 2.5926376635229693, + "grad_norm": 0.6164323091506958, + "learning_rate": 1.4805433632700475e-05, + "loss": 0.0378, + "step": 4261 + }, + { + "epoch": 2.5932461210830544, + "grad_norm": 0.4577103555202484, + "learning_rate": 1.4794068110120196e-05, + "loss": 0.0393, + "step": 4262 + }, + { + "epoch": 2.5938545786431395, + "grad_norm": 0.42026767134666443, + "learning_rate": 1.4782705118185608e-05, + "loss": 0.0397, + "step": 4263 + }, + { + "epoch": 2.5944630362032246, + "grad_norm": 0.4118916690349579, + "learning_rate": 1.4771344659714287e-05, + "loss": 0.0307, + "step": 4264 + }, + { + "epoch": 2.59507149376331, + "grad_norm": 0.4349444508552551, + "learning_rate": 1.4759986737523135e-05, + "loss": 0.0329, + "step": 4265 + }, + { + "epoch": 2.5956799513233952, + "grad_norm": 0.6133854389190674, + "learning_rate": 1.4748631354428444e-05, + "loss": 0.0446, + "step": 4266 + }, + { + "epoch": 2.5962884088834803, + "grad_norm": 0.5277345776557922, + "learning_rate": 1.473727851324588e-05, + "loss": 0.0432, + "step": 4267 + }, + { + "epoch": 2.5968968664435654, + "grad_norm": 0.6331156492233276, + "learning_rate": 1.472592821679048e-05, + "loss": 0.0598, + "step": 4268 + }, + { + "epoch": 2.5975053240036505, + "grad_norm": 0.5062826871871948, + "learning_rate": 1.4714580467876633e-05, + "loss": 0.0414, + "step": 4269 + }, + { + "epoch": 2.598113781563736, + "grad_norm": 0.43930584192276, + "learning_rate": 1.4703235269318107e-05, + "loss": 0.0425, + "step": 4270 + }, + { + "epoch": 2.598722239123821, + "grad_norm": 0.36331331729888916, + "learning_rate": 1.4691892623928052e-05, + "loss": 0.0169, + "step": 4271 + }, + { + "epoch": 2.5993306966839063, + "grad_norm": 0.49878576397895813, + "learning_rate": 1.468055253451896e-05, + "loss": 0.0363, + "step": 4272 + }, + { + "epoch": 2.5999391542439914, + "grad_norm": 0.5917057991027832, + "learning_rate": 1.4669215003902704e-05, + "loss": 0.063, + "step": 4273 + }, + { + "epoch": 2.6005476118040765, + "grad_norm": 0.6224356293678284, + "learning_rate": 1.465788003489052e-05, + "loss": 0.045, + "step": 4274 + }, + { + "epoch": 2.601156069364162, + "grad_norm": 0.6835285425186157, + "learning_rate": 1.4646547630293003e-05, + "loss": 0.0451, + "step": 4275 + }, + { + "epoch": 2.601764526924247, + "grad_norm": 0.5661000609397888, + "learning_rate": 1.463521779292012e-05, + "loss": 0.0451, + "step": 4276 + }, + { + "epoch": 2.602372984484332, + "grad_norm": 0.4806481897830963, + "learning_rate": 1.4623890525581204e-05, + "loss": 0.0418, + "step": 4277 + }, + { + "epoch": 2.6029814420444173, + "grad_norm": 0.5451545119285583, + "learning_rate": 1.4612565831084937e-05, + "loss": 0.0359, + "step": 4278 + }, + { + "epoch": 2.6035898996045024, + "grad_norm": 0.6044538021087646, + "learning_rate": 1.4601243712239376e-05, + "loss": 0.0408, + "step": 4279 + }, + { + "epoch": 2.604198357164588, + "grad_norm": 0.5169386863708496, + "learning_rate": 1.4589924171851926e-05, + "loss": 0.0313, + "step": 4280 + }, + { + "epoch": 2.604806814724673, + "grad_norm": 0.5409014821052551, + "learning_rate": 1.4578607212729384e-05, + "loss": 0.0353, + "step": 4281 + }, + { + "epoch": 2.605415272284758, + "grad_norm": 1.0071747303009033, + "learning_rate": 1.4567292837677855e-05, + "loss": 0.1323, + "step": 4282 + }, + { + "epoch": 2.606023729844843, + "grad_norm": 0.5074636936187744, + "learning_rate": 1.4555981049502849e-05, + "loss": 0.0353, + "step": 4283 + }, + { + "epoch": 2.6066321874049283, + "grad_norm": 0.4935401976108551, + "learning_rate": 1.4544671851009212e-05, + "loss": 0.0389, + "step": 4284 + }, + { + "epoch": 2.607240644965014, + "grad_norm": 0.46502718329429626, + "learning_rate": 1.4533365245001168e-05, + "loss": 0.0413, + "step": 4285 + }, + { + "epoch": 2.607849102525099, + "grad_norm": 0.49090376496315, + "learning_rate": 1.452206123428225e-05, + "loss": 0.0323, + "step": 4286 + }, + { + "epoch": 2.608457560085184, + "grad_norm": 0.3921135365962982, + "learning_rate": 1.4510759821655423e-05, + "loss": 0.0423, + "step": 4287 + }, + { + "epoch": 2.609066017645269, + "grad_norm": 0.6183372139930725, + "learning_rate": 1.449946100992294e-05, + "loss": 0.0389, + "step": 4288 + }, + { + "epoch": 2.6096744752053542, + "grad_norm": 0.4686357080936432, + "learning_rate": 1.4488164801886453e-05, + "loss": 0.0434, + "step": 4289 + }, + { + "epoch": 2.6102829327654398, + "grad_norm": 0.49797457456588745, + "learning_rate": 1.447687120034692e-05, + "loss": 0.0394, + "step": 4290 + }, + { + "epoch": 2.610891390325525, + "grad_norm": 0.5921012163162231, + "learning_rate": 1.4465580208104722e-05, + "loss": 0.0566, + "step": 4291 + }, + { + "epoch": 2.61149984788561, + "grad_norm": 0.5064262747764587, + "learning_rate": 1.4454291827959526e-05, + "loss": 0.0424, + "step": 4292 + }, + { + "epoch": 2.612108305445695, + "grad_norm": 0.45030635595321655, + "learning_rate": 1.4443006062710391e-05, + "loss": 0.0361, + "step": 4293 + }, + { + "epoch": 2.61271676300578, + "grad_norm": 0.8079550266265869, + "learning_rate": 1.4431722915155716e-05, + "loss": 0.045, + "step": 4294 + }, + { + "epoch": 2.6133252205658657, + "grad_norm": 0.5597800612449646, + "learning_rate": 1.4420442388093258e-05, + "loss": 0.0415, + "step": 4295 + }, + { + "epoch": 2.613933678125951, + "grad_norm": 0.5742923617362976, + "learning_rate": 1.4409164484320092e-05, + "loss": 0.059, + "step": 4296 + }, + { + "epoch": 2.614542135686036, + "grad_norm": 0.4569978713989258, + "learning_rate": 1.4397889206632706e-05, + "loss": 0.0445, + "step": 4297 + }, + { + "epoch": 2.615150593246121, + "grad_norm": 0.4496994614601135, + "learning_rate": 1.4386616557826869e-05, + "loss": 0.0386, + "step": 4298 + }, + { + "epoch": 2.615759050806206, + "grad_norm": 0.531541645526886, + "learning_rate": 1.4375346540697738e-05, + "loss": 0.0294, + "step": 4299 + }, + { + "epoch": 2.6163675083662916, + "grad_norm": 0.4035661816596985, + "learning_rate": 1.4364079158039807e-05, + "loss": 0.0337, + "step": 4300 + }, + { + "epoch": 2.6169759659263767, + "grad_norm": 0.42909929156303406, + "learning_rate": 1.4352814412646926e-05, + "loss": 0.0295, + "step": 4301 + }, + { + "epoch": 2.617584423486462, + "grad_norm": 0.6985564231872559, + "learning_rate": 1.4341552307312266e-05, + "loss": 0.0477, + "step": 4302 + }, + { + "epoch": 2.618192881046547, + "grad_norm": 0.5794458985328674, + "learning_rate": 1.4330292844828368e-05, + "loss": 0.0554, + "step": 4303 + }, + { + "epoch": 2.618801338606632, + "grad_norm": 0.6460904479026794, + "learning_rate": 1.431903602798711e-05, + "loss": 0.0393, + "step": 4304 + }, + { + "epoch": 2.6194097961667175, + "grad_norm": 0.5019935965538025, + "learning_rate": 1.4307781859579721e-05, + "loss": 0.0309, + "step": 4305 + }, + { + "epoch": 2.6200182537268026, + "grad_norm": 0.5265465974807739, + "learning_rate": 1.4296530342396741e-05, + "loss": 0.0407, + "step": 4306 + }, + { + "epoch": 2.6206267112868877, + "grad_norm": 0.6341969966888428, + "learning_rate": 1.4285281479228107e-05, + "loss": 0.0558, + "step": 4307 + }, + { + "epoch": 2.621235168846973, + "grad_norm": 0.5350863337516785, + "learning_rate": 1.4274035272863051e-05, + "loss": 0.0376, + "step": 4308 + }, + { + "epoch": 2.621843626407058, + "grad_norm": 0.5693925023078918, + "learning_rate": 1.4262791726090163e-05, + "loss": 0.0418, + "step": 4309 + }, + { + "epoch": 2.6224520839671435, + "grad_norm": 0.4471037983894348, + "learning_rate": 1.425155084169738e-05, + "loss": 0.0379, + "step": 4310 + }, + { + "epoch": 2.6230605415272286, + "grad_norm": 0.428018718957901, + "learning_rate": 1.424031262247198e-05, + "loss": 0.0334, + "step": 4311 + }, + { + "epoch": 2.6236689990873137, + "grad_norm": 0.5358444452285767, + "learning_rate": 1.4229077071200542e-05, + "loss": 0.0471, + "step": 4312 + }, + { + "epoch": 2.6242774566473988, + "grad_norm": 0.515167772769928, + "learning_rate": 1.4217844190669058e-05, + "loss": 0.0456, + "step": 4313 + }, + { + "epoch": 2.624885914207484, + "grad_norm": 0.5426524877548218, + "learning_rate": 1.4206613983662781e-05, + "loss": 0.0412, + "step": 4314 + }, + { + "epoch": 2.6254943717675694, + "grad_norm": 0.6369853019714355, + "learning_rate": 1.4195386452966359e-05, + "loss": 0.0569, + "step": 4315 + }, + { + "epoch": 2.6261028293276545, + "grad_norm": 0.5484540462493896, + "learning_rate": 1.4184161601363716e-05, + "loss": 0.0377, + "step": 4316 + }, + { + "epoch": 2.6267112868877396, + "grad_norm": 0.4859999418258667, + "learning_rate": 1.4172939431638188e-05, + "loss": 0.0353, + "step": 4317 + }, + { + "epoch": 2.6273197444478247, + "grad_norm": 0.5194180011749268, + "learning_rate": 1.4161719946572377e-05, + "loss": 0.0401, + "step": 4318 + }, + { + "epoch": 2.6279282020079098, + "grad_norm": 0.6942944526672363, + "learning_rate": 1.415050314894826e-05, + "loss": 0.0614, + "step": 4319 + }, + { + "epoch": 2.6285366595679953, + "grad_norm": 0.4476464092731476, + "learning_rate": 1.4139289041547132e-05, + "loss": 0.0307, + "step": 4320 + }, + { + "epoch": 2.6291451171280804, + "grad_norm": 0.5009614825248718, + "learning_rate": 1.4128077627149633e-05, + "loss": 0.0404, + "step": 4321 + }, + { + "epoch": 2.6297535746881655, + "grad_norm": 0.47980043292045593, + "learning_rate": 1.4116868908535702e-05, + "loss": 0.0377, + "step": 4322 + }, + { + "epoch": 2.6303620322482506, + "grad_norm": 0.5018118619918823, + "learning_rate": 1.4105662888484667e-05, + "loss": 0.0424, + "step": 4323 + }, + { + "epoch": 2.6309704898083357, + "grad_norm": 0.602103590965271, + "learning_rate": 1.4094459569775128e-05, + "loss": 0.0571, + "step": 4324 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 0.5462225675582886, + "learning_rate": 1.4083258955185053e-05, + "loss": 0.059, + "step": 4325 + }, + { + "epoch": 2.6321874049285063, + "grad_norm": 0.4753000736236572, + "learning_rate": 1.4072061047491721e-05, + "loss": 0.0439, + "step": 4326 + }, + { + "epoch": 2.6327958624885914, + "grad_norm": 0.45958274602890015, + "learning_rate": 1.4060865849471764e-05, + "loss": 0.0353, + "step": 4327 + }, + { + "epoch": 2.6334043200486765, + "grad_norm": 0.5752948522567749, + "learning_rate": 1.4049673363901097e-05, + "loss": 0.0462, + "step": 4328 + }, + { + "epoch": 2.6340127776087616, + "grad_norm": 0.4886283874511719, + "learning_rate": 1.4038483593555007e-05, + "loss": 0.0483, + "step": 4329 + }, + { + "epoch": 2.634621235168847, + "grad_norm": 0.4635407030582428, + "learning_rate": 1.4027296541208084e-05, + "loss": 0.038, + "step": 4330 + }, + { + "epoch": 2.6352296927289323, + "grad_norm": 0.4888119399547577, + "learning_rate": 1.4016112209634258e-05, + "loss": 0.0394, + "step": 4331 + }, + { + "epoch": 2.6358381502890174, + "grad_norm": 0.52603679895401, + "learning_rate": 1.400493060160677e-05, + "loss": 0.0452, + "step": 4332 + }, + { + "epoch": 2.6364466078491025, + "grad_norm": 0.5692835450172424, + "learning_rate": 1.3993751719898207e-05, + "loss": 0.0364, + "step": 4333 + }, + { + "epoch": 2.6370550654091875, + "grad_norm": 0.5845534205436707, + "learning_rate": 1.3982575567280442e-05, + "loss": 0.0503, + "step": 4334 + }, + { + "epoch": 2.637663522969273, + "grad_norm": 0.4657052755355835, + "learning_rate": 1.397140214652471e-05, + "loss": 0.0328, + "step": 4335 + }, + { + "epoch": 2.638271980529358, + "grad_norm": 0.4424096941947937, + "learning_rate": 1.3960231460401552e-05, + "loss": 0.0287, + "step": 4336 + }, + { + "epoch": 2.6388804380894433, + "grad_norm": 0.4316795766353607, + "learning_rate": 1.3949063511680837e-05, + "loss": 0.0282, + "step": 4337 + }, + { + "epoch": 2.6394888956495284, + "grad_norm": 0.5137184262275696, + "learning_rate": 1.3937898303131742e-05, + "loss": 0.0395, + "step": 4338 + }, + { + "epoch": 2.6400973532096135, + "grad_norm": 0.5804075002670288, + "learning_rate": 1.392673583752277e-05, + "loss": 0.0472, + "step": 4339 + }, + { + "epoch": 2.640705810769699, + "grad_norm": 0.43918368220329285, + "learning_rate": 1.3915576117621758e-05, + "loss": 0.0335, + "step": 4340 + }, + { + "epoch": 2.641314268329784, + "grad_norm": 0.4672797918319702, + "learning_rate": 1.3904419146195846e-05, + "loss": 0.0429, + "step": 4341 + }, + { + "epoch": 2.641922725889869, + "grad_norm": 0.515037477016449, + "learning_rate": 1.3893264926011502e-05, + "loss": 0.0467, + "step": 4342 + }, + { + "epoch": 2.6425311834499543, + "grad_norm": 0.5160783529281616, + "learning_rate": 1.3882113459834512e-05, + "loss": 0.0379, + "step": 4343 + }, + { + "epoch": 2.6431396410100394, + "grad_norm": 0.5049687623977661, + "learning_rate": 1.3870964750429954e-05, + "loss": 0.0434, + "step": 4344 + }, + { + "epoch": 2.643748098570125, + "grad_norm": 0.5025326013565063, + "learning_rate": 1.3859818800562263e-05, + "loss": 0.051, + "step": 4345 + }, + { + "epoch": 2.64435655613021, + "grad_norm": 0.5900479555130005, + "learning_rate": 1.384867561299516e-05, + "loss": 0.0468, + "step": 4346 + }, + { + "epoch": 2.644965013690295, + "grad_norm": 0.48302316665649414, + "learning_rate": 1.3837535190491696e-05, + "loss": 0.041, + "step": 4347 + }, + { + "epoch": 2.6455734712503802, + "grad_norm": 0.5139932632446289, + "learning_rate": 1.3826397535814242e-05, + "loss": 0.0532, + "step": 4348 + }, + { + "epoch": 2.6461819288104653, + "grad_norm": 0.40311169624328613, + "learning_rate": 1.3815262651724448e-05, + "loss": 0.0332, + "step": 4349 + }, + { + "epoch": 2.646790386370551, + "grad_norm": 0.512266218662262, + "learning_rate": 1.3804130540983317e-05, + "loss": 0.0447, + "step": 4350 + }, + { + "epoch": 2.647398843930636, + "grad_norm": 0.48747456073760986, + "learning_rate": 1.3793001206351142e-05, + "loss": 0.0312, + "step": 4351 + }, + { + "epoch": 2.648007301490721, + "grad_norm": 0.3904288709163666, + "learning_rate": 1.3781874650587536e-05, + "loss": 0.0294, + "step": 4352 + }, + { + "epoch": 2.648615759050806, + "grad_norm": 0.6491031050682068, + "learning_rate": 1.3770750876451427e-05, + "loss": 0.046, + "step": 4353 + }, + { + "epoch": 2.6492242166108912, + "grad_norm": 0.4241449534893036, + "learning_rate": 1.3759629886701047e-05, + "loss": 0.0422, + "step": 4354 + }, + { + "epoch": 2.649832674170977, + "grad_norm": 0.48629888892173767, + "learning_rate": 1.3748511684093926e-05, + "loss": 0.0308, + "step": 4355 + }, + { + "epoch": 2.650441131731062, + "grad_norm": 0.5005238056182861, + "learning_rate": 1.3737396271386921e-05, + "loss": 0.0566, + "step": 4356 + }, + { + "epoch": 2.651049589291147, + "grad_norm": 0.6130560040473938, + "learning_rate": 1.3726283651336194e-05, + "loss": 0.064, + "step": 4357 + }, + { + "epoch": 2.651658046851232, + "grad_norm": 0.578168511390686, + "learning_rate": 1.3715173826697209e-05, + "loss": 0.0479, + "step": 4358 + }, + { + "epoch": 2.652266504411317, + "grad_norm": 0.5515162944793701, + "learning_rate": 1.3704066800224741e-05, + "loss": 0.0452, + "step": 4359 + }, + { + "epoch": 2.6528749619714027, + "grad_norm": 0.5804712772369385, + "learning_rate": 1.369296257467288e-05, + "loss": 0.0467, + "step": 4360 + }, + { + "epoch": 2.653483419531488, + "grad_norm": 0.45583152770996094, + "learning_rate": 1.3681861152794984e-05, + "loss": 0.0318, + "step": 4361 + }, + { + "epoch": 2.654091877091573, + "grad_norm": 0.8064979910850525, + "learning_rate": 1.3670762537343765e-05, + "loss": 0.0576, + "step": 4362 + }, + { + "epoch": 2.654700334651658, + "grad_norm": 0.48464739322662354, + "learning_rate": 1.3659666731071207e-05, + "loss": 0.0436, + "step": 4363 + }, + { + "epoch": 2.655308792211743, + "grad_norm": 0.48261159658432007, + "learning_rate": 1.3648573736728627e-05, + "loss": 0.03, + "step": 4364 + }, + { + "epoch": 2.6559172497718286, + "grad_norm": 0.5928403735160828, + "learning_rate": 1.3637483557066583e-05, + "loss": 0.0423, + "step": 4365 + }, + { + "epoch": 2.6565257073319137, + "grad_norm": 0.521212637424469, + "learning_rate": 1.3626396194835026e-05, + "loss": 0.0518, + "step": 4366 + }, + { + "epoch": 2.657134164891999, + "grad_norm": 0.47702714800834656, + "learning_rate": 1.3615311652783127e-05, + "loss": 0.0327, + "step": 4367 + }, + { + "epoch": 2.657742622452084, + "grad_norm": 0.5482526421546936, + "learning_rate": 1.3604229933659402e-05, + "loss": 0.0494, + "step": 4368 + }, + { + "epoch": 2.658351080012169, + "grad_norm": 0.45253828167915344, + "learning_rate": 1.3593151040211654e-05, + "loss": 0.0375, + "step": 4369 + }, + { + "epoch": 2.6589595375722546, + "grad_norm": 0.5163220167160034, + "learning_rate": 1.3582074975186998e-05, + "loss": 0.034, + "step": 4370 + }, + { + "epoch": 2.6595679951323397, + "grad_norm": 0.4799509346485138, + "learning_rate": 1.3571001741331815e-05, + "loss": 0.035, + "step": 4371 + }, + { + "epoch": 2.6601764526924248, + "grad_norm": 3.0906639099121094, + "learning_rate": 1.3559931341391815e-05, + "loss": 0.0468, + "step": 4372 + }, + { + "epoch": 2.66078491025251, + "grad_norm": 0.5727887153625488, + "learning_rate": 1.3548863778111998e-05, + "loss": 0.0482, + "step": 4373 + }, + { + "epoch": 2.661393367812595, + "grad_norm": 0.5182603597640991, + "learning_rate": 1.3537799054236666e-05, + "loss": 0.0403, + "step": 4374 + }, + { + "epoch": 2.6620018253726805, + "grad_norm": 0.4546807110309601, + "learning_rate": 1.3526737172509383e-05, + "loss": 0.0396, + "step": 4375 + }, + { + "epoch": 2.6626102829327656, + "grad_norm": 0.5297713279724121, + "learning_rate": 1.3515678135673072e-05, + "loss": 0.0366, + "step": 4376 + }, + { + "epoch": 2.6632187404928507, + "grad_norm": 0.4167693853378296, + "learning_rate": 1.350462194646988e-05, + "loss": 0.0333, + "step": 4377 + }, + { + "epoch": 2.6638271980529358, + "grad_norm": 0.4427233040332794, + "learning_rate": 1.3493568607641294e-05, + "loss": 0.0403, + "step": 4378 + }, + { + "epoch": 2.664435655613021, + "grad_norm": 0.4933980405330658, + "learning_rate": 1.3482518121928083e-05, + "loss": 0.0355, + "step": 4379 + }, + { + "epoch": 2.6650441131731064, + "grad_norm": 0.6081718802452087, + "learning_rate": 1.3471470492070315e-05, + "loss": 0.0516, + "step": 4380 + }, + { + "epoch": 2.6656525707331915, + "grad_norm": 0.5365004539489746, + "learning_rate": 1.3460425720807316e-05, + "loss": 0.0451, + "step": 4381 + }, + { + "epoch": 2.6662610282932766, + "grad_norm": 0.4933543801307678, + "learning_rate": 1.3449383810877761e-05, + "loss": 0.0492, + "step": 4382 + }, + { + "epoch": 2.6668694858533617, + "grad_norm": 0.48126405477523804, + "learning_rate": 1.3438344765019558e-05, + "loss": 0.0424, + "step": 4383 + }, + { + "epoch": 2.667477943413447, + "grad_norm": 0.5277286171913147, + "learning_rate": 1.342730858596995e-05, + "loss": 0.0428, + "step": 4384 + }, + { + "epoch": 2.6680864009735323, + "grad_norm": 0.45332902669906616, + "learning_rate": 1.341627527646542e-05, + "loss": 0.037, + "step": 4385 + }, + { + "epoch": 2.6686948585336174, + "grad_norm": 0.4423314034938812, + "learning_rate": 1.340524483924181e-05, + "loss": 0.0283, + "step": 4386 + }, + { + "epoch": 2.6693033160937025, + "grad_norm": 0.6126822829246521, + "learning_rate": 1.3394217277034171e-05, + "loss": 0.0286, + "step": 4387 + }, + { + "epoch": 2.6699117736537876, + "grad_norm": 0.5217598080635071, + "learning_rate": 1.3383192592576898e-05, + "loss": 0.0424, + "step": 4388 + }, + { + "epoch": 2.6705202312138727, + "grad_norm": 0.5640128254890442, + "learning_rate": 1.3372170788603649e-05, + "loss": 0.0493, + "step": 4389 + }, + { + "epoch": 2.6711286887739583, + "grad_norm": 0.504953145980835, + "learning_rate": 1.3361151867847382e-05, + "loss": 0.0329, + "step": 4390 + }, + { + "epoch": 2.6717371463340434, + "grad_norm": 0.4337262511253357, + "learning_rate": 1.3350135833040305e-05, + "loss": 0.0376, + "step": 4391 + }, + { + "epoch": 2.6723456038941285, + "grad_norm": 0.5187790989875793, + "learning_rate": 1.3339122686913968e-05, + "loss": 0.0401, + "step": 4392 + }, + { + "epoch": 2.6729540614542135, + "grad_norm": 0.48012837767601013, + "learning_rate": 1.3328112432199144e-05, + "loss": 0.039, + "step": 4393 + }, + { + "epoch": 2.6735625190142986, + "grad_norm": 0.5445698499679565, + "learning_rate": 1.331710507162594e-05, + "loss": 0.062, + "step": 4394 + }, + { + "epoch": 2.674170976574384, + "grad_norm": 0.45752736926078796, + "learning_rate": 1.3306100607923687e-05, + "loss": 0.04, + "step": 4395 + }, + { + "epoch": 2.6747794341344693, + "grad_norm": 0.5074790120124817, + "learning_rate": 1.3295099043821085e-05, + "loss": 0.0531, + "step": 4396 + }, + { + "epoch": 2.6753878916945544, + "grad_norm": 0.46912896633148193, + "learning_rate": 1.3284100382046022e-05, + "loss": 0.0366, + "step": 4397 + }, + { + "epoch": 2.6759963492546395, + "grad_norm": 0.4760879576206207, + "learning_rate": 1.3273104625325722e-05, + "loss": 0.0374, + "step": 4398 + }, + { + "epoch": 2.6766048068147246, + "grad_norm": 0.6422132253646851, + "learning_rate": 1.326211177638667e-05, + "loss": 0.0607, + "step": 4399 + }, + { + "epoch": 2.67721326437481, + "grad_norm": 0.47899577021598816, + "learning_rate": 1.3251121837954655e-05, + "loss": 0.0411, + "step": 4400 + }, + { + "epoch": 2.677821721934895, + "grad_norm": 0.6358346939086914, + "learning_rate": 1.3240134812754681e-05, + "loss": 0.0489, + "step": 4401 + }, + { + "epoch": 2.6784301794949803, + "grad_norm": 0.5330213308334351, + "learning_rate": 1.3229150703511122e-05, + "loss": 0.0398, + "step": 4402 + }, + { + "epoch": 2.6790386370550654, + "grad_norm": 0.4606907367706299, + "learning_rate": 1.3218169512947542e-05, + "loss": 0.0269, + "step": 4403 + }, + { + "epoch": 2.6796470946151505, + "grad_norm": 0.5010808110237122, + "learning_rate": 1.3207191243786834e-05, + "loss": 0.0487, + "step": 4404 + }, + { + "epoch": 2.680255552175236, + "grad_norm": 0.4977697432041168, + "learning_rate": 1.319621589875115e-05, + "loss": 0.0378, + "step": 4405 + }, + { + "epoch": 2.6808640097353207, + "grad_norm": 0.41439104080200195, + "learning_rate": 1.3185243480561926e-05, + "loss": 0.0436, + "step": 4406 + }, + { + "epoch": 2.6814724672954062, + "grad_norm": 0.35358157753944397, + "learning_rate": 1.3174273991939845e-05, + "loss": 0.0269, + "step": 4407 + }, + { + "epoch": 2.6820809248554913, + "grad_norm": 0.4562048017978668, + "learning_rate": 1.3163307435604893e-05, + "loss": 0.0383, + "step": 4408 + }, + { + "epoch": 2.6826893824155764, + "grad_norm": 0.5168052911758423, + "learning_rate": 1.3152343814276318e-05, + "loss": 0.046, + "step": 4409 + }, + { + "epoch": 2.683297839975662, + "grad_norm": 0.5310634970664978, + "learning_rate": 1.3141383130672658e-05, + "loss": 0.0405, + "step": 4410 + }, + { + "epoch": 2.6839062975357466, + "grad_norm": 0.5165497660636902, + "learning_rate": 1.3130425387511667e-05, + "loss": 0.0516, + "step": 4411 + }, + { + "epoch": 2.684514755095832, + "grad_norm": 0.4916205406188965, + "learning_rate": 1.3119470587510451e-05, + "loss": 0.0372, + "step": 4412 + }, + { + "epoch": 2.6851232126559172, + "grad_norm": 0.40354472398757935, + "learning_rate": 1.3108518733385314e-05, + "loss": 0.0213, + "step": 4413 + }, + { + "epoch": 2.6857316702160023, + "grad_norm": 0.4055176079273224, + "learning_rate": 1.309756982785187e-05, + "loss": 0.0283, + "step": 4414 + }, + { + "epoch": 2.686340127776088, + "grad_norm": 0.5336512923240662, + "learning_rate": 1.3086623873624992e-05, + "loss": 0.041, + "step": 4415 + }, + { + "epoch": 2.6869485853361725, + "grad_norm": 0.4223940670490265, + "learning_rate": 1.3075680873418828e-05, + "loss": 0.0254, + "step": 4416 + }, + { + "epoch": 2.687557042896258, + "grad_norm": 0.6376833319664001, + "learning_rate": 1.306474082994677e-05, + "loss": 0.0409, + "step": 4417 + }, + { + "epoch": 2.688165500456343, + "grad_norm": 0.4131200611591339, + "learning_rate": 1.3053803745921498e-05, + "loss": 0.0309, + "step": 4418 + }, + { + "epoch": 2.6887739580164283, + "grad_norm": 0.5285510420799255, + "learning_rate": 1.3042869624054955e-05, + "loss": 0.0427, + "step": 4419 + }, + { + "epoch": 2.689382415576514, + "grad_norm": 0.4723327159881592, + "learning_rate": 1.3031938467058358e-05, + "loss": 0.0292, + "step": 4420 + }, + { + "epoch": 2.6899908731365985, + "grad_norm": 0.5857917070388794, + "learning_rate": 1.3021010277642145e-05, + "loss": 0.0493, + "step": 4421 + }, + { + "epoch": 2.690599330696684, + "grad_norm": 0.5390817523002625, + "learning_rate": 1.3010085058516097e-05, + "loss": 0.043, + "step": 4422 + }, + { + "epoch": 2.691207788256769, + "grad_norm": 0.45317864418029785, + "learning_rate": 1.2999162812389181e-05, + "loss": 0.029, + "step": 4423 + }, + { + "epoch": 2.691816245816854, + "grad_norm": 0.4660782217979431, + "learning_rate": 1.2988243541969667e-05, + "loss": 0.0258, + "step": 4424 + }, + { + "epoch": 2.6924247033769397, + "grad_norm": 0.5454357266426086, + "learning_rate": 1.297732724996508e-05, + "loss": 0.0432, + "step": 4425 + }, + { + "epoch": 2.6930331609370244, + "grad_norm": 0.4717608094215393, + "learning_rate": 1.2966413939082214e-05, + "loss": 0.0356, + "step": 4426 + }, + { + "epoch": 2.69364161849711, + "grad_norm": 0.43902650475502014, + "learning_rate": 1.2955503612027086e-05, + "loss": 0.0321, + "step": 4427 + }, + { + "epoch": 2.694250076057195, + "grad_norm": 0.5439276695251465, + "learning_rate": 1.2944596271505046e-05, + "loss": 0.0477, + "step": 4428 + }, + { + "epoch": 2.69485853361728, + "grad_norm": 0.5469129085540771, + "learning_rate": 1.2933691920220626e-05, + "loss": 0.0429, + "step": 4429 + }, + { + "epoch": 2.695466991177365, + "grad_norm": 0.6971737742424011, + "learning_rate": 1.2922790560877667e-05, + "loss": 0.067, + "step": 4430 + }, + { + "epoch": 2.6960754487374503, + "grad_norm": 0.5088085532188416, + "learning_rate": 1.2911892196179231e-05, + "loss": 0.0499, + "step": 4431 + }, + { + "epoch": 2.696683906297536, + "grad_norm": 0.502051591873169, + "learning_rate": 1.2900996828827693e-05, + "loss": 0.0361, + "step": 4432 + }, + { + "epoch": 2.697292363857621, + "grad_norm": 0.4716339707374573, + "learning_rate": 1.2890104461524619e-05, + "loss": 0.032, + "step": 4433 + }, + { + "epoch": 2.697900821417706, + "grad_norm": 0.4593643546104431, + "learning_rate": 1.2879215096970871e-05, + "loss": 0.0321, + "step": 4434 + }, + { + "epoch": 2.698509278977791, + "grad_norm": 0.42600390315055847, + "learning_rate": 1.2868328737866563e-05, + "loss": 0.049, + "step": 4435 + }, + { + "epoch": 2.6991177365378762, + "grad_norm": 0.5230850577354431, + "learning_rate": 1.2857445386911064e-05, + "loss": 0.0362, + "step": 4436 + }, + { + "epoch": 2.6997261940979618, + "grad_norm": 0.5436407923698425, + "learning_rate": 1.284656504680296e-05, + "loss": 0.0493, + "step": 4437 + }, + { + "epoch": 2.700334651658047, + "grad_norm": 0.5182242393493652, + "learning_rate": 1.283568772024017e-05, + "loss": 0.0354, + "step": 4438 + }, + { + "epoch": 2.700943109218132, + "grad_norm": 0.4881172776222229, + "learning_rate": 1.2824813409919777e-05, + "loss": 0.0473, + "step": 4439 + }, + { + "epoch": 2.701551566778217, + "grad_norm": 0.5372771620750427, + "learning_rate": 1.2813942118538181e-05, + "loss": 0.0472, + "step": 4440 + }, + { + "epoch": 2.702160024338302, + "grad_norm": 0.31403908133506775, + "learning_rate": 1.2803073848790983e-05, + "loss": 0.0213, + "step": 4441 + }, + { + "epoch": 2.7027684818983877, + "grad_norm": 0.5783568024635315, + "learning_rate": 1.2792208603373096e-05, + "loss": 0.046, + "step": 4442 + }, + { + "epoch": 2.703376939458473, + "grad_norm": 0.48002150654792786, + "learning_rate": 1.278134638497862e-05, + "loss": 0.0315, + "step": 4443 + }, + { + "epoch": 2.703985397018558, + "grad_norm": 0.4475983679294586, + "learning_rate": 1.277048719630094e-05, + "loss": 0.0338, + "step": 4444 + }, + { + "epoch": 2.704593854578643, + "grad_norm": 0.4856289029121399, + "learning_rate": 1.2759631040032688e-05, + "loss": 0.0384, + "step": 4445 + }, + { + "epoch": 2.705202312138728, + "grad_norm": 0.34708070755004883, + "learning_rate": 1.274877791886574e-05, + "loss": 0.0199, + "step": 4446 + }, + { + "epoch": 2.7058107696988136, + "grad_norm": 0.4791733920574188, + "learning_rate": 1.2737927835491196e-05, + "loss": 0.0384, + "step": 4447 + }, + { + "epoch": 2.7064192272588987, + "grad_norm": 0.4502567648887634, + "learning_rate": 1.2727080792599455e-05, + "loss": 0.0389, + "step": 4448 + }, + { + "epoch": 2.707027684818984, + "grad_norm": 0.583976149559021, + "learning_rate": 1.2716236792880112e-05, + "loss": 0.0401, + "step": 4449 + }, + { + "epoch": 2.707636142379069, + "grad_norm": 0.5464617013931274, + "learning_rate": 1.270539583902203e-05, + "loss": 0.0389, + "step": 4450 + }, + { + "epoch": 2.708244599939154, + "grad_norm": 0.4903790354728699, + "learning_rate": 1.2694557933713316e-05, + "loss": 0.0391, + "step": 4451 + }, + { + "epoch": 2.7088530574992395, + "grad_norm": 0.42102527618408203, + "learning_rate": 1.2683723079641329e-05, + "loss": 0.0348, + "step": 4452 + }, + { + "epoch": 2.7094615150593246, + "grad_norm": 0.44486379623413086, + "learning_rate": 1.2672891279492638e-05, + "loss": 0.0285, + "step": 4453 + }, + { + "epoch": 2.7100699726194097, + "grad_norm": 0.5711926221847534, + "learning_rate": 1.2662062535953095e-05, + "loss": 0.0396, + "step": 4454 + }, + { + "epoch": 2.710678430179495, + "grad_norm": 0.5700972676277161, + "learning_rate": 1.2651236851707768e-05, + "loss": 0.0475, + "step": 4455 + }, + { + "epoch": 2.71128688773958, + "grad_norm": 0.4372463822364807, + "learning_rate": 1.2640414229440983e-05, + "loss": 0.0364, + "step": 4456 + }, + { + "epoch": 2.7118953452996655, + "grad_norm": 0.5272347927093506, + "learning_rate": 1.2629594671836292e-05, + "loss": 0.0545, + "step": 4457 + }, + { + "epoch": 2.7125038028597506, + "grad_norm": 0.49635735154151917, + "learning_rate": 1.2618778181576513e-05, + "loss": 0.0401, + "step": 4458 + }, + { + "epoch": 2.7131122604198357, + "grad_norm": 0.430007666349411, + "learning_rate": 1.260796476134366e-05, + "loss": 0.0336, + "step": 4459 + }, + { + "epoch": 2.7137207179799208, + "grad_norm": 0.6025989055633545, + "learning_rate": 1.2597154413819018e-05, + "loss": 0.0432, + "step": 4460 + }, + { + "epoch": 2.714329175540006, + "grad_norm": 0.5454067587852478, + "learning_rate": 1.2586347141683108e-05, + "loss": 0.0383, + "step": 4461 + }, + { + "epoch": 2.7149376331000914, + "grad_norm": 0.5201361775398254, + "learning_rate": 1.2575542947615675e-05, + "loss": 0.0388, + "step": 4462 + }, + { + "epoch": 2.7155460906601765, + "grad_norm": 0.5323150753974915, + "learning_rate": 1.256474183429572e-05, + "loss": 0.0497, + "step": 4463 + }, + { + "epoch": 2.7161545482202616, + "grad_norm": 0.4000251889228821, + "learning_rate": 1.2553943804401472e-05, + "loss": 0.0306, + "step": 4464 + }, + { + "epoch": 2.7167630057803467, + "grad_norm": 0.47391611337661743, + "learning_rate": 1.254314886061037e-05, + "loss": 0.0445, + "step": 4465 + }, + { + "epoch": 2.717371463340432, + "grad_norm": 0.3792179524898529, + "learning_rate": 1.2532357005599126e-05, + "loss": 0.0361, + "step": 4466 + }, + { + "epoch": 2.7179799209005173, + "grad_norm": 0.4722985029220581, + "learning_rate": 1.2521568242043669e-05, + "loss": 0.0487, + "step": 4467 + }, + { + "epoch": 2.7185883784606024, + "grad_norm": 0.4448596239089966, + "learning_rate": 1.251078257261916e-05, + "loss": 0.0373, + "step": 4468 + }, + { + "epoch": 2.7191968360206875, + "grad_norm": 0.482346773147583, + "learning_rate": 1.2500000000000006e-05, + "loss": 0.0407, + "step": 4469 + }, + { + "epoch": 2.7198052935807726, + "grad_norm": 0.5089819431304932, + "learning_rate": 1.2489220526859816e-05, + "loss": 0.0476, + "step": 4470 + }, + { + "epoch": 2.7204137511408577, + "grad_norm": 0.3887785077095032, + "learning_rate": 1.247844415587146e-05, + "loss": 0.0293, + "step": 4471 + }, + { + "epoch": 2.7210222087009432, + "grad_norm": 0.5263569951057434, + "learning_rate": 1.2467670889707032e-05, + "loss": 0.0463, + "step": 4472 + }, + { + "epoch": 2.7216306662610283, + "grad_norm": 0.7234339714050293, + "learning_rate": 1.2456900731037849e-05, + "loss": 0.0493, + "step": 4473 + }, + { + "epoch": 2.7222391238211134, + "grad_norm": 0.565403938293457, + "learning_rate": 1.2446133682534473e-05, + "loss": 0.0406, + "step": 4474 + }, + { + "epoch": 2.7228475813811985, + "grad_norm": 0.476296067237854, + "learning_rate": 1.243536974686666e-05, + "loss": 0.0374, + "step": 4475 + }, + { + "epoch": 2.7234560389412836, + "grad_norm": 0.49702188372612, + "learning_rate": 1.2424608926703433e-05, + "loss": 0.0248, + "step": 4476 + }, + { + "epoch": 2.724064496501369, + "grad_norm": 0.4639529585838318, + "learning_rate": 1.2413851224713022e-05, + "loss": 0.0491, + "step": 4477 + }, + { + "epoch": 2.7246729540614543, + "grad_norm": 0.5371832251548767, + "learning_rate": 1.2403096643562891e-05, + "loss": 0.0396, + "step": 4478 + }, + { + "epoch": 2.7252814116215394, + "grad_norm": 0.7577241659164429, + "learning_rate": 1.2392345185919737e-05, + "loss": 0.0627, + "step": 4479 + }, + { + "epoch": 2.7258898691816245, + "grad_norm": 0.46722671389579773, + "learning_rate": 1.2381596854449457e-05, + "loss": 0.0446, + "step": 4480 + }, + { + "epoch": 2.7264983267417096, + "grad_norm": 0.39545291662216187, + "learning_rate": 1.2370851651817194e-05, + "loss": 0.0299, + "step": 4481 + }, + { + "epoch": 2.727106784301795, + "grad_norm": 0.4299526810646057, + "learning_rate": 1.2360109580687313e-05, + "loss": 0.0343, + "step": 4482 + }, + { + "epoch": 2.72771524186188, + "grad_norm": 0.3463142514228821, + "learning_rate": 1.2349370643723399e-05, + "loss": 0.0296, + "step": 4483 + }, + { + "epoch": 2.7283236994219653, + "grad_norm": 0.42831888794898987, + "learning_rate": 1.2338634843588263e-05, + "loss": 0.0388, + "step": 4484 + }, + { + "epoch": 2.7289321569820504, + "grad_norm": 0.5531302690505981, + "learning_rate": 1.232790218294394e-05, + "loss": 0.0563, + "step": 4485 + }, + { + "epoch": 2.7295406145421355, + "grad_norm": 0.537441611289978, + "learning_rate": 1.231717266445167e-05, + "loss": 0.0507, + "step": 4486 + }, + { + "epoch": 2.730149072102221, + "grad_norm": 0.5240452885627747, + "learning_rate": 1.2306446290771934e-05, + "loss": 0.0478, + "step": 4487 + }, + { + "epoch": 2.730757529662306, + "grad_norm": 0.5667134523391724, + "learning_rate": 1.2295723064564422e-05, + "loss": 0.0346, + "step": 4488 + }, + { + "epoch": 2.731365987222391, + "grad_norm": 0.6184573769569397, + "learning_rate": 1.228500298848806e-05, + "loss": 0.0501, + "step": 4489 + }, + { + "epoch": 2.7319744447824763, + "grad_norm": 0.47468021512031555, + "learning_rate": 1.227428606520095e-05, + "loss": 0.0299, + "step": 4490 + }, + { + "epoch": 2.7325829023425614, + "grad_norm": 0.534231424331665, + "learning_rate": 1.2263572297360478e-05, + "loss": 0.0534, + "step": 4491 + }, + { + "epoch": 2.733191359902647, + "grad_norm": 0.3944413959980011, + "learning_rate": 1.225286168762319e-05, + "loss": 0.0295, + "step": 4492 + }, + { + "epoch": 2.733799817462732, + "grad_norm": 0.45191413164138794, + "learning_rate": 1.2242154238644879e-05, + "loss": 0.0385, + "step": 4493 + }, + { + "epoch": 2.734408275022817, + "grad_norm": 0.5744283199310303, + "learning_rate": 1.223144995308054e-05, + "loss": 0.0453, + "step": 4494 + }, + { + "epoch": 2.7350167325829022, + "grad_norm": 0.542323887348175, + "learning_rate": 1.2220748833584403e-05, + "loss": 0.0428, + "step": 4495 + }, + { + "epoch": 2.7356251901429873, + "grad_norm": 0.4557442367076874, + "learning_rate": 1.221005088280987e-05, + "loss": 0.0253, + "step": 4496 + }, + { + "epoch": 2.736233647703073, + "grad_norm": 0.5451676845550537, + "learning_rate": 1.219935610340963e-05, + "loss": 0.0469, + "step": 4497 + }, + { + "epoch": 2.736842105263158, + "grad_norm": 0.568136990070343, + "learning_rate": 1.2188664498035504e-05, + "loss": 0.0496, + "step": 4498 + }, + { + "epoch": 2.737450562823243, + "grad_norm": 0.451236754655838, + "learning_rate": 1.2177976069338592e-05, + "loss": 0.0302, + "step": 4499 + }, + { + "epoch": 2.738059020383328, + "grad_norm": 0.5220772624015808, + "learning_rate": 1.2167290819969149e-05, + "loss": 0.0521, + "step": 4500 + }, + { + "epoch": 2.7386674779434133, + "grad_norm": 0.508506715297699, + "learning_rate": 1.2156608752576707e-05, + "loss": 0.0526, + "step": 4501 + }, + { + "epoch": 2.739275935503499, + "grad_norm": 0.49221986532211304, + "learning_rate": 1.2145929869809944e-05, + "loss": 0.0402, + "step": 4502 + }, + { + "epoch": 2.739884393063584, + "grad_norm": 0.5821833610534668, + "learning_rate": 1.213525417431679e-05, + "loss": 0.0393, + "step": 4503 + }, + { + "epoch": 2.740492850623669, + "grad_norm": 0.4816054701805115, + "learning_rate": 1.2124581668744372e-05, + "loss": 0.0305, + "step": 4504 + }, + { + "epoch": 2.741101308183754, + "grad_norm": 0.5656616687774658, + "learning_rate": 1.2113912355739037e-05, + "loss": 0.0549, + "step": 4505 + }, + { + "epoch": 2.741709765743839, + "grad_norm": 0.5027511715888977, + "learning_rate": 1.21032462379463e-05, + "loss": 0.042, + "step": 4506 + }, + { + "epoch": 2.7423182233039247, + "grad_norm": 1.0891318321228027, + "learning_rate": 1.209258331801095e-05, + "loss": 0.1, + "step": 4507 + }, + { + "epoch": 2.74292668086401, + "grad_norm": 0.5207118988037109, + "learning_rate": 1.2081923598576921e-05, + "loss": 0.0473, + "step": 4508 + }, + { + "epoch": 2.743535138424095, + "grad_norm": 0.5301030278205872, + "learning_rate": 1.2071267082287388e-05, + "loss": 0.0409, + "step": 4509 + }, + { + "epoch": 2.74414359598418, + "grad_norm": 0.577142596244812, + "learning_rate": 1.2060613771784724e-05, + "loss": 0.0391, + "step": 4510 + }, + { + "epoch": 2.744752053544265, + "grad_norm": 0.4636692702770233, + "learning_rate": 1.204996366971051e-05, + "loss": 0.0381, + "step": 4511 + }, + { + "epoch": 2.7453605111043506, + "grad_norm": 0.43843039870262146, + "learning_rate": 1.2039316778705514e-05, + "loss": 0.0349, + "step": 4512 + }, + { + "epoch": 2.7459689686644357, + "grad_norm": 0.5346546769142151, + "learning_rate": 1.2028673101409729e-05, + "loss": 0.0466, + "step": 4513 + }, + { + "epoch": 2.746577426224521, + "grad_norm": 0.5091422200202942, + "learning_rate": 1.2018032640462345e-05, + "loss": 0.0382, + "step": 4514 + }, + { + "epoch": 2.747185883784606, + "grad_norm": 0.4113479256629944, + "learning_rate": 1.2007395398501759e-05, + "loss": 0.0386, + "step": 4515 + }, + { + "epoch": 2.747794341344691, + "grad_norm": 0.44784247875213623, + "learning_rate": 1.1996761378165535e-05, + "loss": 0.0397, + "step": 4516 + }, + { + "epoch": 2.7484027989047766, + "grad_norm": 0.44350409507751465, + "learning_rate": 1.198613058209051e-05, + "loss": 0.0327, + "step": 4517 + }, + { + "epoch": 2.7490112564648617, + "grad_norm": 0.4770455062389374, + "learning_rate": 1.1975503012912645e-05, + "loss": 0.0396, + "step": 4518 + }, + { + "epoch": 2.7496197140249468, + "grad_norm": 0.4313164949417114, + "learning_rate": 1.1964878673267143e-05, + "loss": 0.0279, + "step": 4519 + }, + { + "epoch": 2.750228171585032, + "grad_norm": 0.5598992705345154, + "learning_rate": 1.1954257565788402e-05, + "loss": 0.0391, + "step": 4520 + }, + { + "epoch": 2.750836629145117, + "grad_norm": 0.44698452949523926, + "learning_rate": 1.194363969311002e-05, + "loss": 0.0492, + "step": 4521 + }, + { + "epoch": 2.7514450867052025, + "grad_norm": 0.4371298551559448, + "learning_rate": 1.1933025057864769e-05, + "loss": 0.0342, + "step": 4522 + }, + { + "epoch": 2.7520535442652876, + "grad_norm": 0.4278777539730072, + "learning_rate": 1.1922413662684644e-05, + "loss": 0.0251, + "step": 4523 + }, + { + "epoch": 2.7526620018253727, + "grad_norm": 0.6282891631126404, + "learning_rate": 1.1911805510200833e-05, + "loss": 0.0435, + "step": 4524 + }, + { + "epoch": 2.753270459385458, + "grad_norm": 0.6077399253845215, + "learning_rate": 1.1901200603043718e-05, + "loss": 0.0501, + "step": 4525 + }, + { + "epoch": 2.753878916945543, + "grad_norm": 0.4395311772823334, + "learning_rate": 1.1890598943842854e-05, + "loss": 0.0423, + "step": 4526 + }, + { + "epoch": 2.7544873745056284, + "grad_norm": 0.5030261874198914, + "learning_rate": 1.188000053522704e-05, + "loss": 0.0456, + "step": 4527 + }, + { + "epoch": 2.7550958320657135, + "grad_norm": 0.49082061648368835, + "learning_rate": 1.186940537982422e-05, + "loss": 0.0404, + "step": 4528 + }, + { + "epoch": 2.7557042896257986, + "grad_norm": 0.476769357919693, + "learning_rate": 1.1858813480261552e-05, + "loss": 0.0286, + "step": 4529 + }, + { + "epoch": 2.7563127471858837, + "grad_norm": 0.4839482307434082, + "learning_rate": 1.1848224839165389e-05, + "loss": 0.0287, + "step": 4530 + }, + { + "epoch": 2.756921204745969, + "grad_norm": 0.5627169609069824, + "learning_rate": 1.1837639459161284e-05, + "loss": 0.0645, + "step": 4531 + }, + { + "epoch": 2.7575296623060543, + "grad_norm": 0.5222127437591553, + "learning_rate": 1.182705734287394e-05, + "loss": 0.0388, + "step": 4532 + }, + { + "epoch": 2.7581381198661394, + "grad_norm": 0.4021141827106476, + "learning_rate": 1.1816478492927316e-05, + "loss": 0.0307, + "step": 4533 + }, + { + "epoch": 2.7587465774262245, + "grad_norm": 0.41538554430007935, + "learning_rate": 1.1805902911944503e-05, + "loss": 0.032, + "step": 4534 + }, + { + "epoch": 2.7593550349863096, + "grad_norm": 0.517659604549408, + "learning_rate": 1.179533060254782e-05, + "loss": 0.0451, + "step": 4535 + }, + { + "epoch": 2.7599634925463947, + "grad_norm": 0.5641751885414124, + "learning_rate": 1.1784761567358729e-05, + "loss": 0.0513, + "step": 4536 + }, + { + "epoch": 2.7605719501064803, + "grad_norm": 0.5050565600395203, + "learning_rate": 1.177419580899795e-05, + "loss": 0.0318, + "step": 4537 + }, + { + "epoch": 2.7611804076665654, + "grad_norm": 0.5235201716423035, + "learning_rate": 1.1763633330085325e-05, + "loss": 0.0241, + "step": 4538 + }, + { + "epoch": 2.7617888652266505, + "grad_norm": 0.6112051010131836, + "learning_rate": 1.1753074133239914e-05, + "loss": 0.0513, + "step": 4539 + }, + { + "epoch": 2.7623973227867356, + "grad_norm": 0.3812270164489746, + "learning_rate": 1.1742518221079957e-05, + "loss": 0.0267, + "step": 4540 + }, + { + "epoch": 2.7630057803468207, + "grad_norm": 0.4638807773590088, + "learning_rate": 1.1731965596222893e-05, + "loss": 0.0362, + "step": 4541 + }, + { + "epoch": 2.763614237906906, + "grad_norm": 0.47099676728248596, + "learning_rate": 1.1721416261285303e-05, + "loss": 0.0516, + "step": 4542 + }, + { + "epoch": 2.7642226954669913, + "grad_norm": 0.6157107353210449, + "learning_rate": 1.1710870218883022e-05, + "loss": 0.0583, + "step": 4543 + }, + { + "epoch": 2.7648311530270764, + "grad_norm": 0.4588291049003601, + "learning_rate": 1.1700327471630995e-05, + "loss": 0.03, + "step": 4544 + }, + { + "epoch": 2.7654396105871615, + "grad_norm": 0.54620760679245, + "learning_rate": 1.1689788022143411e-05, + "loss": 0.0413, + "step": 4545 + }, + { + "epoch": 2.7660480681472466, + "grad_norm": 0.5352728366851807, + "learning_rate": 1.167925187303358e-05, + "loss": 0.0469, + "step": 4546 + }, + { + "epoch": 2.766656525707332, + "grad_norm": 0.560219407081604, + "learning_rate": 1.1668719026914068e-05, + "loss": 0.032, + "step": 4547 + }, + { + "epoch": 2.767264983267417, + "grad_norm": 0.5410435199737549, + "learning_rate": 1.165818948639655e-05, + "loss": 0.0457, + "step": 4548 + }, + { + "epoch": 2.7678734408275023, + "grad_norm": 0.46575361490249634, + "learning_rate": 1.1647663254091928e-05, + "loss": 0.0321, + "step": 4549 + }, + { + "epoch": 2.7684818983875874, + "grad_norm": 0.571999728679657, + "learning_rate": 1.1637140332610267e-05, + "loss": 0.05, + "step": 4550 + }, + { + "epoch": 2.7690903559476725, + "grad_norm": 0.4646073281764984, + "learning_rate": 1.1626620724560819e-05, + "loss": 0.04, + "step": 4551 + }, + { + "epoch": 2.769698813507758, + "grad_norm": 0.44628095626831055, + "learning_rate": 1.1616104432551982e-05, + "loss": 0.0328, + "step": 4552 + }, + { + "epoch": 2.770307271067843, + "grad_norm": 0.49966830015182495, + "learning_rate": 1.16055914591914e-05, + "loss": 0.0392, + "step": 4553 + }, + { + "epoch": 2.7709157286279282, + "grad_norm": 0.4285971522331238, + "learning_rate": 1.1595081807085816e-05, + "loss": 0.0387, + "step": 4554 + }, + { + "epoch": 2.7715241861880133, + "grad_norm": 0.46280503273010254, + "learning_rate": 1.1584575478841204e-05, + "loss": 0.0361, + "step": 4555 + }, + { + "epoch": 2.7721326437480984, + "grad_norm": 0.48611634969711304, + "learning_rate": 1.1574072477062686e-05, + "loss": 0.0432, + "step": 4556 + }, + { + "epoch": 2.772741101308184, + "grad_norm": 0.4622981548309326, + "learning_rate": 1.1563572804354586e-05, + "loss": 0.0429, + "step": 4557 + }, + { + "epoch": 2.773349558868269, + "grad_norm": 0.500038206577301, + "learning_rate": 1.1553076463320363e-05, + "loss": 0.0648, + "step": 4558 + }, + { + "epoch": 2.773958016428354, + "grad_norm": 0.5276514291763306, + "learning_rate": 1.154258345656268e-05, + "loss": 0.0452, + "step": 4559 + }, + { + "epoch": 2.7745664739884393, + "grad_norm": 0.6001972556114197, + "learning_rate": 1.1532093786683368e-05, + "loss": 0.0578, + "step": 4560 + }, + { + "epoch": 2.7751749315485243, + "grad_norm": 0.5367801785469055, + "learning_rate": 1.1521607456283437e-05, + "loss": 0.0388, + "step": 4561 + }, + { + "epoch": 2.77578338910861, + "grad_norm": 0.45866864919662476, + "learning_rate": 1.151112446796303e-05, + "loss": 0.0373, + "step": 4562 + }, + { + "epoch": 2.776391846668695, + "grad_norm": 0.549251139163971, + "learning_rate": 1.1500644824321529e-05, + "loss": 0.05, + "step": 4563 + }, + { + "epoch": 2.77700030422878, + "grad_norm": 0.6965852379798889, + "learning_rate": 1.1490168527957423e-05, + "loss": 0.0401, + "step": 4564 + }, + { + "epoch": 2.777608761788865, + "grad_norm": 0.4115385413169861, + "learning_rate": 1.1479695581468405e-05, + "loss": 0.0365, + "step": 4565 + }, + { + "epoch": 2.7782172193489503, + "grad_norm": 0.4784400761127472, + "learning_rate": 1.1469225987451327e-05, + "loss": 0.0356, + "step": 4566 + }, + { + "epoch": 2.778825676909036, + "grad_norm": 0.49439680576324463, + "learning_rate": 1.1458759748502223e-05, + "loss": 0.0499, + "step": 4567 + }, + { + "epoch": 2.779434134469121, + "grad_norm": 0.39347532391548157, + "learning_rate": 1.1448296867216268e-05, + "loss": 0.0287, + "step": 4568 + }, + { + "epoch": 2.780042592029206, + "grad_norm": 0.5476490259170532, + "learning_rate": 1.1437837346187824e-05, + "loss": 0.0491, + "step": 4569 + }, + { + "epoch": 2.780651049589291, + "grad_norm": 0.5295094847679138, + "learning_rate": 1.142738118801042e-05, + "loss": 0.0493, + "step": 4570 + }, + { + "epoch": 2.781259507149376, + "grad_norm": 0.5610222816467285, + "learning_rate": 1.1416928395276757e-05, + "loss": 0.0442, + "step": 4571 + }, + { + "epoch": 2.7818679647094617, + "grad_norm": 0.443450003862381, + "learning_rate": 1.1406478970578662e-05, + "loss": 0.0362, + "step": 4572 + }, + { + "epoch": 2.782476422269547, + "grad_norm": 0.5407084226608276, + "learning_rate": 1.1396032916507196e-05, + "loss": 0.046, + "step": 4573 + }, + { + "epoch": 2.783084879829632, + "grad_norm": 0.5253666043281555, + "learning_rate": 1.1385590235652515e-05, + "loss": 0.0456, + "step": 4574 + }, + { + "epoch": 2.783693337389717, + "grad_norm": 0.5159477591514587, + "learning_rate": 1.1375150930603982e-05, + "loss": 0.0368, + "step": 4575 + }, + { + "epoch": 2.784301794949802, + "grad_norm": 0.49028316140174866, + "learning_rate": 1.1364715003950102e-05, + "loss": 0.0431, + "step": 4576 + }, + { + "epoch": 2.7849102525098877, + "grad_norm": 0.4681493937969208, + "learning_rate": 1.1354282458278567e-05, + "loss": 0.0318, + "step": 4577 + }, + { + "epoch": 2.7855187100699728, + "grad_norm": 0.44730204343795776, + "learning_rate": 1.134385329617618e-05, + "loss": 0.0361, + "step": 4578 + }, + { + "epoch": 2.786127167630058, + "grad_norm": 0.47637778520584106, + "learning_rate": 1.1333427520228979e-05, + "loss": 0.037, + "step": 4579 + }, + { + "epoch": 2.786735625190143, + "grad_norm": 0.42056208848953247, + "learning_rate": 1.1323005133022095e-05, + "loss": 0.0391, + "step": 4580 + }, + { + "epoch": 2.787344082750228, + "grad_norm": 0.4972124695777893, + "learning_rate": 1.131258613713985e-05, + "loss": 0.0455, + "step": 4581 + }, + { + "epoch": 2.7879525403103136, + "grad_norm": 0.5506184697151184, + "learning_rate": 1.130217053516572e-05, + "loss": 0.0345, + "step": 4582 + }, + { + "epoch": 2.7885609978703987, + "grad_norm": 0.5120692253112793, + "learning_rate": 1.1291758329682358e-05, + "loss": 0.0371, + "step": 4583 + }, + { + "epoch": 2.789169455430484, + "grad_norm": 0.4274442791938782, + "learning_rate": 1.1281349523271534e-05, + "loss": 0.0344, + "step": 4584 + }, + { + "epoch": 2.789777912990569, + "grad_norm": 0.5016206502914429, + "learning_rate": 1.1270944118514203e-05, + "loss": 0.0366, + "step": 4585 + }, + { + "epoch": 2.790386370550654, + "grad_norm": 0.5108194351196289, + "learning_rate": 1.1260542117990478e-05, + "loss": 0.0465, + "step": 4586 + }, + { + "epoch": 2.7909948281107395, + "grad_norm": 0.5079597234725952, + "learning_rate": 1.1250143524279618e-05, + "loss": 0.0422, + "step": 4587 + }, + { + "epoch": 2.7916032856708246, + "grad_norm": 0.42185690999031067, + "learning_rate": 1.1239748339960043e-05, + "loss": 0.0339, + "step": 4588 + }, + { + "epoch": 2.7922117432309097, + "grad_norm": 0.5816838145256042, + "learning_rate": 1.122935656760933e-05, + "loss": 0.065, + "step": 4589 + }, + { + "epoch": 2.792820200790995, + "grad_norm": 0.5395743250846863, + "learning_rate": 1.1218968209804192e-05, + "loss": 0.0482, + "step": 4590 + }, + { + "epoch": 2.79342865835108, + "grad_norm": 0.5038939118385315, + "learning_rate": 1.1208583269120517e-05, + "loss": 0.0371, + "step": 4591 + }, + { + "epoch": 2.7940371159111654, + "grad_norm": 0.43629151582717896, + "learning_rate": 1.1198201748133338e-05, + "loss": 0.0306, + "step": 4592 + }, + { + "epoch": 2.7946455734712505, + "grad_norm": 0.3945944607257843, + "learning_rate": 1.1187823649416836e-05, + "loss": 0.0291, + "step": 4593 + }, + { + "epoch": 2.7952540310313356, + "grad_norm": 0.3942541778087616, + "learning_rate": 1.1177448975544361e-05, + "loss": 0.0294, + "step": 4594 + }, + { + "epoch": 2.7958624885914207, + "grad_norm": 0.4381770193576813, + "learning_rate": 1.1167077729088374e-05, + "loss": 0.0294, + "step": 4595 + }, + { + "epoch": 2.796470946151506, + "grad_norm": 0.7765640616416931, + "learning_rate": 1.1156709912620529e-05, + "loss": 0.0846, + "step": 4596 + }, + { + "epoch": 2.7970794037115914, + "grad_norm": 0.38119301199913025, + "learning_rate": 1.1146345528711608e-05, + "loss": 0.0215, + "step": 4597 + }, + { + "epoch": 2.7976878612716765, + "grad_norm": 0.4524082541465759, + "learning_rate": 1.1135984579931547e-05, + "loss": 0.0351, + "step": 4598 + }, + { + "epoch": 2.7982963188317616, + "grad_norm": 0.4416946768760681, + "learning_rate": 1.1125627068849428e-05, + "loss": 0.037, + "step": 4599 + }, + { + "epoch": 2.7989047763918466, + "grad_norm": 0.5243279933929443, + "learning_rate": 1.1115272998033496e-05, + "loss": 0.037, + "step": 4600 + }, + { + "epoch": 2.7995132339519317, + "grad_norm": 0.5025190711021423, + "learning_rate": 1.1104922370051105e-05, + "loss": 0.0333, + "step": 4601 + }, + { + "epoch": 2.8001216915120173, + "grad_norm": 0.5427147746086121, + "learning_rate": 1.1094575187468792e-05, + "loss": 0.0279, + "step": 4602 + }, + { + "epoch": 2.8007301490721024, + "grad_norm": 0.369625985622406, + "learning_rate": 1.1084231452852226e-05, + "loss": 0.029, + "step": 4603 + }, + { + "epoch": 2.8013386066321875, + "grad_norm": 0.39575597643852234, + "learning_rate": 1.1073891168766229e-05, + "loss": 0.0311, + "step": 4604 + }, + { + "epoch": 2.8019470641922726, + "grad_norm": 0.6142522692680359, + "learning_rate": 1.1063554337774745e-05, + "loss": 0.0341, + "step": 4605 + }, + { + "epoch": 2.8025555217523577, + "grad_norm": 0.4445866048336029, + "learning_rate": 1.1053220962440889e-05, + "loss": 0.0396, + "step": 4606 + }, + { + "epoch": 2.803163979312443, + "grad_norm": 0.5825973749160767, + "learning_rate": 1.10428910453269e-05, + "loss": 0.033, + "step": 4607 + }, + { + "epoch": 2.8037724368725283, + "grad_norm": 0.4755668342113495, + "learning_rate": 1.103256458899417e-05, + "loss": 0.0303, + "step": 4608 + }, + { + "epoch": 2.8043808944326134, + "grad_norm": 0.5290636420249939, + "learning_rate": 1.1022241596003232e-05, + "loss": 0.045, + "step": 4609 + }, + { + "epoch": 2.8049893519926985, + "grad_norm": 0.44851988554000854, + "learning_rate": 1.101192206891377e-05, + "loss": 0.0318, + "step": 4610 + }, + { + "epoch": 2.8055978095527836, + "grad_norm": 0.5043997764587402, + "learning_rate": 1.1001606010284569e-05, + "loss": 0.0462, + "step": 4611 + }, + { + "epoch": 2.806206267112869, + "grad_norm": 0.41543683409690857, + "learning_rate": 1.0991293422673596e-05, + "loss": 0.0359, + "step": 4612 + }, + { + "epoch": 2.806814724672954, + "grad_norm": 0.5744189620018005, + "learning_rate": 1.0980984308637944e-05, + "loss": 0.0454, + "step": 4613 + }, + { + "epoch": 2.8074231822330393, + "grad_norm": 0.4493009150028229, + "learning_rate": 1.0970678670733839e-05, + "loss": 0.031, + "step": 4614 + }, + { + "epoch": 2.8080316397931244, + "grad_norm": 0.466262549161911, + "learning_rate": 1.0960376511516655e-05, + "loss": 0.0362, + "step": 4615 + }, + { + "epoch": 2.8086400973532095, + "grad_norm": 0.5800133943557739, + "learning_rate": 1.0950077833540906e-05, + "loss": 0.0544, + "step": 4616 + }, + { + "epoch": 2.809248554913295, + "grad_norm": 0.32791540026664734, + "learning_rate": 1.0939782639360214e-05, + "loss": 0.0188, + "step": 4617 + }, + { + "epoch": 2.8098570124733797, + "grad_norm": 0.45963072776794434, + "learning_rate": 1.0929490931527369e-05, + "loss": 0.0387, + "step": 4618 + }, + { + "epoch": 2.8104654700334653, + "grad_norm": 0.46646299958229065, + "learning_rate": 1.0919202712594284e-05, + "loss": 0.0344, + "step": 4619 + }, + { + "epoch": 2.8110739275935503, + "grad_norm": 0.39890867471694946, + "learning_rate": 1.0908917985112021e-05, + "loss": 0.0238, + "step": 4620 + }, + { + "epoch": 2.8116823851536354, + "grad_norm": 0.439327210187912, + "learning_rate": 1.0898636751630733e-05, + "loss": 0.0415, + "step": 4621 + }, + { + "epoch": 2.812290842713721, + "grad_norm": 0.3781556785106659, + "learning_rate": 1.0888359014699776e-05, + "loss": 0.0332, + "step": 4622 + }, + { + "epoch": 2.8128993002738056, + "grad_norm": 0.42909082770347595, + "learning_rate": 1.087808477686757e-05, + "loss": 0.0419, + "step": 4623 + }, + { + "epoch": 2.813507757833891, + "grad_norm": 0.4752819538116455, + "learning_rate": 1.0867814040681711e-05, + "loss": 0.0308, + "step": 4624 + }, + { + "epoch": 2.8141162153939763, + "grad_norm": 0.5056931376457214, + "learning_rate": 1.0857546808688912e-05, + "loss": 0.0384, + "step": 4625 + }, + { + "epoch": 2.8147246729540614, + "grad_norm": 0.9717746376991272, + "learning_rate": 1.0847283083435026e-05, + "loss": 0.0731, + "step": 4626 + }, + { + "epoch": 2.815333130514147, + "grad_norm": 0.4587456285953522, + "learning_rate": 1.083702286746501e-05, + "loss": 0.0435, + "step": 4627 + }, + { + "epoch": 2.8159415880742316, + "grad_norm": 0.5184073448181152, + "learning_rate": 1.0826766163322982e-05, + "loss": 0.0337, + "step": 4628 + }, + { + "epoch": 2.816550045634317, + "grad_norm": 0.5532636046409607, + "learning_rate": 1.0816512973552178e-05, + "loss": 0.0415, + "step": 4629 + }, + { + "epoch": 2.817158503194402, + "grad_norm": 0.5456773638725281, + "learning_rate": 1.0806263300694966e-05, + "loss": 0.0372, + "step": 4630 + }, + { + "epoch": 2.8177669607544873, + "grad_norm": 0.5305153727531433, + "learning_rate": 1.0796017147292817e-05, + "loss": 0.0458, + "step": 4631 + }, + { + "epoch": 2.818375418314573, + "grad_norm": 0.4857366681098938, + "learning_rate": 1.0785774515886379e-05, + "loss": 0.0372, + "step": 4632 + }, + { + "epoch": 2.8189838758746575, + "grad_norm": 0.5140689611434937, + "learning_rate": 1.0775535409015374e-05, + "loss": 0.0521, + "step": 4633 + }, + { + "epoch": 2.819592333434743, + "grad_norm": 0.5720868110656738, + "learning_rate": 1.0765299829218683e-05, + "loss": 0.0484, + "step": 4634 + }, + { + "epoch": 2.820200790994828, + "grad_norm": 0.4893862009048462, + "learning_rate": 1.0755067779034302e-05, + "loss": 0.0446, + "step": 4635 + }, + { + "epoch": 2.820809248554913, + "grad_norm": 0.4779762923717499, + "learning_rate": 1.0744839260999368e-05, + "loss": 0.0327, + "step": 4636 + }, + { + "epoch": 2.8214177061149983, + "grad_norm": 0.4866357743740082, + "learning_rate": 1.073461427765009e-05, + "loss": 0.0258, + "step": 4637 + }, + { + "epoch": 2.8220261636750834, + "grad_norm": 0.42479321360588074, + "learning_rate": 1.0724392831521878e-05, + "loss": 0.0353, + "step": 4638 + }, + { + "epoch": 2.822634621235169, + "grad_norm": 0.4593687951564789, + "learning_rate": 1.0714174925149201e-05, + "loss": 0.0423, + "step": 4639 + }, + { + "epoch": 2.823243078795254, + "grad_norm": 0.5180348753929138, + "learning_rate": 1.0703960561065688e-05, + "loss": 0.0448, + "step": 4640 + }, + { + "epoch": 2.823851536355339, + "grad_norm": 0.5316763520240784, + "learning_rate": 1.0693749741804048e-05, + "loss": 0.038, + "step": 4641 + }, + { + "epoch": 2.8244599939154242, + "grad_norm": 0.28423795104026794, + "learning_rate": 1.068354246989618e-05, + "loss": 0.0152, + "step": 4642 + }, + { + "epoch": 2.8250684514755093, + "grad_norm": 0.3734150230884552, + "learning_rate": 1.0673338747873027e-05, + "loss": 0.0258, + "step": 4643 + }, + { + "epoch": 2.825676909035595, + "grad_norm": 0.4261896312236786, + "learning_rate": 1.06631385782647e-05, + "loss": 0.0279, + "step": 4644 + }, + { + "epoch": 2.82628536659568, + "grad_norm": 0.35567089915275574, + "learning_rate": 1.0652941963600418e-05, + "loss": 0.027, + "step": 4645 + }, + { + "epoch": 2.826893824155765, + "grad_norm": 0.39518773555755615, + "learning_rate": 1.0642748906408522e-05, + "loss": 0.0355, + "step": 4646 + }, + { + "epoch": 2.82750228171585, + "grad_norm": 0.5049468874931335, + "learning_rate": 1.0632559409216442e-05, + "loss": 0.0393, + "step": 4647 + }, + { + "epoch": 2.8281107392759353, + "grad_norm": 0.4476812481880188, + "learning_rate": 1.062237347455078e-05, + "loss": 0.0258, + "step": 4648 + }, + { + "epoch": 2.828719196836021, + "grad_norm": 0.41204380989074707, + "learning_rate": 1.0612191104937198e-05, + "loss": 0.0345, + "step": 4649 + }, + { + "epoch": 2.829327654396106, + "grad_norm": 0.5308571457862854, + "learning_rate": 1.0602012302900516e-05, + "loss": 0.0422, + "step": 4650 + }, + { + "epoch": 2.829936111956191, + "grad_norm": 0.43992185592651367, + "learning_rate": 1.059183707096463e-05, + "loss": 0.0342, + "step": 4651 + }, + { + "epoch": 2.830544569516276, + "grad_norm": 0.482450395822525, + "learning_rate": 1.0581665411652605e-05, + "loss": 0.0282, + "step": 4652 + }, + { + "epoch": 2.831153027076361, + "grad_norm": 0.41237378120422363, + "learning_rate": 1.0571497327486563e-05, + "loss": 0.033, + "step": 4653 + }, + { + "epoch": 2.8317614846364467, + "grad_norm": 0.5276917219161987, + "learning_rate": 1.0561332820987774e-05, + "loss": 0.0362, + "step": 4654 + }, + { + "epoch": 2.832369942196532, + "grad_norm": 0.4543791115283966, + "learning_rate": 1.0551171894676615e-05, + "loss": 0.0247, + "step": 4655 + }, + { + "epoch": 2.832978399756617, + "grad_norm": 0.4912313222885132, + "learning_rate": 1.0541014551072576e-05, + "loss": 0.0258, + "step": 4656 + }, + { + "epoch": 2.833586857316702, + "grad_norm": 0.46352067589759827, + "learning_rate": 1.053086079269423e-05, + "loss": 0.0371, + "step": 4657 + }, + { + "epoch": 2.834195314876787, + "grad_norm": 0.5429676175117493, + "learning_rate": 1.052071062205932e-05, + "loss": 0.0499, + "step": 4658 + }, + { + "epoch": 2.8348037724368726, + "grad_norm": 0.4813297688961029, + "learning_rate": 1.0510564041684645e-05, + "loss": 0.0556, + "step": 4659 + }, + { + "epoch": 2.8354122299969577, + "grad_norm": 0.5711151957511902, + "learning_rate": 1.0500421054086135e-05, + "loss": 0.0396, + "step": 4660 + }, + { + "epoch": 2.836020687557043, + "grad_norm": 0.48414355516433716, + "learning_rate": 1.049028166177883e-05, + "loss": 0.0347, + "step": 4661 + }, + { + "epoch": 2.836629145117128, + "grad_norm": 0.5011805295944214, + "learning_rate": 1.0480145867276892e-05, + "loss": 0.0439, + "step": 4662 + }, + { + "epoch": 2.837237602677213, + "grad_norm": 0.49640247225761414, + "learning_rate": 1.0470013673093548e-05, + "loss": 0.0395, + "step": 4663 + }, + { + "epoch": 2.8378460602372986, + "grad_norm": 0.48717397451400757, + "learning_rate": 1.0459885081741175e-05, + "loss": 0.0382, + "step": 4664 + }, + { + "epoch": 2.8384545177973837, + "grad_norm": 0.4565880596637726, + "learning_rate": 1.044976009573124e-05, + "loss": 0.0269, + "step": 4665 + }, + { + "epoch": 2.8390629753574688, + "grad_norm": 0.6080870032310486, + "learning_rate": 1.0439638717574327e-05, + "loss": 0.0385, + "step": 4666 + }, + { + "epoch": 2.839671432917554, + "grad_norm": 0.6103970408439636, + "learning_rate": 1.0429520949780085e-05, + "loss": 0.0523, + "step": 4667 + }, + { + "epoch": 2.840279890477639, + "grad_norm": 0.45440056920051575, + "learning_rate": 1.0419406794857343e-05, + "loss": 0.0385, + "step": 4668 + }, + { + "epoch": 2.8408883480377245, + "grad_norm": 0.3543758690357208, + "learning_rate": 1.0409296255313955e-05, + "loss": 0.0286, + "step": 4669 + }, + { + "epoch": 2.8414968055978096, + "grad_norm": 0.5107575058937073, + "learning_rate": 1.0399189333656925e-05, + "loss": 0.0439, + "step": 4670 + }, + { + "epoch": 2.8421052631578947, + "grad_norm": 0.45144620537757874, + "learning_rate": 1.0389086032392348e-05, + "loss": 0.0318, + "step": 4671 + }, + { + "epoch": 2.84271372071798, + "grad_norm": 0.41293710470199585, + "learning_rate": 1.0378986354025433e-05, + "loss": 0.0224, + "step": 4672 + }, + { + "epoch": 2.843322178278065, + "grad_norm": 0.4346982538700104, + "learning_rate": 1.0368890301060457e-05, + "loss": 0.0418, + "step": 4673 + }, + { + "epoch": 2.8439306358381504, + "grad_norm": 0.47238776087760925, + "learning_rate": 1.0358797876000834e-05, + "loss": 0.0314, + "step": 4674 + }, + { + "epoch": 2.8445390933982355, + "grad_norm": 0.4769528806209564, + "learning_rate": 1.0348709081349062e-05, + "loss": 0.0416, + "step": 4675 + }, + { + "epoch": 2.8451475509583206, + "grad_norm": 0.4458237886428833, + "learning_rate": 1.033862391960675e-05, + "loss": 0.0395, + "step": 4676 + }, + { + "epoch": 2.8457560085184057, + "grad_norm": 0.3304067850112915, + "learning_rate": 1.0328542393274571e-05, + "loss": 0.0189, + "step": 4677 + }, + { + "epoch": 2.846364466078491, + "grad_norm": 0.3914128243923187, + "learning_rate": 1.031846450485236e-05, + "loss": 0.0229, + "step": 4678 + }, + { + "epoch": 2.8469729236385763, + "grad_norm": 0.4684884250164032, + "learning_rate": 1.0308390256838987e-05, + "loss": 0.0423, + "step": 4679 + }, + { + "epoch": 2.8475813811986614, + "grad_norm": 0.5104598999023438, + "learning_rate": 1.0298319651732455e-05, + "loss": 0.0385, + "step": 4680 + }, + { + "epoch": 2.8481898387587465, + "grad_norm": 0.5490165948867798, + "learning_rate": 1.0288252692029851e-05, + "loss": 0.0506, + "step": 4681 + }, + { + "epoch": 2.8487982963188316, + "grad_norm": 0.3741062581539154, + "learning_rate": 1.0278189380227373e-05, + "loss": 0.0251, + "step": 4682 + }, + { + "epoch": 2.8494067538789167, + "grad_norm": 0.5128927230834961, + "learning_rate": 1.0268129718820277e-05, + "loss": 0.0526, + "step": 4683 + }, + { + "epoch": 2.8500152114390023, + "grad_norm": 0.43258655071258545, + "learning_rate": 1.0258073710302973e-05, + "loss": 0.0312, + "step": 4684 + }, + { + "epoch": 2.8506236689990874, + "grad_norm": 0.5292765498161316, + "learning_rate": 1.0248021357168908e-05, + "loss": 0.0539, + "step": 4685 + }, + { + "epoch": 2.8512321265591725, + "grad_norm": 0.5508760809898376, + "learning_rate": 1.023797266191066e-05, + "loss": 0.0453, + "step": 4686 + }, + { + "epoch": 2.8518405841192576, + "grad_norm": 0.45886969566345215, + "learning_rate": 1.0227927627019862e-05, + "loss": 0.0382, + "step": 4687 + }, + { + "epoch": 2.8524490416793427, + "grad_norm": 0.587009847164154, + "learning_rate": 1.0217886254987303e-05, + "loss": 0.0589, + "step": 4688 + }, + { + "epoch": 2.853057499239428, + "grad_norm": 0.5119481682777405, + "learning_rate": 1.0207848548302793e-05, + "loss": 0.0305, + "step": 4689 + }, + { + "epoch": 2.8536659567995133, + "grad_norm": 0.4871687889099121, + "learning_rate": 1.0197814509455275e-05, + "loss": 0.0551, + "step": 4690 + }, + { + "epoch": 2.8542744143595984, + "grad_norm": 0.5183733701705933, + "learning_rate": 1.0187784140932774e-05, + "loss": 0.0435, + "step": 4691 + }, + { + "epoch": 2.8548828719196835, + "grad_norm": 0.46334466338157654, + "learning_rate": 1.0177757445222411e-05, + "loss": 0.0406, + "step": 4692 + }, + { + "epoch": 2.8554913294797686, + "grad_norm": 0.42779427766799927, + "learning_rate": 1.0167734424810363e-05, + "loss": 0.024, + "step": 4693 + }, + { + "epoch": 2.856099787039854, + "grad_norm": 0.5750960111618042, + "learning_rate": 1.0157715082181956e-05, + "loss": 0.0467, + "step": 4694 + }, + { + "epoch": 2.856708244599939, + "grad_norm": 0.6426092982292175, + "learning_rate": 1.0147699419821543e-05, + "loss": 0.0576, + "step": 4695 + }, + { + "epoch": 2.8573167021600243, + "grad_norm": 0.39237314462661743, + "learning_rate": 1.0137687440212598e-05, + "loss": 0.0315, + "step": 4696 + }, + { + "epoch": 2.8579251597201094, + "grad_norm": 0.4717332124710083, + "learning_rate": 1.012767914583768e-05, + "loss": 0.029, + "step": 4697 + }, + { + "epoch": 2.8585336172801945, + "grad_norm": 0.4430489242076874, + "learning_rate": 1.0117674539178428e-05, + "loss": 0.0343, + "step": 4698 + }, + { + "epoch": 2.85914207484028, + "grad_norm": 0.42511722445487976, + "learning_rate": 1.010767362271556e-05, + "loss": 0.0306, + "step": 4699 + }, + { + "epoch": 2.859750532400365, + "grad_norm": 0.5086604952812195, + "learning_rate": 1.009767639892889e-05, + "loss": 0.0396, + "step": 4700 + }, + { + "epoch": 2.8603589899604502, + "grad_norm": 0.4445130527019501, + "learning_rate": 1.0087682870297313e-05, + "loss": 0.0335, + "step": 4701 + }, + { + "epoch": 2.8609674475205353, + "grad_norm": 0.4963395595550537, + "learning_rate": 1.0077693039298808e-05, + "loss": 0.0442, + "step": 4702 + }, + { + "epoch": 2.8615759050806204, + "grad_norm": 0.5074029564857483, + "learning_rate": 1.0067706908410437e-05, + "loss": 0.0469, + "step": 4703 + }, + { + "epoch": 2.862184362640706, + "grad_norm": 0.4995264708995819, + "learning_rate": 1.0057724480108352e-05, + "loss": 0.0369, + "step": 4704 + }, + { + "epoch": 2.862792820200791, + "grad_norm": 0.41249075531959534, + "learning_rate": 1.0047745756867762e-05, + "loss": 0.0245, + "step": 4705 + }, + { + "epoch": 2.863401277760876, + "grad_norm": 0.4585972726345062, + "learning_rate": 1.0037770741162986e-05, + "loss": 0.0362, + "step": 4706 + }, + { + "epoch": 2.8640097353209613, + "grad_norm": 0.4985556900501251, + "learning_rate": 1.0027799435467409e-05, + "loss": 0.0433, + "step": 4707 + }, + { + "epoch": 2.8646181928810464, + "grad_norm": 0.46208369731903076, + "learning_rate": 1.0017831842253509e-05, + "loss": 0.0252, + "step": 4708 + }, + { + "epoch": 2.865226650441132, + "grad_norm": 0.5662164688110352, + "learning_rate": 1.0007867963992814e-05, + "loss": 0.0602, + "step": 4709 + }, + { + "epoch": 2.865835108001217, + "grad_norm": 0.5789265036582947, + "learning_rate": 9.997907803155962e-06, + "loss": 0.051, + "step": 4710 + }, + { + "epoch": 2.866443565561302, + "grad_norm": 0.4753771126270294, + "learning_rate": 9.987951362212658e-06, + "loss": 0.0505, + "step": 4711 + }, + { + "epoch": 2.867052023121387, + "grad_norm": 0.5585101842880249, + "learning_rate": 9.977998643631684e-06, + "loss": 0.0621, + "step": 4712 + }, + { + "epoch": 2.8676604806814723, + "grad_norm": 0.396291583776474, + "learning_rate": 9.968049649880895e-06, + "loss": 0.031, + "step": 4713 + }, + { + "epoch": 2.868268938241558, + "grad_norm": 0.5255879163742065, + "learning_rate": 9.95810438342724e-06, + "loss": 0.0365, + "step": 4714 + }, + { + "epoch": 2.868877395801643, + "grad_norm": 0.5379154086112976, + "learning_rate": 9.948162846736715e-06, + "loss": 0.0466, + "step": 4715 + }, + { + "epoch": 2.869485853361728, + "grad_norm": 0.5413723587989807, + "learning_rate": 9.93822504227441e-06, + "loss": 0.0422, + "step": 4716 + }, + { + "epoch": 2.870094310921813, + "grad_norm": 0.6132776141166687, + "learning_rate": 9.928290972504489e-06, + "loss": 0.044, + "step": 4717 + }, + { + "epoch": 2.870702768481898, + "grad_norm": 0.5400335192680359, + "learning_rate": 9.918360639890187e-06, + "loss": 0.0462, + "step": 4718 + }, + { + "epoch": 2.8713112260419837, + "grad_norm": 0.37978994846343994, + "learning_rate": 9.908434046893822e-06, + "loss": 0.0303, + "step": 4719 + }, + { + "epoch": 2.871919683602069, + "grad_norm": 0.49378716945648193, + "learning_rate": 9.898511195976756e-06, + "loss": 0.0331, + "step": 4720 + }, + { + "epoch": 2.872528141162154, + "grad_norm": 0.44092321395874023, + "learning_rate": 9.888592089599452e-06, + "loss": 0.0358, + "step": 4721 + }, + { + "epoch": 2.873136598722239, + "grad_norm": 0.4932500123977661, + "learning_rate": 9.878676730221437e-06, + "loss": 0.0459, + "step": 4722 + }, + { + "epoch": 2.873745056282324, + "grad_norm": 0.36311423778533936, + "learning_rate": 9.868765120301305e-06, + "loss": 0.0274, + "step": 4723 + }, + { + "epoch": 2.8743535138424097, + "grad_norm": 0.4552887976169586, + "learning_rate": 9.858857262296719e-06, + "loss": 0.0377, + "step": 4724 + }, + { + "epoch": 2.8749619714024948, + "grad_norm": 0.4563083052635193, + "learning_rate": 9.848953158664431e-06, + "loss": 0.0407, + "step": 4725 + }, + { + "epoch": 2.87557042896258, + "grad_norm": 0.5411676168441772, + "learning_rate": 9.839052811860222e-06, + "loss": 0.04, + "step": 4726 + }, + { + "epoch": 2.876178886522665, + "grad_norm": 0.34208500385284424, + "learning_rate": 9.829156224338975e-06, + "loss": 0.0262, + "step": 4727 + }, + { + "epoch": 2.87678734408275, + "grad_norm": 0.4771437644958496, + "learning_rate": 9.819263398554634e-06, + "loss": 0.0411, + "step": 4728 + }, + { + "epoch": 2.8773958016428356, + "grad_norm": 0.43631598353385925, + "learning_rate": 9.809374336960208e-06, + "loss": 0.0348, + "step": 4729 + }, + { + "epoch": 2.8780042592029207, + "grad_norm": 0.457810640335083, + "learning_rate": 9.799489042007767e-06, + "loss": 0.0462, + "step": 4730 + }, + { + "epoch": 2.878612716763006, + "grad_norm": 0.5080879926681519, + "learning_rate": 9.789607516148467e-06, + "loss": 0.0379, + "step": 4731 + }, + { + "epoch": 2.879221174323091, + "grad_norm": 0.5162711143493652, + "learning_rate": 9.77972976183249e-06, + "loss": 0.0444, + "step": 4732 + }, + { + "epoch": 2.879829631883176, + "grad_norm": 0.4640320837497711, + "learning_rate": 9.769855781509121e-06, + "loss": 0.0301, + "step": 4733 + }, + { + "epoch": 2.8804380894432615, + "grad_norm": 0.457856148481369, + "learning_rate": 9.759985577626695e-06, + "loss": 0.0409, + "step": 4734 + }, + { + "epoch": 2.8810465470033466, + "grad_norm": 0.43068334460258484, + "learning_rate": 9.75011915263262e-06, + "loss": 0.0367, + "step": 4735 + }, + { + "epoch": 2.8816550045634317, + "grad_norm": 0.5189123153686523, + "learning_rate": 9.74025650897333e-06, + "loss": 0.0427, + "step": 4736 + }, + { + "epoch": 2.882263462123517, + "grad_norm": 0.4327934682369232, + "learning_rate": 9.730397649094386e-06, + "loss": 0.0368, + "step": 4737 + }, + { + "epoch": 2.882871919683602, + "grad_norm": 0.3771323263645172, + "learning_rate": 9.72054257544035e-06, + "loss": 0.0311, + "step": 4738 + }, + { + "epoch": 2.8834803772436874, + "grad_norm": 0.5253776907920837, + "learning_rate": 9.710691290454874e-06, + "loss": 0.0475, + "step": 4739 + }, + { + "epoch": 2.8840888348037725, + "grad_norm": 0.47808972001075745, + "learning_rate": 9.70084379658067e-06, + "loss": 0.0314, + "step": 4740 + }, + { + "epoch": 2.8846972923638576, + "grad_norm": 0.4568268656730652, + "learning_rate": 9.691000096259512e-06, + "loss": 0.0294, + "step": 4741 + }, + { + "epoch": 2.8853057499239427, + "grad_norm": 0.417096346616745, + "learning_rate": 9.68116019193221e-06, + "loss": 0.0287, + "step": 4742 + }, + { + "epoch": 2.885914207484028, + "grad_norm": 0.4551639258861542, + "learning_rate": 9.671324086038658e-06, + "loss": 0.0252, + "step": 4743 + }, + { + "epoch": 2.8865226650441134, + "grad_norm": 0.4030291736125946, + "learning_rate": 9.661491781017806e-06, + "loss": 0.039, + "step": 4744 + }, + { + "epoch": 2.8871311226041985, + "grad_norm": 0.49422821402549744, + "learning_rate": 9.651663279307657e-06, + "loss": 0.0379, + "step": 4745 + }, + { + "epoch": 2.8877395801642836, + "grad_norm": 0.4428713321685791, + "learning_rate": 9.64183858334525e-06, + "loss": 0.0335, + "step": 4746 + }, + { + "epoch": 2.8883480377243687, + "grad_norm": 0.4136061370372772, + "learning_rate": 9.632017695566731e-06, + "loss": 0.0354, + "step": 4747 + }, + { + "epoch": 2.8889564952844538, + "grad_norm": 0.44465282559394836, + "learning_rate": 9.622200618407246e-06, + "loss": 0.0353, + "step": 4748 + }, + { + "epoch": 2.8895649528445393, + "grad_norm": 0.42466622591018677, + "learning_rate": 9.612387354301028e-06, + "loss": 0.0322, + "step": 4749 + }, + { + "epoch": 2.8901734104046244, + "grad_norm": 0.5076179504394531, + "learning_rate": 9.602577905681358e-06, + "loss": 0.0531, + "step": 4750 + }, + { + "epoch": 2.8907818679647095, + "grad_norm": 0.42435580492019653, + "learning_rate": 9.592772274980582e-06, + "loss": 0.0338, + "step": 4751 + }, + { + "epoch": 2.8913903255247946, + "grad_norm": 0.43116191029548645, + "learning_rate": 9.58297046463006e-06, + "loss": 0.0269, + "step": 4752 + }, + { + "epoch": 2.8919987830848797, + "grad_norm": 0.4200409948825836, + "learning_rate": 9.573172477060266e-06, + "loss": 0.0255, + "step": 4753 + }, + { + "epoch": 2.892607240644965, + "grad_norm": 0.44302016496658325, + "learning_rate": 9.563378314700665e-06, + "loss": 0.036, + "step": 4754 + }, + { + "epoch": 2.8932156982050503, + "grad_norm": 0.4883388876914978, + "learning_rate": 9.553587979979826e-06, + "loss": 0.0334, + "step": 4755 + }, + { + "epoch": 2.8938241557651354, + "grad_norm": 0.5191654562950134, + "learning_rate": 9.543801475325306e-06, + "loss": 0.0383, + "step": 4756 + }, + { + "epoch": 2.8944326133252205, + "grad_norm": 0.47728797793388367, + "learning_rate": 9.534018803163794e-06, + "loss": 0.031, + "step": 4757 + }, + { + "epoch": 2.8950410708853056, + "grad_norm": 0.42437100410461426, + "learning_rate": 9.524239965920958e-06, + "loss": 0.0288, + "step": 4758 + }, + { + "epoch": 2.895649528445391, + "grad_norm": 0.46275144815444946, + "learning_rate": 9.514464966021547e-06, + "loss": 0.0391, + "step": 4759 + }, + { + "epoch": 2.8962579860054762, + "grad_norm": 0.5176787376403809, + "learning_rate": 9.504693805889355e-06, + "loss": 0.0275, + "step": 4760 + }, + { + "epoch": 2.8968664435655613, + "grad_norm": 0.39485782384872437, + "learning_rate": 9.494926487947233e-06, + "loss": 0.0326, + "step": 4761 + }, + { + "epoch": 2.8974749011256464, + "grad_norm": 1.0066802501678467, + "learning_rate": 9.485163014617043e-06, + "loss": 0.0387, + "step": 4762 + }, + { + "epoch": 2.8980833586857315, + "grad_norm": 0.38768336176872253, + "learning_rate": 9.475403388319752e-06, + "loss": 0.029, + "step": 4763 + }, + { + "epoch": 2.898691816245817, + "grad_norm": 0.3965464234352112, + "learning_rate": 9.465647611475312e-06, + "loss": 0.0288, + "step": 4764 + }, + { + "epoch": 2.899300273805902, + "grad_norm": 0.5666676163673401, + "learning_rate": 9.455895686502762e-06, + "loss": 0.0437, + "step": 4765 + }, + { + "epoch": 2.8999087313659873, + "grad_norm": 0.49036771059036255, + "learning_rate": 9.446147615820169e-06, + "loss": 0.0332, + "step": 4766 + }, + { + "epoch": 2.9005171889260724, + "grad_norm": 0.49707627296447754, + "learning_rate": 9.436403401844663e-06, + "loss": 0.0317, + "step": 4767 + }, + { + "epoch": 2.9011256464861574, + "grad_norm": 0.5176650881767273, + "learning_rate": 9.426663046992381e-06, + "loss": 0.0347, + "step": 4768 + }, + { + "epoch": 2.901734104046243, + "grad_norm": 0.536285936832428, + "learning_rate": 9.416926553678535e-06, + "loss": 0.0404, + "step": 4769 + }, + { + "epoch": 2.902342561606328, + "grad_norm": 0.5301802158355713, + "learning_rate": 9.40719392431737e-06, + "loss": 0.0418, + "step": 4770 + }, + { + "epoch": 2.902951019166413, + "grad_norm": 0.5603205561637878, + "learning_rate": 9.397465161322177e-06, + "loss": 0.0421, + "step": 4771 + }, + { + "epoch": 2.9035594767264983, + "grad_norm": 0.4085746109485626, + "learning_rate": 9.387740267105263e-06, + "loss": 0.0362, + "step": 4772 + }, + { + "epoch": 2.9041679342865834, + "grad_norm": 0.4638700485229492, + "learning_rate": 9.378019244078028e-06, + "loss": 0.0435, + "step": 4773 + }, + { + "epoch": 2.904776391846669, + "grad_norm": 0.5153172612190247, + "learning_rate": 9.368302094650858e-06, + "loss": 0.0505, + "step": 4774 + }, + { + "epoch": 2.905384849406754, + "grad_norm": 0.3367726802825928, + "learning_rate": 9.358588821233207e-06, + "loss": 0.0292, + "step": 4775 + }, + { + "epoch": 2.905993306966839, + "grad_norm": 0.4512867033481598, + "learning_rate": 9.348879426233561e-06, + "loss": 0.0333, + "step": 4776 + }, + { + "epoch": 2.906601764526924, + "grad_norm": 0.4421890676021576, + "learning_rate": 9.339173912059455e-06, + "loss": 0.0347, + "step": 4777 + }, + { + "epoch": 2.9072102220870093, + "grad_norm": 0.379062294960022, + "learning_rate": 9.329472281117434e-06, + "loss": 0.0256, + "step": 4778 + }, + { + "epoch": 2.907818679647095, + "grad_norm": 0.4036319851875305, + "learning_rate": 9.319774535813109e-06, + "loss": 0.0392, + "step": 4779 + }, + { + "epoch": 2.90842713720718, + "grad_norm": 0.47172126173973083, + "learning_rate": 9.310080678551114e-06, + "loss": 0.0408, + "step": 4780 + }, + { + "epoch": 2.909035594767265, + "grad_norm": 0.494945228099823, + "learning_rate": 9.300390711735132e-06, + "loss": 0.0257, + "step": 4781 + }, + { + "epoch": 2.90964405232735, + "grad_norm": 0.4051055610179901, + "learning_rate": 9.290704637767843e-06, + "loss": 0.0298, + "step": 4782 + }, + { + "epoch": 2.9102525098874352, + "grad_norm": 0.5722257494926453, + "learning_rate": 9.281022459051029e-06, + "loss": 0.0371, + "step": 4783 + }, + { + "epoch": 2.9108609674475208, + "grad_norm": 0.4758877754211426, + "learning_rate": 9.271344177985433e-06, + "loss": 0.045, + "step": 4784 + }, + { + "epoch": 2.911469425007606, + "grad_norm": 0.5374777913093567, + "learning_rate": 9.261669796970881e-06, + "loss": 0.0507, + "step": 4785 + }, + { + "epoch": 2.912077882567691, + "grad_norm": 0.35278376936912537, + "learning_rate": 9.251999318406213e-06, + "loss": 0.0239, + "step": 4786 + }, + { + "epoch": 2.912686340127776, + "grad_norm": 0.40819981694221497, + "learning_rate": 9.242332744689314e-06, + "loss": 0.026, + "step": 4787 + }, + { + "epoch": 2.913294797687861, + "grad_norm": 0.5325673222541809, + "learning_rate": 9.232670078217062e-06, + "loss": 0.0409, + "step": 4788 + }, + { + "epoch": 2.9139032552479467, + "grad_norm": 0.44101467728614807, + "learning_rate": 9.223011321385435e-06, + "loss": 0.0378, + "step": 4789 + }, + { + "epoch": 2.914511712808032, + "grad_norm": 0.47359776496887207, + "learning_rate": 9.213356476589375e-06, + "loss": 0.0324, + "step": 4790 + }, + { + "epoch": 2.915120170368117, + "grad_norm": 0.3451775908470154, + "learning_rate": 9.2037055462229e-06, + "loss": 0.0211, + "step": 4791 + }, + { + "epoch": 2.915728627928202, + "grad_norm": 0.3723897635936737, + "learning_rate": 9.194058532679006e-06, + "loss": 0.0358, + "step": 4792 + }, + { + "epoch": 2.916337085488287, + "grad_norm": 0.5562166571617126, + "learning_rate": 9.184415438349788e-06, + "loss": 0.0396, + "step": 4793 + }, + { + "epoch": 2.9169455430483726, + "grad_norm": 0.41815292835235596, + "learning_rate": 9.174776265626312e-06, + "loss": 0.0194, + "step": 4794 + }, + { + "epoch": 2.9175540006084577, + "grad_norm": 0.5843579173088074, + "learning_rate": 9.16514101689869e-06, + "loss": 0.0494, + "step": 4795 + }, + { + "epoch": 2.918162458168543, + "grad_norm": 0.42992332577705383, + "learning_rate": 9.15550969455607e-06, + "loss": 0.033, + "step": 4796 + }, + { + "epoch": 2.918770915728628, + "grad_norm": 0.4272845387458801, + "learning_rate": 9.14588230098662e-06, + "loss": 0.0286, + "step": 4797 + }, + { + "epoch": 2.919379373288713, + "grad_norm": 0.4098367691040039, + "learning_rate": 9.136258838577511e-06, + "loss": 0.0226, + "step": 4798 + }, + { + "epoch": 2.9199878308487985, + "grad_norm": 0.47262877225875854, + "learning_rate": 9.126639309714997e-06, + "loss": 0.0416, + "step": 4799 + }, + { + "epoch": 2.9205962884088836, + "grad_norm": 0.5105498433113098, + "learning_rate": 9.117023716784287e-06, + "loss": 0.043, + "step": 4800 + }, + { + "epoch": 2.9212047459689687, + "grad_norm": 0.4782066345214844, + "learning_rate": 9.10741206216967e-06, + "loss": 0.0292, + "step": 4801 + }, + { + "epoch": 2.921813203529054, + "grad_norm": 0.5072314143180847, + "learning_rate": 9.09780434825441e-06, + "loss": 0.0364, + "step": 4802 + }, + { + "epoch": 2.922421661089139, + "grad_norm": 0.46522530913352966, + "learning_rate": 9.08820057742085e-06, + "loss": 0.0315, + "step": 4803 + }, + { + "epoch": 2.9230301186492245, + "grad_norm": 0.5069283246994019, + "learning_rate": 9.078600752050304e-06, + "loss": 0.0451, + "step": 4804 + }, + { + "epoch": 2.9236385762093096, + "grad_norm": 0.3636358678340912, + "learning_rate": 9.069004874523136e-06, + "loss": 0.0215, + "step": 4805 + }, + { + "epoch": 2.9242470337693947, + "grad_norm": 0.43121346831321716, + "learning_rate": 9.059412947218718e-06, + "loss": 0.0287, + "step": 4806 + }, + { + "epoch": 2.9248554913294798, + "grad_norm": 0.35692986845970154, + "learning_rate": 9.049824972515464e-06, + "loss": 0.0224, + "step": 4807 + }, + { + "epoch": 2.925463948889565, + "grad_norm": 0.4458438456058502, + "learning_rate": 9.040240952790765e-06, + "loss": 0.0279, + "step": 4808 + }, + { + "epoch": 2.9260724064496504, + "grad_norm": 0.4637523591518402, + "learning_rate": 9.030660890421089e-06, + "loss": 0.0411, + "step": 4809 + }, + { + "epoch": 2.9266808640097355, + "grad_norm": 0.4921475648880005, + "learning_rate": 9.02108478778187e-06, + "loss": 0.0346, + "step": 4810 + }, + { + "epoch": 2.9272893215698206, + "grad_norm": 0.4898071587085724, + "learning_rate": 9.011512647247588e-06, + "loss": 0.0455, + "step": 4811 + }, + { + "epoch": 2.9278977791299057, + "grad_norm": 0.3920941650867462, + "learning_rate": 9.00194447119174e-06, + "loss": 0.0236, + "step": 4812 + }, + { + "epoch": 2.9285062366899908, + "grad_norm": 0.4192269444465637, + "learning_rate": 8.992380261986837e-06, + "loss": 0.0303, + "step": 4813 + }, + { + "epoch": 2.9291146942500763, + "grad_norm": 0.44664058089256287, + "learning_rate": 8.98282002200439e-06, + "loss": 0.0363, + "step": 4814 + }, + { + "epoch": 2.929723151810161, + "grad_norm": 0.5392205715179443, + "learning_rate": 8.973263753614949e-06, + "loss": 0.0398, + "step": 4815 + }, + { + "epoch": 2.9303316093702465, + "grad_norm": 0.43230360746383667, + "learning_rate": 8.96371145918807e-06, + "loss": 0.0439, + "step": 4816 + }, + { + "epoch": 2.9309400669303316, + "grad_norm": 0.3646244704723358, + "learning_rate": 8.954163141092333e-06, + "loss": 0.0249, + "step": 4817 + }, + { + "epoch": 2.9315485244904167, + "grad_norm": 0.5149335861206055, + "learning_rate": 8.944618801695295e-06, + "loss": 0.0292, + "step": 4818 + }, + { + "epoch": 2.9321569820505022, + "grad_norm": 0.3785509765148163, + "learning_rate": 8.935078443363592e-06, + "loss": 0.0376, + "step": 4819 + }, + { + "epoch": 2.932765439610587, + "grad_norm": 0.5124600529670715, + "learning_rate": 8.925542068462806e-06, + "loss": 0.0407, + "step": 4820 + }, + { + "epoch": 2.9333738971706724, + "grad_norm": 0.4565979838371277, + "learning_rate": 8.91600967935757e-06, + "loss": 0.0334, + "step": 4821 + }, + { + "epoch": 2.9339823547307575, + "grad_norm": 0.5674571394920349, + "learning_rate": 8.906481278411522e-06, + "loss": 0.0486, + "step": 4822 + }, + { + "epoch": 2.9345908122908426, + "grad_norm": 0.5254626870155334, + "learning_rate": 8.896956867987313e-06, + "loss": 0.0281, + "step": 4823 + }, + { + "epoch": 2.935199269850928, + "grad_norm": 0.380880743265152, + "learning_rate": 8.887436450446584e-06, + "loss": 0.0277, + "step": 4824 + }, + { + "epoch": 2.935807727411013, + "grad_norm": 0.41296547651290894, + "learning_rate": 8.877920028150013e-06, + "loss": 0.028, + "step": 4825 + }, + { + "epoch": 2.9364161849710984, + "grad_norm": 0.5368117094039917, + "learning_rate": 8.868407603457272e-06, + "loss": 0.0386, + "step": 4826 + }, + { + "epoch": 2.9370246425311834, + "grad_norm": 0.5776300430297852, + "learning_rate": 8.858899178727045e-06, + "loss": 0.0416, + "step": 4827 + }, + { + "epoch": 2.9376331000912685, + "grad_norm": 0.38237324357032776, + "learning_rate": 8.84939475631703e-06, + "loss": 0.0324, + "step": 4828 + }, + { + "epoch": 2.938241557651354, + "grad_norm": 0.42977848649024963, + "learning_rate": 8.839894338583935e-06, + "loss": 0.0297, + "step": 4829 + }, + { + "epoch": 2.9388500152114387, + "grad_norm": 0.5375818610191345, + "learning_rate": 8.830397927883446e-06, + "loss": 0.0374, + "step": 4830 + }, + { + "epoch": 2.9394584727715243, + "grad_norm": 0.6935356259346008, + "learning_rate": 8.820905526570289e-06, + "loss": 0.0601, + "step": 4831 + }, + { + "epoch": 2.9400669303316094, + "grad_norm": 0.5644581317901611, + "learning_rate": 8.811417136998184e-06, + "loss": 0.0488, + "step": 4832 + }, + { + "epoch": 2.9406753878916945, + "grad_norm": 0.4846365451812744, + "learning_rate": 8.801932761519857e-06, + "loss": 0.0472, + "step": 4833 + }, + { + "epoch": 2.94128384545178, + "grad_norm": 0.4495841860771179, + "learning_rate": 8.792452402487037e-06, + "loss": 0.033, + "step": 4834 + }, + { + "epoch": 2.9418923030118647, + "grad_norm": 0.42976799607276917, + "learning_rate": 8.782976062250464e-06, + "loss": 0.0334, + "step": 4835 + }, + { + "epoch": 2.94250076057195, + "grad_norm": 0.5387213230133057, + "learning_rate": 8.77350374315986e-06, + "loss": 0.0398, + "step": 4836 + }, + { + "epoch": 2.9431092181320353, + "grad_norm": 0.46354934573173523, + "learning_rate": 8.764035447563976e-06, + "loss": 0.0363, + "step": 4837 + }, + { + "epoch": 2.9437176756921204, + "grad_norm": 0.471663236618042, + "learning_rate": 8.75457117781055e-06, + "loss": 0.0491, + "step": 4838 + }, + { + "epoch": 2.944326133252206, + "grad_norm": 0.584274172782898, + "learning_rate": 8.745110936246331e-06, + "loss": 0.0437, + "step": 4839 + }, + { + "epoch": 2.9449345908122906, + "grad_norm": 0.5374387502670288, + "learning_rate": 8.73565472521707e-06, + "loss": 0.0475, + "step": 4840 + }, + { + "epoch": 2.945543048372376, + "grad_norm": 0.46591848134994507, + "learning_rate": 8.726202547067496e-06, + "loss": 0.0427, + "step": 4841 + }, + { + "epoch": 2.9461515059324612, + "grad_norm": 0.3446124792098999, + "learning_rate": 8.716754404141368e-06, + "loss": 0.0233, + "step": 4842 + }, + { + "epoch": 2.9467599634925463, + "grad_norm": 0.43566611409187317, + "learning_rate": 8.707310298781427e-06, + "loss": 0.0277, + "step": 4843 + }, + { + "epoch": 2.9473684210526314, + "grad_norm": 0.38527342677116394, + "learning_rate": 8.69787023332942e-06, + "loss": 0.0224, + "step": 4844 + }, + { + "epoch": 2.9479768786127165, + "grad_norm": 0.36706897616386414, + "learning_rate": 8.688434210126099e-06, + "loss": 0.0239, + "step": 4845 + }, + { + "epoch": 2.948585336172802, + "grad_norm": 0.43865934014320374, + "learning_rate": 8.679002231511182e-06, + "loss": 0.0322, + "step": 4846 + }, + { + "epoch": 2.949193793732887, + "grad_norm": 0.5478273034095764, + "learning_rate": 8.669574299823422e-06, + "loss": 0.0374, + "step": 4847 + }, + { + "epoch": 2.9498022512929722, + "grad_norm": 0.4504847824573517, + "learning_rate": 8.660150417400547e-06, + "loss": 0.0363, + "step": 4848 + }, + { + "epoch": 2.9504107088530573, + "grad_norm": 0.4088916778564453, + "learning_rate": 8.650730586579292e-06, + "loss": 0.036, + "step": 4849 + }, + { + "epoch": 2.9510191664131424, + "grad_norm": 0.4998932480812073, + "learning_rate": 8.641314809695389e-06, + "loss": 0.0482, + "step": 4850 + }, + { + "epoch": 2.951627623973228, + "grad_norm": 0.506369411945343, + "learning_rate": 8.63190308908354e-06, + "loss": 0.0313, + "step": 4851 + }, + { + "epoch": 2.952236081533313, + "grad_norm": 0.4469977915287018, + "learning_rate": 8.622495427077468e-06, + "loss": 0.0239, + "step": 4852 + }, + { + "epoch": 2.952844539093398, + "grad_norm": 0.4590071439743042, + "learning_rate": 8.613091826009884e-06, + "loss": 0.0423, + "step": 4853 + }, + { + "epoch": 2.9534529966534833, + "grad_norm": 0.614018440246582, + "learning_rate": 8.603692288212486e-06, + "loss": 0.0576, + "step": 4854 + }, + { + "epoch": 2.9540614542135684, + "grad_norm": 0.45783132314682007, + "learning_rate": 8.59429681601597e-06, + "loss": 0.0325, + "step": 4855 + }, + { + "epoch": 2.954669911773654, + "grad_norm": 0.4062897562980652, + "learning_rate": 8.58490541175003e-06, + "loss": 0.0286, + "step": 4856 + }, + { + "epoch": 2.955278369333739, + "grad_norm": 0.561907947063446, + "learning_rate": 8.575518077743322e-06, + "loss": 0.0356, + "step": 4857 + }, + { + "epoch": 2.955886826893824, + "grad_norm": 0.42090898752212524, + "learning_rate": 8.566134816323526e-06, + "loss": 0.0326, + "step": 4858 + }, + { + "epoch": 2.956495284453909, + "grad_norm": 0.45067688822746277, + "learning_rate": 8.556755629817295e-06, + "loss": 0.0378, + "step": 4859 + }, + { + "epoch": 2.9571037420139943, + "grad_norm": 0.4070775806903839, + "learning_rate": 8.54738052055029e-06, + "loss": 0.0318, + "step": 4860 + }, + { + "epoch": 2.95771219957408, + "grad_norm": 0.40776872634887695, + "learning_rate": 8.538009490847124e-06, + "loss": 0.0308, + "step": 4861 + }, + { + "epoch": 2.958320657134165, + "grad_norm": 0.4585787355899811, + "learning_rate": 8.528642543031448e-06, + "loss": 0.0288, + "step": 4862 + }, + { + "epoch": 2.95892911469425, + "grad_norm": 0.4308152496814728, + "learning_rate": 8.519279679425851e-06, + "loss": 0.0528, + "step": 4863 + }, + { + "epoch": 2.959537572254335, + "grad_norm": 0.3569679856300354, + "learning_rate": 8.509920902351948e-06, + "loss": 0.0242, + "step": 4864 + }, + { + "epoch": 2.96014602981442, + "grad_norm": 0.38262954354286194, + "learning_rate": 8.500566214130317e-06, + "loss": 0.0322, + "step": 4865 + }, + { + "epoch": 2.9607544873745058, + "grad_norm": 0.4387529492378235, + "learning_rate": 8.491215617080545e-06, + "loss": 0.0275, + "step": 4866 + }, + { + "epoch": 2.961362944934591, + "grad_norm": 0.41847920417785645, + "learning_rate": 8.481869113521163e-06, + "loss": 0.0294, + "step": 4867 + }, + { + "epoch": 2.961971402494676, + "grad_norm": 0.41633474826812744, + "learning_rate": 8.472526705769746e-06, + "loss": 0.0315, + "step": 4868 + }, + { + "epoch": 2.962579860054761, + "grad_norm": 0.5311760902404785, + "learning_rate": 8.463188396142799e-06, + "loss": 0.0422, + "step": 4869 + }, + { + "epoch": 2.963188317614846, + "grad_norm": 0.4289592504501343, + "learning_rate": 8.453854186955851e-06, + "loss": 0.0289, + "step": 4870 + }, + { + "epoch": 2.9637967751749317, + "grad_norm": 0.6336865425109863, + "learning_rate": 8.444524080523373e-06, + "loss": 0.0491, + "step": 4871 + }, + { + "epoch": 2.9644052327350168, + "grad_norm": 0.3597281873226166, + "learning_rate": 8.435198079158867e-06, + "loss": 0.0251, + "step": 4872 + }, + { + "epoch": 2.965013690295102, + "grad_norm": 0.5553586483001709, + "learning_rate": 8.42587618517478e-06, + "loss": 0.0345, + "step": 4873 + }, + { + "epoch": 2.965622147855187, + "grad_norm": 0.5425621271133423, + "learning_rate": 8.416558400882552e-06, + "loss": 0.0432, + "step": 4874 + }, + { + "epoch": 2.966230605415272, + "grad_norm": 0.3684383034706116, + "learning_rate": 8.407244728592612e-06, + "loss": 0.0214, + "step": 4875 + }, + { + "epoch": 2.9668390629753576, + "grad_norm": 0.42099565267562866, + "learning_rate": 8.397935170614366e-06, + "loss": 0.0292, + "step": 4876 + }, + { + "epoch": 2.9674475205354427, + "grad_norm": 0.48442554473876953, + "learning_rate": 8.38862972925617e-06, + "loss": 0.0449, + "step": 4877 + }, + { + "epoch": 2.968055978095528, + "grad_norm": 0.491372287273407, + "learning_rate": 8.379328406825426e-06, + "loss": 0.0444, + "step": 4878 + }, + { + "epoch": 2.968664435655613, + "grad_norm": 0.4108073115348816, + "learning_rate": 8.370031205628442e-06, + "loss": 0.0385, + "step": 4879 + }, + { + "epoch": 2.969272893215698, + "grad_norm": 0.4929574429988861, + "learning_rate": 8.36073812797055e-06, + "loss": 0.0421, + "step": 4880 + }, + { + "epoch": 2.9698813507757835, + "grad_norm": 0.4480864405632019, + "learning_rate": 8.351449176156043e-06, + "loss": 0.0321, + "step": 4881 + }, + { + "epoch": 2.9704898083358686, + "grad_norm": 0.39888888597488403, + "learning_rate": 8.342164352488202e-06, + "loss": 0.0195, + "step": 4882 + }, + { + "epoch": 2.9710982658959537, + "grad_norm": 0.475404292345047, + "learning_rate": 8.332883659269261e-06, + "loss": 0.031, + "step": 4883 + }, + { + "epoch": 2.971706723456039, + "grad_norm": 0.509152352809906, + "learning_rate": 8.323607098800454e-06, + "loss": 0.0327, + "step": 4884 + }, + { + "epoch": 2.972315181016124, + "grad_norm": 0.4782547652721405, + "learning_rate": 8.314334673381976e-06, + "loss": 0.0375, + "step": 4885 + }, + { + "epoch": 2.9729236385762094, + "grad_norm": 0.3744230568408966, + "learning_rate": 8.305066385313021e-06, + "loss": 0.0262, + "step": 4886 + }, + { + "epoch": 2.9735320961362945, + "grad_norm": 0.5393217206001282, + "learning_rate": 8.2958022368917e-06, + "loss": 0.0503, + "step": 4887 + }, + { + "epoch": 2.9741405536963796, + "grad_norm": 0.44529324769973755, + "learning_rate": 8.286542230415182e-06, + "loss": 0.0344, + "step": 4888 + }, + { + "epoch": 2.9747490112564647, + "grad_norm": 0.46004465222358704, + "learning_rate": 8.277286368179526e-06, + "loss": 0.0374, + "step": 4889 + }, + { + "epoch": 2.97535746881655, + "grad_norm": 0.46514490246772766, + "learning_rate": 8.268034652479817e-06, + "loss": 0.0374, + "step": 4890 + }, + { + "epoch": 2.9759659263766354, + "grad_norm": 0.4334418475627899, + "learning_rate": 8.25878708561009e-06, + "loss": 0.031, + "step": 4891 + }, + { + "epoch": 2.9765743839367205, + "grad_norm": 0.3778439164161682, + "learning_rate": 8.249543669863366e-06, + "loss": 0.0354, + "step": 4892 + }, + { + "epoch": 2.9771828414968056, + "grad_norm": 0.4430623948574066, + "learning_rate": 8.240304407531602e-06, + "loss": 0.0393, + "step": 4893 + }, + { + "epoch": 2.9777912990568907, + "grad_norm": 0.3321574628353119, + "learning_rate": 8.231069300905783e-06, + "loss": 0.0243, + "step": 4894 + }, + { + "epoch": 2.9783997566169758, + "grad_norm": 0.6121447086334229, + "learning_rate": 8.221838352275807e-06, + "loss": 0.0376, + "step": 4895 + }, + { + "epoch": 2.9790082141770613, + "grad_norm": 0.48445889353752136, + "learning_rate": 8.212611563930577e-06, + "loss": 0.0425, + "step": 4896 + }, + { + "epoch": 2.9796166717371464, + "grad_norm": 0.43182381987571716, + "learning_rate": 8.20338893815793e-06, + "loss": 0.0318, + "step": 4897 + }, + { + "epoch": 2.9802251292972315, + "grad_norm": 0.4070221185684204, + "learning_rate": 8.194170477244729e-06, + "loss": 0.0291, + "step": 4898 + }, + { + "epoch": 2.9808335868573166, + "grad_norm": 0.4428864121437073, + "learning_rate": 8.184956183476735e-06, + "loss": 0.0336, + "step": 4899 + }, + { + "epoch": 2.9814420444174017, + "grad_norm": 0.4261319637298584, + "learning_rate": 8.175746059138725e-06, + "loss": 0.03, + "step": 4900 + }, + { + "epoch": 2.9820505019774872, + "grad_norm": 0.45288318395614624, + "learning_rate": 8.166540106514422e-06, + "loss": 0.0374, + "step": 4901 + }, + { + "epoch": 2.9826589595375723, + "grad_norm": 0.4366372227668762, + "learning_rate": 8.15733832788653e-06, + "loss": 0.037, + "step": 4902 + }, + { + "epoch": 2.9832674170976574, + "grad_norm": 0.5225448608398438, + "learning_rate": 8.148140725536676e-06, + "loss": 0.0371, + "step": 4903 + }, + { + "epoch": 2.9838758746577425, + "grad_norm": 0.47045180201530457, + "learning_rate": 8.138947301745519e-06, + "loss": 0.0357, + "step": 4904 + }, + { + "epoch": 2.9844843322178276, + "grad_norm": 0.5777865648269653, + "learning_rate": 8.129758058792623e-06, + "loss": 0.0439, + "step": 4905 + }, + { + "epoch": 2.985092789777913, + "grad_norm": 0.5593705773353577, + "learning_rate": 8.120572998956546e-06, + "loss": 0.0355, + "step": 4906 + }, + { + "epoch": 2.9857012473379982, + "grad_norm": 0.41948768496513367, + "learning_rate": 8.111392124514783e-06, + "loss": 0.0446, + "step": 4907 + }, + { + "epoch": 2.9863097048980833, + "grad_norm": 0.44301876425743103, + "learning_rate": 8.102215437743835e-06, + "loss": 0.0342, + "step": 4908 + }, + { + "epoch": 2.9869181624581684, + "grad_norm": 0.4251943826675415, + "learning_rate": 8.093042940919118e-06, + "loss": 0.0345, + "step": 4909 + }, + { + "epoch": 2.9875266200182535, + "grad_norm": 0.28501009941101074, + "learning_rate": 8.083874636315034e-06, + "loss": 0.0155, + "step": 4910 + }, + { + "epoch": 2.988135077578339, + "grad_norm": 0.4958946108818054, + "learning_rate": 8.07471052620494e-06, + "loss": 0.0373, + "step": 4911 + }, + { + "epoch": 2.988743535138424, + "grad_norm": 0.48140478134155273, + "learning_rate": 8.065550612861164e-06, + "loss": 0.0397, + "step": 4912 + }, + { + "epoch": 2.9893519926985093, + "grad_norm": 0.556651771068573, + "learning_rate": 8.05639489855496e-06, + "loss": 0.0472, + "step": 4913 + }, + { + "epoch": 2.9899604502585944, + "grad_norm": 0.4993399381637573, + "learning_rate": 8.047243385556588e-06, + "loss": 0.0437, + "step": 4914 + }, + { + "epoch": 2.9905689078186795, + "grad_norm": 0.33873289823532104, + "learning_rate": 8.038096076135227e-06, + "loss": 0.0172, + "step": 4915 + }, + { + "epoch": 2.991177365378765, + "grad_norm": 0.624120831489563, + "learning_rate": 8.028952972559028e-06, + "loss": 0.0467, + "step": 4916 + }, + { + "epoch": 2.99178582293885, + "grad_norm": 0.4880430996417999, + "learning_rate": 8.019814077095107e-06, + "loss": 0.0364, + "step": 4917 + }, + { + "epoch": 2.992394280498935, + "grad_norm": 0.40606868267059326, + "learning_rate": 8.010679392009532e-06, + "loss": 0.0334, + "step": 4918 + }, + { + "epoch": 2.9930027380590203, + "grad_norm": 0.6444923281669617, + "learning_rate": 8.001548919567312e-06, + "loss": 0.0422, + "step": 4919 + }, + { + "epoch": 2.9936111956191054, + "grad_norm": 0.45829200744628906, + "learning_rate": 7.992422662032429e-06, + "loss": 0.0284, + "step": 4920 + }, + { + "epoch": 2.994219653179191, + "grad_norm": 0.38635167479515076, + "learning_rate": 7.983300621667814e-06, + "loss": 0.0256, + "step": 4921 + }, + { + "epoch": 2.994828110739276, + "grad_norm": 0.47939708828926086, + "learning_rate": 7.974182800735361e-06, + "loss": 0.041, + "step": 4922 + }, + { + "epoch": 2.995436568299361, + "grad_norm": 0.343191534280777, + "learning_rate": 7.965069201495887e-06, + "loss": 0.0212, + "step": 4923 + }, + { + "epoch": 2.996045025859446, + "grad_norm": 0.40407371520996094, + "learning_rate": 7.955959826209217e-06, + "loss": 0.0233, + "step": 4924 + }, + { + "epoch": 2.9966534834195313, + "grad_norm": 0.5077594518661499, + "learning_rate": 7.946854677134072e-06, + "loss": 0.0448, + "step": 4925 + }, + { + "epoch": 2.997261940979617, + "grad_norm": 0.41432538628578186, + "learning_rate": 7.937753756528155e-06, + "loss": 0.0365, + "step": 4926 + }, + { + "epoch": 2.997870398539702, + "grad_norm": 0.3649181127548218, + "learning_rate": 7.928657066648118e-06, + "loss": 0.0347, + "step": 4927 + }, + { + "epoch": 2.998478856099787, + "grad_norm": 0.4499366283416748, + "learning_rate": 7.919564609749568e-06, + "loss": 0.0343, + "step": 4928 + }, + { + "epoch": 2.999087313659872, + "grad_norm": 0.4373056888580322, + "learning_rate": 7.910476388087038e-06, + "loss": 0.035, + "step": 4929 + }, + { + "epoch": 2.9996957712199572, + "grad_norm": 0.3516384959220886, + "learning_rate": 7.90139240391404e-06, + "loss": 0.0229, + "step": 4930 + }, + { + "epoch": 2.9996957712199572, + "eval_loss": 1.429209589958191, + "eval_runtime": 105.3655, + "eval_samples_per_second": 7.232, + "eval_steps_per_second": 0.456, + "step": 4930 + }, + { + "epoch": 3.0003042287800428, + "grad_norm": 0.4027365744113922, + "learning_rate": 7.892312659483017e-06, + "loss": 0.0212, + "step": 4931 + }, + { + "epoch": 3.000912686340128, + "grad_norm": 0.18064260482788086, + "learning_rate": 7.88323715704538e-06, + "loss": 0.0077, + "step": 4932 + }, + { + "epoch": 3.001521143900213, + "grad_norm": 0.20326943695545197, + "learning_rate": 7.874165898851454e-06, + "loss": 0.0088, + "step": 4933 + }, + { + "epoch": 3.002129601460298, + "grad_norm": 0.21207283437252045, + "learning_rate": 7.865098887150557e-06, + "loss": 0.0098, + "step": 4934 + }, + { + "epoch": 3.002738059020383, + "grad_norm": 0.25013598799705505, + "learning_rate": 7.856036124190913e-06, + "loss": 0.0099, + "step": 4935 + }, + { + "epoch": 3.0033465165804687, + "grad_norm": 0.16713546216487885, + "learning_rate": 7.84697761221971e-06, + "loss": 0.0071, + "step": 4936 + }, + { + "epoch": 3.003954974140554, + "grad_norm": 0.20086368918418884, + "learning_rate": 7.83792335348309e-06, + "loss": 0.0092, + "step": 4937 + }, + { + "epoch": 3.004563431700639, + "grad_norm": 0.19693416357040405, + "learning_rate": 7.828873350226135e-06, + "loss": 0.0084, + "step": 4938 + }, + { + "epoch": 3.005171889260724, + "grad_norm": 0.22012776136398315, + "learning_rate": 7.819827604692845e-06, + "loss": 0.0081, + "step": 4939 + }, + { + "epoch": 3.005780346820809, + "grad_norm": 0.15234141051769257, + "learning_rate": 7.81078611912622e-06, + "loss": 0.0062, + "step": 4940 + }, + { + "epoch": 3.0063888043808946, + "grad_norm": 0.23004913330078125, + "learning_rate": 7.801748895768148e-06, + "loss": 0.0081, + "step": 4941 + }, + { + "epoch": 3.0069972619409797, + "grad_norm": 0.32940471172332764, + "learning_rate": 7.79271593685949e-06, + "loss": 0.0068, + "step": 4942 + }, + { + "epoch": 3.007605719501065, + "grad_norm": 0.19095928966999054, + "learning_rate": 7.783687244640048e-06, + "loss": 0.0077, + "step": 4943 + }, + { + "epoch": 3.00821417706115, + "grad_norm": 0.25689762830734253, + "learning_rate": 7.774662821348563e-06, + "loss": 0.0107, + "step": 4944 + }, + { + "epoch": 3.008822634621235, + "grad_norm": 0.22656317055225372, + "learning_rate": 7.765642669222706e-06, + "loss": 0.0056, + "step": 4945 + }, + { + "epoch": 3.0094310921813205, + "grad_norm": 0.20489272475242615, + "learning_rate": 7.7566267904991e-06, + "loss": 0.0086, + "step": 4946 + }, + { + "epoch": 3.0100395497414056, + "grad_norm": 0.24489440023899078, + "learning_rate": 7.747615187413312e-06, + "loss": 0.009, + "step": 4947 + }, + { + "epoch": 3.0106480073014907, + "grad_norm": 0.32784104347229004, + "learning_rate": 7.738607862199851e-06, + "loss": 0.0107, + "step": 4948 + }, + { + "epoch": 3.011256464861576, + "grad_norm": 0.25375160574913025, + "learning_rate": 7.729604817092134e-06, + "loss": 0.0066, + "step": 4949 + }, + { + "epoch": 3.011864922421661, + "grad_norm": 0.33918920159339905, + "learning_rate": 7.720606054322574e-06, + "loss": 0.0123, + "step": 4950 + }, + { + "epoch": 3.0124733799817465, + "grad_norm": 0.28510206937789917, + "learning_rate": 7.71161157612246e-06, + "loss": 0.0111, + "step": 4951 + }, + { + "epoch": 3.0130818375418316, + "grad_norm": 0.18262362480163574, + "learning_rate": 7.702621384722064e-06, + "loss": 0.0048, + "step": 4952 + }, + { + "epoch": 3.0136902951019167, + "grad_norm": 0.3342544138431549, + "learning_rate": 7.693635482350572e-06, + "loss": 0.013, + "step": 4953 + }, + { + "epoch": 3.0142987526620018, + "grad_norm": 0.2429402768611908, + "learning_rate": 7.684653871236125e-06, + "loss": 0.0059, + "step": 4954 + }, + { + "epoch": 3.014907210222087, + "grad_norm": 0.30693817138671875, + "learning_rate": 7.675676553605768e-06, + "loss": 0.007, + "step": 4955 + }, + { + "epoch": 3.0155156677821724, + "grad_norm": 0.27962803840637207, + "learning_rate": 7.666703531685516e-06, + "loss": 0.0086, + "step": 4956 + }, + { + "epoch": 3.0161241253422575, + "grad_norm": 0.5548810958862305, + "learning_rate": 7.657734807700297e-06, + "loss": 0.0059, + "step": 4957 + }, + { + "epoch": 3.0167325829023426, + "grad_norm": 0.20625406503677368, + "learning_rate": 7.648770383873988e-06, + "loss": 0.0054, + "step": 4958 + }, + { + "epoch": 3.0173410404624277, + "grad_norm": 0.19295968115329742, + "learning_rate": 7.639810262429386e-06, + "loss": 0.0042, + "step": 4959 + }, + { + "epoch": 3.0179494980225128, + "grad_norm": 0.231733500957489, + "learning_rate": 7.630854445588239e-06, + "loss": 0.0065, + "step": 4960 + }, + { + "epoch": 3.0185579555825983, + "grad_norm": 0.42650073766708374, + "learning_rate": 7.621902935571201e-06, + "loss": 0.0063, + "step": 4961 + }, + { + "epoch": 3.0191664131426834, + "grad_norm": 0.21923278272151947, + "learning_rate": 7.612955734597879e-06, + "loss": 0.0045, + "step": 4962 + }, + { + "epoch": 3.0197748707027685, + "grad_norm": 0.3130083978176117, + "learning_rate": 7.6040128448868096e-06, + "loss": 0.0065, + "step": 4963 + }, + { + "epoch": 3.0203833282628536, + "grad_norm": 0.2901938557624817, + "learning_rate": 7.5950742686554525e-06, + "loss": 0.0072, + "step": 4964 + }, + { + "epoch": 3.0209917858229387, + "grad_norm": 0.2919492721557617, + "learning_rate": 7.58614000812021e-06, + "loss": 0.0072, + "step": 4965 + }, + { + "epoch": 3.0216002433830242, + "grad_norm": 0.2188805788755417, + "learning_rate": 7.577210065496396e-06, + "loss": 0.0032, + "step": 4966 + }, + { + "epoch": 3.0222087009431093, + "grad_norm": 0.24292194843292236, + "learning_rate": 7.568284442998263e-06, + "loss": 0.0065, + "step": 4967 + }, + { + "epoch": 3.0228171585031944, + "grad_norm": 0.26626601815223694, + "learning_rate": 7.559363142839002e-06, + "loss": 0.011, + "step": 4968 + }, + { + "epoch": 3.0234256160632795, + "grad_norm": 0.2926938235759735, + "learning_rate": 7.550446167230718e-06, + "loss": 0.0076, + "step": 4969 + }, + { + "epoch": 3.0240340736233646, + "grad_norm": 0.26158931851387024, + "learning_rate": 7.5415335183844525e-06, + "loss": 0.0093, + "step": 4970 + }, + { + "epoch": 3.02464253118345, + "grad_norm": 0.2527156472206116, + "learning_rate": 7.53262519851018e-06, + "loss": 0.0066, + "step": 4971 + }, + { + "epoch": 3.0252509887435353, + "grad_norm": 0.21547862887382507, + "learning_rate": 7.52372120981677e-06, + "loss": 0.0039, + "step": 4972 + }, + { + "epoch": 3.0258594463036204, + "grad_norm": 0.1877271980047226, + "learning_rate": 7.514821554512056e-06, + "loss": 0.0046, + "step": 4973 + }, + { + "epoch": 3.0264679038637055, + "grad_norm": 0.31582483649253845, + "learning_rate": 7.505926234802777e-06, + "loss": 0.0115, + "step": 4974 + }, + { + "epoch": 3.0270763614237906, + "grad_norm": 0.2469736635684967, + "learning_rate": 7.49703525289461e-06, + "loss": 0.0073, + "step": 4975 + }, + { + "epoch": 3.027684818983876, + "grad_norm": 0.2651013731956482, + "learning_rate": 7.488148610992127e-06, + "loss": 0.0059, + "step": 4976 + }, + { + "epoch": 3.028293276543961, + "grad_norm": 0.3526837229728699, + "learning_rate": 7.479266311298871e-06, + "loss": 0.0103, + "step": 4977 + }, + { + "epoch": 3.0289017341040463, + "grad_norm": 0.4271092712879181, + "learning_rate": 7.470388356017266e-06, + "loss": 0.0092, + "step": 4978 + }, + { + "epoch": 3.0295101916641314, + "grad_norm": 0.1828882098197937, + "learning_rate": 7.461514747348674e-06, + "loss": 0.0029, + "step": 4979 + }, + { + "epoch": 3.0301186492242165, + "grad_norm": 0.2836543917655945, + "learning_rate": 7.452645487493387e-06, + "loss": 0.008, + "step": 4980 + }, + { + "epoch": 3.030727106784302, + "grad_norm": 0.5397449731826782, + "learning_rate": 7.4437805786506164e-06, + "loss": 0.0153, + "step": 4981 + }, + { + "epoch": 3.031335564344387, + "grad_norm": 0.24427630007266998, + "learning_rate": 7.434920023018476e-06, + "loss": 0.004, + "step": 4982 + }, + { + "epoch": 3.031944021904472, + "grad_norm": 0.320932000875473, + "learning_rate": 7.426063822794022e-06, + "loss": 0.0092, + "step": 4983 + }, + { + "epoch": 3.0325524794645573, + "grad_norm": 0.280251145362854, + "learning_rate": 7.417211980173222e-06, + "loss": 0.0093, + "step": 4984 + }, + { + "epoch": 3.0331609370246424, + "grad_norm": 0.19520291686058044, + "learning_rate": 7.408364497350964e-06, + "loss": 0.0055, + "step": 4985 + }, + { + "epoch": 3.0337693945847275, + "grad_norm": 0.28560855984687805, + "learning_rate": 7.3995213765210545e-06, + "loss": 0.0072, + "step": 4986 + }, + { + "epoch": 3.034377852144813, + "grad_norm": 0.226197749376297, + "learning_rate": 7.390682619876227e-06, + "loss": 0.0038, + "step": 4987 + }, + { + "epoch": 3.034986309704898, + "grad_norm": 0.3306165039539337, + "learning_rate": 7.38184822960811e-06, + "loss": 0.0087, + "step": 4988 + }, + { + "epoch": 3.0355947672649832, + "grad_norm": 0.20668867230415344, + "learning_rate": 7.37301820790727e-06, + "loss": 0.0043, + "step": 4989 + }, + { + "epoch": 3.0362032248250683, + "grad_norm": 0.28162097930908203, + "learning_rate": 7.364192556963187e-06, + "loss": 0.0085, + "step": 4990 + }, + { + "epoch": 3.0368116823851534, + "grad_norm": 0.2704124450683594, + "learning_rate": 7.35537127896426e-06, + "loss": 0.0047, + "step": 4991 + }, + { + "epoch": 3.037420139945239, + "grad_norm": 0.18822601437568665, + "learning_rate": 7.346554376097778e-06, + "loss": 0.0044, + "step": 4992 + }, + { + "epoch": 3.038028597505324, + "grad_norm": 0.28500285744667053, + "learning_rate": 7.337741850549992e-06, + "loss": 0.0073, + "step": 4993 + }, + { + "epoch": 3.038637055065409, + "grad_norm": 0.394612193107605, + "learning_rate": 7.328933704506022e-06, + "loss": 0.0124, + "step": 4994 + }, + { + "epoch": 3.0392455126254942, + "grad_norm": 0.2029077261686325, + "learning_rate": 7.320129940149925e-06, + "loss": 0.0046, + "step": 4995 + }, + { + "epoch": 3.0398539701855793, + "grad_norm": 0.3149600625038147, + "learning_rate": 7.311330559664673e-06, + "loss": 0.0061, + "step": 4996 + }, + { + "epoch": 3.040462427745665, + "grad_norm": 0.18582724034786224, + "learning_rate": 7.3025355652321464e-06, + "loss": 0.0046, + "step": 4997 + }, + { + "epoch": 3.04107088530575, + "grad_norm": 0.2304268330335617, + "learning_rate": 7.293744959033124e-06, + "loss": 0.0065, + "step": 4998 + }, + { + "epoch": 3.041679342865835, + "grad_norm": 0.3009764850139618, + "learning_rate": 7.284958743247322e-06, + "loss": 0.0064, + "step": 4999 + }, + { + "epoch": 3.04228780042592, + "grad_norm": 0.2738197147846222, + "learning_rate": 7.276176920053351e-06, + "loss": 0.0107, + "step": 5000 + }, + { + "epoch": 3.0428962579860053, + "grad_norm": 0.28810760378837585, + "learning_rate": 7.267399491628748e-06, + "loss": 0.0055, + "step": 5001 + }, + { + "epoch": 3.043504715546091, + "grad_norm": 0.3494376838207245, + "learning_rate": 7.258626460149922e-06, + "loss": 0.0112, + "step": 5002 + }, + { + "epoch": 3.044113173106176, + "grad_norm": 0.20463795959949493, + "learning_rate": 7.249857827792253e-06, + "loss": 0.0053, + "step": 5003 + }, + { + "epoch": 3.044721630666261, + "grad_norm": 0.22482581436634064, + "learning_rate": 7.241093596729976e-06, + "loss": 0.0088, + "step": 5004 + }, + { + "epoch": 3.045330088226346, + "grad_norm": 0.29432442784309387, + "learning_rate": 7.232333769136254e-06, + "loss": 0.0081, + "step": 5005 + }, + { + "epoch": 3.045938545786431, + "grad_norm": 0.273941308259964, + "learning_rate": 7.223578347183166e-06, + "loss": 0.0092, + "step": 5006 + }, + { + "epoch": 3.0465470033465167, + "grad_norm": 0.3165578842163086, + "learning_rate": 7.2148273330416985e-06, + "loss": 0.0134, + "step": 5007 + }, + { + "epoch": 3.047155460906602, + "grad_norm": 0.293646901845932, + "learning_rate": 7.206080728881715e-06, + "loss": 0.009, + "step": 5008 + }, + { + "epoch": 3.047763918466687, + "grad_norm": 0.24793358147144318, + "learning_rate": 7.197338536872039e-06, + "loss": 0.008, + "step": 5009 + }, + { + "epoch": 3.048372376026772, + "grad_norm": 0.23065130412578583, + "learning_rate": 7.188600759180347e-06, + "loss": 0.0052, + "step": 5010 + }, + { + "epoch": 3.048980833586857, + "grad_norm": 0.32581770420074463, + "learning_rate": 7.1798673979732585e-06, + "loss": 0.0089, + "step": 5011 + }, + { + "epoch": 3.0495892911469427, + "grad_norm": 0.2617538571357727, + "learning_rate": 7.17113845541626e-06, + "loss": 0.0074, + "step": 5012 + }, + { + "epoch": 3.0501977487070278, + "grad_norm": 0.2555639445781708, + "learning_rate": 7.162413933673795e-06, + "loss": 0.008, + "step": 5013 + }, + { + "epoch": 3.050806206267113, + "grad_norm": 0.1875854879617691, + "learning_rate": 7.153693834909161e-06, + "loss": 0.005, + "step": 5014 + }, + { + "epoch": 3.051414663827198, + "grad_norm": 0.19386368989944458, + "learning_rate": 7.144978161284585e-06, + "loss": 0.0039, + "step": 5015 + }, + { + "epoch": 3.052023121387283, + "grad_norm": 0.23817914724349976, + "learning_rate": 7.13626691496119e-06, + "loss": 0.0068, + "step": 5016 + }, + { + "epoch": 3.0526315789473686, + "grad_norm": 0.215720534324646, + "learning_rate": 7.127560098099012e-06, + "loss": 0.006, + "step": 5017 + }, + { + "epoch": 3.0532400365074537, + "grad_norm": 0.35492417216300964, + "learning_rate": 7.118857712856952e-06, + "loss": 0.0065, + "step": 5018 + }, + { + "epoch": 3.0538484940675388, + "grad_norm": 0.30515995621681213, + "learning_rate": 7.110159761392876e-06, + "loss": 0.0106, + "step": 5019 + }, + { + "epoch": 3.054456951627624, + "grad_norm": 0.1642477661371231, + "learning_rate": 7.101466245863483e-06, + "loss": 0.0038, + "step": 5020 + }, + { + "epoch": 3.055065409187709, + "grad_norm": 0.3018817901611328, + "learning_rate": 7.092777168424422e-06, + "loss": 0.0085, + "step": 5021 + }, + { + "epoch": 3.0556738667477945, + "grad_norm": 0.4562574625015259, + "learning_rate": 7.084092531230196e-06, + "loss": 0.0108, + "step": 5022 + }, + { + "epoch": 3.0562823243078796, + "grad_norm": 0.3128783702850342, + "learning_rate": 7.07541233643427e-06, + "loss": 0.007, + "step": 5023 + }, + { + "epoch": 3.0568907818679647, + "grad_norm": 0.2551473379135132, + "learning_rate": 7.066736586188941e-06, + "loss": 0.0075, + "step": 5024 + }, + { + "epoch": 3.05749923942805, + "grad_norm": 0.20833851397037506, + "learning_rate": 7.058065282645443e-06, + "loss": 0.0061, + "step": 5025 + }, + { + "epoch": 3.058107696988135, + "grad_norm": 0.2601064443588257, + "learning_rate": 7.049398427953899e-06, + "loss": 0.005, + "step": 5026 + }, + { + "epoch": 3.0587161545482204, + "grad_norm": 0.2717060148715973, + "learning_rate": 7.040736024263334e-06, + "loss": 0.007, + "step": 5027 + }, + { + "epoch": 3.0593246121083055, + "grad_norm": 0.1747554987668991, + "learning_rate": 7.03207807372164e-06, + "loss": 0.0047, + "step": 5028 + }, + { + "epoch": 3.0599330696683906, + "grad_norm": 0.3020252585411072, + "learning_rate": 7.023424578475659e-06, + "loss": 0.008, + "step": 5029 + }, + { + "epoch": 3.0605415272284757, + "grad_norm": 0.20203709602355957, + "learning_rate": 7.014775540671076e-06, + "loss": 0.0062, + "step": 5030 + }, + { + "epoch": 3.061149984788561, + "grad_norm": 0.30242085456848145, + "learning_rate": 7.006130962452498e-06, + "loss": 0.0091, + "step": 5031 + }, + { + "epoch": 3.0617584423486464, + "grad_norm": 0.25199273228645325, + "learning_rate": 6.997490845963417e-06, + "loss": 0.0062, + "step": 5032 + }, + { + "epoch": 3.0623668999087315, + "grad_norm": 0.2626577615737915, + "learning_rate": 6.988855193346236e-06, + "loss": 0.0067, + "step": 5033 + }, + { + "epoch": 3.0629753574688166, + "grad_norm": 0.21209199726581573, + "learning_rate": 6.980224006742214e-06, + "loss": 0.0036, + "step": 5034 + }, + { + "epoch": 3.0635838150289016, + "grad_norm": 0.14695894718170166, + "learning_rate": 6.97159728829154e-06, + "loss": 0.0028, + "step": 5035 + }, + { + "epoch": 3.0641922725889867, + "grad_norm": 0.28200700879096985, + "learning_rate": 6.9629750401332766e-06, + "loss": 0.0076, + "step": 5036 + }, + { + "epoch": 3.0648007301490723, + "grad_norm": 0.26668545603752136, + "learning_rate": 6.954357264405392e-06, + "loss": 0.0058, + "step": 5037 + }, + { + "epoch": 3.0654091877091574, + "grad_norm": 0.2839964032173157, + "learning_rate": 6.945743963244711e-06, + "loss": 0.0102, + "step": 5038 + }, + { + "epoch": 3.0660176452692425, + "grad_norm": 0.29744213819503784, + "learning_rate": 6.937135138787001e-06, + "loss": 0.0084, + "step": 5039 + }, + { + "epoch": 3.0666261028293276, + "grad_norm": 0.3763432800769806, + "learning_rate": 6.928530793166874e-06, + "loss": 0.0099, + "step": 5040 + }, + { + "epoch": 3.0672345603894127, + "grad_norm": 0.23103450238704681, + "learning_rate": 6.919930928517854e-06, + "loss": 0.0071, + "step": 5041 + }, + { + "epoch": 3.067843017949498, + "grad_norm": 0.3754606246948242, + "learning_rate": 6.91133554697235e-06, + "loss": 0.0078, + "step": 5042 + }, + { + "epoch": 3.0684514755095833, + "grad_norm": 0.31299889087677, + "learning_rate": 6.902744650661663e-06, + "loss": 0.0066, + "step": 5043 + }, + { + "epoch": 3.0690599330696684, + "grad_norm": 0.26163333654403687, + "learning_rate": 6.894158241715959e-06, + "loss": 0.0059, + "step": 5044 + }, + { + "epoch": 3.0696683906297535, + "grad_norm": 0.25679513812065125, + "learning_rate": 6.885576322264336e-06, + "loss": 0.0043, + "step": 5045 + }, + { + "epoch": 3.0702768481898386, + "grad_norm": 0.38195693492889404, + "learning_rate": 6.87699889443473e-06, + "loss": 0.0088, + "step": 5046 + }, + { + "epoch": 3.070885305749924, + "grad_norm": 0.3322720229625702, + "learning_rate": 6.868425960354005e-06, + "loss": 0.0103, + "step": 5047 + }, + { + "epoch": 3.0714937633100092, + "grad_norm": 0.3915663957595825, + "learning_rate": 6.859857522147864e-06, + "loss": 0.0069, + "step": 5048 + }, + { + "epoch": 3.0721022208700943, + "grad_norm": 0.26933425664901733, + "learning_rate": 6.851293581940954e-06, + "loss": 0.0077, + "step": 5049 + }, + { + "epoch": 3.0727106784301794, + "grad_norm": 0.24823108315467834, + "learning_rate": 6.842734141856755e-06, + "loss": 0.0065, + "step": 5050 + }, + { + "epoch": 3.0733191359902645, + "grad_norm": 0.31463244557380676, + "learning_rate": 6.834179204017655e-06, + "loss": 0.0049, + "step": 5051 + }, + { + "epoch": 3.07392759355035, + "grad_norm": 0.23442064225673676, + "learning_rate": 6.82562877054492e-06, + "loss": 0.0064, + "step": 5052 + }, + { + "epoch": 3.074536051110435, + "grad_norm": 0.36819660663604736, + "learning_rate": 6.817082843558717e-06, + "loss": 0.0045, + "step": 5053 + }, + { + "epoch": 3.0751445086705202, + "grad_norm": 0.24585528671741486, + "learning_rate": 6.80854142517805e-06, + "loss": 0.0048, + "step": 5054 + }, + { + "epoch": 3.0757529662306053, + "grad_norm": 0.18307526409626007, + "learning_rate": 6.80000451752087e-06, + "loss": 0.0048, + "step": 5055 + }, + { + "epoch": 3.0763614237906904, + "grad_norm": 0.20536406338214874, + "learning_rate": 6.791472122703946e-06, + "loss": 0.0045, + "step": 5056 + }, + { + "epoch": 3.076969881350776, + "grad_norm": 0.3933330774307251, + "learning_rate": 6.782944242842976e-06, + "loss": 0.0126, + "step": 5057 + }, + { + "epoch": 3.077578338910861, + "grad_norm": 0.2311299741268158, + "learning_rate": 6.774420880052496e-06, + "loss": 0.0061, + "step": 5058 + }, + { + "epoch": 3.078186796470946, + "grad_norm": 0.2359190732240677, + "learning_rate": 6.7659020364459705e-06, + "loss": 0.0067, + "step": 5059 + }, + { + "epoch": 3.0787952540310313, + "grad_norm": 0.2867051959037781, + "learning_rate": 6.757387714135696e-06, + "loss": 0.0055, + "step": 5060 + }, + { + "epoch": 3.0794037115911164, + "grad_norm": 0.2318498194217682, + "learning_rate": 6.748877915232882e-06, + "loss": 0.0062, + "step": 5061 + }, + { + "epoch": 3.080012169151202, + "grad_norm": 0.15246573090553284, + "learning_rate": 6.7403726418476005e-06, + "loss": 0.003, + "step": 5062 + }, + { + "epoch": 3.080620626711287, + "grad_norm": 0.30910444259643555, + "learning_rate": 6.731871896088812e-06, + "loss": 0.009, + "step": 5063 + }, + { + "epoch": 3.081229084271372, + "grad_norm": 0.32887697219848633, + "learning_rate": 6.723375680064325e-06, + "loss": 0.007, + "step": 5064 + }, + { + "epoch": 3.081837541831457, + "grad_norm": 0.38914939761161804, + "learning_rate": 6.714883995880877e-06, + "loss": 0.0124, + "step": 5065 + }, + { + "epoch": 3.0824459993915423, + "grad_norm": 0.3048250079154968, + "learning_rate": 6.706396845644031e-06, + "loss": 0.0073, + "step": 5066 + }, + { + "epoch": 3.083054456951628, + "grad_norm": 0.28059664368629456, + "learning_rate": 6.697914231458249e-06, + "loss": 0.0043, + "step": 5067 + }, + { + "epoch": 3.083662914511713, + "grad_norm": 0.23873847723007202, + "learning_rate": 6.689436155426873e-06, + "loss": 0.0053, + "step": 5068 + }, + { + "epoch": 3.084271372071798, + "grad_norm": 0.4285385012626648, + "learning_rate": 6.680962619652115e-06, + "loss": 0.0055, + "step": 5069 + }, + { + "epoch": 3.084879829631883, + "grad_norm": 0.2138291299343109, + "learning_rate": 6.672493626235044e-06, + "loss": 0.0059, + "step": 5070 + }, + { + "epoch": 3.085488287191968, + "grad_norm": 0.591597318649292, + "learning_rate": 6.664029177275624e-06, + "loss": 0.0073, + "step": 5071 + }, + { + "epoch": 3.0860967447520538, + "grad_norm": 0.301170289516449, + "learning_rate": 6.655569274872689e-06, + "loss": 0.0088, + "step": 5072 + }, + { + "epoch": 3.086705202312139, + "grad_norm": 0.309881329536438, + "learning_rate": 6.647113921123941e-06, + "loss": 0.0096, + "step": 5073 + }, + { + "epoch": 3.087313659872224, + "grad_norm": 0.17016667127609253, + "learning_rate": 6.638663118125951e-06, + "loss": 0.0056, + "step": 5074 + }, + { + "epoch": 3.087922117432309, + "grad_norm": 0.18052910268306732, + "learning_rate": 6.6302168679741785e-06, + "loss": 0.0032, + "step": 5075 + }, + { + "epoch": 3.088530574992394, + "grad_norm": 0.22513878345489502, + "learning_rate": 6.6217751727629285e-06, + "loss": 0.0041, + "step": 5076 + }, + { + "epoch": 3.0891390325524792, + "grad_norm": 0.2399200052022934, + "learning_rate": 6.61333803458539e-06, + "loss": 0.0082, + "step": 5077 + }, + { + "epoch": 3.0897474901125648, + "grad_norm": 0.20230963826179504, + "learning_rate": 6.604905455533625e-06, + "loss": 0.0073, + "step": 5078 + }, + { + "epoch": 3.09035594767265, + "grad_norm": 0.24099519848823547, + "learning_rate": 6.596477437698565e-06, + "loss": 0.0054, + "step": 5079 + }, + { + "epoch": 3.090964405232735, + "grad_norm": 0.16781140863895416, + "learning_rate": 6.588053983170006e-06, + "loss": 0.0036, + "step": 5080 + }, + { + "epoch": 3.09157286279282, + "grad_norm": 0.24010391533374786, + "learning_rate": 6.579635094036607e-06, + "loss": 0.0067, + "step": 5081 + }, + { + "epoch": 3.092181320352905, + "grad_norm": 0.3611644208431244, + "learning_rate": 6.571220772385905e-06, + "loss": 0.009, + "step": 5082 + }, + { + "epoch": 3.0927897779129907, + "grad_norm": 0.2840390205383301, + "learning_rate": 6.562811020304305e-06, + "loss": 0.008, + "step": 5083 + }, + { + "epoch": 3.093398235473076, + "grad_norm": 0.29138410091400146, + "learning_rate": 6.55440583987707e-06, + "loss": 0.0054, + "step": 5084 + }, + { + "epoch": 3.094006693033161, + "grad_norm": 0.23507152497768402, + "learning_rate": 6.546005233188343e-06, + "loss": 0.0051, + "step": 5085 + }, + { + "epoch": 3.094615150593246, + "grad_norm": 0.2997680902481079, + "learning_rate": 6.537609202321113e-06, + "loss": 0.007, + "step": 5086 + }, + { + "epoch": 3.095223608153331, + "grad_norm": 0.2853991389274597, + "learning_rate": 6.529217749357247e-06, + "loss": 0.0089, + "step": 5087 + }, + { + "epoch": 3.0958320657134166, + "grad_norm": 0.18508751690387726, + "learning_rate": 6.520830876377482e-06, + "loss": 0.0029, + "step": 5088 + }, + { + "epoch": 3.0964405232735017, + "grad_norm": 0.1906099170446396, + "learning_rate": 6.5124485854614086e-06, + "loss": 0.0041, + "step": 5089 + }, + { + "epoch": 3.097048980833587, + "grad_norm": 0.3323763906955719, + "learning_rate": 6.504070878687485e-06, + "loss": 0.0109, + "step": 5090 + }, + { + "epoch": 3.097657438393672, + "grad_norm": 0.2999829947948456, + "learning_rate": 6.495697758133046e-06, + "loss": 0.0077, + "step": 5091 + }, + { + "epoch": 3.098265895953757, + "grad_norm": 0.3580150604248047, + "learning_rate": 6.487329225874256e-06, + "loss": 0.0135, + "step": 5092 + }, + { + "epoch": 3.0988743535138426, + "grad_norm": 0.17444658279418945, + "learning_rate": 6.478965283986174e-06, + "loss": 0.0035, + "step": 5093 + }, + { + "epoch": 3.0994828110739276, + "grad_norm": 0.20913676917552948, + "learning_rate": 6.470605934542703e-06, + "loss": 0.0045, + "step": 5094 + }, + { + "epoch": 3.1000912686340127, + "grad_norm": 0.2408442199230194, + "learning_rate": 6.462251179616621e-06, + "loss": 0.0065, + "step": 5095 + }, + { + "epoch": 3.100699726194098, + "grad_norm": 0.18831227719783783, + "learning_rate": 6.453901021279559e-06, + "loss": 0.0035, + "step": 5096 + }, + { + "epoch": 3.101308183754183, + "grad_norm": 0.3297005295753479, + "learning_rate": 6.445555461602001e-06, + "loss": 0.0078, + "step": 5097 + }, + { + "epoch": 3.1019166413142685, + "grad_norm": 0.21692611277103424, + "learning_rate": 6.437214502653299e-06, + "loss": 0.0051, + "step": 5098 + }, + { + "epoch": 3.1025250988743536, + "grad_norm": 0.24288775026798248, + "learning_rate": 6.428878146501666e-06, + "loss": 0.0051, + "step": 5099 + }, + { + "epoch": 3.1031335564344387, + "grad_norm": 0.17630305886268616, + "learning_rate": 6.420546395214167e-06, + "loss": 0.0034, + "step": 5100 + }, + { + "epoch": 3.1037420139945238, + "grad_norm": 0.2515610158443451, + "learning_rate": 6.412219250856735e-06, + "loss": 0.006, + "step": 5101 + }, + { + "epoch": 3.104350471554609, + "grad_norm": 0.3058002293109894, + "learning_rate": 6.403896715494159e-06, + "loss": 0.01, + "step": 5102 + }, + { + "epoch": 3.1049589291146944, + "grad_norm": 0.29932624101638794, + "learning_rate": 6.395578791190066e-06, + "loss": 0.0046, + "step": 5103 + }, + { + "epoch": 3.1055673866747795, + "grad_norm": 0.2611561119556427, + "learning_rate": 6.3872654800069626e-06, + "loss": 0.0097, + "step": 5104 + }, + { + "epoch": 3.1061758442348646, + "grad_norm": 0.2513970136642456, + "learning_rate": 6.378956784006204e-06, + "loss": 0.006, + "step": 5105 + }, + { + "epoch": 3.1067843017949497, + "grad_norm": 0.2295328974723816, + "learning_rate": 6.370652705248007e-06, + "loss": 0.0055, + "step": 5106 + }, + { + "epoch": 3.107392759355035, + "grad_norm": 0.3369624614715576, + "learning_rate": 6.362353245791411e-06, + "loss": 0.013, + "step": 5107 + }, + { + "epoch": 3.1080012169151203, + "grad_norm": 0.21619094908237457, + "learning_rate": 6.354058407694374e-06, + "loss": 0.0059, + "step": 5108 + }, + { + "epoch": 3.1086096744752054, + "grad_norm": 0.2147800624370575, + "learning_rate": 6.34576819301364e-06, + "loss": 0.0071, + "step": 5109 + }, + { + "epoch": 3.1092181320352905, + "grad_norm": 0.18836335837841034, + "learning_rate": 6.337482603804851e-06, + "loss": 0.0051, + "step": 5110 + }, + { + "epoch": 3.1098265895953756, + "grad_norm": 0.15471284091472626, + "learning_rate": 6.329201642122481e-06, + "loss": 0.0046, + "step": 5111 + }, + { + "epoch": 3.1104350471554607, + "grad_norm": 0.2526698708534241, + "learning_rate": 6.320925310019876e-06, + "loss": 0.0071, + "step": 5112 + }, + { + "epoch": 3.1110435047155462, + "grad_norm": 0.2389310896396637, + "learning_rate": 6.312653609549196e-06, + "loss": 0.0065, + "step": 5113 + }, + { + "epoch": 3.1116519622756313, + "grad_norm": 0.22375918924808502, + "learning_rate": 6.304386542761509e-06, + "loss": 0.0057, + "step": 5114 + }, + { + "epoch": 3.1122604198357164, + "grad_norm": 0.29425686597824097, + "learning_rate": 6.29612411170668e-06, + "loss": 0.005, + "step": 5115 + }, + { + "epoch": 3.1128688773958015, + "grad_norm": 0.37902581691741943, + "learning_rate": 6.287866318433464e-06, + "loss": 0.0065, + "step": 5116 + }, + { + "epoch": 3.1134773349558866, + "grad_norm": 0.3976466655731201, + "learning_rate": 6.279613164989426e-06, + "loss": 0.0142, + "step": 5117 + }, + { + "epoch": 3.114085792515972, + "grad_norm": 0.2854953706264496, + "learning_rate": 6.27136465342103e-06, + "loss": 0.0062, + "step": 5118 + }, + { + "epoch": 3.1146942500760573, + "grad_norm": 0.6086744070053101, + "learning_rate": 6.263120785773549e-06, + "loss": 0.0085, + "step": 5119 + }, + { + "epoch": 3.1153027076361424, + "grad_norm": 0.2866113483905792, + "learning_rate": 6.254881564091119e-06, + "loss": 0.0042, + "step": 5120 + }, + { + "epoch": 3.1159111651962275, + "grad_norm": 0.20252281427383423, + "learning_rate": 6.246646990416727e-06, + "loss": 0.0056, + "step": 5121 + }, + { + "epoch": 3.1165196227563126, + "grad_norm": 0.21693319082260132, + "learning_rate": 6.238417066792212e-06, + "loss": 0.0052, + "step": 5122 + }, + { + "epoch": 3.117128080316398, + "grad_norm": 0.2629375159740448, + "learning_rate": 6.230191795258228e-06, + "loss": 0.0105, + "step": 5123 + }, + { + "epoch": 3.117736537876483, + "grad_norm": 0.18073906004428864, + "learning_rate": 6.2219711778543274e-06, + "loss": 0.0055, + "step": 5124 + }, + { + "epoch": 3.1183449954365683, + "grad_norm": 0.3107531666755676, + "learning_rate": 6.213755216618861e-06, + "loss": 0.0077, + "step": 5125 + }, + { + "epoch": 3.1189534529966534, + "grad_norm": 0.3121819496154785, + "learning_rate": 6.205543913589059e-06, + "loss": 0.0042, + "step": 5126 + }, + { + "epoch": 3.1195619105567385, + "grad_norm": 0.3623504638671875, + "learning_rate": 6.19733727080096e-06, + "loss": 0.0076, + "step": 5127 + }, + { + "epoch": 3.120170368116824, + "grad_norm": 0.3374268412590027, + "learning_rate": 6.189135290289499e-06, + "loss": 0.0076, + "step": 5128 + }, + { + "epoch": 3.120778825676909, + "grad_norm": 0.2626723051071167, + "learning_rate": 6.180937974088405e-06, + "loss": 0.0045, + "step": 5129 + }, + { + "epoch": 3.121387283236994, + "grad_norm": 0.39012977480888367, + "learning_rate": 6.172745324230275e-06, + "loss": 0.0123, + "step": 5130 + }, + { + "epoch": 3.1219957407970793, + "grad_norm": 0.1949702948331833, + "learning_rate": 6.164557342746547e-06, + "loss": 0.0064, + "step": 5131 + }, + { + "epoch": 3.1226041983571644, + "grad_norm": 0.3511587679386139, + "learning_rate": 6.156374031667503e-06, + "loss": 0.0094, + "step": 5132 + }, + { + "epoch": 3.12321265591725, + "grad_norm": 0.2506071627140045, + "learning_rate": 6.1481953930222435e-06, + "loss": 0.0068, + "step": 5133 + }, + { + "epoch": 3.123821113477335, + "grad_norm": 0.2543926537036896, + "learning_rate": 6.140021428838761e-06, + "loss": 0.0035, + "step": 5134 + }, + { + "epoch": 3.12442957103742, + "grad_norm": 0.2524166703224182, + "learning_rate": 6.131852141143834e-06, + "loss": 0.0059, + "step": 5135 + }, + { + "epoch": 3.1250380285975052, + "grad_norm": 0.3138072192668915, + "learning_rate": 6.123687531963113e-06, + "loss": 0.0061, + "step": 5136 + }, + { + "epoch": 3.1256464861575903, + "grad_norm": 0.32244062423706055, + "learning_rate": 6.115527603321081e-06, + "loss": 0.0081, + "step": 5137 + }, + { + "epoch": 3.126254943717676, + "grad_norm": 0.21048331260681152, + "learning_rate": 6.1073723572410645e-06, + "loss": 0.0037, + "step": 5138 + }, + { + "epoch": 3.126863401277761, + "grad_norm": 0.255831778049469, + "learning_rate": 6.099221795745213e-06, + "loss": 0.0074, + "step": 5139 + }, + { + "epoch": 3.127471858837846, + "grad_norm": 0.3076886236667633, + "learning_rate": 6.091075920854536e-06, + "loss": 0.0058, + "step": 5140 + }, + { + "epoch": 3.128080316397931, + "grad_norm": 0.21628208458423615, + "learning_rate": 6.082934734588866e-06, + "loss": 0.0051, + "step": 5141 + }, + { + "epoch": 3.1286887739580163, + "grad_norm": 0.2794947326183319, + "learning_rate": 6.074798238966886e-06, + "loss": 0.0078, + "step": 5142 + }, + { + "epoch": 3.129297231518102, + "grad_norm": 0.38834357261657715, + "learning_rate": 6.066666436006088e-06, + "loss": 0.0055, + "step": 5143 + }, + { + "epoch": 3.129905689078187, + "grad_norm": 0.28329214453697205, + "learning_rate": 6.058539327722848e-06, + "loss": 0.0069, + "step": 5144 + }, + { + "epoch": 3.130514146638272, + "grad_norm": 0.2482929527759552, + "learning_rate": 6.050416916132329e-06, + "loss": 0.0069, + "step": 5145 + }, + { + "epoch": 3.131122604198357, + "grad_norm": 0.22554363310337067, + "learning_rate": 6.042299203248555e-06, + "loss": 0.0045, + "step": 5146 + }, + { + "epoch": 3.131731061758442, + "grad_norm": 0.3571288287639618, + "learning_rate": 6.034186191084384e-06, + "loss": 0.0082, + "step": 5147 + }, + { + "epoch": 3.1323395193185277, + "grad_norm": 0.24375028908252716, + "learning_rate": 6.026077881651513e-06, + "loss": 0.0037, + "step": 5148 + }, + { + "epoch": 3.132947976878613, + "grad_norm": 0.33513811230659485, + "learning_rate": 6.017974276960445e-06, + "loss": 0.0047, + "step": 5149 + }, + { + "epoch": 3.133556434438698, + "grad_norm": 0.32763129472732544, + "learning_rate": 6.0098753790205494e-06, + "loss": 0.0091, + "step": 5150 + }, + { + "epoch": 3.134164891998783, + "grad_norm": 0.18572641909122467, + "learning_rate": 6.001781189840011e-06, + "loss": 0.0065, + "step": 5151 + }, + { + "epoch": 3.134773349558868, + "grad_norm": 0.24770891666412354, + "learning_rate": 5.9936917114258585e-06, + "loss": 0.007, + "step": 5152 + }, + { + "epoch": 3.1353818071189536, + "grad_norm": 0.38206419348716736, + "learning_rate": 5.985606945783926e-06, + "loss": 0.005, + "step": 5153 + }, + { + "epoch": 3.1359902646790387, + "grad_norm": 0.24334634840488434, + "learning_rate": 5.977526894918928e-06, + "loss": 0.0062, + "step": 5154 + }, + { + "epoch": 3.136598722239124, + "grad_norm": 0.23436123132705688, + "learning_rate": 5.969451560834355e-06, + "loss": 0.0062, + "step": 5155 + }, + { + "epoch": 3.137207179799209, + "grad_norm": 0.20244354009628296, + "learning_rate": 5.9613809455325635e-06, + "loss": 0.0046, + "step": 5156 + }, + { + "epoch": 3.137815637359294, + "grad_norm": 0.20892736315727234, + "learning_rate": 5.95331505101473e-06, + "loss": 0.0037, + "step": 5157 + }, + { + "epoch": 3.1384240949193796, + "grad_norm": 0.32044070959091187, + "learning_rate": 5.945253879280862e-06, + "loss": 0.0097, + "step": 5158 + }, + { + "epoch": 3.1390325524794647, + "grad_norm": 0.27256110310554504, + "learning_rate": 5.93719743232978e-06, + "loss": 0.0057, + "step": 5159 + }, + { + "epoch": 3.1396410100395498, + "grad_norm": 0.22417287528514862, + "learning_rate": 5.929145712159173e-06, + "loss": 0.0039, + "step": 5160 + }, + { + "epoch": 3.140249467599635, + "grad_norm": 0.28702032566070557, + "learning_rate": 5.921098720765508e-06, + "loss": 0.0038, + "step": 5161 + }, + { + "epoch": 3.14085792515972, + "grad_norm": 0.3003734350204468, + "learning_rate": 5.91305646014412e-06, + "loss": 0.0079, + "step": 5162 + }, + { + "epoch": 3.1414663827198055, + "grad_norm": 0.3371279835700989, + "learning_rate": 5.905018932289133e-06, + "loss": 0.0085, + "step": 5163 + }, + { + "epoch": 3.1420748402798906, + "grad_norm": 0.24667906761169434, + "learning_rate": 5.896986139193547e-06, + "loss": 0.0089, + "step": 5164 + }, + { + "epoch": 3.1426832978399757, + "grad_norm": 0.20292513072490692, + "learning_rate": 5.888958082849139e-06, + "loss": 0.0047, + "step": 5165 + }, + { + "epoch": 3.143291755400061, + "grad_norm": 0.1873752474784851, + "learning_rate": 5.880934765246537e-06, + "loss": 0.0056, + "step": 5166 + }, + { + "epoch": 3.143900212960146, + "grad_norm": 0.2573238015174866, + "learning_rate": 5.872916188375194e-06, + "loss": 0.0035, + "step": 5167 + }, + { + "epoch": 3.1445086705202314, + "grad_norm": 0.22613008320331573, + "learning_rate": 5.864902354223384e-06, + "loss": 0.0063, + "step": 5168 + }, + { + "epoch": 3.1451171280803165, + "grad_norm": 0.2717379927635193, + "learning_rate": 5.856893264778188e-06, + "loss": 0.0072, + "step": 5169 + }, + { + "epoch": 3.1457255856404016, + "grad_norm": 0.18429331481456757, + "learning_rate": 5.848888922025553e-06, + "loss": 0.005, + "step": 5170 + }, + { + "epoch": 3.1463340432004867, + "grad_norm": 0.2671307921409607, + "learning_rate": 5.8408893279502e-06, + "loss": 0.0073, + "step": 5171 + }, + { + "epoch": 3.146942500760572, + "grad_norm": 0.263640820980072, + "learning_rate": 5.832894484535709e-06, + "loss": 0.0094, + "step": 5172 + }, + { + "epoch": 3.1475509583206573, + "grad_norm": 0.17915846407413483, + "learning_rate": 5.8249043937644465e-06, + "loss": 0.0039, + "step": 5173 + }, + { + "epoch": 3.1481594158807424, + "grad_norm": 0.30745455622673035, + "learning_rate": 5.816919057617653e-06, + "loss": 0.0067, + "step": 5174 + }, + { + "epoch": 3.1487678734408275, + "grad_norm": 0.2966357171535492, + "learning_rate": 5.808938478075335e-06, + "loss": 0.0063, + "step": 5175 + }, + { + "epoch": 3.1493763310009126, + "grad_norm": 0.3174609839916229, + "learning_rate": 5.800962657116351e-06, + "loss": 0.0088, + "step": 5176 + }, + { + "epoch": 3.1499847885609977, + "grad_norm": 0.25037750601768494, + "learning_rate": 5.792991596718375e-06, + "loss": 0.0063, + "step": 5177 + }, + { + "epoch": 3.1505932461210833, + "grad_norm": 0.30678263306617737, + "learning_rate": 5.785025298857902e-06, + "loss": 0.0043, + "step": 5178 + }, + { + "epoch": 3.1512017036811684, + "grad_norm": 0.25345203280448914, + "learning_rate": 5.777063765510219e-06, + "loss": 0.0061, + "step": 5179 + }, + { + "epoch": 3.1518101612412535, + "grad_norm": 0.5578262209892273, + "learning_rate": 5.769106998649488e-06, + "loss": 0.0109, + "step": 5180 + }, + { + "epoch": 3.1524186188013386, + "grad_norm": 0.2420710027217865, + "learning_rate": 5.761155000248627e-06, + "loss": 0.007, + "step": 5181 + }, + { + "epoch": 3.1530270763614237, + "grad_norm": 0.21901202201843262, + "learning_rate": 5.753207772279415e-06, + "loss": 0.0049, + "step": 5182 + }, + { + "epoch": 3.153635533921509, + "grad_norm": 0.26763221621513367, + "learning_rate": 5.745265316712428e-06, + "loss": 0.0065, + "step": 5183 + }, + { + "epoch": 3.1542439914815943, + "grad_norm": 0.23685556650161743, + "learning_rate": 5.737327635517073e-06, + "loss": 0.007, + "step": 5184 + }, + { + "epoch": 3.1548524490416794, + "grad_norm": 0.318765789270401, + "learning_rate": 5.729394730661547e-06, + "loss": 0.0065, + "step": 5185 + }, + { + "epoch": 3.1554609066017645, + "grad_norm": 0.289823055267334, + "learning_rate": 5.721466604112893e-06, + "loss": 0.0079, + "step": 5186 + }, + { + "epoch": 3.1560693641618496, + "grad_norm": 0.23716124892234802, + "learning_rate": 5.713543257836951e-06, + "loss": 0.0055, + "step": 5187 + }, + { + "epoch": 3.156677821721935, + "grad_norm": 0.25619250535964966, + "learning_rate": 5.705624693798389e-06, + "loss": 0.0063, + "step": 5188 + }, + { + "epoch": 3.15728627928202, + "grad_norm": 0.21789373457431793, + "learning_rate": 5.6977109139606605e-06, + "loss": 0.0036, + "step": 5189 + }, + { + "epoch": 3.1578947368421053, + "grad_norm": 0.2993996739387512, + "learning_rate": 5.689801920286078e-06, + "loss": 0.0066, + "step": 5190 + }, + { + "epoch": 3.1585031944021904, + "grad_norm": 0.3557785749435425, + "learning_rate": 5.681897714735726e-06, + "loss": 0.0102, + "step": 5191 + }, + { + "epoch": 3.1591116519622755, + "grad_norm": 0.21176862716674805, + "learning_rate": 5.673998299269523e-06, + "loss": 0.0035, + "step": 5192 + }, + { + "epoch": 3.159720109522361, + "grad_norm": 0.23229624330997467, + "learning_rate": 5.666103675846191e-06, + "loss": 0.0055, + "step": 5193 + }, + { + "epoch": 3.160328567082446, + "grad_norm": 0.1171480268239975, + "learning_rate": 5.65821384642328e-06, + "loss": 0.0017, + "step": 5194 + }, + { + "epoch": 3.1609370246425312, + "grad_norm": 0.20375606417655945, + "learning_rate": 5.650328812957115e-06, + "loss": 0.0049, + "step": 5195 + }, + { + "epoch": 3.1615454822026163, + "grad_norm": 0.3008398711681366, + "learning_rate": 5.642448577402887e-06, + "loss": 0.0067, + "step": 5196 + }, + { + "epoch": 3.1621539397627014, + "grad_norm": 0.3283615708351135, + "learning_rate": 5.6345731417145385e-06, + "loss": 0.005, + "step": 5197 + }, + { + "epoch": 3.162762397322787, + "grad_norm": 0.2205916792154312, + "learning_rate": 5.6267025078448634e-06, + "loss": 0.0053, + "step": 5198 + }, + { + "epoch": 3.163370854882872, + "grad_norm": 0.23091734945774078, + "learning_rate": 5.618836677745445e-06, + "loss": 0.0067, + "step": 5199 + }, + { + "epoch": 3.163979312442957, + "grad_norm": 0.3182157278060913, + "learning_rate": 5.610975653366693e-06, + "loss": 0.0087, + "step": 5200 + }, + { + "epoch": 3.1645877700030423, + "grad_norm": 0.3846273422241211, + "learning_rate": 5.603119436657794e-06, + "loss": 0.0171, + "step": 5201 + }, + { + "epoch": 3.1651962275631274, + "grad_norm": 0.26661422848701477, + "learning_rate": 5.595268029566777e-06, + "loss": 0.0062, + "step": 5202 + }, + { + "epoch": 3.165804685123213, + "grad_norm": 0.4423938989639282, + "learning_rate": 5.587421434040457e-06, + "loss": 0.0079, + "step": 5203 + }, + { + "epoch": 3.166413142683298, + "grad_norm": 0.3375749886035919, + "learning_rate": 5.579579652024467e-06, + "loss": 0.0085, + "step": 5204 + }, + { + "epoch": 3.167021600243383, + "grad_norm": 0.2887289822101593, + "learning_rate": 5.571742685463238e-06, + "loss": 0.0104, + "step": 5205 + }, + { + "epoch": 3.167630057803468, + "grad_norm": 0.23182101547718048, + "learning_rate": 5.563910536300021e-06, + "loss": 0.005, + "step": 5206 + }, + { + "epoch": 3.1682385153635533, + "grad_norm": 0.252333402633667, + "learning_rate": 5.55608320647685e-06, + "loss": 0.0059, + "step": 5207 + }, + { + "epoch": 3.1688469729236384, + "grad_norm": 0.22592368721961975, + "learning_rate": 5.548260697934579e-06, + "loss": 0.0045, + "step": 5208 + }, + { + "epoch": 3.169455430483724, + "grad_norm": 0.2711806893348694, + "learning_rate": 5.540443012612867e-06, + "loss": 0.0061, + "step": 5209 + }, + { + "epoch": 3.170063888043809, + "grad_norm": 0.23822571337223053, + "learning_rate": 5.532630152450175e-06, + "loss": 0.0053, + "step": 5210 + }, + { + "epoch": 3.170672345603894, + "grad_norm": 0.3533017039299011, + "learning_rate": 5.5248221193837715e-06, + "loss": 0.0102, + "step": 5211 + }, + { + "epoch": 3.171280803163979, + "grad_norm": 0.2815355360507965, + "learning_rate": 5.5170189153497085e-06, + "loss": 0.0073, + "step": 5212 + }, + { + "epoch": 3.1718892607240643, + "grad_norm": 0.3320218622684479, + "learning_rate": 5.509220542282864e-06, + "loss": 0.0076, + "step": 5213 + }, + { + "epoch": 3.17249771828415, + "grad_norm": 0.2814382314682007, + "learning_rate": 5.501427002116913e-06, + "loss": 0.0074, + "step": 5214 + }, + { + "epoch": 3.173106175844235, + "grad_norm": 0.29092541337013245, + "learning_rate": 5.4936382967843206e-06, + "loss": 0.0057, + "step": 5215 + }, + { + "epoch": 3.17371463340432, + "grad_norm": 0.1667691469192505, + "learning_rate": 5.485854428216375e-06, + "loss": 0.0039, + "step": 5216 + }, + { + "epoch": 3.174323090964405, + "grad_norm": 0.2939755618572235, + "learning_rate": 5.478075398343133e-06, + "loss": 0.0082, + "step": 5217 + }, + { + "epoch": 3.17493154852449, + "grad_norm": 0.21566243469715118, + "learning_rate": 5.470301209093478e-06, + "loss": 0.0053, + "step": 5218 + }, + { + "epoch": 3.1755400060845758, + "grad_norm": 0.26064324378967285, + "learning_rate": 5.462531862395087e-06, + "loss": 0.0046, + "step": 5219 + }, + { + "epoch": 3.176148463644661, + "grad_norm": 0.22674903273582458, + "learning_rate": 5.454767360174431e-06, + "loss": 0.0043, + "step": 5220 + }, + { + "epoch": 3.176756921204746, + "grad_norm": 0.2666117548942566, + "learning_rate": 5.447007704356791e-06, + "loss": 0.0051, + "step": 5221 + }, + { + "epoch": 3.177365378764831, + "grad_norm": 0.27889886498451233, + "learning_rate": 5.439252896866226e-06, + "loss": 0.0053, + "step": 5222 + }, + { + "epoch": 3.177973836324916, + "grad_norm": 0.273966908454895, + "learning_rate": 5.431502939625608e-06, + "loss": 0.0053, + "step": 5223 + }, + { + "epoch": 3.1785822938850017, + "grad_norm": 0.29631203413009644, + "learning_rate": 5.423757834556606e-06, + "loss": 0.0079, + "step": 5224 + }, + { + "epoch": 3.179190751445087, + "grad_norm": 0.3811946213245392, + "learning_rate": 5.416017583579686e-06, + "loss": 0.0081, + "step": 5225 + }, + { + "epoch": 3.179799209005172, + "grad_norm": 0.285285621881485, + "learning_rate": 5.408282188614103e-06, + "loss": 0.0083, + "step": 5226 + }, + { + "epoch": 3.180407666565257, + "grad_norm": 0.2812410593032837, + "learning_rate": 5.40055165157792e-06, + "loss": 0.0051, + "step": 5227 + }, + { + "epoch": 3.181016124125342, + "grad_norm": 0.3686572015285492, + "learning_rate": 5.392825974387977e-06, + "loss": 0.0078, + "step": 5228 + }, + { + "epoch": 3.1816245816854276, + "grad_norm": 0.34885480999946594, + "learning_rate": 5.385105158959924e-06, + "loss": 0.0081, + "step": 5229 + }, + { + "epoch": 3.1822330392455127, + "grad_norm": 0.2072272151708603, + "learning_rate": 5.377389207208203e-06, + "loss": 0.0027, + "step": 5230 + }, + { + "epoch": 3.182841496805598, + "grad_norm": 0.22844795882701874, + "learning_rate": 5.369678121046054e-06, + "loss": 0.0068, + "step": 5231 + }, + { + "epoch": 3.183449954365683, + "grad_norm": 0.3148641884326935, + "learning_rate": 5.361971902385485e-06, + "loss": 0.0111, + "step": 5232 + }, + { + "epoch": 3.184058411925768, + "grad_norm": 0.2549585700035095, + "learning_rate": 5.354270553137347e-06, + "loss": 0.008, + "step": 5233 + }, + { + "epoch": 3.1846668694858535, + "grad_norm": 0.26395851373672485, + "learning_rate": 5.346574075211228e-06, + "loss": 0.0077, + "step": 5234 + }, + { + "epoch": 3.1852753270459386, + "grad_norm": 0.24896180629730225, + "learning_rate": 5.338882470515544e-06, + "loss": 0.0076, + "step": 5235 + }, + { + "epoch": 3.1858837846060237, + "grad_norm": 0.240103617310524, + "learning_rate": 5.331195740957493e-06, + "loss": 0.0065, + "step": 5236 + }, + { + "epoch": 3.186492242166109, + "grad_norm": 0.1803087294101715, + "learning_rate": 5.3235138884430655e-06, + "loss": 0.0052, + "step": 5237 + }, + { + "epoch": 3.187100699726194, + "grad_norm": 0.25235310196876526, + "learning_rate": 5.315836914877026e-06, + "loss": 0.0046, + "step": 5238 + }, + { + "epoch": 3.1877091572862795, + "grad_norm": 0.2217434048652649, + "learning_rate": 5.30816482216297e-06, + "loss": 0.0073, + "step": 5239 + }, + { + "epoch": 3.1883176148463646, + "grad_norm": 0.1752319633960724, + "learning_rate": 5.300497612203231e-06, + "loss": 0.0029, + "step": 5240 + }, + { + "epoch": 3.1889260724064497, + "grad_norm": 0.2559622824192047, + "learning_rate": 5.292835286898973e-06, + "loss": 0.006, + "step": 5241 + }, + { + "epoch": 3.1895345299665347, + "grad_norm": 0.30886584520339966, + "learning_rate": 5.285177848150127e-06, + "loss": 0.0065, + "step": 5242 + }, + { + "epoch": 3.19014298752662, + "grad_norm": 0.32938501238822937, + "learning_rate": 5.277525297855427e-06, + "loss": 0.0087, + "step": 5243 + }, + { + "epoch": 3.1907514450867054, + "grad_norm": 0.30601125955581665, + "learning_rate": 5.269877637912374e-06, + "loss": 0.0065, + "step": 5244 + }, + { + "epoch": 3.1913599026467905, + "grad_norm": 0.1772543340921402, + "learning_rate": 5.262234870217272e-06, + "loss": 0.0038, + "step": 5245 + }, + { + "epoch": 3.1919683602068756, + "grad_norm": 0.31185612082481384, + "learning_rate": 5.254596996665215e-06, + "loss": 0.0116, + "step": 5246 + }, + { + "epoch": 3.1925768177669607, + "grad_norm": 0.2804357409477234, + "learning_rate": 5.246964019150077e-06, + "loss": 0.0092, + "step": 5247 + }, + { + "epoch": 3.1931852753270458, + "grad_norm": 0.31903064250946045, + "learning_rate": 5.239335939564505e-06, + "loss": 0.0092, + "step": 5248 + }, + { + "epoch": 3.1937937328871313, + "grad_norm": 0.27632802724838257, + "learning_rate": 5.231712759799967e-06, + "loss": 0.0074, + "step": 5249 + }, + { + "epoch": 3.1944021904472164, + "grad_norm": 0.15155087411403656, + "learning_rate": 5.224094481746675e-06, + "loss": 0.0031, + "step": 5250 + }, + { + "epoch": 3.1950106480073015, + "grad_norm": 0.322252094745636, + "learning_rate": 5.216481107293653e-06, + "loss": 0.0045, + "step": 5251 + }, + { + "epoch": 3.1956191055673866, + "grad_norm": 0.12668286263942719, + "learning_rate": 5.2088726383286965e-06, + "loss": 0.0028, + "step": 5252 + }, + { + "epoch": 3.1962275631274717, + "grad_norm": 0.3014906048774719, + "learning_rate": 5.2012690767383964e-06, + "loss": 0.008, + "step": 5253 + }, + { + "epoch": 3.1968360206875572, + "grad_norm": 0.21465936303138733, + "learning_rate": 5.193670424408109e-06, + "loss": 0.0053, + "step": 5254 + }, + { + "epoch": 3.1974444782476423, + "grad_norm": 0.24435143172740936, + "learning_rate": 5.186076683221988e-06, + "loss": 0.0062, + "step": 5255 + }, + { + "epoch": 3.1980529358077274, + "grad_norm": 0.28407108783721924, + "learning_rate": 5.1784878550629635e-06, + "loss": 0.0073, + "step": 5256 + }, + { + "epoch": 3.1986613933678125, + "grad_norm": 0.3360306918621063, + "learning_rate": 5.1709039418127575e-06, + "loss": 0.0102, + "step": 5257 + }, + { + "epoch": 3.1992698509278976, + "grad_norm": 0.15760943293571472, + "learning_rate": 5.163324945351841e-06, + "loss": 0.0029, + "step": 5258 + }, + { + "epoch": 3.199878308487983, + "grad_norm": 0.31353893876075745, + "learning_rate": 5.15575086755952e-06, + "loss": 0.0061, + "step": 5259 + }, + { + "epoch": 3.2004867660480683, + "grad_norm": 0.2500174641609192, + "learning_rate": 5.148181710313827e-06, + "loss": 0.0055, + "step": 5260 + }, + { + "epoch": 3.2010952236081534, + "grad_norm": 0.1809082180261612, + "learning_rate": 5.140617475491605e-06, + "loss": 0.0049, + "step": 5261 + }, + { + "epoch": 3.2017036811682384, + "grad_norm": 0.1834014654159546, + "learning_rate": 5.1330581649684715e-06, + "loss": 0.0044, + "step": 5262 + }, + { + "epoch": 3.2023121387283235, + "grad_norm": 0.3861825466156006, + "learning_rate": 5.125503780618824e-06, + "loss": 0.01, + "step": 5263 + }, + { + "epoch": 3.202920596288409, + "grad_norm": 0.33081695437431335, + "learning_rate": 5.117954324315813e-06, + "loss": 0.0065, + "step": 5264 + }, + { + "epoch": 3.203529053848494, + "grad_norm": 0.1698094755411148, + "learning_rate": 5.11040979793142e-06, + "loss": 0.0031, + "step": 5265 + }, + { + "epoch": 3.2041375114085793, + "grad_norm": 0.2472512573003769, + "learning_rate": 5.102870203336352e-06, + "loss": 0.0062, + "step": 5266 + }, + { + "epoch": 3.2047459689686644, + "grad_norm": 0.19104048609733582, + "learning_rate": 5.095335542400129e-06, + "loss": 0.0045, + "step": 5267 + }, + { + "epoch": 3.2053544265287495, + "grad_norm": 0.23127706348896027, + "learning_rate": 5.087805816991006e-06, + "loss": 0.0056, + "step": 5268 + }, + { + "epoch": 3.205962884088835, + "grad_norm": 0.28482139110565186, + "learning_rate": 5.080281028976078e-06, + "loss": 0.0062, + "step": 5269 + }, + { + "epoch": 3.20657134164892, + "grad_norm": 0.3393114507198334, + "learning_rate": 5.07276118022115e-06, + "loss": 0.0099, + "step": 5270 + }, + { + "epoch": 3.207179799209005, + "grad_norm": 0.25069156289100647, + "learning_rate": 5.06524627259084e-06, + "loss": 0.0043, + "step": 5271 + }, + { + "epoch": 3.2077882567690903, + "grad_norm": 0.28203946352005005, + "learning_rate": 5.057736307948535e-06, + "loss": 0.0068, + "step": 5272 + }, + { + "epoch": 3.2083967143291754, + "grad_norm": 0.2758210301399231, + "learning_rate": 5.050231288156398e-06, + "loss": 0.0066, + "step": 5273 + }, + { + "epoch": 3.2090051718892605, + "grad_norm": 0.2686399519443512, + "learning_rate": 5.042731215075341e-06, + "loss": 0.0049, + "step": 5274 + }, + { + "epoch": 3.209613629449346, + "grad_norm": 0.25653237104415894, + "learning_rate": 5.035236090565093e-06, + "loss": 0.0062, + "step": 5275 + }, + { + "epoch": 3.210222087009431, + "grad_norm": 0.21267692744731903, + "learning_rate": 5.027745916484119e-06, + "loss": 0.0056, + "step": 5276 + }, + { + "epoch": 3.210830544569516, + "grad_norm": 0.3413097858428955, + "learning_rate": 5.02026069468968e-06, + "loss": 0.0055, + "step": 5277 + }, + { + "epoch": 3.2114390021296013, + "grad_norm": 0.4562237560749054, + "learning_rate": 5.012780427037775e-06, + "loss": 0.0205, + "step": 5278 + }, + { + "epoch": 3.2120474596896864, + "grad_norm": 0.28480085730552673, + "learning_rate": 5.005305115383233e-06, + "loss": 0.0078, + "step": 5279 + }, + { + "epoch": 3.212655917249772, + "grad_norm": 0.24477575719356537, + "learning_rate": 4.997834761579595e-06, + "loss": 0.0043, + "step": 5280 + }, + { + "epoch": 3.213264374809857, + "grad_norm": 0.27294448018074036, + "learning_rate": 4.990369367479203e-06, + "loss": 0.0061, + "step": 5281 + }, + { + "epoch": 3.213872832369942, + "grad_norm": 0.21413196623325348, + "learning_rate": 4.982908934933167e-06, + "loss": 0.0047, + "step": 5282 + }, + { + "epoch": 3.2144812899300272, + "grad_norm": 0.31827402114868164, + "learning_rate": 4.975453465791366e-06, + "loss": 0.0074, + "step": 5283 + }, + { + "epoch": 3.2150897474901123, + "grad_norm": 0.27179375290870667, + "learning_rate": 4.9680029619024295e-06, + "loss": 0.0051, + "step": 5284 + }, + { + "epoch": 3.215698205050198, + "grad_norm": 0.16175347566604614, + "learning_rate": 4.9605574251137985e-06, + "loss": 0.0041, + "step": 5285 + }, + { + "epoch": 3.216306662610283, + "grad_norm": 0.3044746220111847, + "learning_rate": 4.953116857271634e-06, + "loss": 0.0106, + "step": 5286 + }, + { + "epoch": 3.216915120170368, + "grad_norm": 0.263638436794281, + "learning_rate": 4.945681260220891e-06, + "loss": 0.0108, + "step": 5287 + }, + { + "epoch": 3.217523577730453, + "grad_norm": 0.26071223616600037, + "learning_rate": 4.9382506358052916e-06, + "loss": 0.0084, + "step": 5288 + }, + { + "epoch": 3.2181320352905383, + "grad_norm": 0.20165440440177917, + "learning_rate": 4.930824985867328e-06, + "loss": 0.0038, + "step": 5289 + }, + { + "epoch": 3.218740492850624, + "grad_norm": 0.29097187519073486, + "learning_rate": 4.923404312248234e-06, + "loss": 0.0078, + "step": 5290 + }, + { + "epoch": 3.219348950410709, + "grad_norm": 0.21615852415561676, + "learning_rate": 4.915988616788039e-06, + "loss": 0.0036, + "step": 5291 + }, + { + "epoch": 3.219957407970794, + "grad_norm": 0.2644842565059662, + "learning_rate": 4.9085779013255225e-06, + "loss": 0.0066, + "step": 5292 + }, + { + "epoch": 3.220565865530879, + "grad_norm": 0.330758273601532, + "learning_rate": 4.901172167698242e-06, + "loss": 0.0079, + "step": 5293 + }, + { + "epoch": 3.221174323090964, + "grad_norm": 0.2616957128047943, + "learning_rate": 4.89377141774249e-06, + "loss": 0.0041, + "step": 5294 + }, + { + "epoch": 3.2217827806510497, + "grad_norm": 0.38702672719955444, + "learning_rate": 4.886375653293371e-06, + "loss": 0.0084, + "step": 5295 + }, + { + "epoch": 3.222391238211135, + "grad_norm": 0.3087538778781891, + "learning_rate": 4.878984876184706e-06, + "loss": 0.0064, + "step": 5296 + }, + { + "epoch": 3.22299969577122, + "grad_norm": 0.2493748515844345, + "learning_rate": 4.871599088249107e-06, + "loss": 0.0052, + "step": 5297 + }, + { + "epoch": 3.223608153331305, + "grad_norm": 0.3203872740268707, + "learning_rate": 4.86421829131794e-06, + "loss": 0.0077, + "step": 5298 + }, + { + "epoch": 3.22421661089139, + "grad_norm": 0.21354947984218597, + "learning_rate": 4.856842487221344e-06, + "loss": 0.0052, + "step": 5299 + }, + { + "epoch": 3.2248250684514757, + "grad_norm": 0.41666483879089355, + "learning_rate": 4.849471677788195e-06, + "loss": 0.0138, + "step": 5300 + }, + { + "epoch": 3.2254335260115607, + "grad_norm": 0.1607670783996582, + "learning_rate": 4.842105864846155e-06, + "loss": 0.0021, + "step": 5301 + }, + { + "epoch": 3.226041983571646, + "grad_norm": 0.25365200638771057, + "learning_rate": 4.83474505022164e-06, + "loss": 0.008, + "step": 5302 + }, + { + "epoch": 3.226650441131731, + "grad_norm": 0.23420965671539307, + "learning_rate": 4.82738923573983e-06, + "loss": 0.0039, + "step": 5303 + }, + { + "epoch": 3.227258898691816, + "grad_norm": 0.24002118408679962, + "learning_rate": 4.820038423224638e-06, + "loss": 0.0039, + "step": 5304 + }, + { + "epoch": 3.2278673562519016, + "grad_norm": 0.17119701206684113, + "learning_rate": 4.812692614498787e-06, + "loss": 0.0022, + "step": 5305 + }, + { + "epoch": 3.2284758138119867, + "grad_norm": 0.16908611357212067, + "learning_rate": 4.805351811383716e-06, + "loss": 0.0036, + "step": 5306 + }, + { + "epoch": 3.2290842713720718, + "grad_norm": 0.24062961339950562, + "learning_rate": 4.798016015699638e-06, + "loss": 0.0045, + "step": 5307 + }, + { + "epoch": 3.229692728932157, + "grad_norm": 0.26717016100883484, + "learning_rate": 4.790685229265529e-06, + "loss": 0.0064, + "step": 5308 + }, + { + "epoch": 3.230301186492242, + "grad_norm": 0.30753692984580994, + "learning_rate": 4.783359453899125e-06, + "loss": 0.0083, + "step": 5309 + }, + { + "epoch": 3.2309096440523275, + "grad_norm": 0.2237798273563385, + "learning_rate": 4.776038691416892e-06, + "loss": 0.0078, + "step": 5310 + }, + { + "epoch": 3.2315181016124126, + "grad_norm": 0.24201908707618713, + "learning_rate": 4.768722943634099e-06, + "loss": 0.0034, + "step": 5311 + }, + { + "epoch": 3.2321265591724977, + "grad_norm": 0.14591459929943085, + "learning_rate": 4.7614122123647295e-06, + "loss": 0.002, + "step": 5312 + }, + { + "epoch": 3.232735016732583, + "grad_norm": 0.1915680468082428, + "learning_rate": 4.754106499421545e-06, + "loss": 0.0033, + "step": 5313 + }, + { + "epoch": 3.233343474292668, + "grad_norm": 0.24156653881072998, + "learning_rate": 4.746805806616059e-06, + "loss": 0.0066, + "step": 5314 + }, + { + "epoch": 3.2339519318527534, + "grad_norm": 0.2683356702327728, + "learning_rate": 4.739510135758546e-06, + "loss": 0.0054, + "step": 5315 + }, + { + "epoch": 3.2345603894128385, + "grad_norm": 0.29896220564842224, + "learning_rate": 4.7322194886580105e-06, + "loss": 0.0082, + "step": 5316 + }, + { + "epoch": 3.2351688469729236, + "grad_norm": 0.2597740590572357, + "learning_rate": 4.724933867122242e-06, + "loss": 0.0076, + "step": 5317 + }, + { + "epoch": 3.2357773045330087, + "grad_norm": 0.17904557287693024, + "learning_rate": 4.717653272957767e-06, + "loss": 0.0035, + "step": 5318 + }, + { + "epoch": 3.236385762093094, + "grad_norm": 0.26095980405807495, + "learning_rate": 4.710377707969876e-06, + "loss": 0.0066, + "step": 5319 + }, + { + "epoch": 3.2369942196531793, + "grad_norm": 0.14280088245868683, + "learning_rate": 4.703107173962587e-06, + "loss": 0.0027, + "step": 5320 + }, + { + "epoch": 3.2376026772132644, + "grad_norm": 0.2902020812034607, + "learning_rate": 4.695841672738718e-06, + "loss": 0.0059, + "step": 5321 + }, + { + "epoch": 3.2382111347733495, + "grad_norm": 0.34895089268684387, + "learning_rate": 4.688581206099787e-06, + "loss": 0.009, + "step": 5322 + }, + { + "epoch": 3.2388195923334346, + "grad_norm": 0.21605385839939117, + "learning_rate": 4.681325775846096e-06, + "loss": 0.0052, + "step": 5323 + }, + { + "epoch": 3.2394280498935197, + "grad_norm": 0.21634212136268616, + "learning_rate": 4.674075383776689e-06, + "loss": 0.0058, + "step": 5324 + }, + { + "epoch": 3.2400365074536053, + "grad_norm": 0.25203219056129456, + "learning_rate": 4.666830031689365e-06, + "loss": 0.0048, + "step": 5325 + }, + { + "epoch": 3.2406449650136904, + "grad_norm": 0.26320701837539673, + "learning_rate": 4.659589721380661e-06, + "loss": 0.0049, + "step": 5326 + }, + { + "epoch": 3.2412534225737755, + "grad_norm": 0.268195241689682, + "learning_rate": 4.652354454645874e-06, + "loss": 0.0058, + "step": 5327 + }, + { + "epoch": 3.2418618801338606, + "grad_norm": 0.2631436288356781, + "learning_rate": 4.64512423327905e-06, + "loss": 0.0049, + "step": 5328 + }, + { + "epoch": 3.2424703376939457, + "grad_norm": 0.3505512773990631, + "learning_rate": 4.637899059072984e-06, + "loss": 0.0039, + "step": 5329 + }, + { + "epoch": 3.243078795254031, + "grad_norm": 0.2612294554710388, + "learning_rate": 4.630678933819218e-06, + "loss": 0.0065, + "step": 5330 + }, + { + "epoch": 3.2436872528141163, + "grad_norm": 0.2038453072309494, + "learning_rate": 4.623463859308047e-06, + "loss": 0.0045, + "step": 5331 + }, + { + "epoch": 3.2442957103742014, + "grad_norm": 0.26115256547927856, + "learning_rate": 4.616253837328497e-06, + "loss": 0.0074, + "step": 5332 + }, + { + "epoch": 3.2449041679342865, + "grad_norm": 0.31222108006477356, + "learning_rate": 4.6090488696683585e-06, + "loss": 0.008, + "step": 5333 + }, + { + "epoch": 3.2455126254943716, + "grad_norm": 0.2225833237171173, + "learning_rate": 4.601848958114164e-06, + "loss": 0.0064, + "step": 5334 + }, + { + "epoch": 3.246121083054457, + "grad_norm": 0.2462644726037979, + "learning_rate": 4.5946541044511905e-06, + "loss": 0.0049, + "step": 5335 + }, + { + "epoch": 3.246729540614542, + "grad_norm": 0.27759093046188354, + "learning_rate": 4.5874643104634685e-06, + "loss": 0.0099, + "step": 5336 + }, + { + "epoch": 3.2473379981746273, + "grad_norm": 0.24177397787570953, + "learning_rate": 4.580279577933755e-06, + "loss": 0.0059, + "step": 5337 + }, + { + "epoch": 3.2479464557347124, + "grad_norm": 0.2759784162044525, + "learning_rate": 4.573099908643572e-06, + "loss": 0.0062, + "step": 5338 + }, + { + "epoch": 3.2485549132947975, + "grad_norm": 0.400174617767334, + "learning_rate": 4.565925304373176e-06, + "loss": 0.0064, + "step": 5339 + }, + { + "epoch": 3.249163370854883, + "grad_norm": 0.3525528609752655, + "learning_rate": 4.558755766901568e-06, + "loss": 0.0097, + "step": 5340 + }, + { + "epoch": 3.249771828414968, + "grad_norm": 0.41769731044769287, + "learning_rate": 4.551591298006497e-06, + "loss": 0.0096, + "step": 5341 + }, + { + "epoch": 3.2503802859750532, + "grad_norm": 0.2166384756565094, + "learning_rate": 4.54443189946446e-06, + "loss": 0.0031, + "step": 5342 + }, + { + "epoch": 3.2509887435351383, + "grad_norm": 0.2720005214214325, + "learning_rate": 4.537277573050674e-06, + "loss": 0.0067, + "step": 5343 + }, + { + "epoch": 3.2515972010952234, + "grad_norm": 0.2838521897792816, + "learning_rate": 4.530128320539126e-06, + "loss": 0.01, + "step": 5344 + }, + { + "epoch": 3.252205658655309, + "grad_norm": 0.22926031053066254, + "learning_rate": 4.522984143702524e-06, + "loss": 0.0049, + "step": 5345 + }, + { + "epoch": 3.252814116215394, + "grad_norm": 0.19992421567440033, + "learning_rate": 4.515845044312331e-06, + "loss": 0.0041, + "step": 5346 + }, + { + "epoch": 3.253422573775479, + "grad_norm": 0.28706929087638855, + "learning_rate": 4.508711024138746e-06, + "loss": 0.0085, + "step": 5347 + }, + { + "epoch": 3.2540310313355643, + "grad_norm": 0.1090521439909935, + "learning_rate": 4.501582084950715e-06, + "loss": 0.0021, + "step": 5348 + }, + { + "epoch": 3.2546394888956494, + "grad_norm": 0.18653200566768646, + "learning_rate": 4.494458228515902e-06, + "loss": 0.0045, + "step": 5349 + }, + { + "epoch": 3.255247946455735, + "grad_norm": 0.2562299370765686, + "learning_rate": 4.487339456600736e-06, + "loss": 0.0074, + "step": 5350 + }, + { + "epoch": 3.25585640401582, + "grad_norm": 0.21422119438648224, + "learning_rate": 4.480225770970378e-06, + "loss": 0.0036, + "step": 5351 + }, + { + "epoch": 3.256464861575905, + "grad_norm": 0.16707159578800201, + "learning_rate": 4.4731171733887245e-06, + "loss": 0.0038, + "step": 5352 + }, + { + "epoch": 3.25707331913599, + "grad_norm": 0.27651339769363403, + "learning_rate": 4.466013665618407e-06, + "loss": 0.0049, + "step": 5353 + }, + { + "epoch": 3.2576817766960753, + "grad_norm": 0.18352994322776794, + "learning_rate": 4.458915249420798e-06, + "loss": 0.0034, + "step": 5354 + }, + { + "epoch": 3.258290234256161, + "grad_norm": 0.24244019389152527, + "learning_rate": 4.451821926556016e-06, + "loss": 0.006, + "step": 5355 + }, + { + "epoch": 3.258898691816246, + "grad_norm": 0.2059207558631897, + "learning_rate": 4.444733698782902e-06, + "loss": 0.0051, + "step": 5356 + }, + { + "epoch": 3.259507149376331, + "grad_norm": 0.32662034034729004, + "learning_rate": 4.437650567859047e-06, + "loss": 0.0062, + "step": 5357 + }, + { + "epoch": 3.260115606936416, + "grad_norm": 0.2572704553604126, + "learning_rate": 4.430572535540778e-06, + "loss": 0.0055, + "step": 5358 + }, + { + "epoch": 3.260724064496501, + "grad_norm": 0.15180638432502747, + "learning_rate": 4.423499603583137e-06, + "loss": 0.0031, + "step": 5359 + }, + { + "epoch": 3.2613325220565867, + "grad_norm": 0.20388486981391907, + "learning_rate": 4.416431773739924e-06, + "loss": 0.004, + "step": 5360 + }, + { + "epoch": 3.261940979616672, + "grad_norm": 0.25693532824516296, + "learning_rate": 4.409369047763664e-06, + "loss": 0.0053, + "step": 5361 + }, + { + "epoch": 3.262549437176757, + "grad_norm": 0.311781644821167, + "learning_rate": 4.402311427405628e-06, + "loss": 0.0063, + "step": 5362 + }, + { + "epoch": 3.263157894736842, + "grad_norm": 0.23537062108516693, + "learning_rate": 4.39525891441579e-06, + "loss": 0.0053, + "step": 5363 + }, + { + "epoch": 3.263766352296927, + "grad_norm": 0.3553119897842407, + "learning_rate": 4.388211510542906e-06, + "loss": 0.0094, + "step": 5364 + }, + { + "epoch": 3.2643748098570127, + "grad_norm": 0.20130811631679535, + "learning_rate": 4.38116921753442e-06, + "loss": 0.0049, + "step": 5365 + }, + { + "epoch": 3.2649832674170978, + "grad_norm": 0.24931548535823822, + "learning_rate": 4.374132037136533e-06, + "loss": 0.004, + "step": 5366 + }, + { + "epoch": 3.265591724977183, + "grad_norm": 0.29390352964401245, + "learning_rate": 4.367099971094174e-06, + "loss": 0.008, + "step": 5367 + }, + { + "epoch": 3.266200182537268, + "grad_norm": 0.3790845274925232, + "learning_rate": 4.360073021151004e-06, + "loss": 0.0104, + "step": 5368 + }, + { + "epoch": 3.266808640097353, + "grad_norm": 0.2828552722930908, + "learning_rate": 4.353051189049398e-06, + "loss": 0.0077, + "step": 5369 + }, + { + "epoch": 3.2674170976574386, + "grad_norm": 0.13647259771823883, + "learning_rate": 4.346034476530503e-06, + "loss": 0.0035, + "step": 5370 + }, + { + "epoch": 3.2680255552175237, + "grad_norm": 0.28553807735443115, + "learning_rate": 4.339022885334154e-06, + "loss": 0.0087, + "step": 5371 + }, + { + "epoch": 3.268634012777609, + "grad_norm": 0.37478142976760864, + "learning_rate": 4.332016417198942e-06, + "loss": 0.0084, + "step": 5372 + }, + { + "epoch": 3.269242470337694, + "grad_norm": 0.2366034984588623, + "learning_rate": 4.32501507386216e-06, + "loss": 0.0061, + "step": 5373 + }, + { + "epoch": 3.269850927897779, + "grad_norm": 0.25971001386642456, + "learning_rate": 4.318018857059878e-06, + "loss": 0.0082, + "step": 5374 + }, + { + "epoch": 3.2704593854578645, + "grad_norm": 0.24216106534004211, + "learning_rate": 4.311027768526846e-06, + "loss": 0.0056, + "step": 5375 + }, + { + "epoch": 3.2710678430179496, + "grad_norm": 0.32058781385421753, + "learning_rate": 4.3040418099965674e-06, + "loss": 0.0085, + "step": 5376 + }, + { + "epoch": 3.2716763005780347, + "grad_norm": 0.25957608222961426, + "learning_rate": 4.297060983201273e-06, + "loss": 0.005, + "step": 5377 + }, + { + "epoch": 3.27228475813812, + "grad_norm": 0.2595154047012329, + "learning_rate": 4.290085289871917e-06, + "loss": 0.005, + "step": 5378 + }, + { + "epoch": 3.272893215698205, + "grad_norm": 0.25397470593452454, + "learning_rate": 4.283114731738166e-06, + "loss": 0.0034, + "step": 5379 + }, + { + "epoch": 3.2735016732582904, + "grad_norm": 0.2799084782600403, + "learning_rate": 4.276149310528452e-06, + "loss": 0.0068, + "step": 5380 + }, + { + "epoch": 3.2741101308183755, + "grad_norm": 0.27302873134613037, + "learning_rate": 4.269189027969889e-06, + "loss": 0.0046, + "step": 5381 + }, + { + "epoch": 3.2747185883784606, + "grad_norm": 0.27299267053604126, + "learning_rate": 4.26223388578835e-06, + "loss": 0.0083, + "step": 5382 + }, + { + "epoch": 3.2753270459385457, + "grad_norm": 0.3280734717845917, + "learning_rate": 4.255283885708403e-06, + "loss": 0.0097, + "step": 5383 + }, + { + "epoch": 3.275935503498631, + "grad_norm": 0.2109125405550003, + "learning_rate": 4.2483390294533825e-06, + "loss": 0.005, + "step": 5384 + }, + { + "epoch": 3.2765439610587164, + "grad_norm": 0.3015410304069519, + "learning_rate": 4.2413993187453035e-06, + "loss": 0.005, + "step": 5385 + }, + { + "epoch": 3.2771524186188015, + "grad_norm": 0.239542618393898, + "learning_rate": 4.234464755304934e-06, + "loss": 0.0064, + "step": 5386 + }, + { + "epoch": 3.2777608761788866, + "grad_norm": 0.3143784999847412, + "learning_rate": 4.227535340851754e-06, + "loss": 0.0056, + "step": 5387 + }, + { + "epoch": 3.2783693337389717, + "grad_norm": 0.3585420250892639, + "learning_rate": 4.220611077103978e-06, + "loss": 0.0104, + "step": 5388 + }, + { + "epoch": 3.2789777912990568, + "grad_norm": 0.3050104081630707, + "learning_rate": 4.213691965778516e-06, + "loss": 0.0094, + "step": 5389 + }, + { + "epoch": 3.2795862488591423, + "grad_norm": 0.22966350615024567, + "learning_rate": 4.206778008591039e-06, + "loss": 0.0047, + "step": 5390 + }, + { + "epoch": 3.2801947064192274, + "grad_norm": 0.2953983247280121, + "learning_rate": 4.199869207255907e-06, + "loss": 0.0051, + "step": 5391 + }, + { + "epoch": 3.2808031639793125, + "grad_norm": 0.22054730355739594, + "learning_rate": 4.192965563486221e-06, + "loss": 0.0047, + "step": 5392 + }, + { + "epoch": 3.2814116215393976, + "grad_norm": 0.21199162304401398, + "learning_rate": 4.186067078993794e-06, + "loss": 0.0051, + "step": 5393 + }, + { + "epoch": 3.2820200790994827, + "grad_norm": 0.184243842959404, + "learning_rate": 4.179173755489171e-06, + "loss": 0.0028, + "step": 5394 + }, + { + "epoch": 3.282628536659568, + "grad_norm": 0.15772341191768646, + "learning_rate": 4.172285594681594e-06, + "loss": 0.0034, + "step": 5395 + }, + { + "epoch": 3.2832369942196533, + "grad_norm": 0.2756187617778778, + "learning_rate": 4.165402598279048e-06, + "loss": 0.005, + "step": 5396 + }, + { + "epoch": 3.2838454517797384, + "grad_norm": 0.2757441997528076, + "learning_rate": 4.1585247679882315e-06, + "loss": 0.0049, + "step": 5397 + }, + { + "epoch": 3.2844539093398235, + "grad_norm": 0.2898223400115967, + "learning_rate": 4.151652105514559e-06, + "loss": 0.0065, + "step": 5398 + }, + { + "epoch": 3.2850623668999086, + "grad_norm": 1.003989815711975, + "learning_rate": 4.144784612562152e-06, + "loss": 0.0069, + "step": 5399 + }, + { + "epoch": 3.285670824459994, + "grad_norm": 0.230075865983963, + "learning_rate": 4.1379222908338844e-06, + "loss": 0.0046, + "step": 5400 + }, + { + "epoch": 3.2862792820200792, + "grad_norm": 0.24226723611354828, + "learning_rate": 4.13106514203131e-06, + "loss": 0.0061, + "step": 5401 + }, + { + "epoch": 3.2868877395801643, + "grad_norm": 0.2802073657512665, + "learning_rate": 4.124213167854721e-06, + "loss": 0.0073, + "step": 5402 + }, + { + "epoch": 3.2874961971402494, + "grad_norm": 0.2725411355495453, + "learning_rate": 4.1173663700031174e-06, + "loss": 0.0034, + "step": 5403 + }, + { + "epoch": 3.2881046547003345, + "grad_norm": 0.20064030587673187, + "learning_rate": 4.11052475017423e-06, + "loss": 0.0034, + "step": 5404 + }, + { + "epoch": 3.28871311226042, + "grad_norm": 0.2537406086921692, + "learning_rate": 4.103688310064485e-06, + "loss": 0.0069, + "step": 5405 + }, + { + "epoch": 3.289321569820505, + "grad_norm": 0.22151683270931244, + "learning_rate": 4.096857051369035e-06, + "loss": 0.006, + "step": 5406 + }, + { + "epoch": 3.2899300273805903, + "grad_norm": 0.14316684007644653, + "learning_rate": 4.090030975781753e-06, + "loss": 0.003, + "step": 5407 + }, + { + "epoch": 3.2905384849406754, + "grad_norm": 0.3060914874076843, + "learning_rate": 4.08321008499522e-06, + "loss": 0.0093, + "step": 5408 + }, + { + "epoch": 3.2911469425007605, + "grad_norm": 0.23609232902526855, + "learning_rate": 4.076394380700724e-06, + "loss": 0.0054, + "step": 5409 + }, + { + "epoch": 3.291755400060846, + "grad_norm": 0.34055647253990173, + "learning_rate": 4.069583864588291e-06, + "loss": 0.007, + "step": 5410 + }, + { + "epoch": 3.292363857620931, + "grad_norm": 0.20988178253173828, + "learning_rate": 4.062778538346634e-06, + "loss": 0.0051, + "step": 5411 + }, + { + "epoch": 3.292972315181016, + "grad_norm": 0.23799017071723938, + "learning_rate": 4.055978403663191e-06, + "loss": 0.0048, + "step": 5412 + }, + { + "epoch": 3.2935807727411013, + "grad_norm": 0.28818562626838684, + "learning_rate": 4.049183462224115e-06, + "loss": 0.0076, + "step": 5413 + }, + { + "epoch": 3.2941892303011864, + "grad_norm": 0.2099044919013977, + "learning_rate": 4.042393715714274e-06, + "loss": 0.0033, + "step": 5414 + }, + { + "epoch": 3.294797687861272, + "grad_norm": 0.27117833495140076, + "learning_rate": 4.0356091658172225e-06, + "loss": 0.0066, + "step": 5415 + }, + { + "epoch": 3.295406145421357, + "grad_norm": 0.3128974437713623, + "learning_rate": 4.028829814215271e-06, + "loss": 0.0082, + "step": 5416 + }, + { + "epoch": 3.296014602981442, + "grad_norm": 0.20908896625041962, + "learning_rate": 4.022055662589397e-06, + "loss": 0.0045, + "step": 5417 + }, + { + "epoch": 3.296623060541527, + "grad_norm": 0.2693098783493042, + "learning_rate": 4.01528671261932e-06, + "loss": 0.0045, + "step": 5418 + }, + { + "epoch": 3.2972315181016123, + "grad_norm": 0.25489383935928345, + "learning_rate": 4.008522965983444e-06, + "loss": 0.0049, + "step": 5419 + }, + { + "epoch": 3.297839975661698, + "grad_norm": 0.27955323457717896, + "learning_rate": 4.001764424358914e-06, + "loss": 0.0091, + "step": 5420 + }, + { + "epoch": 3.298448433221783, + "grad_norm": 0.1617559790611267, + "learning_rate": 3.995011089421552e-06, + "loss": 0.0024, + "step": 5421 + }, + { + "epoch": 3.299056890781868, + "grad_norm": 0.1988738626241684, + "learning_rate": 3.9882629628459095e-06, + "loss": 0.0048, + "step": 5422 + }, + { + "epoch": 3.299665348341953, + "grad_norm": 0.32020819187164307, + "learning_rate": 3.98152004630524e-06, + "loss": 0.0103, + "step": 5423 + }, + { + "epoch": 3.3002738059020382, + "grad_norm": 0.2752763628959656, + "learning_rate": 3.974782341471508e-06, + "loss": 0.0072, + "step": 5424 + }, + { + "epoch": 3.3008822634621233, + "grad_norm": 0.177403524518013, + "learning_rate": 3.968049850015374e-06, + "loss": 0.0036, + "step": 5425 + }, + { + "epoch": 3.301490721022209, + "grad_norm": 0.2594372630119324, + "learning_rate": 3.96132257360623e-06, + "loss": 0.0054, + "step": 5426 + }, + { + "epoch": 3.302099178582294, + "grad_norm": 0.24807630479335785, + "learning_rate": 3.954600513912149e-06, + "loss": 0.0048, + "step": 5427 + }, + { + "epoch": 3.302707636142379, + "grad_norm": 0.24896809458732605, + "learning_rate": 3.9478836725999306e-06, + "loss": 0.0062, + "step": 5428 + }, + { + "epoch": 3.303316093702464, + "grad_norm": 0.28816282749176025, + "learning_rate": 3.941172051335054e-06, + "loss": 0.0076, + "step": 5429 + }, + { + "epoch": 3.3039245512625492, + "grad_norm": 0.27258211374282837, + "learning_rate": 3.934465651781746e-06, + "loss": 0.0078, + "step": 5430 + }, + { + "epoch": 3.304533008822635, + "grad_norm": 0.2567967176437378, + "learning_rate": 3.927764475602893e-06, + "loss": 0.0085, + "step": 5431 + }, + { + "epoch": 3.30514146638272, + "grad_norm": 0.2778639793395996, + "learning_rate": 3.9210685244601195e-06, + "loss": 0.0084, + "step": 5432 + }, + { + "epoch": 3.305749923942805, + "grad_norm": 0.2634139060974121, + "learning_rate": 3.914377800013738e-06, + "loss": 0.0034, + "step": 5433 + }, + { + "epoch": 3.30635838150289, + "grad_norm": 0.3199003338813782, + "learning_rate": 3.907692303922775e-06, + "loss": 0.0084, + "step": 5434 + }, + { + "epoch": 3.306966839062975, + "grad_norm": 0.22238844633102417, + "learning_rate": 3.9010120378449365e-06, + "loss": 0.004, + "step": 5435 + }, + { + "epoch": 3.3075752966230607, + "grad_norm": 0.23715293407440186, + "learning_rate": 3.894337003436679e-06, + "loss": 0.0041, + "step": 5436 + }, + { + "epoch": 3.308183754183146, + "grad_norm": 0.2940605580806732, + "learning_rate": 3.88766720235311e-06, + "loss": 0.007, + "step": 5437 + }, + { + "epoch": 3.308792211743231, + "grad_norm": 0.3147020637989044, + "learning_rate": 3.8810026362480684e-06, + "loss": 0.0059, + "step": 5438 + }, + { + "epoch": 3.309400669303316, + "grad_norm": 0.19217897951602936, + "learning_rate": 3.8743433067740895e-06, + "loss": 0.0067, + "step": 5439 + }, + { + "epoch": 3.310009126863401, + "grad_norm": 0.36218491196632385, + "learning_rate": 3.867689215582415e-06, + "loss": 0.0147, + "step": 5440 + }, + { + "epoch": 3.3106175844234866, + "grad_norm": 0.23334696888923645, + "learning_rate": 3.861040364322974e-06, + "loss": 0.0067, + "step": 5441 + }, + { + "epoch": 3.3112260419835717, + "grad_norm": 0.3586207926273346, + "learning_rate": 3.854396754644405e-06, + "loss": 0.0062, + "step": 5442 + }, + { + "epoch": 3.311834499543657, + "grad_norm": 0.3108196556568146, + "learning_rate": 3.84775838819405e-06, + "loss": 0.0105, + "step": 5443 + }, + { + "epoch": 3.312442957103742, + "grad_norm": 0.21571455895900726, + "learning_rate": 3.841125266617946e-06, + "loss": 0.0057, + "step": 5444 + }, + { + "epoch": 3.313051414663827, + "grad_norm": 0.2929924726486206, + "learning_rate": 3.834497391560829e-06, + "loss": 0.0084, + "step": 5445 + }, + { + "epoch": 3.3136598722239126, + "grad_norm": 0.1458842009305954, + "learning_rate": 3.827874764666145e-06, + "loss": 0.0021, + "step": 5446 + }, + { + "epoch": 3.3142683297839977, + "grad_norm": 0.25384876132011414, + "learning_rate": 3.821257387576014e-06, + "loss": 0.0074, + "step": 5447 + }, + { + "epoch": 3.3148767873440828, + "grad_norm": 0.29817652702331543, + "learning_rate": 3.8146452619312768e-06, + "loss": 0.0078, + "step": 5448 + }, + { + "epoch": 3.315485244904168, + "grad_norm": 0.2839571535587311, + "learning_rate": 3.8080383893714684e-06, + "loss": 0.006, + "step": 5449 + }, + { + "epoch": 3.316093702464253, + "grad_norm": 0.3559333384037018, + "learning_rate": 3.801436771534814e-06, + "loss": 0.0114, + "step": 5450 + }, + { + "epoch": 3.3167021600243385, + "grad_norm": 0.46538999676704407, + "learning_rate": 3.7948404100582453e-06, + "loss": 0.0078, + "step": 5451 + }, + { + "epoch": 3.3173106175844236, + "grad_norm": 0.41122740507125854, + "learning_rate": 3.788249306577374e-06, + "loss": 0.0127, + "step": 5452 + }, + { + "epoch": 3.3179190751445087, + "grad_norm": 0.22749930620193481, + "learning_rate": 3.7816634627265285e-06, + "loss": 0.0075, + "step": 5453 + }, + { + "epoch": 3.3185275327045938, + "grad_norm": 0.257730633020401, + "learning_rate": 3.7750828801387167e-06, + "loss": 0.0062, + "step": 5454 + }, + { + "epoch": 3.319135990264679, + "grad_norm": 0.27139922976493835, + "learning_rate": 3.7685075604456553e-06, + "loss": 0.0042, + "step": 5455 + }, + { + "epoch": 3.3197444478247644, + "grad_norm": 0.23728352785110474, + "learning_rate": 3.7619375052777518e-06, + "loss": 0.0036, + "step": 5456 + }, + { + "epoch": 3.3203529053848495, + "grad_norm": 0.35973915457725525, + "learning_rate": 3.755372716264094e-06, + "loss": 0.0105, + "step": 5457 + }, + { + "epoch": 3.3209613629449346, + "grad_norm": 0.15002618730068207, + "learning_rate": 3.748813195032483e-06, + "loss": 0.0033, + "step": 5458 + }, + { + "epoch": 3.3215698205050197, + "grad_norm": 0.14451484382152557, + "learning_rate": 3.7422589432094064e-06, + "loss": 0.0016, + "step": 5459 + }, + { + "epoch": 3.322178278065105, + "grad_norm": 0.19004569947719574, + "learning_rate": 3.7357099624200475e-06, + "loss": 0.0041, + "step": 5460 + }, + { + "epoch": 3.3227867356251903, + "grad_norm": 0.27472901344299316, + "learning_rate": 3.729166254288277e-06, + "loss": 0.006, + "step": 5461 + }, + { + "epoch": 3.3233951931852754, + "grad_norm": 0.19888129830360413, + "learning_rate": 3.7226278204366695e-06, + "loss": 0.0047, + "step": 5462 + }, + { + "epoch": 3.3240036507453605, + "grad_norm": 0.267270565032959, + "learning_rate": 3.7160946624864716e-06, + "loss": 0.0077, + "step": 5463 + }, + { + "epoch": 3.3246121083054456, + "grad_norm": 0.24350838363170624, + "learning_rate": 3.7095667820576415e-06, + "loss": 0.0058, + "step": 5464 + }, + { + "epoch": 3.3252205658655307, + "grad_norm": 0.28811752796173096, + "learning_rate": 3.7030441807688193e-06, + "loss": 0.007, + "step": 5465 + }, + { + "epoch": 3.325829023425616, + "grad_norm": 0.1940048485994339, + "learning_rate": 3.69652686023734e-06, + "loss": 0.0038, + "step": 5466 + }, + { + "epoch": 3.3264374809857014, + "grad_norm": 0.2174157351255417, + "learning_rate": 3.6900148220792323e-06, + "loss": 0.0034, + "step": 5467 + }, + { + "epoch": 3.3270459385457865, + "grad_norm": 0.18492846190929413, + "learning_rate": 3.6835080679091998e-06, + "loss": 0.0027, + "step": 5468 + }, + { + "epoch": 3.3276543961058715, + "grad_norm": 0.30942097306251526, + "learning_rate": 3.6770065993406517e-06, + "loss": 0.0077, + "step": 5469 + }, + { + "epoch": 3.3282628536659566, + "grad_norm": 0.3265366852283478, + "learning_rate": 3.6705104179856785e-06, + "loss": 0.0071, + "step": 5470 + }, + { + "epoch": 3.3288713112260417, + "grad_norm": 0.2290954440832138, + "learning_rate": 3.6640195254550676e-06, + "loss": 0.0061, + "step": 5471 + }, + { + "epoch": 3.3294797687861273, + "grad_norm": 0.21861323714256287, + "learning_rate": 3.657533923358286e-06, + "loss": 0.0048, + "step": 5472 + }, + { + "epoch": 3.3300882263462124, + "grad_norm": 0.25013089179992676, + "learning_rate": 3.6510536133034985e-06, + "loss": 0.0059, + "step": 5473 + }, + { + "epoch": 3.3306966839062975, + "grad_norm": 0.26969480514526367, + "learning_rate": 3.644578596897541e-06, + "loss": 0.0078, + "step": 5474 + }, + { + "epoch": 3.3313051414663826, + "grad_norm": 0.23772546648979187, + "learning_rate": 3.638108875745955e-06, + "loss": 0.0055, + "step": 5475 + }, + { + "epoch": 3.3319135990264677, + "grad_norm": 0.245829239487648, + "learning_rate": 3.631644451452959e-06, + "loss": 0.007, + "step": 5476 + }, + { + "epoch": 3.332522056586553, + "grad_norm": 0.23014789819717407, + "learning_rate": 3.625185325621469e-06, + "loss": 0.0052, + "step": 5477 + }, + { + "epoch": 3.3331305141466383, + "grad_norm": 0.2805229425430298, + "learning_rate": 3.618731499853059e-06, + "loss": 0.0076, + "step": 5478 + }, + { + "epoch": 3.3337389717067234, + "grad_norm": 0.23805426061153412, + "learning_rate": 3.6122829757480354e-06, + "loss": 0.0048, + "step": 5479 + }, + { + "epoch": 3.3343474292668085, + "grad_norm": 0.3036494553089142, + "learning_rate": 3.605839754905341e-06, + "loss": 0.0084, + "step": 5480 + }, + { + "epoch": 3.3349558868268936, + "grad_norm": 0.2554338872432709, + "learning_rate": 3.5994018389226365e-06, + "loss": 0.0045, + "step": 5481 + }, + { + "epoch": 3.335564344386979, + "grad_norm": 0.26599669456481934, + "learning_rate": 3.5929692293962562e-06, + "loss": 0.0069, + "step": 5482 + }, + { + "epoch": 3.3361728019470642, + "grad_norm": 0.1806708723306656, + "learning_rate": 3.586541927921222e-06, + "loss": 0.003, + "step": 5483 + }, + { + "epoch": 3.3367812595071493, + "grad_norm": 0.21414224803447723, + "learning_rate": 3.5801199360912226e-06, + "loss": 0.0046, + "step": 5484 + }, + { + "epoch": 3.3373897170672344, + "grad_norm": 0.23333413898944855, + "learning_rate": 3.573703255498667e-06, + "loss": 0.0036, + "step": 5485 + }, + { + "epoch": 3.3379981746273195, + "grad_norm": 0.19560351967811584, + "learning_rate": 3.567291887734603e-06, + "loss": 0.0052, + "step": 5486 + }, + { + "epoch": 3.338606632187405, + "grad_norm": 0.48690447211265564, + "learning_rate": 3.5608858343887997e-06, + "loss": 0.0215, + "step": 5487 + }, + { + "epoch": 3.33921508974749, + "grad_norm": 0.27765342593193054, + "learning_rate": 3.55448509704967e-06, + "loss": 0.0068, + "step": 5488 + }, + { + "epoch": 3.3398235473075752, + "grad_norm": 0.2087118774652481, + "learning_rate": 3.548089677304356e-06, + "loss": 0.0043, + "step": 5489 + }, + { + "epoch": 3.3404320048676603, + "grad_norm": 0.3580057919025421, + "learning_rate": 3.5416995767386357e-06, + "loss": 0.0066, + "step": 5490 + }, + { + "epoch": 3.3410404624277454, + "grad_norm": 0.2551713287830353, + "learning_rate": 3.5353147969369948e-06, + "loss": 0.0061, + "step": 5491 + }, + { + "epoch": 3.341648919987831, + "grad_norm": 0.26948314905166626, + "learning_rate": 3.5289353394825947e-06, + "loss": 0.0082, + "step": 5492 + }, + { + "epoch": 3.342257377547916, + "grad_norm": 0.438873827457428, + "learning_rate": 3.5225612059572727e-06, + "loss": 0.0112, + "step": 5493 + }, + { + "epoch": 3.342865835108001, + "grad_norm": 0.18292073905467987, + "learning_rate": 3.5161923979415395e-06, + "loss": 0.0039, + "step": 5494 + }, + { + "epoch": 3.3434742926680863, + "grad_norm": 0.2990265488624573, + "learning_rate": 3.509828917014615e-06, + "loss": 0.0097, + "step": 5495 + }, + { + "epoch": 3.3440827502281714, + "grad_norm": 0.23416587710380554, + "learning_rate": 3.5034707647543576e-06, + "loss": 0.0061, + "step": 5496 + }, + { + "epoch": 3.344691207788257, + "grad_norm": 0.28342893719673157, + "learning_rate": 3.4971179427373295e-06, + "loss": 0.0072, + "step": 5497 + }, + { + "epoch": 3.345299665348342, + "grad_norm": 0.22702716290950775, + "learning_rate": 3.490770452538769e-06, + "loss": 0.0053, + "step": 5498 + }, + { + "epoch": 3.345908122908427, + "grad_norm": 0.24197997152805328, + "learning_rate": 3.484428295732592e-06, + "loss": 0.0071, + "step": 5499 + }, + { + "epoch": 3.346516580468512, + "grad_norm": 0.16110782325267792, + "learning_rate": 3.4780914738913816e-06, + "loss": 0.0035, + "step": 5500 + }, + { + "epoch": 3.3471250380285973, + "grad_norm": 0.3154269754886627, + "learning_rate": 3.471759988586404e-06, + "loss": 0.0057, + "step": 5501 + }, + { + "epoch": 3.347733495588683, + "grad_norm": 0.24822941422462463, + "learning_rate": 3.465433841387611e-06, + "loss": 0.0036, + "step": 5502 + }, + { + "epoch": 3.348341953148768, + "grad_norm": 0.26176613569259644, + "learning_rate": 3.4591130338636257e-06, + "loss": 0.0049, + "step": 5503 + }, + { + "epoch": 3.348950410708853, + "grad_norm": 0.31120842695236206, + "learning_rate": 3.4527975675817282e-06, + "loss": 0.0095, + "step": 5504 + }, + { + "epoch": 3.349558868268938, + "grad_norm": 0.20220082998275757, + "learning_rate": 3.4464874441079126e-06, + "loss": 0.005, + "step": 5505 + }, + { + "epoch": 3.350167325829023, + "grad_norm": 0.29147812724113464, + "learning_rate": 3.440182665006811e-06, + "loss": 0.0047, + "step": 5506 + }, + { + "epoch": 3.3507757833891088, + "grad_norm": 0.3253108561038971, + "learning_rate": 3.43388323184175e-06, + "loss": 0.01, + "step": 5507 + }, + { + "epoch": 3.351384240949194, + "grad_norm": 0.3161683976650238, + "learning_rate": 3.4275891461747283e-06, + "loss": 0.007, + "step": 5508 + }, + { + "epoch": 3.351992698509279, + "grad_norm": 0.3239922821521759, + "learning_rate": 3.421300409566422e-06, + "loss": 0.0054, + "step": 5509 + }, + { + "epoch": 3.352601156069364, + "grad_norm": 0.30600473284721375, + "learning_rate": 3.415017023576164e-06, + "loss": 0.0044, + "step": 5510 + }, + { + "epoch": 3.353209613629449, + "grad_norm": 0.2611445486545563, + "learning_rate": 3.408738989761978e-06, + "loss": 0.0062, + "step": 5511 + }, + { + "epoch": 3.3538180711895347, + "grad_norm": 0.1755179911851883, + "learning_rate": 3.4024663096805547e-06, + "loss": 0.0036, + "step": 5512 + }, + { + "epoch": 3.3544265287496198, + "grad_norm": 0.2571423649787903, + "learning_rate": 3.396198984887261e-06, + "loss": 0.0089, + "step": 5513 + }, + { + "epoch": 3.355034986309705, + "grad_norm": 0.23700016736984253, + "learning_rate": 3.389937016936118e-06, + "loss": 0.0037, + "step": 5514 + }, + { + "epoch": 3.35564344386979, + "grad_norm": 0.31217527389526367, + "learning_rate": 3.383680407379855e-06, + "loss": 0.0048, + "step": 5515 + }, + { + "epoch": 3.356251901429875, + "grad_norm": 0.2647400200366974, + "learning_rate": 3.377429157769832e-06, + "loss": 0.0099, + "step": 5516 + }, + { + "epoch": 3.3568603589899606, + "grad_norm": 0.199229896068573, + "learning_rate": 3.3711832696561056e-06, + "loss": 0.0034, + "step": 5517 + }, + { + "epoch": 3.3574688165500457, + "grad_norm": 0.24624145030975342, + "learning_rate": 3.3649427445873934e-06, + "loss": 0.0058, + "step": 5518 + }, + { + "epoch": 3.358077274110131, + "grad_norm": 0.21135321259498596, + "learning_rate": 3.3587075841110927e-06, + "loss": 0.0047, + "step": 5519 + }, + { + "epoch": 3.358685731670216, + "grad_norm": 0.31276053190231323, + "learning_rate": 3.3524777897732452e-06, + "loss": 0.0068, + "step": 5520 + }, + { + "epoch": 3.359294189230301, + "grad_norm": 0.20705646276474, + "learning_rate": 3.3462533631186066e-06, + "loss": 0.0041, + "step": 5521 + }, + { + "epoch": 3.3599026467903865, + "grad_norm": 0.373995840549469, + "learning_rate": 3.340034305690554e-06, + "loss": 0.0137, + "step": 5522 + }, + { + "epoch": 3.3605111043504716, + "grad_norm": 0.1533001810312271, + "learning_rate": 3.3338206190311667e-06, + "loss": 0.0022, + "step": 5523 + }, + { + "epoch": 3.3611195619105567, + "grad_norm": 0.3650040328502655, + "learning_rate": 3.327612304681166e-06, + "loss": 0.0084, + "step": 5524 + }, + { + "epoch": 3.361728019470642, + "grad_norm": 0.2500038146972656, + "learning_rate": 3.321409364179975e-06, + "loss": 0.0052, + "step": 5525 + }, + { + "epoch": 3.362336477030727, + "grad_norm": 0.23246996104717255, + "learning_rate": 3.3152117990656456e-06, + "loss": 0.0054, + "step": 5526 + }, + { + "epoch": 3.3629449345908125, + "grad_norm": 0.21729208528995514, + "learning_rate": 3.309019610874925e-06, + "loss": 0.0031, + "step": 5527 + }, + { + "epoch": 3.3635533921508975, + "grad_norm": 0.29215723276138306, + "learning_rate": 3.3028328011432157e-06, + "loss": 0.0071, + "step": 5528 + }, + { + "epoch": 3.3641618497109826, + "grad_norm": 0.21962468326091766, + "learning_rate": 3.2966513714045967e-06, + "loss": 0.0037, + "step": 5529 + }, + { + "epoch": 3.3647703072710677, + "grad_norm": 0.24007774889469147, + "learning_rate": 3.2904753231917857e-06, + "loss": 0.0048, + "step": 5530 + }, + { + "epoch": 3.365378764831153, + "grad_norm": 0.2580210864543915, + "learning_rate": 3.284304658036208e-06, + "loss": 0.0049, + "step": 5531 + }, + { + "epoch": 3.3659872223912384, + "grad_norm": 0.1776011735200882, + "learning_rate": 3.278139377467912e-06, + "loss": 0.0027, + "step": 5532 + }, + { + "epoch": 3.3665956799513235, + "grad_norm": 0.22608719766139984, + "learning_rate": 3.271979483015647e-06, + "loss": 0.0057, + "step": 5533 + }, + { + "epoch": 3.3672041375114086, + "grad_norm": 0.29510682821273804, + "learning_rate": 3.265824976206791e-06, + "loss": 0.0056, + "step": 5534 + }, + { + "epoch": 3.3678125950714937, + "grad_norm": 0.2816297113895416, + "learning_rate": 3.2596758585674238e-06, + "loss": 0.0058, + "step": 5535 + }, + { + "epoch": 3.3684210526315788, + "grad_norm": 0.21519798040390015, + "learning_rate": 3.253532131622261e-06, + "loss": 0.0077, + "step": 5536 + }, + { + "epoch": 3.3690295101916643, + "grad_norm": 0.1678488701581955, + "learning_rate": 3.247393796894688e-06, + "loss": 0.003, + "step": 5537 + }, + { + "epoch": 3.3696379677517494, + "grad_norm": 0.26176977157592773, + "learning_rate": 3.2412608559067638e-06, + "loss": 0.0042, + "step": 5538 + }, + { + "epoch": 3.3702464253118345, + "grad_norm": 0.2218218296766281, + "learning_rate": 3.2351333101792005e-06, + "loss": 0.0036, + "step": 5539 + }, + { + "epoch": 3.3708548828719196, + "grad_norm": 0.2444075644016266, + "learning_rate": 3.229011161231363e-06, + "loss": 0.0051, + "step": 5540 + }, + { + "epoch": 3.3714633404320047, + "grad_norm": 0.18922793865203857, + "learning_rate": 3.2228944105813093e-06, + "loss": 0.0036, + "step": 5541 + }, + { + "epoch": 3.3720717979920902, + "grad_norm": 0.24314537644386292, + "learning_rate": 3.2167830597457205e-06, + "loss": 0.0046, + "step": 5542 + }, + { + "epoch": 3.3726802555521753, + "grad_norm": 0.12830045819282532, + "learning_rate": 3.2106771102399625e-06, + "loss": 0.0019, + "step": 5543 + }, + { + "epoch": 3.3732887131122604, + "grad_norm": 0.26987224817276, + "learning_rate": 3.20457656357806e-06, + "loss": 0.0062, + "step": 5544 + }, + { + "epoch": 3.3738971706723455, + "grad_norm": 0.21220239996910095, + "learning_rate": 3.198481421272698e-06, + "loss": 0.0038, + "step": 5545 + }, + { + "epoch": 3.3745056282324306, + "grad_norm": 0.2044731080532074, + "learning_rate": 3.1923916848352025e-06, + "loss": 0.0044, + "step": 5546 + }, + { + "epoch": 3.375114085792516, + "grad_norm": 0.3579948842525482, + "learning_rate": 3.186307355775586e-06, + "loss": 0.0061, + "step": 5547 + }, + { + "epoch": 3.3757225433526012, + "grad_norm": 0.6677601337432861, + "learning_rate": 3.180228435602503e-06, + "loss": 0.0061, + "step": 5548 + }, + { + "epoch": 3.3763310009126863, + "grad_norm": 0.21300187706947327, + "learning_rate": 3.1741549258232833e-06, + "loss": 0.005, + "step": 5549 + }, + { + "epoch": 3.3769394584727714, + "grad_norm": 0.24835187196731567, + "learning_rate": 3.1680868279438835e-06, + "loss": 0.0057, + "step": 5550 + }, + { + "epoch": 3.3775479160328565, + "grad_norm": 0.4124807119369507, + "learning_rate": 3.162024143468964e-06, + "loss": 0.0088, + "step": 5551 + }, + { + "epoch": 3.378156373592942, + "grad_norm": 0.2930738925933838, + "learning_rate": 3.1559668739017974e-06, + "loss": 0.0062, + "step": 5552 + }, + { + "epoch": 3.378764831153027, + "grad_norm": 0.18829555809497833, + "learning_rate": 3.149915020744343e-06, + "loss": 0.0042, + "step": 5553 + }, + { + "epoch": 3.3793732887131123, + "grad_norm": 0.24336494505405426, + "learning_rate": 3.143868585497206e-06, + "loss": 0.0039, + "step": 5554 + }, + { + "epoch": 3.3799817462731974, + "grad_norm": 0.2603733241558075, + "learning_rate": 3.1378275696596597e-06, + "loss": 0.0061, + "step": 5555 + }, + { + "epoch": 3.3805902038332825, + "grad_norm": 0.17050331830978394, + "learning_rate": 3.131791974729609e-06, + "loss": 0.0035, + "step": 5556 + }, + { + "epoch": 3.381198661393368, + "grad_norm": 0.263378381729126, + "learning_rate": 3.125761802203636e-06, + "loss": 0.0063, + "step": 5557 + }, + { + "epoch": 3.381807118953453, + "grad_norm": 0.3855268359184265, + "learning_rate": 3.119737053576971e-06, + "loss": 0.0063, + "step": 5558 + }, + { + "epoch": 3.382415576513538, + "grad_norm": 0.17356686294078827, + "learning_rate": 3.11371773034351e-06, + "loss": 0.0025, + "step": 5559 + }, + { + "epoch": 3.3830240340736233, + "grad_norm": 0.2114478200674057, + "learning_rate": 3.107703833995776e-06, + "loss": 0.0054, + "step": 5560 + }, + { + "epoch": 3.3836324916337084, + "grad_norm": 0.31965020298957825, + "learning_rate": 3.101695366024987e-06, + "loss": 0.0067, + "step": 5561 + }, + { + "epoch": 3.384240949193794, + "grad_norm": 0.2662268579006195, + "learning_rate": 3.095692327920974e-06, + "loss": 0.0069, + "step": 5562 + }, + { + "epoch": 3.384849406753879, + "grad_norm": 0.204771026968956, + "learning_rate": 3.0896947211722484e-06, + "loss": 0.004, + "step": 5563 + }, + { + "epoch": 3.385457864313964, + "grad_norm": 0.22940324246883392, + "learning_rate": 3.083702547265965e-06, + "loss": 0.0058, + "step": 5564 + }, + { + "epoch": 3.386066321874049, + "grad_norm": 0.2345285266637802, + "learning_rate": 3.077715807687939e-06, + "loss": 0.0132, + "step": 5565 + }, + { + "epoch": 3.3866747794341343, + "grad_norm": 1.2732864618301392, + "learning_rate": 3.071734503922616e-06, + "loss": 0.0262, + "step": 5566 + }, + { + "epoch": 3.38728323699422, + "grad_norm": 0.24260850250720978, + "learning_rate": 3.065758637453131e-06, + "loss": 0.0033, + "step": 5567 + }, + { + "epoch": 3.387891694554305, + "grad_norm": 0.28996798396110535, + "learning_rate": 3.059788209761233e-06, + "loss": 0.0037, + "step": 5568 + }, + { + "epoch": 3.38850015211439, + "grad_norm": 0.2517380714416504, + "learning_rate": 3.0538232223273482e-06, + "loss": 0.0056, + "step": 5569 + }, + { + "epoch": 3.389108609674475, + "grad_norm": 0.2352709025144577, + "learning_rate": 3.047863676630541e-06, + "loss": 0.0056, + "step": 5570 + }, + { + "epoch": 3.3897170672345602, + "grad_norm": 0.24700261652469635, + "learning_rate": 3.041909574148538e-06, + "loss": 0.0038, + "step": 5571 + }, + { + "epoch": 3.3903255247946458, + "grad_norm": 0.20761002600193024, + "learning_rate": 3.035960916357697e-06, + "loss": 0.0046, + "step": 5572 + }, + { + "epoch": 3.390933982354731, + "grad_norm": 0.24791277945041656, + "learning_rate": 3.030017704733043e-06, + "loss": 0.0048, + "step": 5573 + }, + { + "epoch": 3.391542439914816, + "grad_norm": 0.20745399594306946, + "learning_rate": 3.0240799407482452e-06, + "loss": 0.0044, + "step": 5574 + }, + { + "epoch": 3.392150897474901, + "grad_norm": 0.14504899084568024, + "learning_rate": 3.018147625875617e-06, + "loss": 0.002, + "step": 5575 + }, + { + "epoch": 3.392759355034986, + "grad_norm": 0.3044413626194, + "learning_rate": 3.012220761586132e-06, + "loss": 0.0064, + "step": 5576 + }, + { + "epoch": 3.3933678125950717, + "grad_norm": 0.2477070391178131, + "learning_rate": 3.006299349349406e-06, + "loss": 0.0065, + "step": 5577 + }, + { + "epoch": 3.393976270155157, + "grad_norm": 0.28918352723121643, + "learning_rate": 3.000383390633696e-06, + "loss": 0.0062, + "step": 5578 + }, + { + "epoch": 3.394584727715242, + "grad_norm": 0.2834755480289459, + "learning_rate": 2.9944728869059136e-06, + "loss": 0.0063, + "step": 5579 + }, + { + "epoch": 3.395193185275327, + "grad_norm": 0.2524814009666443, + "learning_rate": 2.98856783963162e-06, + "loss": 0.0038, + "step": 5580 + }, + { + "epoch": 3.395801642835412, + "grad_norm": 0.2619141638278961, + "learning_rate": 2.98266825027502e-06, + "loss": 0.0057, + "step": 5581 + }, + { + "epoch": 3.3964101003954976, + "grad_norm": 0.2823928892612457, + "learning_rate": 2.9767741202989723e-06, + "loss": 0.0061, + "step": 5582 + }, + { + "epoch": 3.3970185579555827, + "grad_norm": 0.20528404414653778, + "learning_rate": 2.970885451164965e-06, + "loss": 0.0046, + "step": 5583 + }, + { + "epoch": 3.397627015515668, + "grad_norm": 0.2534896433353424, + "learning_rate": 2.9650022443331453e-06, + "loss": 0.0045, + "step": 5584 + }, + { + "epoch": 3.398235473075753, + "grad_norm": 0.26259493827819824, + "learning_rate": 2.9591245012623058e-06, + "loss": 0.0071, + "step": 5585 + }, + { + "epoch": 3.398843930635838, + "grad_norm": 0.3380330204963684, + "learning_rate": 2.9532522234098803e-06, + "loss": 0.0089, + "step": 5586 + }, + { + "epoch": 3.3994523881959235, + "grad_norm": 0.18706291913986206, + "learning_rate": 2.947385412231951e-06, + "loss": 0.0031, + "step": 5587 + }, + { + "epoch": 3.4000608457560086, + "grad_norm": 0.2913658916950226, + "learning_rate": 2.9415240691832463e-06, + "loss": 0.0085, + "step": 5588 + }, + { + "epoch": 3.4006693033160937, + "grad_norm": 0.3028022050857544, + "learning_rate": 2.9356681957171227e-06, + "loss": 0.0066, + "step": 5589 + }, + { + "epoch": 3.401277760876179, + "grad_norm": 0.2161863148212433, + "learning_rate": 2.9298177932856025e-06, + "loss": 0.0045, + "step": 5590 + }, + { + "epoch": 3.401886218436264, + "grad_norm": 0.25855207443237305, + "learning_rate": 2.923972863339336e-06, + "loss": 0.007, + "step": 5591 + }, + { + "epoch": 3.4024946759963495, + "grad_norm": 0.2582375109195709, + "learning_rate": 2.9181334073276334e-06, + "loss": 0.0055, + "step": 5592 + }, + { + "epoch": 3.4031031335564346, + "grad_norm": 0.24739745259284973, + "learning_rate": 2.9122994266984226e-06, + "loss": 0.0048, + "step": 5593 + }, + { + "epoch": 3.4037115911165197, + "grad_norm": 0.2663595676422119, + "learning_rate": 2.906470922898291e-06, + "loss": 0.0044, + "step": 5594 + }, + { + "epoch": 3.4043200486766048, + "grad_norm": 0.21414734423160553, + "learning_rate": 2.900647897372469e-06, + "loss": 0.0041, + "step": 5595 + }, + { + "epoch": 3.40492850623669, + "grad_norm": 0.33169928193092346, + "learning_rate": 2.894830351564823e-06, + "loss": 0.0088, + "step": 5596 + }, + { + "epoch": 3.4055369637967754, + "grad_norm": 0.3833865523338318, + "learning_rate": 2.889018286917858e-06, + "loss": 0.0072, + "step": 5597 + }, + { + "epoch": 3.4061454213568605, + "grad_norm": 0.30657726526260376, + "learning_rate": 2.8832117048727348e-06, + "loss": 0.009, + "step": 5598 + }, + { + "epoch": 3.4067538789169456, + "grad_norm": 0.1277196705341339, + "learning_rate": 2.877410606869227e-06, + "loss": 0.0017, + "step": 5599 + }, + { + "epoch": 3.4073623364770307, + "grad_norm": 0.29806557297706604, + "learning_rate": 2.8716149943457755e-06, + "loss": 0.0072, + "step": 5600 + }, + { + "epoch": 3.407970794037116, + "grad_norm": 0.24409450590610504, + "learning_rate": 2.86582486873945e-06, + "loss": 0.0052, + "step": 5601 + }, + { + "epoch": 3.4085792515972013, + "grad_norm": 0.23363740742206573, + "learning_rate": 2.8600402314859636e-06, + "loss": 0.0072, + "step": 5602 + }, + { + "epoch": 3.4091877091572864, + "grad_norm": 0.23178331553936005, + "learning_rate": 2.8542610840196493e-06, + "loss": 0.0055, + "step": 5603 + }, + { + "epoch": 3.4097961667173715, + "grad_norm": 0.2069847732782364, + "learning_rate": 2.848487427773519e-06, + "loss": 0.0039, + "step": 5604 + }, + { + "epoch": 3.4104046242774566, + "grad_norm": 0.17575199902057648, + "learning_rate": 2.842719264179178e-06, + "loss": 0.0028, + "step": 5605 + }, + { + "epoch": 3.4110130818375417, + "grad_norm": 0.1965407133102417, + "learning_rate": 2.836956594666901e-06, + "loss": 0.0036, + "step": 5606 + }, + { + "epoch": 3.4116215393976272, + "grad_norm": 0.25646457076072693, + "learning_rate": 2.8311994206655867e-06, + "loss": 0.0067, + "step": 5607 + }, + { + "epoch": 3.4122299969577123, + "grad_norm": 0.2868242859840393, + "learning_rate": 2.8254477436027797e-06, + "loss": 0.0063, + "step": 5608 + }, + { + "epoch": 3.4128384545177974, + "grad_norm": 0.277547150850296, + "learning_rate": 2.8197015649046393e-06, + "loss": 0.0067, + "step": 5609 + }, + { + "epoch": 3.4134469120778825, + "grad_norm": 0.20469379425048828, + "learning_rate": 2.813960885996003e-06, + "loss": 0.0029, + "step": 5610 + }, + { + "epoch": 3.4140553696379676, + "grad_norm": 0.39242902398109436, + "learning_rate": 2.808225708300302e-06, + "loss": 0.004, + "step": 5611 + }, + { + "epoch": 3.414663827198053, + "grad_norm": 0.2715418338775635, + "learning_rate": 2.8024960332396266e-06, + "loss": 0.0084, + "step": 5612 + }, + { + "epoch": 3.4152722847581383, + "grad_norm": 0.1898062378168106, + "learning_rate": 2.7967718622346957e-06, + "loss": 0.0042, + "step": 5613 + }, + { + "epoch": 3.4158807423182234, + "grad_norm": 0.2265051305294037, + "learning_rate": 2.7910531967048736e-06, + "loss": 0.0029, + "step": 5614 + }, + { + "epoch": 3.4164891998783085, + "grad_norm": 0.3068977892398834, + "learning_rate": 2.7853400380681404e-06, + "loss": 0.0095, + "step": 5615 + }, + { + "epoch": 3.4170976574383936, + "grad_norm": 0.22495239973068237, + "learning_rate": 2.779632387741121e-06, + "loss": 0.0059, + "step": 5616 + }, + { + "epoch": 3.417706114998479, + "grad_norm": 0.2805343568325043, + "learning_rate": 2.7739302471390836e-06, + "loss": 0.0073, + "step": 5617 + }, + { + "epoch": 3.418314572558564, + "grad_norm": 0.2405698299407959, + "learning_rate": 2.7682336176759195e-06, + "loss": 0.0081, + "step": 5618 + }, + { + "epoch": 3.4189230301186493, + "grad_norm": 0.3347167372703552, + "learning_rate": 2.7625425007641425e-06, + "loss": 0.0061, + "step": 5619 + }, + { + "epoch": 3.4195314876787344, + "grad_norm": 0.25292569398880005, + "learning_rate": 2.756856897814933e-06, + "loss": 0.0041, + "step": 5620 + }, + { + "epoch": 3.4201399452388195, + "grad_norm": 0.1868637651205063, + "learning_rate": 2.7511768102380654e-06, + "loss": 0.0028, + "step": 5621 + }, + { + "epoch": 3.420748402798905, + "grad_norm": 0.175969198346138, + "learning_rate": 2.7455022394419746e-06, + "loss": 0.0043, + "step": 5622 + }, + { + "epoch": 3.42135686035899, + "grad_norm": 0.31663379073143005, + "learning_rate": 2.739833186833715e-06, + "loss": 0.0037, + "step": 5623 + }, + { + "epoch": 3.421965317919075, + "grad_norm": 0.3171955347061157, + "learning_rate": 2.73416965381898e-06, + "loss": 0.0114, + "step": 5624 + }, + { + "epoch": 3.4225737754791603, + "grad_norm": 0.22550025582313538, + "learning_rate": 2.728511641802076e-06, + "loss": 0.0033, + "step": 5625 + }, + { + "epoch": 3.4231822330392454, + "grad_norm": 0.16939371824264526, + "learning_rate": 2.722859152185972e-06, + "loss": 0.0042, + "step": 5626 + }, + { + "epoch": 3.423790690599331, + "grad_norm": 0.3074353337287903, + "learning_rate": 2.7172121863722366e-06, + "loss": 0.0072, + "step": 5627 + }, + { + "epoch": 3.424399148159416, + "grad_norm": 0.3095559775829315, + "learning_rate": 2.711570745761094e-06, + "loss": 0.0081, + "step": 5628 + }, + { + "epoch": 3.425007605719501, + "grad_norm": 0.24687674641609192, + "learning_rate": 2.7059348317513665e-06, + "loss": 0.0073, + "step": 5629 + }, + { + "epoch": 3.4256160632795862, + "grad_norm": 0.28120920062065125, + "learning_rate": 2.700304445740551e-06, + "loss": 0.0062, + "step": 5630 + }, + { + "epoch": 3.4262245208396713, + "grad_norm": 0.18883898854255676, + "learning_rate": 2.6946795891247266e-06, + "loss": 0.0037, + "step": 5631 + }, + { + "epoch": 3.4268329783997564, + "grad_norm": 0.21660275757312775, + "learning_rate": 2.6890602632986373e-06, + "loss": 0.0056, + "step": 5632 + }, + { + "epoch": 3.427441435959842, + "grad_norm": 0.2389414757490158, + "learning_rate": 2.6834464696556343e-06, + "loss": 0.0043, + "step": 5633 + }, + { + "epoch": 3.428049893519927, + "grad_norm": 0.23681369423866272, + "learning_rate": 2.6778382095877104e-06, + "loss": 0.0051, + "step": 5634 + }, + { + "epoch": 3.428658351080012, + "grad_norm": 0.251058965921402, + "learning_rate": 2.6722354844854693e-06, + "loss": 0.0031, + "step": 5635 + }, + { + "epoch": 3.4292668086400973, + "grad_norm": 0.27185845375061035, + "learning_rate": 2.666638295738169e-06, + "loss": 0.0058, + "step": 5636 + }, + { + "epoch": 3.4298752662001823, + "grad_norm": 0.2628857493400574, + "learning_rate": 2.6610466447336697e-06, + "loss": 0.0045, + "step": 5637 + }, + { + "epoch": 3.430483723760268, + "grad_norm": 0.17281438410282135, + "learning_rate": 2.65546053285847e-06, + "loss": 0.0036, + "step": 5638 + }, + { + "epoch": 3.431092181320353, + "grad_norm": 0.16873584687709808, + "learning_rate": 2.649879961497684e-06, + "loss": 0.0028, + "step": 5639 + }, + { + "epoch": 3.431700638880438, + "grad_norm": 0.23468784987926483, + "learning_rate": 2.64430493203508e-06, + "loss": 0.0062, + "step": 5640 + }, + { + "epoch": 3.432309096440523, + "grad_norm": 0.20407533645629883, + "learning_rate": 2.6387354458530134e-06, + "loss": 0.0034, + "step": 5641 + }, + { + "epoch": 3.4329175540006083, + "grad_norm": 0.17050844430923462, + "learning_rate": 2.6331715043324935e-06, + "loss": 0.0037, + "step": 5642 + }, + { + "epoch": 3.433526011560694, + "grad_norm": 0.26065853238105774, + "learning_rate": 2.627613108853147e-06, + "loss": 0.004, + "step": 5643 + }, + { + "epoch": 3.434134469120779, + "grad_norm": 0.31893685460090637, + "learning_rate": 2.622060260793227e-06, + "loss": 0.0089, + "step": 5644 + }, + { + "epoch": 3.434742926680864, + "grad_norm": 0.16491414606571198, + "learning_rate": 2.616512961529591e-06, + "loss": 0.0041, + "step": 5645 + }, + { + "epoch": 3.435351384240949, + "grad_norm": 0.30626043677330017, + "learning_rate": 2.610971212437763e-06, + "loss": 0.0042, + "step": 5646 + }, + { + "epoch": 3.435959841801034, + "grad_norm": 0.3586437702178955, + "learning_rate": 2.6054350148918493e-06, + "loss": 0.0068, + "step": 5647 + }, + { + "epoch": 3.4365682993611197, + "grad_norm": 0.2317257821559906, + "learning_rate": 2.5999043702646e-06, + "loss": 0.0054, + "step": 5648 + }, + { + "epoch": 3.437176756921205, + "grad_norm": 0.2033744603395462, + "learning_rate": 2.5943792799273838e-06, + "loss": 0.0042, + "step": 5649 + }, + { + "epoch": 3.43778521448129, + "grad_norm": 0.18324805796146393, + "learning_rate": 2.5888597452501994e-06, + "loss": 0.0049, + "step": 5650 + }, + { + "epoch": 3.438393672041375, + "grad_norm": 0.2906474769115448, + "learning_rate": 2.5833457676016526e-06, + "loss": 0.0073, + "step": 5651 + }, + { + "epoch": 3.43900212960146, + "grad_norm": 0.16437779366970062, + "learning_rate": 2.5778373483489827e-06, + "loss": 0.0042, + "step": 5652 + }, + { + "epoch": 3.4396105871615457, + "grad_norm": 0.2783378064632416, + "learning_rate": 2.572334488858047e-06, + "loss": 0.009, + "step": 5653 + }, + { + "epoch": 3.4402190447216308, + "grad_norm": 0.21060001850128174, + "learning_rate": 2.566837190493332e-06, + "loss": 0.0041, + "step": 5654 + }, + { + "epoch": 3.440827502281716, + "grad_norm": 0.2840571403503418, + "learning_rate": 2.561345454617925e-06, + "loss": 0.0058, + "step": 5655 + }, + { + "epoch": 3.441435959841801, + "grad_norm": 0.20874862372875214, + "learning_rate": 2.5558592825935645e-06, + "loss": 0.0034, + "step": 5656 + }, + { + "epoch": 3.442044417401886, + "grad_norm": 0.237037792801857, + "learning_rate": 2.5503786757805794e-06, + "loss": 0.0045, + "step": 5657 + }, + { + "epoch": 3.4426528749619716, + "grad_norm": 0.25760436058044434, + "learning_rate": 2.5449036355379347e-06, + "loss": 0.0053, + "step": 5658 + }, + { + "epoch": 3.4432613325220567, + "grad_norm": 0.2317677140235901, + "learning_rate": 2.539434163223217e-06, + "loss": 0.0052, + "step": 5659 + }, + { + "epoch": 3.443869790082142, + "grad_norm": 0.3090670704841614, + "learning_rate": 2.533970260192628e-06, + "loss": 0.0103, + "step": 5660 + }, + { + "epoch": 3.444478247642227, + "grad_norm": 0.23244629800319672, + "learning_rate": 2.528511927800978e-06, + "loss": 0.0052, + "step": 5661 + }, + { + "epoch": 3.445086705202312, + "grad_norm": 0.20796987414360046, + "learning_rate": 2.5230591674017145e-06, + "loss": 0.0032, + "step": 5662 + }, + { + "epoch": 3.4456951627623975, + "grad_norm": 0.2811738848686218, + "learning_rate": 2.51761198034689e-06, + "loss": 0.007, + "step": 5663 + }, + { + "epoch": 3.4463036203224826, + "grad_norm": 0.2699826955795288, + "learning_rate": 2.5121703679871907e-06, + "loss": 0.0047, + "step": 5664 + }, + { + "epoch": 3.4469120778825677, + "grad_norm": 0.21873120963573456, + "learning_rate": 2.5067343316718866e-06, + "loss": 0.0041, + "step": 5665 + }, + { + "epoch": 3.447520535442653, + "grad_norm": 0.21390405297279358, + "learning_rate": 2.501303872748917e-06, + "loss": 0.0045, + "step": 5666 + }, + { + "epoch": 3.448128993002738, + "grad_norm": 0.18419447541236877, + "learning_rate": 2.4958789925647873e-06, + "loss": 0.0044, + "step": 5667 + }, + { + "epoch": 3.4487374505628234, + "grad_norm": 0.19194208085536957, + "learning_rate": 2.49045969246465e-06, + "loss": 0.0049, + "step": 5668 + }, + { + "epoch": 3.4493459081229085, + "grad_norm": 0.18368539214134216, + "learning_rate": 2.485045973792266e-06, + "loss": 0.0046, + "step": 5669 + }, + { + "epoch": 3.4499543656829936, + "grad_norm": 0.33034926652908325, + "learning_rate": 2.4796378378900142e-06, + "loss": 0.0104, + "step": 5670 + }, + { + "epoch": 3.4505628232430787, + "grad_norm": 0.23672623932361603, + "learning_rate": 2.474235286098878e-06, + "loss": 0.0038, + "step": 5671 + }, + { + "epoch": 3.451171280803164, + "grad_norm": 0.23982059955596924, + "learning_rate": 2.4688383197584762e-06, + "loss": 0.0046, + "step": 5672 + }, + { + "epoch": 3.451779738363249, + "grad_norm": 0.3084208369255066, + "learning_rate": 2.4634469402070233e-06, + "loss": 0.0091, + "step": 5673 + }, + { + "epoch": 3.4523881959233345, + "grad_norm": 0.14161497354507446, + "learning_rate": 2.458061148781363e-06, + "loss": 0.0026, + "step": 5674 + }, + { + "epoch": 3.4529966534834196, + "grad_norm": 0.3343401253223419, + "learning_rate": 2.452680946816932e-06, + "loss": 0.0078, + "step": 5675 + }, + { + "epoch": 3.4536051110435046, + "grad_norm": 0.23120488226413727, + "learning_rate": 2.4473063356478198e-06, + "loss": 0.0054, + "step": 5676 + }, + { + "epoch": 3.4542135686035897, + "grad_norm": 0.2343619167804718, + "learning_rate": 2.4419373166066904e-06, + "loss": 0.0056, + "step": 5677 + }, + { + "epoch": 3.454822026163675, + "grad_norm": 0.37773486971855164, + "learning_rate": 2.4365738910248375e-06, + "loss": 0.0102, + "step": 5678 + }, + { + "epoch": 3.4554304837237604, + "grad_norm": 0.19855354726314545, + "learning_rate": 2.431216060232169e-06, + "loss": 0.0056, + "step": 5679 + }, + { + "epoch": 3.4560389412838455, + "grad_norm": 0.2364191859960556, + "learning_rate": 2.425863825557212e-06, + "loss": 0.0064, + "step": 5680 + }, + { + "epoch": 3.4566473988439306, + "grad_norm": 0.4233652353286743, + "learning_rate": 2.420517188327079e-06, + "loss": 0.0106, + "step": 5681 + }, + { + "epoch": 3.4572558564040157, + "grad_norm": 0.2906593084335327, + "learning_rate": 2.4151761498675345e-06, + "loss": 0.0053, + "step": 5682 + }, + { + "epoch": 3.4578643139641008, + "grad_norm": 0.2784557044506073, + "learning_rate": 2.409840711502917e-06, + "loss": 0.0079, + "step": 5683 + }, + { + "epoch": 3.4584727715241863, + "grad_norm": 0.2974449396133423, + "learning_rate": 2.4045108745561985e-06, + "loss": 0.0071, + "step": 5684 + }, + { + "epoch": 3.4590812290842714, + "grad_norm": 0.3297038674354553, + "learning_rate": 2.3991866403489577e-06, + "loss": 0.0073, + "step": 5685 + }, + { + "epoch": 3.4596896866443565, + "grad_norm": 0.2977392077445984, + "learning_rate": 2.393868010201386e-06, + "loss": 0.0072, + "step": 5686 + }, + { + "epoch": 3.4602981442044416, + "grad_norm": 0.3237118721008301, + "learning_rate": 2.388554985432273e-06, + "loss": 0.005, + "step": 5687 + }, + { + "epoch": 3.4609066017645267, + "grad_norm": 0.23049971461296082, + "learning_rate": 2.3832475673590316e-06, + "loss": 0.0063, + "step": 5688 + }, + { + "epoch": 3.4615150593246122, + "grad_norm": 0.2660450339317322, + "learning_rate": 2.377945757297681e-06, + "loss": 0.0061, + "step": 5689 + }, + { + "epoch": 3.4621235168846973, + "grad_norm": 0.2178175449371338, + "learning_rate": 2.3726495565628506e-06, + "loss": 0.0058, + "step": 5690 + }, + { + "epoch": 3.4627319744447824, + "grad_norm": 0.47303879261016846, + "learning_rate": 2.3673589664677727e-06, + "loss": 0.0134, + "step": 5691 + }, + { + "epoch": 3.4633404320048675, + "grad_norm": 0.2600957453250885, + "learning_rate": 2.362073988324304e-06, + "loss": 0.0049, + "step": 5692 + }, + { + "epoch": 3.4639488895649526, + "grad_norm": 0.2990921139717102, + "learning_rate": 2.3567946234428844e-06, + "loss": 0.0068, + "step": 5693 + }, + { + "epoch": 3.464557347125038, + "grad_norm": 0.19195446372032166, + "learning_rate": 2.3515208731325815e-06, + "loss": 0.004, + "step": 5694 + }, + { + "epoch": 3.4651658046851233, + "grad_norm": 0.3154228627681732, + "learning_rate": 2.346252738701071e-06, + "loss": 0.005, + "step": 5695 + }, + { + "epoch": 3.4657742622452083, + "grad_norm": 0.19232860207557678, + "learning_rate": 2.340990221454628e-06, + "loss": 0.0035, + "step": 5696 + }, + { + "epoch": 3.4663827198052934, + "grad_norm": 0.28526800870895386, + "learning_rate": 2.3357333226981333e-06, + "loss": 0.0065, + "step": 5697 + }, + { + "epoch": 3.4669911773653785, + "grad_norm": 0.20558756589889526, + "learning_rate": 2.33048204373508e-06, + "loss": 0.0026, + "step": 5698 + }, + { + "epoch": 3.467599634925464, + "grad_norm": 0.21182219684123993, + "learning_rate": 2.3252363858675684e-06, + "loss": 0.0044, + "step": 5699 + }, + { + "epoch": 3.468208092485549, + "grad_norm": 0.22109881043434143, + "learning_rate": 2.3199963503963e-06, + "loss": 0.0043, + "step": 5700 + }, + { + "epoch": 3.4688165500456343, + "grad_norm": 0.26969975233078003, + "learning_rate": 2.314761938620591e-06, + "loss": 0.0073, + "step": 5701 + }, + { + "epoch": 3.4694250076057194, + "grad_norm": 0.20506982505321503, + "learning_rate": 2.3095331518383582e-06, + "loss": 0.0061, + "step": 5702 + }, + { + "epoch": 3.4700334651658045, + "grad_norm": 0.20657236874103546, + "learning_rate": 2.3043099913461125e-06, + "loss": 0.0033, + "step": 5703 + }, + { + "epoch": 3.47064192272589, + "grad_norm": 0.22096757590770721, + "learning_rate": 2.2990924584389868e-06, + "loss": 0.0039, + "step": 5704 + }, + { + "epoch": 3.471250380285975, + "grad_norm": 0.221493199467659, + "learning_rate": 2.29388055441071e-06, + "loss": 0.0034, + "step": 5705 + }, + { + "epoch": 3.47185883784606, + "grad_norm": 0.2772558331489563, + "learning_rate": 2.2886742805536183e-06, + "loss": 0.0058, + "step": 5706 + }, + { + "epoch": 3.4724672954061453, + "grad_norm": 0.26506444811820984, + "learning_rate": 2.283473638158656e-06, + "loss": 0.0064, + "step": 5707 + }, + { + "epoch": 3.4730757529662304, + "grad_norm": 0.18250389397144318, + "learning_rate": 2.278278628515354e-06, + "loss": 0.0037, + "step": 5708 + }, + { + "epoch": 3.473684210526316, + "grad_norm": 0.21926113963127136, + "learning_rate": 2.2730892529118643e-06, + "loss": 0.0046, + "step": 5709 + }, + { + "epoch": 3.474292668086401, + "grad_norm": 0.2898429334163666, + "learning_rate": 2.267905512634935e-06, + "loss": 0.0055, + "step": 5710 + }, + { + "epoch": 3.474901125646486, + "grad_norm": 0.21994440257549286, + "learning_rate": 2.2627274089699195e-06, + "loss": 0.005, + "step": 5711 + }, + { + "epoch": 3.475509583206571, + "grad_norm": 0.3623565137386322, + "learning_rate": 2.257554943200771e-06, + "loss": 0.0111, + "step": 5712 + }, + { + "epoch": 3.4761180407666563, + "grad_norm": 0.2396956980228424, + "learning_rate": 2.252388116610046e-06, + "loss": 0.0043, + "step": 5713 + }, + { + "epoch": 3.476726498326742, + "grad_norm": 0.25912052392959595, + "learning_rate": 2.247226930478899e-06, + "loss": 0.0062, + "step": 5714 + }, + { + "epoch": 3.477334955886827, + "grad_norm": 0.22874920070171356, + "learning_rate": 2.2420713860870914e-06, + "loss": 0.0055, + "step": 5715 + }, + { + "epoch": 3.477943413446912, + "grad_norm": 0.3113996386528015, + "learning_rate": 2.2369214847129812e-06, + "loss": 0.0056, + "step": 5716 + }, + { + "epoch": 3.478551871006997, + "grad_norm": 0.21554897725582123, + "learning_rate": 2.231777227633533e-06, + "loss": 0.0053, + "step": 5717 + }, + { + "epoch": 3.4791603285670822, + "grad_norm": 0.29633817076683044, + "learning_rate": 2.2266386161243045e-06, + "loss": 0.0057, + "step": 5718 + }, + { + "epoch": 3.479768786127168, + "grad_norm": 0.14219973981380463, + "learning_rate": 2.221505651459463e-06, + "loss": 0.0035, + "step": 5719 + }, + { + "epoch": 3.480377243687253, + "grad_norm": 0.2941674590110779, + "learning_rate": 2.2163783349117616e-06, + "loss": 0.0053, + "step": 5720 + }, + { + "epoch": 3.480985701247338, + "grad_norm": 0.30135074257850647, + "learning_rate": 2.211256667752565e-06, + "loss": 0.0099, + "step": 5721 + }, + { + "epoch": 3.481594158807423, + "grad_norm": 0.3033702075481415, + "learning_rate": 2.206140651251834e-06, + "loss": 0.0064, + "step": 5722 + }, + { + "epoch": 3.482202616367508, + "grad_norm": 0.26715901494026184, + "learning_rate": 2.2010302866781296e-06, + "loss": 0.0059, + "step": 5723 + }, + { + "epoch": 3.4828110739275937, + "grad_norm": 0.3949749767780304, + "learning_rate": 2.1959255752986017e-06, + "loss": 0.0104, + "step": 5724 + }, + { + "epoch": 3.483419531487679, + "grad_norm": 0.2797585129737854, + "learning_rate": 2.1908265183790105e-06, + "loss": 0.0073, + "step": 5725 + }, + { + "epoch": 3.484027989047764, + "grad_norm": 0.23626194894313812, + "learning_rate": 2.1857331171837107e-06, + "loss": 0.009, + "step": 5726 + }, + { + "epoch": 3.484636446607849, + "grad_norm": 0.23918910324573517, + "learning_rate": 2.180645372975651e-06, + "loss": 0.0059, + "step": 5727 + }, + { + "epoch": 3.485244904167934, + "grad_norm": 0.16909535229206085, + "learning_rate": 2.1755632870163828e-06, + "loss": 0.0032, + "step": 5728 + }, + { + "epoch": 3.4858533617280196, + "grad_norm": 0.2599836587905884, + "learning_rate": 2.170486860566054e-06, + "loss": 0.008, + "step": 5729 + }, + { + "epoch": 3.4864618192881047, + "grad_norm": 0.3136194944381714, + "learning_rate": 2.1654160948833963e-06, + "loss": 0.0058, + "step": 5730 + }, + { + "epoch": 3.48707027684819, + "grad_norm": 0.2240346372127533, + "learning_rate": 2.160350991225757e-06, + "loss": 0.0049, + "step": 5731 + }, + { + "epoch": 3.487678734408275, + "grad_norm": 0.34254634380340576, + "learning_rate": 2.1552915508490675e-06, + "loss": 0.0119, + "step": 5732 + }, + { + "epoch": 3.48828719196836, + "grad_norm": 0.22509586811065674, + "learning_rate": 2.1502377750078627e-06, + "loss": 0.0043, + "step": 5733 + }, + { + "epoch": 3.4888956495284456, + "grad_norm": 0.19744735956192017, + "learning_rate": 2.145189664955258e-06, + "loss": 0.0055, + "step": 5734 + }, + { + "epoch": 3.4895041070885306, + "grad_norm": 0.24436576664447784, + "learning_rate": 2.1401472219429867e-06, + "loss": 0.0053, + "step": 5735 + }, + { + "epoch": 3.4901125646486157, + "grad_norm": 0.29822880029678345, + "learning_rate": 2.1351104472213585e-06, + "loss": 0.0054, + "step": 5736 + }, + { + "epoch": 3.490721022208701, + "grad_norm": 0.33873575925827026, + "learning_rate": 2.1300793420392815e-06, + "loss": 0.0053, + "step": 5737 + }, + { + "epoch": 3.491329479768786, + "grad_norm": 0.3289222717285156, + "learning_rate": 2.1250539076442617e-06, + "loss": 0.0073, + "step": 5738 + }, + { + "epoch": 3.4919379373288715, + "grad_norm": 0.23619408905506134, + "learning_rate": 2.120034145282404e-06, + "loss": 0.0044, + "step": 5739 + }, + { + "epoch": 3.4925463948889566, + "grad_norm": 0.2359532117843628, + "learning_rate": 2.115020056198383e-06, + "loss": 0.0052, + "step": 5740 + }, + { + "epoch": 3.4931548524490417, + "grad_norm": 0.20343980193138123, + "learning_rate": 2.1100116416355063e-06, + "loss": 0.0051, + "step": 5741 + }, + { + "epoch": 3.4937633100091268, + "grad_norm": 0.2578510642051697, + "learning_rate": 2.1050089028356366e-06, + "loss": 0.0055, + "step": 5742 + }, + { + "epoch": 3.494371767569212, + "grad_norm": 0.2600227892398834, + "learning_rate": 2.100011841039251e-06, + "loss": 0.0072, + "step": 5743 + }, + { + "epoch": 3.4949802251292974, + "grad_norm": 0.23431195318698883, + "learning_rate": 2.0950204574854027e-06, + "loss": 0.0045, + "step": 5744 + }, + { + "epoch": 3.4955886826893825, + "grad_norm": 0.265465646982193, + "learning_rate": 2.0900347534117627e-06, + "loss": 0.007, + "step": 5745 + }, + { + "epoch": 3.4961971402494676, + "grad_norm": 0.22313670814037323, + "learning_rate": 2.0850547300545668e-06, + "loss": 0.0041, + "step": 5746 + }, + { + "epoch": 3.4968055978095527, + "grad_norm": 0.3893153965473175, + "learning_rate": 2.080080388648653e-06, + "loss": 0.0111, + "step": 5747 + }, + { + "epoch": 3.497414055369638, + "grad_norm": 0.31730809807777405, + "learning_rate": 2.0751117304274563e-06, + "loss": 0.0099, + "step": 5748 + }, + { + "epoch": 3.4980225129297233, + "grad_norm": 0.30250468850135803, + "learning_rate": 2.070148756622997e-06, + "loss": 0.0057, + "step": 5749 + }, + { + "epoch": 3.4986309704898084, + "grad_norm": 0.30746808648109436, + "learning_rate": 2.065191468465874e-06, + "loss": 0.0084, + "step": 5750 + }, + { + "epoch": 3.4992394280498935, + "grad_norm": 0.1614643782377243, + "learning_rate": 2.060239867185304e-06, + "loss": 0.0021, + "step": 5751 + }, + { + "epoch": 3.4998478856099786, + "grad_norm": 0.2963274121284485, + "learning_rate": 2.0552939540090687e-06, + "loss": 0.006, + "step": 5752 + }, + { + "epoch": 3.5004563431700637, + "grad_norm": 0.20009563863277435, + "learning_rate": 2.050353730163554e-06, + "loss": 0.0052, + "step": 5753 + }, + { + "epoch": 3.5010648007301493, + "grad_norm": 0.20086035132408142, + "learning_rate": 2.045419196873716e-06, + "loss": 0.0028, + "step": 5754 + }, + { + "epoch": 3.5016732582902343, + "grad_norm": 0.22441764175891876, + "learning_rate": 2.0404903553631337e-06, + "loss": 0.0052, + "step": 5755 + }, + { + "epoch": 3.5022817158503194, + "grad_norm": 0.2553858160972595, + "learning_rate": 2.0355672068539387e-06, + "loss": 0.0045, + "step": 5756 + }, + { + "epoch": 3.5028901734104045, + "grad_norm": 0.32011979818344116, + "learning_rate": 2.030649752566871e-06, + "loss": 0.0076, + "step": 5757 + }, + { + "epoch": 3.5034986309704896, + "grad_norm": 0.21500691771507263, + "learning_rate": 2.025737993721255e-06, + "loss": 0.0053, + "step": 5758 + }, + { + "epoch": 3.504107088530575, + "grad_norm": 0.32461220026016235, + "learning_rate": 2.020831931535008e-06, + "loss": 0.0096, + "step": 5759 + }, + { + "epoch": 3.5047155460906603, + "grad_norm": 0.28721633553504944, + "learning_rate": 2.015931567224613e-06, + "loss": 0.0088, + "step": 5760 + }, + { + "epoch": 3.5053240036507454, + "grad_norm": 0.17419946193695068, + "learning_rate": 2.0110369020051755e-06, + "loss": 0.0027, + "step": 5761 + }, + { + "epoch": 3.5059324612108305, + "grad_norm": 0.33820995688438416, + "learning_rate": 2.006147937090355e-06, + "loss": 0.0107, + "step": 5762 + }, + { + "epoch": 3.5065409187709156, + "grad_norm": 0.3115573227405548, + "learning_rate": 2.001264673692413e-06, + "loss": 0.0061, + "step": 5763 + }, + { + "epoch": 3.507149376331001, + "grad_norm": 0.2463875263929367, + "learning_rate": 1.9963871130221997e-06, + "loss": 0.0061, + "step": 5764 + }, + { + "epoch": 3.507757833891086, + "grad_norm": 0.19313430786132812, + "learning_rate": 1.9915152562891476e-06, + "loss": 0.0046, + "step": 5765 + }, + { + "epoch": 3.5083662914511713, + "grad_norm": 0.27370426058769226, + "learning_rate": 1.9866491047012687e-06, + "loss": 0.0052, + "step": 5766 + }, + { + "epoch": 3.5089747490112564, + "grad_norm": 0.22294797003269196, + "learning_rate": 1.981788659465164e-06, + "loss": 0.0045, + "step": 5767 + }, + { + "epoch": 3.5095832065713415, + "grad_norm": 0.24368692934513092, + "learning_rate": 1.976933921786028e-06, + "loss": 0.0047, + "step": 5768 + }, + { + "epoch": 3.510191664131427, + "grad_norm": 0.29862114787101746, + "learning_rate": 1.972084892867637e-06, + "loss": 0.0081, + "step": 5769 + }, + { + "epoch": 3.510800121691512, + "grad_norm": 0.16069160401821136, + "learning_rate": 1.967241573912329e-06, + "loss": 0.0023, + "step": 5770 + }, + { + "epoch": 3.511408579251597, + "grad_norm": 0.20247617363929749, + "learning_rate": 1.962403966121071e-06, + "loss": 0.0043, + "step": 5771 + }, + { + "epoch": 3.5120170368116823, + "grad_norm": 0.2329401671886444, + "learning_rate": 1.957572070693367e-06, + "loss": 0.0039, + "step": 5772 + }, + { + "epoch": 3.5126254943717674, + "grad_norm": 0.2965979278087616, + "learning_rate": 1.952745888827337e-06, + "loss": 0.0107, + "step": 5773 + }, + { + "epoch": 3.513233951931853, + "grad_norm": 0.10749952495098114, + "learning_rate": 1.947925421719668e-06, + "loss": 0.0021, + "step": 5774 + }, + { + "epoch": 3.513842409491938, + "grad_norm": 0.2879178822040558, + "learning_rate": 1.9431106705656397e-06, + "loss": 0.0084, + "step": 5775 + }, + { + "epoch": 3.514450867052023, + "grad_norm": 0.34235483407974243, + "learning_rate": 1.938301636559098e-06, + "loss": 0.0071, + "step": 5776 + }, + { + "epoch": 3.5150593246121082, + "grad_norm": 0.21877804398536682, + "learning_rate": 1.9334983208925017e-06, + "loss": 0.007, + "step": 5777 + }, + { + "epoch": 3.5156677821721933, + "grad_norm": 0.36894896626472473, + "learning_rate": 1.9287007247568573e-06, + "loss": 0.007, + "step": 5778 + }, + { + "epoch": 3.516276239732279, + "grad_norm": 0.3058571517467499, + "learning_rate": 1.9239088493417796e-06, + "loss": 0.0046, + "step": 5779 + }, + { + "epoch": 3.516884697292364, + "grad_norm": 0.24490618705749512, + "learning_rate": 1.9191226958354403e-06, + "loss": 0.0057, + "step": 5780 + }, + { + "epoch": 3.517493154852449, + "grad_norm": 0.304394006729126, + "learning_rate": 1.9143422654246205e-06, + "loss": 0.0098, + "step": 5781 + }, + { + "epoch": 3.518101612412534, + "grad_norm": 0.28492802381515503, + "learning_rate": 1.9095675592946587e-06, + "loss": 0.0061, + "step": 5782 + }, + { + "epoch": 3.5187100699726193, + "grad_norm": 0.2946596145629883, + "learning_rate": 1.9047985786294853e-06, + "loss": 0.0061, + "step": 5783 + }, + { + "epoch": 3.519318527532705, + "grad_norm": 0.18398457765579224, + "learning_rate": 1.900035324611607e-06, + "loss": 0.0037, + "step": 5784 + }, + { + "epoch": 3.51992698509279, + "grad_norm": 0.23536744713783264, + "learning_rate": 1.895277798422121e-06, + "loss": 0.004, + "step": 5785 + }, + { + "epoch": 3.520535442652875, + "grad_norm": 0.24130214750766754, + "learning_rate": 1.8905260012406778e-06, + "loss": 0.0049, + "step": 5786 + }, + { + "epoch": 3.52114390021296, + "grad_norm": 0.2898615300655365, + "learning_rate": 1.8857799342455462e-06, + "loss": 0.0057, + "step": 5787 + }, + { + "epoch": 3.521752357773045, + "grad_norm": 0.25997620820999146, + "learning_rate": 1.8810395986135377e-06, + "loss": 0.0057, + "step": 5788 + }, + { + "epoch": 3.5223608153331307, + "grad_norm": 0.2553774416446686, + "learning_rate": 1.8763049955200674e-06, + "loss": 0.0057, + "step": 5789 + }, + { + "epoch": 3.522969272893216, + "grad_norm": 0.26607316732406616, + "learning_rate": 1.8715761261391074e-06, + "loss": 0.0045, + "step": 5790 + }, + { + "epoch": 3.523577730453301, + "grad_norm": 0.24370121955871582, + "learning_rate": 1.8668529916432365e-06, + "loss": 0.0047, + "step": 5791 + }, + { + "epoch": 3.524186188013386, + "grad_norm": 0.2677232325077057, + "learning_rate": 1.8621355932035788e-06, + "loss": 0.0061, + "step": 5792 + }, + { + "epoch": 3.524794645573471, + "grad_norm": 0.29431116580963135, + "learning_rate": 1.8574239319898657e-06, + "loss": 0.0068, + "step": 5793 + }, + { + "epoch": 3.5254031031335566, + "grad_norm": 0.2646505832672119, + "learning_rate": 1.8527180091703843e-06, + "loss": 0.005, + "step": 5794 + }, + { + "epoch": 3.5260115606936417, + "grad_norm": 0.2067060023546219, + "learning_rate": 1.848017825912013e-06, + "loss": 0.0051, + "step": 5795 + }, + { + "epoch": 3.526620018253727, + "grad_norm": 0.19585838913917542, + "learning_rate": 1.843323383380194e-06, + "loss": 0.0028, + "step": 5796 + }, + { + "epoch": 3.527228475813812, + "grad_norm": 0.27853307127952576, + "learning_rate": 1.8386346827389629e-06, + "loss": 0.006, + "step": 5797 + }, + { + "epoch": 3.527836933373897, + "grad_norm": 0.260459840297699, + "learning_rate": 1.8339517251509146e-06, + "loss": 0.0058, + "step": 5798 + }, + { + "epoch": 3.5284453909339826, + "grad_norm": 0.21536771953105927, + "learning_rate": 1.829274511777232e-06, + "loss": 0.006, + "step": 5799 + }, + { + "epoch": 3.5290538484940677, + "grad_norm": 0.3227105736732483, + "learning_rate": 1.8246030437776645e-06, + "loss": 0.0062, + "step": 5800 + }, + { + "epoch": 3.5296623060541528, + "grad_norm": 0.21509003639221191, + "learning_rate": 1.8199373223105498e-06, + "loss": 0.0041, + "step": 5801 + }, + { + "epoch": 3.530270763614238, + "grad_norm": 0.3301815092563629, + "learning_rate": 1.8152773485327818e-06, + "loss": 0.0088, + "step": 5802 + }, + { + "epoch": 3.530879221174323, + "grad_norm": 0.13643626868724823, + "learning_rate": 1.8106231235998444e-06, + "loss": 0.0032, + "step": 5803 + }, + { + "epoch": 3.5314876787344085, + "grad_norm": 0.3046422600746155, + "learning_rate": 1.8059746486657896e-06, + "loss": 0.0091, + "step": 5804 + }, + { + "epoch": 3.5320961362944936, + "grad_norm": 0.2640552222728729, + "learning_rate": 1.8013319248832538e-06, + "loss": 0.0067, + "step": 5805 + }, + { + "epoch": 3.5327045938545787, + "grad_norm": 0.32940080761909485, + "learning_rate": 1.7966949534034243e-06, + "loss": 0.0086, + "step": 5806 + }, + { + "epoch": 3.533313051414664, + "grad_norm": 0.2538890242576599, + "learning_rate": 1.7920637353760928e-06, + "loss": 0.0057, + "step": 5807 + }, + { + "epoch": 3.533921508974749, + "grad_norm": 0.6640288233757019, + "learning_rate": 1.7874382719495958e-06, + "loss": 0.0576, + "step": 5808 + }, + { + "epoch": 3.5345299665348344, + "grad_norm": 0.3605937063694, + "learning_rate": 1.7828185642708605e-06, + "loss": 0.0078, + "step": 5809 + }, + { + "epoch": 3.5351384240949195, + "grad_norm": 0.2806709408760071, + "learning_rate": 1.7782046134853792e-06, + "loss": 0.0088, + "step": 5810 + }, + { + "epoch": 3.5357468816550046, + "grad_norm": 0.17481489479541779, + "learning_rate": 1.773596420737228e-06, + "loss": 0.0039, + "step": 5811 + }, + { + "epoch": 3.5363553392150897, + "grad_norm": 0.15608619153499603, + "learning_rate": 1.7689939871690375e-06, + "loss": 0.0043, + "step": 5812 + }, + { + "epoch": 3.536963796775175, + "grad_norm": 0.2716093361377716, + "learning_rate": 1.7643973139220198e-06, + "loss": 0.0101, + "step": 5813 + }, + { + "epoch": 3.5375722543352603, + "grad_norm": 0.13356530666351318, + "learning_rate": 1.7598064021359607e-06, + "loss": 0.0015, + "step": 5814 + }, + { + "epoch": 3.5381807118953454, + "grad_norm": 0.17770147323608398, + "learning_rate": 1.7552212529492158e-06, + "loss": 0.0029, + "step": 5815 + }, + { + "epoch": 3.5387891694554305, + "grad_norm": 0.1851194053888321, + "learning_rate": 1.750641867498709e-06, + "loss": 0.0034, + "step": 5816 + }, + { + "epoch": 3.5393976270155156, + "grad_norm": 0.3669975697994232, + "learning_rate": 1.7460682469199435e-06, + "loss": 0.0077, + "step": 5817 + }, + { + "epoch": 3.5400060845756007, + "grad_norm": 0.24136383831501007, + "learning_rate": 1.7415003923469787e-06, + "loss": 0.0053, + "step": 5818 + }, + { + "epoch": 3.5406145421356863, + "grad_norm": 0.2976721227169037, + "learning_rate": 1.7369383049124498e-06, + "loss": 0.0061, + "step": 5819 + }, + { + "epoch": 3.5412229996957714, + "grad_norm": 0.20041820406913757, + "learning_rate": 1.7323819857475721e-06, + "loss": 0.0051, + "step": 5820 + }, + { + "epoch": 3.5418314572558565, + "grad_norm": 0.2120663970708847, + "learning_rate": 1.727831435982119e-06, + "loss": 0.0047, + "step": 5821 + }, + { + "epoch": 3.5424399148159416, + "grad_norm": 0.18316033482551575, + "learning_rate": 1.7232866567444384e-06, + "loss": 0.0035, + "step": 5822 + }, + { + "epoch": 3.5430483723760267, + "grad_norm": 0.22955255210399628, + "learning_rate": 1.7187476491614507e-06, + "loss": 0.0058, + "step": 5823 + }, + { + "epoch": 3.543656829936112, + "grad_norm": 0.22870229184627533, + "learning_rate": 1.7142144143586308e-06, + "loss": 0.0033, + "step": 5824 + }, + { + "epoch": 3.5442652874961973, + "grad_norm": 0.24868161976337433, + "learning_rate": 1.7096869534600352e-06, + "loss": 0.0076, + "step": 5825 + }, + { + "epoch": 3.5448737450562824, + "grad_norm": 0.32703396677970886, + "learning_rate": 1.7051652675882878e-06, + "loss": 0.0066, + "step": 5826 + }, + { + "epoch": 3.5454822026163675, + "grad_norm": 0.28158140182495117, + "learning_rate": 1.7006493578645838e-06, + "loss": 0.0064, + "step": 5827 + }, + { + "epoch": 3.5460906601764526, + "grad_norm": 0.2724444270133972, + "learning_rate": 1.6961392254086689e-06, + "loss": 0.0065, + "step": 5828 + }, + { + "epoch": 3.546699117736538, + "grad_norm": 0.312369704246521, + "learning_rate": 1.6916348713388708e-06, + "loss": 0.0063, + "step": 5829 + }, + { + "epoch": 3.547307575296623, + "grad_norm": 0.2701030373573303, + "learning_rate": 1.6871362967720878e-06, + "loss": 0.006, + "step": 5830 + }, + { + "epoch": 3.5479160328567083, + "grad_norm": 0.16680403053760529, + "learning_rate": 1.682643502823772e-06, + "loss": 0.0028, + "step": 5831 + }, + { + "epoch": 3.5485244904167934, + "grad_norm": 0.25249308347702026, + "learning_rate": 1.6781564906079545e-06, + "loss": 0.0045, + "step": 5832 + }, + { + "epoch": 3.5491329479768785, + "grad_norm": 0.1920885592699051, + "learning_rate": 1.6736752612372286e-06, + "loss": 0.0041, + "step": 5833 + }, + { + "epoch": 3.549741405536964, + "grad_norm": 0.28763115406036377, + "learning_rate": 1.6691998158227446e-06, + "loss": 0.007, + "step": 5834 + }, + { + "epoch": 3.550349863097049, + "grad_norm": 0.479855477809906, + "learning_rate": 1.6647301554742312e-06, + "loss": 0.0151, + "step": 5835 + }, + { + "epoch": 3.5509583206571342, + "grad_norm": 0.4631776213645935, + "learning_rate": 1.6602662812999742e-06, + "loss": 0.0091, + "step": 5836 + }, + { + "epoch": 3.5515667782172193, + "grad_norm": 0.3424180746078491, + "learning_rate": 1.6558081944068354e-06, + "loss": 0.0108, + "step": 5837 + }, + { + "epoch": 3.5521752357773044, + "grad_norm": 0.24957185983657837, + "learning_rate": 1.6513558959002334e-06, + "loss": 0.0073, + "step": 5838 + }, + { + "epoch": 3.55278369333739, + "grad_norm": 0.23904968798160553, + "learning_rate": 1.6469093868841434e-06, + "loss": 0.0071, + "step": 5839 + }, + { + "epoch": 3.553392150897475, + "grad_norm": 0.1692766696214676, + "learning_rate": 1.6424686684611224e-06, + "loss": 0.0027, + "step": 5840 + }, + { + "epoch": 3.55400060845756, + "grad_norm": 0.28650224208831787, + "learning_rate": 1.638033741732281e-06, + "loss": 0.0097, + "step": 5841 + }, + { + "epoch": 3.5546090660176453, + "grad_norm": 0.14089111983776093, + "learning_rate": 1.6336046077972983e-06, + "loss": 0.0019, + "step": 5842 + }, + { + "epoch": 3.5552175235777304, + "grad_norm": 0.148437961935997, + "learning_rate": 1.6291812677544121e-06, + "loss": 0.0024, + "step": 5843 + }, + { + "epoch": 3.555825981137816, + "grad_norm": 0.26019591093063354, + "learning_rate": 1.6247637227004342e-06, + "loss": 0.007, + "step": 5844 + }, + { + "epoch": 3.5564344386979005, + "grad_norm": 0.2771880626678467, + "learning_rate": 1.6203519737307187e-06, + "loss": 0.0042, + "step": 5845 + }, + { + "epoch": 3.557042896257986, + "grad_norm": 0.27632075548171997, + "learning_rate": 1.615946021939202e-06, + "loss": 0.0059, + "step": 5846 + }, + { + "epoch": 3.557651353818071, + "grad_norm": 0.2640374004840851, + "learning_rate": 1.6115458684183793e-06, + "loss": 0.0031, + "step": 5847 + }, + { + "epoch": 3.5582598113781563, + "grad_norm": 0.23111113905906677, + "learning_rate": 1.607151514259303e-06, + "loss": 0.0044, + "step": 5848 + }, + { + "epoch": 3.558868268938242, + "grad_norm": 0.40425482392311096, + "learning_rate": 1.602762960551582e-06, + "loss": 0.0084, + "step": 5849 + }, + { + "epoch": 3.5594767264983265, + "grad_norm": 0.17525078356266022, + "learning_rate": 1.5983802083834126e-06, + "loss": 0.0032, + "step": 5850 + }, + { + "epoch": 3.560085184058412, + "grad_norm": 0.2451610118150711, + "learning_rate": 1.5940032588415171e-06, + "loss": 0.0077, + "step": 5851 + }, + { + "epoch": 3.560693641618497, + "grad_norm": 0.2035626322031021, + "learning_rate": 1.5896321130112023e-06, + "loss": 0.0035, + "step": 5852 + }, + { + "epoch": 3.561302099178582, + "grad_norm": 0.16917142271995544, + "learning_rate": 1.5852667719763348e-06, + "loss": 0.0026, + "step": 5853 + }, + { + "epoch": 3.5619105567386677, + "grad_norm": 0.40496107935905457, + "learning_rate": 1.5809072368193345e-06, + "loss": 0.027, + "step": 5854 + }, + { + "epoch": 3.5625190142987524, + "grad_norm": 0.2576707899570465, + "learning_rate": 1.5765535086211786e-06, + "loss": 0.0059, + "step": 5855 + }, + { + "epoch": 3.563127471858838, + "grad_norm": 0.27489739656448364, + "learning_rate": 1.57220558846142e-06, + "loss": 0.0037, + "step": 5856 + }, + { + "epoch": 3.563735929418923, + "grad_norm": 0.16968370974063873, + "learning_rate": 1.567863477418155e-06, + "loss": 0.0036, + "step": 5857 + }, + { + "epoch": 3.564344386979008, + "grad_norm": 0.271075040102005, + "learning_rate": 1.5635271765680525e-06, + "loss": 0.0051, + "step": 5858 + }, + { + "epoch": 3.5649528445390937, + "grad_norm": 0.27260100841522217, + "learning_rate": 1.5591966869863196e-06, + "loss": 0.0062, + "step": 5859 + }, + { + "epoch": 3.5655613020991783, + "grad_norm": 0.15648119151592255, + "learning_rate": 1.554872009746758e-06, + "loss": 0.0032, + "step": 5860 + }, + { + "epoch": 3.566169759659264, + "grad_norm": 0.23940731585025787, + "learning_rate": 1.5505531459216904e-06, + "loss": 0.004, + "step": 5861 + }, + { + "epoch": 3.566778217219349, + "grad_norm": 0.3243446350097656, + "learning_rate": 1.5462400965820218e-06, + "loss": 0.0109, + "step": 5862 + }, + { + "epoch": 3.567386674779434, + "grad_norm": 0.14726978540420532, + "learning_rate": 1.5419328627972103e-06, + "loss": 0.0021, + "step": 5863 + }, + { + "epoch": 3.5679951323395196, + "grad_norm": 0.2737961411476135, + "learning_rate": 1.5376314456352708e-06, + "loss": 0.0065, + "step": 5864 + }, + { + "epoch": 3.5686035898996042, + "grad_norm": 0.2394101470708847, + "learning_rate": 1.5333358461627673e-06, + "loss": 0.0036, + "step": 5865 + }, + { + "epoch": 3.56921204745969, + "grad_norm": 0.20798607170581818, + "learning_rate": 1.5290460654448418e-06, + "loss": 0.0051, + "step": 5866 + }, + { + "epoch": 3.569820505019775, + "grad_norm": 0.21632260084152222, + "learning_rate": 1.5247621045451688e-06, + "loss": 0.0044, + "step": 5867 + }, + { + "epoch": 3.57042896257986, + "grad_norm": 0.28413519263267517, + "learning_rate": 1.5204839645259983e-06, + "loss": 0.005, + "step": 5868 + }, + { + "epoch": 3.571037420139945, + "grad_norm": 0.28667712211608887, + "learning_rate": 1.5162116464481318e-06, + "loss": 0.0077, + "step": 5869 + }, + { + "epoch": 3.57164587770003, + "grad_norm": 0.25631603598594666, + "learning_rate": 1.5119451513709277e-06, + "loss": 0.0039, + "step": 5870 + }, + { + "epoch": 3.5722543352601157, + "grad_norm": 0.19717690348625183, + "learning_rate": 1.5076844803522922e-06, + "loss": 0.0024, + "step": 5871 + }, + { + "epoch": 3.572862792820201, + "grad_norm": 0.28270480036735535, + "learning_rate": 1.503429634448697e-06, + "loss": 0.0071, + "step": 5872 + }, + { + "epoch": 3.573471250380286, + "grad_norm": 0.23655082285404205, + "learning_rate": 1.4991806147151677e-06, + "loss": 0.0069, + "step": 5873 + }, + { + "epoch": 3.574079707940371, + "grad_norm": 0.22452490031719208, + "learning_rate": 1.4949374222052864e-06, + "loss": 0.0042, + "step": 5874 + }, + { + "epoch": 3.574688165500456, + "grad_norm": 0.21213477849960327, + "learning_rate": 1.4907000579711782e-06, + "loss": 0.0043, + "step": 5875 + }, + { + "epoch": 3.5752966230605416, + "grad_norm": 0.27884024381637573, + "learning_rate": 1.4864685230635473e-06, + "loss": 0.0057, + "step": 5876 + }, + { + "epoch": 3.5759050806206267, + "grad_norm": 0.23877516388893127, + "learning_rate": 1.4822428185316261e-06, + "loss": 0.0067, + "step": 5877 + }, + { + "epoch": 3.576513538180712, + "grad_norm": 0.20616334676742554, + "learning_rate": 1.4780229454232158e-06, + "loss": 0.0053, + "step": 5878 + }, + { + "epoch": 3.577121995740797, + "grad_norm": 0.18893709778785706, + "learning_rate": 1.4738089047846736e-06, + "loss": 0.0038, + "step": 5879 + }, + { + "epoch": 3.577730453300882, + "grad_norm": 0.2203339785337448, + "learning_rate": 1.4696006976609057e-06, + "loss": 0.0054, + "step": 5880 + }, + { + "epoch": 3.5783389108609676, + "grad_norm": 0.15522873401641846, + "learning_rate": 1.4653983250953657e-06, + "loss": 0.0029, + "step": 5881 + }, + { + "epoch": 3.5789473684210527, + "grad_norm": 0.14982964098453522, + "learning_rate": 1.4612017881300704e-06, + "loss": 0.0032, + "step": 5882 + }, + { + "epoch": 3.5795558259811378, + "grad_norm": 0.26596924662590027, + "learning_rate": 1.4570110878055877e-06, + "loss": 0.0044, + "step": 5883 + }, + { + "epoch": 3.580164283541223, + "grad_norm": 0.2501273453235626, + "learning_rate": 1.4528262251610358e-06, + "loss": 0.007, + "step": 5884 + }, + { + "epoch": 3.580772741101308, + "grad_norm": 0.19661864638328552, + "learning_rate": 1.4486472012340824e-06, + "loss": 0.0042, + "step": 5885 + }, + { + "epoch": 3.5813811986613935, + "grad_norm": 0.22375038266181946, + "learning_rate": 1.4444740170609567e-06, + "loss": 0.0061, + "step": 5886 + }, + { + "epoch": 3.5819896562214786, + "grad_norm": 0.28383755683898926, + "learning_rate": 1.440306673676431e-06, + "loss": 0.0056, + "step": 5887 + }, + { + "epoch": 3.5825981137815637, + "grad_norm": 0.17903096973896027, + "learning_rate": 1.436145172113834e-06, + "loss": 0.0037, + "step": 5888 + }, + { + "epoch": 3.5832065713416488, + "grad_norm": 0.24099615216255188, + "learning_rate": 1.4319895134050437e-06, + "loss": 0.0047, + "step": 5889 + }, + { + "epoch": 3.583815028901734, + "grad_norm": 0.20468921959400177, + "learning_rate": 1.4278396985804966e-06, + "loss": 0.005, + "step": 5890 + }, + { + "epoch": 3.5844234864618194, + "grad_norm": 0.26146969199180603, + "learning_rate": 1.4236957286691581e-06, + "loss": 0.0064, + "step": 5891 + }, + { + "epoch": 3.5850319440219045, + "grad_norm": 0.328022837638855, + "learning_rate": 1.4195576046985793e-06, + "loss": 0.0064, + "step": 5892 + }, + { + "epoch": 3.5856404015819896, + "grad_norm": 0.26720380783081055, + "learning_rate": 1.4154253276948276e-06, + "loss": 0.0044, + "step": 5893 + }, + { + "epoch": 3.5862488591420747, + "grad_norm": 0.20053601264953613, + "learning_rate": 1.4112988986825476e-06, + "loss": 0.0033, + "step": 5894 + }, + { + "epoch": 3.58685731670216, + "grad_norm": 0.251700758934021, + "learning_rate": 1.407178318684907e-06, + "loss": 0.0063, + "step": 5895 + }, + { + "epoch": 3.5874657742622453, + "grad_norm": 0.4364267587661743, + "learning_rate": 1.403063588723652e-06, + "loss": 0.0114, + "step": 5896 + }, + { + "epoch": 3.5880742318223304, + "grad_norm": 0.25533613562583923, + "learning_rate": 1.3989547098190559e-06, + "loss": 0.0049, + "step": 5897 + }, + { + "epoch": 3.5886826893824155, + "grad_norm": 0.22121703624725342, + "learning_rate": 1.3948516829899505e-06, + "loss": 0.0056, + "step": 5898 + }, + { + "epoch": 3.5892911469425006, + "grad_norm": 0.16689860820770264, + "learning_rate": 1.3907545092537166e-06, + "loss": 0.0029, + "step": 5899 + }, + { + "epoch": 3.5898996045025857, + "grad_norm": 0.22792737185955048, + "learning_rate": 1.386663189626286e-06, + "loss": 0.0051, + "step": 5900 + }, + { + "epoch": 3.5905080620626713, + "grad_norm": 0.275244802236557, + "learning_rate": 1.3825777251221278e-06, + "loss": 0.0072, + "step": 5901 + }, + { + "epoch": 3.5911165196227564, + "grad_norm": 0.34632372856140137, + "learning_rate": 1.378498116754276e-06, + "loss": 0.0078, + "step": 5902 + }, + { + "epoch": 3.5917249771828414, + "grad_norm": 0.20081204175949097, + "learning_rate": 1.3744243655342937e-06, + "loss": 0.006, + "step": 5903 + }, + { + "epoch": 3.5923334347429265, + "grad_norm": 0.11412859708070755, + "learning_rate": 1.3703564724723116e-06, + "loss": 0.0016, + "step": 5904 + }, + { + "epoch": 3.5929418923030116, + "grad_norm": 0.22430390119552612, + "learning_rate": 1.3662944385769843e-06, + "loss": 0.004, + "step": 5905 + }, + { + "epoch": 3.593550349863097, + "grad_norm": 0.25405508279800415, + "learning_rate": 1.362238264855542e-06, + "loss": 0.007, + "step": 5906 + }, + { + "epoch": 3.5941588074231823, + "grad_norm": 0.447488397359848, + "learning_rate": 1.3581879523137386e-06, + "loss": 0.0064, + "step": 5907 + }, + { + "epoch": 3.5947672649832674, + "grad_norm": 0.2718401253223419, + "learning_rate": 1.354143501955879e-06, + "loss": 0.0067, + "step": 5908 + }, + { + "epoch": 3.5953757225433525, + "grad_norm": 0.18165600299835205, + "learning_rate": 1.3501049147848277e-06, + "loss": 0.0047, + "step": 5909 + }, + { + "epoch": 3.5959841801034376, + "grad_norm": 0.2794153392314911, + "learning_rate": 1.346072191801981e-06, + "loss": 0.0047, + "step": 5910 + }, + { + "epoch": 3.596592637663523, + "grad_norm": 0.21536363661289215, + "learning_rate": 1.3420453340072832e-06, + "loss": 0.0067, + "step": 5911 + }, + { + "epoch": 3.597201095223608, + "grad_norm": 0.2198522537946701, + "learning_rate": 1.3380243423992328e-06, + "loss": 0.0053, + "step": 5912 + }, + { + "epoch": 3.5978095527836933, + "grad_norm": 0.17715135216712952, + "learning_rate": 1.3340092179748658e-06, + "loss": 0.004, + "step": 5913 + }, + { + "epoch": 3.5984180103437784, + "grad_norm": 0.2602989375591278, + "learning_rate": 1.3299999617297637e-06, + "loss": 0.0056, + "step": 5914 + }, + { + "epoch": 3.5990264679038635, + "grad_norm": 0.20000648498535156, + "learning_rate": 1.3259965746580588e-06, + "loss": 0.0018, + "step": 5915 + }, + { + "epoch": 3.599634925463949, + "grad_norm": 0.25586432218551636, + "learning_rate": 1.3219990577524239e-06, + "loss": 0.008, + "step": 5916 + }, + { + "epoch": 3.600243383024034, + "grad_norm": 0.2492225617170334, + "learning_rate": 1.3180074120040741e-06, + "loss": 0.0054, + "step": 5917 + }, + { + "epoch": 3.6008518405841192, + "grad_norm": 0.21238954365253448, + "learning_rate": 1.3140216384027682e-06, + "loss": 0.0044, + "step": 5918 + }, + { + "epoch": 3.6014602981442043, + "grad_norm": 0.13737817108631134, + "learning_rate": 1.3100417379368179e-06, + "loss": 0.0033, + "step": 5919 + }, + { + "epoch": 3.6020687557042894, + "grad_norm": 0.34668809175491333, + "learning_rate": 1.306067711593076e-06, + "loss": 0.0071, + "step": 5920 + }, + { + "epoch": 3.602677213264375, + "grad_norm": 0.2114226520061493, + "learning_rate": 1.3020995603569203e-06, + "loss": 0.0044, + "step": 5921 + }, + { + "epoch": 3.60328567082446, + "grad_norm": 0.3277231454849243, + "learning_rate": 1.298137285212303e-06, + "loss": 0.008, + "step": 5922 + }, + { + "epoch": 3.603894128384545, + "grad_norm": 0.2284201830625534, + "learning_rate": 1.2941808871416938e-06, + "loss": 0.0035, + "step": 5923 + }, + { + "epoch": 3.6045025859446302, + "grad_norm": 0.2673366069793701, + "learning_rate": 1.290230367126119e-06, + "loss": 0.005, + "step": 5924 + }, + { + "epoch": 3.6051110435047153, + "grad_norm": 0.21040508151054382, + "learning_rate": 1.2862857261451395e-06, + "loss": 0.0037, + "step": 5925 + }, + { + "epoch": 3.605719501064801, + "grad_norm": 0.2350999116897583, + "learning_rate": 1.282346965176867e-06, + "loss": 0.0021, + "step": 5926 + }, + { + "epoch": 3.606327958624886, + "grad_norm": 0.33141762018203735, + "learning_rate": 1.2784140851979404e-06, + "loss": 0.0068, + "step": 5927 + }, + { + "epoch": 3.606936416184971, + "grad_norm": 0.2573280334472656, + "learning_rate": 1.2744870871835623e-06, + "loss": 0.0072, + "step": 5928 + }, + { + "epoch": 3.607544873745056, + "grad_norm": 0.21989892423152924, + "learning_rate": 1.270565972107457e-06, + "loss": 0.0088, + "step": 5929 + }, + { + "epoch": 3.6081533313051413, + "grad_norm": 0.2301366925239563, + "learning_rate": 1.2666507409418993e-06, + "loss": 0.0065, + "step": 5930 + }, + { + "epoch": 3.608761788865227, + "grad_norm": 0.18148985505104065, + "learning_rate": 1.2627413946576989e-06, + "loss": 0.0043, + "step": 5931 + }, + { + "epoch": 3.609370246425312, + "grad_norm": 0.24122391641139984, + "learning_rate": 1.2588379342242218e-06, + "loss": 0.0051, + "step": 5932 + }, + { + "epoch": 3.609978703985397, + "grad_norm": 0.4258638322353363, + "learning_rate": 1.2549403606093525e-06, + "loss": 0.0056, + "step": 5933 + }, + { + "epoch": 3.610587161545482, + "grad_norm": 0.2916209399700165, + "learning_rate": 1.2510486747795286e-06, + "loss": 0.006, + "step": 5934 + }, + { + "epoch": 3.611195619105567, + "grad_norm": 0.2829422056674957, + "learning_rate": 1.2471628776997312e-06, + "loss": 0.0039, + "step": 5935 + }, + { + "epoch": 3.6118040766656527, + "grad_norm": 0.2982374429702759, + "learning_rate": 1.2432829703334759e-06, + "loss": 0.006, + "step": 5936 + }, + { + "epoch": 3.612412534225738, + "grad_norm": 0.19474712014198303, + "learning_rate": 1.2394089536428067e-06, + "loss": 0.006, + "step": 5937 + }, + { + "epoch": 3.613020991785823, + "grad_norm": 0.2293567806482315, + "learning_rate": 1.2355408285883357e-06, + "loss": 0.0076, + "step": 5938 + }, + { + "epoch": 3.613629449345908, + "grad_norm": 0.26767081022262573, + "learning_rate": 1.2316785961291849e-06, + "loss": 0.0035, + "step": 5939 + }, + { + "epoch": 3.614237906905993, + "grad_norm": 0.2060486376285553, + "learning_rate": 1.2278222572230268e-06, + "loss": 0.0043, + "step": 5940 + }, + { + "epoch": 3.6148463644660787, + "grad_norm": 0.21085241436958313, + "learning_rate": 1.2239718128260774e-06, + "loss": 0.0062, + "step": 5941 + }, + { + "epoch": 3.6154548220261638, + "grad_norm": 0.24178573489189148, + "learning_rate": 1.2201272638930894e-06, + "loss": 0.0058, + "step": 5942 + }, + { + "epoch": 3.616063279586249, + "grad_norm": 0.20811717212200165, + "learning_rate": 1.216288611377342e-06, + "loss": 0.0049, + "step": 5943 + }, + { + "epoch": 3.616671737146334, + "grad_norm": 0.3802126348018646, + "learning_rate": 1.2124558562306625e-06, + "loss": 0.0064, + "step": 5944 + }, + { + "epoch": 3.617280194706419, + "grad_norm": 0.2511650323867798, + "learning_rate": 1.2086289994034217e-06, + "loss": 0.0051, + "step": 5945 + }, + { + "epoch": 3.6178886522665046, + "grad_norm": 0.27000531554222107, + "learning_rate": 1.204808041844513e-06, + "loss": 0.0072, + "step": 5946 + }, + { + "epoch": 3.6184971098265897, + "grad_norm": 0.26983216404914856, + "learning_rate": 1.2009929845013757e-06, + "loss": 0.0072, + "step": 5947 + }, + { + "epoch": 3.6191055673866748, + "grad_norm": 0.19793999195098877, + "learning_rate": 1.197183828319992e-06, + "loss": 0.0045, + "step": 5948 + }, + { + "epoch": 3.61971402494676, + "grad_norm": 0.238411545753479, + "learning_rate": 1.193380574244865e-06, + "loss": 0.0068, + "step": 5949 + }, + { + "epoch": 3.620322482506845, + "grad_norm": 0.1789894700050354, + "learning_rate": 1.1895832232190485e-06, + "loss": 0.0026, + "step": 5950 + }, + { + "epoch": 3.6209309400669305, + "grad_norm": 0.27357521653175354, + "learning_rate": 1.1857917761841225e-06, + "loss": 0.0085, + "step": 5951 + }, + { + "epoch": 3.6215393976270156, + "grad_norm": 0.31104499101638794, + "learning_rate": 1.182006234080213e-06, + "loss": 0.0063, + "step": 5952 + }, + { + "epoch": 3.6221478551871007, + "grad_norm": 0.23851986229419708, + "learning_rate": 1.1782265978459771e-06, + "loss": 0.0035, + "step": 5953 + }, + { + "epoch": 3.622756312747186, + "grad_norm": 0.48092883825302124, + "learning_rate": 1.1744528684186018e-06, + "loss": 0.0056, + "step": 5954 + }, + { + "epoch": 3.623364770307271, + "grad_norm": 0.19673019647598267, + "learning_rate": 1.170685046733816e-06, + "loss": 0.0052, + "step": 5955 + }, + { + "epoch": 3.6239732278673564, + "grad_norm": 0.24021019041538239, + "learning_rate": 1.1669231337258862e-06, + "loss": 0.0064, + "step": 5956 + }, + { + "epoch": 3.6245816854274415, + "grad_norm": 0.2394445836544037, + "learning_rate": 1.1631671303276054e-06, + "loss": 0.0043, + "step": 5957 + }, + { + "epoch": 3.6251901429875266, + "grad_norm": 0.24804948270320892, + "learning_rate": 1.1594170374703088e-06, + "loss": 0.0048, + "step": 5958 + }, + { + "epoch": 3.6257986005476117, + "grad_norm": 0.3146321475505829, + "learning_rate": 1.155672856083867e-06, + "loss": 0.0046, + "step": 5959 + }, + { + "epoch": 3.626407058107697, + "grad_norm": 0.2864423990249634, + "learning_rate": 1.1519345870966703e-06, + "loss": 0.0097, + "step": 5960 + }, + { + "epoch": 3.6270155156677824, + "grad_norm": 0.26554256677627563, + "learning_rate": 1.1482022314356606e-06, + "loss": 0.007, + "step": 5961 + }, + { + "epoch": 3.6276239732278674, + "grad_norm": 0.2531816363334656, + "learning_rate": 1.144475790026306e-06, + "loss": 0.0054, + "step": 5962 + }, + { + "epoch": 3.6282324307879525, + "grad_norm": 0.2108616828918457, + "learning_rate": 1.1407552637926117e-06, + "loss": 0.0055, + "step": 5963 + }, + { + "epoch": 3.6288408883480376, + "grad_norm": 0.19454440474510193, + "learning_rate": 1.1370406536571066e-06, + "loss": 0.0038, + "step": 5964 + }, + { + "epoch": 3.6294493459081227, + "grad_norm": 0.2602161169052124, + "learning_rate": 1.1333319605408622e-06, + "loss": 0.0079, + "step": 5965 + }, + { + "epoch": 3.6300578034682083, + "grad_norm": 0.22308175265789032, + "learning_rate": 1.1296291853634816e-06, + "loss": 0.0068, + "step": 5966 + }, + { + "epoch": 3.6306662610282934, + "grad_norm": 0.19663815200328827, + "learning_rate": 1.1259323290430944e-06, + "loss": 0.0054, + "step": 5967 + }, + { + "epoch": 3.6312747185883785, + "grad_norm": 0.23746468126773834, + "learning_rate": 1.1222413924963705e-06, + "loss": 0.0043, + "step": 5968 + }, + { + "epoch": 3.6318831761484636, + "grad_norm": 0.19152231514453888, + "learning_rate": 1.1185563766385077e-06, + "loss": 0.0048, + "step": 5969 + }, + { + "epoch": 3.6324916337085487, + "grad_norm": 0.21245577931404114, + "learning_rate": 1.1148772823832365e-06, + "loss": 0.0034, + "step": 5970 + }, + { + "epoch": 3.633100091268634, + "grad_norm": 0.29071682691574097, + "learning_rate": 1.1112041106428162e-06, + "loss": 0.008, + "step": 5971 + }, + { + "epoch": 3.6337085488287193, + "grad_norm": 0.23585431277751923, + "learning_rate": 1.107536862328043e-06, + "loss": 0.0071, + "step": 5972 + }, + { + "epoch": 3.6343170063888044, + "grad_norm": 0.29736191034317017, + "learning_rate": 1.1038755383482397e-06, + "loss": 0.0092, + "step": 5973 + }, + { + "epoch": 3.6349254639488895, + "grad_norm": 0.24729853868484497, + "learning_rate": 1.100220139611266e-06, + "loss": 0.0049, + "step": 5974 + }, + { + "epoch": 3.6355339215089746, + "grad_norm": 0.15655024349689484, + "learning_rate": 1.0965706670235081e-06, + "loss": 0.0027, + "step": 5975 + }, + { + "epoch": 3.63614237906906, + "grad_norm": 0.15523763000965118, + "learning_rate": 1.0929271214898755e-06, + "loss": 0.0044, + "step": 5976 + }, + { + "epoch": 3.6367508366291452, + "grad_norm": 0.33551907539367676, + "learning_rate": 1.0892895039138234e-06, + "loss": 0.0069, + "step": 5977 + }, + { + "epoch": 3.6373592941892303, + "grad_norm": 0.2048521190881729, + "learning_rate": 1.0856578151973246e-06, + "loss": 0.0038, + "step": 5978 + }, + { + "epoch": 3.6379677517493154, + "grad_norm": 0.4035876393318176, + "learning_rate": 1.0820320562408947e-06, + "loss": 0.013, + "step": 5979 + }, + { + "epoch": 3.6385762093094005, + "grad_norm": 0.3847244083881378, + "learning_rate": 1.0784122279435565e-06, + "loss": 0.0125, + "step": 5980 + }, + { + "epoch": 3.639184666869486, + "grad_norm": 0.2535593807697296, + "learning_rate": 1.074798331202892e-06, + "loss": 0.0073, + "step": 5981 + }, + { + "epoch": 3.639793124429571, + "grad_norm": 0.3089277148246765, + "learning_rate": 1.0711903669149843e-06, + "loss": 0.0079, + "step": 5982 + }, + { + "epoch": 3.6404015819896562, + "grad_norm": 0.3164043724536896, + "learning_rate": 1.067588335974465e-06, + "loss": 0.0064, + "step": 5983 + }, + { + "epoch": 3.6410100395497413, + "grad_norm": 0.30803948640823364, + "learning_rate": 1.0639922392744889e-06, + "loss": 0.0103, + "step": 5984 + }, + { + "epoch": 3.6416184971098264, + "grad_norm": 0.32882362604141235, + "learning_rate": 1.0604020777067347e-06, + "loss": 0.0082, + "step": 5985 + }, + { + "epoch": 3.642226954669912, + "grad_norm": 0.9143747091293335, + "learning_rate": 1.0568178521614125e-06, + "loss": 0.0232, + "step": 5986 + }, + { + "epoch": 3.642835412229997, + "grad_norm": 0.43392911553382874, + "learning_rate": 1.0532395635272613e-06, + "loss": 0.0092, + "step": 5987 + }, + { + "epoch": 3.643443869790082, + "grad_norm": 0.18668395280838013, + "learning_rate": 1.0496672126915492e-06, + "loss": 0.0029, + "step": 5988 + }, + { + "epoch": 3.6440523273501673, + "grad_norm": 0.2678108513355255, + "learning_rate": 1.046100800540073e-06, + "loss": 0.0064, + "step": 5989 + }, + { + "epoch": 3.6446607849102524, + "grad_norm": 0.30396801233291626, + "learning_rate": 1.0425403279571422e-06, + "loss": 0.0081, + "step": 5990 + }, + { + "epoch": 3.645269242470338, + "grad_norm": 0.17665399610996246, + "learning_rate": 1.0389857958256227e-06, + "loss": 0.0039, + "step": 5991 + }, + { + "epoch": 3.645877700030423, + "grad_norm": 0.24349285662174225, + "learning_rate": 1.0354372050268762e-06, + "loss": 0.003, + "step": 5992 + }, + { + "epoch": 3.646486157590508, + "grad_norm": 0.22671709954738617, + "learning_rate": 1.03189455644081e-06, + "loss": 0.0066, + "step": 5993 + }, + { + "epoch": 3.647094615150593, + "grad_norm": 0.2378721684217453, + "learning_rate": 1.0283578509458548e-06, + "loss": 0.0059, + "step": 5994 + }, + { + "epoch": 3.6477030727106783, + "grad_norm": 0.3115555942058563, + "learning_rate": 1.0248270894189698e-06, + "loss": 0.0051, + "step": 5995 + }, + { + "epoch": 3.648311530270764, + "grad_norm": 0.3361762464046478, + "learning_rate": 1.0213022727356247e-06, + "loss": 0.0077, + "step": 5996 + }, + { + "epoch": 3.648919987830849, + "grad_norm": 0.27221810817718506, + "learning_rate": 1.0177834017698423e-06, + "loss": 0.0068, + "step": 5997 + }, + { + "epoch": 3.649528445390934, + "grad_norm": 0.2501876652240753, + "learning_rate": 1.0142704773941443e-06, + "loss": 0.0063, + "step": 5998 + }, + { + "epoch": 3.650136902951019, + "grad_norm": 0.23100760579109192, + "learning_rate": 1.0107635004795946e-06, + "loss": 0.0052, + "step": 5999 + }, + { + "epoch": 3.650745360511104, + "grad_norm": 0.18547479808330536, + "learning_rate": 1.007262471895773e-06, + "loss": 0.004, + "step": 6000 + }, + { + "epoch": 3.6513538180711897, + "grad_norm": 0.22204366326332092, + "learning_rate": 1.003767392510796e-06, + "loss": 0.0041, + "step": 6001 + }, + { + "epoch": 3.651962275631275, + "grad_norm": 0.1914484053850174, + "learning_rate": 1.00027826319129e-06, + "loss": 0.0057, + "step": 6002 + }, + { + "epoch": 3.65257073319136, + "grad_norm": 0.18928459286689758, + "learning_rate": 9.967950848024183e-07, + "loss": 0.0041, + "step": 6003 + }, + { + "epoch": 3.653179190751445, + "grad_norm": 0.2991024851799011, + "learning_rate": 9.933178582078624e-07, + "loss": 0.005, + "step": 6004 + }, + { + "epoch": 3.65378764831153, + "grad_norm": 0.12466610223054886, + "learning_rate": 9.898465842698323e-07, + "loss": 0.0017, + "step": 6005 + }, + { + "epoch": 3.6543961058716157, + "grad_norm": 0.5989731550216675, + "learning_rate": 9.863812638490511e-07, + "loss": 0.016, + "step": 6006 + }, + { + "epoch": 3.6550045634317008, + "grad_norm": 0.22727476060390472, + "learning_rate": 9.829218978047839e-07, + "loss": 0.0059, + "step": 6007 + }, + { + "epoch": 3.655613020991786, + "grad_norm": 0.26156729459762573, + "learning_rate": 9.794684869948056e-07, + "loss": 0.0066, + "step": 6008 + }, + { + "epoch": 3.656221478551871, + "grad_norm": 0.24984483420848846, + "learning_rate": 9.760210322754175e-07, + "loss": 0.0052, + "step": 6009 + }, + { + "epoch": 3.656829936111956, + "grad_norm": 0.1444278508424759, + "learning_rate": 9.725795345014387e-07, + "loss": 0.0034, + "step": 6010 + }, + { + "epoch": 3.6574383936720416, + "grad_norm": 0.2517237663269043, + "learning_rate": 9.69143994526231e-07, + "loss": 0.0065, + "step": 6011 + }, + { + "epoch": 3.6580468512321267, + "grad_norm": 0.2072015106678009, + "learning_rate": 9.657144132016517e-07, + "loss": 0.0028, + "step": 6012 + }, + { + "epoch": 3.658655308792212, + "grad_norm": 0.21760626137256622, + "learning_rate": 9.62290791378101e-07, + "loss": 0.0055, + "step": 6013 + }, + { + "epoch": 3.659263766352297, + "grad_norm": 0.22900134325027466, + "learning_rate": 9.588731299044945e-07, + "loss": 0.0089, + "step": 6014 + }, + { + "epoch": 3.659872223912382, + "grad_norm": 0.23601651191711426, + "learning_rate": 9.554614296282682e-07, + "loss": 0.008, + "step": 6015 + }, + { + "epoch": 3.6604806814724675, + "grad_norm": 0.20876143872737885, + "learning_rate": 9.52055691395376e-07, + "loss": 0.0056, + "step": 6016 + }, + { + "epoch": 3.6610891390325526, + "grad_norm": 0.20409031212329865, + "learning_rate": 9.486559160503117e-07, + "loss": 0.0051, + "step": 6017 + }, + { + "epoch": 3.6616975965926377, + "grad_norm": 0.17437663674354553, + "learning_rate": 9.452621044360676e-07, + "loss": 0.0045, + "step": 6018 + }, + { + "epoch": 3.662306054152723, + "grad_norm": 0.7772207260131836, + "learning_rate": 9.418742573941707e-07, + "loss": 0.042, + "step": 6019 + }, + { + "epoch": 3.662914511712808, + "grad_norm": 0.21668609976768494, + "learning_rate": 9.384923757646657e-07, + "loss": 0.0037, + "step": 6020 + }, + { + "epoch": 3.6635229692728934, + "grad_norm": 0.22653992474079132, + "learning_rate": 9.351164603861234e-07, + "loss": 0.0067, + "step": 6021 + }, + { + "epoch": 3.6641314268329785, + "grad_norm": 0.19564014673233032, + "learning_rate": 9.317465120956215e-07, + "loss": 0.0054, + "step": 6022 + }, + { + "epoch": 3.6647398843930636, + "grad_norm": 0.17543290555477142, + "learning_rate": 9.283825317287692e-07, + "loss": 0.0033, + "step": 6023 + }, + { + "epoch": 3.6653483419531487, + "grad_norm": 0.16604088246822357, + "learning_rate": 9.250245201196938e-07, + "loss": 0.0054, + "step": 6024 + }, + { + "epoch": 3.665956799513234, + "grad_norm": 0.25378942489624023, + "learning_rate": 9.216724781010461e-07, + "loss": 0.005, + "step": 6025 + }, + { + "epoch": 3.6665652570733194, + "grad_norm": 0.2088557928800583, + "learning_rate": 9.183264065039859e-07, + "loss": 0.0033, + "step": 6026 + }, + { + "epoch": 3.6671737146334045, + "grad_norm": 0.22516481578350067, + "learning_rate": 9.149863061582053e-07, + "loss": 0.0092, + "step": 6027 + }, + { + "epoch": 3.6677821721934896, + "grad_norm": 0.31587862968444824, + "learning_rate": 9.116521778919085e-07, + "loss": 0.0089, + "step": 6028 + }, + { + "epoch": 3.6683906297535747, + "grad_norm": 0.34137919545173645, + "learning_rate": 9.083240225318202e-07, + "loss": 0.0135, + "step": 6029 + }, + { + "epoch": 3.6689990873136598, + "grad_norm": 0.21958407759666443, + "learning_rate": 9.050018409031801e-07, + "loss": 0.0044, + "step": 6030 + }, + { + "epoch": 3.6696075448737453, + "grad_norm": 0.32528388500213623, + "learning_rate": 9.016856338297602e-07, + "loss": 0.0076, + "step": 6031 + }, + { + "epoch": 3.6702160024338304, + "grad_norm": 0.21904724836349487, + "learning_rate": 8.983754021338331e-07, + "loss": 0.0048, + "step": 6032 + }, + { + "epoch": 3.6708244599939155, + "grad_norm": 0.3020154535770416, + "learning_rate": 8.950711466362005e-07, + "loss": 0.006, + "step": 6033 + }, + { + "epoch": 3.6714329175540006, + "grad_norm": 0.2894566059112549, + "learning_rate": 8.917728681561793e-07, + "loss": 0.0057, + "step": 6034 + }, + { + "epoch": 3.6720413751140857, + "grad_norm": 0.2588036060333252, + "learning_rate": 8.884805675116098e-07, + "loss": 0.0056, + "step": 6035 + }, + { + "epoch": 3.672649832674171, + "grad_norm": 0.2685299813747406, + "learning_rate": 8.851942455188362e-07, + "loss": 0.007, + "step": 6036 + }, + { + "epoch": 3.6732582902342563, + "grad_norm": 0.19193175435066223, + "learning_rate": 8.819139029927425e-07, + "loss": 0.0032, + "step": 6037 + }, + { + "epoch": 3.6738667477943414, + "grad_norm": 0.20610147714614868, + "learning_rate": 8.786395407467062e-07, + "loss": 0.0036, + "step": 6038 + }, + { + "epoch": 3.6744752053544265, + "grad_norm": 0.17886589467525482, + "learning_rate": 8.753711595926334e-07, + "loss": 0.0046, + "step": 6039 + }, + { + "epoch": 3.6750836629145116, + "grad_norm": 0.2853025794029236, + "learning_rate": 8.721087603409506e-07, + "loss": 0.0057, + "step": 6040 + }, + { + "epoch": 3.675692120474597, + "grad_norm": 0.22553856670856476, + "learning_rate": 8.688523438005996e-07, + "loss": 0.0043, + "step": 6041 + }, + { + "epoch": 3.6763005780346822, + "grad_norm": 0.20292532444000244, + "learning_rate": 8.656019107790237e-07, + "loss": 0.0061, + "step": 6042 + }, + { + "epoch": 3.6769090355947673, + "grad_norm": 0.24954308569431305, + "learning_rate": 8.623574620822083e-07, + "loss": 0.0065, + "step": 6043 + }, + { + "epoch": 3.6775174931548524, + "grad_norm": 0.36442530155181885, + "learning_rate": 8.591189985146352e-07, + "loss": 0.0074, + "step": 6044 + }, + { + "epoch": 3.6781259507149375, + "grad_norm": 0.26411545276641846, + "learning_rate": 8.558865208793093e-07, + "loss": 0.0049, + "step": 6045 + }, + { + "epoch": 3.678734408275023, + "grad_norm": 0.2690979838371277, + "learning_rate": 8.526600299777448e-07, + "loss": 0.0055, + "step": 6046 + }, + { + "epoch": 3.679342865835108, + "grad_norm": 0.14458657801151276, + "learning_rate": 8.49439526609988e-07, + "loss": 0.0029, + "step": 6047 + }, + { + "epoch": 3.6799513233951933, + "grad_norm": 0.2396114468574524, + "learning_rate": 8.462250115745807e-07, + "loss": 0.0076, + "step": 6048 + }, + { + "epoch": 3.6805597809552784, + "grad_norm": 0.22277654707431793, + "learning_rate": 8.430164856685935e-07, + "loss": 0.0046, + "step": 6049 + }, + { + "epoch": 3.6811682385153635, + "grad_norm": 0.22810159623622894, + "learning_rate": 8.398139496876012e-07, + "loss": 0.0072, + "step": 6050 + }, + { + "epoch": 3.681776696075449, + "grad_norm": 0.21732278168201447, + "learning_rate": 8.366174044257103e-07, + "loss": 0.0048, + "step": 6051 + }, + { + "epoch": 3.6823851536355336, + "grad_norm": 0.22662782669067383, + "learning_rate": 8.334268506755144e-07, + "loss": 0.0038, + "step": 6052 + }, + { + "epoch": 3.682993611195619, + "grad_norm": 0.26345643401145935, + "learning_rate": 8.302422892281558e-07, + "loss": 0.0067, + "step": 6053 + }, + { + "epoch": 3.6836020687557043, + "grad_norm": 0.24986715614795685, + "learning_rate": 8.270637208732585e-07, + "loss": 0.0056, + "step": 6054 + }, + { + "epoch": 3.6842105263157894, + "grad_norm": 0.31368815898895264, + "learning_rate": 8.238911463989835e-07, + "loss": 0.0067, + "step": 6055 + }, + { + "epoch": 3.684818983875875, + "grad_norm": 0.6407575607299805, + "learning_rate": 8.207245665919932e-07, + "loss": 0.0278, + "step": 6056 + }, + { + "epoch": 3.6854274414359596, + "grad_norm": 0.259106308221817, + "learning_rate": 8.175639822374709e-07, + "loss": 0.0082, + "step": 6057 + }, + { + "epoch": 3.686035898996045, + "grad_norm": 0.1873266100883484, + "learning_rate": 8.144093941191061e-07, + "loss": 0.0042, + "step": 6058 + }, + { + "epoch": 3.68664435655613, + "grad_norm": 0.3827762007713318, + "learning_rate": 8.112608030191037e-07, + "loss": 0.0117, + "step": 6059 + }, + { + "epoch": 3.6872528141162153, + "grad_norm": 0.3754812479019165, + "learning_rate": 8.081182097181894e-07, + "loss": 0.0145, + "step": 6060 + }, + { + "epoch": 3.687861271676301, + "grad_norm": 0.26298126578330994, + "learning_rate": 8.049816149955896e-07, + "loss": 0.0056, + "step": 6061 + }, + { + "epoch": 3.6884697292363855, + "grad_norm": 0.1487468034029007, + "learning_rate": 8.018510196290519e-07, + "loss": 0.0029, + "step": 6062 + }, + { + "epoch": 3.689078186796471, + "grad_norm": 0.30363762378692627, + "learning_rate": 7.987264243948356e-07, + "loss": 0.0086, + "step": 6063 + }, + { + "epoch": 3.689686644356556, + "grad_norm": 0.1642022281885147, + "learning_rate": 7.956078300677045e-07, + "loss": 0.0029, + "step": 6064 + }, + { + "epoch": 3.6902951019166412, + "grad_norm": 0.2749357521533966, + "learning_rate": 7.924952374209399e-07, + "loss": 0.008, + "step": 6065 + }, + { + "epoch": 3.6909035594767268, + "grad_norm": 0.14298266172409058, + "learning_rate": 7.893886472263412e-07, + "loss": 0.0026, + "step": 6066 + }, + { + "epoch": 3.6915120170368114, + "grad_norm": 0.2198246717453003, + "learning_rate": 7.862880602542116e-07, + "loss": 0.005, + "step": 6067 + }, + { + "epoch": 3.692120474596897, + "grad_norm": 0.21324285864830017, + "learning_rate": 7.83193477273364e-07, + "loss": 0.0043, + "step": 6068 + }, + { + "epoch": 3.692728932156982, + "grad_norm": 0.3092917203903198, + "learning_rate": 7.801048990511262e-07, + "loss": 0.0179, + "step": 6069 + }, + { + "epoch": 3.693337389717067, + "grad_norm": 0.24335362017154694, + "learning_rate": 7.770223263533411e-07, + "loss": 0.0066, + "step": 6070 + }, + { + "epoch": 3.6939458472771527, + "grad_norm": 0.27647829055786133, + "learning_rate": 7.739457599443528e-07, + "loss": 0.0069, + "step": 6071 + }, + { + "epoch": 3.6945543048372373, + "grad_norm": 0.27333924174308777, + "learning_rate": 7.708752005870263e-07, + "loss": 0.0076, + "step": 6072 + }, + { + "epoch": 3.695162762397323, + "grad_norm": 0.21789668500423431, + "learning_rate": 7.6781064904273e-07, + "loss": 0.0057, + "step": 6073 + }, + { + "epoch": 3.695771219957408, + "grad_norm": 0.25622355937957764, + "learning_rate": 7.647521060713452e-07, + "loss": 0.0054, + "step": 6074 + }, + { + "epoch": 3.696379677517493, + "grad_norm": 0.28123655915260315, + "learning_rate": 7.616995724312626e-07, + "loss": 0.0085, + "step": 6075 + }, + { + "epoch": 3.696988135077578, + "grad_norm": 0.2564314305782318, + "learning_rate": 7.586530488793847e-07, + "loss": 0.0069, + "step": 6076 + }, + { + "epoch": 3.6975965926376633, + "grad_norm": 0.30161136388778687, + "learning_rate": 7.556125361711214e-07, + "loss": 0.0059, + "step": 6077 + }, + { + "epoch": 3.698205050197749, + "grad_norm": 0.17149339616298676, + "learning_rate": 7.525780350603917e-07, + "loss": 0.0035, + "step": 6078 + }, + { + "epoch": 3.698813507757834, + "grad_norm": 0.2607922852039337, + "learning_rate": 7.495495462996327e-07, + "loss": 0.0156, + "step": 6079 + }, + { + "epoch": 3.699421965317919, + "grad_norm": 0.30373069643974304, + "learning_rate": 7.465270706397714e-07, + "loss": 0.0071, + "step": 6080 + }, + { + "epoch": 3.700030422878004, + "grad_norm": 0.17537471652030945, + "learning_rate": 7.43510608830264e-07, + "loss": 0.0047, + "step": 6081 + }, + { + "epoch": 3.700638880438089, + "grad_norm": 0.2823267877101898, + "learning_rate": 7.405001616190649e-07, + "loss": 0.0044, + "step": 6082 + }, + { + "epoch": 3.7012473379981747, + "grad_norm": 0.3912118375301361, + "learning_rate": 7.374957297526408e-07, + "loss": 0.006, + "step": 6083 + }, + { + "epoch": 3.70185579555826, + "grad_norm": 0.2731781303882599, + "learning_rate": 7.344973139759654e-07, + "loss": 0.0076, + "step": 6084 + }, + { + "epoch": 3.702464253118345, + "grad_norm": 0.23557397723197937, + "learning_rate": 7.315049150325187e-07, + "loss": 0.0049, + "step": 6085 + }, + { + "epoch": 3.70307271067843, + "grad_norm": 0.19650277495384216, + "learning_rate": 7.285185336642908e-07, + "loss": 0.0042, + "step": 6086 + }, + { + "epoch": 3.703681168238515, + "grad_norm": 0.18600913882255554, + "learning_rate": 7.255381706117837e-07, + "loss": 0.0031, + "step": 6087 + }, + { + "epoch": 3.7042896257986007, + "grad_norm": 0.21114131808280945, + "learning_rate": 7.225638266140006e-07, + "loss": 0.0051, + "step": 6088 + }, + { + "epoch": 3.7048980833586858, + "grad_norm": 0.2112278938293457, + "learning_rate": 7.195955024084544e-07, + "loss": 0.0045, + "step": 6089 + }, + { + "epoch": 3.705506540918771, + "grad_norm": 0.16948889195919037, + "learning_rate": 7.166331987311675e-07, + "loss": 0.0044, + "step": 6090 + }, + { + "epoch": 3.706114998478856, + "grad_norm": 0.19342589378356934, + "learning_rate": 7.136769163166662e-07, + "loss": 0.0037, + "step": 6091 + }, + { + "epoch": 3.706723456038941, + "grad_norm": 0.2661781311035156, + "learning_rate": 7.107266558979864e-07, + "loss": 0.0043, + "step": 6092 + }, + { + "epoch": 3.7073319135990266, + "grad_norm": 0.23141947388648987, + "learning_rate": 7.077824182066678e-07, + "loss": 0.0044, + "step": 6093 + }, + { + "epoch": 3.7079403711591117, + "grad_norm": 0.25309503078460693, + "learning_rate": 7.048442039727627e-07, + "loss": 0.004, + "step": 6094 + }, + { + "epoch": 3.7085488287191968, + "grad_norm": 0.2142305076122284, + "learning_rate": 7.019120139248187e-07, + "loss": 0.005, + "step": 6095 + }, + { + "epoch": 3.709157286279282, + "grad_norm": 0.2936217188835144, + "learning_rate": 6.989858487899043e-07, + "loss": 0.0058, + "step": 6096 + }, + { + "epoch": 3.709765743839367, + "grad_norm": 0.22409573197364807, + "learning_rate": 6.960657092935807e-07, + "loss": 0.0069, + "step": 6097 + }, + { + "epoch": 3.7103742013994525, + "grad_norm": 0.21282601356506348, + "learning_rate": 6.931515961599244e-07, + "loss": 0.0028, + "step": 6098 + }, + { + "epoch": 3.7109826589595376, + "grad_norm": 0.22673866152763367, + "learning_rate": 6.902435101115129e-07, + "loss": 0.0061, + "step": 6099 + }, + { + "epoch": 3.7115911165196227, + "grad_norm": 0.181802436709404, + "learning_rate": 6.873414518694332e-07, + "loss": 0.0035, + "step": 6100 + }, + { + "epoch": 3.712199574079708, + "grad_norm": 0.24081139266490936, + "learning_rate": 6.844454221532682e-07, + "loss": 0.0051, + "step": 6101 + }, + { + "epoch": 3.712808031639793, + "grad_norm": 0.19030770659446716, + "learning_rate": 6.815554216811182e-07, + "loss": 0.0031, + "step": 6102 + }, + { + "epoch": 3.7134164891998784, + "grad_norm": 0.2784750163555145, + "learning_rate": 6.786714511695796e-07, + "loss": 0.0089, + "step": 6103 + }, + { + "epoch": 3.7140249467599635, + "grad_norm": 0.26827266812324524, + "learning_rate": 6.75793511333761e-07, + "loss": 0.0047, + "step": 6104 + }, + { + "epoch": 3.7146334043200486, + "grad_norm": 0.2427385449409485, + "learning_rate": 6.729216028872637e-07, + "loss": 0.0047, + "step": 6105 + }, + { + "epoch": 3.7152418618801337, + "grad_norm": 0.19240151345729828, + "learning_rate": 6.700557265422097e-07, + "loss": 0.0039, + "step": 6106 + }, + { + "epoch": 3.715850319440219, + "grad_norm": 0.27845215797424316, + "learning_rate": 6.67195883009214e-07, + "loss": 0.0066, + "step": 6107 + }, + { + "epoch": 3.7164587770003044, + "grad_norm": 0.23421838879585266, + "learning_rate": 6.643420729973954e-07, + "loss": 0.0073, + "step": 6108 + }, + { + "epoch": 3.7170672345603895, + "grad_norm": 0.23832927644252777, + "learning_rate": 6.614942972143822e-07, + "loss": 0.0045, + "step": 6109 + }, + { + "epoch": 3.7176756921204746, + "grad_norm": 0.47682875394821167, + "learning_rate": 6.586525563663099e-07, + "loss": 0.0115, + "step": 6110 + }, + { + "epoch": 3.7182841496805596, + "grad_norm": 0.2529081702232361, + "learning_rate": 6.558168511577978e-07, + "loss": 0.0061, + "step": 6111 + }, + { + "epoch": 3.7188926072406447, + "grad_norm": 0.3189617693424225, + "learning_rate": 6.529871822919975e-07, + "loss": 0.0086, + "step": 6112 + }, + { + "epoch": 3.7195010648007303, + "grad_norm": 0.21395058929920197, + "learning_rate": 6.501635504705422e-07, + "loss": 0.0042, + "step": 6113 + }, + { + "epoch": 3.7201095223608154, + "grad_norm": 0.1643410176038742, + "learning_rate": 6.473459563935747e-07, + "loss": 0.0024, + "step": 6114 + }, + { + "epoch": 3.7207179799209005, + "grad_norm": 0.15923701226711273, + "learning_rate": 6.445344007597387e-07, + "loss": 0.0026, + "step": 6115 + }, + { + "epoch": 3.7213264374809856, + "grad_norm": 0.2772526741027832, + "learning_rate": 6.417288842661878e-07, + "loss": 0.0062, + "step": 6116 + }, + { + "epoch": 3.7219348950410707, + "grad_norm": 0.21529936790466309, + "learning_rate": 6.389294076085684e-07, + "loss": 0.0061, + "step": 6117 + }, + { + "epoch": 3.722543352601156, + "grad_norm": 0.23362010717391968, + "learning_rate": 6.361359714810389e-07, + "loss": 0.0059, + "step": 6118 + }, + { + "epoch": 3.7231518101612413, + "grad_norm": 0.23200243711471558, + "learning_rate": 6.333485765762509e-07, + "loss": 0.0065, + "step": 6119 + }, + { + "epoch": 3.7237602677213264, + "grad_norm": 0.22773078083992004, + "learning_rate": 6.305672235853682e-07, + "loss": 0.0042, + "step": 6120 + }, + { + "epoch": 3.7243687252814115, + "grad_norm": 0.29405245184898376, + "learning_rate": 6.277919131980392e-07, + "loss": 0.0076, + "step": 6121 + }, + { + "epoch": 3.7249771828414966, + "grad_norm": 0.16285215318202972, + "learning_rate": 6.250226461024383e-07, + "loss": 0.0032, + "step": 6122 + }, + { + "epoch": 3.725585640401582, + "grad_norm": 0.2497759461402893, + "learning_rate": 6.222594229852163e-07, + "loss": 0.0056, + "step": 6123 + }, + { + "epoch": 3.7261940979616672, + "grad_norm": 0.18337193131446838, + "learning_rate": 6.195022445315474e-07, + "loss": 0.0032, + "step": 6124 + }, + { + "epoch": 3.7268025555217523, + "grad_norm": 0.2235439568758011, + "learning_rate": 6.167511114250901e-07, + "loss": 0.0057, + "step": 6125 + }, + { + "epoch": 3.7274110130818374, + "grad_norm": 0.26169970631599426, + "learning_rate": 6.140060243480156e-07, + "loss": 0.0074, + "step": 6126 + }, + { + "epoch": 3.7280194706419225, + "grad_norm": 0.41065341234207153, + "learning_rate": 6.112669839809876e-07, + "loss": 0.0077, + "step": 6127 + }, + { + "epoch": 3.728627928202008, + "grad_norm": 0.22760555148124695, + "learning_rate": 6.08533991003174e-07, + "loss": 0.0051, + "step": 6128 + }, + { + "epoch": 3.729236385762093, + "grad_norm": 0.3023330867290497, + "learning_rate": 6.058070460922466e-07, + "loss": 0.0043, + "step": 6129 + }, + { + "epoch": 3.7298448433221782, + "grad_norm": 0.17906133830547333, + "learning_rate": 6.030861499243701e-07, + "loss": 0.0027, + "step": 6130 + }, + { + "epoch": 3.7304533008822633, + "grad_norm": 0.18436305224895477, + "learning_rate": 6.003713031742131e-07, + "loss": 0.0043, + "step": 6131 + }, + { + "epoch": 3.7310617584423484, + "grad_norm": 0.16411884129047394, + "learning_rate": 5.97662506514951e-07, + "loss": 0.003, + "step": 6132 + }, + { + "epoch": 3.731670216002434, + "grad_norm": 0.28377214074134827, + "learning_rate": 5.949597606182439e-07, + "loss": 0.0052, + "step": 6133 + }, + { + "epoch": 3.732278673562519, + "grad_norm": 0.24661727249622345, + "learning_rate": 5.922630661542639e-07, + "loss": 0.0072, + "step": 6134 + }, + { + "epoch": 3.732887131122604, + "grad_norm": 0.23685060441493988, + "learning_rate": 5.895724237916816e-07, + "loss": 0.0057, + "step": 6135 + }, + { + "epoch": 3.7334955886826893, + "grad_norm": 0.24538807570934296, + "learning_rate": 5.868878341976608e-07, + "loss": 0.006, + "step": 6136 + }, + { + "epoch": 3.7341040462427744, + "grad_norm": 0.23321305215358734, + "learning_rate": 5.842092980378688e-07, + "loss": 0.0036, + "step": 6137 + }, + { + "epoch": 3.73471250380286, + "grad_norm": 0.2515392005443573, + "learning_rate": 5.815368159764689e-07, + "loss": 0.0044, + "step": 6138 + }, + { + "epoch": 3.735320961362945, + "grad_norm": 0.3265810012817383, + "learning_rate": 5.788703886761254e-07, + "loss": 0.0064, + "step": 6139 + }, + { + "epoch": 3.73592941892303, + "grad_norm": 0.2389630675315857, + "learning_rate": 5.762100167980067e-07, + "loss": 0.0038, + "step": 6140 + }, + { + "epoch": 3.736537876483115, + "grad_norm": 0.26586848497390747, + "learning_rate": 5.735557010017656e-07, + "loss": 0.0071, + "step": 6141 + }, + { + "epoch": 3.7371463340432003, + "grad_norm": 0.1601967066526413, + "learning_rate": 5.709074419455701e-07, + "loss": 0.0019, + "step": 6142 + }, + { + "epoch": 3.737754791603286, + "grad_norm": 0.29823827743530273, + "learning_rate": 5.682652402860727e-07, + "loss": 0.0091, + "step": 6143 + }, + { + "epoch": 3.738363249163371, + "grad_norm": 0.24698449671268463, + "learning_rate": 5.6562909667843e-07, + "loss": 0.0061, + "step": 6144 + }, + { + "epoch": 3.738971706723456, + "grad_norm": 0.23663544654846191, + "learning_rate": 5.629990117762968e-07, + "loss": 0.0065, + "step": 6145 + }, + { + "epoch": 3.739580164283541, + "grad_norm": 0.300060898065567, + "learning_rate": 5.603749862318292e-07, + "loss": 0.0047, + "step": 6146 + }, + { + "epoch": 3.740188621843626, + "grad_norm": 0.13921256363391876, + "learning_rate": 5.577570206956623e-07, + "loss": 0.0032, + "step": 6147 + }, + { + "epoch": 3.7407970794037118, + "grad_norm": 0.15815310180187225, + "learning_rate": 5.551451158169602e-07, + "loss": 0.0035, + "step": 6148 + }, + { + "epoch": 3.741405536963797, + "grad_norm": 0.21306031942367554, + "learning_rate": 5.52539272243352e-07, + "loss": 0.0043, + "step": 6149 + }, + { + "epoch": 3.742013994523882, + "grad_norm": 0.2478276789188385, + "learning_rate": 5.499394906209876e-07, + "loss": 0.0076, + "step": 6150 + }, + { + "epoch": 3.742622452083967, + "grad_norm": 0.18793390691280365, + "learning_rate": 5.473457715944957e-07, + "loss": 0.0035, + "step": 6151 + }, + { + "epoch": 3.743230909644052, + "grad_norm": 0.2907355725765228, + "learning_rate": 5.447581158070203e-07, + "loss": 0.0041, + "step": 6152 + }, + { + "epoch": 3.7438393672041377, + "grad_norm": 0.31171515583992004, + "learning_rate": 5.42176523900187e-07, + "loss": 0.0065, + "step": 6153 + }, + { + "epoch": 3.7444478247642228, + "grad_norm": 0.27889809012413025, + "learning_rate": 5.396009965141197e-07, + "loss": 0.0062, + "step": 6154 + }, + { + "epoch": 3.745056282324308, + "grad_norm": 0.2419702708721161, + "learning_rate": 5.370315342874494e-07, + "loss": 0.0047, + "step": 6155 + }, + { + "epoch": 3.745664739884393, + "grad_norm": 0.22070972621440887, + "learning_rate": 5.344681378572913e-07, + "loss": 0.0048, + "step": 6156 + }, + { + "epoch": 3.746273197444478, + "grad_norm": 0.2477099597454071, + "learning_rate": 5.319108078592567e-07, + "loss": 0.0039, + "step": 6157 + }, + { + "epoch": 3.7468816550045636, + "grad_norm": 0.1542275846004486, + "learning_rate": 5.293595449274685e-07, + "loss": 0.0033, + "step": 6158 + }, + { + "epoch": 3.7474901125646487, + "grad_norm": 0.17303107678890228, + "learning_rate": 5.268143496945239e-07, + "loss": 0.0031, + "step": 6159 + }, + { + "epoch": 3.748098570124734, + "grad_norm": 0.26074641942977905, + "learning_rate": 5.242752227915287e-07, + "loss": 0.0071, + "step": 6160 + }, + { + "epoch": 3.748707027684819, + "grad_norm": 0.20203956961631775, + "learning_rate": 5.217421648480769e-07, + "loss": 0.0051, + "step": 6161 + }, + { + "epoch": 3.749315485244904, + "grad_norm": 0.18085290491580963, + "learning_rate": 5.192151764922659e-07, + "loss": 0.0027, + "step": 6162 + }, + { + "epoch": 3.7499239428049895, + "grad_norm": 0.31731289625167847, + "learning_rate": 5.166942583506806e-07, + "loss": 0.0055, + "step": 6163 + }, + { + "epoch": 3.7505324003650746, + "grad_norm": 0.1756841391324997, + "learning_rate": 5.141794110484071e-07, + "loss": 0.0023, + "step": 6164 + }, + { + "epoch": 3.7511408579251597, + "grad_norm": 0.17156068980693817, + "learning_rate": 5.116706352090189e-07, + "loss": 0.003, + "step": 6165 + }, + { + "epoch": 3.751749315485245, + "grad_norm": 0.2008720338344574, + "learning_rate": 5.091679314545905e-07, + "loss": 0.0039, + "step": 6166 + }, + { + "epoch": 3.75235777304533, + "grad_norm": 0.1687059849500656, + "learning_rate": 5.066713004056839e-07, + "loss": 0.0052, + "step": 6167 + }, + { + "epoch": 3.7529662306054155, + "grad_norm": 0.2529749572277069, + "learning_rate": 5.041807426813649e-07, + "loss": 0.0085, + "step": 6168 + }, + { + "epoch": 3.7535746881655006, + "grad_norm": 0.2726878225803375, + "learning_rate": 5.01696258899187e-07, + "loss": 0.0089, + "step": 6169 + }, + { + "epoch": 3.7541831457255856, + "grad_norm": 0.14040397107601166, + "learning_rate": 4.992178496751931e-07, + "loss": 0.0023, + "step": 6170 + }, + { + "epoch": 3.7547916032856707, + "grad_norm": 0.2372034788131714, + "learning_rate": 4.967455156239337e-07, + "loss": 0.0057, + "step": 6171 + }, + { + "epoch": 3.755400060845756, + "grad_norm": 0.22318537533283234, + "learning_rate": 4.942792573584404e-07, + "loss": 0.0033, + "step": 6172 + }, + { + "epoch": 3.7560085184058414, + "grad_norm": 0.22246554493904114, + "learning_rate": 4.918190754902408e-07, + "loss": 0.0038, + "step": 6173 + }, + { + "epoch": 3.7566169759659265, + "grad_norm": 0.18072238564491272, + "learning_rate": 4.89364970629358e-07, + "loss": 0.0038, + "step": 6174 + }, + { + "epoch": 3.7572254335260116, + "grad_norm": 0.2487388253211975, + "learning_rate": 4.86916943384308e-07, + "loss": 0.0067, + "step": 6175 + }, + { + "epoch": 3.7578338910860967, + "grad_norm": 0.1718432605266571, + "learning_rate": 4.844749943621052e-07, + "loss": 0.0029, + "step": 6176 + }, + { + "epoch": 3.7584423486461818, + "grad_norm": 0.20204098522663116, + "learning_rate": 4.820391241682404e-07, + "loss": 0.0034, + "step": 6177 + }, + { + "epoch": 3.7590508062062673, + "grad_norm": 0.20857053995132446, + "learning_rate": 4.796093334067192e-07, + "loss": 0.0042, + "step": 6178 + }, + { + "epoch": 3.7596592637663524, + "grad_norm": 0.2744029760360718, + "learning_rate": 4.771856226800209e-07, + "loss": 0.0074, + "step": 6179 + }, + { + "epoch": 3.7602677213264375, + "grad_norm": 0.33221253752708435, + "learning_rate": 4.7476799258912574e-07, + "loss": 0.0068, + "step": 6180 + }, + { + "epoch": 3.7608761788865226, + "grad_norm": 0.3299937844276428, + "learning_rate": 4.7235644373350707e-07, + "loss": 0.0058, + "step": 6181 + }, + { + "epoch": 3.7614846364466077, + "grad_norm": 0.33729061484336853, + "learning_rate": 4.69950976711131e-07, + "loss": 0.0079, + "step": 6182 + }, + { + "epoch": 3.7620930940066932, + "grad_norm": 0.26489993929862976, + "learning_rate": 4.675515921184481e-07, + "loss": 0.0064, + "step": 6183 + }, + { + "epoch": 3.7627015515667783, + "grad_norm": 0.14743492007255554, + "learning_rate": 4.651582905504048e-07, + "loss": 0.0022, + "step": 6184 + }, + { + "epoch": 3.7633100091268634, + "grad_norm": 0.21888674795627594, + "learning_rate": 4.627710726004458e-07, + "loss": 0.0074, + "step": 6185 + }, + { + "epoch": 3.7639184666869485, + "grad_norm": 0.21569287776947021, + "learning_rate": 4.6038993886049484e-07, + "loss": 0.005, + "step": 6186 + }, + { + "epoch": 3.7645269242470336, + "grad_norm": 0.22505217790603638, + "learning_rate": 4.5801488992098243e-07, + "loss": 0.0038, + "step": 6187 + }, + { + "epoch": 3.765135381807119, + "grad_norm": 0.196213498711586, + "learning_rate": 4.5564592637081517e-07, + "loss": 0.0047, + "step": 6188 + }, + { + "epoch": 3.7657438393672042, + "grad_norm": 0.26138564944267273, + "learning_rate": 4.5328304879739823e-07, + "loss": 0.0047, + "step": 6189 + }, + { + "epoch": 3.7663522969272893, + "grad_norm": 0.30381667613983154, + "learning_rate": 4.509262577866269e-07, + "loss": 0.0102, + "step": 6190 + }, + { + "epoch": 3.7669607544873744, + "grad_norm": 0.17448806762695312, + "learning_rate": 4.4857555392288917e-07, + "loss": 0.0023, + "step": 6191 + }, + { + "epoch": 3.7675692120474595, + "grad_norm": 0.2946946620941162, + "learning_rate": 4.4623093778906053e-07, + "loss": 0.0049, + "step": 6192 + }, + { + "epoch": 3.768177669607545, + "grad_norm": 0.23686204850673676, + "learning_rate": 4.438924099665065e-07, + "loss": 0.0048, + "step": 6193 + }, + { + "epoch": 3.76878612716763, + "grad_norm": 0.2854866087436676, + "learning_rate": 4.4155997103508817e-07, + "loss": 0.0049, + "step": 6194 + }, + { + "epoch": 3.7693945847277153, + "grad_norm": 0.22638092935085297, + "learning_rate": 4.392336215731513e-07, + "loss": 0.0056, + "step": 6195 + }, + { + "epoch": 3.7700030422878004, + "grad_norm": 0.1855926811695099, + "learning_rate": 4.369133621575289e-07, + "loss": 0.0034, + "step": 6196 + }, + { + "epoch": 3.7706114998478855, + "grad_norm": 0.21199901401996613, + "learning_rate": 4.3459919336355514e-07, + "loss": 0.0035, + "step": 6197 + }, + { + "epoch": 3.771219957407971, + "grad_norm": 0.1068754643201828, + "learning_rate": 4.322911157650433e-07, + "loss": 0.0015, + "step": 6198 + }, + { + "epoch": 3.771828414968056, + "grad_norm": 0.24557361006736755, + "learning_rate": 4.2998912993430785e-07, + "loss": 0.0054, + "step": 6199 + }, + { + "epoch": 3.772436872528141, + "grad_norm": 0.3791668117046356, + "learning_rate": 4.2769323644213375e-07, + "loss": 0.0127, + "step": 6200 + }, + { + "epoch": 3.7730453300882263, + "grad_norm": 0.19211210310459137, + "learning_rate": 4.2540343585781573e-07, + "loss": 0.0034, + "step": 6201 + }, + { + "epoch": 3.7736537876483114, + "grad_norm": 0.23425185680389404, + "learning_rate": 4.2311972874912453e-07, + "loss": 0.0062, + "step": 6202 + }, + { + "epoch": 3.774262245208397, + "grad_norm": 0.2918088138103485, + "learning_rate": 4.208421156823239e-07, + "loss": 0.005, + "step": 6203 + }, + { + "epoch": 3.774870702768482, + "grad_norm": 0.17967435717582703, + "learning_rate": 4.1857059722217316e-07, + "loss": 0.0033, + "step": 6204 + }, + { + "epoch": 3.775479160328567, + "grad_norm": 0.2613130211830139, + "learning_rate": 4.1630517393190794e-07, + "loss": 0.0055, + "step": 6205 + }, + { + "epoch": 3.776087617888652, + "grad_norm": 0.3404596149921417, + "learning_rate": 4.1404584637325936e-07, + "loss": 0.0088, + "step": 6206 + }, + { + "epoch": 3.7766960754487373, + "grad_norm": 0.286954402923584, + "learning_rate": 4.117926151064488e-07, + "loss": 0.0082, + "step": 6207 + }, + { + "epoch": 3.777304533008823, + "grad_norm": 0.19479084014892578, + "learning_rate": 4.0954548069018217e-07, + "loss": 0.0041, + "step": 6208 + }, + { + "epoch": 3.777912990568908, + "grad_norm": 0.20791840553283691, + "learning_rate": 4.073044436816581e-07, + "loss": 0.0048, + "step": 6209 + }, + { + "epoch": 3.778521448128993, + "grad_norm": 0.2315748929977417, + "learning_rate": 4.0506950463655713e-07, + "loss": 0.0048, + "step": 6210 + }, + { + "epoch": 3.779129905689078, + "grad_norm": 0.2659534811973572, + "learning_rate": 4.028406641090499e-07, + "loss": 0.004, + "step": 6211 + }, + { + "epoch": 3.7797383632491632, + "grad_norm": 0.2190115749835968, + "learning_rate": 4.0061792265179696e-07, + "loss": 0.0029, + "step": 6212 + }, + { + "epoch": 3.7803468208092488, + "grad_norm": 0.20221631228923798, + "learning_rate": 3.984012808159493e-07, + "loss": 0.0021, + "step": 6213 + }, + { + "epoch": 3.780955278369334, + "grad_norm": 0.2616676390171051, + "learning_rate": 3.961907391511366e-07, + "loss": 0.0075, + "step": 6214 + }, + { + "epoch": 3.781563735929419, + "grad_norm": 0.27306365966796875, + "learning_rate": 3.9398629820548703e-07, + "loss": 0.0063, + "step": 6215 + }, + { + "epoch": 3.782172193489504, + "grad_norm": 0.18841589987277985, + "learning_rate": 3.9178795852560236e-07, + "loss": 0.0023, + "step": 6216 + }, + { + "epoch": 3.782780651049589, + "grad_norm": 0.27202504873275757, + "learning_rate": 3.8959572065658535e-07, + "loss": 0.0057, + "step": 6217 + }, + { + "epoch": 3.7833891086096747, + "grad_norm": 0.24065257608890533, + "learning_rate": 3.874095851420151e-07, + "loss": 0.0048, + "step": 6218 + }, + { + "epoch": 3.78399756616976, + "grad_norm": 0.31770059466362, + "learning_rate": 3.852295525239663e-07, + "loss": 0.0073, + "step": 6219 + }, + { + "epoch": 3.784606023729845, + "grad_norm": 0.20511983335018158, + "learning_rate": 3.830556233429927e-07, + "loss": 0.0039, + "step": 6220 + }, + { + "epoch": 3.78521448128993, + "grad_norm": 0.33700552582740784, + "learning_rate": 3.808877981381437e-07, + "loss": 0.0049, + "step": 6221 + }, + { + "epoch": 3.785822938850015, + "grad_norm": 0.2216031849384308, + "learning_rate": 3.78726077446942e-07, + "loss": 0.0034, + "step": 6222 + }, + { + "epoch": 3.7864313964101006, + "grad_norm": 0.1690044105052948, + "learning_rate": 3.7657046180540887e-07, + "loss": 0.0042, + "step": 6223 + }, + { + "epoch": 3.7870398539701857, + "grad_norm": 0.3780827224254608, + "learning_rate": 3.744209517480446e-07, + "loss": 0.0068, + "step": 6224 + }, + { + "epoch": 3.787648311530271, + "grad_norm": 0.2709617018699646, + "learning_rate": 3.722775478078422e-07, + "loss": 0.0064, + "step": 6225 + }, + { + "epoch": 3.788256769090356, + "grad_norm": 0.24110420048236847, + "learning_rate": 3.70140250516271e-07, + "loss": 0.0058, + "step": 6226 + }, + { + "epoch": 3.788865226650441, + "grad_norm": 0.31499508023262024, + "learning_rate": 3.6800906040329595e-07, + "loss": 0.0075, + "step": 6227 + }, + { + "epoch": 3.7894736842105265, + "grad_norm": 0.2634466588497162, + "learning_rate": 3.658839779973611e-07, + "loss": 0.0054, + "step": 6228 + }, + { + "epoch": 3.7900821417706116, + "grad_norm": 0.20416542887687683, + "learning_rate": 3.637650038254004e-07, + "loss": 0.0045, + "step": 6229 + }, + { + "epoch": 3.7906905993306967, + "grad_norm": 0.3133479356765747, + "learning_rate": 3.6165213841282966e-07, + "loss": 0.0248, + "step": 6230 + }, + { + "epoch": 3.791299056890782, + "grad_norm": 0.16795231401920319, + "learning_rate": 3.5954538228355205e-07, + "loss": 0.0035, + "step": 6231 + }, + { + "epoch": 3.791907514450867, + "grad_norm": 0.26573193073272705, + "learning_rate": 3.5744473595995533e-07, + "loss": 0.0077, + "step": 6232 + }, + { + "epoch": 3.7925159720109525, + "grad_norm": 0.3238775432109833, + "learning_rate": 3.5535019996290885e-07, + "loss": 0.0108, + "step": 6233 + }, + { + "epoch": 3.7931244295710376, + "grad_norm": 0.1755254566669464, + "learning_rate": 3.5326177481177505e-07, + "loss": 0.0038, + "step": 6234 + }, + { + "epoch": 3.7937328871311227, + "grad_norm": 0.2531315088272095, + "learning_rate": 3.5117946102439513e-07, + "loss": 0.0048, + "step": 6235 + }, + { + "epoch": 3.7943413446912078, + "grad_norm": 0.28363898396492004, + "learning_rate": 3.491032591170951e-07, + "loss": 0.0049, + "step": 6236 + }, + { + "epoch": 3.794949802251293, + "grad_norm": 0.24325834214687347, + "learning_rate": 3.4703316960468524e-07, + "loss": 0.0048, + "step": 6237 + }, + { + "epoch": 3.7955582598113784, + "grad_norm": 0.22880126535892487, + "learning_rate": 3.4496919300046617e-07, + "loss": 0.0046, + "step": 6238 + }, + { + "epoch": 3.7961667173714635, + "grad_norm": 0.30837124586105347, + "learning_rate": 3.4291132981621174e-07, + "loss": 0.002, + "step": 6239 + }, + { + "epoch": 3.7967751749315486, + "grad_norm": 0.16037946939468384, + "learning_rate": 3.408595805621889e-07, + "loss": 0.0026, + "step": 6240 + }, + { + "epoch": 3.7973836324916337, + "grad_norm": 0.1504233032464981, + "learning_rate": 3.3881394574715174e-07, + "loss": 0.0035, + "step": 6241 + }, + { + "epoch": 3.797992090051719, + "grad_norm": 0.25224924087524414, + "learning_rate": 3.367744258783223e-07, + "loss": 0.006, + "step": 6242 + }, + { + "epoch": 3.7986005476118043, + "grad_norm": 0.25352349877357483, + "learning_rate": 3.347410214614211e-07, + "loss": 0.0061, + "step": 6243 + }, + { + "epoch": 3.7992090051718894, + "grad_norm": 0.27149248123168945, + "learning_rate": 3.327137330006502e-07, + "loss": 0.0158, + "step": 6244 + }, + { + "epoch": 3.7998174627319745, + "grad_norm": 0.23779939115047455, + "learning_rate": 3.3069256099869105e-07, + "loss": 0.0074, + "step": 6245 + }, + { + "epoch": 3.8004259202920596, + "grad_norm": 0.16755706071853638, + "learning_rate": 3.2867750595670657e-07, + "loss": 0.0032, + "step": 6246 + }, + { + "epoch": 3.8010343778521447, + "grad_norm": 0.19651807844638824, + "learning_rate": 3.266685683743498e-07, + "loss": 0.0047, + "step": 6247 + }, + { + "epoch": 3.8016428354122302, + "grad_norm": 0.277131587266922, + "learning_rate": 3.2466574874975565e-07, + "loss": 0.0041, + "step": 6248 + }, + { + "epoch": 3.8022512929723153, + "grad_norm": 0.27917802333831787, + "learning_rate": 3.226690475795324e-07, + "loss": 0.0071, + "step": 6249 + }, + { + "epoch": 3.8028597505324004, + "grad_norm": 0.2759116291999817, + "learning_rate": 3.206784653587869e-07, + "loss": 0.0063, + "step": 6250 + }, + { + "epoch": 3.8034682080924855, + "grad_norm": 0.37202462553977966, + "learning_rate": 3.186940025810992e-07, + "loss": 0.0067, + "step": 6251 + }, + { + "epoch": 3.8040766656525706, + "grad_norm": 0.1423821896314621, + "learning_rate": 3.1671565973852567e-07, + "loss": 0.0034, + "step": 6252 + }, + { + "epoch": 3.804685123212656, + "grad_norm": 0.6763217449188232, + "learning_rate": 3.14743437321624e-07, + "loss": 0.0252, + "step": 6253 + }, + { + "epoch": 3.805293580772741, + "grad_norm": 0.34884288907051086, + "learning_rate": 3.1277733581941416e-07, + "loss": 0.0078, + "step": 6254 + }, + { + "epoch": 3.8059020383328264, + "grad_norm": 0.24357537925243378, + "learning_rate": 3.1081735571941437e-07, + "loss": 0.0061, + "step": 6255 + }, + { + "epoch": 3.8065104958929115, + "grad_norm": 0.27209481596946716, + "learning_rate": 3.088634975076082e-07, + "loss": 0.0054, + "step": 6256 + }, + { + "epoch": 3.8071189534529966, + "grad_norm": 0.24121397733688354, + "learning_rate": 3.0691576166848314e-07, + "loss": 0.0054, + "step": 6257 + }, + { + "epoch": 3.807727411013082, + "grad_norm": 0.24442483484745026, + "learning_rate": 3.0497414868498884e-07, + "loss": 0.0058, + "step": 6258 + }, + { + "epoch": 3.8083358685731667, + "grad_norm": 0.24496260285377502, + "learning_rate": 3.030386590385653e-07, + "loss": 0.0083, + "step": 6259 + }, + { + "epoch": 3.8089443261332523, + "grad_norm": 0.14519323408603668, + "learning_rate": 3.0110929320913694e-07, + "loss": 0.0029, + "step": 6260 + }, + { + "epoch": 3.8095527836933374, + "grad_norm": 0.32007092237472534, + "learning_rate": 2.991860516751016e-07, + "loss": 0.0097, + "step": 6261 + }, + { + "epoch": 3.8101612412534225, + "grad_norm": 0.3110872209072113, + "learning_rate": 2.972689349133417e-07, + "loss": 0.0085, + "step": 6262 + }, + { + "epoch": 3.810769698813508, + "grad_norm": 0.27134284377098083, + "learning_rate": 2.9535794339922984e-07, + "loss": 0.0026, + "step": 6263 + }, + { + "epoch": 3.8113781563735927, + "grad_norm": 0.2500753700733185, + "learning_rate": 2.9345307760660637e-07, + "loss": 0.0059, + "step": 6264 + }, + { + "epoch": 3.811986613933678, + "grad_norm": 0.3057078421115875, + "learning_rate": 2.9155433800780176e-07, + "loss": 0.0049, + "step": 6265 + }, + { + "epoch": 3.8125950714937633, + "grad_norm": 0.2372872531414032, + "learning_rate": 2.8966172507362e-07, + "loss": 0.0049, + "step": 6266 + }, + { + "epoch": 3.8132035290538484, + "grad_norm": 0.2749864161014557, + "learning_rate": 2.8777523927335515e-07, + "loss": 0.0056, + "step": 6267 + }, + { + "epoch": 3.813811986613934, + "grad_norm": 0.26591286063194275, + "learning_rate": 2.8589488107477194e-07, + "loss": 0.0046, + "step": 6268 + }, + { + "epoch": 3.8144204441740186, + "grad_norm": 0.21569810807704926, + "learning_rate": 2.840206509441251e-07, + "loss": 0.0049, + "step": 6269 + }, + { + "epoch": 3.815028901734104, + "grad_norm": 0.19832342863082886, + "learning_rate": 2.82152549346143e-07, + "loss": 0.0056, + "step": 6270 + }, + { + "epoch": 3.8156373592941892, + "grad_norm": 0.3168319761753082, + "learning_rate": 2.8029057674404115e-07, + "loss": 0.0104, + "step": 6271 + }, + { + "epoch": 3.8162458168542743, + "grad_norm": 0.1959853619337082, + "learning_rate": 2.7843473359950303e-07, + "loss": 0.0045, + "step": 6272 + }, + { + "epoch": 3.81685427441436, + "grad_norm": 0.3243808150291443, + "learning_rate": 2.7658502037270774e-07, + "loss": 0.0067, + "step": 6273 + }, + { + "epoch": 3.8174627319744445, + "grad_norm": 0.2744379937648773, + "learning_rate": 2.747414375223051e-07, + "loss": 0.0057, + "step": 6274 + }, + { + "epoch": 3.81807118953453, + "grad_norm": 0.26081326603889465, + "learning_rate": 2.7290398550542664e-07, + "loss": 0.0047, + "step": 6275 + }, + { + "epoch": 3.818679647094615, + "grad_norm": 0.24908572435379028, + "learning_rate": 2.710726647776829e-07, + "loss": 0.0062, + "step": 6276 + }, + { + "epoch": 3.8192881046547003, + "grad_norm": 0.23935918509960175, + "learning_rate": 2.692474757931662e-07, + "loss": 0.0075, + "step": 6277 + }, + { + "epoch": 3.819896562214786, + "grad_norm": 0.1932537704706192, + "learning_rate": 2.674284190044479e-07, + "loss": 0.0034, + "step": 6278 + }, + { + "epoch": 3.8205050197748704, + "grad_norm": 0.3207075893878937, + "learning_rate": 2.6561549486257556e-07, + "loss": 0.0078, + "step": 6279 + }, + { + "epoch": 3.821113477334956, + "grad_norm": 0.2624523639678955, + "learning_rate": 2.638087038170811e-07, + "loss": 0.0082, + "step": 6280 + }, + { + "epoch": 3.821721934895041, + "grad_norm": 0.16647395491600037, + "learning_rate": 2.62008046315973e-07, + "loss": 0.003, + "step": 6281 + }, + { + "epoch": 3.822330392455126, + "grad_norm": 0.2986607253551483, + "learning_rate": 2.602135228057384e-07, + "loss": 0.0101, + "step": 6282 + }, + { + "epoch": 3.8229388500152113, + "grad_norm": 0.35330304503440857, + "learning_rate": 2.5842513373134645e-07, + "loss": 0.0103, + "step": 6283 + }, + { + "epoch": 3.8235473075752964, + "grad_norm": 0.2447318136692047, + "learning_rate": 2.566428795362397e-07, + "loss": 0.0059, + "step": 6284 + }, + { + "epoch": 3.824155765135382, + "grad_norm": 0.16872382164001465, + "learning_rate": 2.5486676066234504e-07, + "loss": 0.0039, + "step": 6285 + }, + { + "epoch": 3.824764222695467, + "grad_norm": 0.2803313434123993, + "learning_rate": 2.5309677755006867e-07, + "loss": 0.0081, + "step": 6286 + }, + { + "epoch": 3.825372680255552, + "grad_norm": 0.36526891589164734, + "learning_rate": 2.5133293063828724e-07, + "loss": 0.0137, + "step": 6287 + }, + { + "epoch": 3.825981137815637, + "grad_norm": 0.2731226086616516, + "learning_rate": 2.4957522036436474e-07, + "loss": 0.0092, + "step": 6288 + }, + { + "epoch": 3.8265895953757223, + "grad_norm": 0.19423212110996246, + "learning_rate": 2.4782364716413873e-07, + "loss": 0.0049, + "step": 6289 + }, + { + "epoch": 3.827198052935808, + "grad_norm": 0.16490328311920166, + "learning_rate": 2.460782114719257e-07, + "loss": 0.0032, + "step": 6290 + }, + { + "epoch": 3.827806510495893, + "grad_norm": 0.30284324288368225, + "learning_rate": 2.4433891372052376e-07, + "loss": 0.0055, + "step": 6291 + }, + { + "epoch": 3.828414968055978, + "grad_norm": 0.18749664723873138, + "learning_rate": 2.426057543412019e-07, + "loss": 0.0038, + "step": 6292 + }, + { + "epoch": 3.829023425616063, + "grad_norm": 0.2562442719936371, + "learning_rate": 2.408787337637164e-07, + "loss": 0.0059, + "step": 6293 + }, + { + "epoch": 3.829631883176148, + "grad_norm": 0.1709728091955185, + "learning_rate": 2.3915785241629406e-07, + "loss": 0.0035, + "step": 6294 + }, + { + "epoch": 3.8302403407362338, + "grad_norm": 0.3805026113986969, + "learning_rate": 2.3744311072563808e-07, + "loss": 0.0078, + "step": 6295 + }, + { + "epoch": 3.830848798296319, + "grad_norm": 0.20816834270954132, + "learning_rate": 2.3573450911693883e-07, + "loss": 0.0035, + "step": 6296 + }, + { + "epoch": 3.831457255856404, + "grad_norm": 0.16338828206062317, + "learning_rate": 2.3403204801385746e-07, + "loss": 0.0023, + "step": 6297 + }, + { + "epoch": 3.832065713416489, + "grad_norm": 0.21396136283874512, + "learning_rate": 2.3233572783852854e-07, + "loss": 0.0054, + "step": 6298 + }, + { + "epoch": 3.832674170976574, + "grad_norm": 0.20282062888145447, + "learning_rate": 2.3064554901157388e-07, + "loss": 0.0051, + "step": 6299 + }, + { + "epoch": 3.8332826285366597, + "grad_norm": 0.2224971354007721, + "learning_rate": 2.2896151195208603e-07, + "loss": 0.004, + "step": 6300 + }, + { + "epoch": 3.833891086096745, + "grad_norm": 0.3189062476158142, + "learning_rate": 2.272836170776338e-07, + "loss": 0.0039, + "step": 6301 + }, + { + "epoch": 3.83449954365683, + "grad_norm": 0.14660929143428802, + "learning_rate": 2.2561186480426766e-07, + "loss": 0.0035, + "step": 6302 + }, + { + "epoch": 3.835108001216915, + "grad_norm": 0.30386772751808167, + "learning_rate": 2.239462555465116e-07, + "loss": 0.0065, + "step": 6303 + }, + { + "epoch": 3.835716458777, + "grad_norm": 0.18635998666286469, + "learning_rate": 2.222867897173686e-07, + "loss": 0.0032, + "step": 6304 + }, + { + "epoch": 3.8363249163370856, + "grad_norm": 0.12038879841566086, + "learning_rate": 2.206334677283123e-07, + "loss": 0.0022, + "step": 6305 + }, + { + "epoch": 3.8369333738971707, + "grad_norm": 0.14553184807300568, + "learning_rate": 2.189862899893036e-07, + "loss": 0.0035, + "step": 6306 + }, + { + "epoch": 3.837541831457256, + "grad_norm": 0.21486513316631317, + "learning_rate": 2.173452569087714e-07, + "loss": 0.0051, + "step": 6307 + }, + { + "epoch": 3.838150289017341, + "grad_norm": 0.25689902901649475, + "learning_rate": 2.157103688936235e-07, + "loss": 0.0056, + "step": 6308 + }, + { + "epoch": 3.838758746577426, + "grad_norm": 0.22431597113609314, + "learning_rate": 2.14081626349244e-07, + "loss": 0.0025, + "step": 6309 + }, + { + "epoch": 3.8393672041375115, + "grad_norm": 0.23348523676395416, + "learning_rate": 2.1245902967949315e-07, + "loss": 0.0043, + "step": 6310 + }, + { + "epoch": 3.8399756616975966, + "grad_norm": 0.21763131022453308, + "learning_rate": 2.1084257928670748e-07, + "loss": 0.0034, + "step": 6311 + }, + { + "epoch": 3.8405841192576817, + "grad_norm": 0.2803143560886383, + "learning_rate": 2.092322755716969e-07, + "loss": 0.015, + "step": 6312 + }, + { + "epoch": 3.841192576817767, + "grad_norm": 0.1908082216978073, + "learning_rate": 2.0762811893375588e-07, + "loss": 0.0037, + "step": 6313 + }, + { + "epoch": 3.841801034377852, + "grad_norm": 0.24294763803482056, + "learning_rate": 2.060301097706413e-07, + "loss": 0.0044, + "step": 6314 + }, + { + "epoch": 3.8424094919379375, + "grad_norm": 0.2627767026424408, + "learning_rate": 2.0443824847859727e-07, + "loss": 0.0064, + "step": 6315 + }, + { + "epoch": 3.8430179494980226, + "grad_norm": 0.24739083647727966, + "learning_rate": 2.0285253545233585e-07, + "loss": 0.0071, + "step": 6316 + }, + { + "epoch": 3.8436264070581077, + "grad_norm": 0.31518417596817017, + "learning_rate": 2.0127297108505082e-07, + "loss": 0.0102, + "step": 6317 + }, + { + "epoch": 3.8442348646181927, + "grad_norm": 0.40057793259620667, + "learning_rate": 1.996995557684067e-07, + "loss": 0.006, + "step": 6318 + }, + { + "epoch": 3.844843322178278, + "grad_norm": 0.1834147572517395, + "learning_rate": 1.9813228989254695e-07, + "loss": 0.0039, + "step": 6319 + }, + { + "epoch": 3.8454517797383634, + "grad_norm": 0.2752745449542999, + "learning_rate": 1.9657117384608569e-07, + "loss": 0.0065, + "step": 6320 + }, + { + "epoch": 3.8460602372984485, + "grad_norm": 0.3888533413410187, + "learning_rate": 1.950162080161161e-07, + "loss": 0.0083, + "step": 6321 + }, + { + "epoch": 3.8466686948585336, + "grad_norm": 0.3024943172931671, + "learning_rate": 1.9346739278820192e-07, + "loss": 0.0117, + "step": 6322 + }, + { + "epoch": 3.8472771524186187, + "grad_norm": 0.2485586553812027, + "learning_rate": 1.9192472854638875e-07, + "loss": 0.0073, + "step": 6323 + }, + { + "epoch": 3.8478856099787038, + "grad_norm": 0.19068674743175507, + "learning_rate": 1.9038821567319286e-07, + "loss": 0.0028, + "step": 6324 + }, + { + "epoch": 3.8484940675387893, + "grad_norm": 0.18191272020339966, + "learning_rate": 1.8885785454960115e-07, + "loss": 0.0039, + "step": 6325 + }, + { + "epoch": 3.8491025250988744, + "grad_norm": 0.28522956371307373, + "learning_rate": 1.8733364555508225e-07, + "loss": 0.0054, + "step": 6326 + }, + { + "epoch": 3.8497109826589595, + "grad_norm": 0.24018339812755585, + "learning_rate": 1.8581558906757557e-07, + "loss": 0.0064, + "step": 6327 + }, + { + "epoch": 3.8503194402190446, + "grad_norm": 0.3272731304168701, + "learning_rate": 1.8430368546349942e-07, + "loss": 0.0079, + "step": 6328 + }, + { + "epoch": 3.8509278977791297, + "grad_norm": 0.30089452862739563, + "learning_rate": 1.8279793511774e-07, + "loss": 0.0038, + "step": 6329 + }, + { + "epoch": 3.8515363553392152, + "grad_norm": 0.2161916345357895, + "learning_rate": 1.8129833840365985e-07, + "loss": 0.0054, + "step": 6330 + }, + { + "epoch": 3.8521448128993003, + "grad_norm": 0.2529414892196655, + "learning_rate": 1.7980489569309755e-07, + "loss": 0.005, + "step": 6331 + }, + { + "epoch": 3.8527532704593854, + "grad_norm": 0.3203285336494446, + "learning_rate": 1.7831760735636248e-07, + "loss": 0.0057, + "step": 6332 + }, + { + "epoch": 3.8533617280194705, + "grad_norm": 0.20392923057079315, + "learning_rate": 1.76836473762243e-07, + "loss": 0.004, + "step": 6333 + }, + { + "epoch": 3.8539701855795556, + "grad_norm": 0.3112758994102478, + "learning_rate": 1.7536149527800082e-07, + "loss": 0.0069, + "step": 6334 + }, + { + "epoch": 3.854578643139641, + "grad_norm": 0.2692670226097107, + "learning_rate": 1.738926722693629e-07, + "loss": 0.0065, + "step": 6335 + }, + { + "epoch": 3.8551871006997263, + "grad_norm": 0.1982191503047943, + "learning_rate": 1.7243000510053787e-07, + "loss": 0.0045, + "step": 6336 + }, + { + "epoch": 3.8557955582598114, + "grad_norm": 0.21474218368530273, + "learning_rate": 1.7097349413420781e-07, + "loss": 0.0056, + "step": 6337 + }, + { + "epoch": 3.8564040158198964, + "grad_norm": 0.16877877712249756, + "learning_rate": 1.6952313973152834e-07, + "loss": 0.0047, + "step": 6338 + }, + { + "epoch": 3.8570124733799815, + "grad_norm": 0.35990428924560547, + "learning_rate": 1.6807894225212283e-07, + "loss": 0.0087, + "step": 6339 + }, + { + "epoch": 3.857620930940067, + "grad_norm": 0.3100360333919525, + "learning_rate": 1.6664090205409656e-07, + "loss": 0.0051, + "step": 6340 + }, + { + "epoch": 3.858229388500152, + "grad_norm": 0.1680872142314911, + "learning_rate": 1.652090194940198e-07, + "loss": 0.0032, + "step": 6341 + }, + { + "epoch": 3.8588378460602373, + "grad_norm": 0.22448216378688812, + "learning_rate": 1.637832949269419e-07, + "loss": 0.0045, + "step": 6342 + }, + { + "epoch": 3.8594463036203224, + "grad_norm": 0.20747099816799164, + "learning_rate": 1.6236372870638284e-07, + "loss": 0.0045, + "step": 6343 + }, + { + "epoch": 3.8600547611804075, + "grad_norm": 0.23858143389225006, + "learning_rate": 1.609503211843333e-07, + "loss": 0.0066, + "step": 6344 + }, + { + "epoch": 3.860663218740493, + "grad_norm": 0.2025892287492752, + "learning_rate": 1.595430727112629e-07, + "loss": 0.0042, + "step": 6345 + }, + { + "epoch": 3.861271676300578, + "grad_norm": 0.3395010530948639, + "learning_rate": 1.581419836361092e-07, + "loss": 0.0097, + "step": 6346 + }, + { + "epoch": 3.861880133860663, + "grad_norm": 0.23754116892814636, + "learning_rate": 1.5674705430628323e-07, + "loss": 0.0032, + "step": 6347 + }, + { + "epoch": 3.8624885914207483, + "grad_norm": 0.21957284212112427, + "learning_rate": 1.5535828506766936e-07, + "loss": 0.0063, + "step": 6348 + }, + { + "epoch": 3.8630970489808334, + "grad_norm": 0.18481026589870453, + "learning_rate": 1.539756762646255e-07, + "loss": 0.0039, + "step": 6349 + }, + { + "epoch": 3.863705506540919, + "grad_norm": 0.3242833912372589, + "learning_rate": 1.5259922823998297e-07, + "loss": 0.0067, + "step": 6350 + }, + { + "epoch": 3.864313964101004, + "grad_norm": 0.24175108969211578, + "learning_rate": 1.5122894133503817e-07, + "loss": 0.0048, + "step": 6351 + }, + { + "epoch": 3.864922421661089, + "grad_norm": 0.27556535601615906, + "learning_rate": 1.4986481588956934e-07, + "loss": 0.0043, + "step": 6352 + }, + { + "epoch": 3.865530879221174, + "grad_norm": 0.27273163199424744, + "learning_rate": 1.485068522418226e-07, + "loss": 0.0042, + "step": 6353 + }, + { + "epoch": 3.8661393367812593, + "grad_norm": 0.277964323759079, + "learning_rate": 1.4715505072851188e-07, + "loss": 0.015, + "step": 6354 + }, + { + "epoch": 3.866747794341345, + "grad_norm": 0.3170606791973114, + "learning_rate": 1.4580941168483298e-07, + "loss": 0.0103, + "step": 6355 + }, + { + "epoch": 3.86735625190143, + "grad_norm": 0.23001722991466522, + "learning_rate": 1.4446993544444954e-07, + "loss": 0.0063, + "step": 6356 + }, + { + "epoch": 3.867964709461515, + "grad_norm": 0.240625262260437, + "learning_rate": 1.4313662233948755e-07, + "loss": 0.0072, + "step": 6357 + }, + { + "epoch": 3.8685731670216, + "grad_norm": 0.22342710196971893, + "learning_rate": 1.4180947270056032e-07, + "loss": 0.0043, + "step": 6358 + }, + { + "epoch": 3.8691816245816852, + "grad_norm": 0.2697957754135132, + "learning_rate": 1.4048848685674354e-07, + "loss": 0.0061, + "step": 6359 + }, + { + "epoch": 3.869790082141771, + "grad_norm": 0.21805325150489807, + "learning_rate": 1.3917366513558629e-07, + "loss": 0.004, + "step": 6360 + }, + { + "epoch": 3.870398539701856, + "grad_norm": 0.23698726296424866, + "learning_rate": 1.378650078631083e-07, + "loss": 0.0069, + "step": 6361 + }, + { + "epoch": 3.871006997261941, + "grad_norm": 0.2618488669395447, + "learning_rate": 1.365625153638056e-07, + "loss": 0.0074, + "step": 6362 + }, + { + "epoch": 3.871615454822026, + "grad_norm": 0.2968907654285431, + "learning_rate": 1.352661879606393e-07, + "loss": 0.0097, + "step": 6363 + }, + { + "epoch": 3.872223912382111, + "grad_norm": 0.31761443614959717, + "learning_rate": 1.339760259750439e-07, + "loss": 0.0071, + "step": 6364 + }, + { + "epoch": 3.8728323699421967, + "grad_norm": 0.18307559192180634, + "learning_rate": 1.3269202972692741e-07, + "loss": 0.0043, + "step": 6365 + }, + { + "epoch": 3.873440827502282, + "grad_norm": 0.2085932344198227, + "learning_rate": 1.314141995346685e-07, + "loss": 0.0047, + "step": 6366 + }, + { + "epoch": 3.874049285062367, + "grad_norm": 0.21864819526672363, + "learning_rate": 1.301425357151137e-07, + "loss": 0.0072, + "step": 6367 + }, + { + "epoch": 3.874657742622452, + "grad_norm": 0.15939606726169586, + "learning_rate": 1.2887703858358302e-07, + "loss": 0.0028, + "step": 6368 + }, + { + "epoch": 3.875266200182537, + "grad_norm": 0.21072454750537872, + "learning_rate": 1.2761770845386712e-07, + "loss": 0.0034, + "step": 6369 + }, + { + "epoch": 3.8758746577426226, + "grad_norm": 0.3073256313800812, + "learning_rate": 1.2636454563823009e-07, + "loss": 0.0075, + "step": 6370 + }, + { + "epoch": 3.8764831153027077, + "grad_norm": 0.2551986873149872, + "learning_rate": 1.2511755044739836e-07, + "loss": 0.0072, + "step": 6371 + }, + { + "epoch": 3.877091572862793, + "grad_norm": 0.2689383327960968, + "learning_rate": 1.238767231905774e-07, + "loss": 0.0041, + "step": 6372 + }, + { + "epoch": 3.877700030422878, + "grad_norm": 0.2862478792667389, + "learning_rate": 1.2264206417544333e-07, + "loss": 0.0051, + "step": 6373 + }, + { + "epoch": 3.878308487982963, + "grad_norm": 0.34870097041130066, + "learning_rate": 1.2141357370813732e-07, + "loss": 0.0063, + "step": 6374 + }, + { + "epoch": 3.8789169455430486, + "grad_norm": 0.27742713689804077, + "learning_rate": 1.2019125209327409e-07, + "loss": 0.0047, + "step": 6375 + }, + { + "epoch": 3.8795254031031337, + "grad_norm": 0.19228945672512054, + "learning_rate": 1.1897509963394171e-07, + "loss": 0.0043, + "step": 6376 + }, + { + "epoch": 3.8801338606632187, + "grad_norm": 0.2762080430984497, + "learning_rate": 1.1776511663168788e-07, + "loss": 0.0081, + "step": 6377 + }, + { + "epoch": 3.880742318223304, + "grad_norm": 0.3993409276008606, + "learning_rate": 1.165613033865448e-07, + "loss": 0.0087, + "step": 6378 + }, + { + "epoch": 3.881350775783389, + "grad_norm": 0.26734060049057007, + "learning_rate": 1.1536366019700428e-07, + "loss": 0.0054, + "step": 6379 + }, + { + "epoch": 3.8819592333434745, + "grad_norm": 0.2520960867404938, + "learning_rate": 1.1417218736003432e-07, + "loss": 0.0048, + "step": 6380 + }, + { + "epoch": 3.8825676909035596, + "grad_norm": 0.24928855895996094, + "learning_rate": 1.1298688517107081e-07, + "loss": 0.0046, + "step": 6381 + }, + { + "epoch": 3.8831761484636447, + "grad_norm": 0.20136448740959167, + "learning_rate": 1.1180775392401754e-07, + "loss": 0.0037, + "step": 6382 + }, + { + "epoch": 3.8837846060237298, + "grad_norm": 0.2751845121383667, + "learning_rate": 1.1063479391124898e-07, + "loss": 0.0036, + "step": 6383 + }, + { + "epoch": 3.884393063583815, + "grad_norm": 0.25994813442230225, + "learning_rate": 1.0946800542361025e-07, + "loss": 0.0059, + "step": 6384 + }, + { + "epoch": 3.8850015211439004, + "grad_norm": 0.2973096966743469, + "learning_rate": 1.083073887504199e-07, + "loss": 0.0103, + "step": 6385 + }, + { + "epoch": 3.8856099787039855, + "grad_norm": 0.1909116506576538, + "learning_rate": 1.0715294417946164e-07, + "loss": 0.0021, + "step": 6386 + }, + { + "epoch": 3.8862184362640706, + "grad_norm": 0.17645008862018585, + "learning_rate": 1.0600467199698427e-07, + "loss": 0.0044, + "step": 6387 + }, + { + "epoch": 3.8868268938241557, + "grad_norm": 0.24159935116767883, + "learning_rate": 1.0486257248771835e-07, + "loss": 0.0088, + "step": 6388 + }, + { + "epoch": 3.887435351384241, + "grad_norm": 0.30770307779312134, + "learning_rate": 1.0372664593485403e-07, + "loss": 0.0055, + "step": 6389 + }, + { + "epoch": 3.8880438089443263, + "grad_norm": 0.7140611410140991, + "learning_rate": 1.0259689262005212e-07, + "loss": 0.0058, + "step": 6390 + }, + { + "epoch": 3.8886522665044114, + "grad_norm": 0.249837726354599, + "learning_rate": 1.0147331282344686e-07, + "loss": 0.0047, + "step": 6391 + }, + { + "epoch": 3.8892607240644965, + "grad_norm": 0.1296958178281784, + "learning_rate": 1.003559068236376e-07, + "loss": 0.0019, + "step": 6392 + }, + { + "epoch": 3.8898691816245816, + "grad_norm": 0.31125199794769287, + "learning_rate": 9.924467489769717e-08, + "loss": 0.0084, + "step": 6393 + }, + { + "epoch": 3.8904776391846667, + "grad_norm": 0.25288596749305725, + "learning_rate": 9.813961732116073e-08, + "loss": 0.0079, + "step": 6394 + }, + { + "epoch": 3.8910860967447523, + "grad_norm": 0.21936556696891785, + "learning_rate": 9.704073436803685e-08, + "loss": 0.0032, + "step": 6395 + }, + { + "epoch": 3.8916945543048374, + "grad_norm": 0.3556055724620819, + "learning_rate": 9.594802631080756e-08, + "loss": 0.0119, + "step": 6396 + }, + { + "epoch": 3.8923030118649224, + "grad_norm": 0.2905397117137909, + "learning_rate": 9.48614934204145e-08, + "loss": 0.007, + "step": 6397 + }, + { + "epoch": 3.8929114694250075, + "grad_norm": 0.27061349153518677, + "learning_rate": 9.378113596627546e-08, + "loss": 0.0089, + "step": 6398 + }, + { + "epoch": 3.8935199269850926, + "grad_norm": 0.182336688041687, + "learning_rate": 9.270695421626784e-08, + "loss": 0.0036, + "step": 6399 + }, + { + "epoch": 3.894128384545178, + "grad_norm": 0.07943737506866455, + "learning_rate": 9.163894843675357e-08, + "loss": 0.0007, + "step": 6400 + }, + { + "epoch": 3.8947368421052633, + "grad_norm": 0.24037736654281616, + "learning_rate": 9.057711889254584e-08, + "loss": 0.0051, + "step": 6401 + }, + { + "epoch": 3.8953452996653484, + "grad_norm": 0.28533557057380676, + "learning_rate": 8.952146584693677e-08, + "loss": 0.0073, + "step": 6402 + }, + { + "epoch": 3.8959537572254335, + "grad_norm": 0.1930028647184372, + "learning_rate": 8.847198956168368e-08, + "loss": 0.0045, + "step": 6403 + }, + { + "epoch": 3.8965622147855186, + "grad_norm": 0.24128195643424988, + "learning_rate": 8.742869029701451e-08, + "loss": 0.0045, + "step": 6404 + }, + { + "epoch": 3.897170672345604, + "grad_norm": 0.2623744308948517, + "learning_rate": 8.639156831162231e-08, + "loss": 0.0079, + "step": 6405 + }, + { + "epoch": 3.897779129905689, + "grad_norm": 0.31534165143966675, + "learning_rate": 8.536062386267362e-08, + "loss": 0.0061, + "step": 6406 + }, + { + "epoch": 3.8983875874657743, + "grad_norm": 0.22992461919784546, + "learning_rate": 8.433585720579173e-08, + "loss": 0.0066, + "step": 6407 + }, + { + "epoch": 3.8989960450258594, + "grad_norm": 0.17727993428707123, + "learning_rate": 8.331726859508726e-08, + "loss": 0.0031, + "step": 6408 + }, + { + "epoch": 3.8996045025859445, + "grad_norm": 0.2012277990579605, + "learning_rate": 8.230485828311651e-08, + "loss": 0.0044, + "step": 6409 + }, + { + "epoch": 3.90021296014603, + "grad_norm": 0.253017783164978, + "learning_rate": 8.129862652092313e-08, + "loss": 0.0045, + "step": 6410 + }, + { + "epoch": 3.900821417706115, + "grad_norm": 0.19332891702651978, + "learning_rate": 8.029857355800475e-08, + "loss": 0.0044, + "step": 6411 + }, + { + "epoch": 3.9014298752662, + "grad_norm": 0.27358150482177734, + "learning_rate": 7.930469964234078e-08, + "loss": 0.0062, + "step": 6412 + }, + { + "epoch": 3.9020383328262853, + "grad_norm": 0.23901762068271637, + "learning_rate": 7.83170050203591e-08, + "loss": 0.0046, + "step": 6413 + }, + { + "epoch": 3.9026467903863704, + "grad_norm": 0.23956961929798126, + "learning_rate": 7.73354899369777e-08, + "loss": 0.0065, + "step": 6414 + }, + { + "epoch": 3.903255247946456, + "grad_norm": 0.25745102763175964, + "learning_rate": 7.636015463556578e-08, + "loss": 0.0074, + "step": 6415 + }, + { + "epoch": 3.903863705506541, + "grad_norm": 0.23999325931072235, + "learning_rate": 7.539099935796879e-08, + "loss": 0.0059, + "step": 6416 + }, + { + "epoch": 3.904472163066626, + "grad_norm": 0.18391796946525574, + "learning_rate": 7.442802434449169e-08, + "loss": 0.0043, + "step": 6417 + }, + { + "epoch": 3.9050806206267112, + "grad_norm": 0.2279115468263626, + "learning_rate": 7.347122983391851e-08, + "loss": 0.0061, + "step": 6418 + }, + { + "epoch": 3.9056890781867963, + "grad_norm": 0.3626938760280609, + "learning_rate": 7.252061606349003e-08, + "loss": 0.0105, + "step": 6419 + }, + { + "epoch": 3.906297535746882, + "grad_norm": 0.26662662625312805, + "learning_rate": 7.157618326892046e-08, + "loss": 0.0039, + "step": 6420 + }, + { + "epoch": 3.906905993306967, + "grad_norm": 0.23431222140789032, + "learning_rate": 7.063793168438915e-08, + "loss": 0.0097, + "step": 6421 + }, + { + "epoch": 3.907514450867052, + "grad_norm": 0.23029586672782898, + "learning_rate": 6.970586154254333e-08, + "loss": 0.0044, + "step": 6422 + }, + { + "epoch": 3.908122908427137, + "grad_norm": 0.30431026220321655, + "learning_rate": 6.877997307449813e-08, + "loss": 0.003, + "step": 6423 + }, + { + "epoch": 3.9087313659872223, + "grad_norm": 0.31623929738998413, + "learning_rate": 6.786026650983657e-08, + "loss": 0.003, + "step": 6424 + }, + { + "epoch": 3.909339823547308, + "grad_norm": 0.2245815098285675, + "learning_rate": 6.6946742076604e-08, + "loss": 0.0021, + "step": 6425 + }, + { + "epoch": 3.909948281107393, + "grad_norm": 0.20678408443927765, + "learning_rate": 6.603940000132203e-08, + "loss": 0.0039, + "step": 6426 + }, + { + "epoch": 3.910556738667478, + "grad_norm": 0.32013562321662903, + "learning_rate": 6.513824050896622e-08, + "loss": 0.0073, + "step": 6427 + }, + { + "epoch": 3.911165196227563, + "grad_norm": 0.26314496994018555, + "learning_rate": 6.424326382299394e-08, + "loss": 0.0059, + "step": 6428 + }, + { + "epoch": 3.911773653787648, + "grad_norm": 0.2609120011329651, + "learning_rate": 6.33544701653166e-08, + "loss": 0.0074, + "step": 6429 + }, + { + "epoch": 3.9123821113477337, + "grad_norm": 0.22066128253936768, + "learning_rate": 6.247185975631897e-08, + "loss": 0.0059, + "step": 6430 + }, + { + "epoch": 3.912990568907819, + "grad_norm": 0.24796698987483978, + "learning_rate": 6.15954328148538e-08, + "loss": 0.0056, + "step": 6431 + }, + { + "epoch": 3.913599026467904, + "grad_norm": 0.32350313663482666, + "learning_rate": 6.072518955823891e-08, + "loss": 0.0097, + "step": 6432 + }, + { + "epoch": 3.914207484027989, + "grad_norm": 0.24313661456108093, + "learning_rate": 5.986113020225448e-08, + "loss": 0.0052, + "step": 6433 + }, + { + "epoch": 3.914815941588074, + "grad_norm": 0.15304477512836456, + "learning_rate": 5.900325496115411e-08, + "loss": 0.0037, + "step": 6434 + }, + { + "epoch": 3.9154243991481597, + "grad_norm": 0.189406618475914, + "learning_rate": 5.815156404765654e-08, + "loss": 0.003, + "step": 6435 + }, + { + "epoch": 3.9160328567082447, + "grad_norm": 0.23600012063980103, + "learning_rate": 5.7306057672942833e-08, + "loss": 0.0029, + "step": 6436 + }, + { + "epoch": 3.91664131426833, + "grad_norm": 0.20945847034454346, + "learning_rate": 5.6466736046661974e-08, + "loss": 0.0038, + "step": 6437 + }, + { + "epoch": 3.917249771828415, + "grad_norm": 0.2792292535305023, + "learning_rate": 5.5633599376936353e-08, + "loss": 0.0071, + "step": 6438 + }, + { + "epoch": 3.9178582293885, + "grad_norm": 0.1970946043729782, + "learning_rate": 5.48066478703424e-08, + "loss": 0.0035, + "step": 6439 + }, + { + "epoch": 3.9184666869485856, + "grad_norm": 0.23822718858718872, + "learning_rate": 5.398588173193275e-08, + "loss": 0.0058, + "step": 6440 + }, + { + "epoch": 3.9190751445086707, + "grad_norm": 0.30124393105506897, + "learning_rate": 5.317130116522517e-08, + "loss": 0.0082, + "step": 6441 + }, + { + "epoch": 3.9196836020687558, + "grad_norm": 0.22608311474323273, + "learning_rate": 5.2362906372199764e-08, + "loss": 0.0037, + "step": 6442 + }, + { + "epoch": 3.920292059628841, + "grad_norm": 0.23782384395599365, + "learning_rate": 5.156069755330451e-08, + "loss": 0.0057, + "step": 6443 + }, + { + "epoch": 3.920900517188926, + "grad_norm": 0.20344187319278717, + "learning_rate": 5.0764674907452516e-08, + "loss": 0.0058, + "step": 6444 + }, + { + "epoch": 3.9215089747490115, + "grad_norm": 0.2951229214668274, + "learning_rate": 4.997483863202757e-08, + "loss": 0.0071, + "step": 6445 + }, + { + "epoch": 3.9221174323090966, + "grad_norm": 0.18063601851463318, + "learning_rate": 4.9191188922875775e-08, + "loss": 0.0046, + "step": 6446 + }, + { + "epoch": 3.9227258898691817, + "grad_norm": 0.2033570259809494, + "learning_rate": 4.8413725974305604e-08, + "loss": 0.0046, + "step": 6447 + }, + { + "epoch": 3.923334347429267, + "grad_norm": 0.3676776587963104, + "learning_rate": 4.764244997909895e-08, + "loss": 0.0053, + "step": 6448 + }, + { + "epoch": 3.923942804989352, + "grad_norm": 0.1831228882074356, + "learning_rate": 4.6877361128497276e-08, + "loss": 0.0039, + "step": 6449 + }, + { + "epoch": 3.9245512625494374, + "grad_norm": 0.17482344806194305, + "learning_rate": 4.61184596122155e-08, + "loss": 0.0048, + "step": 6450 + }, + { + "epoch": 3.9251597201095225, + "grad_norm": 0.25266796350479126, + "learning_rate": 4.5365745618425325e-08, + "loss": 0.007, + "step": 6451 + }, + { + "epoch": 3.9257681776696076, + "grad_norm": 0.25007006525993347, + "learning_rate": 4.4619219333769115e-08, + "loss": 0.0068, + "step": 6452 + }, + { + "epoch": 3.9263766352296927, + "grad_norm": 0.2621901035308838, + "learning_rate": 4.3878880943357124e-08, + "loss": 0.0078, + "step": 6453 + }, + { + "epoch": 3.926985092789778, + "grad_norm": 0.31776395440101624, + "learning_rate": 4.314473063075919e-08, + "loss": 0.0099, + "step": 6454 + }, + { + "epoch": 3.9275935503498633, + "grad_norm": 0.19145816564559937, + "learning_rate": 4.24167685780158e-08, + "loss": 0.0034, + "step": 6455 + }, + { + "epoch": 3.9282020079099484, + "grad_norm": 0.24003374576568604, + "learning_rate": 4.169499496562701e-08, + "loss": 0.004, + "step": 6456 + }, + { + "epoch": 3.9288104654700335, + "grad_norm": 0.19230146706104279, + "learning_rate": 4.097940997256911e-08, + "loss": 0.005, + "step": 6457 + }, + { + "epoch": 3.9294189230301186, + "grad_norm": 0.22203390300273895, + "learning_rate": 4.0270013776275153e-08, + "loss": 0.005, + "step": 6458 + }, + { + "epoch": 3.9300273805902037, + "grad_norm": 0.28375643491744995, + "learning_rate": 3.9566806552643335e-08, + "loss": 0.0063, + "step": 6459 + }, + { + "epoch": 3.9306358381502893, + "grad_norm": 0.20378822088241577, + "learning_rate": 3.8869788476039725e-08, + "loss": 0.0035, + "step": 6460 + }, + { + "epoch": 3.931244295710374, + "grad_norm": 0.611138641834259, + "learning_rate": 3.817895971930108e-08, + "loss": 0.0211, + "step": 6461 + }, + { + "epoch": 3.9318527532704595, + "grad_norm": 0.20204846560955048, + "learning_rate": 3.749432045371815e-08, + "loss": 0.005, + "step": 6462 + }, + { + "epoch": 3.9324612108305446, + "grad_norm": 0.2756820619106293, + "learning_rate": 3.6815870849055134e-08, + "loss": 0.0047, + "step": 6463 + }, + { + "epoch": 3.9330696683906297, + "grad_norm": 0.26881980895996094, + "learning_rate": 3.614361107354136e-08, + "loss": 0.004, + "step": 6464 + }, + { + "epoch": 3.933678125950715, + "grad_norm": 0.30591699481010437, + "learning_rate": 3.547754129386571e-08, + "loss": 0.0054, + "step": 6465 + }, + { + "epoch": 3.9342865835108, + "grad_norm": 0.3070509433746338, + "learning_rate": 3.481766167518774e-08, + "loss": 0.0074, + "step": 6466 + }, + { + "epoch": 3.9348950410708854, + "grad_norm": 0.21430547535419464, + "learning_rate": 3.416397238112934e-08, + "loss": 0.0054, + "step": 6467 + }, + { + "epoch": 3.9355034986309705, + "grad_norm": 0.15337048470973969, + "learning_rate": 3.351647357377752e-08, + "loss": 0.0031, + "step": 6468 + }, + { + "epoch": 3.9361119561910556, + "grad_norm": 0.17072342336177826, + "learning_rate": 3.2875165413687184e-08, + "loss": 0.0035, + "step": 6469 + }, + { + "epoch": 3.936720413751141, + "grad_norm": 0.22262074053287506, + "learning_rate": 3.2240048059872793e-08, + "loss": 0.0051, + "step": 6470 + }, + { + "epoch": 3.9373288713112258, + "grad_norm": 0.22603394091129303, + "learning_rate": 3.161112166982227e-08, + "loss": 0.0052, + "step": 6471 + }, + { + "epoch": 3.9379373288713113, + "grad_norm": 0.20640049874782562, + "learning_rate": 3.098838639947754e-08, + "loss": 0.0047, + "step": 6472 + }, + { + "epoch": 3.9385457864313964, + "grad_norm": 0.17780143022537231, + "learning_rate": 3.037184240325397e-08, + "loss": 0.0027, + "step": 6473 + }, + { + "epoch": 3.9391542439914815, + "grad_norm": 0.1945071816444397, + "learning_rate": 2.976148983402649e-08, + "loss": 0.0056, + "step": 6474 + }, + { + "epoch": 3.939762701551567, + "grad_norm": 0.2443944811820984, + "learning_rate": 2.9157328843140708e-08, + "loss": 0.005, + "step": 6475 + }, + { + "epoch": 3.9403711591116517, + "grad_norm": 0.1970926970243454, + "learning_rate": 2.855935958040179e-08, + "loss": 0.0036, + "step": 6476 + }, + { + "epoch": 3.9409796166717372, + "grad_norm": 0.2412329912185669, + "learning_rate": 2.7967582194080022e-08, + "loss": 0.0042, + "step": 6477 + }, + { + "epoch": 3.9415880742318223, + "grad_norm": 0.15232165157794952, + "learning_rate": 2.7381996830910805e-08, + "loss": 0.0041, + "step": 6478 + }, + { + "epoch": 3.9421965317919074, + "grad_norm": 0.2518268823623657, + "learning_rate": 2.6802603636097435e-08, + "loss": 0.0074, + "step": 6479 + }, + { + "epoch": 3.942804989351993, + "grad_norm": 0.25513625144958496, + "learning_rate": 2.6229402753305544e-08, + "loss": 0.0088, + "step": 6480 + }, + { + "epoch": 3.9434134469120776, + "grad_norm": 0.2241678684949875, + "learning_rate": 2.5662394324663108e-08, + "loss": 0.0053, + "step": 6481 + }, + { + "epoch": 3.944021904472163, + "grad_norm": 0.1274229884147644, + "learning_rate": 2.5101578490763223e-08, + "loss": 0.0015, + "step": 6482 + }, + { + "epoch": 3.9446303620322483, + "grad_norm": 0.20924502611160278, + "learning_rate": 2.4546955390669645e-08, + "loss": 0.003, + "step": 6483 + }, + { + "epoch": 3.9452388195923334, + "grad_norm": 0.23410069942474365, + "learning_rate": 2.3998525161900153e-08, + "loss": 0.0072, + "step": 6484 + }, + { + "epoch": 3.9458472771524185, + "grad_norm": 0.18146799504756927, + "learning_rate": 2.345628794044874e-08, + "loss": 0.0023, + "step": 6485 + }, + { + "epoch": 3.9464557347125035, + "grad_norm": 0.3447299003601074, + "learning_rate": 2.292024386076064e-08, + "loss": 0.0028, + "step": 6486 + }, + { + "epoch": 3.947064192272589, + "grad_norm": 0.2687872052192688, + "learning_rate": 2.2390393055757297e-08, + "loss": 0.0052, + "step": 6487 + }, + { + "epoch": 3.947672649832674, + "grad_norm": 0.20712141692638397, + "learning_rate": 2.1866735656819736e-08, + "loss": 0.0062, + "step": 6488 + }, + { + "epoch": 3.9482811073927593, + "grad_norm": 0.31326937675476074, + "learning_rate": 2.1349271793791313e-08, + "loss": 0.0079, + "step": 6489 + }, + { + "epoch": 3.9488895649528444, + "grad_norm": 0.3461199104785919, + "learning_rate": 2.0838001594980504e-08, + "loss": 0.009, + "step": 6490 + }, + { + "epoch": 3.9494980225129295, + "grad_norm": 0.18858326971530914, + "learning_rate": 2.0332925187163676e-08, + "loss": 0.0052, + "step": 6491 + }, + { + "epoch": 3.950106480073015, + "grad_norm": 0.26478853821754456, + "learning_rate": 1.983404269557676e-08, + "loss": 0.0061, + "step": 6492 + }, + { + "epoch": 3.9507149376331, + "grad_norm": 0.2934277057647705, + "learning_rate": 1.9341354243923583e-08, + "loss": 0.0034, + "step": 6493 + }, + { + "epoch": 3.951323395193185, + "grad_norm": 0.18972255289554596, + "learning_rate": 1.8854859954370306e-08, + "loss": 0.0042, + "step": 6494 + }, + { + "epoch": 3.9519318527532703, + "grad_norm": 0.3544655740261078, + "learning_rate": 1.8374559947545446e-08, + "loss": 0.0206, + "step": 6495 + }, + { + "epoch": 3.9525403103133554, + "grad_norm": 0.1487325131893158, + "learning_rate": 1.7900454342542616e-08, + "loss": 0.0032, + "step": 6496 + }, + { + "epoch": 3.953148767873441, + "grad_norm": 0.3513227701187134, + "learning_rate": 1.743254325692334e-08, + "loss": 0.0078, + "step": 6497 + }, + { + "epoch": 3.953757225433526, + "grad_norm": 0.19835789501667023, + "learning_rate": 1.6970826806708694e-08, + "loss": 0.004, + "step": 6498 + }, + { + "epoch": 3.954365682993611, + "grad_norm": 0.16036944091320038, + "learning_rate": 1.651530510638488e-08, + "loss": 0.0027, + "step": 6499 + }, + { + "epoch": 3.9549741405536962, + "grad_norm": 0.2660086750984192, + "learning_rate": 1.6065978268903215e-08, + "loss": 0.0068, + "step": 6500 + }, + { + "epoch": 3.9555825981137813, + "grad_norm": 0.18971866369247437, + "learning_rate": 1.562284640567735e-08, + "loss": 0.0053, + "step": 6501 + }, + { + "epoch": 3.956191055673867, + "grad_norm": 0.22135283052921295, + "learning_rate": 1.5185909626583284e-08, + "loss": 0.0057, + "step": 6502 + }, + { + "epoch": 3.956799513233952, + "grad_norm": 0.33586928248405457, + "learning_rate": 1.4755168039967682e-08, + "loss": 0.0154, + "step": 6503 + }, + { + "epoch": 3.957407970794037, + "grad_norm": 0.21343040466308594, + "learning_rate": 1.4330621752631224e-08, + "loss": 0.0041, + "step": 6504 + }, + { + "epoch": 3.958016428354122, + "grad_norm": 0.15883713960647583, + "learning_rate": 1.3912270869848032e-08, + "loss": 0.0037, + "step": 6505 + }, + { + "epoch": 3.9586248859142072, + "grad_norm": 0.33996376395225525, + "learning_rate": 1.3500115495351795e-08, + "loss": 0.0054, + "step": 6506 + }, + { + "epoch": 3.959233343474293, + "grad_norm": 0.22483274340629578, + "learning_rate": 1.3094155731335767e-08, + "loss": 0.0047, + "step": 6507 + }, + { + "epoch": 3.959841801034378, + "grad_norm": 0.3512939214706421, + "learning_rate": 1.2694391678463868e-08, + "loss": 0.0082, + "step": 6508 + }, + { + "epoch": 3.960450258594463, + "grad_norm": 0.3100382685661316, + "learning_rate": 1.2300823435862363e-08, + "loss": 0.0046, + "step": 6509 + }, + { + "epoch": 3.961058716154548, + "grad_norm": 0.15239042043685913, + "learning_rate": 1.191345110111708e-08, + "loss": 0.0034, + "step": 6510 + }, + { + "epoch": 3.961667173714633, + "grad_norm": 0.2930486798286438, + "learning_rate": 1.1532274770281737e-08, + "loss": 0.0059, + "step": 6511 + }, + { + "epoch": 3.9622756312747187, + "grad_norm": 0.18088679015636444, + "learning_rate": 1.1157294537869622e-08, + "loss": 0.0063, + "step": 6512 + }, + { + "epoch": 3.962884088834804, + "grad_norm": 0.2748885452747345, + "learning_rate": 1.078851049686469e-08, + "loss": 0.0051, + "step": 6513 + }, + { + "epoch": 3.963492546394889, + "grad_norm": 0.25067272782325745, + "learning_rate": 1.0425922738704908e-08, + "loss": 0.0065, + "step": 6514 + }, + { + "epoch": 3.964101003954974, + "grad_norm": 0.2923562228679657, + "learning_rate": 1.0069531353301687e-08, + "loss": 0.0112, + "step": 6515 + }, + { + "epoch": 3.964709461515059, + "grad_norm": 0.23911620676517487, + "learning_rate": 9.719336429023229e-09, + "loss": 0.0069, + "step": 6516 + }, + { + "epoch": 3.9653179190751446, + "grad_norm": 0.23181907832622528, + "learning_rate": 9.375338052702853e-09, + "loss": 0.0054, + "step": 6517 + }, + { + "epoch": 3.9659263766352297, + "grad_norm": 0.25822049379348755, + "learning_rate": 9.037536309636218e-09, + "loss": 0.0059, + "step": 6518 + }, + { + "epoch": 3.966534834195315, + "grad_norm": 0.29777318239212036, + "learning_rate": 8.705931283586877e-09, + "loss": 0.008, + "step": 6519 + }, + { + "epoch": 3.9671432917554, + "grad_norm": 0.440320760011673, + "learning_rate": 8.380523056777944e-09, + "loss": 0.008, + "step": 6520 + }, + { + "epoch": 3.967751749315485, + "grad_norm": 0.2474939078092575, + "learning_rate": 8.061311709897656e-09, + "loss": 0.0038, + "step": 6521 + }, + { + "epoch": 3.9683602068755706, + "grad_norm": 0.4338011145591736, + "learning_rate": 7.748297322096586e-09, + "loss": 0.007, + "step": 6522 + }, + { + "epoch": 3.9689686644356557, + "grad_norm": 0.20323798060417175, + "learning_rate": 7.441479970990428e-09, + "loss": 0.0051, + "step": 6523 + }, + { + "epoch": 3.9695771219957408, + "grad_norm": 0.21181131899356842, + "learning_rate": 7.140859732654437e-09, + "loss": 0.0047, + "step": 6524 + }, + { + "epoch": 3.970185579555826, + "grad_norm": 0.25651806592941284, + "learning_rate": 6.846436681631763e-09, + "loss": 0.006, + "step": 6525 + }, + { + "epoch": 3.970794037115911, + "grad_norm": 0.20691846311092377, + "learning_rate": 6.558210890927896e-09, + "loss": 0.0041, + "step": 6526 + }, + { + "epoch": 3.9714024946759965, + "grad_norm": 0.21363694965839386, + "learning_rate": 6.2761824320106684e-09, + "loss": 0.0032, + "step": 6527 + }, + { + "epoch": 3.9720109522360816, + "grad_norm": 0.22716128826141357, + "learning_rate": 6.000351374807478e-09, + "loss": 0.005, + "step": 6528 + }, + { + "epoch": 3.9726194097961667, + "grad_norm": 0.19671691954135895, + "learning_rate": 5.730717787716389e-09, + "loss": 0.0042, + "step": 6529 + }, + { + "epoch": 3.9732278673562518, + "grad_norm": 0.27143892645835876, + "learning_rate": 5.467281737597807e-09, + "loss": 0.0053, + "step": 6530 + }, + { + "epoch": 3.973836324916337, + "grad_norm": 0.20458009839057922, + "learning_rate": 5.2100432897661535e-09, + "loss": 0.0049, + "step": 6531 + }, + { + "epoch": 3.9744447824764224, + "grad_norm": 0.2738465964794159, + "learning_rate": 4.959002508012067e-09, + "loss": 0.0075, + "step": 6532 + }, + { + "epoch": 3.9750532400365075, + "grad_norm": 0.2310987263917923, + "learning_rate": 4.714159454580203e-09, + "loss": 0.0029, + "step": 6533 + }, + { + "epoch": 3.9756616975965926, + "grad_norm": 0.33232930302619934, + "learning_rate": 4.47551419018033e-09, + "loss": 0.0072, + "step": 6534 + }, + { + "epoch": 3.9762701551566777, + "grad_norm": 0.21468819677829742, + "learning_rate": 4.243066773990112e-09, + "loss": 0.005, + "step": 6535 + }, + { + "epoch": 3.976878612716763, + "grad_norm": 0.21161320805549622, + "learning_rate": 4.016817263644002e-09, + "loss": 0.0057, + "step": 6536 + }, + { + "epoch": 3.9774870702768483, + "grad_norm": 0.22753821313381195, + "learning_rate": 3.796765715244344e-09, + "loss": 0.0048, + "step": 6537 + }, + { + "epoch": 3.9780955278369334, + "grad_norm": 0.229328915476799, + "learning_rate": 3.5829121833530488e-09, + "loss": 0.0063, + "step": 6538 + }, + { + "epoch": 3.9787039853970185, + "grad_norm": 0.25419679284095764, + "learning_rate": 3.3752567209971444e-09, + "loss": 0.0048, + "step": 6539 + }, + { + "epoch": 3.9793124429571036, + "grad_norm": 0.2747920751571655, + "learning_rate": 3.173799379665998e-09, + "loss": 0.0061, + "step": 6540 + }, + { + "epoch": 3.9799209005171887, + "grad_norm": 0.24330097436904907, + "learning_rate": 2.978540209314096e-09, + "loss": 0.0035, + "step": 6541 + }, + { + "epoch": 3.9805293580772743, + "grad_norm": 0.20687943696975708, + "learning_rate": 2.789479258358263e-09, + "loss": 0.0051, + "step": 6542 + }, + { + "epoch": 3.9811378156373594, + "grad_norm": 0.22026467323303223, + "learning_rate": 2.6066165736748914e-09, + "loss": 0.0068, + "step": 6543 + }, + { + "epoch": 3.9817462731974445, + "grad_norm": 0.22411870956420898, + "learning_rate": 2.429952200611041e-09, + "loss": 0.0042, + "step": 6544 + }, + { + "epoch": 3.9823547307575295, + "grad_norm": 0.250068336725235, + "learning_rate": 2.2594861829650094e-09, + "loss": 0.0051, + "step": 6545 + }, + { + "epoch": 3.9829631883176146, + "grad_norm": 0.2509680688381195, + "learning_rate": 2.095218563011314e-09, + "loss": 0.0048, + "step": 6546 + }, + { + "epoch": 3.9835716458777, + "grad_norm": 0.20843267440795898, + "learning_rate": 1.9371493814784867e-09, + "loss": 0.0041, + "step": 6547 + }, + { + "epoch": 3.9841801034377853, + "grad_norm": 0.12238210439682007, + "learning_rate": 1.7852786775629516e-09, + "loss": 0.0014, + "step": 6548 + }, + { + "epoch": 3.9847885609978704, + "grad_norm": 0.22475853562355042, + "learning_rate": 1.6396064889206974e-09, + "loss": 0.0046, + "step": 6549 + }, + { + "epoch": 3.9853970185579555, + "grad_norm": 0.1801108866930008, + "learning_rate": 1.5001328516728309e-09, + "loss": 0.0037, + "step": 6550 + }, + { + "epoch": 3.9860054761180406, + "grad_norm": 0.2936258912086487, + "learning_rate": 1.3668578004027988e-09, + "loss": 0.0074, + "step": 6551 + }, + { + "epoch": 3.986613933678126, + "grad_norm": 0.27133995294570923, + "learning_rate": 1.2397813681591653e-09, + "loss": 0.0061, + "step": 6552 + }, + { + "epoch": 3.987222391238211, + "grad_norm": 0.29440489411354065, + "learning_rate": 1.1189035864500597e-09, + "loss": 0.006, + "step": 6553 + }, + { + "epoch": 3.9878308487982963, + "grad_norm": 0.24190589785575867, + "learning_rate": 1.0042244852487281e-09, + "loss": 0.0042, + "step": 6554 + }, + { + "epoch": 3.9884393063583814, + "grad_norm": 0.2079675942659378, + "learning_rate": 8.957440929879823e-10, + "loss": 0.0042, + "step": 6555 + }, + { + "epoch": 3.9890477639184665, + "grad_norm": 0.2172488123178482, + "learning_rate": 7.934624365685261e-10, + "loss": 0.0041, + "step": 6556 + }, + { + "epoch": 3.989656221478552, + "grad_norm": 0.23664279282093048, + "learning_rate": 6.973795413534045e-10, + "loss": 0.0086, + "step": 6557 + }, + { + "epoch": 3.990264679038637, + "grad_norm": 0.12852723896503448, + "learning_rate": 6.074954311652281e-10, + "loss": 0.0013, + "step": 6558 + }, + { + "epoch": 3.9908731365987222, + "grad_norm": 0.24331684410572052, + "learning_rate": 5.238101282944996e-10, + "loss": 0.0056, + "step": 6559 + }, + { + "epoch": 3.9914815941588073, + "grad_norm": 0.49298709630966187, + "learning_rate": 4.4632365348573625e-10, + "loss": 0.0224, + "step": 6560 + }, + { + "epoch": 3.9920900517188924, + "grad_norm": 0.31049343943595886, + "learning_rate": 3.750360259596741e-10, + "loss": 0.0094, + "step": 6561 + }, + { + "epoch": 3.992698509278978, + "grad_norm": 0.2852802276611328, + "learning_rate": 3.099472633855127e-10, + "loss": 0.0075, + "step": 6562 + }, + { + "epoch": 3.993306966839063, + "grad_norm": 0.20569032430648804, + "learning_rate": 2.5105738190867033e-10, + "loss": 0.0051, + "step": 6563 + }, + { + "epoch": 3.993915424399148, + "grad_norm": 0.23459145426750183, + "learning_rate": 1.9836639612580422e-10, + "loss": 0.0032, + "step": 6564 + }, + { + "epoch": 3.9945238819592332, + "grad_norm": 0.24636982381343842, + "learning_rate": 1.518743191070149e-10, + "loss": 0.0067, + "step": 6565 + }, + { + "epoch": 3.9951323395193183, + "grad_norm": 0.21381396055221558, + "learning_rate": 1.1158116237919292e-10, + "loss": 0.0054, + "step": 6566 + }, + { + "epoch": 3.995740797079404, + "grad_norm": 0.2777492105960846, + "learning_rate": 7.748693593156997e-11, + "loss": 0.0209, + "step": 6567 + }, + { + "epoch": 3.996349254639489, + "grad_norm": 0.229709655046463, + "learning_rate": 4.959164821849438e-11, + "loss": 0.0044, + "step": 6568 + }, + { + "epoch": 3.996957712199574, + "grad_norm": 0.21392680704593658, + "learning_rate": 2.7895306156655587e-11, + "loss": 0.0033, + "step": 6569 + }, + { + "epoch": 3.997566169759659, + "grad_norm": 0.23358656466007233, + "learning_rate": 1.2397915127859705e-11, + "loss": 0.0094, + "step": 6570 + }, + { + "epoch": 3.9981746273197443, + "grad_norm": 0.23164047300815582, + "learning_rate": 3.099478973478398e-12, + "loss": 0.0039, + "step": 6571 + }, + { + "epoch": 3.99878308487983, + "grad_norm": 0.16882868111133575, + "learning_rate": 0.0, + "loss": 0.002, + "step": 6572 + }, + { + "epoch": 3.99878308487983, + "eval_loss": 1.711881399154663, + "eval_runtime": 104.2004, + "eval_samples_per_second": 7.313, + "eval_steps_per_second": 0.461, + "step": 6572 + }, + { + "epoch": 3.99878308487983, + "step": 6572, + "total_flos": 9.69717152698545e+18, + "train_loss": 0.1857661268678992, + "train_runtime": 120983.7455, + "train_samples_per_second": 1.739, + "train_steps_per_second": 0.054 + } + ], + "logging_steps": 1.0, + "max_steps": 6572, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 9.69717152698545e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}