{ "best_global_step": 28000, "best_metric": 2.3673107624053955, "best_model_checkpoint": "nllb_good_result__goodcheese\\checkpoint-28000", "epoch": 8.0, "eval_steps": 500, "global_step": 28232, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0056673278549164065, "grad_norm": 0.29186156392097473, "learning_rate": 1.2747875354107649e-06, "loss": 5.6321, "step": 20 }, { "epoch": 0.011334655709832813, "grad_norm": 0.267672061920166, "learning_rate": 2.691218130311615e-06, "loss": 5.5306, "step": 40 }, { "epoch": 0.01700198356474922, "grad_norm": 0.4487878382205963, "learning_rate": 4.036827195467422e-06, "loss": 5.8556, "step": 60 }, { "epoch": 0.022669311419665626, "grad_norm": 0.2811218500137329, "learning_rate": 5.453257790368272e-06, "loss": 5.0984, "step": 80 }, { "epoch": 0.028336639274582034, "grad_norm": 0.3103780746459961, "learning_rate": 6.869688385269123e-06, "loss": 5.8487, "step": 100 }, { "epoch": 0.03400396712949844, "grad_norm": 0.4519574046134949, "learning_rate": 8.286118980169973e-06, "loss": 5.1579, "step": 120 }, { "epoch": 0.03967129498441485, "grad_norm": 0.34206947684288025, "learning_rate": 9.63172804532578e-06, "loss": 5.7137, "step": 140 }, { "epoch": 0.04533862283933125, "grad_norm": 0.6143743991851807, "learning_rate": 1.1048158640226628e-05, "loss": 5.8383, "step": 160 }, { "epoch": 0.05100595069424766, "grad_norm": 0.3132596015930176, "learning_rate": 1.2464589235127479e-05, "loss": 5.6122, "step": 180 }, { "epoch": 0.05667327854916407, "grad_norm": 0.33682480454444885, "learning_rate": 1.388101983002833e-05, "loss": 5.2599, "step": 200 }, { "epoch": 0.06234060640408048, "grad_norm": 0.5733115673065186, "learning_rate": 1.529745042492918e-05, "loss": 6.2053, "step": 220 }, { "epoch": 0.06800793425899689, "grad_norm": 0.5600686073303223, "learning_rate": 1.671388101983003e-05, "loss": 5.337, "step": 240 }, { "epoch": 0.07367526211391329, "grad_norm": 1.2261295318603516, "learning_rate": 1.813031161473088e-05, "loss": 6.0524, "step": 260 }, { "epoch": 0.0793425899688297, "grad_norm": 1.14884614944458, "learning_rate": 1.954674220963173e-05, "loss": 5.3696, "step": 280 }, { "epoch": 0.0850099178237461, "grad_norm": 0.7089225053787231, "learning_rate": 2.096317280453258e-05, "loss": 5.7349, "step": 300 }, { "epoch": 0.0906772456786625, "grad_norm": 1.6177756786346436, "learning_rate": 2.237960339943343e-05, "loss": 5.2647, "step": 320 }, { "epoch": 0.09634457353357892, "grad_norm": 0.9251440763473511, "learning_rate": 2.3796033994334278e-05, "loss": 5.5348, "step": 340 }, { "epoch": 0.10201190138849532, "grad_norm": 1.6010820865631104, "learning_rate": 2.521246458923513e-05, "loss": 4.6075, "step": 360 }, { "epoch": 0.10767922924341174, "grad_norm": 1.859062910079956, "learning_rate": 2.6628895184135975e-05, "loss": 4.8889, "step": 380 }, { "epoch": 0.11334655709832814, "grad_norm": 0.9227429032325745, "learning_rate": 2.804532577903683e-05, "loss": 5.3685, "step": 400 }, { "epoch": 0.11901388495324454, "grad_norm": 1.8194032907485962, "learning_rate": 2.9461756373937676e-05, "loss": 5.1806, "step": 420 }, { "epoch": 0.12468121280816095, "grad_norm": 1.7711089849472046, "learning_rate": 3.087818696883853e-05, "loss": 4.473, "step": 440 }, { "epoch": 0.13034854066307736, "grad_norm": 2.442990303039551, "learning_rate": 3.229461756373938e-05, "loss": 4.8542, "step": 460 }, { "epoch": 0.13601586851799377, "grad_norm": 1.6828886270523071, "learning_rate": 3.371104815864023e-05, "loss": 4.2847, "step": 480 }, { "epoch": 0.1416831963729102, "grad_norm": 1.1208703517913818, "learning_rate": 3.5127478753541074e-05, "loss": 4.8895, "step": 500 }, { "epoch": 0.1416831963729102, "eval_loss": 5.187112808227539, "eval_runtime": 2.8266, "eval_samples_per_second": 35.379, "eval_steps_per_second": 8.845, "step": 500 }, { "epoch": 0.14735052422782657, "grad_norm": 1.8706556558609009, "learning_rate": 3.654390934844193e-05, "loss": 4.2952, "step": 520 }, { "epoch": 0.153017852082743, "grad_norm": 0.8823001980781555, "learning_rate": 3.796033994334278e-05, "loss": 4.2056, "step": 540 }, { "epoch": 0.1586851799376594, "grad_norm": 1.4135098457336426, "learning_rate": 3.937677053824363e-05, "loss": 4.058, "step": 560 }, { "epoch": 0.1643525077925758, "grad_norm": 1.7615550756454468, "learning_rate": 4.0793201133144476e-05, "loss": 3.8887, "step": 580 }, { "epoch": 0.1700198356474922, "grad_norm": 0.9159285426139832, "learning_rate": 4.220963172804533e-05, "loss": 3.633, "step": 600 }, { "epoch": 0.17568716350240862, "grad_norm": 0.9417641758918762, "learning_rate": 4.362606232294617e-05, "loss": 3.475, "step": 620 }, { "epoch": 0.181354491357325, "grad_norm": 2.368506908416748, "learning_rate": 4.504249291784703e-05, "loss": 3.9381, "step": 640 }, { "epoch": 0.18702181921224142, "grad_norm": 1.4000766277313232, "learning_rate": 4.645892351274788e-05, "loss": 3.7003, "step": 660 }, { "epoch": 0.19268914706715784, "grad_norm": 1.962949275970459, "learning_rate": 4.7875354107648726e-05, "loss": 3.2681, "step": 680 }, { "epoch": 0.19835647492207426, "grad_norm": 1.7445436716079712, "learning_rate": 4.9291784702549575e-05, "loss": 3.4537, "step": 700 }, { "epoch": 0.20402380277699064, "grad_norm": 1.899295449256897, "learning_rate": 5.070821529745042e-05, "loss": 3.4228, "step": 720 }, { "epoch": 0.20969113063190706, "grad_norm": 0.866439700126648, "learning_rate": 5.212464589235128e-05, "loss": 3.5248, "step": 740 }, { "epoch": 0.21535845848682347, "grad_norm": 0.894516110420227, "learning_rate": 5.354107648725213e-05, "loss": 3.042, "step": 760 }, { "epoch": 0.22102578634173986, "grad_norm": 1.376686453819275, "learning_rate": 5.495750708215298e-05, "loss": 3.1676, "step": 780 }, { "epoch": 0.22669311419665628, "grad_norm": 0.9934491515159607, "learning_rate": 5.6373937677053825e-05, "loss": 3.1438, "step": 800 }, { "epoch": 0.2323604420515727, "grad_norm": 2.115955352783203, "learning_rate": 5.7790368271954674e-05, "loss": 3.3913, "step": 820 }, { "epoch": 0.23802776990648908, "grad_norm": 1.3446273803710938, "learning_rate": 5.920679886685553e-05, "loss": 3.0604, "step": 840 }, { "epoch": 0.2436950977614055, "grad_norm": 2.301786422729492, "learning_rate": 6.0623229461756384e-05, "loss": 3.2805, "step": 860 }, { "epoch": 0.2493624256163219, "grad_norm": 1.1904113292694092, "learning_rate": 6.203966005665722e-05, "loss": 3.1956, "step": 880 }, { "epoch": 0.2550297534712383, "grad_norm": 1.6670551300048828, "learning_rate": 6.345609065155808e-05, "loss": 3.4157, "step": 900 }, { "epoch": 0.2606970813261547, "grad_norm": 1.5608646869659424, "learning_rate": 6.487252124645893e-05, "loss": 3.359, "step": 920 }, { "epoch": 0.2663644091810711, "grad_norm": 1.198926568031311, "learning_rate": 6.628895184135979e-05, "loss": 2.738, "step": 940 }, { "epoch": 0.27203173703598754, "grad_norm": 1.5176031589508057, "learning_rate": 6.770538243626063e-05, "loss": 3.01, "step": 960 }, { "epoch": 0.27769906489090396, "grad_norm": 1.9348669052124023, "learning_rate": 6.912181303116147e-05, "loss": 3.0666, "step": 980 }, { "epoch": 0.2833663927458204, "grad_norm": 1.7650913000106812, "learning_rate": 7.053824362606233e-05, "loss": 3.1139, "step": 1000 }, { "epoch": 0.2833663927458204, "eval_loss": 3.923851251602173, "eval_runtime": 2.837, "eval_samples_per_second": 35.248, "eval_steps_per_second": 8.812, "step": 1000 }, { "epoch": 0.28903372060073673, "grad_norm": 1.2686957120895386, "learning_rate": 7.195467422096318e-05, "loss": 3.0188, "step": 1020 }, { "epoch": 0.29470104845565315, "grad_norm": 1.4178904294967651, "learning_rate": 7.337110481586402e-05, "loss": 3.1594, "step": 1040 }, { "epoch": 0.30036837631056956, "grad_norm": 1.3633654117584229, "learning_rate": 7.478753541076488e-05, "loss": 2.8664, "step": 1060 }, { "epoch": 0.306035704165486, "grad_norm": 1.7646857500076294, "learning_rate": 7.620396600566573e-05, "loss": 3.072, "step": 1080 }, { "epoch": 0.3117030320204024, "grad_norm": 1.4823541641235352, "learning_rate": 7.762039660056658e-05, "loss": 3.2188, "step": 1100 }, { "epoch": 0.3173703598753188, "grad_norm": 2.0483739376068115, "learning_rate": 7.903682719546742e-05, "loss": 2.9581, "step": 1120 }, { "epoch": 0.32303768773023517, "grad_norm": 2.436291217803955, "learning_rate": 8.045325779036827e-05, "loss": 3.1419, "step": 1140 }, { "epoch": 0.3287050155851516, "grad_norm": 1.544060468673706, "learning_rate": 8.186968838526913e-05, "loss": 2.8801, "step": 1160 }, { "epoch": 0.334372343440068, "grad_norm": 1.2832497358322144, "learning_rate": 8.328611898016998e-05, "loss": 3.0091, "step": 1180 }, { "epoch": 0.3400396712949844, "grad_norm": 1.899031639099121, "learning_rate": 8.470254957507083e-05, "loss": 3.0946, "step": 1200 }, { "epoch": 0.34570699914990083, "grad_norm": 2.1353347301483154, "learning_rate": 8.611898016997168e-05, "loss": 3.1204, "step": 1220 }, { "epoch": 0.35137432700481724, "grad_norm": 2.685692548751831, "learning_rate": 8.753541076487252e-05, "loss": 3.3401, "step": 1240 }, { "epoch": 0.35704165485973366, "grad_norm": 1.8317824602127075, "learning_rate": 8.895184135977338e-05, "loss": 2.8771, "step": 1260 }, { "epoch": 0.36270898271465, "grad_norm": 1.755120873451233, "learning_rate": 9.036827195467422e-05, "loss": 2.9278, "step": 1280 }, { "epoch": 0.36837631056956643, "grad_norm": 1.3528960943222046, "learning_rate": 9.178470254957508e-05, "loss": 3.2048, "step": 1300 }, { "epoch": 0.37404363842448285, "grad_norm": 2.8512563705444336, "learning_rate": 9.320113314447593e-05, "loss": 2.9606, "step": 1320 }, { "epoch": 0.37971096627939926, "grad_norm": 2.7526137828826904, "learning_rate": 9.461756373937679e-05, "loss": 3.1115, "step": 1340 }, { "epoch": 0.3853782941343157, "grad_norm": 1.5162757635116577, "learning_rate": 9.603399433427761e-05, "loss": 2.7891, "step": 1360 }, { "epoch": 0.3910456219892321, "grad_norm": 1.395861029624939, "learning_rate": 9.745042492917847e-05, "loss": 2.8312, "step": 1380 }, { "epoch": 0.3967129498441485, "grad_norm": 2.0118582248687744, "learning_rate": 9.886685552407933e-05, "loss": 2.9076, "step": 1400 }, { "epoch": 0.40238027769906487, "grad_norm": 1.3605271577835083, "learning_rate": 0.00010028328611898017, "loss": 2.7994, "step": 1420 }, { "epoch": 0.4080476055539813, "grad_norm": 1.5742707252502441, "learning_rate": 0.00010169971671388104, "loss": 2.6921, "step": 1440 }, { "epoch": 0.4137149334088977, "grad_norm": 1.3698663711547852, "learning_rate": 0.00010311614730878188, "loss": 3.1178, "step": 1460 }, { "epoch": 0.4193822612638141, "grad_norm": 1.5976741313934326, "learning_rate": 0.00010453257790368272, "loss": 2.8757, "step": 1480 }, { "epoch": 0.42504958911873053, "grad_norm": 2.3305740356445312, "learning_rate": 0.00010594900849858358, "loss": 2.8722, "step": 1500 }, { "epoch": 0.42504958911873053, "eval_loss": 3.5559470653533936, "eval_runtime": 2.8729, "eval_samples_per_second": 34.808, "eval_steps_per_second": 8.702, "step": 1500 }, { "epoch": 0.43071691697364695, "grad_norm": 2.873483657836914, "learning_rate": 0.00010736543909348442, "loss": 2.9398, "step": 1520 }, { "epoch": 0.4363842448285633, "grad_norm": 2.218794584274292, "learning_rate": 0.00010878186968838529, "loss": 2.8192, "step": 1540 }, { "epoch": 0.4420515726834797, "grad_norm": 2.125523805618286, "learning_rate": 0.00011019830028328613, "loss": 2.9393, "step": 1560 }, { "epoch": 0.44771890053839614, "grad_norm": 1.6263399124145508, "learning_rate": 0.00011161473087818697, "loss": 2.9762, "step": 1580 }, { "epoch": 0.45338622839331255, "grad_norm": 2.533090591430664, "learning_rate": 0.00011303116147308783, "loss": 2.8722, "step": 1600 }, { "epoch": 0.45905355624822897, "grad_norm": 1.4439283609390259, "learning_rate": 0.00011444759206798867, "loss": 2.8747, "step": 1620 }, { "epoch": 0.4647208841031454, "grad_norm": 2.7112388610839844, "learning_rate": 0.00011586402266288951, "loss": 2.8902, "step": 1640 }, { "epoch": 0.4703882119580618, "grad_norm": 2.0887794494628906, "learning_rate": 0.00011728045325779038, "loss": 2.9718, "step": 1660 }, { "epoch": 0.47605553981297816, "grad_norm": 2.5012853145599365, "learning_rate": 0.00011869688385269122, "loss": 2.7605, "step": 1680 }, { "epoch": 0.48172286766789457, "grad_norm": 1.0283865928649902, "learning_rate": 0.00012011331444759209, "loss": 2.8882, "step": 1700 }, { "epoch": 0.487390195522811, "grad_norm": 1.7189027070999146, "learning_rate": 0.00012152974504249293, "loss": 2.6467, "step": 1720 }, { "epoch": 0.4930575233777274, "grad_norm": 1.732700228691101, "learning_rate": 0.00012294617563739376, "loss": 3.0613, "step": 1740 }, { "epoch": 0.4987248512326438, "grad_norm": 2.5366084575653076, "learning_rate": 0.00012436260623229463, "loss": 2.8189, "step": 1760 }, { "epoch": 0.5043921790875602, "grad_norm": 3.092099189758301, "learning_rate": 0.00012577903682719547, "loss": 2.899, "step": 1780 }, { "epoch": 0.5100595069424766, "grad_norm": 2.3340187072753906, "learning_rate": 0.0001271954674220963, "loss": 2.7357, "step": 1800 }, { "epoch": 0.515726834797393, "grad_norm": 1.4133331775665283, "learning_rate": 0.00012861189801699718, "loss": 2.6591, "step": 1820 }, { "epoch": 0.5213941626523094, "grad_norm": 2.181704044342041, "learning_rate": 0.00013002832861189802, "loss": 2.7129, "step": 1840 }, { "epoch": 0.5270614905072258, "grad_norm": 1.4123858213424683, "learning_rate": 0.0001314447592067989, "loss": 2.6462, "step": 1860 }, { "epoch": 0.5327288183621423, "grad_norm": 2.365875482559204, "learning_rate": 0.00013286118980169973, "loss": 2.7759, "step": 1880 }, { "epoch": 0.5383961462170587, "grad_norm": 1.6475216150283813, "learning_rate": 0.00013427762039660058, "loss": 2.8526, "step": 1900 }, { "epoch": 0.5440634740719751, "grad_norm": 3.0361626148223877, "learning_rate": 0.00013569405099150142, "loss": 3.0276, "step": 1920 }, { "epoch": 0.5497308019268915, "grad_norm": 2.4113242626190186, "learning_rate": 0.00013711048158640226, "loss": 2.9711, "step": 1940 }, { "epoch": 0.5553981297818079, "grad_norm": 3.097050666809082, "learning_rate": 0.0001385269121813031, "loss": 2.9887, "step": 1960 }, { "epoch": 0.5610654576367243, "grad_norm": 2.7279372215270996, "learning_rate": 0.00013994334277620397, "loss": 2.8313, "step": 1980 }, { "epoch": 0.5667327854916407, "grad_norm": 2.4262983798980713, "learning_rate": 0.0001413597733711048, "loss": 2.9281, "step": 2000 }, { "epoch": 0.5667327854916407, "eval_loss": 3.397294044494629, "eval_runtime": 2.8357, "eval_samples_per_second": 35.265, "eval_steps_per_second": 8.816, "step": 2000 }, { "epoch": 0.572400113346557, "grad_norm": 2.5475411415100098, "learning_rate": 0.00014277620396600566, "loss": 2.625, "step": 2020 }, { "epoch": 0.5780674412014735, "grad_norm": 3.0202033519744873, "learning_rate": 0.00014419263456090652, "loss": 2.674, "step": 2040 }, { "epoch": 0.5837347690563899, "grad_norm": 1.454906940460205, "learning_rate": 0.00014560906515580737, "loss": 2.9691, "step": 2060 }, { "epoch": 0.5894020969113063, "grad_norm": 2.606339693069458, "learning_rate": 0.00014702549575070824, "loss": 2.876, "step": 2080 }, { "epoch": 0.5950694247662227, "grad_norm": 2.5433061122894287, "learning_rate": 0.00014844192634560908, "loss": 3.0493, "step": 2100 }, { "epoch": 0.6007367526211391, "grad_norm": 1.5245689153671265, "learning_rate": 0.00014985835694050992, "loss": 2.6003, "step": 2120 }, { "epoch": 0.6064040804760555, "grad_norm": 3.1535794734954834, "learning_rate": 0.0001512747875354108, "loss": 2.8714, "step": 2140 }, { "epoch": 0.612071408330972, "grad_norm": 1.9213694334030151, "learning_rate": 0.00015269121813031163, "loss": 2.7927, "step": 2160 }, { "epoch": 0.6177387361858884, "grad_norm": 1.5795044898986816, "learning_rate": 0.00015410764872521247, "loss": 2.4769, "step": 2180 }, { "epoch": 0.6234060640408048, "grad_norm": 1.8888667821884155, "learning_rate": 0.00015552407932011331, "loss": 2.8295, "step": 2200 }, { "epoch": 0.6290733918957212, "grad_norm": 2.111983060836792, "learning_rate": 0.00015694050991501416, "loss": 2.8305, "step": 2220 }, { "epoch": 0.6347407197506376, "grad_norm": 3.1099205017089844, "learning_rate": 0.00015835694050991502, "loss": 2.5189, "step": 2240 }, { "epoch": 0.640408047605554, "grad_norm": 1.6185884475708008, "learning_rate": 0.00015977337110481587, "loss": 2.7794, "step": 2260 }, { "epoch": 0.6460753754604703, "grad_norm": 2.0744504928588867, "learning_rate": 0.0001611898016997167, "loss": 2.6947, "step": 2280 }, { "epoch": 0.6517427033153868, "grad_norm": 1.8444247245788574, "learning_rate": 0.00016260623229461758, "loss": 2.6664, "step": 2300 }, { "epoch": 0.6574100311703032, "grad_norm": 1.7946594953536987, "learning_rate": 0.00016402266288951842, "loss": 2.5474, "step": 2320 }, { "epoch": 0.6630773590252196, "grad_norm": 3.6139602661132812, "learning_rate": 0.00016543909348441926, "loss": 2.7576, "step": 2340 }, { "epoch": 0.668744686880136, "grad_norm": 2.6122708320617676, "learning_rate": 0.00016685552407932013, "loss": 2.7103, "step": 2360 }, { "epoch": 0.6744120147350524, "grad_norm": 1.617784857749939, "learning_rate": 0.00016827195467422097, "loss": 2.5282, "step": 2380 }, { "epoch": 0.6800793425899688, "grad_norm": 2.7513091564178467, "learning_rate": 0.00016968838526912184, "loss": 2.3312, "step": 2400 }, { "epoch": 0.6857466704448852, "grad_norm": 2.7580323219299316, "learning_rate": 0.00017110481586402268, "loss": 2.7012, "step": 2420 }, { "epoch": 0.6914139982998017, "grad_norm": 3.184919595718384, "learning_rate": 0.00017252124645892352, "loss": 2.8081, "step": 2440 }, { "epoch": 0.6970813261547181, "grad_norm": 2.8753271102905273, "learning_rate": 0.00017393767705382437, "loss": 2.5758, "step": 2460 }, { "epoch": 0.7027486540096345, "grad_norm": 2.1138670444488525, "learning_rate": 0.0001753541076487252, "loss": 2.6938, "step": 2480 }, { "epoch": 0.7084159818645509, "grad_norm": 1.2899034023284912, "learning_rate": 0.00017677053824362605, "loss": 2.6271, "step": 2500 }, { "epoch": 0.7084159818645509, "eval_loss": 3.2992186546325684, "eval_runtime": 2.8177, "eval_samples_per_second": 35.49, "eval_steps_per_second": 8.872, "step": 2500 }, { "epoch": 0.7140833097194673, "grad_norm": 2.5916409492492676, "learning_rate": 0.00017818696883852692, "loss": 2.6379, "step": 2520 }, { "epoch": 0.7197506375743836, "grad_norm": 2.547750949859619, "learning_rate": 0.00017960339943342776, "loss": 2.9224, "step": 2540 }, { "epoch": 0.7254179654293, "grad_norm": 1.6308832168579102, "learning_rate": 0.00018101983002832863, "loss": 2.8481, "step": 2560 }, { "epoch": 0.7310852932842165, "grad_norm": 2.038508892059326, "learning_rate": 0.00018243626062322947, "loss": 2.8581, "step": 2580 }, { "epoch": 0.7367526211391329, "grad_norm": 2.5229544639587402, "learning_rate": 0.00018385269121813031, "loss": 2.9051, "step": 2600 }, { "epoch": 0.7424199489940493, "grad_norm": 1.5403872728347778, "learning_rate": 0.00018526912181303118, "loss": 2.1375, "step": 2620 }, { "epoch": 0.7480872768489657, "grad_norm": 1.7945910692214966, "learning_rate": 0.00018668555240793203, "loss": 2.5773, "step": 2640 }, { "epoch": 0.7537546047038821, "grad_norm": 3.048215389251709, "learning_rate": 0.00018810198300283287, "loss": 2.9564, "step": 2660 }, { "epoch": 0.7594219325587985, "grad_norm": 2.5712270736694336, "learning_rate": 0.00018951841359773374, "loss": 2.5898, "step": 2680 }, { "epoch": 0.7650892604137149, "grad_norm": 2.2132937908172607, "learning_rate": 0.00019093484419263458, "loss": 2.8016, "step": 2700 }, { "epoch": 0.7707565882686314, "grad_norm": 2.118088483810425, "learning_rate": 0.00019235127478753542, "loss": 2.6996, "step": 2720 }, { "epoch": 0.7764239161235478, "grad_norm": 2.9355812072753906, "learning_rate": 0.00019376770538243626, "loss": 2.5367, "step": 2740 }, { "epoch": 0.7820912439784642, "grad_norm": 2.3586180210113525, "learning_rate": 0.0001951841359773371, "loss": 2.5093, "step": 2760 }, { "epoch": 0.7877585718333806, "grad_norm": 1.9104572534561157, "learning_rate": 0.00019660056657223797, "loss": 2.7953, "step": 2780 }, { "epoch": 0.793425899688297, "grad_norm": 2.2142138481140137, "learning_rate": 0.00019801699716713881, "loss": 2.8754, "step": 2800 }, { "epoch": 0.7990932275432133, "grad_norm": 1.410037875175476, "learning_rate": 0.00019943342776203966, "loss": 2.5827, "step": 2820 }, { "epoch": 0.8047605553981297, "grad_norm": 2.351311445236206, "learning_rate": 0.00019990554156171285, "loss": 2.6927, "step": 2840 }, { "epoch": 0.8104278832530462, "grad_norm": 1.872078537940979, "learning_rate": 0.00019974811083123427, "loss": 2.683, "step": 2860 }, { "epoch": 0.8160952111079626, "grad_norm": 2.698965072631836, "learning_rate": 0.00019959068010075567, "loss": 2.5791, "step": 2880 }, { "epoch": 0.821762538962879, "grad_norm": 3.219263792037964, "learning_rate": 0.00019943324937027709, "loss": 2.7265, "step": 2900 }, { "epoch": 0.8274298668177954, "grad_norm": 1.9284050464630127, "learning_rate": 0.0001992758186397985, "loss": 2.6708, "step": 2920 }, { "epoch": 0.8330971946727118, "grad_norm": 1.8650238513946533, "learning_rate": 0.00019911838790931993, "loss": 2.5391, "step": 2940 }, { "epoch": 0.8387645225276282, "grad_norm": 2.5646023750305176, "learning_rate": 0.00019896095717884132, "loss": 2.5239, "step": 2960 }, { "epoch": 0.8444318503825446, "grad_norm": 2.720078706741333, "learning_rate": 0.00019880352644836274, "loss": 2.8554, "step": 2980 }, { "epoch": 0.8500991782374611, "grad_norm": 1.6769999265670776, "learning_rate": 0.00019864609571788416, "loss": 2.6715, "step": 3000 }, { "epoch": 0.8500991782374611, "eval_loss": 3.206998825073242, "eval_runtime": 3.1674, "eval_samples_per_second": 31.571, "eval_steps_per_second": 7.893, "step": 3000 }, { "epoch": 0.8557665060923775, "grad_norm": 2.186739921569824, "learning_rate": 0.00019848866498740556, "loss": 2.2278, "step": 3020 }, { "epoch": 0.8614338339472939, "grad_norm": 2.36682391166687, "learning_rate": 0.00019833123425692698, "loss": 2.5699, "step": 3040 }, { "epoch": 0.8671011618022103, "grad_norm": 2.753265619277954, "learning_rate": 0.00019817380352644837, "loss": 2.8219, "step": 3060 }, { "epoch": 0.8727684896571266, "grad_norm": 3.0686233043670654, "learning_rate": 0.0001980163727959698, "loss": 2.5103, "step": 3080 }, { "epoch": 0.878435817512043, "grad_norm": 2.590656280517578, "learning_rate": 0.0001978589420654912, "loss": 2.7593, "step": 3100 }, { "epoch": 0.8841031453669594, "grad_norm": 1.642221212387085, "learning_rate": 0.0001977015113350126, "loss": 2.5822, "step": 3120 }, { "epoch": 0.8897704732218759, "grad_norm": 1.97707998752594, "learning_rate": 0.00019754408060453402, "loss": 2.7995, "step": 3140 }, { "epoch": 0.8954378010767923, "grad_norm": 1.8260983228683472, "learning_rate": 0.00019738664987405542, "loss": 2.6908, "step": 3160 }, { "epoch": 0.9011051289317087, "grad_norm": 1.9206418991088867, "learning_rate": 0.00019722921914357684, "loss": 2.7466, "step": 3180 }, { "epoch": 0.9067724567866251, "grad_norm": 2.0730175971984863, "learning_rate": 0.00019707178841309826, "loss": 2.7414, "step": 3200 }, { "epoch": 0.9124397846415415, "grad_norm": 2.5340662002563477, "learning_rate": 0.00019691435768261965, "loss": 2.4917, "step": 3220 }, { "epoch": 0.9181071124964579, "grad_norm": 0.9857118725776672, "learning_rate": 0.00019675692695214107, "loss": 2.7394, "step": 3240 }, { "epoch": 0.9237744403513743, "grad_norm": 2.7024550437927246, "learning_rate": 0.00019659949622166247, "loss": 2.8429, "step": 3260 }, { "epoch": 0.9294417682062908, "grad_norm": 2.9400439262390137, "learning_rate": 0.0001964420654911839, "loss": 2.7125, "step": 3280 }, { "epoch": 0.9351090960612072, "grad_norm": 2.304840564727783, "learning_rate": 0.00019628463476070528, "loss": 2.7783, "step": 3300 }, { "epoch": 0.9407764239161236, "grad_norm": 2.6463370323181152, "learning_rate": 0.0001961272040302267, "loss": 2.7213, "step": 3320 }, { "epoch": 0.94644375177104, "grad_norm": 1.4277743101119995, "learning_rate": 0.00019596977329974812, "loss": 2.6145, "step": 3340 }, { "epoch": 0.9521110796259563, "grad_norm": 3.241283416748047, "learning_rate": 0.00019581234256926952, "loss": 2.4275, "step": 3360 }, { "epoch": 0.9577784074808727, "grad_norm": 2.0646631717681885, "learning_rate": 0.00019565491183879094, "loss": 2.629, "step": 3380 }, { "epoch": 0.9634457353357891, "grad_norm": 2.0683557987213135, "learning_rate": 0.00019549748110831233, "loss": 2.5786, "step": 3400 }, { "epoch": 0.9691130631907056, "grad_norm": 3.0151731967926025, "learning_rate": 0.00019534005037783375, "loss": 2.5653, "step": 3420 }, { "epoch": 0.974780391045622, "grad_norm": 2.5758419036865234, "learning_rate": 0.00019518261964735517, "loss": 2.7547, "step": 3440 }, { "epoch": 0.9804477189005384, "grad_norm": 2.408783435821533, "learning_rate": 0.00019502518891687656, "loss": 2.6636, "step": 3460 }, { "epoch": 0.9861150467554548, "grad_norm": 1.4672608375549316, "learning_rate": 0.00019486775818639799, "loss": 2.6821, "step": 3480 }, { "epoch": 0.9917823746103712, "grad_norm": 2.063401937484741, "learning_rate": 0.0001947103274559194, "loss": 2.568, "step": 3500 }, { "epoch": 0.9917823746103712, "eval_loss": 3.140152931213379, "eval_runtime": 3.2907, "eval_samples_per_second": 30.389, "eval_steps_per_second": 7.597, "step": 3500 }, { "epoch": 0.9974497024652876, "grad_norm": 2.9170329570770264, "learning_rate": 0.00019455289672544083, "loss": 2.303, "step": 3520 }, { "epoch": 1.003117030320204, "grad_norm": 2.1850082874298096, "learning_rate": 0.00019439546599496222, "loss": 2.41, "step": 3540 }, { "epoch": 1.0087843581751204, "grad_norm": 3.325085163116455, "learning_rate": 0.00019423803526448364, "loss": 2.5818, "step": 3560 }, { "epoch": 1.0144516860300368, "grad_norm": 2.6982274055480957, "learning_rate": 0.00019408060453400506, "loss": 2.6028, "step": 3580 }, { "epoch": 1.0201190138849532, "grad_norm": 1.3922134637832642, "learning_rate": 0.00019392317380352645, "loss": 2.4387, "step": 3600 }, { "epoch": 1.0257863417398696, "grad_norm": 1.5238608121871948, "learning_rate": 0.00019376574307304788, "loss": 2.1849, "step": 3620 }, { "epoch": 1.031453669594786, "grad_norm": 3.7361652851104736, "learning_rate": 0.0001936083123425693, "loss": 2.5195, "step": 3640 }, { "epoch": 1.0371209974497024, "grad_norm": 2.168543577194214, "learning_rate": 0.0001934508816120907, "loss": 2.5481, "step": 3660 }, { "epoch": 1.0427883253046188, "grad_norm": 3.507930040359497, "learning_rate": 0.0001932934508816121, "loss": 2.408, "step": 3680 }, { "epoch": 1.0484556531595353, "grad_norm": 2.137735366821289, "learning_rate": 0.0001931360201511335, "loss": 2.3248, "step": 3700 }, { "epoch": 1.0541229810144517, "grad_norm": 3.4405417442321777, "learning_rate": 0.00019297858942065492, "loss": 2.4033, "step": 3720 }, { "epoch": 1.059790308869368, "grad_norm": 1.8242233991622925, "learning_rate": 0.00019282115869017634, "loss": 2.5319, "step": 3740 }, { "epoch": 1.0654576367242845, "grad_norm": 2.793949604034424, "learning_rate": 0.00019266372795969774, "loss": 2.6715, "step": 3760 }, { "epoch": 1.071124964579201, "grad_norm": 2.859201192855835, "learning_rate": 0.00019250629722921916, "loss": 2.8222, "step": 3780 }, { "epoch": 1.0767922924341173, "grad_norm": 2.847801923751831, "learning_rate": 0.00019234886649874055, "loss": 2.7445, "step": 3800 }, { "epoch": 1.0824596202890338, "grad_norm": 2.263092517852783, "learning_rate": 0.00019219143576826197, "loss": 2.7047, "step": 3820 }, { "epoch": 1.0881269481439502, "grad_norm": 2.0762503147125244, "learning_rate": 0.0001920340050377834, "loss": 2.6632, "step": 3840 }, { "epoch": 1.0937942759988666, "grad_norm": 2.307384967803955, "learning_rate": 0.0001918765743073048, "loss": 2.2797, "step": 3860 }, { "epoch": 1.099461603853783, "grad_norm": 1.720149040222168, "learning_rate": 0.0001917191435768262, "loss": 2.5853, "step": 3880 }, { "epoch": 1.1051289317086994, "grad_norm": 3.2709243297576904, "learning_rate": 0.0001915617128463476, "loss": 2.8081, "step": 3900 }, { "epoch": 1.1107962595636158, "grad_norm": 2.0977277755737305, "learning_rate": 0.00019140428211586902, "loss": 2.4242, "step": 3920 }, { "epoch": 1.1164635874185322, "grad_norm": 2.695814371109009, "learning_rate": 0.00019124685138539044, "loss": 2.8308, "step": 3940 }, { "epoch": 1.1221309152734487, "grad_norm": 1.8322988748550415, "learning_rate": 0.00019108942065491184, "loss": 2.7928, "step": 3960 }, { "epoch": 1.1277982431283649, "grad_norm": 2.3744421005249023, "learning_rate": 0.00019093198992443326, "loss": 2.6975, "step": 3980 }, { "epoch": 1.1334655709832813, "grad_norm": 2.833454132080078, "learning_rate": 0.00019077455919395465, "loss": 2.5879, "step": 4000 }, { "epoch": 1.1334655709832813, "eval_loss": 3.082501173019409, "eval_runtime": 2.8091, "eval_samples_per_second": 35.599, "eval_steps_per_second": 8.9, "step": 4000 }, { "epoch": 1.1391328988381977, "grad_norm": 3.0977234840393066, "learning_rate": 0.00019061712846347607, "loss": 2.6304, "step": 4020 }, { "epoch": 1.144800226693114, "grad_norm": 2.1840627193450928, "learning_rate": 0.0001904596977329975, "loss": 2.4813, "step": 4040 }, { "epoch": 1.1504675545480305, "grad_norm": 2.9770548343658447, "learning_rate": 0.00019030226700251889, "loss": 2.6723, "step": 4060 }, { "epoch": 1.156134882402947, "grad_norm": 2.6563708782196045, "learning_rate": 0.0001901448362720403, "loss": 2.596, "step": 4080 }, { "epoch": 1.1618022102578633, "grad_norm": 3.5191991329193115, "learning_rate": 0.00018998740554156173, "loss": 2.5523, "step": 4100 }, { "epoch": 1.1674695381127798, "grad_norm": 3.183046817779541, "learning_rate": 0.00018982997481108312, "loss": 2.5057, "step": 4120 }, { "epoch": 1.1731368659676962, "grad_norm": 1.4318444728851318, "learning_rate": 0.00018967254408060454, "loss": 2.1762, "step": 4140 }, { "epoch": 1.1788041938226126, "grad_norm": 1.7054232358932495, "learning_rate": 0.00018951511335012596, "loss": 2.417, "step": 4160 }, { "epoch": 1.184471521677529, "grad_norm": 2.96807861328125, "learning_rate": 0.00018935768261964738, "loss": 2.7116, "step": 4180 }, { "epoch": 1.1901388495324454, "grad_norm": 1.3166216611862183, "learning_rate": 0.00018920025188916878, "loss": 1.9508, "step": 4200 }, { "epoch": 1.1958061773873618, "grad_norm": 1.5125080347061157, "learning_rate": 0.0001890428211586902, "loss": 2.5453, "step": 4220 }, { "epoch": 1.2014735052422783, "grad_norm": 3.00634765625, "learning_rate": 0.00018888539042821162, "loss": 2.4392, "step": 4240 }, { "epoch": 1.2071408330971947, "grad_norm": 3.5963969230651855, "learning_rate": 0.000188727959697733, "loss": 2.4767, "step": 4260 }, { "epoch": 1.212808160952111, "grad_norm": 2.389190673828125, "learning_rate": 0.00018857052896725443, "loss": 2.5892, "step": 4280 }, { "epoch": 1.2184754888070275, "grad_norm": 2.9884426593780518, "learning_rate": 0.00018841309823677582, "loss": 2.4698, "step": 4300 }, { "epoch": 1.224142816661944, "grad_norm": 2.3109359741210938, "learning_rate": 0.00018825566750629724, "loss": 2.2529, "step": 4320 }, { "epoch": 1.2298101445168603, "grad_norm": 2.0564658641815186, "learning_rate": 0.00018809823677581867, "loss": 2.361, "step": 4340 }, { "epoch": 1.2354774723717767, "grad_norm": 2.6753528118133545, "learning_rate": 0.00018794080604534006, "loss": 2.6806, "step": 4360 }, { "epoch": 1.2411448002266932, "grad_norm": 2.4879562854766846, "learning_rate": 0.00018778337531486148, "loss": 2.5959, "step": 4380 }, { "epoch": 1.2468121280816096, "grad_norm": 3.1283600330352783, "learning_rate": 0.00018762594458438287, "loss": 2.5376, "step": 4400 }, { "epoch": 1.252479455936526, "grad_norm": 2.438445568084717, "learning_rate": 0.0001874685138539043, "loss": 2.1858, "step": 4420 }, { "epoch": 1.2581467837914424, "grad_norm": 3.1226162910461426, "learning_rate": 0.00018731108312342571, "loss": 2.6253, "step": 4440 }, { "epoch": 1.2638141116463588, "grad_norm": 2.275535821914673, "learning_rate": 0.0001871536523929471, "loss": 2.3781, "step": 4460 }, { "epoch": 1.2694814395012752, "grad_norm": 1.8061625957489014, "learning_rate": 0.00018699622166246853, "loss": 2.6929, "step": 4480 }, { "epoch": 1.2751487673561916, "grad_norm": 1.3811522722244263, "learning_rate": 0.00018683879093198992, "loss": 2.5407, "step": 4500 }, { "epoch": 1.2751487673561916, "eval_loss": 3.0039620399475098, "eval_runtime": 2.811, "eval_samples_per_second": 35.575, "eval_steps_per_second": 8.894, "step": 4500 }, { "epoch": 1.280816095211108, "grad_norm": 2.472416877746582, "learning_rate": 0.00018668136020151134, "loss": 2.755, "step": 4520 }, { "epoch": 1.2864834230660245, "grad_norm": 2.005464553833008, "learning_rate": 0.00018652392947103276, "loss": 2.3747, "step": 4540 }, { "epoch": 1.292150750920941, "grad_norm": 2.9778473377227783, "learning_rate": 0.00018636649874055416, "loss": 2.5926, "step": 4560 }, { "epoch": 1.2978180787758573, "grad_norm": 1.713280200958252, "learning_rate": 0.00018620906801007558, "loss": 2.2695, "step": 4580 }, { "epoch": 1.3034854066307737, "grad_norm": 1.985217571258545, "learning_rate": 0.00018605163727959697, "loss": 2.308, "step": 4600 }, { "epoch": 1.30915273448569, "grad_norm": 2.0672178268432617, "learning_rate": 0.0001858942065491184, "loss": 2.6654, "step": 4620 }, { "epoch": 1.3148200623406063, "grad_norm": 2.3022706508636475, "learning_rate": 0.00018573677581863978, "loss": 2.3706, "step": 4640 }, { "epoch": 1.3204873901955227, "grad_norm": 2.4930179119110107, "learning_rate": 0.0001855793450881612, "loss": 2.4536, "step": 4660 }, { "epoch": 1.3261547180504392, "grad_norm": 3.0795810222625732, "learning_rate": 0.00018542191435768263, "loss": 2.2975, "step": 4680 }, { "epoch": 1.3318220459053556, "grad_norm": 2.407158851623535, "learning_rate": 0.00018526448362720405, "loss": 2.2909, "step": 4700 }, { "epoch": 1.337489373760272, "grad_norm": 2.1117422580718994, "learning_rate": 0.00018510705289672544, "loss": 2.2089, "step": 4720 }, { "epoch": 1.3431567016151884, "grad_norm": 1.7431004047393799, "learning_rate": 0.00018494962216624686, "loss": 2.76, "step": 4740 }, { "epoch": 1.3488240294701048, "grad_norm": 1.764187216758728, "learning_rate": 0.00018479219143576828, "loss": 2.3117, "step": 4760 }, { "epoch": 1.3544913573250212, "grad_norm": 2.6031908988952637, "learning_rate": 0.0001846347607052897, "loss": 2.563, "step": 4780 }, { "epoch": 1.3601586851799377, "grad_norm": 2.339071273803711, "learning_rate": 0.0001844773299748111, "loss": 2.6606, "step": 4800 }, { "epoch": 1.365826013034854, "grad_norm": 3.6041977405548096, "learning_rate": 0.00018431989924433252, "loss": 2.3476, "step": 4820 }, { "epoch": 1.3714933408897705, "grad_norm": 2.6601414680480957, "learning_rate": 0.0001841624685138539, "loss": 2.7594, "step": 4840 }, { "epoch": 1.377160668744687, "grad_norm": 2.275899648666382, "learning_rate": 0.00018400503778337533, "loss": 2.5455, "step": 4860 }, { "epoch": 1.3828279965996033, "grad_norm": 2.0911641120910645, "learning_rate": 0.00018384760705289675, "loss": 2.3002, "step": 4880 }, { "epoch": 1.3884953244545197, "grad_norm": 2.887432336807251, "learning_rate": 0.00018369017632241814, "loss": 2.5323, "step": 4900 }, { "epoch": 1.3941626523094361, "grad_norm": 2.071936845779419, "learning_rate": 0.00018353274559193956, "loss": 2.3508, "step": 4920 }, { "epoch": 1.3998299801643526, "grad_norm": 2.114422082901001, "learning_rate": 0.00018337531486146096, "loss": 2.9467, "step": 4940 }, { "epoch": 1.405497308019269, "grad_norm": 2.6056251525878906, "learning_rate": 0.00018321788413098238, "loss": 2.3436, "step": 4960 }, { "epoch": 1.4111646358741854, "grad_norm": 1.4599838256835938, "learning_rate": 0.0001830604534005038, "loss": 2.5206, "step": 4980 }, { "epoch": 1.4168319637291018, "grad_norm": 1.7601889371871948, "learning_rate": 0.0001829030226700252, "loss": 2.2828, "step": 5000 }, { "epoch": 1.4168319637291018, "eval_loss": 2.9546473026275635, "eval_runtime": 2.812, "eval_samples_per_second": 35.562, "eval_steps_per_second": 8.891, "step": 5000 }, { "epoch": 1.422499291584018, "grad_norm": 2.730172872543335, "learning_rate": 0.00018274559193954661, "loss": 2.4088, "step": 5020 }, { "epoch": 1.4281666194389344, "grad_norm": 2.8962032794952393, "learning_rate": 0.000182588161209068, "loss": 2.2587, "step": 5040 }, { "epoch": 1.4338339472938508, "grad_norm": 1.6010209321975708, "learning_rate": 0.00018243073047858943, "loss": 2.3285, "step": 5060 }, { "epoch": 1.4395012751487672, "grad_norm": 3.1424672603607178, "learning_rate": 0.00018227329974811085, "loss": 2.5568, "step": 5080 }, { "epoch": 1.4451686030036837, "grad_norm": 1.256752848625183, "learning_rate": 0.00018211586901763224, "loss": 2.3939, "step": 5100 }, { "epoch": 1.4508359308586, "grad_norm": 2.554417610168457, "learning_rate": 0.00018195843828715366, "loss": 2.6174, "step": 5120 }, { "epoch": 1.4565032587135165, "grad_norm": 4.178256988525391, "learning_rate": 0.00018180100755667506, "loss": 2.1799, "step": 5140 }, { "epoch": 1.462170586568433, "grad_norm": 2.5059776306152344, "learning_rate": 0.00018164357682619648, "loss": 2.6181, "step": 5160 }, { "epoch": 1.4678379144233493, "grad_norm": 2.0199661254882812, "learning_rate": 0.0001814861460957179, "loss": 2.3318, "step": 5180 }, { "epoch": 1.4735052422782657, "grad_norm": 2.8674845695495605, "learning_rate": 0.0001813287153652393, "loss": 2.6139, "step": 5200 }, { "epoch": 1.4791725701331822, "grad_norm": 1.975320816040039, "learning_rate": 0.0001811712846347607, "loss": 2.4629, "step": 5220 }, { "epoch": 1.4848398979880986, "grad_norm": 2.204007387161255, "learning_rate": 0.0001810138539042821, "loss": 2.2157, "step": 5240 }, { "epoch": 1.490507225843015, "grad_norm": 1.6304585933685303, "learning_rate": 0.00018085642317380353, "loss": 2.6917, "step": 5260 }, { "epoch": 1.4961745536979314, "grad_norm": 2.1824429035186768, "learning_rate": 0.00018069899244332495, "loss": 2.135, "step": 5280 }, { "epoch": 1.5018418815528478, "grad_norm": 1.1885401010513306, "learning_rate": 0.00018054156171284634, "loss": 2.333, "step": 5300 }, { "epoch": 1.5075092094077642, "grad_norm": 2.4040002822875977, "learning_rate": 0.00018038413098236776, "loss": 2.4598, "step": 5320 }, { "epoch": 1.5131765372626806, "grad_norm": 2.6888046264648438, "learning_rate": 0.00018022670025188918, "loss": 2.3256, "step": 5340 }, { "epoch": 1.518843865117597, "grad_norm": 1.4562530517578125, "learning_rate": 0.0001800692695214106, "loss": 2.26, "step": 5360 }, { "epoch": 1.5245111929725135, "grad_norm": 1.6198939085006714, "learning_rate": 0.000179911838790932, "loss": 2.4967, "step": 5380 }, { "epoch": 1.5301785208274299, "grad_norm": 2.4001379013061523, "learning_rate": 0.00017975440806045342, "loss": 2.2851, "step": 5400 }, { "epoch": 1.5358458486823463, "grad_norm": 2.653404951095581, "learning_rate": 0.00017959697732997484, "loss": 2.5699, "step": 5420 }, { "epoch": 1.5415131765372627, "grad_norm": 1.9571356773376465, "learning_rate": 0.00017943954659949623, "loss": 2.8018, "step": 5440 }, { "epoch": 1.5471805043921791, "grad_norm": 2.1998708248138428, "learning_rate": 0.00017928211586901765, "loss": 2.6489, "step": 5460 }, { "epoch": 1.5528478322470955, "grad_norm": 2.253659963607788, "learning_rate": 0.00017912468513853907, "loss": 2.4334, "step": 5480 }, { "epoch": 1.558515160102012, "grad_norm": 2.565180778503418, "learning_rate": 0.00017896725440806046, "loss": 2.7018, "step": 5500 }, { "epoch": 1.558515160102012, "eval_loss": 2.929788827896118, "eval_runtime": 2.7986, "eval_samples_per_second": 35.732, "eval_steps_per_second": 8.933, "step": 5500 }, { "epoch": 1.5641824879569284, "grad_norm": 1.656478762626648, "learning_rate": 0.00017880982367758189, "loss": 2.4188, "step": 5520 }, { "epoch": 1.5698498158118448, "grad_norm": 2.0417423248291016, "learning_rate": 0.00017865239294710328, "loss": 2.8783, "step": 5540 }, { "epoch": 1.5755171436667612, "grad_norm": 2.0403895378112793, "learning_rate": 0.0001784949622166247, "loss": 2.2929, "step": 5560 }, { "epoch": 1.5811844715216776, "grad_norm": 2.2081973552703857, "learning_rate": 0.00017833753148614612, "loss": 2.6353, "step": 5580 }, { "epoch": 1.586851799376594, "grad_norm": 2.2200376987457275, "learning_rate": 0.0001781801007556675, "loss": 2.7092, "step": 5600 }, { "epoch": 1.5925191272315105, "grad_norm": 3.160665273666382, "learning_rate": 0.00017802267002518893, "loss": 2.4224, "step": 5620 }, { "epoch": 1.5981864550864269, "grad_norm": 2.6303188800811768, "learning_rate": 0.00017786523929471033, "loss": 1.985, "step": 5640 }, { "epoch": 1.6038537829413433, "grad_norm": 2.074103355407715, "learning_rate": 0.00017770780856423175, "loss": 2.7368, "step": 5660 }, { "epoch": 1.6095211107962597, "grad_norm": 1.835572361946106, "learning_rate": 0.00017755037783375317, "loss": 2.6459, "step": 5680 }, { "epoch": 1.6151884386511761, "grad_norm": 2.1183230876922607, "learning_rate": 0.00017739294710327456, "loss": 2.7257, "step": 5700 }, { "epoch": 1.6208557665060925, "grad_norm": 2.378448486328125, "learning_rate": 0.00017723551637279598, "loss": 2.4204, "step": 5720 }, { "epoch": 1.626523094361009, "grad_norm": 2.3793344497680664, "learning_rate": 0.00017707808564231738, "loss": 2.2684, "step": 5740 }, { "epoch": 1.6321904222159251, "grad_norm": 2.266275644302368, "learning_rate": 0.0001769206549118388, "loss": 2.4719, "step": 5760 }, { "epoch": 1.6378577500708416, "grad_norm": 1.5334393978118896, "learning_rate": 0.00017676322418136022, "loss": 2.6475, "step": 5780 }, { "epoch": 1.643525077925758, "grad_norm": 2.405388593673706, "learning_rate": 0.0001766057934508816, "loss": 2.2758, "step": 5800 }, { "epoch": 1.6491924057806744, "grad_norm": 2.902848482131958, "learning_rate": 0.00017644836272040303, "loss": 2.2127, "step": 5820 }, { "epoch": 1.6548597336355908, "grad_norm": 2.4124269485473633, "learning_rate": 0.00017629093198992443, "loss": 2.5793, "step": 5840 }, { "epoch": 1.6605270614905072, "grad_norm": 2.064995527267456, "learning_rate": 0.00017613350125944585, "loss": 2.1336, "step": 5860 }, { "epoch": 1.6661943893454236, "grad_norm": 1.3856558799743652, "learning_rate": 0.00017597607052896724, "loss": 2.2451, "step": 5880 }, { "epoch": 1.67186171720034, "grad_norm": 1.9192875623703003, "learning_rate": 0.00017581863979848866, "loss": 2.4114, "step": 5900 }, { "epoch": 1.6775290450552565, "grad_norm": 2.169050693511963, "learning_rate": 0.00017566120906801008, "loss": 2.4051, "step": 5920 }, { "epoch": 1.6831963729101729, "grad_norm": 2.903210401535034, "learning_rate": 0.0001755037783375315, "loss": 2.5251, "step": 5940 }, { "epoch": 1.6888637007650893, "grad_norm": 2.5201339721679688, "learning_rate": 0.0001753463476070529, "loss": 2.5304, "step": 5960 }, { "epoch": 1.6945310286200057, "grad_norm": 1.8286657333374023, "learning_rate": 0.00017518891687657432, "loss": 2.4993, "step": 5980 }, { "epoch": 1.7001983564749221, "grad_norm": 1.8457602262496948, "learning_rate": 0.00017503148614609574, "loss": 2.3005, "step": 6000 }, { "epoch": 1.7001983564749221, "eval_loss": 2.894773006439209, "eval_runtime": 3.0659, "eval_samples_per_second": 32.617, "eval_steps_per_second": 8.154, "step": 6000 }, { "epoch": 1.7058656843298383, "grad_norm": 2.7101848125457764, "learning_rate": 0.00017487405541561716, "loss": 2.2347, "step": 6020 }, { "epoch": 1.7115330121847547, "grad_norm": 2.9463462829589844, "learning_rate": 0.00017471662468513855, "loss": 2.4693, "step": 6040 }, { "epoch": 1.7172003400396711, "grad_norm": 2.214611530303955, "learning_rate": 0.00017455919395465997, "loss": 2.543, "step": 6060 }, { "epoch": 1.7228676678945876, "grad_norm": 1.800302505493164, "learning_rate": 0.00017440176322418136, "loss": 2.3988, "step": 6080 }, { "epoch": 1.728534995749504, "grad_norm": 1.6168558597564697, "learning_rate": 0.00017424433249370278, "loss": 2.3067, "step": 6100 }, { "epoch": 1.7342023236044204, "grad_norm": 1.792503833770752, "learning_rate": 0.0001740869017632242, "loss": 2.4324, "step": 6120 }, { "epoch": 1.7398696514593368, "grad_norm": 3.236666440963745, "learning_rate": 0.0001739294710327456, "loss": 2.614, "step": 6140 }, { "epoch": 1.7455369793142532, "grad_norm": 2.673731803894043, "learning_rate": 0.00017377204030226702, "loss": 2.6046, "step": 6160 }, { "epoch": 1.7512043071691696, "grad_norm": 2.067209243774414, "learning_rate": 0.0001736146095717884, "loss": 2.4048, "step": 6180 }, { "epoch": 1.756871635024086, "grad_norm": 2.9088408946990967, "learning_rate": 0.00017345717884130983, "loss": 2.4252, "step": 6200 }, { "epoch": 1.7625389628790025, "grad_norm": 2.570803165435791, "learning_rate": 0.00017329974811083125, "loss": 2.4, "step": 6220 }, { "epoch": 1.7682062907339189, "grad_norm": 1.6601548194885254, "learning_rate": 0.00017314231738035265, "loss": 2.3388, "step": 6240 }, { "epoch": 1.7738736185888353, "grad_norm": 2.261650800704956, "learning_rate": 0.00017298488664987407, "loss": 2.2604, "step": 6260 }, { "epoch": 1.7795409464437517, "grad_norm": 2.1470768451690674, "learning_rate": 0.00017282745591939546, "loss": 2.4691, "step": 6280 }, { "epoch": 1.7852082742986681, "grad_norm": 2.5933737754821777, "learning_rate": 0.00017267002518891688, "loss": 2.4179, "step": 6300 }, { "epoch": 1.7908756021535845, "grad_norm": 2.708608627319336, "learning_rate": 0.0001725125944584383, "loss": 2.5905, "step": 6320 }, { "epoch": 1.796542930008501, "grad_norm": 2.0187718868255615, "learning_rate": 0.0001723551637279597, "loss": 2.3824, "step": 6340 }, { "epoch": 1.8022102578634174, "grad_norm": 3.0485661029815674, "learning_rate": 0.00017219773299748112, "loss": 2.3222, "step": 6360 }, { "epoch": 1.8078775857183338, "grad_norm": 1.4683558940887451, "learning_rate": 0.0001720403022670025, "loss": 2.4677, "step": 6380 }, { "epoch": 1.8135449135732502, "grad_norm": 2.09863018989563, "learning_rate": 0.00017188287153652393, "loss": 2.5618, "step": 6400 }, { "epoch": 1.8192122414281666, "grad_norm": 3.32715106010437, "learning_rate": 0.00017172544080604535, "loss": 2.4582, "step": 6420 }, { "epoch": 1.824879569283083, "grad_norm": 1.9082072973251343, "learning_rate": 0.00017156801007556675, "loss": 2.2556, "step": 6440 }, { "epoch": 1.8305468971379995, "grad_norm": 2.095249652862549, "learning_rate": 0.00017141057934508817, "loss": 2.4534, "step": 6460 }, { "epoch": 1.8362142249929159, "grad_norm": 2.7830159664154053, "learning_rate": 0.00017125314861460956, "loss": 2.5573, "step": 6480 }, { "epoch": 1.8418815528478323, "grad_norm": 2.5036823749542236, "learning_rate": 0.00017109571788413098, "loss": 2.7014, "step": 6500 }, { "epoch": 1.8418815528478323, "eval_loss": 2.901240825653076, "eval_runtime": 3.1036, "eval_samples_per_second": 32.221, "eval_steps_per_second": 8.055, "step": 6500 }, { "epoch": 1.8475488807027487, "grad_norm": 2.890636444091797, "learning_rate": 0.0001709382871536524, "loss": 2.4916, "step": 6520 }, { "epoch": 1.8532162085576651, "grad_norm": 2.786834239959717, "learning_rate": 0.0001707808564231738, "loss": 2.5189, "step": 6540 }, { "epoch": 1.8588835364125815, "grad_norm": 2.5919430255889893, "learning_rate": 0.00017062342569269521, "loss": 2.3139, "step": 6560 }, { "epoch": 1.864550864267498, "grad_norm": 1.7164280414581299, "learning_rate": 0.00017046599496221664, "loss": 2.0663, "step": 6580 }, { "epoch": 1.8702181921224144, "grad_norm": 2.5058701038360596, "learning_rate": 0.00017030856423173806, "loss": 2.5168, "step": 6600 }, { "epoch": 1.8758855199773308, "grad_norm": 1.7800853252410889, "learning_rate": 0.00017015113350125948, "loss": 2.4688, "step": 6620 }, { "epoch": 1.8815528478322472, "grad_norm": 2.9929161071777344, "learning_rate": 0.00016999370277078087, "loss": 2.4036, "step": 6640 }, { "epoch": 1.8872201756871636, "grad_norm": 1.7226547002792358, "learning_rate": 0.0001698362720403023, "loss": 2.2338, "step": 6660 }, { "epoch": 1.89288750354208, "grad_norm": 2.4759228229522705, "learning_rate": 0.00016967884130982368, "loss": 2.4128, "step": 6680 }, { "epoch": 1.8985548313969964, "grad_norm": 2.6356420516967773, "learning_rate": 0.0001695214105793451, "loss": 2.4733, "step": 6700 }, { "epoch": 1.9042221592519128, "grad_norm": 2.586310386657715, "learning_rate": 0.00016936397984886653, "loss": 2.3177, "step": 6720 }, { "epoch": 1.9098894871068293, "grad_norm": 1.6923245191574097, "learning_rate": 0.00016920654911838792, "loss": 2.4701, "step": 6740 }, { "epoch": 1.9155568149617457, "grad_norm": 1.8547301292419434, "learning_rate": 0.00016904911838790934, "loss": 2.4794, "step": 6760 }, { "epoch": 1.921224142816662, "grad_norm": 1.9403104782104492, "learning_rate": 0.00016889168765743073, "loss": 2.612, "step": 6780 }, { "epoch": 1.9268914706715785, "grad_norm": 2.039132833480835, "learning_rate": 0.00016873425692695215, "loss": 2.32, "step": 6800 }, { "epoch": 1.9325587985264947, "grad_norm": 2.709059238433838, "learning_rate": 0.00016857682619647357, "loss": 2.6412, "step": 6820 }, { "epoch": 1.9382261263814111, "grad_norm": 2.2215821743011475, "learning_rate": 0.00016841939546599497, "loss": 2.5025, "step": 6840 }, { "epoch": 1.9438934542363275, "grad_norm": 2.7609920501708984, "learning_rate": 0.0001682619647355164, "loss": 2.5107, "step": 6860 }, { "epoch": 1.949560782091244, "grad_norm": 2.312913656234741, "learning_rate": 0.00016810453400503778, "loss": 2.2898, "step": 6880 }, { "epoch": 1.9552281099461604, "grad_norm": 1.9355868101119995, "learning_rate": 0.0001679471032745592, "loss": 2.6632, "step": 6900 }, { "epoch": 1.9608954378010768, "grad_norm": 2.7121925354003906, "learning_rate": 0.00016778967254408062, "loss": 2.2489, "step": 6920 }, { "epoch": 1.9665627656559932, "grad_norm": 2.4646923542022705, "learning_rate": 0.00016763224181360202, "loss": 2.3332, "step": 6940 }, { "epoch": 1.9722300935109096, "grad_norm": 2.530965805053711, "learning_rate": 0.00016747481108312344, "loss": 2.5333, "step": 6960 }, { "epoch": 1.977897421365826, "grad_norm": 2.32324481010437, "learning_rate": 0.00016731738035264483, "loss": 2.5205, "step": 6980 }, { "epoch": 1.9835647492207424, "grad_norm": 2.5765111446380615, "learning_rate": 0.00016715994962216625, "loss": 2.2317, "step": 7000 }, { "epoch": 1.9835647492207424, "eval_loss": 2.843167781829834, "eval_runtime": 6.5825, "eval_samples_per_second": 15.192, "eval_steps_per_second": 3.798, "step": 7000 }, { "epoch": 1.9892320770756589, "grad_norm": 2.2172975540161133, "learning_rate": 0.00016700251889168767, "loss": 2.281, "step": 7020 }, { "epoch": 1.9948994049305753, "grad_norm": 2.260226011276245, "learning_rate": 0.00016684508816120907, "loss": 2.1909, "step": 7040 }, { "epoch": 2.0005667327854915, "grad_norm": 2.8310775756835938, "learning_rate": 0.00016668765743073049, "loss": 2.3458, "step": 7060 }, { "epoch": 2.006234060640408, "grad_norm": 2.1876490116119385, "learning_rate": 0.00016653022670025188, "loss": 2.4859, "step": 7080 }, { "epoch": 2.0119013884953243, "grad_norm": 2.004753351211548, "learning_rate": 0.0001663727959697733, "loss": 2.5462, "step": 7100 }, { "epoch": 2.0175687163502407, "grad_norm": 2.12479829788208, "learning_rate": 0.00016621536523929472, "loss": 2.2201, "step": 7120 }, { "epoch": 2.023236044205157, "grad_norm": 1.9358830451965332, "learning_rate": 0.00016605793450881611, "loss": 2.3316, "step": 7140 }, { "epoch": 2.0289033720600735, "grad_norm": 3.6592369079589844, "learning_rate": 0.00016590050377833753, "loss": 2.2719, "step": 7160 }, { "epoch": 2.03457069991499, "grad_norm": 3.23899507522583, "learning_rate": 0.00016574307304785896, "loss": 2.4928, "step": 7180 }, { "epoch": 2.0402380277699064, "grad_norm": 2.8704543113708496, "learning_rate": 0.00016558564231738038, "loss": 2.4024, "step": 7200 }, { "epoch": 2.045905355624823, "grad_norm": 3.071497917175293, "learning_rate": 0.00016542821158690177, "loss": 2.419, "step": 7220 }, { "epoch": 2.051572683479739, "grad_norm": 1.6562724113464355, "learning_rate": 0.0001652707808564232, "loss": 2.5883, "step": 7240 }, { "epoch": 2.0572400113346556, "grad_norm": 3.095526695251465, "learning_rate": 0.0001651133501259446, "loss": 2.3377, "step": 7260 }, { "epoch": 2.062907339189572, "grad_norm": 1.4811360836029053, "learning_rate": 0.000164955919395466, "loss": 2.3614, "step": 7280 }, { "epoch": 2.0685746670444884, "grad_norm": 2.6386337280273438, "learning_rate": 0.00016479848866498743, "loss": 2.4001, "step": 7300 }, { "epoch": 2.074241994899405, "grad_norm": 3.289548873901367, "learning_rate": 0.00016464105793450882, "loss": 2.0404, "step": 7320 }, { "epoch": 2.0799093227543213, "grad_norm": 2.309859037399292, "learning_rate": 0.00016448362720403024, "loss": 2.3378, "step": 7340 }, { "epoch": 2.0855766506092377, "grad_norm": 2.5595717430114746, "learning_rate": 0.00016432619647355166, "loss": 2.4865, "step": 7360 }, { "epoch": 2.091243978464154, "grad_norm": 2.9636921882629395, "learning_rate": 0.00016416876574307305, "loss": 2.4002, "step": 7380 }, { "epoch": 2.0969113063190705, "grad_norm": 1.472091794013977, "learning_rate": 0.00016401133501259447, "loss": 2.2417, "step": 7400 }, { "epoch": 2.102578634173987, "grad_norm": 2.373563051223755, "learning_rate": 0.00016385390428211587, "loss": 2.3798, "step": 7420 }, { "epoch": 2.1082459620289034, "grad_norm": 1.8071825504302979, "learning_rate": 0.0001636964735516373, "loss": 2.2917, "step": 7440 }, { "epoch": 2.1139132898838198, "grad_norm": 2.305704116821289, "learning_rate": 0.0001635390428211587, "loss": 2.5257, "step": 7460 }, { "epoch": 2.119580617738736, "grad_norm": 2.4652457237243652, "learning_rate": 0.0001633816120906801, "loss": 2.3234, "step": 7480 }, { "epoch": 2.1252479455936526, "grad_norm": 3.374314069747925, "learning_rate": 0.00016322418136020152, "loss": 2.427, "step": 7500 }, { "epoch": 2.1252479455936526, "eval_loss": 2.80009126663208, "eval_runtime": 3.5894, "eval_samples_per_second": 27.86, "eval_steps_per_second": 6.965, "step": 7500 }, { "epoch": 2.130915273448569, "grad_norm": 3.960911989212036, "learning_rate": 0.00016306675062972292, "loss": 2.4475, "step": 7520 }, { "epoch": 2.1365826013034854, "grad_norm": 3.5533838272094727, "learning_rate": 0.00016290931989924434, "loss": 2.267, "step": 7540 }, { "epoch": 2.142249929158402, "grad_norm": 2.5927839279174805, "learning_rate": 0.00016275188916876576, "loss": 2.2356, "step": 7560 }, { "epoch": 2.1479172570133183, "grad_norm": 2.005831003189087, "learning_rate": 0.00016259445843828715, "loss": 2.5994, "step": 7580 }, { "epoch": 2.1535845848682347, "grad_norm": 1.8276090621948242, "learning_rate": 0.00016243702770780857, "loss": 2.1273, "step": 7600 }, { "epoch": 2.159251912723151, "grad_norm": 1.7224339246749878, "learning_rate": 0.00016227959697732997, "loss": 2.4834, "step": 7620 }, { "epoch": 2.1649192405780675, "grad_norm": 2.510697603225708, "learning_rate": 0.00016212216624685139, "loss": 2.2681, "step": 7640 }, { "epoch": 2.170586568432984, "grad_norm": 3.134523868560791, "learning_rate": 0.0001619647355163728, "loss": 2.4116, "step": 7660 }, { "epoch": 2.1762538962879003, "grad_norm": 2.632368803024292, "learning_rate": 0.0001618073047858942, "loss": 2.0942, "step": 7680 }, { "epoch": 2.1819212241428168, "grad_norm": 1.838720679283142, "learning_rate": 0.00016164987405541562, "loss": 2.0806, "step": 7700 }, { "epoch": 2.187588551997733, "grad_norm": 2.647280216217041, "learning_rate": 0.00016149244332493701, "loss": 2.4678, "step": 7720 }, { "epoch": 2.1932558798526496, "grad_norm": 1.5983269214630127, "learning_rate": 0.00016133501259445843, "loss": 2.3756, "step": 7740 }, { "epoch": 2.198923207707566, "grad_norm": 2.021974802017212, "learning_rate": 0.00016117758186397986, "loss": 2.1492, "step": 7760 }, { "epoch": 2.2045905355624824, "grad_norm": 2.020425319671631, "learning_rate": 0.00016102015113350128, "loss": 2.2573, "step": 7780 }, { "epoch": 2.210257863417399, "grad_norm": 2.171834945678711, "learning_rate": 0.00016086272040302267, "loss": 2.3155, "step": 7800 }, { "epoch": 2.2159251912723152, "grad_norm": 1.1585094928741455, "learning_rate": 0.0001607052896725441, "loss": 2.1415, "step": 7820 }, { "epoch": 2.2215925191272317, "grad_norm": 2.547248601913452, "learning_rate": 0.0001605478589420655, "loss": 2.4409, "step": 7840 }, { "epoch": 2.227259846982148, "grad_norm": 3.079226493835449, "learning_rate": 0.00016039042821158693, "loss": 2.3609, "step": 7860 }, { "epoch": 2.2329271748370645, "grad_norm": 2.7051501274108887, "learning_rate": 0.00016023299748110832, "loss": 2.2932, "step": 7880 }, { "epoch": 2.238594502691981, "grad_norm": 1.9966986179351807, "learning_rate": 0.00016007556675062975, "loss": 2.2879, "step": 7900 }, { "epoch": 2.2442618305468973, "grad_norm": 3.7688839435577393, "learning_rate": 0.00015991813602015114, "loss": 2.4959, "step": 7920 }, { "epoch": 2.2499291584018137, "grad_norm": 3.3540143966674805, "learning_rate": 0.00015976070528967256, "loss": 2.3904, "step": 7940 }, { "epoch": 2.2555964862567297, "grad_norm": 1.2728419303894043, "learning_rate": 0.00015960327455919398, "loss": 2.0898, "step": 7960 }, { "epoch": 2.261263814111646, "grad_norm": 2.7562403678894043, "learning_rate": 0.00015944584382871537, "loss": 2.5521, "step": 7980 }, { "epoch": 2.2669311419665625, "grad_norm": 2.8068790435791016, "learning_rate": 0.0001592884130982368, "loss": 2.4855, "step": 8000 }, { "epoch": 2.2669311419665625, "eval_loss": 2.8120086193084717, "eval_runtime": 4.3219, "eval_samples_per_second": 23.138, "eval_steps_per_second": 5.785, "step": 8000 }, { "epoch": 2.272598469821479, "grad_norm": 4.780170917510986, "learning_rate": 0.0001591309823677582, "loss": 2.3433, "step": 8020 }, { "epoch": 2.2782657976763954, "grad_norm": 2.329892635345459, "learning_rate": 0.0001589735516372796, "loss": 2.2164, "step": 8040 }, { "epoch": 2.283933125531312, "grad_norm": 1.371971845626831, "learning_rate": 0.00015881612090680103, "loss": 2.4624, "step": 8060 }, { "epoch": 2.289600453386228, "grad_norm": 1.9944554567337036, "learning_rate": 0.00015865869017632242, "loss": 2.2497, "step": 8080 }, { "epoch": 2.2952677812411446, "grad_norm": 1.928144097328186, "learning_rate": 0.00015850125944584384, "loss": 2.2425, "step": 8100 }, { "epoch": 2.300935109096061, "grad_norm": 1.9011304378509521, "learning_rate": 0.00015834382871536524, "loss": 2.3302, "step": 8120 }, { "epoch": 2.3066024369509774, "grad_norm": 1.8293368816375732, "learning_rate": 0.00015818639798488666, "loss": 2.1843, "step": 8140 }, { "epoch": 2.312269764805894, "grad_norm": 2.58994460105896, "learning_rate": 0.00015802896725440808, "loss": 2.5307, "step": 8160 }, { "epoch": 2.3179370926608103, "grad_norm": 3.1964333057403564, "learning_rate": 0.0001578794080604534, "loss": 1.8921, "step": 8180 }, { "epoch": 2.3236044205157267, "grad_norm": 3.868084192276001, "learning_rate": 0.00015772197732997483, "loss": 2.0937, "step": 8200 }, { "epoch": 2.329271748370643, "grad_norm": 2.8976247310638428, "learning_rate": 0.00015756454659949623, "loss": 2.32, "step": 8220 }, { "epoch": 2.3349390762255595, "grad_norm": 1.8084183931350708, "learning_rate": 0.00015740711586901765, "loss": 2.1776, "step": 8240 }, { "epoch": 2.340606404080476, "grad_norm": 3.527627468109131, "learning_rate": 0.00015724968513853907, "loss": 2.4144, "step": 8260 }, { "epoch": 2.3462737319353923, "grad_norm": 3.086888313293457, "learning_rate": 0.00015709225440806046, "loss": 2.2749, "step": 8280 }, { "epoch": 2.3519410597903088, "grad_norm": 2.2123515605926514, "learning_rate": 0.00015693482367758188, "loss": 2.3915, "step": 8300 }, { "epoch": 2.357608387645225, "grad_norm": 3.4009289741516113, "learning_rate": 0.00015677739294710328, "loss": 2.4896, "step": 8320 }, { "epoch": 2.3632757155001416, "grad_norm": 2.6163032054901123, "learning_rate": 0.0001566199622166247, "loss": 2.5139, "step": 8340 }, { "epoch": 2.368943043355058, "grad_norm": 2.6936097145080566, "learning_rate": 0.00015646253148614612, "loss": 2.3655, "step": 8360 }, { "epoch": 2.3746103712099744, "grad_norm": 1.2325189113616943, "learning_rate": 0.00015631297229219145, "loss": 2.3071, "step": 8380 }, { "epoch": 2.380277699064891, "grad_norm": 2.385713577270508, "learning_rate": 0.00015615554156171287, "loss": 2.7953, "step": 8400 }, { "epoch": 2.3859450269198073, "grad_norm": 2.69376277923584, "learning_rate": 0.00015599811083123427, "loss": 2.3252, "step": 8420 }, { "epoch": 2.3916123547747237, "grad_norm": 2.358503580093384, "learning_rate": 0.00015584068010075569, "loss": 2.3178, "step": 8440 }, { "epoch": 2.39727968262964, "grad_norm": 2.2966880798339844, "learning_rate": 0.0001556832493702771, "loss": 2.339, "step": 8460 }, { "epoch": 2.4029470104845565, "grad_norm": 3.3230462074279785, "learning_rate": 0.0001555258186397985, "loss": 2.2583, "step": 8480 }, { "epoch": 2.408614338339473, "grad_norm": 2.6733531951904297, "learning_rate": 0.00015536838790931992, "loss": 2.2783, "step": 8500 }, { "epoch": 2.408614338339473, "eval_loss": 2.772428035736084, "eval_runtime": 7.4295, "eval_samples_per_second": 13.46, "eval_steps_per_second": 3.365, "step": 8500 }, { "epoch": 2.4142816661943893, "grad_norm": 2.7924015522003174, "learning_rate": 0.00015521095717884131, "loss": 2.428, "step": 8520 }, { "epoch": 2.4199489940493057, "grad_norm": 2.683727979660034, "learning_rate": 0.00015505352644836273, "loss": 2.2268, "step": 8540 }, { "epoch": 2.425616321904222, "grad_norm": 3.0768203735351562, "learning_rate": 0.00015490396725440807, "loss": 2.2842, "step": 8560 }, { "epoch": 2.4312836497591386, "grad_norm": 1.3317536115646362, "learning_rate": 0.00015474653652392946, "loss": 2.0874, "step": 8580 }, { "epoch": 2.436950977614055, "grad_norm": 1.4555238485336304, "learning_rate": 0.00015458910579345088, "loss": 2.2251, "step": 8600 }, { "epoch": 2.4426183054689714, "grad_norm": 1.6957465410232544, "learning_rate": 0.0001544316750629723, "loss": 2.2342, "step": 8620 }, { "epoch": 2.448285633323888, "grad_norm": 2.5949954986572266, "learning_rate": 0.00015427424433249372, "loss": 2.3822, "step": 8640 }, { "epoch": 2.4539529611788042, "grad_norm": 2.510690927505493, "learning_rate": 0.00015411681360201512, "loss": 2.2773, "step": 8660 }, { "epoch": 2.4596202890337207, "grad_norm": 1.623098611831665, "learning_rate": 0.00015395938287153654, "loss": 2.294, "step": 8680 }, { "epoch": 2.465287616888637, "grad_norm": 3.3008878231048584, "learning_rate": 0.00015380195214105796, "loss": 2.5224, "step": 8700 }, { "epoch": 2.4709549447435535, "grad_norm": 3.122823715209961, "learning_rate": 0.00015364452141057935, "loss": 2.0819, "step": 8720 }, { "epoch": 2.47662227259847, "grad_norm": 1.7081055641174316, "learning_rate": 0.00015348709068010077, "loss": 2.077, "step": 8740 }, { "epoch": 2.4822896004533863, "grad_norm": 2.4683761596679688, "learning_rate": 0.0001533296599496222, "loss": 2.5073, "step": 8760 }, { "epoch": 2.4879569283083027, "grad_norm": 2.476719617843628, "learning_rate": 0.0001531722292191436, "loss": 2.3643, "step": 8780 }, { "epoch": 2.493624256163219, "grad_norm": 3.2902543544769287, "learning_rate": 0.000153014798488665, "loss": 2.2025, "step": 8800 }, { "epoch": 2.4992915840181356, "grad_norm": 3.064850091934204, "learning_rate": 0.0001528573677581864, "loss": 2.2284, "step": 8820 }, { "epoch": 2.504958911873052, "grad_norm": 2.7081470489501953, "learning_rate": 0.00015269993702770782, "loss": 2.116, "step": 8840 }, { "epoch": 2.5106262397279684, "grad_norm": 3.4181480407714844, "learning_rate": 0.00015254250629722924, "loss": 2.4685, "step": 8860 }, { "epoch": 2.516293567582885, "grad_norm": 2.770113229751587, "learning_rate": 0.00015238507556675064, "loss": 2.5762, "step": 8880 }, { "epoch": 2.521960895437801, "grad_norm": 2.072331666946411, "learning_rate": 0.00015222764483627206, "loss": 2.3684, "step": 8900 }, { "epoch": 2.5276282232927176, "grad_norm": 2.7252044677734375, "learning_rate": 0.00015207021410579345, "loss": 2.7392, "step": 8920 }, { "epoch": 2.533295551147634, "grad_norm": 1.9399724006652832, "learning_rate": 0.00015191278337531487, "loss": 2.453, "step": 8940 }, { "epoch": 2.5389628790025505, "grad_norm": 1.663861632347107, "learning_rate": 0.00015175535264483626, "loss": 2.2369, "step": 8960 }, { "epoch": 2.544630206857467, "grad_norm": 1.6849429607391357, "learning_rate": 0.00015159792191435769, "loss": 2.349, "step": 8980 }, { "epoch": 2.5502975347123833, "grad_norm": 2.5210835933685303, "learning_rate": 0.0001514404911838791, "loss": 2.3276, "step": 9000 }, { "epoch": 2.5502975347123833, "eval_loss": 2.7718639373779297, "eval_runtime": 3.1522, "eval_samples_per_second": 31.724, "eval_steps_per_second": 7.931, "step": 9000 }, { "epoch": 2.5559648625672997, "grad_norm": 2.964818000793457, "learning_rate": 0.0001512830604534005, "loss": 2.339, "step": 9020 }, { "epoch": 2.561632190422216, "grad_norm": 1.9607985019683838, "learning_rate": 0.00015112562972292192, "loss": 2.1497, "step": 9040 }, { "epoch": 2.5672995182771325, "grad_norm": 2.008519411087036, "learning_rate": 0.00015096819899244331, "loss": 2.3334, "step": 9060 }, { "epoch": 2.572966846132049, "grad_norm": 2.5395472049713135, "learning_rate": 0.00015081076826196473, "loss": 2.158, "step": 9080 }, { "epoch": 2.5786341739869654, "grad_norm": 3.0644032955169678, "learning_rate": 0.00015065333753148616, "loss": 2.2819, "step": 9100 }, { "epoch": 2.584301501841882, "grad_norm": 2.361692428588867, "learning_rate": 0.00015049590680100755, "loss": 2.7494, "step": 9120 }, { "epoch": 2.589968829696798, "grad_norm": 2.7539889812469482, "learning_rate": 0.00015033847607052897, "loss": 2.3862, "step": 9140 }, { "epoch": 2.5956361575517146, "grad_norm": 2.18387508392334, "learning_rate": 0.00015018104534005036, "loss": 2.1919, "step": 9160 }, { "epoch": 2.601303485406631, "grad_norm": 2.041923999786377, "learning_rate": 0.00015002361460957178, "loss": 2.3131, "step": 9180 }, { "epoch": 2.6069708132615474, "grad_norm": 3.2729411125183105, "learning_rate": 0.0001498661838790932, "loss": 2.4391, "step": 9200 }, { "epoch": 2.612638141116464, "grad_norm": 2.0193276405334473, "learning_rate": 0.00014970875314861462, "loss": 2.2879, "step": 9220 }, { "epoch": 2.61830546897138, "grad_norm": 2.7329158782958984, "learning_rate": 0.00014955132241813602, "loss": 2.3964, "step": 9240 }, { "epoch": 2.6239727968262962, "grad_norm": 2.459596633911133, "learning_rate": 0.00014939389168765744, "loss": 2.3116, "step": 9260 }, { "epoch": 2.6296401246812127, "grad_norm": 3.155575752258301, "learning_rate": 0.00014923646095717886, "loss": 2.5256, "step": 9280 }, { "epoch": 2.635307452536129, "grad_norm": 2.145146131515503, "learning_rate": 0.00014907903022670028, "loss": 1.9398, "step": 9300 }, { "epoch": 2.6409747803910455, "grad_norm": 2.078341484069824, "learning_rate": 0.00014892159949622167, "loss": 2.2754, "step": 9320 }, { "epoch": 2.646642108245962, "grad_norm": 2.3469126224517822, "learning_rate": 0.0001487641687657431, "loss": 2.1669, "step": 9340 }, { "epoch": 2.6523094361008783, "grad_norm": 3.001664876937866, "learning_rate": 0.0001486067380352645, "loss": 2.4066, "step": 9360 }, { "epoch": 2.6579767639557947, "grad_norm": 1.9174195528030396, "learning_rate": 0.0001484493073047859, "loss": 2.5258, "step": 9380 }, { "epoch": 2.663644091810711, "grad_norm": 2.324711799621582, "learning_rate": 0.00014829187657430733, "loss": 2.2431, "step": 9400 }, { "epoch": 2.6693114196656276, "grad_norm": 2.3054568767547607, "learning_rate": 0.00014813444584382872, "loss": 2.0425, "step": 9420 }, { "epoch": 2.674978747520544, "grad_norm": 1.9291499853134155, "learning_rate": 0.00014797701511335014, "loss": 2.1825, "step": 9440 }, { "epoch": 2.6806460753754604, "grad_norm": 2.4812333583831787, "learning_rate": 0.00014781958438287154, "loss": 2.6085, "step": 9460 }, { "epoch": 2.686313403230377, "grad_norm": 2.806016683578491, "learning_rate": 0.00014766215365239296, "loss": 2.0694, "step": 9480 }, { "epoch": 2.6919807310852932, "grad_norm": 1.7358734607696533, "learning_rate": 0.00014750472292191438, "loss": 2.2858, "step": 9500 }, { "epoch": 2.6919807310852932, "eval_loss": 2.7128820419311523, "eval_runtime": 4.0958, "eval_samples_per_second": 24.415, "eval_steps_per_second": 6.104, "step": 9500 }, { "epoch": 2.6976480589402096, "grad_norm": 1.27308988571167, "learning_rate": 0.00014734729219143577, "loss": 2.2132, "step": 9520 }, { "epoch": 2.703315386795126, "grad_norm": 1.3460731506347656, "learning_rate": 0.0001471898614609572, "loss": 2.4391, "step": 9540 }, { "epoch": 2.7089827146500425, "grad_norm": 3.0129804611206055, "learning_rate": 0.00014704030226700253, "loss": 2.1847, "step": 9560 }, { "epoch": 2.714650042504959, "grad_norm": 2.450838804244995, "learning_rate": 0.00014688287153652395, "loss": 2.5963, "step": 9580 }, { "epoch": 2.7203173703598753, "grad_norm": 1.848183274269104, "learning_rate": 0.00014672544080604537, "loss": 2.1117, "step": 9600 }, { "epoch": 2.7259846982147917, "grad_norm": 2.603017807006836, "learning_rate": 0.00014656801007556676, "loss": 2.5114, "step": 9620 }, { "epoch": 2.731652026069708, "grad_norm": 2.8565850257873535, "learning_rate": 0.00014641057934508818, "loss": 2.259, "step": 9640 }, { "epoch": 2.7373193539246246, "grad_norm": 1.9264154434204102, "learning_rate": 0.00014625314861460958, "loss": 2.2323, "step": 9660 }, { "epoch": 2.742986681779541, "grad_norm": 1.793988823890686, "learning_rate": 0.000146095717884131, "loss": 2.099, "step": 9680 }, { "epoch": 2.7486540096344574, "grad_norm": 2.8806650638580322, "learning_rate": 0.00014593828715365242, "loss": 2.0755, "step": 9700 }, { "epoch": 2.754321337489374, "grad_norm": 2.6137123107910156, "learning_rate": 0.0001457808564231738, "loss": 2.098, "step": 9720 }, { "epoch": 2.75998866534429, "grad_norm": 2.005281925201416, "learning_rate": 0.00014562342569269523, "loss": 2.1729, "step": 9740 }, { "epoch": 2.7656559931992066, "grad_norm": 2.695561170578003, "learning_rate": 0.00014546599496221662, "loss": 2.2028, "step": 9760 }, { "epoch": 2.771323321054123, "grad_norm": 1.731743335723877, "learning_rate": 0.00014530856423173804, "loss": 2.2763, "step": 9780 }, { "epoch": 2.7769906489090395, "grad_norm": 2.4486076831817627, "learning_rate": 0.00014515113350125947, "loss": 2.2865, "step": 9800 }, { "epoch": 2.782657976763956, "grad_norm": 2.0332183837890625, "learning_rate": 0.00014499370277078086, "loss": 2.3494, "step": 9820 }, { "epoch": 2.7883253046188723, "grad_norm": 1.668196439743042, "learning_rate": 0.00014483627204030228, "loss": 2.3017, "step": 9840 }, { "epoch": 2.7939926324737887, "grad_norm": 1.4477896690368652, "learning_rate": 0.00014467884130982367, "loss": 1.9731, "step": 9860 }, { "epoch": 2.799659960328705, "grad_norm": 2.6575543880462646, "learning_rate": 0.0001445214105793451, "loss": 2.0661, "step": 9880 }, { "epoch": 2.8053272881836215, "grad_norm": 1.7935446500778198, "learning_rate": 0.00014436397984886651, "loss": 2.3243, "step": 9900 }, { "epoch": 2.810994616038538, "grad_norm": 3.4458916187286377, "learning_rate": 0.0001442065491183879, "loss": 2.3467, "step": 9920 }, { "epoch": 2.8166619438934544, "grad_norm": 2.8462555408477783, "learning_rate": 0.00014404911838790933, "loss": 2.0653, "step": 9940 }, { "epoch": 2.822329271748371, "grad_norm": 1.1852829456329346, "learning_rate": 0.00014389168765743072, "loss": 2.3831, "step": 9960 }, { "epoch": 2.827996599603287, "grad_norm": 1.437692403793335, "learning_rate": 0.00014373425692695214, "loss": 2.2512, "step": 9980 }, { "epoch": 2.8336639274582036, "grad_norm": 1.7776933908462524, "learning_rate": 0.00014357682619647356, "loss": 2.2861, "step": 10000 }, { "epoch": 2.8336639274582036, "eval_loss": 2.6975317001342773, "eval_runtime": 10.8128, "eval_samples_per_second": 9.248, "eval_steps_per_second": 2.312, "step": 10000 }, { "epoch": 2.83933125531312, "grad_norm": 1.8666365146636963, "learning_rate": 0.00014341939546599496, "loss": 2.3028, "step": 10020 }, { "epoch": 2.844998583168036, "grad_norm": 2.9218027591705322, "learning_rate": 0.00014326196473551638, "loss": 2.4184, "step": 10040 }, { "epoch": 2.8506659110229524, "grad_norm": 3.0971028804779053, "learning_rate": 0.00014310453400503777, "loss": 2.1829, "step": 10060 }, { "epoch": 2.856333238877869, "grad_norm": 3.2618703842163086, "learning_rate": 0.0001429471032745592, "loss": 2.2488, "step": 10080 }, { "epoch": 2.8620005667327852, "grad_norm": 3.1495819091796875, "learning_rate": 0.0001427896725440806, "loss": 2.2476, "step": 10100 }, { "epoch": 2.8676678945877017, "grad_norm": 2.2311253547668457, "learning_rate": 0.00014263224181360203, "loss": 2.2312, "step": 10120 }, { "epoch": 2.873335222442618, "grad_norm": 2.300086736679077, "learning_rate": 0.00014247481108312343, "loss": 2.1752, "step": 10140 }, { "epoch": 2.8790025502975345, "grad_norm": 2.2071080207824707, "learning_rate": 0.00014231738035264485, "loss": 2.0546, "step": 10160 }, { "epoch": 2.884669878152451, "grad_norm": 2.9841291904449463, "learning_rate": 0.00014215994962216627, "loss": 2.2974, "step": 10180 }, { "epoch": 2.8903372060073673, "grad_norm": 1.2842730283737183, "learning_rate": 0.0001420025188916877, "loss": 1.9927, "step": 10200 }, { "epoch": 2.8960045338622837, "grad_norm": 3.4435982704162598, "learning_rate": 0.00014184508816120908, "loss": 2.4676, "step": 10220 }, { "epoch": 2.9016718617172, "grad_norm": 2.5336999893188477, "learning_rate": 0.0001416876574307305, "loss": 2.0274, "step": 10240 }, { "epoch": 2.9073391895721166, "grad_norm": 2.202472448348999, "learning_rate": 0.0001415302267002519, "loss": 2.2237, "step": 10260 }, { "epoch": 2.913006517427033, "grad_norm": 1.7741717100143433, "learning_rate": 0.00014137279596977332, "loss": 2.2521, "step": 10280 }, { "epoch": 2.9186738452819494, "grad_norm": 1.7316256761550903, "learning_rate": 0.00014121536523929474, "loss": 1.9795, "step": 10300 }, { "epoch": 2.924341173136866, "grad_norm": 2.07324481010437, "learning_rate": 0.00014105793450881613, "loss": 2.6921, "step": 10320 }, { "epoch": 2.9300085009917822, "grad_norm": 2.8377280235290527, "learning_rate": 0.00014090050377833755, "loss": 2.189, "step": 10340 }, { "epoch": 2.9356758288466986, "grad_norm": 2.1482396125793457, "learning_rate": 0.00014074307304785894, "loss": 2.4991, "step": 10360 }, { "epoch": 2.941343156701615, "grad_norm": 3.158545732498169, "learning_rate": 0.00014058564231738036, "loss": 2.4547, "step": 10380 }, { "epoch": 2.9470104845565315, "grad_norm": 1.7537274360656738, "learning_rate": 0.00014042821158690176, "loss": 2.3203, "step": 10400 }, { "epoch": 2.952677812411448, "grad_norm": 5.593268394470215, "learning_rate": 0.00014027078085642318, "loss": 2.4789, "step": 10420 }, { "epoch": 2.9583451402663643, "grad_norm": 2.5846259593963623, "learning_rate": 0.0001401133501259446, "loss": 2.1533, "step": 10440 }, { "epoch": 2.9640124681212807, "grad_norm": 2.0195910930633545, "learning_rate": 0.000139955919395466, "loss": 2.242, "step": 10460 }, { "epoch": 2.969679795976197, "grad_norm": 2.2268331050872803, "learning_rate": 0.00013979848866498741, "loss": 2.2801, "step": 10480 }, { "epoch": 2.9753471238311135, "grad_norm": 2.5797696113586426, "learning_rate": 0.0001396410579345088, "loss": 2.2072, "step": 10500 }, { "epoch": 2.9753471238311135, "eval_loss": 2.7052924633026123, "eval_runtime": 3.5621, "eval_samples_per_second": 28.074, "eval_steps_per_second": 7.018, "step": 10500 }, { "epoch": 2.98101445168603, "grad_norm": 2.7205538749694824, "learning_rate": 0.00013948362720403023, "loss": 2.0927, "step": 10520 }, { "epoch": 2.9866817795409464, "grad_norm": 4.232125282287598, "learning_rate": 0.00013932619647355165, "loss": 2.2611, "step": 10540 }, { "epoch": 2.992349107395863, "grad_norm": 2.1806387901306152, "learning_rate": 0.00013916876574307304, "loss": 2.2259, "step": 10560 }, { "epoch": 2.998016435250779, "grad_norm": 2.7484304904937744, "learning_rate": 0.00013901133501259446, "loss": 2.18, "step": 10580 }, { "epoch": 3.0036837631056956, "grad_norm": 2.8674259185791016, "learning_rate": 0.00013885390428211586, "loss": 2.1953, "step": 10600 }, { "epoch": 3.009351090960612, "grad_norm": 2.3233556747436523, "learning_rate": 0.00013869647355163728, "loss": 2.2801, "step": 10620 }, { "epoch": 3.0150184188155285, "grad_norm": 1.2857000827789307, "learning_rate": 0.0001385390428211587, "loss": 2.2572, "step": 10640 }, { "epoch": 3.020685746670445, "grad_norm": 2.294248580932617, "learning_rate": 0.0001383816120906801, "loss": 2.3019, "step": 10660 }, { "epoch": 3.0263530745253613, "grad_norm": 2.3173625469207764, "learning_rate": 0.0001382241813602015, "loss": 2.1457, "step": 10680 }, { "epoch": 3.0320204023802777, "grad_norm": 2.690378189086914, "learning_rate": 0.00013806675062972293, "loss": 2.3286, "step": 10700 }, { "epoch": 3.037687730235194, "grad_norm": 2.656442403793335, "learning_rate": 0.00013790931989924433, "loss": 2.201, "step": 10720 }, { "epoch": 3.0433550580901105, "grad_norm": 1.2951269149780273, "learning_rate": 0.00013775188916876575, "loss": 2.1014, "step": 10740 }, { "epoch": 3.049022385945027, "grad_norm": 2.503994941711426, "learning_rate": 0.00013759445843828717, "loss": 2.2086, "step": 10760 }, { "epoch": 3.0546897137999434, "grad_norm": 2.891786813735962, "learning_rate": 0.0001374370277078086, "loss": 2.0352, "step": 10780 }, { "epoch": 3.0603570416548598, "grad_norm": 3.4484000205993652, "learning_rate": 0.00013727959697732998, "loss": 2.1159, "step": 10800 }, { "epoch": 3.066024369509776, "grad_norm": 1.8439151048660278, "learning_rate": 0.0001371221662468514, "loss": 2.2591, "step": 10820 }, { "epoch": 3.0716916973646926, "grad_norm": 3.0267269611358643, "learning_rate": 0.00013696473551637282, "loss": 2.0574, "step": 10840 }, { "epoch": 3.077359025219609, "grad_norm": 2.702254056930542, "learning_rate": 0.00013680730478589422, "loss": 2.0493, "step": 10860 }, { "epoch": 3.0830263530745254, "grad_norm": 2.5154123306274414, "learning_rate": 0.00013664987405541564, "loss": 2.1315, "step": 10880 }, { "epoch": 3.088693680929442, "grad_norm": 1.9899142980575562, "learning_rate": 0.00013649244332493703, "loss": 2.0117, "step": 10900 }, { "epoch": 3.0943610087843583, "grad_norm": 2.5216870307922363, "learning_rate": 0.00013633501259445845, "loss": 2.1419, "step": 10920 }, { "epoch": 3.1000283366392747, "grad_norm": 3.4561798572540283, "learning_rate": 0.00013617758186397987, "loss": 2.2051, "step": 10940 }, { "epoch": 3.105695664494191, "grad_norm": 2.064423084259033, "learning_rate": 0.00013602015113350126, "loss": 2.365, "step": 10960 }, { "epoch": 3.1113629923491075, "grad_norm": 3.3535256385803223, "learning_rate": 0.00013586272040302269, "loss": 2.268, "step": 10980 }, { "epoch": 3.117030320204024, "grad_norm": 3.017845869064331, "learning_rate": 0.00013570528967254408, "loss": 2.2347, "step": 11000 }, { "epoch": 3.117030320204024, "eval_loss": 2.670203924179077, "eval_runtime": 3.0918, "eval_samples_per_second": 32.344, "eval_steps_per_second": 8.086, "step": 11000 }, { "epoch": 3.1226976480589403, "grad_norm": 1.8534648418426514, "learning_rate": 0.0001355478589420655, "loss": 2.2692, "step": 11020 }, { "epoch": 3.1283649759138568, "grad_norm": 3.547301769256592, "learning_rate": 0.00013539042821158692, "loss": 2.4455, "step": 11040 }, { "epoch": 3.134032303768773, "grad_norm": 3.364941358566284, "learning_rate": 0.0001352329974811083, "loss": 2.0867, "step": 11060 }, { "epoch": 3.1396996316236896, "grad_norm": 3.1527462005615234, "learning_rate": 0.00013507556675062973, "loss": 1.9736, "step": 11080 }, { "epoch": 3.145366959478606, "grad_norm": 3.0659615993499756, "learning_rate": 0.00013491813602015113, "loss": 2.1167, "step": 11100 }, { "epoch": 3.1510342873335224, "grad_norm": 1.6643054485321045, "learning_rate": 0.00013476070528967255, "loss": 2.2127, "step": 11120 }, { "epoch": 3.156701615188439, "grad_norm": 2.6552040576934814, "learning_rate": 0.00013460327455919397, "loss": 2.2658, "step": 11140 }, { "epoch": 3.1623689430433553, "grad_norm": 2.5503036975860596, "learning_rate": 0.00013444584382871536, "loss": 2.1136, "step": 11160 }, { "epoch": 3.1680362708982717, "grad_norm": 3.4892609119415283, "learning_rate": 0.00013428841309823678, "loss": 2.1339, "step": 11180 }, { "epoch": 3.173703598753188, "grad_norm": 2.805483341217041, "learning_rate": 0.00013413098236775818, "loss": 2.3843, "step": 11200 }, { "epoch": 3.1793709266081045, "grad_norm": 2.721018075942993, "learning_rate": 0.0001339735516372796, "loss": 2.2013, "step": 11220 }, { "epoch": 3.185038254463021, "grad_norm": 3.563188076019287, "learning_rate": 0.00013381612090680102, "loss": 2.4074, "step": 11240 }, { "epoch": 3.1907055823179373, "grad_norm": 3.232921838760376, "learning_rate": 0.0001336586901763224, "loss": 2.3289, "step": 11260 }, { "epoch": 3.1963729101728533, "grad_norm": 3.2730469703674316, "learning_rate": 0.00013350125944584383, "loss": 2.2811, "step": 11280 }, { "epoch": 3.2020402380277697, "grad_norm": 2.766502857208252, "learning_rate": 0.00013334382871536523, "loss": 2.1264, "step": 11300 }, { "epoch": 3.207707565882686, "grad_norm": 3.2688405513763428, "learning_rate": 0.00013318639798488665, "loss": 1.7657, "step": 11320 }, { "epoch": 3.2133748937376025, "grad_norm": 3.611823320388794, "learning_rate": 0.00013302896725440807, "loss": 2.2011, "step": 11340 }, { "epoch": 3.219042221592519, "grad_norm": 4.2690043449401855, "learning_rate": 0.0001328715365239295, "loss": 2.1803, "step": 11360 }, { "epoch": 3.2247095494474354, "grad_norm": 3.7969610691070557, "learning_rate": 0.00013271410579345088, "loss": 2.3218, "step": 11380 }, { "epoch": 3.230376877302352, "grad_norm": 3.2149288654327393, "learning_rate": 0.0001325566750629723, "loss": 2.2775, "step": 11400 }, { "epoch": 3.236044205157268, "grad_norm": 2.155052423477173, "learning_rate": 0.00013239924433249372, "loss": 2.0636, "step": 11420 }, { "epoch": 3.2417115330121846, "grad_norm": 1.4378539323806763, "learning_rate": 0.00013224181360201514, "loss": 2.1442, "step": 11440 }, { "epoch": 3.247378860867101, "grad_norm": 2.4278650283813477, "learning_rate": 0.00013208438287153654, "loss": 2.2445, "step": 11460 }, { "epoch": 3.2530461887220175, "grad_norm": 1.5889227390289307, "learning_rate": 0.00013192695214105796, "loss": 2.4813, "step": 11480 }, { "epoch": 3.258713516576934, "grad_norm": 1.5460145473480225, "learning_rate": 0.00013176952141057935, "loss": 2.2345, "step": 11500 }, { "epoch": 3.258713516576934, "eval_loss": 2.663754940032959, "eval_runtime": 3.0942, "eval_samples_per_second": 32.318, "eval_steps_per_second": 8.08, "step": 11500 }, { "epoch": 3.2643808444318503, "grad_norm": 2.8473448753356934, "learning_rate": 0.00013161209068010077, "loss": 2.395, "step": 11520 }, { "epoch": 3.2700481722867667, "grad_norm": 3.2274537086486816, "learning_rate": 0.0001314546599496222, "loss": 2.1963, "step": 11540 }, { "epoch": 3.275715500141683, "grad_norm": 1.9147582054138184, "learning_rate": 0.00013129722921914358, "loss": 1.918, "step": 11560 }, { "epoch": 3.2813828279965995, "grad_norm": 2.901381731033325, "learning_rate": 0.000131139798488665, "loss": 2.0896, "step": 11580 }, { "epoch": 3.287050155851516, "grad_norm": 2.4441018104553223, "learning_rate": 0.0001309823677581864, "loss": 2.201, "step": 11600 }, { "epoch": 3.2927174837064324, "grad_norm": 2.505458116531372, "learning_rate": 0.00013082493702770782, "loss": 2.2869, "step": 11620 }, { "epoch": 3.2983848115613488, "grad_norm": 1.8611371517181396, "learning_rate": 0.00013066750629722924, "loss": 2.1395, "step": 11640 }, { "epoch": 3.304052139416265, "grad_norm": 2.0075771808624268, "learning_rate": 0.00013051007556675063, "loss": 2.3082, "step": 11660 }, { "epoch": 3.3097194672711816, "grad_norm": 2.1490814685821533, "learning_rate": 0.00013035264483627205, "loss": 2.1601, "step": 11680 }, { "epoch": 3.315386795126098, "grad_norm": 2.2391486167907715, "learning_rate": 0.00013019521410579345, "loss": 2.1604, "step": 11700 }, { "epoch": 3.3210541229810144, "grad_norm": 3.044732093811035, "learning_rate": 0.00013003778337531487, "loss": 2.0647, "step": 11720 }, { "epoch": 3.326721450835931, "grad_norm": 3.3200325965881348, "learning_rate": 0.00012988035264483626, "loss": 2.4201, "step": 11740 }, { "epoch": 3.3323887786908473, "grad_norm": 2.375093698501587, "learning_rate": 0.00012972292191435768, "loss": 2.1771, "step": 11760 }, { "epoch": 3.3380561065457637, "grad_norm": 2.0740296840667725, "learning_rate": 0.0001295654911838791, "loss": 2.4032, "step": 11780 }, { "epoch": 3.34372343440068, "grad_norm": 3.456479549407959, "learning_rate": 0.0001294080604534005, "loss": 2.0992, "step": 11800 }, { "epoch": 3.3493907622555965, "grad_norm": 1.6590676307678223, "learning_rate": 0.00012925062972292192, "loss": 2.1044, "step": 11820 }, { "epoch": 3.355058090110513, "grad_norm": 2.4259135723114014, "learning_rate": 0.0001290931989924433, "loss": 2.3095, "step": 11840 }, { "epoch": 3.3607254179654293, "grad_norm": 1.905967116355896, "learning_rate": 0.00012893576826196473, "loss": 2.0684, "step": 11860 }, { "epoch": 3.3663927458203458, "grad_norm": 2.3308262825012207, "learning_rate": 0.00012877833753148615, "loss": 2.0896, "step": 11880 }, { "epoch": 3.372060073675262, "grad_norm": 2.282622814178467, "learning_rate": 0.00012862090680100755, "loss": 2.057, "step": 11900 }, { "epoch": 3.3777274015301786, "grad_norm": 3.104706048965454, "learning_rate": 0.00012846347607052897, "loss": 2.2346, "step": 11920 }, { "epoch": 3.383394729385095, "grad_norm": 2.967670202255249, "learning_rate": 0.0001283060453400504, "loss": 2.3376, "step": 11940 }, { "epoch": 3.3890620572400114, "grad_norm": 2.635925054550171, "learning_rate": 0.00012814861460957178, "loss": 1.9609, "step": 11960 }, { "epoch": 3.394729385094928, "grad_norm": 1.7973943948745728, "learning_rate": 0.0001279911838790932, "loss": 2.3591, "step": 11980 }, { "epoch": 3.4003967129498442, "grad_norm": 2.018777370452881, "learning_rate": 0.00012783375314861462, "loss": 2.2779, "step": 12000 }, { "epoch": 3.4003967129498442, "eval_loss": 2.6338918209075928, "eval_runtime": 3.2055, "eval_samples_per_second": 31.197, "eval_steps_per_second": 7.799, "step": 12000 }, { "epoch": 3.4060640408047607, "grad_norm": 2.2112374305725098, "learning_rate": 0.00012767632241813604, "loss": 2.3438, "step": 12020 }, { "epoch": 3.411731368659677, "grad_norm": 2.0568792819976807, "learning_rate": 0.00012751889168765744, "loss": 2.2832, "step": 12040 }, { "epoch": 3.4173986965145935, "grad_norm": 2.10998272895813, "learning_rate": 0.00012736146095717886, "loss": 2.2511, "step": 12060 }, { "epoch": 3.42306602436951, "grad_norm": 2.375075578689575, "learning_rate": 0.00012720403022670028, "loss": 1.9322, "step": 12080 }, { "epoch": 3.4287333522244263, "grad_norm": 2.3977253437042236, "learning_rate": 0.00012704659949622167, "loss": 2.4655, "step": 12100 }, { "epoch": 3.4344006800793427, "grad_norm": 2.7915639877319336, "learning_rate": 0.0001268891687657431, "loss": 2.1597, "step": 12120 }, { "epoch": 3.440068007934259, "grad_norm": 4.2611083984375, "learning_rate": 0.00012673173803526448, "loss": 2.146, "step": 12140 }, { "epoch": 3.4457353357891756, "grad_norm": 2.905327081680298, "learning_rate": 0.0001265743073047859, "loss": 2.455, "step": 12160 }, { "epoch": 3.451402663644092, "grad_norm": 2.084442377090454, "learning_rate": 0.00012641687657430733, "loss": 2.1691, "step": 12180 }, { "epoch": 3.4570699914990084, "grad_norm": 1.7104185819625854, "learning_rate": 0.00012625944584382872, "loss": 2.1144, "step": 12200 }, { "epoch": 3.462737319353925, "grad_norm": 3.050873041152954, "learning_rate": 0.00012610201511335014, "loss": 2.5539, "step": 12220 }, { "epoch": 3.468404647208841, "grad_norm": 1.6890558004379272, "learning_rate": 0.00012594458438287153, "loss": 2.2011, "step": 12240 }, { "epoch": 3.474071975063757, "grad_norm": 2.142890453338623, "learning_rate": 0.00012578715365239295, "loss": 2.0144, "step": 12260 }, { "epoch": 3.4797393029186736, "grad_norm": 2.4429149627685547, "learning_rate": 0.00012562972292191437, "loss": 2.3213, "step": 12280 }, { "epoch": 3.48540663077359, "grad_norm": 2.7614898681640625, "learning_rate": 0.00012547229219143577, "loss": 2.2432, "step": 12300 }, { "epoch": 3.4910739586285064, "grad_norm": 3.28320050239563, "learning_rate": 0.0001253148614609572, "loss": 2.5542, "step": 12320 }, { "epoch": 3.496741286483423, "grad_norm": 3.7955667972564697, "learning_rate": 0.00012515743073047858, "loss": 2.3167, "step": 12340 }, { "epoch": 3.5024086143383393, "grad_norm": 2.541165590286255, "learning_rate": 0.000125, "loss": 1.951, "step": 12360 }, { "epoch": 3.5080759421932557, "grad_norm": 2.374260663986206, "learning_rate": 0.00012484256926952142, "loss": 2.3104, "step": 12380 }, { "epoch": 3.513743270048172, "grad_norm": 1.950270175933838, "learning_rate": 0.00012468513853904282, "loss": 1.9754, "step": 12400 }, { "epoch": 3.5194105979030885, "grad_norm": 5.4043803215026855, "learning_rate": 0.00012452770780856424, "loss": 2.0379, "step": 12420 }, { "epoch": 3.525077925758005, "grad_norm": 1.534547209739685, "learning_rate": 0.00012437027707808563, "loss": 2.5231, "step": 12440 }, { "epoch": 3.5307452536129214, "grad_norm": 3.965202569961548, "learning_rate": 0.00012421284634760705, "loss": 2.5186, "step": 12460 }, { "epoch": 3.5364125814678378, "grad_norm": 2.628424882888794, "learning_rate": 0.00012405541561712847, "loss": 2.2714, "step": 12480 }, { "epoch": 3.542079909322754, "grad_norm": 2.8518669605255127, "learning_rate": 0.00012389798488664987, "loss": 2.3317, "step": 12500 }, { "epoch": 3.542079909322754, "eval_loss": 2.6055610179901123, "eval_runtime": 3.0536, "eval_samples_per_second": 32.748, "eval_steps_per_second": 8.187, "step": 12500 }, { "epoch": 3.5477472371776706, "grad_norm": 3.049640417098999, "learning_rate": 0.00012374055415617129, "loss": 1.9312, "step": 12520 }, { "epoch": 3.553414565032587, "grad_norm": 2.924109697341919, "learning_rate": 0.00012358312342569268, "loss": 2.3946, "step": 12540 }, { "epoch": 3.5590818928875034, "grad_norm": 2.830845594406128, "learning_rate": 0.0001234256926952141, "loss": 2.2212, "step": 12560 }, { "epoch": 3.56474922074242, "grad_norm": 3.538621187210083, "learning_rate": 0.00012326826196473552, "loss": 2.4107, "step": 12580 }, { "epoch": 3.5704165485973363, "grad_norm": 3.16609263420105, "learning_rate": 0.00012311083123425694, "loss": 2.1467, "step": 12600 }, { "epoch": 3.5760838764522527, "grad_norm": 2.3178296089172363, "learning_rate": 0.00012295340050377836, "loss": 2.336, "step": 12620 }, { "epoch": 3.581751204307169, "grad_norm": 2.5632126331329346, "learning_rate": 0.00012279596977329976, "loss": 2.3049, "step": 12640 }, { "epoch": 3.5874185321620855, "grad_norm": 2.838292121887207, "learning_rate": 0.00012263853904282118, "loss": 2.1883, "step": 12660 }, { "epoch": 3.593085860017002, "grad_norm": 1.6278738975524902, "learning_rate": 0.0001224811083123426, "loss": 2.2006, "step": 12680 }, { "epoch": 3.5987531878719183, "grad_norm": 2.538614511489868, "learning_rate": 0.000122323677581864, "loss": 2.3342, "step": 12700 }, { "epoch": 3.6044205157268348, "grad_norm": 2.8967220783233643, "learning_rate": 0.0001221662468513854, "loss": 2.3723, "step": 12720 }, { "epoch": 3.610087843581751, "grad_norm": 2.9534173011779785, "learning_rate": 0.0001220088161209068, "loss": 2.2272, "step": 12740 }, { "epoch": 3.6157551714366676, "grad_norm": 2.1892776489257812, "learning_rate": 0.00012185138539042823, "loss": 1.9266, "step": 12760 }, { "epoch": 3.621422499291584, "grad_norm": 2.1670548915863037, "learning_rate": 0.00012169395465994963, "loss": 2.2557, "step": 12780 }, { "epoch": 3.6270898271465004, "grad_norm": 2.3003838062286377, "learning_rate": 0.00012153652392947104, "loss": 2.1665, "step": 12800 }, { "epoch": 3.632757155001417, "grad_norm": 2.6965646743774414, "learning_rate": 0.00012137909319899246, "loss": 2.505, "step": 12820 }, { "epoch": 3.6384244828563332, "grad_norm": 2.6316702365875244, "learning_rate": 0.00012122166246851385, "loss": 1.9523, "step": 12840 }, { "epoch": 3.6440918107112497, "grad_norm": 2.9587371349334717, "learning_rate": 0.00012106423173803527, "loss": 2.2881, "step": 12860 }, { "epoch": 3.649759138566166, "grad_norm": 2.0558106899261475, "learning_rate": 0.0001209068010075567, "loss": 2.3469, "step": 12880 }, { "epoch": 3.6554264664210825, "grad_norm": 1.7000762224197388, "learning_rate": 0.00012074937027707809, "loss": 2.2403, "step": 12900 }, { "epoch": 3.661093794275999, "grad_norm": 3.5249218940734863, "learning_rate": 0.00012059193954659951, "loss": 2.119, "step": 12920 }, { "epoch": 3.6667611221309153, "grad_norm": 2.5795509815216064, "learning_rate": 0.0001204345088161209, "loss": 2.419, "step": 12940 }, { "epoch": 3.6724284499858317, "grad_norm": 1.8098112344741821, "learning_rate": 0.00012027707808564232, "loss": 2.1803, "step": 12960 }, { "epoch": 3.678095777840748, "grad_norm": 3.674870252609253, "learning_rate": 0.00012011964735516374, "loss": 2.4885, "step": 12980 }, { "epoch": 3.6837631056956646, "grad_norm": 4.349696636199951, "learning_rate": 0.00011996221662468514, "loss": 2.4022, "step": 13000 }, { "epoch": 3.6837631056956646, "eval_loss": 2.5916082859039307, "eval_runtime": 3.2225, "eval_samples_per_second": 31.032, "eval_steps_per_second": 7.758, "step": 13000 }, { "epoch": 3.689430433550581, "grad_norm": 2.5092554092407227, "learning_rate": 0.00011980478589420656, "loss": 2.0367, "step": 13020 }, { "epoch": 3.6950977614054974, "grad_norm": 2.2241082191467285, "learning_rate": 0.00011964735516372796, "loss": 2.1224, "step": 13040 }, { "epoch": 3.700765089260414, "grad_norm": 2.800952196121216, "learning_rate": 0.00011948992443324937, "loss": 2.1977, "step": 13060 }, { "epoch": 3.7064324171153302, "grad_norm": 1.674391746520996, "learning_rate": 0.00011933249370277078, "loss": 2.1479, "step": 13080 }, { "epoch": 3.7120997449702466, "grad_norm": 2.520673990249634, "learning_rate": 0.0001191750629722922, "loss": 2.1496, "step": 13100 }, { "epoch": 3.717767072825163, "grad_norm": 1.8463191986083984, "learning_rate": 0.00011901763224181362, "loss": 2.2631, "step": 13120 }, { "epoch": 3.7234344006800795, "grad_norm": 1.7727094888687134, "learning_rate": 0.00011886020151133501, "loss": 2.0955, "step": 13140 }, { "epoch": 3.729101728534996, "grad_norm": 2.716247081756592, "learning_rate": 0.00011870277078085643, "loss": 2.0533, "step": 13160 }, { "epoch": 3.7347690563899123, "grad_norm": 1.6097841262817383, "learning_rate": 0.00011854534005037783, "loss": 1.9599, "step": 13180 }, { "epoch": 3.7404363842448287, "grad_norm": 1.8846412897109985, "learning_rate": 0.00011838790931989925, "loss": 2.314, "step": 13200 }, { "epoch": 3.746103712099745, "grad_norm": 2.424431085586548, "learning_rate": 0.00011823047858942067, "loss": 2.3608, "step": 13220 }, { "epoch": 3.7517710399546615, "grad_norm": 2.026026964187622, "learning_rate": 0.00011807304785894206, "loss": 2.0349, "step": 13240 }, { "epoch": 3.757438367809578, "grad_norm": 2.674293279647827, "learning_rate": 0.00011791561712846348, "loss": 2.3979, "step": 13260 }, { "epoch": 3.7631056956644944, "grad_norm": 2.256223201751709, "learning_rate": 0.00011775818639798488, "loss": 2.0402, "step": 13280 }, { "epoch": 3.768773023519411, "grad_norm": 2.7279646396636963, "learning_rate": 0.0001176007556675063, "loss": 2.0954, "step": 13300 }, { "epoch": 3.774440351374327, "grad_norm": 2.0750999450683594, "learning_rate": 0.00011744332493702772, "loss": 2.1507, "step": 13320 }, { "epoch": 3.7801076792292436, "grad_norm": 2.897549629211426, "learning_rate": 0.00011728589420654912, "loss": 2.2075, "step": 13340 }, { "epoch": 3.78577500708416, "grad_norm": 2.530906915664673, "learning_rate": 0.00011712846347607053, "loss": 2.1464, "step": 13360 }, { "epoch": 3.7914423349390765, "grad_norm": 2.195884943008423, "learning_rate": 0.00011697103274559194, "loss": 2.084, "step": 13380 }, { "epoch": 3.797109662793993, "grad_norm": 2.567448139190674, "learning_rate": 0.00011681360201511336, "loss": 2.2047, "step": 13400 }, { "epoch": 3.8027769906489093, "grad_norm": 2.219639778137207, "learning_rate": 0.00011665617128463478, "loss": 1.911, "step": 13420 }, { "epoch": 3.8084443185038257, "grad_norm": 2.55180025100708, "learning_rate": 0.00011649874055415617, "loss": 2.1766, "step": 13440 }, { "epoch": 3.814111646358742, "grad_norm": 3.532160997390747, "learning_rate": 0.0001163413098236776, "loss": 2.4511, "step": 13460 }, { "epoch": 3.8197789742136585, "grad_norm": 2.2325427532196045, "learning_rate": 0.00011618387909319899, "loss": 2.0139, "step": 13480 }, { "epoch": 3.825446302068575, "grad_norm": 2.796725273132324, "learning_rate": 0.00011602644836272041, "loss": 2.5501, "step": 13500 }, { "epoch": 3.825446302068575, "eval_loss": 2.5852210521698, "eval_runtime": 3.2181, "eval_samples_per_second": 31.074, "eval_steps_per_second": 7.769, "step": 13500 }, { "epoch": 3.831113629923491, "grad_norm": 2.608856201171875, "learning_rate": 0.00011586901763224183, "loss": 2.0729, "step": 13520 }, { "epoch": 3.8367809577784073, "grad_norm": 2.823665142059326, "learning_rate": 0.00011571158690176322, "loss": 2.1232, "step": 13540 }, { "epoch": 3.8424482856333237, "grad_norm": 3.031323194503784, "learning_rate": 0.00011555415617128464, "loss": 1.8223, "step": 13560 }, { "epoch": 3.84811561348824, "grad_norm": 3.018430233001709, "learning_rate": 0.00011539672544080604, "loss": 2.1297, "step": 13580 }, { "epoch": 3.8537829413431566, "grad_norm": 2.795938491821289, "learning_rate": 0.00011523929471032746, "loss": 2.3744, "step": 13600 }, { "epoch": 3.859450269198073, "grad_norm": 1.897302269935608, "learning_rate": 0.00011508186397984888, "loss": 2.2067, "step": 13620 }, { "epoch": 3.8651175970529894, "grad_norm": 1.2017441987991333, "learning_rate": 0.00011492443324937027, "loss": 2.1947, "step": 13640 }, { "epoch": 3.870784924907906, "grad_norm": 2.6210947036743164, "learning_rate": 0.00011476700251889169, "loss": 2.2942, "step": 13660 }, { "epoch": 3.8764522527628222, "grad_norm": 3.785167932510376, "learning_rate": 0.0001146095717884131, "loss": 2.0592, "step": 13680 }, { "epoch": 3.8821195806177387, "grad_norm": 1.547645926475525, "learning_rate": 0.00011445214105793452, "loss": 2.3021, "step": 13700 }, { "epoch": 3.887786908472655, "grad_norm": 2.4754045009613037, "learning_rate": 0.00011429471032745594, "loss": 2.3234, "step": 13720 }, { "epoch": 3.8934542363275715, "grad_norm": 2.3843698501586914, "learning_rate": 0.00011413727959697733, "loss": 2.2701, "step": 13740 }, { "epoch": 3.899121564182488, "grad_norm": 1.6965466737747192, "learning_rate": 0.00011397984886649875, "loss": 2.1126, "step": 13760 }, { "epoch": 3.9047888920374043, "grad_norm": 2.7540078163146973, "learning_rate": 0.00011382241813602015, "loss": 2.1373, "step": 13780 }, { "epoch": 3.9104562198923207, "grad_norm": 2.600942611694336, "learning_rate": 0.00011366498740554157, "loss": 2.355, "step": 13800 }, { "epoch": 3.916123547747237, "grad_norm": 2.2522218227386475, "learning_rate": 0.00011350755667506299, "loss": 2.2866, "step": 13820 }, { "epoch": 3.9217908756021536, "grad_norm": 3.0471324920654297, "learning_rate": 0.00011335012594458438, "loss": 2.1658, "step": 13840 }, { "epoch": 3.92745820345707, "grad_norm": 2.80242657661438, "learning_rate": 0.0001131926952141058, "loss": 2.2615, "step": 13860 }, { "epoch": 3.9331255313119864, "grad_norm": 1.6693689823150635, "learning_rate": 0.0001130352644836272, "loss": 1.9832, "step": 13880 }, { "epoch": 3.938792859166903, "grad_norm": 1.7925853729248047, "learning_rate": 0.00011287783375314862, "loss": 1.9924, "step": 13900 }, { "epoch": 3.944460187021819, "grad_norm": 3.3592286109924316, "learning_rate": 0.00011272040302267004, "loss": 2.3421, "step": 13920 }, { "epoch": 3.9501275148767356, "grad_norm": 2.5693514347076416, "learning_rate": 0.00011256297229219143, "loss": 2.1088, "step": 13940 }, { "epoch": 3.955794842731652, "grad_norm": 2.4678328037261963, "learning_rate": 0.00011240554156171285, "loss": 1.9663, "step": 13960 }, { "epoch": 3.9614621705865685, "grad_norm": 3.800464153289795, "learning_rate": 0.00011224811083123426, "loss": 2.4601, "step": 13980 }, { "epoch": 3.967129498441485, "grad_norm": 4.0479512214660645, "learning_rate": 0.00011209068010075568, "loss": 1.9925, "step": 14000 }, { "epoch": 3.967129498441485, "eval_loss": 2.601236581802368, "eval_runtime": 2.808, "eval_samples_per_second": 35.612, "eval_steps_per_second": 8.903, "step": 14000 }, { "epoch": 3.9727968262964013, "grad_norm": 1.7064104080200195, "learning_rate": 0.00011193324937027709, "loss": 2.1534, "step": 14020 }, { "epoch": 3.9784641541513177, "grad_norm": 3.1176583766937256, "learning_rate": 0.0001117758186397985, "loss": 2.3398, "step": 14040 }, { "epoch": 3.984131482006234, "grad_norm": 1.70388662815094, "learning_rate": 0.00011161838790931991, "loss": 2.3162, "step": 14060 }, { "epoch": 3.9897988098611505, "grad_norm": 2.730586051940918, "learning_rate": 0.00011146095717884131, "loss": 2.0569, "step": 14080 }, { "epoch": 3.995466137716067, "grad_norm": 2.4882378578186035, "learning_rate": 0.00011130352644836273, "loss": 2.3949, "step": 14100 }, { "epoch": 4.001133465570983, "grad_norm": 2.998526096343994, "learning_rate": 0.00011114609571788415, "loss": 2.2472, "step": 14120 }, { "epoch": 4.006800793425899, "grad_norm": 3.1528983116149902, "learning_rate": 0.00011098866498740554, "loss": 2.0055, "step": 14140 }, { "epoch": 4.012468121280816, "grad_norm": 1.7269772291183472, "learning_rate": 0.00011083123425692696, "loss": 1.8636, "step": 14160 }, { "epoch": 4.018135449135732, "grad_norm": 3.6996374130249023, "learning_rate": 0.00011067380352644836, "loss": 2.0252, "step": 14180 }, { "epoch": 4.023802776990649, "grad_norm": 1.9313174486160278, "learning_rate": 0.00011051637279596978, "loss": 2.3717, "step": 14200 }, { "epoch": 4.029470104845565, "grad_norm": 2.2295897006988525, "learning_rate": 0.0001103589420654912, "loss": 2.3739, "step": 14220 }, { "epoch": 4.035137432700481, "grad_norm": 3.148862361907959, "learning_rate": 0.00011020151133501259, "loss": 2.1986, "step": 14240 }, { "epoch": 4.040804760555398, "grad_norm": 3.488157033920288, "learning_rate": 0.00011004408060453401, "loss": 2.0557, "step": 14260 }, { "epoch": 4.046472088410314, "grad_norm": 3.348540782928467, "learning_rate": 0.00010988664987405542, "loss": 2.0662, "step": 14280 }, { "epoch": 4.052139416265231, "grad_norm": 2.0953595638275146, "learning_rate": 0.00010972921914357684, "loss": 2.0572, "step": 14300 }, { "epoch": 4.057806744120147, "grad_norm": 2.483224868774414, "learning_rate": 0.00010957178841309825, "loss": 2.1758, "step": 14320 }, { "epoch": 4.0634740719750635, "grad_norm": 3.2123374938964844, "learning_rate": 0.00010941435768261965, "loss": 2.5713, "step": 14340 }, { "epoch": 4.06914139982998, "grad_norm": 1.8340033292770386, "learning_rate": 0.00010925692695214107, "loss": 2.14, "step": 14360 }, { "epoch": 4.074808727684896, "grad_norm": 2.345903158187866, "learning_rate": 0.00010909949622166247, "loss": 2.4102, "step": 14380 }, { "epoch": 4.080476055539813, "grad_norm": 1.5778138637542725, "learning_rate": 0.00010894206549118389, "loss": 2.0536, "step": 14400 }, { "epoch": 4.086143383394729, "grad_norm": 2.3162434101104736, "learning_rate": 0.00010878463476070528, "loss": 2.0864, "step": 14420 }, { "epoch": 4.091810711249646, "grad_norm": 2.309699296951294, "learning_rate": 0.0001086272040302267, "loss": 2.2958, "step": 14440 }, { "epoch": 4.097478039104562, "grad_norm": 2.873891830444336, "learning_rate": 0.00010846977329974812, "loss": 2.0398, "step": 14460 }, { "epoch": 4.103145366959478, "grad_norm": 1.8570784330368042, "learning_rate": 0.00010831234256926952, "loss": 2.0934, "step": 14480 }, { "epoch": 4.108812694814395, "grad_norm": 1.9899300336837769, "learning_rate": 0.00010815491183879094, "loss": 2.2918, "step": 14500 }, { "epoch": 4.108812694814395, "eval_loss": 2.5631062984466553, "eval_runtime": 2.8102, "eval_samples_per_second": 35.585, "eval_steps_per_second": 8.896, "step": 14500 }, { "epoch": 4.114480022669311, "grad_norm": 3.250046968460083, "learning_rate": 0.00010799748110831233, "loss": 2.1929, "step": 14520 }, { "epoch": 4.120147350524228, "grad_norm": 1.8071410655975342, "learning_rate": 0.00010784005037783375, "loss": 2.1085, "step": 14540 }, { "epoch": 4.125814678379144, "grad_norm": 2.1138498783111572, "learning_rate": 0.00010768261964735517, "loss": 2.214, "step": 14560 }, { "epoch": 4.1314820062340605, "grad_norm": 2.439495325088501, "learning_rate": 0.00010752518891687658, "loss": 2.0742, "step": 14580 }, { "epoch": 4.137149334088977, "grad_norm": 2.979523181915283, "learning_rate": 0.00010736775818639799, "loss": 2.097, "step": 14600 }, { "epoch": 4.142816661943893, "grad_norm": 3.031501531600952, "learning_rate": 0.0001072103274559194, "loss": 2.268, "step": 14620 }, { "epoch": 4.14848398979881, "grad_norm": 2.7768356800079346, "learning_rate": 0.00010705289672544081, "loss": 2.1908, "step": 14640 }, { "epoch": 4.154151317653726, "grad_norm": 2.93528151512146, "learning_rate": 0.00010689546599496223, "loss": 1.9598, "step": 14660 }, { "epoch": 4.1598186455086426, "grad_norm": 1.6406769752502441, "learning_rate": 0.00010673803526448363, "loss": 2.0483, "step": 14680 }, { "epoch": 4.165485973363559, "grad_norm": 2.0956027507781982, "learning_rate": 0.00010658060453400505, "loss": 2.3424, "step": 14700 }, { "epoch": 4.171153301218475, "grad_norm": 2.5103437900543213, "learning_rate": 0.00010642317380352644, "loss": 2.3318, "step": 14720 }, { "epoch": 4.176820629073392, "grad_norm": 1.6188454627990723, "learning_rate": 0.00010626574307304786, "loss": 2.2598, "step": 14740 }, { "epoch": 4.182487956928308, "grad_norm": 3.7336795330047607, "learning_rate": 0.00010610831234256928, "loss": 1.9651, "step": 14760 }, { "epoch": 4.188155284783225, "grad_norm": 3.0738677978515625, "learning_rate": 0.00010595088161209068, "loss": 2.099, "step": 14780 }, { "epoch": 4.193822612638141, "grad_norm": 3.5832650661468506, "learning_rate": 0.0001057934508816121, "loss": 2.0592, "step": 14800 }, { "epoch": 4.1994899404930575, "grad_norm": 1.7851639986038208, "learning_rate": 0.00010563602015113349, "loss": 2.0938, "step": 14820 }, { "epoch": 4.205157268347974, "grad_norm": 2.519808530807495, "learning_rate": 0.00010547858942065491, "loss": 2.285, "step": 14840 }, { "epoch": 4.21082459620289, "grad_norm": 3.5694098472595215, "learning_rate": 0.00010532115869017633, "loss": 2.1338, "step": 14860 }, { "epoch": 4.216491924057807, "grad_norm": 3.621523857116699, "learning_rate": 0.00010516372795969774, "loss": 2.4946, "step": 14880 }, { "epoch": 4.222159251912723, "grad_norm": 1.7120813131332397, "learning_rate": 0.00010500629722921915, "loss": 2.3706, "step": 14900 }, { "epoch": 4.2278265797676395, "grad_norm": 1.2210742235183716, "learning_rate": 0.00010484886649874055, "loss": 1.9529, "step": 14920 }, { "epoch": 4.233493907622556, "grad_norm": 4.060178279876709, "learning_rate": 0.00010469143576826197, "loss": 2.4666, "step": 14940 }, { "epoch": 4.239161235477472, "grad_norm": 1.6368330717086792, "learning_rate": 0.0001045340050377834, "loss": 1.8443, "step": 14960 }, { "epoch": 4.244828563332389, "grad_norm": 3.3006865978240967, "learning_rate": 0.00010437657430730479, "loss": 2.273, "step": 14980 }, { "epoch": 4.250495891187305, "grad_norm": 3.4050724506378174, "learning_rate": 0.00010421914357682621, "loss": 1.9171, "step": 15000 }, { "epoch": 4.250495891187305, "eval_loss": 2.5422470569610596, "eval_runtime": 2.8329, "eval_samples_per_second": 35.3, "eval_steps_per_second": 8.825, "step": 15000 }, { "epoch": 4.256163219042222, "grad_norm": 1.6130422353744507, "learning_rate": 0.0001040617128463476, "loss": 2.1511, "step": 15020 }, { "epoch": 4.261830546897138, "grad_norm": 2.3339264392852783, "learning_rate": 0.00010390428211586902, "loss": 1.9022, "step": 15040 }, { "epoch": 4.267497874752054, "grad_norm": 1.8212581872940063, "learning_rate": 0.00010375472292191437, "loss": 2.2455, "step": 15060 }, { "epoch": 4.273165202606971, "grad_norm": 2.0960988998413086, "learning_rate": 0.00010359729219143576, "loss": 2.0909, "step": 15080 }, { "epoch": 4.278832530461887, "grad_norm": 1.7981733083724976, "learning_rate": 0.00010343986146095719, "loss": 2.2837, "step": 15100 }, { "epoch": 4.284499858316804, "grad_norm": 2.68420147895813, "learning_rate": 0.00010328243073047858, "loss": 1.9365, "step": 15120 }, { "epoch": 4.29016718617172, "grad_norm": 3.0869174003601074, "learning_rate": 0.000103125, "loss": 1.903, "step": 15140 }, { "epoch": 4.2958345140266365, "grad_norm": 2.100395917892456, "learning_rate": 0.00010296756926952142, "loss": 2.1622, "step": 15160 }, { "epoch": 4.301501841881553, "grad_norm": 2.8661584854125977, "learning_rate": 0.00010281013853904283, "loss": 2.1253, "step": 15180 }, { "epoch": 4.307169169736469, "grad_norm": 4.167464733123779, "learning_rate": 0.00010265270780856425, "loss": 2.5276, "step": 15200 }, { "epoch": 4.312836497591386, "grad_norm": 2.6079812049865723, "learning_rate": 0.00010249527707808564, "loss": 2.0512, "step": 15220 }, { "epoch": 4.318503825446302, "grad_norm": 2.2207114696502686, "learning_rate": 0.00010233784634760706, "loss": 2.1751, "step": 15240 }, { "epoch": 4.324171153301219, "grad_norm": 3.689655303955078, "learning_rate": 0.00010218041561712848, "loss": 2.2271, "step": 15260 }, { "epoch": 4.329838481156135, "grad_norm": 2.514122486114502, "learning_rate": 0.00010202298488664988, "loss": 1.9297, "step": 15280 }, { "epoch": 4.335505809011051, "grad_norm": 1.611823320388794, "learning_rate": 0.0001018655541561713, "loss": 2.1275, "step": 15300 }, { "epoch": 4.341173136865968, "grad_norm": 2.0419039726257324, "learning_rate": 0.00010170812342569269, "loss": 2.2593, "step": 15320 }, { "epoch": 4.346840464720884, "grad_norm": 2.6001248359680176, "learning_rate": 0.00010155069269521411, "loss": 1.9687, "step": 15340 }, { "epoch": 4.352507792575801, "grad_norm": 2.370124578475952, "learning_rate": 0.00010139326196473553, "loss": 1.8149, "step": 15360 }, { "epoch": 4.358175120430717, "grad_norm": 2.405198097229004, "learning_rate": 0.00010123583123425692, "loss": 2.0178, "step": 15380 }, { "epoch": 4.3638424482856335, "grad_norm": 2.226900100708008, "learning_rate": 0.00010107840050377835, "loss": 1.765, "step": 15400 }, { "epoch": 4.36950977614055, "grad_norm": 2.6988940238952637, "learning_rate": 0.00010092096977329974, "loss": 2.1023, "step": 15420 }, { "epoch": 4.375177103995466, "grad_norm": 3.127578020095825, "learning_rate": 0.00010076353904282116, "loss": 2.5809, "step": 15440 }, { "epoch": 4.380844431850383, "grad_norm": 2.7449769973754883, "learning_rate": 0.00010060610831234258, "loss": 2.0326, "step": 15460 }, { "epoch": 4.386511759705299, "grad_norm": 1.7547941207885742, "learning_rate": 0.00010044867758186399, "loss": 2.0147, "step": 15480 }, { "epoch": 4.392179087560216, "grad_norm": 3.1960136890411377, "learning_rate": 0.0001002912468513854, "loss": 2.1133, "step": 15500 }, { "epoch": 4.392179087560216, "eval_loss": 2.541494846343994, "eval_runtime": 2.8928, "eval_samples_per_second": 34.568, "eval_steps_per_second": 8.642, "step": 15500 }, { "epoch": 4.397846415415132, "grad_norm": 2.6480014324188232, "learning_rate": 0.0001001338161209068, "loss": 2.37, "step": 15520 }, { "epoch": 4.403513743270048, "grad_norm": 2.3555309772491455, "learning_rate": 9.997638539042822e-05, "loss": 2.2901, "step": 15540 }, { "epoch": 4.409181071124965, "grad_norm": 2.043233633041382, "learning_rate": 9.981895465994963e-05, "loss": 1.9961, "step": 15560 }, { "epoch": 4.414848398979881, "grad_norm": 3.2146637439727783, "learning_rate": 9.966152392947104e-05, "loss": 2.3091, "step": 15580 }, { "epoch": 4.420515726834798, "grad_norm": 2.1864705085754395, "learning_rate": 9.950409319899244e-05, "loss": 2.1505, "step": 15600 }, { "epoch": 4.426183054689714, "grad_norm": 2.7555770874023438, "learning_rate": 9.934666246851386e-05, "loss": 1.9678, "step": 15620 }, { "epoch": 4.4318503825446305, "grad_norm": 2.243762493133545, "learning_rate": 9.918923173803527e-05, "loss": 1.9281, "step": 15640 }, { "epoch": 4.437517710399547, "grad_norm": 4.0090837478637695, "learning_rate": 9.903180100755668e-05, "loss": 2.3638, "step": 15660 }, { "epoch": 4.443185038254463, "grad_norm": 2.334876537322998, "learning_rate": 9.887437027707808e-05, "loss": 2.3428, "step": 15680 }, { "epoch": 4.44885236610938, "grad_norm": 2.348623037338257, "learning_rate": 9.871693954659949e-05, "loss": 2.2133, "step": 15700 }, { "epoch": 4.454519693964296, "grad_norm": 1.9974579811096191, "learning_rate": 9.855950881612091e-05, "loss": 2.0664, "step": 15720 }, { "epoch": 4.460187021819213, "grad_norm": 4.7677507400512695, "learning_rate": 9.840207808564232e-05, "loss": 2.48, "step": 15740 }, { "epoch": 4.465854349674129, "grad_norm": 2.237412214279175, "learning_rate": 9.824464735516373e-05, "loss": 2.0361, "step": 15760 }, { "epoch": 4.471521677529045, "grad_norm": 2.7374372482299805, "learning_rate": 9.808721662468515e-05, "loss": 2.1954, "step": 15780 }, { "epoch": 4.477189005383962, "grad_norm": 1.5552740097045898, "learning_rate": 9.792978589420655e-05, "loss": 1.9873, "step": 15800 }, { "epoch": 4.482856333238878, "grad_norm": 3.449444532394409, "learning_rate": 9.777235516372797e-05, "loss": 2.1522, "step": 15820 }, { "epoch": 4.488523661093795, "grad_norm": 1.8778547048568726, "learning_rate": 9.761492443324938e-05, "loss": 2.2383, "step": 15840 }, { "epoch": 4.494190988948711, "grad_norm": 1.7599525451660156, "learning_rate": 9.745749370277079e-05, "loss": 2.1141, "step": 15860 }, { "epoch": 4.4998583168036275, "grad_norm": 2.067941665649414, "learning_rate": 9.73000629722922e-05, "loss": 1.9909, "step": 15880 }, { "epoch": 4.505525644658544, "grad_norm": 3.1372246742248535, "learning_rate": 9.71426322418136e-05, "loss": 1.9778, "step": 15900 }, { "epoch": 4.511192972513459, "grad_norm": 2.4991812705993652, "learning_rate": 9.698520151133502e-05, "loss": 2.1443, "step": 15920 }, { "epoch": 4.516860300368377, "grad_norm": 2.872926712036133, "learning_rate": 9.682777078085643e-05, "loss": 2.2738, "step": 15940 }, { "epoch": 4.522527628223292, "grad_norm": 2.526963710784912, "learning_rate": 9.667034005037784e-05, "loss": 2.1809, "step": 15960 }, { "epoch": 4.5281949560782095, "grad_norm": 1.9566162824630737, "learning_rate": 9.651290931989925e-05, "loss": 1.9099, "step": 15980 }, { "epoch": 4.533862283933125, "grad_norm": 1.6572496891021729, "learning_rate": 9.635547858942065e-05, "loss": 2.3153, "step": 16000 }, { "epoch": 4.533862283933125, "eval_loss": 2.5328502655029297, "eval_runtime": 3.0596, "eval_samples_per_second": 32.684, "eval_steps_per_second": 8.171, "step": 16000 }, { "epoch": 4.539529611788042, "grad_norm": 2.3872873783111572, "learning_rate": 9.619804785894207e-05, "loss": 2.0104, "step": 16020 }, { "epoch": 4.545196939642958, "grad_norm": 3.8765885829925537, "learning_rate": 9.604061712846348e-05, "loss": 2.1942, "step": 16040 }, { "epoch": 4.550864267497875, "grad_norm": 3.2206783294677734, "learning_rate": 9.588318639798489e-05, "loss": 1.9295, "step": 16060 }, { "epoch": 4.556531595352791, "grad_norm": 2.432044506072998, "learning_rate": 9.57257556675063e-05, "loss": 1.8958, "step": 16080 }, { "epoch": 4.562198923207708, "grad_norm": 1.8008333444595337, "learning_rate": 9.556832493702771e-05, "loss": 1.9504, "step": 16100 }, { "epoch": 4.567866251062624, "grad_norm": 2.082988739013672, "learning_rate": 9.541089420654914e-05, "loss": 2.2839, "step": 16120 }, { "epoch": 4.573533578917541, "grad_norm": 2.634587287902832, "learning_rate": 9.525346347607054e-05, "loss": 2.186, "step": 16140 }, { "epoch": 4.579200906772456, "grad_norm": 2.4398605823516846, "learning_rate": 9.509603274559195e-05, "loss": 1.9551, "step": 16160 }, { "epoch": 4.584868234627373, "grad_norm": 1.6370044946670532, "learning_rate": 9.493860201511336e-05, "loss": 2.1232, "step": 16180 }, { "epoch": 4.590535562482289, "grad_norm": 2.542977809906006, "learning_rate": 9.478117128463476e-05, "loss": 1.8248, "step": 16200 }, { "epoch": 4.596202890337206, "grad_norm": 2.9373440742492676, "learning_rate": 9.462374055415617e-05, "loss": 2.1155, "step": 16220 }, { "epoch": 4.601870218192122, "grad_norm": 3.5269792079925537, "learning_rate": 9.446630982367759e-05, "loss": 2.1598, "step": 16240 }, { "epoch": 4.6075375460470385, "grad_norm": 2.108086347579956, "learning_rate": 9.4308879093199e-05, "loss": 2.3274, "step": 16260 }, { "epoch": 4.613204873901955, "grad_norm": 3.0127644538879395, "learning_rate": 9.41514483627204e-05, "loss": 2.1938, "step": 16280 }, { "epoch": 4.618872201756871, "grad_norm": 2.236767053604126, "learning_rate": 9.399401763224181e-05, "loss": 2.0618, "step": 16300 }, { "epoch": 4.624539529611788, "grad_norm": 1.9350080490112305, "learning_rate": 9.383658690176322e-05, "loss": 2.1375, "step": 16320 }, { "epoch": 4.630206857466704, "grad_norm": 2.762411594390869, "learning_rate": 9.367915617128464e-05, "loss": 2.1662, "step": 16340 }, { "epoch": 4.6358741853216205, "grad_norm": 3.462433099746704, "learning_rate": 9.352172544080605e-05, "loss": 2.4224, "step": 16360 }, { "epoch": 4.641541513176537, "grad_norm": 3.176011562347412, "learning_rate": 9.336429471032745e-05, "loss": 2.1278, "step": 16380 }, { "epoch": 4.647208841031453, "grad_norm": 2.2641067504882812, "learning_rate": 9.320686397984887e-05, "loss": 2.2379, "step": 16400 }, { "epoch": 4.65287616888637, "grad_norm": 1.9422305822372437, "learning_rate": 9.304943324937028e-05, "loss": 2.0263, "step": 16420 }, { "epoch": 4.658543496741286, "grad_norm": 2.3394930362701416, "learning_rate": 9.28920025188917e-05, "loss": 2.116, "step": 16440 }, { "epoch": 4.664210824596203, "grad_norm": 2.1810262203216553, "learning_rate": 9.273457178841311e-05, "loss": 2.25, "step": 16460 }, { "epoch": 4.669878152451119, "grad_norm": 1.5272624492645264, "learning_rate": 9.257714105793452e-05, "loss": 2.2981, "step": 16480 }, { "epoch": 4.6755454803060355, "grad_norm": 3.039471387863159, "learning_rate": 9.241971032745592e-05, "loss": 2.0452, "step": 16500 }, { "epoch": 4.6755454803060355, "eval_loss": 2.534147262573242, "eval_runtime": 3.1991, "eval_samples_per_second": 31.259, "eval_steps_per_second": 7.815, "step": 16500 }, { "epoch": 4.681212808160952, "grad_norm": 4.542686462402344, "learning_rate": 9.226227959697733e-05, "loss": 2.1744, "step": 16520 }, { "epoch": 4.686880136015868, "grad_norm": 2.112548351287842, "learning_rate": 9.210484886649875e-05, "loss": 1.7862, "step": 16540 }, { "epoch": 4.692547463870785, "grad_norm": 3.265399932861328, "learning_rate": 9.194741813602016e-05, "loss": 2.0769, "step": 16560 }, { "epoch": 4.698214791725701, "grad_norm": 2.765065908432007, "learning_rate": 9.178998740554157e-05, "loss": 2.3969, "step": 16580 }, { "epoch": 4.7038821195806175, "grad_norm": 3.0396904945373535, "learning_rate": 9.163255667506297e-05, "loss": 2.1543, "step": 16600 }, { "epoch": 4.709549447435534, "grad_norm": 2.1735658645629883, "learning_rate": 9.147512594458438e-05, "loss": 1.9209, "step": 16620 }, { "epoch": 4.71521677529045, "grad_norm": 3.2876620292663574, "learning_rate": 9.13176952141058e-05, "loss": 2.2637, "step": 16640 }, { "epoch": 4.720884103145367, "grad_norm": 2.4522008895874023, "learning_rate": 9.116026448362721e-05, "loss": 2.4159, "step": 16660 }, { "epoch": 4.726551431000283, "grad_norm": 3.569902181625366, "learning_rate": 9.100283375314861e-05, "loss": 2.3154, "step": 16680 }, { "epoch": 4.7322187588552, "grad_norm": 3.2628724575042725, "learning_rate": 9.084540302267003e-05, "loss": 1.8932, "step": 16700 }, { "epoch": 4.737886086710116, "grad_norm": 2.4736580848693848, "learning_rate": 9.068797229219144e-05, "loss": 2.2045, "step": 16720 }, { "epoch": 4.743553414565032, "grad_norm": 1.5378949642181396, "learning_rate": 9.053054156171286e-05, "loss": 2.27, "step": 16740 }, { "epoch": 4.749220742419949, "grad_norm": 3.041412830352783, "learning_rate": 9.037311083123427e-05, "loss": 2.5436, "step": 16760 }, { "epoch": 4.754888070274865, "grad_norm": 1.4869428873062134, "learning_rate": 9.021568010075568e-05, "loss": 2.0487, "step": 16780 }, { "epoch": 4.760555398129782, "grad_norm": 3.12021541595459, "learning_rate": 9.005824937027708e-05, "loss": 1.9661, "step": 16800 }, { "epoch": 4.766222725984698, "grad_norm": 2.5641088485717773, "learning_rate": 8.990081863979849e-05, "loss": 2.1215, "step": 16820 }, { "epoch": 4.7718900538396145, "grad_norm": 1.415110468864441, "learning_rate": 8.97433879093199e-05, "loss": 2.1518, "step": 16840 }, { "epoch": 4.777557381694531, "grad_norm": 2.5956015586853027, "learning_rate": 8.958595717884132e-05, "loss": 2.1249, "step": 16860 }, { "epoch": 4.783224709549447, "grad_norm": 2.1703569889068604, "learning_rate": 8.942852644836273e-05, "loss": 2.1381, "step": 16880 }, { "epoch": 4.788892037404364, "grad_norm": 1.8806556463241577, "learning_rate": 8.927109571788413e-05, "loss": 1.875, "step": 16900 }, { "epoch": 4.79455936525928, "grad_norm": 2.0450947284698486, "learning_rate": 8.911366498740554e-05, "loss": 2.0044, "step": 16920 }, { "epoch": 4.800226693114197, "grad_norm": 3.5895731449127197, "learning_rate": 8.895623425692695e-05, "loss": 1.8282, "step": 16940 }, { "epoch": 4.805894020969113, "grad_norm": 2.6889166831970215, "learning_rate": 8.879880352644837e-05, "loss": 2.4293, "step": 16960 }, { "epoch": 4.811561348824029, "grad_norm": 3.2479629516601562, "learning_rate": 8.864137279596977e-05, "loss": 1.9721, "step": 16980 }, { "epoch": 4.817228676678946, "grad_norm": 2.463063955307007, "learning_rate": 8.848394206549118e-05, "loss": 1.9849, "step": 17000 }, { "epoch": 4.817228676678946, "eval_loss": 2.5216333866119385, "eval_runtime": 7.0851, "eval_samples_per_second": 14.114, "eval_steps_per_second": 3.529, "step": 17000 }, { "epoch": 4.822896004533862, "grad_norm": 2.2834837436676025, "learning_rate": 8.83265113350126e-05, "loss": 1.9826, "step": 17020 }, { "epoch": 4.828563332388779, "grad_norm": 2.934311628341675, "learning_rate": 8.816908060453401e-05, "loss": 2.1014, "step": 17040 }, { "epoch": 4.834230660243695, "grad_norm": 1.8655563592910767, "learning_rate": 8.801164987405543e-05, "loss": 2.0185, "step": 17060 }, { "epoch": 4.8398979880986115, "grad_norm": 2.9462952613830566, "learning_rate": 8.785421914357684e-05, "loss": 1.9417, "step": 17080 }, { "epoch": 4.845565315953528, "grad_norm": 3.295435905456543, "learning_rate": 8.769678841309824e-05, "loss": 2.0193, "step": 17100 }, { "epoch": 4.851232643808444, "grad_norm": 2.2571659088134766, "learning_rate": 8.753935768261965e-05, "loss": 2.3968, "step": 17120 }, { "epoch": 4.856899971663361, "grad_norm": 2.744328737258911, "learning_rate": 8.738192695214106e-05, "loss": 1.9792, "step": 17140 }, { "epoch": 4.862567299518277, "grad_norm": 3.059037208557129, "learning_rate": 8.722449622166248e-05, "loss": 2.1146, "step": 17160 }, { "epoch": 4.868234627373194, "grad_norm": 2.130732536315918, "learning_rate": 8.706706549118389e-05, "loss": 1.7818, "step": 17180 }, { "epoch": 4.87390195522811, "grad_norm": 2.1071839332580566, "learning_rate": 8.690963476070529e-05, "loss": 2.3757, "step": 17200 }, { "epoch": 4.879569283083026, "grad_norm": 1.973744511604309, "learning_rate": 8.67522040302267e-05, "loss": 2.1915, "step": 17220 }, { "epoch": 4.885236610937943, "grad_norm": 3.144542694091797, "learning_rate": 8.65947732997481e-05, "loss": 2.0589, "step": 17240 }, { "epoch": 4.890903938792859, "grad_norm": 2.6524555683135986, "learning_rate": 8.643734256926953e-05, "loss": 2.1756, "step": 17260 }, { "epoch": 4.896571266647776, "grad_norm": 2.012974500656128, "learning_rate": 8.627991183879093e-05, "loss": 2.4634, "step": 17280 }, { "epoch": 4.902238594502692, "grad_norm": 3.2002832889556885, "learning_rate": 8.612248110831234e-05, "loss": 2.2074, "step": 17300 }, { "epoch": 4.9079059223576085, "grad_norm": 4.350805759429932, "learning_rate": 8.596505037783376e-05, "loss": 2.1941, "step": 17320 }, { "epoch": 4.913573250212525, "grad_norm": 2.39760684967041, "learning_rate": 8.580761964735517e-05, "loss": 2.0816, "step": 17340 }, { "epoch": 4.919240578067441, "grad_norm": 3.081848621368408, "learning_rate": 8.565018891687659e-05, "loss": 2.0291, "step": 17360 }, { "epoch": 4.924907905922358, "grad_norm": 2.5015769004821777, "learning_rate": 8.5492758186398e-05, "loss": 2.2585, "step": 17380 }, { "epoch": 4.930575233777274, "grad_norm": 2.269577741622925, "learning_rate": 8.53353274559194e-05, "loss": 2.1733, "step": 17400 }, { "epoch": 4.9362425616321906, "grad_norm": 2.219484567642212, "learning_rate": 8.517789672544081e-05, "loss": 2.1854, "step": 17420 }, { "epoch": 4.941909889487107, "grad_norm": 2.9977378845214844, "learning_rate": 8.502046599496222e-05, "loss": 1.9393, "step": 17440 }, { "epoch": 4.947577217342023, "grad_norm": 2.9846885204315186, "learning_rate": 8.486303526448362e-05, "loss": 1.8715, "step": 17460 }, { "epoch": 4.95324454519694, "grad_norm": 2.544022560119629, "learning_rate": 8.470560453400505e-05, "loss": 2.2167, "step": 17480 }, { "epoch": 4.958911873051856, "grad_norm": 2.009578227996826, "learning_rate": 8.454817380352645e-05, "loss": 2.1464, "step": 17500 }, { "epoch": 4.958911873051856, "eval_loss": 2.4881093502044678, "eval_runtime": 3.2171, "eval_samples_per_second": 31.084, "eval_steps_per_second": 7.771, "step": 17500 }, { "epoch": 4.964579200906773, "grad_norm": 2.7923951148986816, "learning_rate": 8.439074307304786e-05, "loss": 2.0785, "step": 17520 }, { "epoch": 4.970246528761689, "grad_norm": 2.5019516944885254, "learning_rate": 8.423331234256927e-05, "loss": 2.0215, "step": 17540 }, { "epoch": 4.9759138566166055, "grad_norm": 1.902367115020752, "learning_rate": 8.407588161209067e-05, "loss": 2.3362, "step": 17560 }, { "epoch": 4.981581184471522, "grad_norm": 2.441772699356079, "learning_rate": 8.39184508816121e-05, "loss": 2.1467, "step": 17580 }, { "epoch": 4.987248512326438, "grad_norm": 2.7760977745056152, "learning_rate": 8.37610201511335e-05, "loss": 2.1818, "step": 17600 }, { "epoch": 4.992915840181355, "grad_norm": 2.6653623580932617, "learning_rate": 8.360358942065491e-05, "loss": 2.2526, "step": 17620 }, { "epoch": 4.998583168036271, "grad_norm": 1.611898422241211, "learning_rate": 8.344615869017633e-05, "loss": 1.9495, "step": 17640 }, { "epoch": 5.0042504958911875, "grad_norm": 2.12345814704895, "learning_rate": 8.328872795969774e-05, "loss": 2.2312, "step": 17660 }, { "epoch": 5.009917823746104, "grad_norm": 2.4139115810394287, "learning_rate": 8.313129722921916e-05, "loss": 1.9948, "step": 17680 }, { "epoch": 5.01558515160102, "grad_norm": 1.9104679822921753, "learning_rate": 8.297386649874056e-05, "loss": 1.9979, "step": 17700 }, { "epoch": 5.021252479455937, "grad_norm": 1.6763135194778442, "learning_rate": 8.281643576826197e-05, "loss": 1.9741, "step": 17720 }, { "epoch": 5.026919807310853, "grad_norm": 2.2822425365448, "learning_rate": 8.265900503778338e-05, "loss": 2.1104, "step": 17740 }, { "epoch": 5.03258713516577, "grad_norm": 2.0670816898345947, "learning_rate": 8.250157430730479e-05, "loss": 2.1315, "step": 17760 }, { "epoch": 5.038254463020686, "grad_norm": 4.752333641052246, "learning_rate": 8.23441435768262e-05, "loss": 2.0401, "step": 17780 }, { "epoch": 5.043921790875602, "grad_norm": 1.6980493068695068, "learning_rate": 8.218671284634761e-05, "loss": 2.0819, "step": 17800 }, { "epoch": 5.049589118730519, "grad_norm": 2.580793619155884, "learning_rate": 8.202928211586902e-05, "loss": 1.9222, "step": 17820 }, { "epoch": 5.055256446585435, "grad_norm": 2.5627386569976807, "learning_rate": 8.187185138539043e-05, "loss": 1.83, "step": 17840 }, { "epoch": 5.060923774440352, "grad_norm": 3.1464080810546875, "learning_rate": 8.171442065491183e-05, "loss": 2.0895, "step": 17860 }, { "epoch": 5.066591102295268, "grad_norm": 3.343013286590576, "learning_rate": 8.155698992443325e-05, "loss": 2.1006, "step": 17880 }, { "epoch": 5.0722584301501845, "grad_norm": 2.9297568798065186, "learning_rate": 8.139955919395466e-05, "loss": 2.0575, "step": 17900 }, { "epoch": 5.077925758005101, "grad_norm": 1.5253390073776245, "learning_rate": 8.124212846347607e-05, "loss": 2.1604, "step": 17920 }, { "epoch": 5.083593085860017, "grad_norm": 2.1921324729919434, "learning_rate": 8.108469773299749e-05, "loss": 2.077, "step": 17940 }, { "epoch": 5.089260413714934, "grad_norm": 2.963470220565796, "learning_rate": 8.09272670025189e-05, "loss": 2.2282, "step": 17960 }, { "epoch": 5.09492774156985, "grad_norm": 2.120961904525757, "learning_rate": 8.076983627204032e-05, "loss": 1.7986, "step": 17980 }, { "epoch": 5.100595069424767, "grad_norm": 3.170531988143921, "learning_rate": 8.061240554156172e-05, "loss": 1.9734, "step": 18000 }, { "epoch": 5.100595069424767, "eval_loss": 2.4896302223205566, "eval_runtime": 3.3342, "eval_samples_per_second": 29.992, "eval_steps_per_second": 7.498, "step": 18000 }, { "epoch": 5.106262397279683, "grad_norm": 2.576862096786499, "learning_rate": 8.045497481108313e-05, "loss": 2.1119, "step": 18020 }, { "epoch": 5.111929725134599, "grad_norm": 3.0261478424072266, "learning_rate": 8.029754408060454e-05, "loss": 1.9213, "step": 18040 }, { "epoch": 5.117597052989516, "grad_norm": 2.046865940093994, "learning_rate": 8.014011335012595e-05, "loss": 2.2662, "step": 18060 }, { "epoch": 5.123264380844432, "grad_norm": 2.9750986099243164, "learning_rate": 7.998268261964737e-05, "loss": 2.137, "step": 18080 }, { "epoch": 5.128931708699349, "grad_norm": 2.1197729110717773, "learning_rate": 7.982525188916877e-05, "loss": 1.9736, "step": 18100 }, { "epoch": 5.134599036554265, "grad_norm": 3.0674076080322266, "learning_rate": 7.966782115869018e-05, "loss": 2.1976, "step": 18120 }, { "epoch": 5.1402663644091815, "grad_norm": 2.932227849960327, "learning_rate": 7.951039042821159e-05, "loss": 2.1513, "step": 18140 }, { "epoch": 5.145933692264098, "grad_norm": 2.8038082122802734, "learning_rate": 7.9352959697733e-05, "loss": 2.1578, "step": 18160 }, { "epoch": 5.151601020119013, "grad_norm": 2.9535889625549316, "learning_rate": 7.91955289672544e-05, "loss": 2.2502, "step": 18180 }, { "epoch": 5.157268347973931, "grad_norm": 1.309472918510437, "learning_rate": 7.903809823677582e-05, "loss": 2.0498, "step": 18200 }, { "epoch": 5.162935675828846, "grad_norm": 3.4494869709014893, "learning_rate": 7.888066750629723e-05, "loss": 2.0391, "step": 18220 }, { "epoch": 5.168603003683763, "grad_norm": 2.674025297164917, "learning_rate": 7.872323677581865e-05, "loss": 2.4361, "step": 18240 }, { "epoch": 5.174270331538679, "grad_norm": 3.0136802196502686, "learning_rate": 7.856580604534006e-05, "loss": 1.8106, "step": 18260 }, { "epoch": 5.1799376593935955, "grad_norm": 2.919110059738159, "learning_rate": 7.840837531486146e-05, "loss": 2.2182, "step": 18280 }, { "epoch": 5.185604987248512, "grad_norm": 1.638126015663147, "learning_rate": 7.825094458438288e-05, "loss": 2.1145, "step": 18300 }, { "epoch": 5.191272315103428, "grad_norm": 2.200085401535034, "learning_rate": 7.809351385390429e-05, "loss": 2.1709, "step": 18320 }, { "epoch": 5.196939642958345, "grad_norm": 3.3994224071502686, "learning_rate": 7.79360831234257e-05, "loss": 2.4144, "step": 18340 }, { "epoch": 5.202606970813261, "grad_norm": 2.244265556335449, "learning_rate": 7.77786523929471e-05, "loss": 2.1646, "step": 18360 }, { "epoch": 5.208274298668178, "grad_norm": 2.0081467628479004, "learning_rate": 7.762122166246851e-05, "loss": 1.9374, "step": 18380 }, { "epoch": 5.213941626523094, "grad_norm": 2.2320468425750732, "learning_rate": 7.746379093198993e-05, "loss": 2.4236, "step": 18400 }, { "epoch": 5.21960895437801, "grad_norm": 2.356590986251831, "learning_rate": 7.730636020151134e-05, "loss": 2.0714, "step": 18420 }, { "epoch": 5.225276282232927, "grad_norm": 2.8124611377716064, "learning_rate": 7.714892947103275e-05, "loss": 2.3284, "step": 18440 }, { "epoch": 5.230943610087843, "grad_norm": 2.513658046722412, "learning_rate": 7.699149874055415e-05, "loss": 2.3014, "step": 18460 }, { "epoch": 5.23661093794276, "grad_norm": 2.454583168029785, "learning_rate": 7.683406801007556e-05, "loss": 2.1609, "step": 18480 }, { "epoch": 5.242278265797676, "grad_norm": 3.3914637565612793, "learning_rate": 7.667663727959698e-05, "loss": 1.9801, "step": 18500 }, { "epoch": 5.242278265797676, "eval_loss": 2.4703454971313477, "eval_runtime": 3.2381, "eval_samples_per_second": 30.882, "eval_steps_per_second": 7.72, "step": 18500 }, { "epoch": 5.2479455936525925, "grad_norm": 1.6943174600601196, "learning_rate": 7.651920654911839e-05, "loss": 2.0503, "step": 18520 }, { "epoch": 5.253612921507509, "grad_norm": 3.948223352432251, "learning_rate": 7.63617758186398e-05, "loss": 1.9716, "step": 18540 }, { "epoch": 5.259280249362425, "grad_norm": 2.835549831390381, "learning_rate": 7.620434508816122e-05, "loss": 2.0251, "step": 18560 }, { "epoch": 5.264947577217342, "grad_norm": 1.6352922916412354, "learning_rate": 7.604691435768262e-05, "loss": 1.8663, "step": 18580 }, { "epoch": 5.270614905072258, "grad_norm": 2.668067455291748, "learning_rate": 7.588948362720404e-05, "loss": 2.1502, "step": 18600 }, { "epoch": 5.276282232927175, "grad_norm": 2.842921495437622, "learning_rate": 7.573205289672545e-05, "loss": 2.163, "step": 18620 }, { "epoch": 5.281949560782091, "grad_norm": 2.735417604446411, "learning_rate": 7.557462216624686e-05, "loss": 1.8118, "step": 18640 }, { "epoch": 5.287616888637007, "grad_norm": 2.9332432746887207, "learning_rate": 7.541719143576827e-05, "loss": 1.7266, "step": 18660 }, { "epoch": 5.293284216491924, "grad_norm": 2.903895854949951, "learning_rate": 7.525976070528967e-05, "loss": 2.1408, "step": 18680 }, { "epoch": 5.29895154434684, "grad_norm": 3.239797353744507, "learning_rate": 7.510232997481109e-05, "loss": 2.17, "step": 18700 }, { "epoch": 5.304618872201757, "grad_norm": 2.382883071899414, "learning_rate": 7.49448992443325e-05, "loss": 1.7877, "step": 18720 }, { "epoch": 5.310286200056673, "grad_norm": 1.8095910549163818, "learning_rate": 7.478746851385391e-05, "loss": 2.078, "step": 18740 }, { "epoch": 5.3159535279115895, "grad_norm": 2.318617105484009, "learning_rate": 7.463003778337531e-05, "loss": 2.2546, "step": 18760 }, { "epoch": 5.321620855766506, "grad_norm": 3.298072338104248, "learning_rate": 7.447260705289672e-05, "loss": 1.9098, "step": 18780 }, { "epoch": 5.327288183621422, "grad_norm": 3.113771438598633, "learning_rate": 7.431517632241813e-05, "loss": 2.0452, "step": 18800 }, { "epoch": 5.332955511476339, "grad_norm": 1.7504597902297974, "learning_rate": 7.415774559193955e-05, "loss": 2.2694, "step": 18820 }, { "epoch": 5.338622839331255, "grad_norm": 2.813023328781128, "learning_rate": 7.400031486146096e-05, "loss": 2.0498, "step": 18840 }, { "epoch": 5.344290167186172, "grad_norm": 2.136150360107422, "learning_rate": 7.384288413098238e-05, "loss": 1.9964, "step": 18860 }, { "epoch": 5.349957495041088, "grad_norm": 3.94606876373291, "learning_rate": 7.368545340050378e-05, "loss": 2.4806, "step": 18880 }, { "epoch": 5.355624822896004, "grad_norm": 3.3203959465026855, "learning_rate": 7.352802267002519e-05, "loss": 2.0108, "step": 18900 }, { "epoch": 5.361292150750921, "grad_norm": 1.8811283111572266, "learning_rate": 7.337059193954661e-05, "loss": 2.1235, "step": 18920 }, { "epoch": 5.366959478605837, "grad_norm": 1.4347679615020752, "learning_rate": 7.321316120906802e-05, "loss": 1.7695, "step": 18940 }, { "epoch": 5.372626806460754, "grad_norm": 1.6307075023651123, "learning_rate": 7.305573047858943e-05, "loss": 2.0888, "step": 18960 }, { "epoch": 5.37829413431567, "grad_norm": 2.198254108428955, "learning_rate": 7.289829974811083e-05, "loss": 1.9819, "step": 18980 }, { "epoch": 5.3839614621705865, "grad_norm": 3.646446466445923, "learning_rate": 7.274086901763224e-05, "loss": 2.0792, "step": 19000 }, { "epoch": 5.3839614621705865, "eval_loss": 2.47479248046875, "eval_runtime": 3.217, "eval_samples_per_second": 31.085, "eval_steps_per_second": 7.771, "step": 19000 }, { "epoch": 5.389628790025503, "grad_norm": 2.636549949645996, "learning_rate": 7.258343828715366e-05, "loss": 1.9472, "step": 19020 }, { "epoch": 5.395296117880419, "grad_norm": 2.653043508529663, "learning_rate": 7.242600755667507e-05, "loss": 1.9029, "step": 19040 }, { "epoch": 5.400963445735336, "grad_norm": 2.073880434036255, "learning_rate": 7.226857682619647e-05, "loss": 2.1926, "step": 19060 }, { "epoch": 5.406630773590252, "grad_norm": 2.969294786453247, "learning_rate": 7.211114609571788e-05, "loss": 2.1444, "step": 19080 }, { "epoch": 5.4122981014451685, "grad_norm": 1.7169913053512573, "learning_rate": 7.195371536523929e-05, "loss": 1.7829, "step": 19100 }, { "epoch": 5.417965429300085, "grad_norm": 3.421771764755249, "learning_rate": 7.179628463476071e-05, "loss": 2.1297, "step": 19120 }, { "epoch": 5.423632757155001, "grad_norm": 2.9925830364227295, "learning_rate": 7.163885390428212e-05, "loss": 2.3765, "step": 19140 }, { "epoch": 5.429300085009918, "grad_norm": 2.109009265899658, "learning_rate": 7.148142317380354e-05, "loss": 2.0536, "step": 19160 }, { "epoch": 5.434967412864834, "grad_norm": 1.455603003501892, "learning_rate": 7.132399244332494e-05, "loss": 2.0764, "step": 19180 }, { "epoch": 5.440634740719751, "grad_norm": 1.9426006078720093, "learning_rate": 7.116656171284635e-05, "loss": 2.0462, "step": 19200 }, { "epoch": 5.446302068574667, "grad_norm": 3.297316074371338, "learning_rate": 7.100913098236777e-05, "loss": 2.1608, "step": 19220 }, { "epoch": 5.4519693964295834, "grad_norm": 1.747969388961792, "learning_rate": 7.085170025188918e-05, "loss": 2.0496, "step": 19240 }, { "epoch": 5.4576367242845, "grad_norm": 2.3945157527923584, "learning_rate": 7.070214105793451e-05, "loss": 2.2217, "step": 19260 }, { "epoch": 5.463304052139416, "grad_norm": 2.8052563667297363, "learning_rate": 7.054471032745592e-05, "loss": 2.0864, "step": 19280 }, { "epoch": 5.468971379994333, "grad_norm": 2.766470193862915, "learning_rate": 7.038727959697733e-05, "loss": 1.9465, "step": 19300 }, { "epoch": 5.474638707849249, "grad_norm": 2.9683549404144287, "learning_rate": 7.022984886649875e-05, "loss": 2.118, "step": 19320 }, { "epoch": 5.4803060357041655, "grad_norm": 1.7682409286499023, "learning_rate": 7.007241813602016e-05, "loss": 2.2897, "step": 19340 }, { "epoch": 5.485973363559082, "grad_norm": 1.847961187362671, "learning_rate": 6.991498740554156e-05, "loss": 2.2154, "step": 19360 }, { "epoch": 5.491640691413998, "grad_norm": 3.157636880874634, "learning_rate": 6.975755667506297e-05, "loss": 1.9751, "step": 19380 }, { "epoch": 5.497308019268915, "grad_norm": 2.6534993648529053, "learning_rate": 6.960012594458438e-05, "loss": 2.4059, "step": 19400 }, { "epoch": 5.502975347123831, "grad_norm": 2.820960760116577, "learning_rate": 6.94426952141058e-05, "loss": 2.0831, "step": 19420 }, { "epoch": 5.508642674978748, "grad_norm": 2.5798096656799316, "learning_rate": 6.92852644836272e-05, "loss": 2.1112, "step": 19440 }, { "epoch": 5.514310002833664, "grad_norm": 3.199784994125366, "learning_rate": 6.912783375314862e-05, "loss": 1.978, "step": 19460 }, { "epoch": 5.51997733068858, "grad_norm": 2.0286285877227783, "learning_rate": 6.897040302267003e-05, "loss": 1.8653, "step": 19480 }, { "epoch": 5.525644658543497, "grad_norm": 2.598414421081543, "learning_rate": 6.881297229219144e-05, "loss": 1.9011, "step": 19500 }, { "epoch": 5.525644658543497, "eval_loss": 2.4556543827056885, "eval_runtime": 4.793, "eval_samples_per_second": 20.864, "eval_steps_per_second": 5.216, "step": 19500 }, { "epoch": 5.531311986398413, "grad_norm": 1.8829158544540405, "learning_rate": 6.865554156171286e-05, "loss": 2.1263, "step": 19520 }, { "epoch": 5.53697931425333, "grad_norm": 3.8471462726593018, "learning_rate": 6.849811083123427e-05, "loss": 2.1645, "step": 19540 }, { "epoch": 5.542646642108246, "grad_norm": 4.183299541473389, "learning_rate": 6.834068010075567e-05, "loss": 1.8249, "step": 19560 }, { "epoch": 5.5483139699631625, "grad_norm": 3.22953200340271, "learning_rate": 6.818324937027708e-05, "loss": 2.2605, "step": 19580 }, { "epoch": 5.553981297818079, "grad_norm": 3.213674783706665, "learning_rate": 6.802581863979849e-05, "loss": 1.962, "step": 19600 }, { "epoch": 5.559648625672995, "grad_norm": 3.2854604721069336, "learning_rate": 6.78683879093199e-05, "loss": 2.1806, "step": 19620 }, { "epoch": 5.565315953527912, "grad_norm": 3.4986965656280518, "learning_rate": 6.771095717884132e-05, "loss": 2.1129, "step": 19640 }, { "epoch": 5.570983281382828, "grad_norm": 2.272303342819214, "learning_rate": 6.755352644836272e-05, "loss": 2.1014, "step": 19660 }, { "epoch": 5.576650609237745, "grad_norm": 2.313539743423462, "learning_rate": 6.739609571788413e-05, "loss": 1.6756, "step": 19680 }, { "epoch": 5.582317937092661, "grad_norm": 3.0942163467407227, "learning_rate": 6.723866498740554e-05, "loss": 2.1026, "step": 19700 }, { "epoch": 5.587985264947577, "grad_norm": 2.0947153568267822, "learning_rate": 6.708123425692696e-05, "loss": 2.002, "step": 19720 }, { "epoch": 5.593652592802494, "grad_norm": 3.4495205879211426, "learning_rate": 6.692380352644836e-05, "loss": 2.0464, "step": 19740 }, { "epoch": 5.59931992065741, "grad_norm": 1.86062490940094, "learning_rate": 6.676637279596978e-05, "loss": 1.8995, "step": 19760 }, { "epoch": 5.604987248512327, "grad_norm": 2.924778699874878, "learning_rate": 6.660894206549119e-05, "loss": 2.1324, "step": 19780 }, { "epoch": 5.610654576367243, "grad_norm": 1.6769875288009644, "learning_rate": 6.64515113350126e-05, "loss": 2.1029, "step": 19800 }, { "epoch": 5.6163219042221595, "grad_norm": 2.417696714401245, "learning_rate": 6.6294080604534e-05, "loss": 2.1541, "step": 19820 }, { "epoch": 5.621989232077076, "grad_norm": 3.1510069370269775, "learning_rate": 6.613664987405543e-05, "loss": 1.763, "step": 19840 }, { "epoch": 5.627656559931992, "grad_norm": 2.213958740234375, "learning_rate": 6.597921914357683e-05, "loss": 2.313, "step": 19860 }, { "epoch": 5.633323887786909, "grad_norm": 1.6285980939865112, "learning_rate": 6.582178841309824e-05, "loss": 2.3212, "step": 19880 }, { "epoch": 5.638991215641825, "grad_norm": 1.8207205533981323, "learning_rate": 6.566435768261965e-05, "loss": 2.1429, "step": 19900 }, { "epoch": 5.644658543496742, "grad_norm": 2.8851072788238525, "learning_rate": 6.550692695214105e-05, "loss": 2.2691, "step": 19920 }, { "epoch": 5.650325871351658, "grad_norm": 2.3610246181488037, "learning_rate": 6.534949622166248e-05, "loss": 2.3964, "step": 19940 }, { "epoch": 5.655993199206574, "grad_norm": 2.2136218547821045, "learning_rate": 6.519206549118388e-05, "loss": 2.1798, "step": 19960 }, { "epoch": 5.661660527061491, "grad_norm": 3.2735633850097656, "learning_rate": 6.503463476070529e-05, "loss": 1.8563, "step": 19980 }, { "epoch": 5.667327854916407, "grad_norm": 2.2639482021331787, "learning_rate": 6.48772040302267e-05, "loss": 2.1416, "step": 20000 }, { "epoch": 5.667327854916407, "eval_loss": 2.4388110637664795, "eval_runtime": 3.2548, "eval_samples_per_second": 30.723, "eval_steps_per_second": 7.681, "step": 20000 }, { "epoch": 5.672995182771324, "grad_norm": 2.9988245964050293, "learning_rate": 6.47197732997481e-05, "loss": 2.1224, "step": 20020 }, { "epoch": 5.67866251062624, "grad_norm": 1.75840425491333, "learning_rate": 6.456234256926952e-05, "loss": 1.926, "step": 20040 }, { "epoch": 5.6843298384811565, "grad_norm": 1.7853156328201294, "learning_rate": 6.440491183879093e-05, "loss": 1.9675, "step": 20060 }, { "epoch": 5.689997166336073, "grad_norm": 2.8419766426086426, "learning_rate": 6.424748110831235e-05, "loss": 2.2042, "step": 20080 }, { "epoch": 5.695664494190989, "grad_norm": 2.0991363525390625, "learning_rate": 6.409005037783376e-05, "loss": 2.1646, "step": 20100 }, { "epoch": 5.701331822045906, "grad_norm": 2.547128915786743, "learning_rate": 6.393261964735517e-05, "loss": 2.1156, "step": 20120 }, { "epoch": 5.706999149900822, "grad_norm": 2.9477779865264893, "learning_rate": 6.377518891687659e-05, "loss": 2.2143, "step": 20140 }, { "epoch": 5.7126664777557385, "grad_norm": 2.9247043132781982, "learning_rate": 6.3617758186398e-05, "loss": 2.2349, "step": 20160 }, { "epoch": 5.718333805610655, "grad_norm": 1.541329026222229, "learning_rate": 6.34603274559194e-05, "loss": 1.944, "step": 20180 }, { "epoch": 5.7240011334655705, "grad_norm": 3.0165555477142334, "learning_rate": 6.330289672544081e-05, "loss": 2.1221, "step": 20200 }, { "epoch": 5.729668461320488, "grad_norm": 2.2763922214508057, "learning_rate": 6.314546599496221e-05, "loss": 1.9722, "step": 20220 }, { "epoch": 5.735335789175403, "grad_norm": 1.970175862312317, "learning_rate": 6.298803526448362e-05, "loss": 2.0935, "step": 20240 }, { "epoch": 5.741003117030321, "grad_norm": 3.0834615230560303, "learning_rate": 6.283060453400504e-05, "loss": 1.8963, "step": 20260 }, { "epoch": 5.746670444885236, "grad_norm": 3.1697397232055664, "learning_rate": 6.267317380352645e-05, "loss": 1.7621, "step": 20280 }, { "epoch": 5.7523377727401535, "grad_norm": 1.9718819856643677, "learning_rate": 6.251574307304786e-05, "loss": 2.0025, "step": 20300 }, { "epoch": 5.758005100595069, "grad_norm": 2.329387903213501, "learning_rate": 6.235831234256926e-05, "loss": 2.2388, "step": 20320 }, { "epoch": 5.763672428449986, "grad_norm": 2.304124593734741, "learning_rate": 6.220088161209068e-05, "loss": 2.0768, "step": 20340 }, { "epoch": 5.769339756304902, "grad_norm": 2.7573482990264893, "learning_rate": 6.204345088161209e-05, "loss": 2.0539, "step": 20360 }, { "epoch": 5.775007084159819, "grad_norm": 2.4950790405273438, "learning_rate": 6.188602015113351e-05, "loss": 2.0682, "step": 20380 }, { "epoch": 5.780674412014735, "grad_norm": 1.5820170640945435, "learning_rate": 6.172858942065492e-05, "loss": 2.1376, "step": 20400 }, { "epoch": 5.786341739869652, "grad_norm": 1.5796340703964233, "learning_rate": 6.157115869017633e-05, "loss": 1.8318, "step": 20420 }, { "epoch": 5.7920090677245675, "grad_norm": 2.9279041290283203, "learning_rate": 6.141372795969773e-05, "loss": 2.1727, "step": 20440 }, { "epoch": 5.797676395579484, "grad_norm": 2.1237261295318604, "learning_rate": 6.125629722921915e-05, "loss": 2.0071, "step": 20460 }, { "epoch": 5.8033437234344, "grad_norm": 3.1060469150543213, "learning_rate": 6.109886649874056e-05, "loss": 2.0069, "step": 20480 }, { "epoch": 5.809011051289317, "grad_norm": 3.726361036300659, "learning_rate": 6.094143576826197e-05, "loss": 2.149, "step": 20500 }, { "epoch": 5.809011051289317, "eval_loss": 2.4449756145477295, "eval_runtime": 4.5326, "eval_samples_per_second": 22.062, "eval_steps_per_second": 5.516, "step": 20500 }, { "epoch": 5.814678379144233, "grad_norm": 3.7346291542053223, "learning_rate": 6.0784005037783375e-05, "loss": 1.9595, "step": 20520 }, { "epoch": 5.8203457069991495, "grad_norm": 2.049481153488159, "learning_rate": 6.062657430730478e-05, "loss": 1.9061, "step": 20540 }, { "epoch": 5.826013034854066, "grad_norm": 1.9789921045303345, "learning_rate": 6.04691435768262e-05, "loss": 1.9476, "step": 20560 }, { "epoch": 5.831680362708982, "grad_norm": 3.6475727558135986, "learning_rate": 6.031171284634761e-05, "loss": 2.1275, "step": 20580 }, { "epoch": 5.837347690563899, "grad_norm": 3.771057367324829, "learning_rate": 6.0154282115869023e-05, "loss": 2.1838, "step": 20600 }, { "epoch": 5.843015018418815, "grad_norm": 2.299187421798706, "learning_rate": 5.999685138539043e-05, "loss": 2.1614, "step": 20620 }, { "epoch": 5.848682346273732, "grad_norm": 2.180260419845581, "learning_rate": 5.983942065491184e-05, "loss": 2.201, "step": 20640 }, { "epoch": 5.854349674128648, "grad_norm": 1.945426344871521, "learning_rate": 5.968198992443326e-05, "loss": 1.8653, "step": 20660 }, { "epoch": 5.8600170019835645, "grad_norm": 2.7463645935058594, "learning_rate": 5.9524559193954665e-05, "loss": 1.8955, "step": 20680 }, { "epoch": 5.865684329838481, "grad_norm": 3.5293266773223877, "learning_rate": 5.936712846347607e-05, "loss": 2.0823, "step": 20700 }, { "epoch": 5.871351657693397, "grad_norm": 2.8452839851379395, "learning_rate": 5.920969773299748e-05, "loss": 2.0841, "step": 20720 }, { "epoch": 5.877018985548314, "grad_norm": 2.2791764736175537, "learning_rate": 5.905226700251889e-05, "loss": 2.1591, "step": 20740 }, { "epoch": 5.88268631340323, "grad_norm": 1.83878755569458, "learning_rate": 5.889483627204031e-05, "loss": 2.1694, "step": 20760 }, { "epoch": 5.8883536412581465, "grad_norm": 2.8884003162384033, "learning_rate": 5.873740554156172e-05, "loss": 2.0612, "step": 20780 }, { "epoch": 5.894020969113063, "grad_norm": 2.4525411128997803, "learning_rate": 5.857997481108313e-05, "loss": 1.9347, "step": 20800 }, { "epoch": 5.899688296967979, "grad_norm": 2.8223187923431396, "learning_rate": 5.8422544080604535e-05, "loss": 1.8984, "step": 20820 }, { "epoch": 5.905355624822896, "grad_norm": 2.578632116317749, "learning_rate": 5.826511335012594e-05, "loss": 2.0802, "step": 20840 }, { "epoch": 5.911022952677812, "grad_norm": 2.3425028324127197, "learning_rate": 5.810768261964736e-05, "loss": 2.059, "step": 20860 }, { "epoch": 5.916690280532729, "grad_norm": 2.963291883468628, "learning_rate": 5.795025188916877e-05, "loss": 2.1417, "step": 20880 }, { "epoch": 5.922357608387645, "grad_norm": 2.0896551609039307, "learning_rate": 5.7792821158690184e-05, "loss": 2.2094, "step": 20900 }, { "epoch": 5.928024936242561, "grad_norm": 2.2109158039093018, "learning_rate": 5.763539042821159e-05, "loss": 2.1535, "step": 20920 }, { "epoch": 5.933692264097478, "grad_norm": 2.881401777267456, "learning_rate": 5.7477959697733e-05, "loss": 2.3234, "step": 20940 }, { "epoch": 5.939359591952394, "grad_norm": 1.7358540296554565, "learning_rate": 5.7320528967254405e-05, "loss": 1.9131, "step": 20960 }, { "epoch": 5.945026919807311, "grad_norm": 3.1998605728149414, "learning_rate": 5.7163098236775825e-05, "loss": 1.9332, "step": 20980 }, { "epoch": 5.950694247662227, "grad_norm": 3.824021816253662, "learning_rate": 5.700566750629723e-05, "loss": 2.0365, "step": 21000 }, { "epoch": 5.950694247662227, "eval_loss": 2.425581216812134, "eval_runtime": 4.3456, "eval_samples_per_second": 23.012, "eval_steps_per_second": 5.753, "step": 21000 }, { "epoch": 5.9563615755171435, "grad_norm": 2.502293586730957, "learning_rate": 5.684823677581864e-05, "loss": 2.2499, "step": 21020 }, { "epoch": 5.96202890337206, "grad_norm": 3.378965139389038, "learning_rate": 5.669080604534005e-05, "loss": 2.0264, "step": 21040 }, { "epoch": 5.967696231226976, "grad_norm": 3.0274112224578857, "learning_rate": 5.653337531486146e-05, "loss": 2.2812, "step": 21060 }, { "epoch": 5.973363559081893, "grad_norm": 2.1198225021362305, "learning_rate": 5.637594458438288e-05, "loss": 2.0689, "step": 21080 }, { "epoch": 5.979030886936809, "grad_norm": 2.6119089126586914, "learning_rate": 5.621851385390429e-05, "loss": 2.3771, "step": 21100 }, { "epoch": 5.984698214791726, "grad_norm": 2.908874988555908, "learning_rate": 5.6061083123425695e-05, "loss": 2.1518, "step": 21120 }, { "epoch": 5.990365542646642, "grad_norm": 2.877047300338745, "learning_rate": 5.59036523929471e-05, "loss": 2.1545, "step": 21140 }, { "epoch": 5.996032870501558, "grad_norm": 2.6687257289886475, "learning_rate": 5.574622166246851e-05, "loss": 2.1517, "step": 21160 }, { "epoch": 6.001700198356475, "grad_norm": 2.606386423110962, "learning_rate": 5.558879093198993e-05, "loss": 2.0432, "step": 21180 }, { "epoch": 6.007367526211391, "grad_norm": 2.192491054534912, "learning_rate": 5.543136020151134e-05, "loss": 1.8915, "step": 21200 }, { "epoch": 6.013034854066308, "grad_norm": 2.129635810852051, "learning_rate": 5.527392947103275e-05, "loss": 2.1884, "step": 21220 }, { "epoch": 6.018702181921224, "grad_norm": 1.8217124938964844, "learning_rate": 5.511649874055416e-05, "loss": 1.9591, "step": 21240 }, { "epoch": 6.0243695097761405, "grad_norm": 2.8712050914764404, "learning_rate": 5.4959068010075565e-05, "loss": 1.9833, "step": 21260 }, { "epoch": 6.030036837631057, "grad_norm": 2.6022608280181885, "learning_rate": 5.4801637279596985e-05, "loss": 1.8921, "step": 21280 }, { "epoch": 6.035704165485973, "grad_norm": 1.53400719165802, "learning_rate": 5.464420654911839e-05, "loss": 1.9038, "step": 21300 }, { "epoch": 6.04137149334089, "grad_norm": 2.845515489578247, "learning_rate": 5.44867758186398e-05, "loss": 2.0025, "step": 21320 }, { "epoch": 6.047038821195806, "grad_norm": 2.194345474243164, "learning_rate": 5.4329345088161207e-05, "loss": 2.0643, "step": 21340 }, { "epoch": 6.052706149050723, "grad_norm": 2.9751715660095215, "learning_rate": 5.417191435768262e-05, "loss": 1.9796, "step": 21360 }, { "epoch": 6.058373476905639, "grad_norm": 2.94996976852417, "learning_rate": 5.401448362720404e-05, "loss": 2.031, "step": 21380 }, { "epoch": 6.064040804760555, "grad_norm": 3.1982829570770264, "learning_rate": 5.385705289672545e-05, "loss": 1.7422, "step": 21400 }, { "epoch": 6.069708132615472, "grad_norm": 3.71414852142334, "learning_rate": 5.3699622166246855e-05, "loss": 2.252, "step": 21420 }, { "epoch": 6.075375460470388, "grad_norm": 2.8651375770568848, "learning_rate": 5.354219143576826e-05, "loss": 2.1711, "step": 21440 }, { "epoch": 6.081042788325305, "grad_norm": 2.4566287994384766, "learning_rate": 5.338476070528967e-05, "loss": 2.0057, "step": 21460 }, { "epoch": 6.086710116180221, "grad_norm": 4.084034442901611, "learning_rate": 5.322732997481109e-05, "loss": 2.0664, "step": 21480 }, { "epoch": 6.0923774440351375, "grad_norm": 2.8006837368011475, "learning_rate": 5.30698992443325e-05, "loss": 1.9168, "step": 21500 }, { "epoch": 6.0923774440351375, "eval_loss": 2.417039394378662, "eval_runtime": 3.2893, "eval_samples_per_second": 30.401, "eval_steps_per_second": 7.6, "step": 21500 }, { "epoch": 6.098044771890054, "grad_norm": 1.4964312314987183, "learning_rate": 5.291246851385391e-05, "loss": 2.049, "step": 21520 }, { "epoch": 6.10371209974497, "grad_norm": 3.0628747940063477, "learning_rate": 5.275503778337532e-05, "loss": 2.0734, "step": 21540 }, { "epoch": 6.109379427599887, "grad_norm": 3.056234121322632, "learning_rate": 5.2597607052896725e-05, "loss": 2.0086, "step": 21560 }, { "epoch": 6.115046755454803, "grad_norm": 2.1585164070129395, "learning_rate": 5.244017632241813e-05, "loss": 1.8124, "step": 21580 }, { "epoch": 6.1207140833097196, "grad_norm": 2.578914165496826, "learning_rate": 5.228274559193955e-05, "loss": 1.9807, "step": 21600 }, { "epoch": 6.126381411164636, "grad_norm": 2.3548150062561035, "learning_rate": 5.212531486146096e-05, "loss": 1.7482, "step": 21620 }, { "epoch": 6.132048739019552, "grad_norm": 2.7067956924438477, "learning_rate": 5.196788413098237e-05, "loss": 2.0196, "step": 21640 }, { "epoch": 6.137716066874469, "grad_norm": 3.80324125289917, "learning_rate": 5.181045340050378e-05, "loss": 2.0129, "step": 21660 }, { "epoch": 6.143383394729385, "grad_norm": 2.598961353302002, "learning_rate": 5.165302267002519e-05, "loss": 1.9477, "step": 21680 }, { "epoch": 6.149050722584302, "grad_norm": 2.536165237426758, "learning_rate": 5.149559193954661e-05, "loss": 1.8108, "step": 21700 }, { "epoch": 6.154718050439218, "grad_norm": 2.657949209213257, "learning_rate": 5.1338161209068015e-05, "loss": 1.9341, "step": 21720 }, { "epoch": 6.1603853782941345, "grad_norm": 1.8108060359954834, "learning_rate": 5.118073047858942e-05, "loss": 2.1234, "step": 21740 }, { "epoch": 6.166052706149051, "grad_norm": 2.3113696575164795, "learning_rate": 5.102329974811083e-05, "loss": 1.6216, "step": 21760 }, { "epoch": 6.171720034003967, "grad_norm": 2.1336653232574463, "learning_rate": 5.0865869017632236e-05, "loss": 1.9406, "step": 21780 }, { "epoch": 6.177387361858884, "grad_norm": 3.085634708404541, "learning_rate": 5.070843828715366e-05, "loss": 2.0823, "step": 21800 }, { "epoch": 6.1830546897138, "grad_norm": 3.2689034938812256, "learning_rate": 5.0551007556675064e-05, "loss": 2.0532, "step": 21820 }, { "epoch": 6.1887220175687165, "grad_norm": 3.206812858581543, "learning_rate": 5.039357682619648e-05, "loss": 2.1314, "step": 21840 }, { "epoch": 6.194389345423633, "grad_norm": 3.4216408729553223, "learning_rate": 5.0236146095717885e-05, "loss": 2.1431, "step": 21860 }, { "epoch": 6.200056673278549, "grad_norm": 2.8522963523864746, "learning_rate": 5.007871536523929e-05, "loss": 1.9482, "step": 21880 }, { "epoch": 6.205724001133466, "grad_norm": 2.8254451751708984, "learning_rate": 4.9921284634760706e-05, "loss": 1.7052, "step": 21900 }, { "epoch": 6.211391328988382, "grad_norm": 1.5718283653259277, "learning_rate": 4.976385390428212e-05, "loss": 1.855, "step": 21920 }, { "epoch": 6.217058656843299, "grad_norm": 1.634379267692566, "learning_rate": 4.960642317380353e-05, "loss": 2.0327, "step": 21940 }, { "epoch": 6.222725984698215, "grad_norm": 2.30794620513916, "learning_rate": 4.944899244332494e-05, "loss": 2.1794, "step": 21960 }, { "epoch": 6.2283933125531314, "grad_norm": 2.097869396209717, "learning_rate": 4.9291561712846354e-05, "loss": 1.7293, "step": 21980 }, { "epoch": 6.234060640408048, "grad_norm": 1.5178961753845215, "learning_rate": 4.913413098236776e-05, "loss": 2.1555, "step": 22000 }, { "epoch": 6.234060640408048, "eval_loss": 2.419494867324829, "eval_runtime": 3.4225, "eval_samples_per_second": 29.219, "eval_steps_per_second": 7.305, "step": 22000 }, { "epoch": 6.239727968262964, "grad_norm": 2.101348876953125, "learning_rate": 4.8976700251889175e-05, "loss": 2.1382, "step": 22020 }, { "epoch": 6.245395296117881, "grad_norm": 2.0564188957214355, "learning_rate": 4.881926952141058e-05, "loss": 2.1028, "step": 22040 }, { "epoch": 6.251062623972797, "grad_norm": 2.433161973953247, "learning_rate": 4.866183879093199e-05, "loss": 2.1561, "step": 22060 }, { "epoch": 6.2567299518277135, "grad_norm": 1.7992278337478638, "learning_rate": 4.85044080604534e-05, "loss": 1.9856, "step": 22080 }, { "epoch": 6.26239727968263, "grad_norm": 3.732327699661255, "learning_rate": 4.834697732997481e-05, "loss": 2.1425, "step": 22100 }, { "epoch": 6.268064607537546, "grad_norm": 2.8461897373199463, "learning_rate": 4.8189546599496224e-05, "loss": 1.9051, "step": 22120 }, { "epoch": 6.273731935392463, "grad_norm": 4.2314982414245605, "learning_rate": 4.803211586901764e-05, "loss": 2.0748, "step": 22140 }, { "epoch": 6.279399263247379, "grad_norm": 2.8581454753875732, "learning_rate": 4.7874685138539045e-05, "loss": 2.1782, "step": 22160 }, { "epoch": 6.285066591102296, "grad_norm": 2.3766987323760986, "learning_rate": 4.771725440806046e-05, "loss": 2.3411, "step": 22180 }, { "epoch": 6.290733918957212, "grad_norm": 3.0754947662353516, "learning_rate": 4.7559823677581866e-05, "loss": 1.8517, "step": 22200 }, { "epoch": 6.296401246812128, "grad_norm": 1.697813630104065, "learning_rate": 4.740239294710327e-05, "loss": 2.0749, "step": 22220 }, { "epoch": 6.302068574667045, "grad_norm": 2.6863694190979004, "learning_rate": 4.724496221662469e-05, "loss": 2.0784, "step": 22240 }, { "epoch": 6.307735902521961, "grad_norm": 3.288961410522461, "learning_rate": 4.7087531486146094e-05, "loss": 2.0988, "step": 22260 }, { "epoch": 6.313403230376878, "grad_norm": 3.573582887649536, "learning_rate": 4.693010075566751e-05, "loss": 1.7269, "step": 22280 }, { "epoch": 6.319070558231794, "grad_norm": 4.229494094848633, "learning_rate": 4.677267002518892e-05, "loss": 1.8333, "step": 22300 }, { "epoch": 6.3247378860867105, "grad_norm": 2.7735462188720703, "learning_rate": 4.661523929471033e-05, "loss": 2.1231, "step": 22320 }, { "epoch": 6.330405213941627, "grad_norm": 2.1033589839935303, "learning_rate": 4.645780856423174e-05, "loss": 1.936, "step": 22340 }, { "epoch": 6.336072541796543, "grad_norm": 2.011934280395508, "learning_rate": 4.630037783375315e-05, "loss": 2.1851, "step": 22360 }, { "epoch": 6.34173986965146, "grad_norm": 4.051826477050781, "learning_rate": 4.6142947103274563e-05, "loss": 2.0794, "step": 22380 }, { "epoch": 6.347407197506376, "grad_norm": 3.3374674320220947, "learning_rate": 4.598551637279597e-05, "loss": 1.9463, "step": 22400 }, { "epoch": 6.353074525361292, "grad_norm": 3.2879014015197754, "learning_rate": 4.5828085642317384e-05, "loss": 2.1256, "step": 22420 }, { "epoch": 6.358741853216209, "grad_norm": 2.4399218559265137, "learning_rate": 4.56706549118388e-05, "loss": 1.9873, "step": 22440 }, { "epoch": 6.3644091810711245, "grad_norm": 3.563060760498047, "learning_rate": 4.5513224181360205e-05, "loss": 2.3416, "step": 22460 }, { "epoch": 6.370076508926042, "grad_norm": 3.1526973247528076, "learning_rate": 4.535579345088161e-05, "loss": 2.154, "step": 22480 }, { "epoch": 6.375743836780957, "grad_norm": 2.7281911373138428, "learning_rate": 4.5198362720403026e-05, "loss": 2.0652, "step": 22500 }, { "epoch": 6.375743836780957, "eval_loss": 2.4025583267211914, "eval_runtime": 3.1355, "eval_samples_per_second": 31.893, "eval_steps_per_second": 7.973, "step": 22500 }, { "epoch": 6.381411164635875, "grad_norm": 3.6995720863342285, "learning_rate": 4.504093198992443e-05, "loss": 2.1297, "step": 22520 }, { "epoch": 6.38707849249079, "grad_norm": 2.3185689449310303, "learning_rate": 4.488350125944585e-05, "loss": 2.3145, "step": 22540 }, { "epoch": 6.392745820345707, "grad_norm": 2.000162363052368, "learning_rate": 4.4726070528967254e-05, "loss": 1.9953, "step": 22560 }, { "epoch": 6.398413148200623, "grad_norm": 2.200204372406006, "learning_rate": 4.456863979848867e-05, "loss": 2.191, "step": 22580 }, { "epoch": 6.404080476055539, "grad_norm": 1.8371127843856812, "learning_rate": 4.441120906801008e-05, "loss": 1.9409, "step": 22600 }, { "epoch": 6.409747803910456, "grad_norm": 1.8733766078948975, "learning_rate": 4.425377833753149e-05, "loss": 2.1325, "step": 22620 }, { "epoch": 6.415415131765372, "grad_norm": 2.3495383262634277, "learning_rate": 4.40963476070529e-05, "loss": 2.0162, "step": 22640 }, { "epoch": 6.421082459620289, "grad_norm": 2.6760189533233643, "learning_rate": 4.393891687657431e-05, "loss": 1.8172, "step": 22660 }, { "epoch": 6.426749787475205, "grad_norm": 2.134878635406494, "learning_rate": 4.378148614609572e-05, "loss": 2.1924, "step": 22680 }, { "epoch": 6.4324171153301215, "grad_norm": 3.227609872817993, "learning_rate": 4.362405541561713e-05, "loss": 2.3214, "step": 22700 }, { "epoch": 6.438084443185038, "grad_norm": 4.216457843780518, "learning_rate": 4.346662468513854e-05, "loss": 2.1921, "step": 22720 }, { "epoch": 6.443751771039954, "grad_norm": 2.988720178604126, "learning_rate": 4.330919395465995e-05, "loss": 1.9442, "step": 22740 }, { "epoch": 6.449419098894871, "grad_norm": 3.348069906234741, "learning_rate": 4.3151763224181365e-05, "loss": 2.132, "step": 22760 }, { "epoch": 6.455086426749787, "grad_norm": 3.3704237937927246, "learning_rate": 4.299433249370277e-05, "loss": 2.1069, "step": 22780 }, { "epoch": 6.460753754604704, "grad_norm": 3.2031679153442383, "learning_rate": 4.2836901763224186e-05, "loss": 2.1234, "step": 22800 }, { "epoch": 6.46642108245962, "grad_norm": 4.010575294494629, "learning_rate": 4.267947103274559e-05, "loss": 2.1434, "step": 22820 }, { "epoch": 6.472088410314536, "grad_norm": 4.687906265258789, "learning_rate": 4.2522040302267e-05, "loss": 1.7897, "step": 22840 }, { "epoch": 6.477755738169453, "grad_norm": 3.302692174911499, "learning_rate": 4.2364609571788414e-05, "loss": 2.2382, "step": 22860 }, { "epoch": 6.483423066024369, "grad_norm": 2.3680787086486816, "learning_rate": 4.220717884130982e-05, "loss": 1.9439, "step": 22880 }, { "epoch": 6.489090393879286, "grad_norm": 3.0420186519622803, "learning_rate": 4.204974811083124e-05, "loss": 2.026, "step": 22900 }, { "epoch": 6.494757721734202, "grad_norm": 3.7600269317626953, "learning_rate": 4.189231738035265e-05, "loss": 1.8427, "step": 22920 }, { "epoch": 6.5004250495891185, "grad_norm": 3.5169217586517334, "learning_rate": 4.1734886649874056e-05, "loss": 2.3191, "step": 22940 }, { "epoch": 6.506092377444035, "grad_norm": 1.6750386953353882, "learning_rate": 4.157745591939547e-05, "loss": 2.2457, "step": 22960 }, { "epoch": 6.511759705298951, "grad_norm": 3.4677793979644775, "learning_rate": 4.142002518891688e-05, "loss": 2.3593, "step": 22980 }, { "epoch": 6.517427033153868, "grad_norm": 2.082308053970337, "learning_rate": 4.126259445843829e-05, "loss": 1.9169, "step": 23000 }, { "epoch": 6.517427033153868, "eval_loss": 2.400475263595581, "eval_runtime": 7.3557, "eval_samples_per_second": 13.595, "eval_steps_per_second": 3.399, "step": 23000 }, { "epoch": 6.523094361008784, "grad_norm": 1.8431912660598755, "learning_rate": 4.11051637279597e-05, "loss": 2.016, "step": 23020 }, { "epoch": 6.528761688863701, "grad_norm": 2.5485291481018066, "learning_rate": 4.094773299748111e-05, "loss": 1.8217, "step": 23040 }, { "epoch": 6.534429016718617, "grad_norm": 2.9542415142059326, "learning_rate": 4.0790302267002525e-05, "loss": 2.0637, "step": 23060 }, { "epoch": 6.540096344573533, "grad_norm": 3.4550628662109375, "learning_rate": 4.063287153652393e-05, "loss": 2.1011, "step": 23080 }, { "epoch": 6.54576367242845, "grad_norm": 3.640573024749756, "learning_rate": 4.047544080604534e-05, "loss": 2.1174, "step": 23100 }, { "epoch": 6.551431000283366, "grad_norm": 2.1184628009796143, "learning_rate": 4.0318010075566753e-05, "loss": 2.1483, "step": 23120 }, { "epoch": 6.557098328138283, "grad_norm": 2.6833693981170654, "learning_rate": 4.016057934508816e-05, "loss": 2.0994, "step": 23140 }, { "epoch": 6.562765655993199, "grad_norm": 3.4621665477752686, "learning_rate": 4.0003148614609574e-05, "loss": 2.0037, "step": 23160 }, { "epoch": 6.5684329838481155, "grad_norm": 3.1389856338500977, "learning_rate": 3.984571788413098e-05, "loss": 2.0209, "step": 23180 }, { "epoch": 6.574100311703032, "grad_norm": 3.0767643451690674, "learning_rate": 3.9688287153652395e-05, "loss": 2.2451, "step": 23200 }, { "epoch": 6.579767639557948, "grad_norm": 2.2055115699768066, "learning_rate": 3.953085642317381e-05, "loss": 1.876, "step": 23220 }, { "epoch": 6.585434967412865, "grad_norm": 2.7509560585021973, "learning_rate": 3.9373425692695216e-05, "loss": 2.1596, "step": 23240 }, { "epoch": 6.591102295267781, "grad_norm": 2.9830451011657715, "learning_rate": 3.922386649874056e-05, "loss": 1.9632, "step": 23260 }, { "epoch": 6.5967696231226975, "grad_norm": 2.0715694427490234, "learning_rate": 3.9066435768261965e-05, "loss": 2.1354, "step": 23280 }, { "epoch": 6.602436950977614, "grad_norm": 2.890753984451294, "learning_rate": 3.890900503778337e-05, "loss": 2.1789, "step": 23300 }, { "epoch": 6.60810427883253, "grad_norm": 2.357600688934326, "learning_rate": 3.8751574307304786e-05, "loss": 2.2615, "step": 23320 }, { "epoch": 6.613771606687447, "grad_norm": 2.641329526901245, "learning_rate": 3.85941435768262e-05, "loss": 2.1198, "step": 23340 }, { "epoch": 6.619438934542363, "grad_norm": 4.01620626449585, "learning_rate": 3.843671284634761e-05, "loss": 2.0736, "step": 23360 }, { "epoch": 6.62510626239728, "grad_norm": 2.8899033069610596, "learning_rate": 3.827928211586902e-05, "loss": 2.2633, "step": 23380 }, { "epoch": 6.630773590252196, "grad_norm": 2.8832154273986816, "learning_rate": 3.812185138539043e-05, "loss": 2.0364, "step": 23400 }, { "epoch": 6.6364409181071125, "grad_norm": 3.7245821952819824, "learning_rate": 3.796442065491184e-05, "loss": 2.0257, "step": 23420 }, { "epoch": 6.642108245962029, "grad_norm": 2.1374807357788086, "learning_rate": 3.780698992443325e-05, "loss": 2.2586, "step": 23440 }, { "epoch": 6.647775573816945, "grad_norm": 2.7670538425445557, "learning_rate": 3.764955919395466e-05, "loss": 2.1452, "step": 23460 }, { "epoch": 6.653442901671862, "grad_norm": 2.1952292919158936, "learning_rate": 3.7492128463476076e-05, "loss": 2.0716, "step": 23480 }, { "epoch": 6.659110229526778, "grad_norm": 2.0418331623077393, "learning_rate": 3.733469773299748e-05, "loss": 2.0375, "step": 23500 }, { "epoch": 6.659110229526778, "eval_loss": 2.3889718055725098, "eval_runtime": 3.1647, "eval_samples_per_second": 31.599, "eval_steps_per_second": 7.9, "step": 23500 }, { "epoch": 6.6647775573816945, "grad_norm": 4.044829845428467, "learning_rate": 3.71772670025189e-05, "loss": 2.044, "step": 23520 }, { "epoch": 6.670444885236611, "grad_norm": 3.628307819366455, "learning_rate": 3.7019836272040304e-05, "loss": 2.1963, "step": 23540 }, { "epoch": 6.676112213091527, "grad_norm": 3.4848392009735107, "learning_rate": 3.686240554156171e-05, "loss": 2.3557, "step": 23560 }, { "epoch": 6.681779540946444, "grad_norm": 2.013045310974121, "learning_rate": 3.6704974811083125e-05, "loss": 2.0788, "step": 23580 }, { "epoch": 6.68744686880136, "grad_norm": 2.592710494995117, "learning_rate": 3.654754408060453e-05, "loss": 1.8563, "step": 23600 }, { "epoch": 6.693114196656277, "grad_norm": 1.6886849403381348, "learning_rate": 3.6390113350125946e-05, "loss": 1.8757, "step": 23620 }, { "epoch": 6.698781524511193, "grad_norm": 1.5967741012573242, "learning_rate": 3.623268261964736e-05, "loss": 1.955, "step": 23640 }, { "epoch": 6.704448852366109, "grad_norm": 1.3829079866409302, "learning_rate": 3.6075251889168767e-05, "loss": 1.7081, "step": 23660 }, { "epoch": 6.710116180221026, "grad_norm": 2.1749722957611084, "learning_rate": 3.591782115869018e-05, "loss": 2.1558, "step": 23680 }, { "epoch": 6.715783508075942, "grad_norm": 2.3803274631500244, "learning_rate": 3.576039042821159e-05, "loss": 2.2897, "step": 23700 }, { "epoch": 6.721450835930859, "grad_norm": 3.562082052230835, "learning_rate": 3.5602959697733e-05, "loss": 1.9846, "step": 23720 }, { "epoch": 6.727118163785775, "grad_norm": 3.09842586517334, "learning_rate": 3.544552896725441e-05, "loss": 2.0617, "step": 23740 }, { "epoch": 6.7327854916406915, "grad_norm": 2.9970357418060303, "learning_rate": 3.529596977329975e-05, "loss": 2.0955, "step": 23760 }, { "epoch": 6.738452819495608, "grad_norm": 2.6218104362487793, "learning_rate": 3.5138539042821164e-05, "loss": 1.8631, "step": 23780 }, { "epoch": 6.744120147350524, "grad_norm": 2.9563047885894775, "learning_rate": 3.498110831234257e-05, "loss": 1.9011, "step": 23800 }, { "epoch": 6.749787475205441, "grad_norm": 2.3660473823547363, "learning_rate": 3.4823677581863984e-05, "loss": 1.7559, "step": 23820 }, { "epoch": 6.755454803060357, "grad_norm": 2.2849748134613037, "learning_rate": 3.466624685138539e-05, "loss": 2.1006, "step": 23840 }, { "epoch": 6.761122130915274, "grad_norm": 1.2344627380371094, "learning_rate": 3.45088161209068e-05, "loss": 2.2032, "step": 23860 }, { "epoch": 6.76678945877019, "grad_norm": 2.2480275630950928, "learning_rate": 3.435138539042821e-05, "loss": 2.0365, "step": 23880 }, { "epoch": 6.772456786625106, "grad_norm": 3.194091558456421, "learning_rate": 3.4193954659949626e-05, "loss": 2.0627, "step": 23900 }, { "epoch": 6.778124114480023, "grad_norm": 2.006216049194336, "learning_rate": 3.403652392947104e-05, "loss": 1.9802, "step": 23920 }, { "epoch": 6.783791442334939, "grad_norm": 3.065403938293457, "learning_rate": 3.387909319899245e-05, "loss": 1.7741, "step": 23940 }, { "epoch": 6.789458770189856, "grad_norm": 4.049249649047852, "learning_rate": 3.3721662468513854e-05, "loss": 2.0386, "step": 23960 }, { "epoch": 6.795126098044772, "grad_norm": 3.015514612197876, "learning_rate": 3.356423173803527e-05, "loss": 2.0937, "step": 23980 }, { "epoch": 6.8007934258996885, "grad_norm": 4.187402248382568, "learning_rate": 3.3406801007556675e-05, "loss": 2.3612, "step": 24000 }, { "epoch": 6.8007934258996885, "eval_loss": 2.3968515396118164, "eval_runtime": 6.1844, "eval_samples_per_second": 16.17, "eval_steps_per_second": 4.042, "step": 24000 }, { "epoch": 6.806460753754605, "grad_norm": 1.5942758321762085, "learning_rate": 3.324937027707808e-05, "loss": 2.2706, "step": 24020 }, { "epoch": 6.812128081609521, "grad_norm": 2.6515204906463623, "learning_rate": 3.3091939546599496e-05, "loss": 1.8543, "step": 24040 }, { "epoch": 6.817795409464438, "grad_norm": 2.6869423389434814, "learning_rate": 3.293450881612091e-05, "loss": 1.6832, "step": 24060 }, { "epoch": 6.823462737319354, "grad_norm": 4.126588821411133, "learning_rate": 3.2777078085642324e-05, "loss": 2.1869, "step": 24080 }, { "epoch": 6.829130065174271, "grad_norm": 2.4013831615448, "learning_rate": 3.261964735516373e-05, "loss": 1.9895, "step": 24100 }, { "epoch": 6.834797393029187, "grad_norm": 2.9623374938964844, "learning_rate": 3.246221662468514e-05, "loss": 2.2721, "step": 24120 }, { "epoch": 6.840464720884103, "grad_norm": 3.5471584796905518, "learning_rate": 3.230478589420655e-05, "loss": 2.0904, "step": 24140 }, { "epoch": 6.84613204873902, "grad_norm": 2.671138286590576, "learning_rate": 3.214735516372796e-05, "loss": 1.9944, "step": 24160 }, { "epoch": 6.851799376593936, "grad_norm": 3.6440911293029785, "learning_rate": 3.198992443324937e-05, "loss": 2.0819, "step": 24180 }, { "epoch": 6.857466704448853, "grad_norm": 2.443720817565918, "learning_rate": 3.183249370277078e-05, "loss": 1.9835, "step": 24200 }, { "epoch": 6.863134032303769, "grad_norm": 3.235928773880005, "learning_rate": 3.1675062972292193e-05, "loss": 2.0605, "step": 24220 }, { "epoch": 6.8688013601586855, "grad_norm": 3.3158023357391357, "learning_rate": 3.151763224181361e-05, "loss": 2.1016, "step": 24240 }, { "epoch": 6.874468688013602, "grad_norm": 2.968862533569336, "learning_rate": 3.1360201511335014e-05, "loss": 2.0398, "step": 24260 }, { "epoch": 6.880136015868518, "grad_norm": 2.610361337661743, "learning_rate": 3.120277078085643e-05, "loss": 2.0344, "step": 24280 }, { "epoch": 6.885803343723435, "grad_norm": 3.4181952476501465, "learning_rate": 3.1045340050377835e-05, "loss": 2.0948, "step": 24300 }, { "epoch": 6.891470671578351, "grad_norm": 2.736668586730957, "learning_rate": 3.088790931989924e-05, "loss": 1.8148, "step": 24320 }, { "epoch": 6.8971379994332676, "grad_norm": 2.8229641914367676, "learning_rate": 3.0730478589420656e-05, "loss": 2.1848, "step": 24340 }, { "epoch": 6.902805327288184, "grad_norm": 3.065139055252075, "learning_rate": 3.057304785894206e-05, "loss": 2.036, "step": 24360 }, { "epoch": 6.9084726551431, "grad_norm": 2.5566601753234863, "learning_rate": 3.0415617128463474e-05, "loss": 1.896, "step": 24380 }, { "epoch": 6.914139982998017, "grad_norm": 2.5570871829986572, "learning_rate": 3.0258186397984887e-05, "loss": 2.1756, "step": 24400 }, { "epoch": 6.919807310852933, "grad_norm": 3.557962656021118, "learning_rate": 3.0100755667506298e-05, "loss": 2.1778, "step": 24420 }, { "epoch": 6.92547463870785, "grad_norm": 1.821282982826233, "learning_rate": 2.9943324937027712e-05, "loss": 1.8592, "step": 24440 }, { "epoch": 6.931141966562766, "grad_norm": 1.7148374319076538, "learning_rate": 2.978589420654912e-05, "loss": 2.1349, "step": 24460 }, { "epoch": 6.936809294417682, "grad_norm": 2.7279791831970215, "learning_rate": 2.962846347607053e-05, "loss": 1.8614, "step": 24480 }, { "epoch": 6.942476622272599, "grad_norm": 2.6080451011657715, "learning_rate": 2.9471032745591943e-05, "loss": 1.8428, "step": 24500 }, { "epoch": 6.942476622272599, "eval_loss": 2.3951735496520996, "eval_runtime": 3.4215, "eval_samples_per_second": 29.227, "eval_steps_per_second": 7.307, "step": 24500 }, { "epoch": 6.948143950127514, "grad_norm": 1.9061074256896973, "learning_rate": 2.931360201511335e-05, "loss": 2.1611, "step": 24520 }, { "epoch": 6.953811277982432, "grad_norm": 2.8317666053771973, "learning_rate": 2.9156171284634764e-05, "loss": 2.1525, "step": 24540 }, { "epoch": 6.959478605837347, "grad_norm": 2.732363700866699, "learning_rate": 2.8998740554156174e-05, "loss": 2.0487, "step": 24560 }, { "epoch": 6.9651459336922645, "grad_norm": 3.5036561489105225, "learning_rate": 2.884130982367758e-05, "loss": 2.0664, "step": 24580 }, { "epoch": 6.97081326154718, "grad_norm": 3.8730990886688232, "learning_rate": 2.8683879093198995e-05, "loss": 2.1937, "step": 24600 }, { "epoch": 6.976480589402097, "grad_norm": 2.440481185913086, "learning_rate": 2.8526448362720402e-05, "loss": 2.057, "step": 24620 }, { "epoch": 6.982147917257013, "grad_norm": 3.511258840560913, "learning_rate": 2.8369017632241813e-05, "loss": 1.8321, "step": 24640 }, { "epoch": 6.98781524511193, "grad_norm": 3.724647283554077, "learning_rate": 2.8211586901763227e-05, "loss": 2.1684, "step": 24660 }, { "epoch": 6.993482572966846, "grad_norm": 1.7989954948425293, "learning_rate": 2.8054156171284634e-05, "loss": 1.865, "step": 24680 }, { "epoch": 6.999149900821763, "grad_norm": 2.582520008087158, "learning_rate": 2.7896725440806048e-05, "loss": 1.9257, "step": 24700 }, { "epoch": 7.0048172286766786, "grad_norm": 1.3620442152023315, "learning_rate": 2.7739294710327458e-05, "loss": 1.8418, "step": 24720 }, { "epoch": 7.010484556531595, "grad_norm": 2.6091196537017822, "learning_rate": 2.7581863979848865e-05, "loss": 1.7354, "step": 24740 }, { "epoch": 7.016151884386511, "grad_norm": 2.133687734603882, "learning_rate": 2.742443324937028e-05, "loss": 1.8988, "step": 24760 }, { "epoch": 7.021819212241428, "grad_norm": 3.614654779434204, "learning_rate": 2.726700251889169e-05, "loss": 2.03, "step": 24780 }, { "epoch": 7.027486540096344, "grad_norm": 4.559241771697998, "learning_rate": 2.7109571788413103e-05, "loss": 1.9967, "step": 24800 }, { "epoch": 7.033153867951261, "grad_norm": 2.8040339946746826, "learning_rate": 2.695214105793451e-05, "loss": 2.3464, "step": 24820 }, { "epoch": 7.038821195806177, "grad_norm": 2.2723660469055176, "learning_rate": 2.6794710327455917e-05, "loss": 2.2527, "step": 24840 }, { "epoch": 7.0444885236610935, "grad_norm": 2.98667311668396, "learning_rate": 2.663727959697733e-05, "loss": 1.9577, "step": 24860 }, { "epoch": 7.05015585151601, "grad_norm": 3.7956717014312744, "learning_rate": 2.647984886649874e-05, "loss": 1.942, "step": 24880 }, { "epoch": 7.055823179370926, "grad_norm": 2.5600507259368896, "learning_rate": 2.6322418136020155e-05, "loss": 1.9527, "step": 24900 }, { "epoch": 7.061490507225843, "grad_norm": 1.7360310554504395, "learning_rate": 2.6164987405541563e-05, "loss": 1.9508, "step": 24920 }, { "epoch": 7.067157835080759, "grad_norm": 1.9320721626281738, "learning_rate": 2.6007556675062973e-05, "loss": 1.9526, "step": 24940 }, { "epoch": 7.0728251629356755, "grad_norm": 2.426555871963501, "learning_rate": 2.5850125944584387e-05, "loss": 2.0816, "step": 24960 }, { "epoch": 7.078492490790592, "grad_norm": 1.4982030391693115, "learning_rate": 2.5692695214105794e-05, "loss": 1.8849, "step": 24980 }, { "epoch": 7.084159818645508, "grad_norm": 2.4452595710754395, "learning_rate": 2.55352644836272e-05, "loss": 1.9936, "step": 25000 }, { "epoch": 7.084159818645508, "eval_loss": 2.3867971897125244, "eval_runtime": 8.937, "eval_samples_per_second": 11.189, "eval_steps_per_second": 2.797, "step": 25000 }, { "epoch": 7.089827146500425, "grad_norm": 2.866413116455078, "learning_rate": 2.5377833753148618e-05, "loss": 2.0369, "step": 25020 }, { "epoch": 7.095494474355341, "grad_norm": 1.967036247253418, "learning_rate": 2.5220403022670025e-05, "loss": 1.9904, "step": 25040 }, { "epoch": 7.101161802210258, "grad_norm": 3.90358829498291, "learning_rate": 2.506297229219144e-05, "loss": 1.8382, "step": 25060 }, { "epoch": 7.106829130065174, "grad_norm": 1.7922592163085938, "learning_rate": 2.4905541561712846e-05, "loss": 2.075, "step": 25080 }, { "epoch": 7.11249645792009, "grad_norm": 2.6908645629882812, "learning_rate": 2.474811083123426e-05, "loss": 1.959, "step": 25100 }, { "epoch": 7.118163785775007, "grad_norm": 3.5005877017974854, "learning_rate": 2.459068010075567e-05, "loss": 1.9681, "step": 25120 }, { "epoch": 7.123831113629923, "grad_norm": 3.3342626094818115, "learning_rate": 2.4433249370277077e-05, "loss": 2.292, "step": 25140 }, { "epoch": 7.12949844148484, "grad_norm": 1.8842560052871704, "learning_rate": 2.4275818639798488e-05, "loss": 2.0008, "step": 25160 }, { "epoch": 7.135165769339756, "grad_norm": 2.9695844650268555, "learning_rate": 2.4118387909319902e-05, "loss": 1.9698, "step": 25180 }, { "epoch": 7.1408330971946725, "grad_norm": 2.8094005584716797, "learning_rate": 2.3960957178841312e-05, "loss": 2.0449, "step": 25200 }, { "epoch": 7.146500425049589, "grad_norm": 3.9919629096984863, "learning_rate": 2.380352644836272e-05, "loss": 1.9487, "step": 25220 }, { "epoch": 7.152167752904505, "grad_norm": 2.9651360511779785, "learning_rate": 2.364609571788413e-05, "loss": 1.9119, "step": 25240 }, { "epoch": 7.157835080759422, "grad_norm": 2.6570241451263428, "learning_rate": 2.3488664987405544e-05, "loss": 1.9471, "step": 25260 }, { "epoch": 7.163502408614338, "grad_norm": 2.5290753841400146, "learning_rate": 2.3331234256926954e-05, "loss": 2.048, "step": 25280 }, { "epoch": 7.169169736469255, "grad_norm": 5.138999938964844, "learning_rate": 2.3173803526448364e-05, "loss": 2.0486, "step": 25300 }, { "epoch": 7.174837064324171, "grad_norm": 1.6156316995620728, "learning_rate": 2.3016372795969775e-05, "loss": 1.8327, "step": 25320 }, { "epoch": 7.180504392179087, "grad_norm": 3.0746636390686035, "learning_rate": 2.2858942065491185e-05, "loss": 1.9014, "step": 25340 }, { "epoch": 7.186171720034004, "grad_norm": 1.6470799446105957, "learning_rate": 2.2701511335012596e-05, "loss": 1.9614, "step": 25360 }, { "epoch": 7.19183904788892, "grad_norm": 3.0337748527526855, "learning_rate": 2.2544080604534006e-05, "loss": 1.8401, "step": 25380 }, { "epoch": 7.197506375743837, "grad_norm": 2.4902961254119873, "learning_rate": 2.2386649874055417e-05, "loss": 2.1432, "step": 25400 }, { "epoch": 7.203173703598753, "grad_norm": 1.9536110162734985, "learning_rate": 2.2229219143576827e-05, "loss": 1.8499, "step": 25420 }, { "epoch": 7.2088410314536695, "grad_norm": 2.526289939880371, "learning_rate": 2.2071788413098238e-05, "loss": 2.1137, "step": 25440 }, { "epoch": 7.214508359308586, "grad_norm": 3.2735888957977295, "learning_rate": 2.1914357682619648e-05, "loss": 1.9548, "step": 25460 }, { "epoch": 7.220175687163502, "grad_norm": 4.508800983428955, "learning_rate": 2.175692695214106e-05, "loss": 1.8679, "step": 25480 }, { "epoch": 7.225843015018419, "grad_norm": 2.418177843093872, "learning_rate": 2.159949622166247e-05, "loss": 2.006, "step": 25500 }, { "epoch": 7.225843015018419, "eval_loss": 2.3841259479522705, "eval_runtime": 3.8208, "eval_samples_per_second": 26.173, "eval_steps_per_second": 6.543, "step": 25500 }, { "epoch": 7.231510342873335, "grad_norm": 1.7876240015029907, "learning_rate": 2.144206549118388e-05, "loss": 1.7158, "step": 25520 }, { "epoch": 7.237177670728252, "grad_norm": 2.6282968521118164, "learning_rate": 2.128463476070529e-05, "loss": 1.9768, "step": 25540 }, { "epoch": 7.242844998583168, "grad_norm": 3.6761908531188965, "learning_rate": 2.1127204030226704e-05, "loss": 1.9188, "step": 25560 }, { "epoch": 7.248512326438084, "grad_norm": 2.5201611518859863, "learning_rate": 2.096977329974811e-05, "loss": 1.973, "step": 25580 }, { "epoch": 7.254179654293001, "grad_norm": 2.5935451984405518, "learning_rate": 2.081234256926952e-05, "loss": 2.18, "step": 25600 }, { "epoch": 7.259846982147917, "grad_norm": 3.3094747066497803, "learning_rate": 2.065491183879093e-05, "loss": 2.0573, "step": 25620 }, { "epoch": 7.265514310002834, "grad_norm": 2.945183515548706, "learning_rate": 2.0497481108312345e-05, "loss": 2.1475, "step": 25640 }, { "epoch": 7.27118163785775, "grad_norm": 3.0024197101593018, "learning_rate": 2.0340050377833756e-05, "loss": 2.1518, "step": 25660 }, { "epoch": 7.2768489657126665, "grad_norm": 2.6679654121398926, "learning_rate": 2.0182619647355163e-05, "loss": 1.8458, "step": 25680 }, { "epoch": 7.282516293567583, "grad_norm": 2.976282835006714, "learning_rate": 2.0025188916876573e-05, "loss": 1.7687, "step": 25700 }, { "epoch": 7.288183621422499, "grad_norm": 4.596390247344971, "learning_rate": 1.9867758186397987e-05, "loss": 1.7693, "step": 25720 }, { "epoch": 7.293850949277416, "grad_norm": 3.1779825687408447, "learning_rate": 1.9710327455919398e-05, "loss": 2.0868, "step": 25740 }, { "epoch": 7.299518277132332, "grad_norm": 1.7006670236587524, "learning_rate": 1.9552896725440805e-05, "loss": 1.8665, "step": 25760 }, { "epoch": 7.305185604987249, "grad_norm": 2.236936092376709, "learning_rate": 1.939546599496222e-05, "loss": 1.9015, "step": 25780 }, { "epoch": 7.310852932842165, "grad_norm": 4.092790126800537, "learning_rate": 1.923803526448363e-05, "loss": 2.0726, "step": 25800 }, { "epoch": 7.316520260697081, "grad_norm": 1.5189704895019531, "learning_rate": 1.908060453400504e-05, "loss": 1.8324, "step": 25820 }, { "epoch": 7.322187588551998, "grad_norm": 2.8459370136260986, "learning_rate": 1.892317380352645e-05, "loss": 2.1461, "step": 25840 }, { "epoch": 7.327854916406914, "grad_norm": 2.986793279647827, "learning_rate": 1.8773614609571788e-05, "loss": 2.2858, "step": 25860 }, { "epoch": 7.333522244261831, "grad_norm": 2.3430516719818115, "learning_rate": 1.86161838790932e-05, "loss": 2.0121, "step": 25880 }, { "epoch": 7.339189572116747, "grad_norm": 2.472294807434082, "learning_rate": 1.845875314861461e-05, "loss": 1.7777, "step": 25900 }, { "epoch": 7.3448568999716635, "grad_norm": 3.1519088745117188, "learning_rate": 1.8301322418136023e-05, "loss": 2.0337, "step": 25920 }, { "epoch": 7.35052422782658, "grad_norm": 2.6729047298431396, "learning_rate": 1.8143891687657433e-05, "loss": 2.0588, "step": 25940 }, { "epoch": 7.356191555681496, "grad_norm": 2.5368478298187256, "learning_rate": 1.798646095717884e-05, "loss": 1.849, "step": 25960 }, { "epoch": 7.361858883536413, "grad_norm": 3.50793719291687, "learning_rate": 1.7829030226700254e-05, "loss": 2.0308, "step": 25980 }, { "epoch": 7.367526211391329, "grad_norm": 2.8010456562042236, "learning_rate": 1.7671599496221665e-05, "loss": 2.084, "step": 26000 }, { "epoch": 7.367526211391329, "eval_loss": 2.3769593238830566, "eval_runtime": 4.9499, "eval_samples_per_second": 20.203, "eval_steps_per_second": 5.051, "step": 26000 }, { "epoch": 7.3731935392462455, "grad_norm": 3.8831539154052734, "learning_rate": 1.7514168765743075e-05, "loss": 2.4735, "step": 26020 }, { "epoch": 7.378860867101162, "grad_norm": 1.814440131187439, "learning_rate": 1.7356738035264482e-05, "loss": 1.8304, "step": 26040 }, { "epoch": 7.384528194956078, "grad_norm": 1.8853788375854492, "learning_rate": 1.7199307304785896e-05, "loss": 2.1085, "step": 26060 }, { "epoch": 7.390195522810995, "grad_norm": 3.1752564907073975, "learning_rate": 1.7041876574307306e-05, "loss": 2.0172, "step": 26080 }, { "epoch": 7.395862850665911, "grad_norm": 2.3268773555755615, "learning_rate": 1.6884445843828717e-05, "loss": 1.9396, "step": 26100 }, { "epoch": 7.401530178520828, "grad_norm": 1.713739037513733, "learning_rate": 1.6727015113350127e-05, "loss": 2.1052, "step": 26120 }, { "epoch": 7.407197506375744, "grad_norm": 3.885432720184326, "learning_rate": 1.6569584382871538e-05, "loss": 1.992, "step": 26140 }, { "epoch": 7.4128648342306604, "grad_norm": 2.1733057498931885, "learning_rate": 1.6412153652392948e-05, "loss": 2.1815, "step": 26160 }, { "epoch": 7.418532162085577, "grad_norm": 1.871437430381775, "learning_rate": 1.625472292191436e-05, "loss": 1.8882, "step": 26180 }, { "epoch": 7.424199489940493, "grad_norm": 2.591052293777466, "learning_rate": 1.609729219143577e-05, "loss": 2.1055, "step": 26200 }, { "epoch": 7.42986681779541, "grad_norm": 2.1060853004455566, "learning_rate": 1.593986146095718e-05, "loss": 1.9374, "step": 26220 }, { "epoch": 7.435534145650326, "grad_norm": 3.4129979610443115, "learning_rate": 1.578243073047859e-05, "loss": 2.3222, "step": 26240 }, { "epoch": 7.4412014735052425, "grad_norm": 2.7728374004364014, "learning_rate": 1.5625e-05, "loss": 1.761, "step": 26260 }, { "epoch": 7.446868801360159, "grad_norm": 4.181581497192383, "learning_rate": 1.546756926952141e-05, "loss": 1.8585, "step": 26280 }, { "epoch": 7.452536129215075, "grad_norm": 2.5631537437438965, "learning_rate": 1.5310138539042825e-05, "loss": 2.0883, "step": 26300 }, { "epoch": 7.458203457069992, "grad_norm": 2.149087429046631, "learning_rate": 1.5152707808564232e-05, "loss": 1.9059, "step": 26320 }, { "epoch": 7.463870784924908, "grad_norm": 2.467003107070923, "learning_rate": 1.4995277078085642e-05, "loss": 2.0777, "step": 26340 }, { "epoch": 7.469538112779825, "grad_norm": 3.57696533203125, "learning_rate": 1.4837846347607054e-05, "loss": 1.7543, "step": 26360 }, { "epoch": 7.475205440634741, "grad_norm": 2.5763039588928223, "learning_rate": 1.4680415617128465e-05, "loss": 2.0823, "step": 26380 }, { "epoch": 7.480872768489657, "grad_norm": 4.462738990783691, "learning_rate": 1.4522984886649873e-05, "loss": 2.0033, "step": 26400 }, { "epoch": 7.486540096344574, "grad_norm": 2.816843271255493, "learning_rate": 1.4365554156171284e-05, "loss": 1.9993, "step": 26420 }, { "epoch": 7.49220742419949, "grad_norm": 2.765244960784912, "learning_rate": 1.4208123425692696e-05, "loss": 2.139, "step": 26440 }, { "epoch": 7.497874752054407, "grad_norm": 3.8238930702209473, "learning_rate": 1.4050692695214106e-05, "loss": 2.1185, "step": 26460 }, { "epoch": 7.503542079909323, "grad_norm": 1.5666344165802002, "learning_rate": 1.3893261964735519e-05, "loss": 2.0555, "step": 26480 }, { "epoch": 7.5092094077642395, "grad_norm": 3.4180991649627686, "learning_rate": 1.3735831234256927e-05, "loss": 2.1076, "step": 26500 }, { "epoch": 7.5092094077642395, "eval_loss": 2.3740596771240234, "eval_runtime": 3.5451, "eval_samples_per_second": 28.208, "eval_steps_per_second": 7.052, "step": 26500 }, { "epoch": 7.514876735619156, "grad_norm": 1.9294134378433228, "learning_rate": 1.3578400503778338e-05, "loss": 2.2266, "step": 26520 }, { "epoch": 7.520544063474072, "grad_norm": 2.653122901916504, "learning_rate": 1.3420969773299748e-05, "loss": 2.2558, "step": 26540 }, { "epoch": 7.526211391328989, "grad_norm": 1.7390453815460205, "learning_rate": 1.326353904282116e-05, "loss": 1.9482, "step": 26560 }, { "epoch": 7.531878719183905, "grad_norm": 2.0288381576538086, "learning_rate": 1.310610831234257e-05, "loss": 1.8109, "step": 26580 }, { "epoch": 7.537546047038822, "grad_norm": 1.9242277145385742, "learning_rate": 1.294867758186398e-05, "loss": 1.9985, "step": 26600 }, { "epoch": 7.543213374893737, "grad_norm": 3.5614235401153564, "learning_rate": 1.2791246851385392e-05, "loss": 2.065, "step": 26620 }, { "epoch": 7.548880702748654, "grad_norm": 3.0668246746063232, "learning_rate": 1.2633816120906802e-05, "loss": 1.4825, "step": 26640 }, { "epoch": 7.55454803060357, "grad_norm": 3.087691307067871, "learning_rate": 1.2476385390428213e-05, "loss": 2.157, "step": 26660 }, { "epoch": 7.560215358458487, "grad_norm": 3.1645305156707764, "learning_rate": 1.2318954659949623e-05, "loss": 2.2624, "step": 26680 }, { "epoch": 7.565882686313403, "grad_norm": 2.095198392868042, "learning_rate": 1.2161523929471034e-05, "loss": 2.2205, "step": 26700 }, { "epoch": 7.57155001416832, "grad_norm": 2.346905469894409, "learning_rate": 1.2004093198992444e-05, "loss": 1.9881, "step": 26720 }, { "epoch": 7.577217342023236, "grad_norm": 2.6021840572357178, "learning_rate": 1.1846662468513854e-05, "loss": 1.9818, "step": 26740 }, { "epoch": 7.582884669878153, "grad_norm": 3.23764705657959, "learning_rate": 1.1689231738035265e-05, "loss": 2.0944, "step": 26760 }, { "epoch": 7.588551997733068, "grad_norm": 2.8694934844970703, "learning_rate": 1.1531801007556675e-05, "loss": 2.0339, "step": 26780 }, { "epoch": 7.594219325587986, "grad_norm": 3.3995590209960938, "learning_rate": 1.1374370277078086e-05, "loss": 2.06, "step": 26800 }, { "epoch": 7.599886653442901, "grad_norm": 2.7044217586517334, "learning_rate": 1.1216939546599496e-05, "loss": 1.8922, "step": 26820 }, { "epoch": 7.605553981297819, "grad_norm": 2.680511713027954, "learning_rate": 1.1059508816120907e-05, "loss": 2.0628, "step": 26840 }, { "epoch": 7.611221309152734, "grad_norm": 2.34586763381958, "learning_rate": 1.0902078085642319e-05, "loss": 1.9287, "step": 26860 }, { "epoch": 7.6168886370076505, "grad_norm": 2.327878475189209, "learning_rate": 1.0744647355163728e-05, "loss": 1.9162, "step": 26880 }, { "epoch": 7.622555964862567, "grad_norm": 2.8339929580688477, "learning_rate": 1.058721662468514e-05, "loss": 1.7912, "step": 26900 }, { "epoch": 7.628223292717483, "grad_norm": 3.706899881362915, "learning_rate": 1.0429785894206548e-05, "loss": 1.8399, "step": 26920 }, { "epoch": 7.6338906205724, "grad_norm": 2.9267466068267822, "learning_rate": 1.027235516372796e-05, "loss": 1.9652, "step": 26940 }, { "epoch": 7.639557948427316, "grad_norm": 2.8386104106903076, "learning_rate": 1.0114924433249371e-05, "loss": 1.834, "step": 26960 }, { "epoch": 7.645225276282233, "grad_norm": 3.28200364112854, "learning_rate": 9.957493702770782e-06, "loss": 1.8674, "step": 26980 }, { "epoch": 7.650892604137149, "grad_norm": 3.9935638904571533, "learning_rate": 9.800062972292192e-06, "loss": 2.1818, "step": 27000 }, { "epoch": 7.650892604137149, "eval_loss": 2.368424892425537, "eval_runtime": 2.903, "eval_samples_per_second": 34.447, "eval_steps_per_second": 8.612, "step": 27000 }, { "epoch": 7.656559931992065, "grad_norm": 3.2098357677459717, "learning_rate": 9.642632241813602e-06, "loss": 2.0547, "step": 27020 }, { "epoch": 7.662227259846982, "grad_norm": 1.9178589582443237, "learning_rate": 9.485201511335013e-06, "loss": 2.0096, "step": 27040 }, { "epoch": 7.667894587701898, "grad_norm": 3.7093241214752197, "learning_rate": 9.327770780856423e-06, "loss": 1.9539, "step": 27060 }, { "epoch": 7.673561915556815, "grad_norm": 2.6212518215179443, "learning_rate": 9.170340050377835e-06, "loss": 2.0852, "step": 27080 }, { "epoch": 7.679229243411731, "grad_norm": 1.7110230922698975, "learning_rate": 9.012909319899244e-06, "loss": 1.9441, "step": 27100 }, { "epoch": 7.6848965712666475, "grad_norm": 2.8453385829925537, "learning_rate": 8.855478589420656e-06, "loss": 2.112, "step": 27120 }, { "epoch": 7.690563899121564, "grad_norm": 2.7949252128601074, "learning_rate": 8.698047858942065e-06, "loss": 2.1292, "step": 27140 }, { "epoch": 7.69623122697648, "grad_norm": 3.3399200439453125, "learning_rate": 8.540617128463477e-06, "loss": 1.9792, "step": 27160 }, { "epoch": 7.701898554831397, "grad_norm": 3.616973400115967, "learning_rate": 8.383186397984886e-06, "loss": 2.0986, "step": 27180 }, { "epoch": 7.707565882686313, "grad_norm": 3.230170488357544, "learning_rate": 8.225755667506298e-06, "loss": 2.1876, "step": 27200 }, { "epoch": 7.71323321054123, "grad_norm": 3.1382851600646973, "learning_rate": 8.068324937027707e-06, "loss": 1.8828, "step": 27220 }, { "epoch": 7.718900538396146, "grad_norm": 1.8755038976669312, "learning_rate": 7.910894206549119e-06, "loss": 1.8399, "step": 27240 }, { "epoch": 7.724567866251062, "grad_norm": 1.9809577465057373, "learning_rate": 7.75346347607053e-06, "loss": 2.0855, "step": 27260 }, { "epoch": 7.730235194105979, "grad_norm": 2.0653018951416016, "learning_rate": 7.59603274559194e-06, "loss": 1.8152, "step": 27280 }, { "epoch": 7.735902521960895, "grad_norm": 3.033128499984741, "learning_rate": 7.438602015113351e-06, "loss": 1.9498, "step": 27300 }, { "epoch": 7.741569849815812, "grad_norm": 2.2902092933654785, "learning_rate": 7.281171284634761e-06, "loss": 2.0473, "step": 27320 }, { "epoch": 7.747237177670728, "grad_norm": 2.7275230884552, "learning_rate": 7.123740554156172e-06, "loss": 1.8492, "step": 27340 }, { "epoch": 7.7529045055256445, "grad_norm": 3.267713785171509, "learning_rate": 6.966309823677582e-06, "loss": 2.0397, "step": 27360 }, { "epoch": 7.758571833380561, "grad_norm": 2.124990940093994, "learning_rate": 6.808879093198993e-06, "loss": 1.934, "step": 27380 }, { "epoch": 7.764239161235477, "grad_norm": 2.8369481563568115, "learning_rate": 6.651448362720403e-06, "loss": 1.8555, "step": 27400 }, { "epoch": 7.769906489090394, "grad_norm": 3.1722116470336914, "learning_rate": 6.494017632241814e-06, "loss": 2.0719, "step": 27420 }, { "epoch": 7.77557381694531, "grad_norm": 2.1115689277648926, "learning_rate": 6.336586901763225e-06, "loss": 2.0339, "step": 27440 }, { "epoch": 7.7812411448002266, "grad_norm": 3.4126553535461426, "learning_rate": 6.179156171284635e-06, "loss": 2.0145, "step": 27460 }, { "epoch": 7.786908472655143, "grad_norm": 3.5279576778411865, "learning_rate": 6.021725440806045e-06, "loss": 2.0005, "step": 27480 }, { "epoch": 7.792575800510059, "grad_norm": 3.6004292964935303, "learning_rate": 5.864294710327456e-06, "loss": 2.0808, "step": 27500 }, { "epoch": 7.792575800510059, "eval_loss": 2.3696117401123047, "eval_runtime": 2.9603, "eval_samples_per_second": 33.781, "eval_steps_per_second": 8.445, "step": 27500 }, { "epoch": 7.798243128364976, "grad_norm": 3.784318447113037, "learning_rate": 5.706863979848867e-06, "loss": 2.2164, "step": 27520 }, { "epoch": 7.803910456219892, "grad_norm": 2.641849994659424, "learning_rate": 5.5494332493702775e-06, "loss": 2.0545, "step": 27540 }, { "epoch": 7.809577784074809, "grad_norm": 3.3937368392944336, "learning_rate": 5.392002518891688e-06, "loss": 2.1323, "step": 27560 }, { "epoch": 7.815245111929725, "grad_norm": 3.1644203662872314, "learning_rate": 5.234571788413098e-06, "loss": 2.1824, "step": 27580 }, { "epoch": 7.8209124397846415, "grad_norm": 2.923328399658203, "learning_rate": 5.077141057934509e-06, "loss": 1.9908, "step": 27600 }, { "epoch": 7.826579767639558, "grad_norm": 3.554110527038574, "learning_rate": 4.919710327455919e-06, "loss": 1.7967, "step": 27620 }, { "epoch": 7.832247095494474, "grad_norm": 2.7161154747009277, "learning_rate": 4.76227959697733e-06, "loss": 2.2407, "step": 27640 }, { "epoch": 7.837914423349391, "grad_norm": 2.154963254928589, "learning_rate": 4.604848866498741e-06, "loss": 1.9958, "step": 27660 }, { "epoch": 7.843581751204307, "grad_norm": 1.7623778581619263, "learning_rate": 4.4474181360201515e-06, "loss": 2.1773, "step": 27680 }, { "epoch": 7.8492490790592235, "grad_norm": 1.8625316619873047, "learning_rate": 4.289987405541562e-06, "loss": 1.9857, "step": 27700 }, { "epoch": 7.85491640691414, "grad_norm": 2.0632588863372803, "learning_rate": 4.132556675062973e-06, "loss": 2.1102, "step": 27720 }, { "epoch": 7.860583734769056, "grad_norm": 2.225883722305298, "learning_rate": 3.975125944584384e-06, "loss": 2.0646, "step": 27740 }, { "epoch": 7.866251062623973, "grad_norm": 3.7089357376098633, "learning_rate": 3.817695214105794e-06, "loss": 1.9633, "step": 27760 }, { "epoch": 7.871918390478889, "grad_norm": 1.9237614870071411, "learning_rate": 3.660264483627204e-06, "loss": 2.1341, "step": 27780 }, { "epoch": 7.877585718333806, "grad_norm": 2.2526156902313232, "learning_rate": 3.502833753148615e-06, "loss": 1.9827, "step": 27800 }, { "epoch": 7.883253046188722, "grad_norm": 2.474177837371826, "learning_rate": 3.3454030226700254e-06, "loss": 2.1657, "step": 27820 }, { "epoch": 7.888920374043638, "grad_norm": 2.4541993141174316, "learning_rate": 3.187972292191436e-06, "loss": 1.6961, "step": 27840 }, { "epoch": 7.894587701898555, "grad_norm": 2.2030463218688965, "learning_rate": 3.0305415617128468e-06, "loss": 2.1446, "step": 27860 }, { "epoch": 7.900255029753471, "grad_norm": 3.2580044269561768, "learning_rate": 2.873110831234257e-06, "loss": 2.0016, "step": 27880 }, { "epoch": 7.905922357608388, "grad_norm": 3.227149724960327, "learning_rate": 2.7156801007556677e-06, "loss": 2.0002, "step": 27900 }, { "epoch": 7.911589685463304, "grad_norm": 1.398563265800476, "learning_rate": 2.558249370277078e-06, "loss": 1.9468, "step": 27920 }, { "epoch": 7.9172570133182205, "grad_norm": 3.6059467792510986, "learning_rate": 2.4008186397984885e-06, "loss": 2.1506, "step": 27940 }, { "epoch": 7.922924341173137, "grad_norm": 3.4954187870025635, "learning_rate": 2.2433879093198994e-06, "loss": 2.1436, "step": 27960 }, { "epoch": 7.928591669028053, "grad_norm": 2.484344959259033, "learning_rate": 2.08595717884131e-06, "loss": 2.2101, "step": 27980 }, { "epoch": 7.93425899688297, "grad_norm": 3.057554006576538, "learning_rate": 1.9285264483627203e-06, "loss": 2.0305, "step": 28000 }, { "epoch": 7.93425899688297, "eval_loss": 2.3673107624053955, "eval_runtime": 2.8034, "eval_samples_per_second": 35.671, "eval_steps_per_second": 8.918, "step": 28000 }, { "epoch": 7.939926324737886, "grad_norm": 2.4007439613342285, "learning_rate": 1.771095717884131e-06, "loss": 2.31, "step": 28020 }, { "epoch": 7.945593652592803, "grad_norm": 3.128981113433838, "learning_rate": 1.6136649874055414e-06, "loss": 1.8868, "step": 28040 }, { "epoch": 7.951260980447719, "grad_norm": 2.767014265060425, "learning_rate": 1.456234256926952e-06, "loss": 2.1838, "step": 28060 }, { "epoch": 7.956928308302635, "grad_norm": 3.261197328567505, "learning_rate": 1.298803526448363e-06, "loss": 2.1653, "step": 28080 }, { "epoch": 7.962595636157552, "grad_norm": 3.967350959777832, "learning_rate": 1.1413727959697734e-06, "loss": 1.9867, "step": 28100 }, { "epoch": 7.968262964012468, "grad_norm": 2.4714808464050293, "learning_rate": 9.839420654911839e-07, "loss": 2.2218, "step": 28120 }, { "epoch": 7.973930291867385, "grad_norm": 2.0417537689208984, "learning_rate": 8.265113350125945e-07, "loss": 2.0529, "step": 28140 }, { "epoch": 7.979597619722301, "grad_norm": 4.137369155883789, "learning_rate": 6.769521410579346e-07, "loss": 2.0835, "step": 28160 }, { "epoch": 7.9852649475772175, "grad_norm": 2.3702034950256348, "learning_rate": 5.19521410579345e-07, "loss": 1.8677, "step": 28180 }, { "epoch": 7.990932275432134, "grad_norm": 2.399726629257202, "learning_rate": 3.620906801007557e-07, "loss": 1.9967, "step": 28200 }, { "epoch": 7.99659960328705, "grad_norm": 3.284844398498535, "learning_rate": 2.0465994962216625e-07, "loss": 2.0803, "step": 28220 } ], "logging_steps": 20, "max_steps": 28232, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.158220809522381e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }