DINOCOMPANION / checkpoint-13300 /trainer_state.json
Beaconsyh08's picture
Upload folder using huggingface_hub
4475787 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 19.97088108209656,
"eval_steps": 500,
"global_step": 13300,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.015029118917903438,
"grad_norm": 0.2293534129858017,
"learning_rate": 6.766917293233083e-07,
"loss": 1.5634,
"step": 10
},
{
"epoch": 0.030058237835806877,
"grad_norm": 0.2535412907600403,
"learning_rate": 1.4285714285714286e-06,
"loss": 1.5043,
"step": 20
},
{
"epoch": 0.04508735675371031,
"grad_norm": 0.3165118992328644,
"learning_rate": 2.1804511278195492e-06,
"loss": 1.5571,
"step": 30
},
{
"epoch": 0.06011647567161375,
"grad_norm": 0.27761849761009216,
"learning_rate": 2.9323308270676694e-06,
"loss": 1.5064,
"step": 40
},
{
"epoch": 0.0751455945895172,
"grad_norm": 0.34336039423942566,
"learning_rate": 3.6842105263157892e-06,
"loss": 1.5299,
"step": 50
},
{
"epoch": 0.09017471350742062,
"grad_norm": 0.4327663481235504,
"learning_rate": 4.436090225563911e-06,
"loss": 1.5351,
"step": 60
},
{
"epoch": 0.10520383242532406,
"grad_norm": 0.4244738221168518,
"learning_rate": 5.187969924812031e-06,
"loss": 1.4876,
"step": 70
},
{
"epoch": 0.1202329513432275,
"grad_norm": 0.39235126972198486,
"learning_rate": 5.939849624060151e-06,
"loss": 1.4138,
"step": 80
},
{
"epoch": 0.13526207026113093,
"grad_norm": 0.36149346828460693,
"learning_rate": 6.691729323308271e-06,
"loss": 1.3901,
"step": 90
},
{
"epoch": 0.1502911891790344,
"grad_norm": 0.2174796313047409,
"learning_rate": 7.4436090225563915e-06,
"loss": 1.284,
"step": 100
},
{
"epoch": 0.16532030809693782,
"grad_norm": 0.19376038014888763,
"learning_rate": 8.195488721804512e-06,
"loss": 1.2713,
"step": 110
},
{
"epoch": 0.18034942701484125,
"grad_norm": 0.18585975468158722,
"learning_rate": 8.947368421052632e-06,
"loss": 1.2301,
"step": 120
},
{
"epoch": 0.1953785459327447,
"grad_norm": 0.18462727963924408,
"learning_rate": 9.699248120300752e-06,
"loss": 1.231,
"step": 130
},
{
"epoch": 0.21040766485064813,
"grad_norm": 0.16348238289356232,
"learning_rate": 1.0451127819548872e-05,
"loss": 1.2167,
"step": 140
},
{
"epoch": 0.22543678376855156,
"grad_norm": 0.17574988305568695,
"learning_rate": 1.1203007518796992e-05,
"loss": 1.1999,
"step": 150
},
{
"epoch": 0.240465902686455,
"grad_norm": 0.14682741463184357,
"learning_rate": 1.1954887218045113e-05,
"loss": 1.2491,
"step": 160
},
{
"epoch": 0.25549502160435844,
"grad_norm": 0.1753804236650467,
"learning_rate": 1.2706766917293233e-05,
"loss": 1.2036,
"step": 170
},
{
"epoch": 0.27052414052226187,
"grad_norm": 0.17857442796230316,
"learning_rate": 1.3458646616541353e-05,
"loss": 1.1822,
"step": 180
},
{
"epoch": 0.2855532594401653,
"grad_norm": 0.18367990851402283,
"learning_rate": 1.4210526315789475e-05,
"loss": 1.1679,
"step": 190
},
{
"epoch": 0.3005823783580688,
"grad_norm": 0.20284640789031982,
"learning_rate": 1.4962406015037595e-05,
"loss": 1.1337,
"step": 200
},
{
"epoch": 0.3156114972759722,
"grad_norm": 0.16659210622310638,
"learning_rate": 1.5714285714285715e-05,
"loss": 1.181,
"step": 210
},
{
"epoch": 0.33064061619387564,
"grad_norm": 0.1798979490995407,
"learning_rate": 1.6466165413533834e-05,
"loss": 1.1785,
"step": 220
},
{
"epoch": 0.34566973511177906,
"grad_norm": 0.1689957082271576,
"learning_rate": 1.7218045112781956e-05,
"loss": 1.1489,
"step": 230
},
{
"epoch": 0.3606988540296825,
"grad_norm": 0.199369415640831,
"learning_rate": 1.7969924812030074e-05,
"loss": 1.1677,
"step": 240
},
{
"epoch": 0.375727972947586,
"grad_norm": 0.23965977132320404,
"learning_rate": 1.8721804511278196e-05,
"loss": 1.1516,
"step": 250
},
{
"epoch": 0.3907570918654894,
"grad_norm": 0.18958410620689392,
"learning_rate": 1.9473684210526315e-05,
"loss": 1.1278,
"step": 260
},
{
"epoch": 0.40578621078339283,
"grad_norm": 0.20659048855304718,
"learning_rate": 2.0225563909774437e-05,
"loss": 1.1613,
"step": 270
},
{
"epoch": 0.42081532970129626,
"grad_norm": 0.22374583780765533,
"learning_rate": 2.097744360902256e-05,
"loss": 1.1348,
"step": 280
},
{
"epoch": 0.4358444486191997,
"grad_norm": 0.22938427329063416,
"learning_rate": 2.1729323308270677e-05,
"loss": 1.157,
"step": 290
},
{
"epoch": 0.4508735675371031,
"grad_norm": 0.2688145935535431,
"learning_rate": 2.24812030075188e-05,
"loss": 1.1329,
"step": 300
},
{
"epoch": 0.4659026864550066,
"grad_norm": 0.21894283592700958,
"learning_rate": 2.3233082706766917e-05,
"loss": 1.1197,
"step": 310
},
{
"epoch": 0.48093180537291,
"grad_norm": 0.2249055653810501,
"learning_rate": 2.398496240601504e-05,
"loss": 1.127,
"step": 320
},
{
"epoch": 0.49596092429081345,
"grad_norm": 0.2487722635269165,
"learning_rate": 2.4736842105263158e-05,
"loss": 1.1331,
"step": 330
},
{
"epoch": 0.5109900432087169,
"grad_norm": 0.2143404483795166,
"learning_rate": 2.548872180451128e-05,
"loss": 1.1342,
"step": 340
},
{
"epoch": 0.5260191621266204,
"grad_norm": 0.27003639936447144,
"learning_rate": 2.6240601503759398e-05,
"loss": 1.133,
"step": 350
},
{
"epoch": 0.5410482810445237,
"grad_norm": 0.25785332918167114,
"learning_rate": 2.699248120300752e-05,
"loss": 1.1284,
"step": 360
},
{
"epoch": 0.5560773999624272,
"grad_norm": 0.24334581196308136,
"learning_rate": 2.774436090225564e-05,
"loss": 1.1149,
"step": 370
},
{
"epoch": 0.5711065188803306,
"grad_norm": 0.23162595927715302,
"learning_rate": 2.849624060150376e-05,
"loss": 1.144,
"step": 380
},
{
"epoch": 0.5861356377982341,
"grad_norm": 0.2650283873081207,
"learning_rate": 2.924812030075188e-05,
"loss": 1.1431,
"step": 390
},
{
"epoch": 0.6011647567161376,
"grad_norm": 0.2596570551395416,
"learning_rate": 3e-05,
"loss": 1.1211,
"step": 400
},
{
"epoch": 0.6161938756340409,
"grad_norm": 0.25908759236335754,
"learning_rate": 3.075187969924812e-05,
"loss": 1.1287,
"step": 410
},
{
"epoch": 0.6312229945519444,
"grad_norm": 0.24592378735542297,
"learning_rate": 3.150375939849624e-05,
"loss": 1.0913,
"step": 420
},
{
"epoch": 0.6462521134698478,
"grad_norm": 0.2371867448091507,
"learning_rate": 3.225563909774436e-05,
"loss": 1.1435,
"step": 430
},
{
"epoch": 0.6612812323877513,
"grad_norm": 0.25050684809684753,
"learning_rate": 3.300751879699248e-05,
"loss": 1.1009,
"step": 440
},
{
"epoch": 0.6763103513056548,
"grad_norm": 0.26998868584632874,
"learning_rate": 3.3759398496240603e-05,
"loss": 1.1018,
"step": 450
},
{
"epoch": 0.6913394702235581,
"grad_norm": 0.255825012922287,
"learning_rate": 3.451127819548872e-05,
"loss": 1.096,
"step": 460
},
{
"epoch": 0.7063685891414616,
"grad_norm": 0.2328484058380127,
"learning_rate": 3.526315789473684e-05,
"loss": 1.1083,
"step": 470
},
{
"epoch": 0.721397708059365,
"grad_norm": 0.2772115170955658,
"learning_rate": 3.6015037593984966e-05,
"loss": 1.1243,
"step": 480
},
{
"epoch": 0.7364268269772685,
"grad_norm": 0.2620559334754944,
"learning_rate": 3.6766917293233084e-05,
"loss": 1.1357,
"step": 490
},
{
"epoch": 0.751455945895172,
"grad_norm": 0.2600899934768677,
"learning_rate": 3.75187969924812e-05,
"loss": 1.1044,
"step": 500
},
{
"epoch": 0.7664850648130753,
"grad_norm": 0.23489312827587128,
"learning_rate": 3.827067669172932e-05,
"loss": 1.1028,
"step": 510
},
{
"epoch": 0.7815141837309788,
"grad_norm": 0.2843015491962433,
"learning_rate": 3.9022556390977447e-05,
"loss": 1.1234,
"step": 520
},
{
"epoch": 0.7965433026488822,
"grad_norm": 0.27744609117507935,
"learning_rate": 3.9774436090225565e-05,
"loss": 1.0939,
"step": 530
},
{
"epoch": 0.8115724215667857,
"grad_norm": 0.2344987690448761,
"learning_rate": 4.0526315789473684e-05,
"loss": 1.094,
"step": 540
},
{
"epoch": 0.826601540484689,
"grad_norm": 0.2847677171230316,
"learning_rate": 4.12781954887218e-05,
"loss": 1.1,
"step": 550
},
{
"epoch": 0.8416306594025925,
"grad_norm": 0.3026759922504425,
"learning_rate": 4.203007518796993e-05,
"loss": 1.1191,
"step": 560
},
{
"epoch": 0.856659778320496,
"grad_norm": 0.38774576783180237,
"learning_rate": 4.2781954887218046e-05,
"loss": 1.1273,
"step": 570
},
{
"epoch": 0.8716888972383994,
"grad_norm": 0.28009462356567383,
"learning_rate": 4.3533834586466164e-05,
"loss": 1.081,
"step": 580
},
{
"epoch": 0.8867180161563029,
"grad_norm": 0.2575189471244812,
"learning_rate": 4.428571428571428e-05,
"loss": 1.1077,
"step": 590
},
{
"epoch": 0.9017471350742062,
"grad_norm": 0.2847520112991333,
"learning_rate": 4.503759398496241e-05,
"loss": 1.1041,
"step": 600
},
{
"epoch": 0.9167762539921097,
"grad_norm": 0.31493791937828064,
"learning_rate": 4.5789473684210527e-05,
"loss": 1.1406,
"step": 610
},
{
"epoch": 0.9318053729100132,
"grad_norm": 0.2649036645889282,
"learning_rate": 4.6541353383458645e-05,
"loss": 1.0949,
"step": 620
},
{
"epoch": 0.9468344918279166,
"grad_norm": 0.29710251092910767,
"learning_rate": 4.729323308270677e-05,
"loss": 1.0853,
"step": 630
},
{
"epoch": 0.96186361074582,
"grad_norm": 0.26907584071159363,
"learning_rate": 4.804511278195489e-05,
"loss": 1.1092,
"step": 640
},
{
"epoch": 0.9768927296637234,
"grad_norm": 0.3357178568840027,
"learning_rate": 4.879699248120301e-05,
"loss": 1.1179,
"step": 650
},
{
"epoch": 0.9919218485816269,
"grad_norm": 0.2772427201271057,
"learning_rate": 4.9548872180451126e-05,
"loss": 1.0903,
"step": 660
},
{
"epoch": 1.0060116475671614,
"grad_norm": 0.26991239190101624,
"learning_rate": 5.030075187969925e-05,
"loss": 1.1113,
"step": 670
},
{
"epoch": 1.021040766485065,
"grad_norm": 0.24760043621063232,
"learning_rate": 5.1052631578947376e-05,
"loss": 1.068,
"step": 680
},
{
"epoch": 1.0360698854029682,
"grad_norm": 0.28557974100112915,
"learning_rate": 5.180451127819549e-05,
"loss": 1.0954,
"step": 690
},
{
"epoch": 1.0510990043208717,
"grad_norm": 0.3007003962993622,
"learning_rate": 5.2556390977443613e-05,
"loss": 1.0944,
"step": 700
},
{
"epoch": 1.0661281232387751,
"grad_norm": 0.30276528000831604,
"learning_rate": 5.330827067669173e-05,
"loss": 1.0945,
"step": 710
},
{
"epoch": 1.0811572421566786,
"grad_norm": 0.26913130283355713,
"learning_rate": 5.406015037593986e-05,
"loss": 1.112,
"step": 720
},
{
"epoch": 1.0961863610745821,
"grad_norm": 0.289982408285141,
"learning_rate": 5.481203007518797e-05,
"loss": 1.0891,
"step": 730
},
{
"epoch": 1.1112154799924854,
"grad_norm": 0.28320783376693726,
"learning_rate": 5.5563909774436094e-05,
"loss": 1.094,
"step": 740
},
{
"epoch": 1.1262445989103889,
"grad_norm": 0.31406116485595703,
"learning_rate": 5.631578947368421e-05,
"loss": 1.0853,
"step": 750
},
{
"epoch": 1.1412737178282923,
"grad_norm": 0.299730122089386,
"learning_rate": 5.706766917293234e-05,
"loss": 1.1048,
"step": 760
},
{
"epoch": 1.1563028367461958,
"grad_norm": 0.30774202942848206,
"learning_rate": 5.781954887218045e-05,
"loss": 1.0549,
"step": 770
},
{
"epoch": 1.1713319556640993,
"grad_norm": 0.325926810503006,
"learning_rate": 5.8571428571428575e-05,
"loss": 1.0823,
"step": 780
},
{
"epoch": 1.1863610745820026,
"grad_norm": 0.31851741671562195,
"learning_rate": 5.9323308270676694e-05,
"loss": 1.0989,
"step": 790
},
{
"epoch": 1.201390193499906,
"grad_norm": 0.3333583474159241,
"learning_rate": 6.007518796992482e-05,
"loss": 1.0625,
"step": 800
},
{
"epoch": 1.2164193124178095,
"grad_norm": 0.3349563479423523,
"learning_rate": 6.082706766917293e-05,
"loss": 1.1002,
"step": 810
},
{
"epoch": 1.231448431335713,
"grad_norm": 0.3039754629135132,
"learning_rate": 6.157894736842106e-05,
"loss": 1.0927,
"step": 820
},
{
"epoch": 1.2464775502536165,
"grad_norm": 0.3020300269126892,
"learning_rate": 6.233082706766917e-05,
"loss": 1.0983,
"step": 830
},
{
"epoch": 1.2615066691715198,
"grad_norm": 0.31834477186203003,
"learning_rate": 6.308270676691729e-05,
"loss": 1.0628,
"step": 840
},
{
"epoch": 1.2765357880894233,
"grad_norm": 0.3013087809085846,
"learning_rate": 6.383458646616541e-05,
"loss": 1.0683,
"step": 850
},
{
"epoch": 1.2915649070073267,
"grad_norm": 0.3001497983932495,
"learning_rate": 6.458646616541354e-05,
"loss": 1.0858,
"step": 860
},
{
"epoch": 1.30659402592523,
"grad_norm": 0.32003313302993774,
"learning_rate": 6.533834586466165e-05,
"loss": 1.0747,
"step": 870
},
{
"epoch": 1.3216231448431337,
"grad_norm": 0.3063625693321228,
"learning_rate": 6.609022556390978e-05,
"loss": 1.1008,
"step": 880
},
{
"epoch": 1.336652263761037,
"grad_norm": 0.27760475873947144,
"learning_rate": 6.68421052631579e-05,
"loss": 1.0903,
"step": 890
},
{
"epoch": 1.3516813826789404,
"grad_norm": 0.25132644176483154,
"learning_rate": 6.759398496240602e-05,
"loss": 1.0808,
"step": 900
},
{
"epoch": 1.366710501596844,
"grad_norm": 0.2900444567203522,
"learning_rate": 6.834586466165414e-05,
"loss": 1.0755,
"step": 910
},
{
"epoch": 1.3817396205147472,
"grad_norm": 0.2900155484676361,
"learning_rate": 6.909774436090227e-05,
"loss": 1.0797,
"step": 920
},
{
"epoch": 1.396768739432651,
"grad_norm": 0.31477174162864685,
"learning_rate": 6.984962406015037e-05,
"loss": 1.076,
"step": 930
},
{
"epoch": 1.4117978583505542,
"grad_norm": 0.3233202397823334,
"learning_rate": 7.06015037593985e-05,
"loss": 1.0968,
"step": 940
},
{
"epoch": 1.4268269772684576,
"grad_norm": 0.30731186270713806,
"learning_rate": 7.135338345864661e-05,
"loss": 1.0976,
"step": 950
},
{
"epoch": 1.4418560961863611,
"grad_norm": 0.24933114647865295,
"learning_rate": 7.210526315789474e-05,
"loss": 1.0713,
"step": 960
},
{
"epoch": 1.4568852151042644,
"grad_norm": 0.2990662753582001,
"learning_rate": 7.285714285714286e-05,
"loss": 1.0988,
"step": 970
},
{
"epoch": 1.4719143340221679,
"grad_norm": 0.25678712129592896,
"learning_rate": 7.360902255639098e-05,
"loss": 1.0874,
"step": 980
},
{
"epoch": 1.4869434529400714,
"grad_norm": 0.3273868262767792,
"learning_rate": 7.43609022556391e-05,
"loss": 1.1036,
"step": 990
},
{
"epoch": 1.5019725718579748,
"grad_norm": 0.26454275846481323,
"learning_rate": 7.511278195488723e-05,
"loss": 1.0713,
"step": 1000
},
{
"epoch": 1.5170016907758783,
"grad_norm": 0.2492770105600357,
"learning_rate": 7.586466165413533e-05,
"loss": 1.063,
"step": 1010
},
{
"epoch": 1.5320308096937816,
"grad_norm": 0.28998205065727234,
"learning_rate": 7.661654135338347e-05,
"loss": 1.0866,
"step": 1020
},
{
"epoch": 1.5470599286116853,
"grad_norm": 0.26011377573013306,
"learning_rate": 7.736842105263159e-05,
"loss": 1.0615,
"step": 1030
},
{
"epoch": 1.5620890475295885,
"grad_norm": 0.25039157271385193,
"learning_rate": 7.81203007518797e-05,
"loss": 1.0613,
"step": 1040
},
{
"epoch": 1.577118166447492,
"grad_norm": 0.26238375902175903,
"learning_rate": 7.887218045112782e-05,
"loss": 1.0927,
"step": 1050
},
{
"epoch": 1.5921472853653955,
"grad_norm": 0.23926205933094025,
"learning_rate": 7.962406015037594e-05,
"loss": 1.0568,
"step": 1060
},
{
"epoch": 1.6071764042832988,
"grad_norm": 0.24725791811943054,
"learning_rate": 8.037593984962406e-05,
"loss": 1.0772,
"step": 1070
},
{
"epoch": 1.6222055232012025,
"grad_norm": 0.25732311606407166,
"learning_rate": 8.112781954887219e-05,
"loss": 1.1058,
"step": 1080
},
{
"epoch": 1.6372346421191057,
"grad_norm": 0.2595824897289276,
"learning_rate": 8.18796992481203e-05,
"loss": 1.1056,
"step": 1090
},
{
"epoch": 1.6522637610370092,
"grad_norm": 0.25049930810928345,
"learning_rate": 8.263157894736843e-05,
"loss": 1.0818,
"step": 1100
},
{
"epoch": 1.6672928799549127,
"grad_norm": 0.2525707185268402,
"learning_rate": 8.338345864661655e-05,
"loss": 1.1147,
"step": 1110
},
{
"epoch": 1.682321998872816,
"grad_norm": 0.25421109795570374,
"learning_rate": 8.413533834586467e-05,
"loss": 1.0959,
"step": 1120
},
{
"epoch": 1.6973511177907197,
"grad_norm": 0.2396637499332428,
"learning_rate": 8.488721804511278e-05,
"loss": 1.1012,
"step": 1130
},
{
"epoch": 1.712380236708623,
"grad_norm": 0.24933594465255737,
"learning_rate": 8.56390977443609e-05,
"loss": 1.0931,
"step": 1140
},
{
"epoch": 1.7274093556265264,
"grad_norm": 0.2631904184818268,
"learning_rate": 8.639097744360902e-05,
"loss": 1.1116,
"step": 1150
},
{
"epoch": 1.74243847454443,
"grad_norm": 0.25884145498275757,
"learning_rate": 8.714285714285715e-05,
"loss": 1.0957,
"step": 1160
},
{
"epoch": 1.7574675934623332,
"grad_norm": 0.23709504306316376,
"learning_rate": 8.789473684210526e-05,
"loss": 1.0804,
"step": 1170
},
{
"epoch": 1.7724967123802369,
"grad_norm": 0.25201550126075745,
"learning_rate": 8.864661654135339e-05,
"loss": 1.0887,
"step": 1180
},
{
"epoch": 1.7875258312981401,
"grad_norm": 0.2535940110683441,
"learning_rate": 8.939849624060151e-05,
"loss": 1.0748,
"step": 1190
},
{
"epoch": 1.8025549502160436,
"grad_norm": 0.2509770691394806,
"learning_rate": 9.015037593984963e-05,
"loss": 1.1021,
"step": 1200
},
{
"epoch": 1.817584069133947,
"grad_norm": 0.23271974921226501,
"learning_rate": 9.090225563909775e-05,
"loss": 1.0516,
"step": 1210
},
{
"epoch": 1.8326131880518504,
"grad_norm": 0.249566912651062,
"learning_rate": 9.165413533834586e-05,
"loss": 1.0766,
"step": 1220
},
{
"epoch": 1.8476423069697538,
"grad_norm": 0.22922058403491974,
"learning_rate": 9.240601503759398e-05,
"loss": 1.1056,
"step": 1230
},
{
"epoch": 1.8626714258876573,
"grad_norm": 0.24767987430095673,
"learning_rate": 9.315789473684211e-05,
"loss": 1.0934,
"step": 1240
},
{
"epoch": 1.8777005448055608,
"grad_norm": 0.23084762692451477,
"learning_rate": 9.390977443609022e-05,
"loss": 1.0894,
"step": 1250
},
{
"epoch": 1.8927296637234643,
"grad_norm": 0.24973560869693756,
"learning_rate": 9.466165413533835e-05,
"loss": 1.0788,
"step": 1260
},
{
"epoch": 1.9077587826413676,
"grad_norm": 0.248574361205101,
"learning_rate": 9.541353383458647e-05,
"loss": 1.0829,
"step": 1270
},
{
"epoch": 1.922787901559271,
"grad_norm": 0.24072329699993134,
"learning_rate": 9.616541353383459e-05,
"loss": 1.1161,
"step": 1280
},
{
"epoch": 1.9378170204771745,
"grad_norm": 0.2310166209936142,
"learning_rate": 9.69172932330827e-05,
"loss": 1.0682,
"step": 1290
},
{
"epoch": 1.952846139395078,
"grad_norm": 0.23928825557231903,
"learning_rate": 9.766917293233084e-05,
"loss": 1.1194,
"step": 1300
},
{
"epoch": 1.9678752583129815,
"grad_norm": 0.2643069624900818,
"learning_rate": 9.842105263157894e-05,
"loss": 1.0712,
"step": 1310
},
{
"epoch": 1.9829043772308848,
"grad_norm": 0.2541036307811737,
"learning_rate": 9.917293233082708e-05,
"loss": 1.0847,
"step": 1320
},
{
"epoch": 1.9979334961487882,
"grad_norm": 0.2341761291027069,
"learning_rate": 9.99248120300752e-05,
"loss": 1.0847,
"step": 1330
},
{
"epoch": 2.012023295134323,
"grad_norm": 0.2271430492401123,
"learning_rate": 9.999986051218537e-05,
"loss": 1.0459,
"step": 1340
},
{
"epoch": 2.027052414052226,
"grad_norm": 0.2847868800163269,
"learning_rate": 9.999937833308459e-05,
"loss": 1.0499,
"step": 1350
},
{
"epoch": 2.04208153297013,
"grad_norm": 0.283787339925766,
"learning_rate": 9.999855174394648e-05,
"loss": 1.0434,
"step": 1360
},
{
"epoch": 2.057110651888033,
"grad_norm": 0.3147590756416321,
"learning_rate": 9.999738075046483e-05,
"loss": 1.053,
"step": 1370
},
{
"epoch": 2.0721397708059364,
"grad_norm": 0.26797565817832947,
"learning_rate": 9.999586536070575e-05,
"loss": 1.0599,
"step": 1380
},
{
"epoch": 2.08716888972384,
"grad_norm": 0.3145821988582611,
"learning_rate": 9.99940055851077e-05,
"loss": 1.053,
"step": 1390
},
{
"epoch": 2.1021980086417433,
"grad_norm": 0.2934500277042389,
"learning_rate": 9.999180143648135e-05,
"loss": 1.0613,
"step": 1400
},
{
"epoch": 2.117227127559647,
"grad_norm": 0.26865336298942566,
"learning_rate": 9.998925293000949e-05,
"loss": 1.0548,
"step": 1410
},
{
"epoch": 2.1322562464775503,
"grad_norm": 0.3006330132484436,
"learning_rate": 9.998636008324698e-05,
"loss": 1.0362,
"step": 1420
},
{
"epoch": 2.1472853653954536,
"grad_norm": 0.3416139483451843,
"learning_rate": 9.998312291612057e-05,
"loss": 1.0588,
"step": 1430
},
{
"epoch": 2.1623144843133573,
"grad_norm": 0.3035484552383423,
"learning_rate": 9.997954145092878e-05,
"loss": 1.0675,
"step": 1440
},
{
"epoch": 2.1773436032312605,
"grad_norm": 0.2740626335144043,
"learning_rate": 9.997561571234179e-05,
"loss": 1.0435,
"step": 1450
},
{
"epoch": 2.1923727221491642,
"grad_norm": 0.2556332051753998,
"learning_rate": 9.997134572740121e-05,
"loss": 1.0803,
"step": 1460
},
{
"epoch": 2.2074018410670675,
"grad_norm": 0.30163928866386414,
"learning_rate": 9.996673152551991e-05,
"loss": 1.0734,
"step": 1470
},
{
"epoch": 2.2224309599849708,
"grad_norm": 0.3375592529773712,
"learning_rate": 9.996177313848184e-05,
"loss": 1.0906,
"step": 1480
},
{
"epoch": 2.2374600789028745,
"grad_norm": 0.2721370756626129,
"learning_rate": 9.995647060044177e-05,
"loss": 1.0335,
"step": 1490
},
{
"epoch": 2.2524891978207777,
"grad_norm": 0.26590871810913086,
"learning_rate": 9.995082394792514e-05,
"loss": 1.0448,
"step": 1500
},
{
"epoch": 2.2675183167386814,
"grad_norm": 0.31041955947875977,
"learning_rate": 9.994483321982768e-05,
"loss": 1.0715,
"step": 1510
},
{
"epoch": 2.2825474356565847,
"grad_norm": 0.2897711396217346,
"learning_rate": 9.993849845741524e-05,
"loss": 1.0564,
"step": 1520
},
{
"epoch": 2.297576554574488,
"grad_norm": 0.3064815402030945,
"learning_rate": 9.993181970432349e-05,
"loss": 1.0634,
"step": 1530
},
{
"epoch": 2.3126056734923917,
"grad_norm": 0.28484266996383667,
"learning_rate": 9.99247970065576e-05,
"loss": 1.0742,
"step": 1540
},
{
"epoch": 2.327634792410295,
"grad_norm": 0.2922673523426056,
"learning_rate": 9.99174304124919e-05,
"loss": 1.0851,
"step": 1550
},
{
"epoch": 2.3426639113281986,
"grad_norm": 0.3106658160686493,
"learning_rate": 9.990971997286961e-05,
"loss": 1.1097,
"step": 1560
},
{
"epoch": 2.357693030246102,
"grad_norm": 0.30149292945861816,
"learning_rate": 9.990166574080246e-05,
"loss": 1.048,
"step": 1570
},
{
"epoch": 2.372722149164005,
"grad_norm": 0.2597978115081787,
"learning_rate": 9.989326777177028e-05,
"loss": 1.029,
"step": 1580
},
{
"epoch": 2.387751268081909,
"grad_norm": 0.24886192381381989,
"learning_rate": 9.988452612362071e-05,
"loss": 1.054,
"step": 1590
},
{
"epoch": 2.402780386999812,
"grad_norm": 0.3196369707584381,
"learning_rate": 9.987544085656873e-05,
"loss": 1.0715,
"step": 1600
},
{
"epoch": 2.417809505917716,
"grad_norm": 0.28219732642173767,
"learning_rate": 9.986601203319623e-05,
"loss": 1.0631,
"step": 1610
},
{
"epoch": 2.432838624835619,
"grad_norm": 0.2625892162322998,
"learning_rate": 9.985623971845169e-05,
"loss": 1.0699,
"step": 1620
},
{
"epoch": 2.4478677437535223,
"grad_norm": 0.26191845536231995,
"learning_rate": 9.984612397964956e-05,
"loss": 1.0536,
"step": 1630
},
{
"epoch": 2.462896862671426,
"grad_norm": 0.27230942249298096,
"learning_rate": 9.983566488646999e-05,
"loss": 1.0924,
"step": 1640
},
{
"epoch": 2.4779259815893293,
"grad_norm": 0.2692161500453949,
"learning_rate": 9.982486251095817e-05,
"loss": 1.0414,
"step": 1650
},
{
"epoch": 2.492955100507233,
"grad_norm": 0.2909376323223114,
"learning_rate": 9.981371692752401e-05,
"loss": 1.0797,
"step": 1660
},
{
"epoch": 2.5079842194251363,
"grad_norm": 0.3020433783531189,
"learning_rate": 9.980222821294143e-05,
"loss": 1.0637,
"step": 1670
},
{
"epoch": 2.5230133383430395,
"grad_norm": 0.2783840596675873,
"learning_rate": 9.979039644634802e-05,
"loss": 1.0617,
"step": 1680
},
{
"epoch": 2.5380424572609432,
"grad_norm": 0.27026644349098206,
"learning_rate": 9.977822170924434e-05,
"loss": 1.0515,
"step": 1690
},
{
"epoch": 2.5530715761788465,
"grad_norm": 0.2597585618495941,
"learning_rate": 9.97657040854935e-05,
"loss": 1.0541,
"step": 1700
},
{
"epoch": 2.56810069509675,
"grad_norm": 0.2972753345966339,
"learning_rate": 9.975284366132047e-05,
"loss": 1.0541,
"step": 1710
},
{
"epoch": 2.5831298140146535,
"grad_norm": 0.25682052969932556,
"learning_rate": 9.973964052531154e-05,
"loss": 1.0533,
"step": 1720
},
{
"epoch": 2.5981589329325567,
"grad_norm": 0.2819693684577942,
"learning_rate": 9.972609476841367e-05,
"loss": 1.0458,
"step": 1730
},
{
"epoch": 2.61318805185046,
"grad_norm": 0.28979477286338806,
"learning_rate": 9.971220648393394e-05,
"loss": 1.0747,
"step": 1740
},
{
"epoch": 2.6282171707683637,
"grad_norm": 0.2849046289920807,
"learning_rate": 9.96979757675388e-05,
"loss": 1.05,
"step": 1750
},
{
"epoch": 2.6432462896862674,
"grad_norm": 0.28079524636268616,
"learning_rate": 9.968340271725352e-05,
"loss": 1.0755,
"step": 1760
},
{
"epoch": 2.6582754086041707,
"grad_norm": 0.27980852127075195,
"learning_rate": 9.966848743346144e-05,
"loss": 1.0874,
"step": 1770
},
{
"epoch": 2.673304527522074,
"grad_norm": 0.25519728660583496,
"learning_rate": 9.965323001890331e-05,
"loss": 1.0319,
"step": 1780
},
{
"epoch": 2.688333646439977,
"grad_norm": 0.25402480363845825,
"learning_rate": 9.963763057867656e-05,
"loss": 1.0268,
"step": 1790
},
{
"epoch": 2.703362765357881,
"grad_norm": 0.25798556208610535,
"learning_rate": 9.962168922023462e-05,
"loss": 1.0365,
"step": 1800
},
{
"epoch": 2.7183918842757846,
"grad_norm": 0.2535860538482666,
"learning_rate": 9.960540605338613e-05,
"loss": 1.0543,
"step": 1810
},
{
"epoch": 2.733421003193688,
"grad_norm": 0.26214438676834106,
"learning_rate": 9.958878119029418e-05,
"loss": 1.0336,
"step": 1820
},
{
"epoch": 2.748450122111591,
"grad_norm": 0.27087315917015076,
"learning_rate": 9.957181474547563e-05,
"loss": 1.0457,
"step": 1830
},
{
"epoch": 2.7634792410294944,
"grad_norm": 0.27433788776397705,
"learning_rate": 9.955450683580018e-05,
"loss": 1.07,
"step": 1840
},
{
"epoch": 2.778508359947398,
"grad_norm": 0.2705138027667999,
"learning_rate": 9.953685758048967e-05,
"loss": 1.0403,
"step": 1850
},
{
"epoch": 2.793537478865302,
"grad_norm": 0.2626933157444,
"learning_rate": 9.951886710111723e-05,
"loss": 1.0464,
"step": 1860
},
{
"epoch": 2.808566597783205,
"grad_norm": 0.27033478021621704,
"learning_rate": 9.950053552160644e-05,
"loss": 1.0653,
"step": 1870
},
{
"epoch": 2.8235957167011083,
"grad_norm": 0.2985825836658478,
"learning_rate": 9.948186296823048e-05,
"loss": 1.0417,
"step": 1880
},
{
"epoch": 2.8386248356190116,
"grad_norm": 0.2883852422237396,
"learning_rate": 9.94628495696112e-05,
"loss": 1.0503,
"step": 1890
},
{
"epoch": 2.8536539545369153,
"grad_norm": 0.25887343287467957,
"learning_rate": 9.94434954567184e-05,
"loss": 1.0526,
"step": 1900
},
{
"epoch": 2.868683073454819,
"grad_norm": 0.26801565289497375,
"learning_rate": 9.94238007628687e-05,
"loss": 1.0917,
"step": 1910
},
{
"epoch": 2.8837121923727222,
"grad_norm": 0.2502713203430176,
"learning_rate": 9.940376562372482e-05,
"loss": 1.0638,
"step": 1920
},
{
"epoch": 2.8987413112906255,
"grad_norm": 0.2549043297767639,
"learning_rate": 9.93833901772945e-05,
"loss": 1.0438,
"step": 1930
},
{
"epoch": 2.9137704302085288,
"grad_norm": 0.26013997197151184,
"learning_rate": 9.936267456392971e-05,
"loss": 1.0759,
"step": 1940
},
{
"epoch": 2.9287995491264325,
"grad_norm": 0.29080161452293396,
"learning_rate": 9.934161892632547e-05,
"loss": 1.0387,
"step": 1950
},
{
"epoch": 2.9438286680443357,
"grad_norm": 0.27860552072525024,
"learning_rate": 9.932022340951909e-05,
"loss": 1.0339,
"step": 1960
},
{
"epoch": 2.9588577869622394,
"grad_norm": 0.25391969084739685,
"learning_rate": 9.929848816088897e-05,
"loss": 1.0503,
"step": 1970
},
{
"epoch": 2.9738869058801427,
"grad_norm": 0.2683584690093994,
"learning_rate": 9.927641333015377e-05,
"loss": 1.0617,
"step": 1980
},
{
"epoch": 2.988916024798046,
"grad_norm": 0.29328426718711853,
"learning_rate": 9.925399906937123e-05,
"loss": 1.068,
"step": 1990
},
{
"epoch": 3.003005823783581,
"grad_norm": 0.26925235986709595,
"learning_rate": 9.923124553293718e-05,
"loss": 1.0641,
"step": 2000
},
{
"epoch": 3.018034942701484,
"grad_norm": 0.2933187186717987,
"learning_rate": 9.920815287758451e-05,
"loss": 1.0264,
"step": 2010
},
{
"epoch": 3.0330640616193874,
"grad_norm": 0.30965468287467957,
"learning_rate": 9.918472126238206e-05,
"loss": 1.0154,
"step": 2020
},
{
"epoch": 3.048093180537291,
"grad_norm": 0.3275061547756195,
"learning_rate": 9.916095084873347e-05,
"loss": 0.9905,
"step": 2030
},
{
"epoch": 3.0631222994551943,
"grad_norm": 0.40177953243255615,
"learning_rate": 9.913684180037619e-05,
"loss": 1.0066,
"step": 2040
},
{
"epoch": 3.078151418373098,
"grad_norm": 0.389649361371994,
"learning_rate": 9.911239428338023e-05,
"loss": 1.0424,
"step": 2050
},
{
"epoch": 3.0931805372910013,
"grad_norm": 0.3205302953720093,
"learning_rate": 9.908760846614709e-05,
"loss": 1.0234,
"step": 2060
},
{
"epoch": 3.1082096562089045,
"grad_norm": 0.3212546408176422,
"learning_rate": 9.906248451940861e-05,
"loss": 1.0075,
"step": 2070
},
{
"epoch": 3.1232387751268083,
"grad_norm": 0.33269983530044556,
"learning_rate": 9.903702261622567e-05,
"loss": 1.0039,
"step": 2080
},
{
"epoch": 3.1382678940447115,
"grad_norm": 0.34872928261756897,
"learning_rate": 9.901122293198719e-05,
"loss": 0.9952,
"step": 2090
},
{
"epoch": 3.153297012962615,
"grad_norm": 0.348037987947464,
"learning_rate": 9.898508564440879e-05,
"loss": 1.0133,
"step": 2100
},
{
"epoch": 3.1683261318805185,
"grad_norm": 0.3966461420059204,
"learning_rate": 9.895861093353158e-05,
"loss": 1.0049,
"step": 2110
},
{
"epoch": 3.1833552507984217,
"grad_norm": 0.3553076684474945,
"learning_rate": 9.893179898172095e-05,
"loss": 0.9789,
"step": 2120
},
{
"epoch": 3.1983843697163254,
"grad_norm": 0.38464319705963135,
"learning_rate": 9.890464997366529e-05,
"loss": 1.0062,
"step": 2130
},
{
"epoch": 3.2134134886342287,
"grad_norm": 0.3749645948410034,
"learning_rate": 9.887716409637478e-05,
"loss": 1.0364,
"step": 2140
},
{
"epoch": 3.2284426075521324,
"grad_norm": 0.3553982675075531,
"learning_rate": 9.884934153917997e-05,
"loss": 0.9896,
"step": 2150
},
{
"epoch": 3.2434717264700357,
"grad_norm": 0.34840455651283264,
"learning_rate": 9.882118249373063e-05,
"loss": 0.9954,
"step": 2160
},
{
"epoch": 3.258500845387939,
"grad_norm": 0.34040772914886475,
"learning_rate": 9.879268715399432e-05,
"loss": 1.0224,
"step": 2170
},
{
"epoch": 3.2735299643058426,
"grad_norm": 0.37151041626930237,
"learning_rate": 9.87638557162551e-05,
"loss": 0.9864,
"step": 2180
},
{
"epoch": 3.288559083223746,
"grad_norm": 0.34764307737350464,
"learning_rate": 9.87346883791122e-05,
"loss": 1.0121,
"step": 2190
},
{
"epoch": 3.3035882021416496,
"grad_norm": 0.3537833094596863,
"learning_rate": 9.870518534347853e-05,
"loss": 0.9952,
"step": 2200
},
{
"epoch": 3.318617321059553,
"grad_norm": 0.3364524245262146,
"learning_rate": 9.867534681257951e-05,
"loss": 1.0383,
"step": 2210
},
{
"epoch": 3.333646439977456,
"grad_norm": 0.33494752645492554,
"learning_rate": 9.864517299195144e-05,
"loss": 1.0318,
"step": 2220
},
{
"epoch": 3.34867555889536,
"grad_norm": 0.31135261058807373,
"learning_rate": 9.861466408944027e-05,
"loss": 0.9749,
"step": 2230
},
{
"epoch": 3.363704677813263,
"grad_norm": 0.36317843198776245,
"learning_rate": 9.858382031520005e-05,
"loss": 1.0232,
"step": 2240
},
{
"epoch": 3.378733796731167,
"grad_norm": 0.346181720495224,
"learning_rate": 9.855264188169152e-05,
"loss": 1.0099,
"step": 2250
},
{
"epoch": 3.39376291564907,
"grad_norm": 0.35162779688835144,
"learning_rate": 9.852112900368066e-05,
"loss": 1.0128,
"step": 2260
},
{
"epoch": 3.4087920345669733,
"grad_norm": 0.3490872383117676,
"learning_rate": 9.848928189823723e-05,
"loss": 1.0,
"step": 2270
},
{
"epoch": 3.423821153484877,
"grad_norm": 0.3363298177719116,
"learning_rate": 9.845710078473316e-05,
"loss": 1.0171,
"step": 2280
},
{
"epoch": 3.4388502724027803,
"grad_norm": 0.323453813791275,
"learning_rate": 9.842458588484123e-05,
"loss": 0.9908,
"step": 2290
},
{
"epoch": 3.453879391320684,
"grad_norm": 0.3421192765235901,
"learning_rate": 9.839173742253334e-05,
"loss": 1.0134,
"step": 2300
},
{
"epoch": 3.4689085102385873,
"grad_norm": 0.33773696422576904,
"learning_rate": 9.835855562407912e-05,
"loss": 0.9938,
"step": 2310
},
{
"epoch": 3.4839376291564905,
"grad_norm": 0.34854745864868164,
"learning_rate": 9.83250407180443e-05,
"loss": 0.9922,
"step": 2320
},
{
"epoch": 3.4989667480743942,
"grad_norm": 0.35300213098526,
"learning_rate": 9.829119293528916e-05,
"loss": 1.0067,
"step": 2330
},
{
"epoch": 3.5139958669922975,
"grad_norm": 0.34796491265296936,
"learning_rate": 9.82570125089669e-05,
"loss": 1.0133,
"step": 2340
},
{
"epoch": 3.529024985910201,
"grad_norm": 0.35767292976379395,
"learning_rate": 9.822249967452213e-05,
"loss": 1.0187,
"step": 2350
},
{
"epoch": 3.5440541048281045,
"grad_norm": 0.3610760569572449,
"learning_rate": 9.818765466968909e-05,
"loss": 1.0044,
"step": 2360
},
{
"epoch": 3.5590832237460077,
"grad_norm": 0.3299923241138458,
"learning_rate": 9.815247773449018e-05,
"loss": 0.9999,
"step": 2370
},
{
"epoch": 3.5741123426639114,
"grad_norm": 0.27984675765037537,
"learning_rate": 9.81169691112342e-05,
"loss": 0.9758,
"step": 2380
},
{
"epoch": 3.5891414615818147,
"grad_norm": 0.30341655015945435,
"learning_rate": 9.80811290445147e-05,
"loss": 1.0024,
"step": 2390
},
{
"epoch": 3.6041705804997184,
"grad_norm": 0.33460941910743713,
"learning_rate": 9.804495778120833e-05,
"loss": 1.0167,
"step": 2400
},
{
"epoch": 3.6191996994176217,
"grad_norm": 0.33041292428970337,
"learning_rate": 9.800845557047314e-05,
"loss": 1.0108,
"step": 2410
},
{
"epoch": 3.634228818335525,
"grad_norm": 0.304404079914093,
"learning_rate": 9.797162266374676e-05,
"loss": 1.0052,
"step": 2420
},
{
"epoch": 3.6492579372534286,
"grad_norm": 0.3226507008075714,
"learning_rate": 9.793445931474485e-05,
"loss": 1.0087,
"step": 2430
},
{
"epoch": 3.664287056171332,
"grad_norm": 0.3016469180583954,
"learning_rate": 9.789696577945917e-05,
"loss": 1.0068,
"step": 2440
},
{
"epoch": 3.6793161750892356,
"grad_norm": 0.317958265542984,
"learning_rate": 9.785914231615594e-05,
"loss": 1.0256,
"step": 2450
},
{
"epoch": 3.694345294007139,
"grad_norm": 0.3319275677204132,
"learning_rate": 9.782098918537399e-05,
"loss": 0.9882,
"step": 2460
},
{
"epoch": 3.709374412925042,
"grad_norm": 0.34686529636383057,
"learning_rate": 9.778250664992304e-05,
"loss": 1.0071,
"step": 2470
},
{
"epoch": 3.724403531842946,
"grad_norm": 0.36334285140037537,
"learning_rate": 9.77436949748818e-05,
"loss": 1.0086,
"step": 2480
},
{
"epoch": 3.739432650760849,
"grad_norm": 0.36445969343185425,
"learning_rate": 9.770455442759621e-05,
"loss": 1.0285,
"step": 2490
},
{
"epoch": 3.754461769678753,
"grad_norm": 0.32181107997894287,
"learning_rate": 9.766508527767757e-05,
"loss": 1.0374,
"step": 2500
},
{
"epoch": 3.769490888596656,
"grad_norm": 0.371354341506958,
"learning_rate": 9.762528779700067e-05,
"loss": 1.0192,
"step": 2510
},
{
"epoch": 3.7845200075145593,
"grad_norm": 0.3308964669704437,
"learning_rate": 9.758516225970198e-05,
"loss": 1.0117,
"step": 2520
},
{
"epoch": 3.799549126432463,
"grad_norm": 0.35072851181030273,
"learning_rate": 9.754470894217767e-05,
"loss": 1.02,
"step": 2530
},
{
"epoch": 3.8145782453503663,
"grad_norm": 0.3249657452106476,
"learning_rate": 9.750392812308178e-05,
"loss": 1.0205,
"step": 2540
},
{
"epoch": 3.82960736426827,
"grad_norm": 0.3178282380104065,
"learning_rate": 9.74628200833243e-05,
"loss": 1.0244,
"step": 2550
},
{
"epoch": 3.8446364831861732,
"grad_norm": 0.3914138674736023,
"learning_rate": 9.742138510606915e-05,
"loss": 1.0201,
"step": 2560
},
{
"epoch": 3.8596656021040765,
"grad_norm": 0.3437259793281555,
"learning_rate": 9.737962347673231e-05,
"loss": 1.0067,
"step": 2570
},
{
"epoch": 3.87469472102198,
"grad_norm": 0.3310168385505676,
"learning_rate": 9.733753548297988e-05,
"loss": 1.0215,
"step": 2580
},
{
"epoch": 3.8897238399398835,
"grad_norm": 0.35641738772392273,
"learning_rate": 9.729512141472599e-05,
"loss": 1.0181,
"step": 2590
},
{
"epoch": 3.904752958857787,
"grad_norm": 0.36426904797554016,
"learning_rate": 9.725238156413089e-05,
"loss": 1.0174,
"step": 2600
},
{
"epoch": 3.9197820777756904,
"grad_norm": 0.3366813659667969,
"learning_rate": 9.720931622559893e-05,
"loss": 1.0126,
"step": 2610
},
{
"epoch": 3.9348111966935937,
"grad_norm": 0.3486657440662384,
"learning_rate": 9.716592569577646e-05,
"loss": 1.0161,
"step": 2620
},
{
"epoch": 3.9498403156114974,
"grad_norm": 0.3317498564720154,
"learning_rate": 9.712221027354991e-05,
"loss": 1.0171,
"step": 2630
},
{
"epoch": 3.9648694345294007,
"grad_norm": 0.3477359712123871,
"learning_rate": 9.707817026004362e-05,
"loss": 1.0195,
"step": 2640
},
{
"epoch": 3.9798985534473044,
"grad_norm": 0.30774736404418945,
"learning_rate": 9.70338059586178e-05,
"loss": 1.0261,
"step": 2650
},
{
"epoch": 3.9949276723652076,
"grad_norm": 0.38554686307907104,
"learning_rate": 9.698911767486649e-05,
"loss": 1.0376,
"step": 2660
},
{
"epoch": 4.0090174713507425,
"grad_norm": 0.40208327770233154,
"learning_rate": 9.694410571661537e-05,
"loss": 0.9654,
"step": 2670
},
{
"epoch": 4.024046590268646,
"grad_norm": 0.4230579733848572,
"learning_rate": 9.689877039391968e-05,
"loss": 0.9452,
"step": 2680
},
{
"epoch": 4.039075709186549,
"grad_norm": 0.4582759439945221,
"learning_rate": 9.685311201906215e-05,
"loss": 0.9308,
"step": 2690
},
{
"epoch": 4.054104828104452,
"grad_norm": 0.4000380337238312,
"learning_rate": 9.680713090655072e-05,
"loss": 0.9203,
"step": 2700
},
{
"epoch": 4.0691339470223555,
"grad_norm": 0.3987461030483246,
"learning_rate": 9.676082737311645e-05,
"loss": 0.9427,
"step": 2710
},
{
"epoch": 4.08416306594026,
"grad_norm": 0.4363115429878235,
"learning_rate": 9.671420173771136e-05,
"loss": 0.9249,
"step": 2720
},
{
"epoch": 4.099192184858163,
"grad_norm": 0.39811596274375916,
"learning_rate": 9.666725432150616e-05,
"loss": 0.9205,
"step": 2730
},
{
"epoch": 4.114221303776066,
"grad_norm": 0.4178659915924072,
"learning_rate": 9.661998544788813e-05,
"loss": 0.927,
"step": 2740
},
{
"epoch": 4.1292504226939695,
"grad_norm": 0.43525931239128113,
"learning_rate": 9.657239544245876e-05,
"loss": 0.9172,
"step": 2750
},
{
"epoch": 4.144279541611873,
"grad_norm": 0.38502469658851624,
"learning_rate": 9.652448463303168e-05,
"loss": 0.9331,
"step": 2760
},
{
"epoch": 4.159308660529776,
"grad_norm": 0.50247722864151,
"learning_rate": 9.647625334963024e-05,
"loss": 0.9558,
"step": 2770
},
{
"epoch": 4.17433777944768,
"grad_norm": 0.4176265597343445,
"learning_rate": 9.642770192448536e-05,
"loss": 0.9374,
"step": 2780
},
{
"epoch": 4.189366898365583,
"grad_norm": 0.4144188463687897,
"learning_rate": 9.637883069203314e-05,
"loss": 0.9119,
"step": 2790
},
{
"epoch": 4.204396017283487,
"grad_norm": 0.4362613558769226,
"learning_rate": 9.632963998891262e-05,
"loss": 0.928,
"step": 2800
},
{
"epoch": 4.21942513620139,
"grad_norm": 0.45967820286750793,
"learning_rate": 9.628013015396346e-05,
"loss": 0.9398,
"step": 2810
},
{
"epoch": 4.234454255119294,
"grad_norm": 0.4533185660839081,
"learning_rate": 9.62303015282236e-05,
"loss": 0.9586,
"step": 2820
},
{
"epoch": 4.249483374037197,
"grad_norm": 0.438513845205307,
"learning_rate": 9.618015445492688e-05,
"loss": 0.9469,
"step": 2830
},
{
"epoch": 4.264512492955101,
"grad_norm": 0.45950812101364136,
"learning_rate": 9.612968927950065e-05,
"loss": 0.9438,
"step": 2840
},
{
"epoch": 4.279541611873004,
"grad_norm": 0.42663341760635376,
"learning_rate": 9.607890634956355e-05,
"loss": 0.9461,
"step": 2850
},
{
"epoch": 4.294570730790907,
"grad_norm": 0.4346635043621063,
"learning_rate": 9.602780601492294e-05,
"loss": 0.9323,
"step": 2860
},
{
"epoch": 4.30959984970881,
"grad_norm": 0.4921177327632904,
"learning_rate": 9.597638862757255e-05,
"loss": 0.9337,
"step": 2870
},
{
"epoch": 4.3246289686267145,
"grad_norm": 0.39174574613571167,
"learning_rate": 9.592465454169004e-05,
"loss": 0.938,
"step": 2880
},
{
"epoch": 4.339658087544618,
"grad_norm": 0.40984979271888733,
"learning_rate": 9.587260411363465e-05,
"loss": 0.9461,
"step": 2890
},
{
"epoch": 4.354687206462521,
"grad_norm": 0.37494781613349915,
"learning_rate": 9.582023770194461e-05,
"loss": 0.9407,
"step": 2900
},
{
"epoch": 4.369716325380424,
"grad_norm": 0.35851216316223145,
"learning_rate": 9.57675556673348e-05,
"loss": 0.9285,
"step": 2910
},
{
"epoch": 4.3847454442983285,
"grad_norm": 0.37766364216804504,
"learning_rate": 9.571455837269411e-05,
"loss": 0.9268,
"step": 2920
},
{
"epoch": 4.399774563216232,
"grad_norm": 0.45168834924697876,
"learning_rate": 9.566124618308312e-05,
"loss": 0.9593,
"step": 2930
},
{
"epoch": 4.414803682134135,
"grad_norm": 0.43097320199012756,
"learning_rate": 9.560761946573143e-05,
"loss": 0.9537,
"step": 2940
},
{
"epoch": 4.429832801052038,
"grad_norm": 0.415606826543808,
"learning_rate": 9.555367859003525e-05,
"loss": 0.929,
"step": 2950
},
{
"epoch": 4.4448619199699415,
"grad_norm": 0.3891099989414215,
"learning_rate": 9.54994239275548e-05,
"loss": 0.9103,
"step": 2960
},
{
"epoch": 4.459891038887845,
"grad_norm": 0.3769884705543518,
"learning_rate": 9.544485585201169e-05,
"loss": 0.9234,
"step": 2970
},
{
"epoch": 4.474920157805749,
"grad_norm": 0.46022331714630127,
"learning_rate": 9.538997473928647e-05,
"loss": 0.9734,
"step": 2980
},
{
"epoch": 4.489949276723652,
"grad_norm": 0.36743420362472534,
"learning_rate": 9.533478096741597e-05,
"loss": 0.9025,
"step": 2990
},
{
"epoch": 4.5049783956415554,
"grad_norm": 0.4562210738658905,
"learning_rate": 9.527927491659068e-05,
"loss": 0.9444,
"step": 3000
},
{
"epoch": 4.520007514559459,
"grad_norm": 0.4317024052143097,
"learning_rate": 9.522345696915218e-05,
"loss": 0.9301,
"step": 3010
},
{
"epoch": 4.535036633477363,
"grad_norm": 0.43993476033210754,
"learning_rate": 9.51673275095905e-05,
"loss": 0.9425,
"step": 3020
},
{
"epoch": 4.550065752395266,
"grad_norm": 0.34426409006118774,
"learning_rate": 9.51108869245414e-05,
"loss": 0.9348,
"step": 3030
},
{
"epoch": 4.565094871313169,
"grad_norm": 0.44477733969688416,
"learning_rate": 9.505413560278382e-05,
"loss": 0.9295,
"step": 3040
},
{
"epoch": 4.580123990231073,
"grad_norm": 0.4211689829826355,
"learning_rate": 9.49970739352371e-05,
"loss": 0.933,
"step": 3050
},
{
"epoch": 4.595153109148976,
"grad_norm": 0.45019835233688354,
"learning_rate": 9.493970231495835e-05,
"loss": 0.9471,
"step": 3060
},
{
"epoch": 4.610182228066879,
"grad_norm": 0.42713072896003723,
"learning_rate": 9.488202113713973e-05,
"loss": 0.953,
"step": 3070
},
{
"epoch": 4.625211346984783,
"grad_norm": 0.41138195991516113,
"learning_rate": 9.482403079910571e-05,
"loss": 0.9398,
"step": 3080
},
{
"epoch": 4.640240465902687,
"grad_norm": 0.42336663603782654,
"learning_rate": 9.476573170031035e-05,
"loss": 0.9342,
"step": 3090
},
{
"epoch": 4.65526958482059,
"grad_norm": 0.4236120581626892,
"learning_rate": 9.470712424233452e-05,
"loss": 0.9306,
"step": 3100
},
{
"epoch": 4.670298703738493,
"grad_norm": 0.47870710492134094,
"learning_rate": 9.464820882888319e-05,
"loss": 0.9763,
"step": 3110
},
{
"epoch": 4.685327822656397,
"grad_norm": 0.44699183106422424,
"learning_rate": 9.45889858657826e-05,
"loss": 0.9479,
"step": 3120
},
{
"epoch": 4.7003569415743005,
"grad_norm": 0.41658318042755127,
"learning_rate": 9.452945576097748e-05,
"loss": 0.9381,
"step": 3130
},
{
"epoch": 4.715386060492204,
"grad_norm": 0.42650163173675537,
"learning_rate": 9.446961892452824e-05,
"loss": 0.9333,
"step": 3140
},
{
"epoch": 4.730415179410107,
"grad_norm": 0.4480834901332855,
"learning_rate": 9.440947576860814e-05,
"loss": 0.9349,
"step": 3150
},
{
"epoch": 4.74544429832801,
"grad_norm": 0.41825857758522034,
"learning_rate": 9.434902670750047e-05,
"loss": 0.9768,
"step": 3160
},
{
"epoch": 4.7604734172459136,
"grad_norm": 0.38604798913002014,
"learning_rate": 9.428827215759568e-05,
"loss": 0.9374,
"step": 3170
},
{
"epoch": 4.775502536163818,
"grad_norm": 0.43158042430877686,
"learning_rate": 9.42272125373885e-05,
"loss": 0.942,
"step": 3180
},
{
"epoch": 4.790531655081721,
"grad_norm": 0.4181406497955322,
"learning_rate": 9.416584826747509e-05,
"loss": 0.9427,
"step": 3190
},
{
"epoch": 4.805560773999624,
"grad_norm": 0.42289501428604126,
"learning_rate": 9.410417977055011e-05,
"loss": 0.9731,
"step": 3200
},
{
"epoch": 4.8205898929175275,
"grad_norm": 0.42214304208755493,
"learning_rate": 9.404220747140382e-05,
"loss": 0.9236,
"step": 3210
},
{
"epoch": 4.835619011835432,
"grad_norm": 0.4040350019931793,
"learning_rate": 9.397993179691917e-05,
"loss": 0.9478,
"step": 3220
},
{
"epoch": 4.850648130753335,
"grad_norm": 0.40848028659820557,
"learning_rate": 9.391735317606885e-05,
"loss": 0.955,
"step": 3230
},
{
"epoch": 4.865677249671238,
"grad_norm": 0.46537673473358154,
"learning_rate": 9.385447203991231e-05,
"loss": 0.9618,
"step": 3240
},
{
"epoch": 4.880706368589141,
"grad_norm": 0.419888973236084,
"learning_rate": 9.379128882159283e-05,
"loss": 0.9686,
"step": 3250
},
{
"epoch": 4.895735487507045,
"grad_norm": 0.3668920397758484,
"learning_rate": 9.372780395633451e-05,
"loss": 0.9389,
"step": 3260
},
{
"epoch": 4.910764606424948,
"grad_norm": 0.3719962239265442,
"learning_rate": 9.36640178814393e-05,
"loss": 0.9546,
"step": 3270
},
{
"epoch": 4.925793725342852,
"grad_norm": 0.3528194725513458,
"learning_rate": 9.359993103628393e-05,
"loss": 0.9492,
"step": 3280
},
{
"epoch": 4.940822844260755,
"grad_norm": 0.4485328495502472,
"learning_rate": 9.353554386231695e-05,
"loss": 0.9555,
"step": 3290
},
{
"epoch": 4.955851963178659,
"grad_norm": 0.4136585593223572,
"learning_rate": 9.347085680305565e-05,
"loss": 0.9383,
"step": 3300
},
{
"epoch": 4.970881082096562,
"grad_norm": 0.4350145757198334,
"learning_rate": 9.340587030408304e-05,
"loss": 0.9432,
"step": 3310
},
{
"epoch": 4.985910201014466,
"grad_norm": 0.5096591114997864,
"learning_rate": 9.334058481304471e-05,
"loss": 0.9451,
"step": 3320
},
{
"epoch": 5.0,
"grad_norm": 0.6608612537384033,
"learning_rate": 9.327500077964584e-05,
"loss": 0.935,
"step": 3330
},
{
"epoch": 5.015029118917903,
"grad_norm": 0.4970506429672241,
"learning_rate": 9.320911865564802e-05,
"loss": 0.8215,
"step": 3340
},
{
"epoch": 5.0300582378358065,
"grad_norm": 0.4373551607131958,
"learning_rate": 9.314293889486619e-05,
"loss": 0.8335,
"step": 3350
},
{
"epoch": 5.045087356753711,
"grad_norm": 0.47342097759246826,
"learning_rate": 9.30764619531655e-05,
"loss": 0.8232,
"step": 3360
},
{
"epoch": 5.060116475671614,
"grad_norm": 0.4043892025947571,
"learning_rate": 9.300968828845817e-05,
"loss": 0.8394,
"step": 3370
},
{
"epoch": 5.075145594589517,
"grad_norm": 0.5077358484268188,
"learning_rate": 9.294261836070032e-05,
"loss": 0.8202,
"step": 3380
},
{
"epoch": 5.0901747135074205,
"grad_norm": 0.5389407277107239,
"learning_rate": 9.28752526318888e-05,
"loss": 0.812,
"step": 3390
},
{
"epoch": 5.105203832425324,
"grad_norm": 0.5698477625846863,
"learning_rate": 9.28075915660581e-05,
"loss": 0.8424,
"step": 3400
},
{
"epoch": 5.120232951343228,
"grad_norm": 0.47804853320121765,
"learning_rate": 9.273963562927695e-05,
"loss": 0.8513,
"step": 3410
},
{
"epoch": 5.135262070261131,
"grad_norm": 0.5664450526237488,
"learning_rate": 9.267138528964536e-05,
"loss": 0.8276,
"step": 3420
},
{
"epoch": 5.150291189179034,
"grad_norm": 0.5398600697517395,
"learning_rate": 9.260284101729116e-05,
"loss": 0.8398,
"step": 3430
},
{
"epoch": 5.165320308096938,
"grad_norm": 0.5055420398712158,
"learning_rate": 9.253400328436699e-05,
"loss": 0.8297,
"step": 3440
},
{
"epoch": 5.180349427014841,
"grad_norm": 0.4511585831642151,
"learning_rate": 9.246487256504682e-05,
"loss": 0.8141,
"step": 3450
},
{
"epoch": 5.195378545932745,
"grad_norm": 0.5470993518829346,
"learning_rate": 9.239544933552286e-05,
"loss": 0.8434,
"step": 3460
},
{
"epoch": 5.210407664850648,
"grad_norm": 0.4637773036956787,
"learning_rate": 9.232573407400221e-05,
"loss": 0.8497,
"step": 3470
},
{
"epoch": 5.225436783768552,
"grad_norm": 0.4901561141014099,
"learning_rate": 9.225572726070354e-05,
"loss": 0.8361,
"step": 3480
},
{
"epoch": 5.240465902686455,
"grad_norm": 0.531245231628418,
"learning_rate": 9.218542937785384e-05,
"loss": 0.8506,
"step": 3490
},
{
"epoch": 5.255495021604358,
"grad_norm": 0.5206908583641052,
"learning_rate": 9.211484090968506e-05,
"loss": 0.8347,
"step": 3500
},
{
"epoch": 5.270524140522262,
"grad_norm": 0.5049258470535278,
"learning_rate": 9.204396234243076e-05,
"loss": 0.8383,
"step": 3510
},
{
"epoch": 5.2855532594401655,
"grad_norm": 0.5462550520896912,
"learning_rate": 9.197279416432284e-05,
"loss": 0.8301,
"step": 3520
},
{
"epoch": 5.300582378358069,
"grad_norm": 0.5243920683860779,
"learning_rate": 9.190133686558808e-05,
"loss": 0.8392,
"step": 3530
},
{
"epoch": 5.315611497275972,
"grad_norm": 0.5010761618614197,
"learning_rate": 9.182959093844483e-05,
"loss": 0.8215,
"step": 3540
},
{
"epoch": 5.330640616193875,
"grad_norm": 0.5377451181411743,
"learning_rate": 9.175755687709956e-05,
"loss": 0.8311,
"step": 3550
},
{
"epoch": 5.3456697351117795,
"grad_norm": 0.5271348357200623,
"learning_rate": 9.168523517774356e-05,
"loss": 0.8266,
"step": 3560
},
{
"epoch": 5.360698854029683,
"grad_norm": 0.48982876539230347,
"learning_rate": 9.161262633854935e-05,
"loss": 0.8571,
"step": 3570
},
{
"epoch": 5.375727972947586,
"grad_norm": 0.5555334687232971,
"learning_rate": 9.153973085966746e-05,
"loss": 0.8414,
"step": 3580
},
{
"epoch": 5.390757091865489,
"grad_norm": 0.5088291764259338,
"learning_rate": 9.146654924322277e-05,
"loss": 0.8541,
"step": 3590
},
{
"epoch": 5.4057862107833925,
"grad_norm": 0.6044062376022339,
"learning_rate": 9.139308199331125e-05,
"loss": 0.8553,
"step": 3600
},
{
"epoch": 5.420815329701297,
"grad_norm": 0.549253523349762,
"learning_rate": 9.131932961599636e-05,
"loss": 0.8303,
"step": 3610
},
{
"epoch": 5.4358444486192,
"grad_norm": 0.5907899737358093,
"learning_rate": 9.124529261930559e-05,
"loss": 0.8264,
"step": 3620
},
{
"epoch": 5.450873567537103,
"grad_norm": 0.5540890097618103,
"learning_rate": 9.117097151322697e-05,
"loss": 0.8292,
"step": 3630
},
{
"epoch": 5.465902686455006,
"grad_norm": 0.5545858144760132,
"learning_rate": 9.109636680970557e-05,
"loss": 0.8382,
"step": 3640
},
{
"epoch": 5.48093180537291,
"grad_norm": 0.5407220721244812,
"learning_rate": 9.102147902263995e-05,
"loss": 0.863,
"step": 3650
},
{
"epoch": 5.495960924290814,
"grad_norm": 0.5022987723350525,
"learning_rate": 9.094630866787863e-05,
"loss": 0.8624,
"step": 3660
},
{
"epoch": 5.510990043208717,
"grad_norm": 0.5069270730018616,
"learning_rate": 9.087085626321657e-05,
"loss": 0.8494,
"step": 3670
},
{
"epoch": 5.52601916212662,
"grad_norm": 0.586992621421814,
"learning_rate": 9.07951223283915e-05,
"loss": 0.8708,
"step": 3680
},
{
"epoch": 5.541048281044524,
"grad_norm": 0.48386263847351074,
"learning_rate": 9.071910738508048e-05,
"loss": 0.8327,
"step": 3690
},
{
"epoch": 5.556077399962427,
"grad_norm": 0.5556206703186035,
"learning_rate": 9.064281195689621e-05,
"loss": 0.8506,
"step": 3700
},
{
"epoch": 5.571106518880331,
"grad_norm": 0.4873793423175812,
"learning_rate": 9.056623656938344e-05,
"loss": 0.8314,
"step": 3710
},
{
"epoch": 5.586135637798234,
"grad_norm": 0.5752863883972168,
"learning_rate": 9.048938175001535e-05,
"loss": 0.8559,
"step": 3720
},
{
"epoch": 5.601164756716138,
"grad_norm": 0.5001512765884399,
"learning_rate": 9.041224802818999e-05,
"loss": 0.8517,
"step": 3730
},
{
"epoch": 5.616193875634041,
"grad_norm": 0.5640326142311096,
"learning_rate": 9.033483593522651e-05,
"loss": 0.8471,
"step": 3740
},
{
"epoch": 5.631222994551944,
"grad_norm": 0.544611930847168,
"learning_rate": 9.025714600436157e-05,
"loss": 0.8314,
"step": 3750
},
{
"epoch": 5.646252113469847,
"grad_norm": 0.5598495602607727,
"learning_rate": 9.017917877074565e-05,
"loss": 0.8454,
"step": 3760
},
{
"epoch": 5.6612812323877515,
"grad_norm": 0.6049039959907532,
"learning_rate": 9.010093477143942e-05,
"loss": 0.8376,
"step": 3770
},
{
"epoch": 5.676310351305655,
"grad_norm": 0.5953666567802429,
"learning_rate": 9.002241454540992e-05,
"loss": 0.8655,
"step": 3780
},
{
"epoch": 5.691339470223558,
"grad_norm": 0.5012089610099792,
"learning_rate": 8.994361863352696e-05,
"loss": 0.8556,
"step": 3790
},
{
"epoch": 5.706368589141461,
"grad_norm": 0.5770487189292908,
"learning_rate": 8.986454757855938e-05,
"loss": 0.8613,
"step": 3800
},
{
"epoch": 5.721397708059365,
"grad_norm": 0.5475596189498901,
"learning_rate": 8.978520192517121e-05,
"loss": 0.8689,
"step": 3810
},
{
"epoch": 5.736426826977269,
"grad_norm": 0.4748040437698364,
"learning_rate": 8.970558221991807e-05,
"loss": 0.8444,
"step": 3820
},
{
"epoch": 5.751455945895172,
"grad_norm": 0.5324169993400574,
"learning_rate": 8.962568901124327e-05,
"loss": 0.8642,
"step": 3830
},
{
"epoch": 5.766485064813075,
"grad_norm": 0.5375658869743347,
"learning_rate": 8.954552284947411e-05,
"loss": 0.8528,
"step": 3840
},
{
"epoch": 5.7815141837309785,
"grad_norm": 0.5448617339134216,
"learning_rate": 8.946508428681807e-05,
"loss": 0.8394,
"step": 3850
},
{
"epoch": 5.796543302648882,
"grad_norm": 0.5199793577194214,
"learning_rate": 8.938437387735903e-05,
"loss": 0.8615,
"step": 3860
},
{
"epoch": 5.811572421566786,
"grad_norm": 0.5268539190292358,
"learning_rate": 8.930339217705337e-05,
"loss": 0.8661,
"step": 3870
},
{
"epoch": 5.826601540484689,
"grad_norm": 0.5181281566619873,
"learning_rate": 8.922213974372628e-05,
"loss": 0.8643,
"step": 3880
},
{
"epoch": 5.841630659402592,
"grad_norm": 0.5384554862976074,
"learning_rate": 8.914061713706776e-05,
"loss": 0.8355,
"step": 3890
},
{
"epoch": 5.856659778320496,
"grad_norm": 0.5838069319725037,
"learning_rate": 8.905882491862888e-05,
"loss": 0.8723,
"step": 3900
},
{
"epoch": 5.8716888972384,
"grad_norm": 0.5165135860443115,
"learning_rate": 8.897676365181784e-05,
"loss": 0.8298,
"step": 3910
},
{
"epoch": 5.886718016156303,
"grad_norm": 0.5289579033851624,
"learning_rate": 8.889443390189618e-05,
"loss": 0.8664,
"step": 3920
},
{
"epoch": 5.901747135074206,
"grad_norm": 0.4891420304775238,
"learning_rate": 8.88118362359748e-05,
"loss": 0.8503,
"step": 3930
},
{
"epoch": 5.91677625399211,
"grad_norm": 0.49529027938842773,
"learning_rate": 8.872897122301004e-05,
"loss": 0.8497,
"step": 3940
},
{
"epoch": 5.931805372910013,
"grad_norm": 0.6124776601791382,
"learning_rate": 8.864583943379987e-05,
"loss": 0.8829,
"step": 3950
},
{
"epoch": 5.946834491827916,
"grad_norm": 0.5730892419815063,
"learning_rate": 8.856244144097988e-05,
"loss": 0.8372,
"step": 3960
},
{
"epoch": 5.96186361074582,
"grad_norm": 0.5806572437286377,
"learning_rate": 8.847877781901928e-05,
"loss": 0.8661,
"step": 3970
},
{
"epoch": 5.9768927296637235,
"grad_norm": 0.5184414386749268,
"learning_rate": 8.83948491442171e-05,
"loss": 0.8747,
"step": 3980
},
{
"epoch": 5.991921848581627,
"grad_norm": 0.5810568332672119,
"learning_rate": 8.831065599469806e-05,
"loss": 0.8747,
"step": 3990
},
{
"epoch": 6.006011647567162,
"grad_norm": 0.5326306819915771,
"learning_rate": 8.822619895040868e-05,
"loss": 0.7988,
"step": 4000
},
{
"epoch": 6.021040766485065,
"grad_norm": 0.5372363924980164,
"learning_rate": 8.814147859311332e-05,
"loss": 0.712,
"step": 4010
},
{
"epoch": 6.036069885402968,
"grad_norm": 0.6200835108757019,
"learning_rate": 8.805649550639004e-05,
"loss": 0.7213,
"step": 4020
},
{
"epoch": 6.051099004320871,
"grad_norm": 0.5874983072280884,
"learning_rate": 8.797125027562665e-05,
"loss": 0.7096,
"step": 4030
},
{
"epoch": 6.066128123238775,
"grad_norm": 0.6422827243804932,
"learning_rate": 8.788574348801675e-05,
"loss": 0.7223,
"step": 4040
},
{
"epoch": 6.081157242156679,
"grad_norm": 0.641160786151886,
"learning_rate": 8.779997573255553e-05,
"loss": 0.7231,
"step": 4050
},
{
"epoch": 6.096186361074582,
"grad_norm": 0.7293818593025208,
"learning_rate": 8.771394760003593e-05,
"loss": 0.7092,
"step": 4060
},
{
"epoch": 6.111215479992485,
"grad_norm": 0.60944664478302,
"learning_rate": 8.762765968304431e-05,
"loss": 0.7203,
"step": 4070
},
{
"epoch": 6.126244598910389,
"grad_norm": 0.6189725399017334,
"learning_rate": 8.754111257595657e-05,
"loss": 0.7136,
"step": 4080
},
{
"epoch": 6.141273717828292,
"grad_norm": 0.6322532296180725,
"learning_rate": 8.745430687493396e-05,
"loss": 0.7382,
"step": 4090
},
{
"epoch": 6.156302836746196,
"grad_norm": 0.6236686706542969,
"learning_rate": 8.736724317791902e-05,
"loss": 0.7221,
"step": 4100
},
{
"epoch": 6.171331955664099,
"grad_norm": 0.5708134174346924,
"learning_rate": 8.727992208463143e-05,
"loss": 0.7205,
"step": 4110
},
{
"epoch": 6.186361074582003,
"grad_norm": 0.6412458419799805,
"learning_rate": 8.719234419656387e-05,
"loss": 0.7306,
"step": 4120
},
{
"epoch": 6.201390193499906,
"grad_norm": 0.6535741686820984,
"learning_rate": 8.710451011697793e-05,
"loss": 0.7169,
"step": 4130
},
{
"epoch": 6.216419312417809,
"grad_norm": 0.6490382552146912,
"learning_rate": 8.701642045089992e-05,
"loss": 0.7145,
"step": 4140
},
{
"epoch": 6.231448431335713,
"grad_norm": 0.7014051079750061,
"learning_rate": 8.692807580511667e-05,
"loss": 0.7569,
"step": 4150
},
{
"epoch": 6.2464775502536165,
"grad_norm": 0.7195674180984497,
"learning_rate": 8.683947678817139e-05,
"loss": 0.7244,
"step": 4160
},
{
"epoch": 6.26150666917152,
"grad_norm": 0.6836762428283691,
"learning_rate": 8.675062401035952e-05,
"loss": 0.7303,
"step": 4170
},
{
"epoch": 6.276535788089423,
"grad_norm": 0.6135929822921753,
"learning_rate": 8.666151808372439e-05,
"loss": 0.7179,
"step": 4180
},
{
"epoch": 6.291564907007326,
"grad_norm": 0.6589913368225098,
"learning_rate": 8.657215962205319e-05,
"loss": 0.7455,
"step": 4190
},
{
"epoch": 6.30659402592523,
"grad_norm": 0.6406304836273193,
"learning_rate": 8.648254924087254e-05,
"loss": 0.7496,
"step": 4200
},
{
"epoch": 6.321623144843134,
"grad_norm": 0.6410109400749207,
"learning_rate": 8.639268755744447e-05,
"loss": 0.7355,
"step": 4210
},
{
"epoch": 6.336652263761037,
"grad_norm": 0.6654278039932251,
"learning_rate": 8.630257519076196e-05,
"loss": 0.7367,
"step": 4220
},
{
"epoch": 6.35168138267894,
"grad_norm": 0.588206946849823,
"learning_rate": 8.621221276154481e-05,
"loss": 0.7255,
"step": 4230
},
{
"epoch": 6.3667105015968435,
"grad_norm": 0.633627712726593,
"learning_rate": 8.612160089223529e-05,
"loss": 0.7248,
"step": 4240
},
{
"epoch": 6.381739620514748,
"grad_norm": 0.6771560311317444,
"learning_rate": 8.603074020699393e-05,
"loss": 0.7393,
"step": 4250
},
{
"epoch": 6.396768739432651,
"grad_norm": 0.682534396648407,
"learning_rate": 8.593963133169514e-05,
"loss": 0.7406,
"step": 4260
},
{
"epoch": 6.411797858350554,
"grad_norm": 0.6308305859565735,
"learning_rate": 8.584827489392293e-05,
"loss": 0.751,
"step": 4270
},
{
"epoch": 6.426826977268457,
"grad_norm": 0.7026039958000183,
"learning_rate": 8.575667152296665e-05,
"loss": 0.7335,
"step": 4280
},
{
"epoch": 6.441856096186361,
"grad_norm": 0.6078832149505615,
"learning_rate": 8.566482184981651e-05,
"loss": 0.752,
"step": 4290
},
{
"epoch": 6.456885215104265,
"grad_norm": 0.6271105408668518,
"learning_rate": 8.557272650715939e-05,
"loss": 0.7436,
"step": 4300
},
{
"epoch": 6.471914334022168,
"grad_norm": 0.7435263991355896,
"learning_rate": 8.54803861293744e-05,
"loss": 0.7516,
"step": 4310
},
{
"epoch": 6.486943452940071,
"grad_norm": 0.6983492970466614,
"learning_rate": 8.538780135252844e-05,
"loss": 0.7369,
"step": 4320
},
{
"epoch": 6.501972571857975,
"grad_norm": 0.6141520738601685,
"learning_rate": 8.529497281437204e-05,
"loss": 0.7415,
"step": 4330
},
{
"epoch": 6.517001690775878,
"grad_norm": 0.580833375453949,
"learning_rate": 8.520190115433473e-05,
"loss": 0.7542,
"step": 4340
},
{
"epoch": 6.532030809693782,
"grad_norm": 0.6651113033294678,
"learning_rate": 8.510858701352076e-05,
"loss": 0.7251,
"step": 4350
},
{
"epoch": 6.547059928611685,
"grad_norm": 0.676468551158905,
"learning_rate": 8.501503103470466e-05,
"loss": 0.7377,
"step": 4360
},
{
"epoch": 6.5620890475295885,
"grad_norm": 0.6262651085853577,
"learning_rate": 8.492123386232677e-05,
"loss": 0.7158,
"step": 4370
},
{
"epoch": 6.577118166447492,
"grad_norm": 0.7301998138427734,
"learning_rate": 8.482719614248894e-05,
"loss": 0.7483,
"step": 4380
},
{
"epoch": 6.592147285365395,
"grad_norm": 0.602796733379364,
"learning_rate": 8.473291852294987e-05,
"loss": 0.7332,
"step": 4390
},
{
"epoch": 6.607176404283299,
"grad_norm": 0.6329184770584106,
"learning_rate": 8.463840165312082e-05,
"loss": 0.7518,
"step": 4400
},
{
"epoch": 6.6222055232012025,
"grad_norm": 0.7019734382629395,
"learning_rate": 8.454364618406106e-05,
"loss": 0.7702,
"step": 4410
},
{
"epoch": 6.637234642119106,
"grad_norm": 0.6546521782875061,
"learning_rate": 8.444865276847338e-05,
"loss": 0.751,
"step": 4420
},
{
"epoch": 6.652263761037009,
"grad_norm": 0.7014687657356262,
"learning_rate": 8.435342206069965e-05,
"loss": 0.7662,
"step": 4430
},
{
"epoch": 6.667292879954912,
"grad_norm": 0.6677362322807312,
"learning_rate": 8.425795471671625e-05,
"loss": 0.74,
"step": 4440
},
{
"epoch": 6.682321998872816,
"grad_norm": 0.6421080231666565,
"learning_rate": 8.416225139412959e-05,
"loss": 0.7491,
"step": 4450
},
{
"epoch": 6.69735111779072,
"grad_norm": 0.6495652794837952,
"learning_rate": 8.406631275217156e-05,
"loss": 0.7612,
"step": 4460
},
{
"epoch": 6.712380236708623,
"grad_norm": 0.7310630679130554,
"learning_rate": 8.397013945169501e-05,
"loss": 0.7475,
"step": 4470
},
{
"epoch": 6.727409355626526,
"grad_norm": 0.6594589948654175,
"learning_rate": 8.387373215516918e-05,
"loss": 0.7295,
"step": 4480
},
{
"epoch": 6.7424384745444295,
"grad_norm": 0.6998351216316223,
"learning_rate": 8.377709152667512e-05,
"loss": 0.756,
"step": 4490
},
{
"epoch": 6.757467593462334,
"grad_norm": 0.6579599380493164,
"learning_rate": 8.368021823190116e-05,
"loss": 0.7256,
"step": 4500
},
{
"epoch": 6.772496712380237,
"grad_norm": 0.6116402745246887,
"learning_rate": 8.358311293813832e-05,
"loss": 0.7358,
"step": 4510
},
{
"epoch": 6.78752583129814,
"grad_norm": 0.6876879930496216,
"learning_rate": 8.348577631427566e-05,
"loss": 0.7568,
"step": 4520
},
{
"epoch": 6.802554950216043,
"grad_norm": 0.6426005363464355,
"learning_rate": 8.33882090307957e-05,
"loss": 0.7563,
"step": 4530
},
{
"epoch": 6.817584069133947,
"grad_norm": 0.6187247633934021,
"learning_rate": 8.329041175976987e-05,
"loss": 0.7367,
"step": 4540
},
{
"epoch": 6.832613188051851,
"grad_norm": 0.6543039679527283,
"learning_rate": 8.319238517485375e-05,
"loss": 0.7577,
"step": 4550
},
{
"epoch": 6.847642306969754,
"grad_norm": 0.6411317586898804,
"learning_rate": 8.309412995128256e-05,
"loss": 0.7614,
"step": 4560
},
{
"epoch": 6.862671425887657,
"grad_norm": 0.7125687599182129,
"learning_rate": 8.299564676586638e-05,
"loss": 0.7572,
"step": 4570
},
{
"epoch": 6.877700544805561,
"grad_norm": 0.7412214875221252,
"learning_rate": 8.289693629698564e-05,
"loss": 0.7724,
"step": 4580
},
{
"epoch": 6.892729663723464,
"grad_norm": 0.6838482022285461,
"learning_rate": 8.279799922458629e-05,
"loss": 0.7428,
"step": 4590
},
{
"epoch": 6.907758782641368,
"grad_norm": 0.6079447269439697,
"learning_rate": 8.269883623017522e-05,
"loss": 0.7515,
"step": 4600
},
{
"epoch": 6.922787901559271,
"grad_norm": 0.7181859612464905,
"learning_rate": 8.259944799681555e-05,
"loss": 0.7472,
"step": 4610
},
{
"epoch": 6.9378170204771745,
"grad_norm": 0.7185594439506531,
"learning_rate": 8.249983520912187e-05,
"loss": 0.7582,
"step": 4620
},
{
"epoch": 6.952846139395078,
"grad_norm": 0.7397907972335815,
"learning_rate": 8.239999855325563e-05,
"loss": 0.7578,
"step": 4630
},
{
"epoch": 6.967875258312981,
"grad_norm": 0.6544892191886902,
"learning_rate": 8.229993871692028e-05,
"loss": 0.7511,
"step": 4640
},
{
"epoch": 6.982904377230885,
"grad_norm": 0.7269999384880066,
"learning_rate": 8.219965638935662e-05,
"loss": 0.7557,
"step": 4650
},
{
"epoch": 6.9979334961487885,
"grad_norm": 0.7143056392669678,
"learning_rate": 8.209915226133807e-05,
"loss": 0.7603,
"step": 4660
},
{
"epoch": 7.012023295134322,
"grad_norm": 0.740738034248352,
"learning_rate": 8.199842702516583e-05,
"loss": 0.6384,
"step": 4670
},
{
"epoch": 7.027052414052227,
"grad_norm": 0.7142441868782043,
"learning_rate": 8.189748137466417e-05,
"loss": 0.6018,
"step": 4680
},
{
"epoch": 7.04208153297013,
"grad_norm": 0.8026095628738403,
"learning_rate": 8.179631600517565e-05,
"loss": 0.6187,
"step": 4690
},
{
"epoch": 7.057110651888033,
"grad_norm": 0.8209463953971863,
"learning_rate": 8.169493161355633e-05,
"loss": 0.6178,
"step": 4700
},
{
"epoch": 7.072139770805936,
"grad_norm": 0.7156078219413757,
"learning_rate": 8.159332889817088e-05,
"loss": 0.6223,
"step": 4710
},
{
"epoch": 7.08716888972384,
"grad_norm": 0.7837380170822144,
"learning_rate": 8.149150855888794e-05,
"loss": 0.603,
"step": 4720
},
{
"epoch": 7.102198008641744,
"grad_norm": 0.7317357063293457,
"learning_rate": 8.138947129707517e-05,
"loss": 0.6183,
"step": 4730
},
{
"epoch": 7.117227127559647,
"grad_norm": 0.6778579950332642,
"learning_rate": 8.128721781559443e-05,
"loss": 0.6123,
"step": 4740
},
{
"epoch": 7.13225624647755,
"grad_norm": 0.6829363703727722,
"learning_rate": 8.118474881879701e-05,
"loss": 0.6111,
"step": 4750
},
{
"epoch": 7.147285365395454,
"grad_norm": 0.7064921855926514,
"learning_rate": 8.108206501251866e-05,
"loss": 0.6142,
"step": 4760
},
{
"epoch": 7.162314484313357,
"grad_norm": 0.7147718071937561,
"learning_rate": 8.097916710407492e-05,
"loss": 0.6128,
"step": 4770
},
{
"epoch": 7.177343603231261,
"grad_norm": 0.7428337335586548,
"learning_rate": 8.0876055802256e-05,
"loss": 0.6087,
"step": 4780
},
{
"epoch": 7.192372722149164,
"grad_norm": 0.7002803087234497,
"learning_rate": 8.077273181732207e-05,
"loss": 0.6421,
"step": 4790
},
{
"epoch": 7.2074018410670675,
"grad_norm": 0.7221034169197083,
"learning_rate": 8.066919586099834e-05,
"loss": 0.6159,
"step": 4800
},
{
"epoch": 7.222430959984971,
"grad_norm": 0.7155001759529114,
"learning_rate": 8.056544864647015e-05,
"loss": 0.6227,
"step": 4810
},
{
"epoch": 7.237460078902874,
"grad_norm": 0.828462541103363,
"learning_rate": 8.046149088837802e-05,
"loss": 0.6249,
"step": 4820
},
{
"epoch": 7.252489197820778,
"grad_norm": 0.7177339792251587,
"learning_rate": 8.035732330281273e-05,
"loss": 0.6205,
"step": 4830
},
{
"epoch": 7.267518316738681,
"grad_norm": 0.7466073632240295,
"learning_rate": 8.025294660731048e-05,
"loss": 0.6225,
"step": 4840
},
{
"epoch": 7.282547435656585,
"grad_norm": 0.7658254504203796,
"learning_rate": 8.014836152084784e-05,
"loss": 0.6259,
"step": 4850
},
{
"epoch": 7.297576554574488,
"grad_norm": 0.7269898653030396,
"learning_rate": 8.00435687638368e-05,
"loss": 0.6228,
"step": 4860
},
{
"epoch": 7.312605673492391,
"grad_norm": 0.8240427374839783,
"learning_rate": 7.993856905811991e-05,
"loss": 0.6242,
"step": 4870
},
{
"epoch": 7.327634792410295,
"grad_norm": 0.7971922755241394,
"learning_rate": 7.983336312696522e-05,
"loss": 0.6272,
"step": 4880
},
{
"epoch": 7.342663911328199,
"grad_norm": 0.7452378869056702,
"learning_rate": 7.972795169506129e-05,
"loss": 0.6214,
"step": 4890
},
{
"epoch": 7.357693030246102,
"grad_norm": 0.7922284603118896,
"learning_rate": 7.962233548851227e-05,
"loss": 0.6257,
"step": 4900
},
{
"epoch": 7.372722149164005,
"grad_norm": 0.8231662511825562,
"learning_rate": 7.951651523483283e-05,
"loss": 0.6288,
"step": 4910
},
{
"epoch": 7.387751268081908,
"grad_norm": 0.7604002952575684,
"learning_rate": 7.941049166294319e-05,
"loss": 0.6416,
"step": 4920
},
{
"epoch": 7.402780386999812,
"grad_norm": 0.7322626709938049,
"learning_rate": 7.930426550316406e-05,
"loss": 0.628,
"step": 4930
},
{
"epoch": 7.417809505917716,
"grad_norm": 0.7688371539115906,
"learning_rate": 7.919783748721168e-05,
"loss": 0.6245,
"step": 4940
},
{
"epoch": 7.432838624835619,
"grad_norm": 0.8524195551872253,
"learning_rate": 7.909120834819268e-05,
"loss": 0.6431,
"step": 4950
},
{
"epoch": 7.447867743753522,
"grad_norm": 0.8562901020050049,
"learning_rate": 7.898437882059913e-05,
"loss": 0.6291,
"step": 4960
},
{
"epoch": 7.462896862671426,
"grad_norm": 0.7663971185684204,
"learning_rate": 7.887734964030337e-05,
"loss": 0.6361,
"step": 4970
},
{
"epoch": 7.47792598158933,
"grad_norm": 0.7779290676116943,
"learning_rate": 7.87701215445531e-05,
"loss": 0.6321,
"step": 4980
},
{
"epoch": 7.492955100507233,
"grad_norm": 0.8450044393539429,
"learning_rate": 7.86626952719661e-05,
"loss": 0.6554,
"step": 4990
},
{
"epoch": 7.507984219425136,
"grad_norm": 0.7660729885101318,
"learning_rate": 7.855507156252535e-05,
"loss": 0.6546,
"step": 5000
},
{
"epoch": 7.5230133383430395,
"grad_norm": 0.9639895558357239,
"learning_rate": 7.844725115757375e-05,
"loss": 0.6388,
"step": 5010
},
{
"epoch": 7.538042457260943,
"grad_norm": 0.8670216798782349,
"learning_rate": 7.833923479980914e-05,
"loss": 0.6489,
"step": 5020
},
{
"epoch": 7.553071576178846,
"grad_norm": 0.7850314974784851,
"learning_rate": 7.823102323327911e-05,
"loss": 0.6397,
"step": 5030
},
{
"epoch": 7.56810069509675,
"grad_norm": 0.7203473448753357,
"learning_rate": 7.812261720337594e-05,
"loss": 0.6466,
"step": 5040
},
{
"epoch": 7.5831298140146535,
"grad_norm": 0.7159662246704102,
"learning_rate": 7.801401745683143e-05,
"loss": 0.6336,
"step": 5050
},
{
"epoch": 7.598158932932557,
"grad_norm": 0.8092458844184875,
"learning_rate": 7.79052247417117e-05,
"loss": 0.6415,
"step": 5060
},
{
"epoch": 7.61318805185046,
"grad_norm": 0.7300180196762085,
"learning_rate": 7.779623980741214e-05,
"loss": 0.6469,
"step": 5070
},
{
"epoch": 7.628217170768364,
"grad_norm": 0.8448249697685242,
"learning_rate": 7.768706340465219e-05,
"loss": 0.6281,
"step": 5080
},
{
"epoch": 7.643246289686267,
"grad_norm": 0.7753276824951172,
"learning_rate": 7.757769628547018e-05,
"loss": 0.644,
"step": 5090
},
{
"epoch": 7.658275408604171,
"grad_norm": 0.7004479765892029,
"learning_rate": 7.746813920321816e-05,
"loss": 0.6349,
"step": 5100
},
{
"epoch": 7.673304527522074,
"grad_norm": 0.7119005918502808,
"learning_rate": 7.735839291255667e-05,
"loss": 0.6477,
"step": 5110
},
{
"epoch": 7.688333646439977,
"grad_norm": 0.8026734590530396,
"learning_rate": 7.724845816944961e-05,
"loss": 0.6302,
"step": 5120
},
{
"epoch": 7.7033627653578804,
"grad_norm": 0.7971638441085815,
"learning_rate": 7.713833573115894e-05,
"loss": 0.642,
"step": 5130
},
{
"epoch": 7.718391884275785,
"grad_norm": 0.7363801598548889,
"learning_rate": 7.70280263562396e-05,
"loss": 0.6509,
"step": 5140
},
{
"epoch": 7.733421003193688,
"grad_norm": 0.7832568883895874,
"learning_rate": 7.691753080453412e-05,
"loss": 0.6517,
"step": 5150
},
{
"epoch": 7.748450122111591,
"grad_norm": 0.7115653157234192,
"learning_rate": 7.680684983716753e-05,
"loss": 0.6484,
"step": 5160
},
{
"epoch": 7.763479241029494,
"grad_norm": 0.7662774324417114,
"learning_rate": 7.6695984216542e-05,
"loss": 0.6496,
"step": 5170
},
{
"epoch": 7.7785083599473985,
"grad_norm": 0.7544398307800293,
"learning_rate": 7.658493470633173e-05,
"loss": 0.6394,
"step": 5180
},
{
"epoch": 7.793537478865302,
"grad_norm": 0.7812057733535767,
"learning_rate": 7.647370207147748e-05,
"loss": 0.6494,
"step": 5190
},
{
"epoch": 7.808566597783205,
"grad_norm": 0.7722028493881226,
"learning_rate": 7.636228707818154e-05,
"loss": 0.6395,
"step": 5200
},
{
"epoch": 7.823595716701108,
"grad_norm": 0.776189923286438,
"learning_rate": 7.625069049390227e-05,
"loss": 0.6474,
"step": 5210
},
{
"epoch": 7.838624835619012,
"grad_norm": 0.6927589178085327,
"learning_rate": 7.613891308734894e-05,
"loss": 0.6419,
"step": 5220
},
{
"epoch": 7.853653954536915,
"grad_norm": 0.8120152354240417,
"learning_rate": 7.60269556284763e-05,
"loss": 0.6638,
"step": 5230
},
{
"epoch": 7.868683073454819,
"grad_norm": 0.8518467545509338,
"learning_rate": 7.59148188884794e-05,
"loss": 0.6546,
"step": 5240
},
{
"epoch": 7.883712192372722,
"grad_norm": 0.8371894359588623,
"learning_rate": 7.580250363978824e-05,
"loss": 0.6567,
"step": 5250
},
{
"epoch": 7.8987413112906255,
"grad_norm": 0.8003565669059753,
"learning_rate": 7.569001065606238e-05,
"loss": 0.6443,
"step": 5260
},
{
"epoch": 7.913770430208529,
"grad_norm": 0.8672810196876526,
"learning_rate": 7.557734071218576e-05,
"loss": 0.6559,
"step": 5270
},
{
"epoch": 7.928799549126433,
"grad_norm": 0.7518348097801208,
"learning_rate": 7.546449458426117e-05,
"loss": 0.6579,
"step": 5280
},
{
"epoch": 7.943828668044336,
"grad_norm": 0.8424391150474548,
"learning_rate": 7.535147304960508e-05,
"loss": 0.6588,
"step": 5290
},
{
"epoch": 7.9588577869622394,
"grad_norm": 0.7776015996932983,
"learning_rate": 7.52382768867422e-05,
"loss": 0.6516,
"step": 5300
},
{
"epoch": 7.973886905880143,
"grad_norm": 0.8192471861839294,
"learning_rate": 7.512490687540009e-05,
"loss": 0.6686,
"step": 5310
},
{
"epoch": 7.988916024798046,
"grad_norm": 0.7316805720329285,
"learning_rate": 7.501136379650388e-05,
"loss": 0.6505,
"step": 5320
},
{
"epoch": 8.00300582378358,
"grad_norm": 0.8020321726799011,
"learning_rate": 7.489764843217082e-05,
"loss": 0.6468,
"step": 5330
},
{
"epoch": 8.018034942701485,
"grad_norm": 0.7429752349853516,
"learning_rate": 7.478376156570489e-05,
"loss": 0.5209,
"step": 5340
},
{
"epoch": 8.033064061619388,
"grad_norm": 0.7338524460792542,
"learning_rate": 7.466970398159145e-05,
"loss": 0.5215,
"step": 5350
},
{
"epoch": 8.048093180537292,
"grad_norm": 0.7771674990653992,
"learning_rate": 7.45554764654918e-05,
"loss": 0.5066,
"step": 5360
},
{
"epoch": 8.063122299455195,
"grad_norm": 0.7496100068092346,
"learning_rate": 7.444107980423778e-05,
"loss": 0.5101,
"step": 5370
},
{
"epoch": 8.078151418373098,
"grad_norm": 0.8719698786735535,
"learning_rate": 7.432651478582636e-05,
"loss": 0.513,
"step": 5380
},
{
"epoch": 8.093180537291001,
"grad_norm": 0.706078052520752,
"learning_rate": 7.42117821994142e-05,
"loss": 0.5185,
"step": 5390
},
{
"epoch": 8.108209656208905,
"grad_norm": 0.7622345685958862,
"learning_rate": 7.409688283531222e-05,
"loss": 0.5162,
"step": 5400
},
{
"epoch": 8.123238775126808,
"grad_norm": 0.7656405568122864,
"learning_rate": 7.398181748498015e-05,
"loss": 0.5137,
"step": 5410
},
{
"epoch": 8.138267894044711,
"grad_norm": 0.8089895248413086,
"learning_rate": 7.386658694102103e-05,
"loss": 0.5006,
"step": 5420
},
{
"epoch": 8.153297012962614,
"grad_norm": 0.7622844576835632,
"learning_rate": 7.375119199717591e-05,
"loss": 0.5224,
"step": 5430
},
{
"epoch": 8.16832613188052,
"grad_norm": 0.8785136342048645,
"learning_rate": 7.363563344831818e-05,
"loss": 0.5277,
"step": 5440
},
{
"epoch": 8.183355250798423,
"grad_norm": 0.8507887721061707,
"learning_rate": 7.351991209044821e-05,
"loss": 0.5203,
"step": 5450
},
{
"epoch": 8.198384369716326,
"grad_norm": 0.9602698683738708,
"learning_rate": 7.340402872068789e-05,
"loss": 0.5186,
"step": 5460
},
{
"epoch": 8.21341348863423,
"grad_norm": 0.8880749344825745,
"learning_rate": 7.328798413727503e-05,
"loss": 0.5175,
"step": 5470
},
{
"epoch": 8.228442607552132,
"grad_norm": 0.8679527640342712,
"learning_rate": 7.317177913955795e-05,
"loss": 0.513,
"step": 5480
},
{
"epoch": 8.243471726470036,
"grad_norm": 0.7859882116317749,
"learning_rate": 7.305541452798997e-05,
"loss": 0.5252,
"step": 5490
},
{
"epoch": 8.258500845387939,
"grad_norm": 0.8226519227027893,
"learning_rate": 7.293889110412387e-05,
"loss": 0.5211,
"step": 5500
},
{
"epoch": 8.273529964305842,
"grad_norm": 0.8628718256950378,
"learning_rate": 7.282220967060633e-05,
"loss": 0.5294,
"step": 5510
},
{
"epoch": 8.288559083223745,
"grad_norm": 0.9453558325767517,
"learning_rate": 7.270537103117252e-05,
"loss": 0.5238,
"step": 5520
},
{
"epoch": 8.303588202141649,
"grad_norm": 0.9046574831008911,
"learning_rate": 7.258837599064043e-05,
"loss": 0.5186,
"step": 5530
},
{
"epoch": 8.318617321059552,
"grad_norm": 0.9415176510810852,
"learning_rate": 7.24712253549054e-05,
"loss": 0.5282,
"step": 5540
},
{
"epoch": 8.333646439977457,
"grad_norm": 0.8018948435783386,
"learning_rate": 7.235391993093456e-05,
"loss": 0.5264,
"step": 5550
},
{
"epoch": 8.34867555889536,
"grad_norm": 0.818480908870697,
"learning_rate": 7.22364605267613e-05,
"loss": 0.5272,
"step": 5560
},
{
"epoch": 8.363704677813264,
"grad_norm": 0.8961235284805298,
"learning_rate": 7.211884795147958e-05,
"loss": 0.5373,
"step": 5570
},
{
"epoch": 8.378733796731167,
"grad_norm": 0.8245147466659546,
"learning_rate": 7.200108301523854e-05,
"loss": 0.5423,
"step": 5580
},
{
"epoch": 8.39376291564907,
"grad_norm": 0.8225317001342773,
"learning_rate": 7.188316652923677e-05,
"loss": 0.5374,
"step": 5590
},
{
"epoch": 8.408792034566973,
"grad_norm": 0.9353516697883606,
"learning_rate": 7.176509930571682e-05,
"loss": 0.5418,
"step": 5600
},
{
"epoch": 8.423821153484877,
"grad_norm": 0.9062713384628296,
"learning_rate": 7.16468821579595e-05,
"loss": 0.5508,
"step": 5610
},
{
"epoch": 8.43885027240278,
"grad_norm": 0.8618881106376648,
"learning_rate": 7.152851590027843e-05,
"loss": 0.5424,
"step": 5620
},
{
"epoch": 8.453879391320683,
"grad_norm": 0.8350569009780884,
"learning_rate": 7.141000134801425e-05,
"loss": 0.5433,
"step": 5630
},
{
"epoch": 8.468908510238588,
"grad_norm": 0.8575078845024109,
"learning_rate": 7.129133931752914e-05,
"loss": 0.5459,
"step": 5640
},
{
"epoch": 8.483937629156491,
"grad_norm": 0.869219183921814,
"learning_rate": 7.117253062620118e-05,
"loss": 0.5397,
"step": 5650
},
{
"epoch": 8.498966748074395,
"grad_norm": 0.900360643863678,
"learning_rate": 7.105357609241863e-05,
"loss": 0.5435,
"step": 5660
},
{
"epoch": 8.513995866992298,
"grad_norm": 0.9262248277664185,
"learning_rate": 7.093447653557441e-05,
"loss": 0.5462,
"step": 5670
},
{
"epoch": 8.529024985910201,
"grad_norm": 0.9586583971977234,
"learning_rate": 7.081523277606035e-05,
"loss": 0.5386,
"step": 5680
},
{
"epoch": 8.544054104828104,
"grad_norm": 0.8671521544456482,
"learning_rate": 7.069584563526166e-05,
"loss": 0.539,
"step": 5690
},
{
"epoch": 8.559083223746008,
"grad_norm": 0.8206884860992432,
"learning_rate": 7.057631593555111e-05,
"loss": 0.5389,
"step": 5700
},
{
"epoch": 8.574112342663911,
"grad_norm": 0.8640275597572327,
"learning_rate": 7.045664450028352e-05,
"loss": 0.5443,
"step": 5710
},
{
"epoch": 8.589141461581814,
"grad_norm": 0.8697555661201477,
"learning_rate": 7.033683215379002e-05,
"loss": 0.5488,
"step": 5720
},
{
"epoch": 8.604170580499718,
"grad_norm": 0.9721740484237671,
"learning_rate": 7.021687972137235e-05,
"loss": 0.5474,
"step": 5730
},
{
"epoch": 8.61919969941762,
"grad_norm": 0.895819902420044,
"learning_rate": 7.009678802929724e-05,
"loss": 0.5504,
"step": 5740
},
{
"epoch": 8.634228818335526,
"grad_norm": 1.060189962387085,
"learning_rate": 6.997655790479061e-05,
"loss": 0.5469,
"step": 5750
},
{
"epoch": 8.649257937253429,
"grad_norm": 0.955331563949585,
"learning_rate": 6.985619017603207e-05,
"loss": 0.5491,
"step": 5760
},
{
"epoch": 8.664287056171332,
"grad_norm": 0.9543823599815369,
"learning_rate": 6.973568567214894e-05,
"loss": 0.5549,
"step": 5770
},
{
"epoch": 8.679316175089236,
"grad_norm": 0.8880019187927246,
"learning_rate": 6.961504522321076e-05,
"loss": 0.5466,
"step": 5780
},
{
"epoch": 8.694345294007139,
"grad_norm": 0.8980219960212708,
"learning_rate": 6.949426966022354e-05,
"loss": 0.5321,
"step": 5790
},
{
"epoch": 8.709374412925042,
"grad_norm": 0.9821533560752869,
"learning_rate": 6.937335981512389e-05,
"loss": 0.5466,
"step": 5800
},
{
"epoch": 8.724403531842945,
"grad_norm": 0.9177353978157043,
"learning_rate": 6.925231652077348e-05,
"loss": 0.5568,
"step": 5810
},
{
"epoch": 8.739432650760849,
"grad_norm": 0.9436571002006531,
"learning_rate": 6.913114061095319e-05,
"loss": 0.5537,
"step": 5820
},
{
"epoch": 8.754461769678752,
"grad_norm": 0.8605087995529175,
"learning_rate": 6.900983292035739e-05,
"loss": 0.5456,
"step": 5830
},
{
"epoch": 8.769490888596657,
"grad_norm": 0.9178728461265564,
"learning_rate": 6.888839428458818e-05,
"loss": 0.5522,
"step": 5840
},
{
"epoch": 8.78452000751456,
"grad_norm": 0.8443792462348938,
"learning_rate": 6.876682554014967e-05,
"loss": 0.5465,
"step": 5850
},
{
"epoch": 8.799549126432463,
"grad_norm": 0.8694719076156616,
"learning_rate": 6.86451275244422e-05,
"loss": 0.5516,
"step": 5860
},
{
"epoch": 8.814578245350367,
"grad_norm": 0.8430178165435791,
"learning_rate": 6.852330107575652e-05,
"loss": 0.549,
"step": 5870
},
{
"epoch": 8.82960736426827,
"grad_norm": 0.8651490211486816,
"learning_rate": 6.840134703326815e-05,
"loss": 0.5525,
"step": 5880
},
{
"epoch": 8.844636483186173,
"grad_norm": 0.7867377400398254,
"learning_rate": 6.827926623703142e-05,
"loss": 0.5594,
"step": 5890
},
{
"epoch": 8.859665602104076,
"grad_norm": 0.9743750691413879,
"learning_rate": 6.815705952797382e-05,
"loss": 0.5617,
"step": 5900
},
{
"epoch": 8.87469472102198,
"grad_norm": 0.8857339024543762,
"learning_rate": 6.80347277478902e-05,
"loss": 0.5559,
"step": 5910
},
{
"epoch": 8.889723839939883,
"grad_norm": 0.9169685244560242,
"learning_rate": 6.791227173943684e-05,
"loss": 0.5473,
"step": 5920
},
{
"epoch": 8.904752958857786,
"grad_norm": 1.0672627687454224,
"learning_rate": 6.778969234612584e-05,
"loss": 0.5532,
"step": 5930
},
{
"epoch": 8.91978207777569,
"grad_norm": 0.9694510698318481,
"learning_rate": 6.766699041231913e-05,
"loss": 0.5541,
"step": 5940
},
{
"epoch": 8.934811196693595,
"grad_norm": 0.940804123878479,
"learning_rate": 6.754416678322281e-05,
"loss": 0.5569,
"step": 5950
},
{
"epoch": 8.949840315611498,
"grad_norm": 0.9347053170204163,
"learning_rate": 6.74212223048812e-05,
"loss": 0.5614,
"step": 5960
},
{
"epoch": 8.964869434529401,
"grad_norm": 0.8529021739959717,
"learning_rate": 6.729815782417105e-05,
"loss": 0.5438,
"step": 5970
},
{
"epoch": 8.979898553447304,
"grad_norm": 0.9158792495727539,
"learning_rate": 6.717497418879579e-05,
"loss": 0.5687,
"step": 5980
},
{
"epoch": 8.994927672365208,
"grad_norm": 0.8642351627349854,
"learning_rate": 6.705167224727955e-05,
"loss": 0.5508,
"step": 5990
},
{
"epoch": 9.009017471350742,
"grad_norm": 1.036657452583313,
"learning_rate": 6.692825284896142e-05,
"loss": 0.496,
"step": 6000
},
{
"epoch": 9.024046590268645,
"grad_norm": 1.0688594579696655,
"learning_rate": 6.680471684398957e-05,
"loss": 0.4279,
"step": 6010
},
{
"epoch": 9.039075709186548,
"grad_norm": 0.9282298684120178,
"learning_rate": 6.668106508331539e-05,
"loss": 0.4258,
"step": 6020
},
{
"epoch": 9.054104828104453,
"grad_norm": 0.8562738299369812,
"learning_rate": 6.655729841868758e-05,
"loss": 0.4266,
"step": 6030
},
{
"epoch": 9.069133947022356,
"grad_norm": 0.9267016649246216,
"learning_rate": 6.643341770264642e-05,
"loss": 0.4253,
"step": 6040
},
{
"epoch": 9.08416306594026,
"grad_norm": 0.838796079158783,
"learning_rate": 6.630942378851774e-05,
"loss": 0.4209,
"step": 6050
},
{
"epoch": 9.099192184858163,
"grad_norm": 1.0836501121520996,
"learning_rate": 6.618531753040712e-05,
"loss": 0.4319,
"step": 6060
},
{
"epoch": 9.114221303776066,
"grad_norm": 0.912151038646698,
"learning_rate": 6.606109978319404e-05,
"loss": 0.4242,
"step": 6070
},
{
"epoch": 9.12925042269397,
"grad_norm": 0.9484944939613342,
"learning_rate": 6.593677140252588e-05,
"loss": 0.4275,
"step": 6080
},
{
"epoch": 9.144279541611873,
"grad_norm": 0.8877925276756287,
"learning_rate": 6.581233324481216e-05,
"loss": 0.4372,
"step": 6090
},
{
"epoch": 9.159308660529776,
"grad_norm": 0.9061231017112732,
"learning_rate": 6.568778616721853e-05,
"loss": 0.4309,
"step": 6100
},
{
"epoch": 9.17433777944768,
"grad_norm": 0.9550976753234863,
"learning_rate": 6.556313102766094e-05,
"loss": 0.4344,
"step": 6110
},
{
"epoch": 9.189366898365583,
"grad_norm": 0.9908791780471802,
"learning_rate": 6.543836868479968e-05,
"loss": 0.4366,
"step": 6120
},
{
"epoch": 9.204396017283488,
"grad_norm": 1.0337473154067993,
"learning_rate": 6.531349999803353e-05,
"loss": 0.4357,
"step": 6130
},
{
"epoch": 9.21942513620139,
"grad_norm": 0.9019971489906311,
"learning_rate": 6.518852582749373e-05,
"loss": 0.439,
"step": 6140
},
{
"epoch": 9.234454255119294,
"grad_norm": 0.9498554468154907,
"learning_rate": 6.506344703403819e-05,
"loss": 0.4348,
"step": 6150
},
{
"epoch": 9.249483374037197,
"grad_norm": 0.9589983820915222,
"learning_rate": 6.493826447924541e-05,
"loss": 0.4512,
"step": 6160
},
{
"epoch": 9.2645124929551,
"grad_norm": 0.9420648217201233,
"learning_rate": 6.481297902540875e-05,
"loss": 0.4415,
"step": 6170
},
{
"epoch": 9.279541611873004,
"grad_norm": 0.8353439569473267,
"learning_rate": 6.468759153553022e-05,
"loss": 0.4482,
"step": 6180
},
{
"epoch": 9.294570730790907,
"grad_norm": 0.9372383952140808,
"learning_rate": 6.456210287331483e-05,
"loss": 0.4401,
"step": 6190
},
{
"epoch": 9.30959984970881,
"grad_norm": 1.0183303356170654,
"learning_rate": 6.443651390316437e-05,
"loss": 0.4387,
"step": 6200
},
{
"epoch": 9.324628968626714,
"grad_norm": 0.9157505035400391,
"learning_rate": 6.431082549017166e-05,
"loss": 0.4364,
"step": 6210
},
{
"epoch": 9.339658087544617,
"grad_norm": 0.9424082040786743,
"learning_rate": 6.41850385001145e-05,
"loss": 0.4456,
"step": 6220
},
{
"epoch": 9.354687206462522,
"grad_norm": 0.987912654876709,
"learning_rate": 6.405915379944966e-05,
"loss": 0.4427,
"step": 6230
},
{
"epoch": 9.369716325380425,
"grad_norm": 0.9018827676773071,
"learning_rate": 6.393317225530706e-05,
"loss": 0.4545,
"step": 6240
},
{
"epoch": 9.384745444298328,
"grad_norm": 0.8961259722709656,
"learning_rate": 6.380709473548361e-05,
"loss": 0.4524,
"step": 6250
},
{
"epoch": 9.399774563216232,
"grad_norm": 0.939476728439331,
"learning_rate": 6.368092210843739e-05,
"loss": 0.4465,
"step": 6260
},
{
"epoch": 9.414803682134135,
"grad_norm": 0.9325003623962402,
"learning_rate": 6.35546552432816e-05,
"loss": 0.4562,
"step": 6270
},
{
"epoch": 9.429832801052038,
"grad_norm": 1.0927010774612427,
"learning_rate": 6.342829500977856e-05,
"loss": 0.4499,
"step": 6280
},
{
"epoch": 9.444861919969942,
"grad_norm": 0.9243865013122559,
"learning_rate": 6.330184227833376e-05,
"loss": 0.4469,
"step": 6290
},
{
"epoch": 9.459891038887845,
"grad_norm": 0.9676965475082397,
"learning_rate": 6.31752979199898e-05,
"loss": 0.4475,
"step": 6300
},
{
"epoch": 9.474920157805748,
"grad_norm": 1.0749905109405518,
"learning_rate": 6.30486628064205e-05,
"loss": 0.4644,
"step": 6310
},
{
"epoch": 9.489949276723651,
"grad_norm": 1.0174274444580078,
"learning_rate": 6.292193780992474e-05,
"loss": 0.4657,
"step": 6320
},
{
"epoch": 9.504978395641556,
"grad_norm": 0.9137683510780334,
"learning_rate": 6.279512380342065e-05,
"loss": 0.4574,
"step": 6330
},
{
"epoch": 9.52000751455946,
"grad_norm": 0.8929033279418945,
"learning_rate": 6.266822166043937e-05,
"loss": 0.4571,
"step": 6340
},
{
"epoch": 9.535036633477363,
"grad_norm": 1.0599805116653442,
"learning_rate": 6.254123225511923e-05,
"loss": 0.4606,
"step": 6350
},
{
"epoch": 9.550065752395266,
"grad_norm": 1.183914065361023,
"learning_rate": 6.241415646219963e-05,
"loss": 0.459,
"step": 6360
},
{
"epoch": 9.56509487131317,
"grad_norm": 1.0352977514266968,
"learning_rate": 6.228699515701501e-05,
"loss": 0.4593,
"step": 6370
},
{
"epoch": 9.580123990231073,
"grad_norm": 0.8676705956459045,
"learning_rate": 6.215974921548887e-05,
"loss": 0.4546,
"step": 6380
},
{
"epoch": 9.595153109148976,
"grad_norm": 1.03312087059021,
"learning_rate": 6.203241951412767e-05,
"loss": 0.4495,
"step": 6390
},
{
"epoch": 9.61018222806688,
"grad_norm": 0.9865357279777527,
"learning_rate": 6.19050069300149e-05,
"loss": 0.4533,
"step": 6400
},
{
"epoch": 9.625211346984782,
"grad_norm": 1.0788352489471436,
"learning_rate": 6.177751234080491e-05,
"loss": 0.4515,
"step": 6410
},
{
"epoch": 9.640240465902686,
"grad_norm": 1.049320936203003,
"learning_rate": 6.164993662471692e-05,
"loss": 0.4568,
"step": 6420
},
{
"epoch": 9.65526958482059,
"grad_norm": 0.9056411981582642,
"learning_rate": 6.152228066052904e-05,
"loss": 0.4648,
"step": 6430
},
{
"epoch": 9.670298703738494,
"grad_norm": 0.9347831010818481,
"learning_rate": 6.139454532757208e-05,
"loss": 0.4622,
"step": 6440
},
{
"epoch": 9.685327822656397,
"grad_norm": 0.9340201020240784,
"learning_rate": 6.126673150572362e-05,
"loss": 0.4537,
"step": 6450
},
{
"epoch": 9.7003569415743,
"grad_norm": 0.9909615516662598,
"learning_rate": 6.113884007540184e-05,
"loss": 0.4704,
"step": 6460
},
{
"epoch": 9.715386060492204,
"grad_norm": 1.0939775705337524,
"learning_rate": 6.1010871917559576e-05,
"loss": 0.4596,
"step": 6470
},
{
"epoch": 9.730415179410107,
"grad_norm": 0.9341562986373901,
"learning_rate": 6.088282791367812e-05,
"loss": 0.46,
"step": 6480
},
{
"epoch": 9.74544429832801,
"grad_norm": 0.9412760734558105,
"learning_rate": 6.075470894576124e-05,
"loss": 0.4701,
"step": 6490
},
{
"epoch": 9.760473417245914,
"grad_norm": 1.0007338523864746,
"learning_rate": 6.062651589632911e-05,
"loss": 0.4652,
"step": 6500
},
{
"epoch": 9.775502536163817,
"grad_norm": 1.0357065200805664,
"learning_rate": 6.0498249648412134e-05,
"loss": 0.4684,
"step": 6510
},
{
"epoch": 9.79053165508172,
"grad_norm": 0.8514649868011475,
"learning_rate": 6.036991108554497e-05,
"loss": 0.454,
"step": 6520
},
{
"epoch": 9.805560773999623,
"grad_norm": 0.9953536987304688,
"learning_rate": 6.02415010917604e-05,
"loss": 0.4579,
"step": 6530
},
{
"epoch": 9.820589892917528,
"grad_norm": 0.9308024644851685,
"learning_rate": 6.011302055158324e-05,
"loss": 0.4631,
"step": 6540
},
{
"epoch": 9.835619011835432,
"grad_norm": 0.9298855662345886,
"learning_rate": 5.9984470350024256e-05,
"loss": 0.4544,
"step": 6550
},
{
"epoch": 9.850648130753335,
"grad_norm": 0.9751214385032654,
"learning_rate": 5.985585137257401e-05,
"loss": 0.4571,
"step": 6560
},
{
"epoch": 9.865677249671238,
"grad_norm": 0.9474308490753174,
"learning_rate": 5.9727164505196905e-05,
"loss": 0.4658,
"step": 6570
},
{
"epoch": 9.880706368589141,
"grad_norm": 1.0583529472351074,
"learning_rate": 5.95984106343249e-05,
"loss": 0.4561,
"step": 6580
},
{
"epoch": 9.895735487507045,
"grad_norm": 1.0418837070465088,
"learning_rate": 5.946959064685156e-05,
"loss": 0.4637,
"step": 6590
},
{
"epoch": 9.910764606424948,
"grad_norm": 1.0113483667373657,
"learning_rate": 5.934070543012582e-05,
"loss": 0.4705,
"step": 6600
},
{
"epoch": 9.925793725342851,
"grad_norm": 1.046410083770752,
"learning_rate": 5.921175587194601e-05,
"loss": 0.4884,
"step": 6610
},
{
"epoch": 9.940822844260754,
"grad_norm": 0.9872678518295288,
"learning_rate": 5.9082742860553576e-05,
"loss": 0.4744,
"step": 6620
},
{
"epoch": 9.95585196317866,
"grad_norm": 1.0428500175476074,
"learning_rate": 5.895366728462709e-05,
"loss": 0.4704,
"step": 6630
},
{
"epoch": 9.970881082096563,
"grad_norm": 0.922476053237915,
"learning_rate": 5.882453003327612e-05,
"loss": 0.465,
"step": 6640
},
{
"epoch": 9.985910201014466,
"grad_norm": 1.03745698928833,
"learning_rate": 5.8695331996034986e-05,
"loss": 0.4674,
"step": 6650
},
{
"epoch": 10.0,
"grad_norm": 1.6415784358978271,
"learning_rate": 5.8566074062856815e-05,
"loss": 0.4717,
"step": 6660
},
{
"epoch": 10.015029118917903,
"grad_norm": 0.9536633491516113,
"learning_rate": 5.8436757124107245e-05,
"loss": 0.361,
"step": 6670
},
{
"epoch": 10.030058237835807,
"grad_norm": 0.8403608202934265,
"learning_rate": 5.83073820705584e-05,
"loss": 0.3593,
"step": 6680
},
{
"epoch": 10.04508735675371,
"grad_norm": 1.0014981031417847,
"learning_rate": 5.8177949793382705e-05,
"loss": 0.3669,
"step": 6690
},
{
"epoch": 10.060116475671613,
"grad_norm": 0.9928374290466309,
"learning_rate": 5.804846118414671e-05,
"loss": 0.3584,
"step": 6700
},
{
"epoch": 10.075145594589518,
"grad_norm": 0.9604836106300354,
"learning_rate": 5.7918917134805096e-05,
"loss": 0.3467,
"step": 6710
},
{
"epoch": 10.090174713507421,
"grad_norm": 1.0535321235656738,
"learning_rate": 5.7789318537694335e-05,
"loss": 0.3623,
"step": 6720
},
{
"epoch": 10.105203832425325,
"grad_norm": 1.0338060855865479,
"learning_rate": 5.76596662855267e-05,
"loss": 0.3504,
"step": 6730
},
{
"epoch": 10.120232951343228,
"grad_norm": 0.9590771794319153,
"learning_rate": 5.752996127138404e-05,
"loss": 0.3571,
"step": 6740
},
{
"epoch": 10.135262070261131,
"grad_norm": 0.939929187297821,
"learning_rate": 5.740020438871162e-05,
"loss": 0.3709,
"step": 6750
},
{
"epoch": 10.150291189179034,
"grad_norm": 1.0055979490280151,
"learning_rate": 5.727039653131202e-05,
"loss": 0.3646,
"step": 6760
},
{
"epoch": 10.165320308096938,
"grad_norm": 1.0767991542816162,
"learning_rate": 5.714053859333893e-05,
"loss": 0.3626,
"step": 6770
},
{
"epoch": 10.180349427014841,
"grad_norm": 0.9774537682533264,
"learning_rate": 5.701063146929103e-05,
"loss": 0.3691,
"step": 6780
},
{
"epoch": 10.195378545932744,
"grad_norm": 1.1948145627975464,
"learning_rate": 5.688067605400579e-05,
"loss": 0.3707,
"step": 6790
},
{
"epoch": 10.210407664850647,
"grad_norm": 1.1181336641311646,
"learning_rate": 5.675067324265332e-05,
"loss": 0.3637,
"step": 6800
},
{
"epoch": 10.22543678376855,
"grad_norm": 0.9550219774246216,
"learning_rate": 5.662062393073022e-05,
"loss": 0.3625,
"step": 6810
},
{
"epoch": 10.240465902686456,
"grad_norm": 0.9461958408355713,
"learning_rate": 5.6490529014053405e-05,
"loss": 0.3719,
"step": 6820
},
{
"epoch": 10.255495021604359,
"grad_norm": 0.9581360816955566,
"learning_rate": 5.636038938875391e-05,
"loss": 0.3711,
"step": 6830
},
{
"epoch": 10.270524140522262,
"grad_norm": 0.9395859837532043,
"learning_rate": 5.623020595127073e-05,
"loss": 0.3624,
"step": 6840
},
{
"epoch": 10.285553259440166,
"grad_norm": 1.146485447883606,
"learning_rate": 5.609997959834471e-05,
"loss": 0.3684,
"step": 6850
},
{
"epoch": 10.300582378358069,
"grad_norm": 0.9923917055130005,
"learning_rate": 5.596971122701221e-05,
"loss": 0.3695,
"step": 6860
},
{
"epoch": 10.315611497275972,
"grad_norm": 0.9672958850860596,
"learning_rate": 5.583940173459913e-05,
"loss": 0.3735,
"step": 6870
},
{
"epoch": 10.330640616193875,
"grad_norm": 0.9627594947814941,
"learning_rate": 5.5709052018714536e-05,
"loss": 0.3585,
"step": 6880
},
{
"epoch": 10.345669735111779,
"grad_norm": 1.0451908111572266,
"learning_rate": 5.5578662977244625e-05,
"loss": 0.3726,
"step": 6890
},
{
"epoch": 10.360698854029682,
"grad_norm": 1.0388795137405396,
"learning_rate": 5.5448235508346435e-05,
"loss": 0.3778,
"step": 6900
},
{
"epoch": 10.375727972947587,
"grad_norm": 0.9968121647834778,
"learning_rate": 5.5317770510441745e-05,
"loss": 0.3837,
"step": 6910
},
{
"epoch": 10.39075709186549,
"grad_norm": 1.104638934135437,
"learning_rate": 5.518726888221082e-05,
"loss": 0.3719,
"step": 6920
},
{
"epoch": 10.405786210783393,
"grad_norm": 1.006320595741272,
"learning_rate": 5.5056731522586236e-05,
"loss": 0.3664,
"step": 6930
},
{
"epoch": 10.420815329701297,
"grad_norm": 1.1039286851882935,
"learning_rate": 5.492615933074673e-05,
"loss": 0.3768,
"step": 6940
},
{
"epoch": 10.4358444486192,
"grad_norm": 0.9026983380317688,
"learning_rate": 5.479555320611094e-05,
"loss": 0.3661,
"step": 6950
},
{
"epoch": 10.450873567537103,
"grad_norm": 1.0680197477340698,
"learning_rate": 5.466491404833127e-05,
"loss": 0.375,
"step": 6960
},
{
"epoch": 10.465902686455006,
"grad_norm": 1.079924464225769,
"learning_rate": 5.4534242757287643e-05,
"loss": 0.3865,
"step": 6970
},
{
"epoch": 10.48093180537291,
"grad_norm": 1.037091851234436,
"learning_rate": 5.440354023308134e-05,
"loss": 0.3861,
"step": 6980
},
{
"epoch": 10.495960924290813,
"grad_norm": 1.0389127731323242,
"learning_rate": 5.4272807376028777e-05,
"loss": 0.3701,
"step": 6990
},
{
"epoch": 10.510990043208716,
"grad_norm": 1.079481840133667,
"learning_rate": 5.41420450866553e-05,
"loss": 0.3775,
"step": 7000
},
{
"epoch": 10.52601916212662,
"grad_norm": 1.3485366106033325,
"learning_rate": 5.401125426568904e-05,
"loss": 0.3722,
"step": 7010
},
{
"epoch": 10.541048281044525,
"grad_norm": 1.0112107992172241,
"learning_rate": 5.388043581405461e-05,
"loss": 0.3712,
"step": 7020
},
{
"epoch": 10.556077399962428,
"grad_norm": 0.9727371335029602,
"learning_rate": 5.374959063286695e-05,
"loss": 0.3732,
"step": 7030
},
{
"epoch": 10.571106518880331,
"grad_norm": 0.9836901426315308,
"learning_rate": 5.361871962342518e-05,
"loss": 0.3787,
"step": 7040
},
{
"epoch": 10.586135637798234,
"grad_norm": 1.0882790088653564,
"learning_rate": 5.348782368720626e-05,
"loss": 0.3816,
"step": 7050
},
{
"epoch": 10.601164756716138,
"grad_norm": 0.9604332447052002,
"learning_rate": 5.335690372585892e-05,
"loss": 0.3765,
"step": 7060
},
{
"epoch": 10.61619387563404,
"grad_norm": 0.9835896492004395,
"learning_rate": 5.322596064119731e-05,
"loss": 0.3808,
"step": 7070
},
{
"epoch": 10.631222994551944,
"grad_norm": 0.9179807901382446,
"learning_rate": 5.309499533519493e-05,
"loss": 0.378,
"step": 7080
},
{
"epoch": 10.646252113469847,
"grad_norm": 1.0876275300979614,
"learning_rate": 5.2964008709978305e-05,
"loss": 0.3752,
"step": 7090
},
{
"epoch": 10.66128123238775,
"grad_norm": 0.9817517995834351,
"learning_rate": 5.2833001667820816e-05,
"loss": 0.3856,
"step": 7100
},
{
"epoch": 10.676310351305656,
"grad_norm": 1.0658329725265503,
"learning_rate": 5.270197511113649e-05,
"loss": 0.3747,
"step": 7110
},
{
"epoch": 10.691339470223559,
"grad_norm": 1.0060932636260986,
"learning_rate": 5.257092994247377e-05,
"loss": 0.3867,
"step": 7120
},
{
"epoch": 10.706368589141462,
"grad_norm": 1.1070188283920288,
"learning_rate": 5.243986706450933e-05,
"loss": 0.3765,
"step": 7130
},
{
"epoch": 10.721397708059365,
"grad_norm": 0.9768523573875427,
"learning_rate": 5.2308787380041777e-05,
"loss": 0.3852,
"step": 7140
},
{
"epoch": 10.736426826977269,
"grad_norm": 0.9963809847831726,
"learning_rate": 5.217769179198555e-05,
"loss": 0.3924,
"step": 7150
},
{
"epoch": 10.751455945895172,
"grad_norm": 0.9897161722183228,
"learning_rate": 5.2046581203364586e-05,
"loss": 0.3871,
"step": 7160
},
{
"epoch": 10.766485064813075,
"grad_norm": 1.0196555852890015,
"learning_rate": 5.191545651730616e-05,
"loss": 0.3766,
"step": 7170
},
{
"epoch": 10.781514183730978,
"grad_norm": 0.8715333342552185,
"learning_rate": 5.1784318637034676e-05,
"loss": 0.3878,
"step": 7180
},
{
"epoch": 10.796543302648882,
"grad_norm": 1.0659235715866089,
"learning_rate": 5.165316846586541e-05,
"loss": 0.387,
"step": 7190
},
{
"epoch": 10.811572421566785,
"grad_norm": 1.0283163785934448,
"learning_rate": 5.15220069071983e-05,
"loss": 0.3899,
"step": 7200
},
{
"epoch": 10.826601540484688,
"grad_norm": 0.972322404384613,
"learning_rate": 5.139083486451172e-05,
"loss": 0.3916,
"step": 7210
},
{
"epoch": 10.841630659402593,
"grad_norm": 1.1113601922988892,
"learning_rate": 5.1259653241356276e-05,
"loss": 0.3832,
"step": 7220
},
{
"epoch": 10.856659778320497,
"grad_norm": 1.1082892417907715,
"learning_rate": 5.1128462941348554e-05,
"loss": 0.3863,
"step": 7230
},
{
"epoch": 10.8716888972384,
"grad_norm": 1.0528475046157837,
"learning_rate": 5.0997264868164903e-05,
"loss": 0.393,
"step": 7240
},
{
"epoch": 10.886718016156303,
"grad_norm": 0.9899016618728638,
"learning_rate": 5.0866059925535234e-05,
"loss": 0.39,
"step": 7250
},
{
"epoch": 10.901747135074206,
"grad_norm": 1.1150156259536743,
"learning_rate": 5.073484901723676e-05,
"loss": 0.3806,
"step": 7260
},
{
"epoch": 10.91677625399211,
"grad_norm": 1.0797758102416992,
"learning_rate": 5.0603633047087817e-05,
"loss": 0.3953,
"step": 7270
},
{
"epoch": 10.931805372910013,
"grad_norm": 1.122441291809082,
"learning_rate": 5.047241291894156e-05,
"loss": 0.386,
"step": 7280
},
{
"epoch": 10.946834491827916,
"grad_norm": 0.8962685465812683,
"learning_rate": 5.034118953667982e-05,
"loss": 0.3914,
"step": 7290
},
{
"epoch": 10.96186361074582,
"grad_norm": 1.1607177257537842,
"learning_rate": 5.020996380420685e-05,
"loss": 0.3995,
"step": 7300
},
{
"epoch": 10.976892729663723,
"grad_norm": 1.0731902122497559,
"learning_rate": 5.0078736625443054e-05,
"loss": 0.3836,
"step": 7310
},
{
"epoch": 10.991921848581628,
"grad_norm": 1.0019197463989258,
"learning_rate": 4.994750890431884e-05,
"loss": 0.3845,
"step": 7320
},
{
"epoch": 11.006011647567162,
"grad_norm": 0.9175123572349548,
"learning_rate": 4.9816281544768326e-05,
"loss": 0.3611,
"step": 7330
},
{
"epoch": 11.021040766485065,
"grad_norm": 0.8413906097412109,
"learning_rate": 4.968505545072313e-05,
"loss": 0.3021,
"step": 7340
},
{
"epoch": 11.036069885402968,
"grad_norm": 1.0692964792251587,
"learning_rate": 4.955383152610621e-05,
"loss": 0.2892,
"step": 7350
},
{
"epoch": 11.051099004320871,
"grad_norm": 1.0013508796691895,
"learning_rate": 4.9422610674825495e-05,
"loss": 0.2979,
"step": 7360
},
{
"epoch": 11.066128123238775,
"grad_norm": 1.0104172229766846,
"learning_rate": 4.929139380076783e-05,
"loss": 0.2995,
"step": 7370
},
{
"epoch": 11.081157242156678,
"grad_norm": 1.0872989892959595,
"learning_rate": 4.9160181807792586e-05,
"loss": 0.2909,
"step": 7380
},
{
"epoch": 11.096186361074581,
"grad_norm": 1.1095547676086426,
"learning_rate": 4.90289755997256e-05,
"loss": 0.29,
"step": 7390
},
{
"epoch": 11.111215479992486,
"grad_norm": 1.0950359106063843,
"learning_rate": 4.889777608035273e-05,
"loss": 0.3107,
"step": 7400
},
{
"epoch": 11.12624459891039,
"grad_norm": 1.060843586921692,
"learning_rate": 4.876658415341393e-05,
"loss": 0.3128,
"step": 7410
},
{
"epoch": 11.141273717828293,
"grad_norm": 1.0450581312179565,
"learning_rate": 4.863540072259668e-05,
"loss": 0.3099,
"step": 7420
},
{
"epoch": 11.156302836746196,
"grad_norm": 0.9836236238479614,
"learning_rate": 4.850422669153009e-05,
"loss": 0.3038,
"step": 7430
},
{
"epoch": 11.1713319556641,
"grad_norm": 0.9338634610176086,
"learning_rate": 4.837306296377841e-05,
"loss": 0.2983,
"step": 7440
},
{
"epoch": 11.186361074582003,
"grad_norm": 0.9969077706336975,
"learning_rate": 4.824191044283498e-05,
"loss": 0.3041,
"step": 7450
},
{
"epoch": 11.201390193499906,
"grad_norm": 1.1370275020599365,
"learning_rate": 4.811077003211592e-05,
"loss": 0.3124,
"step": 7460
},
{
"epoch": 11.216419312417809,
"grad_norm": 1.122521162033081,
"learning_rate": 4.797964263495394e-05,
"loss": 0.3077,
"step": 7470
},
{
"epoch": 11.231448431335712,
"grad_norm": 1.1988801956176758,
"learning_rate": 4.78485291545921e-05,
"loss": 0.3154,
"step": 7480
},
{
"epoch": 11.246477550253616,
"grad_norm": 1.1286782026290894,
"learning_rate": 4.771743049417761e-05,
"loss": 0.2994,
"step": 7490
},
{
"epoch": 11.26150666917152,
"grad_norm": 1.0577936172485352,
"learning_rate": 4.7586347556755573e-05,
"loss": 0.3036,
"step": 7500
},
{
"epoch": 11.276535788089424,
"grad_norm": 1.0209895372390747,
"learning_rate": 4.745528124526282e-05,
"loss": 0.3043,
"step": 7510
},
{
"epoch": 11.291564907007327,
"grad_norm": 0.9786052107810974,
"learning_rate": 4.7324232462521634e-05,
"loss": 0.3089,
"step": 7520
},
{
"epoch": 11.30659402592523,
"grad_norm": 1.1310527324676514,
"learning_rate": 4.719320211123358e-05,
"loss": 0.3016,
"step": 7530
},
{
"epoch": 11.321623144843134,
"grad_norm": 0.9561529755592346,
"learning_rate": 4.706219109397319e-05,
"loss": 0.3154,
"step": 7540
},
{
"epoch": 11.336652263761037,
"grad_norm": 0.9974495768547058,
"learning_rate": 4.6931200313181944e-05,
"loss": 0.3208,
"step": 7550
},
{
"epoch": 11.35168138267894,
"grad_norm": 0.9916987419128418,
"learning_rate": 4.6800230671161784e-05,
"loss": 0.3069,
"step": 7560
},
{
"epoch": 11.366710501596843,
"grad_norm": 1.231939435005188,
"learning_rate": 4.666928307006918e-05,
"loss": 0.3063,
"step": 7570
},
{
"epoch": 11.381739620514747,
"grad_norm": 1.0125497579574585,
"learning_rate": 4.6538358411908646e-05,
"loss": 0.318,
"step": 7580
},
{
"epoch": 11.39676873943265,
"grad_norm": 1.0557286739349365,
"learning_rate": 4.640745759852677e-05,
"loss": 0.3112,
"step": 7590
},
{
"epoch": 11.411797858350555,
"grad_norm": 1.0968514680862427,
"learning_rate": 4.6276581531605824e-05,
"loss": 0.3163,
"step": 7600
},
{
"epoch": 11.426826977268458,
"grad_norm": 1.0451496839523315,
"learning_rate": 4.6145731112657644e-05,
"loss": 0.3096,
"step": 7610
},
{
"epoch": 11.441856096186362,
"grad_norm": 1.1789813041687012,
"learning_rate": 4.601490724301738e-05,
"loss": 0.3024,
"step": 7620
},
{
"epoch": 11.456885215104265,
"grad_norm": 1.1728602647781372,
"learning_rate": 4.5884110823837334e-05,
"loss": 0.3052,
"step": 7630
},
{
"epoch": 11.471914334022168,
"grad_norm": 1.032285451889038,
"learning_rate": 4.5753342756080666e-05,
"loss": 0.3108,
"step": 7640
},
{
"epoch": 11.486943452940071,
"grad_norm": 1.1014740467071533,
"learning_rate": 4.5622603940515326e-05,
"loss": 0.3049,
"step": 7650
},
{
"epoch": 11.501972571857975,
"grad_norm": 1.2548887729644775,
"learning_rate": 4.549189527770767e-05,
"loss": 0.3204,
"step": 7660
},
{
"epoch": 11.517001690775878,
"grad_norm": 1.0855730772018433,
"learning_rate": 4.5361217668016446e-05,
"loss": 0.3136,
"step": 7670
},
{
"epoch": 11.532030809693781,
"grad_norm": 0.9988487362861633,
"learning_rate": 4.52305720115864e-05,
"loss": 0.3173,
"step": 7680
},
{
"epoch": 11.547059928611684,
"grad_norm": 1.1315146684646606,
"learning_rate": 4.509995920834229e-05,
"loss": 0.3138,
"step": 7690
},
{
"epoch": 11.56208904752959,
"grad_norm": 0.9927186965942383,
"learning_rate": 4.496938015798246e-05,
"loss": 0.3079,
"step": 7700
},
{
"epoch": 11.577118166447493,
"grad_norm": 1.1122972965240479,
"learning_rate": 4.483883575997284e-05,
"loss": 0.3179,
"step": 7710
},
{
"epoch": 11.592147285365396,
"grad_norm": 1.007947564125061,
"learning_rate": 4.47083269135406e-05,
"loss": 0.3276,
"step": 7720
},
{
"epoch": 11.6071764042833,
"grad_norm": 1.00367271900177,
"learning_rate": 4.4577854517668075e-05,
"loss": 0.3202,
"step": 7730
},
{
"epoch": 11.622205523201202,
"grad_norm": 1.1806467771530151,
"learning_rate": 4.4447419471086484e-05,
"loss": 0.3203,
"step": 7740
},
{
"epoch": 11.637234642119106,
"grad_norm": 1.2128424644470215,
"learning_rate": 4.431702267226979e-05,
"loss": 0.3188,
"step": 7750
},
{
"epoch": 11.652263761037009,
"grad_norm": 1.2076245546340942,
"learning_rate": 4.418666501942848e-05,
"loss": 0.3093,
"step": 7760
},
{
"epoch": 11.667292879954912,
"grad_norm": 1.1673307418823242,
"learning_rate": 4.4056347410503414e-05,
"loss": 0.3204,
"step": 7770
},
{
"epoch": 11.682321998872816,
"grad_norm": 0.9249235987663269,
"learning_rate": 4.392607074315957e-05,
"loss": 0.3167,
"step": 7780
},
{
"epoch": 11.697351117790719,
"grad_norm": 1.0417946577072144,
"learning_rate": 4.379583591477999e-05,
"loss": 0.3157,
"step": 7790
},
{
"epoch": 11.712380236708622,
"grad_norm": 1.1642825603485107,
"learning_rate": 4.366564382245943e-05,
"loss": 0.3145,
"step": 7800
},
{
"epoch": 11.727409355626527,
"grad_norm": 1.1535450220108032,
"learning_rate": 4.353549536299835e-05,
"loss": 0.3144,
"step": 7810
},
{
"epoch": 11.74243847454443,
"grad_norm": 0.992770254611969,
"learning_rate": 4.3405391432896555e-05,
"loss": 0.3084,
"step": 7820
},
{
"epoch": 11.757467593462334,
"grad_norm": 1.064002275466919,
"learning_rate": 4.327533292834723e-05,
"loss": 0.3186,
"step": 7830
},
{
"epoch": 11.772496712380237,
"grad_norm": 1.1059247255325317,
"learning_rate": 4.314532074523057e-05,
"loss": 0.3233,
"step": 7840
},
{
"epoch": 11.78752583129814,
"grad_norm": 1.1188381910324097,
"learning_rate": 4.3015355779107734e-05,
"loss": 0.3361,
"step": 7850
},
{
"epoch": 11.802554950216043,
"grad_norm": 1.0294090509414673,
"learning_rate": 4.288543892521463e-05,
"loss": 0.3144,
"step": 7860
},
{
"epoch": 11.817584069133947,
"grad_norm": 1.265080451965332,
"learning_rate": 4.275557107845576e-05,
"loss": 0.3171,
"step": 7870
},
{
"epoch": 11.83261318805185,
"grad_norm": 1.3412435054779053,
"learning_rate": 4.262575313339803e-05,
"loss": 0.3249,
"step": 7880
},
{
"epoch": 11.847642306969753,
"grad_norm": 1.074264407157898,
"learning_rate": 4.249598598426465e-05,
"loss": 0.3241,
"step": 7890
},
{
"epoch": 11.862671425887658,
"grad_norm": 1.2046911716461182,
"learning_rate": 4.236627052492889e-05,
"loss": 0.3202,
"step": 7900
},
{
"epoch": 11.877700544805561,
"grad_norm": 1.1616815328598022,
"learning_rate": 4.2236607648907984e-05,
"loss": 0.3185,
"step": 7910
},
{
"epoch": 11.892729663723465,
"grad_norm": 1.1158292293548584,
"learning_rate": 4.210699824935695e-05,
"loss": 0.3209,
"step": 7920
},
{
"epoch": 11.907758782641368,
"grad_norm": 1.0398184061050415,
"learning_rate": 4.197744321906247e-05,
"loss": 0.3124,
"step": 7930
},
{
"epoch": 11.922787901559271,
"grad_norm": 1.1969057321548462,
"learning_rate": 4.1847943450436686e-05,
"loss": 0.3432,
"step": 7940
},
{
"epoch": 11.937817020477175,
"grad_norm": 1.1535173654556274,
"learning_rate": 4.17184998355111e-05,
"loss": 0.3143,
"step": 7950
},
{
"epoch": 11.952846139395078,
"grad_norm": 1.0445293188095093,
"learning_rate": 4.158911326593037e-05,
"loss": 0.3222,
"step": 7960
},
{
"epoch": 11.967875258312981,
"grad_norm": 1.1093374490737915,
"learning_rate": 4.14597846329463e-05,
"loss": 0.3311,
"step": 7970
},
{
"epoch": 11.982904377230884,
"grad_norm": 1.1024218797683716,
"learning_rate": 4.133051482741149e-05,
"loss": 0.3153,
"step": 7980
},
{
"epoch": 11.997933496148788,
"grad_norm": 1.0923748016357422,
"learning_rate": 4.120130473977343e-05,
"loss": 0.3194,
"step": 7990
},
{
"epoch": 12.012023295134323,
"grad_norm": 1.1858222484588623,
"learning_rate": 4.107215526006817e-05,
"loss": 0.2696,
"step": 8000
},
{
"epoch": 12.027052414052227,
"grad_norm": 0.9616860151290894,
"learning_rate": 4.094306727791436e-05,
"loss": 0.2594,
"step": 8010
},
{
"epoch": 12.04208153297013,
"grad_norm": 0.9500885009765625,
"learning_rate": 4.081404168250694e-05,
"loss": 0.2461,
"step": 8020
},
{
"epoch": 12.057110651888033,
"grad_norm": 1.0713434219360352,
"learning_rate": 4.0685079362611204e-05,
"loss": 0.2645,
"step": 8030
},
{
"epoch": 12.072139770805936,
"grad_norm": 1.0027638673782349,
"learning_rate": 4.055618120655652e-05,
"loss": 0.2624,
"step": 8040
},
{
"epoch": 12.08716888972384,
"grad_norm": 1.0205668210983276,
"learning_rate": 4.0427348102230314e-05,
"loss": 0.2464,
"step": 8050
},
{
"epoch": 12.102198008641743,
"grad_norm": 0.970747172832489,
"learning_rate": 4.029858093707189e-05,
"loss": 0.2406,
"step": 8060
},
{
"epoch": 12.117227127559646,
"grad_norm": 1.1178600788116455,
"learning_rate": 4.01698805980664e-05,
"loss": 0.2533,
"step": 8070
},
{
"epoch": 12.13225624647755,
"grad_norm": 1.0586788654327393,
"learning_rate": 4.004124797173857e-05,
"loss": 0.2549,
"step": 8080
},
{
"epoch": 12.147285365395454,
"grad_norm": 1.0152502059936523,
"learning_rate": 3.991268394414685e-05,
"loss": 0.2499,
"step": 8090
},
{
"epoch": 12.162314484313358,
"grad_norm": 1.0560377836227417,
"learning_rate": 3.9784189400877005e-05,
"loss": 0.2591,
"step": 8100
},
{
"epoch": 12.177343603231261,
"grad_norm": 1.1126878261566162,
"learning_rate": 3.965576522703631e-05,
"loss": 0.2593,
"step": 8110
},
{
"epoch": 12.192372722149164,
"grad_norm": 0.9110709428787231,
"learning_rate": 3.9527412307247205e-05,
"loss": 0.2623,
"step": 8120
},
{
"epoch": 12.207401841067067,
"grad_norm": 1.153400182723999,
"learning_rate": 3.9399131525641405e-05,
"loss": 0.2598,
"step": 8130
},
{
"epoch": 12.22243095998497,
"grad_norm": 0.8933331966400146,
"learning_rate": 3.927092376585363e-05,
"loss": 0.2529,
"step": 8140
},
{
"epoch": 12.237460078902874,
"grad_norm": 1.031607747077942,
"learning_rate": 3.914278991101568e-05,
"loss": 0.2554,
"step": 8150
},
{
"epoch": 12.252489197820777,
"grad_norm": 1.1537200212478638,
"learning_rate": 3.901473084375023e-05,
"loss": 0.2474,
"step": 8160
},
{
"epoch": 12.26751831673868,
"grad_norm": 1.024788498878479,
"learning_rate": 3.88867474461648e-05,
"loss": 0.2475,
"step": 8170
},
{
"epoch": 12.282547435656584,
"grad_norm": 1.087825059890747,
"learning_rate": 3.875884059984571e-05,
"loss": 0.2568,
"step": 8180
},
{
"epoch": 12.297576554574489,
"grad_norm": 1.000375509262085,
"learning_rate": 3.863101118585194e-05,
"loss": 0.259,
"step": 8190
},
{
"epoch": 12.312605673492392,
"grad_norm": 1.0344016551971436,
"learning_rate": 3.850326008470908e-05,
"loss": 0.2553,
"step": 8200
},
{
"epoch": 12.327634792410295,
"grad_norm": 0.9918733835220337,
"learning_rate": 3.8375588176403345e-05,
"loss": 0.2597,
"step": 8210
},
{
"epoch": 12.342663911328199,
"grad_norm": 1.0089991092681885,
"learning_rate": 3.8247996340375344e-05,
"loss": 0.2477,
"step": 8220
},
{
"epoch": 12.357693030246102,
"grad_norm": 1.012367606163025,
"learning_rate": 3.812048545551426e-05,
"loss": 0.2585,
"step": 8230
},
{
"epoch": 12.372722149164005,
"grad_norm": 1.1676548719406128,
"learning_rate": 3.799305640015152e-05,
"loss": 0.2534,
"step": 8240
},
{
"epoch": 12.387751268081908,
"grad_norm": 1.1742953062057495,
"learning_rate": 3.786571005205498e-05,
"loss": 0.2577,
"step": 8250
},
{
"epoch": 12.402780386999812,
"grad_norm": 1.2898715734481812,
"learning_rate": 3.773844728842275e-05,
"loss": 0.2534,
"step": 8260
},
{
"epoch": 12.417809505917715,
"grad_norm": 1.093583583831787,
"learning_rate": 3.7611268985877215e-05,
"loss": 0.259,
"step": 8270
},
{
"epoch": 12.432838624835618,
"grad_norm": 0.9623090624809265,
"learning_rate": 3.7484176020458906e-05,
"loss": 0.2647,
"step": 8280
},
{
"epoch": 12.447867743753523,
"grad_norm": 1.0669386386871338,
"learning_rate": 3.735716926762059e-05,
"loss": 0.2628,
"step": 8290
},
{
"epoch": 12.462896862671426,
"grad_norm": 1.136635184288025,
"learning_rate": 3.723024960222116e-05,
"loss": 0.264,
"step": 8300
},
{
"epoch": 12.47792598158933,
"grad_norm": 1.2198032140731812,
"learning_rate": 3.710341789851962e-05,
"loss": 0.2575,
"step": 8310
},
{
"epoch": 12.492955100507233,
"grad_norm": 1.1004136800765991,
"learning_rate": 3.697667503016904e-05,
"loss": 0.2573,
"step": 8320
},
{
"epoch": 12.507984219425136,
"grad_norm": 0.9815653562545776,
"learning_rate": 3.685002187021064e-05,
"loss": 0.2693,
"step": 8330
},
{
"epoch": 12.52301333834304,
"grad_norm": 1.23141348361969,
"learning_rate": 3.6723459291067615e-05,
"loss": 0.2632,
"step": 8340
},
{
"epoch": 12.538042457260943,
"grad_norm": 1.0357614755630493,
"learning_rate": 3.65969881645393e-05,
"loss": 0.2582,
"step": 8350
},
{
"epoch": 12.553071576178846,
"grad_norm": 1.283329963684082,
"learning_rate": 3.647060936179497e-05,
"loss": 0.2654,
"step": 8360
},
{
"epoch": 12.56810069509675,
"grad_norm": 1.062829613685608,
"learning_rate": 3.63443237533681e-05,
"loss": 0.2652,
"step": 8370
},
{
"epoch": 12.583129814014653,
"grad_norm": 1.0494091510772705,
"learning_rate": 3.6218132209150045e-05,
"loss": 0.2664,
"step": 8380
},
{
"epoch": 12.598158932932558,
"grad_norm": 1.1577351093292236,
"learning_rate": 3.6092035598384354e-05,
"loss": 0.2765,
"step": 8390
},
{
"epoch": 12.61318805185046,
"grad_norm": 1.1229662895202637,
"learning_rate": 3.5966034789660574e-05,
"loss": 0.2658,
"step": 8400
},
{
"epoch": 12.628217170768364,
"grad_norm": 1.1747732162475586,
"learning_rate": 3.584013065090837e-05,
"loss": 0.2631,
"step": 8410
},
{
"epoch": 12.643246289686267,
"grad_norm": 1.2156236171722412,
"learning_rate": 3.571432404939149e-05,
"loss": 0.2618,
"step": 8420
},
{
"epoch": 12.65827540860417,
"grad_norm": 1.2369886636734009,
"learning_rate": 3.5588615851701855e-05,
"loss": 0.2637,
"step": 8430
},
{
"epoch": 12.673304527522074,
"grad_norm": 0.9820154905319214,
"learning_rate": 3.546300692375352e-05,
"loss": 0.2675,
"step": 8440
},
{
"epoch": 12.688333646439977,
"grad_norm": 1.0225483179092407,
"learning_rate": 3.533749813077677e-05,
"loss": 0.2634,
"step": 8450
},
{
"epoch": 12.70336276535788,
"grad_norm": 0.9450991153717041,
"learning_rate": 3.5212090337312095e-05,
"loss": 0.2713,
"step": 8460
},
{
"epoch": 12.718391884275784,
"grad_norm": 1.1000279188156128,
"learning_rate": 3.508678440720431e-05,
"loss": 0.2728,
"step": 8470
},
{
"epoch": 12.733421003193687,
"grad_norm": 1.1958969831466675,
"learning_rate": 3.496158120359653e-05,
"loss": 0.2546,
"step": 8480
},
{
"epoch": 12.748450122111592,
"grad_norm": 1.0161027908325195,
"learning_rate": 3.483648158892431e-05,
"loss": 0.265,
"step": 8490
},
{
"epoch": 12.763479241029495,
"grad_norm": 1.069886326789856,
"learning_rate": 3.471148642490957e-05,
"loss": 0.2605,
"step": 8500
},
{
"epoch": 12.778508359947399,
"grad_norm": 1.082297444343567,
"learning_rate": 3.4586596572554856e-05,
"loss": 0.2739,
"step": 8510
},
{
"epoch": 12.793537478865302,
"grad_norm": 1.0885424613952637,
"learning_rate": 3.4461812892137196e-05,
"loss": 0.2708,
"step": 8520
},
{
"epoch": 12.808566597783205,
"grad_norm": 1.0391422510147095,
"learning_rate": 3.433713624320234e-05,
"loss": 0.2655,
"step": 8530
},
{
"epoch": 12.823595716701108,
"grad_norm": 1.225851058959961,
"learning_rate": 3.421256748455873e-05,
"loss": 0.2542,
"step": 8540
},
{
"epoch": 12.838624835619012,
"grad_norm": 0.993791401386261,
"learning_rate": 3.408810747427169e-05,
"loss": 0.2697,
"step": 8550
},
{
"epoch": 12.853653954536915,
"grad_norm": 1.0382951498031616,
"learning_rate": 3.396375706965738e-05,
"loss": 0.2706,
"step": 8560
},
{
"epoch": 12.868683073454818,
"grad_norm": 1.0424343347549438,
"learning_rate": 3.383951712727701e-05,
"loss": 0.2755,
"step": 8570
},
{
"epoch": 12.883712192372721,
"grad_norm": 1.1532506942749023,
"learning_rate": 3.371538850293088e-05,
"loss": 0.2628,
"step": 8580
},
{
"epoch": 12.898741311290626,
"grad_norm": 1.1272519826889038,
"learning_rate": 3.359137205165251e-05,
"loss": 0.2699,
"step": 8590
},
{
"epoch": 12.91377043020853,
"grad_norm": 1.073285698890686,
"learning_rate": 3.3467468627702734e-05,
"loss": 0.2677,
"step": 8600
},
{
"epoch": 12.928799549126433,
"grad_norm": 1.2244044542312622,
"learning_rate": 3.334367908456384e-05,
"loss": 0.2673,
"step": 8610
},
{
"epoch": 12.943828668044336,
"grad_norm": 1.1868269443511963,
"learning_rate": 3.32200042749336e-05,
"loss": 0.2671,
"step": 8620
},
{
"epoch": 12.95885778696224,
"grad_norm": 1.1779018640518188,
"learning_rate": 3.309644505071959e-05,
"loss": 0.2744,
"step": 8630
},
{
"epoch": 12.973886905880143,
"grad_norm": 1.1692800521850586,
"learning_rate": 3.297300226303306e-05,
"loss": 0.2741,
"step": 8640
},
{
"epoch": 12.988916024798046,
"grad_norm": 1.0709041357040405,
"learning_rate": 3.284967676218336e-05,
"loss": 0.2672,
"step": 8650
},
{
"epoch": 13.00300582378358,
"grad_norm": 0.9654292464256287,
"learning_rate": 3.272646939767179e-05,
"loss": 0.255,
"step": 8660
},
{
"epoch": 13.018034942701485,
"grad_norm": 0.9214917421340942,
"learning_rate": 3.2603381018186016e-05,
"loss": 0.2085,
"step": 8670
},
{
"epoch": 13.033064061619388,
"grad_norm": 0.9971623420715332,
"learning_rate": 3.248041247159401e-05,
"loss": 0.2158,
"step": 8680
},
{
"epoch": 13.048093180537292,
"grad_norm": 0.8868154287338257,
"learning_rate": 3.235756460493836e-05,
"loss": 0.2225,
"step": 8690
},
{
"epoch": 13.063122299455195,
"grad_norm": 0.9371384382247925,
"learning_rate": 3.2234838264430346e-05,
"loss": 0.2194,
"step": 8700
},
{
"epoch": 13.078151418373098,
"grad_norm": 0.933928370475769,
"learning_rate": 3.211223429544415e-05,
"loss": 0.2087,
"step": 8710
},
{
"epoch": 13.093180537291001,
"grad_norm": 1.1291043758392334,
"learning_rate": 3.198975354251101e-05,
"loss": 0.214,
"step": 8720
},
{
"epoch": 13.108209656208905,
"grad_norm": 0.9412780404090881,
"learning_rate": 3.1867396849313466e-05,
"loss": 0.2059,
"step": 8730
},
{
"epoch": 13.123238775126808,
"grad_norm": 0.9674059748649597,
"learning_rate": 3.174516505867943e-05,
"loss": 0.2118,
"step": 8740
},
{
"epoch": 13.138267894044711,
"grad_norm": 1.1346533298492432,
"learning_rate": 3.16230590125765e-05,
"loss": 0.2191,
"step": 8750
},
{
"epoch": 13.153297012962614,
"grad_norm": 0.9253365993499756,
"learning_rate": 3.150107955210606e-05,
"loss": 0.2137,
"step": 8760
},
{
"epoch": 13.16832613188052,
"grad_norm": 1.0744667053222656,
"learning_rate": 3.137922751749762e-05,
"loss": 0.2194,
"step": 8770
},
{
"epoch": 13.183355250798423,
"grad_norm": 0.9793460965156555,
"learning_rate": 3.125750374810283e-05,
"loss": 0.2131,
"step": 8780
},
{
"epoch": 13.198384369716326,
"grad_norm": 0.923272430896759,
"learning_rate": 3.113590908238994e-05,
"loss": 0.228,
"step": 8790
},
{
"epoch": 13.21341348863423,
"grad_norm": 1.0247244834899902,
"learning_rate": 3.101444435793777e-05,
"loss": 0.2104,
"step": 8800
},
{
"epoch": 13.228442607552132,
"grad_norm": 1.0090657472610474,
"learning_rate": 3.089311041143017e-05,
"loss": 0.2161,
"step": 8810
},
{
"epoch": 13.243471726470036,
"grad_norm": 0.9428199529647827,
"learning_rate": 3.077190807865009e-05,
"loss": 0.2165,
"step": 8820
},
{
"epoch": 13.258500845387939,
"grad_norm": 1.083084225654602,
"learning_rate": 3.065083819447393e-05,
"loss": 0.2135,
"step": 8830
},
{
"epoch": 13.273529964305842,
"grad_norm": 1.0958205461502075,
"learning_rate": 3.0529901592865705e-05,
"loss": 0.2128,
"step": 8840
},
{
"epoch": 13.288559083223745,
"grad_norm": 0.9356290698051453,
"learning_rate": 3.0409099106871374e-05,
"loss": 0.2136,
"step": 8850
},
{
"epoch": 13.303588202141649,
"grad_norm": 1.1614493131637573,
"learning_rate": 3.0288431568613053e-05,
"loss": 0.2256,
"step": 8860
},
{
"epoch": 13.318617321059552,
"grad_norm": 1.0191394090652466,
"learning_rate": 3.0167899809283308e-05,
"loss": 0.2183,
"step": 8870
},
{
"epoch": 13.333646439977457,
"grad_norm": 1.0032422542572021,
"learning_rate": 3.0047504659139404e-05,
"loss": 0.214,
"step": 8880
},
{
"epoch": 13.34867555889536,
"grad_norm": 0.9819022417068481,
"learning_rate": 2.9927246947497644e-05,
"loss": 0.2169,
"step": 8890
},
{
"epoch": 13.363704677813264,
"grad_norm": 1.050058364868164,
"learning_rate": 2.9807127502727537e-05,
"loss": 0.2249,
"step": 8900
},
{
"epoch": 13.378733796731167,
"grad_norm": 0.9431155920028687,
"learning_rate": 2.9687147152246276e-05,
"loss": 0.2148,
"step": 8910
},
{
"epoch": 13.39376291564907,
"grad_norm": 0.8861021399497986,
"learning_rate": 2.9567306722512833e-05,
"loss": 0.2202,
"step": 8920
},
{
"epoch": 13.408792034566973,
"grad_norm": 1.0134702920913696,
"learning_rate": 2.944760703902244e-05,
"loss": 0.2214,
"step": 8930
},
{
"epoch": 13.423821153484877,
"grad_norm": 1.1062716245651245,
"learning_rate": 2.9328048926300766e-05,
"loss": 0.2238,
"step": 8940
},
{
"epoch": 13.43885027240278,
"grad_norm": 1.0837918519973755,
"learning_rate": 2.9208633207898372e-05,
"loss": 0.2142,
"step": 8950
},
{
"epoch": 13.453879391320683,
"grad_norm": 1.1653366088867188,
"learning_rate": 2.908936070638487e-05,
"loss": 0.2172,
"step": 8960
},
{
"epoch": 13.468908510238588,
"grad_norm": 1.0416685342788696,
"learning_rate": 2.8970232243343482e-05,
"loss": 0.2185,
"step": 8970
},
{
"epoch": 13.483937629156491,
"grad_norm": 1.0021854639053345,
"learning_rate": 2.8851248639365114e-05,
"loss": 0.2166,
"step": 8980
},
{
"epoch": 13.498966748074395,
"grad_norm": 1.0365519523620605,
"learning_rate": 2.8732410714042957e-05,
"loss": 0.2209,
"step": 8990
},
{
"epoch": 13.513995866992298,
"grad_norm": 1.008899211883545,
"learning_rate": 2.8613719285966623e-05,
"loss": 0.2254,
"step": 9000
},
{
"epoch": 13.529024985910201,
"grad_norm": 0.8905879855155945,
"learning_rate": 2.8495175172716692e-05,
"loss": 0.2204,
"step": 9010
},
{
"epoch": 13.544054104828104,
"grad_norm": 1.0459271669387817,
"learning_rate": 2.837677919085896e-05,
"loss": 0.217,
"step": 9020
},
{
"epoch": 13.559083223746008,
"grad_norm": 1.0746241807937622,
"learning_rate": 2.8258532155938875e-05,
"loss": 0.2154,
"step": 9030
},
{
"epoch": 13.574112342663911,
"grad_norm": 1.0592225790023804,
"learning_rate": 2.8140434882475847e-05,
"loss": 0.2232,
"step": 9040
},
{
"epoch": 13.589141461581814,
"grad_norm": 0.9885957837104797,
"learning_rate": 2.802248818395773e-05,
"loss": 0.2158,
"step": 9050
},
{
"epoch": 13.604170580499718,
"grad_norm": 1.1569939851760864,
"learning_rate": 2.790469287283517e-05,
"loss": 0.2218,
"step": 9060
},
{
"epoch": 13.61919969941762,
"grad_norm": 1.135467529296875,
"learning_rate": 2.7787049760516013e-05,
"loss": 0.2214,
"step": 9070
},
{
"epoch": 13.634228818335526,
"grad_norm": 1.140293002128601,
"learning_rate": 2.766955965735968e-05,
"loss": 0.2174,
"step": 9080
},
{
"epoch": 13.649257937253429,
"grad_norm": 1.062946081161499,
"learning_rate": 2.755222337267168e-05,
"loss": 0.2245,
"step": 9090
},
{
"epoch": 13.664287056171332,
"grad_norm": 1.142333984375,
"learning_rate": 2.74350417146979e-05,
"loss": 0.2159,
"step": 9100
},
{
"epoch": 13.679316175089236,
"grad_norm": 1.206817388534546,
"learning_rate": 2.731801549061923e-05,
"loss": 0.2213,
"step": 9110
},
{
"epoch": 13.694345294007139,
"grad_norm": 1.0265262126922607,
"learning_rate": 2.7201145506545756e-05,
"loss": 0.2307,
"step": 9120
},
{
"epoch": 13.709374412925042,
"grad_norm": 1.2109159231185913,
"learning_rate": 2.7084432567511443e-05,
"loss": 0.2188,
"step": 9130
},
{
"epoch": 13.724403531842945,
"grad_norm": 1.3201031684875488,
"learning_rate": 2.6967877477468397e-05,
"loss": 0.2243,
"step": 9140
},
{
"epoch": 13.739432650760849,
"grad_norm": 1.1013463735580444,
"learning_rate": 2.6851481039281478e-05,
"loss": 0.2285,
"step": 9150
},
{
"epoch": 13.754461769678752,
"grad_norm": 1.1080180406570435,
"learning_rate": 2.6735244054722697e-05,
"loss": 0.2289,
"step": 9160
},
{
"epoch": 13.769490888596657,
"grad_norm": 1.0649311542510986,
"learning_rate": 2.66191673244657e-05,
"loss": 0.2243,
"step": 9170
},
{
"epoch": 13.78452000751456,
"grad_norm": 1.1212127208709717,
"learning_rate": 2.6503251648080212e-05,
"loss": 0.217,
"step": 9180
},
{
"epoch": 13.799549126432463,
"grad_norm": 1.0007354021072388,
"learning_rate": 2.6387497824026637e-05,
"loss": 0.2213,
"step": 9190
},
{
"epoch": 13.814578245350367,
"grad_norm": 0.9835550785064697,
"learning_rate": 2.6271906649650457e-05,
"loss": 0.2206,
"step": 9200
},
{
"epoch": 13.82960736426827,
"grad_norm": 1.1858932971954346,
"learning_rate": 2.6156478921176807e-05,
"loss": 0.2285,
"step": 9210
},
{
"epoch": 13.844636483186173,
"grad_norm": 1.2049376964569092,
"learning_rate": 2.6041215433704903e-05,
"loss": 0.2236,
"step": 9220
},
{
"epoch": 13.859665602104076,
"grad_norm": 0.9520084261894226,
"learning_rate": 2.5926116981202688e-05,
"loss": 0.233,
"step": 9230
},
{
"epoch": 13.87469472102198,
"grad_norm": 1.0784698724746704,
"learning_rate": 2.581118435650121e-05,
"loss": 0.2284,
"step": 9240
},
{
"epoch": 13.889723839939883,
"grad_norm": 1.1517982482910156,
"learning_rate": 2.5696418351289387e-05,
"loss": 0.2209,
"step": 9250
},
{
"epoch": 13.904752958857786,
"grad_norm": 1.0725606679916382,
"learning_rate": 2.558181975610827e-05,
"loss": 0.2179,
"step": 9260
},
{
"epoch": 13.91978207777569,
"grad_norm": 1.0226749181747437,
"learning_rate": 2.546738936034585e-05,
"loss": 0.2247,
"step": 9270
},
{
"epoch": 13.934811196693595,
"grad_norm": 1.1553442478179932,
"learning_rate": 2.5353127952231404e-05,
"loss": 0.2179,
"step": 9280
},
{
"epoch": 13.949840315611498,
"grad_norm": 1.0485488176345825,
"learning_rate": 2.5239036318830278e-05,
"loss": 0.2179,
"step": 9290
},
{
"epoch": 13.964869434529401,
"grad_norm": 1.2220666408538818,
"learning_rate": 2.51251152460383e-05,
"loss": 0.2247,
"step": 9300
},
{
"epoch": 13.979898553447304,
"grad_norm": 1.1536996364593506,
"learning_rate": 2.5011365518576467e-05,
"loss": 0.2331,
"step": 9310
},
{
"epoch": 13.994927672365208,
"grad_norm": 1.0037457942962646,
"learning_rate": 2.4897787919985454e-05,
"loss": 0.2266,
"step": 9320
},
{
"epoch": 14.009017471350742,
"grad_norm": 0.900565505027771,
"learning_rate": 2.4784383232620295e-05,
"loss": 0.1914,
"step": 9330
},
{
"epoch": 14.024046590268645,
"grad_norm": 0.9061153531074524,
"learning_rate": 2.467115223764495e-05,
"loss": 0.1753,
"step": 9340
},
{
"epoch": 14.039075709186548,
"grad_norm": 0.8884809613227844,
"learning_rate": 2.4558095715026973e-05,
"loss": 0.1721,
"step": 9350
},
{
"epoch": 14.054104828104453,
"grad_norm": 0.9852058291435242,
"learning_rate": 2.4445214443532027e-05,
"loss": 0.1734,
"step": 9360
},
{
"epoch": 14.069133947022356,
"grad_norm": 0.8632417321205139,
"learning_rate": 2.4332509200718673e-05,
"loss": 0.1898,
"step": 9370
},
{
"epoch": 14.08416306594026,
"grad_norm": 0.9666391015052795,
"learning_rate": 2.421998076293285e-05,
"loss": 0.1835,
"step": 9380
},
{
"epoch": 14.099192184858163,
"grad_norm": 0.8072938919067383,
"learning_rate": 2.4107629905302738e-05,
"loss": 0.1845,
"step": 9390
},
{
"epoch": 14.114221303776066,
"grad_norm": 1.2991918325424194,
"learning_rate": 2.3995457401733158e-05,
"loss": 0.1809,
"step": 9400
},
{
"epoch": 14.12925042269397,
"grad_norm": 0.8927931785583496,
"learning_rate": 2.3883464024900482e-05,
"loss": 0.1743,
"step": 9410
},
{
"epoch": 14.144279541611873,
"grad_norm": 0.9115880727767944,
"learning_rate": 2.3771650546247128e-05,
"loss": 0.1742,
"step": 9420
},
{
"epoch": 14.159308660529776,
"grad_norm": 0.904136061668396,
"learning_rate": 2.3660017735976374e-05,
"loss": 0.1873,
"step": 9430
},
{
"epoch": 14.17433777944768,
"grad_norm": 0.9878782629966736,
"learning_rate": 2.3548566363046992e-05,
"loss": 0.1839,
"step": 9440
},
{
"epoch": 14.189366898365583,
"grad_norm": 1.261094093322754,
"learning_rate": 2.343729719516798e-05,
"loss": 0.1722,
"step": 9450
},
{
"epoch": 14.204396017283488,
"grad_norm": 0.959791362285614,
"learning_rate": 2.332621099879318e-05,
"loss": 0.1797,
"step": 9460
},
{
"epoch": 14.21942513620139,
"grad_norm": 1.0712839365005493,
"learning_rate": 2.321530853911616e-05,
"loss": 0.1779,
"step": 9470
},
{
"epoch": 14.234454255119294,
"grad_norm": 0.9205087423324585,
"learning_rate": 2.3104590580064823e-05,
"loss": 0.1978,
"step": 9480
},
{
"epoch": 14.249483374037197,
"grad_norm": 0.9004307985305786,
"learning_rate": 2.299405788429619e-05,
"loss": 0.1792,
"step": 9490
},
{
"epoch": 14.2645124929551,
"grad_norm": 0.9223144054412842,
"learning_rate": 2.288371121319109e-05,
"loss": 0.1795,
"step": 9500
},
{
"epoch": 14.279541611873004,
"grad_norm": 0.8646677732467651,
"learning_rate": 2.2773551326849036e-05,
"loss": 0.1778,
"step": 9510
},
{
"epoch": 14.294570730790907,
"grad_norm": 1.060955286026001,
"learning_rate": 2.266357898408282e-05,
"loss": 0.1864,
"step": 9520
},
{
"epoch": 14.30959984970881,
"grad_norm": 0.9104660153388977,
"learning_rate": 2.2553794942413503e-05,
"loss": 0.1825,
"step": 9530
},
{
"epoch": 14.324628968626714,
"grad_norm": 0.945350170135498,
"learning_rate": 2.2444199958064955e-05,
"loss": 0.1836,
"step": 9540
},
{
"epoch": 14.339658087544617,
"grad_norm": 1.2413114309310913,
"learning_rate": 2.2334794785958845e-05,
"loss": 0.1769,
"step": 9550
},
{
"epoch": 14.354687206462522,
"grad_norm": 0.9645456671714783,
"learning_rate": 2.2225580179709303e-05,
"loss": 0.1845,
"step": 9560
},
{
"epoch": 14.369716325380425,
"grad_norm": 0.9362895488739014,
"learning_rate": 2.2116556891617825e-05,
"loss": 0.1813,
"step": 9570
},
{
"epoch": 14.384745444298328,
"grad_norm": 1.0554242134094238,
"learning_rate": 2.200772567266805e-05,
"loss": 0.1932,
"step": 9580
},
{
"epoch": 14.399774563216232,
"grad_norm": 1.0449492931365967,
"learning_rate": 2.1899087272520595e-05,
"loss": 0.1882,
"step": 9590
},
{
"epoch": 14.414803682134135,
"grad_norm": 1.107164978981018,
"learning_rate": 2.179064243950784e-05,
"loss": 0.1878,
"step": 9600
},
{
"epoch": 14.429832801052038,
"grad_norm": 1.010380506515503,
"learning_rate": 2.1682391920628868e-05,
"loss": 0.1784,
"step": 9610
},
{
"epoch": 14.444861919969942,
"grad_norm": 1.1067860126495361,
"learning_rate": 2.1574336461544258e-05,
"loss": 0.1823,
"step": 9620
},
{
"epoch": 14.459891038887845,
"grad_norm": 1.0193742513656616,
"learning_rate": 2.1466476806570972e-05,
"loss": 0.1887,
"step": 9630
},
{
"epoch": 14.474920157805748,
"grad_norm": 0.9946687817573547,
"learning_rate": 2.1358813698677178e-05,
"loss": 0.1956,
"step": 9640
},
{
"epoch": 14.489949276723651,
"grad_norm": 1.2227554321289062,
"learning_rate": 2.125134787947722e-05,
"loss": 0.1815,
"step": 9650
},
{
"epoch": 14.504978395641556,
"grad_norm": 1.002421259880066,
"learning_rate": 2.114408008922639e-05,
"loss": 0.1851,
"step": 9660
},
{
"epoch": 14.52000751455946,
"grad_norm": 1.0360831022262573,
"learning_rate": 2.103701106681602e-05,
"loss": 0.1838,
"step": 9670
},
{
"epoch": 14.535036633477363,
"grad_norm": 0.9968597292900085,
"learning_rate": 2.0930141549768144e-05,
"loss": 0.1842,
"step": 9680
},
{
"epoch": 14.550065752395266,
"grad_norm": 1.0610520839691162,
"learning_rate": 2.082347227423064e-05,
"loss": 0.1844,
"step": 9690
},
{
"epoch": 14.56509487131317,
"grad_norm": 0.9733484983444214,
"learning_rate": 2.071700397497199e-05,
"loss": 0.1877,
"step": 9700
},
{
"epoch": 14.580123990231073,
"grad_norm": 1.059486746788025,
"learning_rate": 2.061073738537635e-05,
"loss": 0.1917,
"step": 9710
},
{
"epoch": 14.595153109148976,
"grad_norm": 1.0647083520889282,
"learning_rate": 2.0504673237438422e-05,
"loss": 0.1935,
"step": 9720
},
{
"epoch": 14.61018222806688,
"grad_norm": 1.005767583847046,
"learning_rate": 2.0398812261758444e-05,
"loss": 0.1868,
"step": 9730
},
{
"epoch": 14.625211346984782,
"grad_norm": 1.0666831731796265,
"learning_rate": 2.029315518753711e-05,
"loss": 0.1863,
"step": 9740
},
{
"epoch": 14.640240465902686,
"grad_norm": 1.0782824754714966,
"learning_rate": 2.018770274257062e-05,
"loss": 0.2028,
"step": 9750
},
{
"epoch": 14.65526958482059,
"grad_norm": 0.9997120499610901,
"learning_rate": 2.0082455653245612e-05,
"loss": 0.1945,
"step": 9760
},
{
"epoch": 14.670298703738494,
"grad_norm": 1.096117615699768,
"learning_rate": 1.9977414644534205e-05,
"loss": 0.1876,
"step": 9770
},
{
"epoch": 14.685327822656397,
"grad_norm": 0.9982436895370483,
"learning_rate": 1.98725804399889e-05,
"loss": 0.1847,
"step": 9780
},
{
"epoch": 14.7003569415743,
"grad_norm": 1.2439534664154053,
"learning_rate": 1.9767953761737772e-05,
"loss": 0.189,
"step": 9790
},
{
"epoch": 14.715386060492204,
"grad_norm": 1.0233805179595947,
"learning_rate": 1.9663535330479305e-05,
"loss": 0.1905,
"step": 9800
},
{
"epoch": 14.730415179410107,
"grad_norm": 0.9537500739097595,
"learning_rate": 1.9559325865477573e-05,
"loss": 0.1757,
"step": 9810
},
{
"epoch": 14.74544429832801,
"grad_norm": 1.0633177757263184,
"learning_rate": 1.9455326084557213e-05,
"loss": 0.1926,
"step": 9820
},
{
"epoch": 14.760473417245914,
"grad_norm": 0.9927921295166016,
"learning_rate": 1.9351536704098527e-05,
"loss": 0.1907,
"step": 9830
},
{
"epoch": 14.775502536163817,
"grad_norm": 1.0007320642471313,
"learning_rate": 1.9247958439032448e-05,
"loss": 0.189,
"step": 9840
},
{
"epoch": 14.79053165508172,
"grad_norm": 1.1696594953536987,
"learning_rate": 1.9144592002835756e-05,
"loss": 0.1894,
"step": 9850
},
{
"epoch": 14.805560773999623,
"grad_norm": 4.139706611633301,
"learning_rate": 1.9041438107526056e-05,
"loss": 0.1839,
"step": 9860
},
{
"epoch": 14.820589892917528,
"grad_norm": 0.9341458678245544,
"learning_rate": 1.8938497463656945e-05,
"loss": 0.1991,
"step": 9870
},
{
"epoch": 14.835619011835432,
"grad_norm": 1.1703625917434692,
"learning_rate": 1.8835770780313027e-05,
"loss": 0.1837,
"step": 9880
},
{
"epoch": 14.850648130753335,
"grad_norm": 0.9725760221481323,
"learning_rate": 1.8733258765105126e-05,
"loss": 0.1831,
"step": 9890
},
{
"epoch": 14.865677249671238,
"grad_norm": 0.9153964519500732,
"learning_rate": 1.8630962124165375e-05,
"loss": 0.1955,
"step": 9900
},
{
"epoch": 14.880706368589141,
"grad_norm": 1.1788238286972046,
"learning_rate": 1.852888156214233e-05,
"loss": 0.1869,
"step": 9910
},
{
"epoch": 14.895735487507045,
"grad_norm": 0.9835808873176575,
"learning_rate": 1.8427017782196127e-05,
"loss": 0.1915,
"step": 9920
},
{
"epoch": 14.910764606424948,
"grad_norm": 1.1048306226730347,
"learning_rate": 1.832537148599367e-05,
"loss": 0.1851,
"step": 9930
},
{
"epoch": 14.925793725342851,
"grad_norm": 1.847183108329773,
"learning_rate": 1.8223943373703734e-05,
"loss": 0.1848,
"step": 9940
},
{
"epoch": 14.940822844260754,
"grad_norm": 0.9361986517906189,
"learning_rate": 1.8122734143992214e-05,
"loss": 0.1946,
"step": 9950
},
{
"epoch": 14.95585196317866,
"grad_norm": 1.007897973060608,
"learning_rate": 1.8021744494017283e-05,
"loss": 0.1917,
"step": 9960
},
{
"epoch": 14.970881082096563,
"grad_norm": 1.0453609228134155,
"learning_rate": 1.7920975119424576e-05,
"loss": 0.1956,
"step": 9970
},
{
"epoch": 14.985910201014466,
"grad_norm": 1.3399736881256104,
"learning_rate": 1.7820426714342374e-05,
"loss": 0.1963,
"step": 9980
},
{
"epoch": 15.0,
"grad_norm": 1.1934865713119507,
"learning_rate": 1.7720099971376907e-05,
"loss": 0.192,
"step": 9990
},
{
"epoch": 15.015029118917903,
"grad_norm": 0.9646713733673096,
"learning_rate": 1.7619995581607516e-05,
"loss": 0.1614,
"step": 10000
},
{
"epoch": 15.030058237835807,
"grad_norm": 0.815608561038971,
"learning_rate": 1.7520114234581912e-05,
"loss": 0.1628,
"step": 10010
},
{
"epoch": 15.04508735675371,
"grad_norm": 0.9114384055137634,
"learning_rate": 1.7420456618311405e-05,
"loss": 0.1567,
"step": 10020
},
{
"epoch": 15.060116475671613,
"grad_norm": 0.9106918573379517,
"learning_rate": 1.7321023419266193e-05,
"loss": 0.1582,
"step": 10030
},
{
"epoch": 15.075145594589518,
"grad_norm": 0.7602341771125793,
"learning_rate": 1.7221815322370632e-05,
"loss": 0.1563,
"step": 10040
},
{
"epoch": 15.090174713507421,
"grad_norm": 0.7736881971359253,
"learning_rate": 1.7122833010998535e-05,
"loss": 0.1533,
"step": 10050
},
{
"epoch": 15.105203832425325,
"grad_norm": 0.9630312919616699,
"learning_rate": 1.702407716696836e-05,
"loss": 0.1533,
"step": 10060
},
{
"epoch": 15.120232951343228,
"grad_norm": 0.8553804755210876,
"learning_rate": 1.6925548470538695e-05,
"loss": 0.1629,
"step": 10070
},
{
"epoch": 15.135262070261131,
"grad_norm": 1.0749071836471558,
"learning_rate": 1.6827247600403366e-05,
"loss": 0.1605,
"step": 10080
},
{
"epoch": 15.150291189179034,
"grad_norm": 0.8994390964508057,
"learning_rate": 1.6729175233686955e-05,
"loss": 0.1506,
"step": 10090
},
{
"epoch": 15.165320308096938,
"grad_norm": 1.0106632709503174,
"learning_rate": 1.6631332045939996e-05,
"loss": 0.1652,
"step": 10100
},
{
"epoch": 15.180349427014841,
"grad_norm": 1.0532327890396118,
"learning_rate": 1.6533718711134412e-05,
"loss": 0.1603,
"step": 10110
},
{
"epoch": 15.195378545932744,
"grad_norm": 0.821412205696106,
"learning_rate": 1.6436335901658766e-05,
"loss": 0.1511,
"step": 10120
},
{
"epoch": 15.210407664850647,
"grad_norm": 0.8959778547286987,
"learning_rate": 1.633918428831377e-05,
"loss": 0.1609,
"step": 10130
},
{
"epoch": 15.22543678376855,
"grad_norm": 0.8607751131057739,
"learning_rate": 1.6242264540307552e-05,
"loss": 0.1579,
"step": 10140
},
{
"epoch": 15.240465902686456,
"grad_norm": 0.8581548929214478,
"learning_rate": 1.614557732525111e-05,
"loss": 0.1563,
"step": 10150
},
{
"epoch": 15.255495021604359,
"grad_norm": 0.8387672901153564,
"learning_rate": 1.604912330915364e-05,
"loss": 0.1576,
"step": 10160
},
{
"epoch": 15.270524140522262,
"grad_norm": 0.871376097202301,
"learning_rate": 1.595290315641806e-05,
"loss": 0.1621,
"step": 10170
},
{
"epoch": 15.285553259440166,
"grad_norm": 1.072432279586792,
"learning_rate": 1.585691752983629e-05,
"loss": 0.153,
"step": 10180
},
{
"epoch": 15.300582378358069,
"grad_norm": 0.9539718627929688,
"learning_rate": 1.5761167090584882e-05,
"loss": 0.1551,
"step": 10190
},
{
"epoch": 15.315611497275972,
"grad_norm": 0.9477748274803162,
"learning_rate": 1.5665652498220236e-05,
"loss": 0.1596,
"step": 10200
},
{
"epoch": 15.330640616193875,
"grad_norm": 1.0767313241958618,
"learning_rate": 1.5570374410674243e-05,
"loss": 0.1597,
"step": 10210
},
{
"epoch": 15.345669735111779,
"grad_norm": 0.8535225987434387,
"learning_rate": 1.547533348424963e-05,
"loss": 0.1653,
"step": 10220
},
{
"epoch": 15.360698854029682,
"grad_norm": 0.92160964012146,
"learning_rate": 1.5380530373615542e-05,
"loss": 0.1487,
"step": 10230
},
{
"epoch": 15.375727972947587,
"grad_norm": 0.840239942073822,
"learning_rate": 1.5285965731802944e-05,
"loss": 0.1545,
"step": 10240
},
{
"epoch": 15.39075709186549,
"grad_norm": 1.0626702308654785,
"learning_rate": 1.5191640210200187e-05,
"loss": 0.1559,
"step": 10250
},
{
"epoch": 15.405786210783393,
"grad_norm": 0.9364585280418396,
"learning_rate": 1.5097554458548452e-05,
"loss": 0.1646,
"step": 10260
},
{
"epoch": 15.420815329701297,
"grad_norm": 1.0330567359924316,
"learning_rate": 1.5003709124937354e-05,
"loss": 0.1625,
"step": 10270
},
{
"epoch": 15.4358444486192,
"grad_norm": 0.9339507818222046,
"learning_rate": 1.4910104855800427e-05,
"loss": 0.1515,
"step": 10280
},
{
"epoch": 15.450873567537103,
"grad_norm": 0.7912824153900146,
"learning_rate": 1.4816742295910708e-05,
"loss": 0.162,
"step": 10290
},
{
"epoch": 15.465902686455006,
"grad_norm": 0.9348452687263489,
"learning_rate": 1.4723622088376205e-05,
"loss": 0.1572,
"step": 10300
},
{
"epoch": 15.48093180537291,
"grad_norm": 0.8750469088554382,
"learning_rate": 1.463074487463561e-05,
"loss": 0.1485,
"step": 10310
},
{
"epoch": 15.495960924290813,
"grad_norm": 0.9709532260894775,
"learning_rate": 1.4538111294453732e-05,
"loss": 0.1583,
"step": 10320
},
{
"epoch": 15.510990043208716,
"grad_norm": 0.9631896018981934,
"learning_rate": 1.4445721985917254e-05,
"loss": 0.1606,
"step": 10330
},
{
"epoch": 15.52601916212662,
"grad_norm": 0.8176620006561279,
"learning_rate": 1.435357758543015e-05,
"loss": 0.1583,
"step": 10340
},
{
"epoch": 15.541048281044525,
"grad_norm": 0.8556742668151855,
"learning_rate": 1.426167872770947e-05,
"loss": 0.1593,
"step": 10350
},
{
"epoch": 15.556077399962428,
"grad_norm": 1.2856311798095703,
"learning_rate": 1.4170026045780832e-05,
"loss": 0.169,
"step": 10360
},
{
"epoch": 15.571106518880331,
"grad_norm": 1.07082200050354,
"learning_rate": 1.4078620170974177e-05,
"loss": 0.1581,
"step": 10370
},
{
"epoch": 15.586135637798234,
"grad_norm": 0.9026190042495728,
"learning_rate": 1.3987461732919343e-05,
"loss": 0.1704,
"step": 10380
},
{
"epoch": 15.601164756716138,
"grad_norm": 0.9147086143493652,
"learning_rate": 1.3896551359541782e-05,
"loss": 0.1566,
"step": 10390
},
{
"epoch": 15.61619387563404,
"grad_norm": 0.9676672220230103,
"learning_rate": 1.3805889677058149e-05,
"loss": 0.1668,
"step": 10400
},
{
"epoch": 15.631222994551944,
"grad_norm": 0.9647960066795349,
"learning_rate": 1.3715477309972086e-05,
"loss": 0.1603,
"step": 10410
},
{
"epoch": 15.646252113469847,
"grad_norm": 0.9588443636894226,
"learning_rate": 1.3625314881069873e-05,
"loss": 0.1614,
"step": 10420
},
{
"epoch": 15.66128123238775,
"grad_norm": 0.921419084072113,
"learning_rate": 1.3535403011416158e-05,
"loss": 0.1574,
"step": 10430
},
{
"epoch": 15.676310351305656,
"grad_norm": 0.9163838624954224,
"learning_rate": 1.3445742320349625e-05,
"loss": 0.1521,
"step": 10440
},
{
"epoch": 15.691339470223559,
"grad_norm": 0.9288631081581116,
"learning_rate": 1.3356333425478817e-05,
"loss": 0.159,
"step": 10450
},
{
"epoch": 15.706368589141462,
"grad_norm": 0.9103051424026489,
"learning_rate": 1.3267176942677761e-05,
"loss": 0.1648,
"step": 10460
},
{
"epoch": 15.721397708059365,
"grad_norm": 0.8684786558151245,
"learning_rate": 1.317827348608191e-05,
"loss": 0.1598,
"step": 10470
},
{
"epoch": 15.736426826977269,
"grad_norm": 1.129595160484314,
"learning_rate": 1.3089623668083683e-05,
"loss": 0.1595,
"step": 10480
},
{
"epoch": 15.751455945895172,
"grad_norm": 0.8634871244430542,
"learning_rate": 1.3001228099328443e-05,
"loss": 0.1642,
"step": 10490
},
{
"epoch": 15.766485064813075,
"grad_norm": 0.932549774646759,
"learning_rate": 1.2913087388710165e-05,
"loss": 0.1541,
"step": 10500
},
{
"epoch": 15.781514183730978,
"grad_norm": 0.9329362511634827,
"learning_rate": 1.282520214336731e-05,
"loss": 0.1523,
"step": 10510
},
{
"epoch": 15.796543302648882,
"grad_norm": 0.9856179356575012,
"learning_rate": 1.2737572968678623e-05,
"loss": 0.1597,
"step": 10520
},
{
"epoch": 15.811572421566785,
"grad_norm": 0.9236768484115601,
"learning_rate": 1.2650200468258966e-05,
"loss": 0.161,
"step": 10530
},
{
"epoch": 15.826601540484688,
"grad_norm": 1.0709694623947144,
"learning_rate": 1.256308524395512e-05,
"loss": 0.1641,
"step": 10540
},
{
"epoch": 15.841630659402593,
"grad_norm": 0.8838292956352234,
"learning_rate": 1.2476227895841713e-05,
"loss": 0.1683,
"step": 10550
},
{
"epoch": 15.856659778320497,
"grad_norm": 1.0665549039840698,
"learning_rate": 1.238962902221703e-05,
"loss": 0.165,
"step": 10560
},
{
"epoch": 15.8716888972384,
"grad_norm": 0.876946210861206,
"learning_rate": 1.2303289219598934e-05,
"loss": 0.1645,
"step": 10570
},
{
"epoch": 15.886718016156303,
"grad_norm": 0.8602812886238098,
"learning_rate": 1.2217209082720677e-05,
"loss": 0.1648,
"step": 10580
},
{
"epoch": 15.901747135074206,
"grad_norm": 0.9444336295127869,
"learning_rate": 1.2131389204526927e-05,
"loss": 0.1531,
"step": 10590
},
{
"epoch": 15.91677625399211,
"grad_norm": 0.8952954411506653,
"learning_rate": 1.2045830176169542e-05,
"loss": 0.1653,
"step": 10600
},
{
"epoch": 15.931805372910013,
"grad_norm": 0.9685820937156677,
"learning_rate": 1.1960532587003664e-05,
"loss": 0.1683,
"step": 10610
},
{
"epoch": 15.946834491827916,
"grad_norm": 0.9807755351066589,
"learning_rate": 1.1875497024583476e-05,
"loss": 0.1588,
"step": 10620
},
{
"epoch": 15.96186361074582,
"grad_norm": 0.986831784248352,
"learning_rate": 1.1790724074658315e-05,
"loss": 0.1734,
"step": 10630
},
{
"epoch": 15.976892729663723,
"grad_norm": 0.932146430015564,
"learning_rate": 1.1706214321168513e-05,
"loss": 0.1581,
"step": 10640
},
{
"epoch": 15.991921848581628,
"grad_norm": 0.9639928936958313,
"learning_rate": 1.1621968346241457e-05,
"loss": 0.1595,
"step": 10650
},
{
"epoch": 16.00601164756716,
"grad_norm": 0.7162834405899048,
"learning_rate": 1.1537986730187566e-05,
"loss": 0.1529,
"step": 10660
},
{
"epoch": 16.021040766485065,
"grad_norm": 0.9273526072502136,
"learning_rate": 1.1454270051496264e-05,
"loss": 0.1424,
"step": 10670
},
{
"epoch": 16.03606988540297,
"grad_norm": 0.7194620370864868,
"learning_rate": 1.1370818886831985e-05,
"loss": 0.147,
"step": 10680
},
{
"epoch": 16.05109900432087,
"grad_norm": 0.8820509910583496,
"learning_rate": 1.1287633811030268e-05,
"loss": 0.1394,
"step": 10690
},
{
"epoch": 16.066128123238776,
"grad_norm": 0.9373833537101746,
"learning_rate": 1.1204715397093734e-05,
"loss": 0.1347,
"step": 10700
},
{
"epoch": 16.081157242156678,
"grad_norm": 0.7921836376190186,
"learning_rate": 1.1122064216188183e-05,
"loss": 0.1368,
"step": 10710
},
{
"epoch": 16.096186361074583,
"grad_norm": 0.7020202875137329,
"learning_rate": 1.1039680837638594e-05,
"loss": 0.1403,
"step": 10720
},
{
"epoch": 16.111215479992484,
"grad_norm": 0.7879025340080261,
"learning_rate": 1.0957565828925293e-05,
"loss": 0.1319,
"step": 10730
},
{
"epoch": 16.12624459891039,
"grad_norm": 0.7713704705238342,
"learning_rate": 1.0875719755679936e-05,
"loss": 0.1335,
"step": 10740
},
{
"epoch": 16.14127371782829,
"grad_norm": 0.8271151185035706,
"learning_rate": 1.0794143181681782e-05,
"loss": 0.1357,
"step": 10750
},
{
"epoch": 16.156302836746196,
"grad_norm": 0.7664535641670227,
"learning_rate": 1.0712836668853582e-05,
"loss": 0.137,
"step": 10760
},
{
"epoch": 16.171331955664098,
"grad_norm": 0.8511399626731873,
"learning_rate": 1.063180077725791e-05,
"loss": 0.151,
"step": 10770
},
{
"epoch": 16.186361074582003,
"grad_norm": 0.8683989644050598,
"learning_rate": 1.0551036065093172e-05,
"loss": 0.1416,
"step": 10780
},
{
"epoch": 16.201390193499908,
"grad_norm": 0.8145375847816467,
"learning_rate": 1.0470543088689855e-05,
"loss": 0.1364,
"step": 10790
},
{
"epoch": 16.21641931241781,
"grad_norm": 0.9890855550765991,
"learning_rate": 1.0390322402506619e-05,
"loss": 0.1312,
"step": 10800
},
{
"epoch": 16.231448431335714,
"grad_norm": 0.7960677742958069,
"learning_rate": 1.0310374559126551e-05,
"loss": 0.1259,
"step": 10810
},
{
"epoch": 16.246477550253616,
"grad_norm": 0.7810579538345337,
"learning_rate": 1.0230700109253256e-05,
"loss": 0.1476,
"step": 10820
},
{
"epoch": 16.26150666917152,
"grad_norm": 0.7869362235069275,
"learning_rate": 1.0151299601707187e-05,
"loss": 0.1326,
"step": 10830
},
{
"epoch": 16.276535788089422,
"grad_norm": 0.7896257042884827,
"learning_rate": 1.0072173583421769e-05,
"loss": 0.1414,
"step": 10840
},
{
"epoch": 16.291564907007327,
"grad_norm": 0.8226996660232544,
"learning_rate": 9.993322599439692e-06,
"loss": 0.1437,
"step": 10850
},
{
"epoch": 16.30659402592523,
"grad_norm": 0.8732724785804749,
"learning_rate": 9.914747192909096e-06,
"loss": 0.1286,
"step": 10860
},
{
"epoch": 16.321623144843134,
"grad_norm": 0.8967133164405823,
"learning_rate": 9.836447905079905e-06,
"loss": 0.1476,
"step": 10870
},
{
"epoch": 16.33665226376104,
"grad_norm": 0.8874047994613647,
"learning_rate": 9.758425275299999e-06,
"loss": 0.1301,
"step": 10880
},
{
"epoch": 16.35168138267894,
"grad_norm": 0.7454355359077454,
"learning_rate": 9.680679841011652e-06,
"loss": 0.1466,
"step": 10890
},
{
"epoch": 16.366710501596845,
"grad_norm": 0.9600047469139099,
"learning_rate": 9.603212137747641e-06,
"loss": 0.1384,
"step": 10900
},
{
"epoch": 16.381739620514747,
"grad_norm": 1.0687470436096191,
"learning_rate": 9.526022699127718e-06,
"loss": 0.1337,
"step": 10910
},
{
"epoch": 16.396768739432652,
"grad_norm": 0.7660526633262634,
"learning_rate": 9.449112056854813e-06,
"loss": 0.1372,
"step": 10920
},
{
"epoch": 16.411797858350553,
"grad_norm": 0.7811424136161804,
"learning_rate": 9.372480740711475e-06,
"loss": 0.1368,
"step": 10930
},
{
"epoch": 16.42682697726846,
"grad_norm": 0.9468358159065247,
"learning_rate": 9.296129278556155e-06,
"loss": 0.1399,
"step": 10940
},
{
"epoch": 16.44185609618636,
"grad_norm": 0.799017071723938,
"learning_rate": 9.220058196319598e-06,
"loss": 0.1439,
"step": 10950
},
{
"epoch": 16.456885215104265,
"grad_norm": 0.811414361000061,
"learning_rate": 9.144268018001184e-06,
"loss": 0.1445,
"step": 10960
},
{
"epoch": 16.471914334022166,
"grad_norm": 0.8114548325538635,
"learning_rate": 9.068759265665384e-06,
"loss": 0.1478,
"step": 10970
},
{
"epoch": 16.48694345294007,
"grad_norm": 0.753917932510376,
"learning_rate": 8.993532459438098e-06,
"loss": 0.1432,
"step": 10980
},
{
"epoch": 16.501972571857976,
"grad_norm": 0.8858105540275574,
"learning_rate": 8.91858811750313e-06,
"loss": 0.1367,
"step": 10990
},
{
"epoch": 16.517001690775878,
"grad_norm": 0.7127811312675476,
"learning_rate": 8.843926756098547e-06,
"loss": 0.1342,
"step": 11000
},
{
"epoch": 16.532030809693783,
"grad_norm": 0.8266831636428833,
"learning_rate": 8.769548889513212e-06,
"loss": 0.1492,
"step": 11010
},
{
"epoch": 16.547059928611684,
"grad_norm": 0.9057301878929138,
"learning_rate": 8.695455030083144e-06,
"loss": 0.1474,
"step": 11020
},
{
"epoch": 16.56208904752959,
"grad_norm": 0.7918298840522766,
"learning_rate": 8.621645688188085e-06,
"loss": 0.1388,
"step": 11030
},
{
"epoch": 16.57711816644749,
"grad_norm": 0.8264976739883423,
"learning_rate": 8.548121372247918e-06,
"loss": 0.1449,
"step": 11040
},
{
"epoch": 16.592147285365396,
"grad_norm": 0.9591594934463501,
"learning_rate": 8.474882588719196e-06,
"loss": 0.1436,
"step": 11050
},
{
"epoch": 16.607176404283297,
"grad_norm": 0.8288829326629639,
"learning_rate": 8.401929842091616e-06,
"loss": 0.1291,
"step": 11060
},
{
"epoch": 16.622205523201202,
"grad_norm": 0.865283191204071,
"learning_rate": 8.329263634884598e-06,
"loss": 0.1443,
"step": 11070
},
{
"epoch": 16.637234642119104,
"grad_norm": 0.8038478493690491,
"learning_rate": 8.256884467643788e-06,
"loss": 0.1409,
"step": 11080
},
{
"epoch": 16.65226376103701,
"grad_norm": 0.7755337357521057,
"learning_rate": 8.184792838937633e-06,
"loss": 0.1378,
"step": 11090
},
{
"epoch": 16.667292879954914,
"grad_norm": 0.7843419313430786,
"learning_rate": 8.112989245353896e-06,
"loss": 0.1532,
"step": 11100
},
{
"epoch": 16.682321998872816,
"grad_norm": 0.7573866248130798,
"learning_rate": 8.0414741814963e-06,
"loss": 0.1451,
"step": 11110
},
{
"epoch": 16.69735111779072,
"grad_norm": 0.8233633637428284,
"learning_rate": 7.97024813998109e-06,
"loss": 0.1364,
"step": 11120
},
{
"epoch": 16.712380236708622,
"grad_norm": 0.8834894895553589,
"learning_rate": 7.899311611433646e-06,
"loss": 0.1431,
"step": 11130
},
{
"epoch": 16.727409355626527,
"grad_norm": 0.8282538056373596,
"learning_rate": 7.828665084485076e-06,
"loss": 0.1316,
"step": 11140
},
{
"epoch": 16.74243847454443,
"grad_norm": 0.7527298927307129,
"learning_rate": 7.758309045768908e-06,
"loss": 0.1465,
"step": 11150
},
{
"epoch": 16.757467593462334,
"grad_norm": 0.7522730827331543,
"learning_rate": 7.688243979917664e-06,
"loss": 0.1386,
"step": 11160
},
{
"epoch": 16.772496712380235,
"grad_norm": 0.949739933013916,
"learning_rate": 7.6184703695595936e-06,
"loss": 0.1317,
"step": 11170
},
{
"epoch": 16.78752583129814,
"grad_norm": 0.8552820086479187,
"learning_rate": 7.5489886953153125e-06,
"loss": 0.1313,
"step": 11180
},
{
"epoch": 16.802554950216045,
"grad_norm": 0.7522038817405701,
"learning_rate": 7.479799435794499e-06,
"loss": 0.1399,
"step": 11190
},
{
"epoch": 16.817584069133947,
"grad_norm": 0.8218302726745605,
"learning_rate": 7.410903067592562e-06,
"loss": 0.139,
"step": 11200
},
{
"epoch": 16.83261318805185,
"grad_norm": 0.7487614154815674,
"learning_rate": 7.342300065287439e-06,
"loss": 0.1462,
"step": 11210
},
{
"epoch": 16.847642306969753,
"grad_norm": 0.8830420970916748,
"learning_rate": 7.273990901436245e-06,
"loss": 0.1466,
"step": 11220
},
{
"epoch": 16.862671425887658,
"grad_norm": 1.094682216644287,
"learning_rate": 7.2059760465720825e-06,
"loss": 0.1473,
"step": 11230
},
{
"epoch": 16.87770054480556,
"grad_norm": 0.7629777789115906,
"learning_rate": 7.1382559692007245e-06,
"loss": 0.1385,
"step": 11240
},
{
"epoch": 16.892729663723465,
"grad_norm": 0.7562497854232788,
"learning_rate": 7.070831135797473e-06,
"loss": 0.1454,
"step": 11250
},
{
"epoch": 16.907758782641366,
"grad_norm": 0.8945866823196411,
"learning_rate": 7.003702010803892e-06,
"loss": 0.1405,
"step": 11260
},
{
"epoch": 16.92278790155927,
"grad_norm": 0.7205698490142822,
"learning_rate": 6.936869056624623e-06,
"loss": 0.1475,
"step": 11270
},
{
"epoch": 16.937817020477176,
"grad_norm": 0.8356210589408875,
"learning_rate": 6.870332733624174e-06,
"loss": 0.1431,
"step": 11280
},
{
"epoch": 16.952846139395078,
"grad_norm": 0.8396646976470947,
"learning_rate": 6.8040935001238256e-06,
"loss": 0.1426,
"step": 11290
},
{
"epoch": 16.967875258312983,
"grad_norm": 0.9201752543449402,
"learning_rate": 6.738151812398352e-06,
"loss": 0.1434,
"step": 11300
},
{
"epoch": 16.982904377230884,
"grad_norm": 0.9603893756866455,
"learning_rate": 6.67250812467301e-06,
"loss": 0.142,
"step": 11310
},
{
"epoch": 16.99793349614879,
"grad_norm": 0.7966869473457336,
"learning_rate": 6.607162889120305e-06,
"loss": 0.155,
"step": 11320
},
{
"epoch": 17.012023295134323,
"grad_norm": 0.5946935415267944,
"learning_rate": 6.542116555856953e-06,
"loss": 0.1274,
"step": 11330
},
{
"epoch": 17.027052414052225,
"grad_norm": 0.774712324142456,
"learning_rate": 6.477369572940706e-06,
"loss": 0.1221,
"step": 11340
},
{
"epoch": 17.04208153297013,
"grad_norm": 0.7754786610603333,
"learning_rate": 6.412922386367332e-06,
"loss": 0.1317,
"step": 11350
},
{
"epoch": 17.05711065188803,
"grad_norm": 0.6870192885398865,
"learning_rate": 6.348775440067506e-06,
"loss": 0.1174,
"step": 11360
},
{
"epoch": 17.072139770805936,
"grad_norm": 0.8024049401283264,
"learning_rate": 6.284929175903786e-06,
"loss": 0.127,
"step": 11370
},
{
"epoch": 17.08716888972384,
"grad_norm": 0.752888023853302,
"learning_rate": 6.2213840336674936e-06,
"loss": 0.1207,
"step": 11380
},
{
"epoch": 17.102198008641743,
"grad_norm": 0.7125491499900818,
"learning_rate": 6.158140451075795e-06,
"loss": 0.1351,
"step": 11390
},
{
"epoch": 17.117227127559648,
"grad_norm": 0.7468791007995605,
"learning_rate": 6.095198863768564e-06,
"loss": 0.131,
"step": 11400
},
{
"epoch": 17.13225624647755,
"grad_norm": 0.8037786483764648,
"learning_rate": 6.032559705305523e-06,
"loss": 0.1308,
"step": 11410
},
{
"epoch": 17.147285365395454,
"grad_norm": 0.7919206023216248,
"learning_rate": 5.9702234071631e-06,
"loss": 0.1234,
"step": 11420
},
{
"epoch": 17.162314484313356,
"grad_norm": 0.7676987051963806,
"learning_rate": 5.9081903987316e-06,
"loss": 0.1197,
"step": 11430
},
{
"epoch": 17.17734360323126,
"grad_norm": 1.1687105894088745,
"learning_rate": 5.8464611073121235e-06,
"loss": 0.1241,
"step": 11440
},
{
"epoch": 17.192372722149162,
"grad_norm": 0.7436251044273376,
"learning_rate": 5.785035958113716e-06,
"loss": 0.1288,
"step": 11450
},
{
"epoch": 17.207401841067067,
"grad_norm": 0.656187117099762,
"learning_rate": 5.7239153742503995e-06,
"loss": 0.1187,
"step": 11460
},
{
"epoch": 17.222430959984973,
"grad_norm": 0.6904690265655518,
"learning_rate": 5.663099776738273e-06,
"loss": 0.1366,
"step": 11470
},
{
"epoch": 17.237460078902874,
"grad_norm": 0.8284912109375,
"learning_rate": 5.602589584492562e-06,
"loss": 0.1242,
"step": 11480
},
{
"epoch": 17.25248919782078,
"grad_norm": 0.8081623911857605,
"learning_rate": 5.542385214324819e-06,
"loss": 0.1234,
"step": 11490
},
{
"epoch": 17.26751831673868,
"grad_norm": 1.1938631534576416,
"learning_rate": 5.48248708093998e-06,
"loss": 0.1326,
"step": 11500
},
{
"epoch": 17.282547435656586,
"grad_norm": 0.6938109993934631,
"learning_rate": 5.422895596933558e-06,
"loss": 0.1305,
"step": 11510
},
{
"epoch": 17.297576554574487,
"grad_norm": 0.7339420914649963,
"learning_rate": 5.36361117278874e-06,
"loss": 0.1206,
"step": 11520
},
{
"epoch": 17.312605673492392,
"grad_norm": 0.7437239289283752,
"learning_rate": 5.304634216873633e-06,
"loss": 0.1205,
"step": 11530
},
{
"epoch": 17.327634792410294,
"grad_norm": 0.7222012281417847,
"learning_rate": 5.24596513543838e-06,
"loss": 0.1219,
"step": 11540
},
{
"epoch": 17.3426639113282,
"grad_norm": 0.8264778852462769,
"learning_rate": 5.187604332612445e-06,
"loss": 0.1318,
"step": 11550
},
{
"epoch": 17.3576930302461,
"grad_norm": 0.7213618159294128,
"learning_rate": 5.129552210401728e-06,
"loss": 0.1203,
"step": 11560
},
{
"epoch": 17.372722149164005,
"grad_norm": 0.7722398638725281,
"learning_rate": 5.071809168685887e-06,
"loss": 0.1266,
"step": 11570
},
{
"epoch": 17.38775126808191,
"grad_norm": 0.8326044678688049,
"learning_rate": 5.014375605215521e-06,
"loss": 0.1267,
"step": 11580
},
{
"epoch": 17.40278038699981,
"grad_norm": 0.886371374130249,
"learning_rate": 4.957251915609462e-06,
"loss": 0.119,
"step": 11590
},
{
"epoch": 17.417809505917717,
"grad_norm": 0.7517515420913696,
"learning_rate": 4.900438493352055e-06,
"loss": 0.1291,
"step": 11600
},
{
"epoch": 17.432838624835618,
"grad_norm": 0.8436376452445984,
"learning_rate": 4.843935729790422e-06,
"loss": 0.1336,
"step": 11610
},
{
"epoch": 17.447867743753523,
"grad_norm": 0.8188118934631348,
"learning_rate": 4.7877440141317675e-06,
"loss": 0.1276,
"step": 11620
},
{
"epoch": 17.462896862671425,
"grad_norm": 0.7850053310394287,
"learning_rate": 4.731863733440733e-06,
"loss": 0.1263,
"step": 11630
},
{
"epoch": 17.47792598158933,
"grad_norm": 0.7156862616539001,
"learning_rate": 4.676295272636688e-06,
"loss": 0.1371,
"step": 11640
},
{
"epoch": 17.49295510050723,
"grad_norm": 0.9043847322463989,
"learning_rate": 4.621039014491119e-06,
"loss": 0.136,
"step": 11650
},
{
"epoch": 17.507984219425136,
"grad_norm": 0.7520122528076172,
"learning_rate": 4.566095339624943e-06,
"loss": 0.1278,
"step": 11660
},
{
"epoch": 17.52301333834304,
"grad_norm": 0.8322932124137878,
"learning_rate": 4.511464626505935e-06,
"loss": 0.1178,
"step": 11670
},
{
"epoch": 17.538042457260943,
"grad_norm": 0.7075957655906677,
"learning_rate": 4.457147251446075e-06,
"loss": 0.1295,
"step": 11680
},
{
"epoch": 17.553071576178848,
"grad_norm": 0.7323919534683228,
"learning_rate": 4.403143588599029e-06,
"loss": 0.1272,
"step": 11690
},
{
"epoch": 17.56810069509675,
"grad_norm": 0.9109891653060913,
"learning_rate": 4.349454009957471e-06,
"loss": 0.1236,
"step": 11700
},
{
"epoch": 17.583129814014654,
"grad_norm": 0.8152607679367065,
"learning_rate": 4.296078885350607e-06,
"loss": 0.1267,
"step": 11710
},
{
"epoch": 17.598158932932556,
"grad_norm": 0.7224797606468201,
"learning_rate": 4.2430185824415715e-06,
"loss": 0.1355,
"step": 11720
},
{
"epoch": 17.61318805185046,
"grad_norm": 0.7984783053398132,
"learning_rate": 4.190273466724925e-06,
"loss": 0.1364,
"step": 11730
},
{
"epoch": 17.628217170768362,
"grad_norm": 0.9017600417137146,
"learning_rate": 4.137843901524141e-06,
"loss": 0.1281,
"step": 11740
},
{
"epoch": 17.643246289686267,
"grad_norm": 0.7681065797805786,
"learning_rate": 4.085730247989078e-06,
"loss": 0.1234,
"step": 11750
},
{
"epoch": 17.65827540860417,
"grad_norm": 0.7442010045051575,
"learning_rate": 4.033932865093499e-06,
"loss": 0.1331,
"step": 11760
},
{
"epoch": 17.673304527522074,
"grad_norm": 0.7311212420463562,
"learning_rate": 3.982452109632617e-06,
"loss": 0.1336,
"step": 11770
},
{
"epoch": 17.68833364643998,
"grad_norm": 0.7073860764503479,
"learning_rate": 3.931288336220617e-06,
"loss": 0.1263,
"step": 11780
},
{
"epoch": 17.70336276535788,
"grad_norm": 0.6838569641113281,
"learning_rate": 3.880441897288234e-06,
"loss": 0.1299,
"step": 11790
},
{
"epoch": 17.718391884275785,
"grad_norm": 0.9706346988677979,
"learning_rate": 3.829913143080283e-06,
"loss": 0.1276,
"step": 11800
},
{
"epoch": 17.733421003193687,
"grad_norm": 0.7603088617324829,
"learning_rate": 3.7797024216533138e-06,
"loss": 0.1263,
"step": 11810
},
{
"epoch": 17.748450122111592,
"grad_norm": 0.7066922187805176,
"learning_rate": 3.729810078873125e-06,
"loss": 0.1284,
"step": 11820
},
{
"epoch": 17.763479241029493,
"grad_norm": 0.7454369068145752,
"learning_rate": 3.6802364584124947e-06,
"loss": 0.124,
"step": 11830
},
{
"epoch": 17.7785083599474,
"grad_norm": 0.7552350759506226,
"learning_rate": 3.6309819017487034e-06,
"loss": 0.1259,
"step": 11840
},
{
"epoch": 17.7935374788653,
"grad_norm": 0.8061559200286865,
"learning_rate": 3.5820467481612496e-06,
"loss": 0.126,
"step": 11850
},
{
"epoch": 17.808566597783205,
"grad_norm": 0.6990138292312622,
"learning_rate": 3.5334313347294757e-06,
"loss": 0.1271,
"step": 11860
},
{
"epoch": 17.82359571670111,
"grad_norm": 0.7601016163825989,
"learning_rate": 3.4851359963302798e-06,
"loss": 0.1397,
"step": 11870
},
{
"epoch": 17.83862483561901,
"grad_norm": 0.7683603167533875,
"learning_rate": 3.43716106563578e-06,
"loss": 0.1376,
"step": 11880
},
{
"epoch": 17.853653954536917,
"grad_norm": 0.8137221932411194,
"learning_rate": 3.3895068731110534e-06,
"loss": 0.122,
"step": 11890
},
{
"epoch": 17.868683073454818,
"grad_norm": 0.8366261124610901,
"learning_rate": 3.342173747011801e-06,
"loss": 0.1273,
"step": 11900
},
{
"epoch": 17.883712192372723,
"grad_norm": 0.8289967179298401,
"learning_rate": 3.295162013382164e-06,
"loss": 0.1274,
"step": 11910
},
{
"epoch": 17.898741311290625,
"grad_norm": 0.6871482133865356,
"learning_rate": 3.248471996052432e-06,
"loss": 0.1357,
"step": 11920
},
{
"epoch": 17.91377043020853,
"grad_norm": 0.7140630483627319,
"learning_rate": 3.202104016636814e-06,
"loss": 0.1247,
"step": 11930
},
{
"epoch": 17.92879954912643,
"grad_norm": 0.7578158974647522,
"learning_rate": 3.156058394531225e-06,
"loss": 0.1285,
"step": 11940
},
{
"epoch": 17.943828668044336,
"grad_norm": 0.718285858631134,
"learning_rate": 3.1103354469111056e-06,
"loss": 0.1285,
"step": 11950
},
{
"epoch": 17.958857786962238,
"grad_norm": 0.7415304780006409,
"learning_rate": 3.0649354887291925e-06,
"loss": 0.1259,
"step": 11960
},
{
"epoch": 17.973886905880143,
"grad_norm": 0.7331326007843018,
"learning_rate": 3.019858832713435e-06,
"loss": 0.1264,
"step": 11970
},
{
"epoch": 17.988916024798048,
"grad_norm": 0.7621225714683533,
"learning_rate": 2.9751057893647237e-06,
"loss": 0.1306,
"step": 11980
},
{
"epoch": 18.00300582378358,
"grad_norm": 0.6445237994194031,
"learning_rate": 2.930676666954846e-06,
"loss": 0.1289,
"step": 11990
},
{
"epoch": 18.018034942701483,
"grad_norm": 0.6551523208618164,
"learning_rate": 2.8865717715243212e-06,
"loss": 0.123,
"step": 12000
},
{
"epoch": 18.03306406161939,
"grad_norm": 0.6718552708625793,
"learning_rate": 2.842791406880291e-06,
"loss": 0.1254,
"step": 12010
},
{
"epoch": 18.04809318053729,
"grad_norm": 0.653846263885498,
"learning_rate": 2.7993358745944608e-06,
"loss": 0.1237,
"step": 12020
},
{
"epoch": 18.063122299455195,
"grad_norm": 0.7196510434150696,
"learning_rate": 2.756205474000978e-06,
"loss": 0.1162,
"step": 12030
},
{
"epoch": 18.078151418373096,
"grad_norm": 0.6618478894233704,
"learning_rate": 2.7134005021943852e-06,
"loss": 0.117,
"step": 12040
},
{
"epoch": 18.093180537291,
"grad_norm": 0.8368316292762756,
"learning_rate": 2.670921254027592e-06,
"loss": 0.1205,
"step": 12050
},
{
"epoch": 18.108209656208906,
"grad_norm": 0.6879215836524963,
"learning_rate": 2.6287680221098233e-06,
"loss": 0.1171,
"step": 12060
},
{
"epoch": 18.123238775126808,
"grad_norm": 0.7069093585014343,
"learning_rate": 2.5869410968046294e-06,
"loss": 0.1235,
"step": 12070
},
{
"epoch": 18.138267894044713,
"grad_norm": 0.6723190546035767,
"learning_rate": 2.5454407662278244e-06,
"loss": 0.1085,
"step": 12080
},
{
"epoch": 18.153297012962614,
"grad_norm": 0.6698660850524902,
"learning_rate": 2.5042673162455954e-06,
"loss": 0.1195,
"step": 12090
},
{
"epoch": 18.16832613188052,
"grad_norm": 0.6730449795722961,
"learning_rate": 2.463421030472429e-06,
"loss": 0.1139,
"step": 12100
},
{
"epoch": 18.18335525079842,
"grad_norm": 0.805294394493103,
"learning_rate": 2.422902190269266e-06,
"loss": 0.1242,
"step": 12110
},
{
"epoch": 18.198384369716326,
"grad_norm": 1.0811830759048462,
"learning_rate": 2.3827110747414785e-06,
"loss": 0.1195,
"step": 12120
},
{
"epoch": 18.213413488634227,
"grad_norm": 0.6854028105735779,
"learning_rate": 2.342847960736966e-06,
"loss": 0.119,
"step": 12130
},
{
"epoch": 18.228442607552132,
"grad_norm": 0.6735851764678955,
"learning_rate": 2.303313122844286e-06,
"loss": 0.1321,
"step": 12140
},
{
"epoch": 18.243471726470037,
"grad_norm": 0.7301083207130432,
"learning_rate": 2.264106833390722e-06,
"loss": 0.1204,
"step": 12150
},
{
"epoch": 18.25850084538794,
"grad_norm": 0.7372903823852539,
"learning_rate": 2.2252293624404176e-06,
"loss": 0.1201,
"step": 12160
},
{
"epoch": 18.273529964305844,
"grad_norm": 0.6305893659591675,
"learning_rate": 2.1866809777925324e-06,
"loss": 0.1128,
"step": 12170
},
{
"epoch": 18.288559083223745,
"grad_norm": 0.7112670540809631,
"learning_rate": 2.148461944979385e-06,
"loss": 0.1172,
"step": 12180
},
{
"epoch": 18.30358820214165,
"grad_norm": 0.6915646195411682,
"learning_rate": 2.1105725272646094e-06,
"loss": 0.1197,
"step": 12190
},
{
"epoch": 18.318617321059552,
"grad_norm": 0.6650305986404419,
"learning_rate": 2.0730129856413707e-06,
"loss": 0.121,
"step": 12200
},
{
"epoch": 18.333646439977457,
"grad_norm": 0.6500080823898315,
"learning_rate": 2.0357835788305467e-06,
"loss": 0.1209,
"step": 12210
},
{
"epoch": 18.34867555889536,
"grad_norm": 0.7032843828201294,
"learning_rate": 1.998884563278963e-06,
"loss": 0.1194,
"step": 12220
},
{
"epoch": 18.363704677813264,
"grad_norm": 0.6876169443130493,
"learning_rate": 1.962316193157593e-06,
"loss": 0.117,
"step": 12230
},
{
"epoch": 18.378733796731165,
"grad_norm": 0.6640487909317017,
"learning_rate": 1.926078720359853e-06,
"loss": 0.1246,
"step": 12240
},
{
"epoch": 18.39376291564907,
"grad_norm": 0.7534406185150146,
"learning_rate": 1.8901723944998118e-06,
"loss": 0.1175,
"step": 12250
},
{
"epoch": 18.408792034566975,
"grad_norm": 0.7041878700256348,
"learning_rate": 1.8545974629105622e-06,
"loss": 0.1191,
"step": 12260
},
{
"epoch": 18.423821153484877,
"grad_norm": 0.6589450240135193,
"learning_rate": 1.81935417064239e-06,
"loss": 0.1155,
"step": 12270
},
{
"epoch": 18.43885027240278,
"grad_norm": 0.6730456352233887,
"learning_rate": 1.7844427604612024e-06,
"loss": 0.1283,
"step": 12280
},
{
"epoch": 18.453879391320683,
"grad_norm": 0.7545807361602783,
"learning_rate": 1.74986347284678e-06,
"loss": 0.114,
"step": 12290
},
{
"epoch": 18.468908510238588,
"grad_norm": 0.720689058303833,
"learning_rate": 1.7156165459911665e-06,
"loss": 0.1228,
"step": 12300
},
{
"epoch": 18.48393762915649,
"grad_norm": 0.6629992723464966,
"learning_rate": 1.6817022157970042e-06,
"loss": 0.1171,
"step": 12310
},
{
"epoch": 18.498966748074395,
"grad_norm": 0.6659217476844788,
"learning_rate": 1.648120715875906e-06,
"loss": 0.1133,
"step": 12320
},
{
"epoch": 18.513995866992296,
"grad_norm": 0.6609564423561096,
"learning_rate": 1.6148722775468639e-06,
"loss": 0.1343,
"step": 12330
},
{
"epoch": 18.5290249859102,
"grad_norm": 0.6903553009033203,
"learning_rate": 1.581957129834638e-06,
"loss": 0.1182,
"step": 12340
},
{
"epoch": 18.544054104828106,
"grad_norm": 0.7767003178596497,
"learning_rate": 1.5493754994681976e-06,
"loss": 0.122,
"step": 12350
},
{
"epoch": 18.559083223746008,
"grad_norm": 0.6776891350746155,
"learning_rate": 1.5171276108791544e-06,
"loss": 0.1129,
"step": 12360
},
{
"epoch": 18.574112342663913,
"grad_norm": 0.6937426924705505,
"learning_rate": 1.4852136862001764e-06,
"loss": 0.1136,
"step": 12370
},
{
"epoch": 18.589141461581814,
"grad_norm": 0.7074488401412964,
"learning_rate": 1.4536339452635384e-06,
"loss": 0.1126,
"step": 12380
},
{
"epoch": 18.60417058049972,
"grad_norm": 0.6760552525520325,
"learning_rate": 1.4223886055995172e-06,
"loss": 0.1227,
"step": 12390
},
{
"epoch": 18.61919969941762,
"grad_norm": 0.7237436175346375,
"learning_rate": 1.3914778824349884e-06,
"loss": 0.1208,
"step": 12400
},
{
"epoch": 18.634228818335526,
"grad_norm": 0.6534668803215027,
"learning_rate": 1.3609019886918427e-06,
"loss": 0.1171,
"step": 12410
},
{
"epoch": 18.649257937253427,
"grad_norm": 0.6551641225814819,
"learning_rate": 1.3306611349856112e-06,
"loss": 0.1184,
"step": 12420
},
{
"epoch": 18.664287056171332,
"grad_norm": 0.681528627872467,
"learning_rate": 1.300755529623937e-06,
"loss": 0.1203,
"step": 12430
},
{
"epoch": 18.679316175089234,
"grad_norm": 0.7110047340393066,
"learning_rate": 1.2711853786052109e-06,
"loss": 0.1227,
"step": 12440
},
{
"epoch": 18.69434529400714,
"grad_norm": 0.7127984166145325,
"learning_rate": 1.241950885617088e-06,
"loss": 0.1192,
"step": 12450
},
{
"epoch": 18.709374412925044,
"grad_norm": 0.9400015473365784,
"learning_rate": 1.2130522520351405e-06,
"loss": 0.1206,
"step": 12460
},
{
"epoch": 18.724403531842945,
"grad_norm": 0.640738844871521,
"learning_rate": 1.1844896769214186e-06,
"loss": 0.125,
"step": 12470
},
{
"epoch": 18.73943265076085,
"grad_norm": 0.6960272789001465,
"learning_rate": 1.1562633570231352e-06,
"loss": 0.1181,
"step": 12480
},
{
"epoch": 18.754461769678752,
"grad_norm": 0.7713277339935303,
"learning_rate": 1.128373486771256e-06,
"loss": 0.1183,
"step": 12490
},
{
"epoch": 18.769490888596657,
"grad_norm": 0.6949428915977478,
"learning_rate": 1.1008202582792004e-06,
"loss": 0.1308,
"step": 12500
},
{
"epoch": 18.78452000751456,
"grad_norm": 0.6489851474761963,
"learning_rate": 1.0736038613414878e-06,
"loss": 0.1288,
"step": 12510
},
{
"epoch": 18.799549126432463,
"grad_norm": 0.7511118054389954,
"learning_rate": 1.0467244834324707e-06,
"loss": 0.1098,
"step": 12520
},
{
"epoch": 18.814578245350365,
"grad_norm": 0.7278922200202942,
"learning_rate": 1.0201823097049812e-06,
"loss": 0.1248,
"step": 12530
},
{
"epoch": 18.82960736426827,
"grad_norm": 0.7048822641372681,
"learning_rate": 9.939775229891313e-07,
"loss": 0.1201,
"step": 12540
},
{
"epoch": 18.84463648318617,
"grad_norm": 0.7828486561775208,
"learning_rate": 9.681103037909866e-07,
"loss": 0.1271,
"step": 12550
},
{
"epoch": 18.859665602104076,
"grad_norm": 0.6916821002960205,
"learning_rate": 9.42580830291373e-07,
"loss": 0.1151,
"step": 12560
},
{
"epoch": 18.87469472102198,
"grad_norm": 0.7299247980117798,
"learning_rate": 9.173892783445992e-07,
"loss": 0.1287,
"step": 12570
},
{
"epoch": 18.889723839939883,
"grad_norm": 0.8514544367790222,
"learning_rate": 8.925358214772972e-07,
"loss": 0.1261,
"step": 12580
},
{
"epoch": 18.904752958857788,
"grad_norm": 0.6913233995437622,
"learning_rate": 8.680206308871952e-07,
"loss": 0.1091,
"step": 12590
},
{
"epoch": 18.91978207777569,
"grad_norm": 0.7069427967071533,
"learning_rate": 8.43843875441952e-07,
"loss": 0.1242,
"step": 12600
},
{
"epoch": 18.934811196693595,
"grad_norm": 0.6860793232917786,
"learning_rate": 8.2000572167798e-07,
"loss": 0.1245,
"step": 12610
},
{
"epoch": 18.949840315611496,
"grad_norm": 0.6952442526817322,
"learning_rate": 7.965063337993017e-07,
"loss": 0.1194,
"step": 12620
},
{
"epoch": 18.9648694345294,
"grad_norm": 0.7195196747779846,
"learning_rate": 7.733458736764398e-07,
"loss": 0.1266,
"step": 12630
},
{
"epoch": 18.979898553447303,
"grad_norm": 0.685310959815979,
"learning_rate": 7.505245008452788e-07,
"loss": 0.1153,
"step": 12640
},
{
"epoch": 18.994927672365208,
"grad_norm": 0.6967130899429321,
"learning_rate": 7.280423725059604e-07,
"loss": 0.1331,
"step": 12650
},
{
"epoch": 19.00901747135074,
"grad_norm": 0.5955845713615417,
"learning_rate": 7.058996435218346e-07,
"loss": 0.1032,
"step": 12660
},
{
"epoch": 19.024046590268647,
"grad_norm": 0.6826702356338501,
"learning_rate": 6.840964664183436e-07,
"loss": 0.1116,
"step": 12670
},
{
"epoch": 19.039075709186548,
"grad_norm": 0.6504730582237244,
"learning_rate": 6.626329913820339e-07,
"loss": 0.1218,
"step": 12680
},
{
"epoch": 19.054104828104453,
"grad_norm": 0.6690040230751038,
"learning_rate": 6.415093662594629e-07,
"loss": 0.1218,
"step": 12690
},
{
"epoch": 19.069133947022355,
"grad_norm": 0.7162594199180603,
"learning_rate": 6.207257365562047e-07,
"loss": 0.1148,
"step": 12700
},
{
"epoch": 19.08416306594026,
"grad_norm": 0.6570801734924316,
"learning_rate": 6.00282245435857e-07,
"loss": 0.1138,
"step": 12710
},
{
"epoch": 19.09919218485816,
"grad_norm": 0.6705721616744995,
"learning_rate": 5.80179033719036e-07,
"loss": 0.1241,
"step": 12720
},
{
"epoch": 19.114221303776066,
"grad_norm": 0.7230423092842102,
"learning_rate": 5.604162398824275e-07,
"loss": 0.1122,
"step": 12730
},
{
"epoch": 19.12925042269397,
"grad_norm": 0.6463306546211243,
"learning_rate": 5.409940000578206e-07,
"loss": 0.1085,
"step": 12740
},
{
"epoch": 19.144279541611873,
"grad_norm": 0.7528629302978516,
"learning_rate": 5.219124480311532e-07,
"loss": 0.1186,
"step": 12750
},
{
"epoch": 19.159308660529778,
"grad_norm": 1.4888911247253418,
"learning_rate": 5.031717152416238e-07,
"loss": 0.1158,
"step": 12760
},
{
"epoch": 19.17433777944768,
"grad_norm": 0.6441943645477295,
"learning_rate": 4.847719307807752e-07,
"loss": 0.1197,
"step": 12770
},
{
"epoch": 19.189366898365584,
"grad_norm": 0.6627583503723145,
"learning_rate": 4.6671322139158477e-07,
"loss": 0.1168,
"step": 12780
},
{
"epoch": 19.204396017283486,
"grad_norm": 0.6732495427131653,
"learning_rate": 4.4899571146761467e-07,
"loss": 0.1104,
"step": 12790
},
{
"epoch": 19.21942513620139,
"grad_norm": 0.6743932366371155,
"learning_rate": 4.3161952305215136e-07,
"loss": 0.1185,
"step": 12800
},
{
"epoch": 19.234454255119292,
"grad_norm": 0.7038917541503906,
"learning_rate": 4.145847758373511e-07,
"loss": 0.1216,
"step": 12810
},
{
"epoch": 19.249483374037197,
"grad_norm": 0.6505002975463867,
"learning_rate": 3.9789158716343475e-07,
"loss": 0.1247,
"step": 12820
},
{
"epoch": 19.2645124929551,
"grad_norm": 0.6234051585197449,
"learning_rate": 3.815400720178719e-07,
"loss": 0.1122,
"step": 12830
},
{
"epoch": 19.279541611873004,
"grad_norm": 0.6669496297836304,
"learning_rate": 3.6553034303457577e-07,
"loss": 0.1127,
"step": 12840
},
{
"epoch": 19.29457073079091,
"grad_norm": 0.7005789279937744,
"learning_rate": 3.49862510493143e-07,
"loss": 0.1135,
"step": 12850
},
{
"epoch": 19.30959984970881,
"grad_norm": 0.7209417223930359,
"learning_rate": 3.3453668231809286e-07,
"loss": 0.115,
"step": 12860
},
{
"epoch": 19.324628968626715,
"grad_norm": 0.670708179473877,
"learning_rate": 3.1955296407811807e-07,
"loss": 0.1147,
"step": 12870
},
{
"epoch": 19.339658087544617,
"grad_norm": 0.6531425714492798,
"learning_rate": 3.0491145898536856e-07,
"loss": 0.1153,
"step": 12880
},
{
"epoch": 19.354687206462522,
"grad_norm": 0.6748098134994507,
"learning_rate": 2.9061226789471873e-07,
"loss": 0.1098,
"step": 12890
},
{
"epoch": 19.369716325380423,
"grad_norm": 0.7407058477401733,
"learning_rate": 2.7665548930308484e-07,
"loss": 0.1186,
"step": 12900
},
{
"epoch": 19.38474544429833,
"grad_norm": 0.7474448680877686,
"learning_rate": 2.6304121934876966e-07,
"loss": 0.1167,
"step": 12910
},
{
"epoch": 19.39977456321623,
"grad_norm": 0.710455596446991,
"learning_rate": 2.497695518107579e-07,
"loss": 0.1256,
"step": 12920
},
{
"epoch": 19.414803682134135,
"grad_norm": 0.674196183681488,
"learning_rate": 2.3684057810808847e-07,
"loss": 0.1199,
"step": 12930
},
{
"epoch": 19.42983280105204,
"grad_norm": 0.6443490982055664,
"learning_rate": 2.2425438729924419e-07,
"loss": 0.1134,
"step": 12940
},
{
"epoch": 19.44486191996994,
"grad_norm": 0.6689858436584473,
"learning_rate": 2.120110660815078e-07,
"loss": 0.1213,
"step": 12950
},
{
"epoch": 19.459891038887847,
"grad_norm": 0.6597970128059387,
"learning_rate": 2.0011069879038447e-07,
"loss": 0.127,
"step": 12960
},
{
"epoch": 19.474920157805748,
"grad_norm": 0.6606748104095459,
"learning_rate": 1.8855336739901363e-07,
"loss": 0.1184,
"step": 12970
},
{
"epoch": 19.489949276723653,
"grad_norm": 0.6770042181015015,
"learning_rate": 1.773391515176026e-07,
"loss": 0.1199,
"step": 12980
},
{
"epoch": 19.504978395641555,
"grad_norm": 0.6483029723167419,
"learning_rate": 1.6646812839287706e-07,
"loss": 0.1094,
"step": 12990
},
{
"epoch": 19.52000751455946,
"grad_norm": 0.6776772737503052,
"learning_rate": 1.5594037290755925e-07,
"loss": 0.115,
"step": 13000
},
{
"epoch": 19.53503663347736,
"grad_norm": 0.6734815835952759,
"learning_rate": 1.4575595757985173e-07,
"loss": 0.1176,
"step": 13010
},
{
"epoch": 19.550065752395266,
"grad_norm": 0.671363353729248,
"learning_rate": 1.3591495256291554e-07,
"loss": 0.1158,
"step": 13020
},
{
"epoch": 19.565094871313168,
"grad_norm": 0.7096564769744873,
"learning_rate": 1.2641742564441506e-07,
"loss": 0.1178,
"step": 13030
},
{
"epoch": 19.580123990231073,
"grad_norm": 0.7112547755241394,
"learning_rate": 1.1726344224603502e-07,
"loss": 0.1186,
"step": 13040
},
{
"epoch": 19.595153109148978,
"grad_norm": 0.9371479153633118,
"learning_rate": 1.0845306542303645e-07,
"loss": 0.1158,
"step": 13050
},
{
"epoch": 19.61018222806688,
"grad_norm": 0.666856050491333,
"learning_rate": 9.998635586381255e-08,
"loss": 0.1151,
"step": 13060
},
{
"epoch": 19.625211346984784,
"grad_norm": 0.6255350708961487,
"learning_rate": 9.186337188949457e-08,
"loss": 0.1287,
"step": 13070
},
{
"epoch": 19.640240465902686,
"grad_norm": 0.6888746619224548,
"learning_rate": 8.408416945351328e-08,
"loss": 0.119,
"step": 13080
},
{
"epoch": 19.65526958482059,
"grad_norm": 0.6902468204498291,
"learning_rate": 7.664880214123815e-08,
"loss": 0.1199,
"step": 13090
},
{
"epoch": 19.670298703738492,
"grad_norm": 0.6694928407669067,
"learning_rate": 6.95573211696221e-08,
"loss": 0.1262,
"step": 13100
},
{
"epoch": 19.685327822656397,
"grad_norm": 0.6304376125335693,
"learning_rate": 6.280977538681288e-08,
"loss": 0.1196,
"step": 13110
},
{
"epoch": 19.7003569415743,
"grad_norm": 0.7109536528587341,
"learning_rate": 5.64062112718311e-08,
"loss": 0.1158,
"step": 13120
},
{
"epoch": 19.715386060492204,
"grad_norm": 0.6978461146354675,
"learning_rate": 5.0346672934270534e-08,
"loss": 0.1139,
"step": 13130
},
{
"epoch": 19.73041517941011,
"grad_norm": 0.6379060745239258,
"learning_rate": 4.4631202113953886e-08,
"loss": 0.1157,
"step": 13140
},
{
"epoch": 19.74544429832801,
"grad_norm": 0.6268938779830933,
"learning_rate": 3.925983818069412e-08,
"loss": 0.1086,
"step": 13150
},
{
"epoch": 19.760473417245915,
"grad_norm": 0.7297201156616211,
"learning_rate": 3.4232618133978044e-08,
"loss": 0.1132,
"step": 13160
},
{
"epoch": 19.775502536163817,
"grad_norm": 0.6648380756378174,
"learning_rate": 2.9549576602733164e-08,
"loss": 0.1124,
"step": 13170
},
{
"epoch": 19.790531655081722,
"grad_norm": 0.7137235999107361,
"learning_rate": 2.5210745845100082e-08,
"loss": 0.1165,
"step": 13180
},
{
"epoch": 19.805560773999623,
"grad_norm": 0.6801294684410095,
"learning_rate": 2.1216155748182696e-08,
"loss": 0.1155,
"step": 13190
},
{
"epoch": 19.82058989291753,
"grad_norm": 0.719840407371521,
"learning_rate": 1.756583382785948e-08,
"loss": 0.1261,
"step": 13200
},
{
"epoch": 19.83561901183543,
"grad_norm": 0.6777321696281433,
"learning_rate": 1.4259805228594713e-08,
"loss": 0.1172,
"step": 13210
},
{
"epoch": 19.850648130753335,
"grad_norm": 0.6588504314422607,
"learning_rate": 1.129809272326643e-08,
"loss": 0.1151,
"step": 13220
},
{
"epoch": 19.865677249671236,
"grad_norm": 0.6828821897506714,
"learning_rate": 8.680716712988756e-09,
"loss": 0.1176,
"step": 13230
},
{
"epoch": 19.88070636858914,
"grad_norm": 0.7881568670272827,
"learning_rate": 6.40769522700091e-09,
"loss": 0.1212,
"step": 13240
},
{
"epoch": 19.895735487507046,
"grad_norm": 0.6444976329803467,
"learning_rate": 4.479043922528403e-09,
"loss": 0.1141,
"step": 13250
},
{
"epoch": 19.910764606424948,
"grad_norm": 0.6598045825958252,
"learning_rate": 2.894776084672035e-09,
"loss": 0.1181,
"step": 13260
},
{
"epoch": 19.925793725342853,
"grad_norm": 0.6139656901359558,
"learning_rate": 1.654902626324617e-09,
"loss": 0.1222,
"step": 13270
},
{
"epoch": 19.940822844260754,
"grad_norm": 0.6389946341514587,
"learning_rate": 7.594320880821571e-10,
"loss": 0.1218,
"step": 13280
},
{
"epoch": 19.95585196317866,
"grad_norm": 0.6922657489776611,
"learning_rate": 2.0837063821055326e-10,
"loss": 0.1139,
"step": 13290
},
{
"epoch": 19.97088108209656,
"grad_norm": 0.6712486743927002,
"learning_rate": 1.7220725789801607e-12,
"loss": 0.1172,
"step": 13300
}
],
"logging_steps": 10,
"max_steps": 13300,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.732273085924172e+20,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}