RealQA-models / realqa /trainer_state.json
MingxingLi's picture
Upload folder using huggingface_hub
e28f08b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9998204829009962,
"eval_steps": 500,
"global_step": 5570,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003590341980073602,
"grad_norm": 6.1201090812683105,
"learning_rate": 1.7953321364452425e-06,
"loss": 2.9926,
"step": 10
},
{
"epoch": 0.007180683960147204,
"grad_norm": 4.0763983726501465,
"learning_rate": 3.590664272890485e-06,
"loss": 2.7866,
"step": 20
},
{
"epoch": 0.010771025940220806,
"grad_norm": 3.385845184326172,
"learning_rate": 5.385996409335727e-06,
"loss": 2.0378,
"step": 30
},
{
"epoch": 0.014361367920294408,
"grad_norm": 5.195909023284912,
"learning_rate": 7.18132854578097e-06,
"loss": 1.2251,
"step": 40
},
{
"epoch": 0.01795170990036801,
"grad_norm": 2.219606637954712,
"learning_rate": 8.976660682226211e-06,
"loss": 0.6834,
"step": 50
},
{
"epoch": 0.02154205188044161,
"grad_norm": 16.839906692504883,
"learning_rate": 1.0771992818671454e-05,
"loss": 0.4754,
"step": 60
},
{
"epoch": 0.025132393860515214,
"grad_norm": 25.55668067932129,
"learning_rate": 1.2567324955116697e-05,
"loss": 0.3818,
"step": 70
},
{
"epoch": 0.028722735840588817,
"grad_norm": 1.359479546546936,
"learning_rate": 1.436265709156194e-05,
"loss": 0.3797,
"step": 80
},
{
"epoch": 0.032313077820662416,
"grad_norm": 1.2728756666183472,
"learning_rate": 1.615798922800718e-05,
"loss": 0.3712,
"step": 90
},
{
"epoch": 0.03590341980073602,
"grad_norm": 1.9393813610076904,
"learning_rate": 1.7953321364452423e-05,
"loss": 0.3564,
"step": 100
},
{
"epoch": 0.03949376178080962,
"grad_norm": 1.4643720388412476,
"learning_rate": 1.9748653500897668e-05,
"loss": 0.3438,
"step": 110
},
{
"epoch": 0.04308410376088322,
"grad_norm": 1.4880571365356445,
"learning_rate": 2.154398563734291e-05,
"loss": 0.321,
"step": 120
},
{
"epoch": 0.046674445740956826,
"grad_norm": 1.239957571029663,
"learning_rate": 2.3339317773788153e-05,
"loss": 0.319,
"step": 130
},
{
"epoch": 0.05026478772103043,
"grad_norm": 1.2627112865447998,
"learning_rate": 2.5134649910233395e-05,
"loss": 0.3128,
"step": 140
},
{
"epoch": 0.05385512970110403,
"grad_norm": 1.1520243883132935,
"learning_rate": 2.6929982046678636e-05,
"loss": 0.31,
"step": 150
},
{
"epoch": 0.05744547168117763,
"grad_norm": 1.8554497957229614,
"learning_rate": 2.872531418312388e-05,
"loss": 0.3167,
"step": 160
},
{
"epoch": 0.061035813661251236,
"grad_norm": 1.8501205444335938,
"learning_rate": 3.0520646319569125e-05,
"loss": 0.3177,
"step": 170
},
{
"epoch": 0.06462615564132483,
"grad_norm": 1.249617099761963,
"learning_rate": 3.231597845601436e-05,
"loss": 0.3081,
"step": 180
},
{
"epoch": 0.06821649762139843,
"grad_norm": 1.1702481508255005,
"learning_rate": 3.411131059245961e-05,
"loss": 0.3122,
"step": 190
},
{
"epoch": 0.07180683960147204,
"grad_norm": 1.0217711925506592,
"learning_rate": 3.5906642728904846e-05,
"loss": 0.3047,
"step": 200
},
{
"epoch": 0.07539718158154564,
"grad_norm": 0.8885968923568726,
"learning_rate": 3.770197486535009e-05,
"loss": 0.3042,
"step": 210
},
{
"epoch": 0.07898752356161924,
"grad_norm": 1.0739161968231201,
"learning_rate": 3.9497307001795335e-05,
"loss": 0.2957,
"step": 220
},
{
"epoch": 0.08257786554169284,
"grad_norm": 1.963072419166565,
"learning_rate": 4.129263913824057e-05,
"loss": 0.2967,
"step": 230
},
{
"epoch": 0.08616820752176645,
"grad_norm": 0.9546407461166382,
"learning_rate": 4.308797127468582e-05,
"loss": 0.2834,
"step": 240
},
{
"epoch": 0.08975854950184005,
"grad_norm": 1.82941734790802,
"learning_rate": 4.488330341113106e-05,
"loss": 0.2864,
"step": 250
},
{
"epoch": 0.09334889148191365,
"grad_norm": 1.4494279623031616,
"learning_rate": 4.667863554757631e-05,
"loss": 0.2891,
"step": 260
},
{
"epoch": 0.09693923346198725,
"grad_norm": 1.195784330368042,
"learning_rate": 4.847396768402155e-05,
"loss": 0.2904,
"step": 270
},
{
"epoch": 0.10052957544206086,
"grad_norm": 1.0053528547286987,
"learning_rate": 5.026929982046679e-05,
"loss": 0.2804,
"step": 280
},
{
"epoch": 0.10411991742213446,
"grad_norm": 4.148128986358643,
"learning_rate": 5.2064631956912034e-05,
"loss": 0.3165,
"step": 290
},
{
"epoch": 0.10771025940220806,
"grad_norm": 1.4303346872329712,
"learning_rate": 5.385996409335727e-05,
"loss": 0.2747,
"step": 300
},
{
"epoch": 0.11130060138228166,
"grad_norm": 1.128341794013977,
"learning_rate": 5.565529622980251e-05,
"loss": 0.297,
"step": 310
},
{
"epoch": 0.11489094336235527,
"grad_norm": 1.2280890941619873,
"learning_rate": 5.745062836624776e-05,
"loss": 0.2821,
"step": 320
},
{
"epoch": 0.11848128534242887,
"grad_norm": 1.4685401916503906,
"learning_rate": 5.9245960502693e-05,
"loss": 0.2815,
"step": 330
},
{
"epoch": 0.12207162732250247,
"grad_norm": 2.4324777126312256,
"learning_rate": 6.104129263913825e-05,
"loss": 0.291,
"step": 340
},
{
"epoch": 0.12566196930257606,
"grad_norm": 1.2875359058380127,
"learning_rate": 6.283662477558349e-05,
"loss": 0.2852,
"step": 350
},
{
"epoch": 0.12925231128264966,
"grad_norm": 2.257322072982788,
"learning_rate": 6.463195691202873e-05,
"loss": 0.2804,
"step": 360
},
{
"epoch": 0.13284265326272326,
"grad_norm": 1.3770567178726196,
"learning_rate": 6.642728904847398e-05,
"loss": 0.2873,
"step": 370
},
{
"epoch": 0.13643299524279687,
"grad_norm": 1.6921864748001099,
"learning_rate": 6.822262118491922e-05,
"loss": 0.2974,
"step": 380
},
{
"epoch": 0.14002333722287047,
"grad_norm": 0.9520618915557861,
"learning_rate": 7.001795332136445e-05,
"loss": 0.2939,
"step": 390
},
{
"epoch": 0.14361367920294407,
"grad_norm": 0.812728762626648,
"learning_rate": 7.181328545780969e-05,
"loss": 0.2702,
"step": 400
},
{
"epoch": 0.14720402118301767,
"grad_norm": 1.7924541234970093,
"learning_rate": 7.360861759425493e-05,
"loss": 0.2969,
"step": 410
},
{
"epoch": 0.15079436316309128,
"grad_norm": 2.439558982849121,
"learning_rate": 7.540394973070018e-05,
"loss": 0.2893,
"step": 420
},
{
"epoch": 0.15438470514316488,
"grad_norm": 0.8057828545570374,
"learning_rate": 7.719928186714542e-05,
"loss": 0.2808,
"step": 430
},
{
"epoch": 0.15797504712323848,
"grad_norm": 1.2622177600860596,
"learning_rate": 7.899461400359067e-05,
"loss": 0.282,
"step": 440
},
{
"epoch": 0.16156538910331208,
"grad_norm": 1.1095036268234253,
"learning_rate": 8.078994614003591e-05,
"loss": 0.2691,
"step": 450
},
{
"epoch": 0.1651557310833857,
"grad_norm": 0.7493880987167358,
"learning_rate": 8.258527827648115e-05,
"loss": 0.2748,
"step": 460
},
{
"epoch": 0.1687460730634593,
"grad_norm": 0.7199195623397827,
"learning_rate": 8.43806104129264e-05,
"loss": 0.2876,
"step": 470
},
{
"epoch": 0.1723364150435329,
"grad_norm": 0.9257749915122986,
"learning_rate": 8.617594254937164e-05,
"loss": 0.2801,
"step": 480
},
{
"epoch": 0.1759267570236065,
"grad_norm": 0.9219655394554138,
"learning_rate": 8.797127468581689e-05,
"loss": 0.2717,
"step": 490
},
{
"epoch": 0.1795170990036801,
"grad_norm": 1.5916101932525635,
"learning_rate": 8.976660682226212e-05,
"loss": 0.275,
"step": 500
},
{
"epoch": 0.1831074409837537,
"grad_norm": 1.1832544803619385,
"learning_rate": 9.156193895870736e-05,
"loss": 0.2774,
"step": 510
},
{
"epoch": 0.1866977829638273,
"grad_norm": 0.8959478735923767,
"learning_rate": 9.335727109515261e-05,
"loss": 0.3003,
"step": 520
},
{
"epoch": 0.1902881249439009,
"grad_norm": 0.7720569968223572,
"learning_rate": 9.515260323159785e-05,
"loss": 0.2771,
"step": 530
},
{
"epoch": 0.1938784669239745,
"grad_norm": 1.263458013534546,
"learning_rate": 9.69479353680431e-05,
"loss": 0.2737,
"step": 540
},
{
"epoch": 0.1974688089040481,
"grad_norm": 1.6316909790039062,
"learning_rate": 9.874326750448834e-05,
"loss": 0.2899,
"step": 550
},
{
"epoch": 0.2010591508841217,
"grad_norm": 0.8948745131492615,
"learning_rate": 9.999991163368873e-05,
"loss": 0.2703,
"step": 560
},
{
"epoch": 0.20464949286419531,
"grad_norm": 1.680094599723816,
"learning_rate": 9.999834068573299e-05,
"loss": 0.2828,
"step": 570
},
{
"epoch": 0.20823983484426892,
"grad_norm": 1.1262023448944092,
"learning_rate": 9.999480611298721e-05,
"loss": 0.2651,
"step": 580
},
{
"epoch": 0.21183017682434252,
"grad_norm": 1.2514327764511108,
"learning_rate": 9.998930805426751e-05,
"loss": 0.2828,
"step": 590
},
{
"epoch": 0.21542051880441612,
"grad_norm": 0.8650713562965393,
"learning_rate": 9.998184672550354e-05,
"loss": 0.2641,
"step": 600
},
{
"epoch": 0.21901086078448972,
"grad_norm": 1.3188605308532715,
"learning_rate": 9.997242241973004e-05,
"loss": 0.2791,
"step": 610
},
{
"epoch": 0.22260120276456333,
"grad_norm": 1.677878737449646,
"learning_rate": 9.996103550707527e-05,
"loss": 0.2803,
"step": 620
},
{
"epoch": 0.22619154474463693,
"grad_norm": 1.9317690134048462,
"learning_rate": 9.994768643474658e-05,
"loss": 0.263,
"step": 630
},
{
"epoch": 0.22978188672471053,
"grad_norm": 0.8656140565872192,
"learning_rate": 9.993237572701274e-05,
"loss": 0.2723,
"step": 640
},
{
"epoch": 0.23337222870478413,
"grad_norm": 0.7631008625030518,
"learning_rate": 9.991510398518341e-05,
"loss": 0.2958,
"step": 650
},
{
"epoch": 0.23696257068485774,
"grad_norm": 0.6852580308914185,
"learning_rate": 9.989587188758552e-05,
"loss": 0.2612,
"step": 660
},
{
"epoch": 0.24055291266493134,
"grad_norm": 0.6097802519798279,
"learning_rate": 9.987468018953661e-05,
"loss": 0.2607,
"step": 670
},
{
"epoch": 0.24414325464500494,
"grad_norm": 1.254186987876892,
"learning_rate": 9.985152972331516e-05,
"loss": 0.2662,
"step": 680
},
{
"epoch": 0.24773359662507854,
"grad_norm": 0.8868479132652283,
"learning_rate": 9.982642139812793e-05,
"loss": 0.2705,
"step": 690
},
{
"epoch": 0.2513239386051521,
"grad_norm": 1.5867512226104736,
"learning_rate": 9.979935620007424e-05,
"loss": 0.2735,
"step": 700
},
{
"epoch": 0.25491428058522575,
"grad_norm": 0.7384280562400818,
"learning_rate": 9.977033519210725e-05,
"loss": 0.2676,
"step": 710
},
{
"epoch": 0.2585046225652993,
"grad_norm": 0.7617084383964539,
"learning_rate": 9.97393595139922e-05,
"loss": 0.2655,
"step": 720
},
{
"epoch": 0.26209496454537295,
"grad_norm": 0.6475211381912231,
"learning_rate": 9.970643038226166e-05,
"loss": 0.2629,
"step": 730
},
{
"epoch": 0.26568530652544653,
"grad_norm": 1.3059916496276855,
"learning_rate": 9.967154909016772e-05,
"loss": 0.2548,
"step": 740
},
{
"epoch": 0.26927564850552016,
"grad_norm": 1.1138116121292114,
"learning_rate": 9.963471700763123e-05,
"loss": 0.2525,
"step": 750
},
{
"epoch": 0.27286599048559373,
"grad_norm": 1.0550082921981812,
"learning_rate": 9.959593558118803e-05,
"loss": 0.2622,
"step": 760
},
{
"epoch": 0.27645633246566736,
"grad_norm": 0.8017902374267578,
"learning_rate": 9.955520633393205e-05,
"loss": 0.2649,
"step": 770
},
{
"epoch": 0.28004667444574094,
"grad_norm": 1.235143780708313,
"learning_rate": 9.951253086545558e-05,
"loss": 0.2747,
"step": 780
},
{
"epoch": 0.28363701642581457,
"grad_norm": 0.7427018284797668,
"learning_rate": 9.946791085178639e-05,
"loss": 0.242,
"step": 790
},
{
"epoch": 0.28722735840588814,
"grad_norm": 0.6972371935844421,
"learning_rate": 9.942134804532193e-05,
"loss": 0.2423,
"step": 800
},
{
"epoch": 0.2908177003859618,
"grad_norm": 0.9071277976036072,
"learning_rate": 9.937284427476052e-05,
"loss": 0.2425,
"step": 810
},
{
"epoch": 0.29440804236603535,
"grad_norm": 0.8345310688018799,
"learning_rate": 9.932240144502952e-05,
"loss": 0.2864,
"step": 820
},
{
"epoch": 0.297998384346109,
"grad_norm": 1.1392581462860107,
"learning_rate": 9.927002153721044e-05,
"loss": 0.2366,
"step": 830
},
{
"epoch": 0.30158872632618255,
"grad_norm": 0.9356684684753418,
"learning_rate": 9.921570660846131e-05,
"loss": 0.2464,
"step": 840
},
{
"epoch": 0.3051790683062562,
"grad_norm": 1.5248229503631592,
"learning_rate": 9.915945879193571e-05,
"loss": 0.2809,
"step": 850
},
{
"epoch": 0.30876941028632976,
"grad_norm": 1.0663933753967285,
"learning_rate": 9.91012802966991e-05,
"loss": 0.2779,
"step": 860
},
{
"epoch": 0.3123597522664034,
"grad_norm": 0.9292562007904053,
"learning_rate": 9.904117340764201e-05,
"loss": 0.2465,
"step": 870
},
{
"epoch": 0.31595009424647696,
"grad_norm": 0.7365911602973938,
"learning_rate": 9.897914048539032e-05,
"loss": 0.2688,
"step": 880
},
{
"epoch": 0.3195404362265506,
"grad_norm": 1.0190156698226929,
"learning_rate": 9.891518396621258e-05,
"loss": 0.2471,
"step": 890
},
{
"epoch": 0.32313077820662417,
"grad_norm": 1.167611837387085,
"learning_rate": 9.884930636192426e-05,
"loss": 0.2468,
"step": 900
},
{
"epoch": 0.3267211201866978,
"grad_norm": 1.1509454250335693,
"learning_rate": 9.878151025978918e-05,
"loss": 0.2528,
"step": 910
},
{
"epoch": 0.3303114621667714,
"grad_norm": 1.0654162168502808,
"learning_rate": 9.871179832241781e-05,
"loss": 0.2669,
"step": 920
},
{
"epoch": 0.333901804146845,
"grad_norm": 0.9040902853012085,
"learning_rate": 9.86401732876628e-05,
"loss": 0.2513,
"step": 930
},
{
"epoch": 0.3374921461269186,
"grad_norm": 2.8603482246398926,
"learning_rate": 9.856663796851137e-05,
"loss": 0.2526,
"step": 940
},
{
"epoch": 0.3410824881069922,
"grad_norm": 0.7283102869987488,
"learning_rate": 9.849119525297488e-05,
"loss": 0.2565,
"step": 950
},
{
"epoch": 0.3446728300870658,
"grad_norm": 1.1231544017791748,
"learning_rate": 9.841384810397538e-05,
"loss": 0.2591,
"step": 960
},
{
"epoch": 0.3482631720671394,
"grad_norm": 1.3341351747512817,
"learning_rate": 9.833459955922926e-05,
"loss": 0.2426,
"step": 970
},
{
"epoch": 0.351853514047213,
"grad_norm": 0.7382979393005371,
"learning_rate": 9.825345273112796e-05,
"loss": 0.2404,
"step": 980
},
{
"epoch": 0.3554438560272866,
"grad_norm": 0.9196600914001465,
"learning_rate": 9.817041080661571e-05,
"loss": 0.269,
"step": 990
},
{
"epoch": 0.3590341980073602,
"grad_norm": 4.254228115081787,
"learning_rate": 9.808547704706437e-05,
"loss": 0.2498,
"step": 1000
},
{
"epoch": 0.3626245399874338,
"grad_norm": 0.6999326348304749,
"learning_rate": 9.799865478814535e-05,
"loss": 0.242,
"step": 1010
},
{
"epoch": 0.3662148819675074,
"grad_norm": 1.5552287101745605,
"learning_rate": 9.790994743969864e-05,
"loss": 0.2663,
"step": 1020
},
{
"epoch": 0.36980522394758103,
"grad_norm": 0.6971444487571716,
"learning_rate": 9.781935848559878e-05,
"loss": 0.2549,
"step": 1030
},
{
"epoch": 0.3733955659276546,
"grad_norm": 1.180908441543579,
"learning_rate": 9.772689148361817e-05,
"loss": 0.2313,
"step": 1040
},
{
"epoch": 0.37698590790772823,
"grad_norm": 0.633343517780304,
"learning_rate": 9.763255006528731e-05,
"loss": 0.2395,
"step": 1050
},
{
"epoch": 0.3805762498878018,
"grad_norm": 0.9181081056594849,
"learning_rate": 9.753633793575206e-05,
"loss": 0.2512,
"step": 1060
},
{
"epoch": 0.38416659186787544,
"grad_norm": 1.1254559755325317,
"learning_rate": 9.743825887362832e-05,
"loss": 0.2467,
"step": 1070
},
{
"epoch": 0.387756933847949,
"grad_norm": 0.8145197629928589,
"learning_rate": 9.733831673085344e-05,
"loss": 0.2421,
"step": 1080
},
{
"epoch": 0.39134727582802264,
"grad_norm": 0.5483050346374512,
"learning_rate": 9.723651543253509e-05,
"loss": 0.2578,
"step": 1090
},
{
"epoch": 0.3949376178080962,
"grad_norm": 0.7891978621482849,
"learning_rate": 9.713285897679699e-05,
"loss": 0.2339,
"step": 1100
},
{
"epoch": 0.39852795978816985,
"grad_norm": 0.6310613751411438,
"learning_rate": 9.702735143462198e-05,
"loss": 0.2379,
"step": 1110
},
{
"epoch": 0.4021183017682434,
"grad_norm": 0.8631925582885742,
"learning_rate": 9.691999694969208e-05,
"loss": 0.2413,
"step": 1120
},
{
"epoch": 0.40570864374831705,
"grad_norm": 0.7224175930023193,
"learning_rate": 9.681079973822576e-05,
"loss": 0.2343,
"step": 1130
},
{
"epoch": 0.40929898572839063,
"grad_norm": 0.8189213871955872,
"learning_rate": 9.669976408881238e-05,
"loss": 0.2513,
"step": 1140
},
{
"epoch": 0.4128893277084642,
"grad_norm": 0.8129417300224304,
"learning_rate": 9.658689436224373e-05,
"loss": 0.2547,
"step": 1150
},
{
"epoch": 0.41647966968853783,
"grad_norm": 1.1440197229385376,
"learning_rate": 9.647219499134277e-05,
"loss": 0.2427,
"step": 1160
},
{
"epoch": 0.4200700116686114,
"grad_norm": 0.9682267308235168,
"learning_rate": 9.635567048078958e-05,
"loss": 0.2411,
"step": 1170
},
{
"epoch": 0.42366035364868504,
"grad_norm": 0.7513495683670044,
"learning_rate": 9.623732540694437e-05,
"loss": 0.252,
"step": 1180
},
{
"epoch": 0.4272506956287586,
"grad_norm": 3.1498029232025146,
"learning_rate": 9.61171644176678e-05,
"loss": 0.2486,
"step": 1190
},
{
"epoch": 0.43084103760883224,
"grad_norm": 0.6250784397125244,
"learning_rate": 9.599519223213842e-05,
"loss": 0.2459,
"step": 1200
},
{
"epoch": 0.4344313795889058,
"grad_norm": 0.548052966594696,
"learning_rate": 9.587141364066736e-05,
"loss": 0.2334,
"step": 1210
},
{
"epoch": 0.43802172156897945,
"grad_norm": 0.6549167037010193,
"learning_rate": 9.574583350451016e-05,
"loss": 0.2399,
"step": 1220
},
{
"epoch": 0.441612063549053,
"grad_norm": 0.7177796363830566,
"learning_rate": 9.561845675567586e-05,
"loss": 0.2574,
"step": 1230
},
{
"epoch": 0.44520240552912665,
"grad_norm": 1.0265281200408936,
"learning_rate": 9.548928839673334e-05,
"loss": 0.2285,
"step": 1240
},
{
"epoch": 0.44879274750920023,
"grad_norm": 1.3221251964569092,
"learning_rate": 9.535833350061473e-05,
"loss": 0.2293,
"step": 1250
},
{
"epoch": 0.45238308948927386,
"grad_norm": 0.9542430639266968,
"learning_rate": 9.522559721041636e-05,
"loss": 0.2367,
"step": 1260
},
{
"epoch": 0.45597343146934743,
"grad_norm": 2.0089797973632812,
"learning_rate": 9.509108473919662e-05,
"loss": 0.2166,
"step": 1270
},
{
"epoch": 0.45956377344942106,
"grad_norm": 1.2323672771453857,
"learning_rate": 9.495480136977127e-05,
"loss": 0.2253,
"step": 1280
},
{
"epoch": 0.46315411542949464,
"grad_norm": 1.155745506286621,
"learning_rate": 9.4816752454506e-05,
"loss": 0.2236,
"step": 1290
},
{
"epoch": 0.46674445740956827,
"grad_norm": 0.5866098403930664,
"learning_rate": 9.46769434151062e-05,
"loss": 0.2346,
"step": 1300
},
{
"epoch": 0.47033479938964184,
"grad_norm": 0.8677975535392761,
"learning_rate": 9.4535379742404e-05,
"loss": 0.2229,
"step": 1310
},
{
"epoch": 0.4739251413697155,
"grad_norm": 0.8805405497550964,
"learning_rate": 9.439206699614263e-05,
"loss": 0.2279,
"step": 1320
},
{
"epoch": 0.47751548334978905,
"grad_norm": 0.5903385877609253,
"learning_rate": 9.424701080475811e-05,
"loss": 0.2454,
"step": 1330
},
{
"epoch": 0.4811058253298627,
"grad_norm": 0.9364457726478577,
"learning_rate": 9.410021686515815e-05,
"loss": 0.2454,
"step": 1340
},
{
"epoch": 0.48469616730993625,
"grad_norm": 1.4409586191177368,
"learning_rate": 9.39516909424985e-05,
"loss": 0.2417,
"step": 1350
},
{
"epoch": 0.4882865092900099,
"grad_norm": 0.705747663974762,
"learning_rate": 9.380143886995636e-05,
"loss": 0.2253,
"step": 1360
},
{
"epoch": 0.49187685127008346,
"grad_norm": 1.2557168006896973,
"learning_rate": 9.364946654850148e-05,
"loss": 0.2332,
"step": 1370
},
{
"epoch": 0.4954671932501571,
"grad_norm": 1.4732472896575928,
"learning_rate": 9.349577994666427e-05,
"loss": 0.2202,
"step": 1380
},
{
"epoch": 0.49905753523023066,
"grad_norm": 1.1212490797042847,
"learning_rate": 9.33403851003015e-05,
"loss": 0.2064,
"step": 1390
},
{
"epoch": 0.5026478772103042,
"grad_norm": 0.825175404548645,
"learning_rate": 9.31832881123591e-05,
"loss": 0.2148,
"step": 1400
},
{
"epoch": 0.5062382191903779,
"grad_norm": 0.8229523301124573,
"learning_rate": 9.302449515263268e-05,
"loss": 0.2307,
"step": 1410
},
{
"epoch": 0.5098285611704515,
"grad_norm": 0.8145741820335388,
"learning_rate": 9.286401245752501e-05,
"loss": 0.2405,
"step": 1420
},
{
"epoch": 0.5134189031505251,
"grad_norm": 0.7511823177337646,
"learning_rate": 9.270184632980121e-05,
"loss": 0.2311,
"step": 1430
},
{
"epoch": 0.5170092451305986,
"grad_norm": 0.7575204968452454,
"learning_rate": 9.253800313834127e-05,
"loss": 0.2068,
"step": 1440
},
{
"epoch": 0.5205995871106723,
"grad_norm": 0.6711773872375488,
"learning_rate": 9.237248931788972e-05,
"loss": 0.2336,
"step": 1450
},
{
"epoch": 0.5241899290907459,
"grad_norm": 0.7057952880859375,
"learning_rate": 9.220531136880314e-05,
"loss": 0.2332,
"step": 1460
},
{
"epoch": 0.5277802710708195,
"grad_norm": 0.7404478788375854,
"learning_rate": 9.203647585679471e-05,
"loss": 0.2204,
"step": 1470
},
{
"epoch": 0.5313706130508931,
"grad_norm": 0.6271808743476868,
"learning_rate": 9.186598941267642e-05,
"loss": 0.207,
"step": 1480
},
{
"epoch": 0.5349609550309667,
"grad_norm": 0.7089178562164307,
"learning_rate": 9.169385873209863e-05,
"loss": 0.2259,
"step": 1490
},
{
"epoch": 0.5385512970110403,
"grad_norm": 0.949642539024353,
"learning_rate": 9.152009057528714e-05,
"loss": 0.229,
"step": 1500
},
{
"epoch": 0.5421416389911139,
"grad_norm": 0.7554659247398376,
"learning_rate": 9.134469176677762e-05,
"loss": 0.2208,
"step": 1510
},
{
"epoch": 0.5457319809711875,
"grad_norm": 0.713874340057373,
"learning_rate": 9.116766919514765e-05,
"loss": 0.2177,
"step": 1520
},
{
"epoch": 0.5493223229512612,
"grad_norm": 0.6753556728363037,
"learning_rate": 9.098902981274615e-05,
"loss": 0.2202,
"step": 1530
},
{
"epoch": 0.5529126649313347,
"grad_norm": 1.2491189241409302,
"learning_rate": 9.080878063542035e-05,
"loss": 0.2118,
"step": 1540
},
{
"epoch": 0.5565030069114083,
"grad_norm": 0.6264563798904419,
"learning_rate": 9.062692874224024e-05,
"loss": 0.2211,
"step": 1550
},
{
"epoch": 0.5600933488914819,
"grad_norm": 0.4661034941673279,
"learning_rate": 9.044348127522054e-05,
"loss": 0.2168,
"step": 1560
},
{
"epoch": 0.5636836908715556,
"grad_norm": 0.6062325835227966,
"learning_rate": 9.025844543904022e-05,
"loss": 0.214,
"step": 1570
},
{
"epoch": 0.5672740328516291,
"grad_norm": 0.6374778747558594,
"learning_rate": 9.007182850075956e-05,
"loss": 0.2083,
"step": 1580
},
{
"epoch": 0.5708643748317027,
"grad_norm": 1.131443738937378,
"learning_rate": 8.98836377895347e-05,
"loss": 0.2005,
"step": 1590
},
{
"epoch": 0.5744547168117763,
"grad_norm": 0.6167281866073608,
"learning_rate": 8.969388069632987e-05,
"loss": 0.2122,
"step": 1600
},
{
"epoch": 0.57804505879185,
"grad_norm": 0.9362030625343323,
"learning_rate": 8.950256467362699e-05,
"loss": 0.2275,
"step": 1610
},
{
"epoch": 0.5816354007719235,
"grad_norm": 0.9304684996604919,
"learning_rate": 8.930969723513312e-05,
"loss": 0.2027,
"step": 1620
},
{
"epoch": 0.5852257427519971,
"grad_norm": 0.62895268201828,
"learning_rate": 8.911528595548533e-05,
"loss": 0.2266,
"step": 1630
},
{
"epoch": 0.5888160847320707,
"grad_norm": 1.480999231338501,
"learning_rate": 8.891933846995312e-05,
"loss": 0.2052,
"step": 1640
},
{
"epoch": 0.5924064267121444,
"grad_norm": 1.3081512451171875,
"learning_rate": 8.872186247413874e-05,
"loss": 0.212,
"step": 1650
},
{
"epoch": 0.595996768692218,
"grad_norm": 2.765312671661377,
"learning_rate": 8.852286572367476e-05,
"loss": 0.2233,
"step": 1660
},
{
"epoch": 0.5995871106722915,
"grad_norm": 1.2033319473266602,
"learning_rate": 8.832235603391958e-05,
"loss": 0.2199,
"step": 1670
},
{
"epoch": 0.6031774526523651,
"grad_norm": 1.092360496520996,
"learning_rate": 8.812034127965048e-05,
"loss": 0.1994,
"step": 1680
},
{
"epoch": 0.6067677946324388,
"grad_norm": 1.0622711181640625,
"learning_rate": 8.791682939475438e-05,
"loss": 0.2117,
"step": 1690
},
{
"epoch": 0.6103581366125124,
"grad_norm": 0.722064733505249,
"learning_rate": 8.771182837191613e-05,
"loss": 0.2219,
"step": 1700
},
{
"epoch": 0.6139484785925859,
"grad_norm": 0.602187991142273,
"learning_rate": 8.750534626230475e-05,
"loss": 0.2159,
"step": 1710
},
{
"epoch": 0.6175388205726595,
"grad_norm": 0.7628340721130371,
"learning_rate": 8.729739117525715e-05,
"loss": 0.2088,
"step": 1720
},
{
"epoch": 0.6211291625527331,
"grad_norm": 0.5262313485145569,
"learning_rate": 8.708797127795963e-05,
"loss": 0.2285,
"step": 1730
},
{
"epoch": 0.6247195045328068,
"grad_norm": 0.6427643299102783,
"learning_rate": 8.68770947951272e-05,
"loss": 0.2094,
"step": 1740
},
{
"epoch": 0.6283098465128804,
"grad_norm": 0.5874310731887817,
"learning_rate": 8.666477000868046e-05,
"loss": 0.2263,
"step": 1750
},
{
"epoch": 0.6319001884929539,
"grad_norm": 0.561213493347168,
"learning_rate": 8.645100525742042e-05,
"loss": 0.2025,
"step": 1760
},
{
"epoch": 0.6354905304730275,
"grad_norm": 0.7805958390235901,
"learning_rate": 8.623580893670105e-05,
"loss": 0.2171,
"step": 1770
},
{
"epoch": 0.6390808724531012,
"grad_norm": 0.5806890726089478,
"learning_rate": 8.601918949809937e-05,
"loss": 0.2103,
"step": 1780
},
{
"epoch": 0.6426712144331748,
"grad_norm": 0.581363320350647,
"learning_rate": 8.580115544908374e-05,
"loss": 0.2129,
"step": 1790
},
{
"epoch": 0.6462615564132483,
"grad_norm": 0.4736599326133728,
"learning_rate": 8.558171535267958e-05,
"loss": 0.1993,
"step": 1800
},
{
"epoch": 0.6498518983933219,
"grad_norm": 0.6482508778572083,
"learning_rate": 8.536087782713318e-05,
"loss": 0.193,
"step": 1810
},
{
"epoch": 0.6534422403733956,
"grad_norm": 0.7920377850532532,
"learning_rate": 8.513865154557315e-05,
"loss": 0.1989,
"step": 1820
},
{
"epoch": 0.6570325823534692,
"grad_norm": 0.7527133226394653,
"learning_rate": 8.491504523566985e-05,
"loss": 0.215,
"step": 1830
},
{
"epoch": 0.6606229243335427,
"grad_norm": 0.8890761733055115,
"learning_rate": 8.46900676792926e-05,
"loss": 0.1972,
"step": 1840
},
{
"epoch": 0.6642132663136163,
"grad_norm": 1.100785732269287,
"learning_rate": 8.44637277121647e-05,
"loss": 0.1958,
"step": 1850
},
{
"epoch": 0.66780360829369,
"grad_norm": 0.6120195388793945,
"learning_rate": 8.423603422351665e-05,
"loss": 0.21,
"step": 1860
},
{
"epoch": 0.6713939502737636,
"grad_norm": 0.9138973951339722,
"learning_rate": 8.400699615573671e-05,
"loss": 0.2144,
"step": 1870
},
{
"epoch": 0.6749842922538372,
"grad_norm": 0.6855999827384949,
"learning_rate": 8.377662250402e-05,
"loss": 0.1949,
"step": 1880
},
{
"epoch": 0.6785746342339107,
"grad_norm": 0.8468754291534424,
"learning_rate": 8.354492231601505e-05,
"loss": 0.207,
"step": 1890
},
{
"epoch": 0.6821649762139844,
"grad_norm": 0.650043249130249,
"learning_rate": 8.331190469146848e-05,
"loss": 0.2029,
"step": 1900
},
{
"epoch": 0.685755318194058,
"grad_norm": 0.7149790525436401,
"learning_rate": 8.307757878186767e-05,
"loss": 0.1891,
"step": 1910
},
{
"epoch": 0.6893456601741316,
"grad_norm": 0.5650553703308105,
"learning_rate": 8.284195379008137e-05,
"loss": 0.2034,
"step": 1920
},
{
"epoch": 0.6929360021542051,
"grad_norm": 0.8220282793045044,
"learning_rate": 8.260503896999814e-05,
"loss": 0.2004,
"step": 1930
},
{
"epoch": 0.6965263441342788,
"grad_norm": 0.9552260041236877,
"learning_rate": 8.236684362616307e-05,
"loss": 0.2052,
"step": 1940
},
{
"epoch": 0.7001166861143524,
"grad_norm": 0.643084704875946,
"learning_rate": 8.212737711341223e-05,
"loss": 0.2072,
"step": 1950
},
{
"epoch": 0.703707028094426,
"grad_norm": 0.6681669354438782,
"learning_rate": 8.188664883650537e-05,
"loss": 0.1969,
"step": 1960
},
{
"epoch": 0.7072973700744996,
"grad_norm": 1.1286799907684326,
"learning_rate": 8.164466824975647e-05,
"loss": 0.1964,
"step": 1970
},
{
"epoch": 0.7108877120545732,
"grad_norm": 0.7001319527626038,
"learning_rate": 8.14014448566625e-05,
"loss": 0.1728,
"step": 1980
},
{
"epoch": 0.7144780540346468,
"grad_norm": 0.8087079524993896,
"learning_rate": 8.115698820953012e-05,
"loss": 0.1879,
"step": 1990
},
{
"epoch": 0.7180683960147204,
"grad_norm": 0.5888068079948425,
"learning_rate": 8.091130790910065e-05,
"loss": 0.2017,
"step": 2000
},
{
"epoch": 0.721658737994794,
"grad_norm": 0.868241012096405,
"learning_rate": 8.066441360417283e-05,
"loss": 0.2002,
"step": 2010
},
{
"epoch": 0.7252490799748676,
"grad_norm": 0.9173946976661682,
"learning_rate": 8.041631499122399e-05,
"loss": 0.1822,
"step": 2020
},
{
"epoch": 0.7288394219549412,
"grad_norm": 0.7348050475120544,
"learning_rate": 8.016702181402925e-05,
"loss": 0.1822,
"step": 2030
},
{
"epoch": 0.7324297639350148,
"grad_norm": 0.5974103808403015,
"learning_rate": 7.991654386327877e-05,
"loss": 0.1894,
"step": 2040
},
{
"epoch": 0.7360201059150884,
"grad_norm": 1.2631843090057373,
"learning_rate": 7.966489097619327e-05,
"loss": 0.2005,
"step": 2050
},
{
"epoch": 0.7396104478951621,
"grad_norm": 0.9306305050849915,
"learning_rate": 7.941207303613773e-05,
"loss": 0.2077,
"step": 2060
},
{
"epoch": 0.7432007898752356,
"grad_norm": 0.6469571590423584,
"learning_rate": 7.915809997223312e-05,
"loss": 0.1893,
"step": 2070
},
{
"epoch": 0.7467911318553092,
"grad_norm": 0.6804335713386536,
"learning_rate": 7.89029817589665e-05,
"loss": 0.1985,
"step": 2080
},
{
"epoch": 0.7503814738353828,
"grad_norm": 0.6059459447860718,
"learning_rate": 7.864672841579944e-05,
"loss": 0.1856,
"step": 2090
},
{
"epoch": 0.7539718158154565,
"grad_norm": 0.6755326390266418,
"learning_rate": 7.838935000677419e-05,
"loss": 0.1816,
"step": 2100
},
{
"epoch": 0.75756215779553,
"grad_norm": 0.5813919901847839,
"learning_rate": 7.813085664011873e-05,
"loss": 0.1796,
"step": 2110
},
{
"epoch": 0.7611524997756036,
"grad_norm": 0.9791029691696167,
"learning_rate": 7.78712584678496e-05,
"loss": 0.204,
"step": 2120
},
{
"epoch": 0.7647428417556772,
"grad_norm": 0.6557776927947998,
"learning_rate": 7.76105656853733e-05,
"loss": 0.1897,
"step": 2130
},
{
"epoch": 0.7683331837357509,
"grad_norm": 0.5696374177932739,
"learning_rate": 7.73487885310858e-05,
"loss": 0.1882,
"step": 2140
},
{
"epoch": 0.7719235257158245,
"grad_norm": 0.704799473285675,
"learning_rate": 7.708593728597046e-05,
"loss": 0.186,
"step": 2150
},
{
"epoch": 0.775513867695898,
"grad_norm": 0.9005138874053955,
"learning_rate": 7.682202227319433e-05,
"loss": 0.1938,
"step": 2160
},
{
"epoch": 0.7791042096759716,
"grad_norm": 0.7679111957550049,
"learning_rate": 7.655705385770258e-05,
"loss": 0.182,
"step": 2170
},
{
"epoch": 0.7826945516560453,
"grad_norm": 0.7027627229690552,
"learning_rate": 7.629104244581156e-05,
"loss": 0.1859,
"step": 2180
},
{
"epoch": 0.7862848936361189,
"grad_norm": 0.8638216853141785,
"learning_rate": 7.602399848480002e-05,
"loss": 0.1945,
"step": 2190
},
{
"epoch": 0.7898752356161924,
"grad_norm": 0.6846340894699097,
"learning_rate": 7.575593246249885e-05,
"loss": 0.1899,
"step": 2200
},
{
"epoch": 0.793465577596266,
"grad_norm": 0.7671458721160889,
"learning_rate": 7.548685490687919e-05,
"loss": 0.1835,
"step": 2210
},
{
"epoch": 0.7970559195763397,
"grad_norm": 1.7174897193908691,
"learning_rate": 7.521677638563889e-05,
"loss": 0.1742,
"step": 2220
},
{
"epoch": 0.8006462615564133,
"grad_norm": 1.024430751800537,
"learning_rate": 7.494570750578757e-05,
"loss": 0.1827,
"step": 2230
},
{
"epoch": 0.8042366035364868,
"grad_norm": 0.8393763303756714,
"learning_rate": 7.467365891322995e-05,
"loss": 0.1726,
"step": 2240
},
{
"epoch": 0.8078269455165604,
"grad_norm": 3.184171438217163,
"learning_rate": 7.440064129234783e-05,
"loss": 0.1855,
"step": 2250
},
{
"epoch": 0.8114172874966341,
"grad_norm": 0.7078256011009216,
"learning_rate": 7.412666536558041e-05,
"loss": 0.1783,
"step": 2260
},
{
"epoch": 0.8150076294767077,
"grad_norm": 0.7265491485595703,
"learning_rate": 7.385174189300323e-05,
"loss": 0.19,
"step": 2270
},
{
"epoch": 0.8185979714567813,
"grad_norm": 0.8136366605758667,
"learning_rate": 7.35758816719055e-05,
"loss": 0.1685,
"step": 2280
},
{
"epoch": 0.8221883134368548,
"grad_norm": 1.0148855447769165,
"learning_rate": 7.329909553636618e-05,
"loss": 0.1781,
"step": 2290
},
{
"epoch": 0.8257786554169284,
"grad_norm": 0.9568372964859009,
"learning_rate": 7.302139435682831e-05,
"loss": 0.1702,
"step": 2300
},
{
"epoch": 0.8293689973970021,
"grad_norm": 1.8222324848175049,
"learning_rate": 7.274278903967229e-05,
"loss": 0.1823,
"step": 2310
},
{
"epoch": 0.8329593393770757,
"grad_norm": 0.6024855375289917,
"learning_rate": 7.246329052678736e-05,
"loss": 0.1741,
"step": 2320
},
{
"epoch": 0.8365496813571492,
"grad_norm": 0.9722542762756348,
"learning_rate": 7.218290979514202e-05,
"loss": 0.1757,
"step": 2330
},
{
"epoch": 0.8401400233372228,
"grad_norm": 2.1216533184051514,
"learning_rate": 7.190165785635273e-05,
"loss": 0.1748,
"step": 2340
},
{
"epoch": 0.8437303653172965,
"grad_norm": 0.6482483148574829,
"learning_rate": 7.161954575625172e-05,
"loss": 0.1799,
"step": 2350
},
{
"epoch": 0.8473207072973701,
"grad_norm": 2.2838494777679443,
"learning_rate": 7.133658457445291e-05,
"loss": 0.1616,
"step": 2360
},
{
"epoch": 0.8509110492774437,
"grad_norm": 0.6801573634147644,
"learning_rate": 7.105278542391695e-05,
"loss": 0.1806,
"step": 2370
},
{
"epoch": 0.8545013912575172,
"grad_norm": 0.8442283272743225,
"learning_rate": 7.076815945051465e-05,
"loss": 0.1821,
"step": 2380
},
{
"epoch": 0.8580917332375909,
"grad_norm": 1.1653680801391602,
"learning_rate": 7.048271783258936e-05,
"loss": 0.1773,
"step": 2390
},
{
"epoch": 0.8616820752176645,
"grad_norm": 0.6987717151641846,
"learning_rate": 7.019647178051779e-05,
"loss": 0.1693,
"step": 2400
},
{
"epoch": 0.8652724171977381,
"grad_norm": 0.6374627351760864,
"learning_rate": 6.990943253626994e-05,
"loss": 0.194,
"step": 2410
},
{
"epoch": 0.8688627591778116,
"grad_norm": 0.6507960557937622,
"learning_rate": 6.962161137296743e-05,
"loss": 0.1568,
"step": 2420
},
{
"epoch": 0.8724531011578853,
"grad_norm": 0.6699422597885132,
"learning_rate": 6.933301959444082e-05,
"loss": 0.1759,
"step": 2430
},
{
"epoch": 0.8760434431379589,
"grad_norm": 0.48265889286994934,
"learning_rate": 6.904366853478567e-05,
"loss": 0.1735,
"step": 2440
},
{
"epoch": 0.8796337851180325,
"grad_norm": 0.8710943460464478,
"learning_rate": 6.875356955791735e-05,
"loss": 0.1807,
"step": 2450
},
{
"epoch": 0.883224127098106,
"grad_norm": 0.7356705069541931,
"learning_rate": 6.846273405712483e-05,
"loss": 0.1751,
"step": 2460
},
{
"epoch": 0.8868144690781797,
"grad_norm": 0.6466989517211914,
"learning_rate": 6.817117345462316e-05,
"loss": 0.1599,
"step": 2470
},
{
"epoch": 0.8904048110582533,
"grad_norm": 0.5134007334709167,
"learning_rate": 6.787889920110488e-05,
"loss": 0.1666,
"step": 2480
},
{
"epoch": 0.8939951530383269,
"grad_norm": 0.471064954996109,
"learning_rate": 6.75859227752903e-05,
"loss": 0.1624,
"step": 2490
},
{
"epoch": 0.8975854950184005,
"grad_norm": 0.606399655342102,
"learning_rate": 6.729225568347677e-05,
"loss": 0.1696,
"step": 2500
},
{
"epoch": 0.9011758369984741,
"grad_norm": 0.6752104759216309,
"learning_rate": 6.699790945908662e-05,
"loss": 0.1607,
"step": 2510
},
{
"epoch": 0.9047661789785477,
"grad_norm": 0.8237718939781189,
"learning_rate": 6.670289566221437e-05,
"loss": 0.1601,
"step": 2520
},
{
"epoch": 0.9083565209586213,
"grad_norm": 0.7542670965194702,
"learning_rate": 6.640722587917263e-05,
"loss": 0.1608,
"step": 2530
},
{
"epoch": 0.9119468629386949,
"grad_norm": 0.609646737575531,
"learning_rate": 6.611091172203708e-05,
"loss": 0.1586,
"step": 2540
},
{
"epoch": 0.9155372049187686,
"grad_norm": 0.7793768644332886,
"learning_rate": 6.581396482819038e-05,
"loss": 0.1601,
"step": 2550
},
{
"epoch": 0.9191275468988421,
"grad_norm": 0.9071997404098511,
"learning_rate": 6.551639685986524e-05,
"loss": 0.166,
"step": 2560
},
{
"epoch": 0.9227178888789157,
"grad_norm": 1.0000146627426147,
"learning_rate": 6.521821950368625e-05,
"loss": 0.1702,
"step": 2570
},
{
"epoch": 0.9263082308589893,
"grad_norm": 0.8889328241348267,
"learning_rate": 6.491944447021102e-05,
"loss": 0.1669,
"step": 2580
},
{
"epoch": 0.929898572839063,
"grad_norm": 0.6329061985015869,
"learning_rate": 6.462008349347022e-05,
"loss": 0.1641,
"step": 2590
},
{
"epoch": 0.9334889148191365,
"grad_norm": 0.7821244597434998,
"learning_rate": 6.43201483305067e-05,
"loss": 0.1643,
"step": 2600
},
{
"epoch": 0.9370792567992101,
"grad_norm": 1.3463133573532104,
"learning_rate": 6.401965076091382e-05,
"loss": 0.1603,
"step": 2610
},
{
"epoch": 0.9406695987792837,
"grad_norm": 2.534256935119629,
"learning_rate": 6.371860258637278e-05,
"loss": 0.1577,
"step": 2620
},
{
"epoch": 0.9442599407593574,
"grad_norm": 0.9502484202384949,
"learning_rate": 6.341701563018913e-05,
"loss": 0.1529,
"step": 2630
},
{
"epoch": 0.947850282739431,
"grad_norm": 0.5928242206573486,
"learning_rate": 6.311490173682839e-05,
"loss": 0.1633,
"step": 2640
},
{
"epoch": 0.9514406247195045,
"grad_norm": 1.3390663862228394,
"learning_rate": 6.281227277145093e-05,
"loss": 0.1609,
"step": 2650
},
{
"epoch": 0.9550309666995781,
"grad_norm": 0.8307391405105591,
"learning_rate": 6.250914061944597e-05,
"loss": 0.1654,
"step": 2660
},
{
"epoch": 0.9586213086796518,
"grad_norm": 0.6453768610954285,
"learning_rate": 6.220551718596477e-05,
"loss": 0.1504,
"step": 2670
},
{
"epoch": 0.9622116506597254,
"grad_norm": 0.9472678899765015,
"learning_rate": 6.190141439545304e-05,
"loss": 0.1441,
"step": 2680
},
{
"epoch": 0.9658019926397989,
"grad_norm": 1.077405571937561,
"learning_rate": 6.159684419118274e-05,
"loss": 0.1574,
"step": 2690
},
{
"epoch": 0.9693923346198725,
"grad_norm": 1.373565673828125,
"learning_rate": 6.129181853478285e-05,
"loss": 0.1557,
"step": 2700
},
{
"epoch": 0.9729826765999462,
"grad_norm": 0.7159507274627686,
"learning_rate": 6.0986349405769795e-05,
"loss": 0.148,
"step": 2710
},
{
"epoch": 0.9765730185800198,
"grad_norm": 0.7065421342849731,
"learning_rate": 6.068044880107675e-05,
"loss": 0.1481,
"step": 2720
},
{
"epoch": 0.9801633605600933,
"grad_norm": 1.0575318336486816,
"learning_rate": 6.0374128734582634e-05,
"loss": 0.1546,
"step": 2730
},
{
"epoch": 0.9837537025401669,
"grad_norm": 1.3331146240234375,
"learning_rate": 6.006740123664022e-05,
"loss": 0.1685,
"step": 2740
},
{
"epoch": 0.9873440445202406,
"grad_norm": 0.712989091873169,
"learning_rate": 5.976027835360366e-05,
"loss": 0.1443,
"step": 2750
},
{
"epoch": 0.9909343865003142,
"grad_norm": 0.9985840320587158,
"learning_rate": 5.945277214735537e-05,
"loss": 0.1381,
"step": 2760
},
{
"epoch": 0.9945247284803878,
"grad_norm": 0.6109340786933899,
"learning_rate": 5.914489469483234e-05,
"loss": 0.1506,
"step": 2770
},
{
"epoch": 0.9981150704604613,
"grad_norm": 0.5232493281364441,
"learning_rate": 5.883665808755179e-05,
"loss": 0.1527,
"step": 2780
},
{
"epoch": 1.001705412440535,
"grad_norm": 1.120089054107666,
"learning_rate": 5.852807443113635e-05,
"loss": 0.1397,
"step": 2790
},
{
"epoch": 1.0052957544206085,
"grad_norm": 0.9276136755943298,
"learning_rate": 5.821915584483853e-05,
"loss": 0.1155,
"step": 2800
},
{
"epoch": 1.008886096400682,
"grad_norm": 0.6816973686218262,
"learning_rate": 5.790991446106487e-05,
"loss": 0.1111,
"step": 2810
},
{
"epoch": 1.0124764383807558,
"grad_norm": 0.8138614296913147,
"learning_rate": 5.7600362424899354e-05,
"loss": 0.1107,
"step": 2820
},
{
"epoch": 1.0160667803608294,
"grad_norm": 0.5443429350852966,
"learning_rate": 5.729051189362649e-05,
"loss": 0.1122,
"step": 2830
},
{
"epoch": 1.019657122340903,
"grad_norm": 0.6204805970191956,
"learning_rate": 5.698037503625379e-05,
"loss": 0.1147,
"step": 2840
},
{
"epoch": 1.0232474643209766,
"grad_norm": 0.5502025485038757,
"learning_rate": 5.6669964033033905e-05,
"loss": 0.1135,
"step": 2850
},
{
"epoch": 1.0268378063010501,
"grad_norm": 0.6541283130645752,
"learning_rate": 5.6359291074986244e-05,
"loss": 0.1225,
"step": 2860
},
{
"epoch": 1.0304281482811237,
"grad_norm": 0.6311090588569641,
"learning_rate": 5.604836836341816e-05,
"loss": 0.1063,
"step": 2870
},
{
"epoch": 1.0340184902611973,
"grad_norm": 0.9657145738601685,
"learning_rate": 5.573720810944575e-05,
"loss": 0.1171,
"step": 2880
},
{
"epoch": 1.037608832241271,
"grad_norm": 0.53743577003479,
"learning_rate": 5.542582253351438e-05,
"loss": 0.1128,
"step": 2890
},
{
"epoch": 1.0411991742213447,
"grad_norm": 0.7501124739646912,
"learning_rate": 5.511422386491858e-05,
"loss": 0.1117,
"step": 2900
},
{
"epoch": 1.0447895162014182,
"grad_norm": 0.7120064496994019,
"learning_rate": 5.480242434132191e-05,
"loss": 0.1049,
"step": 2910
},
{
"epoch": 1.0483798581814918,
"grad_norm": 0.5755088329315186,
"learning_rate": 5.4490436208276194e-05,
"loss": 0.1047,
"step": 2920
},
{
"epoch": 1.0519702001615654,
"grad_norm": 0.8773960471153259,
"learning_rate": 5.4178271718740744e-05,
"loss": 0.1119,
"step": 2930
},
{
"epoch": 1.055560542141639,
"grad_norm": 0.5922686457633972,
"learning_rate": 5.3865943132601e-05,
"loss": 0.1092,
"step": 2940
},
{
"epoch": 1.0591508841217125,
"grad_norm": 0.7486307621002197,
"learning_rate": 5.355346271618715e-05,
"loss": 0.1068,
"step": 2950
},
{
"epoch": 1.0627412261017861,
"grad_norm": 0.8534032702445984,
"learning_rate": 5.324084274179228e-05,
"loss": 0.1072,
"step": 2960
},
{
"epoch": 1.0663315680818597,
"grad_norm": 0.7270232439041138,
"learning_rate": 5.292809548719049e-05,
"loss": 0.1101,
"step": 2970
},
{
"epoch": 1.0699219100619335,
"grad_norm": 0.5195777416229248,
"learning_rate": 5.2615233235154616e-05,
"loss": 0.1084,
"step": 2980
},
{
"epoch": 1.073512252042007,
"grad_norm": 0.5684207081794739,
"learning_rate": 5.230226827297395e-05,
"loss": 0.1026,
"step": 2990
},
{
"epoch": 1.0771025940220806,
"grad_norm": 1.3543568849563599,
"learning_rate": 5.198921289197153e-05,
"loss": 0.1026,
"step": 3000
},
{
"epoch": 1.0806929360021542,
"grad_norm": 0.7514908313751221,
"learning_rate": 5.167607938702154e-05,
"loss": 0.1085,
"step": 3010
},
{
"epoch": 1.0842832779822278,
"grad_norm": 0.6683730483055115,
"learning_rate": 5.136288005606631e-05,
"loss": 0.1012,
"step": 3020
},
{
"epoch": 1.0878736199623014,
"grad_norm": 0.5652278065681458,
"learning_rate": 5.1049627199633496e-05,
"loss": 0.119,
"step": 3030
},
{
"epoch": 1.091463961942375,
"grad_norm": 0.7017742395401001,
"learning_rate": 5.073633312035287e-05,
"loss": 0.1057,
"step": 3040
},
{
"epoch": 1.0950543039224485,
"grad_norm": 0.5066478848457336,
"learning_rate": 5.042301012247317e-05,
"loss": 0.1127,
"step": 3050
},
{
"epoch": 1.0986446459025223,
"grad_norm": 0.535321056842804,
"learning_rate": 5.010967051137887e-05,
"loss": 0.1102,
"step": 3060
},
{
"epoch": 1.1022349878825959,
"grad_norm": 0.6270662546157837,
"learning_rate": 4.979632659310695e-05,
"loss": 0.1008,
"step": 3070
},
{
"epoch": 1.1058253298626695,
"grad_norm": 0.748859703540802,
"learning_rate": 4.9482990673863485e-05,
"loss": 0.0995,
"step": 3080
},
{
"epoch": 1.109415671842743,
"grad_norm": 0.500746488571167,
"learning_rate": 4.916967505954046e-05,
"loss": 0.1056,
"step": 3090
},
{
"epoch": 1.1130060138228166,
"grad_norm": 0.5748748183250427,
"learning_rate": 4.885639205523239e-05,
"loss": 0.106,
"step": 3100
},
{
"epoch": 1.1165963558028902,
"grad_norm": 0.593147337436676,
"learning_rate": 4.854315396475304e-05,
"loss": 0.1086,
"step": 3110
},
{
"epoch": 1.1201866977829638,
"grad_norm": 0.6119722127914429,
"learning_rate": 4.822997309015226e-05,
"loss": 0.1035,
"step": 3120
},
{
"epoch": 1.1237770397630373,
"grad_norm": 0.5296047925949097,
"learning_rate": 4.7916861731232846e-05,
"loss": 0.1083,
"step": 3130
},
{
"epoch": 1.127367381743111,
"grad_norm": 0.7060047388076782,
"learning_rate": 4.7603832185067416e-05,
"loss": 0.1,
"step": 3140
},
{
"epoch": 1.1309577237231847,
"grad_norm": 0.4993881583213806,
"learning_rate": 4.729089674551547e-05,
"loss": 0.1057,
"step": 3150
},
{
"epoch": 1.1345480657032583,
"grad_norm": 0.7866911888122559,
"learning_rate": 4.697806770274062e-05,
"loss": 0.0997,
"step": 3160
},
{
"epoch": 1.1381384076833319,
"grad_norm": 0.642524242401123,
"learning_rate": 4.6665357342727865e-05,
"loss": 0.1051,
"step": 3170
},
{
"epoch": 1.1417287496634054,
"grad_norm": 0.5228136777877808,
"learning_rate": 4.6352777946801094e-05,
"loss": 0.1002,
"step": 3180
},
{
"epoch": 1.145319091643479,
"grad_norm": 0.9493293762207031,
"learning_rate": 4.604034179114067e-05,
"loss": 0.1019,
"step": 3190
},
{
"epoch": 1.1489094336235526,
"grad_norm": 0.5647363662719727,
"learning_rate": 4.5728061146301476e-05,
"loss": 0.0915,
"step": 3200
},
{
"epoch": 1.1524997756036262,
"grad_norm": 0.6017284989356995,
"learning_rate": 4.5415948276730805e-05,
"loss": 0.1098,
"step": 3210
},
{
"epoch": 1.1560901175837,
"grad_norm": 0.46670928597450256,
"learning_rate": 4.5104015440286826e-05,
"loss": 0.1056,
"step": 3220
},
{
"epoch": 1.1596804595637735,
"grad_norm": 0.6661453247070312,
"learning_rate": 4.479227488775707e-05,
"loss": 0.0964,
"step": 3230
},
{
"epoch": 1.163270801543847,
"grad_norm": 0.642352819442749,
"learning_rate": 4.4480738862377444e-05,
"loss": 0.0907,
"step": 3240
},
{
"epoch": 1.1668611435239207,
"grad_norm": 2.4927215576171875,
"learning_rate": 4.4169419599351186e-05,
"loss": 0.0969,
"step": 3250
},
{
"epoch": 1.1704514855039942,
"grad_norm": 0.5965277552604675,
"learning_rate": 4.3858329325368536e-05,
"loss": 0.0921,
"step": 3260
},
{
"epoch": 1.1740418274840678,
"grad_norm": 0.503105103969574,
"learning_rate": 4.354748025812639e-05,
"loss": 0.0918,
"step": 3270
},
{
"epoch": 1.1776321694641414,
"grad_norm": 2.0070412158966064,
"learning_rate": 4.323688460584864e-05,
"loss": 0.1008,
"step": 3280
},
{
"epoch": 1.181222511444215,
"grad_norm": 0.5921032428741455,
"learning_rate": 4.292655456680651e-05,
"loss": 0.0992,
"step": 3290
},
{
"epoch": 1.1848128534242885,
"grad_norm": 0.7106916308403015,
"learning_rate": 4.261650232883965e-05,
"loss": 0.0998,
"step": 3300
},
{
"epoch": 1.1884031954043623,
"grad_norm": 0.7483718395233154,
"learning_rate": 4.230674006887734e-05,
"loss": 0.1007,
"step": 3310
},
{
"epoch": 1.191993537384436,
"grad_norm": 0.5854814648628235,
"learning_rate": 4.199727995246041e-05,
"loss": 0.1001,
"step": 3320
},
{
"epoch": 1.1955838793645095,
"grad_norm": 1.022163987159729,
"learning_rate": 4.1688134133263285e-05,
"loss": 0.0989,
"step": 3330
},
{
"epoch": 1.199174221344583,
"grad_norm": 0.6698512434959412,
"learning_rate": 4.1379314752616784e-05,
"loss": 0.0929,
"step": 3340
},
{
"epoch": 1.2027645633246566,
"grad_norm": 0.8445412516593933,
"learning_rate": 4.107083393903126e-05,
"loss": 0.0865,
"step": 3350
},
{
"epoch": 1.2063549053047302,
"grad_norm": 0.9410879611968994,
"learning_rate": 4.076270380772021e-05,
"loss": 0.0942,
"step": 3360
},
{
"epoch": 1.2099452472848038,
"grad_norm": 0.4104284346103668,
"learning_rate": 4.04549364601245e-05,
"loss": 0.0957,
"step": 3370
},
{
"epoch": 1.2135355892648776,
"grad_norm": 0.8418083786964417,
"learning_rate": 4.014754398343716e-05,
"loss": 0.0925,
"step": 3380
},
{
"epoch": 1.2171259312449512,
"grad_norm": 0.5773093700408936,
"learning_rate": 3.984053845012858e-05,
"loss": 0.0921,
"step": 3390
},
{
"epoch": 1.2207162732250247,
"grad_norm": 1.2288339138031006,
"learning_rate": 3.953393191747239e-05,
"loss": 0.089,
"step": 3400
},
{
"epoch": 1.2243066152050983,
"grad_norm": 0.5901492238044739,
"learning_rate": 3.9227736427071995e-05,
"loss": 0.0903,
"step": 3410
},
{
"epoch": 1.2278969571851719,
"grad_norm": 0.6220996379852295,
"learning_rate": 3.892196400438755e-05,
"loss": 0.0958,
"step": 3420
},
{
"epoch": 1.2314872991652455,
"grad_norm": 0.6737645864486694,
"learning_rate": 3.8616626658263825e-05,
"loss": 0.0892,
"step": 3430
},
{
"epoch": 1.235077641145319,
"grad_norm": 0.5661391019821167,
"learning_rate": 3.831173638045839e-05,
"loss": 0.0888,
"step": 3440
},
{
"epoch": 1.2386679831253926,
"grad_norm": 0.7712500095367432,
"learning_rate": 3.800730514517077e-05,
"loss": 0.0859,
"step": 3450
},
{
"epoch": 1.2422583251054662,
"grad_norm": 0.7590687274932861,
"learning_rate": 3.770334490857217e-05,
"loss": 0.0868,
"step": 3460
},
{
"epoch": 1.24584866708554,
"grad_norm": 0.5650063753128052,
"learning_rate": 3.7399867608335895e-05,
"loss": 0.0974,
"step": 3470
},
{
"epoch": 1.2494390090656136,
"grad_norm": 0.8975266218185425,
"learning_rate": 3.709688516316844e-05,
"loss": 0.095,
"step": 3480
},
{
"epoch": 1.2530293510456871,
"grad_norm": 0.5311192274093628,
"learning_rate": 3.679440947234152e-05,
"loss": 0.0925,
"step": 3490
},
{
"epoch": 1.2566196930257607,
"grad_norm": 1.0144147872924805,
"learning_rate": 3.649245241522468e-05,
"loss": 0.0903,
"step": 3500
},
{
"epoch": 1.2602100350058343,
"grad_norm": 0.6833083629608154,
"learning_rate": 3.619102585081872e-05,
"loss": 0.0929,
"step": 3510
},
{
"epoch": 1.2638003769859079,
"grad_norm": 0.6380596160888672,
"learning_rate": 3.589014161728999e-05,
"loss": 0.0787,
"step": 3520
},
{
"epoch": 1.2673907189659814,
"grad_norm": 0.7181170582771301,
"learning_rate": 3.558981153150542e-05,
"loss": 0.0859,
"step": 3530
},
{
"epoch": 1.2709810609460552,
"grad_norm": 0.6842727661132812,
"learning_rate": 3.529004738856853e-05,
"loss": 0.0823,
"step": 3540
},
{
"epoch": 1.2745714029261288,
"grad_norm": 1.5806798934936523,
"learning_rate": 3.4990860961356044e-05,
"loss": 0.085,
"step": 3550
},
{
"epoch": 1.2781617449062024,
"grad_norm": 0.6149685978889465,
"learning_rate": 3.4692264000055594e-05,
"loss": 0.0818,
"step": 3560
},
{
"epoch": 1.281752086886276,
"grad_norm": 0.797741174697876,
"learning_rate": 3.4394268231704266e-05,
"loss": 0.0787,
"step": 3570
},
{
"epoch": 1.2853424288663495,
"grad_norm": 0.5583544373512268,
"learning_rate": 3.4096885359728036e-05,
"loss": 0.0879,
"step": 3580
},
{
"epoch": 1.288932770846423,
"grad_norm": 1.2549068927764893,
"learning_rate": 3.380012706348209e-05,
"loss": 0.085,
"step": 3590
},
{
"epoch": 1.2925231128264967,
"grad_norm": 0.56533282995224,
"learning_rate": 3.350400499779214e-05,
"loss": 0.0932,
"step": 3600
},
{
"epoch": 1.2961134548065703,
"grad_norm": 0.9718196392059326,
"learning_rate": 3.32085307924967e-05,
"loss": 0.0901,
"step": 3610
},
{
"epoch": 1.2997037967866438,
"grad_norm": 0.6769024133682251,
"learning_rate": 3.2913716051990394e-05,
"loss": 0.0845,
"step": 3620
},
{
"epoch": 1.3032941387667174,
"grad_norm": 1.1620076894760132,
"learning_rate": 3.261957235476813e-05,
"loss": 0.0831,
"step": 3630
},
{
"epoch": 1.3068844807467912,
"grad_norm": 0.5092564225196838,
"learning_rate": 3.232611125297035e-05,
"loss": 0.0804,
"step": 3640
},
{
"epoch": 1.3104748227268648,
"grad_norm": 0.42432501912117004,
"learning_rate": 3.2033344271929476e-05,
"loss": 0.0866,
"step": 3650
},
{
"epoch": 1.3140651647069383,
"grad_norm": 0.5998629331588745,
"learning_rate": 3.17412829097171e-05,
"loss": 0.0865,
"step": 3660
},
{
"epoch": 1.317655506687012,
"grad_norm": 0.5421279072761536,
"learning_rate": 3.144993863669251e-05,
"loss": 0.0849,
"step": 3670
},
{
"epoch": 1.3212458486670855,
"grad_norm": 0.6406755447387695,
"learning_rate": 3.115932289505213e-05,
"loss": 0.0814,
"step": 3680
},
{
"epoch": 1.324836190647159,
"grad_norm": 0.9076423048973083,
"learning_rate": 3.086944709838028e-05,
"loss": 0.0898,
"step": 3690
},
{
"epoch": 1.3284265326272329,
"grad_norm": 0.7807140350341797,
"learning_rate": 3.0580322631200756e-05,
"loss": 0.0828,
"step": 3700
},
{
"epoch": 1.3320168746073064,
"grad_norm": 0.6127801537513733,
"learning_rate": 3.029196084852981e-05,
"loss": 0.08,
"step": 3710
},
{
"epoch": 1.33560721658738,
"grad_norm": 0.6226149797439575,
"learning_rate": 3.000437307543017e-05,
"loss": 0.0774,
"step": 3720
},
{
"epoch": 1.3391975585674536,
"grad_norm": 0.4141993820667267,
"learning_rate": 2.9717570606566287e-05,
"loss": 0.0817,
"step": 3730
},
{
"epoch": 1.3427879005475272,
"grad_norm": 0.6416285634040833,
"learning_rate": 2.943156470576073e-05,
"loss": 0.0792,
"step": 3740
},
{
"epoch": 1.3463782425276007,
"grad_norm": 0.6912229657173157,
"learning_rate": 2.914636660555178e-05,
"loss": 0.0743,
"step": 3750
},
{
"epoch": 1.3499685845076743,
"grad_norm": 0.8113506436347961,
"learning_rate": 2.886198750675233e-05,
"loss": 0.0843,
"step": 3760
},
{
"epoch": 1.353558926487748,
"grad_norm": 0.6693570613861084,
"learning_rate": 2.8578438578010053e-05,
"loss": 0.0718,
"step": 3770
},
{
"epoch": 1.3571492684678215,
"grad_norm": 0.6286030411720276,
"learning_rate": 2.8295730955368573e-05,
"loss": 0.0821,
"step": 3780
},
{
"epoch": 1.360739610447895,
"grad_norm": 0.5432600975036621,
"learning_rate": 2.8013875741830264e-05,
"loss": 0.0779,
"step": 3790
},
{
"epoch": 1.3643299524279688,
"grad_norm": 0.5628815293312073,
"learning_rate": 2.7732884006920225e-05,
"loss": 0.076,
"step": 3800
},
{
"epoch": 1.3679202944080424,
"grad_norm": 0.761500895023346,
"learning_rate": 2.745276678625141e-05,
"loss": 0.0869,
"step": 3810
},
{
"epoch": 1.371510636388116,
"grad_norm": 0.5888515710830688,
"learning_rate": 2.717353508109125e-05,
"loss": 0.0812,
"step": 3820
},
{
"epoch": 1.3751009783681896,
"grad_norm": 0.5477086305618286,
"learning_rate": 2.6895199857929643e-05,
"loss": 0.0772,
"step": 3830
},
{
"epoch": 1.3786913203482631,
"grad_norm": 0.5078212022781372,
"learning_rate": 2.6617772048048284e-05,
"loss": 0.0707,
"step": 3840
},
{
"epoch": 1.3822816623283367,
"grad_norm": 0.5893701910972595,
"learning_rate": 2.634126254709125e-05,
"loss": 0.081,
"step": 3850
},
{
"epoch": 1.3858720043084105,
"grad_norm": 0.9726279973983765,
"learning_rate": 2.6065682214637123e-05,
"loss": 0.0868,
"step": 3860
},
{
"epoch": 1.389462346288484,
"grad_norm": 0.5375906229019165,
"learning_rate": 2.5791041873772513e-05,
"loss": 0.0754,
"step": 3870
},
{
"epoch": 1.3930526882685577,
"grad_norm": 0.5937024354934692,
"learning_rate": 2.5517352310667053e-05,
"loss": 0.07,
"step": 3880
},
{
"epoch": 1.3966430302486312,
"grad_norm": 0.5695418119430542,
"learning_rate": 2.524462427414967e-05,
"loss": 0.0712,
"step": 3890
},
{
"epoch": 1.4002333722287048,
"grad_norm": 0.6219804883003235,
"learning_rate": 2.497286847528646e-05,
"loss": 0.0771,
"step": 3900
},
{
"epoch": 1.4038237142087784,
"grad_norm": 0.7533654570579529,
"learning_rate": 2.4702095586960085e-05,
"loss": 0.073,
"step": 3910
},
{
"epoch": 1.407414056188852,
"grad_norm": 0.5750814080238342,
"learning_rate": 2.443231624345061e-05,
"loss": 0.0753,
"step": 3920
},
{
"epoch": 1.4110043981689255,
"grad_norm": 0.5853593349456787,
"learning_rate": 2.416354104001779e-05,
"loss": 0.0754,
"step": 3930
},
{
"epoch": 1.414594740148999,
"grad_norm": 0.4552966356277466,
"learning_rate": 2.389578053248493e-05,
"loss": 0.0753,
"step": 3940
},
{
"epoch": 1.4181850821290727,
"grad_norm": 0.718437671661377,
"learning_rate": 2.362904523682447e-05,
"loss": 0.0758,
"step": 3950
},
{
"epoch": 1.4217754241091463,
"grad_norm": 0.7326009273529053,
"learning_rate": 2.3363345628744832e-05,
"loss": 0.0756,
"step": 3960
},
{
"epoch": 1.42536576608922,
"grad_norm": 0.9607858657836914,
"learning_rate": 2.3098692143279066e-05,
"loss": 0.0719,
"step": 3970
},
{
"epoch": 1.4289561080692936,
"grad_norm": 0.7754957675933838,
"learning_rate": 2.283509517437496e-05,
"loss": 0.0717,
"step": 3980
},
{
"epoch": 1.4325464500493672,
"grad_norm": 0.8900684714317322,
"learning_rate": 2.2572565074486972e-05,
"loss": 0.0757,
"step": 3990
},
{
"epoch": 1.4361367920294408,
"grad_norm": 0.6538607478141785,
"learning_rate": 2.2311112154169507e-05,
"loss": 0.0709,
"step": 4000
},
{
"epoch": 1.4397271340095144,
"grad_norm": 0.6442373991012573,
"learning_rate": 2.2050746681672056e-05,
"loss": 0.0736,
"step": 4010
},
{
"epoch": 1.443317475989588,
"grad_norm": 0.9824745655059814,
"learning_rate": 2.179147888253584e-05,
"loss": 0.0741,
"step": 4020
},
{
"epoch": 1.4469078179696617,
"grad_norm": 0.6084447503089905,
"learning_rate": 2.1533318939192394e-05,
"loss": 0.0675,
"step": 4030
},
{
"epoch": 1.4504981599497353,
"grad_norm": 0.6071482300758362,
"learning_rate": 2.127627699056345e-05,
"loss": 0.0721,
"step": 4040
},
{
"epoch": 1.4540885019298089,
"grad_norm": 0.5101909637451172,
"learning_rate": 2.102036313166289e-05,
"loss": 0.0691,
"step": 4050
},
{
"epoch": 1.4576788439098824,
"grad_norm": 0.5907676815986633,
"learning_rate": 2.076558741320016e-05,
"loss": 0.0624,
"step": 4060
},
{
"epoch": 1.461269185889956,
"grad_norm": 0.7201829552650452,
"learning_rate": 2.0511959841185713e-05,
"loss": 0.0749,
"step": 4070
},
{
"epoch": 1.4648595278700296,
"grad_norm": 0.5254886150360107,
"learning_rate": 2.0259490376537865e-05,
"loss": 0.078,
"step": 4080
},
{
"epoch": 1.4684498698501032,
"grad_norm": 0.4855566620826721,
"learning_rate": 2.0008188934691614e-05,
"loss": 0.0727,
"step": 4090
},
{
"epoch": 1.4720402118301767,
"grad_norm": 0.68084716796875,
"learning_rate": 1.975806538520937e-05,
"loss": 0.0679,
"step": 4100
},
{
"epoch": 1.4756305538102503,
"grad_norm": 0.5893229842185974,
"learning_rate": 1.9509129551393145e-05,
"loss": 0.0709,
"step": 4110
},
{
"epoch": 1.479220895790324,
"grad_norm": 0.5513525605201721,
"learning_rate": 1.9261391209898912e-05,
"loss": 0.0664,
"step": 4120
},
{
"epoch": 1.4828112377703977,
"grad_norm": 0.45056793093681335,
"learning_rate": 1.9014860090352476e-05,
"loss": 0.0635,
"step": 4130
},
{
"epoch": 1.4864015797504713,
"grad_norm": 0.6190094947814941,
"learning_rate": 1.8769545874967566e-05,
"loss": 0.0693,
"step": 4140
},
{
"epoch": 1.4899919217305448,
"grad_norm": 0.6586858034133911,
"learning_rate": 1.852545819816539e-05,
"loss": 0.0652,
"step": 4150
},
{
"epoch": 1.4935822637106184,
"grad_norm": 0.9752713441848755,
"learning_rate": 1.8282606646196353e-05,
"loss": 0.0744,
"step": 4160
},
{
"epoch": 1.497172605690692,
"grad_norm": 0.6681696176528931,
"learning_rate": 1.8041000756763493e-05,
"loss": 0.0671,
"step": 4170
},
{
"epoch": 1.5007629476707658,
"grad_norm": 0.5906854867935181,
"learning_rate": 1.7800650018648024e-05,
"loss": 0.0736,
"step": 4180
},
{
"epoch": 1.5043532896508394,
"grad_norm": 0.6534956097602844,
"learning_rate": 1.7561563871336545e-05,
"loss": 0.0674,
"step": 4190
},
{
"epoch": 1.507943631630913,
"grad_norm": 0.5932891964912415,
"learning_rate": 1.732375170465041e-05,
"loss": 0.0672,
"step": 4200
},
{
"epoch": 1.5115339736109865,
"grad_norm": 0.504921019077301,
"learning_rate": 1.7087222858376834e-05,
"loss": 0.07,
"step": 4210
},
{
"epoch": 1.51512431559106,
"grad_norm": 0.6252205967903137,
"learning_rate": 1.6851986621902265e-05,
"loss": 0.0637,
"step": 4220
},
{
"epoch": 1.5187146575711337,
"grad_norm": 0.47223180532455444,
"learning_rate": 1.6618052233847404e-05,
"loss": 0.0697,
"step": 4230
},
{
"epoch": 1.5223049995512072,
"grad_norm": 0.4429969787597656,
"learning_rate": 1.6385428881704405e-05,
"loss": 0.0664,
"step": 4240
},
{
"epoch": 1.5258953415312808,
"grad_norm": 0.44724294543266296,
"learning_rate": 1.6154125701476092e-05,
"loss": 0.0642,
"step": 4250
},
{
"epoch": 1.5294856835113544,
"grad_norm": 0.49648982286453247,
"learning_rate": 1.59241517773171e-05,
"loss": 0.0616,
"step": 4260
},
{
"epoch": 1.533076025491428,
"grad_norm": 0.3683583736419678,
"learning_rate": 1.5695516141177142e-05,
"loss": 0.0631,
"step": 4270
},
{
"epoch": 1.5366663674715015,
"grad_norm": 0.7180688977241516,
"learning_rate": 1.546822777244627e-05,
"loss": 0.0658,
"step": 4280
},
{
"epoch": 1.5402567094515751,
"grad_norm": 0.6510112881660461,
"learning_rate": 1.5242295597602225e-05,
"loss": 0.0624,
"step": 4290
},
{
"epoch": 1.543847051431649,
"grad_norm": 0.6626403331756592,
"learning_rate": 1.5017728489859862e-05,
"loss": 0.0596,
"step": 4300
},
{
"epoch": 1.5474373934117225,
"grad_norm": 0.7510163187980652,
"learning_rate": 1.4794535268822673e-05,
"loss": 0.0666,
"step": 4310
},
{
"epoch": 1.551027735391796,
"grad_norm": 0.48777294158935547,
"learning_rate": 1.4572724700136386e-05,
"loss": 0.0623,
"step": 4320
},
{
"epoch": 1.5546180773718696,
"grad_norm": 0.6740663647651672,
"learning_rate": 1.4352305495144736e-05,
"loss": 0.0699,
"step": 4330
},
{
"epoch": 1.5582084193519434,
"grad_norm": 0.513523519039154,
"learning_rate": 1.4133286310547294e-05,
"loss": 0.0686,
"step": 4340
},
{
"epoch": 1.561798761332017,
"grad_norm": 0.689508318901062,
"learning_rate": 1.3915675748059537e-05,
"loss": 0.0643,
"step": 4350
},
{
"epoch": 1.5653891033120906,
"grad_norm": 0.7558987736701965,
"learning_rate": 1.3699482354074989e-05,
"loss": 0.0638,
"step": 4360
},
{
"epoch": 1.5689794452921642,
"grad_norm": 1.4819414615631104,
"learning_rate": 1.3484714619329574e-05,
"loss": 0.0579,
"step": 4370
},
{
"epoch": 1.5725697872722377,
"grad_norm": 0.45672255754470825,
"learning_rate": 1.3271380978568187e-05,
"loss": 0.0597,
"step": 4380
},
{
"epoch": 1.5761601292523113,
"grad_norm": 0.7070518136024475,
"learning_rate": 1.3059489810213371e-05,
"loss": 0.0653,
"step": 4390
},
{
"epoch": 1.5797504712323849,
"grad_norm": 0.4744075536727905,
"learning_rate": 1.2849049436036326e-05,
"loss": 0.0609,
"step": 4400
},
{
"epoch": 1.5833408132124585,
"grad_norm": 0.5028963088989258,
"learning_rate": 1.2640068120830035e-05,
"loss": 0.0614,
"step": 4410
},
{
"epoch": 1.586931155192532,
"grad_norm": 1.222612977027893,
"learning_rate": 1.24325540720847e-05,
"loss": 0.058,
"step": 4420
},
{
"epoch": 1.5905214971726056,
"grad_norm": 0.4024209976196289,
"learning_rate": 1.2226515439665392e-05,
"loss": 0.0599,
"step": 4430
},
{
"epoch": 1.5941118391526792,
"grad_norm": 0.5114520788192749,
"learning_rate": 1.2021960315491975e-05,
"loss": 0.0525,
"step": 4440
},
{
"epoch": 1.5977021811327528,
"grad_norm": 0.6782193779945374,
"learning_rate": 1.1818896733221318e-05,
"loss": 0.0605,
"step": 4450
},
{
"epoch": 1.6012925231128265,
"grad_norm": 0.4370103180408478,
"learning_rate": 1.1617332667931763e-05,
"loss": 0.0569,
"step": 4460
},
{
"epoch": 1.6048828650929001,
"grad_norm": 0.5159808993339539,
"learning_rate": 1.1417276035809926e-05,
"loss": 0.0583,
"step": 4470
},
{
"epoch": 1.6084732070729737,
"grad_norm": 0.45791277289390564,
"learning_rate": 1.1218734693839794e-05,
"loss": 0.0639,
"step": 4480
},
{
"epoch": 1.6120635490530473,
"grad_norm": 0.6834966540336609,
"learning_rate": 1.1021716439494156e-05,
"loss": 0.0626,
"step": 4490
},
{
"epoch": 1.615653891033121,
"grad_norm": 0.4611278176307678,
"learning_rate": 1.0826229010428369e-05,
"loss": 0.056,
"step": 4500
},
{
"epoch": 1.6192442330131946,
"grad_norm": 0.6188788414001465,
"learning_rate": 1.0632280084176444e-05,
"loss": 0.0578,
"step": 4510
},
{
"epoch": 1.6228345749932682,
"grad_norm": 0.5647935271263123,
"learning_rate": 1.0439877277849575e-05,
"loss": 0.0586,
"step": 4520
},
{
"epoch": 1.6264249169733418,
"grad_norm": 0.6752751469612122,
"learning_rate": 1.024902814783692e-05,
"loss": 0.0555,
"step": 4530
},
{
"epoch": 1.6300152589534154,
"grad_norm": 0.49796855449676514,
"learning_rate": 1.0059740189508881e-05,
"loss": 0.0556,
"step": 4540
},
{
"epoch": 1.633605600933489,
"grad_norm": 0.6069309115409851,
"learning_rate": 9.872020836922724e-06,
"loss": 0.0564,
"step": 4550
},
{
"epoch": 1.6371959429135625,
"grad_norm": 0.6443465948104858,
"learning_rate": 9.68587746253059e-06,
"loss": 0.0559,
"step": 4560
},
{
"epoch": 1.640786284893636,
"grad_norm": 0.48786768317222595,
"learning_rate": 9.501317376889985e-06,
"loss": 0.0551,
"step": 4570
},
{
"epoch": 1.6443766268737097,
"grad_norm": 0.6036781072616577,
"learning_rate": 9.318347828376639e-06,
"loss": 0.06,
"step": 4580
},
{
"epoch": 1.6479669688537832,
"grad_norm": 0.7226144075393677,
"learning_rate": 9.136976002899855e-06,
"loss": 0.0616,
"step": 4590
},
{
"epoch": 1.6515573108338568,
"grad_norm": 0.4328902065753937,
"learning_rate": 8.957209023620277e-06,
"loss": 0.0504,
"step": 4600
},
{
"epoch": 1.6551476528139304,
"grad_norm": 0.506410539150238,
"learning_rate": 8.779053950670146e-06,
"loss": 0.059,
"step": 4610
},
{
"epoch": 1.658737994794004,
"grad_norm": 0.6660659909248352,
"learning_rate": 8.602517780876007e-06,
"loss": 0.0528,
"step": 4620
},
{
"epoch": 1.6623283367740778,
"grad_norm": 0.5838719606399536,
"learning_rate": 8.427607447483943e-06,
"loss": 0.0561,
"step": 4630
},
{
"epoch": 1.6659186787541513,
"grad_norm": 0.7501543760299683,
"learning_rate": 8.254329819887252e-06,
"loss": 0.0527,
"step": 4640
},
{
"epoch": 1.669509020734225,
"grad_norm": 0.4832637906074524,
"learning_rate": 8.082691703356688e-06,
"loss": 0.0512,
"step": 4650
},
{
"epoch": 1.6730993627142985,
"grad_norm": 0.5931252241134644,
"learning_rate": 7.912699838773151e-06,
"loss": 0.0513,
"step": 4660
},
{
"epoch": 1.6766897046943723,
"grad_norm": 0.5244051218032837,
"learning_rate": 7.744360902363002e-06,
"loss": 0.0544,
"step": 4670
},
{
"epoch": 1.6802800466744459,
"grad_norm": 0.6513102054595947,
"learning_rate": 7.577681505435813e-06,
"loss": 0.054,
"step": 4680
},
{
"epoch": 1.6838703886545194,
"grad_norm": 0.8317810297012329,
"learning_rate": 7.412668194124728e-06,
"loss": 0.0507,
"step": 4690
},
{
"epoch": 1.687460730634593,
"grad_norm": 0.4875124394893646,
"learning_rate": 7.2493274491294285e-06,
"loss": 0.0488,
"step": 4700
},
{
"epoch": 1.6910510726146666,
"grad_norm": 0.4913179576396942,
"learning_rate": 7.087665685461497e-06,
"loss": 0.0551,
"step": 4710
},
{
"epoch": 1.6946414145947402,
"grad_norm": 0.47164708375930786,
"learning_rate": 6.9276892521925816e-06,
"loss": 0.0548,
"step": 4720
},
{
"epoch": 1.6982317565748137,
"grad_norm": 0.39257460832595825,
"learning_rate": 6.769404432204973e-06,
"loss": 0.0532,
"step": 4730
},
{
"epoch": 1.7018220985548873,
"grad_norm": 0.548692524433136,
"learning_rate": 6.61281744194494e-06,
"loss": 0.0503,
"step": 4740
},
{
"epoch": 1.7054124405349609,
"grad_norm": 0.476531445980072,
"learning_rate": 6.4579344311784475e-06,
"loss": 0.0514,
"step": 4750
},
{
"epoch": 1.7090027825150345,
"grad_norm": 0.47037366032600403,
"learning_rate": 6.304761482749777e-06,
"loss": 0.0497,
"step": 4760
},
{
"epoch": 1.712593124495108,
"grad_norm": 0.7144917845726013,
"learning_rate": 6.153304612342514e-06,
"loss": 0.0529,
"step": 4770
},
{
"epoch": 1.7161834664751816,
"grad_norm": 0.7041458487510681,
"learning_rate": 6.003569768243411e-06,
"loss": 0.0493,
"step": 4780
},
{
"epoch": 1.7197738084552554,
"grad_norm": 0.5702252984046936,
"learning_rate": 5.855562831108624e-06,
"loss": 0.0491,
"step": 4790
},
{
"epoch": 1.723364150435329,
"grad_norm": 0.697307288646698,
"learning_rate": 5.709289613732888e-06,
"loss": 0.0533,
"step": 4800
},
{
"epoch": 1.7269544924154026,
"grad_norm": 0.6015498638153076,
"learning_rate": 5.564755860821147e-06,
"loss": 0.0521,
"step": 4810
},
{
"epoch": 1.7305448343954761,
"grad_norm": 0.6062167882919312,
"learning_rate": 5.421967248763021e-06,
"loss": 0.0547,
"step": 4820
},
{
"epoch": 1.73413517637555,
"grad_norm": 0.45276394486427307,
"learning_rate": 5.2809293854097495e-06,
"loss": 0.0553,
"step": 4830
},
{
"epoch": 1.7377255183556235,
"grad_norm": 0.4024350047111511,
"learning_rate": 5.14164780985405e-06,
"loss": 0.0512,
"step": 4840
},
{
"epoch": 1.741315860335697,
"grad_norm": 0.6370827555656433,
"learning_rate": 5.0041279922125705e-06,
"loss": 0.0562,
"step": 4850
},
{
"epoch": 1.7449062023157706,
"grad_norm": 0.5606709122657776,
"learning_rate": 4.868375333411002e-06,
"loss": 0.0556,
"step": 4860
},
{
"epoch": 1.7484965442958442,
"grad_norm": 0.8585699796676636,
"learning_rate": 4.734395164971978e-06,
"loss": 0.0459,
"step": 4870
},
{
"epoch": 1.7520868862759178,
"grad_norm": 0.4308234453201294,
"learning_rate": 4.6021927488057334e-06,
"loss": 0.0471,
"step": 4880
},
{
"epoch": 1.7556772282559914,
"grad_norm": 0.4660848081111908,
"learning_rate": 4.471773277003427e-06,
"loss": 0.0524,
"step": 4890
},
{
"epoch": 1.759267570236065,
"grad_norm": 0.6825345158576965,
"learning_rate": 4.343141871633188e-06,
"loss": 0.0521,
"step": 4900
},
{
"epoch": 1.7628579122161385,
"grad_norm": 0.6137758493423462,
"learning_rate": 4.216303584538988e-06,
"loss": 0.0539,
"step": 4910
},
{
"epoch": 1.766448254196212,
"grad_norm": 0.7231915593147278,
"learning_rate": 4.0912633971422425e-06,
"loss": 0.0466,
"step": 4920
},
{
"epoch": 1.7700385961762857,
"grad_norm": 0.6705979108810425,
"learning_rate": 3.968026220246174e-06,
"loss": 0.047,
"step": 4930
},
{
"epoch": 1.7736289381563592,
"grad_norm": 0.5974612832069397,
"learning_rate": 3.846596893842891e-06,
"loss": 0.0499,
"step": 4940
},
{
"epoch": 1.777219280136433,
"grad_norm": 0.6848942637443542,
"learning_rate": 3.7269801869233845e-06,
"loss": 0.0545,
"step": 4950
},
{
"epoch": 1.7808096221165066,
"grad_norm": 0.6268109083175659,
"learning_rate": 3.6091807972901624e-06,
"loss": 0.0519,
"step": 4960
},
{
"epoch": 1.7843999640965802,
"grad_norm": 0.8246615529060364,
"learning_rate": 3.49320335137282e-06,
"loss": 0.0495,
"step": 4970
},
{
"epoch": 1.7879903060766538,
"grad_norm": 0.7163103222846985,
"learning_rate": 3.3790524040462566e-06,
"loss": 0.0465,
"step": 4980
},
{
"epoch": 1.7915806480567276,
"grad_norm": 0.5779036283493042,
"learning_rate": 3.266732438451842e-06,
"loss": 0.0493,
"step": 4990
},
{
"epoch": 1.7951709900368011,
"grad_norm": 0.5178433060646057,
"learning_rate": 3.1562478658213656e-06,
"loss": 0.0499,
"step": 5000
},
{
"epoch": 1.7987613320168747,
"grad_norm": 0.7967355847358704,
"learning_rate": 3.0476030253037415e-06,
"loss": 0.0502,
"step": 5010
},
{
"epoch": 1.8023516739969483,
"grad_norm": 0.8158264756202698,
"learning_rate": 2.9408021837945942e-06,
"loss": 0.0481,
"step": 5020
},
{
"epoch": 1.8059420159770219,
"grad_norm": 0.43987634778022766,
"learning_rate": 2.8358495357687364e-06,
"loss": 0.0456,
"step": 5030
},
{
"epoch": 1.8095323579570954,
"grad_norm": 0.45231232047080994,
"learning_rate": 2.7327492031153866e-06,
"loss": 0.0474,
"step": 5040
},
{
"epoch": 1.813122699937169,
"grad_norm": 0.799350917339325,
"learning_rate": 2.631505234976311e-06,
"loss": 0.0489,
"step": 5050
},
{
"epoch": 1.8167130419172426,
"grad_norm": 0.5466026663780212,
"learning_rate": 2.5321216075867626e-06,
"loss": 0.0474,
"step": 5060
},
{
"epoch": 1.8203033838973162,
"grad_norm": 0.7424982190132141,
"learning_rate": 2.4346022241193643e-06,
"loss": 0.0452,
"step": 5070
},
{
"epoch": 1.8238937258773897,
"grad_norm": 0.7979154586791992,
"learning_rate": 2.3389509145308076e-06,
"loss": 0.05,
"step": 5080
},
{
"epoch": 1.8274840678574633,
"grad_norm": 0.6414862275123596,
"learning_rate": 2.245171435411414e-06,
"loss": 0.0487,
"step": 5090
},
{
"epoch": 1.8310744098375369,
"grad_norm": 0.5069670081138611,
"learning_rate": 2.1532674698376e-06,
"loss": 0.0464,
"step": 5100
},
{
"epoch": 1.8346647518176105,
"grad_norm": 0.4745350480079651,
"learning_rate": 2.0632426272272464e-06,
"loss": 0.0467,
"step": 5110
},
{
"epoch": 1.8382550937976843,
"grad_norm": 0.5952518582344055,
"learning_rate": 1.975100443197958e-06,
"loss": 0.0508,
"step": 5120
},
{
"epoch": 1.8418454357777578,
"grad_norm": 0.5413398146629333,
"learning_rate": 1.8888443794281618e-06,
"loss": 0.0426,
"step": 5130
},
{
"epoch": 1.8454357777578314,
"grad_norm": 0.6297146677970886,
"learning_rate": 1.8044778235211723e-06,
"loss": 0.0523,
"step": 5140
},
{
"epoch": 1.849026119737905,
"grad_norm": 0.458870530128479,
"learning_rate": 1.72200408887217e-06,
"loss": 0.0462,
"step": 5150
},
{
"epoch": 1.8526164617179788,
"grad_norm": 0.6490904688835144,
"learning_rate": 1.6414264145380442e-06,
"loss": 0.0484,
"step": 5160
},
{
"epoch": 1.8562068036980524,
"grad_norm": 0.7383233904838562,
"learning_rate": 1.562747965110195e-06,
"loss": 0.0484,
"step": 5170
},
{
"epoch": 1.859797145678126,
"grad_norm": 2.4921016693115234,
"learning_rate": 1.4859718305902326e-06,
"loss": 0.046,
"step": 5180
},
{
"epoch": 1.8633874876581995,
"grad_norm": 1.6146339178085327,
"learning_rate": 1.411101026268652e-06,
"loss": 0.043,
"step": 5190
},
{
"epoch": 1.866977829638273,
"grad_norm": 0.47561097145080566,
"learning_rate": 1.3381384926063833e-06,
"loss": 0.0467,
"step": 5200
},
{
"epoch": 1.8705681716183467,
"grad_norm": 0.5113374590873718,
"learning_rate": 1.2670870951193292e-06,
"loss": 0.0475,
"step": 5210
},
{
"epoch": 1.8741585135984202,
"grad_norm": 0.5401134490966797,
"learning_rate": 1.197949624265776e-06,
"loss": 0.0482,
"step": 5220
},
{
"epoch": 1.8777488555784938,
"grad_norm": 0.4193181097507477,
"learning_rate": 1.1307287953368995e-06,
"loss": 0.0472,
"step": 5230
},
{
"epoch": 1.8813391975585674,
"grad_norm": 0.45812806487083435,
"learning_rate": 1.065427248350015e-06,
"loss": 0.0477,
"step": 5240
},
{
"epoch": 1.884929539538641,
"grad_norm": 0.8749078512191772,
"learning_rate": 1.0020475479449731e-06,
"loss": 0.0507,
"step": 5250
},
{
"epoch": 1.8885198815187145,
"grad_norm": 0.48960697650909424,
"learning_rate": 9.405921832833841e-07,
"loss": 0.046,
"step": 5260
},
{
"epoch": 1.892110223498788,
"grad_norm": 0.7578288316726685,
"learning_rate": 8.810635679509071e-07,
"loss": 0.0471,
"step": 5270
},
{
"epoch": 1.895700565478862,
"grad_norm": 0.6842608451843262,
"learning_rate": 8.23464039862426e-07,
"loss": 0.0445,
"step": 5280
},
{
"epoch": 1.8992909074589355,
"grad_norm": 0.5089036226272583,
"learning_rate": 7.67795861170234e-07,
"loss": 0.0457,
"step": 5290
},
{
"epoch": 1.902881249439009,
"grad_norm": 0.5393949151039124,
"learning_rate": 7.140612181752048e-07,
"loss": 0.0456,
"step": 5300
},
{
"epoch": 1.9064715914190826,
"grad_norm": 0.9976809024810791,
"learning_rate": 6.622622212409058e-07,
"loss": 0.047,
"step": 5310
},
{
"epoch": 1.9100619333991564,
"grad_norm": 0.5556519031524658,
"learning_rate": 6.124009047107471e-07,
"loss": 0.0517,
"step": 5320
},
{
"epoch": 1.91365227537923,
"grad_norm": 0.534712553024292,
"learning_rate": 5.644792268280574e-07,
"loss": 0.0427,
"step": 5330
},
{
"epoch": 1.9172426173593036,
"grad_norm": 0.7053726315498352,
"learning_rate": 5.18499069659184e-07,
"loss": 0.0455,
"step": 5340
},
{
"epoch": 1.9208329593393771,
"grad_norm": 0.5793641209602356,
"learning_rate": 4.744622390195963e-07,
"loss": 0.0513,
"step": 5350
},
{
"epoch": 1.9244233013194507,
"grad_norm": 0.4043155908584595,
"learning_rate": 4.323704644029203e-07,
"loss": 0.0501,
"step": 5360
},
{
"epoch": 1.9280136432995243,
"grad_norm": 0.4776788353919983,
"learning_rate": 3.9222539891307086e-07,
"loss": 0.0415,
"step": 5370
},
{
"epoch": 1.9316039852795979,
"grad_norm": 0.6649408340454102,
"learning_rate": 3.5402861919928697e-07,
"loss": 0.0451,
"step": 5380
},
{
"epoch": 1.9351943272596714,
"grad_norm": 3.3624627590179443,
"learning_rate": 3.1778162539421453e-07,
"loss": 0.0472,
"step": 5390
},
{
"epoch": 1.938784669239745,
"grad_norm": 0.5529268980026245,
"learning_rate": 2.8348584105501453e-07,
"loss": 0.045,
"step": 5400
},
{
"epoch": 1.9423750112198186,
"grad_norm": 0.6905925273895264,
"learning_rate": 2.511426131074246e-07,
"loss": 0.0452,
"step": 5410
},
{
"epoch": 1.9459653531998922,
"grad_norm": 0.6144551038742065,
"learning_rate": 2.2075321179289565e-07,
"loss": 0.0422,
"step": 5420
},
{
"epoch": 1.9495556951799657,
"grad_norm": 1.2887723445892334,
"learning_rate": 1.9231883061866517e-07,
"loss": 0.0441,
"step": 5430
},
{
"epoch": 1.9531460371600395,
"grad_norm": 0.7968602776527405,
"learning_rate": 1.6584058631090582e-07,
"loss": 0.0455,
"step": 5440
},
{
"epoch": 1.9567363791401131,
"grad_norm": 0.7239225506782532,
"learning_rate": 1.4131951877087158e-07,
"loss": 0.0461,
"step": 5450
},
{
"epoch": 1.9603267211201867,
"grad_norm": 0.6258605718612671,
"learning_rate": 1.1875659103404157e-07,
"loss": 0.0449,
"step": 5460
},
{
"epoch": 1.9639170631002603,
"grad_norm": 0.7048450708389282,
"learning_rate": 9.815268923230592e-08,
"loss": 0.0469,
"step": 5470
},
{
"epoch": 1.967507405080334,
"grad_norm": 0.6698242425918579,
"learning_rate": 7.95086225591657e-08,
"loss": 0.0469,
"step": 5480
},
{
"epoch": 1.9710977470604076,
"grad_norm": 0.612483561038971,
"learning_rate": 6.282512323795287e-08,
"loss": 0.0432,
"step": 5490
},
{
"epoch": 1.9746880890404812,
"grad_norm": 1.0906122922897339,
"learning_rate": 4.81028464930755e-08,
"loss": 0.0439,
"step": 5500
},
{
"epoch": 1.9782784310205548,
"grad_norm": 0.5854030847549438,
"learning_rate": 3.534237052426059e-08,
"loss": 0.0461,
"step": 5510
},
{
"epoch": 1.9818687730006284,
"grad_norm": 0.5965482592582703,
"learning_rate": 2.4544196483888837e-08,
"loss": 0.0449,
"step": 5520
},
{
"epoch": 1.985459114980702,
"grad_norm": 1.0227429866790771,
"learning_rate": 1.5708748457271548e-08,
"loss": 0.0476,
"step": 5530
},
{
"epoch": 1.9890494569607755,
"grad_norm": 0.506277859210968,
"learning_rate": 8.836373446019507e-09,
"loss": 0.0477,
"step": 5540
},
{
"epoch": 1.992639798940849,
"grad_norm": 0.4811525344848633,
"learning_rate": 3.927341354420522e-09,
"loss": 0.0468,
"step": 5550
},
{
"epoch": 1.9962301409209227,
"grad_norm": 0.4584663212299347,
"learning_rate": 9.818449787979412e-10,
"loss": 0.0424,
"step": 5560
},
{
"epoch": 1.9998204829009962,
"grad_norm": 0.6924448609352112,
"learning_rate": 0.0,
"loss": 0.0443,
"step": 5570
}
],
"logging_steps": 10,
"max_steps": 5570,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.008182835124896e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}