AnonyResearcher's picture
Upload folder using huggingface_hub
14dd23d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 95.72649572649573,
"eval_steps": 200,
"global_step": 2100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.045584045584045586,
"grad_norm": 62.2914085542995,
"learning_rate": 2.3809523809523814e-05,
"loss": 43.092,
"loss_layer_12_head": 8.308940887451172,
"loss_layer_18_head": 7.555350303649902,
"loss_layer_24_head": 6.16204833984375,
"loss_layer_30_head": 5.288188934326172,
"loss_layer_36_head": 3.8764476776123047,
"loss_layer_42_head": 2.1485373973846436,
"loss_layer_6_head": 9.3965425491333,
"step": 1
},
{
"epoch": 0.22792022792022792,
"grad_norm": 19.455193806314377,
"learning_rate": 0.00011904761904761905,
"loss": 33.3367,
"loss_layer_12_head": 6.647928714752197,
"loss_layer_18_head": 5.974323272705078,
"loss_layer_24_head": 4.755464553833008,
"loss_layer_30_head": 3.9858086109161377,
"loss_layer_36_head": 2.8238697052001953,
"loss_layer_42_head": 1.47384774684906,
"loss_layer_6_head": 7.75436544418335,
"step": 5
},
{
"epoch": 0.45584045584045585,
"grad_norm": 19.77362433069107,
"learning_rate": 0.0002380952380952381,
"loss": 24.502,
"loss_layer_12_head": 5.146442413330078,
"loss_layer_18_head": 4.827411651611328,
"loss_layer_24_head": 3.496838331222534,
"loss_layer_30_head": 2.7353873252868652,
"loss_layer_36_head": 1.7639620304107666,
"loss_layer_42_head": 1.1026753187179565,
"loss_layer_6_head": 5.473020553588867,
"step": 10
},
{
"epoch": 0.6837606837606838,
"grad_norm": 5.825213351919792,
"learning_rate": 0.00035714285714285714,
"loss": 19.6598,
"loss_layer_12_head": 4.174715042114258,
"loss_layer_18_head": 3.8590176105499268,
"loss_layer_24_head": 2.7612555027008057,
"loss_layer_30_head": 2.1735005378723145,
"loss_layer_36_head": 1.4102507829666138,
"loss_layer_42_head": 0.7683950066566467,
"loss_layer_6_head": 4.8184590339660645,
"step": 15
},
{
"epoch": 0.9116809116809117,
"grad_norm": 3.6611272833257194,
"learning_rate": 0.0004761904761904762,
"loss": 15.5497,
"loss_layer_12_head": 3.2854111194610596,
"loss_layer_18_head": 3.0585737228393555,
"loss_layer_24_head": 2.142367362976074,
"loss_layer_30_head": 1.661052942276001,
"loss_layer_36_head": 1.0766279697418213,
"loss_layer_42_head": 0.6817001104354858,
"loss_layer_6_head": 3.832777738571167,
"step": 20
},
{
"epoch": 1.1396011396011396,
"grad_norm": 2.0539370731960522,
"learning_rate": 0.0005952380952380952,
"loss": 13.1226,
"loss_layer_12_head": 2.8252596855163574,
"loss_layer_18_head": 2.6267945766448975,
"loss_layer_24_head": 1.7749992609024048,
"loss_layer_30_head": 1.3407100439071655,
"loss_layer_36_head": 0.85447758436203,
"loss_layer_42_head": 0.5364646911621094,
"loss_layer_6_head": 3.3687548637390137,
"step": 25
},
{
"epoch": 1.3675213675213675,
"grad_norm": 1.3678909297039978,
"learning_rate": 0.0007142857142857143,
"loss": 11.2545,
"loss_layer_12_head": 2.3465054035186768,
"loss_layer_18_head": 2.1682493686676025,
"loss_layer_24_head": 1.4258038997650146,
"loss_layer_30_head": 1.066014051437378,
"loss_layer_36_head": 0.7038318514823914,
"loss_layer_42_head": 0.4449593424797058,
"loss_layer_6_head": 2.874427080154419,
"step": 30
},
{
"epoch": 1.5954415954415955,
"grad_norm": 1.0597925214891006,
"learning_rate": 0.0008333333333333333,
"loss": 10.0245,
"loss_layer_12_head": 2.145514965057373,
"loss_layer_18_head": 1.9670956134796143,
"loss_layer_24_head": 1.2736284732818604,
"loss_layer_30_head": 0.9368025064468384,
"loss_layer_36_head": 0.5962619185447693,
"loss_layer_42_head": 0.37413108348846436,
"loss_layer_6_head": 2.68680477142334,
"step": 35
},
{
"epoch": 1.8233618233618234,
"grad_norm": 0.8067240935550286,
"learning_rate": 0.0009523809523809524,
"loss": 8.9233,
"loss_layer_12_head": 1.9344780445098877,
"loss_layer_18_head": 1.770326852798462,
"loss_layer_24_head": 1.1205565929412842,
"loss_layer_30_head": 0.8123494982719421,
"loss_layer_36_head": 0.5198163390159607,
"loss_layer_42_head": 0.3280097246170044,
"loss_layer_6_head": 2.4362289905548096,
"step": 40
},
{
"epoch": 2.051282051282051,
"grad_norm": 0.6437708908708522,
"learning_rate": 0.0010714285714285715,
"loss": 8.1196,
"loss_layer_12_head": 1.773280143737793,
"loss_layer_18_head": 1.600731611251831,
"loss_layer_24_head": 0.9964693188667297,
"loss_layer_30_head": 0.7136993408203125,
"loss_layer_36_head": 0.4567667543888092,
"loss_layer_42_head": 0.28257232904434204,
"loss_layer_6_head": 2.308173656463623,
"step": 45
},
{
"epoch": 2.2792022792022792,
"grad_norm": 0.5437222788803687,
"learning_rate": 0.0011904761904761904,
"loss": 7.3482,
"loss_layer_12_head": 1.6736904382705688,
"loss_layer_18_head": 1.4971253871917725,
"loss_layer_24_head": 0.9068229794502258,
"loss_layer_30_head": 0.6316335797309875,
"loss_layer_36_head": 0.4000738561153412,
"loss_layer_42_head": 0.24324941635131836,
"loss_layer_6_head": 2.207364082336426,
"step": 50
},
{
"epoch": 2.5071225071225074,
"grad_norm": 0.47033294183192215,
"learning_rate": 0.0013095238095238097,
"loss": 6.782,
"loss_layer_12_head": 1.4841620922088623,
"loss_layer_18_head": 1.3070929050445557,
"loss_layer_24_head": 0.774025559425354,
"loss_layer_30_head": 0.5281109809875488,
"loss_layer_36_head": 0.3339698314666748,
"loss_layer_42_head": 0.1966492384672165,
"loss_layer_6_head": 2.0062077045440674,
"step": 55
},
{
"epoch": 2.735042735042735,
"grad_norm": 0.41848236144680057,
"learning_rate": 0.0014285714285714286,
"loss": 6.3554,
"loss_layer_12_head": 1.410228967666626,
"loss_layer_18_head": 1.23647141456604,
"loss_layer_24_head": 0.727624237537384,
"loss_layer_30_head": 0.49607592821121216,
"loss_layer_36_head": 0.3141021132469177,
"loss_layer_42_head": 0.18274283409118652,
"loss_layer_6_head": 1.926108956336975,
"step": 60
},
{
"epoch": 2.962962962962963,
"grad_norm": 0.36960333783062255,
"learning_rate": 0.0015476190476190477,
"loss": 5.8987,
"loss_layer_12_head": 1.3281461000442505,
"loss_layer_18_head": 1.1566855907440186,
"loss_layer_24_head": 0.6787080764770508,
"loss_layer_30_head": 0.45799437165260315,
"loss_layer_36_head": 0.29089364409446716,
"loss_layer_42_head": 0.1687871664762497,
"loss_layer_6_head": 1.8334720134735107,
"step": 65
},
{
"epoch": 3.190883190883191,
"grad_norm": 0.34506211668913744,
"learning_rate": 0.0016666666666666666,
"loss": 5.4375,
"loss_layer_12_head": 1.199660062789917,
"loss_layer_18_head": 1.032221794128418,
"loss_layer_24_head": 0.593693733215332,
"loss_layer_30_head": 0.3999592959880829,
"loss_layer_36_head": 0.258480966091156,
"loss_layer_42_head": 0.1496810019016266,
"loss_layer_6_head": 1.7067362070083618,
"step": 70
},
{
"epoch": 3.4188034188034186,
"grad_norm": 0.3151835312525889,
"learning_rate": 0.0017857142857142859,
"loss": 5.1469,
"loss_layer_12_head": 1.2106449604034424,
"loss_layer_18_head": 1.0451469421386719,
"loss_layer_24_head": 0.5938726663589478,
"loss_layer_30_head": 0.39307016134262085,
"loss_layer_36_head": 0.2464793175458908,
"loss_layer_42_head": 0.13884057104587555,
"loss_layer_6_head": 1.7400537729263306,
"step": 75
},
{
"epoch": 3.646723646723647,
"grad_norm": 0.3082388550908825,
"learning_rate": 0.0019047619047619048,
"loss": 5.0216,
"loss_layer_12_head": 1.1534507274627686,
"loss_layer_18_head": 0.9869141578674316,
"loss_layer_24_head": 0.5700754523277283,
"loss_layer_30_head": 0.37610307335853577,
"loss_layer_36_head": 0.23609893023967743,
"loss_layer_42_head": 0.13269147276878357,
"loss_layer_6_head": 1.6694886684417725,
"step": 80
},
{
"epoch": 3.8746438746438745,
"grad_norm": 0.3122891528340703,
"learning_rate": 0.002023809523809524,
"loss": 4.7593,
"loss_layer_12_head": 1.0834763050079346,
"loss_layer_18_head": 0.9217953681945801,
"loss_layer_24_head": 0.5253178477287292,
"loss_layer_30_head": 0.3411971926689148,
"loss_layer_36_head": 0.2131468951702118,
"loss_layer_42_head": 0.12173072248697281,
"loss_layer_6_head": 1.5824086666107178,
"step": 85
},
{
"epoch": 4.102564102564102,
"grad_norm": 0.3089845680262722,
"learning_rate": 0.002142857142857143,
"loss": 4.4801,
"loss_layer_12_head": 1.0101158618927002,
"loss_layer_18_head": 0.8577353358268738,
"loss_layer_24_head": 0.49227556586265564,
"loss_layer_30_head": 0.32580453157424927,
"loss_layer_36_head": 0.21231690049171448,
"loss_layer_42_head": 0.1251501739025116,
"loss_layer_6_head": 1.4942306280136108,
"step": 90
},
{
"epoch": 4.330484330484331,
"grad_norm": 0.3021335552511279,
"learning_rate": 0.002261904761904762,
"loss": 4.2004,
"loss_layer_12_head": 0.9157131314277649,
"loss_layer_18_head": 0.7664994597434998,
"loss_layer_24_head": 0.4360291361808777,
"loss_layer_30_head": 0.28324925899505615,
"loss_layer_36_head": 0.18157120048999786,
"loss_layer_42_head": 0.10644565522670746,
"loss_layer_6_head": 1.3922828435897827,
"step": 95
},
{
"epoch": 4.5584045584045585,
"grad_norm": 0.35656956962665187,
"learning_rate": 0.0023809523809523807,
"loss": 4.0681,
"loss_layer_12_head": 0.9036925435066223,
"loss_layer_18_head": 0.7571278810501099,
"loss_layer_24_head": 0.43527936935424805,
"loss_layer_30_head": 0.28879284858703613,
"loss_layer_36_head": 0.18665608763694763,
"loss_layer_42_head": 0.110997773706913,
"loss_layer_6_head": 1.3775255680084229,
"step": 100
},
{
"epoch": 4.786324786324786,
"grad_norm": 0.7537150260193732,
"learning_rate": 0.0025,
"loss": 4.1325,
"loss_layer_12_head": 0.9238117933273315,
"loss_layer_18_head": 0.7721032500267029,
"loss_layer_24_head": 0.43952664732933044,
"loss_layer_30_head": 0.28288325667381287,
"loss_layer_36_head": 0.18152464926242828,
"loss_layer_42_head": 0.21891792118549347,
"loss_layer_6_head": 1.3878661394119263,
"step": 105
},
{
"epoch": 5.014245014245014,
"grad_norm": 0.7856472586064206,
"learning_rate": 0.0026190476190476194,
"loss": 4.0448,
"loss_layer_12_head": 0.8572763204574585,
"loss_layer_18_head": 0.7265270948410034,
"loss_layer_24_head": 0.40768003463745117,
"loss_layer_30_head": 0.2640893757343292,
"loss_layer_36_head": 0.17229382693767548,
"loss_layer_42_head": 0.22290952503681183,
"loss_layer_6_head": 1.2932946681976318,
"step": 110
},
{
"epoch": 5.2421652421652425,
"grad_norm": 1.8914483067832557,
"learning_rate": 0.0027380952380952383,
"loss": 4.0412,
"loss_layer_12_head": 0.8094813227653503,
"loss_layer_18_head": 0.9888504147529602,
"loss_layer_24_head": 0.42250028252601624,
"loss_layer_30_head": 0.25133341550827026,
"loss_layer_36_head": 0.1654709130525589,
"loss_layer_42_head": 0.1770133078098297,
"loss_layer_6_head": 1.2574403285980225,
"step": 115
},
{
"epoch": 5.47008547008547,
"grad_norm": 1.9425500774924052,
"learning_rate": 0.002857142857142857,
"loss": 4.6802,
"loss_layer_12_head": 0.8291244506835938,
"loss_layer_18_head": 0.9608370065689087,
"loss_layer_24_head": 0.9025907516479492,
"loss_layer_30_head": 0.3785189986228943,
"loss_layer_36_head": 0.2427733689546585,
"loss_layer_42_head": 0.1588081568479538,
"loss_layer_6_head": 1.2310930490493774,
"step": 120
},
{
"epoch": 5.698005698005698,
"grad_norm": 1.409437774928848,
"learning_rate": 0.002976190476190476,
"loss": 4.6225,
"loss_layer_12_head": 1.0028345584869385,
"loss_layer_18_head": 0.9019104242324829,
"loss_layer_24_head": 0.7798857688903809,
"loss_layer_30_head": 0.373049795627594,
"loss_layer_36_head": 0.24940261244773865,
"loss_layer_42_head": 0.16313040256500244,
"loss_layer_6_head": 1.1742385625839233,
"step": 125
},
{
"epoch": 5.925925925925926,
"grad_norm": 1.0378215832119297,
"learning_rate": 0.0030952380952380953,
"loss": 4.4125,
"loss_layer_12_head": 1.0007030963897705,
"loss_layer_18_head": 0.8221645355224609,
"loss_layer_24_head": 0.6742190718650818,
"loss_layer_30_head": 0.3384473919868469,
"loss_layer_36_head": 0.2220737487077713,
"loss_layer_42_head": 0.12549471855163574,
"loss_layer_6_head": 1.171250581741333,
"step": 130
},
{
"epoch": 6.153846153846154,
"grad_norm": 1.4926191542432983,
"learning_rate": 0.0032142857142857147,
"loss": 3.9975,
"loss_layer_12_head": 0.9102523922920227,
"loss_layer_18_head": 0.7440574765205383,
"loss_layer_24_head": 0.5891538262367249,
"loss_layer_30_head": 0.3084535002708435,
"loss_layer_36_head": 0.19867992401123047,
"loss_layer_42_head": 0.12676474452018738,
"loss_layer_6_head": 1.178501844406128,
"step": 135
},
{
"epoch": 6.381766381766382,
"grad_norm": 1.2078362363188435,
"learning_rate": 0.003333333333333333,
"loss": 3.7208,
"loss_layer_12_head": 0.82643061876297,
"loss_layer_18_head": 0.6635754704475403,
"loss_layer_24_head": 0.49496936798095703,
"loss_layer_30_head": 0.25951477885246277,
"loss_layer_36_head": 0.1663007289171219,
"loss_layer_42_head": 0.09452775865793228,
"loss_layer_6_head": 1.1378257274627686,
"step": 140
},
{
"epoch": 6.60968660968661,
"grad_norm": 1.3611292633005208,
"learning_rate": 0.0034523809523809524,
"loss": 3.621,
"loss_layer_12_head": 0.7876842617988586,
"loss_layer_18_head": 0.6401599645614624,
"loss_layer_24_head": 0.46546655893325806,
"loss_layer_30_head": 0.2504613399505615,
"loss_layer_36_head": 0.15819665789604187,
"loss_layer_42_head": 0.09289596974849701,
"loss_layer_6_head": 1.1508190631866455,
"step": 145
},
{
"epoch": 6.837606837606837,
"grad_norm": 2.068883579723556,
"learning_rate": 0.0035714285714285718,
"loss": 3.618,
"loss_layer_12_head": 0.8008605241775513,
"loss_layer_18_head": 0.6416077613830566,
"loss_layer_24_head": 0.4489217698574066,
"loss_layer_30_head": 0.25658518075942993,
"loss_layer_36_head": 0.18197140097618103,
"loss_layer_42_head": 0.09827812016010284,
"loss_layer_6_head": 1.2267687320709229,
"step": 150
},
{
"epoch": 7.065527065527066,
"grad_norm": 1.8566033890581124,
"learning_rate": 0.0036904761904761906,
"loss": 3.6802,
"loss_layer_12_head": 0.769829273223877,
"loss_layer_18_head": 0.6047574281692505,
"loss_layer_24_head": 0.404893159866333,
"loss_layer_30_head": 0.23501157760620117,
"loss_layer_36_head": 0.21422222256660461,
"loss_layer_42_head": 0.13609819114208221,
"loss_layer_6_head": 1.2630399465560913,
"step": 155
},
{
"epoch": 7.293447293447294,
"grad_norm": 1.3020341739606298,
"learning_rate": 0.0038095238095238095,
"loss": 3.5461,
"loss_layer_12_head": 0.6948034763336182,
"loss_layer_18_head": 0.5865488052368164,
"loss_layer_24_head": 0.36058539152145386,
"loss_layer_30_head": 0.254138708114624,
"loss_layer_36_head": 0.1751880794763565,
"loss_layer_42_head": 0.15007896721363068,
"loss_layer_6_head": 1.2036831378936768,
"step": 160
},
{
"epoch": 7.521367521367521,
"grad_norm": 1.2579419895110615,
"learning_rate": 0.003928571428571429,
"loss": 3.4722,
"loss_layer_12_head": 0.7162200212478638,
"loss_layer_18_head": 0.6314468383789062,
"loss_layer_24_head": 0.3721396327018738,
"loss_layer_30_head": 0.24703256785869598,
"loss_layer_36_head": 0.17532584071159363,
"loss_layer_42_head": 0.1474941372871399,
"loss_layer_6_head": 1.1787517070770264,
"step": 165
},
{
"epoch": 7.749287749287749,
"grad_norm": 1.0118058129375738,
"learning_rate": 0.004047619047619048,
"loss": 3.447,
"loss_layer_12_head": 0.7314745187759399,
"loss_layer_18_head": 0.6032330989837646,
"loss_layer_24_head": 0.3913646340370178,
"loss_layer_30_head": 0.23757806420326233,
"loss_layer_36_head": 0.178132563829422,
"loss_layer_42_head": 0.1304202377796173,
"loss_layer_6_head": 1.1239389181137085,
"step": 170
},
{
"epoch": 7.977207977207978,
"grad_norm": 1.312823729396693,
"learning_rate": 0.004166666666666667,
"loss": 3.4955,
"loss_layer_12_head": 0.8259456753730774,
"loss_layer_18_head": 0.5874773263931274,
"loss_layer_24_head": 0.3818410336971283,
"loss_layer_30_head": 0.2627549171447754,
"loss_layer_36_head": 0.1600283682346344,
"loss_layer_42_head": 0.11221656948328018,
"loss_layer_6_head": 1.097424864768982,
"step": 175
},
{
"epoch": 8.205128205128204,
"grad_norm": 1.2764585863517615,
"learning_rate": 0.004285714285714286,
"loss": 3.2662,
"loss_layer_12_head": 0.8447945713996887,
"loss_layer_18_head": 0.57573401927948,
"loss_layer_24_head": 0.3404631018638611,
"loss_layer_30_head": 0.23921921849250793,
"loss_layer_36_head": 0.13854867219924927,
"loss_layer_42_head": 0.09624181687831879,
"loss_layer_6_head": 1.0239530801773071,
"step": 180
},
{
"epoch": 8.433048433048434,
"grad_norm": 2.174485752198311,
"learning_rate": 0.004404761904761904,
"loss": 3.8994,
"loss_layer_12_head": 0.8033272624015808,
"loss_layer_18_head": 1.1929363012313843,
"loss_layer_24_head": 0.34211626648902893,
"loss_layer_30_head": 0.24058055877685547,
"loss_layer_36_head": 0.19761431217193604,
"loss_layer_42_head": 0.09599171578884125,
"loss_layer_6_head": 1.062518835067749,
"step": 185
},
{
"epoch": 8.660968660968662,
"grad_norm": 1.7449271648991171,
"learning_rate": 0.004523809523809524,
"loss": 4.0437,
"loss_layer_12_head": 0.7386378049850464,
"loss_layer_18_head": 1.2550582885742188,
"loss_layer_24_head": 0.32292360067367554,
"loss_layer_30_head": 0.22687847912311554,
"loss_layer_36_head": 0.2144986093044281,
"loss_layer_42_head": 0.130828857421875,
"loss_layer_6_head": 1.0966918468475342,
"step": 190
},
{
"epoch": 8.88888888888889,
"grad_norm": 1.7126154778538007,
"learning_rate": 0.004642857142857143,
"loss": 3.9797,
"loss_layer_12_head": 0.7142958641052246,
"loss_layer_18_head": 1.0787105560302734,
"loss_layer_24_head": 0.3428736925125122,
"loss_layer_30_head": 0.2274586260318756,
"loss_layer_36_head": 0.1928836703300476,
"loss_layer_42_head": 0.14408032596111298,
"loss_layer_6_head": 1.2979519367218018,
"step": 195
},
{
"epoch": 9.116809116809117,
"grad_norm": 1.1725132505587743,
"learning_rate": 0.0047619047619047615,
"loss": 3.7028,
"loss_layer_12_head": 0.6508289575576782,
"loss_layer_18_head": 0.9081010818481445,
"loss_layer_24_head": 0.3285003900527954,
"loss_layer_30_head": 0.22624602913856506,
"loss_layer_36_head": 0.16938458383083344,
"loss_layer_42_head": 0.11536189168691635,
"loss_layer_6_head": 1.2216272354125977,
"step": 200
},
{
"epoch": 9.116809116809117,
"eval_loss": 4.635208606719971,
"eval_loss_layer_12_head": 0.8311116099357605,
"eval_loss_layer_18_head": 1.042922019958496,
"eval_loss_layer_24_head": 0.49305030703544617,
"eval_loss_layer_30_head": 0.4456557333469391,
"eval_loss_layer_36_head": 0.23485398292541504,
"eval_loss_layer_42_head": 0.1599181890487671,
"eval_loss_layer_6_head": 1.4034830331802368,
"eval_runtime": 5.0189,
"eval_samples_per_second": 6.575,
"eval_steps_per_second": 0.598,
"step": 200
},
{
"epoch": 9.344729344729345,
"grad_norm": 1.127047766220966,
"learning_rate": 0.004880952380952381,
"loss": 3.4979,
"loss_layer_12_head": 0.6423342227935791,
"loss_layer_18_head": 0.8167837262153625,
"loss_layer_24_head": 0.3226773142814636,
"loss_layer_30_head": 0.3336021602153778,
"loss_layer_36_head": 0.1563200056552887,
"loss_layer_42_head": 0.10249624401330948,
"loss_layer_6_head": 1.1420187950134277,
"step": 205
},
{
"epoch": 9.572649572649572,
"grad_norm": 0.8579363052095977,
"learning_rate": 0.005,
"loss": 3.3994,
"loss_layer_12_head": 0.6976842880249023,
"loss_layer_18_head": 0.7718448638916016,
"loss_layer_24_head": 0.33865469694137573,
"loss_layer_30_head": 0.30089613795280457,
"loss_layer_36_head": 0.15890909731388092,
"loss_layer_42_head": 0.10226695239543915,
"loss_layer_6_head": 1.0943472385406494,
"step": 210
},
{
"epoch": 9.8005698005698,
"grad_norm": 0.6413467845692354,
"learning_rate": 0.004999913657690942,
"loss": 3.3095,
"loss_layer_12_head": 0.6870448589324951,
"loss_layer_18_head": 0.6909510493278503,
"loss_layer_24_head": 0.323662132024765,
"loss_layer_30_head": 0.26362109184265137,
"loss_layer_36_head": 0.16587287187576294,
"loss_layer_42_head": 0.09277474135160446,
"loss_layer_6_head": 1.0225787162780762,
"step": 215
},
{
"epoch": 10.028490028490028,
"grad_norm": 0.7352495226591444,
"learning_rate": 0.004999654636727764,
"loss": 3.1636,
"loss_layer_12_head": 0.6777123212814331,
"loss_layer_18_head": 0.6364808678627014,
"loss_layer_24_head": 0.302184134721756,
"loss_layer_30_head": 0.227763369679451,
"loss_layer_36_head": 0.14396648108959198,
"loss_layer_42_head": 0.09543080627918243,
"loss_layer_6_head": 0.9637772440910339,
"step": 220
},
{
"epoch": 10.256410256410255,
"grad_norm": 0.8688354723248357,
"learning_rate": 0.00499922295500204,
"loss": 2.9773,
"loss_layer_12_head": 0.6373997926712036,
"loss_layer_18_head": 0.597356915473938,
"loss_layer_24_head": 0.30082494020462036,
"loss_layer_30_head": 0.21303267776966095,
"loss_layer_36_head": 0.14231160283088684,
"loss_layer_42_head": 0.15631356835365295,
"loss_layer_6_head": 0.9340046644210815,
"step": 225
},
{
"epoch": 10.484330484330485,
"grad_norm": 0.8921544784848862,
"learning_rate": 0.004998618642331689,
"loss": 3.0212,
"loss_layer_12_head": 0.6571947932243347,
"loss_layer_18_head": 0.5933902263641357,
"loss_layer_24_head": 0.3129543364048004,
"loss_layer_30_head": 0.20653457939624786,
"loss_layer_36_head": 0.146720290184021,
"loss_layer_42_head": 0.20062248408794403,
"loss_layer_6_head": 0.9515364766120911,
"step": 230
},
{
"epoch": 10.712250712250713,
"grad_norm": 1.037926434379806,
"learning_rate": 0.004997841740458911,
"loss": 2.9444,
"loss_layer_12_head": 0.6910628080368042,
"loss_layer_18_head": 0.5505853891372681,
"loss_layer_24_head": 0.28936266899108887,
"loss_layer_30_head": 0.19097623229026794,
"loss_layer_36_head": 0.14262089133262634,
"loss_layer_42_head": 0.16720648109912872,
"loss_layer_6_head": 0.9268558621406555,
"step": 235
},
{
"epoch": 10.94017094017094,
"grad_norm": 1.4783942588373633,
"learning_rate": 0.004996892303047305,
"loss": 3.0655,
"loss_layer_12_head": 0.7266325354576111,
"loss_layer_18_head": 0.5471276640892029,
"loss_layer_24_head": 0.39032667875289917,
"loss_layer_30_head": 0.195367231965065,
"loss_layer_36_head": 0.14416465163230896,
"loss_layer_42_head": 0.14725883305072784,
"loss_layer_6_head": 0.9646366834640503,
"step": 240
},
{
"epoch": 11.168091168091168,
"grad_norm": 1.1422412037330358,
"learning_rate": 0.004995770395678171,
"loss": 2.9034,
"loss_layer_12_head": 0.6590501070022583,
"loss_layer_18_head": 0.50871741771698,
"loss_layer_24_head": 0.3766937851905823,
"loss_layer_30_head": 0.18061670660972595,
"loss_layer_36_head": 0.13145729899406433,
"loss_layer_42_head": 0.11728329956531525,
"loss_layer_6_head": 0.9400504231452942,
"step": 245
},
{
"epoch": 11.396011396011396,
"grad_norm": 0.8093002240508176,
"learning_rate": 0.0049944760958459625,
"loss": 2.7821,
"loss_layer_12_head": 0.6128442883491516,
"loss_layer_18_head": 0.48932427167892456,
"loss_layer_24_head": 0.3477482199668884,
"loss_layer_30_head": 0.20216746628284454,
"loss_layer_36_head": 0.14225037395954132,
"loss_layer_42_head": 0.10360412299633026,
"loss_layer_6_head": 0.8851297497749329,
"step": 250
},
{
"epoch": 11.623931623931623,
"grad_norm": 0.7434506470981133,
"learning_rate": 0.00499300949295295,
"loss": 2.7255,
"loss_layer_12_head": 0.6115326881408691,
"loss_layer_18_head": 0.4955017566680908,
"loss_layer_24_head": 0.33017319440841675,
"loss_layer_30_head": 0.1856774091720581,
"loss_layer_36_head": 0.14168860018253326,
"loss_layer_42_head": 0.09496969729661942,
"loss_layer_6_head": 0.905809760093689,
"step": 255
},
{
"epoch": 11.851851851851851,
"grad_norm": 1.0265273242733213,
"learning_rate": 0.004991370688303039,
"loss": 2.7819,
"loss_layer_12_head": 0.6071752309799194,
"loss_layer_18_head": 0.48383840918540955,
"loss_layer_24_head": 0.30922985076904297,
"loss_layer_30_head": 0.18128779530525208,
"loss_layer_36_head": 0.14182403683662415,
"loss_layer_42_head": 0.0852092057466507,
"loss_layer_6_head": 0.9614272117614746,
"step": 260
},
{
"epoch": 12.079772079772079,
"grad_norm": 1.0046729839259028,
"learning_rate": 0.00498955979509477,
"loss": 2.7013,
"loss_layer_12_head": 0.580025315284729,
"loss_layer_18_head": 0.4724946618080139,
"loss_layer_24_head": 0.29289859533309937,
"loss_layer_30_head": 0.18664391338825226,
"loss_layer_36_head": 0.128468319773674,
"loss_layer_42_head": 0.07929748296737671,
"loss_layer_6_head": 0.9460482597351074,
"step": 265
},
{
"epoch": 12.307692307692308,
"grad_norm": 1.298003181437895,
"learning_rate": 0.004987576938413504,
"loss": 2.6403,
"loss_layer_12_head": 0.5524693727493286,
"loss_layer_18_head": 0.4450896382331848,
"loss_layer_24_head": 0.28416186571121216,
"loss_layer_30_head": 0.18391993641853333,
"loss_layer_36_head": 0.11984305083751678,
"loss_layer_42_head": 0.0738661140203476,
"loss_layer_6_head": 0.9863630533218384,
"step": 270
},
{
"epoch": 12.535612535612536,
"grad_norm": 1.0033885173931771,
"learning_rate": 0.00498542225522278,
"loss": 2.749,
"loss_layer_12_head": 0.6306111812591553,
"loss_layer_18_head": 0.44906359910964966,
"loss_layer_24_head": 0.28689223527908325,
"loss_layer_30_head": 0.18012277781963348,
"loss_layer_36_head": 0.1199340969324112,
"loss_layer_42_head": 0.07653049379587173,
"loss_layer_6_head": 1.0442602634429932,
"step": 275
},
{
"epoch": 12.763532763532764,
"grad_norm": 0.843782334866487,
"learning_rate": 0.004983095894354857,
"loss": 2.7597,
"loss_layer_12_head": 0.618486762046814,
"loss_layer_18_head": 0.45755448937416077,
"loss_layer_24_head": 0.3018384575843811,
"loss_layer_30_head": 0.1906372755765915,
"loss_layer_36_head": 0.12202297151088715,
"loss_layer_42_head": 0.07890411466360092,
"loss_layer_6_head": 1.005176305770874,
"step": 280
},
{
"epoch": 12.991452991452991,
"grad_norm": 0.6576558320387813,
"learning_rate": 0.0049805980165004305,
"loss": 2.7936,
"loss_layer_12_head": 0.6078914999961853,
"loss_layer_18_head": 0.4747782349586487,
"loss_layer_24_head": 0.3116241991519928,
"loss_layer_30_head": 0.19092823565006256,
"loss_layer_36_head": 0.1737132966518402,
"loss_layer_42_head": 0.08486510813236237,
"loss_layer_6_head": 0.964220404624939,
"step": 285
},
{
"epoch": 13.21937321937322,
"grad_norm": 0.7017471319323249,
"learning_rate": 0.004977928794197532,
"loss": 2.5574,
"loss_layer_12_head": 0.5619921088218689,
"loss_layer_18_head": 0.4452199339866638,
"loss_layer_24_head": 0.27957409620285034,
"loss_layer_30_head": 0.1817333996295929,
"loss_layer_36_head": 0.1468358337879181,
"loss_layer_42_head": 0.08508309721946716,
"loss_layer_6_head": 0.884550929069519,
"step": 290
},
{
"epoch": 13.447293447293447,
"grad_norm": 0.6115425707737043,
"learning_rate": 0.004975088411819616,
"loss": 2.5032,
"loss_layer_12_head": 0.5326579809188843,
"loss_layer_18_head": 0.42704278230667114,
"loss_layer_24_head": 0.2711140513420105,
"loss_layer_30_head": 0.1879303902387619,
"loss_layer_36_head": 0.13557665050029755,
"loss_layer_42_head": 0.09177286922931671,
"loss_layer_6_head": 0.8427483439445496,
"step": 295
},
{
"epoch": 13.675213675213675,
"grad_norm": 0.9450445323425971,
"learning_rate": 0.004972077065562821,
"loss": 2.4842,
"loss_layer_12_head": 0.5639079809188843,
"loss_layer_18_head": 0.4394643306732178,
"loss_layer_24_head": 0.2671627402305603,
"loss_layer_30_head": 0.18647949397563934,
"loss_layer_36_head": 0.1306968778371811,
"loss_layer_42_head": 0.08405078202486038,
"loss_layer_6_head": 0.8649594187736511,
"step": 300
},
{
"epoch": 13.903133903133902,
"grad_norm": 0.9138974651639769,
"learning_rate": 0.004968894963432419,
"loss": 2.5697,
"loss_layer_12_head": 0.5812464356422424,
"loss_layer_18_head": 0.450472891330719,
"loss_layer_24_head": 0.26817426085472107,
"loss_layer_30_head": 0.18351063132286072,
"loss_layer_36_head": 0.1325109750032425,
"loss_layer_42_head": 0.08811412006616592,
"loss_layer_6_head": 0.8798761367797852,
"step": 305
},
{
"epoch": 14.131054131054132,
"grad_norm": 0.9905055131763821,
"learning_rate": 0.004965542325228446,
"loss": 2.4857,
"loss_layer_12_head": 0.5650662183761597,
"loss_layer_18_head": 0.4344004690647125,
"loss_layer_24_head": 0.2506847381591797,
"loss_layer_30_head": 0.1754450798034668,
"loss_layer_36_head": 0.11748027801513672,
"loss_layer_42_head": 0.08022721111774445,
"loss_layer_6_head": 0.8539366722106934,
"step": 310
},
{
"epoch": 14.35897435897436,
"grad_norm": 1.0702212782131248,
"learning_rate": 0.00496201938253052,
"loss": 2.4271,
"loss_layer_12_head": 0.5588186383247375,
"loss_layer_18_head": 0.41167354583740234,
"loss_layer_24_head": 0.24384205043315887,
"loss_layer_30_head": 0.16517134010791779,
"loss_layer_36_head": 0.10888339579105377,
"loss_layer_42_head": 0.0690896213054657,
"loss_layer_6_head": 0.8617145419120789,
"step": 315
},
{
"epoch": 14.586894586894587,
"grad_norm": 1.162558917354217,
"learning_rate": 0.004958326378681849,
"loss": 2.5164,
"loss_layer_12_head": 0.6251641511917114,
"loss_layer_18_head": 0.4351302981376648,
"loss_layer_24_head": 0.27556923031806946,
"loss_layer_30_head": 0.18502500653266907,
"loss_layer_36_head": 0.12545715272426605,
"loss_layer_42_head": 0.08420990407466888,
"loss_layer_6_head": 0.8722649812698364,
"step": 320
},
{
"epoch": 14.814814814814815,
"grad_norm": 0.9669550279988619,
"learning_rate": 0.004954463568772415,
"loss": 2.6015,
"loss_layer_12_head": 0.6321030855178833,
"loss_layer_18_head": 0.4239681661128998,
"loss_layer_24_head": 0.2637138366699219,
"loss_layer_30_head": 0.18219289183616638,
"loss_layer_36_head": 0.11929179728031158,
"loss_layer_42_head": 0.08808918297290802,
"loss_layer_6_head": 0.874514102935791,
"step": 325
},
{
"epoch": 15.042735042735043,
"grad_norm": 1.0543343861851295,
"learning_rate": 0.00495043121962136,
"loss": 2.6238,
"loss_layer_12_head": 0.6399250030517578,
"loss_layer_18_head": 0.4572976231575012,
"loss_layer_24_head": 0.2698724567890167,
"loss_layer_30_head": 0.20202596485614777,
"loss_layer_36_head": 0.12312252819538116,
"loss_layer_42_head": 0.0965074896812439,
"loss_layer_6_head": 0.9252855181694031,
"step": 330
},
{
"epoch": 15.27065527065527,
"grad_norm": 0.8727229528851698,
"learning_rate": 0.0049462296097585534,
"loss": 2.5011,
"loss_layer_12_head": 0.5516534447669983,
"loss_layer_18_head": 0.4133722186088562,
"loss_layer_24_head": 0.26039832830429077,
"loss_layer_30_head": 0.17739339172840118,
"loss_layer_36_head": 0.10702526569366455,
"loss_layer_42_head": 0.08984313905239105,
"loss_layer_6_head": 0.8428100347518921,
"step": 335
},
{
"epoch": 15.498575498575498,
"grad_norm": 0.7580466664772497,
"learning_rate": 0.004941859029405353,
"loss": 2.4557,
"loss_layer_12_head": 0.5529354214668274,
"loss_layer_18_head": 0.43304672837257385,
"loss_layer_24_head": 0.2537849247455597,
"loss_layer_30_head": 0.17200490832328796,
"loss_layer_36_head": 0.15178406238555908,
"loss_layer_42_head": 0.0801372081041336,
"loss_layer_6_head": 0.8668233156204224,
"step": 340
},
{
"epoch": 15.726495726495726,
"grad_norm": 0.9418340108215619,
"learning_rate": 0.0049373197804545585,
"loss": 2.4453,
"loss_layer_12_head": 0.5447560548782349,
"loss_layer_18_head": 0.43586069345474243,
"loss_layer_24_head": 0.2588791251182556,
"loss_layer_30_head": 0.17809347808361053,
"loss_layer_36_head": 0.15249352157115936,
"loss_layer_42_head": 0.08157460391521454,
"loss_layer_6_head": 0.8784462213516235,
"step": 345
},
{
"epoch": 15.954415954415955,
"grad_norm": 0.9047489961689316,
"learning_rate": 0.004932612176449559,
"loss": 2.4695,
"loss_layer_12_head": 0.5187823176383972,
"loss_layer_18_head": 0.41048678755760193,
"loss_layer_24_head": 0.2688165605068207,
"loss_layer_30_head": 0.1661987006664276,
"loss_layer_36_head": 0.13160857558250427,
"loss_layer_42_head": 0.07198281586170197,
"loss_layer_6_head": 0.835620105266571,
"step": 350
},
{
"epoch": 16.182336182336183,
"grad_norm": 0.8657986887322702,
"learning_rate": 0.004927736542562676,
"loss": 2.4342,
"loss_layer_12_head": 0.5429137349128723,
"loss_layer_18_head": 0.4332647919654846,
"loss_layer_24_head": 0.27726346254348755,
"loss_layer_30_head": 0.17736102640628815,
"loss_layer_36_head": 0.11637071520090103,
"loss_layer_42_head": 0.06702034920454025,
"loss_layer_6_head": 0.8331397771835327,
"step": 355
},
{
"epoch": 16.41025641025641,
"grad_norm": 0.9425666175764251,
"learning_rate": 0.004922693215572695,
"loss": 2.4022,
"loss_layer_12_head": 0.510245680809021,
"loss_layer_18_head": 0.42118319869041443,
"loss_layer_24_head": 0.2536671757698059,
"loss_layer_30_head": 0.18339803814888,
"loss_layer_36_head": 0.1120036393404007,
"loss_layer_42_head": 0.06707726418972015,
"loss_layer_6_head": 0.8400261998176575,
"step": 360
},
{
"epoch": 16.63817663817664,
"grad_norm": 0.8077761031458355,
"learning_rate": 0.004917482543841618,
"loss": 2.3971,
"loss_layer_12_head": 0.5046784281730652,
"loss_layer_18_head": 0.4156663417816162,
"loss_layer_24_head": 0.24197836220264435,
"loss_layer_30_head": 0.17227289080619812,
"loss_layer_36_head": 0.10577581822872162,
"loss_layer_42_head": 0.08296042680740356,
"loss_layer_6_head": 0.8249191045761108,
"step": 365
},
{
"epoch": 16.866096866096868,
"grad_norm": 0.8166472620009376,
"learning_rate": 0.004912104887290587,
"loss": 2.3838,
"loss_layer_12_head": 0.5049887895584106,
"loss_layer_18_head": 0.4009923040866852,
"loss_layer_24_head": 0.24655649065971375,
"loss_layer_30_head": 0.16778674721717834,
"loss_layer_36_head": 0.10913994163274765,
"loss_layer_42_head": 0.0756555050611496,
"loss_layer_6_head": 0.8108115196228027,
"step": 370
},
{
"epoch": 17.094017094017094,
"grad_norm": 0.7008030273100762,
"learning_rate": 0.0049065606173750295,
"loss": 2.323,
"loss_layer_12_head": 0.5143665671348572,
"loss_layer_18_head": 0.42088404297828674,
"loss_layer_24_head": 0.2502315938472748,
"loss_layer_30_head": 0.17210659384727478,
"loss_layer_36_head": 0.10463279485702515,
"loss_layer_42_head": 0.07540477812290192,
"loss_layer_6_head": 0.8134105801582336,
"step": 375
},
{
"epoch": 17.321937321937323,
"grad_norm": 1.0537921241381873,
"learning_rate": 0.004900850117059,
"loss": 2.3176,
"loss_layer_12_head": 0.5064232349395752,
"loss_layer_18_head": 0.4487612247467041,
"loss_layer_24_head": 0.23967449367046356,
"loss_layer_30_head": 0.15723897516727448,
"loss_layer_36_head": 0.09650365263223648,
"loss_layer_42_head": 0.06906923651695251,
"loss_layer_6_head": 0.779255747795105,
"step": 380
},
{
"epoch": 17.54985754985755,
"grad_norm": 1.2282578627276648,
"learning_rate": 0.004894973780788722,
"loss": 2.5449,
"loss_layer_12_head": 0.5091427564620972,
"loss_layer_18_head": 0.5503803491592407,
"loss_layer_24_head": 0.24137809872627258,
"loss_layer_30_head": 0.16684108972549438,
"loss_layer_36_head": 0.18911299109458923,
"loss_layer_42_head": 0.07031677663326263,
"loss_layer_6_head": 0.8027304410934448,
"step": 385
},
{
"epoch": 17.77777777777778,
"grad_norm": 1.0765867107263354,
"learning_rate": 0.004888932014465352,
"loss": 2.6286,
"loss_layer_12_head": 0.5079230070114136,
"loss_layer_18_head": 0.5281995534896851,
"loss_layer_24_head": 0.2384219467639923,
"loss_layer_30_head": 0.18661737442016602,
"loss_layer_36_head": 0.2386232614517212,
"loss_layer_42_head": 0.06579138338565826,
"loss_layer_6_head": 0.8466068506240845,
"step": 390
},
{
"epoch": 18.005698005698004,
"grad_norm": 1.3603813767101454,
"learning_rate": 0.0048827252354169326,
"loss": 2.6561,
"loss_layer_12_head": 0.5154815912246704,
"loss_layer_18_head": 0.4759330749511719,
"loss_layer_24_head": 0.23246505856513977,
"loss_layer_30_head": 0.17445430159568787,
"loss_layer_36_head": 0.18358901143074036,
"loss_layer_42_head": 0.07926751673221588,
"loss_layer_6_head": 0.9083824157714844,
"step": 395
},
{
"epoch": 18.233618233618234,
"grad_norm": 1.0979057750891,
"learning_rate": 0.004876353872369572,
"loss": 2.736,
"loss_layer_12_head": 0.4919304847717285,
"loss_layer_18_head": 0.4315822124481201,
"loss_layer_24_head": 0.47708845138549805,
"loss_layer_30_head": 0.16244928538799286,
"loss_layer_36_head": 0.16019582748413086,
"loss_layer_42_head": 0.10476633161306381,
"loss_layer_6_head": 0.8798778653144836,
"step": 400
},
{
"epoch": 18.233618233618234,
"eval_loss": 4.721885681152344,
"eval_loss_layer_12_head": 0.8316348195075989,
"eval_loss_layer_18_head": 0.7489945292472839,
"eval_loss_layer_24_head": 1.0868966579437256,
"eval_loss_layer_30_head": 0.32381412386894226,
"eval_loss_layer_36_head": 0.26655933260917664,
"eval_loss_layer_42_head": 0.17233143746852875,
"eval_loss_layer_6_head": 1.2158176898956299,
"eval_runtime": 4.9318,
"eval_samples_per_second": 6.691,
"eval_steps_per_second": 0.608,
"step": 400
},
{
"epoch": 18.46153846153846,
"grad_norm": 1.4771278422236476,
"learning_rate": 0.004869818365417829,
"loss": 2.9782,
"loss_layer_12_head": 0.5835316181182861,
"loss_layer_18_head": 0.41122907400131226,
"loss_layer_24_head": 0.703028678894043,
"loss_layer_30_head": 0.16457121074199677,
"loss_layer_36_head": 0.1471795290708542,
"loss_layer_42_head": 0.09325657039880753,
"loss_layer_6_head": 0.8624021410942078,
"step": 405
},
{
"epoch": 18.68945868945869,
"grad_norm": 1.0091974995950315,
"learning_rate": 0.004863119165994312,
"loss": 2.9112,
"loss_layer_12_head": 0.670836329460144,
"loss_layer_18_head": 0.4119151532649994,
"loss_layer_24_head": 0.5785427093505859,
"loss_layer_30_head": 0.16393141448497772,
"loss_layer_36_head": 0.14178720116615295,
"loss_layer_42_head": 0.08272372931241989,
"loss_layer_6_head": 0.8412971496582031,
"step": 410
},
{
"epoch": 18.91737891737892,
"grad_norm": 0.6656147383639003,
"learning_rate": 0.004856256736838498,
"loss": 2.747,
"loss_layer_12_head": 0.6682761311531067,
"loss_layer_18_head": 0.411543607711792,
"loss_layer_24_head": 0.4885958731174469,
"loss_layer_30_head": 0.16745702922344208,
"loss_layer_36_head": 0.12891222536563873,
"loss_layer_42_head": 0.07578006386756897,
"loss_layer_6_head": 0.8457491993904114,
"step": 415
},
{
"epoch": 19.145299145299145,
"grad_norm": 0.5054801842815198,
"learning_rate": 0.0048492315519647715,
"loss": 2.4882,
"loss_layer_12_head": 0.5891987085342407,
"loss_layer_18_head": 0.3886004090309143,
"loss_layer_24_head": 0.42113596200942993,
"loss_layer_30_head": 0.15418270230293274,
"loss_layer_36_head": 0.11842542886734009,
"loss_layer_42_head": 0.068417988717556,
"loss_layer_6_head": 0.7943364381790161,
"step": 420
},
{
"epoch": 19.373219373219374,
"grad_norm": 0.4952359696534927,
"learning_rate": 0.0048420440966296776,
"loss": 2.376,
"loss_layer_12_head": 0.5409587621688843,
"loss_layer_18_head": 0.38072651624679565,
"loss_layer_24_head": 0.37133434414863586,
"loss_layer_30_head": 0.15104272961616516,
"loss_layer_36_head": 0.11313710361719131,
"loss_layer_42_head": 0.06553231179714203,
"loss_layer_6_head": 0.7570014595985413,
"step": 425
},
{
"epoch": 19.6011396011396,
"grad_norm": 0.7006459133032029,
"learning_rate": 0.004834694867298409,
"loss": 2.3375,
"loss_layer_12_head": 0.5223572254180908,
"loss_layer_18_head": 0.38262003660202026,
"loss_layer_24_head": 0.34287339448928833,
"loss_layer_30_head": 0.17433951795101166,
"loss_layer_36_head": 0.11228135973215103,
"loss_layer_42_head": 0.06561323255300522,
"loss_layer_6_head": 0.7609976530075073,
"step": 430
},
{
"epoch": 19.82905982905983,
"grad_norm": 0.5491943779867444,
"learning_rate": 0.004827184371610511,
"loss": 2.3932,
"loss_layer_12_head": 0.4984654486179352,
"loss_layer_18_head": 0.37037283182144165,
"loss_layer_24_head": 0.3063350319862366,
"loss_layer_30_head": 0.23311960697174072,
"loss_layer_36_head": 0.10572105646133423,
"loss_layer_42_head": 0.06688721477985382,
"loss_layer_6_head": 0.7638763785362244,
"step": 435
},
{
"epoch": 20.056980056980056,
"grad_norm": 0.5945511861137925,
"learning_rate": 0.004819513128344813,
"loss": 2.3253,
"loss_layer_12_head": 0.49806079268455505,
"loss_layer_18_head": 0.38933131098747253,
"loss_layer_24_head": 0.29588833451271057,
"loss_layer_30_head": 0.21775169670581818,
"loss_layer_36_head": 0.11467882245779037,
"loss_layer_42_head": 0.06367787718772888,
"loss_layer_6_head": 0.7606059312820435,
"step": 440
},
{
"epoch": 20.284900284900285,
"grad_norm": 1.131592778867395,
"learning_rate": 0.004811681667383604,
"loss": 2.2407,
"loss_layer_12_head": 0.47209444642066956,
"loss_layer_18_head": 0.3742205798625946,
"loss_layer_24_head": 0.27469268441200256,
"loss_layer_30_head": 0.19076983630657196,
"loss_layer_36_head": 0.10397826135158539,
"loss_layer_42_head": 0.06761151552200317,
"loss_layer_6_head": 0.7645944356918335,
"step": 445
},
{
"epoch": 20.51282051282051,
"grad_norm": 0.7245636198542537,
"learning_rate": 0.004803690529676019,
"loss": 2.2039,
"loss_layer_12_head": 0.4624738097190857,
"loss_layer_18_head": 0.36616355180740356,
"loss_layer_24_head": 0.25559544563293457,
"loss_layer_30_head": 0.17066636681556702,
"loss_layer_36_head": 0.10047119855880737,
"loss_layer_42_head": 0.0659947544336319,
"loss_layer_6_head": 0.770186185836792,
"step": 450
},
{
"epoch": 20.74074074074074,
"grad_norm": 0.8633596477338317,
"learning_rate": 0.004795540267200685,
"loss": 2.2908,
"loss_layer_12_head": 0.48264575004577637,
"loss_layer_18_head": 0.40934857726097107,
"loss_layer_24_head": 0.2605716288089752,
"loss_layer_30_head": 0.16646702587604523,
"loss_layer_36_head": 0.11609245836734772,
"loss_layer_42_head": 0.06701932847499847,
"loss_layer_6_head": 0.8158668279647827,
"step": 455
},
{
"epoch": 20.96866096866097,
"grad_norm": 0.8163615583946534,
"learning_rate": 0.004787231442927586,
"loss": 2.295,
"loss_layer_12_head": 0.4884544014930725,
"loss_layer_18_head": 0.3945980668067932,
"loss_layer_24_head": 0.2505979835987091,
"loss_layer_30_head": 0.15824952721595764,
"loss_layer_36_head": 0.11044065654277802,
"loss_layer_42_head": 0.06734587997198105,
"loss_layer_6_head": 0.8474823236465454,
"step": 460
},
{
"epoch": 21.196581196581196,
"grad_norm": 0.9965972018686945,
"learning_rate": 0.004778764630779183,
"loss": 2.2109,
"loss_layer_12_head": 0.45411452651023865,
"loss_layer_18_head": 0.3743075430393219,
"loss_layer_24_head": 0.23483021557331085,
"loss_layer_30_head": 0.1525890976190567,
"loss_layer_36_head": 0.10491526126861572,
"loss_layer_42_head": 0.06145526096224785,
"loss_layer_6_head": 0.8141587972640991,
"step": 465
},
{
"epoch": 21.424501424501425,
"grad_norm": 0.9579834031557086,
"learning_rate": 0.004770140415590762,
"loss": 2.2449,
"loss_layer_12_head": 0.4716685712337494,
"loss_layer_18_head": 0.4065755307674408,
"loss_layer_24_head": 0.23640041053295135,
"loss_layer_30_head": 0.15385356545448303,
"loss_layer_36_head": 0.10922032594680786,
"loss_layer_42_head": 0.07777702063322067,
"loss_layer_6_head": 0.8294760584831238,
"step": 470
},
{
"epoch": 21.65242165242165,
"grad_norm": 1.0807303975242237,
"learning_rate": 0.0047613593930700485,
"loss": 2.3935,
"loss_layer_12_head": 0.5475557446479797,
"loss_layer_18_head": 0.46753430366516113,
"loss_layer_24_head": 0.2278400957584381,
"loss_layer_30_head": 0.14627912640571594,
"loss_layer_36_head": 0.10156891494989395,
"loss_layer_42_head": 0.06986488401889801,
"loss_layer_6_head": 0.7917695045471191,
"step": 475
},
{
"epoch": 21.88034188034188,
"grad_norm": 0.8264421749483553,
"learning_rate": 0.004752422169756048,
"loss": 2.4382,
"loss_layer_12_head": 0.6330815553665161,
"loss_layer_18_head": 0.48085513710975647,
"loss_layer_24_head": 0.2386564314365387,
"loss_layer_30_head": 0.15691904723644257,
"loss_layer_36_head": 0.10863230377435684,
"loss_layer_42_head": 0.07321296632289886,
"loss_layer_6_head": 0.7897067070007324,
"step": 480
},
{
"epoch": 22.108262108262107,
"grad_norm": 0.6980195048286758,
"learning_rate": 0.00474332936297716,
"loss": 2.389,
"loss_layer_12_head": 0.671896755695343,
"loss_layer_18_head": 0.44162482023239136,
"loss_layer_24_head": 0.22745053470134735,
"loss_layer_30_head": 0.14352098107337952,
"loss_layer_36_head": 0.09909910708665848,
"loss_layer_42_head": 0.06609858572483063,
"loss_layer_6_head": 0.7521198391914368,
"step": 485
},
{
"epoch": 22.336182336182336,
"grad_norm": 0.8316424914651728,
"learning_rate": 0.004734081600808531,
"loss": 2.3102,
"loss_layer_12_head": 0.5933720469474792,
"loss_layer_18_head": 0.3834800124168396,
"loss_layer_24_head": 0.20532111823558807,
"loss_layer_30_head": 0.13171999156475067,
"loss_layer_36_head": 0.08968774229288101,
"loss_layer_42_head": 0.07913189381361008,
"loss_layer_6_head": 0.7101460099220276,
"step": 490
},
{
"epoch": 22.564102564102566,
"grad_norm": 0.8957636763389422,
"learning_rate": 0.004724679522028672,
"loss": 2.24,
"loss_layer_12_head": 0.5700436234474182,
"loss_layer_18_head": 0.3944259285926819,
"loss_layer_24_head": 0.21593594551086426,
"loss_layer_30_head": 0.13537552952766418,
"loss_layer_36_head": 0.09649913012981415,
"loss_layer_42_head": 0.07921244949102402,
"loss_layer_6_head": 0.7655854821205139,
"step": 495
},
{
"epoch": 22.79202279202279,
"grad_norm": 0.6557657532360657,
"learning_rate": 0.004715123776075337,
"loss": 2.2258,
"loss_layer_12_head": 0.5300887823104858,
"loss_layer_18_head": 0.3781338632106781,
"loss_layer_24_head": 0.20915941894054413,
"loss_layer_30_head": 0.13727469742298126,
"loss_layer_36_head": 0.08927856385707855,
"loss_layer_42_head": 0.07049344480037689,
"loss_layer_6_head": 0.752504289150238,
"step": 500
},
{
"epoch": 23.01994301994302,
"grad_norm": 0.7830740281704984,
"learning_rate": 0.0047054150230006605,
"loss": 2.2806,
"loss_layer_12_head": 0.5240658521652222,
"loss_layer_18_head": 0.38662710785865784,
"loss_layer_24_head": 0.22086063027381897,
"loss_layer_30_head": 0.19454047083854675,
"loss_layer_36_head": 0.09448827058076859,
"loss_layer_42_head": 0.06740959733724594,
"loss_layer_6_head": 0.7966665029525757,
"step": 505
},
{
"epoch": 23.247863247863247,
"grad_norm": 0.5579028205723976,
"learning_rate": 0.004695553933425571,
"loss": 2.1436,
"loss_layer_12_head": 0.4735882878303528,
"loss_layer_18_head": 0.36018824577331543,
"loss_layer_24_head": 0.20643365383148193,
"loss_layer_30_head": 0.17400547862052917,
"loss_layer_36_head": 0.0954669862985611,
"loss_layer_42_head": 0.06432937830686569,
"loss_layer_6_head": 0.731139063835144,
"step": 510
},
{
"epoch": 23.475783475783476,
"grad_norm": 0.552522051234654,
"learning_rate": 0.004685541188493464,
"loss": 2.0865,
"loss_layer_12_head": 0.46377283334732056,
"loss_layer_18_head": 0.3589307963848114,
"loss_layer_24_head": 0.20729784667491913,
"loss_layer_30_head": 0.15942244231700897,
"loss_layer_36_head": 0.0918920561671257,
"loss_layer_42_head": 0.059663545340299606,
"loss_layer_6_head": 0.7291909456253052,
"step": 515
},
{
"epoch": 23.703703703703702,
"grad_norm": 0.529577925725112,
"learning_rate": 0.004675377479823153,
"loss": 2.1213,
"loss_layer_12_head": 0.4666662812232971,
"loss_layer_18_head": 0.37829747796058655,
"loss_layer_24_head": 0.20896704494953156,
"loss_layer_30_head": 0.15137134492397308,
"loss_layer_36_head": 0.10702455043792725,
"loss_layer_42_head": 0.060252584517002106,
"loss_layer_6_head": 0.74061518907547,
"step": 520
},
{
"epoch": 23.931623931623932,
"grad_norm": 0.7782049747581867,
"learning_rate": 0.004665063509461097,
"loss": 2.1262,
"loss_layer_12_head": 0.462003231048584,
"loss_layer_18_head": 0.37864863872528076,
"loss_layer_24_head": 0.20960676670074463,
"loss_layer_30_head": 0.1473332941532135,
"loss_layer_36_head": 0.09850483387708664,
"loss_layer_42_head": 0.05810556560754776,
"loss_layer_6_head": 0.7602113485336304,
"step": 525
},
{
"epoch": 24.159544159544158,
"grad_norm": 0.6360759879888129,
"learning_rate": 0.00465459998983291,
"loss": 2.0674,
"loss_layer_12_head": 0.44862040877342224,
"loss_layer_18_head": 0.3798063099384308,
"loss_layer_24_head": 0.20599885284900665,
"loss_layer_30_head": 0.1415323168039322,
"loss_layer_36_head": 0.09703875333070755,
"loss_layer_42_head": 0.05768008157610893,
"loss_layer_6_head": 0.7395282983779907,
"step": 530
},
{
"epoch": 24.387464387464387,
"grad_norm": 1.0659611300691207,
"learning_rate": 0.004643987643694149,
"loss": 2.1265,
"loss_layer_12_head": 0.46460556983947754,
"loss_layer_18_head": 0.4132619798183441,
"loss_layer_24_head": 0.21071580052375793,
"loss_layer_30_head": 0.1440211832523346,
"loss_layer_36_head": 0.09940258413553238,
"loss_layer_42_head": 0.08096981793642044,
"loss_layer_6_head": 0.7652676105499268,
"step": 535
},
{
"epoch": 24.615384615384617,
"grad_norm": 0.932438216545257,
"learning_rate": 0.004633227204080389,
"loss": 2.1919,
"loss_layer_12_head": 0.46289700269699097,
"loss_layer_18_head": 0.3990364670753479,
"loss_layer_24_head": 0.20558461546897888,
"loss_layer_30_head": 0.13726839423179626,
"loss_layer_36_head": 0.09494920074939728,
"loss_layer_42_head": 0.07333710789680481,
"loss_layer_6_head": 0.8040269017219543,
"step": 540
},
{
"epoch": 24.843304843304843,
"grad_norm": 1.0248927444527722,
"learning_rate": 0.004622319414256594,
"loss": 2.2581,
"loss_layer_12_head": 0.4594665467739105,
"loss_layer_18_head": 0.387908935546875,
"loss_layer_24_head": 0.21216896176338196,
"loss_layer_30_head": 0.13931572437286377,
"loss_layer_36_head": 0.09683094173669815,
"loss_layer_42_head": 0.0691598504781723,
"loss_layer_6_head": 0.872686505317688,
"step": 545
},
{
"epoch": 25.071225071225072,
"grad_norm": 0.8098869912341418,
"learning_rate": 0.00461126502766577,
"loss": 2.2181,
"loss_layer_12_head": 0.45522379875183105,
"loss_layer_18_head": 0.37695351243019104,
"loss_layer_24_head": 0.20662946999073029,
"loss_layer_30_head": 0.13180062174797058,
"loss_layer_36_head": 0.10556745529174805,
"loss_layer_42_head": 0.0616777129471302,
"loss_layer_6_head": 0.8408571481704712,
"step": 550
},
{
"epoch": 25.299145299145298,
"grad_norm": 0.6911311577236631,
"learning_rate": 0.0046000648078769295,
"loss": 2.1218,
"loss_layer_12_head": 0.47091466188430786,
"loss_layer_18_head": 0.37433698773384094,
"loss_layer_24_head": 0.21443676948547363,
"loss_layer_30_head": 0.15783022344112396,
"loss_layer_36_head": 0.10397765785455704,
"loss_layer_42_head": 0.06332050263881683,
"loss_layer_6_head": 0.81425541639328,
"step": 555
},
{
"epoch": 25.527065527065528,
"grad_norm": 0.5483359227112778,
"learning_rate": 0.004588719528532341,
"loss": 2.0836,
"loss_layer_12_head": 0.4457060396671295,
"loss_layer_18_head": 0.35422247648239136,
"loss_layer_24_head": 0.2071726769208908,
"loss_layer_30_head": 0.1477786749601364,
"loss_layer_36_head": 0.10025700181722641,
"loss_layer_42_head": 0.0617125928401947,
"loss_layer_6_head": 0.7575004696846008,
"step": 560
},
{
"epoch": 25.754985754985753,
"grad_norm": 0.5184834946968752,
"learning_rate": 0.004577229973294099,
"loss": 2.0984,
"loss_layer_12_head": 0.46223968267440796,
"loss_layer_18_head": 0.39067524671554565,
"loss_layer_24_head": 0.212998628616333,
"loss_layer_30_head": 0.14908693730831146,
"loss_layer_36_head": 0.09892690926790237,
"loss_layer_42_head": 0.05941791459918022,
"loss_layer_6_head": 0.760570764541626,
"step": 565
},
{
"epoch": 25.982905982905983,
"grad_norm": 0.551240787416797,
"learning_rate": 0.004565596935789987,
"loss": 2.1022,
"loss_layer_12_head": 0.46231111884117126,
"loss_layer_18_head": 0.38874131441116333,
"loss_layer_24_head": 0.21234926581382751,
"loss_layer_30_head": 0.14975954592227936,
"loss_layer_36_head": 0.09737460315227509,
"loss_layer_42_head": 0.060064684599637985,
"loss_layer_6_head": 0.738222599029541,
"step": 570
},
{
"epoch": 26.210826210826212,
"grad_norm": 0.6796340352770581,
"learning_rate": 0.004553821219558661,
"loss": 2.0061,
"loss_layer_12_head": 0.46215319633483887,
"loss_layer_18_head": 0.3686721920967102,
"loss_layer_24_head": 0.2038687914609909,
"loss_layer_30_head": 0.1499582827091217,
"loss_layer_36_head": 0.08974792063236237,
"loss_layer_42_head": 0.06425337493419647,
"loss_layer_6_head": 0.7155905961990356,
"step": 575
},
{
"epoch": 26.43874643874644,
"grad_norm": 0.8866787700250957,
"learning_rate": 0.004541903637994142,
"loss": 2.0603,
"loss_layer_12_head": 0.44424566626548767,
"loss_layer_18_head": 0.3662102520465851,
"loss_layer_24_head": 0.20653650164604187,
"loss_layer_30_head": 0.14670081436634064,
"loss_layer_36_head": 0.10450087487697601,
"loss_layer_42_head": 0.06536010652780533,
"loss_layer_6_head": 0.7095087170600891,
"step": 580
},
{
"epoch": 26.666666666666668,
"grad_norm": 0.6486151029335254,
"learning_rate": 0.004529845014289642,
"loss": 2.0393,
"loss_layer_12_head": 0.4427577555179596,
"loss_layer_18_head": 0.36848098039627075,
"loss_layer_24_head": 0.20641179382801056,
"loss_layer_30_head": 0.14615444839000702,
"loss_layer_36_head": 0.10037078708410263,
"loss_layer_42_head": 0.06394441425800323,
"loss_layer_6_head": 0.7171773910522461,
"step": 585
},
{
"epoch": 26.894586894586894,
"grad_norm": 0.8622308299226761,
"learning_rate": 0.00451764618138069,
"loss": 2.101,
"loss_layer_12_head": 0.47314929962158203,
"loss_layer_18_head": 0.3822045922279358,
"loss_layer_24_head": 0.2136635035276413,
"loss_layer_30_head": 0.14691603183746338,
"loss_layer_36_head": 0.11097769439220428,
"loss_layer_42_head": 0.06998325139284134,
"loss_layer_6_head": 0.7319896221160889,
"step": 590
},
{
"epoch": 27.122507122507123,
"grad_norm": 0.7390178601634367,
"learning_rate": 0.0045053079818876095,
"loss": 2.0397,
"loss_layer_12_head": 0.44820213317871094,
"loss_layer_18_head": 0.3551773428916931,
"loss_layer_24_head": 0.20566609501838684,
"loss_layer_30_head": 0.1313147395849228,
"loss_layer_36_head": 0.09519216418266296,
"loss_layer_42_head": 0.0580265149474144,
"loss_layer_6_head": 0.7102463245391846,
"step": 595
},
{
"epoch": 27.35042735042735,
"grad_norm": 0.8258874084634017,
"learning_rate": 0.0044928312680573065,
"loss": 2.0128,
"loss_layer_12_head": 0.43636664748191833,
"loss_layer_18_head": 0.34100016951560974,
"loss_layer_24_head": 0.2000311315059662,
"loss_layer_30_head": 0.14223387837409973,
"loss_layer_36_head": 0.09040534496307373,
"loss_layer_42_head": 0.0646161437034607,
"loss_layer_6_head": 0.713534414768219,
"step": 600
},
{
"epoch": 27.35042735042735,
"eval_loss": 3.8952760696411133,
"eval_loss_layer_12_head": 0.8030363917350769,
"eval_loss_layer_18_head": 0.7230358719825745,
"eval_loss_layer_24_head": 0.44514450430870056,
"eval_loss_layer_30_head": 0.349967360496521,
"eval_loss_layer_36_head": 0.2027323693037033,
"eval_loss_layer_42_head": 0.14586174488067627,
"eval_loss_layer_6_head": 1.159847617149353,
"eval_runtime": 4.9499,
"eval_samples_per_second": 6.667,
"eval_steps_per_second": 0.606,
"step": 600
},
{
"epoch": 27.57834757834758,
"grad_norm": 0.6217870505507969,
"learning_rate": 0.004480216901704406,
"loss": 2.0692,
"loss_layer_12_head": 0.46979862451553345,
"loss_layer_18_head": 0.35027259588241577,
"loss_layer_24_head": 0.2092142403125763,
"loss_layer_30_head": 0.1553209125995636,
"loss_layer_36_head": 0.09953723847866058,
"loss_layer_42_head": 0.05826183035969734,
"loss_layer_6_head": 0.7447252869606018,
"step": 605
},
{
"epoch": 27.806267806267805,
"grad_norm": 1.3117503463089188,
"learning_rate": 0.004467465754151723,
"loss": 2.1562,
"loss_layer_12_head": 0.446768581867218,
"loss_layer_18_head": 0.40118208527565,
"loss_layer_24_head": 0.20322206616401672,
"loss_layer_30_head": 0.14313673973083496,
"loss_layer_36_head": 0.09695640206336975,
"loss_layer_42_head": 0.0757862776517868,
"loss_layer_6_head": 0.7276732921600342,
"step": 610
},
{
"epoch": 28.034188034188034,
"grad_norm": 0.725245663795549,
"learning_rate": 0.0044545787061700745,
"loss": 7.8264,
"loss_layer_12_head": 0.46167078614234924,
"loss_layer_18_head": 6.237041473388672,
"loss_layer_24_head": 0.2191043347120285,
"loss_layer_30_head": 0.14354541897773743,
"loss_layer_36_head": 0.09405554085969925,
"loss_layer_42_head": 0.07178852707147598,
"loss_layer_6_head": 0.749311625957489,
"step": 615
},
{
"epoch": 28.262108262108264,
"grad_norm": 0.6454953978078362,
"learning_rate": 0.004441556647917446,
"loss": 7.5221,
"loss_layer_12_head": 0.4319098889827728,
"loss_layer_18_head": 5.769500255584717,
"loss_layer_24_head": 0.21817514300346375,
"loss_layer_30_head": 0.13674825429916382,
"loss_layer_36_head": 0.09862224757671356,
"loss_layer_42_head": 0.0651412308216095,
"loss_layer_6_head": 0.7003488540649414,
"step": 620
},
{
"epoch": 28.49002849002849,
"grad_norm": 0.5903394788169665,
"learning_rate": 0.004428400478877499,
"loss": 6.8023,
"loss_layer_12_head": 0.44873490929603577,
"loss_layer_18_head": 5.282101631164551,
"loss_layer_24_head": 0.216207355260849,
"loss_layer_30_head": 0.13955923914909363,
"loss_layer_36_head": 0.09675300121307373,
"loss_layer_42_head": 0.06318524479866028,
"loss_layer_6_head": 0.7136842012405396,
"step": 625
},
{
"epoch": 28.71794871794872,
"grad_norm": 0.798968656911552,
"learning_rate": 0.004415111107797445,
"loss": 6.3915,
"loss_layer_12_head": 0.4504212737083435,
"loss_layer_18_head": 4.649975776672363,
"loss_layer_24_head": 0.21879024803638458,
"loss_layer_30_head": 0.15180036425590515,
"loss_layer_36_head": 0.10117790848016739,
"loss_layer_42_head": 0.06038238853216171,
"loss_layer_6_head": 0.7129833102226257,
"step": 630
},
{
"epoch": 28.945868945868945,
"grad_norm": 0.5813942239901363,
"learning_rate": 0.004401689452625272,
"loss": 6.0419,
"loss_layer_12_head": 0.4498574733734131,
"loss_layer_18_head": 4.355905532836914,
"loss_layer_24_head": 0.20642943680286407,
"loss_layer_30_head": 0.1372959315776825,
"loss_layer_36_head": 0.10397912561893463,
"loss_layer_42_head": 0.055586397647857666,
"loss_layer_6_head": 0.7128573656082153,
"step": 635
},
{
"epoch": 29.173789173789174,
"grad_norm": 0.7269276575425652,
"learning_rate": 0.004388136440446337,
"loss": 5.7069,
"loss_layer_12_head": 0.4492161273956299,
"loss_layer_18_head": 4.068719387054443,
"loss_layer_24_head": 0.20855550467967987,
"loss_layer_30_head": 0.1389748752117157,
"loss_layer_36_head": 0.10276387631893158,
"loss_layer_42_head": 0.05752943828701973,
"loss_layer_6_head": 0.7084957957267761,
"step": 640
},
{
"epoch": 29.4017094017094,
"grad_norm": 0.7160396858444902,
"learning_rate": 0.0043744530074193355,
"loss": 5.4769,
"loss_layer_12_head": 0.43505144119262695,
"loss_layer_18_head": 3.8150742053985596,
"loss_layer_24_head": 0.20362111926078796,
"loss_layer_30_head": 0.13056820631027222,
"loss_layer_36_head": 0.09140172600746155,
"loss_layer_42_head": 0.05409618094563484,
"loss_layer_6_head": 0.6898916959762573,
"step": 645
},
{
"epoch": 29.62962962962963,
"grad_norm": 0.9710847170559996,
"learning_rate": 0.004360640098711629,
"loss": 5.3281,
"loss_layer_12_head": 0.4550530016422272,
"loss_layer_18_head": 3.626317262649536,
"loss_layer_24_head": 0.20697829127311707,
"loss_layer_30_head": 0.14250314235687256,
"loss_layer_36_head": 0.09626954793930054,
"loss_layer_42_head": 0.08632789552211761,
"loss_layer_6_head": 0.7261134386062622,
"step": 650
},
{
"epoch": 29.85754985754986,
"grad_norm": 0.7802876477742213,
"learning_rate": 0.004346698668433964,
"loss": 5.1986,
"loss_layer_12_head": 0.4772264063358307,
"loss_layer_18_head": 3.4601082801818848,
"loss_layer_24_head": 0.20182755589485168,
"loss_layer_30_head": 0.13202452659606934,
"loss_layer_36_head": 0.08688151091337204,
"loss_layer_42_head": 0.0689874067902565,
"loss_layer_6_head": 0.7271126508712769,
"step": 655
},
{
"epoch": 30.085470085470085,
"grad_norm": 0.8296241303467062,
"learning_rate": 0.004332629679574566,
"loss": 5.0954,
"loss_layer_12_head": 0.5482393503189087,
"loss_layer_18_head": 3.2373409271240234,
"loss_layer_24_head": 0.20368175208568573,
"loss_layer_30_head": 0.1347997486591339,
"loss_layer_36_head": 0.088289774954319,
"loss_layer_42_head": 0.062674880027771,
"loss_layer_6_head": 0.7211120128631592,
"step": 660
},
{
"epoch": 30.313390313390315,
"grad_norm": 0.997608350407498,
"learning_rate": 0.0043184341039326215,
"loss": 4.9774,
"loss_layer_12_head": 0.541628897190094,
"loss_layer_18_head": 3.1904382705688477,
"loss_layer_24_head": 0.1936037689447403,
"loss_layer_30_head": 0.13256603479385376,
"loss_layer_36_head": 0.10135439783334732,
"loss_layer_42_head": 0.059117190539836884,
"loss_layer_6_head": 0.7407132387161255,
"step": 665
},
{
"epoch": 30.54131054131054,
"grad_norm": 0.8596300884902531,
"learning_rate": 0.004304112922051156,
"loss": 4.8952,
"loss_layer_12_head": 0.5269235968589783,
"loss_layer_18_head": 3.068875789642334,
"loss_layer_24_head": 0.20888909697532654,
"loss_layer_30_head": 0.17197871208190918,
"loss_layer_36_head": 0.09767322242259979,
"loss_layer_42_head": 0.05949018523097038,
"loss_layer_6_head": 0.7554638981819153,
"step": 670
},
{
"epoch": 30.76923076923077,
"grad_norm": 0.5476872326854121,
"learning_rate": 0.004289667123149296,
"loss": 4.7437,
"loss_layer_12_head": 0.4962801933288574,
"loss_layer_18_head": 2.9493045806884766,
"loss_layer_24_head": 0.21296346187591553,
"loss_layer_30_head": 0.1855914145708084,
"loss_layer_36_head": 0.09603692591190338,
"loss_layer_42_head": 0.05795098468661308,
"loss_layer_6_head": 0.734602153301239,
"step": 675
},
{
"epoch": 30.997150997150996,
"grad_norm": 0.5685599681812281,
"learning_rate": 0.00427509770505395,
"loss": 4.6172,
"loss_layer_12_head": 0.4792402684688568,
"loss_layer_18_head": 2.841484785079956,
"loss_layer_24_head": 0.20963506400585175,
"loss_layer_30_head": 0.17262789607048035,
"loss_layer_36_head": 0.09381120651960373,
"loss_layer_42_head": 0.056700825691223145,
"loss_layer_6_head": 0.7296031713485718,
"step": 680
},
{
"epoch": 31.225071225071225,
"grad_norm": 0.4938244163515248,
"learning_rate": 0.00426040567413088,
"loss": 4.4604,
"loss_layer_12_head": 0.42545080184936523,
"loss_layer_18_head": 2.7222859859466553,
"loss_layer_24_head": 0.19360534846782684,
"loss_layer_30_head": 0.1514999121427536,
"loss_layer_36_head": 0.08772268146276474,
"loss_layer_42_head": 0.06376411765813828,
"loss_layer_6_head": 0.6847577095031738,
"step": 685
},
{
"epoch": 31.45299145299145,
"grad_norm": 0.5301807150076605,
"learning_rate": 0.004245592045215182,
"loss": 4.3332,
"loss_layer_12_head": 0.42786169052124023,
"loss_layer_18_head": 2.699221134185791,
"loss_layer_24_head": 0.19219763576984406,
"loss_layer_30_head": 0.14631637930870056,
"loss_layer_36_head": 0.08851729333400726,
"loss_layer_42_head": 0.054960232228040695,
"loss_layer_6_head": 0.6882950663566589,
"step": 690
},
{
"epoch": 31.68091168091168,
"grad_norm": 0.4495253512587936,
"learning_rate": 0.004230657841541199,
"loss": 4.2686,
"loss_layer_12_head": 0.4296341836452484,
"loss_layer_18_head": 2.653172254562378,
"loss_layer_24_head": 0.19859598577022552,
"loss_layer_30_head": 0.14528068900108337,
"loss_layer_36_head": 0.08939908444881439,
"loss_layer_42_head": 0.05449342727661133,
"loss_layer_6_head": 0.6941906213760376,
"step": 695
},
{
"epoch": 31.90883190883191,
"grad_norm": 0.7842110163127093,
"learning_rate": 0.004215604094671834,
"loss": 4.2071,
"loss_layer_12_head": 0.4337303042411804,
"loss_layer_18_head": 2.5614027976989746,
"loss_layer_24_head": 0.2046552449464798,
"loss_layer_30_head": 0.13873878121376038,
"loss_layer_36_head": 0.08663885295391083,
"loss_layer_42_head": 0.06045306846499443,
"loss_layer_6_head": 0.7004567384719849,
"step": 700
},
{
"epoch": 32.136752136752136,
"grad_norm": 0.7965796662169216,
"learning_rate": 0.004200431844427298,
"loss": 4.1225,
"loss_layer_12_head": 0.4378797113895416,
"loss_layer_18_head": 2.602318525314331,
"loss_layer_24_head": 0.19969557225704193,
"loss_layer_30_head": 0.1381944864988327,
"loss_layer_36_head": 0.08641330897808075,
"loss_layer_42_head": 0.057823099195957184,
"loss_layer_6_head": 0.7061583995819092,
"step": 705
},
{
"epoch": 32.364672364672366,
"grad_norm": 0.605583079609419,
"learning_rate": 0.004185142138813288,
"loss": 4.1027,
"loss_layer_12_head": 0.42314305901527405,
"loss_layer_18_head": 2.5009102821350098,
"loss_layer_24_head": 0.19693370163440704,
"loss_layer_30_head": 0.1453002393245697,
"loss_layer_36_head": 0.08666378259658813,
"loss_layer_42_head": 0.054904550313949585,
"loss_layer_6_head": 0.7133627533912659,
"step": 710
},
{
"epoch": 32.592592592592595,
"grad_norm": 0.5948001098340642,
"learning_rate": 0.004169736033948593,
"loss": 4.0341,
"loss_layer_12_head": 0.41084012389183044,
"loss_layer_18_head": 2.4099440574645996,
"loss_layer_24_head": 0.18852706253528595,
"loss_layer_30_head": 0.1356324553489685,
"loss_layer_36_head": 0.08520137518644333,
"loss_layer_42_head": 0.05375955253839493,
"loss_layer_6_head": 0.6864619255065918,
"step": 715
},
{
"epoch": 32.82051282051282,
"grad_norm": 0.534772333872558,
"learning_rate": 0.004154214593992149,
"loss": 3.9781,
"loss_layer_12_head": 0.4393290579319,
"loss_layer_18_head": 2.4041430950164795,
"loss_layer_24_head": 0.2018025815486908,
"loss_layer_30_head": 0.14416970312595367,
"loss_layer_36_head": 0.09272973984479904,
"loss_layer_42_head": 0.07788576930761337,
"loss_layer_6_head": 0.6994581818580627,
"step": 720
},
{
"epoch": 33.04843304843305,
"grad_norm": 0.9032624922336923,
"learning_rate": 0.004138578891069526,
"loss": 4.08,
"loss_layer_12_head": 0.4454478621482849,
"loss_layer_18_head": 2.357255458831787,
"loss_layer_24_head": 0.32440370321273804,
"loss_layer_30_head": 0.13958565890789032,
"loss_layer_36_head": 0.1198217049241066,
"loss_layer_42_head": 0.07531634718179703,
"loss_layer_6_head": 0.7142508029937744,
"step": 725
},
{
"epoch": 33.27635327635328,
"grad_norm": 0.5157652034690633,
"learning_rate": 0.00412283000519888,
"loss": 4.047,
"loss_layer_12_head": 0.4016711115837097,
"loss_layer_18_head": 2.2794528007507324,
"loss_layer_24_head": 0.3679157495498657,
"loss_layer_30_head": 0.1256381869316101,
"loss_layer_36_head": 0.10068968683481216,
"loss_layer_42_head": 0.06270381063222885,
"loss_layer_6_head": 0.6438853144645691,
"step": 730
},
{
"epoch": 33.504273504273506,
"grad_norm": 0.7863247890392172,
"learning_rate": 0.004106969024216348,
"loss": 3.9464,
"loss_layer_12_head": 0.45711761713027954,
"loss_layer_18_head": 2.191732406616211,
"loss_layer_24_head": 0.33409491181373596,
"loss_layer_30_head": 0.1311335265636444,
"loss_layer_36_head": 0.10090503841638565,
"loss_layer_42_head": 0.0615161657333374,
"loss_layer_6_head": 0.6720151901245117,
"step": 735
},
{
"epoch": 33.732193732193736,
"grad_norm": 0.7890469543055019,
"learning_rate": 0.004090997043700909,
"loss": 3.9344,
"loss_layer_12_head": 0.47232159972190857,
"loss_layer_18_head": 2.2468361854553223,
"loss_layer_24_head": 0.29550954699516296,
"loss_layer_30_head": 0.13495633006095886,
"loss_layer_36_head": 0.09402237832546234,
"loss_layer_42_head": 0.05762190371751785,
"loss_layer_6_head": 0.6706913113594055,
"step": 740
},
{
"epoch": 33.96011396011396,
"grad_norm": 0.6929213281119304,
"learning_rate": 0.004074915166898703,
"loss": 3.8988,
"loss_layer_12_head": 0.48199015855789185,
"loss_layer_18_head": 2.14357328414917,
"loss_layer_24_head": 0.2758108973503113,
"loss_layer_30_head": 0.14659464359283447,
"loss_layer_36_head": 0.09513117372989655,
"loss_layer_42_head": 0.05680239200592041,
"loss_layer_6_head": 0.7120590806007385,
"step": 745
},
{
"epoch": 34.18803418803419,
"grad_norm": 0.7353968166938382,
"learning_rate": 0.004058724504646834,
"loss": 3.7431,
"loss_layer_12_head": 0.426646888256073,
"loss_layer_18_head": 2.1235299110412598,
"loss_layer_24_head": 0.23976945877075195,
"loss_layer_30_head": 0.12898829579353333,
"loss_layer_36_head": 0.08852064609527588,
"loss_layer_42_head": 0.054186951369047165,
"loss_layer_6_head": 0.6639636158943176,
"step": 750
},
{
"epoch": 34.41595441595442,
"grad_norm": 0.6556313337678951,
"learning_rate": 0.004042426175296631,
"loss": 3.69,
"loss_layer_12_head": 0.40987634658813477,
"loss_layer_18_head": 2.0121848583221436,
"loss_layer_24_head": 0.2265264093875885,
"loss_layer_30_head": 0.13357605040073395,
"loss_layer_36_head": 0.08718468993902206,
"loss_layer_42_head": 0.054076552391052246,
"loss_layer_6_head": 0.6570177674293518,
"step": 755
},
{
"epoch": 34.643874643874646,
"grad_norm": 0.503404286314858,
"learning_rate": 0.004026021304636408,
"loss": 3.6462,
"loss_layer_12_head": 0.43029722571372986,
"loss_layer_18_head": 2.0399160385131836,
"loss_layer_24_head": 0.22090141475200653,
"loss_layer_30_head": 0.13173122704029083,
"loss_layer_36_head": 0.0885952040553093,
"loss_layer_42_head": 0.053800784051418304,
"loss_layer_6_head": 0.702730119228363,
"step": 760
},
{
"epoch": 34.87179487179487,
"grad_norm": 0.477295528591719,
"learning_rate": 0.0040095110258136935,
"loss": 3.6617,
"loss_layer_12_head": 0.42810750007629395,
"loss_layer_18_head": 2.095346689224243,
"loss_layer_24_head": 0.21871677041053772,
"loss_layer_30_head": 0.1337631195783615,
"loss_layer_36_head": 0.0910869836807251,
"loss_layer_42_head": 0.056711532175540924,
"loss_layer_6_head": 0.6914973258972168,
"step": 765
},
{
"epoch": 35.0997150997151,
"grad_norm": 0.4481247294956435,
"learning_rate": 0.003992896479256966,
"loss": 3.5825,
"loss_layer_12_head": 0.4251008927822113,
"loss_layer_18_head": 1.9943554401397705,
"loss_layer_24_head": 0.2084464132785797,
"loss_layer_30_head": 0.12890145182609558,
"loss_layer_36_head": 0.08945901691913605,
"loss_layer_42_head": 0.052441976964473724,
"loss_layer_6_head": 0.6897009015083313,
"step": 770
},
{
"epoch": 35.32763532763533,
"grad_norm": 0.4019794924847544,
"learning_rate": 0.003976178812596875,
"loss": 3.4959,
"loss_layer_12_head": 0.404086172580719,
"loss_layer_18_head": 1.9240039587020874,
"loss_layer_24_head": 0.19800055027008057,
"loss_layer_30_head": 0.12846828997135162,
"loss_layer_36_head": 0.0879533439874649,
"loss_layer_42_head": 0.0561065748333931,
"loss_layer_6_head": 0.6614921689033508,
"step": 775
},
{
"epoch": 35.55555555555556,
"grad_norm": 0.4836393513656827,
"learning_rate": 0.003959359180586975,
"loss": 3.4916,
"loss_layer_12_head": 0.40365737676620483,
"loss_layer_18_head": 1.9353296756744385,
"loss_layer_24_head": 0.1916879415512085,
"loss_layer_30_head": 0.13406383991241455,
"loss_layer_36_head": 0.08549612015485764,
"loss_layer_42_head": 0.05212582275271416,
"loss_layer_6_head": 0.6601444482803345,
"step": 780
},
{
"epoch": 35.78347578347579,
"grad_norm": 0.7383438714425403,
"learning_rate": 0.003942438745023957,
"loss": 3.4015,
"loss_layer_12_head": 0.41292792558670044,
"loss_layer_18_head": 1.8313575983047485,
"loss_layer_24_head": 0.19283311069011688,
"loss_layer_30_head": 0.1311044692993164,
"loss_layer_36_head": 0.08634034544229507,
"loss_layer_42_head": 0.05280064791440964,
"loss_layer_6_head": 0.6659265160560608,
"step": 785
},
{
"epoch": 36.01139601139601,
"grad_norm": 0.4124024976711123,
"learning_rate": 0.003925418674667404,
"loss": 3.4839,
"loss_layer_12_head": 0.4237042963504791,
"loss_layer_18_head": 1.8229494094848633,
"loss_layer_24_head": 0.19902099668979645,
"loss_layer_30_head": 0.13239210844039917,
"loss_layer_36_head": 0.08935176581144333,
"loss_layer_42_head": 0.05493398755788803,
"loss_layer_6_head": 0.6895285844802856,
"step": 790
},
{
"epoch": 36.23931623931624,
"grad_norm": 0.580340353250173,
"learning_rate": 0.003908300145159055,
"loss": 3.3939,
"loss_layer_12_head": 0.3949509263038635,
"loss_layer_18_head": 1.807559609413147,
"loss_layer_24_head": 0.18570482730865479,
"loss_layer_30_head": 0.12840019166469574,
"loss_layer_36_head": 0.09089195728302002,
"loss_layer_42_head": 0.052501481026411057,
"loss_layer_6_head": 0.6542239189147949,
"step": 795
},
{
"epoch": 36.46723646723647,
"grad_norm": 0.8135204155958197,
"learning_rate": 0.003891084338941603,
"loss": 3.3605,
"loss_layer_12_head": 0.39419084787368774,
"loss_layer_18_head": 1.7904040813446045,
"loss_layer_24_head": 0.1839243322610855,
"loss_layer_30_head": 0.12546224892139435,
"loss_layer_36_head": 0.08821476995944977,
"loss_layer_42_head": 0.04802712798118591,
"loss_layer_6_head": 0.6653536558151245,
"step": 800
},
{
"epoch": 36.46723646723647,
"eval_loss": 4.920344829559326,
"eval_loss_layer_12_head": 0.8174667954444885,
"eval_loss_layer_18_head": 1.66545832157135,
"eval_loss_layer_24_head": 0.4409888684749603,
"eval_loss_layer_30_head": 0.30910196900367737,
"eval_loss_layer_36_head": 0.20546264946460724,
"eval_loss_layer_42_head": 0.13648581504821777,
"eval_loss_layer_6_head": 1.103848934173584,
"eval_runtime": 4.9515,
"eval_samples_per_second": 6.665,
"eval_steps_per_second": 0.606,
"step": 800
},
{
"epoch": 36.6951566951567,
"grad_norm": 0.8790506399999926,
"learning_rate": 0.003873772445177015,
"loss": 3.3515,
"loss_layer_12_head": 0.41721591353416443,
"loss_layer_18_head": 1.7524770498275757,
"loss_layer_24_head": 0.19004251062870026,
"loss_layer_30_head": 0.12846611440181732,
"loss_layer_36_head": 0.0869293063879013,
"loss_layer_42_head": 0.05517953634262085,
"loss_layer_6_head": 0.6763615012168884,
"step": 805
},
{
"epoch": 36.92307692307692,
"grad_norm": 0.6459144380295287,
"learning_rate": 0.0038563656596643987,
"loss": 3.4019,
"loss_layer_12_head": 0.4642051160335541,
"loss_layer_18_head": 1.7727333307266235,
"loss_layer_24_head": 0.18619823455810547,
"loss_layer_30_head": 0.12455607950687408,
"loss_layer_36_head": 0.08845292031764984,
"loss_layer_42_head": 0.07540778815746307,
"loss_layer_6_head": 0.6746565699577332,
"step": 810
},
{
"epoch": 37.15099715099715,
"grad_norm": 0.9964625761499913,
"learning_rate": 0.0038388651847573963,
"loss": 3.3717,
"loss_layer_12_head": 0.47130832076072693,
"loss_layer_18_head": 1.7519481182098389,
"loss_layer_24_head": 0.1870810091495514,
"loss_layer_30_head": 0.1326059103012085,
"loss_layer_36_head": 0.08662253618240356,
"loss_layer_42_head": 0.0689396858215332,
"loss_layer_6_head": 0.6901426315307617,
"step": 815
},
{
"epoch": 37.37891737891738,
"grad_norm": 0.7408034008604258,
"learning_rate": 0.0038212722292811385,
"loss": 3.3646,
"loss_layer_12_head": 0.4616113603115082,
"loss_layer_18_head": 1.7593231201171875,
"loss_layer_24_head": 0.18609504401683807,
"loss_layer_30_head": 0.12701207399368286,
"loss_layer_36_head": 0.08557876199483871,
"loss_layer_42_head": 0.06022990494966507,
"loss_layer_6_head": 0.6876755952835083,
"step": 820
},
{
"epoch": 37.60683760683761,
"grad_norm": 0.5628531711786754,
"learning_rate": 0.0038035880084487453,
"loss": 3.3382,
"loss_layer_12_head": 0.4506490230560303,
"loss_layer_18_head": 1.7187092304229736,
"loss_layer_24_head": 0.18839967250823975,
"loss_layer_30_head": 0.12567642331123352,
"loss_layer_36_head": 0.08187349885702133,
"loss_layer_42_head": 0.053150080144405365,
"loss_layer_6_head": 0.6763015985488892,
"step": 825
},
{
"epoch": 37.83475783475784,
"grad_norm": 0.39395241311283635,
"learning_rate": 0.003785813743777384,
"loss": 3.2585,
"loss_layer_12_head": 0.46294349431991577,
"loss_layer_18_head": 1.6916673183441162,
"loss_layer_24_head": 0.18889106810092926,
"loss_layer_30_head": 0.12418197095394135,
"loss_layer_36_head": 0.08617638051509857,
"loss_layer_42_head": 0.05219808965921402,
"loss_layer_6_head": 0.6769475936889648,
"step": 830
},
{
"epoch": 38.06267806267806,
"grad_norm": 0.4833603204338285,
"learning_rate": 0.003767950663003898,
"loss": 3.2325,
"loss_layer_12_head": 0.45097413659095764,
"loss_layer_18_head": 1.6738840341567993,
"loss_layer_24_head": 0.1965884268283844,
"loss_layer_30_head": 0.13039812445640564,
"loss_layer_36_head": 0.0916060358285904,
"loss_layer_42_head": 0.05350957438349724,
"loss_layer_6_head": 0.6804872751235962,
"step": 835
},
{
"epoch": 38.29059829059829,
"grad_norm": 0.4615221255562949,
"learning_rate": 0.00375,
"loss": 3.1414,
"loss_layer_12_head": 0.41619110107421875,
"loss_layer_18_head": 1.646401047706604,
"loss_layer_24_head": 0.18583571910858154,
"loss_layer_30_head": 0.1233031377196312,
"loss_layer_36_head": 0.0816354975104332,
"loss_layer_42_head": 0.052404772490262985,
"loss_layer_6_head": 0.6518012881278992,
"step": 840
},
{
"epoch": 38.51851851851852,
"grad_norm": 0.6077690682168392,
"learning_rate": 0.0037319629946870442,
"loss": 3.1353,
"loss_layer_12_head": 0.4147290289402008,
"loss_layer_18_head": 1.6200926303863525,
"loss_layer_24_head": 0.19160741567611694,
"loss_layer_30_head": 0.12613636255264282,
"loss_layer_36_head": 0.08432339131832123,
"loss_layer_42_head": 0.05299381539225578,
"loss_layer_6_head": 0.6604996919631958,
"step": 845
},
{
"epoch": 38.74643874643875,
"grad_norm": 0.5471138428591616,
"learning_rate": 0.0037138408929503802,
"loss": 3.1415,
"loss_layer_12_head": 0.4221917688846588,
"loss_layer_18_head": 1.5912879705429077,
"loss_layer_24_head": 0.19453270733356476,
"loss_layer_30_head": 0.1206267923116684,
"loss_layer_36_head": 0.0878443568944931,
"loss_layer_42_head": 0.049645692110061646,
"loss_layer_6_head": 0.68048095703125,
"step": 850
},
{
"epoch": 38.97435897435897,
"grad_norm": 0.6581590592047735,
"learning_rate": 0.0036956349465532955,
"loss": 3.082,
"loss_layer_12_head": 0.4116048216819763,
"loss_layer_18_head": 1.5232570171356201,
"loss_layer_24_head": 0.19163082540035248,
"loss_layer_30_head": 0.12269763648509979,
"loss_layer_36_head": 0.08780638873577118,
"loss_layer_42_head": 0.053409360349178314,
"loss_layer_6_head": 0.6515310406684875,
"step": 855
},
{
"epoch": 39.2022792022792,
"grad_norm": 0.41965636629777003,
"learning_rate": 0.0036773464130505505,
"loss": 3.0273,
"loss_layer_12_head": 0.4055556654930115,
"loss_layer_18_head": 1.5530657768249512,
"loss_layer_24_head": 0.18856747448444366,
"loss_layer_30_head": 0.11959820985794067,
"loss_layer_36_head": 0.08429338783025742,
"loss_layer_42_head": 0.050204742699861526,
"loss_layer_6_head": 0.6473517417907715,
"step": 860
},
{
"epoch": 39.43019943019943,
"grad_norm": 0.5358440091372462,
"learning_rate": 0.0036589765557015143,
"loss": 2.9578,
"loss_layer_12_head": 0.3931970000267029,
"loss_layer_18_head": 1.5244758129119873,
"loss_layer_24_head": 0.18152835965156555,
"loss_layer_30_head": 0.11682531982660294,
"loss_layer_36_head": 0.08009742945432663,
"loss_layer_42_head": 0.04725899547338486,
"loss_layer_6_head": 0.6312907338142395,
"step": 865
},
{
"epoch": 39.65811965811966,
"grad_norm": 0.6894125877078248,
"learning_rate": 0.0036405266433829075,
"loss": 3.0322,
"loss_layer_12_head": 0.39468279480934143,
"loss_layer_18_head": 1.5026872158050537,
"loss_layer_24_head": 0.17886415123939514,
"loss_layer_30_head": 0.13253279030323029,
"loss_layer_36_head": 0.08210141956806183,
"loss_layer_42_head": 0.0509297177195549,
"loss_layer_6_head": 0.6466023325920105,
"step": 870
},
{
"epoch": 39.88603988603989,
"grad_norm": 0.8054205400910994,
"learning_rate": 0.0036219979505011557,
"loss": 3.1048,
"loss_layer_12_head": 0.426312118768692,
"loss_layer_18_head": 1.4662295579910278,
"loss_layer_24_head": 0.19096367061138153,
"loss_layer_30_head": 0.17445117235183716,
"loss_layer_36_head": 0.08434946835041046,
"loss_layer_42_head": 0.06008179858326912,
"loss_layer_6_head": 0.689109742641449,
"step": 875
},
{
"epoch": 40.11396011396011,
"grad_norm": 0.5198267607735384,
"learning_rate": 0.00360339175690436,
"loss": 3.0127,
"loss_layer_12_head": 0.4104757308959961,
"loss_layer_18_head": 1.467791199684143,
"loss_layer_24_head": 0.18111568689346313,
"loss_layer_30_head": 0.1645602285861969,
"loss_layer_36_head": 0.0877971202135086,
"loss_layer_42_head": 0.05487741157412529,
"loss_layer_6_head": 0.658033549785614,
"step": 880
},
{
"epoch": 40.34188034188034,
"grad_norm": 0.5315864148753724,
"learning_rate": 0.0035847093477938954,
"loss": 2.9515,
"loss_layer_12_head": 0.39946484565734863,
"loss_layer_18_head": 1.433086633682251,
"loss_layer_24_head": 0.1778954565525055,
"loss_layer_30_head": 0.14781329035758972,
"loss_layer_36_head": 0.08687538653612137,
"loss_layer_42_head": 0.051437150686979294,
"loss_layer_6_head": 0.6485563516616821,
"step": 885
},
{
"epoch": 40.56980056980057,
"grad_norm": 0.5423520974281545,
"learning_rate": 0.003565952013635635,
"loss": 2.9617,
"loss_layer_12_head": 0.3888796865940094,
"loss_layer_18_head": 1.4096533060073853,
"loss_layer_24_head": 0.1837933510541916,
"loss_layer_30_head": 0.13355405628681183,
"loss_layer_36_head": 0.0816507488489151,
"loss_layer_42_head": 0.05050645396113396,
"loss_layer_6_head": 0.6469558477401733,
"step": 890
},
{
"epoch": 40.7977207977208,
"grad_norm": 0.4530753536523479,
"learning_rate": 0.0035471210500708124,
"loss": 2.9395,
"loss_layer_12_head": 0.41357699036598206,
"loss_layer_18_head": 1.437182068824768,
"loss_layer_24_head": 0.1888083517551422,
"loss_layer_30_head": 0.13409912586212158,
"loss_layer_36_head": 0.08812478184700012,
"loss_layer_42_head": 0.05342671275138855,
"loss_layer_6_head": 0.6669130325317383,
"step": 895
},
{
"epoch": 41.02564102564103,
"grad_norm": 0.5159717906721856,
"learning_rate": 0.0035282177578265296,
"loss": 2.8916,
"loss_layer_12_head": 0.41983136534690857,
"loss_layer_18_head": 1.4326869249343872,
"loss_layer_24_head": 0.192642480134964,
"loss_layer_30_head": 0.13045620918273926,
"loss_layer_36_head": 0.08625391125679016,
"loss_layer_42_head": 0.05178719013929367,
"loss_layer_6_head": 0.6592742204666138,
"step": 900
},
{
"epoch": 41.25356125356125,
"grad_norm": 0.6128247487167494,
"learning_rate": 0.0035092434426259055,
"loss": 2.8484,
"loss_layer_12_head": 0.39752325415611267,
"loss_layer_18_head": 1.4147056341171265,
"loss_layer_24_head": 0.17930717766284943,
"loss_layer_30_head": 0.12136325985193253,
"loss_layer_36_head": 0.07983438670635223,
"loss_layer_42_head": 0.04938334599137306,
"loss_layer_6_head": 0.6326755285263062,
"step": 905
},
{
"epoch": 41.48148148148148,
"grad_norm": 0.5370777148494995,
"learning_rate": 0.003490199415097892,
"loss": 2.8121,
"loss_layer_12_head": 0.4000681936740875,
"loss_layer_18_head": 1.3680517673492432,
"loss_layer_24_head": 0.17827217280864716,
"loss_layer_30_head": 0.12119412422180176,
"loss_layer_36_head": 0.07962208241224289,
"loss_layer_42_head": 0.04752824455499649,
"loss_layer_6_head": 0.6287747621536255,
"step": 910
},
{
"epoch": 41.70940170940171,
"grad_norm": 0.7708829574678,
"learning_rate": 0.003471086990686737,
"loss": 2.8402,
"loss_layer_12_head": 0.4188622534275055,
"loss_layer_18_head": 1.355021595954895,
"loss_layer_24_head": 0.19683413207530975,
"loss_layer_30_head": 0.12976667284965515,
"loss_layer_36_head": 0.0859081894159317,
"loss_layer_42_head": 0.056226927787065506,
"loss_layer_6_head": 0.6642878651618958,
"step": 915
},
{
"epoch": 41.93732193732194,
"grad_norm": 0.7449833212059208,
"learning_rate": 0.003451907489561124,
"loss": 2.8444,
"loss_layer_12_head": 0.41541725397109985,
"loss_layer_18_head": 1.3178496360778809,
"loss_layer_24_head": 0.18967287242412567,
"loss_layer_30_head": 0.12230970710515976,
"loss_layer_36_head": 0.08014537394046783,
"loss_layer_42_head": 0.05525387078523636,
"loss_layer_6_head": 0.6513751745223999,
"step": 920
},
{
"epoch": 42.16524216524216,
"grad_norm": 0.5477559692126953,
"learning_rate": 0.0034326622365229847,
"loss": 2.7887,
"loss_layer_12_head": 0.3959849774837494,
"loss_layer_18_head": 1.2648799419403076,
"loss_layer_24_head": 0.18191051483154297,
"loss_layer_30_head": 0.1164868101477623,
"loss_layer_36_head": 0.08249407261610031,
"loss_layer_42_head": 0.05099458247423172,
"loss_layer_6_head": 0.6313720941543579,
"step": 925
},
{
"epoch": 42.39316239316239,
"grad_norm": 0.4794834286147715,
"learning_rate": 0.0034133525609159883,
"loss": 2.7665,
"loss_layer_12_head": 0.4049781858921051,
"loss_layer_18_head": 1.2870421409606934,
"loss_layer_24_head": 0.19015896320343018,
"loss_layer_30_head": 0.11839592456817627,
"loss_layer_36_head": 0.08894439786672592,
"loss_layer_42_head": 0.0506242997944355,
"loss_layer_6_head": 0.6355887651443481,
"step": 930
},
{
"epoch": 42.62108262108262,
"grad_norm": 0.49916006862369783,
"learning_rate": 0.0033939797965337154,
"loss": 2.7637,
"loss_layer_12_head": 0.41396409273147583,
"loss_layer_18_head": 1.2755236625671387,
"loss_layer_24_head": 0.19518128037452698,
"loss_layer_30_head": 0.12190593779087067,
"loss_layer_36_head": 0.09020756930112839,
"loss_layer_42_head": 0.050995856523513794,
"loss_layer_6_head": 0.6555052995681763,
"step": 935
},
{
"epoch": 42.84900284900285,
"grad_norm": 0.4882390844422191,
"learning_rate": 0.0033745452815275375,
"loss": 2.7417,
"loss_layer_12_head": 0.40585416555404663,
"loss_layer_18_head": 1.2714842557907104,
"loss_layer_24_head": 0.18599747121334076,
"loss_layer_30_head": 0.11775036156177521,
"loss_layer_36_head": 0.08614761382341385,
"loss_layer_42_head": 0.048269741237163544,
"loss_layer_6_head": 0.6390920281410217,
"step": 940
},
{
"epoch": 43.07692307692308,
"grad_norm": 0.39823098594662537,
"learning_rate": 0.003355050358314172,
"loss": 2.6637,
"loss_layer_12_head": 0.3847056031227112,
"loss_layer_18_head": 1.2039114236831665,
"loss_layer_24_head": 0.176110178232193,
"loss_layer_30_head": 0.11044038832187653,
"loss_layer_36_head": 0.08465754985809326,
"loss_layer_42_head": 0.04742031544446945,
"loss_layer_6_head": 0.5980373620986938,
"step": 945
},
{
"epoch": 43.3048433048433,
"grad_norm": 0.6900260033440537,
"learning_rate": 0.0033354963734829692,
"loss": 2.7018,
"loss_layer_12_head": 0.3878379762172699,
"loss_layer_18_head": 1.2014033794403076,
"loss_layer_24_head": 0.17556259036064148,
"loss_layer_30_head": 0.11799554526805878,
"loss_layer_36_head": 0.08010242134332657,
"loss_layer_42_head": 0.04850541055202484,
"loss_layer_6_head": 0.6182578206062317,
"step": 950
},
{
"epoch": 43.53276353276353,
"grad_norm": 0.47429116139160615,
"learning_rate": 0.0033158846777028893,
"loss": 2.6455,
"loss_layer_12_head": 0.3960720896720886,
"loss_layer_18_head": 1.1679050922393799,
"loss_layer_24_head": 0.17802616953849792,
"loss_layer_30_head": 0.12192656844854355,
"loss_layer_36_head": 0.0852232277393341,
"loss_layer_42_head": 0.049923814833164215,
"loss_layer_6_head": 0.6219567060470581,
"step": 955
},
{
"epoch": 43.76068376068376,
"grad_norm": 0.4188547473795177,
"learning_rate": 0.0032962166256292114,
"loss": 2.6593,
"loss_layer_12_head": 0.39642995595932007,
"loss_layer_18_head": 1.1589549779891968,
"loss_layer_24_head": 0.1802503764629364,
"loss_layer_30_head": 0.11738133430480957,
"loss_layer_36_head": 0.08084283769130707,
"loss_layer_42_head": 0.0486336275935173,
"loss_layer_6_head": 0.624832034111023,
"step": 960
},
{
"epoch": 43.98860398860399,
"grad_norm": 0.6326866995847439,
"learning_rate": 0.0032764935758099597,
"loss": 2.6418,
"loss_layer_12_head": 0.3885257840156555,
"loss_layer_18_head": 1.1653964519500732,
"loss_layer_24_head": 0.18046359717845917,
"loss_layer_30_head": 0.1161709874868393,
"loss_layer_36_head": 0.07991600781679153,
"loss_layer_42_head": 0.04998940974473953,
"loss_layer_6_head": 0.6206977963447571,
"step": 965
},
{
"epoch": 44.21652421652421,
"grad_norm": 0.6425900494900251,
"learning_rate": 0.003256716890592065,
"loss": 2.5966,
"loss_layer_12_head": 0.3927614986896515,
"loss_layer_18_head": 1.167649745941162,
"loss_layer_24_head": 0.18743163347244263,
"loss_layer_30_head": 0.11512549221515656,
"loss_layer_36_head": 0.07711974531412125,
"loss_layer_42_head": 0.047446832060813904,
"loss_layer_6_head": 0.6326077580451965,
"step": 970
},
{
"epoch": 44.44444444444444,
"grad_norm": 0.6079576741884587,
"learning_rate": 0.003236887936027261,
"loss": 2.5795,
"loss_layer_12_head": 0.39777034521102905,
"loss_layer_18_head": 1.1651580333709717,
"loss_layer_24_head": 0.18374374508857727,
"loss_layer_30_head": 0.11680477857589722,
"loss_layer_36_head": 0.07859492301940918,
"loss_layer_42_head": 0.047878436744213104,
"loss_layer_6_head": 0.6312351226806641,
"step": 975
},
{
"epoch": 44.67236467236467,
"grad_norm": 0.5747495318763676,
"learning_rate": 0.003217008081777726,
"loss": 2.5885,
"loss_layer_12_head": 0.401299387216568,
"loss_layer_18_head": 1.1307117938995361,
"loss_layer_24_head": 0.18022188544273376,
"loss_layer_30_head": 0.11754100024700165,
"loss_layer_36_head": 0.07774122804403305,
"loss_layer_42_head": 0.05039285495877266,
"loss_layer_6_head": 0.6277211904525757,
"step": 980
},
{
"epoch": 44.9002849002849,
"grad_norm": 0.6882788247944827,
"learning_rate": 0.003197078701021476,
"loss": 2.6245,
"loss_layer_12_head": 0.4062952399253845,
"loss_layer_18_head": 1.1433289051055908,
"loss_layer_24_head": 0.18742702901363373,
"loss_layer_30_head": 0.12414474785327911,
"loss_layer_36_head": 0.0802634060382843,
"loss_layer_42_head": 0.05628126859664917,
"loss_layer_6_head": 0.6465874910354614,
"step": 985
},
{
"epoch": 45.12820512820513,
"grad_norm": 0.6548183936802736,
"learning_rate": 0.003177101170357513,
"loss": 2.5927,
"loss_layer_12_head": 0.39534568786621094,
"loss_layer_18_head": 1.1005592346191406,
"loss_layer_24_head": 0.18521855771541595,
"loss_layer_30_head": 0.1259688138961792,
"loss_layer_36_head": 0.07717721909284592,
"loss_layer_42_head": 0.05370775982737541,
"loss_layer_6_head": 0.6309455633163452,
"step": 990
},
{
"epoch": 45.356125356125354,
"grad_norm": 0.40696849199308827,
"learning_rate": 0.0031570768697107383,
"loss": 2.5165,
"loss_layer_12_head": 0.3873128294944763,
"loss_layer_18_head": 1.0747811794281006,
"loss_layer_24_head": 0.17662569880485535,
"loss_layer_30_head": 0.1168486624956131,
"loss_layer_36_head": 0.07445703446865082,
"loss_layer_42_head": 0.05355915427207947,
"loss_layer_6_head": 0.6133853793144226,
"step": 995
},
{
"epoch": 45.58404558404558,
"grad_norm": 0.3913225073967932,
"learning_rate": 0.003137007182236637,
"loss": 2.5177,
"loss_layer_12_head": 0.38922491669654846,
"loss_layer_18_head": 1.0804240703582764,
"loss_layer_24_head": 0.17917446792125702,
"loss_layer_30_head": 0.1220487579703331,
"loss_layer_36_head": 0.0791519358754158,
"loss_layer_42_head": 0.049967654049396515,
"loss_layer_6_head": 0.6184254884719849,
"step": 1000
},
{
"epoch": 45.58404558404558,
"eval_loss": 4.238807201385498,
"eval_loss_layer_12_head": 0.8041837215423584,
"eval_loss_layer_18_head": 1.1114610433578491,
"eval_loss_layer_24_head": 0.4403406083583832,
"eval_loss_layer_30_head": 0.303775817155838,
"eval_loss_layer_36_head": 0.22167228162288666,
"eval_loss_layer_42_head": 0.1412229984998703,
"eval_loss_layer_6_head": 1.0907320976257324,
"eval_runtime": 4.9434,
"eval_samples_per_second": 6.676,
"eval_steps_per_second": 0.607,
"step": 1000
},
{
"epoch": 45.81196581196581,
"grad_norm": 0.698286057801242,
"learning_rate": 0.0031168934942257336,
"loss": 2.5466,
"loss_layer_12_head": 0.39911216497421265,
"loss_layer_18_head": 1.0675843954086304,
"loss_layer_24_head": 0.18049687147140503,
"loss_layer_30_head": 0.11798091977834702,
"loss_layer_36_head": 0.09117527306079865,
"loss_layer_42_head": 0.0478927418589592,
"loss_layer_6_head": 0.6361227035522461,
"step": 1005
},
{
"epoch": 46.03988603988604,
"grad_norm": 0.5973347868112177,
"learning_rate": 0.003096737195007845,
"loss": 2.5263,
"loss_layer_12_head": 0.3921283185482025,
"loss_layer_18_head": 1.0628966093063354,
"loss_layer_24_head": 0.18103823065757751,
"loss_layer_30_head": 0.11626795679330826,
"loss_layer_36_head": 0.08541289716959,
"loss_layer_42_head": 0.048044394701719284,
"loss_layer_6_head": 0.6301760673522949,
"step": 1010
},
{
"epoch": 46.267806267806264,
"grad_norm": 0.43531050494934503,
"learning_rate": 0.0030765396768561003,
"loss": 2.4678,
"loss_layer_12_head": 0.37627744674682617,
"loss_layer_18_head": 1.0077391862869263,
"loss_layer_24_head": 0.1717483401298523,
"loss_layer_30_head": 0.11306069046258926,
"loss_layer_36_head": 0.07940587401390076,
"loss_layer_42_head": 0.046035267412662506,
"loss_layer_6_head": 0.6145161390304565,
"step": 1015
},
{
"epoch": 46.495726495726494,
"grad_norm": 0.36523473062420453,
"learning_rate": 0.003056302334890786,
"loss": 2.4591,
"loss_layer_12_head": 0.3900749087333679,
"loss_layer_18_head": 1.0151443481445312,
"loss_layer_24_head": 0.17755632102489471,
"loss_layer_30_head": 0.12042151391506195,
"loss_layer_36_head": 0.08054070919752121,
"loss_layer_42_head": 0.04895460605621338,
"loss_layer_6_head": 0.6219097375869751,
"step": 1020
},
{
"epoch": 46.72364672364672,
"grad_norm": 0.4448392594194299,
"learning_rate": 0.003036026566982969,
"loss": 2.4762,
"loss_layer_12_head": 0.4033172130584717,
"loss_layer_18_head": 1.0222870111465454,
"loss_layer_24_head": 0.18822529911994934,
"loss_layer_30_head": 0.12556472420692444,
"loss_layer_36_head": 0.08045311272144318,
"loss_layer_42_head": 0.04705699533224106,
"loss_layer_6_head": 0.6353141069412231,
"step": 1025
},
{
"epoch": 46.95156695156695,
"grad_norm": 0.4351453730838485,
"learning_rate": 0.0030157137736579443,
"loss": 2.4562,
"loss_layer_12_head": 0.3938303589820862,
"loss_layer_18_head": 0.969444751739502,
"loss_layer_24_head": 0.1871107816696167,
"loss_layer_30_head": 0.11789570748806,
"loss_layer_36_head": 0.07739175856113434,
"loss_layer_42_head": 0.046495065093040466,
"loss_layer_6_head": 0.6121574640274048,
"step": 1030
},
{
"epoch": 47.17948717948718,
"grad_norm": 0.45366609315296125,
"learning_rate": 0.002995365357998494,
"loss": 2.3923,
"loss_layer_12_head": 0.3830070495605469,
"loss_layer_18_head": 1.0128097534179688,
"loss_layer_24_head": 0.17866311967372894,
"loss_layer_30_head": 0.11667140573263168,
"loss_layer_36_head": 0.07569454610347748,
"loss_layer_42_head": 0.045610617846250534,
"loss_layer_6_head": 0.5986355543136597,
"step": 1035
},
{
"epoch": 47.407407407407405,
"grad_norm": 0.46986912758472016,
"learning_rate": 0.0029749827255479756,
"loss": 2.4144,
"loss_layer_12_head": 0.3797675669193268,
"loss_layer_18_head": 0.964137852191925,
"loss_layer_24_head": 0.1742846965789795,
"loss_layer_30_head": 0.11462786048650742,
"loss_layer_36_head": 0.0758369192481041,
"loss_layer_42_head": 0.0473390556871891,
"loss_layer_6_head": 0.6031911373138428,
"step": 1040
},
{
"epoch": 47.635327635327634,
"grad_norm": 0.5575118739748411,
"learning_rate": 0.002954567284213227,
"loss": 2.3859,
"loss_layer_12_head": 0.3897952139377594,
"loss_layer_18_head": 0.9797992706298828,
"loss_layer_24_head": 0.182371586561203,
"loss_layer_30_head": 0.12018144130706787,
"loss_layer_36_head": 0.07999955862760544,
"loss_layer_42_head": 0.04792632535099983,
"loss_layer_6_head": 0.6161386370658875,
"step": 1045
},
{
"epoch": 47.863247863247864,
"grad_norm": 0.40576783272964373,
"learning_rate": 0.0029341204441673263,
"loss": 2.3962,
"loss_layer_12_head": 0.3958156704902649,
"loss_layer_18_head": 0.9727069139480591,
"loss_layer_24_head": 0.17981596291065216,
"loss_layer_30_head": 0.1175287589430809,
"loss_layer_36_head": 0.08136500418186188,
"loss_layer_42_head": 0.045958392322063446,
"loss_layer_6_head": 0.6214891672134399,
"step": 1050
},
{
"epoch": 48.09116809116809,
"grad_norm": 0.41333807616852153,
"learning_rate": 0.002913643617752178,
"loss": 2.3637,
"loss_layer_12_head": 0.38183653354644775,
"loss_layer_18_head": 0.9352052807807922,
"loss_layer_24_head": 0.17474476993083954,
"loss_layer_30_head": 0.11326322704553604,
"loss_layer_36_head": 0.07785725593566895,
"loss_layer_42_head": 0.05166046693921089,
"loss_layer_6_head": 0.6003078818321228,
"step": 1055
},
{
"epoch": 48.319088319088316,
"grad_norm": 0.4902476881076566,
"learning_rate": 0.0028931382193809634,
"loss": 2.3199,
"loss_layer_12_head": 0.3981599509716034,
"loss_layer_18_head": 0.959009051322937,
"loss_layer_24_head": 0.175415500998497,
"loss_layer_30_head": 0.11643791198730469,
"loss_layer_36_head": 0.07993616163730621,
"loss_layer_42_head": 0.05208883807063103,
"loss_layer_6_head": 0.61378014087677,
"step": 1060
},
{
"epoch": 48.547008547008545,
"grad_norm": 0.41588329915773303,
"learning_rate": 0.0028726056654404357,
"loss": 2.3284,
"loss_layer_12_head": 0.39041703939437866,
"loss_layer_18_head": 0.8856471180915833,
"loss_layer_24_head": 0.16735294461250305,
"loss_layer_30_head": 0.10891375690698624,
"loss_layer_36_head": 0.07393185794353485,
"loss_layer_42_head": 0.046171437948942184,
"loss_layer_6_head": 0.587515652179718,
"step": 1065
},
{
"epoch": 48.774928774928775,
"grad_norm": 0.5548900668516289,
"learning_rate": 0.002852047374193092,
"loss": 2.3587,
"loss_layer_12_head": 0.4144614338874817,
"loss_layer_18_head": 0.9471640586853027,
"loss_layer_24_head": 0.18089620769023895,
"loss_layer_30_head": 0.11908881366252899,
"loss_layer_36_head": 0.08078965544700623,
"loss_layer_42_head": 0.05113198235630989,
"loss_layer_6_head": 0.631020724773407,
"step": 1070
},
{
"epoch": 49.002849002849004,
"grad_norm": 0.41121797339069344,
"learning_rate": 0.0028314647656791985,
"loss": 2.367,
"loss_layer_12_head": 0.3957933783531189,
"loss_layer_18_head": 0.9304808378219604,
"loss_layer_24_head": 0.175583153963089,
"loss_layer_30_head": 0.11424344778060913,
"loss_layer_36_head": 0.0768514946103096,
"loss_layer_42_head": 0.04772614315152168,
"loss_layer_6_head": 0.6173755526542664,
"step": 1075
},
{
"epoch": 49.23076923076923,
"grad_norm": 0.5259763425718715,
"learning_rate": 0.0028108592616187134,
"loss": 2.2938,
"loss_layer_12_head": 0.3790958821773529,
"loss_layer_18_head": 0.8951139450073242,
"loss_layer_24_head": 0.17159327864646912,
"loss_layer_30_head": 0.11063919961452484,
"loss_layer_36_head": 0.07651374489068985,
"loss_layer_42_head": 0.04569822549819946,
"loss_layer_6_head": 0.6034574508666992,
"step": 1080
},
{
"epoch": 49.458689458689456,
"grad_norm": 0.5088721335099942,
"learning_rate": 0.002790232285313076,
"loss": 2.3113,
"loss_layer_12_head": 0.37154486775398254,
"loss_layer_18_head": 0.853110134601593,
"loss_layer_24_head": 0.17178836464881897,
"loss_layer_30_head": 0.11050143092870712,
"loss_layer_36_head": 0.07364407926797867,
"loss_layer_42_head": 0.04579634219408035,
"loss_layer_6_head": 0.6029828786849976,
"step": 1085
},
{
"epoch": 49.686609686609685,
"grad_norm": 0.478302427669861,
"learning_rate": 0.0027695852615468967,
"loss": 2.2896,
"loss_layer_12_head": 0.3795209527015686,
"loss_layer_18_head": 0.8706048727035522,
"loss_layer_24_head": 0.17759330570697784,
"loss_layer_30_head": 0.11326000839471817,
"loss_layer_36_head": 0.07385507971048355,
"loss_layer_42_head": 0.0455007366836071,
"loss_layer_6_head": 0.6164635419845581,
"step": 1090
},
{
"epoch": 49.914529914529915,
"grad_norm": 0.5235830286531326,
"learning_rate": 0.002748919616489542,
"loss": 2.2904,
"loss_layer_12_head": 0.37914353609085083,
"loss_layer_18_head": 0.8292545080184937,
"loss_layer_24_head": 0.17843201756477356,
"loss_layer_30_head": 0.11026974022388458,
"loss_layer_36_head": 0.0743885412812233,
"loss_layer_42_head": 0.04483931511640549,
"loss_layer_6_head": 0.605587363243103,
"step": 1095
},
{
"epoch": 50.142450142450144,
"grad_norm": 0.434150702314289,
"learning_rate": 0.002728236777596621,
"loss": 2.2794,
"loss_layer_12_head": 0.38162827491760254,
"loss_layer_18_head": 0.8877264261245728,
"loss_layer_24_head": 0.17784467339515686,
"loss_layer_30_head": 0.11130378395318985,
"loss_layer_36_head": 0.07626891881227493,
"loss_layer_42_head": 0.044149916619062424,
"loss_layer_6_head": 0.6123658418655396,
"step": 1100
},
{
"epoch": 50.370370370370374,
"grad_norm": 0.6419194201898322,
"learning_rate": 0.0027075381735113878,
"loss": 2.2657,
"loss_layer_12_head": 0.38760074973106384,
"loss_layer_18_head": 0.8854343295097351,
"loss_layer_24_head": 0.17738394439220428,
"loss_layer_30_head": 0.11329641193151474,
"loss_layer_36_head": 0.07881636917591095,
"loss_layer_42_head": 0.043652262538671494,
"loss_layer_6_head": 0.6228241920471191,
"step": 1105
},
{
"epoch": 50.598290598290596,
"grad_norm": 0.6461273738487177,
"learning_rate": 0.002686825233966061,
"loss": 2.2678,
"loss_layer_12_head": 0.38489505648612976,
"loss_layer_18_head": 0.8524206280708313,
"loss_layer_24_head": 0.17580567300319672,
"loss_layer_30_head": 0.11257897317409515,
"loss_layer_36_head": 0.07540023326873779,
"loss_layer_42_head": 0.04346206784248352,
"loss_layer_6_head": 0.6352558135986328,
"step": 1110
},
{
"epoch": 50.826210826210826,
"grad_norm": 0.522207949826169,
"learning_rate": 0.002666099389683061,
"loss": 2.2585,
"loss_layer_12_head": 0.3890642821788788,
"loss_layer_18_head": 0.8386470079421997,
"loss_layer_24_head": 0.17881402373313904,
"loss_layer_30_head": 0.11530689895153046,
"loss_layer_36_head": 0.07785026729106903,
"loss_layer_42_head": 0.044162243604660034,
"loss_layer_6_head": 0.6546280384063721,
"step": 1115
},
{
"epoch": 51.054131054131055,
"grad_norm": 0.5318721661025899,
"learning_rate": 0.0026453620722761894,
"loss": 2.3236,
"loss_layer_12_head": 0.3961235582828522,
"loss_layer_18_head": 0.8308088183403015,
"loss_layer_24_head": 0.1806981861591339,
"loss_layer_30_head": 0.11701737344264984,
"loss_layer_36_head": 0.08028064668178558,
"loss_layer_42_head": 0.046638570725917816,
"loss_layer_6_head": 0.6974906921386719,
"step": 1120
},
{
"epoch": 51.282051282051285,
"grad_norm": 0.7094454068405147,
"learning_rate": 0.002624614714151743,
"loss": 2.2551,
"loss_layer_12_head": 0.3696451485157013,
"loss_layer_18_head": 0.8046764135360718,
"loss_layer_24_head": 0.17042198777198792,
"loss_layer_30_head": 0.11222568899393082,
"loss_layer_36_head": 0.07389844954013824,
"loss_layer_42_head": 0.048138294368982315,
"loss_layer_6_head": 0.6442863345146179,
"step": 1125
},
{
"epoch": 51.50997150997151,
"grad_norm": 0.43402133001757,
"learning_rate": 0.002603858748409567,
"loss": 2.2282,
"loss_layer_12_head": 0.38198503851890564,
"loss_layer_18_head": 0.8142977952957153,
"loss_layer_24_head": 0.17379167675971985,
"loss_layer_30_head": 0.11386610567569733,
"loss_layer_36_head": 0.07543236017227173,
"loss_layer_42_head": 0.046159930527210236,
"loss_layer_6_head": 0.6551558375358582,
"step": 1130
},
{
"epoch": 51.73789173789174,
"grad_norm": 0.38917348088482573,
"learning_rate": 0.0025830956087440665,
"loss": 2.2275,
"loss_layer_12_head": 0.38181477785110474,
"loss_layer_18_head": 0.7957919836044312,
"loss_layer_24_head": 0.17541435360908508,
"loss_layer_30_head": 0.11523507535457611,
"loss_layer_36_head": 0.07723738998174667,
"loss_layer_42_head": 0.047955431044101715,
"loss_layer_6_head": 0.6321656703948975,
"step": 1135
},
{
"epoch": 51.965811965811966,
"grad_norm": 0.3637496388028228,
"learning_rate": 0.0025623267293451825,
"loss": 2.2314,
"loss_layer_12_head": 0.3790660500526428,
"loss_layer_18_head": 0.8120620846748352,
"loss_layer_24_head": 0.17084448039531708,
"loss_layer_30_head": 0.11187763512134552,
"loss_layer_36_head": 0.07578536123037338,
"loss_layer_42_head": 0.045785531401634216,
"loss_layer_6_head": 0.6161836385726929,
"step": 1140
},
{
"epoch": 52.193732193732195,
"grad_norm": 0.4536575231743723,
"learning_rate": 0.002541553544799316,
"loss": 2.1509,
"loss_layer_12_head": 0.3612454831600189,
"loss_layer_18_head": 0.7751725912094116,
"loss_layer_24_head": 0.1629917174577713,
"loss_layer_30_head": 0.10584640502929688,
"loss_layer_36_head": 0.07269952446222305,
"loss_layer_42_head": 0.04338030144572258,
"loss_layer_6_head": 0.5827122926712036,
"step": 1145
},
{
"epoch": 52.421652421652425,
"grad_norm": 0.3246994919822453,
"learning_rate": 0.002520777489990243,
"loss": 2.1598,
"loss_layer_12_head": 0.37068501114845276,
"loss_layer_18_head": 0.783240795135498,
"loss_layer_24_head": 0.1731734573841095,
"loss_layer_30_head": 0.11189240217208862,
"loss_layer_36_head": 0.0775907039642334,
"loss_layer_42_head": 0.044106461107730865,
"loss_layer_6_head": 0.5865408182144165,
"step": 1150
},
{
"epoch": 52.64957264957265,
"grad_norm": 0.2825614436603337,
"learning_rate": 0.0025,
"loss": 2.1276,
"loss_layer_12_head": 0.3710993230342865,
"loss_layer_18_head": 0.7801668643951416,
"loss_layer_24_head": 0.17115965485572815,
"loss_layer_30_head": 0.10953103005886078,
"loss_layer_36_head": 0.07413998991250992,
"loss_layer_42_head": 0.04302388057112694,
"loss_layer_6_head": 0.5841787457466125,
"step": 1155
},
{
"epoch": 52.87749287749288,
"grad_norm": 0.2616737808720529,
"learning_rate": 0.0024792225100097576,
"loss": 2.1808,
"loss_layer_12_head": 0.3781898617744446,
"loss_layer_18_head": 0.7769525051116943,
"loss_layer_24_head": 0.17470547556877136,
"loss_layer_30_head": 0.11191209405660629,
"loss_layer_36_head": 0.07527685910463333,
"loss_layer_42_head": 0.04384452477097511,
"loss_layer_6_head": 0.5916733741760254,
"step": 1160
},
{
"epoch": 53.105413105413106,
"grad_norm": 0.3311507726816672,
"learning_rate": 0.002458446455200685,
"loss": 2.1317,
"loss_layer_12_head": 0.3670315444469452,
"loss_layer_18_head": 0.7406850457191467,
"loss_layer_24_head": 0.16683974862098694,
"loss_layer_30_head": 0.10634462535381317,
"loss_layer_36_head": 0.07278958708047867,
"loss_layer_42_head": 0.042678773403167725,
"loss_layer_6_head": 0.5726101994514465,
"step": 1165
},
{
"epoch": 53.333333333333336,
"grad_norm": 0.2509448396894879,
"learning_rate": 0.0024376732706548184,
"loss": 2.1037,
"loss_layer_12_head": 0.3671357333660126,
"loss_layer_18_head": 0.7627917528152466,
"loss_layer_24_head": 0.17304837703704834,
"loss_layer_30_head": 0.10864245891571045,
"loss_layer_36_head": 0.07365027070045471,
"loss_layer_42_head": 0.04330287128686905,
"loss_layer_6_head": 0.5780169367790222,
"step": 1170
},
{
"epoch": 53.56125356125356,
"grad_norm": 0.2947012291464038,
"learning_rate": 0.0024169043912559336,
"loss": 2.1094,
"loss_layer_12_head": 0.3702814280986786,
"loss_layer_18_head": 0.7762826085090637,
"loss_layer_24_head": 0.1664721667766571,
"loss_layer_30_head": 0.10503309965133667,
"loss_layer_36_head": 0.07054396718740463,
"loss_layer_42_head": 0.0412781648337841,
"loss_layer_6_head": 0.5821161270141602,
"step": 1175
},
{
"epoch": 53.78917378917379,
"grad_norm": 0.5137967208925616,
"learning_rate": 0.0023961412515904335,
"loss": 2.1099,
"loss_layer_12_head": 0.37117379903793335,
"loss_layer_18_head": 0.758410632610321,
"loss_layer_24_head": 0.17216971516609192,
"loss_layer_30_head": 0.1102384701371193,
"loss_layer_36_head": 0.07311711460351944,
"loss_layer_42_head": 0.042940251529216766,
"loss_layer_6_head": 0.5829381942749023,
"step": 1180
},
{
"epoch": 54.01709401709402,
"grad_norm": 0.35742509879703566,
"learning_rate": 0.0023753852858482568,
"loss": 2.1136,
"loss_layer_12_head": 0.3729603886604309,
"loss_layer_18_head": 0.7423059940338135,
"loss_layer_24_head": 0.1714349091053009,
"loss_layer_30_head": 0.1128513440489769,
"loss_layer_36_head": 0.0720224529504776,
"loss_layer_42_head": 0.04282630234956741,
"loss_layer_6_head": 0.5869891047477722,
"step": 1185
},
{
"epoch": 54.24501424501425,
"grad_norm": 0.3567652126619958,
"learning_rate": 0.0023546379277238107,
"loss": 2.0666,
"loss_layer_12_head": 0.36165767908096313,
"loss_layer_18_head": 0.7423025965690613,
"loss_layer_24_head": 0.17272017896175385,
"loss_layer_30_head": 0.11087025701999664,
"loss_layer_36_head": 0.07229898124933243,
"loss_layer_42_head": 0.04461061209440231,
"loss_layer_6_head": 0.5694043040275574,
"step": 1190
},
{
"epoch": 54.472934472934476,
"grad_norm": 0.4399155733365801,
"learning_rate": 0.0023339006103169396,
"loss": 2.0613,
"loss_layer_12_head": 0.36041295528411865,
"loss_layer_18_head": 0.7265509366989136,
"loss_layer_24_head": 0.16570261120796204,
"loss_layer_30_head": 0.10996762663125992,
"loss_layer_36_head": 0.07162176072597504,
"loss_layer_42_head": 0.04219638928771019,
"loss_layer_6_head": 0.5656975507736206,
"step": 1195
},
{
"epoch": 54.7008547008547,
"grad_norm": 0.2864521752954199,
"learning_rate": 0.0023131747660339393,
"loss": 2.0743,
"loss_layer_12_head": 0.36955124139785767,
"loss_layer_18_head": 0.7258505821228027,
"loss_layer_24_head": 0.1719922125339508,
"loss_layer_30_head": 0.11259003728628159,
"loss_layer_36_head": 0.07511717826128006,
"loss_layer_42_head": 0.04532603174448013,
"loss_layer_6_head": 0.5806865096092224,
"step": 1200
},
{
"epoch": 54.7008547008547,
"eval_loss": 3.922147750854492,
"eval_loss_layer_12_head": 0.8050315976142883,
"eval_loss_layer_18_head": 0.8688571453094482,
"eval_loss_layer_24_head": 0.44181978702545166,
"eval_loss_layer_30_head": 0.30123329162597656,
"eval_loss_layer_36_head": 0.20439112186431885,
"eval_loss_layer_42_head": 0.1361953616142273,
"eval_loss_layer_6_head": 1.0727375745773315,
"eval_runtime": 4.9363,
"eval_samples_per_second": 6.685,
"eval_steps_per_second": 0.608,
"step": 1200
},
{
"epoch": 54.92877492877493,
"grad_norm": 0.2610712781022057,
"learning_rate": 0.002292461826488612,
"loss": 2.1013,
"loss_layer_12_head": 0.3735349178314209,
"loss_layer_18_head": 0.7252658605575562,
"loss_layer_24_head": 0.17007118463516235,
"loss_layer_30_head": 0.10975446552038193,
"loss_layer_36_head": 0.07351495325565338,
"loss_layer_42_head": 0.043339770287275314,
"loss_layer_6_head": 0.5827316641807556,
"step": 1205
},
{
"epoch": 55.15669515669516,
"grad_norm": 0.23380581342639142,
"learning_rate": 0.0022717632224033796,
"loss": 2.0471,
"loss_layer_12_head": 0.35890263319015503,
"loss_layer_18_head": 0.7042995691299438,
"loss_layer_24_head": 0.16659076511859894,
"loss_layer_30_head": 0.10641211271286011,
"loss_layer_36_head": 0.07390512526035309,
"loss_layer_42_head": 0.044583845883607864,
"loss_layer_6_head": 0.5554832816123962,
"step": 1210
},
{
"epoch": 55.38461538461539,
"grad_norm": 0.49307442680376706,
"learning_rate": 0.0022510803835104586,
"loss": 2.0553,
"loss_layer_12_head": 0.3678162693977356,
"loss_layer_18_head": 0.7129090428352356,
"loss_layer_24_head": 0.17019148170948029,
"loss_layer_30_head": 0.10706619173288345,
"loss_layer_36_head": 0.07150779664516449,
"loss_layer_42_head": 0.04218541830778122,
"loss_layer_6_head": 0.5814077854156494,
"step": 1215
},
{
"epoch": 55.61253561253561,
"grad_norm": 0.3918313570013889,
"learning_rate": 0.002230414738453104,
"loss": 2.044,
"loss_layer_12_head": 0.36497271060943604,
"loss_layer_18_head": 0.6976231932640076,
"loss_layer_24_head": 0.1705760508775711,
"loss_layer_30_head": 0.1071806401014328,
"loss_layer_36_head": 0.07460781186819077,
"loss_layer_42_head": 0.04227457195520401,
"loss_layer_6_head": 0.5821677446365356,
"step": 1220
},
{
"epoch": 55.84045584045584,
"grad_norm": 0.4489143425230452,
"learning_rate": 0.0022097677146869243,
"loss": 2.0616,
"loss_layer_12_head": 0.3607775568962097,
"loss_layer_18_head": 0.690375566482544,
"loss_layer_24_head": 0.1676177680492401,
"loss_layer_30_head": 0.10613974183797836,
"loss_layer_36_head": 0.0728280320763588,
"loss_layer_42_head": 0.043055903166532516,
"loss_layer_6_head": 0.5777379274368286,
"step": 1225
},
{
"epoch": 56.06837606837607,
"grad_norm": 0.4573486183341264,
"learning_rate": 0.002189140738381288,
"loss": 2.0511,
"loss_layer_12_head": 0.37861520051956177,
"loss_layer_18_head": 0.7308498620986938,
"loss_layer_24_head": 0.1784064620733261,
"loss_layer_30_head": 0.11279511451721191,
"loss_layer_36_head": 0.07710321247577667,
"loss_layer_42_head": 0.045412637293338776,
"loss_layer_6_head": 0.5905116200447083,
"step": 1230
},
{
"epoch": 56.2962962962963,
"grad_norm": 0.5031971348141655,
"learning_rate": 0.0021685352343208016,
"loss": 2.0235,
"loss_layer_12_head": 0.3556618094444275,
"loss_layer_18_head": 0.6965985298156738,
"loss_layer_24_head": 0.1655425876379013,
"loss_layer_30_head": 0.10600040853023529,
"loss_layer_36_head": 0.0712694302201271,
"loss_layer_42_head": 0.041733045130968094,
"loss_layer_6_head": 0.5654771327972412,
"step": 1235
},
{
"epoch": 56.52421652421653,
"grad_norm": 0.44714368064818283,
"learning_rate": 0.0021479526258069083,
"loss": 2.0147,
"loss_layer_12_head": 0.37383121252059937,
"loss_layer_18_head": 0.7061088681221008,
"loss_layer_24_head": 0.1718534529209137,
"loss_layer_30_head": 0.1083112508058548,
"loss_layer_36_head": 0.07109373807907104,
"loss_layer_42_head": 0.041331950575113297,
"loss_layer_6_head": 0.5899810791015625,
"step": 1240
},
{
"epoch": 56.75213675213675,
"grad_norm": 0.47642493251870227,
"learning_rate": 0.0021273943345595635,
"loss": 2.0125,
"loss_layer_12_head": 0.38709744811058044,
"loss_layer_18_head": 0.7057448029518127,
"loss_layer_24_head": 0.17811325192451477,
"loss_layer_30_head": 0.11347751319408417,
"loss_layer_36_head": 0.07625190913677216,
"loss_layer_42_head": 0.043919991701841354,
"loss_layer_6_head": 0.6019116640090942,
"step": 1245
},
{
"epoch": 56.98005698005698,
"grad_norm": 0.3972745328767057,
"learning_rate": 0.002106861780619037,
"loss": 2.0611,
"loss_layer_12_head": 0.37597453594207764,
"loss_layer_18_head": 0.6905866861343384,
"loss_layer_24_head": 0.17333197593688965,
"loss_layer_30_head": 0.11129488050937653,
"loss_layer_36_head": 0.0752146914601326,
"loss_layer_42_head": 0.04286158084869385,
"loss_layer_6_head": 0.5845304131507874,
"step": 1250
},
{
"epoch": 57.20797720797721,
"grad_norm": 0.48469221114870253,
"learning_rate": 0.002086356382247822,
"loss": 1.9903,
"loss_layer_12_head": 0.368541955947876,
"loss_layer_18_head": 0.6999972462654114,
"loss_layer_24_head": 0.16970302164554596,
"loss_layer_30_head": 0.10926713794469833,
"loss_layer_36_head": 0.07249046862125397,
"loss_layer_42_head": 0.041749563068151474,
"loss_layer_6_head": 0.5862393379211426,
"step": 1255
},
{
"epoch": 57.43589743589744,
"grad_norm": 0.611574187970635,
"learning_rate": 0.0020658795558326742,
"loss": 2.0046,
"loss_layer_12_head": 0.36997026205062866,
"loss_layer_18_head": 0.6786366701126099,
"loss_layer_24_head": 0.1694273203611374,
"loss_layer_30_head": 0.10953061282634735,
"loss_layer_36_head": 0.07315023243427277,
"loss_layer_42_head": 0.04222399741411209,
"loss_layer_6_head": 0.5881815552711487,
"step": 1260
},
{
"epoch": 57.66381766381767,
"grad_norm": 0.6153677450824064,
"learning_rate": 0.0020454327157867734,
"loss": 2.0109,
"loss_layer_12_head": 0.367452472448349,
"loss_layer_18_head": 0.6759993433952332,
"loss_layer_24_head": 0.1668408215045929,
"loss_layer_30_head": 0.10686562955379486,
"loss_layer_36_head": 0.07048901170492172,
"loss_layer_42_head": 0.04098617285490036,
"loss_layer_6_head": 0.5859954357147217,
"step": 1265
},
{
"epoch": 57.89173789173789,
"grad_norm": 0.2935217424002303,
"learning_rate": 0.002025017274452026,
"loss": 2.0181,
"loss_layer_12_head": 0.37350600957870483,
"loss_layer_18_head": 0.6737070679664612,
"loss_layer_24_head": 0.17168107628822327,
"loss_layer_30_head": 0.11016984283924103,
"loss_layer_36_head": 0.07335834950208664,
"loss_layer_42_head": 0.04219553619623184,
"loss_layer_6_head": 0.5844029188156128,
"step": 1270
},
{
"epoch": 58.11965811965812,
"grad_norm": 0.2621978314625234,
"learning_rate": 0.0020046346420015066,
"loss": 1.9957,
"loss_layer_12_head": 0.3653263449668884,
"loss_layer_18_head": 0.6525477170944214,
"loss_layer_24_head": 0.16379979252815247,
"loss_layer_30_head": 0.10514315217733383,
"loss_layer_36_head": 0.0698147565126419,
"loss_layer_42_head": 0.041251130402088165,
"loss_layer_6_head": 0.5668857097625732,
"step": 1275
},
{
"epoch": 58.34757834757835,
"grad_norm": 0.26002559793149466,
"learning_rate": 0.0019842862263420562,
"loss": 1.9703,
"loss_layer_12_head": 0.36825767159461975,
"loss_layer_18_head": 0.6790739297866821,
"loss_layer_24_head": 0.17250864207744598,
"loss_layer_30_head": 0.10994887351989746,
"loss_layer_36_head": 0.07393065840005875,
"loss_layer_42_head": 0.04338192567229271,
"loss_layer_6_head": 0.5710390210151672,
"step": 1280
},
{
"epoch": 58.57549857549858,
"grad_norm": 0.3664467976121708,
"learning_rate": 0.001963973433017031,
"loss": 1.9765,
"loss_layer_12_head": 0.35958853363990784,
"loss_layer_18_head": 0.6540105938911438,
"loss_layer_24_head": 0.16704809665679932,
"loss_layer_30_head": 0.10820546001195908,
"loss_layer_36_head": 0.07247563451528549,
"loss_layer_42_head": 0.042074285447597504,
"loss_layer_6_head": 0.5653725862503052,
"step": 1285
},
{
"epoch": 58.8034188034188,
"grad_norm": 0.36752505491576265,
"learning_rate": 0.001943697665109214,
"loss": 1.9711,
"loss_layer_12_head": 0.36526957154273987,
"loss_layer_18_head": 0.6480482816696167,
"loss_layer_24_head": 0.16833284497261047,
"loss_layer_30_head": 0.10793248564004898,
"loss_layer_36_head": 0.07179885357618332,
"loss_layer_42_head": 0.04152452200651169,
"loss_layer_6_head": 0.5726978182792664,
"step": 1290
},
{
"epoch": 59.03133903133903,
"grad_norm": 0.27873606157572917,
"learning_rate": 0.0019234603231438995,
"loss": 1.9802,
"loss_layer_12_head": 0.36396297812461853,
"loss_layer_18_head": 0.6509535312652588,
"loss_layer_24_head": 0.1683129370212555,
"loss_layer_30_head": 0.10660085827112198,
"loss_layer_36_head": 0.07140365988016129,
"loss_layer_42_head": 0.0416494682431221,
"loss_layer_6_head": 0.5673893690109253,
"step": 1295
},
{
"epoch": 59.25925925925926,
"grad_norm": 0.25089726312834165,
"learning_rate": 0.0019032628049921558,
"loss": 1.9404,
"loss_layer_12_head": 0.3589734435081482,
"loss_layer_18_head": 0.6603430509567261,
"loss_layer_24_head": 0.16413089632987976,
"loss_layer_30_head": 0.1043197512626648,
"loss_layer_36_head": 0.06929105520248413,
"loss_layer_42_head": 0.041358429938554764,
"loss_layer_6_head": 0.5630621910095215,
"step": 1300
},
{
"epoch": 59.48717948717949,
"grad_norm": 0.22103248903884454,
"learning_rate": 0.0018831065057742658,
"loss": 1.9406,
"loss_layer_12_head": 0.359492689371109,
"loss_layer_18_head": 0.6408952474594116,
"loss_layer_24_head": 0.16286614537239075,
"loss_layer_30_head": 0.10376974195241928,
"loss_layer_36_head": 0.06853290647268295,
"loss_layer_42_head": 0.039769940078258514,
"loss_layer_6_head": 0.5613786578178406,
"step": 1305
},
{
"epoch": 59.71509971509972,
"grad_norm": 0.27744975382692894,
"learning_rate": 0.0018629928177633637,
"loss": 1.9501,
"loss_layer_12_head": 0.3579030930995941,
"loss_layer_18_head": 0.6238123178482056,
"loss_layer_24_head": 0.1625664085149765,
"loss_layer_30_head": 0.10378286987543106,
"loss_layer_36_head": 0.06863966584205627,
"loss_layer_42_head": 0.0397348552942276,
"loss_layer_6_head": 0.5519455671310425,
"step": 1310
},
{
"epoch": 59.94301994301994,
"grad_norm": 0.3067961859601614,
"learning_rate": 0.0018429231302892618,
"loss": 1.9753,
"loss_layer_12_head": 0.3674226701259613,
"loss_layer_18_head": 0.6348339915275574,
"loss_layer_24_head": 0.16799281537532806,
"loss_layer_30_head": 0.10703931003808975,
"loss_layer_36_head": 0.07145868241786957,
"loss_layer_42_head": 0.042139433324337006,
"loss_layer_6_head": 0.5692054033279419,
"step": 1315
},
{
"epoch": 60.17094017094017,
"grad_norm": 0.23589211575663038,
"learning_rate": 0.0018228988296424876,
"loss": 1.9278,
"loss_layer_12_head": 0.35995879769325256,
"loss_layer_18_head": 0.6400879621505737,
"loss_layer_24_head": 0.1672223061323166,
"loss_layer_30_head": 0.10712926089763641,
"loss_layer_36_head": 0.07134952396154404,
"loss_layer_42_head": 0.041246697306632996,
"loss_layer_6_head": 0.5575507879257202,
"step": 1320
},
{
"epoch": 60.3988603988604,
"grad_norm": 0.33319843426515144,
"learning_rate": 0.001802921298978524,
"loss": 1.9185,
"loss_layer_12_head": 0.3579893708229065,
"loss_layer_18_head": 0.6205559968948364,
"loss_layer_24_head": 0.16404248774051666,
"loss_layer_30_head": 0.10560892522335052,
"loss_layer_36_head": 0.07029516994953156,
"loss_layer_42_head": 0.040432218462228775,
"loss_layer_6_head": 0.5526586771011353,
"step": 1325
},
{
"epoch": 60.62678062678063,
"grad_norm": 0.3388940110950527,
"learning_rate": 0.0017829919182222752,
"loss": 1.9358,
"loss_layer_12_head": 0.36018261313438416,
"loss_layer_18_head": 0.6270455121994019,
"loss_layer_24_head": 0.1631208062171936,
"loss_layer_30_head": 0.10345069319009781,
"loss_layer_36_head": 0.06899687647819519,
"loss_layer_42_head": 0.040876902639865875,
"loss_layer_6_head": 0.5665128827095032,
"step": 1330
},
{
"epoch": 60.85470085470085,
"grad_norm": 0.23376441286656932,
"learning_rate": 0.0017631120639727393,
"loss": 1.9364,
"loss_layer_12_head": 0.36492669582366943,
"loss_layer_18_head": 0.617691695690155,
"loss_layer_24_head": 0.166006401181221,
"loss_layer_30_head": 0.10586857795715332,
"loss_layer_36_head": 0.07113579660654068,
"loss_layer_42_head": 0.042698584496974945,
"loss_layer_6_head": 0.5667874217033386,
"step": 1335
},
{
"epoch": 61.08262108262108,
"grad_norm": 0.261608716597069,
"learning_rate": 0.0017432831094079354,
"loss": 1.9645,
"loss_layer_12_head": 0.36654841899871826,
"loss_layer_18_head": 0.638596773147583,
"loss_layer_24_head": 0.16740819811820984,
"loss_layer_30_head": 0.10700327157974243,
"loss_layer_36_head": 0.07154614478349686,
"loss_layer_42_head": 0.04158513993024826,
"loss_layer_6_head": 0.5753756761550903,
"step": 1340
},
{
"epoch": 61.31054131054131,
"grad_norm": 0.26498707042333075,
"learning_rate": 0.0017235064241900406,
"loss": 1.9308,
"loss_layer_12_head": 0.35222604870796204,
"loss_layer_18_head": 0.619999885559082,
"loss_layer_24_head": 0.16033700108528137,
"loss_layer_30_head": 0.10242090374231339,
"loss_layer_36_head": 0.06886529922485352,
"loss_layer_42_head": 0.040125004947185516,
"loss_layer_6_head": 0.5601471662521362,
"step": 1345
},
{
"epoch": 61.53846153846154,
"grad_norm": 0.22226419828206537,
"learning_rate": 0.0017037833743707893,
"loss": 1.9023,
"loss_layer_12_head": 0.36284562945365906,
"loss_layer_18_head": 0.624627947807312,
"loss_layer_24_head": 0.16515551507472992,
"loss_layer_30_head": 0.10532643646001816,
"loss_layer_36_head": 0.07034877687692642,
"loss_layer_42_head": 0.04081105440855026,
"loss_layer_6_head": 0.5715882182121277,
"step": 1350
},
{
"epoch": 61.76638176638177,
"grad_norm": 0.24395119376035693,
"learning_rate": 0.0016841153222971112,
"loss": 1.9328,
"loss_layer_12_head": 0.36113080382347107,
"loss_layer_18_head": 0.6124555468559265,
"loss_layer_24_head": 0.16591724753379822,
"loss_layer_30_head": 0.10665629804134369,
"loss_layer_36_head": 0.07249001413583755,
"loss_layer_42_head": 0.04333646222949028,
"loss_layer_6_head": 0.5653160214424133,
"step": 1355
},
{
"epoch": 61.99430199430199,
"grad_norm": 0.2713219570569318,
"learning_rate": 0.0016645036265170313,
"loss": 1.9178,
"loss_layer_12_head": 0.36808109283447266,
"loss_layer_18_head": 0.6035963892936707,
"loss_layer_24_head": 0.16664768755435944,
"loss_layer_30_head": 0.10622652620077133,
"loss_layer_36_head": 0.07072083652019501,
"loss_layer_42_head": 0.04039480537176132,
"loss_layer_6_head": 0.5705283880233765,
"step": 1360
},
{
"epoch": 62.22222222222222,
"grad_norm": 0.16434355933361142,
"learning_rate": 0.0016449496416858283,
"loss": 1.8723,
"loss_layer_12_head": 0.3438248336315155,
"loss_layer_18_head": 0.6018158793449402,
"loss_layer_24_head": 0.15885809063911438,
"loss_layer_30_head": 0.10279610008001328,
"loss_layer_36_head": 0.06658101826906204,
"loss_layer_42_head": 0.038769569247961044,
"loss_layer_6_head": 0.5343616604804993,
"step": 1365
},
{
"epoch": 62.45014245014245,
"grad_norm": 0.17031052389682674,
"learning_rate": 0.001625454718472464,
"loss": 1.8919,
"loss_layer_12_head": 0.354342520236969,
"loss_layer_18_head": 0.6070439219474792,
"loss_layer_24_head": 0.16174617409706116,
"loss_layer_30_head": 0.10284841060638428,
"loss_layer_36_head": 0.06826045364141464,
"loss_layer_42_head": 0.0396745428442955,
"loss_layer_6_head": 0.5540146231651306,
"step": 1370
},
{
"epoch": 62.67806267806268,
"grad_norm": 0.17239300408220518,
"learning_rate": 0.0016060202034662847,
"loss": 1.892,
"loss_layer_12_head": 0.35568657517433167,
"loss_layer_18_head": 0.6011655330657959,
"loss_layer_24_head": 0.16301211714744568,
"loss_layer_30_head": 0.10400134325027466,
"loss_layer_36_head": 0.06935641914606094,
"loss_layer_42_head": 0.04025629162788391,
"loss_layer_6_head": 0.5542250275611877,
"step": 1375
},
{
"epoch": 62.9059829059829,
"grad_norm": 0.17793556083173895,
"learning_rate": 0.0015866474390840125,
"loss": 1.914,
"loss_layer_12_head": 0.3635668158531189,
"loss_layer_18_head": 0.6096671223640442,
"loss_layer_24_head": 0.16600359976291656,
"loss_layer_30_head": 0.10514751821756363,
"loss_layer_36_head": 0.06970134377479553,
"loss_layer_42_head": 0.040074046701192856,
"loss_layer_6_head": 0.565592348575592,
"step": 1380
},
{
"epoch": 63.13390313390313,
"grad_norm": 0.15331853155042946,
"learning_rate": 0.001567337763477015,
"loss": 1.8954,
"loss_layer_12_head": 0.353837251663208,
"loss_layer_18_head": 0.6123312711715698,
"loss_layer_24_head": 0.16154246032238007,
"loss_layer_30_head": 0.1025996059179306,
"loss_layer_36_head": 0.06770062446594238,
"loss_layer_42_head": 0.03924650326371193,
"loss_layer_6_head": 0.5534176230430603,
"step": 1385
},
{
"epoch": 63.36182336182336,
"grad_norm": 0.13314106837890913,
"learning_rate": 0.0015480925104388763,
"loss": 1.8684,
"loss_layer_12_head": 0.34603065252304077,
"loss_layer_18_head": 0.5800515413284302,
"loss_layer_24_head": 0.15730763971805573,
"loss_layer_30_head": 0.09923694282770157,
"loss_layer_36_head": 0.06509393453598022,
"loss_layer_42_head": 0.037624310702085495,
"loss_layer_6_head": 0.5397533178329468,
"step": 1390
},
{
"epoch": 63.58974358974359,
"grad_norm": 0.1821027004311222,
"learning_rate": 0.0015289130093132633,
"loss": 1.8867,
"loss_layer_12_head": 0.35637348890304565,
"loss_layer_18_head": 0.5976595282554626,
"loss_layer_24_head": 0.1641477346420288,
"loss_layer_30_head": 0.10457787662744522,
"loss_layer_36_head": 0.06947280466556549,
"loss_layer_42_head": 0.039845842868089676,
"loss_layer_6_head": 0.5539056658744812,
"step": 1395
},
{
"epoch": 63.81766381766382,
"grad_norm": 0.14339663827541982,
"learning_rate": 0.001509800584902108,
"loss": 1.8844,
"loss_layer_12_head": 0.36510032415390015,
"loss_layer_18_head": 0.6054537892341614,
"loss_layer_24_head": 0.16671153903007507,
"loss_layer_30_head": 0.10565783828496933,
"loss_layer_36_head": 0.06983356922864914,
"loss_layer_42_head": 0.03976988047361374,
"loss_layer_6_head": 0.56358402967453,
"step": 1400
},
{
"epoch": 63.81766381766382,
"eval_loss": 3.8140172958374023,
"eval_loss_layer_12_head": 0.8028413653373718,
"eval_loss_layer_18_head": 0.7729310989379883,
"eval_loss_layer_24_head": 0.43893590569496155,
"eval_loss_layer_30_head": 0.30446603894233704,
"eval_loss_layer_36_head": 0.20355641841888428,
"eval_loss_layer_42_head": 0.13499176502227783,
"eval_loss_layer_6_head": 1.072322964668274,
"eval_runtime": 4.9557,
"eval_samples_per_second": 6.659,
"eval_steps_per_second": 0.605,
"step": 1400
},
{
"epoch": 64.04558404558405,
"grad_norm": 0.1706143168596619,
"learning_rate": 0.0014907565573740943,
"loss": 1.8565,
"loss_layer_12_head": 0.36537107825279236,
"loss_layer_18_head": 0.5950644016265869,
"loss_layer_24_head": 0.16890759766101837,
"loss_layer_30_head": 0.10786622762680054,
"loss_layer_36_head": 0.0724274069070816,
"loss_layer_42_head": 0.042885925620794296,
"loss_layer_6_head": 0.5630357265472412,
"step": 1405
},
{
"epoch": 64.27350427350427,
"grad_norm": 0.1616402364897859,
"learning_rate": 0.0014717822421734716,
"loss": 1.8474,
"loss_layer_12_head": 0.3484862744808197,
"loss_layer_18_head": 0.5841726660728455,
"loss_layer_24_head": 0.15990933775901794,
"loss_layer_30_head": 0.10199542343616486,
"loss_layer_36_head": 0.06788065284490585,
"loss_layer_42_head": 0.03943866491317749,
"loss_layer_6_head": 0.544236421585083,
"step": 1410
},
{
"epoch": 64.5014245014245,
"grad_norm": 0.1564142623602139,
"learning_rate": 0.0014528789499291884,
"loss": 1.884,
"loss_layer_12_head": 0.3577747642993927,
"loss_layer_18_head": 0.5889385342597961,
"loss_layer_24_head": 0.1639334261417389,
"loss_layer_30_head": 0.10324330627918243,
"loss_layer_36_head": 0.06843547523021698,
"loss_layer_42_head": 0.03951374441385269,
"loss_layer_6_head": 0.5521500706672668,
"step": 1415
},
{
"epoch": 64.72934472934473,
"grad_norm": 0.1668705051362201,
"learning_rate": 0.0014340479863643658,
"loss": 1.8727,
"loss_layer_12_head": 0.3684167265892029,
"loss_layer_18_head": 0.6115270853042603,
"loss_layer_24_head": 0.16978159546852112,
"loss_layer_30_head": 0.10723569244146347,
"loss_layer_36_head": 0.07100329548120499,
"loss_layer_42_head": 0.04143214598298073,
"loss_layer_6_head": 0.5671561360359192,
"step": 1420
},
{
"epoch": 64.95726495726495,
"grad_norm": 0.1722312849981684,
"learning_rate": 0.001415290652206105,
"loss": 1.8603,
"loss_layer_12_head": 0.3622528910636902,
"loss_layer_18_head": 0.5668131113052368,
"loss_layer_24_head": 0.16173304617404938,
"loss_layer_30_head": 0.10281785577535629,
"loss_layer_36_head": 0.0677582398056984,
"loss_layer_42_head": 0.03934178501367569,
"loss_layer_6_head": 0.5578157901763916,
"step": 1425
},
{
"epoch": 65.18518518518519,
"grad_norm": 0.1565891848572285,
"learning_rate": 0.0013966082430956401,
"loss": 1.8396,
"loss_layer_12_head": 0.3468489646911621,
"loss_layer_18_head": 0.5685579776763916,
"loss_layer_24_head": 0.1579982042312622,
"loss_layer_30_head": 0.09962925314903259,
"loss_layer_36_head": 0.06661403924226761,
"loss_layer_42_head": 0.0386619046330452,
"loss_layer_6_head": 0.537671685218811,
"step": 1430
},
{
"epoch": 65.41310541310541,
"grad_norm": 0.20295485671464988,
"learning_rate": 0.0013780020494988446,
"loss": 1.8453,
"loss_layer_12_head": 0.3568614423274994,
"loss_layer_18_head": 0.594135582447052,
"loss_layer_24_head": 0.16297490894794464,
"loss_layer_30_head": 0.1031971201300621,
"loss_layer_36_head": 0.06798653304576874,
"loss_layer_42_head": 0.0389096699655056,
"loss_layer_6_head": 0.5536288022994995,
"step": 1435
},
{
"epoch": 65.64102564102564,
"grad_norm": 0.2043733204414482,
"learning_rate": 0.0013594733566170926,
"loss": 1.8431,
"loss_layer_12_head": 0.35691505670547485,
"loss_layer_18_head": 0.571416437625885,
"loss_layer_24_head": 0.16371139883995056,
"loss_layer_30_head": 0.10467123985290527,
"loss_layer_36_head": 0.06964066624641418,
"loss_layer_42_head": 0.03979448229074478,
"loss_layer_6_head": 0.5498124361038208,
"step": 1440
},
{
"epoch": 65.86894586894587,
"grad_norm": 0.1854719778610673,
"learning_rate": 0.0013410234442984858,
"loss": 1.8603,
"loss_layer_12_head": 0.3696306347846985,
"loss_layer_18_head": 0.5896201729774475,
"loss_layer_24_head": 0.16896311938762665,
"loss_layer_30_head": 0.1075272187590599,
"loss_layer_36_head": 0.07201769948005676,
"loss_layer_42_head": 0.041474007070064545,
"loss_layer_6_head": 0.5694519281387329,
"step": 1445
},
{
"epoch": 66.0968660968661,
"grad_norm": 0.13340586562162032,
"learning_rate": 0.0013226535869494504,
"loss": 1.854,
"loss_layer_12_head": 0.3521641194820404,
"loss_layer_18_head": 0.5745565891265869,
"loss_layer_24_head": 0.16140435636043549,
"loss_layer_30_head": 0.10265658050775528,
"loss_layer_36_head": 0.06814004480838776,
"loss_layer_42_head": 0.03969401866197586,
"loss_layer_6_head": 0.5429982542991638,
"step": 1450
},
{
"epoch": 66.32478632478633,
"grad_norm": 0.17959825279064529,
"learning_rate": 0.0013043650534467052,
"loss": 1.8141,
"loss_layer_12_head": 0.35965582728385925,
"loss_layer_18_head": 0.5807262659072876,
"loss_layer_24_head": 0.16504819691181183,
"loss_layer_30_head": 0.10508060455322266,
"loss_layer_36_head": 0.06974060833454132,
"loss_layer_42_head": 0.040343768894672394,
"loss_layer_6_head": 0.5519964694976807,
"step": 1455
},
{
"epoch": 66.55270655270655,
"grad_norm": 0.15133186076029295,
"learning_rate": 0.0012861591070496192,
"loss": 1.8282,
"loss_layer_12_head": 0.33240434527397156,
"loss_layer_18_head": 0.533875048160553,
"loss_layer_24_head": 0.1510429084300995,
"loss_layer_30_head": 0.09637447446584702,
"loss_layer_36_head": 0.06486562639474869,
"loss_layer_42_head": 0.038768868893384933,
"loss_layer_6_head": 0.5180043578147888,
"step": 1460
},
{
"epoch": 66.78062678062678,
"grad_norm": 0.12781398423219267,
"learning_rate": 0.0012680370053129552,
"loss": 1.8658,
"loss_layer_12_head": 0.3577564060688019,
"loss_layer_18_head": 0.5643961429595947,
"loss_layer_24_head": 0.16104556620121002,
"loss_layer_30_head": 0.10230318456888199,
"loss_layer_36_head": 0.0684160441160202,
"loss_layer_42_head": 0.03966595605015755,
"loss_layer_6_head": 0.5522262454032898,
"step": 1465
},
{
"epoch": 67.00854700854701,
"grad_norm": 0.1434611582862995,
"learning_rate": 0.0012500000000000007,
"loss": 1.8492,
"loss_layer_12_head": 0.36101633310317993,
"loss_layer_18_head": 0.5714535117149353,
"loss_layer_24_head": 0.16426679491996765,
"loss_layer_30_head": 0.10345318168401718,
"loss_layer_36_head": 0.06851635128259659,
"loss_layer_42_head": 0.039524100720882416,
"loss_layer_6_head": 0.5529091358184814,
"step": 1470
},
{
"epoch": 67.23646723646723,
"grad_norm": 0.15891028079726016,
"learning_rate": 0.0012320493369961025,
"loss": 1.8095,
"loss_layer_12_head": 0.34979498386383057,
"loss_layer_18_head": 0.571343719959259,
"loss_layer_24_head": 0.15945219993591309,
"loss_layer_30_head": 0.10051491111516953,
"loss_layer_36_head": 0.06638234108686447,
"loss_layer_42_head": 0.03841354325413704,
"loss_layer_6_head": 0.5429816842079163,
"step": 1475
},
{
"epoch": 67.46438746438747,
"grad_norm": 0.1449020760026797,
"learning_rate": 0.0012141862562226164,
"loss": 1.8208,
"loss_layer_12_head": 0.3603321611881256,
"loss_layer_18_head": 0.574971079826355,
"loss_layer_24_head": 0.16558215022087097,
"loss_layer_30_head": 0.10478846728801727,
"loss_layer_36_head": 0.06960511952638626,
"loss_layer_42_head": 0.04023158177733421,
"loss_layer_6_head": 0.5533859133720398,
"step": 1480
},
{
"epoch": 67.6923076923077,
"grad_norm": 0.1515034944495946,
"learning_rate": 0.001196411991551255,
"loss": 1.8069,
"loss_layer_12_head": 0.35250595211982727,
"loss_layer_18_head": 0.5499317646026611,
"loss_layer_24_head": 0.16539210081100464,
"loss_layer_30_head": 0.10548852384090424,
"loss_layer_36_head": 0.07063941657543182,
"loss_layer_42_head": 0.03947348892688751,
"loss_layer_6_head": 0.5477828979492188,
"step": 1485
},
{
"epoch": 67.92022792022792,
"grad_norm": 0.13885251819035915,
"learning_rate": 0.0011787277707188614,
"loss": 1.8487,
"loss_layer_12_head": 0.3500252366065979,
"loss_layer_18_head": 0.5536776781082153,
"loss_layer_24_head": 0.16161292791366577,
"loss_layer_30_head": 0.10370011627674103,
"loss_layer_36_head": 0.06909648329019547,
"loss_layer_42_head": 0.04028378799557686,
"loss_layer_6_head": 0.5405044555664062,
"step": 1490
},
{
"epoch": 68.14814814814815,
"grad_norm": 0.1477900835869908,
"learning_rate": 0.001161134815242604,
"loss": 1.8287,
"loss_layer_12_head": 0.36902493238449097,
"loss_layer_18_head": 0.577628493309021,
"loss_layer_24_head": 0.1691674441099167,
"loss_layer_30_head": 0.10768872499465942,
"loss_layer_36_head": 0.07162035256624222,
"loss_layer_42_head": 0.041386984288692474,
"loss_layer_6_head": 0.5679258108139038,
"step": 1495
},
{
"epoch": 68.37606837606837,
"grad_norm": 0.1497415381552831,
"learning_rate": 0.0011436343403356016,
"loss": 1.8166,
"loss_layer_12_head": 0.3537401258945465,
"loss_layer_18_head": 0.5527002811431885,
"loss_layer_24_head": 0.16221362352371216,
"loss_layer_30_head": 0.10519299656152725,
"loss_layer_36_head": 0.0677286759018898,
"loss_layer_42_head": 0.038980789482593536,
"loss_layer_6_head": 0.5500974059104919,
"step": 1500
},
{
"epoch": 68.6039886039886,
"grad_norm": 0.16417939418429925,
"learning_rate": 0.001126227554822985,
"loss": 1.8257,
"loss_layer_12_head": 0.35889554023742676,
"loss_layer_18_head": 0.566818118095398,
"loss_layer_24_head": 0.16498246788978577,
"loss_layer_30_head": 0.10522939264774323,
"loss_layer_36_head": 0.0697682723402977,
"loss_layer_42_head": 0.039750516414642334,
"loss_layer_6_head": 0.5557876825332642,
"step": 1505
},
{
"epoch": 68.83190883190883,
"grad_norm": 0.13478519747161247,
"learning_rate": 0.0011089156610583984,
"loss": 1.8031,
"loss_layer_12_head": 0.33702507615089417,
"loss_layer_18_head": 0.5199332237243652,
"loss_layer_24_head": 0.15418687462806702,
"loss_layer_30_head": 0.09883591532707214,
"loss_layer_36_head": 0.06565789133310318,
"loss_layer_42_head": 0.03810978680849075,
"loss_layer_6_head": 0.5240939855575562,
"step": 1510
},
{
"epoch": 69.05982905982906,
"grad_norm": 0.15061647322783647,
"learning_rate": 0.0010916998548409448,
"loss": 1.8214,
"loss_layer_12_head": 0.36763328313827515,
"loss_layer_18_head": 0.5576991438865662,
"loss_layer_24_head": 0.17053815722465515,
"loss_layer_30_head": 0.10889364778995514,
"loss_layer_36_head": 0.0726301372051239,
"loss_layer_42_head": 0.04174993932247162,
"loss_layer_6_head": 0.5595671534538269,
"step": 1515
},
{
"epoch": 69.28774928774929,
"grad_norm": 0.1703599863224025,
"learning_rate": 0.0010745813253325955,
"loss": 1.8176,
"loss_layer_12_head": 0.3379597067832947,
"loss_layer_18_head": 0.5326976180076599,
"loss_layer_24_head": 0.15527421236038208,
"loss_layer_30_head": 0.09891333431005478,
"loss_layer_36_head": 0.06529764831066132,
"loss_layer_42_head": 0.037422697991132736,
"loss_layer_6_head": 0.5266030430793762,
"step": 1520
},
{
"epoch": 69.51566951566952,
"grad_norm": 0.15303418906286098,
"learning_rate": 0.0010575612549760425,
"loss": 1.7911,
"loss_layer_12_head": 0.34493082761764526,
"loss_layer_18_head": 0.540948748588562,
"loss_layer_24_head": 0.15642331540584564,
"loss_layer_30_head": 0.09965353459119797,
"loss_layer_36_head": 0.06654352694749832,
"loss_layer_42_head": 0.038676489144563675,
"loss_layer_6_head": 0.5338304042816162,
"step": 1525
},
{
"epoch": 69.74358974358974,
"grad_norm": 0.15597397432669702,
"learning_rate": 0.001040640819413026,
"loss": 1.801,
"loss_layer_12_head": 0.3541865944862366,
"loss_layer_18_head": 0.5545614957809448,
"loss_layer_24_head": 0.16169731318950653,
"loss_layer_30_head": 0.10177583992481232,
"loss_layer_36_head": 0.06731664389371872,
"loss_layer_42_head": 0.03906581178307533,
"loss_layer_6_head": 0.5432997941970825,
"step": 1530
},
{
"epoch": 69.97150997150997,
"grad_norm": 0.1381659058057447,
"learning_rate": 0.0010238211874031258,
"loss": 1.819,
"loss_layer_12_head": 0.35804328322410583,
"loss_layer_18_head": 0.5514500141143799,
"loss_layer_24_head": 0.16211315989494324,
"loss_layer_30_head": 0.10239671170711517,
"loss_layer_36_head": 0.06772608309984207,
"loss_layer_42_head": 0.03940841555595398,
"loss_layer_6_head": 0.5507400631904602,
"step": 1535
},
{
"epoch": 70.1994301994302,
"grad_norm": 0.12515170214336985,
"learning_rate": 0.0010071035207430351,
"loss": 1.8026,
"loss_layer_12_head": 0.36129826307296753,
"loss_layer_18_head": 0.5562499165534973,
"loss_layer_24_head": 0.16456067562103271,
"loss_layer_30_head": 0.10447216033935547,
"loss_layer_36_head": 0.06889413297176361,
"loss_layer_42_head": 0.03956874459981918,
"loss_layer_6_head": 0.5565100312232971,
"step": 1540
},
{
"epoch": 70.42735042735043,
"grad_norm": 0.1265262115143159,
"learning_rate": 0.000990488974186306,
"loss": 1.7814,
"loss_layer_12_head": 0.3376021087169647,
"loss_layer_18_head": 0.5280048847198486,
"loss_layer_24_head": 0.15099617838859558,
"loss_layer_30_head": 0.09582580626010895,
"loss_layer_36_head": 0.06339169293642044,
"loss_layer_42_head": 0.03644992783665657,
"loss_layer_6_head": 0.526264488697052,
"step": 1545
},
{
"epoch": 70.65527065527066,
"grad_norm": 0.14518716953911703,
"learning_rate": 0.0009739786953635924,
"loss": 1.7955,
"loss_layer_12_head": 0.3329188823699951,
"loss_layer_18_head": 0.522331714630127,
"loss_layer_24_head": 0.15017689764499664,
"loss_layer_30_head": 0.09456716477870941,
"loss_layer_36_head": 0.06272949278354645,
"loss_layer_42_head": 0.037007980048656464,
"loss_layer_6_head": 0.5194771885871887,
"step": 1550
},
{
"epoch": 70.88319088319088,
"grad_norm": 0.14492257597202254,
"learning_rate": 0.0009575738247033688,
"loss": 1.8043,
"loss_layer_12_head": 0.3527730107307434,
"loss_layer_18_head": 0.5289863348007202,
"loss_layer_24_head": 0.16144096851348877,
"loss_layer_30_head": 0.10272853076457977,
"loss_layer_36_head": 0.06866233050823212,
"loss_layer_42_head": 0.04029922932386398,
"loss_layer_6_head": 0.5439268946647644,
"step": 1555
},
{
"epoch": 71.11111111111111,
"grad_norm": 0.12908560052498289,
"learning_rate": 0.0009412754953531664,
"loss": 1.7955,
"loss_layer_12_head": 0.3465037941932678,
"loss_layer_18_head": 0.5363790988922119,
"loss_layer_24_head": 0.15984053909778595,
"loss_layer_30_head": 0.10168828815221786,
"loss_layer_36_head": 0.06739744544029236,
"loss_layer_42_head": 0.03855326771736145,
"loss_layer_6_head": 0.533501386642456,
"step": 1560
},
{
"epoch": 71.33903133903134,
"grad_norm": 0.1309584047726663,
"learning_rate": 0.0009250848331012968,
"loss": 1.778,
"loss_layer_12_head": 0.3392285704612732,
"loss_layer_18_head": 0.5243316888809204,
"loss_layer_24_head": 0.15213483572006226,
"loss_layer_30_head": 0.09613216668367386,
"loss_layer_36_head": 0.06358274072408676,
"loss_layer_42_head": 0.037117063999176025,
"loss_layer_6_head": 0.5279943346977234,
"step": 1565
},
{
"epoch": 71.56695156695157,
"grad_norm": 0.13119649502099703,
"learning_rate": 0.0009090029562990911,
"loss": 1.7932,
"loss_layer_12_head": 0.3541892170906067,
"loss_layer_18_head": 0.535491406917572,
"loss_layer_24_head": 0.16096696257591248,
"loss_layer_30_head": 0.1018461138010025,
"loss_layer_36_head": 0.06726451963186264,
"loss_layer_42_head": 0.03884906694293022,
"loss_layer_6_head": 0.5449596643447876,
"step": 1570
},
{
"epoch": 71.7948717948718,
"grad_norm": 0.15395268449321947,
"learning_rate": 0.0008930309757836516,
"loss": 1.7906,
"loss_layer_12_head": 0.35688620805740356,
"loss_layer_18_head": 0.5362941026687622,
"loss_layer_24_head": 0.16272303462028503,
"loss_layer_30_head": 0.10348813235759735,
"loss_layer_36_head": 0.06898694485425949,
"loss_layer_42_head": 0.0402650460600853,
"loss_layer_6_head": 0.5459010004997253,
"step": 1575
},
{
"epoch": 72.02279202279202,
"grad_norm": 0.1204264850535832,
"learning_rate": 0.0008771699948011203,
"loss": 1.791,
"loss_layer_12_head": 0.35216954350471497,
"loss_layer_18_head": 0.5342555642127991,
"loss_layer_24_head": 0.16132056713104248,
"loss_layer_30_head": 0.10252640396356583,
"loss_layer_36_head": 0.06832431256771088,
"loss_layer_42_head": 0.039825376123189926,
"loss_layer_6_head": 0.5416141152381897,
"step": 1580
},
{
"epoch": 72.25071225071225,
"grad_norm": 0.12330961195318141,
"learning_rate": 0.0008614211089304744,
"loss": 1.7645,
"loss_layer_12_head": 0.3503497242927551,
"loss_layer_18_head": 0.5363430976867676,
"loss_layer_24_head": 0.1589389592409134,
"loss_layer_30_head": 0.10142697393894196,
"loss_layer_36_head": 0.06693203747272491,
"loss_layer_42_head": 0.038655836135149,
"loss_layer_6_head": 0.5450853109359741,
"step": 1585
},
{
"epoch": 72.47863247863248,
"grad_norm": 0.1368428580674561,
"learning_rate": 0.0008457854060078521,
"loss": 1.7539,
"loss_layer_12_head": 0.3376067876815796,
"loss_layer_18_head": 0.5092514753341675,
"loss_layer_24_head": 0.1555960476398468,
"loss_layer_30_head": 0.10002340376377106,
"loss_layer_36_head": 0.06791587173938751,
"loss_layer_42_head": 0.04093862324953079,
"loss_layer_6_head": 0.516821563243866,
"step": 1590
},
{
"epoch": 72.70655270655271,
"grad_norm": 0.1166360566610833,
"learning_rate": 0.0008302639660514069,
"loss": 1.7834,
"loss_layer_12_head": 0.36099570989608765,
"loss_layer_18_head": 0.5416703224182129,
"loss_layer_24_head": 0.164410799741745,
"loss_layer_30_head": 0.10419974476099014,
"loss_layer_36_head": 0.0690721794962883,
"loss_layer_42_head": 0.039205193519592285,
"loss_layer_6_head": 0.552871584892273,
"step": 1595
},
{
"epoch": 72.93447293447294,
"grad_norm": 0.12201088430110535,
"learning_rate": 0.0008148578611867113,
"loss": 1.8019,
"loss_layer_12_head": 0.35458269715309143,
"loss_layer_18_head": 0.531002402305603,
"loss_layer_24_head": 0.16304495930671692,
"loss_layer_30_head": 0.10449066013097763,
"loss_layer_36_head": 0.06930046528577805,
"loss_layer_42_head": 0.039028845727443695,
"loss_layer_6_head": 0.5448761582374573,
"step": 1600
},
{
"epoch": 72.93447293447294,
"eval_loss": 3.777650833129883,
"eval_loss_layer_12_head": 0.8038034439086914,
"eval_loss_layer_18_head": 0.7375771999359131,
"eval_loss_layer_24_head": 0.44012030959129333,
"eval_loss_layer_30_head": 0.3041973412036896,
"eval_loss_layer_36_head": 0.20319151878356934,
"eval_loss_layer_42_head": 0.13452239334583282,
"eval_loss_layer_6_head": 1.0726211071014404,
"eval_runtime": 4.946,
"eval_samples_per_second": 6.672,
"eval_steps_per_second": 0.607,
"step": 1600
},
{
"epoch": 73.16239316239316,
"grad_norm": 0.12807746671894205,
"learning_rate": 0.0007995681555727011,
"loss": 1.7822,
"loss_layer_12_head": 0.35767659544944763,
"loss_layer_18_head": 0.5432697534561157,
"loss_layer_24_head": 0.16505815088748932,
"loss_layer_30_head": 0.10571058839559555,
"loss_layer_36_head": 0.07003127038478851,
"loss_layer_42_head": 0.040247105062007904,
"loss_layer_6_head": 0.5464234352111816,
"step": 1605
},
{
"epoch": 73.3903133903134,
"grad_norm": 0.130365878874844,
"learning_rate": 0.0007843959053281663,
"loss": 1.7653,
"loss_layer_12_head": 0.3459833562374115,
"loss_layer_18_head": 0.5274468064308167,
"loss_layer_24_head": 0.1586887389421463,
"loss_layer_30_head": 0.10026203095912933,
"loss_layer_36_head": 0.0662410706281662,
"loss_layer_42_head": 0.03850514441728592,
"loss_layer_6_head": 0.5331538915634155,
"step": 1610
},
{
"epoch": 73.61823361823362,
"grad_norm": 0.10767490237253707,
"learning_rate": 0.0007693421584588012,
"loss": 1.7654,
"loss_layer_12_head": 0.3585582375526428,
"loss_layer_18_head": 0.5424355864524841,
"loss_layer_24_head": 0.16505910456180573,
"loss_layer_30_head": 0.10474319756031036,
"loss_layer_36_head": 0.0691712498664856,
"loss_layer_42_head": 0.039669524878263474,
"loss_layer_6_head": 0.5465718507766724,
"step": 1615
},
{
"epoch": 73.84615384615384,
"grad_norm": 0.11493378296733746,
"learning_rate": 0.0007544079547848182,
"loss": 1.7795,
"loss_layer_12_head": 0.3523768484592438,
"loss_layer_18_head": 0.5339788198471069,
"loss_layer_24_head": 0.1609235554933548,
"loss_layer_30_head": 0.10195883363485336,
"loss_layer_36_head": 0.06740550696849823,
"loss_layer_42_head": 0.03874468803405762,
"loss_layer_6_head": 0.543295681476593,
"step": 1620
},
{
"epoch": 74.07407407407408,
"grad_norm": 0.10561384049680506,
"learning_rate": 0.0007395943258691206,
"loss": 1.7689,
"loss_layer_12_head": 0.3443649411201477,
"loss_layer_18_head": 0.5210028886795044,
"loss_layer_24_head": 0.1560635268688202,
"loss_layer_30_head": 0.09920267760753632,
"loss_layer_36_head": 0.06585382670164108,
"loss_layer_42_head": 0.038234781473875046,
"loss_layer_6_head": 0.5328817367553711,
"step": 1625
},
{
"epoch": 74.3019943019943,
"grad_norm": 0.10268010300616287,
"learning_rate": 0.0007249022949460493,
"loss": 1.749,
"loss_layer_12_head": 0.33767470717430115,
"loss_layer_18_head": 0.5087472200393677,
"loss_layer_24_head": 0.1519826352596283,
"loss_layer_30_head": 0.09580295532941818,
"loss_layer_36_head": 0.06339939683675766,
"loss_layer_42_head": 0.03678145259618759,
"loss_layer_6_head": 0.5183311104774475,
"step": 1630
},
{
"epoch": 74.52991452991454,
"grad_norm": 0.1142043088181504,
"learning_rate": 0.000710332876850704,
"loss": 1.7564,
"loss_layer_12_head": 0.33524709939956665,
"loss_layer_18_head": 0.5125759840011597,
"loss_layer_24_head": 0.1531374603509903,
"loss_layer_30_head": 0.0977383702993393,
"loss_layer_36_head": 0.06496452540159225,
"loss_layer_42_head": 0.037830810993909836,
"loss_layer_6_head": 0.5194955468177795,
"step": 1635
},
{
"epoch": 74.75783475783476,
"grad_norm": 0.10734174486274735,
"learning_rate": 0.0006958870779488446,
"loss": 1.7828,
"loss_layer_12_head": 0.3413275182247162,
"loss_layer_18_head": 0.5117852091789246,
"loss_layer_24_head": 0.1564275622367859,
"loss_layer_30_head": 0.10126455128192902,
"loss_layer_36_head": 0.06778749823570251,
"loss_layer_42_head": 0.03929626941680908,
"loss_layer_6_head": 0.5240119695663452,
"step": 1640
},
{
"epoch": 74.98575498575498,
"grad_norm": 0.11059460221593216,
"learning_rate": 0.0006815658960673781,
"loss": 1.7701,
"loss_layer_12_head": 0.35752952098846436,
"loss_layer_18_head": 0.5365008115768433,
"loss_layer_24_head": 0.16379977762699127,
"loss_layer_30_head": 0.10386929661035538,
"loss_layer_36_head": 0.06902356445789337,
"loss_layer_42_head": 0.03992991894483566,
"loss_layer_6_head": 0.5477330088615417,
"step": 1645
},
{
"epoch": 75.21367521367522,
"grad_norm": 0.10623496987136612,
"learning_rate": 0.0006673703204254347,
"loss": 1.7388,
"loss_layer_12_head": 0.35095706582069397,
"loss_layer_18_head": 0.5278469324111938,
"loss_layer_24_head": 0.16206221282482147,
"loss_layer_30_head": 0.10317299515008926,
"loss_layer_36_head": 0.06801508367061615,
"loss_layer_42_head": 0.03888120502233505,
"loss_layer_6_head": 0.5396238565444946,
"step": 1650
},
{
"epoch": 75.44159544159544,
"grad_norm": 0.10416826399412087,
"learning_rate": 0.0006533013315660366,
"loss": 1.7447,
"loss_layer_12_head": 0.335065633058548,
"loss_layer_18_head": 0.5077509880065918,
"loss_layer_24_head": 0.1524048149585724,
"loss_layer_30_head": 0.0960657149553299,
"loss_layer_36_head": 0.06350886076688766,
"loss_layer_42_head": 0.03666612505912781,
"loss_layer_6_head": 0.5169146060943604,
"step": 1655
},
{
"epoch": 75.66951566951568,
"grad_norm": 0.11154842762703716,
"learning_rate": 0.0006393599012883708,
"loss": 1.7684,
"loss_layer_12_head": 0.3467431664466858,
"loss_layer_18_head": 0.5151144862174988,
"loss_layer_24_head": 0.1578586995601654,
"loss_layer_30_head": 0.1004854291677475,
"loss_layer_36_head": 0.06669998914003372,
"loss_layer_42_head": 0.03827175125479698,
"loss_layer_6_head": 0.5322891473770142,
"step": 1660
},
{
"epoch": 75.8974358974359,
"grad_norm": 0.11122823408841152,
"learning_rate": 0.0006255469925806642,
"loss": 1.7743,
"loss_layer_12_head": 0.35086023807525635,
"loss_layer_18_head": 0.5303332209587097,
"loss_layer_24_head": 0.16013801097869873,
"loss_layer_30_head": 0.10204042494297028,
"loss_layer_36_head": 0.06783459335565567,
"loss_layer_42_head": 0.03844980522990227,
"loss_layer_6_head": 0.5410701632499695,
"step": 1665
},
{
"epoch": 76.12535612535612,
"grad_norm": 0.10142797226124593,
"learning_rate": 0.0006118635595536634,
"loss": 1.7619,
"loss_layer_12_head": 0.33657488226890564,
"loss_layer_18_head": 0.5111474990844727,
"loss_layer_24_head": 0.1535833477973938,
"loss_layer_30_head": 0.09838329255580902,
"loss_layer_36_head": 0.06594778597354889,
"loss_layer_42_head": 0.038476187735795975,
"loss_layer_6_head": 0.5193209648132324,
"step": 1670
},
{
"epoch": 76.35327635327636,
"grad_norm": 0.09615619261603243,
"learning_rate": 0.0005983105473747291,
"loss": 1.7578,
"loss_layer_12_head": 0.3366927206516266,
"loss_layer_18_head": 0.5104350447654724,
"loss_layer_24_head": 0.15251997113227844,
"loss_layer_30_head": 0.09642209112644196,
"loss_layer_36_head": 0.06284157931804657,
"loss_layer_42_head": 0.03592481091618538,
"loss_layer_6_head": 0.5190488696098328,
"step": 1675
},
{
"epoch": 76.58119658119658,
"grad_norm": 0.11055504444111074,
"learning_rate": 0.0005848888922025552,
"loss": 1.7644,
"loss_layer_12_head": 0.3554447889328003,
"loss_layer_18_head": 0.5308834314346313,
"loss_layer_24_head": 0.16566328704357147,
"loss_layer_30_head": 0.10649579763412476,
"loss_layer_36_head": 0.07171137630939484,
"loss_layer_42_head": 0.04246705397963524,
"loss_layer_6_head": 0.542263388633728,
"step": 1680
},
{
"epoch": 76.80911680911682,
"grad_norm": 0.09715150491168917,
"learning_rate": 0.0005715995211225008,
"loss": 1.7372,
"loss_layer_12_head": 0.34149521589279175,
"loss_layer_18_head": 0.5083609223365784,
"loss_layer_24_head": 0.1550835371017456,
"loss_layer_30_head": 0.09850551933050156,
"loss_layer_36_head": 0.06518407166004181,
"loss_layer_42_head": 0.036875925958156586,
"loss_layer_6_head": 0.5298780202865601,
"step": 1685
},
{
"epoch": 77.03703703703704,
"grad_norm": 0.0994729317287691,
"learning_rate": 0.0005584433520825541,
"loss": 1.7448,
"loss_layer_12_head": 0.337735652923584,
"loss_layer_18_head": 0.4945538640022278,
"loss_layer_24_head": 0.15409395098686218,
"loss_layer_30_head": 0.09803880751132965,
"loss_layer_36_head": 0.06489689648151398,
"loss_layer_42_head": 0.037099629640579224,
"loss_layer_6_head": 0.5176650285720825,
"step": 1690
},
{
"epoch": 77.26495726495726,
"grad_norm": 0.12507262576134806,
"learning_rate": 0.0005454212938299255,
"loss": 1.7432,
"loss_layer_12_head": 0.34350135922431946,
"loss_layer_18_head": 0.5194068551063538,
"loss_layer_24_head": 0.15946084260940552,
"loss_layer_30_head": 0.10177260637283325,
"loss_layer_36_head": 0.06773774325847626,
"loss_layer_42_head": 0.03955072909593582,
"loss_layer_6_head": 0.5315379500389099,
"step": 1695
},
{
"epoch": 77.4928774928775,
"grad_norm": 0.10998347798638555,
"learning_rate": 0.0005325342458482779,
"loss": 1.736,
"loss_layer_12_head": 0.3438124656677246,
"loss_layer_18_head": 0.5158748030662537,
"loss_layer_24_head": 0.15715071558952332,
"loss_layer_30_head": 0.09998907148838043,
"loss_layer_36_head": 0.06680642068386078,
"loss_layer_42_head": 0.03813738375902176,
"loss_layer_6_head": 0.5303772687911987,
"step": 1700
},
{
"epoch": 77.72079772079772,
"grad_norm": 0.10073524573554277,
"learning_rate": 0.0005197830982955945,
"loss": 1.761,
"loss_layer_12_head": 0.3479636311531067,
"loss_layer_18_head": 0.5083318948745728,
"loss_layer_24_head": 0.15731409192085266,
"loss_layer_30_head": 0.09888915717601776,
"loss_layer_36_head": 0.06497863680124283,
"loss_layer_42_head": 0.03769425302743912,
"loss_layer_6_head": 0.5358933806419373,
"step": 1705
},
{
"epoch": 77.94871794871794,
"grad_norm": 0.10383088041264718,
"learning_rate": 0.0005071687319426946,
"loss": 1.7467,
"loss_layer_12_head": 0.35118889808654785,
"loss_layer_18_head": 0.5108534097671509,
"loss_layer_24_head": 0.16018807888031006,
"loss_layer_30_head": 0.10246168076992035,
"loss_layer_36_head": 0.06786641478538513,
"loss_layer_42_head": 0.039132293313741684,
"loss_layer_6_head": 0.5406389832496643,
"step": 1710
},
{
"epoch": 78.17663817663818,
"grad_norm": 0.0921569642386526,
"learning_rate": 0.0004946920181123904,
"loss": 1.7411,
"loss_layer_12_head": 0.34307050704956055,
"loss_layer_18_head": 0.5181266069412231,
"loss_layer_24_head": 0.15736958384513855,
"loss_layer_30_head": 0.10029349476099014,
"loss_layer_36_head": 0.06683328002691269,
"loss_layer_42_head": 0.0385957770049572,
"loss_layer_6_head": 0.5270929932594299,
"step": 1715
},
{
"epoch": 78.4045584045584,
"grad_norm": 0.10382660941610025,
"learning_rate": 0.00048235381861930964,
"loss": 1.7253,
"loss_layer_12_head": 0.3350295126438141,
"loss_layer_18_head": 0.5014669895172119,
"loss_layer_24_head": 0.15213629603385925,
"loss_layer_30_head": 0.0972825437784195,
"loss_layer_36_head": 0.06519439071416855,
"loss_layer_42_head": 0.03851339966058731,
"loss_layer_6_head": 0.5170751214027405,
"step": 1720
},
{
"epoch": 78.63247863247864,
"grad_norm": 0.09529554195297314,
"learning_rate": 0.00047015498571035874,
"loss": 1.7461,
"loss_layer_12_head": 0.3465036153793335,
"loss_layer_18_head": 0.5140535235404968,
"loss_layer_24_head": 0.16019120812416077,
"loss_layer_30_head": 0.10193805396556854,
"loss_layer_36_head": 0.067210853099823,
"loss_layer_42_head": 0.03818202763795853,
"loss_layer_6_head": 0.5324119329452515,
"step": 1725
},
{
"epoch": 78.86039886039886,
"grad_norm": 0.09986232916758815,
"learning_rate": 0.0004580963620058587,
"loss": 1.7606,
"loss_layer_12_head": 0.35431206226348877,
"loss_layer_18_head": 0.5250551104545593,
"loss_layer_24_head": 0.1629064977169037,
"loss_layer_30_head": 0.10397347062826157,
"loss_layer_36_head": 0.06930160522460938,
"loss_layer_42_head": 0.03968958184123039,
"loss_layer_6_head": 0.544308066368103,
"step": 1730
},
{
"epoch": 79.08831908831908,
"grad_norm": 0.09300841263629599,
"learning_rate": 0.0004461787804413406,
"loss": 1.7324,
"loss_layer_12_head": 0.3431733548641205,
"loss_layer_18_head": 0.5113198757171631,
"loss_layer_24_head": 0.15744714438915253,
"loss_layer_30_head": 0.09923293441534042,
"loss_layer_36_head": 0.0657310038805008,
"loss_layer_42_head": 0.03722255676984787,
"loss_layer_6_head": 0.5301375389099121,
"step": 1735
},
{
"epoch": 79.31623931623932,
"grad_norm": 0.0925736410309516,
"learning_rate": 0.0004344030642100133,
"loss": 1.7243,
"loss_layer_12_head": 0.34314194321632385,
"loss_layer_18_head": 0.5142655372619629,
"loss_layer_24_head": 0.1579989492893219,
"loss_layer_30_head": 0.1012849360704422,
"loss_layer_36_head": 0.06685040891170502,
"loss_layer_42_head": 0.03779185935854912,
"loss_layer_6_head": 0.5264319181442261,
"step": 1740
},
{
"epoch": 79.54415954415954,
"grad_norm": 0.10584606451044351,
"learning_rate": 0.00042277002670590034,
"loss": 1.7449,
"loss_layer_12_head": 0.33877480030059814,
"loss_layer_18_head": 0.5086591839790344,
"loss_layer_24_head": 0.15485888719558716,
"loss_layer_30_head": 0.09807880222797394,
"loss_layer_36_head": 0.06451396644115448,
"loss_layer_42_head": 0.03756994009017944,
"loss_layer_6_head": 0.5224930047988892,
"step": 1745
},
{
"epoch": 79.77207977207978,
"grad_norm": 0.10981655707411507,
"learning_rate": 0.0004112804714676593,
"loss": 1.7254,
"loss_layer_12_head": 0.35386496782302856,
"loss_layer_18_head": 0.5217627286911011,
"loss_layer_24_head": 0.16211065649986267,
"loss_layer_30_head": 0.10369504988193512,
"loss_layer_36_head": 0.06905995309352875,
"loss_layer_42_head": 0.03959353640675545,
"loss_layer_6_head": 0.5423368811607361,
"step": 1750
},
{
"epoch": 80.0,
"grad_norm": 0.09352915096510195,
"learning_rate": 0.0003999351921230715,
"loss": 1.7481,
"loss_layer_12_head": 0.3435395658016205,
"loss_layer_18_head": 0.5069267153739929,
"loss_layer_24_head": 0.1569208800792694,
"loss_layer_30_head": 0.09992153942584991,
"loss_layer_36_head": 0.0660344809293747,
"loss_layer_42_head": 0.03770904988050461,
"loss_layer_6_head": 0.5285703539848328,
"step": 1755
},
{
"epoch": 80.22792022792022,
"grad_norm": 0.08847018489420039,
"learning_rate": 0.0003887349723342304,
"loss": 1.7213,
"loss_layer_12_head": 0.3444362282752991,
"loss_layer_18_head": 0.5083240270614624,
"loss_layer_24_head": 0.1559758186340332,
"loss_layer_30_head": 0.09910304844379425,
"loss_layer_36_head": 0.06528599560260773,
"loss_layer_42_head": 0.03764251619577408,
"loss_layer_6_head": 0.5300508737564087,
"step": 1760
},
{
"epoch": 80.45584045584046,
"grad_norm": 0.0942284279992955,
"learning_rate": 0.0003776805857434068,
"loss": 1.7415,
"loss_layer_12_head": 0.34919023513793945,
"loss_layer_18_head": 0.5226079225540161,
"loss_layer_24_head": 0.1596161425113678,
"loss_layer_30_head": 0.10055674612522125,
"loss_layer_36_head": 0.06677110493183136,
"loss_layer_42_head": 0.03786572068929672,
"loss_layer_6_head": 0.5389742851257324,
"step": 1765
},
{
"epoch": 80.68376068376068,
"grad_norm": 0.09201895112943784,
"learning_rate": 0.000366772795919611,
"loss": 1.7281,
"loss_layer_12_head": 0.3336860239505768,
"loss_layer_18_head": 0.49141353368759155,
"loss_layer_24_head": 0.15047487616539001,
"loss_layer_30_head": 0.09469149261713028,
"loss_layer_36_head": 0.06228718161582947,
"loss_layer_42_head": 0.036014530807733536,
"loss_layer_6_head": 0.511943519115448,
"step": 1770
},
{
"epoch": 80.91168091168092,
"grad_norm": 0.08533194295665472,
"learning_rate": 0.0003560123563058512,
"loss": 1.7367,
"loss_layer_12_head": 0.3416348993778229,
"loss_layer_18_head": 0.5056973695755005,
"loss_layer_24_head": 0.15707023441791534,
"loss_layer_30_head": 0.09985633194446564,
"loss_layer_36_head": 0.06595893204212189,
"loss_layer_42_head": 0.03782585263252258,
"loss_layer_6_head": 0.5238116979598999,
"step": 1775
},
{
"epoch": 81.13960113960114,
"grad_norm": 0.08323069486037522,
"learning_rate": 0.0003454000101670901,
"loss": 1.7297,
"loss_layer_12_head": 0.3427240252494812,
"loss_layer_18_head": 0.504900336265564,
"loss_layer_24_head": 0.15762865543365479,
"loss_layer_30_head": 0.10089079290628433,
"loss_layer_36_head": 0.06669814139604568,
"loss_layer_42_head": 0.03817524015903473,
"loss_layer_6_head": 0.5277188420295715,
"step": 1780
},
{
"epoch": 81.36752136752136,
"grad_norm": 0.08706863161635335,
"learning_rate": 0.00033493649053890326,
"loss": 1.7228,
"loss_layer_12_head": 0.3384056091308594,
"loss_layer_18_head": 0.5061665773391724,
"loss_layer_24_head": 0.15349029004573822,
"loss_layer_30_head": 0.09709183871746063,
"loss_layer_36_head": 0.0645359456539154,
"loss_layer_42_head": 0.03722700849175453,
"loss_layer_6_head": 0.5220716595649719,
"step": 1785
},
{
"epoch": 81.5954415954416,
"grad_norm": 0.079555862137296,
"learning_rate": 0.00032462252017684794,
"loss": 1.7134,
"loss_layer_12_head": 0.3370179831981659,
"loss_layer_18_head": 0.4958283305168152,
"loss_layer_24_head": 0.15366677939891815,
"loss_layer_30_head": 0.097067691385746,
"loss_layer_36_head": 0.06432225555181503,
"loss_layer_42_head": 0.037587955594062805,
"loss_layer_6_head": 0.5160147547721863,
"step": 1790
},
{
"epoch": 81.82336182336182,
"grad_norm": 0.08166362138692365,
"learning_rate": 0.0003144588115065364,
"loss": 1.7479,
"loss_layer_12_head": 0.35182663798332214,
"loss_layer_18_head": 0.5176635384559631,
"loss_layer_24_head": 0.16193841397762299,
"loss_layer_30_head": 0.10320702940225601,
"loss_layer_36_head": 0.06898337602615356,
"loss_layer_42_head": 0.039763785898685455,
"loss_layer_6_head": 0.5365123748779297,
"step": 1795
},
{
"epoch": 82.05128205128206,
"grad_norm": 0.0855469720098283,
"learning_rate": 0.00030444606657442836,
"loss": 1.7339,
"loss_layer_12_head": 0.34548094868659973,
"loss_layer_18_head": 0.5053902268409729,
"loss_layer_24_head": 0.1566828191280365,
"loss_layer_30_head": 0.09957839548587799,
"loss_layer_36_head": 0.06610305607318878,
"loss_layer_42_head": 0.037855364382267,
"loss_layer_6_head": 0.5325480699539185,
"step": 1800
},
{
"epoch": 82.05128205128206,
"eval_loss": 3.7662460803985596,
"eval_loss_layer_12_head": 0.8055973052978516,
"eval_loss_layer_18_head": 0.7246161103248596,
"eval_loss_layer_24_head": 0.4393809735774994,
"eval_loss_layer_30_head": 0.3041135370731354,
"eval_loss_layer_36_head": 0.2042381763458252,
"eval_loss_layer_42_head": 0.13466167449951172,
"eval_loss_layer_6_head": 1.070326805114746,
"eval_runtime": 4.9444,
"eval_samples_per_second": 6.674,
"eval_steps_per_second": 0.607,
"step": 1800
},
{
"epoch": 82.27920227920228,
"grad_norm": 0.08718972153802979,
"learning_rate": 0.0002945849769993395,
"loss": 1.7251,
"loss_layer_12_head": 0.3301844298839569,
"loss_layer_18_head": 0.492474228143692,
"loss_layer_24_head": 0.15127244591712952,
"loss_layer_30_head": 0.09562569856643677,
"loss_layer_36_head": 0.06327711790800095,
"loss_layer_42_head": 0.036252304911613464,
"loss_layer_6_head": 0.5087629556655884,
"step": 1805
},
{
"epoch": 82.5071225071225,
"grad_norm": 0.08511606581369867,
"learning_rate": 0.0002848762239246644,
"loss": 1.7251,
"loss_layer_12_head": 0.34044864773750305,
"loss_layer_18_head": 0.5026866793632507,
"loss_layer_24_head": 0.15664085745811462,
"loss_layer_30_head": 0.09928475320339203,
"loss_layer_36_head": 0.06595821678638458,
"loss_layer_42_head": 0.03762565180659294,
"loss_layer_6_head": 0.5205121040344238,
"step": 1810
},
{
"epoch": 82.73504273504274,
"grad_norm": 0.08722784207394887,
"learning_rate": 0.00027532047797132865,
"loss": 1.7193,
"loss_layer_12_head": 0.33828800916671753,
"loss_layer_18_head": 0.5036657452583313,
"loss_layer_24_head": 0.15435315668582916,
"loss_layer_30_head": 0.09803181886672974,
"loss_layer_36_head": 0.06477385014295578,
"loss_layer_42_head": 0.0378272645175457,
"loss_layer_6_head": 0.5202800035476685,
"step": 1815
},
{
"epoch": 82.96296296296296,
"grad_norm": 0.08171497505981262,
"learning_rate": 0.0002659183991914696,
"loss": 1.7291,
"loss_layer_12_head": 0.35178667306900024,
"loss_layer_18_head": 0.5139197111129761,
"loss_layer_24_head": 0.16265292465686798,
"loss_layer_30_head": 0.10370488464832306,
"loss_layer_36_head": 0.06842222809791565,
"loss_layer_42_head": 0.038670577108860016,
"loss_layer_6_head": 0.5341295599937439,
"step": 1820
},
{
"epoch": 83.19088319088318,
"grad_norm": 0.08754226537159704,
"learning_rate": 0.00025667063702284026,
"loss": 1.7084,
"loss_layer_12_head": 0.3361435532569885,
"loss_layer_18_head": 0.4977208077907562,
"loss_layer_24_head": 0.15316614508628845,
"loss_layer_30_head": 0.09671608358621597,
"loss_layer_36_head": 0.06395326554775238,
"loss_layer_42_head": 0.03646908327937126,
"loss_layer_6_head": 0.5177844762802124,
"step": 1825
},
{
"epoch": 83.41880341880342,
"grad_norm": 0.0844836769084174,
"learning_rate": 0.00024757783024395244,
"loss": 1.723,
"loss_layer_12_head": 0.33938589692115784,
"loss_layer_18_head": 0.5006296634674072,
"loss_layer_24_head": 0.1563412994146347,
"loss_layer_30_head": 0.09939941018819809,
"loss_layer_36_head": 0.06653161346912384,
"loss_layer_42_head": 0.03902815654873848,
"loss_layer_6_head": 0.5227442383766174,
"step": 1830
},
{
"epoch": 83.64672364672364,
"grad_norm": 0.08116633475530488,
"learning_rate": 0.0002386406069299521,
"loss": 1.7133,
"loss_layer_12_head": 0.3282950818538666,
"loss_layer_18_head": 0.4761618673801422,
"loss_layer_24_head": 0.14734062552452087,
"loss_layer_30_head": 0.09337802231311798,
"loss_layer_36_head": 0.062004633247852325,
"loss_layer_42_head": 0.03645525127649307,
"loss_layer_6_head": 0.5092923641204834,
"step": 1835
},
{
"epoch": 83.87464387464388,
"grad_norm": 0.08130594229307465,
"learning_rate": 0.0002298595844092377,
"loss": 1.7324,
"loss_layer_12_head": 0.34668418765068054,
"loss_layer_18_head": 0.5105066299438477,
"loss_layer_24_head": 0.15807229280471802,
"loss_layer_30_head": 0.09954778850078583,
"loss_layer_36_head": 0.06574113667011261,
"loss_layer_42_head": 0.03756564110517502,
"loss_layer_6_head": 0.5319226980209351,
"step": 1840
},
{
"epoch": 84.1025641025641,
"grad_norm": 0.07435832563814537,
"learning_rate": 0.00022123536922081716,
"loss": 1.7374,
"loss_layer_12_head": 0.3403926193714142,
"loss_layer_18_head": 0.5032271146774292,
"loss_layer_24_head": 0.15673916041851044,
"loss_layer_30_head": 0.09963904321193695,
"loss_layer_36_head": 0.06623657792806625,
"loss_layer_42_head": 0.03874701261520386,
"loss_layer_6_head": 0.5237872004508972,
"step": 1845
},
{
"epoch": 84.33048433048432,
"grad_norm": 0.07520798715370941,
"learning_rate": 0.0002127685570724136,
"loss": 1.7008,
"loss_layer_12_head": 0.32764607667922974,
"loss_layer_18_head": 0.4850924015045166,
"loss_layer_24_head": 0.1496036797761917,
"loss_layer_30_head": 0.0952225849032402,
"loss_layer_36_head": 0.0631745234131813,
"loss_layer_42_head": 0.03642912209033966,
"loss_layer_6_head": 0.5045855045318604,
"step": 1850
},
{
"epoch": 84.55840455840456,
"grad_norm": 0.08345443449589855,
"learning_rate": 0.0002044597327993153,
"loss": 1.7105,
"loss_layer_12_head": 0.33524253964424133,
"loss_layer_18_head": 0.4937031865119934,
"loss_layer_24_head": 0.15422554314136505,
"loss_layer_30_head": 0.09784449636936188,
"loss_layer_36_head": 0.06489607691764832,
"loss_layer_42_head": 0.03717579320073128,
"loss_layer_6_head": 0.5167443156242371,
"step": 1855
},
{
"epoch": 84.78632478632478,
"grad_norm": 0.07382268007768587,
"learning_rate": 0.00019630947032398067,
"loss": 1.7347,
"loss_layer_12_head": 0.3385489583015442,
"loss_layer_18_head": 0.49440544843673706,
"loss_layer_24_head": 0.15322643518447876,
"loss_layer_30_head": 0.09731130301952362,
"loss_layer_36_head": 0.06442172825336456,
"loss_layer_42_head": 0.03728144243359566,
"loss_layer_6_head": 0.5216220617294312,
"step": 1860
},
{
"epoch": 85.01424501424502,
"grad_norm": 0.07869223889512078,
"learning_rate": 0.00018831833261639619,
"loss": 1.7207,
"loss_layer_12_head": 0.33598074316978455,
"loss_layer_18_head": 0.5042511224746704,
"loss_layer_24_head": 0.15242783725261688,
"loss_layer_30_head": 0.09582826495170593,
"loss_layer_36_head": 0.06362007558345795,
"loss_layer_42_head": 0.037337448447942734,
"loss_layer_6_head": 0.519189715385437,
"step": 1865
},
{
"epoch": 85.24216524216524,
"grad_norm": 0.07623838374572518,
"learning_rate": 0.00018048687165518663,
"loss": 1.7014,
"loss_layer_12_head": 0.34467798471450806,
"loss_layer_18_head": 0.5166942477226257,
"loss_layer_24_head": 0.1603996604681015,
"loss_layer_30_head": 0.10134941339492798,
"loss_layer_36_head": 0.06730696558952332,
"loss_layer_42_head": 0.03929843753576279,
"loss_layer_6_head": 0.5304808616638184,
"step": 1870
},
{
"epoch": 85.47008547008546,
"grad_norm": 0.07719263949161988,
"learning_rate": 0.00017281562838948966,
"loss": 1.7085,
"loss_layer_12_head": 0.34138795733451843,
"loss_layer_18_head": 0.49956589937210083,
"loss_layer_24_head": 0.15558582544326782,
"loss_layer_30_head": 0.09915374964475632,
"loss_layer_36_head": 0.06627882272005081,
"loss_layer_42_head": 0.03831201046705246,
"loss_layer_6_head": 0.5223523378372192,
"step": 1875
},
{
"epoch": 85.6980056980057,
"grad_norm": 0.07328975312789718,
"learning_rate": 0.00016530513270159115,
"loss": 1.7316,
"loss_layer_12_head": 0.343252569437027,
"loss_layer_18_head": 0.5099059343338013,
"loss_layer_24_head": 0.15889784693717957,
"loss_layer_30_head": 0.10200633853673935,
"loss_layer_36_head": 0.0683726891875267,
"loss_layer_42_head": 0.03963029757142067,
"loss_layer_6_head": 0.5271897315979004,
"step": 1880
},
{
"epoch": 85.92592592592592,
"grad_norm": 0.07726370100149342,
"learning_rate": 0.0001579559033703229,
"loss": 1.726,
"loss_layer_12_head": 0.3393276631832123,
"loss_layer_18_head": 0.49352455139160156,
"loss_layer_24_head": 0.15607452392578125,
"loss_layer_30_head": 0.09996481984853745,
"loss_layer_36_head": 0.06702710688114166,
"loss_layer_42_head": 0.03989189863204956,
"loss_layer_6_head": 0.5197093486785889,
"step": 1885
},
{
"epoch": 86.15384615384616,
"grad_norm": 0.07692397896958651,
"learning_rate": 0.00015076844803522921,
"loss": 1.7156,
"loss_layer_12_head": 0.35214799642562866,
"loss_layer_18_head": 0.5165507793426514,
"loss_layer_24_head": 0.1624716967344284,
"loss_layer_30_head": 0.1034996509552002,
"loss_layer_36_head": 0.06864960491657257,
"loss_layer_42_head": 0.039609383791685104,
"loss_layer_6_head": 0.540605902671814,
"step": 1890
},
{
"epoch": 86.38176638176638,
"grad_norm": 0.07830104898036093,
"learning_rate": 0.00014374326316150184,
"loss": 1.7007,
"loss_layer_12_head": 0.33503252267837524,
"loss_layer_18_head": 0.49698352813720703,
"loss_layer_24_head": 0.15384428203105927,
"loss_layer_30_head": 0.09767549484968185,
"loss_layer_36_head": 0.06486734002828598,
"loss_layer_42_head": 0.0373041145503521,
"loss_layer_6_head": 0.5164640545845032,
"step": 1895
},
{
"epoch": 86.6096866096866,
"grad_norm": 0.07492748364571503,
"learning_rate": 0.0001368808340056879,
"loss": 1.71,
"loss_layer_12_head": 0.3400861918926239,
"loss_layer_18_head": 0.4963332712650299,
"loss_layer_24_head": 0.15496981143951416,
"loss_layer_30_head": 0.09840646386146545,
"loss_layer_36_head": 0.06502757966518402,
"loss_layer_42_head": 0.037685126066207886,
"loss_layer_6_head": 0.523513913154602,
"step": 1900
},
{
"epoch": 86.83760683760684,
"grad_norm": 0.07229919889886265,
"learning_rate": 0.00013018163458217074,
"loss": 1.7187,
"loss_layer_12_head": 0.3341769576072693,
"loss_layer_18_head": 0.48553091287612915,
"loss_layer_24_head": 0.1514834463596344,
"loss_layer_30_head": 0.09563258290290833,
"loss_layer_36_head": 0.06288423389196396,
"loss_layer_42_head": 0.036126021295785904,
"loss_layer_6_head": 0.5115571022033691,
"step": 1905
},
{
"epoch": 87.06552706552706,
"grad_norm": 0.074891654205402,
"learning_rate": 0.00012364612763042792,
"loss": 1.7278,
"loss_layer_12_head": 0.34246379137039185,
"loss_layer_18_head": 0.49604225158691406,
"loss_layer_24_head": 0.15869124233722687,
"loss_layer_30_head": 0.10183699429035187,
"loss_layer_36_head": 0.06769458204507828,
"loss_layer_42_head": 0.03991634026169777,
"loss_layer_6_head": 0.5265407562255859,
"step": 1910
},
{
"epoch": 87.29344729344729,
"grad_norm": 0.08032085767903621,
"learning_rate": 0.0001172747645830674,
"loss": 1.707,
"loss_layer_12_head": 0.32982274889945984,
"loss_layer_18_head": 0.4943769872188568,
"loss_layer_24_head": 0.1514527052640915,
"loss_layer_30_head": 0.0964231789112091,
"loss_layer_36_head": 0.06376294791698456,
"loss_layer_42_head": 0.03660362958908081,
"loss_layer_6_head": 0.5084652304649353,
"step": 1915
},
{
"epoch": 87.52136752136752,
"grad_norm": 0.07366761201441889,
"learning_rate": 0.00011106798553464803,
"loss": 1.732,
"loss_layer_12_head": 0.3319392800331116,
"loss_layer_18_head": 0.49802669882774353,
"loss_layer_24_head": 0.15054012835025787,
"loss_layer_30_head": 0.09539338946342468,
"loss_layer_36_head": 0.062421299517154694,
"loss_layer_42_head": 0.03593217581510544,
"loss_layer_6_head": 0.5121363401412964,
"step": 1920
},
{
"epoch": 87.74928774928775,
"grad_norm": 0.07241943481895129,
"learning_rate": 0.00010502621921127774,
"loss": 1.7148,
"loss_layer_12_head": 0.34008413553237915,
"loss_layer_18_head": 0.5034884214401245,
"loss_layer_24_head": 0.15469662845134735,
"loss_layer_30_head": 0.0977717787027359,
"loss_layer_36_head": 0.06479165703058243,
"loss_layer_42_head": 0.03823380544781685,
"loss_layer_6_head": 0.5221525430679321,
"step": 1925
},
{
"epoch": 87.97720797720798,
"grad_norm": 0.07008688713358005,
"learning_rate": 9.914988294100064e-05,
"loss": 1.6985,
"loss_layer_12_head": 0.3371519446372986,
"loss_layer_18_head": 0.49330615997314453,
"loss_layer_24_head": 0.15630921721458435,
"loss_layer_30_head": 0.0999789759516716,
"loss_layer_36_head": 0.06621178239583969,
"loss_layer_42_head": 0.037286750972270966,
"loss_layer_6_head": 0.5132678747177124,
"step": 1930
},
{
"epoch": 88.2051282051282,
"grad_norm": 0.07024357018257298,
"learning_rate": 9.343938262496993e-05,
"loss": 1.7005,
"loss_layer_12_head": 0.3292502760887146,
"loss_layer_18_head": 0.4820357859134674,
"loss_layer_24_head": 0.14815860986709595,
"loss_layer_30_head": 0.09420295804738998,
"loss_layer_36_head": 0.062933050096035,
"loss_layer_42_head": 0.03677560016512871,
"loss_layer_6_head": 0.5083265900611877,
"step": 1935
},
{
"epoch": 88.43304843304843,
"grad_norm": 0.07038272442189607,
"learning_rate": 8.78951127094127e-05,
"loss": 1.7068,
"loss_layer_12_head": 0.3309806287288666,
"loss_layer_18_head": 0.48024678230285645,
"loss_layer_24_head": 0.15456709265708923,
"loss_layer_30_head": 0.09934371709823608,
"loss_layer_36_head": 0.06677098572254181,
"loss_layer_42_head": 0.0396689735352993,
"loss_layer_6_head": 0.5079755187034607,
"step": 1940
},
{
"epoch": 88.66096866096866,
"grad_norm": 0.07190615122554372,
"learning_rate": 8.251745615838191e-05,
"loss": 1.7283,
"loss_layer_12_head": 0.33875033259391785,
"loss_layer_18_head": 0.4983670711517334,
"loss_layer_24_head": 0.15512457489967346,
"loss_layer_30_head": 0.09894488751888275,
"loss_layer_36_head": 0.06597268581390381,
"loss_layer_42_head": 0.03837330639362335,
"loss_layer_6_head": 0.519954264163971,
"step": 1945
},
{
"epoch": 88.88888888888889,
"grad_norm": 0.07393034546819678,
"learning_rate": 7.730678442730537e-05,
"loss": 1.7069,
"loss_layer_12_head": 0.33714374899864197,
"loss_layer_18_head": 0.48974722623825073,
"loss_layer_24_head": 0.1527237594127655,
"loss_layer_30_head": 0.09619395434856415,
"loss_layer_36_head": 0.06359319388866425,
"loss_layer_42_head": 0.03700066730380058,
"loss_layer_6_head": 0.5218413472175598,
"step": 1950
},
{
"epoch": 89.11680911680912,
"grad_norm": 0.06980538304609592,
"learning_rate": 7.226345743732543e-05,
"loss": 1.7225,
"loss_layer_12_head": 0.3432984948158264,
"loss_layer_18_head": 0.5042284727096558,
"loss_layer_24_head": 0.1560019701719284,
"loss_layer_30_head": 0.09774313867092133,
"loss_layer_36_head": 0.0642099529504776,
"loss_layer_42_head": 0.037248644977808,
"loss_layer_6_head": 0.5292196273803711,
"step": 1955
},
{
"epoch": 89.34472934472934,
"grad_norm": 0.06841991765442745,
"learning_rate": 6.738782355044049e-05,
"loss": 1.7048,
"loss_layer_12_head": 0.336143434047699,
"loss_layer_18_head": 0.4922635555267334,
"loss_layer_24_head": 0.15440890192985535,
"loss_layer_30_head": 0.09838224947452545,
"loss_layer_36_head": 0.06481974571943283,
"loss_layer_42_head": 0.03767440468072891,
"loss_layer_6_head": 0.5202369093894958,
"step": 1960
},
{
"epoch": 89.57264957264957,
"grad_norm": 0.06877525391557446,
"learning_rate": 6.268021954544095e-05,
"loss": 1.706,
"loss_layer_12_head": 0.35166916251182556,
"loss_layer_18_head": 0.5136804580688477,
"loss_layer_24_head": 0.16127237677574158,
"loss_layer_30_head": 0.10277248919010162,
"loss_layer_36_head": 0.0681212991476059,
"loss_layer_42_head": 0.039175014942884445,
"loss_layer_6_head": 0.5358132123947144,
"step": 1965
},
{
"epoch": 89.8005698005698,
"grad_norm": 0.06941929581347202,
"learning_rate": 5.8140970594647015e-05,
"loss": 1.7074,
"loss_layer_12_head": 0.33122771978378296,
"loss_layer_18_head": 0.4838249087333679,
"loss_layer_24_head": 0.14897865056991577,
"loss_layer_30_head": 0.09341166168451309,
"loss_layer_36_head": 0.061882950365543365,
"loss_layer_42_head": 0.03573702648282051,
"loss_layer_6_head": 0.5114135146141052,
"step": 1970
},
{
"epoch": 90.02849002849003,
"grad_norm": 0.07797584867466725,
"learning_rate": 5.3770390241446866e-05,
"loss": 1.7153,
"loss_layer_12_head": 0.32591190934181213,
"loss_layer_18_head": 0.48162513971328735,
"loss_layer_24_head": 0.14960148930549622,
"loss_layer_30_head": 0.09565792977809906,
"loss_layer_36_head": 0.06360563635826111,
"loss_layer_42_head": 0.03752583637833595,
"loss_layer_6_head": 0.5001431703567505,
"step": 1975
},
{
"epoch": 90.25641025641026,
"grad_norm": 0.07008181121373384,
"learning_rate": 4.9568780378640435e-05,
"loss": 1.7156,
"loss_layer_12_head": 0.33082446455955505,
"loss_layer_18_head": 0.48348742723464966,
"loss_layer_24_head": 0.15005210041999817,
"loss_layer_30_head": 0.0951717272400856,
"loss_layer_36_head": 0.06267333030700684,
"loss_layer_42_head": 0.036025770008563995,
"loss_layer_6_head": 0.5086785554885864,
"step": 1980
},
{
"epoch": 90.48433048433048,
"grad_norm": 0.06871537369194806,
"learning_rate": 4.553643122758549e-05,
"loss": 1.7142,
"loss_layer_12_head": 0.3386912941932678,
"loss_layer_18_head": 0.4927671551704407,
"loss_layer_24_head": 0.15460585057735443,
"loss_layer_30_head": 0.09897184371948242,
"loss_layer_36_head": 0.06582445651292801,
"loss_layer_42_head": 0.03793860599398613,
"loss_layer_6_head": 0.5209053754806519,
"step": 1985
},
{
"epoch": 90.71225071225071,
"grad_norm": 0.0727769744619384,
"learning_rate": 4.16736213181515e-05,
"loss": 1.692,
"loss_layer_12_head": 0.32157278060913086,
"loss_layer_18_head": 0.4727444648742676,
"loss_layer_24_head": 0.14894722402095795,
"loss_layer_30_head": 0.09573666006326675,
"loss_layer_36_head": 0.06382396072149277,
"loss_layer_42_head": 0.03748338297009468,
"loss_layer_6_head": 0.4971844553947449,
"step": 1990
},
{
"epoch": 90.94017094017094,
"grad_norm": 0.06996748861179324,
"learning_rate": 3.798061746947995e-05,
"loss": 1.7028,
"loss_layer_12_head": 0.33345505595207214,
"loss_layer_18_head": 0.4878571629524231,
"loss_layer_24_head": 0.15110231935977936,
"loss_layer_30_head": 0.09630671888589859,
"loss_layer_36_head": 0.0633818656206131,
"loss_layer_42_head": 0.03600713983178139,
"loss_layer_6_head": 0.5108103156089783,
"step": 1995
},
{
"epoch": 91.16809116809117,
"grad_norm": 0.07222187273768782,
"learning_rate": 3.445767477155443e-05,
"loss": 1.6981,
"loss_layer_12_head": 0.3441750407218933,
"loss_layer_18_head": 0.5018490552902222,
"loss_layer_24_head": 0.15837827324867249,
"loss_layer_30_head": 0.10115264356136322,
"loss_layer_36_head": 0.06739296019077301,
"loss_layer_42_head": 0.038907259702682495,
"loss_layer_6_head": 0.5249325633049011,
"step": 2000
},
{
"epoch": 91.16809116809117,
"eval_loss": 3.761641025543213,
"eval_loss_layer_12_head": 0.8046773076057434,
"eval_loss_layer_18_head": 0.721228301525116,
"eval_loss_layer_24_head": 0.4395800828933716,
"eval_loss_layer_30_head": 0.30423250794410706,
"eval_loss_layer_36_head": 0.20396752655506134,
"eval_loss_layer_42_head": 0.1346331238746643,
"eval_loss_layer_6_head": 1.0709084272384644,
"eval_runtime": 4.9296,
"eval_samples_per_second": 6.694,
"eval_steps_per_second": 0.609,
"step": 2000
},
{
"epoch": 91.3960113960114,
"grad_norm": 0.06956615545566998,
"learning_rate": 3.110503656758079e-05,
"loss": 1.7282,
"loss_layer_12_head": 0.346196711063385,
"loss_layer_18_head": 0.4989009499549866,
"loss_layer_24_head": 0.1574573814868927,
"loss_layer_30_head": 0.0996241420507431,
"loss_layer_36_head": 0.06551764905452728,
"loss_layer_42_head": 0.03770477697253227,
"loss_layer_6_head": 0.527807354927063,
"step": 2005
},
{
"epoch": 91.62393162393163,
"grad_norm": 0.06754759103070487,
"learning_rate": 2.7922934437178695e-05,
"loss": 1.7122,
"loss_layer_12_head": 0.33628371357917786,
"loss_layer_18_head": 0.4885508418083191,
"loss_layer_24_head": 0.15223391354084015,
"loss_layer_30_head": 0.0969274565577507,
"loss_layer_36_head": 0.06416411697864532,
"loss_layer_42_head": 0.03708343952894211,
"loss_layer_6_head": 0.5166887044906616,
"step": 2010
},
{
"epoch": 91.85185185185185,
"grad_norm": 0.06613244713287736,
"learning_rate": 2.4911588180384083e-05,
"loss": 1.7041,
"loss_layer_12_head": 0.3314448893070221,
"loss_layer_18_head": 0.49047932028770447,
"loss_layer_24_head": 0.15160222351551056,
"loss_layer_30_head": 0.09629616886377335,
"loss_layer_36_head": 0.06369747221469879,
"loss_layer_42_head": 0.03670356050133705,
"loss_layer_6_head": 0.509692907333374,
"step": 2015
},
{
"epoch": 92.07977207977208,
"grad_norm": 0.06373357880238427,
"learning_rate": 2.2071205802468298e-05,
"loss": 1.6886,
"loss_layer_12_head": 0.3328821063041687,
"loss_layer_18_head": 0.4800987243652344,
"loss_layer_24_head": 0.151140958070755,
"loss_layer_30_head": 0.09596490859985352,
"loss_layer_36_head": 0.06343450397253036,
"loss_layer_42_head": 0.03617607057094574,
"loss_layer_6_head": 0.5120709538459778,
"step": 2020
},
{
"epoch": 92.3076923076923,
"grad_norm": 0.06615553804228982,
"learning_rate": 1.9401983499569843e-05,
"loss": 1.7013,
"loss_layer_12_head": 0.3357425928115845,
"loss_layer_18_head": 0.49065274000167847,
"loss_layer_24_head": 0.1525392085313797,
"loss_layer_30_head": 0.09637175500392914,
"loss_layer_36_head": 0.06411570310592651,
"loss_layer_42_head": 0.03685402125120163,
"loss_layer_6_head": 0.5137232542037964,
"step": 2025
},
{
"epoch": 92.53561253561253,
"grad_norm": 0.06891724751157484,
"learning_rate": 1.690410564514244e-05,
"loss": 1.7192,
"loss_layer_12_head": 0.3366279602050781,
"loss_layer_18_head": 0.48958778381347656,
"loss_layer_24_head": 0.15377403795719147,
"loss_layer_30_head": 0.09771725535392761,
"loss_layer_36_head": 0.06452822685241699,
"loss_layer_42_head": 0.03708551451563835,
"loss_layer_6_head": 0.5164296627044678,
"step": 2030
},
{
"epoch": 92.76353276353277,
"grad_norm": 0.06761410588330152,
"learning_rate": 1.4577744777219682e-05,
"loss": 1.7225,
"loss_layer_12_head": 0.33475255966186523,
"loss_layer_18_head": 0.49240392446517944,
"loss_layer_24_head": 0.15306416153907776,
"loss_layer_30_head": 0.0966690331697464,
"loss_layer_36_head": 0.0640743225812912,
"loss_layer_42_head": 0.03736594319343567,
"loss_layer_6_head": 0.5156325101852417,
"step": 2035
},
{
"epoch": 92.99145299145299,
"grad_norm": 0.06987564491458538,
"learning_rate": 1.2423061586496476e-05,
"loss": 1.7058,
"loss_layer_12_head": 0.34400632977485657,
"loss_layer_18_head": 0.49989375472068787,
"loss_layer_24_head": 0.15913555026054382,
"loss_layer_30_head": 0.10079015791416168,
"loss_layer_36_head": 0.06650825589895248,
"loss_layer_42_head": 0.03808695822954178,
"loss_layer_6_head": 0.5247647762298584,
"step": 2040
},
{
"epoch": 93.21937321937322,
"grad_norm": 0.06884001051952174,
"learning_rate": 1.0440204905230455e-05,
"loss": 1.7043,
"loss_layer_12_head": 0.35419824719429016,
"loss_layer_18_head": 0.5190885663032532,
"loss_layer_24_head": 0.1636066883802414,
"loss_layer_30_head": 0.1028328388929367,
"loss_layer_36_head": 0.0677228644490242,
"loss_layer_42_head": 0.03939133882522583,
"loss_layer_6_head": 0.5393562316894531,
"step": 2045
},
{
"epoch": 93.44729344729345,
"grad_norm": 0.06869857148762144,
"learning_rate": 8.629311696961295e-06,
"loss": 1.7002,
"loss_layer_12_head": 0.33199894428253174,
"loss_layer_18_head": 0.4830838143825531,
"loss_layer_24_head": 0.14906497299671173,
"loss_layer_30_head": 0.09444169700145721,
"loss_layer_36_head": 0.06229046732187271,
"loss_layer_42_head": 0.03576675057411194,
"loss_layer_6_head": 0.5107249021530151,
"step": 2050
},
{
"epoch": 93.67521367521367,
"grad_norm": 0.0647256792540863,
"learning_rate": 6.990507047049677e-06,
"loss": 1.7197,
"loss_layer_12_head": 0.33809465169906616,
"loss_layer_18_head": 0.49668869376182556,
"loss_layer_24_head": 0.15616574883460999,
"loss_layer_30_head": 0.09987618774175644,
"loss_layer_36_head": 0.06613916158676147,
"loss_layer_42_head": 0.03771573677659035,
"loss_layer_6_head": 0.5225865244865417,
"step": 2055
},
{
"epoch": 93.9031339031339,
"grad_norm": 0.06487422745793997,
"learning_rate": 5.523904154037529e-06,
"loss": 1.7034,
"loss_layer_12_head": 0.34006571769714355,
"loss_layer_18_head": 0.4934326112270355,
"loss_layer_24_head": 0.15561194717884064,
"loss_layer_30_head": 0.09845955669879913,
"loss_layer_36_head": 0.06562654674053192,
"loss_layer_42_head": 0.038369424641132355,
"loss_layer_6_head": 0.5245457887649536,
"step": 2060
},
{
"epoch": 94.13105413105413,
"grad_norm": 0.06340076159086788,
"learning_rate": 4.229604321829561e-06,
"loss": 1.6996,
"loss_layer_12_head": 0.3449529707431793,
"loss_layer_18_head": 0.5078026056289673,
"loss_layer_24_head": 0.1566687673330307,
"loss_layer_30_head": 0.09894686192274094,
"loss_layer_36_head": 0.06501217186450958,
"loss_layer_42_head": 0.0374465249478817,
"loss_layer_6_head": 0.5327596068382263,
"step": 2065
},
{
"epoch": 94.35897435897436,
"grad_norm": 0.06309615709636959,
"learning_rate": 3.107696952694139e-06,
"loss": 1.7155,
"loss_layer_12_head": 0.3498449921607971,
"loss_layer_18_head": 0.5132280588150024,
"loss_layer_24_head": 0.15977954864501953,
"loss_layer_30_head": 0.10074914991855621,
"loss_layer_36_head": 0.06633107364177704,
"loss_layer_42_head": 0.03793049603700638,
"loss_layer_6_head": 0.5361655950546265,
"step": 2070
},
{
"epoch": 94.58689458689459,
"grad_norm": 0.062173287336752,
"learning_rate": 2.1582595410896134e-06,
"loss": 1.7073,
"loss_layer_12_head": 0.3325282335281372,
"loss_layer_18_head": 0.47551655769348145,
"loss_layer_24_head": 0.1513689160346985,
"loss_layer_30_head": 0.09635426104068756,
"loss_layer_36_head": 0.06406603753566742,
"loss_layer_42_head": 0.03743938356637955,
"loss_layer_6_head": 0.5106655359268188,
"step": 2075
},
{
"epoch": 94.81481481481481,
"grad_norm": 0.06493713768466465,
"learning_rate": 1.3813576683111006e-06,
"loss": 1.7118,
"loss_layer_12_head": 0.33602645993232727,
"loss_layer_18_head": 0.4942130446434021,
"loss_layer_24_head": 0.1509321630001068,
"loss_layer_30_head": 0.09551974385976791,
"loss_layer_36_head": 0.06285407394170761,
"loss_layer_42_head": 0.03567957133054733,
"loss_layer_6_head": 0.5194729566574097,
"step": 2080
},
{
"epoch": 95.04273504273505,
"grad_norm": 0.06187259503088811,
"learning_rate": 7.770449979593864e-07,
"loss": 1.696,
"loss_layer_12_head": 0.3397129476070404,
"loss_layer_18_head": 0.4978001117706299,
"loss_layer_24_head": 0.15636396408081055,
"loss_layer_30_head": 0.09891422092914581,
"loss_layer_36_head": 0.06569842994213104,
"loss_layer_42_head": 0.03745325654745102,
"loss_layer_6_head": 0.5212146639823914,
"step": 2085
},
{
"epoch": 95.27065527065527,
"grad_norm": 0.06519728300513385,
"learning_rate": 3.453632722358324e-07,
"loss": 1.6931,
"loss_layer_12_head": 0.32379400730133057,
"loss_layer_18_head": 0.473306804895401,
"loss_layer_24_head": 0.14877896010875702,
"loss_layer_30_head": 0.09489592164754868,
"loss_layer_36_head": 0.06364993751049042,
"loss_layer_42_head": 0.03758970648050308,
"loss_layer_6_head": 0.4982914924621582,
"step": 2090
},
{
"epoch": 95.4985754985755,
"grad_norm": 0.06085731874844273,
"learning_rate": 8.634230905774088e-08,
"loss": 1.7092,
"loss_layer_12_head": 0.3473987877368927,
"loss_layer_18_head": 0.5032440423965454,
"loss_layer_24_head": 0.16124534606933594,
"loss_layer_30_head": 0.10328890383243561,
"loss_layer_36_head": 0.06872069090604782,
"loss_layer_42_head": 0.039262156933546066,
"loss_layer_6_head": 0.5293939709663391,
"step": 2095
},
{
"epoch": 95.72649572649573,
"grad_norm": 0.06407971588377558,
"learning_rate": 0.0,
"loss": 1.7188,
"loss_layer_12_head": 0.34668242931365967,
"loss_layer_18_head": 0.5045074224472046,
"loss_layer_24_head": 0.1579972803592682,
"loss_layer_30_head": 0.0999143049120903,
"loss_layer_36_head": 0.0661567822098732,
"loss_layer_42_head": 0.038395121693611145,
"loss_layer_6_head": 0.5313812494277954,
"step": 2100
},
{
"epoch": 95.72649572649573,
"step": 2100,
"total_flos": 2.2690272796968223e+18,
"train_loss": 2.834041218984695,
"train_runtime": 32024.3772,
"train_samples_per_second": 8.75,
"train_steps_per_second": 0.066
}
],
"logging_steps": 5,
"max_steps": 2100,
"num_input_tokens_seen": 0,
"num_train_epochs": 100,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.2690272796968223e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}