| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 95.72649572649573, | |
| "eval_steps": 200, | |
| "global_step": 2100, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.045584045584045586, | |
| "grad_norm": 62.2914085542995, | |
| "learning_rate": 2.3809523809523814e-05, | |
| "loss": 43.092, | |
| "loss_layer_12_head": 8.308940887451172, | |
| "loss_layer_18_head": 7.555350303649902, | |
| "loss_layer_24_head": 6.16204833984375, | |
| "loss_layer_30_head": 5.288188934326172, | |
| "loss_layer_36_head": 3.8764476776123047, | |
| "loss_layer_42_head": 2.1485373973846436, | |
| "loss_layer_6_head": 9.3965425491333, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.22792022792022792, | |
| "grad_norm": 19.455193806314377, | |
| "learning_rate": 0.00011904761904761905, | |
| "loss": 33.3367, | |
| "loss_layer_12_head": 6.647928714752197, | |
| "loss_layer_18_head": 5.974323272705078, | |
| "loss_layer_24_head": 4.755464553833008, | |
| "loss_layer_30_head": 3.9858086109161377, | |
| "loss_layer_36_head": 2.8238697052001953, | |
| "loss_layer_42_head": 1.47384774684906, | |
| "loss_layer_6_head": 7.75436544418335, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.45584045584045585, | |
| "grad_norm": 19.77362433069107, | |
| "learning_rate": 0.0002380952380952381, | |
| "loss": 24.502, | |
| "loss_layer_12_head": 5.146442413330078, | |
| "loss_layer_18_head": 4.827411651611328, | |
| "loss_layer_24_head": 3.496838331222534, | |
| "loss_layer_30_head": 2.7353873252868652, | |
| "loss_layer_36_head": 1.7639620304107666, | |
| "loss_layer_42_head": 1.1026753187179565, | |
| "loss_layer_6_head": 5.473020553588867, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.6837606837606838, | |
| "grad_norm": 5.825213351919792, | |
| "learning_rate": 0.00035714285714285714, | |
| "loss": 19.6598, | |
| "loss_layer_12_head": 4.174715042114258, | |
| "loss_layer_18_head": 3.8590176105499268, | |
| "loss_layer_24_head": 2.7612555027008057, | |
| "loss_layer_30_head": 2.1735005378723145, | |
| "loss_layer_36_head": 1.4102507829666138, | |
| "loss_layer_42_head": 0.7683950066566467, | |
| "loss_layer_6_head": 4.8184590339660645, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.9116809116809117, | |
| "grad_norm": 3.6611272833257194, | |
| "learning_rate": 0.0004761904761904762, | |
| "loss": 15.5497, | |
| "loss_layer_12_head": 3.2854111194610596, | |
| "loss_layer_18_head": 3.0585737228393555, | |
| "loss_layer_24_head": 2.142367362976074, | |
| "loss_layer_30_head": 1.661052942276001, | |
| "loss_layer_36_head": 1.0766279697418213, | |
| "loss_layer_42_head": 0.6817001104354858, | |
| "loss_layer_6_head": 3.832777738571167, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 1.1396011396011396, | |
| "grad_norm": 2.0539370731960522, | |
| "learning_rate": 0.0005952380952380952, | |
| "loss": 13.1226, | |
| "loss_layer_12_head": 2.8252596855163574, | |
| "loss_layer_18_head": 2.6267945766448975, | |
| "loss_layer_24_head": 1.7749992609024048, | |
| "loss_layer_30_head": 1.3407100439071655, | |
| "loss_layer_36_head": 0.85447758436203, | |
| "loss_layer_42_head": 0.5364646911621094, | |
| "loss_layer_6_head": 3.3687548637390137, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 1.3675213675213675, | |
| "grad_norm": 1.3678909297039978, | |
| "learning_rate": 0.0007142857142857143, | |
| "loss": 11.2545, | |
| "loss_layer_12_head": 2.3465054035186768, | |
| "loss_layer_18_head": 2.1682493686676025, | |
| "loss_layer_24_head": 1.4258038997650146, | |
| "loss_layer_30_head": 1.066014051437378, | |
| "loss_layer_36_head": 0.7038318514823914, | |
| "loss_layer_42_head": 0.4449593424797058, | |
| "loss_layer_6_head": 2.874427080154419, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 1.5954415954415955, | |
| "grad_norm": 1.0597925214891006, | |
| "learning_rate": 0.0008333333333333333, | |
| "loss": 10.0245, | |
| "loss_layer_12_head": 2.145514965057373, | |
| "loss_layer_18_head": 1.9670956134796143, | |
| "loss_layer_24_head": 1.2736284732818604, | |
| "loss_layer_30_head": 0.9368025064468384, | |
| "loss_layer_36_head": 0.5962619185447693, | |
| "loss_layer_42_head": 0.37413108348846436, | |
| "loss_layer_6_head": 2.68680477142334, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 1.8233618233618234, | |
| "grad_norm": 0.8067240935550286, | |
| "learning_rate": 0.0009523809523809524, | |
| "loss": 8.9233, | |
| "loss_layer_12_head": 1.9344780445098877, | |
| "loss_layer_18_head": 1.770326852798462, | |
| "loss_layer_24_head": 1.1205565929412842, | |
| "loss_layer_30_head": 0.8123494982719421, | |
| "loss_layer_36_head": 0.5198163390159607, | |
| "loss_layer_42_head": 0.3280097246170044, | |
| "loss_layer_6_head": 2.4362289905548096, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 2.051282051282051, | |
| "grad_norm": 0.6437708908708522, | |
| "learning_rate": 0.0010714285714285715, | |
| "loss": 8.1196, | |
| "loss_layer_12_head": 1.773280143737793, | |
| "loss_layer_18_head": 1.600731611251831, | |
| "loss_layer_24_head": 0.9964693188667297, | |
| "loss_layer_30_head": 0.7136993408203125, | |
| "loss_layer_36_head": 0.4567667543888092, | |
| "loss_layer_42_head": 0.28257232904434204, | |
| "loss_layer_6_head": 2.308173656463623, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 2.2792022792022792, | |
| "grad_norm": 0.5437222788803687, | |
| "learning_rate": 0.0011904761904761904, | |
| "loss": 7.3482, | |
| "loss_layer_12_head": 1.6736904382705688, | |
| "loss_layer_18_head": 1.4971253871917725, | |
| "loss_layer_24_head": 0.9068229794502258, | |
| "loss_layer_30_head": 0.6316335797309875, | |
| "loss_layer_36_head": 0.4000738561153412, | |
| "loss_layer_42_head": 0.24324941635131836, | |
| "loss_layer_6_head": 2.207364082336426, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 2.5071225071225074, | |
| "grad_norm": 0.47033294183192215, | |
| "learning_rate": 0.0013095238095238097, | |
| "loss": 6.782, | |
| "loss_layer_12_head": 1.4841620922088623, | |
| "loss_layer_18_head": 1.3070929050445557, | |
| "loss_layer_24_head": 0.774025559425354, | |
| "loss_layer_30_head": 0.5281109809875488, | |
| "loss_layer_36_head": 0.3339698314666748, | |
| "loss_layer_42_head": 0.1966492384672165, | |
| "loss_layer_6_head": 2.0062077045440674, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 2.735042735042735, | |
| "grad_norm": 0.41848236144680057, | |
| "learning_rate": 0.0014285714285714286, | |
| "loss": 6.3554, | |
| "loss_layer_12_head": 1.410228967666626, | |
| "loss_layer_18_head": 1.23647141456604, | |
| "loss_layer_24_head": 0.727624237537384, | |
| "loss_layer_30_head": 0.49607592821121216, | |
| "loss_layer_36_head": 0.3141021132469177, | |
| "loss_layer_42_head": 0.18274283409118652, | |
| "loss_layer_6_head": 1.926108956336975, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 2.962962962962963, | |
| "grad_norm": 0.36960333783062255, | |
| "learning_rate": 0.0015476190476190477, | |
| "loss": 5.8987, | |
| "loss_layer_12_head": 1.3281461000442505, | |
| "loss_layer_18_head": 1.1566855907440186, | |
| "loss_layer_24_head": 0.6787080764770508, | |
| "loss_layer_30_head": 0.45799437165260315, | |
| "loss_layer_36_head": 0.29089364409446716, | |
| "loss_layer_42_head": 0.1687871664762497, | |
| "loss_layer_6_head": 1.8334720134735107, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 3.190883190883191, | |
| "grad_norm": 0.34506211668913744, | |
| "learning_rate": 0.0016666666666666666, | |
| "loss": 5.4375, | |
| "loss_layer_12_head": 1.199660062789917, | |
| "loss_layer_18_head": 1.032221794128418, | |
| "loss_layer_24_head": 0.593693733215332, | |
| "loss_layer_30_head": 0.3999592959880829, | |
| "loss_layer_36_head": 0.258480966091156, | |
| "loss_layer_42_head": 0.1496810019016266, | |
| "loss_layer_6_head": 1.7067362070083618, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 3.4188034188034186, | |
| "grad_norm": 0.3151835312525889, | |
| "learning_rate": 0.0017857142857142859, | |
| "loss": 5.1469, | |
| "loss_layer_12_head": 1.2106449604034424, | |
| "loss_layer_18_head": 1.0451469421386719, | |
| "loss_layer_24_head": 0.5938726663589478, | |
| "loss_layer_30_head": 0.39307016134262085, | |
| "loss_layer_36_head": 0.2464793175458908, | |
| "loss_layer_42_head": 0.13884057104587555, | |
| "loss_layer_6_head": 1.7400537729263306, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 3.646723646723647, | |
| "grad_norm": 0.3082388550908825, | |
| "learning_rate": 0.0019047619047619048, | |
| "loss": 5.0216, | |
| "loss_layer_12_head": 1.1534507274627686, | |
| "loss_layer_18_head": 0.9869141578674316, | |
| "loss_layer_24_head": 0.5700754523277283, | |
| "loss_layer_30_head": 0.37610307335853577, | |
| "loss_layer_36_head": 0.23609893023967743, | |
| "loss_layer_42_head": 0.13269147276878357, | |
| "loss_layer_6_head": 1.6694886684417725, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 3.8746438746438745, | |
| "grad_norm": 0.3122891528340703, | |
| "learning_rate": 0.002023809523809524, | |
| "loss": 4.7593, | |
| "loss_layer_12_head": 1.0834763050079346, | |
| "loss_layer_18_head": 0.9217953681945801, | |
| "loss_layer_24_head": 0.5253178477287292, | |
| "loss_layer_30_head": 0.3411971926689148, | |
| "loss_layer_36_head": 0.2131468951702118, | |
| "loss_layer_42_head": 0.12173072248697281, | |
| "loss_layer_6_head": 1.5824086666107178, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 4.102564102564102, | |
| "grad_norm": 0.3089845680262722, | |
| "learning_rate": 0.002142857142857143, | |
| "loss": 4.4801, | |
| "loss_layer_12_head": 1.0101158618927002, | |
| "loss_layer_18_head": 0.8577353358268738, | |
| "loss_layer_24_head": 0.49227556586265564, | |
| "loss_layer_30_head": 0.32580453157424927, | |
| "loss_layer_36_head": 0.21231690049171448, | |
| "loss_layer_42_head": 0.1251501739025116, | |
| "loss_layer_6_head": 1.4942306280136108, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 4.330484330484331, | |
| "grad_norm": 0.3021335552511279, | |
| "learning_rate": 0.002261904761904762, | |
| "loss": 4.2004, | |
| "loss_layer_12_head": 0.9157131314277649, | |
| "loss_layer_18_head": 0.7664994597434998, | |
| "loss_layer_24_head": 0.4360291361808777, | |
| "loss_layer_30_head": 0.28324925899505615, | |
| "loss_layer_36_head": 0.18157120048999786, | |
| "loss_layer_42_head": 0.10644565522670746, | |
| "loss_layer_6_head": 1.3922828435897827, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 4.5584045584045585, | |
| "grad_norm": 0.35656956962665187, | |
| "learning_rate": 0.0023809523809523807, | |
| "loss": 4.0681, | |
| "loss_layer_12_head": 0.9036925435066223, | |
| "loss_layer_18_head": 0.7571278810501099, | |
| "loss_layer_24_head": 0.43527936935424805, | |
| "loss_layer_30_head": 0.28879284858703613, | |
| "loss_layer_36_head": 0.18665608763694763, | |
| "loss_layer_42_head": 0.110997773706913, | |
| "loss_layer_6_head": 1.3775255680084229, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 4.786324786324786, | |
| "grad_norm": 0.7537150260193732, | |
| "learning_rate": 0.0025, | |
| "loss": 4.1325, | |
| "loss_layer_12_head": 0.9238117933273315, | |
| "loss_layer_18_head": 0.7721032500267029, | |
| "loss_layer_24_head": 0.43952664732933044, | |
| "loss_layer_30_head": 0.28288325667381287, | |
| "loss_layer_36_head": 0.18152464926242828, | |
| "loss_layer_42_head": 0.21891792118549347, | |
| "loss_layer_6_head": 1.3878661394119263, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 5.014245014245014, | |
| "grad_norm": 0.7856472586064206, | |
| "learning_rate": 0.0026190476190476194, | |
| "loss": 4.0448, | |
| "loss_layer_12_head": 0.8572763204574585, | |
| "loss_layer_18_head": 0.7265270948410034, | |
| "loss_layer_24_head": 0.40768003463745117, | |
| "loss_layer_30_head": 0.2640893757343292, | |
| "loss_layer_36_head": 0.17229382693767548, | |
| "loss_layer_42_head": 0.22290952503681183, | |
| "loss_layer_6_head": 1.2932946681976318, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 5.2421652421652425, | |
| "grad_norm": 1.8914483067832557, | |
| "learning_rate": 0.0027380952380952383, | |
| "loss": 4.0412, | |
| "loss_layer_12_head": 0.8094813227653503, | |
| "loss_layer_18_head": 0.9888504147529602, | |
| "loss_layer_24_head": 0.42250028252601624, | |
| "loss_layer_30_head": 0.25133341550827026, | |
| "loss_layer_36_head": 0.1654709130525589, | |
| "loss_layer_42_head": 0.1770133078098297, | |
| "loss_layer_6_head": 1.2574403285980225, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 5.47008547008547, | |
| "grad_norm": 1.9425500774924052, | |
| "learning_rate": 0.002857142857142857, | |
| "loss": 4.6802, | |
| "loss_layer_12_head": 0.8291244506835938, | |
| "loss_layer_18_head": 0.9608370065689087, | |
| "loss_layer_24_head": 0.9025907516479492, | |
| "loss_layer_30_head": 0.3785189986228943, | |
| "loss_layer_36_head": 0.2427733689546585, | |
| "loss_layer_42_head": 0.1588081568479538, | |
| "loss_layer_6_head": 1.2310930490493774, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 5.698005698005698, | |
| "grad_norm": 1.409437774928848, | |
| "learning_rate": 0.002976190476190476, | |
| "loss": 4.6225, | |
| "loss_layer_12_head": 1.0028345584869385, | |
| "loss_layer_18_head": 0.9019104242324829, | |
| "loss_layer_24_head": 0.7798857688903809, | |
| "loss_layer_30_head": 0.373049795627594, | |
| "loss_layer_36_head": 0.24940261244773865, | |
| "loss_layer_42_head": 0.16313040256500244, | |
| "loss_layer_6_head": 1.1742385625839233, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 5.925925925925926, | |
| "grad_norm": 1.0378215832119297, | |
| "learning_rate": 0.0030952380952380953, | |
| "loss": 4.4125, | |
| "loss_layer_12_head": 1.0007030963897705, | |
| "loss_layer_18_head": 0.8221645355224609, | |
| "loss_layer_24_head": 0.6742190718650818, | |
| "loss_layer_30_head": 0.3384473919868469, | |
| "loss_layer_36_head": 0.2220737487077713, | |
| "loss_layer_42_head": 0.12549471855163574, | |
| "loss_layer_6_head": 1.171250581741333, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 6.153846153846154, | |
| "grad_norm": 1.4926191542432983, | |
| "learning_rate": 0.0032142857142857147, | |
| "loss": 3.9975, | |
| "loss_layer_12_head": 0.9102523922920227, | |
| "loss_layer_18_head": 0.7440574765205383, | |
| "loss_layer_24_head": 0.5891538262367249, | |
| "loss_layer_30_head": 0.3084535002708435, | |
| "loss_layer_36_head": 0.19867992401123047, | |
| "loss_layer_42_head": 0.12676474452018738, | |
| "loss_layer_6_head": 1.178501844406128, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 6.381766381766382, | |
| "grad_norm": 1.2078362363188435, | |
| "learning_rate": 0.003333333333333333, | |
| "loss": 3.7208, | |
| "loss_layer_12_head": 0.82643061876297, | |
| "loss_layer_18_head": 0.6635754704475403, | |
| "loss_layer_24_head": 0.49496936798095703, | |
| "loss_layer_30_head": 0.25951477885246277, | |
| "loss_layer_36_head": 0.1663007289171219, | |
| "loss_layer_42_head": 0.09452775865793228, | |
| "loss_layer_6_head": 1.1378257274627686, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 6.60968660968661, | |
| "grad_norm": 1.3611292633005208, | |
| "learning_rate": 0.0034523809523809524, | |
| "loss": 3.621, | |
| "loss_layer_12_head": 0.7876842617988586, | |
| "loss_layer_18_head": 0.6401599645614624, | |
| "loss_layer_24_head": 0.46546655893325806, | |
| "loss_layer_30_head": 0.2504613399505615, | |
| "loss_layer_36_head": 0.15819665789604187, | |
| "loss_layer_42_head": 0.09289596974849701, | |
| "loss_layer_6_head": 1.1508190631866455, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 6.837606837606837, | |
| "grad_norm": 2.068883579723556, | |
| "learning_rate": 0.0035714285714285718, | |
| "loss": 3.618, | |
| "loss_layer_12_head": 0.8008605241775513, | |
| "loss_layer_18_head": 0.6416077613830566, | |
| "loss_layer_24_head": 0.4489217698574066, | |
| "loss_layer_30_head": 0.25658518075942993, | |
| "loss_layer_36_head": 0.18197140097618103, | |
| "loss_layer_42_head": 0.09827812016010284, | |
| "loss_layer_6_head": 1.2267687320709229, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 7.065527065527066, | |
| "grad_norm": 1.8566033890581124, | |
| "learning_rate": 0.0036904761904761906, | |
| "loss": 3.6802, | |
| "loss_layer_12_head": 0.769829273223877, | |
| "loss_layer_18_head": 0.6047574281692505, | |
| "loss_layer_24_head": 0.404893159866333, | |
| "loss_layer_30_head": 0.23501157760620117, | |
| "loss_layer_36_head": 0.21422222256660461, | |
| "loss_layer_42_head": 0.13609819114208221, | |
| "loss_layer_6_head": 1.2630399465560913, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 7.293447293447294, | |
| "grad_norm": 1.3020341739606298, | |
| "learning_rate": 0.0038095238095238095, | |
| "loss": 3.5461, | |
| "loss_layer_12_head": 0.6948034763336182, | |
| "loss_layer_18_head": 0.5865488052368164, | |
| "loss_layer_24_head": 0.36058539152145386, | |
| "loss_layer_30_head": 0.254138708114624, | |
| "loss_layer_36_head": 0.1751880794763565, | |
| "loss_layer_42_head": 0.15007896721363068, | |
| "loss_layer_6_head": 1.2036831378936768, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 7.521367521367521, | |
| "grad_norm": 1.2579419895110615, | |
| "learning_rate": 0.003928571428571429, | |
| "loss": 3.4722, | |
| "loss_layer_12_head": 0.7162200212478638, | |
| "loss_layer_18_head": 0.6314468383789062, | |
| "loss_layer_24_head": 0.3721396327018738, | |
| "loss_layer_30_head": 0.24703256785869598, | |
| "loss_layer_36_head": 0.17532584071159363, | |
| "loss_layer_42_head": 0.1474941372871399, | |
| "loss_layer_6_head": 1.1787517070770264, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 7.749287749287749, | |
| "grad_norm": 1.0118058129375738, | |
| "learning_rate": 0.004047619047619048, | |
| "loss": 3.447, | |
| "loss_layer_12_head": 0.7314745187759399, | |
| "loss_layer_18_head": 0.6032330989837646, | |
| "loss_layer_24_head": 0.3913646340370178, | |
| "loss_layer_30_head": 0.23757806420326233, | |
| "loss_layer_36_head": 0.178132563829422, | |
| "loss_layer_42_head": 0.1304202377796173, | |
| "loss_layer_6_head": 1.1239389181137085, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 7.977207977207978, | |
| "grad_norm": 1.312823729396693, | |
| "learning_rate": 0.004166666666666667, | |
| "loss": 3.4955, | |
| "loss_layer_12_head": 0.8259456753730774, | |
| "loss_layer_18_head": 0.5874773263931274, | |
| "loss_layer_24_head": 0.3818410336971283, | |
| "loss_layer_30_head": 0.2627549171447754, | |
| "loss_layer_36_head": 0.1600283682346344, | |
| "loss_layer_42_head": 0.11221656948328018, | |
| "loss_layer_6_head": 1.097424864768982, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 8.205128205128204, | |
| "grad_norm": 1.2764585863517615, | |
| "learning_rate": 0.004285714285714286, | |
| "loss": 3.2662, | |
| "loss_layer_12_head": 0.8447945713996887, | |
| "loss_layer_18_head": 0.57573401927948, | |
| "loss_layer_24_head": 0.3404631018638611, | |
| "loss_layer_30_head": 0.23921921849250793, | |
| "loss_layer_36_head": 0.13854867219924927, | |
| "loss_layer_42_head": 0.09624181687831879, | |
| "loss_layer_6_head": 1.0239530801773071, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 8.433048433048434, | |
| "grad_norm": 2.174485752198311, | |
| "learning_rate": 0.004404761904761904, | |
| "loss": 3.8994, | |
| "loss_layer_12_head": 0.8033272624015808, | |
| "loss_layer_18_head": 1.1929363012313843, | |
| "loss_layer_24_head": 0.34211626648902893, | |
| "loss_layer_30_head": 0.24058055877685547, | |
| "loss_layer_36_head": 0.19761431217193604, | |
| "loss_layer_42_head": 0.09599171578884125, | |
| "loss_layer_6_head": 1.062518835067749, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 8.660968660968662, | |
| "grad_norm": 1.7449271648991171, | |
| "learning_rate": 0.004523809523809524, | |
| "loss": 4.0437, | |
| "loss_layer_12_head": 0.7386378049850464, | |
| "loss_layer_18_head": 1.2550582885742188, | |
| "loss_layer_24_head": 0.32292360067367554, | |
| "loss_layer_30_head": 0.22687847912311554, | |
| "loss_layer_36_head": 0.2144986093044281, | |
| "loss_layer_42_head": 0.130828857421875, | |
| "loss_layer_6_head": 1.0966918468475342, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 8.88888888888889, | |
| "grad_norm": 1.7126154778538007, | |
| "learning_rate": 0.004642857142857143, | |
| "loss": 3.9797, | |
| "loss_layer_12_head": 0.7142958641052246, | |
| "loss_layer_18_head": 1.0787105560302734, | |
| "loss_layer_24_head": 0.3428736925125122, | |
| "loss_layer_30_head": 0.2274586260318756, | |
| "loss_layer_36_head": 0.1928836703300476, | |
| "loss_layer_42_head": 0.14408032596111298, | |
| "loss_layer_6_head": 1.2979519367218018, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 9.116809116809117, | |
| "grad_norm": 1.1725132505587743, | |
| "learning_rate": 0.0047619047619047615, | |
| "loss": 3.7028, | |
| "loss_layer_12_head": 0.6508289575576782, | |
| "loss_layer_18_head": 0.9081010818481445, | |
| "loss_layer_24_head": 0.3285003900527954, | |
| "loss_layer_30_head": 0.22624602913856506, | |
| "loss_layer_36_head": 0.16938458383083344, | |
| "loss_layer_42_head": 0.11536189168691635, | |
| "loss_layer_6_head": 1.2216272354125977, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 9.116809116809117, | |
| "eval_loss": 4.635208606719971, | |
| "eval_loss_layer_12_head": 0.8311116099357605, | |
| "eval_loss_layer_18_head": 1.042922019958496, | |
| "eval_loss_layer_24_head": 0.49305030703544617, | |
| "eval_loss_layer_30_head": 0.4456557333469391, | |
| "eval_loss_layer_36_head": 0.23485398292541504, | |
| "eval_loss_layer_42_head": 0.1599181890487671, | |
| "eval_loss_layer_6_head": 1.4034830331802368, | |
| "eval_runtime": 5.0189, | |
| "eval_samples_per_second": 6.575, | |
| "eval_steps_per_second": 0.598, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 9.344729344729345, | |
| "grad_norm": 1.127047766220966, | |
| "learning_rate": 0.004880952380952381, | |
| "loss": 3.4979, | |
| "loss_layer_12_head": 0.6423342227935791, | |
| "loss_layer_18_head": 0.8167837262153625, | |
| "loss_layer_24_head": 0.3226773142814636, | |
| "loss_layer_30_head": 0.3336021602153778, | |
| "loss_layer_36_head": 0.1563200056552887, | |
| "loss_layer_42_head": 0.10249624401330948, | |
| "loss_layer_6_head": 1.1420187950134277, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 9.572649572649572, | |
| "grad_norm": 0.8579363052095977, | |
| "learning_rate": 0.005, | |
| "loss": 3.3994, | |
| "loss_layer_12_head": 0.6976842880249023, | |
| "loss_layer_18_head": 0.7718448638916016, | |
| "loss_layer_24_head": 0.33865469694137573, | |
| "loss_layer_30_head": 0.30089613795280457, | |
| "loss_layer_36_head": 0.15890909731388092, | |
| "loss_layer_42_head": 0.10226695239543915, | |
| "loss_layer_6_head": 1.0943472385406494, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 9.8005698005698, | |
| "grad_norm": 0.6413467845692354, | |
| "learning_rate": 0.004999913657690942, | |
| "loss": 3.3095, | |
| "loss_layer_12_head": 0.6870448589324951, | |
| "loss_layer_18_head": 0.6909510493278503, | |
| "loss_layer_24_head": 0.323662132024765, | |
| "loss_layer_30_head": 0.26362109184265137, | |
| "loss_layer_36_head": 0.16587287187576294, | |
| "loss_layer_42_head": 0.09277474135160446, | |
| "loss_layer_6_head": 1.0225787162780762, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 10.028490028490028, | |
| "grad_norm": 0.7352495226591444, | |
| "learning_rate": 0.004999654636727764, | |
| "loss": 3.1636, | |
| "loss_layer_12_head": 0.6777123212814331, | |
| "loss_layer_18_head": 0.6364808678627014, | |
| "loss_layer_24_head": 0.302184134721756, | |
| "loss_layer_30_head": 0.227763369679451, | |
| "loss_layer_36_head": 0.14396648108959198, | |
| "loss_layer_42_head": 0.09543080627918243, | |
| "loss_layer_6_head": 0.9637772440910339, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 10.256410256410255, | |
| "grad_norm": 0.8688354723248357, | |
| "learning_rate": 0.00499922295500204, | |
| "loss": 2.9773, | |
| "loss_layer_12_head": 0.6373997926712036, | |
| "loss_layer_18_head": 0.597356915473938, | |
| "loss_layer_24_head": 0.30082494020462036, | |
| "loss_layer_30_head": 0.21303267776966095, | |
| "loss_layer_36_head": 0.14231160283088684, | |
| "loss_layer_42_head": 0.15631356835365295, | |
| "loss_layer_6_head": 0.9340046644210815, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 10.484330484330485, | |
| "grad_norm": 0.8921544784848862, | |
| "learning_rate": 0.004998618642331689, | |
| "loss": 3.0212, | |
| "loss_layer_12_head": 0.6571947932243347, | |
| "loss_layer_18_head": 0.5933902263641357, | |
| "loss_layer_24_head": 0.3129543364048004, | |
| "loss_layer_30_head": 0.20653457939624786, | |
| "loss_layer_36_head": 0.146720290184021, | |
| "loss_layer_42_head": 0.20062248408794403, | |
| "loss_layer_6_head": 0.9515364766120911, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 10.712250712250713, | |
| "grad_norm": 1.037926434379806, | |
| "learning_rate": 0.004997841740458911, | |
| "loss": 2.9444, | |
| "loss_layer_12_head": 0.6910628080368042, | |
| "loss_layer_18_head": 0.5505853891372681, | |
| "loss_layer_24_head": 0.28936266899108887, | |
| "loss_layer_30_head": 0.19097623229026794, | |
| "loss_layer_36_head": 0.14262089133262634, | |
| "loss_layer_42_head": 0.16720648109912872, | |
| "loss_layer_6_head": 0.9268558621406555, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 10.94017094017094, | |
| "grad_norm": 1.4783942588373633, | |
| "learning_rate": 0.004996892303047305, | |
| "loss": 3.0655, | |
| "loss_layer_12_head": 0.7266325354576111, | |
| "loss_layer_18_head": 0.5471276640892029, | |
| "loss_layer_24_head": 0.39032667875289917, | |
| "loss_layer_30_head": 0.195367231965065, | |
| "loss_layer_36_head": 0.14416465163230896, | |
| "loss_layer_42_head": 0.14725883305072784, | |
| "loss_layer_6_head": 0.9646366834640503, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 11.168091168091168, | |
| "grad_norm": 1.1422412037330358, | |
| "learning_rate": 0.004995770395678171, | |
| "loss": 2.9034, | |
| "loss_layer_12_head": 0.6590501070022583, | |
| "loss_layer_18_head": 0.50871741771698, | |
| "loss_layer_24_head": 0.3766937851905823, | |
| "loss_layer_30_head": 0.18061670660972595, | |
| "loss_layer_36_head": 0.13145729899406433, | |
| "loss_layer_42_head": 0.11728329956531525, | |
| "loss_layer_6_head": 0.9400504231452942, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 11.396011396011396, | |
| "grad_norm": 0.8093002240508176, | |
| "learning_rate": 0.0049944760958459625, | |
| "loss": 2.7821, | |
| "loss_layer_12_head": 0.6128442883491516, | |
| "loss_layer_18_head": 0.48932427167892456, | |
| "loss_layer_24_head": 0.3477482199668884, | |
| "loss_layer_30_head": 0.20216746628284454, | |
| "loss_layer_36_head": 0.14225037395954132, | |
| "loss_layer_42_head": 0.10360412299633026, | |
| "loss_layer_6_head": 0.8851297497749329, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 11.623931623931623, | |
| "grad_norm": 0.7434506470981133, | |
| "learning_rate": 0.00499300949295295, | |
| "loss": 2.7255, | |
| "loss_layer_12_head": 0.6115326881408691, | |
| "loss_layer_18_head": 0.4955017566680908, | |
| "loss_layer_24_head": 0.33017319440841675, | |
| "loss_layer_30_head": 0.1856774091720581, | |
| "loss_layer_36_head": 0.14168860018253326, | |
| "loss_layer_42_head": 0.09496969729661942, | |
| "loss_layer_6_head": 0.905809760093689, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 11.851851851851851, | |
| "grad_norm": 1.0265273242733213, | |
| "learning_rate": 0.004991370688303039, | |
| "loss": 2.7819, | |
| "loss_layer_12_head": 0.6071752309799194, | |
| "loss_layer_18_head": 0.48383840918540955, | |
| "loss_layer_24_head": 0.30922985076904297, | |
| "loss_layer_30_head": 0.18128779530525208, | |
| "loss_layer_36_head": 0.14182403683662415, | |
| "loss_layer_42_head": 0.0852092057466507, | |
| "loss_layer_6_head": 0.9614272117614746, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 12.079772079772079, | |
| "grad_norm": 1.0046729839259028, | |
| "learning_rate": 0.00498955979509477, | |
| "loss": 2.7013, | |
| "loss_layer_12_head": 0.580025315284729, | |
| "loss_layer_18_head": 0.4724946618080139, | |
| "loss_layer_24_head": 0.29289859533309937, | |
| "loss_layer_30_head": 0.18664391338825226, | |
| "loss_layer_36_head": 0.128468319773674, | |
| "loss_layer_42_head": 0.07929748296737671, | |
| "loss_layer_6_head": 0.9460482597351074, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 12.307692307692308, | |
| "grad_norm": 1.298003181437895, | |
| "learning_rate": 0.004987576938413504, | |
| "loss": 2.6403, | |
| "loss_layer_12_head": 0.5524693727493286, | |
| "loss_layer_18_head": 0.4450896382331848, | |
| "loss_layer_24_head": 0.28416186571121216, | |
| "loss_layer_30_head": 0.18391993641853333, | |
| "loss_layer_36_head": 0.11984305083751678, | |
| "loss_layer_42_head": 0.0738661140203476, | |
| "loss_layer_6_head": 0.9863630533218384, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 12.535612535612536, | |
| "grad_norm": 1.0033885173931771, | |
| "learning_rate": 0.00498542225522278, | |
| "loss": 2.749, | |
| "loss_layer_12_head": 0.6306111812591553, | |
| "loss_layer_18_head": 0.44906359910964966, | |
| "loss_layer_24_head": 0.28689223527908325, | |
| "loss_layer_30_head": 0.18012277781963348, | |
| "loss_layer_36_head": 0.1199340969324112, | |
| "loss_layer_42_head": 0.07653049379587173, | |
| "loss_layer_6_head": 1.0442602634429932, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 12.763532763532764, | |
| "grad_norm": 0.843782334866487, | |
| "learning_rate": 0.004983095894354857, | |
| "loss": 2.7597, | |
| "loss_layer_12_head": 0.618486762046814, | |
| "loss_layer_18_head": 0.45755448937416077, | |
| "loss_layer_24_head": 0.3018384575843811, | |
| "loss_layer_30_head": 0.1906372755765915, | |
| "loss_layer_36_head": 0.12202297151088715, | |
| "loss_layer_42_head": 0.07890411466360092, | |
| "loss_layer_6_head": 1.005176305770874, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 12.991452991452991, | |
| "grad_norm": 0.6576558320387813, | |
| "learning_rate": 0.0049805980165004305, | |
| "loss": 2.7936, | |
| "loss_layer_12_head": 0.6078914999961853, | |
| "loss_layer_18_head": 0.4747782349586487, | |
| "loss_layer_24_head": 0.3116241991519928, | |
| "loss_layer_30_head": 0.19092823565006256, | |
| "loss_layer_36_head": 0.1737132966518402, | |
| "loss_layer_42_head": 0.08486510813236237, | |
| "loss_layer_6_head": 0.964220404624939, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 13.21937321937322, | |
| "grad_norm": 0.7017471319323249, | |
| "learning_rate": 0.004977928794197532, | |
| "loss": 2.5574, | |
| "loss_layer_12_head": 0.5619921088218689, | |
| "loss_layer_18_head": 0.4452199339866638, | |
| "loss_layer_24_head": 0.27957409620285034, | |
| "loss_layer_30_head": 0.1817333996295929, | |
| "loss_layer_36_head": 0.1468358337879181, | |
| "loss_layer_42_head": 0.08508309721946716, | |
| "loss_layer_6_head": 0.884550929069519, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 13.447293447293447, | |
| "grad_norm": 0.6115425707737043, | |
| "learning_rate": 0.004975088411819616, | |
| "loss": 2.5032, | |
| "loss_layer_12_head": 0.5326579809188843, | |
| "loss_layer_18_head": 0.42704278230667114, | |
| "loss_layer_24_head": 0.2711140513420105, | |
| "loss_layer_30_head": 0.1879303902387619, | |
| "loss_layer_36_head": 0.13557665050029755, | |
| "loss_layer_42_head": 0.09177286922931671, | |
| "loss_layer_6_head": 0.8427483439445496, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 13.675213675213675, | |
| "grad_norm": 0.9450445323425971, | |
| "learning_rate": 0.004972077065562821, | |
| "loss": 2.4842, | |
| "loss_layer_12_head": 0.5639079809188843, | |
| "loss_layer_18_head": 0.4394643306732178, | |
| "loss_layer_24_head": 0.2671627402305603, | |
| "loss_layer_30_head": 0.18647949397563934, | |
| "loss_layer_36_head": 0.1306968778371811, | |
| "loss_layer_42_head": 0.08405078202486038, | |
| "loss_layer_6_head": 0.8649594187736511, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 13.903133903133902, | |
| "grad_norm": 0.9138974651639769, | |
| "learning_rate": 0.004968894963432419, | |
| "loss": 2.5697, | |
| "loss_layer_12_head": 0.5812464356422424, | |
| "loss_layer_18_head": 0.450472891330719, | |
| "loss_layer_24_head": 0.26817426085472107, | |
| "loss_layer_30_head": 0.18351063132286072, | |
| "loss_layer_36_head": 0.1325109750032425, | |
| "loss_layer_42_head": 0.08811412006616592, | |
| "loss_layer_6_head": 0.8798761367797852, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 14.131054131054132, | |
| "grad_norm": 0.9905055131763821, | |
| "learning_rate": 0.004965542325228446, | |
| "loss": 2.4857, | |
| "loss_layer_12_head": 0.5650662183761597, | |
| "loss_layer_18_head": 0.4344004690647125, | |
| "loss_layer_24_head": 0.2506847381591797, | |
| "loss_layer_30_head": 0.1754450798034668, | |
| "loss_layer_36_head": 0.11748027801513672, | |
| "loss_layer_42_head": 0.08022721111774445, | |
| "loss_layer_6_head": 0.8539366722106934, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 14.35897435897436, | |
| "grad_norm": 1.0702212782131248, | |
| "learning_rate": 0.00496201938253052, | |
| "loss": 2.4271, | |
| "loss_layer_12_head": 0.5588186383247375, | |
| "loss_layer_18_head": 0.41167354583740234, | |
| "loss_layer_24_head": 0.24384205043315887, | |
| "loss_layer_30_head": 0.16517134010791779, | |
| "loss_layer_36_head": 0.10888339579105377, | |
| "loss_layer_42_head": 0.0690896213054657, | |
| "loss_layer_6_head": 0.8617145419120789, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 14.586894586894587, | |
| "grad_norm": 1.162558917354217, | |
| "learning_rate": 0.004958326378681849, | |
| "loss": 2.5164, | |
| "loss_layer_12_head": 0.6251641511917114, | |
| "loss_layer_18_head": 0.4351302981376648, | |
| "loss_layer_24_head": 0.27556923031806946, | |
| "loss_layer_30_head": 0.18502500653266907, | |
| "loss_layer_36_head": 0.12545715272426605, | |
| "loss_layer_42_head": 0.08420990407466888, | |
| "loss_layer_6_head": 0.8722649812698364, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 14.814814814814815, | |
| "grad_norm": 0.9669550279988619, | |
| "learning_rate": 0.004954463568772415, | |
| "loss": 2.6015, | |
| "loss_layer_12_head": 0.6321030855178833, | |
| "loss_layer_18_head": 0.4239681661128998, | |
| "loss_layer_24_head": 0.2637138366699219, | |
| "loss_layer_30_head": 0.18219289183616638, | |
| "loss_layer_36_head": 0.11929179728031158, | |
| "loss_layer_42_head": 0.08808918297290802, | |
| "loss_layer_6_head": 0.874514102935791, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 15.042735042735043, | |
| "grad_norm": 1.0543343861851295, | |
| "learning_rate": 0.00495043121962136, | |
| "loss": 2.6238, | |
| "loss_layer_12_head": 0.6399250030517578, | |
| "loss_layer_18_head": 0.4572976231575012, | |
| "loss_layer_24_head": 0.2698724567890167, | |
| "loss_layer_30_head": 0.20202596485614777, | |
| "loss_layer_36_head": 0.12312252819538116, | |
| "loss_layer_42_head": 0.0965074896812439, | |
| "loss_layer_6_head": 0.9252855181694031, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 15.27065527065527, | |
| "grad_norm": 0.8727229528851698, | |
| "learning_rate": 0.0049462296097585534, | |
| "loss": 2.5011, | |
| "loss_layer_12_head": 0.5516534447669983, | |
| "loss_layer_18_head": 0.4133722186088562, | |
| "loss_layer_24_head": 0.26039832830429077, | |
| "loss_layer_30_head": 0.17739339172840118, | |
| "loss_layer_36_head": 0.10702526569366455, | |
| "loss_layer_42_head": 0.08984313905239105, | |
| "loss_layer_6_head": 0.8428100347518921, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 15.498575498575498, | |
| "grad_norm": 0.7580466664772497, | |
| "learning_rate": 0.004941859029405353, | |
| "loss": 2.4557, | |
| "loss_layer_12_head": 0.5529354214668274, | |
| "loss_layer_18_head": 0.43304672837257385, | |
| "loss_layer_24_head": 0.2537849247455597, | |
| "loss_layer_30_head": 0.17200490832328796, | |
| "loss_layer_36_head": 0.15178406238555908, | |
| "loss_layer_42_head": 0.0801372081041336, | |
| "loss_layer_6_head": 0.8668233156204224, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 15.726495726495726, | |
| "grad_norm": 0.9418340108215619, | |
| "learning_rate": 0.0049373197804545585, | |
| "loss": 2.4453, | |
| "loss_layer_12_head": 0.5447560548782349, | |
| "loss_layer_18_head": 0.43586069345474243, | |
| "loss_layer_24_head": 0.2588791251182556, | |
| "loss_layer_30_head": 0.17809347808361053, | |
| "loss_layer_36_head": 0.15249352157115936, | |
| "loss_layer_42_head": 0.08157460391521454, | |
| "loss_layer_6_head": 0.8784462213516235, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 15.954415954415955, | |
| "grad_norm": 0.9047489961689316, | |
| "learning_rate": 0.004932612176449559, | |
| "loss": 2.4695, | |
| "loss_layer_12_head": 0.5187823176383972, | |
| "loss_layer_18_head": 0.41048678755760193, | |
| "loss_layer_24_head": 0.2688165605068207, | |
| "loss_layer_30_head": 0.1661987006664276, | |
| "loss_layer_36_head": 0.13160857558250427, | |
| "loss_layer_42_head": 0.07198281586170197, | |
| "loss_layer_6_head": 0.835620105266571, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 16.182336182336183, | |
| "grad_norm": 0.8657986887322702, | |
| "learning_rate": 0.004927736542562676, | |
| "loss": 2.4342, | |
| "loss_layer_12_head": 0.5429137349128723, | |
| "loss_layer_18_head": 0.4332647919654846, | |
| "loss_layer_24_head": 0.27726346254348755, | |
| "loss_layer_30_head": 0.17736102640628815, | |
| "loss_layer_36_head": 0.11637071520090103, | |
| "loss_layer_42_head": 0.06702034920454025, | |
| "loss_layer_6_head": 0.8331397771835327, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 16.41025641025641, | |
| "grad_norm": 0.9425666175764251, | |
| "learning_rate": 0.004922693215572695, | |
| "loss": 2.4022, | |
| "loss_layer_12_head": 0.510245680809021, | |
| "loss_layer_18_head": 0.42118319869041443, | |
| "loss_layer_24_head": 0.2536671757698059, | |
| "loss_layer_30_head": 0.18339803814888, | |
| "loss_layer_36_head": 0.1120036393404007, | |
| "loss_layer_42_head": 0.06707726418972015, | |
| "loss_layer_6_head": 0.8400261998176575, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 16.63817663817664, | |
| "grad_norm": 0.8077761031458355, | |
| "learning_rate": 0.004917482543841618, | |
| "loss": 2.3971, | |
| "loss_layer_12_head": 0.5046784281730652, | |
| "loss_layer_18_head": 0.4156663417816162, | |
| "loss_layer_24_head": 0.24197836220264435, | |
| "loss_layer_30_head": 0.17227289080619812, | |
| "loss_layer_36_head": 0.10577581822872162, | |
| "loss_layer_42_head": 0.08296042680740356, | |
| "loss_layer_6_head": 0.8249191045761108, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 16.866096866096868, | |
| "grad_norm": 0.8166472620009376, | |
| "learning_rate": 0.004912104887290587, | |
| "loss": 2.3838, | |
| "loss_layer_12_head": 0.5049887895584106, | |
| "loss_layer_18_head": 0.4009923040866852, | |
| "loss_layer_24_head": 0.24655649065971375, | |
| "loss_layer_30_head": 0.16778674721717834, | |
| "loss_layer_36_head": 0.10913994163274765, | |
| "loss_layer_42_head": 0.0756555050611496, | |
| "loss_layer_6_head": 0.8108115196228027, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 17.094017094017094, | |
| "grad_norm": 0.7008030273100762, | |
| "learning_rate": 0.0049065606173750295, | |
| "loss": 2.323, | |
| "loss_layer_12_head": 0.5143665671348572, | |
| "loss_layer_18_head": 0.42088404297828674, | |
| "loss_layer_24_head": 0.2502315938472748, | |
| "loss_layer_30_head": 0.17210659384727478, | |
| "loss_layer_36_head": 0.10463279485702515, | |
| "loss_layer_42_head": 0.07540477812290192, | |
| "loss_layer_6_head": 0.8134105801582336, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 17.321937321937323, | |
| "grad_norm": 1.0537921241381873, | |
| "learning_rate": 0.004900850117059, | |
| "loss": 2.3176, | |
| "loss_layer_12_head": 0.5064232349395752, | |
| "loss_layer_18_head": 0.4487612247467041, | |
| "loss_layer_24_head": 0.23967449367046356, | |
| "loss_layer_30_head": 0.15723897516727448, | |
| "loss_layer_36_head": 0.09650365263223648, | |
| "loss_layer_42_head": 0.06906923651695251, | |
| "loss_layer_6_head": 0.779255747795105, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 17.54985754985755, | |
| "grad_norm": 1.2282578627276648, | |
| "learning_rate": 0.004894973780788722, | |
| "loss": 2.5449, | |
| "loss_layer_12_head": 0.5091427564620972, | |
| "loss_layer_18_head": 0.5503803491592407, | |
| "loss_layer_24_head": 0.24137809872627258, | |
| "loss_layer_30_head": 0.16684108972549438, | |
| "loss_layer_36_head": 0.18911299109458923, | |
| "loss_layer_42_head": 0.07031677663326263, | |
| "loss_layer_6_head": 0.8027304410934448, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 17.77777777777778, | |
| "grad_norm": 1.0765867107263354, | |
| "learning_rate": 0.004888932014465352, | |
| "loss": 2.6286, | |
| "loss_layer_12_head": 0.5079230070114136, | |
| "loss_layer_18_head": 0.5281995534896851, | |
| "loss_layer_24_head": 0.2384219467639923, | |
| "loss_layer_30_head": 0.18661737442016602, | |
| "loss_layer_36_head": 0.2386232614517212, | |
| "loss_layer_42_head": 0.06579138338565826, | |
| "loss_layer_6_head": 0.8466068506240845, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 18.005698005698004, | |
| "grad_norm": 1.3603813767101454, | |
| "learning_rate": 0.0048827252354169326, | |
| "loss": 2.6561, | |
| "loss_layer_12_head": 0.5154815912246704, | |
| "loss_layer_18_head": 0.4759330749511719, | |
| "loss_layer_24_head": 0.23246505856513977, | |
| "loss_layer_30_head": 0.17445430159568787, | |
| "loss_layer_36_head": 0.18358901143074036, | |
| "loss_layer_42_head": 0.07926751673221588, | |
| "loss_layer_6_head": 0.9083824157714844, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 18.233618233618234, | |
| "grad_norm": 1.0979057750891, | |
| "learning_rate": 0.004876353872369572, | |
| "loss": 2.736, | |
| "loss_layer_12_head": 0.4919304847717285, | |
| "loss_layer_18_head": 0.4315822124481201, | |
| "loss_layer_24_head": 0.47708845138549805, | |
| "loss_layer_30_head": 0.16244928538799286, | |
| "loss_layer_36_head": 0.16019582748413086, | |
| "loss_layer_42_head": 0.10476633161306381, | |
| "loss_layer_6_head": 0.8798778653144836, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 18.233618233618234, | |
| "eval_loss": 4.721885681152344, | |
| "eval_loss_layer_12_head": 0.8316348195075989, | |
| "eval_loss_layer_18_head": 0.7489945292472839, | |
| "eval_loss_layer_24_head": 1.0868966579437256, | |
| "eval_loss_layer_30_head": 0.32381412386894226, | |
| "eval_loss_layer_36_head": 0.26655933260917664, | |
| "eval_loss_layer_42_head": 0.17233143746852875, | |
| "eval_loss_layer_6_head": 1.2158176898956299, | |
| "eval_runtime": 4.9318, | |
| "eval_samples_per_second": 6.691, | |
| "eval_steps_per_second": 0.608, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 18.46153846153846, | |
| "grad_norm": 1.4771278422236476, | |
| "learning_rate": 0.004869818365417829, | |
| "loss": 2.9782, | |
| "loss_layer_12_head": 0.5835316181182861, | |
| "loss_layer_18_head": 0.41122907400131226, | |
| "loss_layer_24_head": 0.703028678894043, | |
| "loss_layer_30_head": 0.16457121074199677, | |
| "loss_layer_36_head": 0.1471795290708542, | |
| "loss_layer_42_head": 0.09325657039880753, | |
| "loss_layer_6_head": 0.8624021410942078, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 18.68945868945869, | |
| "grad_norm": 1.0091974995950315, | |
| "learning_rate": 0.004863119165994312, | |
| "loss": 2.9112, | |
| "loss_layer_12_head": 0.670836329460144, | |
| "loss_layer_18_head": 0.4119151532649994, | |
| "loss_layer_24_head": 0.5785427093505859, | |
| "loss_layer_30_head": 0.16393141448497772, | |
| "loss_layer_36_head": 0.14178720116615295, | |
| "loss_layer_42_head": 0.08272372931241989, | |
| "loss_layer_6_head": 0.8412971496582031, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 18.91737891737892, | |
| "grad_norm": 0.6656147383639003, | |
| "learning_rate": 0.004856256736838498, | |
| "loss": 2.747, | |
| "loss_layer_12_head": 0.6682761311531067, | |
| "loss_layer_18_head": 0.411543607711792, | |
| "loss_layer_24_head": 0.4885958731174469, | |
| "loss_layer_30_head": 0.16745702922344208, | |
| "loss_layer_36_head": 0.12891222536563873, | |
| "loss_layer_42_head": 0.07578006386756897, | |
| "loss_layer_6_head": 0.8457491993904114, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 19.145299145299145, | |
| "grad_norm": 0.5054801842815198, | |
| "learning_rate": 0.0048492315519647715, | |
| "loss": 2.4882, | |
| "loss_layer_12_head": 0.5891987085342407, | |
| "loss_layer_18_head": 0.3886004090309143, | |
| "loss_layer_24_head": 0.42113596200942993, | |
| "loss_layer_30_head": 0.15418270230293274, | |
| "loss_layer_36_head": 0.11842542886734009, | |
| "loss_layer_42_head": 0.068417988717556, | |
| "loss_layer_6_head": 0.7943364381790161, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 19.373219373219374, | |
| "grad_norm": 0.4952359696534927, | |
| "learning_rate": 0.0048420440966296776, | |
| "loss": 2.376, | |
| "loss_layer_12_head": 0.5409587621688843, | |
| "loss_layer_18_head": 0.38072651624679565, | |
| "loss_layer_24_head": 0.37133434414863586, | |
| "loss_layer_30_head": 0.15104272961616516, | |
| "loss_layer_36_head": 0.11313710361719131, | |
| "loss_layer_42_head": 0.06553231179714203, | |
| "loss_layer_6_head": 0.7570014595985413, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 19.6011396011396, | |
| "grad_norm": 0.7006459133032029, | |
| "learning_rate": 0.004834694867298409, | |
| "loss": 2.3375, | |
| "loss_layer_12_head": 0.5223572254180908, | |
| "loss_layer_18_head": 0.38262003660202026, | |
| "loss_layer_24_head": 0.34287339448928833, | |
| "loss_layer_30_head": 0.17433951795101166, | |
| "loss_layer_36_head": 0.11228135973215103, | |
| "loss_layer_42_head": 0.06561323255300522, | |
| "loss_layer_6_head": 0.7609976530075073, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 19.82905982905983, | |
| "grad_norm": 0.5491943779867444, | |
| "learning_rate": 0.004827184371610511, | |
| "loss": 2.3932, | |
| "loss_layer_12_head": 0.4984654486179352, | |
| "loss_layer_18_head": 0.37037283182144165, | |
| "loss_layer_24_head": 0.3063350319862366, | |
| "loss_layer_30_head": 0.23311960697174072, | |
| "loss_layer_36_head": 0.10572105646133423, | |
| "loss_layer_42_head": 0.06688721477985382, | |
| "loss_layer_6_head": 0.7638763785362244, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 20.056980056980056, | |
| "grad_norm": 0.5945511861137925, | |
| "learning_rate": 0.004819513128344813, | |
| "loss": 2.3253, | |
| "loss_layer_12_head": 0.49806079268455505, | |
| "loss_layer_18_head": 0.38933131098747253, | |
| "loss_layer_24_head": 0.29588833451271057, | |
| "loss_layer_30_head": 0.21775169670581818, | |
| "loss_layer_36_head": 0.11467882245779037, | |
| "loss_layer_42_head": 0.06367787718772888, | |
| "loss_layer_6_head": 0.7606059312820435, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 20.284900284900285, | |
| "grad_norm": 1.131592778867395, | |
| "learning_rate": 0.004811681667383604, | |
| "loss": 2.2407, | |
| "loss_layer_12_head": 0.47209444642066956, | |
| "loss_layer_18_head": 0.3742205798625946, | |
| "loss_layer_24_head": 0.27469268441200256, | |
| "loss_layer_30_head": 0.19076983630657196, | |
| "loss_layer_36_head": 0.10397826135158539, | |
| "loss_layer_42_head": 0.06761151552200317, | |
| "loss_layer_6_head": 0.7645944356918335, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 20.51282051282051, | |
| "grad_norm": 0.7245636198542537, | |
| "learning_rate": 0.004803690529676019, | |
| "loss": 2.2039, | |
| "loss_layer_12_head": 0.4624738097190857, | |
| "loss_layer_18_head": 0.36616355180740356, | |
| "loss_layer_24_head": 0.25559544563293457, | |
| "loss_layer_30_head": 0.17066636681556702, | |
| "loss_layer_36_head": 0.10047119855880737, | |
| "loss_layer_42_head": 0.0659947544336319, | |
| "loss_layer_6_head": 0.770186185836792, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 20.74074074074074, | |
| "grad_norm": 0.8633596477338317, | |
| "learning_rate": 0.004795540267200685, | |
| "loss": 2.2908, | |
| "loss_layer_12_head": 0.48264575004577637, | |
| "loss_layer_18_head": 0.40934857726097107, | |
| "loss_layer_24_head": 0.2605716288089752, | |
| "loss_layer_30_head": 0.16646702587604523, | |
| "loss_layer_36_head": 0.11609245836734772, | |
| "loss_layer_42_head": 0.06701932847499847, | |
| "loss_layer_6_head": 0.8158668279647827, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 20.96866096866097, | |
| "grad_norm": 0.8163615583946534, | |
| "learning_rate": 0.004787231442927586, | |
| "loss": 2.295, | |
| "loss_layer_12_head": 0.4884544014930725, | |
| "loss_layer_18_head": 0.3945980668067932, | |
| "loss_layer_24_head": 0.2505979835987091, | |
| "loss_layer_30_head": 0.15824952721595764, | |
| "loss_layer_36_head": 0.11044065654277802, | |
| "loss_layer_42_head": 0.06734587997198105, | |
| "loss_layer_6_head": 0.8474823236465454, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 21.196581196581196, | |
| "grad_norm": 0.9965972018686945, | |
| "learning_rate": 0.004778764630779183, | |
| "loss": 2.2109, | |
| "loss_layer_12_head": 0.45411452651023865, | |
| "loss_layer_18_head": 0.3743075430393219, | |
| "loss_layer_24_head": 0.23483021557331085, | |
| "loss_layer_30_head": 0.1525890976190567, | |
| "loss_layer_36_head": 0.10491526126861572, | |
| "loss_layer_42_head": 0.06145526096224785, | |
| "loss_layer_6_head": 0.8141587972640991, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 21.424501424501425, | |
| "grad_norm": 0.9579834031557086, | |
| "learning_rate": 0.004770140415590762, | |
| "loss": 2.2449, | |
| "loss_layer_12_head": 0.4716685712337494, | |
| "loss_layer_18_head": 0.4065755307674408, | |
| "loss_layer_24_head": 0.23640041053295135, | |
| "loss_layer_30_head": 0.15385356545448303, | |
| "loss_layer_36_head": 0.10922032594680786, | |
| "loss_layer_42_head": 0.07777702063322067, | |
| "loss_layer_6_head": 0.8294760584831238, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 21.65242165242165, | |
| "grad_norm": 1.0807303975242237, | |
| "learning_rate": 0.0047613593930700485, | |
| "loss": 2.3935, | |
| "loss_layer_12_head": 0.5475557446479797, | |
| "loss_layer_18_head": 0.46753430366516113, | |
| "loss_layer_24_head": 0.2278400957584381, | |
| "loss_layer_30_head": 0.14627912640571594, | |
| "loss_layer_36_head": 0.10156891494989395, | |
| "loss_layer_42_head": 0.06986488401889801, | |
| "loss_layer_6_head": 0.7917695045471191, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 21.88034188034188, | |
| "grad_norm": 0.8264421749483553, | |
| "learning_rate": 0.004752422169756048, | |
| "loss": 2.4382, | |
| "loss_layer_12_head": 0.6330815553665161, | |
| "loss_layer_18_head": 0.48085513710975647, | |
| "loss_layer_24_head": 0.2386564314365387, | |
| "loss_layer_30_head": 0.15691904723644257, | |
| "loss_layer_36_head": 0.10863230377435684, | |
| "loss_layer_42_head": 0.07321296632289886, | |
| "loss_layer_6_head": 0.7897067070007324, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 22.108262108262107, | |
| "grad_norm": 0.6980195048286758, | |
| "learning_rate": 0.00474332936297716, | |
| "loss": 2.389, | |
| "loss_layer_12_head": 0.671896755695343, | |
| "loss_layer_18_head": 0.44162482023239136, | |
| "loss_layer_24_head": 0.22745053470134735, | |
| "loss_layer_30_head": 0.14352098107337952, | |
| "loss_layer_36_head": 0.09909910708665848, | |
| "loss_layer_42_head": 0.06609858572483063, | |
| "loss_layer_6_head": 0.7521198391914368, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 22.336182336182336, | |
| "grad_norm": 0.8316424914651728, | |
| "learning_rate": 0.004734081600808531, | |
| "loss": 2.3102, | |
| "loss_layer_12_head": 0.5933720469474792, | |
| "loss_layer_18_head": 0.3834800124168396, | |
| "loss_layer_24_head": 0.20532111823558807, | |
| "loss_layer_30_head": 0.13171999156475067, | |
| "loss_layer_36_head": 0.08968774229288101, | |
| "loss_layer_42_head": 0.07913189381361008, | |
| "loss_layer_6_head": 0.7101460099220276, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 22.564102564102566, | |
| "grad_norm": 0.8957636763389422, | |
| "learning_rate": 0.004724679522028672, | |
| "loss": 2.24, | |
| "loss_layer_12_head": 0.5700436234474182, | |
| "loss_layer_18_head": 0.3944259285926819, | |
| "loss_layer_24_head": 0.21593594551086426, | |
| "loss_layer_30_head": 0.13537552952766418, | |
| "loss_layer_36_head": 0.09649913012981415, | |
| "loss_layer_42_head": 0.07921244949102402, | |
| "loss_layer_6_head": 0.7655854821205139, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 22.79202279202279, | |
| "grad_norm": 0.6557657532360657, | |
| "learning_rate": 0.004715123776075337, | |
| "loss": 2.2258, | |
| "loss_layer_12_head": 0.5300887823104858, | |
| "loss_layer_18_head": 0.3781338632106781, | |
| "loss_layer_24_head": 0.20915941894054413, | |
| "loss_layer_30_head": 0.13727469742298126, | |
| "loss_layer_36_head": 0.08927856385707855, | |
| "loss_layer_42_head": 0.07049344480037689, | |
| "loss_layer_6_head": 0.752504289150238, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 23.01994301994302, | |
| "grad_norm": 0.7830740281704984, | |
| "learning_rate": 0.0047054150230006605, | |
| "loss": 2.2806, | |
| "loss_layer_12_head": 0.5240658521652222, | |
| "loss_layer_18_head": 0.38662710785865784, | |
| "loss_layer_24_head": 0.22086063027381897, | |
| "loss_layer_30_head": 0.19454047083854675, | |
| "loss_layer_36_head": 0.09448827058076859, | |
| "loss_layer_42_head": 0.06740959733724594, | |
| "loss_layer_6_head": 0.7966665029525757, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 23.247863247863247, | |
| "grad_norm": 0.5579028205723976, | |
| "learning_rate": 0.004695553933425571, | |
| "loss": 2.1436, | |
| "loss_layer_12_head": 0.4735882878303528, | |
| "loss_layer_18_head": 0.36018824577331543, | |
| "loss_layer_24_head": 0.20643365383148193, | |
| "loss_layer_30_head": 0.17400547862052917, | |
| "loss_layer_36_head": 0.0954669862985611, | |
| "loss_layer_42_head": 0.06432937830686569, | |
| "loss_layer_6_head": 0.731139063835144, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 23.475783475783476, | |
| "grad_norm": 0.552522051234654, | |
| "learning_rate": 0.004685541188493464, | |
| "loss": 2.0865, | |
| "loss_layer_12_head": 0.46377283334732056, | |
| "loss_layer_18_head": 0.3589307963848114, | |
| "loss_layer_24_head": 0.20729784667491913, | |
| "loss_layer_30_head": 0.15942244231700897, | |
| "loss_layer_36_head": 0.0918920561671257, | |
| "loss_layer_42_head": 0.059663545340299606, | |
| "loss_layer_6_head": 0.7291909456253052, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 23.703703703703702, | |
| "grad_norm": 0.529577925725112, | |
| "learning_rate": 0.004675377479823153, | |
| "loss": 2.1213, | |
| "loss_layer_12_head": 0.4666662812232971, | |
| "loss_layer_18_head": 0.37829747796058655, | |
| "loss_layer_24_head": 0.20896704494953156, | |
| "loss_layer_30_head": 0.15137134492397308, | |
| "loss_layer_36_head": 0.10702455043792725, | |
| "loss_layer_42_head": 0.060252584517002106, | |
| "loss_layer_6_head": 0.74061518907547, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 23.931623931623932, | |
| "grad_norm": 0.7782049747581867, | |
| "learning_rate": 0.004665063509461097, | |
| "loss": 2.1262, | |
| "loss_layer_12_head": 0.462003231048584, | |
| "loss_layer_18_head": 0.37864863872528076, | |
| "loss_layer_24_head": 0.20960676670074463, | |
| "loss_layer_30_head": 0.1473332941532135, | |
| "loss_layer_36_head": 0.09850483387708664, | |
| "loss_layer_42_head": 0.05810556560754776, | |
| "loss_layer_6_head": 0.7602113485336304, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 24.159544159544158, | |
| "grad_norm": 0.6360759879888129, | |
| "learning_rate": 0.00465459998983291, | |
| "loss": 2.0674, | |
| "loss_layer_12_head": 0.44862040877342224, | |
| "loss_layer_18_head": 0.3798063099384308, | |
| "loss_layer_24_head": 0.20599885284900665, | |
| "loss_layer_30_head": 0.1415323168039322, | |
| "loss_layer_36_head": 0.09703875333070755, | |
| "loss_layer_42_head": 0.05768008157610893, | |
| "loss_layer_6_head": 0.7395282983779907, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 24.387464387464387, | |
| "grad_norm": 1.0659611300691207, | |
| "learning_rate": 0.004643987643694149, | |
| "loss": 2.1265, | |
| "loss_layer_12_head": 0.46460556983947754, | |
| "loss_layer_18_head": 0.4132619798183441, | |
| "loss_layer_24_head": 0.21071580052375793, | |
| "loss_layer_30_head": 0.1440211832523346, | |
| "loss_layer_36_head": 0.09940258413553238, | |
| "loss_layer_42_head": 0.08096981793642044, | |
| "loss_layer_6_head": 0.7652676105499268, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 24.615384615384617, | |
| "grad_norm": 0.932438216545257, | |
| "learning_rate": 0.004633227204080389, | |
| "loss": 2.1919, | |
| "loss_layer_12_head": 0.46289700269699097, | |
| "loss_layer_18_head": 0.3990364670753479, | |
| "loss_layer_24_head": 0.20558461546897888, | |
| "loss_layer_30_head": 0.13726839423179626, | |
| "loss_layer_36_head": 0.09494920074939728, | |
| "loss_layer_42_head": 0.07333710789680481, | |
| "loss_layer_6_head": 0.8040269017219543, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 24.843304843304843, | |
| "grad_norm": 1.0248927444527722, | |
| "learning_rate": 0.004622319414256594, | |
| "loss": 2.2581, | |
| "loss_layer_12_head": 0.4594665467739105, | |
| "loss_layer_18_head": 0.387908935546875, | |
| "loss_layer_24_head": 0.21216896176338196, | |
| "loss_layer_30_head": 0.13931572437286377, | |
| "loss_layer_36_head": 0.09683094173669815, | |
| "loss_layer_42_head": 0.0691598504781723, | |
| "loss_layer_6_head": 0.872686505317688, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 25.071225071225072, | |
| "grad_norm": 0.8098869912341418, | |
| "learning_rate": 0.00461126502766577, | |
| "loss": 2.2181, | |
| "loss_layer_12_head": 0.45522379875183105, | |
| "loss_layer_18_head": 0.37695351243019104, | |
| "loss_layer_24_head": 0.20662946999073029, | |
| "loss_layer_30_head": 0.13180062174797058, | |
| "loss_layer_36_head": 0.10556745529174805, | |
| "loss_layer_42_head": 0.0616777129471302, | |
| "loss_layer_6_head": 0.8408571481704712, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 25.299145299145298, | |
| "grad_norm": 0.6911311577236631, | |
| "learning_rate": 0.0046000648078769295, | |
| "loss": 2.1218, | |
| "loss_layer_12_head": 0.47091466188430786, | |
| "loss_layer_18_head": 0.37433698773384094, | |
| "loss_layer_24_head": 0.21443676948547363, | |
| "loss_layer_30_head": 0.15783022344112396, | |
| "loss_layer_36_head": 0.10397765785455704, | |
| "loss_layer_42_head": 0.06332050263881683, | |
| "loss_layer_6_head": 0.81425541639328, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 25.527065527065528, | |
| "grad_norm": 0.5483359227112778, | |
| "learning_rate": 0.004588719528532341, | |
| "loss": 2.0836, | |
| "loss_layer_12_head": 0.4457060396671295, | |
| "loss_layer_18_head": 0.35422247648239136, | |
| "loss_layer_24_head": 0.2071726769208908, | |
| "loss_layer_30_head": 0.1477786749601364, | |
| "loss_layer_36_head": 0.10025700181722641, | |
| "loss_layer_42_head": 0.0617125928401947, | |
| "loss_layer_6_head": 0.7575004696846008, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 25.754985754985753, | |
| "grad_norm": 0.5184834946968752, | |
| "learning_rate": 0.004577229973294099, | |
| "loss": 2.0984, | |
| "loss_layer_12_head": 0.46223968267440796, | |
| "loss_layer_18_head": 0.39067524671554565, | |
| "loss_layer_24_head": 0.212998628616333, | |
| "loss_layer_30_head": 0.14908693730831146, | |
| "loss_layer_36_head": 0.09892690926790237, | |
| "loss_layer_42_head": 0.05941791459918022, | |
| "loss_layer_6_head": 0.760570764541626, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 25.982905982905983, | |
| "grad_norm": 0.551240787416797, | |
| "learning_rate": 0.004565596935789987, | |
| "loss": 2.1022, | |
| "loss_layer_12_head": 0.46231111884117126, | |
| "loss_layer_18_head": 0.38874131441116333, | |
| "loss_layer_24_head": 0.21234926581382751, | |
| "loss_layer_30_head": 0.14975954592227936, | |
| "loss_layer_36_head": 0.09737460315227509, | |
| "loss_layer_42_head": 0.060064684599637985, | |
| "loss_layer_6_head": 0.738222599029541, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 26.210826210826212, | |
| "grad_norm": 0.6796340352770581, | |
| "learning_rate": 0.004553821219558661, | |
| "loss": 2.0061, | |
| "loss_layer_12_head": 0.46215319633483887, | |
| "loss_layer_18_head": 0.3686721920967102, | |
| "loss_layer_24_head": 0.2038687914609909, | |
| "loss_layer_30_head": 0.1499582827091217, | |
| "loss_layer_36_head": 0.08974792063236237, | |
| "loss_layer_42_head": 0.06425337493419647, | |
| "loss_layer_6_head": 0.7155905961990356, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 26.43874643874644, | |
| "grad_norm": 0.8866787700250957, | |
| "learning_rate": 0.004541903637994142, | |
| "loss": 2.0603, | |
| "loss_layer_12_head": 0.44424566626548767, | |
| "loss_layer_18_head": 0.3662102520465851, | |
| "loss_layer_24_head": 0.20653650164604187, | |
| "loss_layer_30_head": 0.14670081436634064, | |
| "loss_layer_36_head": 0.10450087487697601, | |
| "loss_layer_42_head": 0.06536010652780533, | |
| "loss_layer_6_head": 0.7095087170600891, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 26.666666666666668, | |
| "grad_norm": 0.6486151029335254, | |
| "learning_rate": 0.004529845014289642, | |
| "loss": 2.0393, | |
| "loss_layer_12_head": 0.4427577555179596, | |
| "loss_layer_18_head": 0.36848098039627075, | |
| "loss_layer_24_head": 0.20641179382801056, | |
| "loss_layer_30_head": 0.14615444839000702, | |
| "loss_layer_36_head": 0.10037078708410263, | |
| "loss_layer_42_head": 0.06394441425800323, | |
| "loss_layer_6_head": 0.7171773910522461, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 26.894586894586894, | |
| "grad_norm": 0.8622308299226761, | |
| "learning_rate": 0.00451764618138069, | |
| "loss": 2.101, | |
| "loss_layer_12_head": 0.47314929962158203, | |
| "loss_layer_18_head": 0.3822045922279358, | |
| "loss_layer_24_head": 0.2136635035276413, | |
| "loss_layer_30_head": 0.14691603183746338, | |
| "loss_layer_36_head": 0.11097769439220428, | |
| "loss_layer_42_head": 0.06998325139284134, | |
| "loss_layer_6_head": 0.7319896221160889, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 27.122507122507123, | |
| "grad_norm": 0.7390178601634367, | |
| "learning_rate": 0.0045053079818876095, | |
| "loss": 2.0397, | |
| "loss_layer_12_head": 0.44820213317871094, | |
| "loss_layer_18_head": 0.3551773428916931, | |
| "loss_layer_24_head": 0.20566609501838684, | |
| "loss_layer_30_head": 0.1313147395849228, | |
| "loss_layer_36_head": 0.09519216418266296, | |
| "loss_layer_42_head": 0.0580265149474144, | |
| "loss_layer_6_head": 0.7102463245391846, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 27.35042735042735, | |
| "grad_norm": 0.8258874084634017, | |
| "learning_rate": 0.0044928312680573065, | |
| "loss": 2.0128, | |
| "loss_layer_12_head": 0.43636664748191833, | |
| "loss_layer_18_head": 0.34100016951560974, | |
| "loss_layer_24_head": 0.2000311315059662, | |
| "loss_layer_30_head": 0.14223387837409973, | |
| "loss_layer_36_head": 0.09040534496307373, | |
| "loss_layer_42_head": 0.0646161437034607, | |
| "loss_layer_6_head": 0.713534414768219, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 27.35042735042735, | |
| "eval_loss": 3.8952760696411133, | |
| "eval_loss_layer_12_head": 0.8030363917350769, | |
| "eval_loss_layer_18_head": 0.7230358719825745, | |
| "eval_loss_layer_24_head": 0.44514450430870056, | |
| "eval_loss_layer_30_head": 0.349967360496521, | |
| "eval_loss_layer_36_head": 0.2027323693037033, | |
| "eval_loss_layer_42_head": 0.14586174488067627, | |
| "eval_loss_layer_6_head": 1.159847617149353, | |
| "eval_runtime": 4.9499, | |
| "eval_samples_per_second": 6.667, | |
| "eval_steps_per_second": 0.606, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 27.57834757834758, | |
| "grad_norm": 0.6217870505507969, | |
| "learning_rate": 0.004480216901704406, | |
| "loss": 2.0692, | |
| "loss_layer_12_head": 0.46979862451553345, | |
| "loss_layer_18_head": 0.35027259588241577, | |
| "loss_layer_24_head": 0.2092142403125763, | |
| "loss_layer_30_head": 0.1553209125995636, | |
| "loss_layer_36_head": 0.09953723847866058, | |
| "loss_layer_42_head": 0.05826183035969734, | |
| "loss_layer_6_head": 0.7447252869606018, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 27.806267806267805, | |
| "grad_norm": 1.3117503463089188, | |
| "learning_rate": 0.004467465754151723, | |
| "loss": 2.1562, | |
| "loss_layer_12_head": 0.446768581867218, | |
| "loss_layer_18_head": 0.40118208527565, | |
| "loss_layer_24_head": 0.20322206616401672, | |
| "loss_layer_30_head": 0.14313673973083496, | |
| "loss_layer_36_head": 0.09695640206336975, | |
| "loss_layer_42_head": 0.0757862776517868, | |
| "loss_layer_6_head": 0.7276732921600342, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 28.034188034188034, | |
| "grad_norm": 0.725245663795549, | |
| "learning_rate": 0.0044545787061700745, | |
| "loss": 7.8264, | |
| "loss_layer_12_head": 0.46167078614234924, | |
| "loss_layer_18_head": 6.237041473388672, | |
| "loss_layer_24_head": 0.2191043347120285, | |
| "loss_layer_30_head": 0.14354541897773743, | |
| "loss_layer_36_head": 0.09405554085969925, | |
| "loss_layer_42_head": 0.07178852707147598, | |
| "loss_layer_6_head": 0.749311625957489, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 28.262108262108264, | |
| "grad_norm": 0.6454953978078362, | |
| "learning_rate": 0.004441556647917446, | |
| "loss": 7.5221, | |
| "loss_layer_12_head": 0.4319098889827728, | |
| "loss_layer_18_head": 5.769500255584717, | |
| "loss_layer_24_head": 0.21817514300346375, | |
| "loss_layer_30_head": 0.13674825429916382, | |
| "loss_layer_36_head": 0.09862224757671356, | |
| "loss_layer_42_head": 0.0651412308216095, | |
| "loss_layer_6_head": 0.7003488540649414, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 28.49002849002849, | |
| "grad_norm": 0.5903394788169665, | |
| "learning_rate": 0.004428400478877499, | |
| "loss": 6.8023, | |
| "loss_layer_12_head": 0.44873490929603577, | |
| "loss_layer_18_head": 5.282101631164551, | |
| "loss_layer_24_head": 0.216207355260849, | |
| "loss_layer_30_head": 0.13955923914909363, | |
| "loss_layer_36_head": 0.09675300121307373, | |
| "loss_layer_42_head": 0.06318524479866028, | |
| "loss_layer_6_head": 0.7136842012405396, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 28.71794871794872, | |
| "grad_norm": 0.798968656911552, | |
| "learning_rate": 0.004415111107797445, | |
| "loss": 6.3915, | |
| "loss_layer_12_head": 0.4504212737083435, | |
| "loss_layer_18_head": 4.649975776672363, | |
| "loss_layer_24_head": 0.21879024803638458, | |
| "loss_layer_30_head": 0.15180036425590515, | |
| "loss_layer_36_head": 0.10117790848016739, | |
| "loss_layer_42_head": 0.06038238853216171, | |
| "loss_layer_6_head": 0.7129833102226257, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 28.945868945868945, | |
| "grad_norm": 0.5813942239901363, | |
| "learning_rate": 0.004401689452625272, | |
| "loss": 6.0419, | |
| "loss_layer_12_head": 0.4498574733734131, | |
| "loss_layer_18_head": 4.355905532836914, | |
| "loss_layer_24_head": 0.20642943680286407, | |
| "loss_layer_30_head": 0.1372959315776825, | |
| "loss_layer_36_head": 0.10397912561893463, | |
| "loss_layer_42_head": 0.055586397647857666, | |
| "loss_layer_6_head": 0.7128573656082153, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 29.173789173789174, | |
| "grad_norm": 0.7269276575425652, | |
| "learning_rate": 0.004388136440446337, | |
| "loss": 5.7069, | |
| "loss_layer_12_head": 0.4492161273956299, | |
| "loss_layer_18_head": 4.068719387054443, | |
| "loss_layer_24_head": 0.20855550467967987, | |
| "loss_layer_30_head": 0.1389748752117157, | |
| "loss_layer_36_head": 0.10276387631893158, | |
| "loss_layer_42_head": 0.05752943828701973, | |
| "loss_layer_6_head": 0.7084957957267761, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 29.4017094017094, | |
| "grad_norm": 0.7160396858444902, | |
| "learning_rate": 0.0043744530074193355, | |
| "loss": 5.4769, | |
| "loss_layer_12_head": 0.43505144119262695, | |
| "loss_layer_18_head": 3.8150742053985596, | |
| "loss_layer_24_head": 0.20362111926078796, | |
| "loss_layer_30_head": 0.13056820631027222, | |
| "loss_layer_36_head": 0.09140172600746155, | |
| "loss_layer_42_head": 0.05409618094563484, | |
| "loss_layer_6_head": 0.6898916959762573, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 29.62962962962963, | |
| "grad_norm": 0.9710847170559996, | |
| "learning_rate": 0.004360640098711629, | |
| "loss": 5.3281, | |
| "loss_layer_12_head": 0.4550530016422272, | |
| "loss_layer_18_head": 3.626317262649536, | |
| "loss_layer_24_head": 0.20697829127311707, | |
| "loss_layer_30_head": 0.14250314235687256, | |
| "loss_layer_36_head": 0.09626954793930054, | |
| "loss_layer_42_head": 0.08632789552211761, | |
| "loss_layer_6_head": 0.7261134386062622, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 29.85754985754986, | |
| "grad_norm": 0.7802876477742213, | |
| "learning_rate": 0.004346698668433964, | |
| "loss": 5.1986, | |
| "loss_layer_12_head": 0.4772264063358307, | |
| "loss_layer_18_head": 3.4601082801818848, | |
| "loss_layer_24_head": 0.20182755589485168, | |
| "loss_layer_30_head": 0.13202452659606934, | |
| "loss_layer_36_head": 0.08688151091337204, | |
| "loss_layer_42_head": 0.0689874067902565, | |
| "loss_layer_6_head": 0.7271126508712769, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 30.085470085470085, | |
| "grad_norm": 0.8296241303467062, | |
| "learning_rate": 0.004332629679574566, | |
| "loss": 5.0954, | |
| "loss_layer_12_head": 0.5482393503189087, | |
| "loss_layer_18_head": 3.2373409271240234, | |
| "loss_layer_24_head": 0.20368175208568573, | |
| "loss_layer_30_head": 0.1347997486591339, | |
| "loss_layer_36_head": 0.088289774954319, | |
| "loss_layer_42_head": 0.062674880027771, | |
| "loss_layer_6_head": 0.7211120128631592, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 30.313390313390315, | |
| "grad_norm": 0.997608350407498, | |
| "learning_rate": 0.0043184341039326215, | |
| "loss": 4.9774, | |
| "loss_layer_12_head": 0.541628897190094, | |
| "loss_layer_18_head": 3.1904382705688477, | |
| "loss_layer_24_head": 0.1936037689447403, | |
| "loss_layer_30_head": 0.13256603479385376, | |
| "loss_layer_36_head": 0.10135439783334732, | |
| "loss_layer_42_head": 0.059117190539836884, | |
| "loss_layer_6_head": 0.7407132387161255, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 30.54131054131054, | |
| "grad_norm": 0.8596300884902531, | |
| "learning_rate": 0.004304112922051156, | |
| "loss": 4.8952, | |
| "loss_layer_12_head": 0.5269235968589783, | |
| "loss_layer_18_head": 3.068875789642334, | |
| "loss_layer_24_head": 0.20888909697532654, | |
| "loss_layer_30_head": 0.17197871208190918, | |
| "loss_layer_36_head": 0.09767322242259979, | |
| "loss_layer_42_head": 0.05949018523097038, | |
| "loss_layer_6_head": 0.7554638981819153, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 30.76923076923077, | |
| "grad_norm": 0.5476872326854121, | |
| "learning_rate": 0.004289667123149296, | |
| "loss": 4.7437, | |
| "loss_layer_12_head": 0.4962801933288574, | |
| "loss_layer_18_head": 2.9493045806884766, | |
| "loss_layer_24_head": 0.21296346187591553, | |
| "loss_layer_30_head": 0.1855914145708084, | |
| "loss_layer_36_head": 0.09603692591190338, | |
| "loss_layer_42_head": 0.05795098468661308, | |
| "loss_layer_6_head": 0.734602153301239, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 30.997150997150996, | |
| "grad_norm": 0.5685599681812281, | |
| "learning_rate": 0.00427509770505395, | |
| "loss": 4.6172, | |
| "loss_layer_12_head": 0.4792402684688568, | |
| "loss_layer_18_head": 2.841484785079956, | |
| "loss_layer_24_head": 0.20963506400585175, | |
| "loss_layer_30_head": 0.17262789607048035, | |
| "loss_layer_36_head": 0.09381120651960373, | |
| "loss_layer_42_head": 0.056700825691223145, | |
| "loss_layer_6_head": 0.7296031713485718, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 31.225071225071225, | |
| "grad_norm": 0.4938244163515248, | |
| "learning_rate": 0.00426040567413088, | |
| "loss": 4.4604, | |
| "loss_layer_12_head": 0.42545080184936523, | |
| "loss_layer_18_head": 2.7222859859466553, | |
| "loss_layer_24_head": 0.19360534846782684, | |
| "loss_layer_30_head": 0.1514999121427536, | |
| "loss_layer_36_head": 0.08772268146276474, | |
| "loss_layer_42_head": 0.06376411765813828, | |
| "loss_layer_6_head": 0.6847577095031738, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 31.45299145299145, | |
| "grad_norm": 0.5301807150076605, | |
| "learning_rate": 0.004245592045215182, | |
| "loss": 4.3332, | |
| "loss_layer_12_head": 0.42786169052124023, | |
| "loss_layer_18_head": 2.699221134185791, | |
| "loss_layer_24_head": 0.19219763576984406, | |
| "loss_layer_30_head": 0.14631637930870056, | |
| "loss_layer_36_head": 0.08851729333400726, | |
| "loss_layer_42_head": 0.054960232228040695, | |
| "loss_layer_6_head": 0.6882950663566589, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 31.68091168091168, | |
| "grad_norm": 0.4495253512587936, | |
| "learning_rate": 0.004230657841541199, | |
| "loss": 4.2686, | |
| "loss_layer_12_head": 0.4296341836452484, | |
| "loss_layer_18_head": 2.653172254562378, | |
| "loss_layer_24_head": 0.19859598577022552, | |
| "loss_layer_30_head": 0.14528068900108337, | |
| "loss_layer_36_head": 0.08939908444881439, | |
| "loss_layer_42_head": 0.05449342727661133, | |
| "loss_layer_6_head": 0.6941906213760376, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 31.90883190883191, | |
| "grad_norm": 0.7842110163127093, | |
| "learning_rate": 0.004215604094671834, | |
| "loss": 4.2071, | |
| "loss_layer_12_head": 0.4337303042411804, | |
| "loss_layer_18_head": 2.5614027976989746, | |
| "loss_layer_24_head": 0.2046552449464798, | |
| "loss_layer_30_head": 0.13873878121376038, | |
| "loss_layer_36_head": 0.08663885295391083, | |
| "loss_layer_42_head": 0.06045306846499443, | |
| "loss_layer_6_head": 0.7004567384719849, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 32.136752136752136, | |
| "grad_norm": 0.7965796662169216, | |
| "learning_rate": 0.004200431844427298, | |
| "loss": 4.1225, | |
| "loss_layer_12_head": 0.4378797113895416, | |
| "loss_layer_18_head": 2.602318525314331, | |
| "loss_layer_24_head": 0.19969557225704193, | |
| "loss_layer_30_head": 0.1381944864988327, | |
| "loss_layer_36_head": 0.08641330897808075, | |
| "loss_layer_42_head": 0.057823099195957184, | |
| "loss_layer_6_head": 0.7061583995819092, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 32.364672364672366, | |
| "grad_norm": 0.605583079609419, | |
| "learning_rate": 0.004185142138813288, | |
| "loss": 4.1027, | |
| "loss_layer_12_head": 0.42314305901527405, | |
| "loss_layer_18_head": 2.5009102821350098, | |
| "loss_layer_24_head": 0.19693370163440704, | |
| "loss_layer_30_head": 0.1453002393245697, | |
| "loss_layer_36_head": 0.08666378259658813, | |
| "loss_layer_42_head": 0.054904550313949585, | |
| "loss_layer_6_head": 0.7133627533912659, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 32.592592592592595, | |
| "grad_norm": 0.5948001098340642, | |
| "learning_rate": 0.004169736033948593, | |
| "loss": 4.0341, | |
| "loss_layer_12_head": 0.41084012389183044, | |
| "loss_layer_18_head": 2.4099440574645996, | |
| "loss_layer_24_head": 0.18852706253528595, | |
| "loss_layer_30_head": 0.1356324553489685, | |
| "loss_layer_36_head": 0.08520137518644333, | |
| "loss_layer_42_head": 0.05375955253839493, | |
| "loss_layer_6_head": 0.6864619255065918, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 32.82051282051282, | |
| "grad_norm": 0.534772333872558, | |
| "learning_rate": 0.004154214593992149, | |
| "loss": 3.9781, | |
| "loss_layer_12_head": 0.4393290579319, | |
| "loss_layer_18_head": 2.4041430950164795, | |
| "loss_layer_24_head": 0.2018025815486908, | |
| "loss_layer_30_head": 0.14416970312595367, | |
| "loss_layer_36_head": 0.09272973984479904, | |
| "loss_layer_42_head": 0.07788576930761337, | |
| "loss_layer_6_head": 0.6994581818580627, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 33.04843304843305, | |
| "grad_norm": 0.9032624922336923, | |
| "learning_rate": 0.004138578891069526, | |
| "loss": 4.08, | |
| "loss_layer_12_head": 0.4454478621482849, | |
| "loss_layer_18_head": 2.357255458831787, | |
| "loss_layer_24_head": 0.32440370321273804, | |
| "loss_layer_30_head": 0.13958565890789032, | |
| "loss_layer_36_head": 0.1198217049241066, | |
| "loss_layer_42_head": 0.07531634718179703, | |
| "loss_layer_6_head": 0.7142508029937744, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 33.27635327635328, | |
| "grad_norm": 0.5157652034690633, | |
| "learning_rate": 0.00412283000519888, | |
| "loss": 4.047, | |
| "loss_layer_12_head": 0.4016711115837097, | |
| "loss_layer_18_head": 2.2794528007507324, | |
| "loss_layer_24_head": 0.3679157495498657, | |
| "loss_layer_30_head": 0.1256381869316101, | |
| "loss_layer_36_head": 0.10068968683481216, | |
| "loss_layer_42_head": 0.06270381063222885, | |
| "loss_layer_6_head": 0.6438853144645691, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 33.504273504273506, | |
| "grad_norm": 0.7863247890392172, | |
| "learning_rate": 0.004106969024216348, | |
| "loss": 3.9464, | |
| "loss_layer_12_head": 0.45711761713027954, | |
| "loss_layer_18_head": 2.191732406616211, | |
| "loss_layer_24_head": 0.33409491181373596, | |
| "loss_layer_30_head": 0.1311335265636444, | |
| "loss_layer_36_head": 0.10090503841638565, | |
| "loss_layer_42_head": 0.0615161657333374, | |
| "loss_layer_6_head": 0.6720151901245117, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 33.732193732193736, | |
| "grad_norm": 0.7890469543055019, | |
| "learning_rate": 0.004090997043700909, | |
| "loss": 3.9344, | |
| "loss_layer_12_head": 0.47232159972190857, | |
| "loss_layer_18_head": 2.2468361854553223, | |
| "loss_layer_24_head": 0.29550954699516296, | |
| "loss_layer_30_head": 0.13495633006095886, | |
| "loss_layer_36_head": 0.09402237832546234, | |
| "loss_layer_42_head": 0.05762190371751785, | |
| "loss_layer_6_head": 0.6706913113594055, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 33.96011396011396, | |
| "grad_norm": 0.6929213281119304, | |
| "learning_rate": 0.004074915166898703, | |
| "loss": 3.8988, | |
| "loss_layer_12_head": 0.48199015855789185, | |
| "loss_layer_18_head": 2.14357328414917, | |
| "loss_layer_24_head": 0.2758108973503113, | |
| "loss_layer_30_head": 0.14659464359283447, | |
| "loss_layer_36_head": 0.09513117372989655, | |
| "loss_layer_42_head": 0.05680239200592041, | |
| "loss_layer_6_head": 0.7120590806007385, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 34.18803418803419, | |
| "grad_norm": 0.7353968166938382, | |
| "learning_rate": 0.004058724504646834, | |
| "loss": 3.7431, | |
| "loss_layer_12_head": 0.426646888256073, | |
| "loss_layer_18_head": 2.1235299110412598, | |
| "loss_layer_24_head": 0.23976945877075195, | |
| "loss_layer_30_head": 0.12898829579353333, | |
| "loss_layer_36_head": 0.08852064609527588, | |
| "loss_layer_42_head": 0.054186951369047165, | |
| "loss_layer_6_head": 0.6639636158943176, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 34.41595441595442, | |
| "grad_norm": 0.6556313337678951, | |
| "learning_rate": 0.004042426175296631, | |
| "loss": 3.69, | |
| "loss_layer_12_head": 0.40987634658813477, | |
| "loss_layer_18_head": 2.0121848583221436, | |
| "loss_layer_24_head": 0.2265264093875885, | |
| "loss_layer_30_head": 0.13357605040073395, | |
| "loss_layer_36_head": 0.08718468993902206, | |
| "loss_layer_42_head": 0.054076552391052246, | |
| "loss_layer_6_head": 0.6570177674293518, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 34.643874643874646, | |
| "grad_norm": 0.503404286314858, | |
| "learning_rate": 0.004026021304636408, | |
| "loss": 3.6462, | |
| "loss_layer_12_head": 0.43029722571372986, | |
| "loss_layer_18_head": 2.0399160385131836, | |
| "loss_layer_24_head": 0.22090141475200653, | |
| "loss_layer_30_head": 0.13173122704029083, | |
| "loss_layer_36_head": 0.0885952040553093, | |
| "loss_layer_42_head": 0.053800784051418304, | |
| "loss_layer_6_head": 0.702730119228363, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 34.87179487179487, | |
| "grad_norm": 0.477295528591719, | |
| "learning_rate": 0.0040095110258136935, | |
| "loss": 3.6617, | |
| "loss_layer_12_head": 0.42810750007629395, | |
| "loss_layer_18_head": 2.095346689224243, | |
| "loss_layer_24_head": 0.21871677041053772, | |
| "loss_layer_30_head": 0.1337631195783615, | |
| "loss_layer_36_head": 0.0910869836807251, | |
| "loss_layer_42_head": 0.056711532175540924, | |
| "loss_layer_6_head": 0.6914973258972168, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 35.0997150997151, | |
| "grad_norm": 0.4481247294956435, | |
| "learning_rate": 0.003992896479256966, | |
| "loss": 3.5825, | |
| "loss_layer_12_head": 0.4251008927822113, | |
| "loss_layer_18_head": 1.9943554401397705, | |
| "loss_layer_24_head": 0.2084464132785797, | |
| "loss_layer_30_head": 0.12890145182609558, | |
| "loss_layer_36_head": 0.08945901691913605, | |
| "loss_layer_42_head": 0.052441976964473724, | |
| "loss_layer_6_head": 0.6897009015083313, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 35.32763532763533, | |
| "grad_norm": 0.4019794924847544, | |
| "learning_rate": 0.003976178812596875, | |
| "loss": 3.4959, | |
| "loss_layer_12_head": 0.404086172580719, | |
| "loss_layer_18_head": 1.9240039587020874, | |
| "loss_layer_24_head": 0.19800055027008057, | |
| "loss_layer_30_head": 0.12846828997135162, | |
| "loss_layer_36_head": 0.0879533439874649, | |
| "loss_layer_42_head": 0.0561065748333931, | |
| "loss_layer_6_head": 0.6614921689033508, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 35.55555555555556, | |
| "grad_norm": 0.4836393513656827, | |
| "learning_rate": 0.003959359180586975, | |
| "loss": 3.4916, | |
| "loss_layer_12_head": 0.40365737676620483, | |
| "loss_layer_18_head": 1.9353296756744385, | |
| "loss_layer_24_head": 0.1916879415512085, | |
| "loss_layer_30_head": 0.13406383991241455, | |
| "loss_layer_36_head": 0.08549612015485764, | |
| "loss_layer_42_head": 0.05212582275271416, | |
| "loss_layer_6_head": 0.6601444482803345, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 35.78347578347579, | |
| "grad_norm": 0.7383438714425403, | |
| "learning_rate": 0.003942438745023957, | |
| "loss": 3.4015, | |
| "loss_layer_12_head": 0.41292792558670044, | |
| "loss_layer_18_head": 1.8313575983047485, | |
| "loss_layer_24_head": 0.19283311069011688, | |
| "loss_layer_30_head": 0.1311044692993164, | |
| "loss_layer_36_head": 0.08634034544229507, | |
| "loss_layer_42_head": 0.05280064791440964, | |
| "loss_layer_6_head": 0.6659265160560608, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 36.01139601139601, | |
| "grad_norm": 0.4124024976711123, | |
| "learning_rate": 0.003925418674667404, | |
| "loss": 3.4839, | |
| "loss_layer_12_head": 0.4237042963504791, | |
| "loss_layer_18_head": 1.8229494094848633, | |
| "loss_layer_24_head": 0.19902099668979645, | |
| "loss_layer_30_head": 0.13239210844039917, | |
| "loss_layer_36_head": 0.08935176581144333, | |
| "loss_layer_42_head": 0.05493398755788803, | |
| "loss_layer_6_head": 0.6895285844802856, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 36.23931623931624, | |
| "grad_norm": 0.580340353250173, | |
| "learning_rate": 0.003908300145159055, | |
| "loss": 3.3939, | |
| "loss_layer_12_head": 0.3949509263038635, | |
| "loss_layer_18_head": 1.807559609413147, | |
| "loss_layer_24_head": 0.18570482730865479, | |
| "loss_layer_30_head": 0.12840019166469574, | |
| "loss_layer_36_head": 0.09089195728302002, | |
| "loss_layer_42_head": 0.052501481026411057, | |
| "loss_layer_6_head": 0.6542239189147949, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 36.46723646723647, | |
| "grad_norm": 0.8135204155958197, | |
| "learning_rate": 0.003891084338941603, | |
| "loss": 3.3605, | |
| "loss_layer_12_head": 0.39419084787368774, | |
| "loss_layer_18_head": 1.7904040813446045, | |
| "loss_layer_24_head": 0.1839243322610855, | |
| "loss_layer_30_head": 0.12546224892139435, | |
| "loss_layer_36_head": 0.08821476995944977, | |
| "loss_layer_42_head": 0.04802712798118591, | |
| "loss_layer_6_head": 0.6653536558151245, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 36.46723646723647, | |
| "eval_loss": 4.920344829559326, | |
| "eval_loss_layer_12_head": 0.8174667954444885, | |
| "eval_loss_layer_18_head": 1.66545832157135, | |
| "eval_loss_layer_24_head": 0.4409888684749603, | |
| "eval_loss_layer_30_head": 0.30910196900367737, | |
| "eval_loss_layer_36_head": 0.20546264946460724, | |
| "eval_loss_layer_42_head": 0.13648581504821777, | |
| "eval_loss_layer_6_head": 1.103848934173584, | |
| "eval_runtime": 4.9515, | |
| "eval_samples_per_second": 6.665, | |
| "eval_steps_per_second": 0.606, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 36.6951566951567, | |
| "grad_norm": 0.8790506399999926, | |
| "learning_rate": 0.003873772445177015, | |
| "loss": 3.3515, | |
| "loss_layer_12_head": 0.41721591353416443, | |
| "loss_layer_18_head": 1.7524770498275757, | |
| "loss_layer_24_head": 0.19004251062870026, | |
| "loss_layer_30_head": 0.12846611440181732, | |
| "loss_layer_36_head": 0.0869293063879013, | |
| "loss_layer_42_head": 0.05517953634262085, | |
| "loss_layer_6_head": 0.6763615012168884, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 36.92307692307692, | |
| "grad_norm": 0.6459144380295287, | |
| "learning_rate": 0.0038563656596643987, | |
| "loss": 3.4019, | |
| "loss_layer_12_head": 0.4642051160335541, | |
| "loss_layer_18_head": 1.7727333307266235, | |
| "loss_layer_24_head": 0.18619823455810547, | |
| "loss_layer_30_head": 0.12455607950687408, | |
| "loss_layer_36_head": 0.08845292031764984, | |
| "loss_layer_42_head": 0.07540778815746307, | |
| "loss_layer_6_head": 0.6746565699577332, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 37.15099715099715, | |
| "grad_norm": 0.9964625761499913, | |
| "learning_rate": 0.0038388651847573963, | |
| "loss": 3.3717, | |
| "loss_layer_12_head": 0.47130832076072693, | |
| "loss_layer_18_head": 1.7519481182098389, | |
| "loss_layer_24_head": 0.1870810091495514, | |
| "loss_layer_30_head": 0.1326059103012085, | |
| "loss_layer_36_head": 0.08662253618240356, | |
| "loss_layer_42_head": 0.0689396858215332, | |
| "loss_layer_6_head": 0.6901426315307617, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 37.37891737891738, | |
| "grad_norm": 0.7408034008604258, | |
| "learning_rate": 0.0038212722292811385, | |
| "loss": 3.3646, | |
| "loss_layer_12_head": 0.4616113603115082, | |
| "loss_layer_18_head": 1.7593231201171875, | |
| "loss_layer_24_head": 0.18609504401683807, | |
| "loss_layer_30_head": 0.12701207399368286, | |
| "loss_layer_36_head": 0.08557876199483871, | |
| "loss_layer_42_head": 0.06022990494966507, | |
| "loss_layer_6_head": 0.6876755952835083, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 37.60683760683761, | |
| "grad_norm": 0.5628531711786754, | |
| "learning_rate": 0.0038035880084487453, | |
| "loss": 3.3382, | |
| "loss_layer_12_head": 0.4506490230560303, | |
| "loss_layer_18_head": 1.7187092304229736, | |
| "loss_layer_24_head": 0.18839967250823975, | |
| "loss_layer_30_head": 0.12567642331123352, | |
| "loss_layer_36_head": 0.08187349885702133, | |
| "loss_layer_42_head": 0.053150080144405365, | |
| "loss_layer_6_head": 0.6763015985488892, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 37.83475783475784, | |
| "grad_norm": 0.39395241311283635, | |
| "learning_rate": 0.003785813743777384, | |
| "loss": 3.2585, | |
| "loss_layer_12_head": 0.46294349431991577, | |
| "loss_layer_18_head": 1.6916673183441162, | |
| "loss_layer_24_head": 0.18889106810092926, | |
| "loss_layer_30_head": 0.12418197095394135, | |
| "loss_layer_36_head": 0.08617638051509857, | |
| "loss_layer_42_head": 0.05219808965921402, | |
| "loss_layer_6_head": 0.6769475936889648, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 38.06267806267806, | |
| "grad_norm": 0.4833603204338285, | |
| "learning_rate": 0.003767950663003898, | |
| "loss": 3.2325, | |
| "loss_layer_12_head": 0.45097413659095764, | |
| "loss_layer_18_head": 1.6738840341567993, | |
| "loss_layer_24_head": 0.1965884268283844, | |
| "loss_layer_30_head": 0.13039812445640564, | |
| "loss_layer_36_head": 0.0916060358285904, | |
| "loss_layer_42_head": 0.05350957438349724, | |
| "loss_layer_6_head": 0.6804872751235962, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 38.29059829059829, | |
| "grad_norm": 0.4615221255562949, | |
| "learning_rate": 0.00375, | |
| "loss": 3.1414, | |
| "loss_layer_12_head": 0.41619110107421875, | |
| "loss_layer_18_head": 1.646401047706604, | |
| "loss_layer_24_head": 0.18583571910858154, | |
| "loss_layer_30_head": 0.1233031377196312, | |
| "loss_layer_36_head": 0.0816354975104332, | |
| "loss_layer_42_head": 0.052404772490262985, | |
| "loss_layer_6_head": 0.6518012881278992, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 38.51851851851852, | |
| "grad_norm": 0.6077690682168392, | |
| "learning_rate": 0.0037319629946870442, | |
| "loss": 3.1353, | |
| "loss_layer_12_head": 0.4147290289402008, | |
| "loss_layer_18_head": 1.6200926303863525, | |
| "loss_layer_24_head": 0.19160741567611694, | |
| "loss_layer_30_head": 0.12613636255264282, | |
| "loss_layer_36_head": 0.08432339131832123, | |
| "loss_layer_42_head": 0.05299381539225578, | |
| "loss_layer_6_head": 0.6604996919631958, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 38.74643874643875, | |
| "grad_norm": 0.5471138428591616, | |
| "learning_rate": 0.0037138408929503802, | |
| "loss": 3.1415, | |
| "loss_layer_12_head": 0.4221917688846588, | |
| "loss_layer_18_head": 1.5912879705429077, | |
| "loss_layer_24_head": 0.19453270733356476, | |
| "loss_layer_30_head": 0.1206267923116684, | |
| "loss_layer_36_head": 0.0878443568944931, | |
| "loss_layer_42_head": 0.049645692110061646, | |
| "loss_layer_6_head": 0.68048095703125, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 38.97435897435897, | |
| "grad_norm": 0.6581590592047735, | |
| "learning_rate": 0.0036956349465532955, | |
| "loss": 3.082, | |
| "loss_layer_12_head": 0.4116048216819763, | |
| "loss_layer_18_head": 1.5232570171356201, | |
| "loss_layer_24_head": 0.19163082540035248, | |
| "loss_layer_30_head": 0.12269763648509979, | |
| "loss_layer_36_head": 0.08780638873577118, | |
| "loss_layer_42_head": 0.053409360349178314, | |
| "loss_layer_6_head": 0.6515310406684875, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 39.2022792022792, | |
| "grad_norm": 0.41965636629777003, | |
| "learning_rate": 0.0036773464130505505, | |
| "loss": 3.0273, | |
| "loss_layer_12_head": 0.4055556654930115, | |
| "loss_layer_18_head": 1.5530657768249512, | |
| "loss_layer_24_head": 0.18856747448444366, | |
| "loss_layer_30_head": 0.11959820985794067, | |
| "loss_layer_36_head": 0.08429338783025742, | |
| "loss_layer_42_head": 0.050204742699861526, | |
| "loss_layer_6_head": 0.6473517417907715, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 39.43019943019943, | |
| "grad_norm": 0.5358440091372462, | |
| "learning_rate": 0.0036589765557015143, | |
| "loss": 2.9578, | |
| "loss_layer_12_head": 0.3931970000267029, | |
| "loss_layer_18_head": 1.5244758129119873, | |
| "loss_layer_24_head": 0.18152835965156555, | |
| "loss_layer_30_head": 0.11682531982660294, | |
| "loss_layer_36_head": 0.08009742945432663, | |
| "loss_layer_42_head": 0.04725899547338486, | |
| "loss_layer_6_head": 0.6312907338142395, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 39.65811965811966, | |
| "grad_norm": 0.6894125877078248, | |
| "learning_rate": 0.0036405266433829075, | |
| "loss": 3.0322, | |
| "loss_layer_12_head": 0.39468279480934143, | |
| "loss_layer_18_head": 1.5026872158050537, | |
| "loss_layer_24_head": 0.17886415123939514, | |
| "loss_layer_30_head": 0.13253279030323029, | |
| "loss_layer_36_head": 0.08210141956806183, | |
| "loss_layer_42_head": 0.0509297177195549, | |
| "loss_layer_6_head": 0.6466023325920105, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 39.88603988603989, | |
| "grad_norm": 0.8054205400910994, | |
| "learning_rate": 0.0036219979505011557, | |
| "loss": 3.1048, | |
| "loss_layer_12_head": 0.426312118768692, | |
| "loss_layer_18_head": 1.4662295579910278, | |
| "loss_layer_24_head": 0.19096367061138153, | |
| "loss_layer_30_head": 0.17445117235183716, | |
| "loss_layer_36_head": 0.08434946835041046, | |
| "loss_layer_42_head": 0.06008179858326912, | |
| "loss_layer_6_head": 0.689109742641449, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 40.11396011396011, | |
| "grad_norm": 0.5198267607735384, | |
| "learning_rate": 0.00360339175690436, | |
| "loss": 3.0127, | |
| "loss_layer_12_head": 0.4104757308959961, | |
| "loss_layer_18_head": 1.467791199684143, | |
| "loss_layer_24_head": 0.18111568689346313, | |
| "loss_layer_30_head": 0.1645602285861969, | |
| "loss_layer_36_head": 0.0877971202135086, | |
| "loss_layer_42_head": 0.05487741157412529, | |
| "loss_layer_6_head": 0.658033549785614, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 40.34188034188034, | |
| "grad_norm": 0.5315864148753724, | |
| "learning_rate": 0.0035847093477938954, | |
| "loss": 2.9515, | |
| "loss_layer_12_head": 0.39946484565734863, | |
| "loss_layer_18_head": 1.433086633682251, | |
| "loss_layer_24_head": 0.1778954565525055, | |
| "loss_layer_30_head": 0.14781329035758972, | |
| "loss_layer_36_head": 0.08687538653612137, | |
| "loss_layer_42_head": 0.051437150686979294, | |
| "loss_layer_6_head": 0.6485563516616821, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 40.56980056980057, | |
| "grad_norm": 0.5423520974281545, | |
| "learning_rate": 0.003565952013635635, | |
| "loss": 2.9617, | |
| "loss_layer_12_head": 0.3888796865940094, | |
| "loss_layer_18_head": 1.4096533060073853, | |
| "loss_layer_24_head": 0.1837933510541916, | |
| "loss_layer_30_head": 0.13355405628681183, | |
| "loss_layer_36_head": 0.0816507488489151, | |
| "loss_layer_42_head": 0.05050645396113396, | |
| "loss_layer_6_head": 0.6469558477401733, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 40.7977207977208, | |
| "grad_norm": 0.4530753536523479, | |
| "learning_rate": 0.0035471210500708124, | |
| "loss": 2.9395, | |
| "loss_layer_12_head": 0.41357699036598206, | |
| "loss_layer_18_head": 1.437182068824768, | |
| "loss_layer_24_head": 0.1888083517551422, | |
| "loss_layer_30_head": 0.13409912586212158, | |
| "loss_layer_36_head": 0.08812478184700012, | |
| "loss_layer_42_head": 0.05342671275138855, | |
| "loss_layer_6_head": 0.6669130325317383, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 41.02564102564103, | |
| "grad_norm": 0.5159717906721856, | |
| "learning_rate": 0.0035282177578265296, | |
| "loss": 2.8916, | |
| "loss_layer_12_head": 0.41983136534690857, | |
| "loss_layer_18_head": 1.4326869249343872, | |
| "loss_layer_24_head": 0.192642480134964, | |
| "loss_layer_30_head": 0.13045620918273926, | |
| "loss_layer_36_head": 0.08625391125679016, | |
| "loss_layer_42_head": 0.05178719013929367, | |
| "loss_layer_6_head": 0.6592742204666138, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 41.25356125356125, | |
| "grad_norm": 0.6128247487167494, | |
| "learning_rate": 0.0035092434426259055, | |
| "loss": 2.8484, | |
| "loss_layer_12_head": 0.39752325415611267, | |
| "loss_layer_18_head": 1.4147056341171265, | |
| "loss_layer_24_head": 0.17930717766284943, | |
| "loss_layer_30_head": 0.12136325985193253, | |
| "loss_layer_36_head": 0.07983438670635223, | |
| "loss_layer_42_head": 0.04938334599137306, | |
| "loss_layer_6_head": 0.6326755285263062, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 41.48148148148148, | |
| "grad_norm": 0.5370777148494995, | |
| "learning_rate": 0.003490199415097892, | |
| "loss": 2.8121, | |
| "loss_layer_12_head": 0.4000681936740875, | |
| "loss_layer_18_head": 1.3680517673492432, | |
| "loss_layer_24_head": 0.17827217280864716, | |
| "loss_layer_30_head": 0.12119412422180176, | |
| "loss_layer_36_head": 0.07962208241224289, | |
| "loss_layer_42_head": 0.04752824455499649, | |
| "loss_layer_6_head": 0.6287747621536255, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 41.70940170940171, | |
| "grad_norm": 0.7708829574678, | |
| "learning_rate": 0.003471086990686737, | |
| "loss": 2.8402, | |
| "loss_layer_12_head": 0.4188622534275055, | |
| "loss_layer_18_head": 1.355021595954895, | |
| "loss_layer_24_head": 0.19683413207530975, | |
| "loss_layer_30_head": 0.12976667284965515, | |
| "loss_layer_36_head": 0.0859081894159317, | |
| "loss_layer_42_head": 0.056226927787065506, | |
| "loss_layer_6_head": 0.6642878651618958, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 41.93732193732194, | |
| "grad_norm": 0.7449833212059208, | |
| "learning_rate": 0.003451907489561124, | |
| "loss": 2.8444, | |
| "loss_layer_12_head": 0.41541725397109985, | |
| "loss_layer_18_head": 1.3178496360778809, | |
| "loss_layer_24_head": 0.18967287242412567, | |
| "loss_layer_30_head": 0.12230970710515976, | |
| "loss_layer_36_head": 0.08014537394046783, | |
| "loss_layer_42_head": 0.05525387078523636, | |
| "loss_layer_6_head": 0.6513751745223999, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 42.16524216524216, | |
| "grad_norm": 0.5477559692126953, | |
| "learning_rate": 0.0034326622365229847, | |
| "loss": 2.7887, | |
| "loss_layer_12_head": 0.3959849774837494, | |
| "loss_layer_18_head": 1.2648799419403076, | |
| "loss_layer_24_head": 0.18191051483154297, | |
| "loss_layer_30_head": 0.1164868101477623, | |
| "loss_layer_36_head": 0.08249407261610031, | |
| "loss_layer_42_head": 0.05099458247423172, | |
| "loss_layer_6_head": 0.6313720941543579, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 42.39316239316239, | |
| "grad_norm": 0.4794834286147715, | |
| "learning_rate": 0.0034133525609159883, | |
| "loss": 2.7665, | |
| "loss_layer_12_head": 0.4049781858921051, | |
| "loss_layer_18_head": 1.2870421409606934, | |
| "loss_layer_24_head": 0.19015896320343018, | |
| "loss_layer_30_head": 0.11839592456817627, | |
| "loss_layer_36_head": 0.08894439786672592, | |
| "loss_layer_42_head": 0.0506242997944355, | |
| "loss_layer_6_head": 0.6355887651443481, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 42.62108262108262, | |
| "grad_norm": 0.49916006862369783, | |
| "learning_rate": 0.0033939797965337154, | |
| "loss": 2.7637, | |
| "loss_layer_12_head": 0.41396409273147583, | |
| "loss_layer_18_head": 1.2755236625671387, | |
| "loss_layer_24_head": 0.19518128037452698, | |
| "loss_layer_30_head": 0.12190593779087067, | |
| "loss_layer_36_head": 0.09020756930112839, | |
| "loss_layer_42_head": 0.050995856523513794, | |
| "loss_layer_6_head": 0.6555052995681763, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 42.84900284900285, | |
| "grad_norm": 0.4882390844422191, | |
| "learning_rate": 0.0033745452815275375, | |
| "loss": 2.7417, | |
| "loss_layer_12_head": 0.40585416555404663, | |
| "loss_layer_18_head": 1.2714842557907104, | |
| "loss_layer_24_head": 0.18599747121334076, | |
| "loss_layer_30_head": 0.11775036156177521, | |
| "loss_layer_36_head": 0.08614761382341385, | |
| "loss_layer_42_head": 0.048269741237163544, | |
| "loss_layer_6_head": 0.6390920281410217, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 43.07692307692308, | |
| "grad_norm": 0.39823098594662537, | |
| "learning_rate": 0.003355050358314172, | |
| "loss": 2.6637, | |
| "loss_layer_12_head": 0.3847056031227112, | |
| "loss_layer_18_head": 1.2039114236831665, | |
| "loss_layer_24_head": 0.176110178232193, | |
| "loss_layer_30_head": 0.11044038832187653, | |
| "loss_layer_36_head": 0.08465754985809326, | |
| "loss_layer_42_head": 0.04742031544446945, | |
| "loss_layer_6_head": 0.5980373620986938, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 43.3048433048433, | |
| "grad_norm": 0.6900260033440537, | |
| "learning_rate": 0.0033354963734829692, | |
| "loss": 2.7018, | |
| "loss_layer_12_head": 0.3878379762172699, | |
| "loss_layer_18_head": 1.2014033794403076, | |
| "loss_layer_24_head": 0.17556259036064148, | |
| "loss_layer_30_head": 0.11799554526805878, | |
| "loss_layer_36_head": 0.08010242134332657, | |
| "loss_layer_42_head": 0.04850541055202484, | |
| "loss_layer_6_head": 0.6182578206062317, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 43.53276353276353, | |
| "grad_norm": 0.47429116139160615, | |
| "learning_rate": 0.0033158846777028893, | |
| "loss": 2.6455, | |
| "loss_layer_12_head": 0.3960720896720886, | |
| "loss_layer_18_head": 1.1679050922393799, | |
| "loss_layer_24_head": 0.17802616953849792, | |
| "loss_layer_30_head": 0.12192656844854355, | |
| "loss_layer_36_head": 0.0852232277393341, | |
| "loss_layer_42_head": 0.049923814833164215, | |
| "loss_layer_6_head": 0.6219567060470581, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 43.76068376068376, | |
| "grad_norm": 0.4188547473795177, | |
| "learning_rate": 0.0032962166256292114, | |
| "loss": 2.6593, | |
| "loss_layer_12_head": 0.39642995595932007, | |
| "loss_layer_18_head": 1.1589549779891968, | |
| "loss_layer_24_head": 0.1802503764629364, | |
| "loss_layer_30_head": 0.11738133430480957, | |
| "loss_layer_36_head": 0.08084283769130707, | |
| "loss_layer_42_head": 0.0486336275935173, | |
| "loss_layer_6_head": 0.624832034111023, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 43.98860398860399, | |
| "grad_norm": 0.6326866995847439, | |
| "learning_rate": 0.0032764935758099597, | |
| "loss": 2.6418, | |
| "loss_layer_12_head": 0.3885257840156555, | |
| "loss_layer_18_head": 1.1653964519500732, | |
| "loss_layer_24_head": 0.18046359717845917, | |
| "loss_layer_30_head": 0.1161709874868393, | |
| "loss_layer_36_head": 0.07991600781679153, | |
| "loss_layer_42_head": 0.04998940974473953, | |
| "loss_layer_6_head": 0.6206977963447571, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 44.21652421652421, | |
| "grad_norm": 0.6425900494900251, | |
| "learning_rate": 0.003256716890592065, | |
| "loss": 2.5966, | |
| "loss_layer_12_head": 0.3927614986896515, | |
| "loss_layer_18_head": 1.167649745941162, | |
| "loss_layer_24_head": 0.18743163347244263, | |
| "loss_layer_30_head": 0.11512549221515656, | |
| "loss_layer_36_head": 0.07711974531412125, | |
| "loss_layer_42_head": 0.047446832060813904, | |
| "loss_layer_6_head": 0.6326077580451965, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 44.44444444444444, | |
| "grad_norm": 0.6079576741884587, | |
| "learning_rate": 0.003236887936027261, | |
| "loss": 2.5795, | |
| "loss_layer_12_head": 0.39777034521102905, | |
| "loss_layer_18_head": 1.1651580333709717, | |
| "loss_layer_24_head": 0.18374374508857727, | |
| "loss_layer_30_head": 0.11680477857589722, | |
| "loss_layer_36_head": 0.07859492301940918, | |
| "loss_layer_42_head": 0.047878436744213104, | |
| "loss_layer_6_head": 0.6312351226806641, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 44.67236467236467, | |
| "grad_norm": 0.5747495318763676, | |
| "learning_rate": 0.003217008081777726, | |
| "loss": 2.5885, | |
| "loss_layer_12_head": 0.401299387216568, | |
| "loss_layer_18_head": 1.1307117938995361, | |
| "loss_layer_24_head": 0.18022188544273376, | |
| "loss_layer_30_head": 0.11754100024700165, | |
| "loss_layer_36_head": 0.07774122804403305, | |
| "loss_layer_42_head": 0.05039285495877266, | |
| "loss_layer_6_head": 0.6277211904525757, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 44.9002849002849, | |
| "grad_norm": 0.6882788247944827, | |
| "learning_rate": 0.003197078701021476, | |
| "loss": 2.6245, | |
| "loss_layer_12_head": 0.4062952399253845, | |
| "loss_layer_18_head": 1.1433289051055908, | |
| "loss_layer_24_head": 0.18742702901363373, | |
| "loss_layer_30_head": 0.12414474785327911, | |
| "loss_layer_36_head": 0.0802634060382843, | |
| "loss_layer_42_head": 0.05628126859664917, | |
| "loss_layer_6_head": 0.6465874910354614, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 45.12820512820513, | |
| "grad_norm": 0.6548183936802736, | |
| "learning_rate": 0.003177101170357513, | |
| "loss": 2.5927, | |
| "loss_layer_12_head": 0.39534568786621094, | |
| "loss_layer_18_head": 1.1005592346191406, | |
| "loss_layer_24_head": 0.18521855771541595, | |
| "loss_layer_30_head": 0.1259688138961792, | |
| "loss_layer_36_head": 0.07717721909284592, | |
| "loss_layer_42_head": 0.05370775982737541, | |
| "loss_layer_6_head": 0.6309455633163452, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 45.356125356125354, | |
| "grad_norm": 0.40696849199308827, | |
| "learning_rate": 0.0031570768697107383, | |
| "loss": 2.5165, | |
| "loss_layer_12_head": 0.3873128294944763, | |
| "loss_layer_18_head": 1.0747811794281006, | |
| "loss_layer_24_head": 0.17662569880485535, | |
| "loss_layer_30_head": 0.1168486624956131, | |
| "loss_layer_36_head": 0.07445703446865082, | |
| "loss_layer_42_head": 0.05355915427207947, | |
| "loss_layer_6_head": 0.6133853793144226, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 45.58404558404558, | |
| "grad_norm": 0.3913225073967932, | |
| "learning_rate": 0.003137007182236637, | |
| "loss": 2.5177, | |
| "loss_layer_12_head": 0.38922491669654846, | |
| "loss_layer_18_head": 1.0804240703582764, | |
| "loss_layer_24_head": 0.17917446792125702, | |
| "loss_layer_30_head": 0.1220487579703331, | |
| "loss_layer_36_head": 0.0791519358754158, | |
| "loss_layer_42_head": 0.049967654049396515, | |
| "loss_layer_6_head": 0.6184254884719849, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 45.58404558404558, | |
| "eval_loss": 4.238807201385498, | |
| "eval_loss_layer_12_head": 0.8041837215423584, | |
| "eval_loss_layer_18_head": 1.1114610433578491, | |
| "eval_loss_layer_24_head": 0.4403406083583832, | |
| "eval_loss_layer_30_head": 0.303775817155838, | |
| "eval_loss_layer_36_head": 0.22167228162288666, | |
| "eval_loss_layer_42_head": 0.1412229984998703, | |
| "eval_loss_layer_6_head": 1.0907320976257324, | |
| "eval_runtime": 4.9434, | |
| "eval_samples_per_second": 6.676, | |
| "eval_steps_per_second": 0.607, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 45.81196581196581, | |
| "grad_norm": 0.698286057801242, | |
| "learning_rate": 0.0031168934942257336, | |
| "loss": 2.5466, | |
| "loss_layer_12_head": 0.39911216497421265, | |
| "loss_layer_18_head": 1.0675843954086304, | |
| "loss_layer_24_head": 0.18049687147140503, | |
| "loss_layer_30_head": 0.11798091977834702, | |
| "loss_layer_36_head": 0.09117527306079865, | |
| "loss_layer_42_head": 0.0478927418589592, | |
| "loss_layer_6_head": 0.6361227035522461, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 46.03988603988604, | |
| "grad_norm": 0.5973347868112177, | |
| "learning_rate": 0.003096737195007845, | |
| "loss": 2.5263, | |
| "loss_layer_12_head": 0.3921283185482025, | |
| "loss_layer_18_head": 1.0628966093063354, | |
| "loss_layer_24_head": 0.18103823065757751, | |
| "loss_layer_30_head": 0.11626795679330826, | |
| "loss_layer_36_head": 0.08541289716959, | |
| "loss_layer_42_head": 0.048044394701719284, | |
| "loss_layer_6_head": 0.6301760673522949, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 46.267806267806264, | |
| "grad_norm": 0.43531050494934503, | |
| "learning_rate": 0.0030765396768561003, | |
| "loss": 2.4678, | |
| "loss_layer_12_head": 0.37627744674682617, | |
| "loss_layer_18_head": 1.0077391862869263, | |
| "loss_layer_24_head": 0.1717483401298523, | |
| "loss_layer_30_head": 0.11306069046258926, | |
| "loss_layer_36_head": 0.07940587401390076, | |
| "loss_layer_42_head": 0.046035267412662506, | |
| "loss_layer_6_head": 0.6145161390304565, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 46.495726495726494, | |
| "grad_norm": 0.36523473062420453, | |
| "learning_rate": 0.003056302334890786, | |
| "loss": 2.4591, | |
| "loss_layer_12_head": 0.3900749087333679, | |
| "loss_layer_18_head": 1.0151443481445312, | |
| "loss_layer_24_head": 0.17755632102489471, | |
| "loss_layer_30_head": 0.12042151391506195, | |
| "loss_layer_36_head": 0.08054070919752121, | |
| "loss_layer_42_head": 0.04895460605621338, | |
| "loss_layer_6_head": 0.6219097375869751, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 46.72364672364672, | |
| "grad_norm": 0.4448392594194299, | |
| "learning_rate": 0.003036026566982969, | |
| "loss": 2.4762, | |
| "loss_layer_12_head": 0.4033172130584717, | |
| "loss_layer_18_head": 1.0222870111465454, | |
| "loss_layer_24_head": 0.18822529911994934, | |
| "loss_layer_30_head": 0.12556472420692444, | |
| "loss_layer_36_head": 0.08045311272144318, | |
| "loss_layer_42_head": 0.04705699533224106, | |
| "loss_layer_6_head": 0.6353141069412231, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 46.95156695156695, | |
| "grad_norm": 0.4351453730838485, | |
| "learning_rate": 0.0030157137736579443, | |
| "loss": 2.4562, | |
| "loss_layer_12_head": 0.3938303589820862, | |
| "loss_layer_18_head": 0.969444751739502, | |
| "loss_layer_24_head": 0.1871107816696167, | |
| "loss_layer_30_head": 0.11789570748806, | |
| "loss_layer_36_head": 0.07739175856113434, | |
| "loss_layer_42_head": 0.046495065093040466, | |
| "loss_layer_6_head": 0.6121574640274048, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 47.17948717948718, | |
| "grad_norm": 0.45366609315296125, | |
| "learning_rate": 0.002995365357998494, | |
| "loss": 2.3923, | |
| "loss_layer_12_head": 0.3830070495605469, | |
| "loss_layer_18_head": 1.0128097534179688, | |
| "loss_layer_24_head": 0.17866311967372894, | |
| "loss_layer_30_head": 0.11667140573263168, | |
| "loss_layer_36_head": 0.07569454610347748, | |
| "loss_layer_42_head": 0.045610617846250534, | |
| "loss_layer_6_head": 0.5986355543136597, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 47.407407407407405, | |
| "grad_norm": 0.46986912758472016, | |
| "learning_rate": 0.0029749827255479756, | |
| "loss": 2.4144, | |
| "loss_layer_12_head": 0.3797675669193268, | |
| "loss_layer_18_head": 0.964137852191925, | |
| "loss_layer_24_head": 0.1742846965789795, | |
| "loss_layer_30_head": 0.11462786048650742, | |
| "loss_layer_36_head": 0.0758369192481041, | |
| "loss_layer_42_head": 0.0473390556871891, | |
| "loss_layer_6_head": 0.6031911373138428, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 47.635327635327634, | |
| "grad_norm": 0.5575118739748411, | |
| "learning_rate": 0.002954567284213227, | |
| "loss": 2.3859, | |
| "loss_layer_12_head": 0.3897952139377594, | |
| "loss_layer_18_head": 0.9797992706298828, | |
| "loss_layer_24_head": 0.182371586561203, | |
| "loss_layer_30_head": 0.12018144130706787, | |
| "loss_layer_36_head": 0.07999955862760544, | |
| "loss_layer_42_head": 0.04792632535099983, | |
| "loss_layer_6_head": 0.6161386370658875, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 47.863247863247864, | |
| "grad_norm": 0.40576783272964373, | |
| "learning_rate": 0.0029341204441673263, | |
| "loss": 2.3962, | |
| "loss_layer_12_head": 0.3958156704902649, | |
| "loss_layer_18_head": 0.9727069139480591, | |
| "loss_layer_24_head": 0.17981596291065216, | |
| "loss_layer_30_head": 0.1175287589430809, | |
| "loss_layer_36_head": 0.08136500418186188, | |
| "loss_layer_42_head": 0.045958392322063446, | |
| "loss_layer_6_head": 0.6214891672134399, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 48.09116809116809, | |
| "grad_norm": 0.41333807616852153, | |
| "learning_rate": 0.002913643617752178, | |
| "loss": 2.3637, | |
| "loss_layer_12_head": 0.38183653354644775, | |
| "loss_layer_18_head": 0.9352052807807922, | |
| "loss_layer_24_head": 0.17474476993083954, | |
| "loss_layer_30_head": 0.11326322704553604, | |
| "loss_layer_36_head": 0.07785725593566895, | |
| "loss_layer_42_head": 0.05166046693921089, | |
| "loss_layer_6_head": 0.6003078818321228, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 48.319088319088316, | |
| "grad_norm": 0.4902476881076566, | |
| "learning_rate": 0.0028931382193809634, | |
| "loss": 2.3199, | |
| "loss_layer_12_head": 0.3981599509716034, | |
| "loss_layer_18_head": 0.959009051322937, | |
| "loss_layer_24_head": 0.175415500998497, | |
| "loss_layer_30_head": 0.11643791198730469, | |
| "loss_layer_36_head": 0.07993616163730621, | |
| "loss_layer_42_head": 0.05208883807063103, | |
| "loss_layer_6_head": 0.61378014087677, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 48.547008547008545, | |
| "grad_norm": 0.41588329915773303, | |
| "learning_rate": 0.0028726056654404357, | |
| "loss": 2.3284, | |
| "loss_layer_12_head": 0.39041703939437866, | |
| "loss_layer_18_head": 0.8856471180915833, | |
| "loss_layer_24_head": 0.16735294461250305, | |
| "loss_layer_30_head": 0.10891375690698624, | |
| "loss_layer_36_head": 0.07393185794353485, | |
| "loss_layer_42_head": 0.046171437948942184, | |
| "loss_layer_6_head": 0.587515652179718, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 48.774928774928775, | |
| "grad_norm": 0.5548900668516289, | |
| "learning_rate": 0.002852047374193092, | |
| "loss": 2.3587, | |
| "loss_layer_12_head": 0.4144614338874817, | |
| "loss_layer_18_head": 0.9471640586853027, | |
| "loss_layer_24_head": 0.18089620769023895, | |
| "loss_layer_30_head": 0.11908881366252899, | |
| "loss_layer_36_head": 0.08078965544700623, | |
| "loss_layer_42_head": 0.05113198235630989, | |
| "loss_layer_6_head": 0.631020724773407, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 49.002849002849004, | |
| "grad_norm": 0.41121797339069344, | |
| "learning_rate": 0.0028314647656791985, | |
| "loss": 2.367, | |
| "loss_layer_12_head": 0.3957933783531189, | |
| "loss_layer_18_head": 0.9304808378219604, | |
| "loss_layer_24_head": 0.175583153963089, | |
| "loss_layer_30_head": 0.11424344778060913, | |
| "loss_layer_36_head": 0.0768514946103096, | |
| "loss_layer_42_head": 0.04772614315152168, | |
| "loss_layer_6_head": 0.6173755526542664, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 49.23076923076923, | |
| "grad_norm": 0.5259763425718715, | |
| "learning_rate": 0.0028108592616187134, | |
| "loss": 2.2938, | |
| "loss_layer_12_head": 0.3790958821773529, | |
| "loss_layer_18_head": 0.8951139450073242, | |
| "loss_layer_24_head": 0.17159327864646912, | |
| "loss_layer_30_head": 0.11063919961452484, | |
| "loss_layer_36_head": 0.07651374489068985, | |
| "loss_layer_42_head": 0.04569822549819946, | |
| "loss_layer_6_head": 0.6034574508666992, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 49.458689458689456, | |
| "grad_norm": 0.5088721335099942, | |
| "learning_rate": 0.002790232285313076, | |
| "loss": 2.3113, | |
| "loss_layer_12_head": 0.37154486775398254, | |
| "loss_layer_18_head": 0.853110134601593, | |
| "loss_layer_24_head": 0.17178836464881897, | |
| "loss_layer_30_head": 0.11050143092870712, | |
| "loss_layer_36_head": 0.07364407926797867, | |
| "loss_layer_42_head": 0.04579634219408035, | |
| "loss_layer_6_head": 0.6029828786849976, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 49.686609686609685, | |
| "grad_norm": 0.478302427669861, | |
| "learning_rate": 0.0027695852615468967, | |
| "loss": 2.2896, | |
| "loss_layer_12_head": 0.3795209527015686, | |
| "loss_layer_18_head": 0.8706048727035522, | |
| "loss_layer_24_head": 0.17759330570697784, | |
| "loss_layer_30_head": 0.11326000839471817, | |
| "loss_layer_36_head": 0.07385507971048355, | |
| "loss_layer_42_head": 0.0455007366836071, | |
| "loss_layer_6_head": 0.6164635419845581, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 49.914529914529915, | |
| "grad_norm": 0.5235830286531326, | |
| "learning_rate": 0.002748919616489542, | |
| "loss": 2.2904, | |
| "loss_layer_12_head": 0.37914353609085083, | |
| "loss_layer_18_head": 0.8292545080184937, | |
| "loss_layer_24_head": 0.17843201756477356, | |
| "loss_layer_30_head": 0.11026974022388458, | |
| "loss_layer_36_head": 0.0743885412812233, | |
| "loss_layer_42_head": 0.04483931511640549, | |
| "loss_layer_6_head": 0.605587363243103, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 50.142450142450144, | |
| "grad_norm": 0.434150702314289, | |
| "learning_rate": 0.002728236777596621, | |
| "loss": 2.2794, | |
| "loss_layer_12_head": 0.38162827491760254, | |
| "loss_layer_18_head": 0.8877264261245728, | |
| "loss_layer_24_head": 0.17784467339515686, | |
| "loss_layer_30_head": 0.11130378395318985, | |
| "loss_layer_36_head": 0.07626891881227493, | |
| "loss_layer_42_head": 0.044149916619062424, | |
| "loss_layer_6_head": 0.6123658418655396, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 50.370370370370374, | |
| "grad_norm": 0.6419194201898322, | |
| "learning_rate": 0.0027075381735113878, | |
| "loss": 2.2657, | |
| "loss_layer_12_head": 0.38760074973106384, | |
| "loss_layer_18_head": 0.8854343295097351, | |
| "loss_layer_24_head": 0.17738394439220428, | |
| "loss_layer_30_head": 0.11329641193151474, | |
| "loss_layer_36_head": 0.07881636917591095, | |
| "loss_layer_42_head": 0.043652262538671494, | |
| "loss_layer_6_head": 0.6228241920471191, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 50.598290598290596, | |
| "grad_norm": 0.6461273738487177, | |
| "learning_rate": 0.002686825233966061, | |
| "loss": 2.2678, | |
| "loss_layer_12_head": 0.38489505648612976, | |
| "loss_layer_18_head": 0.8524206280708313, | |
| "loss_layer_24_head": 0.17580567300319672, | |
| "loss_layer_30_head": 0.11257897317409515, | |
| "loss_layer_36_head": 0.07540023326873779, | |
| "loss_layer_42_head": 0.04346206784248352, | |
| "loss_layer_6_head": 0.6352558135986328, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 50.826210826210826, | |
| "grad_norm": 0.522207949826169, | |
| "learning_rate": 0.002666099389683061, | |
| "loss": 2.2585, | |
| "loss_layer_12_head": 0.3890642821788788, | |
| "loss_layer_18_head": 0.8386470079421997, | |
| "loss_layer_24_head": 0.17881402373313904, | |
| "loss_layer_30_head": 0.11530689895153046, | |
| "loss_layer_36_head": 0.07785026729106903, | |
| "loss_layer_42_head": 0.044162243604660034, | |
| "loss_layer_6_head": 0.6546280384063721, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 51.054131054131055, | |
| "grad_norm": 0.5318721661025899, | |
| "learning_rate": 0.0026453620722761894, | |
| "loss": 2.3236, | |
| "loss_layer_12_head": 0.3961235582828522, | |
| "loss_layer_18_head": 0.8308088183403015, | |
| "loss_layer_24_head": 0.1806981861591339, | |
| "loss_layer_30_head": 0.11701737344264984, | |
| "loss_layer_36_head": 0.08028064668178558, | |
| "loss_layer_42_head": 0.046638570725917816, | |
| "loss_layer_6_head": 0.6974906921386719, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 51.282051282051285, | |
| "grad_norm": 0.7094454068405147, | |
| "learning_rate": 0.002624614714151743, | |
| "loss": 2.2551, | |
| "loss_layer_12_head": 0.3696451485157013, | |
| "loss_layer_18_head": 0.8046764135360718, | |
| "loss_layer_24_head": 0.17042198777198792, | |
| "loss_layer_30_head": 0.11222568899393082, | |
| "loss_layer_36_head": 0.07389844954013824, | |
| "loss_layer_42_head": 0.048138294368982315, | |
| "loss_layer_6_head": 0.6442863345146179, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 51.50997150997151, | |
| "grad_norm": 0.43402133001757, | |
| "learning_rate": 0.002603858748409567, | |
| "loss": 2.2282, | |
| "loss_layer_12_head": 0.38198503851890564, | |
| "loss_layer_18_head": 0.8142977952957153, | |
| "loss_layer_24_head": 0.17379167675971985, | |
| "loss_layer_30_head": 0.11386610567569733, | |
| "loss_layer_36_head": 0.07543236017227173, | |
| "loss_layer_42_head": 0.046159930527210236, | |
| "loss_layer_6_head": 0.6551558375358582, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 51.73789173789174, | |
| "grad_norm": 0.38917348088482573, | |
| "learning_rate": 0.0025830956087440665, | |
| "loss": 2.2275, | |
| "loss_layer_12_head": 0.38181477785110474, | |
| "loss_layer_18_head": 0.7957919836044312, | |
| "loss_layer_24_head": 0.17541435360908508, | |
| "loss_layer_30_head": 0.11523507535457611, | |
| "loss_layer_36_head": 0.07723738998174667, | |
| "loss_layer_42_head": 0.047955431044101715, | |
| "loss_layer_6_head": 0.6321656703948975, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 51.965811965811966, | |
| "grad_norm": 0.3637496388028228, | |
| "learning_rate": 0.0025623267293451825, | |
| "loss": 2.2314, | |
| "loss_layer_12_head": 0.3790660500526428, | |
| "loss_layer_18_head": 0.8120620846748352, | |
| "loss_layer_24_head": 0.17084448039531708, | |
| "loss_layer_30_head": 0.11187763512134552, | |
| "loss_layer_36_head": 0.07578536123037338, | |
| "loss_layer_42_head": 0.045785531401634216, | |
| "loss_layer_6_head": 0.6161836385726929, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 52.193732193732195, | |
| "grad_norm": 0.4536575231743723, | |
| "learning_rate": 0.002541553544799316, | |
| "loss": 2.1509, | |
| "loss_layer_12_head": 0.3612454831600189, | |
| "loss_layer_18_head": 0.7751725912094116, | |
| "loss_layer_24_head": 0.1629917174577713, | |
| "loss_layer_30_head": 0.10584640502929688, | |
| "loss_layer_36_head": 0.07269952446222305, | |
| "loss_layer_42_head": 0.04338030144572258, | |
| "loss_layer_6_head": 0.5827122926712036, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 52.421652421652425, | |
| "grad_norm": 0.3246994919822453, | |
| "learning_rate": 0.002520777489990243, | |
| "loss": 2.1598, | |
| "loss_layer_12_head": 0.37068501114845276, | |
| "loss_layer_18_head": 0.783240795135498, | |
| "loss_layer_24_head": 0.1731734573841095, | |
| "loss_layer_30_head": 0.11189240217208862, | |
| "loss_layer_36_head": 0.0775907039642334, | |
| "loss_layer_42_head": 0.044106461107730865, | |
| "loss_layer_6_head": 0.5865408182144165, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 52.64957264957265, | |
| "grad_norm": 0.2825614436603337, | |
| "learning_rate": 0.0025, | |
| "loss": 2.1276, | |
| "loss_layer_12_head": 0.3710993230342865, | |
| "loss_layer_18_head": 0.7801668643951416, | |
| "loss_layer_24_head": 0.17115965485572815, | |
| "loss_layer_30_head": 0.10953103005886078, | |
| "loss_layer_36_head": 0.07413998991250992, | |
| "loss_layer_42_head": 0.04302388057112694, | |
| "loss_layer_6_head": 0.5841787457466125, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 52.87749287749288, | |
| "grad_norm": 0.2616737808720529, | |
| "learning_rate": 0.0024792225100097576, | |
| "loss": 2.1808, | |
| "loss_layer_12_head": 0.3781898617744446, | |
| "loss_layer_18_head": 0.7769525051116943, | |
| "loss_layer_24_head": 0.17470547556877136, | |
| "loss_layer_30_head": 0.11191209405660629, | |
| "loss_layer_36_head": 0.07527685910463333, | |
| "loss_layer_42_head": 0.04384452477097511, | |
| "loss_layer_6_head": 0.5916733741760254, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 53.105413105413106, | |
| "grad_norm": 0.3311507726816672, | |
| "learning_rate": 0.002458446455200685, | |
| "loss": 2.1317, | |
| "loss_layer_12_head": 0.3670315444469452, | |
| "loss_layer_18_head": 0.7406850457191467, | |
| "loss_layer_24_head": 0.16683974862098694, | |
| "loss_layer_30_head": 0.10634462535381317, | |
| "loss_layer_36_head": 0.07278958708047867, | |
| "loss_layer_42_head": 0.042678773403167725, | |
| "loss_layer_6_head": 0.5726101994514465, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 53.333333333333336, | |
| "grad_norm": 0.2509448396894879, | |
| "learning_rate": 0.0024376732706548184, | |
| "loss": 2.1037, | |
| "loss_layer_12_head": 0.3671357333660126, | |
| "loss_layer_18_head": 0.7627917528152466, | |
| "loss_layer_24_head": 0.17304837703704834, | |
| "loss_layer_30_head": 0.10864245891571045, | |
| "loss_layer_36_head": 0.07365027070045471, | |
| "loss_layer_42_head": 0.04330287128686905, | |
| "loss_layer_6_head": 0.5780169367790222, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 53.56125356125356, | |
| "grad_norm": 0.2947012291464038, | |
| "learning_rate": 0.0024169043912559336, | |
| "loss": 2.1094, | |
| "loss_layer_12_head": 0.3702814280986786, | |
| "loss_layer_18_head": 0.7762826085090637, | |
| "loss_layer_24_head": 0.1664721667766571, | |
| "loss_layer_30_head": 0.10503309965133667, | |
| "loss_layer_36_head": 0.07054396718740463, | |
| "loss_layer_42_head": 0.0412781648337841, | |
| "loss_layer_6_head": 0.5821161270141602, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 53.78917378917379, | |
| "grad_norm": 0.5137967208925616, | |
| "learning_rate": 0.0023961412515904335, | |
| "loss": 2.1099, | |
| "loss_layer_12_head": 0.37117379903793335, | |
| "loss_layer_18_head": 0.758410632610321, | |
| "loss_layer_24_head": 0.17216971516609192, | |
| "loss_layer_30_head": 0.1102384701371193, | |
| "loss_layer_36_head": 0.07311711460351944, | |
| "loss_layer_42_head": 0.042940251529216766, | |
| "loss_layer_6_head": 0.5829381942749023, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 54.01709401709402, | |
| "grad_norm": 0.35742509879703566, | |
| "learning_rate": 0.0023753852858482568, | |
| "loss": 2.1136, | |
| "loss_layer_12_head": 0.3729603886604309, | |
| "loss_layer_18_head": 0.7423059940338135, | |
| "loss_layer_24_head": 0.1714349091053009, | |
| "loss_layer_30_head": 0.1128513440489769, | |
| "loss_layer_36_head": 0.0720224529504776, | |
| "loss_layer_42_head": 0.04282630234956741, | |
| "loss_layer_6_head": 0.5869891047477722, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 54.24501424501425, | |
| "grad_norm": 0.3567652126619958, | |
| "learning_rate": 0.0023546379277238107, | |
| "loss": 2.0666, | |
| "loss_layer_12_head": 0.36165767908096313, | |
| "loss_layer_18_head": 0.7423025965690613, | |
| "loss_layer_24_head": 0.17272017896175385, | |
| "loss_layer_30_head": 0.11087025701999664, | |
| "loss_layer_36_head": 0.07229898124933243, | |
| "loss_layer_42_head": 0.04461061209440231, | |
| "loss_layer_6_head": 0.5694043040275574, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 54.472934472934476, | |
| "grad_norm": 0.4399155733365801, | |
| "learning_rate": 0.0023339006103169396, | |
| "loss": 2.0613, | |
| "loss_layer_12_head": 0.36041295528411865, | |
| "loss_layer_18_head": 0.7265509366989136, | |
| "loss_layer_24_head": 0.16570261120796204, | |
| "loss_layer_30_head": 0.10996762663125992, | |
| "loss_layer_36_head": 0.07162176072597504, | |
| "loss_layer_42_head": 0.04219638928771019, | |
| "loss_layer_6_head": 0.5656975507736206, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 54.7008547008547, | |
| "grad_norm": 0.2864521752954199, | |
| "learning_rate": 0.0023131747660339393, | |
| "loss": 2.0743, | |
| "loss_layer_12_head": 0.36955124139785767, | |
| "loss_layer_18_head": 0.7258505821228027, | |
| "loss_layer_24_head": 0.1719922125339508, | |
| "loss_layer_30_head": 0.11259003728628159, | |
| "loss_layer_36_head": 0.07511717826128006, | |
| "loss_layer_42_head": 0.04532603174448013, | |
| "loss_layer_6_head": 0.5806865096092224, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 54.7008547008547, | |
| "eval_loss": 3.922147750854492, | |
| "eval_loss_layer_12_head": 0.8050315976142883, | |
| "eval_loss_layer_18_head": 0.8688571453094482, | |
| "eval_loss_layer_24_head": 0.44181978702545166, | |
| "eval_loss_layer_30_head": 0.30123329162597656, | |
| "eval_loss_layer_36_head": 0.20439112186431885, | |
| "eval_loss_layer_42_head": 0.1361953616142273, | |
| "eval_loss_layer_6_head": 1.0727375745773315, | |
| "eval_runtime": 4.9363, | |
| "eval_samples_per_second": 6.685, | |
| "eval_steps_per_second": 0.608, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 54.92877492877493, | |
| "grad_norm": 0.2610712781022057, | |
| "learning_rate": 0.002292461826488612, | |
| "loss": 2.1013, | |
| "loss_layer_12_head": 0.3735349178314209, | |
| "loss_layer_18_head": 0.7252658605575562, | |
| "loss_layer_24_head": 0.17007118463516235, | |
| "loss_layer_30_head": 0.10975446552038193, | |
| "loss_layer_36_head": 0.07351495325565338, | |
| "loss_layer_42_head": 0.043339770287275314, | |
| "loss_layer_6_head": 0.5827316641807556, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 55.15669515669516, | |
| "grad_norm": 0.23380581342639142, | |
| "learning_rate": 0.0022717632224033796, | |
| "loss": 2.0471, | |
| "loss_layer_12_head": 0.35890263319015503, | |
| "loss_layer_18_head": 0.7042995691299438, | |
| "loss_layer_24_head": 0.16659076511859894, | |
| "loss_layer_30_head": 0.10641211271286011, | |
| "loss_layer_36_head": 0.07390512526035309, | |
| "loss_layer_42_head": 0.044583845883607864, | |
| "loss_layer_6_head": 0.5554832816123962, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 55.38461538461539, | |
| "grad_norm": 0.49307442680376706, | |
| "learning_rate": 0.0022510803835104586, | |
| "loss": 2.0553, | |
| "loss_layer_12_head": 0.3678162693977356, | |
| "loss_layer_18_head": 0.7129090428352356, | |
| "loss_layer_24_head": 0.17019148170948029, | |
| "loss_layer_30_head": 0.10706619173288345, | |
| "loss_layer_36_head": 0.07150779664516449, | |
| "loss_layer_42_head": 0.04218541830778122, | |
| "loss_layer_6_head": 0.5814077854156494, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 55.61253561253561, | |
| "grad_norm": 0.3918313570013889, | |
| "learning_rate": 0.002230414738453104, | |
| "loss": 2.044, | |
| "loss_layer_12_head": 0.36497271060943604, | |
| "loss_layer_18_head": 0.6976231932640076, | |
| "loss_layer_24_head": 0.1705760508775711, | |
| "loss_layer_30_head": 0.1071806401014328, | |
| "loss_layer_36_head": 0.07460781186819077, | |
| "loss_layer_42_head": 0.04227457195520401, | |
| "loss_layer_6_head": 0.5821677446365356, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 55.84045584045584, | |
| "grad_norm": 0.4489143425230452, | |
| "learning_rate": 0.0022097677146869243, | |
| "loss": 2.0616, | |
| "loss_layer_12_head": 0.3607775568962097, | |
| "loss_layer_18_head": 0.690375566482544, | |
| "loss_layer_24_head": 0.1676177680492401, | |
| "loss_layer_30_head": 0.10613974183797836, | |
| "loss_layer_36_head": 0.0728280320763588, | |
| "loss_layer_42_head": 0.043055903166532516, | |
| "loss_layer_6_head": 0.5777379274368286, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 56.06837606837607, | |
| "grad_norm": 0.4573486183341264, | |
| "learning_rate": 0.002189140738381288, | |
| "loss": 2.0511, | |
| "loss_layer_12_head": 0.37861520051956177, | |
| "loss_layer_18_head": 0.7308498620986938, | |
| "loss_layer_24_head": 0.1784064620733261, | |
| "loss_layer_30_head": 0.11279511451721191, | |
| "loss_layer_36_head": 0.07710321247577667, | |
| "loss_layer_42_head": 0.045412637293338776, | |
| "loss_layer_6_head": 0.5905116200447083, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 56.2962962962963, | |
| "grad_norm": 0.5031971348141655, | |
| "learning_rate": 0.0021685352343208016, | |
| "loss": 2.0235, | |
| "loss_layer_12_head": 0.3556618094444275, | |
| "loss_layer_18_head": 0.6965985298156738, | |
| "loss_layer_24_head": 0.1655425876379013, | |
| "loss_layer_30_head": 0.10600040853023529, | |
| "loss_layer_36_head": 0.0712694302201271, | |
| "loss_layer_42_head": 0.041733045130968094, | |
| "loss_layer_6_head": 0.5654771327972412, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 56.52421652421653, | |
| "grad_norm": 0.44714368064818283, | |
| "learning_rate": 0.0021479526258069083, | |
| "loss": 2.0147, | |
| "loss_layer_12_head": 0.37383121252059937, | |
| "loss_layer_18_head": 0.7061088681221008, | |
| "loss_layer_24_head": 0.1718534529209137, | |
| "loss_layer_30_head": 0.1083112508058548, | |
| "loss_layer_36_head": 0.07109373807907104, | |
| "loss_layer_42_head": 0.041331950575113297, | |
| "loss_layer_6_head": 0.5899810791015625, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 56.75213675213675, | |
| "grad_norm": 0.47642493251870227, | |
| "learning_rate": 0.0021273943345595635, | |
| "loss": 2.0125, | |
| "loss_layer_12_head": 0.38709744811058044, | |
| "loss_layer_18_head": 0.7057448029518127, | |
| "loss_layer_24_head": 0.17811325192451477, | |
| "loss_layer_30_head": 0.11347751319408417, | |
| "loss_layer_36_head": 0.07625190913677216, | |
| "loss_layer_42_head": 0.043919991701841354, | |
| "loss_layer_6_head": 0.6019116640090942, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 56.98005698005698, | |
| "grad_norm": 0.3972745328767057, | |
| "learning_rate": 0.002106861780619037, | |
| "loss": 2.0611, | |
| "loss_layer_12_head": 0.37597453594207764, | |
| "loss_layer_18_head": 0.6905866861343384, | |
| "loss_layer_24_head": 0.17333197593688965, | |
| "loss_layer_30_head": 0.11129488050937653, | |
| "loss_layer_36_head": 0.0752146914601326, | |
| "loss_layer_42_head": 0.04286158084869385, | |
| "loss_layer_6_head": 0.5845304131507874, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 57.20797720797721, | |
| "grad_norm": 0.48469221114870253, | |
| "learning_rate": 0.002086356382247822, | |
| "loss": 1.9903, | |
| "loss_layer_12_head": 0.368541955947876, | |
| "loss_layer_18_head": 0.6999972462654114, | |
| "loss_layer_24_head": 0.16970302164554596, | |
| "loss_layer_30_head": 0.10926713794469833, | |
| "loss_layer_36_head": 0.07249046862125397, | |
| "loss_layer_42_head": 0.041749563068151474, | |
| "loss_layer_6_head": 0.5862393379211426, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 57.43589743589744, | |
| "grad_norm": 0.611574187970635, | |
| "learning_rate": 0.0020658795558326742, | |
| "loss": 2.0046, | |
| "loss_layer_12_head": 0.36997026205062866, | |
| "loss_layer_18_head": 0.6786366701126099, | |
| "loss_layer_24_head": 0.1694273203611374, | |
| "loss_layer_30_head": 0.10953061282634735, | |
| "loss_layer_36_head": 0.07315023243427277, | |
| "loss_layer_42_head": 0.04222399741411209, | |
| "loss_layer_6_head": 0.5881815552711487, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 57.66381766381767, | |
| "grad_norm": 0.6153677450824064, | |
| "learning_rate": 0.0020454327157867734, | |
| "loss": 2.0109, | |
| "loss_layer_12_head": 0.367452472448349, | |
| "loss_layer_18_head": 0.6759993433952332, | |
| "loss_layer_24_head": 0.1668408215045929, | |
| "loss_layer_30_head": 0.10686562955379486, | |
| "loss_layer_36_head": 0.07048901170492172, | |
| "loss_layer_42_head": 0.04098617285490036, | |
| "loss_layer_6_head": 0.5859954357147217, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 57.89173789173789, | |
| "grad_norm": 0.2935217424002303, | |
| "learning_rate": 0.002025017274452026, | |
| "loss": 2.0181, | |
| "loss_layer_12_head": 0.37350600957870483, | |
| "loss_layer_18_head": 0.6737070679664612, | |
| "loss_layer_24_head": 0.17168107628822327, | |
| "loss_layer_30_head": 0.11016984283924103, | |
| "loss_layer_36_head": 0.07335834950208664, | |
| "loss_layer_42_head": 0.04219553619623184, | |
| "loss_layer_6_head": 0.5844029188156128, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 58.11965811965812, | |
| "grad_norm": 0.2621978314625234, | |
| "learning_rate": 0.0020046346420015066, | |
| "loss": 1.9957, | |
| "loss_layer_12_head": 0.3653263449668884, | |
| "loss_layer_18_head": 0.6525477170944214, | |
| "loss_layer_24_head": 0.16379979252815247, | |
| "loss_layer_30_head": 0.10514315217733383, | |
| "loss_layer_36_head": 0.0698147565126419, | |
| "loss_layer_42_head": 0.041251130402088165, | |
| "loss_layer_6_head": 0.5668857097625732, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 58.34757834757835, | |
| "grad_norm": 0.26002559793149466, | |
| "learning_rate": 0.0019842862263420562, | |
| "loss": 1.9703, | |
| "loss_layer_12_head": 0.36825767159461975, | |
| "loss_layer_18_head": 0.6790739297866821, | |
| "loss_layer_24_head": 0.17250864207744598, | |
| "loss_layer_30_head": 0.10994887351989746, | |
| "loss_layer_36_head": 0.07393065840005875, | |
| "loss_layer_42_head": 0.04338192567229271, | |
| "loss_layer_6_head": 0.5710390210151672, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 58.57549857549858, | |
| "grad_norm": 0.3664467976121708, | |
| "learning_rate": 0.001963973433017031, | |
| "loss": 1.9765, | |
| "loss_layer_12_head": 0.35958853363990784, | |
| "loss_layer_18_head": 0.6540105938911438, | |
| "loss_layer_24_head": 0.16704809665679932, | |
| "loss_layer_30_head": 0.10820546001195908, | |
| "loss_layer_36_head": 0.07247563451528549, | |
| "loss_layer_42_head": 0.042074285447597504, | |
| "loss_layer_6_head": 0.5653725862503052, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 58.8034188034188, | |
| "grad_norm": 0.36752505491576265, | |
| "learning_rate": 0.001943697665109214, | |
| "loss": 1.9711, | |
| "loss_layer_12_head": 0.36526957154273987, | |
| "loss_layer_18_head": 0.6480482816696167, | |
| "loss_layer_24_head": 0.16833284497261047, | |
| "loss_layer_30_head": 0.10793248564004898, | |
| "loss_layer_36_head": 0.07179885357618332, | |
| "loss_layer_42_head": 0.04152452200651169, | |
| "loss_layer_6_head": 0.5726978182792664, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 59.03133903133903, | |
| "grad_norm": 0.27873606157572917, | |
| "learning_rate": 0.0019234603231438995, | |
| "loss": 1.9802, | |
| "loss_layer_12_head": 0.36396297812461853, | |
| "loss_layer_18_head": 0.6509535312652588, | |
| "loss_layer_24_head": 0.1683129370212555, | |
| "loss_layer_30_head": 0.10660085827112198, | |
| "loss_layer_36_head": 0.07140365988016129, | |
| "loss_layer_42_head": 0.0416494682431221, | |
| "loss_layer_6_head": 0.5673893690109253, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 59.25925925925926, | |
| "grad_norm": 0.25089726312834165, | |
| "learning_rate": 0.0019032628049921558, | |
| "loss": 1.9404, | |
| "loss_layer_12_head": 0.3589734435081482, | |
| "loss_layer_18_head": 0.6603430509567261, | |
| "loss_layer_24_head": 0.16413089632987976, | |
| "loss_layer_30_head": 0.1043197512626648, | |
| "loss_layer_36_head": 0.06929105520248413, | |
| "loss_layer_42_head": 0.041358429938554764, | |
| "loss_layer_6_head": 0.5630621910095215, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 59.48717948717949, | |
| "grad_norm": 0.22103248903884454, | |
| "learning_rate": 0.0018831065057742658, | |
| "loss": 1.9406, | |
| "loss_layer_12_head": 0.359492689371109, | |
| "loss_layer_18_head": 0.6408952474594116, | |
| "loss_layer_24_head": 0.16286614537239075, | |
| "loss_layer_30_head": 0.10376974195241928, | |
| "loss_layer_36_head": 0.06853290647268295, | |
| "loss_layer_42_head": 0.039769940078258514, | |
| "loss_layer_6_head": 0.5613786578178406, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 59.71509971509972, | |
| "grad_norm": 0.27744975382692894, | |
| "learning_rate": 0.0018629928177633637, | |
| "loss": 1.9501, | |
| "loss_layer_12_head": 0.3579030930995941, | |
| "loss_layer_18_head": 0.6238123178482056, | |
| "loss_layer_24_head": 0.1625664085149765, | |
| "loss_layer_30_head": 0.10378286987543106, | |
| "loss_layer_36_head": 0.06863966584205627, | |
| "loss_layer_42_head": 0.0397348552942276, | |
| "loss_layer_6_head": 0.5519455671310425, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 59.94301994301994, | |
| "grad_norm": 0.3067961859601614, | |
| "learning_rate": 0.0018429231302892618, | |
| "loss": 1.9753, | |
| "loss_layer_12_head": 0.3674226701259613, | |
| "loss_layer_18_head": 0.6348339915275574, | |
| "loss_layer_24_head": 0.16799281537532806, | |
| "loss_layer_30_head": 0.10703931003808975, | |
| "loss_layer_36_head": 0.07145868241786957, | |
| "loss_layer_42_head": 0.042139433324337006, | |
| "loss_layer_6_head": 0.5692054033279419, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 60.17094017094017, | |
| "grad_norm": 0.23589211575663038, | |
| "learning_rate": 0.0018228988296424876, | |
| "loss": 1.9278, | |
| "loss_layer_12_head": 0.35995879769325256, | |
| "loss_layer_18_head": 0.6400879621505737, | |
| "loss_layer_24_head": 0.1672223061323166, | |
| "loss_layer_30_head": 0.10712926089763641, | |
| "loss_layer_36_head": 0.07134952396154404, | |
| "loss_layer_42_head": 0.041246697306632996, | |
| "loss_layer_6_head": 0.5575507879257202, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 60.3988603988604, | |
| "grad_norm": 0.33319843426515144, | |
| "learning_rate": 0.001802921298978524, | |
| "loss": 1.9185, | |
| "loss_layer_12_head": 0.3579893708229065, | |
| "loss_layer_18_head": 0.6205559968948364, | |
| "loss_layer_24_head": 0.16404248774051666, | |
| "loss_layer_30_head": 0.10560892522335052, | |
| "loss_layer_36_head": 0.07029516994953156, | |
| "loss_layer_42_head": 0.040432218462228775, | |
| "loss_layer_6_head": 0.5526586771011353, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 60.62678062678063, | |
| "grad_norm": 0.3388940110950527, | |
| "learning_rate": 0.0017829919182222752, | |
| "loss": 1.9358, | |
| "loss_layer_12_head": 0.36018261313438416, | |
| "loss_layer_18_head": 0.6270455121994019, | |
| "loss_layer_24_head": 0.1631208062171936, | |
| "loss_layer_30_head": 0.10345069319009781, | |
| "loss_layer_36_head": 0.06899687647819519, | |
| "loss_layer_42_head": 0.040876902639865875, | |
| "loss_layer_6_head": 0.5665128827095032, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 60.85470085470085, | |
| "grad_norm": 0.23376441286656932, | |
| "learning_rate": 0.0017631120639727393, | |
| "loss": 1.9364, | |
| "loss_layer_12_head": 0.36492669582366943, | |
| "loss_layer_18_head": 0.617691695690155, | |
| "loss_layer_24_head": 0.166006401181221, | |
| "loss_layer_30_head": 0.10586857795715332, | |
| "loss_layer_36_head": 0.07113579660654068, | |
| "loss_layer_42_head": 0.042698584496974945, | |
| "loss_layer_6_head": 0.5667874217033386, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 61.08262108262108, | |
| "grad_norm": 0.261608716597069, | |
| "learning_rate": 0.0017432831094079354, | |
| "loss": 1.9645, | |
| "loss_layer_12_head": 0.36654841899871826, | |
| "loss_layer_18_head": 0.638596773147583, | |
| "loss_layer_24_head": 0.16740819811820984, | |
| "loss_layer_30_head": 0.10700327157974243, | |
| "loss_layer_36_head": 0.07154614478349686, | |
| "loss_layer_42_head": 0.04158513993024826, | |
| "loss_layer_6_head": 0.5753756761550903, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 61.31054131054131, | |
| "grad_norm": 0.26498707042333075, | |
| "learning_rate": 0.0017235064241900406, | |
| "loss": 1.9308, | |
| "loss_layer_12_head": 0.35222604870796204, | |
| "loss_layer_18_head": 0.619999885559082, | |
| "loss_layer_24_head": 0.16033700108528137, | |
| "loss_layer_30_head": 0.10242090374231339, | |
| "loss_layer_36_head": 0.06886529922485352, | |
| "loss_layer_42_head": 0.040125004947185516, | |
| "loss_layer_6_head": 0.5601471662521362, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 61.53846153846154, | |
| "grad_norm": 0.22226419828206537, | |
| "learning_rate": 0.0017037833743707893, | |
| "loss": 1.9023, | |
| "loss_layer_12_head": 0.36284562945365906, | |
| "loss_layer_18_head": 0.624627947807312, | |
| "loss_layer_24_head": 0.16515551507472992, | |
| "loss_layer_30_head": 0.10532643646001816, | |
| "loss_layer_36_head": 0.07034877687692642, | |
| "loss_layer_42_head": 0.04081105440855026, | |
| "loss_layer_6_head": 0.5715882182121277, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 61.76638176638177, | |
| "grad_norm": 0.24395119376035693, | |
| "learning_rate": 0.0016841153222971112, | |
| "loss": 1.9328, | |
| "loss_layer_12_head": 0.36113080382347107, | |
| "loss_layer_18_head": 0.6124555468559265, | |
| "loss_layer_24_head": 0.16591724753379822, | |
| "loss_layer_30_head": 0.10665629804134369, | |
| "loss_layer_36_head": 0.07249001413583755, | |
| "loss_layer_42_head": 0.04333646222949028, | |
| "loss_layer_6_head": 0.5653160214424133, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 61.99430199430199, | |
| "grad_norm": 0.2713219570569318, | |
| "learning_rate": 0.0016645036265170313, | |
| "loss": 1.9178, | |
| "loss_layer_12_head": 0.36808109283447266, | |
| "loss_layer_18_head": 0.6035963892936707, | |
| "loss_layer_24_head": 0.16664768755435944, | |
| "loss_layer_30_head": 0.10622652620077133, | |
| "loss_layer_36_head": 0.07072083652019501, | |
| "loss_layer_42_head": 0.04039480537176132, | |
| "loss_layer_6_head": 0.5705283880233765, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 62.22222222222222, | |
| "grad_norm": 0.16434355933361142, | |
| "learning_rate": 0.0016449496416858283, | |
| "loss": 1.8723, | |
| "loss_layer_12_head": 0.3438248336315155, | |
| "loss_layer_18_head": 0.6018158793449402, | |
| "loss_layer_24_head": 0.15885809063911438, | |
| "loss_layer_30_head": 0.10279610008001328, | |
| "loss_layer_36_head": 0.06658101826906204, | |
| "loss_layer_42_head": 0.038769569247961044, | |
| "loss_layer_6_head": 0.5343616604804993, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 62.45014245014245, | |
| "grad_norm": 0.17031052389682674, | |
| "learning_rate": 0.001625454718472464, | |
| "loss": 1.8919, | |
| "loss_layer_12_head": 0.354342520236969, | |
| "loss_layer_18_head": 0.6070439219474792, | |
| "loss_layer_24_head": 0.16174617409706116, | |
| "loss_layer_30_head": 0.10284841060638428, | |
| "loss_layer_36_head": 0.06826045364141464, | |
| "loss_layer_42_head": 0.0396745428442955, | |
| "loss_layer_6_head": 0.5540146231651306, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 62.67806267806268, | |
| "grad_norm": 0.17239300408220518, | |
| "learning_rate": 0.0016060202034662847, | |
| "loss": 1.892, | |
| "loss_layer_12_head": 0.35568657517433167, | |
| "loss_layer_18_head": 0.6011655330657959, | |
| "loss_layer_24_head": 0.16301211714744568, | |
| "loss_layer_30_head": 0.10400134325027466, | |
| "loss_layer_36_head": 0.06935641914606094, | |
| "loss_layer_42_head": 0.04025629162788391, | |
| "loss_layer_6_head": 0.5542250275611877, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 62.9059829059829, | |
| "grad_norm": 0.17793556083173895, | |
| "learning_rate": 0.0015866474390840125, | |
| "loss": 1.914, | |
| "loss_layer_12_head": 0.3635668158531189, | |
| "loss_layer_18_head": 0.6096671223640442, | |
| "loss_layer_24_head": 0.16600359976291656, | |
| "loss_layer_30_head": 0.10514751821756363, | |
| "loss_layer_36_head": 0.06970134377479553, | |
| "loss_layer_42_head": 0.040074046701192856, | |
| "loss_layer_6_head": 0.565592348575592, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 63.13390313390313, | |
| "grad_norm": 0.15331853155042946, | |
| "learning_rate": 0.001567337763477015, | |
| "loss": 1.8954, | |
| "loss_layer_12_head": 0.353837251663208, | |
| "loss_layer_18_head": 0.6123312711715698, | |
| "loss_layer_24_head": 0.16154246032238007, | |
| "loss_layer_30_head": 0.1025996059179306, | |
| "loss_layer_36_head": 0.06770062446594238, | |
| "loss_layer_42_head": 0.03924650326371193, | |
| "loss_layer_6_head": 0.5534176230430603, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 63.36182336182336, | |
| "grad_norm": 0.13314106837890913, | |
| "learning_rate": 0.0015480925104388763, | |
| "loss": 1.8684, | |
| "loss_layer_12_head": 0.34603065252304077, | |
| "loss_layer_18_head": 0.5800515413284302, | |
| "loss_layer_24_head": 0.15730763971805573, | |
| "loss_layer_30_head": 0.09923694282770157, | |
| "loss_layer_36_head": 0.06509393453598022, | |
| "loss_layer_42_head": 0.037624310702085495, | |
| "loss_layer_6_head": 0.5397533178329468, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 63.58974358974359, | |
| "grad_norm": 0.1821027004311222, | |
| "learning_rate": 0.0015289130093132633, | |
| "loss": 1.8867, | |
| "loss_layer_12_head": 0.35637348890304565, | |
| "loss_layer_18_head": 0.5976595282554626, | |
| "loss_layer_24_head": 0.1641477346420288, | |
| "loss_layer_30_head": 0.10457787662744522, | |
| "loss_layer_36_head": 0.06947280466556549, | |
| "loss_layer_42_head": 0.039845842868089676, | |
| "loss_layer_6_head": 0.5539056658744812, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 63.81766381766382, | |
| "grad_norm": 0.14339663827541982, | |
| "learning_rate": 0.001509800584902108, | |
| "loss": 1.8844, | |
| "loss_layer_12_head": 0.36510032415390015, | |
| "loss_layer_18_head": 0.6054537892341614, | |
| "loss_layer_24_head": 0.16671153903007507, | |
| "loss_layer_30_head": 0.10565783828496933, | |
| "loss_layer_36_head": 0.06983356922864914, | |
| "loss_layer_42_head": 0.03976988047361374, | |
| "loss_layer_6_head": 0.56358402967453, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 63.81766381766382, | |
| "eval_loss": 3.8140172958374023, | |
| "eval_loss_layer_12_head": 0.8028413653373718, | |
| "eval_loss_layer_18_head": 0.7729310989379883, | |
| "eval_loss_layer_24_head": 0.43893590569496155, | |
| "eval_loss_layer_30_head": 0.30446603894233704, | |
| "eval_loss_layer_36_head": 0.20355641841888428, | |
| "eval_loss_layer_42_head": 0.13499176502227783, | |
| "eval_loss_layer_6_head": 1.072322964668274, | |
| "eval_runtime": 4.9557, | |
| "eval_samples_per_second": 6.659, | |
| "eval_steps_per_second": 0.605, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 64.04558404558405, | |
| "grad_norm": 0.1706143168596619, | |
| "learning_rate": 0.0014907565573740943, | |
| "loss": 1.8565, | |
| "loss_layer_12_head": 0.36537107825279236, | |
| "loss_layer_18_head": 0.5950644016265869, | |
| "loss_layer_24_head": 0.16890759766101837, | |
| "loss_layer_30_head": 0.10786622762680054, | |
| "loss_layer_36_head": 0.0724274069070816, | |
| "loss_layer_42_head": 0.042885925620794296, | |
| "loss_layer_6_head": 0.5630357265472412, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 64.27350427350427, | |
| "grad_norm": 0.1616402364897859, | |
| "learning_rate": 0.0014717822421734716, | |
| "loss": 1.8474, | |
| "loss_layer_12_head": 0.3484862744808197, | |
| "loss_layer_18_head": 0.5841726660728455, | |
| "loss_layer_24_head": 0.15990933775901794, | |
| "loss_layer_30_head": 0.10199542343616486, | |
| "loss_layer_36_head": 0.06788065284490585, | |
| "loss_layer_42_head": 0.03943866491317749, | |
| "loss_layer_6_head": 0.544236421585083, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 64.5014245014245, | |
| "grad_norm": 0.1564142623602139, | |
| "learning_rate": 0.0014528789499291884, | |
| "loss": 1.884, | |
| "loss_layer_12_head": 0.3577747642993927, | |
| "loss_layer_18_head": 0.5889385342597961, | |
| "loss_layer_24_head": 0.1639334261417389, | |
| "loss_layer_30_head": 0.10324330627918243, | |
| "loss_layer_36_head": 0.06843547523021698, | |
| "loss_layer_42_head": 0.03951374441385269, | |
| "loss_layer_6_head": 0.5521500706672668, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 64.72934472934473, | |
| "grad_norm": 0.1668705051362201, | |
| "learning_rate": 0.0014340479863643658, | |
| "loss": 1.8727, | |
| "loss_layer_12_head": 0.3684167265892029, | |
| "loss_layer_18_head": 0.6115270853042603, | |
| "loss_layer_24_head": 0.16978159546852112, | |
| "loss_layer_30_head": 0.10723569244146347, | |
| "loss_layer_36_head": 0.07100329548120499, | |
| "loss_layer_42_head": 0.04143214598298073, | |
| "loss_layer_6_head": 0.5671561360359192, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 64.95726495726495, | |
| "grad_norm": 0.1722312849981684, | |
| "learning_rate": 0.001415290652206105, | |
| "loss": 1.8603, | |
| "loss_layer_12_head": 0.3622528910636902, | |
| "loss_layer_18_head": 0.5668131113052368, | |
| "loss_layer_24_head": 0.16173304617404938, | |
| "loss_layer_30_head": 0.10281785577535629, | |
| "loss_layer_36_head": 0.0677582398056984, | |
| "loss_layer_42_head": 0.03934178501367569, | |
| "loss_layer_6_head": 0.5578157901763916, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 65.18518518518519, | |
| "grad_norm": 0.1565891848572285, | |
| "learning_rate": 0.0013966082430956401, | |
| "loss": 1.8396, | |
| "loss_layer_12_head": 0.3468489646911621, | |
| "loss_layer_18_head": 0.5685579776763916, | |
| "loss_layer_24_head": 0.1579982042312622, | |
| "loss_layer_30_head": 0.09962925314903259, | |
| "loss_layer_36_head": 0.06661403924226761, | |
| "loss_layer_42_head": 0.0386619046330452, | |
| "loss_layer_6_head": 0.537671685218811, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 65.41310541310541, | |
| "grad_norm": 0.20295485671464988, | |
| "learning_rate": 0.0013780020494988446, | |
| "loss": 1.8453, | |
| "loss_layer_12_head": 0.3568614423274994, | |
| "loss_layer_18_head": 0.594135582447052, | |
| "loss_layer_24_head": 0.16297490894794464, | |
| "loss_layer_30_head": 0.1031971201300621, | |
| "loss_layer_36_head": 0.06798653304576874, | |
| "loss_layer_42_head": 0.0389096699655056, | |
| "loss_layer_6_head": 0.5536288022994995, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 65.64102564102564, | |
| "grad_norm": 0.2043733204414482, | |
| "learning_rate": 0.0013594733566170926, | |
| "loss": 1.8431, | |
| "loss_layer_12_head": 0.35691505670547485, | |
| "loss_layer_18_head": 0.571416437625885, | |
| "loss_layer_24_head": 0.16371139883995056, | |
| "loss_layer_30_head": 0.10467123985290527, | |
| "loss_layer_36_head": 0.06964066624641418, | |
| "loss_layer_42_head": 0.03979448229074478, | |
| "loss_layer_6_head": 0.5498124361038208, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 65.86894586894587, | |
| "grad_norm": 0.1854719778610673, | |
| "learning_rate": 0.0013410234442984858, | |
| "loss": 1.8603, | |
| "loss_layer_12_head": 0.3696306347846985, | |
| "loss_layer_18_head": 0.5896201729774475, | |
| "loss_layer_24_head": 0.16896311938762665, | |
| "loss_layer_30_head": 0.1075272187590599, | |
| "loss_layer_36_head": 0.07201769948005676, | |
| "loss_layer_42_head": 0.041474007070064545, | |
| "loss_layer_6_head": 0.5694519281387329, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 66.0968660968661, | |
| "grad_norm": 0.13340586562162032, | |
| "learning_rate": 0.0013226535869494504, | |
| "loss": 1.854, | |
| "loss_layer_12_head": 0.3521641194820404, | |
| "loss_layer_18_head": 0.5745565891265869, | |
| "loss_layer_24_head": 0.16140435636043549, | |
| "loss_layer_30_head": 0.10265658050775528, | |
| "loss_layer_36_head": 0.06814004480838776, | |
| "loss_layer_42_head": 0.03969401866197586, | |
| "loss_layer_6_head": 0.5429982542991638, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 66.32478632478633, | |
| "grad_norm": 0.17959825279064529, | |
| "learning_rate": 0.0013043650534467052, | |
| "loss": 1.8141, | |
| "loss_layer_12_head": 0.35965582728385925, | |
| "loss_layer_18_head": 0.5807262659072876, | |
| "loss_layer_24_head": 0.16504819691181183, | |
| "loss_layer_30_head": 0.10508060455322266, | |
| "loss_layer_36_head": 0.06974060833454132, | |
| "loss_layer_42_head": 0.040343768894672394, | |
| "loss_layer_6_head": 0.5519964694976807, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 66.55270655270655, | |
| "grad_norm": 0.15133186076029295, | |
| "learning_rate": 0.0012861591070496192, | |
| "loss": 1.8282, | |
| "loss_layer_12_head": 0.33240434527397156, | |
| "loss_layer_18_head": 0.533875048160553, | |
| "loss_layer_24_head": 0.1510429084300995, | |
| "loss_layer_30_head": 0.09637447446584702, | |
| "loss_layer_36_head": 0.06486562639474869, | |
| "loss_layer_42_head": 0.038768868893384933, | |
| "loss_layer_6_head": 0.5180043578147888, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 66.78062678062678, | |
| "grad_norm": 0.12781398423219267, | |
| "learning_rate": 0.0012680370053129552, | |
| "loss": 1.8658, | |
| "loss_layer_12_head": 0.3577564060688019, | |
| "loss_layer_18_head": 0.5643961429595947, | |
| "loss_layer_24_head": 0.16104556620121002, | |
| "loss_layer_30_head": 0.10230318456888199, | |
| "loss_layer_36_head": 0.0684160441160202, | |
| "loss_layer_42_head": 0.03966595605015755, | |
| "loss_layer_6_head": 0.5522262454032898, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 67.00854700854701, | |
| "grad_norm": 0.1434611582862995, | |
| "learning_rate": 0.0012500000000000007, | |
| "loss": 1.8492, | |
| "loss_layer_12_head": 0.36101633310317993, | |
| "loss_layer_18_head": 0.5714535117149353, | |
| "loss_layer_24_head": 0.16426679491996765, | |
| "loss_layer_30_head": 0.10345318168401718, | |
| "loss_layer_36_head": 0.06851635128259659, | |
| "loss_layer_42_head": 0.039524100720882416, | |
| "loss_layer_6_head": 0.5529091358184814, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 67.23646723646723, | |
| "grad_norm": 0.15891028079726016, | |
| "learning_rate": 0.0012320493369961025, | |
| "loss": 1.8095, | |
| "loss_layer_12_head": 0.34979498386383057, | |
| "loss_layer_18_head": 0.571343719959259, | |
| "loss_layer_24_head": 0.15945219993591309, | |
| "loss_layer_30_head": 0.10051491111516953, | |
| "loss_layer_36_head": 0.06638234108686447, | |
| "loss_layer_42_head": 0.03841354325413704, | |
| "loss_layer_6_head": 0.5429816842079163, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 67.46438746438747, | |
| "grad_norm": 0.1449020760026797, | |
| "learning_rate": 0.0012141862562226164, | |
| "loss": 1.8208, | |
| "loss_layer_12_head": 0.3603321611881256, | |
| "loss_layer_18_head": 0.574971079826355, | |
| "loss_layer_24_head": 0.16558215022087097, | |
| "loss_layer_30_head": 0.10478846728801727, | |
| "loss_layer_36_head": 0.06960511952638626, | |
| "loss_layer_42_head": 0.04023158177733421, | |
| "loss_layer_6_head": 0.5533859133720398, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 67.6923076923077, | |
| "grad_norm": 0.1515034944495946, | |
| "learning_rate": 0.001196411991551255, | |
| "loss": 1.8069, | |
| "loss_layer_12_head": 0.35250595211982727, | |
| "loss_layer_18_head": 0.5499317646026611, | |
| "loss_layer_24_head": 0.16539210081100464, | |
| "loss_layer_30_head": 0.10548852384090424, | |
| "loss_layer_36_head": 0.07063941657543182, | |
| "loss_layer_42_head": 0.03947348892688751, | |
| "loss_layer_6_head": 0.5477828979492188, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 67.92022792022792, | |
| "grad_norm": 0.13885251819035915, | |
| "learning_rate": 0.0011787277707188614, | |
| "loss": 1.8487, | |
| "loss_layer_12_head": 0.3500252366065979, | |
| "loss_layer_18_head": 0.5536776781082153, | |
| "loss_layer_24_head": 0.16161292791366577, | |
| "loss_layer_30_head": 0.10370011627674103, | |
| "loss_layer_36_head": 0.06909648329019547, | |
| "loss_layer_42_head": 0.04028378799557686, | |
| "loss_layer_6_head": 0.5405044555664062, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 68.14814814814815, | |
| "grad_norm": 0.1477900835869908, | |
| "learning_rate": 0.001161134815242604, | |
| "loss": 1.8287, | |
| "loss_layer_12_head": 0.36902493238449097, | |
| "loss_layer_18_head": 0.577628493309021, | |
| "loss_layer_24_head": 0.1691674441099167, | |
| "loss_layer_30_head": 0.10768872499465942, | |
| "loss_layer_36_head": 0.07162035256624222, | |
| "loss_layer_42_head": 0.041386984288692474, | |
| "loss_layer_6_head": 0.5679258108139038, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 68.37606837606837, | |
| "grad_norm": 0.1497415381552831, | |
| "learning_rate": 0.0011436343403356016, | |
| "loss": 1.8166, | |
| "loss_layer_12_head": 0.3537401258945465, | |
| "loss_layer_18_head": 0.5527002811431885, | |
| "loss_layer_24_head": 0.16221362352371216, | |
| "loss_layer_30_head": 0.10519299656152725, | |
| "loss_layer_36_head": 0.0677286759018898, | |
| "loss_layer_42_head": 0.038980789482593536, | |
| "loss_layer_6_head": 0.5500974059104919, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 68.6039886039886, | |
| "grad_norm": 0.16417939418429925, | |
| "learning_rate": 0.001126227554822985, | |
| "loss": 1.8257, | |
| "loss_layer_12_head": 0.35889554023742676, | |
| "loss_layer_18_head": 0.566818118095398, | |
| "loss_layer_24_head": 0.16498246788978577, | |
| "loss_layer_30_head": 0.10522939264774323, | |
| "loss_layer_36_head": 0.0697682723402977, | |
| "loss_layer_42_head": 0.039750516414642334, | |
| "loss_layer_6_head": 0.5557876825332642, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 68.83190883190883, | |
| "grad_norm": 0.13478519747161247, | |
| "learning_rate": 0.0011089156610583984, | |
| "loss": 1.8031, | |
| "loss_layer_12_head": 0.33702507615089417, | |
| "loss_layer_18_head": 0.5199332237243652, | |
| "loss_layer_24_head": 0.15418687462806702, | |
| "loss_layer_30_head": 0.09883591532707214, | |
| "loss_layer_36_head": 0.06565789133310318, | |
| "loss_layer_42_head": 0.03810978680849075, | |
| "loss_layer_6_head": 0.5240939855575562, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 69.05982905982906, | |
| "grad_norm": 0.15061647322783647, | |
| "learning_rate": 0.0010916998548409448, | |
| "loss": 1.8214, | |
| "loss_layer_12_head": 0.36763328313827515, | |
| "loss_layer_18_head": 0.5576991438865662, | |
| "loss_layer_24_head": 0.17053815722465515, | |
| "loss_layer_30_head": 0.10889364778995514, | |
| "loss_layer_36_head": 0.0726301372051239, | |
| "loss_layer_42_head": 0.04174993932247162, | |
| "loss_layer_6_head": 0.5595671534538269, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 69.28774928774929, | |
| "grad_norm": 0.1703599863224025, | |
| "learning_rate": 0.0010745813253325955, | |
| "loss": 1.8176, | |
| "loss_layer_12_head": 0.3379597067832947, | |
| "loss_layer_18_head": 0.5326976180076599, | |
| "loss_layer_24_head": 0.15527421236038208, | |
| "loss_layer_30_head": 0.09891333431005478, | |
| "loss_layer_36_head": 0.06529764831066132, | |
| "loss_layer_42_head": 0.037422697991132736, | |
| "loss_layer_6_head": 0.5266030430793762, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 69.51566951566952, | |
| "grad_norm": 0.15303418906286098, | |
| "learning_rate": 0.0010575612549760425, | |
| "loss": 1.7911, | |
| "loss_layer_12_head": 0.34493082761764526, | |
| "loss_layer_18_head": 0.540948748588562, | |
| "loss_layer_24_head": 0.15642331540584564, | |
| "loss_layer_30_head": 0.09965353459119797, | |
| "loss_layer_36_head": 0.06654352694749832, | |
| "loss_layer_42_head": 0.038676489144563675, | |
| "loss_layer_6_head": 0.5338304042816162, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 69.74358974358974, | |
| "grad_norm": 0.15597397432669702, | |
| "learning_rate": 0.001040640819413026, | |
| "loss": 1.801, | |
| "loss_layer_12_head": 0.3541865944862366, | |
| "loss_layer_18_head": 0.5545614957809448, | |
| "loss_layer_24_head": 0.16169731318950653, | |
| "loss_layer_30_head": 0.10177583992481232, | |
| "loss_layer_36_head": 0.06731664389371872, | |
| "loss_layer_42_head": 0.03906581178307533, | |
| "loss_layer_6_head": 0.5432997941970825, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 69.97150997150997, | |
| "grad_norm": 0.1381659058057447, | |
| "learning_rate": 0.0010238211874031258, | |
| "loss": 1.819, | |
| "loss_layer_12_head": 0.35804328322410583, | |
| "loss_layer_18_head": 0.5514500141143799, | |
| "loss_layer_24_head": 0.16211315989494324, | |
| "loss_layer_30_head": 0.10239671170711517, | |
| "loss_layer_36_head": 0.06772608309984207, | |
| "loss_layer_42_head": 0.03940841555595398, | |
| "loss_layer_6_head": 0.5507400631904602, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 70.1994301994302, | |
| "grad_norm": 0.12515170214336985, | |
| "learning_rate": 0.0010071035207430351, | |
| "loss": 1.8026, | |
| "loss_layer_12_head": 0.36129826307296753, | |
| "loss_layer_18_head": 0.5562499165534973, | |
| "loss_layer_24_head": 0.16456067562103271, | |
| "loss_layer_30_head": 0.10447216033935547, | |
| "loss_layer_36_head": 0.06889413297176361, | |
| "loss_layer_42_head": 0.03956874459981918, | |
| "loss_layer_6_head": 0.5565100312232971, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 70.42735042735043, | |
| "grad_norm": 0.1265262115143159, | |
| "learning_rate": 0.000990488974186306, | |
| "loss": 1.7814, | |
| "loss_layer_12_head": 0.3376021087169647, | |
| "loss_layer_18_head": 0.5280048847198486, | |
| "loss_layer_24_head": 0.15099617838859558, | |
| "loss_layer_30_head": 0.09582580626010895, | |
| "loss_layer_36_head": 0.06339169293642044, | |
| "loss_layer_42_head": 0.03644992783665657, | |
| "loss_layer_6_head": 0.526264488697052, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 70.65527065527066, | |
| "grad_norm": 0.14518716953911703, | |
| "learning_rate": 0.0009739786953635924, | |
| "loss": 1.7955, | |
| "loss_layer_12_head": 0.3329188823699951, | |
| "loss_layer_18_head": 0.522331714630127, | |
| "loss_layer_24_head": 0.15017689764499664, | |
| "loss_layer_30_head": 0.09456716477870941, | |
| "loss_layer_36_head": 0.06272949278354645, | |
| "loss_layer_42_head": 0.037007980048656464, | |
| "loss_layer_6_head": 0.5194771885871887, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 70.88319088319088, | |
| "grad_norm": 0.14492257597202254, | |
| "learning_rate": 0.0009575738247033688, | |
| "loss": 1.8043, | |
| "loss_layer_12_head": 0.3527730107307434, | |
| "loss_layer_18_head": 0.5289863348007202, | |
| "loss_layer_24_head": 0.16144096851348877, | |
| "loss_layer_30_head": 0.10272853076457977, | |
| "loss_layer_36_head": 0.06866233050823212, | |
| "loss_layer_42_head": 0.04029922932386398, | |
| "loss_layer_6_head": 0.5439268946647644, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 71.11111111111111, | |
| "grad_norm": 0.12908560052498289, | |
| "learning_rate": 0.0009412754953531664, | |
| "loss": 1.7955, | |
| "loss_layer_12_head": 0.3465037941932678, | |
| "loss_layer_18_head": 0.5363790988922119, | |
| "loss_layer_24_head": 0.15984053909778595, | |
| "loss_layer_30_head": 0.10168828815221786, | |
| "loss_layer_36_head": 0.06739744544029236, | |
| "loss_layer_42_head": 0.03855326771736145, | |
| "loss_layer_6_head": 0.533501386642456, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 71.33903133903134, | |
| "grad_norm": 0.1309584047726663, | |
| "learning_rate": 0.0009250848331012968, | |
| "loss": 1.778, | |
| "loss_layer_12_head": 0.3392285704612732, | |
| "loss_layer_18_head": 0.5243316888809204, | |
| "loss_layer_24_head": 0.15213483572006226, | |
| "loss_layer_30_head": 0.09613216668367386, | |
| "loss_layer_36_head": 0.06358274072408676, | |
| "loss_layer_42_head": 0.037117063999176025, | |
| "loss_layer_6_head": 0.5279943346977234, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 71.56695156695157, | |
| "grad_norm": 0.13119649502099703, | |
| "learning_rate": 0.0009090029562990911, | |
| "loss": 1.7932, | |
| "loss_layer_12_head": 0.3541892170906067, | |
| "loss_layer_18_head": 0.535491406917572, | |
| "loss_layer_24_head": 0.16096696257591248, | |
| "loss_layer_30_head": 0.1018461138010025, | |
| "loss_layer_36_head": 0.06726451963186264, | |
| "loss_layer_42_head": 0.03884906694293022, | |
| "loss_layer_6_head": 0.5449596643447876, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 71.7948717948718, | |
| "grad_norm": 0.15395268449321947, | |
| "learning_rate": 0.0008930309757836516, | |
| "loss": 1.7906, | |
| "loss_layer_12_head": 0.35688620805740356, | |
| "loss_layer_18_head": 0.5362941026687622, | |
| "loss_layer_24_head": 0.16272303462028503, | |
| "loss_layer_30_head": 0.10348813235759735, | |
| "loss_layer_36_head": 0.06898694485425949, | |
| "loss_layer_42_head": 0.0402650460600853, | |
| "loss_layer_6_head": 0.5459010004997253, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 72.02279202279202, | |
| "grad_norm": 0.1204264850535832, | |
| "learning_rate": 0.0008771699948011203, | |
| "loss": 1.791, | |
| "loss_layer_12_head": 0.35216954350471497, | |
| "loss_layer_18_head": 0.5342555642127991, | |
| "loss_layer_24_head": 0.16132056713104248, | |
| "loss_layer_30_head": 0.10252640396356583, | |
| "loss_layer_36_head": 0.06832431256771088, | |
| "loss_layer_42_head": 0.039825376123189926, | |
| "loss_layer_6_head": 0.5416141152381897, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 72.25071225071225, | |
| "grad_norm": 0.12330961195318141, | |
| "learning_rate": 0.0008614211089304744, | |
| "loss": 1.7645, | |
| "loss_layer_12_head": 0.3503497242927551, | |
| "loss_layer_18_head": 0.5363430976867676, | |
| "loss_layer_24_head": 0.1589389592409134, | |
| "loss_layer_30_head": 0.10142697393894196, | |
| "loss_layer_36_head": 0.06693203747272491, | |
| "loss_layer_42_head": 0.038655836135149, | |
| "loss_layer_6_head": 0.5450853109359741, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 72.47863247863248, | |
| "grad_norm": 0.1368428580674561, | |
| "learning_rate": 0.0008457854060078521, | |
| "loss": 1.7539, | |
| "loss_layer_12_head": 0.3376067876815796, | |
| "loss_layer_18_head": 0.5092514753341675, | |
| "loss_layer_24_head": 0.1555960476398468, | |
| "loss_layer_30_head": 0.10002340376377106, | |
| "loss_layer_36_head": 0.06791587173938751, | |
| "loss_layer_42_head": 0.04093862324953079, | |
| "loss_layer_6_head": 0.516821563243866, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 72.70655270655271, | |
| "grad_norm": 0.1166360566610833, | |
| "learning_rate": 0.0008302639660514069, | |
| "loss": 1.7834, | |
| "loss_layer_12_head": 0.36099570989608765, | |
| "loss_layer_18_head": 0.5416703224182129, | |
| "loss_layer_24_head": 0.164410799741745, | |
| "loss_layer_30_head": 0.10419974476099014, | |
| "loss_layer_36_head": 0.0690721794962883, | |
| "loss_layer_42_head": 0.039205193519592285, | |
| "loss_layer_6_head": 0.552871584892273, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 72.93447293447294, | |
| "grad_norm": 0.12201088430110535, | |
| "learning_rate": 0.0008148578611867113, | |
| "loss": 1.8019, | |
| "loss_layer_12_head": 0.35458269715309143, | |
| "loss_layer_18_head": 0.531002402305603, | |
| "loss_layer_24_head": 0.16304495930671692, | |
| "loss_layer_30_head": 0.10449066013097763, | |
| "loss_layer_36_head": 0.06930046528577805, | |
| "loss_layer_42_head": 0.039028845727443695, | |
| "loss_layer_6_head": 0.5448761582374573, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 72.93447293447294, | |
| "eval_loss": 3.777650833129883, | |
| "eval_loss_layer_12_head": 0.8038034439086914, | |
| "eval_loss_layer_18_head": 0.7375771999359131, | |
| "eval_loss_layer_24_head": 0.44012030959129333, | |
| "eval_loss_layer_30_head": 0.3041973412036896, | |
| "eval_loss_layer_36_head": 0.20319151878356934, | |
| "eval_loss_layer_42_head": 0.13452239334583282, | |
| "eval_loss_layer_6_head": 1.0726211071014404, | |
| "eval_runtime": 4.946, | |
| "eval_samples_per_second": 6.672, | |
| "eval_steps_per_second": 0.607, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 73.16239316239316, | |
| "grad_norm": 0.12807746671894205, | |
| "learning_rate": 0.0007995681555727011, | |
| "loss": 1.7822, | |
| "loss_layer_12_head": 0.35767659544944763, | |
| "loss_layer_18_head": 0.5432697534561157, | |
| "loss_layer_24_head": 0.16505815088748932, | |
| "loss_layer_30_head": 0.10571058839559555, | |
| "loss_layer_36_head": 0.07003127038478851, | |
| "loss_layer_42_head": 0.040247105062007904, | |
| "loss_layer_6_head": 0.5464234352111816, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 73.3903133903134, | |
| "grad_norm": 0.130365878874844, | |
| "learning_rate": 0.0007843959053281663, | |
| "loss": 1.7653, | |
| "loss_layer_12_head": 0.3459833562374115, | |
| "loss_layer_18_head": 0.5274468064308167, | |
| "loss_layer_24_head": 0.1586887389421463, | |
| "loss_layer_30_head": 0.10026203095912933, | |
| "loss_layer_36_head": 0.0662410706281662, | |
| "loss_layer_42_head": 0.03850514441728592, | |
| "loss_layer_6_head": 0.5331538915634155, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 73.61823361823362, | |
| "grad_norm": 0.10767490237253707, | |
| "learning_rate": 0.0007693421584588012, | |
| "loss": 1.7654, | |
| "loss_layer_12_head": 0.3585582375526428, | |
| "loss_layer_18_head": 0.5424355864524841, | |
| "loss_layer_24_head": 0.16505910456180573, | |
| "loss_layer_30_head": 0.10474319756031036, | |
| "loss_layer_36_head": 0.0691712498664856, | |
| "loss_layer_42_head": 0.039669524878263474, | |
| "loss_layer_6_head": 0.5465718507766724, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 73.84615384615384, | |
| "grad_norm": 0.11493378296733746, | |
| "learning_rate": 0.0007544079547848182, | |
| "loss": 1.7795, | |
| "loss_layer_12_head": 0.3523768484592438, | |
| "loss_layer_18_head": 0.5339788198471069, | |
| "loss_layer_24_head": 0.1609235554933548, | |
| "loss_layer_30_head": 0.10195883363485336, | |
| "loss_layer_36_head": 0.06740550696849823, | |
| "loss_layer_42_head": 0.03874468803405762, | |
| "loss_layer_6_head": 0.543295681476593, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 74.07407407407408, | |
| "grad_norm": 0.10561384049680506, | |
| "learning_rate": 0.0007395943258691206, | |
| "loss": 1.7689, | |
| "loss_layer_12_head": 0.3443649411201477, | |
| "loss_layer_18_head": 0.5210028886795044, | |
| "loss_layer_24_head": 0.1560635268688202, | |
| "loss_layer_30_head": 0.09920267760753632, | |
| "loss_layer_36_head": 0.06585382670164108, | |
| "loss_layer_42_head": 0.038234781473875046, | |
| "loss_layer_6_head": 0.5328817367553711, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 74.3019943019943, | |
| "grad_norm": 0.10268010300616287, | |
| "learning_rate": 0.0007249022949460493, | |
| "loss": 1.749, | |
| "loss_layer_12_head": 0.33767470717430115, | |
| "loss_layer_18_head": 0.5087472200393677, | |
| "loss_layer_24_head": 0.1519826352596283, | |
| "loss_layer_30_head": 0.09580295532941818, | |
| "loss_layer_36_head": 0.06339939683675766, | |
| "loss_layer_42_head": 0.03678145259618759, | |
| "loss_layer_6_head": 0.5183311104774475, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 74.52991452991454, | |
| "grad_norm": 0.1142043088181504, | |
| "learning_rate": 0.000710332876850704, | |
| "loss": 1.7564, | |
| "loss_layer_12_head": 0.33524709939956665, | |
| "loss_layer_18_head": 0.5125759840011597, | |
| "loss_layer_24_head": 0.1531374603509903, | |
| "loss_layer_30_head": 0.0977383702993393, | |
| "loss_layer_36_head": 0.06496452540159225, | |
| "loss_layer_42_head": 0.037830810993909836, | |
| "loss_layer_6_head": 0.5194955468177795, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 74.75783475783476, | |
| "grad_norm": 0.10734174486274735, | |
| "learning_rate": 0.0006958870779488446, | |
| "loss": 1.7828, | |
| "loss_layer_12_head": 0.3413275182247162, | |
| "loss_layer_18_head": 0.5117852091789246, | |
| "loss_layer_24_head": 0.1564275622367859, | |
| "loss_layer_30_head": 0.10126455128192902, | |
| "loss_layer_36_head": 0.06778749823570251, | |
| "loss_layer_42_head": 0.03929626941680908, | |
| "loss_layer_6_head": 0.5240119695663452, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 74.98575498575498, | |
| "grad_norm": 0.11059460221593216, | |
| "learning_rate": 0.0006815658960673781, | |
| "loss": 1.7701, | |
| "loss_layer_12_head": 0.35752952098846436, | |
| "loss_layer_18_head": 0.5365008115768433, | |
| "loss_layer_24_head": 0.16379977762699127, | |
| "loss_layer_30_head": 0.10386929661035538, | |
| "loss_layer_36_head": 0.06902356445789337, | |
| "loss_layer_42_head": 0.03992991894483566, | |
| "loss_layer_6_head": 0.5477330088615417, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 75.21367521367522, | |
| "grad_norm": 0.10623496987136612, | |
| "learning_rate": 0.0006673703204254347, | |
| "loss": 1.7388, | |
| "loss_layer_12_head": 0.35095706582069397, | |
| "loss_layer_18_head": 0.5278469324111938, | |
| "loss_layer_24_head": 0.16206221282482147, | |
| "loss_layer_30_head": 0.10317299515008926, | |
| "loss_layer_36_head": 0.06801508367061615, | |
| "loss_layer_42_head": 0.03888120502233505, | |
| "loss_layer_6_head": 0.5396238565444946, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 75.44159544159544, | |
| "grad_norm": 0.10416826399412087, | |
| "learning_rate": 0.0006533013315660366, | |
| "loss": 1.7447, | |
| "loss_layer_12_head": 0.335065633058548, | |
| "loss_layer_18_head": 0.5077509880065918, | |
| "loss_layer_24_head": 0.1524048149585724, | |
| "loss_layer_30_head": 0.0960657149553299, | |
| "loss_layer_36_head": 0.06350886076688766, | |
| "loss_layer_42_head": 0.03666612505912781, | |
| "loss_layer_6_head": 0.5169146060943604, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 75.66951566951568, | |
| "grad_norm": 0.11154842762703716, | |
| "learning_rate": 0.0006393599012883708, | |
| "loss": 1.7684, | |
| "loss_layer_12_head": 0.3467431664466858, | |
| "loss_layer_18_head": 0.5151144862174988, | |
| "loss_layer_24_head": 0.1578586995601654, | |
| "loss_layer_30_head": 0.1004854291677475, | |
| "loss_layer_36_head": 0.06669998914003372, | |
| "loss_layer_42_head": 0.03827175125479698, | |
| "loss_layer_6_head": 0.5322891473770142, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 75.8974358974359, | |
| "grad_norm": 0.11122823408841152, | |
| "learning_rate": 0.0006255469925806642, | |
| "loss": 1.7743, | |
| "loss_layer_12_head": 0.35086023807525635, | |
| "loss_layer_18_head": 0.5303332209587097, | |
| "loss_layer_24_head": 0.16013801097869873, | |
| "loss_layer_30_head": 0.10204042494297028, | |
| "loss_layer_36_head": 0.06783459335565567, | |
| "loss_layer_42_head": 0.03844980522990227, | |
| "loss_layer_6_head": 0.5410701632499695, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 76.12535612535612, | |
| "grad_norm": 0.10142797226124593, | |
| "learning_rate": 0.0006118635595536634, | |
| "loss": 1.7619, | |
| "loss_layer_12_head": 0.33657488226890564, | |
| "loss_layer_18_head": 0.5111474990844727, | |
| "loss_layer_24_head": 0.1535833477973938, | |
| "loss_layer_30_head": 0.09838329255580902, | |
| "loss_layer_36_head": 0.06594778597354889, | |
| "loss_layer_42_head": 0.038476187735795975, | |
| "loss_layer_6_head": 0.5193209648132324, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 76.35327635327636, | |
| "grad_norm": 0.09615619261603243, | |
| "learning_rate": 0.0005983105473747291, | |
| "loss": 1.7578, | |
| "loss_layer_12_head": 0.3366927206516266, | |
| "loss_layer_18_head": 0.5104350447654724, | |
| "loss_layer_24_head": 0.15251997113227844, | |
| "loss_layer_30_head": 0.09642209112644196, | |
| "loss_layer_36_head": 0.06284157931804657, | |
| "loss_layer_42_head": 0.03592481091618538, | |
| "loss_layer_6_head": 0.5190488696098328, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 76.58119658119658, | |
| "grad_norm": 0.11055504444111074, | |
| "learning_rate": 0.0005848888922025552, | |
| "loss": 1.7644, | |
| "loss_layer_12_head": 0.3554447889328003, | |
| "loss_layer_18_head": 0.5308834314346313, | |
| "loss_layer_24_head": 0.16566328704357147, | |
| "loss_layer_30_head": 0.10649579763412476, | |
| "loss_layer_36_head": 0.07171137630939484, | |
| "loss_layer_42_head": 0.04246705397963524, | |
| "loss_layer_6_head": 0.542263388633728, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 76.80911680911682, | |
| "grad_norm": 0.09715150491168917, | |
| "learning_rate": 0.0005715995211225008, | |
| "loss": 1.7372, | |
| "loss_layer_12_head": 0.34149521589279175, | |
| "loss_layer_18_head": 0.5083609223365784, | |
| "loss_layer_24_head": 0.1550835371017456, | |
| "loss_layer_30_head": 0.09850551933050156, | |
| "loss_layer_36_head": 0.06518407166004181, | |
| "loss_layer_42_head": 0.036875925958156586, | |
| "loss_layer_6_head": 0.5298780202865601, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 77.03703703703704, | |
| "grad_norm": 0.0994729317287691, | |
| "learning_rate": 0.0005584433520825541, | |
| "loss": 1.7448, | |
| "loss_layer_12_head": 0.337735652923584, | |
| "loss_layer_18_head": 0.4945538640022278, | |
| "loss_layer_24_head": 0.15409395098686218, | |
| "loss_layer_30_head": 0.09803880751132965, | |
| "loss_layer_36_head": 0.06489689648151398, | |
| "loss_layer_42_head": 0.037099629640579224, | |
| "loss_layer_6_head": 0.5176650285720825, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 77.26495726495726, | |
| "grad_norm": 0.12507262576134806, | |
| "learning_rate": 0.0005454212938299255, | |
| "loss": 1.7432, | |
| "loss_layer_12_head": 0.34350135922431946, | |
| "loss_layer_18_head": 0.5194068551063538, | |
| "loss_layer_24_head": 0.15946084260940552, | |
| "loss_layer_30_head": 0.10177260637283325, | |
| "loss_layer_36_head": 0.06773774325847626, | |
| "loss_layer_42_head": 0.03955072909593582, | |
| "loss_layer_6_head": 0.5315379500389099, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 77.4928774928775, | |
| "grad_norm": 0.10998347798638555, | |
| "learning_rate": 0.0005325342458482779, | |
| "loss": 1.736, | |
| "loss_layer_12_head": 0.3438124656677246, | |
| "loss_layer_18_head": 0.5158748030662537, | |
| "loss_layer_24_head": 0.15715071558952332, | |
| "loss_layer_30_head": 0.09998907148838043, | |
| "loss_layer_36_head": 0.06680642068386078, | |
| "loss_layer_42_head": 0.03813738375902176, | |
| "loss_layer_6_head": 0.5303772687911987, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 77.72079772079772, | |
| "grad_norm": 0.10073524573554277, | |
| "learning_rate": 0.0005197830982955945, | |
| "loss": 1.761, | |
| "loss_layer_12_head": 0.3479636311531067, | |
| "loss_layer_18_head": 0.5083318948745728, | |
| "loss_layer_24_head": 0.15731409192085266, | |
| "loss_layer_30_head": 0.09888915717601776, | |
| "loss_layer_36_head": 0.06497863680124283, | |
| "loss_layer_42_head": 0.03769425302743912, | |
| "loss_layer_6_head": 0.5358933806419373, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 77.94871794871794, | |
| "grad_norm": 0.10383088041264718, | |
| "learning_rate": 0.0005071687319426946, | |
| "loss": 1.7467, | |
| "loss_layer_12_head": 0.35118889808654785, | |
| "loss_layer_18_head": 0.5108534097671509, | |
| "loss_layer_24_head": 0.16018807888031006, | |
| "loss_layer_30_head": 0.10246168076992035, | |
| "loss_layer_36_head": 0.06786641478538513, | |
| "loss_layer_42_head": 0.039132293313741684, | |
| "loss_layer_6_head": 0.5406389832496643, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 78.17663817663818, | |
| "grad_norm": 0.0921569642386526, | |
| "learning_rate": 0.0004946920181123904, | |
| "loss": 1.7411, | |
| "loss_layer_12_head": 0.34307050704956055, | |
| "loss_layer_18_head": 0.5181266069412231, | |
| "loss_layer_24_head": 0.15736958384513855, | |
| "loss_layer_30_head": 0.10029349476099014, | |
| "loss_layer_36_head": 0.06683328002691269, | |
| "loss_layer_42_head": 0.0385957770049572, | |
| "loss_layer_6_head": 0.5270929932594299, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 78.4045584045584, | |
| "grad_norm": 0.10382660941610025, | |
| "learning_rate": 0.00048235381861930964, | |
| "loss": 1.7253, | |
| "loss_layer_12_head": 0.3350295126438141, | |
| "loss_layer_18_head": 0.5014669895172119, | |
| "loss_layer_24_head": 0.15213629603385925, | |
| "loss_layer_30_head": 0.0972825437784195, | |
| "loss_layer_36_head": 0.06519439071416855, | |
| "loss_layer_42_head": 0.03851339966058731, | |
| "loss_layer_6_head": 0.5170751214027405, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 78.63247863247864, | |
| "grad_norm": 0.09529554195297314, | |
| "learning_rate": 0.00047015498571035874, | |
| "loss": 1.7461, | |
| "loss_layer_12_head": 0.3465036153793335, | |
| "loss_layer_18_head": 0.5140535235404968, | |
| "loss_layer_24_head": 0.16019120812416077, | |
| "loss_layer_30_head": 0.10193805396556854, | |
| "loss_layer_36_head": 0.067210853099823, | |
| "loss_layer_42_head": 0.03818202763795853, | |
| "loss_layer_6_head": 0.5324119329452515, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 78.86039886039886, | |
| "grad_norm": 0.09986232916758815, | |
| "learning_rate": 0.0004580963620058587, | |
| "loss": 1.7606, | |
| "loss_layer_12_head": 0.35431206226348877, | |
| "loss_layer_18_head": 0.5250551104545593, | |
| "loss_layer_24_head": 0.1629064977169037, | |
| "loss_layer_30_head": 0.10397347062826157, | |
| "loss_layer_36_head": 0.06930160522460938, | |
| "loss_layer_42_head": 0.03968958184123039, | |
| "loss_layer_6_head": 0.544308066368103, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 79.08831908831908, | |
| "grad_norm": 0.09300841263629599, | |
| "learning_rate": 0.0004461787804413406, | |
| "loss": 1.7324, | |
| "loss_layer_12_head": 0.3431733548641205, | |
| "loss_layer_18_head": 0.5113198757171631, | |
| "loss_layer_24_head": 0.15744714438915253, | |
| "loss_layer_30_head": 0.09923293441534042, | |
| "loss_layer_36_head": 0.0657310038805008, | |
| "loss_layer_42_head": 0.03722255676984787, | |
| "loss_layer_6_head": 0.5301375389099121, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 79.31623931623932, | |
| "grad_norm": 0.0925736410309516, | |
| "learning_rate": 0.0004344030642100133, | |
| "loss": 1.7243, | |
| "loss_layer_12_head": 0.34314194321632385, | |
| "loss_layer_18_head": 0.5142655372619629, | |
| "loss_layer_24_head": 0.1579989492893219, | |
| "loss_layer_30_head": 0.1012849360704422, | |
| "loss_layer_36_head": 0.06685040891170502, | |
| "loss_layer_42_head": 0.03779185935854912, | |
| "loss_layer_6_head": 0.5264319181442261, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 79.54415954415954, | |
| "grad_norm": 0.10584606451044351, | |
| "learning_rate": 0.00042277002670590034, | |
| "loss": 1.7449, | |
| "loss_layer_12_head": 0.33877480030059814, | |
| "loss_layer_18_head": 0.5086591839790344, | |
| "loss_layer_24_head": 0.15485888719558716, | |
| "loss_layer_30_head": 0.09807880222797394, | |
| "loss_layer_36_head": 0.06451396644115448, | |
| "loss_layer_42_head": 0.03756994009017944, | |
| "loss_layer_6_head": 0.5224930047988892, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 79.77207977207978, | |
| "grad_norm": 0.10981655707411507, | |
| "learning_rate": 0.0004112804714676593, | |
| "loss": 1.7254, | |
| "loss_layer_12_head": 0.35386496782302856, | |
| "loss_layer_18_head": 0.5217627286911011, | |
| "loss_layer_24_head": 0.16211065649986267, | |
| "loss_layer_30_head": 0.10369504988193512, | |
| "loss_layer_36_head": 0.06905995309352875, | |
| "loss_layer_42_head": 0.03959353640675545, | |
| "loss_layer_6_head": 0.5423368811607361, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 80.0, | |
| "grad_norm": 0.09352915096510195, | |
| "learning_rate": 0.0003999351921230715, | |
| "loss": 1.7481, | |
| "loss_layer_12_head": 0.3435395658016205, | |
| "loss_layer_18_head": 0.5069267153739929, | |
| "loss_layer_24_head": 0.1569208800792694, | |
| "loss_layer_30_head": 0.09992153942584991, | |
| "loss_layer_36_head": 0.0660344809293747, | |
| "loss_layer_42_head": 0.03770904988050461, | |
| "loss_layer_6_head": 0.5285703539848328, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 80.22792022792022, | |
| "grad_norm": 0.08847018489420039, | |
| "learning_rate": 0.0003887349723342304, | |
| "loss": 1.7213, | |
| "loss_layer_12_head": 0.3444362282752991, | |
| "loss_layer_18_head": 0.5083240270614624, | |
| "loss_layer_24_head": 0.1559758186340332, | |
| "loss_layer_30_head": 0.09910304844379425, | |
| "loss_layer_36_head": 0.06528599560260773, | |
| "loss_layer_42_head": 0.03764251619577408, | |
| "loss_layer_6_head": 0.5300508737564087, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 80.45584045584046, | |
| "grad_norm": 0.0942284279992955, | |
| "learning_rate": 0.0003776805857434068, | |
| "loss": 1.7415, | |
| "loss_layer_12_head": 0.34919023513793945, | |
| "loss_layer_18_head": 0.5226079225540161, | |
| "loss_layer_24_head": 0.1596161425113678, | |
| "loss_layer_30_head": 0.10055674612522125, | |
| "loss_layer_36_head": 0.06677110493183136, | |
| "loss_layer_42_head": 0.03786572068929672, | |
| "loss_layer_6_head": 0.5389742851257324, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 80.68376068376068, | |
| "grad_norm": 0.09201895112943784, | |
| "learning_rate": 0.000366772795919611, | |
| "loss": 1.7281, | |
| "loss_layer_12_head": 0.3336860239505768, | |
| "loss_layer_18_head": 0.49141353368759155, | |
| "loss_layer_24_head": 0.15047487616539001, | |
| "loss_layer_30_head": 0.09469149261713028, | |
| "loss_layer_36_head": 0.06228718161582947, | |
| "loss_layer_42_head": 0.036014530807733536, | |
| "loss_layer_6_head": 0.511943519115448, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 80.91168091168092, | |
| "grad_norm": 0.08533194295665472, | |
| "learning_rate": 0.0003560123563058512, | |
| "loss": 1.7367, | |
| "loss_layer_12_head": 0.3416348993778229, | |
| "loss_layer_18_head": 0.5056973695755005, | |
| "loss_layer_24_head": 0.15707023441791534, | |
| "loss_layer_30_head": 0.09985633194446564, | |
| "loss_layer_36_head": 0.06595893204212189, | |
| "loss_layer_42_head": 0.03782585263252258, | |
| "loss_layer_6_head": 0.5238116979598999, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 81.13960113960114, | |
| "grad_norm": 0.08323069486037522, | |
| "learning_rate": 0.0003454000101670901, | |
| "loss": 1.7297, | |
| "loss_layer_12_head": 0.3427240252494812, | |
| "loss_layer_18_head": 0.504900336265564, | |
| "loss_layer_24_head": 0.15762865543365479, | |
| "loss_layer_30_head": 0.10089079290628433, | |
| "loss_layer_36_head": 0.06669814139604568, | |
| "loss_layer_42_head": 0.03817524015903473, | |
| "loss_layer_6_head": 0.5277188420295715, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 81.36752136752136, | |
| "grad_norm": 0.08706863161635335, | |
| "learning_rate": 0.00033493649053890326, | |
| "loss": 1.7228, | |
| "loss_layer_12_head": 0.3384056091308594, | |
| "loss_layer_18_head": 0.5061665773391724, | |
| "loss_layer_24_head": 0.15349029004573822, | |
| "loss_layer_30_head": 0.09709183871746063, | |
| "loss_layer_36_head": 0.0645359456539154, | |
| "loss_layer_42_head": 0.03722700849175453, | |
| "loss_layer_6_head": 0.5220716595649719, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 81.5954415954416, | |
| "grad_norm": 0.079555862137296, | |
| "learning_rate": 0.00032462252017684794, | |
| "loss": 1.7134, | |
| "loss_layer_12_head": 0.3370179831981659, | |
| "loss_layer_18_head": 0.4958283305168152, | |
| "loss_layer_24_head": 0.15366677939891815, | |
| "loss_layer_30_head": 0.097067691385746, | |
| "loss_layer_36_head": 0.06432225555181503, | |
| "loss_layer_42_head": 0.037587955594062805, | |
| "loss_layer_6_head": 0.5160147547721863, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 81.82336182336182, | |
| "grad_norm": 0.08166362138692365, | |
| "learning_rate": 0.0003144588115065364, | |
| "loss": 1.7479, | |
| "loss_layer_12_head": 0.35182663798332214, | |
| "loss_layer_18_head": 0.5176635384559631, | |
| "loss_layer_24_head": 0.16193841397762299, | |
| "loss_layer_30_head": 0.10320702940225601, | |
| "loss_layer_36_head": 0.06898337602615356, | |
| "loss_layer_42_head": 0.039763785898685455, | |
| "loss_layer_6_head": 0.5365123748779297, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 82.05128205128206, | |
| "grad_norm": 0.0855469720098283, | |
| "learning_rate": 0.00030444606657442836, | |
| "loss": 1.7339, | |
| "loss_layer_12_head": 0.34548094868659973, | |
| "loss_layer_18_head": 0.5053902268409729, | |
| "loss_layer_24_head": 0.1566828191280365, | |
| "loss_layer_30_head": 0.09957839548587799, | |
| "loss_layer_36_head": 0.06610305607318878, | |
| "loss_layer_42_head": 0.037855364382267, | |
| "loss_layer_6_head": 0.5325480699539185, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 82.05128205128206, | |
| "eval_loss": 3.7662460803985596, | |
| "eval_loss_layer_12_head": 0.8055973052978516, | |
| "eval_loss_layer_18_head": 0.7246161103248596, | |
| "eval_loss_layer_24_head": 0.4393809735774994, | |
| "eval_loss_layer_30_head": 0.3041135370731354, | |
| "eval_loss_layer_36_head": 0.2042381763458252, | |
| "eval_loss_layer_42_head": 0.13466167449951172, | |
| "eval_loss_layer_6_head": 1.070326805114746, | |
| "eval_runtime": 4.9444, | |
| "eval_samples_per_second": 6.674, | |
| "eval_steps_per_second": 0.607, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 82.27920227920228, | |
| "grad_norm": 0.08718972153802979, | |
| "learning_rate": 0.0002945849769993395, | |
| "loss": 1.7251, | |
| "loss_layer_12_head": 0.3301844298839569, | |
| "loss_layer_18_head": 0.492474228143692, | |
| "loss_layer_24_head": 0.15127244591712952, | |
| "loss_layer_30_head": 0.09562569856643677, | |
| "loss_layer_36_head": 0.06327711790800095, | |
| "loss_layer_42_head": 0.036252304911613464, | |
| "loss_layer_6_head": 0.5087629556655884, | |
| "step": 1805 | |
| }, | |
| { | |
| "epoch": 82.5071225071225, | |
| "grad_norm": 0.08511606581369867, | |
| "learning_rate": 0.0002848762239246644, | |
| "loss": 1.7251, | |
| "loss_layer_12_head": 0.34044864773750305, | |
| "loss_layer_18_head": 0.5026866793632507, | |
| "loss_layer_24_head": 0.15664085745811462, | |
| "loss_layer_30_head": 0.09928475320339203, | |
| "loss_layer_36_head": 0.06595821678638458, | |
| "loss_layer_42_head": 0.03762565180659294, | |
| "loss_layer_6_head": 0.5205121040344238, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 82.73504273504274, | |
| "grad_norm": 0.08722784207394887, | |
| "learning_rate": 0.00027532047797132865, | |
| "loss": 1.7193, | |
| "loss_layer_12_head": 0.33828800916671753, | |
| "loss_layer_18_head": 0.5036657452583313, | |
| "loss_layer_24_head": 0.15435315668582916, | |
| "loss_layer_30_head": 0.09803181886672974, | |
| "loss_layer_36_head": 0.06477385014295578, | |
| "loss_layer_42_head": 0.0378272645175457, | |
| "loss_layer_6_head": 0.5202800035476685, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 82.96296296296296, | |
| "grad_norm": 0.08171497505981262, | |
| "learning_rate": 0.0002659183991914696, | |
| "loss": 1.7291, | |
| "loss_layer_12_head": 0.35178667306900024, | |
| "loss_layer_18_head": 0.5139197111129761, | |
| "loss_layer_24_head": 0.16265292465686798, | |
| "loss_layer_30_head": 0.10370488464832306, | |
| "loss_layer_36_head": 0.06842222809791565, | |
| "loss_layer_42_head": 0.038670577108860016, | |
| "loss_layer_6_head": 0.5341295599937439, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 83.19088319088318, | |
| "grad_norm": 0.08754226537159704, | |
| "learning_rate": 0.00025667063702284026, | |
| "loss": 1.7084, | |
| "loss_layer_12_head": 0.3361435532569885, | |
| "loss_layer_18_head": 0.4977208077907562, | |
| "loss_layer_24_head": 0.15316614508628845, | |
| "loss_layer_30_head": 0.09671608358621597, | |
| "loss_layer_36_head": 0.06395326554775238, | |
| "loss_layer_42_head": 0.03646908327937126, | |
| "loss_layer_6_head": 0.5177844762802124, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 83.41880341880342, | |
| "grad_norm": 0.0844836769084174, | |
| "learning_rate": 0.00024757783024395244, | |
| "loss": 1.723, | |
| "loss_layer_12_head": 0.33938589692115784, | |
| "loss_layer_18_head": 0.5006296634674072, | |
| "loss_layer_24_head": 0.1563412994146347, | |
| "loss_layer_30_head": 0.09939941018819809, | |
| "loss_layer_36_head": 0.06653161346912384, | |
| "loss_layer_42_head": 0.03902815654873848, | |
| "loss_layer_6_head": 0.5227442383766174, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 83.64672364672364, | |
| "grad_norm": 0.08116633475530488, | |
| "learning_rate": 0.0002386406069299521, | |
| "loss": 1.7133, | |
| "loss_layer_12_head": 0.3282950818538666, | |
| "loss_layer_18_head": 0.4761618673801422, | |
| "loss_layer_24_head": 0.14734062552452087, | |
| "loss_layer_30_head": 0.09337802231311798, | |
| "loss_layer_36_head": 0.062004633247852325, | |
| "loss_layer_42_head": 0.03645525127649307, | |
| "loss_layer_6_head": 0.5092923641204834, | |
| "step": 1835 | |
| }, | |
| { | |
| "epoch": 83.87464387464388, | |
| "grad_norm": 0.08130594229307465, | |
| "learning_rate": 0.0002298595844092377, | |
| "loss": 1.7324, | |
| "loss_layer_12_head": 0.34668418765068054, | |
| "loss_layer_18_head": 0.5105066299438477, | |
| "loss_layer_24_head": 0.15807229280471802, | |
| "loss_layer_30_head": 0.09954778850078583, | |
| "loss_layer_36_head": 0.06574113667011261, | |
| "loss_layer_42_head": 0.03756564110517502, | |
| "loss_layer_6_head": 0.5319226980209351, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 84.1025641025641, | |
| "grad_norm": 0.07435832563814537, | |
| "learning_rate": 0.00022123536922081716, | |
| "loss": 1.7374, | |
| "loss_layer_12_head": 0.3403926193714142, | |
| "loss_layer_18_head": 0.5032271146774292, | |
| "loss_layer_24_head": 0.15673916041851044, | |
| "loss_layer_30_head": 0.09963904321193695, | |
| "loss_layer_36_head": 0.06623657792806625, | |
| "loss_layer_42_head": 0.03874701261520386, | |
| "loss_layer_6_head": 0.5237872004508972, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 84.33048433048432, | |
| "grad_norm": 0.07520798715370941, | |
| "learning_rate": 0.0002127685570724136, | |
| "loss": 1.7008, | |
| "loss_layer_12_head": 0.32764607667922974, | |
| "loss_layer_18_head": 0.4850924015045166, | |
| "loss_layer_24_head": 0.1496036797761917, | |
| "loss_layer_30_head": 0.0952225849032402, | |
| "loss_layer_36_head": 0.0631745234131813, | |
| "loss_layer_42_head": 0.03642912209033966, | |
| "loss_layer_6_head": 0.5045855045318604, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 84.55840455840456, | |
| "grad_norm": 0.08345443449589855, | |
| "learning_rate": 0.0002044597327993153, | |
| "loss": 1.7105, | |
| "loss_layer_12_head": 0.33524253964424133, | |
| "loss_layer_18_head": 0.4937031865119934, | |
| "loss_layer_24_head": 0.15422554314136505, | |
| "loss_layer_30_head": 0.09784449636936188, | |
| "loss_layer_36_head": 0.06489607691764832, | |
| "loss_layer_42_head": 0.03717579320073128, | |
| "loss_layer_6_head": 0.5167443156242371, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 84.78632478632478, | |
| "grad_norm": 0.07382268007768587, | |
| "learning_rate": 0.00019630947032398067, | |
| "loss": 1.7347, | |
| "loss_layer_12_head": 0.3385489583015442, | |
| "loss_layer_18_head": 0.49440544843673706, | |
| "loss_layer_24_head": 0.15322643518447876, | |
| "loss_layer_30_head": 0.09731130301952362, | |
| "loss_layer_36_head": 0.06442172825336456, | |
| "loss_layer_42_head": 0.03728144243359566, | |
| "loss_layer_6_head": 0.5216220617294312, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 85.01424501424502, | |
| "grad_norm": 0.07869223889512078, | |
| "learning_rate": 0.00018831833261639619, | |
| "loss": 1.7207, | |
| "loss_layer_12_head": 0.33598074316978455, | |
| "loss_layer_18_head": 0.5042511224746704, | |
| "loss_layer_24_head": 0.15242783725261688, | |
| "loss_layer_30_head": 0.09582826495170593, | |
| "loss_layer_36_head": 0.06362007558345795, | |
| "loss_layer_42_head": 0.037337448447942734, | |
| "loss_layer_6_head": 0.519189715385437, | |
| "step": 1865 | |
| }, | |
| { | |
| "epoch": 85.24216524216524, | |
| "grad_norm": 0.07623838374572518, | |
| "learning_rate": 0.00018048687165518663, | |
| "loss": 1.7014, | |
| "loss_layer_12_head": 0.34467798471450806, | |
| "loss_layer_18_head": 0.5166942477226257, | |
| "loss_layer_24_head": 0.1603996604681015, | |
| "loss_layer_30_head": 0.10134941339492798, | |
| "loss_layer_36_head": 0.06730696558952332, | |
| "loss_layer_42_head": 0.03929843753576279, | |
| "loss_layer_6_head": 0.5304808616638184, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 85.47008547008546, | |
| "grad_norm": 0.07719263949161988, | |
| "learning_rate": 0.00017281562838948966, | |
| "loss": 1.7085, | |
| "loss_layer_12_head": 0.34138795733451843, | |
| "loss_layer_18_head": 0.49956589937210083, | |
| "loss_layer_24_head": 0.15558582544326782, | |
| "loss_layer_30_head": 0.09915374964475632, | |
| "loss_layer_36_head": 0.06627882272005081, | |
| "loss_layer_42_head": 0.03831201046705246, | |
| "loss_layer_6_head": 0.5223523378372192, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 85.6980056980057, | |
| "grad_norm": 0.07328975312789718, | |
| "learning_rate": 0.00016530513270159115, | |
| "loss": 1.7316, | |
| "loss_layer_12_head": 0.343252569437027, | |
| "loss_layer_18_head": 0.5099059343338013, | |
| "loss_layer_24_head": 0.15889784693717957, | |
| "loss_layer_30_head": 0.10200633853673935, | |
| "loss_layer_36_head": 0.0683726891875267, | |
| "loss_layer_42_head": 0.03963029757142067, | |
| "loss_layer_6_head": 0.5271897315979004, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 85.92592592592592, | |
| "grad_norm": 0.07726370100149342, | |
| "learning_rate": 0.0001579559033703229, | |
| "loss": 1.726, | |
| "loss_layer_12_head": 0.3393276631832123, | |
| "loss_layer_18_head": 0.49352455139160156, | |
| "loss_layer_24_head": 0.15607452392578125, | |
| "loss_layer_30_head": 0.09996481984853745, | |
| "loss_layer_36_head": 0.06702710688114166, | |
| "loss_layer_42_head": 0.03989189863204956, | |
| "loss_layer_6_head": 0.5197093486785889, | |
| "step": 1885 | |
| }, | |
| { | |
| "epoch": 86.15384615384616, | |
| "grad_norm": 0.07692397896958651, | |
| "learning_rate": 0.00015076844803522921, | |
| "loss": 1.7156, | |
| "loss_layer_12_head": 0.35214799642562866, | |
| "loss_layer_18_head": 0.5165507793426514, | |
| "loss_layer_24_head": 0.1624716967344284, | |
| "loss_layer_30_head": 0.1034996509552002, | |
| "loss_layer_36_head": 0.06864960491657257, | |
| "loss_layer_42_head": 0.039609383791685104, | |
| "loss_layer_6_head": 0.540605902671814, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 86.38176638176638, | |
| "grad_norm": 0.07830104898036093, | |
| "learning_rate": 0.00014374326316150184, | |
| "loss": 1.7007, | |
| "loss_layer_12_head": 0.33503252267837524, | |
| "loss_layer_18_head": 0.49698352813720703, | |
| "loss_layer_24_head": 0.15384428203105927, | |
| "loss_layer_30_head": 0.09767549484968185, | |
| "loss_layer_36_head": 0.06486734002828598, | |
| "loss_layer_42_head": 0.0373041145503521, | |
| "loss_layer_6_head": 0.5164640545845032, | |
| "step": 1895 | |
| }, | |
| { | |
| "epoch": 86.6096866096866, | |
| "grad_norm": 0.07492748364571503, | |
| "learning_rate": 0.0001368808340056879, | |
| "loss": 1.71, | |
| "loss_layer_12_head": 0.3400861918926239, | |
| "loss_layer_18_head": 0.4963332712650299, | |
| "loss_layer_24_head": 0.15496981143951416, | |
| "loss_layer_30_head": 0.09840646386146545, | |
| "loss_layer_36_head": 0.06502757966518402, | |
| "loss_layer_42_head": 0.037685126066207886, | |
| "loss_layer_6_head": 0.523513913154602, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 86.83760683760684, | |
| "grad_norm": 0.07229919889886265, | |
| "learning_rate": 0.00013018163458217074, | |
| "loss": 1.7187, | |
| "loss_layer_12_head": 0.3341769576072693, | |
| "loss_layer_18_head": 0.48553091287612915, | |
| "loss_layer_24_head": 0.1514834463596344, | |
| "loss_layer_30_head": 0.09563258290290833, | |
| "loss_layer_36_head": 0.06288423389196396, | |
| "loss_layer_42_head": 0.036126021295785904, | |
| "loss_layer_6_head": 0.5115571022033691, | |
| "step": 1905 | |
| }, | |
| { | |
| "epoch": 87.06552706552706, | |
| "grad_norm": 0.074891654205402, | |
| "learning_rate": 0.00012364612763042792, | |
| "loss": 1.7278, | |
| "loss_layer_12_head": 0.34246379137039185, | |
| "loss_layer_18_head": 0.49604225158691406, | |
| "loss_layer_24_head": 0.15869124233722687, | |
| "loss_layer_30_head": 0.10183699429035187, | |
| "loss_layer_36_head": 0.06769458204507828, | |
| "loss_layer_42_head": 0.03991634026169777, | |
| "loss_layer_6_head": 0.5265407562255859, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 87.29344729344729, | |
| "grad_norm": 0.08032085767903621, | |
| "learning_rate": 0.0001172747645830674, | |
| "loss": 1.707, | |
| "loss_layer_12_head": 0.32982274889945984, | |
| "loss_layer_18_head": 0.4943769872188568, | |
| "loss_layer_24_head": 0.1514527052640915, | |
| "loss_layer_30_head": 0.0964231789112091, | |
| "loss_layer_36_head": 0.06376294791698456, | |
| "loss_layer_42_head": 0.03660362958908081, | |
| "loss_layer_6_head": 0.5084652304649353, | |
| "step": 1915 | |
| }, | |
| { | |
| "epoch": 87.52136752136752, | |
| "grad_norm": 0.07366761201441889, | |
| "learning_rate": 0.00011106798553464803, | |
| "loss": 1.732, | |
| "loss_layer_12_head": 0.3319392800331116, | |
| "loss_layer_18_head": 0.49802669882774353, | |
| "loss_layer_24_head": 0.15054012835025787, | |
| "loss_layer_30_head": 0.09539338946342468, | |
| "loss_layer_36_head": 0.062421299517154694, | |
| "loss_layer_42_head": 0.03593217581510544, | |
| "loss_layer_6_head": 0.5121363401412964, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 87.74928774928775, | |
| "grad_norm": 0.07241943481895129, | |
| "learning_rate": 0.00010502621921127774, | |
| "loss": 1.7148, | |
| "loss_layer_12_head": 0.34008413553237915, | |
| "loss_layer_18_head": 0.5034884214401245, | |
| "loss_layer_24_head": 0.15469662845134735, | |
| "loss_layer_30_head": 0.0977717787027359, | |
| "loss_layer_36_head": 0.06479165703058243, | |
| "loss_layer_42_head": 0.03823380544781685, | |
| "loss_layer_6_head": 0.5221525430679321, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 87.97720797720798, | |
| "grad_norm": 0.07008688713358005, | |
| "learning_rate": 9.914988294100064e-05, | |
| "loss": 1.6985, | |
| "loss_layer_12_head": 0.3371519446372986, | |
| "loss_layer_18_head": 0.49330615997314453, | |
| "loss_layer_24_head": 0.15630921721458435, | |
| "loss_layer_30_head": 0.0999789759516716, | |
| "loss_layer_36_head": 0.06621178239583969, | |
| "loss_layer_42_head": 0.037286750972270966, | |
| "loss_layer_6_head": 0.5132678747177124, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 88.2051282051282, | |
| "grad_norm": 0.07024357018257298, | |
| "learning_rate": 9.343938262496993e-05, | |
| "loss": 1.7005, | |
| "loss_layer_12_head": 0.3292502760887146, | |
| "loss_layer_18_head": 0.4820357859134674, | |
| "loss_layer_24_head": 0.14815860986709595, | |
| "loss_layer_30_head": 0.09420295804738998, | |
| "loss_layer_36_head": 0.062933050096035, | |
| "loss_layer_42_head": 0.03677560016512871, | |
| "loss_layer_6_head": 0.5083265900611877, | |
| "step": 1935 | |
| }, | |
| { | |
| "epoch": 88.43304843304843, | |
| "grad_norm": 0.07038272442189607, | |
| "learning_rate": 8.78951127094127e-05, | |
| "loss": 1.7068, | |
| "loss_layer_12_head": 0.3309806287288666, | |
| "loss_layer_18_head": 0.48024678230285645, | |
| "loss_layer_24_head": 0.15456709265708923, | |
| "loss_layer_30_head": 0.09934371709823608, | |
| "loss_layer_36_head": 0.06677098572254181, | |
| "loss_layer_42_head": 0.0396689735352993, | |
| "loss_layer_6_head": 0.5079755187034607, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 88.66096866096866, | |
| "grad_norm": 0.07190615122554372, | |
| "learning_rate": 8.251745615838191e-05, | |
| "loss": 1.7283, | |
| "loss_layer_12_head": 0.33875033259391785, | |
| "loss_layer_18_head": 0.4983670711517334, | |
| "loss_layer_24_head": 0.15512457489967346, | |
| "loss_layer_30_head": 0.09894488751888275, | |
| "loss_layer_36_head": 0.06597268581390381, | |
| "loss_layer_42_head": 0.03837330639362335, | |
| "loss_layer_6_head": 0.519954264163971, | |
| "step": 1945 | |
| }, | |
| { | |
| "epoch": 88.88888888888889, | |
| "grad_norm": 0.07393034546819678, | |
| "learning_rate": 7.730678442730537e-05, | |
| "loss": 1.7069, | |
| "loss_layer_12_head": 0.33714374899864197, | |
| "loss_layer_18_head": 0.48974722623825073, | |
| "loss_layer_24_head": 0.1527237594127655, | |
| "loss_layer_30_head": 0.09619395434856415, | |
| "loss_layer_36_head": 0.06359319388866425, | |
| "loss_layer_42_head": 0.03700066730380058, | |
| "loss_layer_6_head": 0.5218413472175598, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 89.11680911680912, | |
| "grad_norm": 0.06980538304609592, | |
| "learning_rate": 7.226345743732543e-05, | |
| "loss": 1.7225, | |
| "loss_layer_12_head": 0.3432984948158264, | |
| "loss_layer_18_head": 0.5042284727096558, | |
| "loss_layer_24_head": 0.1560019701719284, | |
| "loss_layer_30_head": 0.09774313867092133, | |
| "loss_layer_36_head": 0.0642099529504776, | |
| "loss_layer_42_head": 0.037248644977808, | |
| "loss_layer_6_head": 0.5292196273803711, | |
| "step": 1955 | |
| }, | |
| { | |
| "epoch": 89.34472934472934, | |
| "grad_norm": 0.06841991765442745, | |
| "learning_rate": 6.738782355044049e-05, | |
| "loss": 1.7048, | |
| "loss_layer_12_head": 0.336143434047699, | |
| "loss_layer_18_head": 0.4922635555267334, | |
| "loss_layer_24_head": 0.15440890192985535, | |
| "loss_layer_30_head": 0.09838224947452545, | |
| "loss_layer_36_head": 0.06481974571943283, | |
| "loss_layer_42_head": 0.03767440468072891, | |
| "loss_layer_6_head": 0.5202369093894958, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 89.57264957264957, | |
| "grad_norm": 0.06877525391557446, | |
| "learning_rate": 6.268021954544095e-05, | |
| "loss": 1.706, | |
| "loss_layer_12_head": 0.35166916251182556, | |
| "loss_layer_18_head": 0.5136804580688477, | |
| "loss_layer_24_head": 0.16127237677574158, | |
| "loss_layer_30_head": 0.10277248919010162, | |
| "loss_layer_36_head": 0.0681212991476059, | |
| "loss_layer_42_head": 0.039175014942884445, | |
| "loss_layer_6_head": 0.5358132123947144, | |
| "step": 1965 | |
| }, | |
| { | |
| "epoch": 89.8005698005698, | |
| "grad_norm": 0.06941929581347202, | |
| "learning_rate": 5.8140970594647015e-05, | |
| "loss": 1.7074, | |
| "loss_layer_12_head": 0.33122771978378296, | |
| "loss_layer_18_head": 0.4838249087333679, | |
| "loss_layer_24_head": 0.14897865056991577, | |
| "loss_layer_30_head": 0.09341166168451309, | |
| "loss_layer_36_head": 0.061882950365543365, | |
| "loss_layer_42_head": 0.03573702648282051, | |
| "loss_layer_6_head": 0.5114135146141052, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 90.02849002849003, | |
| "grad_norm": 0.07797584867466725, | |
| "learning_rate": 5.3770390241446866e-05, | |
| "loss": 1.7153, | |
| "loss_layer_12_head": 0.32591190934181213, | |
| "loss_layer_18_head": 0.48162513971328735, | |
| "loss_layer_24_head": 0.14960148930549622, | |
| "loss_layer_30_head": 0.09565792977809906, | |
| "loss_layer_36_head": 0.06360563635826111, | |
| "loss_layer_42_head": 0.03752583637833595, | |
| "loss_layer_6_head": 0.5001431703567505, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 90.25641025641026, | |
| "grad_norm": 0.07008181121373384, | |
| "learning_rate": 4.9568780378640435e-05, | |
| "loss": 1.7156, | |
| "loss_layer_12_head": 0.33082446455955505, | |
| "loss_layer_18_head": 0.48348742723464966, | |
| "loss_layer_24_head": 0.15005210041999817, | |
| "loss_layer_30_head": 0.0951717272400856, | |
| "loss_layer_36_head": 0.06267333030700684, | |
| "loss_layer_42_head": 0.036025770008563995, | |
| "loss_layer_6_head": 0.5086785554885864, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 90.48433048433048, | |
| "grad_norm": 0.06871537369194806, | |
| "learning_rate": 4.553643122758549e-05, | |
| "loss": 1.7142, | |
| "loss_layer_12_head": 0.3386912941932678, | |
| "loss_layer_18_head": 0.4927671551704407, | |
| "loss_layer_24_head": 0.15460585057735443, | |
| "loss_layer_30_head": 0.09897184371948242, | |
| "loss_layer_36_head": 0.06582445651292801, | |
| "loss_layer_42_head": 0.03793860599398613, | |
| "loss_layer_6_head": 0.5209053754806519, | |
| "step": 1985 | |
| }, | |
| { | |
| "epoch": 90.71225071225071, | |
| "grad_norm": 0.0727769744619384, | |
| "learning_rate": 4.16736213181515e-05, | |
| "loss": 1.692, | |
| "loss_layer_12_head": 0.32157278060913086, | |
| "loss_layer_18_head": 0.4727444648742676, | |
| "loss_layer_24_head": 0.14894722402095795, | |
| "loss_layer_30_head": 0.09573666006326675, | |
| "loss_layer_36_head": 0.06382396072149277, | |
| "loss_layer_42_head": 0.03748338297009468, | |
| "loss_layer_6_head": 0.4971844553947449, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 90.94017094017094, | |
| "grad_norm": 0.06996748861179324, | |
| "learning_rate": 3.798061746947995e-05, | |
| "loss": 1.7028, | |
| "loss_layer_12_head": 0.33345505595207214, | |
| "loss_layer_18_head": 0.4878571629524231, | |
| "loss_layer_24_head": 0.15110231935977936, | |
| "loss_layer_30_head": 0.09630671888589859, | |
| "loss_layer_36_head": 0.0633818656206131, | |
| "loss_layer_42_head": 0.03600713983178139, | |
| "loss_layer_6_head": 0.5108103156089783, | |
| "step": 1995 | |
| }, | |
| { | |
| "epoch": 91.16809116809117, | |
| "grad_norm": 0.07222187273768782, | |
| "learning_rate": 3.445767477155443e-05, | |
| "loss": 1.6981, | |
| "loss_layer_12_head": 0.3441750407218933, | |
| "loss_layer_18_head": 0.5018490552902222, | |
| "loss_layer_24_head": 0.15837827324867249, | |
| "loss_layer_30_head": 0.10115264356136322, | |
| "loss_layer_36_head": 0.06739296019077301, | |
| "loss_layer_42_head": 0.038907259702682495, | |
| "loss_layer_6_head": 0.5249325633049011, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 91.16809116809117, | |
| "eval_loss": 3.761641025543213, | |
| "eval_loss_layer_12_head": 0.8046773076057434, | |
| "eval_loss_layer_18_head": 0.721228301525116, | |
| "eval_loss_layer_24_head": 0.4395800828933716, | |
| "eval_loss_layer_30_head": 0.30423250794410706, | |
| "eval_loss_layer_36_head": 0.20396752655506134, | |
| "eval_loss_layer_42_head": 0.1346331238746643, | |
| "eval_loss_layer_6_head": 1.0709084272384644, | |
| "eval_runtime": 4.9296, | |
| "eval_samples_per_second": 6.694, | |
| "eval_steps_per_second": 0.609, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 91.3960113960114, | |
| "grad_norm": 0.06956615545566998, | |
| "learning_rate": 3.110503656758079e-05, | |
| "loss": 1.7282, | |
| "loss_layer_12_head": 0.346196711063385, | |
| "loss_layer_18_head": 0.4989009499549866, | |
| "loss_layer_24_head": 0.1574573814868927, | |
| "loss_layer_30_head": 0.0996241420507431, | |
| "loss_layer_36_head": 0.06551764905452728, | |
| "loss_layer_42_head": 0.03770477697253227, | |
| "loss_layer_6_head": 0.527807354927063, | |
| "step": 2005 | |
| }, | |
| { | |
| "epoch": 91.62393162393163, | |
| "grad_norm": 0.06754759103070487, | |
| "learning_rate": 2.7922934437178695e-05, | |
| "loss": 1.7122, | |
| "loss_layer_12_head": 0.33628371357917786, | |
| "loss_layer_18_head": 0.4885508418083191, | |
| "loss_layer_24_head": 0.15223391354084015, | |
| "loss_layer_30_head": 0.0969274565577507, | |
| "loss_layer_36_head": 0.06416411697864532, | |
| "loss_layer_42_head": 0.03708343952894211, | |
| "loss_layer_6_head": 0.5166887044906616, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 91.85185185185185, | |
| "grad_norm": 0.06613244713287736, | |
| "learning_rate": 2.4911588180384083e-05, | |
| "loss": 1.7041, | |
| "loss_layer_12_head": 0.3314448893070221, | |
| "loss_layer_18_head": 0.49047932028770447, | |
| "loss_layer_24_head": 0.15160222351551056, | |
| "loss_layer_30_head": 0.09629616886377335, | |
| "loss_layer_36_head": 0.06369747221469879, | |
| "loss_layer_42_head": 0.03670356050133705, | |
| "loss_layer_6_head": 0.509692907333374, | |
| "step": 2015 | |
| }, | |
| { | |
| "epoch": 92.07977207977208, | |
| "grad_norm": 0.06373357880238427, | |
| "learning_rate": 2.2071205802468298e-05, | |
| "loss": 1.6886, | |
| "loss_layer_12_head": 0.3328821063041687, | |
| "loss_layer_18_head": 0.4800987243652344, | |
| "loss_layer_24_head": 0.151140958070755, | |
| "loss_layer_30_head": 0.09596490859985352, | |
| "loss_layer_36_head": 0.06343450397253036, | |
| "loss_layer_42_head": 0.03617607057094574, | |
| "loss_layer_6_head": 0.5120709538459778, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 92.3076923076923, | |
| "grad_norm": 0.06615553804228982, | |
| "learning_rate": 1.9401983499569843e-05, | |
| "loss": 1.7013, | |
| "loss_layer_12_head": 0.3357425928115845, | |
| "loss_layer_18_head": 0.49065274000167847, | |
| "loss_layer_24_head": 0.1525392085313797, | |
| "loss_layer_30_head": 0.09637175500392914, | |
| "loss_layer_36_head": 0.06411570310592651, | |
| "loss_layer_42_head": 0.03685402125120163, | |
| "loss_layer_6_head": 0.5137232542037964, | |
| "step": 2025 | |
| }, | |
| { | |
| "epoch": 92.53561253561253, | |
| "grad_norm": 0.06891724751157484, | |
| "learning_rate": 1.690410564514244e-05, | |
| "loss": 1.7192, | |
| "loss_layer_12_head": 0.3366279602050781, | |
| "loss_layer_18_head": 0.48958778381347656, | |
| "loss_layer_24_head": 0.15377403795719147, | |
| "loss_layer_30_head": 0.09771725535392761, | |
| "loss_layer_36_head": 0.06452822685241699, | |
| "loss_layer_42_head": 0.03708551451563835, | |
| "loss_layer_6_head": 0.5164296627044678, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 92.76353276353277, | |
| "grad_norm": 0.06761410588330152, | |
| "learning_rate": 1.4577744777219682e-05, | |
| "loss": 1.7225, | |
| "loss_layer_12_head": 0.33475255966186523, | |
| "loss_layer_18_head": 0.49240392446517944, | |
| "loss_layer_24_head": 0.15306416153907776, | |
| "loss_layer_30_head": 0.0966690331697464, | |
| "loss_layer_36_head": 0.0640743225812912, | |
| "loss_layer_42_head": 0.03736594319343567, | |
| "loss_layer_6_head": 0.5156325101852417, | |
| "step": 2035 | |
| }, | |
| { | |
| "epoch": 92.99145299145299, | |
| "grad_norm": 0.06987564491458538, | |
| "learning_rate": 1.2423061586496476e-05, | |
| "loss": 1.7058, | |
| "loss_layer_12_head": 0.34400632977485657, | |
| "loss_layer_18_head": 0.49989375472068787, | |
| "loss_layer_24_head": 0.15913555026054382, | |
| "loss_layer_30_head": 0.10079015791416168, | |
| "loss_layer_36_head": 0.06650825589895248, | |
| "loss_layer_42_head": 0.03808695822954178, | |
| "loss_layer_6_head": 0.5247647762298584, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 93.21937321937322, | |
| "grad_norm": 0.06884001051952174, | |
| "learning_rate": 1.0440204905230455e-05, | |
| "loss": 1.7043, | |
| "loss_layer_12_head": 0.35419824719429016, | |
| "loss_layer_18_head": 0.5190885663032532, | |
| "loss_layer_24_head": 0.1636066883802414, | |
| "loss_layer_30_head": 0.1028328388929367, | |
| "loss_layer_36_head": 0.0677228644490242, | |
| "loss_layer_42_head": 0.03939133882522583, | |
| "loss_layer_6_head": 0.5393562316894531, | |
| "step": 2045 | |
| }, | |
| { | |
| "epoch": 93.44729344729345, | |
| "grad_norm": 0.06869857148762144, | |
| "learning_rate": 8.629311696961295e-06, | |
| "loss": 1.7002, | |
| "loss_layer_12_head": 0.33199894428253174, | |
| "loss_layer_18_head": 0.4830838143825531, | |
| "loss_layer_24_head": 0.14906497299671173, | |
| "loss_layer_30_head": 0.09444169700145721, | |
| "loss_layer_36_head": 0.06229046732187271, | |
| "loss_layer_42_head": 0.03576675057411194, | |
| "loss_layer_6_head": 0.5107249021530151, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 93.67521367521367, | |
| "grad_norm": 0.0647256792540863, | |
| "learning_rate": 6.990507047049677e-06, | |
| "loss": 1.7197, | |
| "loss_layer_12_head": 0.33809465169906616, | |
| "loss_layer_18_head": 0.49668869376182556, | |
| "loss_layer_24_head": 0.15616574883460999, | |
| "loss_layer_30_head": 0.09987618774175644, | |
| "loss_layer_36_head": 0.06613916158676147, | |
| "loss_layer_42_head": 0.03771573677659035, | |
| "loss_layer_6_head": 0.5225865244865417, | |
| "step": 2055 | |
| }, | |
| { | |
| "epoch": 93.9031339031339, | |
| "grad_norm": 0.06487422745793997, | |
| "learning_rate": 5.523904154037529e-06, | |
| "loss": 1.7034, | |
| "loss_layer_12_head": 0.34006571769714355, | |
| "loss_layer_18_head": 0.4934326112270355, | |
| "loss_layer_24_head": 0.15561194717884064, | |
| "loss_layer_30_head": 0.09845955669879913, | |
| "loss_layer_36_head": 0.06562654674053192, | |
| "loss_layer_42_head": 0.038369424641132355, | |
| "loss_layer_6_head": 0.5245457887649536, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 94.13105413105413, | |
| "grad_norm": 0.06340076159086788, | |
| "learning_rate": 4.229604321829561e-06, | |
| "loss": 1.6996, | |
| "loss_layer_12_head": 0.3449529707431793, | |
| "loss_layer_18_head": 0.5078026056289673, | |
| "loss_layer_24_head": 0.1566687673330307, | |
| "loss_layer_30_head": 0.09894686192274094, | |
| "loss_layer_36_head": 0.06501217186450958, | |
| "loss_layer_42_head": 0.0374465249478817, | |
| "loss_layer_6_head": 0.5327596068382263, | |
| "step": 2065 | |
| }, | |
| { | |
| "epoch": 94.35897435897436, | |
| "grad_norm": 0.06309615709636959, | |
| "learning_rate": 3.107696952694139e-06, | |
| "loss": 1.7155, | |
| "loss_layer_12_head": 0.3498449921607971, | |
| "loss_layer_18_head": 0.5132280588150024, | |
| "loss_layer_24_head": 0.15977954864501953, | |
| "loss_layer_30_head": 0.10074914991855621, | |
| "loss_layer_36_head": 0.06633107364177704, | |
| "loss_layer_42_head": 0.03793049603700638, | |
| "loss_layer_6_head": 0.5361655950546265, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 94.58689458689459, | |
| "grad_norm": 0.062173287336752, | |
| "learning_rate": 2.1582595410896134e-06, | |
| "loss": 1.7073, | |
| "loss_layer_12_head": 0.3325282335281372, | |
| "loss_layer_18_head": 0.47551655769348145, | |
| "loss_layer_24_head": 0.1513689160346985, | |
| "loss_layer_30_head": 0.09635426104068756, | |
| "loss_layer_36_head": 0.06406603753566742, | |
| "loss_layer_42_head": 0.03743938356637955, | |
| "loss_layer_6_head": 0.5106655359268188, | |
| "step": 2075 | |
| }, | |
| { | |
| "epoch": 94.81481481481481, | |
| "grad_norm": 0.06493713768466465, | |
| "learning_rate": 1.3813576683111006e-06, | |
| "loss": 1.7118, | |
| "loss_layer_12_head": 0.33602645993232727, | |
| "loss_layer_18_head": 0.4942130446434021, | |
| "loss_layer_24_head": 0.1509321630001068, | |
| "loss_layer_30_head": 0.09551974385976791, | |
| "loss_layer_36_head": 0.06285407394170761, | |
| "loss_layer_42_head": 0.03567957133054733, | |
| "loss_layer_6_head": 0.5194729566574097, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 95.04273504273505, | |
| "grad_norm": 0.06187259503088811, | |
| "learning_rate": 7.770449979593864e-07, | |
| "loss": 1.696, | |
| "loss_layer_12_head": 0.3397129476070404, | |
| "loss_layer_18_head": 0.4978001117706299, | |
| "loss_layer_24_head": 0.15636396408081055, | |
| "loss_layer_30_head": 0.09891422092914581, | |
| "loss_layer_36_head": 0.06569842994213104, | |
| "loss_layer_42_head": 0.03745325654745102, | |
| "loss_layer_6_head": 0.5212146639823914, | |
| "step": 2085 | |
| }, | |
| { | |
| "epoch": 95.27065527065527, | |
| "grad_norm": 0.06519728300513385, | |
| "learning_rate": 3.453632722358324e-07, | |
| "loss": 1.6931, | |
| "loss_layer_12_head": 0.32379400730133057, | |
| "loss_layer_18_head": 0.473306804895401, | |
| "loss_layer_24_head": 0.14877896010875702, | |
| "loss_layer_30_head": 0.09489592164754868, | |
| "loss_layer_36_head": 0.06364993751049042, | |
| "loss_layer_42_head": 0.03758970648050308, | |
| "loss_layer_6_head": 0.4982914924621582, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 95.4985754985755, | |
| "grad_norm": 0.06085731874844273, | |
| "learning_rate": 8.634230905774088e-08, | |
| "loss": 1.7092, | |
| "loss_layer_12_head": 0.3473987877368927, | |
| "loss_layer_18_head": 0.5032440423965454, | |
| "loss_layer_24_head": 0.16124534606933594, | |
| "loss_layer_30_head": 0.10328890383243561, | |
| "loss_layer_36_head": 0.06872069090604782, | |
| "loss_layer_42_head": 0.039262156933546066, | |
| "loss_layer_6_head": 0.5293939709663391, | |
| "step": 2095 | |
| }, | |
| { | |
| "epoch": 95.72649572649573, | |
| "grad_norm": 0.06407971588377558, | |
| "learning_rate": 0.0, | |
| "loss": 1.7188, | |
| "loss_layer_12_head": 0.34668242931365967, | |
| "loss_layer_18_head": 0.5045074224472046, | |
| "loss_layer_24_head": 0.1579972803592682, | |
| "loss_layer_30_head": 0.0999143049120903, | |
| "loss_layer_36_head": 0.0661567822098732, | |
| "loss_layer_42_head": 0.038395121693611145, | |
| "loss_layer_6_head": 0.5313812494277954, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 95.72649572649573, | |
| "step": 2100, | |
| "total_flos": 2.2690272796968223e+18, | |
| "train_loss": 2.834041218984695, | |
| "train_runtime": 32024.3772, | |
| "train_samples_per_second": 8.75, | |
| "train_steps_per_second": 0.066 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 2100, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 100, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": false, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.2690272796968223e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |