{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.029864341229962895, "eval_steps": 1000, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.4932170614981446e-05, "grad_norm": 81.5, "learning_rate": 0.0001, "loss": 5.4834, "loss/crossentropy": 3.0311607122421265, "loss/fcd": 4.953125, "loss/idx": 0.0, "loss/logits": 0.5302902162075043, "step": 1 }, { "epoch": 2.9864341229962893e-05, "grad_norm": 77.0, "learning_rate": 0.0001, "loss": 5.4138, "loss/crossentropy": 2.7625315189361572, "loss/fcd": 4.921875, "loss/idx": 0.0, "loss/logits": 0.4918830245733261, "step": 2 }, { "epoch": 4.479651184494434e-05, "grad_norm": 77.0, "learning_rate": 0.0001, "loss": 5.2211, "loss/crossentropy": 3.0718085765838623, "loss/fcd": 4.703125, "loss/idx": 0.0, "loss/logits": 0.5179921388626099, "step": 3 }, { "epoch": 5.9728682459925785e-05, "grad_norm": 66.5, "learning_rate": 0.0001, "loss": 4.9244, "loss/crossentropy": 2.7016193866729736, "loss/fcd": 4.5, "loss/idx": 0.0, "loss/logits": 0.4243808537721634, "step": 4 }, { "epoch": 7.466085307490723e-05, "grad_norm": 55.75, "learning_rate": 0.0001, "loss": 4.6617, "loss/crossentropy": 2.9749388694763184, "loss/fcd": 4.265625, "loss/idx": 0.0, "loss/logits": 0.39606572687625885, "step": 5 }, { "epoch": 8.959302368988868e-05, "grad_norm": 52.25, "learning_rate": 0.0001, "loss": 4.4842, "loss/crossentropy": 2.7478275299072266, "loss/fcd": 4.09375, "loss/idx": 0.0, "loss/logits": 0.39042919874191284, "step": 6 }, { "epoch": 0.00010452519430487013, "grad_norm": 58.25, "learning_rate": 0.0001, "loss": 4.7182, "loss/crossentropy": 3.114223837852478, "loss/fcd": 4.3125, "loss/idx": 0.0, "loss/logits": 0.4056805968284607, "step": 7 }, { "epoch": 0.00011945736491985157, "grad_norm": 47.75, "learning_rate": 0.0001, "loss": 4.4918, "loss/crossentropy": 2.852568507194519, "loss/fcd": 4.109375, "loss/idx": 0.0, "loss/logits": 0.3824669420719147, "step": 8 }, { "epoch": 0.000134389535534833, "grad_norm": 40.25, "learning_rate": 0.0001, "loss": 4.2264, "loss/crossentropy": 2.582517981529236, "loss/fcd": 3.8828125, "loss/idx": 0.0, "loss/logits": 0.3435918539762497, "step": 9 }, { "epoch": 0.00014932170614981446, "grad_norm": 32.25, "learning_rate": 0.0001, "loss": 3.967, "loss/crossentropy": 2.6108912229537964, "loss/fcd": 3.6328125, "loss/idx": 0.0, "loss/logits": 0.33417809009552, "step": 10 }, { "epoch": 0.0001642538767647959, "grad_norm": 33.75, "learning_rate": 0.0001, "loss": 4.1011, "loss/crossentropy": 2.5188854932785034, "loss/fcd": 3.75, "loss/idx": 0.0, "loss/logits": 0.3511316478252411, "step": 11 }, { "epoch": 0.00017918604737977736, "grad_norm": 32.25, "learning_rate": 0.0001, "loss": 3.9782, "loss/crossentropy": 2.6797866821289062, "loss/fcd": 3.6171875, "loss/idx": 0.0, "loss/logits": 0.36099183559417725, "step": 12 }, { "epoch": 0.00019411821799475881, "grad_norm": 30.125, "learning_rate": 0.0001, "loss": 3.7539, "loss/crossentropy": 2.842925786972046, "loss/fcd": 3.4453125, "loss/idx": 0.0, "loss/logits": 0.30860865116119385, "step": 13 }, { "epoch": 0.00020905038860974027, "grad_norm": 25.625, "learning_rate": 0.0001, "loss": 3.6666, "loss/crossentropy": 2.795591711997986, "loss/fcd": 3.375, "loss/idx": 0.0, "loss/logits": 0.29162760078907013, "step": 14 }, { "epoch": 0.0002239825592247217, "grad_norm": 24.25, "learning_rate": 0.0001, "loss": 3.5034, "loss/crossentropy": 2.6695363521575928, "loss/fcd": 3.21875, "loss/idx": 0.0, "loss/logits": 0.28466810286045074, "step": 15 }, { "epoch": 0.00023891472983970314, "grad_norm": 19.0, "grad_norm_var": 420.071875, "learning_rate": 0.0001, "loss": 3.2914, "loss/crossentropy": 2.761489510536194, "loss/fcd": 3.0546875, "loss/idx": 0.0, "loss/logits": 0.2366952747106552, "step": 16 }, { "epoch": 0.0002538469004546846, "grad_norm": 19.75, "grad_norm_var": 375.11015625, "learning_rate": 0.0001, "loss": 3.3177, "loss/crossentropy": 2.822671890258789, "loss/fcd": 3.078125, "loss/idx": 0.0, "loss/logits": 0.23954815417528152, "step": 17 }, { "epoch": 0.000268779071069666, "grad_norm": 18.25, "grad_norm_var": 326.3354166666667, "learning_rate": 0.0001, "loss": 3.3004, "loss/crossentropy": 2.580729126930237, "loss/fcd": 3.03125, "loss/idx": 0.0, "loss/logits": 0.269170880317688, "step": 18 }, { "epoch": 0.00028371124168464747, "grad_norm": 15.125, "grad_norm_var": 256.7582682291667, "learning_rate": 0.0001, "loss": 2.9912, "loss/crossentropy": 2.450445771217346, "loss/fcd": 2.78125, "loss/idx": 0.0, "loss/logits": 0.20998132228851318, "step": 19 }, { "epoch": 0.0002986434122996289, "grad_norm": 13.75, "grad_norm_var": 214.0087890625, "learning_rate": 0.0001, "loss": 3.1281, "loss/crossentropy": 2.683855891227722, "loss/fcd": 2.875, "loss/idx": 0.0, "loss/logits": 0.2530509978532791, "step": 20 }, { "epoch": 0.00031357558291461037, "grad_norm": 12.125, "grad_norm_var": 197.12682291666667, "learning_rate": 0.0001, "loss": 2.864, "loss/crossentropy": 2.523205280303955, "loss/fcd": 2.671875, "loss/idx": 0.0, "loss/logits": 0.1921716332435608, "step": 21 }, { "epoch": 0.0003285077535295918, "grad_norm": 10.5625, "grad_norm_var": 180.245556640625, "learning_rate": 0.0001, "loss": 2.8605, "loss/crossentropy": 2.6747522354125977, "loss/fcd": 2.65625, "loss/idx": 0.0, "loss/logits": 0.20420243591070175, "step": 22 }, { "epoch": 0.0003434399241445733, "grad_norm": 8.875, "grad_norm_var": 127.321728515625, "learning_rate": 0.0001, "loss": 2.7268, "loss/crossentropy": 2.48712956905365, "loss/fcd": 2.515625, "loss/idx": 0.0, "loss/logits": 0.2112211287021637, "step": 23 }, { "epoch": 0.0003583720947595547, "grad_norm": 7.5625, "grad_norm_var": 100.89635416666667, "learning_rate": 0.0001, "loss": 2.7417, "loss/crossentropy": 2.464865803718567, "loss/fcd": 2.53125, "loss/idx": 0.0, "loss/logits": 0.2104937955737114, "step": 24 }, { "epoch": 0.0003733042653745362, "grad_norm": 6.71875, "grad_norm_var": 87.20006103515625, "learning_rate": 0.0001, "loss": 2.5018, "loss/crossentropy": 2.6716034412384033, "loss/fcd": 2.3125, "loss/idx": 0.0, "loss/logits": 0.18925867974758148, "step": 25 }, { "epoch": 0.00038823643598951763, "grad_norm": 6.59375, "grad_norm_var": 84.29034830729167, "learning_rate": 0.0001, "loss": 2.6178, "loss/crossentropy": 2.6646894216537476, "loss/fcd": 2.421875, "loss/idx": 0.0, "loss/logits": 0.19596357643604279, "step": 26 }, { "epoch": 0.0004031686066044991, "grad_norm": 5.25, "grad_norm_var": 74.33019205729167, "learning_rate": 0.0001, "loss": 2.5377, "loss/crossentropy": 2.4073028564453125, "loss/fcd": 2.359375, "loss/idx": 0.0, "loss/logits": 0.17834600806236267, "step": 27 }, { "epoch": 0.00041810077721948053, "grad_norm": 4.9375, "grad_norm_var": 61.733723958333336, "learning_rate": 0.0001, "loss": 2.3672, "loss/crossentropy": 2.63876473903656, "loss/fcd": 2.1875, "loss/idx": 0.0, "loss/logits": 0.17965663224458694, "step": 28 }, { "epoch": 0.000433032947834462, "grad_norm": 4.78125, "grad_norm_var": 48.33915608723958, "learning_rate": 0.0001, "loss": 2.5811, "loss/crossentropy": 2.325456917285919, "loss/fcd": 2.375, "loss/idx": 0.0, "loss/logits": 0.20605524629354477, "step": 29 }, { "epoch": 0.0004479651184494434, "grad_norm": 5.09375, "grad_norm_var": 39.295247395833336, "learning_rate": 0.0001, "loss": 2.5423, "loss/crossentropy": 2.83246386051178, "loss/fcd": 2.359375, "loss/idx": 0.0, "loss/logits": 0.18287606537342072, "step": 30 }, { "epoch": 0.00046289728906442483, "grad_norm": 3.515625, "grad_norm_var": 30.678872680664064, "learning_rate": 0.0001, "loss": 2.3684, "loss/crossentropy": 2.755223870277405, "loss/fcd": 2.1640625, "loss/idx": 0.0, "loss/logits": 0.20429068058729172, "step": 31 }, { "epoch": 0.0004778294596794063, "grad_norm": 5.6875, "grad_norm_var": 25.990029907226564, "learning_rate": 0.0001, "loss": 3.0309, "loss/crossentropy": 2.2701921463012695, "loss/fcd": 2.7265625, "loss/idx": 0.0, "loss/logits": 0.3043238967657089, "step": 32 }, { "epoch": 0.0004927616302943877, "grad_norm": 5.0, "grad_norm_var": 19.008747355143228, "learning_rate": 0.0001, "loss": 2.7374, "loss/crossentropy": 2.8550366163253784, "loss/fcd": 2.4375, "loss/idx": 0.0, "loss/logits": 0.29989800602197647, "step": 33 }, { "epoch": 0.0005076938009093692, "grad_norm": 3.25, "grad_norm_var": 13.29976298014323, "learning_rate": 0.0001, "loss": 2.3173, "loss/crossentropy": 2.6526776552200317, "loss/fcd": 2.125, "loss/idx": 0.0, "loss/logits": 0.19229362159967422, "step": 34 }, { "epoch": 0.0005226259715243506, "grad_norm": 3.15625, "grad_norm_var": 9.967837524414062, "learning_rate": 0.0001, "loss": 2.0892, "loss/crossentropy": 2.480757474899292, "loss/fcd": 1.9375, "loss/idx": 0.0, "loss/logits": 0.15168970823287964, "step": 35 }, { "epoch": 0.000537558142139332, "grad_norm": 4.15625, "grad_norm_var": 6.6749827067057295, "learning_rate": 0.0001, "loss": 2.1965, "loss/crossentropy": 2.3921128511428833, "loss/fcd": 2.01953125, "loss/idx": 0.0, "loss/logits": 0.17700091004371643, "step": 36 }, { "epoch": 0.0005524903127543135, "grad_norm": 3.34375, "grad_norm_var": 4.415640258789063, "learning_rate": 0.0001, "loss": 2.1708, "loss/crossentropy": 2.6723674535751343, "loss/fcd": 2.01171875, "loss/idx": 0.0, "loss/logits": 0.15905070304870605, "step": 37 }, { "epoch": 0.0005674224833692949, "grad_norm": 3.09375, "grad_norm_var": 2.8907704671223957, "learning_rate": 0.0001, "loss": 2.1407, "loss/crossentropy": 2.612633228302002, "loss/fcd": 1.9921875, "loss/idx": 0.0, "loss/logits": 0.14852391928434372, "step": 38 }, { "epoch": 0.0005823546539842764, "grad_norm": 3.640625, "grad_norm_var": 1.9430623372395834, "learning_rate": 0.0001, "loss": 2.616, "loss/crossentropy": 2.366433620452881, "loss/fcd": 2.296875, "loss/idx": 0.0, "loss/logits": 0.31916864961385727, "step": 39 }, { "epoch": 0.0005972868245992578, "grad_norm": 3.171875, "grad_norm_var": 1.4934234619140625, "learning_rate": 0.0001, "loss": 2.2019, "loss/crossentropy": 2.4993181228637695, "loss/fcd": 2.0, "loss/idx": 0.0, "loss/logits": 0.20192894339561462, "step": 40 }, { "epoch": 0.0006122189952142393, "grad_norm": 2.984375, "grad_norm_var": 1.2413045247395833, "learning_rate": 0.0001, "loss": 2.1757, "loss/crossentropy": 2.4306472539901733, "loss/fcd": 1.98046875, "loss/idx": 0.0, "loss/logits": 0.19518503546714783, "step": 41 }, { "epoch": 0.0006271511658292207, "grad_norm": 3.265625, "grad_norm_var": 0.8840077718098959, "learning_rate": 0.0001, "loss": 2.1187, "loss/crossentropy": 2.8436198234558105, "loss/fcd": 1.9375, "loss/idx": 0.0, "loss/logits": 0.18118727207183838, "step": 42 }, { "epoch": 0.0006420833364442022, "grad_norm": 2.65625, "grad_norm_var": 0.8792795817057292, "learning_rate": 0.0001, "loss": 2.2649, "loss/crossentropy": 2.4525002241134644, "loss/fcd": 2.05078125, "loss/idx": 0.0, "loss/logits": 0.21407584100961685, "step": 43 }, { "epoch": 0.0006570155070591836, "grad_norm": 2.828125, "grad_norm_var": 0.8538736979166667, "learning_rate": 0.0001, "loss": 2.1198, "loss/crossentropy": 2.4759527444839478, "loss/fcd": 1.92578125, "loss/idx": 0.0, "loss/logits": 0.1939752697944641, "step": 44 }, { "epoch": 0.0006719476776741652, "grad_norm": 2.875, "grad_norm_var": 0.8129191080729167, "learning_rate": 0.0001, "loss": 2.2037, "loss/crossentropy": 2.444726586341858, "loss/fcd": 1.99609375, "loss/idx": 0.0, "loss/logits": 0.20757701992988586, "step": 45 }, { "epoch": 0.0006868798482891465, "grad_norm": 2.390625, "grad_norm_var": 0.7339019775390625, "learning_rate": 0.0001, "loss": 1.8268, "loss/crossentropy": 2.5968172550201416, "loss/fcd": 1.68359375, "loss/idx": 0.0, "loss/logits": 0.14320842921733856, "step": 46 }, { "epoch": 0.000701812018904128, "grad_norm": 2.328125, "grad_norm_var": 0.8098215738932292, "learning_rate": 0.0001, "loss": 1.93, "loss/crossentropy": 2.68948233127594, "loss/fcd": 1.765625, "loss/idx": 0.0, "loss/logits": 0.1643698811531067, "step": 47 }, { "epoch": 0.0007167441895191095, "grad_norm": 2.3125, "grad_norm_var": 0.47627665201822916, "learning_rate": 0.0001, "loss": 1.9692, "loss/crossentropy": 2.520722985267639, "loss/fcd": 1.8125, "loss/idx": 0.0, "loss/logits": 0.15668785572052002, "step": 48 }, { "epoch": 0.0007316763601340908, "grad_norm": 2.25, "grad_norm_var": 0.2718170166015625, "learning_rate": 0.0001, "loss": 1.8559, "loss/crossentropy": 2.518172264099121, "loss/fcd": 1.69921875, "loss/idx": 0.0, "loss/logits": 0.15663356333971024, "step": 49 }, { "epoch": 0.0007466085307490724, "grad_norm": 2.453125, "grad_norm_var": 0.28297119140625, "learning_rate": 0.0001, "loss": 1.9511, "loss/crossentropy": 2.5404844284057617, "loss/fcd": 1.77734375, "loss/idx": 0.0, "loss/logits": 0.1737901046872139, "step": 50 }, { "epoch": 0.0007615407013640538, "grad_norm": 2.765625, "grad_norm_var": 0.28080952962239586, "learning_rate": 0.0001, "loss": 1.9093, "loss/crossentropy": 2.7360819578170776, "loss/fcd": 1.7421875, "loss/idx": 0.0, "loss/logits": 0.1670687422156334, "step": 51 }, { "epoch": 0.0007764728719790353, "grad_norm": 2.703125, "grad_norm_var": 0.17078450520833333, "learning_rate": 0.0001, "loss": 2.1227, "loss/crossentropy": 2.581022262573242, "loss/fcd": 1.91796875, "loss/idx": 0.0, "loss/logits": 0.20477516949176788, "step": 52 }, { "epoch": 0.0007914050425940167, "grad_norm": 27.25, "grad_norm_var": 37.57099202473958, "learning_rate": 0.0001, "loss": 1.8477, "loss/crossentropy": 2.7782737016677856, "loss/fcd": 1.69921875, "loss/idx": 0.0, "loss/logits": 0.14849026501178741, "step": 53 }, { "epoch": 0.0008063372132089982, "grad_norm": 2.515625, "grad_norm_var": 37.68567606608073, "learning_rate": 0.0001, "loss": 2.2443, "loss/crossentropy": 2.6635544300079346, "loss/fcd": 1.96875, "loss/idx": 0.0, "loss/logits": 0.275559701025486, "step": 54 }, { "epoch": 0.0008212693838239796, "grad_norm": 1.9921875, "grad_norm_var": 37.9948117574056, "learning_rate": 0.0001, "loss": 1.7658, "loss/crossentropy": 2.4618231058120728, "loss/fcd": 1.62109375, "loss/idx": 0.0, "loss/logits": 0.144667848944664, "step": 55 }, { "epoch": 0.0008362015544389611, "grad_norm": 3.015625, "grad_norm_var": 38.01716079711914, "learning_rate": 0.0001, "loss": 1.8995, "loss/crossentropy": 2.638582944869995, "loss/fcd": 1.73828125, "loss/idx": 0.0, "loss/logits": 0.16121716797351837, "step": 56 }, { "epoch": 0.0008511337250539425, "grad_norm": 2.046875, "grad_norm_var": 38.21924819946289, "learning_rate": 0.0001, "loss": 1.9172, "loss/crossentropy": 2.588966488838196, "loss/fcd": 1.74609375, "loss/idx": 0.0, "loss/logits": 0.17108920216560364, "step": 57 }, { "epoch": 0.000866065895668924, "grad_norm": 1.953125, "grad_norm_var": 38.47345962524414, "learning_rate": 0.0001, "loss": 1.7661, "loss/crossentropy": 2.5097049474716187, "loss/fcd": 1.63671875, "loss/idx": 0.0, "loss/logits": 0.12938163056969643, "step": 58 }, { "epoch": 0.0008809980662839054, "grad_norm": 2.953125, "grad_norm_var": 38.424946848551436, "learning_rate": 0.0001, "loss": 1.9371, "loss/crossentropy": 2.5010175704956055, "loss/fcd": 1.765625, "loss/idx": 0.0, "loss/logits": 0.17150144279003143, "step": 59 }, { "epoch": 0.0008959302368988868, "grad_norm": 1.7578125, "grad_norm_var": 38.66942545572917, "learning_rate": 0.0001, "loss": 1.7706, "loss/crossentropy": 2.5571951866149902, "loss/fcd": 1.61328125, "loss/idx": 0.0, "loss/logits": 0.1572706550359726, "step": 60 }, { "epoch": 0.0009108624075138683, "grad_norm": 1.890625, "grad_norm_var": 38.87405497233073, "learning_rate": 0.0001, "loss": 1.7896, "loss/crossentropy": 2.4101879596710205, "loss/fcd": 1.640625, "loss/idx": 0.0, "loss/logits": 0.14897086471319199, "step": 61 }, { "epoch": 0.0009257945781288497, "grad_norm": 1.6953125, "grad_norm_var": 39.04523493448893, "learning_rate": 0.0001, "loss": 1.7102, "loss/crossentropy": 2.5094202756881714, "loss/fcd": 1.56640625, "loss/idx": 0.0, "loss/logits": 0.14381400495767593, "step": 62 }, { "epoch": 0.0009407267487438312, "grad_norm": 1.6953125, "grad_norm_var": 39.20016276041667, "learning_rate": 0.0001, "loss": 1.8086, "loss/crossentropy": 2.76530921459198, "loss/fcd": 1.640625, "loss/idx": 0.0, "loss/logits": 0.1680033802986145, "step": 63 }, { "epoch": 0.0009556589193588126, "grad_norm": 1.9296875, "grad_norm_var": 39.2866818745931, "learning_rate": 0.0001, "loss": 1.9316, "loss/crossentropy": 2.355651021003723, "loss/fcd": 1.75390625, "loss/idx": 0.0, "loss/logits": 0.17767927050590515, "step": 64 }, { "epoch": 0.0009705910899737941, "grad_norm": 1.7578125, "grad_norm_var": 39.40381673177083, "learning_rate": 0.0001, "loss": 1.7647, "loss/crossentropy": 2.4329527616500854, "loss/fcd": 1.61328125, "loss/idx": 0.0, "loss/logits": 0.1514434814453125, "step": 65 }, { "epoch": 0.0009855232605887755, "grad_norm": 2.421875, "grad_norm_var": 39.40937906901042, "learning_rate": 0.0001, "loss": 2.1449, "loss/crossentropy": 2.527339816093445, "loss/fcd": 1.8828125, "loss/idx": 0.0, "loss/logits": 0.26208290457725525, "step": 66 }, { "epoch": 0.0010004554312037569, "grad_norm": 1.7109375, "grad_norm_var": 39.62035090128581, "learning_rate": 0.0001, "loss": 1.7499, "loss/crossentropy": 2.788802742958069, "loss/fcd": 1.5859375, "loss/idx": 0.0, "loss/logits": 0.16401013731956482, "step": 67 }, { "epoch": 0.0010153876018187385, "grad_norm": 1.7578125, "grad_norm_var": 39.80255126953125, "learning_rate": 0.0001, "loss": 1.7925, "loss/crossentropy": 2.6775633096694946, "loss/fcd": 1.6171875, "loss/idx": 0.0, "loss/logits": 0.17531326413154602, "step": 68 }, { "epoch": 0.0010303197724337199, "grad_norm": 1.7578125, "grad_norm_var": 0.19091161092122397, "learning_rate": 0.0001, "loss": 1.8299, "loss/crossentropy": 2.444396138191223, "loss/fcd": 1.671875, "loss/idx": 0.0, "loss/logits": 0.15805941075086594, "step": 69 }, { "epoch": 0.0010452519430487013, "grad_norm": 1.9609375, "grad_norm_var": 0.1759429931640625, "learning_rate": 0.0001, "loss": 1.9339, "loss/crossentropy": 2.4972236156463623, "loss/fcd": 1.7109375, "loss/idx": 0.0, "loss/logits": 0.22294466942548752, "step": 70 }, { "epoch": 0.0010601841136636827, "grad_norm": 1.4296875, "grad_norm_var": 0.1976959228515625, "learning_rate": 0.0001, "loss": 1.6659, "loss/crossentropy": 2.6186927556991577, "loss/fcd": 1.515625, "loss/idx": 0.0, "loss/logits": 0.15028595924377441, "step": 71 }, { "epoch": 0.001075116284278664, "grad_norm": 4.09375, "grad_norm_var": 0.4187255859375, "learning_rate": 0.0001, "loss": 1.8539, "loss/crossentropy": 2.429172396659851, "loss/fcd": 1.6640625, "loss/idx": 0.0, "loss/logits": 0.1898861974477768, "step": 72 }, { "epoch": 0.0010900484548936457, "grad_norm": 1.9296875, "grad_norm_var": 0.4196449279785156, "learning_rate": 0.0001, "loss": 1.9193, "loss/crossentropy": 2.364608407020569, "loss/fcd": 1.7265625, "loss/idx": 0.0, "loss/logits": 0.19276107847690582, "step": 73 }, { "epoch": 0.001104980625508627, "grad_norm": 1.5234375, "grad_norm_var": 0.43635965983072916, "learning_rate": 0.0001, "loss": 1.6468, "loss/crossentropy": 2.489943504333496, "loss/fcd": 1.49609375, "loss/idx": 0.0, "loss/logits": 0.15067894011735916, "step": 74 }, { "epoch": 0.0011199127961236085, "grad_norm": 1.84375, "grad_norm_var": 0.37475179036458334, "learning_rate": 0.0001, "loss": 1.9659, "loss/crossentropy": 2.5017722845077515, "loss/fcd": 1.74609375, "loss/idx": 0.0, "loss/logits": 0.21985302865505219, "step": 75 }, { "epoch": 0.0011348449667385899, "grad_norm": 1.5859375, "grad_norm_var": 0.38093973795572916, "learning_rate": 0.0001, "loss": 1.6673, "loss/crossentropy": 2.523123264312744, "loss/fcd": 1.5234375, "loss/idx": 0.0, "loss/logits": 0.14388950169086456, "step": 76 }, { "epoch": 0.0011497771373535715, "grad_norm": 1.5703125, "grad_norm_var": 0.38931248982747396, "learning_rate": 0.0001, "loss": 1.7218, "loss/crossentropy": 2.501427173614502, "loss/fcd": 1.55859375, "loss/idx": 0.0, "loss/logits": 0.16323504596948624, "step": 77 }, { "epoch": 0.0011647093079685529, "grad_norm": 2.4375, "grad_norm_var": 0.40185139973958334, "learning_rate": 0.0001, "loss": 1.7289, "loss/crossentropy": 2.392626166343689, "loss/fcd": 1.58203125, "loss/idx": 0.0, "loss/logits": 0.1468452885746956, "step": 78 }, { "epoch": 0.0011796414785835343, "grad_norm": 1.8515625, "grad_norm_var": 0.397802734375, "learning_rate": 0.0001, "loss": 1.6939, "loss/crossentropy": 2.572135329246521, "loss/fcd": 1.54296875, "loss/idx": 0.0, "loss/logits": 0.15093941986560822, "step": 79 }, { "epoch": 0.0011945736491985157, "grad_norm": 1.703125, "grad_norm_var": 0.4023089090983073, "learning_rate": 0.0001, "loss": 1.7403, "loss/crossentropy": 2.265039086341858, "loss/fcd": 1.5625, "loss/idx": 0.0, "loss/logits": 0.17782587558031082, "step": 80 }, { "epoch": 0.0012095058198134973, "grad_norm": 1.3203125, "grad_norm_var": 0.4259783426920573, "learning_rate": 0.0001, "loss": 1.4881, "loss/crossentropy": 2.432965636253357, "loss/fcd": 1.37890625, "loss/idx": 0.0, "loss/logits": 0.1092129796743393, "step": 81 }, { "epoch": 0.0012244379904284787, "grad_norm": 1.5546875, "grad_norm_var": 0.4162394205729167, "learning_rate": 0.0001, "loss": 1.7744, "loss/crossentropy": 2.387954354286194, "loss/fcd": 1.609375, "loss/idx": 0.0, "loss/logits": 0.16505713760852814, "step": 82 }, { "epoch": 0.00123937016104346, "grad_norm": 2.203125, "grad_norm_var": 0.4204851786295573, "learning_rate": 0.0001, "loss": 1.985, "loss/crossentropy": 2.412451148033142, "loss/fcd": 1.76953125, "loss/idx": 0.0, "loss/logits": 0.21551693975925446, "step": 83 }, { "epoch": 0.0012543023316584415, "grad_norm": 1.6328125, "grad_norm_var": 0.42396011352539065, "learning_rate": 0.0001, "loss": 1.7763, "loss/crossentropy": 2.3627136945724487, "loss/fcd": 1.609375, "loss/idx": 0.0, "loss/logits": 0.1669153794646263, "step": 84 }, { "epoch": 0.0012692345022734229, "grad_norm": 2.1875, "grad_norm_var": 0.4273590087890625, "learning_rate": 0.0001, "loss": 1.8321, "loss/crossentropy": 2.3265154361724854, "loss/fcd": 1.65234375, "loss/idx": 0.0, "loss/logits": 0.17975304275751114, "step": 85 }, { "epoch": 0.0012841666728884045, "grad_norm": 1.8125, "grad_norm_var": 0.4280596415201823, "learning_rate": 0.0001, "loss": 1.8219, "loss/crossentropy": 2.644020676612854, "loss/fcd": 1.65234375, "loss/idx": 0.0, "loss/logits": 0.16957848519086838, "step": 86 }, { "epoch": 0.001299098843503386, "grad_norm": 1.4921875, "grad_norm_var": 0.4242388407389323, "learning_rate": 0.0001, "loss": 1.644, "loss/crossentropy": 2.58134126663208, "loss/fcd": 1.484375, "loss/idx": 0.0, "loss/logits": 0.15963882207870483, "step": 87 }, { "epoch": 0.0013140310141183673, "grad_norm": 1.9765625, "grad_norm_var": 0.09115397135416667, "learning_rate": 0.0001, "loss": 1.7601, "loss/crossentropy": 2.463197350502014, "loss/fcd": 1.58203125, "loss/idx": 0.0, "loss/logits": 0.17805806547403336, "step": 88 }, { "epoch": 0.0013289631847333487, "grad_norm": 1.4609375, "grad_norm_var": 0.09609781901041667, "learning_rate": 0.0001, "loss": 1.6648, "loss/crossentropy": 2.7656772136688232, "loss/fcd": 1.5078125, "loss/idx": 0.0, "loss/logits": 0.15694016218185425, "step": 89 }, { "epoch": 0.0013438953553483303, "grad_norm": 22.5, "grad_norm_var": 26.936128489176433, "learning_rate": 0.0001, "loss": 1.6137, "loss/crossentropy": 2.4002861976623535, "loss/fcd": 1.4765625, "loss/idx": 0.0, "loss/logits": 0.137087844312191, "step": 90 }, { "epoch": 0.0013588275259633117, "grad_norm": 1.8125, "grad_norm_var": 26.941302235921224, "learning_rate": 0.0001, "loss": 1.6349, "loss/crossentropy": 2.4773519039154053, "loss/fcd": 1.484375, "loss/idx": 0.0, "loss/logits": 0.15047870576381683, "step": 91 }, { "epoch": 0.001373759696578293, "grad_norm": 1.765625, "grad_norm_var": 26.907792154947916, "learning_rate": 0.0001, "loss": 1.7613, "loss/crossentropy": 2.472650408744812, "loss/fcd": 1.58203125, "loss/idx": 0.0, "loss/logits": 0.17930901050567627, "step": 92 }, { "epoch": 0.0013886918671932745, "grad_norm": 1.53125, "grad_norm_var": 26.915750885009764, "learning_rate": 0.0001, "loss": 1.6722, "loss/crossentropy": 2.6694291830062866, "loss/fcd": 1.51953125, "loss/idx": 0.0, "loss/logits": 0.15264244377613068, "step": 93 }, { "epoch": 0.001403624037808256, "grad_norm": 0.8984375, "grad_norm_var": 27.195156860351563, "learning_rate": 0.0001, "loss": 1.8717, "loss/crossentropy": 2.4455777406692505, "loss/fcd": 1.6875, "loss/idx": 0.5, "loss/logits": 0.18417862057685852, "step": 94 }, { "epoch": 0.0014185562084232375, "grad_norm": 1.0390625, "grad_norm_var": 27.35882059733073, "learning_rate": 0.0001, "loss": 1.6424, "loss/crossentropy": 2.5914158821105957, "loss/fcd": 1.49609375, "loss/idx": 0.5, "loss/logits": 0.1463368535041809, "step": 95 }, { "epoch": 0.001433488379038219, "grad_norm": 0.7890625, "grad_norm_var": 27.560646311442056, "learning_rate": 0.0001, "loss": 1.5349, "loss/crossentropy": 2.660555124282837, "loss/fcd": 1.3984375, "loss/idx": 0.5, "loss/logits": 0.1364329755306244, "step": 96 }, { "epoch": 0.0014484205496532003, "grad_norm": 0.76171875, "grad_norm_var": 27.695830726623534, "learning_rate": 0.0001, "loss": 1.6325, "loss/crossentropy": 2.461188316345215, "loss/fcd": 1.48046875, "loss/idx": 0.5, "loss/logits": 0.1520090326666832, "step": 97 }, { "epoch": 0.0014633527202681817, "grad_norm": 0.6953125, "grad_norm_var": 27.88910617828369, "learning_rate": 0.0001, "loss": 1.6549, "loss/crossentropy": 2.6517964601516724, "loss/fcd": 1.49609375, "loss/idx": 0.5, "loss/logits": 0.15884529054164886, "step": 98 }, { "epoch": 0.0014782848908831633, "grad_norm": 0.8359375, "grad_norm_var": 28.111986223856608, "learning_rate": 0.0001, "loss": 1.5727, "loss/crossentropy": 2.5368658304214478, "loss/fcd": 1.43359375, "loss/idx": 0.5, "loss/logits": 0.13915108144283295, "step": 99 }, { "epoch": 0.0014932170614981447, "grad_norm": 0.84765625, "grad_norm_var": 28.26218058268229, "learning_rate": 0.0001, "loss": 1.9082, "loss/crossentropy": 2.5145949125289917, "loss/fcd": 1.6953125, "loss/idx": 0.5, "loss/logits": 0.21293380111455917, "step": 100 }, { "epoch": 0.001508149232113126, "grad_norm": 0.7890625, "grad_norm_var": 28.47071711222331, "learning_rate": 0.0001, "loss": 1.6666, "loss/crossentropy": 2.6361730098724365, "loss/fcd": 1.49609375, "loss/idx": 0.5, "loss/logits": 0.17047739028930664, "step": 101 }, { "epoch": 0.0015230814027281075, "grad_norm": 0.90234375, "grad_norm_var": 28.61356601715088, "learning_rate": 0.0001, "loss": 1.8376, "loss/crossentropy": 2.5776859521865845, "loss/fcd": 1.640625, "loss/idx": 0.5, "loss/logits": 0.19692759215831757, "step": 102 }, { "epoch": 0.0015380135733430891, "grad_norm": 7.6875, "grad_norm_var": 30.174897702534995, "learning_rate": 0.0001, "loss": 1.6053, "loss/crossentropy": 2.398188829421997, "loss/fcd": 1.4765625, "loss/idx": 0.5, "loss/logits": 0.1287732645869255, "step": 103 }, { "epoch": 0.0015529457439580705, "grad_norm": 0.84765625, "grad_norm_var": 30.39253921508789, "learning_rate": 0.0001, "loss": 1.6112, "loss/crossentropy": 2.597610116004944, "loss/fcd": 1.46875, "loss/idx": 0.5, "loss/logits": 0.14246898889541626, "step": 104 }, { "epoch": 0.001567877914573052, "grad_norm": 0.92578125, "grad_norm_var": 30.507610003153484, "learning_rate": 0.0001, "loss": 1.9468, "loss/crossentropy": 2.6051762104034424, "loss/fcd": 1.73828125, "loss/idx": 0.5, "loss/logits": 0.20850396901369095, "step": 105 }, { "epoch": 0.0015828100851880333, "grad_norm": 0.7734375, "grad_norm_var": 2.9109150568644204, "learning_rate": 0.0001, "loss": 1.754, "loss/crossentropy": 2.248218297958374, "loss/fcd": 1.5703125, "loss/idx": 0.5, "loss/logits": 0.18368404731154442, "step": 106 }, { "epoch": 0.0015977422558030147, "grad_norm": 0.828125, "grad_norm_var": 2.921457354227702, "learning_rate": 0.0001, "loss": 1.592, "loss/crossentropy": 2.530861020088196, "loss/fcd": 1.453125, "loss/idx": 0.5, "loss/logits": 0.1389201432466507, "step": 107 }, { "epoch": 0.0016126744264179963, "grad_norm": 0.7734375, "grad_norm_var": 2.930629920959473, "learning_rate": 0.0001, "loss": 1.6142, "loss/crossentropy": 2.6938165426254272, "loss/fcd": 1.45703125, "loss/idx": 0.5, "loss/logits": 0.1571795791387558, "step": 108 }, { "epoch": 0.0016276065970329777, "grad_norm": 0.72265625, "grad_norm_var": 2.947409820556641, "learning_rate": 0.0001, "loss": 1.6045, "loss/crossentropy": 2.629867672920227, "loss/fcd": 1.4453125, "loss/idx": 0.5, "loss/logits": 0.15913766622543335, "step": 109 }, { "epoch": 0.0016425387676479591, "grad_norm": 0.63671875, "grad_norm_var": 2.9642145156860353, "learning_rate": 0.0001, "loss": 1.5314, "loss/crossentropy": 2.1994539499282837, "loss/fcd": 1.41015625, "loss/idx": 0.5, "loss/logits": 0.12125381454825401, "step": 110 }, { "epoch": 0.0016574709382629405, "grad_norm": 1.1015625, "grad_norm_var": 2.9627761205037433, "learning_rate": 0.0001, "loss": 1.7658, "loss/crossentropy": 2.5422849655151367, "loss/fcd": 1.57421875, "loss/idx": 0.5, "loss/logits": 0.19153974950313568, "step": 111 }, { "epoch": 0.0016724031088779221, "grad_norm": 0.95703125, "grad_norm_var": 2.9543312072753904, "learning_rate": 0.0001, "loss": 1.6532, "loss/crossentropy": 2.4952173233032227, "loss/fcd": 1.5078125, "loss/idx": 0.5, "loss/logits": 0.14537867531180382, "step": 112 }, { "epoch": 0.0016873352794929035, "grad_norm": 1.1640625, "grad_norm_var": 2.9379663467407227, "learning_rate": 0.0001, "loss": 1.6542, "loss/crossentropy": 2.6476502418518066, "loss/fcd": 1.49609375, "loss/idx": 0.5, "loss/logits": 0.1580853909254074, "step": 113 }, { "epoch": 0.001702267450107885, "grad_norm": 0.8359375, "grad_norm_var": 2.9282297134399413, "learning_rate": 0.0001, "loss": 1.5115, "loss/crossentropy": 2.4073606729507446, "loss/fcd": 1.37890625, "loss/idx": 0.5, "loss/logits": 0.1326112598180771, "step": 114 }, { "epoch": 0.0017171996207228663, "grad_norm": 0.71484375, "grad_norm_var": 2.9364662170410156, "learning_rate": 0.0001, "loss": 1.6048, "loss/crossentropy": 2.5658940076828003, "loss/fcd": 1.453125, "loss/idx": 0.5, "loss/logits": 0.15171286836266518, "step": 115 }, { "epoch": 0.001732131791337848, "grad_norm": 0.65625, "grad_norm_var": 2.949834124247233, "learning_rate": 0.0001, "loss": 1.5579, "loss/crossentropy": 2.807284355163574, "loss/fcd": 1.41015625, "loss/idx": 0.5, "loss/logits": 0.14778884127736092, "step": 116 }, { "epoch": 0.0017470639619528293, "grad_norm": 0.5703125, "grad_norm_var": 2.966845639546712, "learning_rate": 0.0001, "loss": 1.5297, "loss/crossentropy": 2.722702145576477, "loss/fcd": 1.3984375, "loss/idx": 0.5, "loss/logits": 0.1313047930598259, "step": 117 }, { "epoch": 0.0017619961325678107, "grad_norm": 0.609375, "grad_norm_var": 2.986028798421224, "learning_rate": 0.0001, "loss": 1.5226, "loss/crossentropy": 2.434728503227234, "loss/fcd": 1.38671875, "loss/idx": 0.5, "loss/logits": 0.13587873429059982, "step": 118 }, { "epoch": 0.0017769283031827921, "grad_norm": 0.796875, "grad_norm_var": 0.027905019124348958, "learning_rate": 0.0001, "loss": 1.5845, "loss/crossentropy": 2.431328058242798, "loss/fcd": 1.43359375, "loss/idx": 0.5, "loss/logits": 0.1509154662489891, "step": 119 }, { "epoch": 0.0017918604737977735, "grad_norm": 0.76171875, "grad_norm_var": 0.0279022216796875, "learning_rate": 0.0001, "loss": 1.6357, "loss/crossentropy": 2.545639991760254, "loss/fcd": 1.48046875, "loss/idx": 0.5, "loss/logits": 0.1552198976278305, "step": 120 }, { "epoch": 0.0018067926444127551, "grad_norm": 0.609375, "grad_norm_var": 0.028927040100097657, "learning_rate": 0.0001, "loss": 1.4819, "loss/crossentropy": 2.671551823616028, "loss/fcd": 1.35546875, "loss/idx": 0.5, "loss/logits": 0.1264294758439064, "step": 121 }, { "epoch": 0.0018217248150277365, "grad_norm": 0.7265625, "grad_norm_var": 0.029117774963378907, "learning_rate": 0.0001, "loss": 1.5531, "loss/crossentropy": 2.7223843336105347, "loss/fcd": 1.40625, "loss/idx": 0.5, "loss/logits": 0.14684423804283142, "step": 122 }, { "epoch": 0.001836656985642718, "grad_norm": 0.8125, "grad_norm_var": 0.029030799865722656, "learning_rate": 0.0001, "loss": 1.7009, "loss/crossentropy": 2.638063430786133, "loss/fcd": 1.53515625, "loss/idx": 0.5, "loss/logits": 0.16576898843050003, "step": 123 }, { "epoch": 0.0018515891562576993, "grad_norm": 0.7578125, "grad_norm_var": 0.02905572255452474, "learning_rate": 0.0001, "loss": 1.73, "loss/crossentropy": 2.351641535758972, "loss/fcd": 1.546875, "loss/idx": 0.5, "loss/logits": 0.18313253670930862, "step": 124 }, { "epoch": 0.001866521326872681, "grad_norm": 0.62890625, "grad_norm_var": 0.03028558095296224, "learning_rate": 0.0001, "loss": 1.5896, "loss/crossentropy": 2.654516100883484, "loss/fcd": 1.4375, "loss/idx": 0.5, "loss/logits": 0.15209884196519852, "step": 125 }, { "epoch": 0.0018814534974876623, "grad_norm": 0.7109375, "grad_norm_var": 0.02929865519205729, "learning_rate": 0.0001, "loss": 1.8795, "loss/crossentropy": 2.376212000846863, "loss/fcd": 1.671875, "loss/idx": 0.5, "loss/logits": 0.20766521990299225, "step": 126 }, { "epoch": 0.0018963856681026437, "grad_norm": 0.5859375, "grad_norm_var": 0.023524729410807292, "learning_rate": 0.0001, "loss": 1.5293, "loss/crossentropy": 2.6014784574508667, "loss/fcd": 1.390625, "loss/idx": 0.5, "loss/logits": 0.1386614888906479, "step": 127 }, { "epoch": 0.0019113178387176251, "grad_norm": 0.828125, "grad_norm_var": 0.02089583079020182, "learning_rate": 0.0001, "loss": 1.5522, "loss/crossentropy": 2.6718069314956665, "loss/fcd": 1.4140625, "loss/idx": 0.5, "loss/logits": 0.13817449286580086, "step": 128 }, { "epoch": 0.0019262500093326065, "grad_norm": 0.7890625, "grad_norm_var": 0.008261553446451823, "learning_rate": 0.0001, "loss": 1.5992, "loss/crossentropy": 2.6475735902786255, "loss/fcd": 1.44921875, "loss/idx": 0.5, "loss/logits": 0.14993533492088318, "step": 129 }, { "epoch": 0.0019411821799475881, "grad_norm": 0.8046875, "grad_norm_var": 0.007806841532389323, "learning_rate": 0.0001, "loss": 1.6639, "loss/crossentropy": 2.257818102836609, "loss/fcd": 1.50390625, "loss/idx": 0.5, "loss/logits": 0.15997718274593353, "step": 130 }, { "epoch": 0.0019561143505625695, "grad_norm": 0.8046875, "grad_norm_var": 0.008366902669270834, "learning_rate": 0.0001, "loss": 1.7219, "loss/crossentropy": 2.6239601373672485, "loss/fcd": 1.55078125, "loss/idx": 0.5, "loss/logits": 0.1711440533399582, "step": 131 }, { "epoch": 0.001971046521177551, "grad_norm": 0.69921875, "grad_norm_var": 0.008141009012858073, "learning_rate": 0.0001, "loss": 1.6674, "loss/crossentropy": 2.5232131481170654, "loss/fcd": 1.50390625, "loss/idx": 0.5, "loss/logits": 0.16352446377277374, "step": 132 }, { "epoch": 0.0019859786917925323, "grad_norm": 0.67578125, "grad_norm_var": 0.0067522684733072914, "learning_rate": 0.0001, "loss": 1.5317, "loss/crossentropy": 2.3178855180740356, "loss/fcd": 1.39453125, "loss/idx": 0.5, "loss/logits": 0.13720271736383438, "step": 133 }, { "epoch": 0.0020009108624075137, "grad_norm": 27.875, "grad_norm_var": 46.04944636027018, "learning_rate": 0.0001, "loss": 2.6756, "loss/crossentropy": 2.7906564474105835, "loss/fcd": 2.4296875, "loss/idx": 1.0, "loss/logits": 0.2459193617105484, "step": 134 }, { "epoch": 0.002015843033022495, "grad_norm": 44.0, "grad_norm_var": 153.3034543355306, "learning_rate": 0.0001, "loss": 3.2793, "loss/crossentropy": 2.7848106622695923, "loss/fcd": 3.0078125, "loss/idx": 1.0, "loss/logits": 0.27151423692703247, "step": 135 }, { "epoch": 0.002030775203637477, "grad_norm": 43.75, "grad_norm_var": 243.7684579849243, "learning_rate": 0.0001, "loss": 3.504, "loss/crossentropy": 2.664074659347534, "loss/fcd": 3.1875, "loss/idx": 1.0, "loss/logits": 0.3164883255958557, "step": 136 }, { "epoch": 0.0020457073742524584, "grad_norm": 40.75, "grad_norm_var": 305.90149377187095, "learning_rate": 0.0001, "loss": 3.3873, "loss/crossentropy": 2.8510701656341553, "loss/fcd": 3.09375, "loss/idx": 1.0, "loss/logits": 0.2935274988412857, "step": 137 }, { "epoch": 0.0020606395448674398, "grad_norm": 37.5, "grad_norm_var": 343.3572509129842, "learning_rate": 0.0001, "loss": 3.3702, "loss/crossentropy": 2.5609227418899536, "loss/fcd": 3.0859375, "loss/idx": 1.0, "loss/logits": 0.2842549532651901, "step": 138 }, { "epoch": 0.002075571715482421, "grad_norm": 37.25, "grad_norm_var": 368.9572041193644, "learning_rate": 0.0001, "loss": 4.486, "loss/crossentropy": 2.607009768486023, "loss/fcd": 3.8515625, "loss/idx": 1.0, "loss/logits": 0.6344500631093979, "step": 139 }, { "epoch": 0.0020905038860974025, "grad_norm": 34.25, "grad_norm_var": 375.90857741038, "learning_rate": 0.0001, "loss": 3.0759, "loss/crossentropy": 2.759778141975403, "loss/fcd": 2.828125, "loss/idx": 1.0, "loss/logits": 0.24779706448316574, "step": 140 }, { "epoch": 0.002105436056712384, "grad_norm": 34.0, "grad_norm_var": 372.6947629292806, "learning_rate": 0.0001, "loss": 3.3188, "loss/crossentropy": 2.3219879865646362, "loss/fcd": 3.0390625, "loss/idx": 1.0, "loss/logits": 0.27973373234272003, "step": 141 }, { "epoch": 0.0021203682273273653, "grad_norm": 33.25, "grad_norm_var": 359.17601114908854, "learning_rate": 0.0001, "loss": 3.1591, "loss/crossentropy": 2.8131046295166016, "loss/fcd": 2.8984375, "loss/idx": 1.0, "loss/logits": 0.2606983706355095, "step": 142 }, { "epoch": 0.0021353003979423467, "grad_norm": 33.0, "grad_norm_var": 336.12636286417643, "learning_rate": 0.0001, "loss": 3.1873, "loss/crossentropy": 2.613773822784424, "loss/fcd": 2.90625, "loss/idx": 1.0, "loss/logits": 0.2810151129961014, "step": 143 }, { "epoch": 0.002150232568557328, "grad_norm": 28.0, "grad_norm_var": 301.4397661844889, "learning_rate": 0.0001, "loss": 3.1588, "loss/crossentropy": 2.1197726130485535, "loss/fcd": 2.90625, "loss/idx": 1.0, "loss/logits": 0.25257067382335663, "step": 144 }, { "epoch": 0.00216516473917231, "grad_norm": 28.5, "grad_norm_var": 260.5796641031901, "learning_rate": 0.0001, "loss": 2.9352, "loss/crossentropy": 2.5937873125076294, "loss/fcd": 2.703125, "loss/idx": 1.0, "loss/logits": 0.23209355771541595, "step": 145 }, { "epoch": 0.0021800969097872914, "grad_norm": 27.25, "grad_norm_var": 213.44209976196288, "learning_rate": 0.0001, "loss": 3.0205, "loss/crossentropy": 2.4897440671920776, "loss/fcd": 2.765625, "loss/idx": 1.0, "loss/logits": 0.254846952855587, "step": 146 }, { "epoch": 0.0021950290804022728, "grad_norm": 31.625, "grad_norm_var": 160.14161987304686, "learning_rate": 0.0001, "loss": 3.1151, "loss/crossentropy": 2.679656982421875, "loss/fcd": 2.84375, "loss/idx": 1.0, "loss/logits": 0.27131521701812744, "step": 147 }, { "epoch": 0.002209961251017254, "grad_norm": 25.75, "grad_norm_var": 100.99951419830322, "learning_rate": 0.0001, "loss": 2.9074, "loss/crossentropy": 2.7504318952560425, "loss/fcd": 2.671875, "loss/idx": 1.0, "loss/logits": 0.23552851378917694, "step": 148 }, { "epoch": 0.0022248934216322356, "grad_norm": 23.875, "grad_norm_var": 38.628580729166664, "learning_rate": 0.0001, "loss": 2.9145, "loss/crossentropy": 2.801925301551819, "loss/fcd": 2.6640625, "loss/idx": 1.0, "loss/logits": 0.2504773437976837, "step": 149 }, { "epoch": 0.002239825592247217, "grad_norm": 22.625, "grad_norm_var": 44.05358072916667, "learning_rate": 0.0001, "loss": 2.8546, "loss/crossentropy": 2.5748791694641113, "loss/fcd": 2.609375, "loss/idx": 1.0, "loss/logits": 0.24526876956224442, "step": 150 }, { "epoch": 0.0022547577628621984, "grad_norm": 22.25, "grad_norm_var": 41.244205729166666, "learning_rate": 0.0001, "loss": 2.6247, "loss/crossentropy": 2.8942376375198364, "loss/fcd": 2.4296875, "loss/idx": 1.0, "loss/logits": 0.1949758157134056, "step": 151 }, { "epoch": 0.0022696899334771798, "grad_norm": 23.25, "grad_norm_var": 33.96243489583333, "learning_rate": 0.0001, "loss": 2.8511, "loss/crossentropy": 2.8165948390960693, "loss/fcd": 2.625, "loss/idx": 1.0, "loss/logits": 0.22610026597976685, "step": 152 }, { "epoch": 0.002284622104092161, "grad_norm": 21.5, "grad_norm_var": 30.0322265625, "learning_rate": 0.0001, "loss": 2.8665, "loss/crossentropy": 2.9259716272354126, "loss/fcd": 2.625, "loss/idx": 1.0, "loss/logits": 0.24152649194002151, "step": 153 }, { "epoch": 0.002299554274707143, "grad_norm": 21.5, "grad_norm_var": 27.8822265625, "learning_rate": 0.0001, "loss": 3.0903, "loss/crossentropy": 2.579954981803894, "loss/fcd": 2.78125, "loss/idx": 1.0, "loss/logits": 0.30902038514614105, "step": 154 }, { "epoch": 0.0023144864453221244, "grad_norm": 17.375, "grad_norm_var": 28.0375, "learning_rate": 0.0001, "loss": 2.7236, "loss/crossentropy": 2.7883050441741943, "loss/fcd": 2.5078125, "loss/idx": 1.0, "loss/logits": 0.21582189947366714, "step": 155 }, { "epoch": 0.0023294186159371058, "grad_norm": 16.25, "grad_norm_var": 30.2875, "learning_rate": 0.0001, "loss": 2.7658, "loss/crossentropy": 2.316504955291748, "loss/fcd": 2.5234375, "loss/idx": 1.0, "loss/logits": 0.24237968027591705, "step": 156 }, { "epoch": 0.002344350786552087, "grad_norm": 15.1875, "grad_norm_var": 31.399593098958334, "learning_rate": 0.0001, "loss": 2.7595, "loss/crossentropy": 2.7337818145751953, "loss/fcd": 2.515625, "loss/idx": 1.0, "loss/logits": 0.24386876821517944, "step": 157 }, { "epoch": 0.0023592829571670686, "grad_norm": 14.3125, "grad_norm_var": 31.591927083333335, "learning_rate": 0.0001, "loss": 2.6083, "loss/crossentropy": 2.8262592554092407, "loss/fcd": 2.40625, "loss/idx": 1.0, "loss/logits": 0.20209631323814392, "step": 158 }, { "epoch": 0.00237421512778205, "grad_norm": 13.5, "grad_norm_var": 30.048177083333332, "learning_rate": 0.0001, "loss": 2.7447, "loss/crossentropy": 2.6527985334396362, "loss/fcd": 2.5, "loss/idx": 1.0, "loss/logits": 0.24472637474536896, "step": 159 }, { "epoch": 0.0023891472983970314, "grad_norm": 13.125, "grad_norm_var": 32.070247395833334, "learning_rate": 0.0001, "loss": 2.453, "loss/crossentropy": 2.714062452316284, "loss/fcd": 2.2578125, "loss/idx": 1.0, "loss/logits": 0.19521619379520416, "step": 160 }, { "epoch": 0.0024040794690120128, "grad_norm": 14.0625, "grad_norm_var": 30.885921223958334, "learning_rate": 0.0001, "loss": 2.7171, "loss/crossentropy": 2.446950674057007, "loss/fcd": 2.5078125, "loss/idx": 1.0, "loss/logits": 0.20926345884799957, "step": 161 }, { "epoch": 0.0024190116396269946, "grad_norm": 13.5, "grad_norm_var": 29.804541015625, "learning_rate": 0.0001, "loss": 2.9389, "loss/crossentropy": 2.5623066425323486, "loss/fcd": 2.6796875, "loss/idx": 1.0, "loss/logits": 0.25920529663562775, "step": 162 }, { "epoch": 0.002433943810241976, "grad_norm": 13.4375, "grad_norm_var": 20.725, "learning_rate": 0.0001, "loss": 2.5012, "loss/crossentropy": 2.540536642074585, "loss/fcd": 2.2890625, "loss/idx": 1.0, "loss/logits": 0.2121095061302185, "step": 163 }, { "epoch": 0.0024488759808569574, "grad_norm": 14.125, "grad_norm_var": 17.4978515625, "learning_rate": 0.0001, "loss": 2.6137, "loss/crossentropy": 2.8521658182144165, "loss/fcd": 2.375, "loss/idx": 1.0, "loss/logits": 0.2387429103255272, "step": 164 }, { "epoch": 0.0024638081514719388, "grad_norm": 13.625, "grad_norm_var": 15.341080729166666, "learning_rate": 0.0001, "loss": 2.9265, "loss/crossentropy": 2.394273519515991, "loss/fcd": 2.640625, "loss/idx": 1.0, "loss/logits": 0.28584469854831696, "step": 165 }, { "epoch": 0.00247874032208692, "grad_norm": 14.0, "grad_norm_var": 13.351041666666667, "learning_rate": 0.0001, "loss": 2.4902, "loss/crossentropy": 2.939634919166565, "loss/fcd": 2.2734375, "loss/idx": 1.0, "loss/logits": 0.21672210842370987, "step": 166 }, { "epoch": 0.0024936724927019016, "grad_norm": 13.3125, "grad_norm_var": 11.267952473958333, "learning_rate": 0.0001, "loss": 2.5294, "loss/crossentropy": 2.645583152770996, "loss/fcd": 2.296875, "loss/idx": 1.0, "loss/logits": 0.23250159621238708, "step": 167 }, { "epoch": 0.002508604663316883, "grad_norm": 11.9375, "grad_norm_var": 7.959635416666667, "learning_rate": 0.0001, "loss": 2.3236, "loss/crossentropy": 2.652674078941345, "loss/fcd": 2.1328125, "loss/idx": 1.0, "loss/logits": 0.1908191666007042, "step": 168 }, { "epoch": 0.0025235368339318644, "grad_norm": 10.875, "grad_norm_var": 5.873372395833333, "learning_rate": 0.0001, "loss": 2.4839, "loss/crossentropy": 2.424571990966797, "loss/fcd": 2.234375, "loss/idx": 1.0, "loss/logits": 0.2495182603597641, "step": 169 }, { "epoch": 0.0025384690045468458, "grad_norm": 12.375, "grad_norm_var": 2.418229166666667, "learning_rate": 0.0001, "loss": 2.5325, "loss/crossentropy": 2.4959967136383057, "loss/fcd": 2.3046875, "loss/idx": 1.0, "loss/logits": 0.22784889489412308, "step": 170 }, { "epoch": 0.0025534011751618276, "grad_norm": 12.0625, "grad_norm_var": 1.6587076822916667, "learning_rate": 0.0001, "loss": 2.4315, "loss/crossentropy": 2.6240097284317017, "loss/fcd": 2.21875, "loss/idx": 1.0, "loss/logits": 0.21272272616624832, "step": 171 }, { "epoch": 0.002568333345776809, "grad_norm": 12.5, "grad_norm_var": 1.1528483072916667, "learning_rate": 0.0001, "loss": 2.6707, "loss/crossentropy": 2.7601370811462402, "loss/fcd": 2.3984375, "loss/idx": 1.0, "loss/logits": 0.27226200699806213, "step": 172 }, { "epoch": 0.0025832655163917904, "grad_norm": 12.5625, "grad_norm_var": 0.9040201822916667, "learning_rate": 0.0001, "loss": 2.596, "loss/crossentropy": 2.7772231101989746, "loss/fcd": 2.3671875, "loss/idx": 1.0, "loss/logits": 0.22885487973690033, "step": 173 }, { "epoch": 0.002598197687006772, "grad_norm": 13.5625, "grad_norm_var": 0.8161295572916667, "learning_rate": 0.0001, "loss": 2.905, "loss/crossentropy": 2.641968846321106, "loss/fcd": 2.625, "loss/idx": 1.0, "loss/logits": 0.2800135463476181, "step": 174 }, { "epoch": 0.002613129857621753, "grad_norm": 12.6875, "grad_norm_var": 0.80703125, "learning_rate": 0.0001, "loss": 2.9335, "loss/crossentropy": 2.4210065603256226, "loss/fcd": 2.6640625, "loss/idx": 1.0, "loss/logits": 0.2694525122642517, "step": 175 }, { "epoch": 0.0026280620282367346, "grad_norm": 12.5, "grad_norm_var": 0.8197265625, "learning_rate": 0.0001, "loss": 2.8887, "loss/crossentropy": 2.781251311302185, "loss/fcd": 2.6015625, "loss/idx": 1.0, "loss/logits": 0.28714829683303833, "step": 176 }, { "epoch": 0.002642994198851716, "grad_norm": 11.0, "grad_norm_var": 0.9497233072916667, "learning_rate": 0.0001, "loss": 2.9995, "loss/crossentropy": 2.67316734790802, "loss/fcd": 2.703125, "loss/idx": 1.0, "loss/logits": 0.29639333486557007, "step": 177 }, { "epoch": 0.0026579263694666974, "grad_norm": 10.375, "grad_norm_var": 1.2492024739583334, "learning_rate": 0.0001, "loss": 2.7215, "loss/crossentropy": 2.549267888069153, "loss/fcd": 2.4765625, "loss/idx": 1.0, "loss/logits": 0.2449359893798828, "step": 178 }, { "epoch": 0.0026728585400816788, "grad_norm": 10.5625, "grad_norm_var": 1.4288899739583334, "learning_rate": 0.0001, "loss": 2.7281, "loss/crossentropy": 2.6656607389450073, "loss/fcd": 2.4921875, "loss/idx": 1.0, "loss/logits": 0.23593086749315262, "step": 179 }, { "epoch": 0.0026877907106966606, "grad_norm": 9.0625, "grad_norm_var": 1.8520833333333333, "learning_rate": 0.0001, "loss": 2.8713, "loss/crossentropy": 2.4903770685195923, "loss/fcd": 2.59375, "loss/idx": 1.0, "loss/logits": 0.2775098979473114, "step": 180 }, { "epoch": 0.002702722881311642, "grad_norm": 8.3125, "grad_norm_var": 2.509228515625, "learning_rate": 0.0001, "loss": 2.7875, "loss/crossentropy": 2.9355252981185913, "loss/fcd": 2.5078125, "loss/idx": 1.0, "loss/logits": 0.2797327786684036, "step": 181 }, { "epoch": 0.0027176550519266234, "grad_norm": 8.5625, "grad_norm_var": 2.71171875, "learning_rate": 0.0001, "loss": 2.6506, "loss/crossentropy": 2.7791460752487183, "loss/fcd": 2.4296875, "loss/idx": 1.0, "loss/logits": 0.22087856382131577, "step": 182 }, { "epoch": 0.002732587222541605, "grad_norm": 7.5625, "grad_norm_var": 3.3046875, "learning_rate": 0.0001, "loss": 2.7138, "loss/crossentropy": 2.7682063579559326, "loss/fcd": 2.46875, "loss/idx": 1.0, "loss/logits": 0.24506456404924393, "step": 183 }, { "epoch": 0.002747519393156586, "grad_norm": 7.15625, "grad_norm_var": 4.15572509765625, "learning_rate": 0.0001, "loss": 2.5292, "loss/crossentropy": 2.508063316345215, "loss/fcd": 2.3046875, "loss/idx": 1.0, "loss/logits": 0.2244826927781105, "step": 184 }, { "epoch": 0.0027624515637715676, "grad_norm": 7.53125, "grad_norm_var": 4.790950520833333, "learning_rate": 0.0001, "loss": 2.6223, "loss/crossentropy": 2.9126468896865845, "loss/fcd": 2.3828125, "loss/idx": 1.0, "loss/logits": 0.23952852189540863, "step": 185 }, { "epoch": 0.002777383734386549, "grad_norm": 7.0, "grad_norm_var": 5.269661458333333, "learning_rate": 0.0001, "loss": 2.615, "loss/crossentropy": 2.417146325111389, "loss/fcd": 2.390625, "loss/idx": 1.0, "loss/logits": 0.22438553720712662, "step": 186 }, { "epoch": 0.0027923159050015304, "grad_norm": 6.5625, "grad_norm_var": 5.785286458333333, "learning_rate": 0.0001, "loss": 2.5657, "loss/crossentropy": 2.8265384435653687, "loss/fcd": 2.3359375, "loss/idx": 1.0, "loss/logits": 0.2297244518995285, "step": 187 }, { "epoch": 0.002807248075616512, "grad_norm": 6.71875, "grad_norm_var": 5.826688639322916, "learning_rate": 0.0001, "loss": 2.4346, "loss/crossentropy": 2.577815532684326, "loss/fcd": 2.2265625, "loss/idx": 1.0, "loss/logits": 0.20803897082805634, "step": 188 }, { "epoch": 0.0028221802462314936, "grad_norm": 6.25, "grad_norm_var": 5.72476806640625, "learning_rate": 0.0001, "loss": 2.5743, "loss/crossentropy": 2.7740859985351562, "loss/fcd": 2.328125, "loss/idx": 1.0, "loss/logits": 0.24621784687042236, "step": 189 }, { "epoch": 0.002837112416846475, "grad_norm": 7.78125, "grad_norm_var": 4.364518229166666, "learning_rate": 0.0001, "loss": 3.1307, "loss/crossentropy": 2.6582623720169067, "loss/fcd": 2.8203125, "loss/idx": 1.0, "loss/logits": 0.3103819936513901, "step": 190 }, { "epoch": 0.0028520445874614564, "grad_norm": 5.375, "grad_norm_var": 3.8446451822916665, "learning_rate": 0.0001, "loss": 2.3101, "loss/crossentropy": 2.8031907081604004, "loss/fcd": 2.109375, "loss/idx": 1.0, "loss/logits": 0.20073574036359787, "step": 191 }, { "epoch": 0.002866976758076438, "grad_norm": 5.53125, "grad_norm_var": 2.949051920572917, "learning_rate": 0.0001, "loss": 2.4215, "loss/crossentropy": 2.6993457078933716, "loss/fcd": 2.21875, "loss/idx": 1.0, "loss/logits": 0.20273278653621674, "step": 192 }, { "epoch": 0.002881908928691419, "grad_norm": 5.21875, "grad_norm_var": 2.597509765625, "learning_rate": 0.0001, "loss": 2.357, "loss/crossentropy": 2.544357180595398, "loss/fcd": 2.1484375, "loss/idx": 1.0, "loss/logits": 0.20859003067016602, "step": 193 }, { "epoch": 0.0028968410993064006, "grad_norm": 6.125, "grad_norm_var": 2.0817545572916667, "learning_rate": 0.0001, "loss": 2.4965, "loss/crossentropy": 2.489700198173523, "loss/fcd": 2.265625, "loss/idx": 1.0, "loss/logits": 0.23092350363731384, "step": 194 }, { "epoch": 0.002911773269921382, "grad_norm": 4.8125, "grad_norm_var": 1.575634765625, "learning_rate": 0.0001, "loss": 2.3381, "loss/crossentropy": 2.7617045640945435, "loss/fcd": 2.1328125, "loss/idx": 1.0, "loss/logits": 0.20526950061321259, "step": 195 }, { "epoch": 0.0029267054405363634, "grad_norm": 4.71875, "grad_norm_var": 1.47213134765625, "learning_rate": 0.0001, "loss": 2.2927, "loss/crossentropy": 2.5566166639328003, "loss/fcd": 2.09375, "loss/idx": 1.0, "loss/logits": 0.1989428475499153, "step": 196 }, { "epoch": 0.0029416376111513452, "grad_norm": 4.40625, "grad_norm_var": 1.5214680989583333, "learning_rate": 0.0001, "loss": 2.3069, "loss/crossentropy": 2.708446979522705, "loss/fcd": 2.09375, "loss/idx": 1.0, "loss/logits": 0.21319883316755295, "step": 197 }, { "epoch": 0.0029565697817663266, "grad_norm": 4.59375, "grad_norm_var": 1.3256144205729166, "learning_rate": 0.0001, "loss": 2.2289, "loss/crossentropy": 2.3388549089431763, "loss/fcd": 2.0390625, "loss/idx": 1.0, "loss/logits": 0.18980170786380768, "step": 198 }, { "epoch": 0.002971501952381308, "grad_norm": 4.15625, "grad_norm_var": 1.3792805989583334, "learning_rate": 0.0001, "loss": 2.2524, "loss/crossentropy": 2.5469167232513428, "loss/fcd": 2.0546875, "loss/idx": 1.0, "loss/logits": 0.19768796861171722, "step": 199 }, { "epoch": 0.0029864341229962894, "grad_norm": 3.921875, "grad_norm_var": 1.4788808186848958, "learning_rate": 0.0001, "loss": 2.1652, "loss/crossentropy": 2.4613125324249268, "loss/fcd": 1.984375, "loss/idx": 1.0, "loss/logits": 0.18085117638111115, "step": 200 }, { "epoch": 0.003001366293611271, "grad_norm": 4.15625, "grad_norm_var": 1.3527577718098958, "learning_rate": 0.0001, "loss": 2.2654, "loss/crossentropy": 2.6382863521575928, "loss/fcd": 2.0546875, "loss/idx": 1.0, "loss/logits": 0.21074112504720688, "step": 201 }, { "epoch": 0.003016298464226252, "grad_norm": 3.78125, "grad_norm_var": 1.338508097330729, "learning_rate": 0.0001, "loss": 2.3743, "loss/crossentropy": 2.652292013168335, "loss/fcd": 2.14453125, "loss/idx": 1.0, "loss/logits": 0.229776993393898, "step": 202 }, { "epoch": 0.0030312306348412336, "grad_norm": 3.40625, "grad_norm_var": 1.4116607666015626, "learning_rate": 0.0001, "loss": 2.236, "loss/crossentropy": 2.683787226676941, "loss/fcd": 2.03125, "loss/idx": 1.0, "loss/logits": 0.20470323413610458, "step": 203 }, { "epoch": 0.003046162805456215, "grad_norm": 3.6875, "grad_norm_var": 1.3153554280598958, "learning_rate": 0.0001, "loss": 2.0852, "loss/crossentropy": 2.662025213241577, "loss/fcd": 1.921875, "loss/idx": 1.0, "loss/logits": 0.16330592334270477, "step": 204 }, { "epoch": 0.0030610949760711964, "grad_norm": 4.0625, "grad_norm_var": 1.2119618733723958, "learning_rate": 0.0001, "loss": 2.6026, "loss/crossentropy": 2.5425872802734375, "loss/fcd": 2.3359375, "loss/idx": 1.0, "loss/logits": 0.26663239300251007, "step": 205 }, { "epoch": 0.0030760271466861782, "grad_norm": 4.21875, "grad_norm_var": 0.5574452718098958, "learning_rate": 0.0001, "loss": 2.2165, "loss/crossentropy": 2.7666549682617188, "loss/fcd": 2.03125, "loss/idx": 1.0, "loss/logits": 0.1852124035358429, "step": 206 }, { "epoch": 0.0030909593173011596, "grad_norm": 3.515625, "grad_norm_var": 0.5592610677083333, "learning_rate": 0.0001, "loss": 2.1929, "loss/crossentropy": 2.853086471557617, "loss/fcd": 2.0078125, "loss/idx": 1.0, "loss/logits": 0.18510984629392624, "step": 207 }, { "epoch": 0.003105891487916141, "grad_norm": 3.03125, "grad_norm_var": 0.5709798177083333, "learning_rate": 0.0001, "loss": 2.3248, "loss/crossentropy": 2.38408362865448, "loss/fcd": 2.125, "loss/idx": 1.0, "loss/logits": 0.1997532695531845, "step": 208 }, { "epoch": 0.0031208236585311224, "grad_norm": 2.59375, "grad_norm_var": 0.6584798177083333, "learning_rate": 0.0001, "loss": 2.2643, "loss/crossentropy": 2.7543792724609375, "loss/fcd": 2.04296875, "loss/idx": 1.0, "loss/logits": 0.22130031883716583, "step": 209 }, { "epoch": 0.003135755829146104, "grad_norm": 5.4375, "grad_norm_var": 0.5000325520833333, "learning_rate": 0.0001, "loss": 2.1592, "loss/crossentropy": 2.7001391649246216, "loss/fcd": 1.9765625, "loss/idx": 1.0, "loss/logits": 0.1826585754752159, "step": 210 }, { "epoch": 0.0031506879997610852, "grad_norm": 4.5, "grad_norm_var": 0.473583984375, "learning_rate": 0.0001, "loss": 2.1441, "loss/crossentropy": 2.70257830619812, "loss/fcd": 1.96875, "loss/idx": 1.0, "loss/logits": 0.17531797289848328, "step": 211 }, { "epoch": 0.0031656201703760666, "grad_norm": 2.609375, "grad_norm_var": 0.5528228759765625, "learning_rate": 0.0001, "loss": 2.0612, "loss/crossentropy": 2.393709421157837, "loss/fcd": 1.890625, "loss/idx": 1.0, "loss/logits": 0.170525424182415, "step": 212 }, { "epoch": 0.003180552340991048, "grad_norm": 4.28125, "grad_norm_var": 0.5450266520182292, "learning_rate": 0.0001, "loss": 2.6544, "loss/crossentropy": 2.5453147888183594, "loss/fcd": 2.375, "loss/idx": 1.0, "loss/logits": 0.27943994104862213, "step": 213 }, { "epoch": 0.0031954845116060294, "grad_norm": 2.125, "grad_norm_var": 0.6883941650390625, "learning_rate": 0.0001, "loss": 2.067, "loss/crossentropy": 2.6982057094573975, "loss/fcd": 1.88671875, "loss/idx": 1.0, "loss/logits": 0.1802719309926033, "step": 214 }, { "epoch": 0.0032104166822210112, "grad_norm": 2.78125, "grad_norm_var": 0.7261708577473959, "learning_rate": 0.0001, "loss": 2.0135, "loss/crossentropy": 2.5155314207077026, "loss/fcd": 1.86328125, "loss/idx": 1.0, "loss/logits": 0.15022382885217667, "step": 215 }, { "epoch": 0.0032253488528359926, "grad_norm": 2.65625, "grad_norm_var": 0.7773396809895833, "learning_rate": 0.0001, "loss": 2.2039, "loss/crossentropy": 2.611793875694275, "loss/fcd": 1.99609375, "loss/idx": 1.0, "loss/logits": 0.20781183242797852, "step": 216 }, { "epoch": 0.003240281023450974, "grad_norm": 2.15625, "grad_norm_var": 0.8664021809895833, "learning_rate": 0.0001, "loss": 1.9168, "loss/crossentropy": 2.9469950199127197, "loss/fcd": 1.765625, "loss/idx": 1.0, "loss/logits": 0.15117117017507553, "step": 217 }, { "epoch": 0.0032552131940659554, "grad_norm": 1.96875, "grad_norm_var": 0.98629150390625, "learning_rate": 0.0001, "loss": 2.033, "loss/crossentropy": 2.321299910545349, "loss/fcd": 1.859375, "loss/idx": 1.0, "loss/logits": 0.1736186519265175, "step": 218 }, { "epoch": 0.003270145364680937, "grad_norm": 2.9375, "grad_norm_var": 0.994287109375, "learning_rate": 0.0001, "loss": 2.3963, "loss/crossentropy": 2.4344794750213623, "loss/fcd": 2.1328125, "loss/idx": 1.0, "loss/logits": 0.26349252462387085, "step": 219 }, { "epoch": 0.0032850775352959182, "grad_norm": 1.9296875, "grad_norm_var": 1.0931068420410157, "learning_rate": 0.0001, "loss": 2.1104, "loss/crossentropy": 2.6011242866516113, "loss/fcd": 1.91796875, "loss/idx": 1.0, "loss/logits": 0.1924804523587227, "step": 220 }, { "epoch": 0.0033000097059108996, "grad_norm": 1.859375, "grad_norm_var": 1.1358497619628907, "learning_rate": 0.0001, "loss": 2.0349, "loss/crossentropy": 2.6590973138809204, "loss/fcd": 1.86328125, "loss/idx": 1.0, "loss/logits": 0.17158202826976776, "step": 221 }, { "epoch": 0.003314941876525881, "grad_norm": 1.828125, "grad_norm_var": 1.1165504455566406, "learning_rate": 0.0001, "loss": 1.9973, "loss/crossentropy": 2.5507930517196655, "loss/fcd": 1.828125, "loss/idx": 1.0, "loss/logits": 0.16922374814748764, "step": 222 }, { "epoch": 0.0033298740471408624, "grad_norm": 4.21875, "grad_norm_var": 1.2062721252441406, "learning_rate": 0.0001, "loss": 2.1822, "loss/crossentropy": 2.44870126247406, "loss/fcd": 2.0078125, "loss/idx": 1.0, "loss/logits": 0.17442134022712708, "step": 223 }, { "epoch": 0.0033448062177558443, "grad_norm": 2.328125, "grad_norm_var": 1.2278785705566406, "learning_rate": 0.0001, "loss": 2.0348, "loss/crossentropy": 2.673762083053589, "loss/fcd": 1.8515625, "loss/idx": 1.0, "loss/logits": 0.18328066915273666, "step": 224 }, { "epoch": 0.0033597383883708257, "grad_norm": 1.6875, "grad_norm_var": 1.3147865295410157, "learning_rate": 0.0001, "loss": 1.8397, "loss/crossentropy": 2.640251398086548, "loss/fcd": 1.70703125, "loss/idx": 1.0, "loss/logits": 0.13263830170035362, "step": 225 }, { "epoch": 0.003374670558985807, "grad_norm": 2.953125, "grad_norm_var": 0.8373207092285156, "learning_rate": 0.0001, "loss": 2.2305, "loss/crossentropy": 2.267286777496338, "loss/fcd": 2.03515625, "loss/idx": 1.0, "loss/logits": 0.19532842934131622, "step": 226 }, { "epoch": 0.0033896027296007884, "grad_norm": 1.625, "grad_norm_var": 0.6548255920410156, "learning_rate": 0.0001, "loss": 1.9428, "loss/crossentropy": 2.665201783180237, "loss/fcd": 1.77734375, "loss/idx": 1.0, "loss/logits": 0.1654854491353035, "step": 227 }, { "epoch": 0.00340453490021577, "grad_norm": 2.125, "grad_norm_var": 0.6622047424316406, "learning_rate": 0.0001, "loss": 2.2125, "loss/crossentropy": 2.5612581968307495, "loss/fcd": 2.0234375, "loss/idx": 1.0, "loss/logits": 0.18906734883785248, "step": 228 }, { "epoch": 0.0034194670708307512, "grad_norm": 1.9765625, "grad_norm_var": 0.43646240234375, "learning_rate": 0.0001, "loss": 2.3429, "loss/crossentropy": 2.2903116941452026, "loss/fcd": 2.1015625, "loss/idx": 1.0, "loss/logits": 0.24134419858455658, "step": 229 }, { "epoch": 0.0034343992414457326, "grad_norm": 1.7109375, "grad_norm_var": 0.45806859334309896, "learning_rate": 0.0001, "loss": 2.0235, "loss/crossentropy": 2.728991985321045, "loss/fcd": 1.84375, "loss/idx": 1.0, "loss/logits": 0.1797611489892006, "step": 230 }, { "epoch": 0.003449331412060714, "grad_norm": 2.140625, "grad_norm_var": 0.44230321248372395, "learning_rate": 0.0001, "loss": 2.0665, "loss/crossentropy": 2.8149020671844482, "loss/fcd": 1.87890625, "loss/idx": 1.0, "loss/logits": 0.1875438541173935, "step": 231 }, { "epoch": 0.003464263582675696, "grad_norm": 2.84375, "grad_norm_var": 0.45449803670247396, "learning_rate": 0.0001, "loss": 2.253, "loss/crossentropy": 2.5558494329452515, "loss/fcd": 2.0390625, "loss/idx": 1.0, "loss/logits": 0.2139800414443016, "step": 232 }, { "epoch": 0.0034791957532906773, "grad_norm": 1.7421875, "grad_norm_var": 0.47138671875, "learning_rate": 0.0001, "loss": 1.9924, "loss/crossentropy": 2.5154411792755127, "loss/fcd": 1.8203125, "loss/idx": 1.0, "loss/logits": 0.17209748923778534, "step": 233 }, { "epoch": 0.0034941279239056587, "grad_norm": 1.6171875, "grad_norm_var": 0.4919288635253906, "learning_rate": 0.0001, "loss": 1.9748, "loss/crossentropy": 2.405826687812805, "loss/fcd": 1.796875, "loss/idx": 1.0, "loss/logits": 0.17791152000427246, "step": 234 }, { "epoch": 0.00350906009452064, "grad_norm": 1.8359375, "grad_norm_var": 0.4624176025390625, "learning_rate": 0.0001, "loss": 2.0062, "loss/crossentropy": 2.6848820447921753, "loss/fcd": 1.82421875, "loss/idx": 1.0, "loss/logits": 0.18202318251132965, "step": 235 }, { "epoch": 0.0035239922651356215, "grad_norm": 1.6640625, "grad_norm_var": 0.4746785481770833, "learning_rate": 0.0001, "loss": 2.138, "loss/crossentropy": 2.6112011671066284, "loss/fcd": 1.93359375, "loss/idx": 1.0, "loss/logits": 0.20442968606948853, "step": 236 }, { "epoch": 0.003538924435750603, "grad_norm": 1.5, "grad_norm_var": 0.4959462483723958, "learning_rate": 0.0001, "loss": 2.1163, "loss/crossentropy": 2.3305634260177612, "loss/fcd": 1.93359375, "loss/idx": 1.0, "loss/logits": 0.1827048435807228, "step": 237 }, { "epoch": 0.0035538566063655842, "grad_norm": 1.40625, "grad_norm_var": 0.5230550130208333, "learning_rate": 0.0001, "loss": 1.9209, "loss/crossentropy": 2.7014966011047363, "loss/fcd": 1.76171875, "loss/idx": 1.0, "loss/logits": 0.15916066616773605, "step": 238 }, { "epoch": 0.0035687887769805656, "grad_norm": 2.671875, "grad_norm_var": 0.23271382649739583, "learning_rate": 0.0001, "loss": 2.1395, "loss/crossentropy": 2.541683793067932, "loss/fcd": 1.91796875, "loss/idx": 1.0, "loss/logits": 0.22150883078575134, "step": 239 }, { "epoch": 0.003583720947595547, "grad_norm": 1.375, "grad_norm_var": 0.24642740885416667, "learning_rate": 0.0001, "loss": 1.9673, "loss/crossentropy": 2.613878011703491, "loss/fcd": 1.796875, "loss/idx": 1.0, "loss/logits": 0.17040642350912094, "step": 240 }, { "epoch": 0.003598653118210529, "grad_norm": 1.6875, "grad_norm_var": 0.24642740885416667, "learning_rate": 0.0001, "loss": 1.8668, "loss/crossentropy": 2.526310443878174, "loss/fcd": 1.71484375, "loss/idx": 1.0, "loss/logits": 0.15190982818603516, "step": 241 }, { "epoch": 0.0036135852888255103, "grad_norm": 1.4453125, "grad_norm_var": 0.18276748657226563, "learning_rate": 0.0001, "loss": 1.8594, "loss/crossentropy": 2.6557728052139282, "loss/fcd": 1.703125, "loss/idx": 1.0, "loss/logits": 0.1562422439455986, "step": 242 }, { "epoch": 0.0036285174594404917, "grad_norm": 1.6796875, "grad_norm_var": 0.18141988118489583, "learning_rate": 0.0001, "loss": 2.1091, "loss/crossentropy": 2.42824125289917, "loss/fcd": 1.91015625, "loss/idx": 1.0, "loss/logits": 0.19891205430030823, "step": 243 }, { "epoch": 0.003643449630055473, "grad_norm": 1.453125, "grad_norm_var": 0.18400065104166666, "learning_rate": 0.0001, "loss": 2.0081, "loss/crossentropy": 2.31497859954834, "loss/fcd": 1.8359375, "loss/idx": 1.0, "loss/logits": 0.17215853184461594, "step": 244 }, { "epoch": 0.0036583818006704545, "grad_norm": 1.4375, "grad_norm_var": 0.1892473856608073, "learning_rate": 0.0001, "loss": 1.8746, "loss/crossentropy": 2.573368787765503, "loss/fcd": 1.71484375, "loss/idx": 1.0, "loss/logits": 0.15973138809204102, "step": 245 }, { "epoch": 0.003673313971285436, "grad_norm": 1.3515625, "grad_norm_var": 0.19982274373372397, "learning_rate": 0.0001, "loss": 1.8213, "loss/crossentropy": 2.602314829826355, "loss/fcd": 1.67578125, "loss/idx": 1.0, "loss/logits": 0.14549748599529266, "step": 246 }, { "epoch": 0.0036882461419004173, "grad_norm": 1.3515625, "grad_norm_var": 0.19666341145833333, "learning_rate": 0.0001, "loss": 1.8882, "loss/crossentropy": 2.596967101097107, "loss/fcd": 1.73046875, "loss/idx": 1.0, "loss/logits": 0.1577165126800537, "step": 247 }, { "epoch": 0.0037031783125153987, "grad_norm": 1.5, "grad_norm_var": 0.10305582682291667, "learning_rate": 0.0001, "loss": 1.9983, "loss/crossentropy": 2.6739336252212524, "loss/fcd": 1.8203125, "loss/idx": 1.0, "loss/logits": 0.17800860106945038, "step": 248 }, { "epoch": 0.00371811048313038, "grad_norm": 1.1796875, "grad_norm_var": 0.11272379557291666, "learning_rate": 0.0001, "loss": 1.7476, "loss/crossentropy": 2.737492322921753, "loss/fcd": 1.609375, "loss/idx": 1.0, "loss/logits": 0.13822893053293228, "step": 249 }, { "epoch": 0.003733042653745362, "grad_norm": 1.8671875, "grad_norm_var": 0.11812744140625, "learning_rate": 0.0001, "loss": 1.7779, "loss/crossentropy": 2.7095056772232056, "loss/fcd": 1.63671875, "loss/idx": 1.0, "loss/logits": 0.1412084773182869, "step": 250 }, { "epoch": 0.0037479748243603433, "grad_norm": 1.109375, "grad_norm_var": 0.12709121704101561, "learning_rate": 0.0001, "loss": 1.7714, "loss/crossentropy": 2.606199264526367, "loss/fcd": 1.640625, "loss/idx": 1.0, "loss/logits": 0.1308179721236229, "step": 251 }, { "epoch": 0.0037629069949753247, "grad_norm": 1.671875, "grad_norm_var": 0.1272216796875, "learning_rate": 0.0001, "loss": 1.8573, "loss/crossentropy": 2.528549313545227, "loss/fcd": 1.69140625, "loss/idx": 1.0, "loss/logits": 0.1659131497144699, "step": 252 }, { "epoch": 0.003777839165590306, "grad_norm": 1.40625, "grad_norm_var": 0.12830810546875, "learning_rate": 0.0001, "loss": 1.8859, "loss/crossentropy": 2.6335842609405518, "loss/fcd": 1.71484375, "loss/idx": 1.0, "loss/logits": 0.1710173338651657, "step": 253 }, { "epoch": 0.0037927713362052875, "grad_norm": 1.1171875, "grad_norm_var": 0.13857396443684897, "learning_rate": 0.0001, "loss": 1.858, "loss/crossentropy": 2.474130392074585, "loss/fcd": 1.69921875, "loss/idx": 1.0, "loss/logits": 0.15880661457777023, "step": 254 }, { "epoch": 0.003807703506820269, "grad_norm": 1.2109375, "grad_norm_var": 0.047408040364583334, "learning_rate": 0.0001, "loss": 1.8397, "loss/crossentropy": 2.475555896759033, "loss/fcd": 1.6875, "loss/idx": 1.0, "loss/logits": 0.15221639722585678, "step": 255 }, { "epoch": 0.0038226356774352503, "grad_norm": 1.3984375, "grad_norm_var": 0.047277577718098956, "learning_rate": 0.0001, "loss": 2.0511, "loss/crossentropy": 2.3949726819992065, "loss/fcd": 1.87890625, "loss/idx": 1.0, "loss/logits": 0.17222578823566437, "step": 256 }, { "epoch": 0.0038375678480502317, "grad_norm": 1.1015625, "grad_norm_var": 0.04855550130208333, "learning_rate": 0.0001, "loss": 1.7403, "loss/crossentropy": 2.4693208932876587, "loss/fcd": 1.6015625, "loss/idx": 1.0, "loss/logits": 0.1387447491288185, "step": 257 }, { "epoch": 0.003852500018665213, "grad_norm": 1.1875, "grad_norm_var": 0.05089696248372396, "learning_rate": 0.0001, "loss": 1.8965, "loss/crossentropy": 2.524168014526367, "loss/fcd": 1.734375, "loss/idx": 1.0, "loss/logits": 0.1620769277215004, "step": 258 }, { "epoch": 0.003867432189280195, "grad_norm": 1.5078125, "grad_norm_var": 0.04579442342122396, "learning_rate": 0.0001, "loss": 1.7984, "loss/crossentropy": 2.66902232170105, "loss/fcd": 1.65625, "loss/idx": 1.0, "loss/logits": 0.1421596258878708, "step": 259 }, { "epoch": 0.0038823643598951763, "grad_norm": 1.4609375, "grad_norm_var": 0.0458892822265625, "learning_rate": 0.0001, "loss": 1.887, "loss/crossentropy": 2.543527603149414, "loss/fcd": 1.71484375, "loss/idx": 1.0, "loss/logits": 0.17219997197389603, "step": 260 }, { "epoch": 0.0038972965305101577, "grad_norm": 1.3046875, "grad_norm_var": 0.04572931925455729, "learning_rate": 0.0001, "loss": 2.0272, "loss/crossentropy": 2.4247206449508667, "loss/fcd": 1.83203125, "loss/idx": 1.0, "loss/logits": 0.195211723446846, "step": 261 }, { "epoch": 0.003912228701125139, "grad_norm": 1.296875, "grad_norm_var": 0.0459625244140625, "learning_rate": 0.0001, "loss": 1.7184, "loss/crossentropy": 2.844157338142395, "loss/fcd": 1.578125, "loss/idx": 1.0, "loss/logits": 0.14025697112083435, "step": 262 }, { "epoch": 0.003927160871740121, "grad_norm": 1.4765625, "grad_norm_var": 0.0468902587890625, "learning_rate": 0.0001, "loss": 1.8431, "loss/crossentropy": 2.630376935005188, "loss/fcd": 1.68359375, "loss/idx": 1.0, "loss/logits": 0.15945688635110855, "step": 263 }, { "epoch": 0.003942093042355102, "grad_norm": 1.2109375, "grad_norm_var": 0.04680557250976562, "learning_rate": 0.0001, "loss": 1.6698, "loss/crossentropy": 2.665648579597473, "loss/fcd": 1.52734375, "loss/idx": 1.5, "loss/logits": 0.14248844236135483, "step": 264 }, { "epoch": 0.003957025212970084, "grad_norm": 0.87890625, "grad_norm_var": 0.05905907948811849, "learning_rate": 0.0001, "loss": 1.66, "loss/crossentropy": 2.4616761207580566, "loss/fcd": 1.51953125, "loss/idx": 1.5, "loss/logits": 0.1404593661427498, "step": 265 }, { "epoch": 0.003971957383585065, "grad_norm": 1.0546875, "grad_norm_var": 0.04162947336832682, "learning_rate": 0.0001, "loss": 1.7816, "loss/crossentropy": 2.717252254486084, "loss/fcd": 1.58984375, "loss/idx": 1.5, "loss/logits": 0.19176837801933289, "step": 266 }, { "epoch": 0.0039868895542000465, "grad_norm": 0.8515625, "grad_norm_var": 0.051465288798014326, "learning_rate": 0.0001, "loss": 1.6546, "loss/crossentropy": 2.7635334730148315, "loss/fcd": 1.5078125, "loss/idx": 1.5, "loss/logits": 0.14682532101869583, "step": 267 }, { "epoch": 0.0040018217248150275, "grad_norm": 1.0234375, "grad_norm_var": 0.04200890858968099, "learning_rate": 0.0001, "loss": 1.8043, "loss/crossentropy": 2.5384750366210938, "loss/fcd": 1.640625, "loss/idx": 1.5, "loss/logits": 0.1637115702033043, "step": 268 }, { "epoch": 0.004016753895430009, "grad_norm": 0.87109375, "grad_norm_var": 0.046477254231770834, "learning_rate": 0.0001, "loss": 1.7611, "loss/crossentropy": 2.486850619316101, "loss/fcd": 1.60546875, "loss/idx": 1.5, "loss/logits": 0.15567568689584732, "step": 269 }, { "epoch": 0.00403168606604499, "grad_norm": 0.9140625, "grad_norm_var": 0.05088094075520833, "learning_rate": 0.0001, "loss": 1.7084, "loss/crossentropy": 2.5260289907455444, "loss/fcd": 1.546875, "loss/idx": 1.5, "loss/logits": 0.16151602566242218, "step": 270 }, { "epoch": 0.004046618236659972, "grad_norm": 1.125, "grad_norm_var": 0.05089492797851562, "learning_rate": 0.0001, "loss": 1.7167, "loss/crossentropy": 2.548484683036804, "loss/fcd": 1.5546875, "loss/idx": 1.5, "loss/logits": 0.16197797656059265, "step": 271 }, { "epoch": 0.004061550407274954, "grad_norm": 0.98046875, "grad_norm_var": 0.048888079325358075, "learning_rate": 0.0001, "loss": 1.6865, "loss/crossentropy": 2.499767541885376, "loss/fcd": 1.53515625, "loss/idx": 1.5, "loss/logits": 0.1513931304216385, "step": 272 }, { "epoch": 0.004076482577889935, "grad_norm": 0.9609375, "grad_norm_var": 0.05085188547770182, "learning_rate": 0.0001, "loss": 1.7289, "loss/crossentropy": 2.458220362663269, "loss/fcd": 1.5625, "loss/idx": 1.5, "loss/logits": 0.16639846563339233, "step": 273 }, { "epoch": 0.004091414748504917, "grad_norm": 0.88671875, "grad_norm_var": 0.054264068603515625, "learning_rate": 0.0001, "loss": 1.7724, "loss/crossentropy": 2.7474247217178345, "loss/fcd": 1.60546875, "loss/idx": 1.5, "loss/logits": 0.16692470759153366, "step": 274 }, { "epoch": 0.004106346919119898, "grad_norm": 1.640625, "grad_norm_var": 0.062361653645833334, "learning_rate": 0.0001, "loss": 1.8246, "loss/crossentropy": 2.3119794130325317, "loss/fcd": 1.6484375, "loss/idx": 1.5, "loss/logits": 0.17614249885082245, "step": 275 }, { "epoch": 0.0041212790897348795, "grad_norm": 0.94921875, "grad_norm_var": 0.05554040273030599, "learning_rate": 0.0001, "loss": 1.767, "loss/crossentropy": 2.567260980606079, "loss/fcd": 1.59375, "loss/idx": 1.5, "loss/logits": 0.17328765988349915, "step": 276 }, { "epoch": 0.0041362112603498605, "grad_norm": 0.9453125, "grad_norm_var": 0.053282610575358075, "learning_rate": 0.0001, "loss": 1.6422, "loss/crossentropy": 2.7794957160949707, "loss/fcd": 1.4921875, "loss/idx": 1.5, "loss/logits": 0.14996777474880219, "step": 277 }, { "epoch": 0.004151143430964842, "grad_norm": 0.96484375, "grad_norm_var": 0.04998067220052083, "learning_rate": 0.0001, "loss": 1.645, "loss/crossentropy": 2.632931113243103, "loss/fcd": 1.5, "loss/idx": 1.5, "loss/logits": 0.14498791843652725, "step": 278 }, { "epoch": 0.004166075601579823, "grad_norm": 0.91796875, "grad_norm_var": 0.03740685780843099, "learning_rate": 0.0001, "loss": 1.6941, "loss/crossentropy": 2.5799747705459595, "loss/fcd": 1.53125, "loss/idx": 1.5, "loss/logits": 0.1628100574016571, "step": 279 }, { "epoch": 0.004181007772194805, "grad_norm": 0.8671875, "grad_norm_var": 0.03562768300374349, "learning_rate": 0.0001, "loss": 1.6888, "loss/crossentropy": 2.750143885612488, "loss/fcd": 1.53125, "loss/idx": 1.5, "loss/logits": 0.15750249475240707, "step": 280 }, { "epoch": 0.004195939942809787, "grad_norm": 0.8359375, "grad_norm_var": 0.036376698811848955, "learning_rate": 0.0001, "loss": 1.5538, "loss/crossentropy": 2.39098060131073, "loss/fcd": 1.421875, "loss/idx": 1.5, "loss/logits": 0.13189761340618134, "step": 281 }, { "epoch": 0.004210872113424768, "grad_norm": 0.87109375, "grad_norm_var": 0.036821937561035155, "learning_rate": 0.0001, "loss": 1.6044, "loss/crossentropy": 2.5139598846435547, "loss/fcd": 1.46875, "loss/idx": 1.5, "loss/logits": 0.13561376184225082, "step": 282 }, { "epoch": 0.00422580428403975, "grad_norm": 0.84375, "grad_norm_var": 0.03695468902587891, "learning_rate": 0.0001, "loss": 1.8361, "loss/crossentropy": 2.2974244356155396, "loss/fcd": 1.66015625, "loss/idx": 1.5, "loss/logits": 0.17598910629749298, "step": 283 }, { "epoch": 0.004240736454654731, "grad_norm": 1.34375, "grad_norm_var": 0.04544213612874349, "learning_rate": 0.0001, "loss": 1.9345, "loss/crossentropy": 2.6373846530914307, "loss/fcd": 1.72265625, "loss/idx": 1.5, "loss/logits": 0.21188458800315857, "step": 284 }, { "epoch": 0.0042556686252697125, "grad_norm": 0.91796875, "grad_norm_var": 0.04480584462483724, "learning_rate": 0.0001, "loss": 1.6757, "loss/crossentropy": 2.4831149578094482, "loss/fcd": 1.515625, "loss/idx": 1.5, "loss/logits": 0.16006582230329514, "step": 285 }, { "epoch": 0.0042706007958846935, "grad_norm": 0.97265625, "grad_norm_var": 0.04436620076497396, "learning_rate": 0.0001, "loss": 1.7398, "loss/crossentropy": 2.4333916902542114, "loss/fcd": 1.578125, "loss/idx": 1.5, "loss/logits": 0.16168130189180374, "step": 286 }, { "epoch": 0.004285532966499675, "grad_norm": 0.82421875, "grad_norm_var": 0.04506626129150391, "learning_rate": 0.0001, "loss": 1.643, "loss/crossentropy": 2.5061216354370117, "loss/fcd": 1.48828125, "loss/idx": 1.5, "loss/logits": 0.1546824723482132, "step": 287 }, { "epoch": 0.004300465137114656, "grad_norm": 0.8671875, "grad_norm_var": 0.0459014892578125, "learning_rate": 0.0001, "loss": 1.6412, "loss/crossentropy": 2.4621150493621826, "loss/fcd": 1.484375, "loss/idx": 1.5, "loss/logits": 0.1568301096558571, "step": 288 }, { "epoch": 0.004315397307729638, "grad_norm": 0.80078125, "grad_norm_var": 0.047817420959472653, "learning_rate": 0.0001, "loss": 1.5355, "loss/crossentropy": 2.621973991394043, "loss/fcd": 1.40625, "loss/idx": 1.5, "loss/logits": 0.12922291457653046, "step": 289 }, { "epoch": 0.00433032947834462, "grad_norm": 0.8203125, "grad_norm_var": 0.04879124959309896, "learning_rate": 0.0001, "loss": 1.6732, "loss/crossentropy": 2.392053723335266, "loss/fcd": 1.51953125, "loss/idx": 1.5, "loss/logits": 0.15370090305805206, "step": 290 }, { "epoch": 0.004345261648959601, "grad_norm": 0.92578125, "grad_norm_var": 0.01599267323811849, "learning_rate": 0.0001, "loss": 1.7259, "loss/crossentropy": 2.4002796411514282, "loss/fcd": 1.546875, "loss/idx": 1.5, "loss/logits": 0.17903384566307068, "step": 291 }, { "epoch": 0.004360193819574583, "grad_norm": 0.87890625, "grad_norm_var": 0.01599725087483724, "learning_rate": 0.0001, "loss": 1.6282, "loss/crossentropy": 2.4668020009994507, "loss/fcd": 1.48046875, "loss/idx": 1.5, "loss/logits": 0.1476999893784523, "step": 292 }, { "epoch": 0.004375125990189564, "grad_norm": 0.9609375, "grad_norm_var": 0.01608117421468099, "learning_rate": 0.0001, "loss": 1.9094, "loss/crossentropy": 2.46618115901947, "loss/fcd": 1.70703125, "loss/idx": 1.5, "loss/logits": 0.2024083137512207, "step": 293 }, { "epoch": 0.0043900581608045455, "grad_norm": 0.84375, "grad_norm_var": 0.016165924072265626, "learning_rate": 0.0001, "loss": 1.7257, "loss/crossentropy": 2.5607261657714844, "loss/fcd": 1.55859375, "loss/idx": 1.5, "loss/logits": 0.16708557307720184, "step": 294 }, { "epoch": 0.0044049903314195265, "grad_norm": 0.82421875, "grad_norm_var": 0.016562652587890626, "learning_rate": 0.0001, "loss": 1.6713, "loss/crossentropy": 2.642008900642395, "loss/fcd": 1.51171875, "loss/idx": 1.5, "loss/logits": 0.15961749851703644, "step": 295 }, { "epoch": 0.004419922502034508, "grad_norm": 0.8203125, "grad_norm_var": 0.016904449462890624, "learning_rate": 0.0001, "loss": 1.6041, "loss/crossentropy": 2.7051438093185425, "loss/fcd": 1.4609375, "loss/idx": 1.5, "loss/logits": 0.143184632062912, "step": 296 }, { "epoch": 0.004434854672649489, "grad_norm": 0.84765625, "grad_norm_var": 0.016817665100097655, "learning_rate": 0.0001, "loss": 1.6298, "loss/crossentropy": 2.6207375526428223, "loss/fcd": 1.4765625, "loss/idx": 1.5, "loss/logits": 0.15321487188339233, "step": 297 }, { "epoch": 0.004449786843264471, "grad_norm": 0.93359375, "grad_norm_var": 0.016840044657389322, "learning_rate": 0.0001, "loss": 1.8072, "loss/crossentropy": 2.586044192314148, "loss/fcd": 1.62109375, "loss/idx": 1.5, "loss/logits": 0.18614596128463745, "step": 298 }, { "epoch": 0.004464719013879453, "grad_norm": 0.8203125, "grad_norm_var": 0.017055193583170574, "learning_rate": 0.0001, "loss": 1.6846, "loss/crossentropy": 2.5060659646987915, "loss/fcd": 1.5234375, "loss/idx": 1.5, "loss/logits": 0.16112150996923447, "step": 299 }, { "epoch": 0.004479651184494434, "grad_norm": 0.76171875, "grad_norm_var": 0.0038022359212239584, "learning_rate": 0.0001, "loss": 1.5685, "loss/crossentropy": 2.5580883026123047, "loss/fcd": 1.421875, "loss/idx": 1.5, "loss/logits": 0.14667311310768127, "step": 300 }, { "epoch": 0.004494583355109416, "grad_norm": 0.83203125, "grad_norm_var": 0.003642781575520833, "learning_rate": 0.0001, "loss": 1.6437, "loss/crossentropy": 2.6943860054016113, "loss/fcd": 1.48828125, "loss/idx": 1.5, "loss/logits": 0.15546557307243347, "step": 301 }, { "epoch": 0.004509515525724397, "grad_norm": 0.8125, "grad_norm_var": 0.0028060277303059897, "learning_rate": 0.0001, "loss": 1.6621, "loss/crossentropy": 2.6080933809280396, "loss/fcd": 1.5078125, "loss/idx": 1.5, "loss/logits": 0.15433663129806519, "step": 302 }, { "epoch": 0.0045244476963393785, "grad_norm": 1.0, "grad_norm_var": 0.004170735677083333, "learning_rate": 0.0001, "loss": 1.7273, "loss/crossentropy": 2.4456557035446167, "loss/fcd": 1.5625, "loss/idx": 1.5, "loss/logits": 0.16477762162685394, "step": 303 }, { "epoch": 0.0045393798669543595, "grad_norm": 1.78125, "grad_norm_var": 0.057342274983723955, "learning_rate": 0.0001, "loss": 1.7906, "loss/crossentropy": 2.5243054628372192, "loss/fcd": 1.6171875, "loss/idx": 2.0, "loss/logits": 0.17336448282003403, "step": 304 }, { "epoch": 0.004554312037569341, "grad_norm": 2.78125, "grad_norm_var": 0.27192529042561847, "learning_rate": 0.0001, "loss": 2.4082, "loss/crossentropy": 2.0944008827209473, "loss/fcd": 2.1328125, "loss/idx": 2.0, "loss/logits": 0.27535852044820786, "step": 305 }, { "epoch": 0.004569244208184322, "grad_norm": 2.234375, "grad_norm_var": 0.35542494455973306, "learning_rate": 0.0001, "loss": 2.2988, "loss/crossentropy": 2.386541962623596, "loss/fcd": 2.046875, "loss/idx": 2.0, "loss/logits": 0.25191500037908554, "step": 306 }, { "epoch": 0.004584176378799304, "grad_norm": 1.7109375, "grad_norm_var": 0.37271525065104166, "learning_rate": 0.0001, "loss": 2.0799, "loss/crossentropy": 2.5651217699050903, "loss/fcd": 1.875, "loss/idx": 2.0, "loss/logits": 0.2048977091908455, "step": 307 }, { "epoch": 0.004599108549414286, "grad_norm": 1.3984375, "grad_norm_var": 0.36888472239176434, "learning_rate": 0.0001, "loss": 1.888, "loss/crossentropy": 2.3940224647521973, "loss/fcd": 1.70703125, "loss/idx": 2.0, "loss/logits": 0.18101809173822403, "step": 308 }, { "epoch": 0.004614040720029267, "grad_norm": 1.4921875, "grad_norm_var": 0.3688674290974935, "learning_rate": 0.0001, "loss": 1.8624, "loss/crossentropy": 2.7549002170562744, "loss/fcd": 1.6875, "loss/idx": 2.0, "loss/logits": 0.17488256096839905, "step": 309 }, { "epoch": 0.004628972890644249, "grad_norm": 1.5078125, "grad_norm_var": 0.3610422134399414, "learning_rate": 0.0001, "loss": 1.9112, "loss/crossentropy": 2.855328917503357, "loss/fcd": 1.734375, "loss/idx": 2.0, "loss/logits": 0.17682038247585297, "step": 310 }, { "epoch": 0.00464390506125923, "grad_norm": 1.28125, "grad_norm_var": 0.3460235595703125, "learning_rate": 0.0001, "loss": 1.8355, "loss/crossentropy": 2.4556884765625, "loss/fcd": 1.67578125, "loss/idx": 2.0, "loss/logits": 0.15974701941013336, "step": 311 }, { "epoch": 0.0046588372318742115, "grad_norm": 1.4296875, "grad_norm_var": 0.32916259765625, "learning_rate": 0.0001, "loss": 2.0998, "loss/crossentropy": 2.13610577583313, "loss/fcd": 1.8984375, "loss/idx": 2.0, "loss/logits": 0.20135962963104248, "step": 312 }, { "epoch": 0.0046737694024891925, "grad_norm": 1.1796875, "grad_norm_var": 0.31374454498291016, "learning_rate": 0.0001, "loss": 1.7831, "loss/crossentropy": 2.529542326927185, "loss/fcd": 1.62890625, "loss/idx": 2.0, "loss/logits": 0.15422120690345764, "step": 313 }, { "epoch": 0.004688701573104174, "grad_norm": 1.3359375, "grad_norm_var": 0.3003265380859375, "learning_rate": 0.0001, "loss": 1.9462, "loss/crossentropy": 2.286463499069214, "loss/fcd": 1.765625, "loss/idx": 2.0, "loss/logits": 0.18060050159692764, "step": 314 }, { "epoch": 0.004703633743719155, "grad_norm": 1.390625, "grad_norm_var": 0.2767677307128906, "learning_rate": 0.0001, "loss": 1.931, "loss/crossentropy": 2.2394256591796875, "loss/fcd": 1.74609375, "loss/idx": 2.0, "loss/logits": 0.1849333867430687, "step": 315 }, { "epoch": 0.004718565914334137, "grad_norm": 1.3671875, "grad_norm_var": 0.24547926584879556, "learning_rate": 0.0001, "loss": 1.9587, "loss/crossentropy": 2.5843063592910767, "loss/fcd": 1.7734375, "loss/idx": 2.0, "loss/logits": 0.18526208400726318, "step": 316 }, { "epoch": 0.004733498084949119, "grad_norm": 1.359375, "grad_norm_var": 0.21793619791666666, "learning_rate": 0.0001, "loss": 1.9264, "loss/crossentropy": 2.453084349632263, "loss/fcd": 1.73046875, "loss/idx": 2.0, "loss/logits": 0.1959688439965248, "step": 317 }, { "epoch": 0.0047484302555641, "grad_norm": 1.2734375, "grad_norm_var": 0.1887224833170573, "learning_rate": 0.0001, "loss": 1.9691, "loss/crossentropy": 2.449620246887207, "loss/fcd": 1.77734375, "loss/idx": 2.0, "loss/logits": 0.19172405451536179, "step": 318 }, { "epoch": 0.004763362426179082, "grad_norm": 1.1796875, "grad_norm_var": 0.17797749837239582, "learning_rate": 0.0001, "loss": 1.8097, "loss/crossentropy": 2.6453970670700073, "loss/fcd": 1.63671875, "loss/idx": 2.0, "loss/logits": 0.17299381643533707, "step": 319 }, { "epoch": 0.004778294596794063, "grad_norm": 1.2578125, "grad_norm_var": 0.17853978474934895, "learning_rate": 0.0001, "loss": 1.9285, "loss/crossentropy": 2.4863957166671753, "loss/fcd": 1.73828125, "loss/idx": 2.0, "loss/logits": 0.19023225456476212, "step": 320 }, { "epoch": 0.0047932267674090446, "grad_norm": 1.0234375, "grad_norm_var": 0.07399800618489584, "learning_rate": 0.0001, "loss": 1.7707, "loss/crossentropy": 2.5647225379943848, "loss/fcd": 1.609375, "loss/idx": 2.0, "loss/logits": 0.16133707761764526, "step": 321 }, { "epoch": 0.0048081589380240255, "grad_norm": 1.1875, "grad_norm_var": 0.026220703125, "learning_rate": 0.0001, "loss": 1.8213, "loss/crossentropy": 2.834408164024353, "loss/fcd": 1.65625, "loss/idx": 2.0, "loss/logits": 0.165000282227993, "step": 322 }, { "epoch": 0.004823091108639007, "grad_norm": 1.171875, "grad_norm_var": 0.017429351806640625, "learning_rate": 0.0001, "loss": 1.8912, "loss/crossentropy": 2.6110740900039673, "loss/fcd": 1.7109375, "loss/idx": 2.0, "loss/logits": 0.18023446202278137, "step": 323 }, { "epoch": 0.004838023279253989, "grad_norm": 1.0078125, "grad_norm_var": 0.021956125895182293, "learning_rate": 0.0001, "loss": 1.7027, "loss/crossentropy": 2.4298505783081055, "loss/fcd": 1.55859375, "loss/idx": 2.0, "loss/logits": 0.1441233903169632, "step": 324 }, { "epoch": 0.00485295544986897, "grad_norm": 1.4375, "grad_norm_var": 0.020580037434895834, "learning_rate": 0.0001, "loss": 1.9852, "loss/crossentropy": 2.5150561332702637, "loss/fcd": 1.73046875, "loss/idx": 2.0, "loss/logits": 0.25472788512706757, "step": 325 }, { "epoch": 0.004867887620483952, "grad_norm": 1.3046875, "grad_norm_var": 0.016837565104166667, "learning_rate": 0.0001, "loss": 1.7973, "loss/crossentropy": 2.7767333984375, "loss/fcd": 1.625, "loss/idx": 2.0, "loss/logits": 0.17234515398740768, "step": 326 }, { "epoch": 0.004882819791098933, "grad_norm": 1.3671875, "grad_norm_var": 0.01752293904622396, "learning_rate": 0.0001, "loss": 2.1812, "loss/crossentropy": 2.416645646095276, "loss/fcd": 1.96875, "loss/idx": 2.0, "loss/logits": 0.21248316764831543, "step": 327 }, { "epoch": 0.004897751961713915, "grad_norm": 1.046875, "grad_norm_var": 0.018382771809895834, "learning_rate": 0.0001, "loss": 1.9089, "loss/crossentropy": 2.5154889822006226, "loss/fcd": 1.71875, "loss/idx": 2.0, "loss/logits": 0.19012955576181412, "step": 328 }, { "epoch": 0.004912684132328896, "grad_norm": 1.1484375, "grad_norm_var": 0.018708292643229166, "learning_rate": 0.0001, "loss": 1.8608, "loss/crossentropy": 2.5697543621063232, "loss/fcd": 1.6796875, "loss/idx": 2.0, "loss/logits": 0.18107804656028748, "step": 329 }, { "epoch": 0.0049276163029438776, "grad_norm": 1.0546875, "grad_norm_var": 0.020099894205729166, "learning_rate": 0.0001, "loss": 1.8524, "loss/crossentropy": 2.3531534671783447, "loss/fcd": 1.6796875, "loss/idx": 2.0, "loss/logits": 0.1726970225572586, "step": 330 }, { "epoch": 0.0049425484735588585, "grad_norm": 1.015625, "grad_norm_var": 0.020539347330729166, "learning_rate": 0.0001, "loss": 1.6318, "loss/crossentropy": 2.558312177658081, "loss/fcd": 1.4921875, "loss/idx": 2.0, "loss/logits": 0.1396355852484703, "step": 331 }, { "epoch": 0.00495748064417384, "grad_norm": 1.0625, "grad_norm_var": 0.019557444254557292, "learning_rate": 0.0001, "loss": 1.8684, "loss/crossentropy": 2.4718196392059326, "loss/fcd": 1.6875, "loss/idx": 2.0, "loss/logits": 0.18088336288928986, "step": 332 }, { "epoch": 0.004972412814788822, "grad_norm": 1.125, "grad_norm_var": 0.017421213785807292, "learning_rate": 0.0001, "loss": 1.7542, "loss/crossentropy": 2.646172881126404, "loss/fcd": 1.6015625, "loss/idx": 2.0, "loss/logits": 0.15264033526182175, "step": 333 }, { "epoch": 0.004987344985403803, "grad_norm": 0.98828125, "grad_norm_var": 0.018437639872233073, "learning_rate": 0.0001, "loss": 1.8037, "loss/crossentropy": 2.658991813659668, "loss/fcd": 1.625, "loss/idx": 2.0, "loss/logits": 0.17869339883327484, "step": 334 }, { "epoch": 0.005002277156018785, "grad_norm": 1.3359375, "grad_norm_var": 0.020609474182128905, "learning_rate": 0.0001, "loss": 2.0561, "loss/crossentropy": 2.62394380569458, "loss/fcd": 1.84765625, "loss/idx": 2.0, "loss/logits": 0.20848941057920456, "step": 335 }, { "epoch": 0.005017209326633766, "grad_norm": 1.09375, "grad_norm_var": 0.020118141174316408, "learning_rate": 0.0001, "loss": 1.7623, "loss/crossentropy": 2.430611491203308, "loss/fcd": 1.59375, "loss/idx": 2.0, "loss/logits": 0.1685967594385147, "step": 336 }, { "epoch": 0.005032141497248748, "grad_norm": 1.375, "grad_norm_var": 0.021994972229003908, "learning_rate": 0.0001, "loss": 1.8267, "loss/crossentropy": 2.6477928161621094, "loss/fcd": 1.65625, "loss/idx": 2.0, "loss/logits": 0.1704726666212082, "step": 337 }, { "epoch": 0.005047073667863729, "grad_norm": 1.1796875, "grad_norm_var": 0.021980730692545573, "learning_rate": 0.0001, "loss": 1.8154, "loss/crossentropy": 2.4817826747894287, "loss/fcd": 1.65234375, "loss/idx": 2.0, "loss/logits": 0.16309074312448502, "step": 338 }, { "epoch": 0.005062005838478711, "grad_norm": 1.4453125, "grad_norm_var": 0.026733843485514323, "learning_rate": 0.0001, "loss": 2.2624, "loss/crossentropy": 2.9061405658721924, "loss/fcd": 2.02734375, "loss/idx": 2.0, "loss/logits": 0.2350568175315857, "step": 339 }, { "epoch": 0.0050769380090936915, "grad_norm": 1.0859375, "grad_norm_var": 0.025251197814941406, "learning_rate": 0.0001, "loss": 1.7576, "loss/crossentropy": 2.5378034114837646, "loss/fcd": 1.5859375, "loss/idx": 2.0, "loss/logits": 0.17163680493831635, "step": 340 }, { "epoch": 0.005091870179708673, "grad_norm": 1.109375, "grad_norm_var": 0.021224403381347658, "learning_rate": 0.0001, "loss": 1.7415, "loss/crossentropy": 2.383753538131714, "loss/fcd": 1.578125, "loss/idx": 2.0, "loss/logits": 0.16342193633317947, "step": 341 }, { "epoch": 0.005106802350323655, "grad_norm": 1.1640625, "grad_norm_var": 0.019956398010253906, "learning_rate": 0.0001, "loss": 1.707, "loss/crossentropy": 2.3804928064346313, "loss/fcd": 1.546875, "loss/idx": 2.0, "loss/logits": 0.1601005494594574, "step": 342 }, { "epoch": 0.005121734520938636, "grad_norm": 1.0546875, "grad_norm_var": 0.017525164286295573, "learning_rate": 0.0001, "loss": 1.7786, "loss/crossentropy": 2.5734684467315674, "loss/fcd": 1.609375, "loss/idx": 2.0, "loss/logits": 0.16918828338384628, "step": 343 }, { "epoch": 0.005136666691553618, "grad_norm": 1.34375, "grad_norm_var": 0.019235674540201822, "learning_rate": 0.0001, "loss": 1.8536, "loss/crossentropy": 2.6887491941452026, "loss/fcd": 1.6640625, "loss/idx": 2.0, "loss/logits": 0.18957675993442535, "step": 344 }, { "epoch": 0.005151598862168599, "grad_norm": 0.99609375, "grad_norm_var": 0.020949045817057293, "learning_rate": 0.0001, "loss": 1.7255, "loss/crossentropy": 2.3476874828338623, "loss/fcd": 1.5859375, "loss/idx": 2.0, "loss/logits": 0.1395951807498932, "step": 345 }, { "epoch": 0.005166531032783581, "grad_norm": 0.9609375, "grad_norm_var": 0.02271296183268229, "learning_rate": 0.0001, "loss": 1.7556, "loss/crossentropy": 2.597803831100464, "loss/fcd": 1.59765625, "loss/idx": 2.0, "loss/logits": 0.15798340737819672, "step": 346 }, { "epoch": 0.005181463203398562, "grad_norm": 1.7421875, "grad_norm_var": 0.043076578776041666, "learning_rate": 0.0001, "loss": 1.8556, "loss/crossentropy": 2.5390493869781494, "loss/fcd": 1.67578125, "loss/idx": 2.0, "loss/logits": 0.17979220300912857, "step": 347 }, { "epoch": 0.005196395374013544, "grad_norm": 1.25, "grad_norm_var": 0.04205118815104167, "learning_rate": 0.0001, "loss": 1.7972, "loss/crossentropy": 2.640982985496521, "loss/fcd": 1.625, "loss/idx": 2.0, "loss/logits": 0.1721990555524826, "step": 348 }, { "epoch": 0.0052113275446285245, "grad_norm": 1.3671875, "grad_norm_var": 0.04319432576497396, "learning_rate": 0.0001, "loss": 1.8083, "loss/crossentropy": 2.61712646484375, "loss/fcd": 1.64453125, "loss/idx": 2.0, "loss/logits": 0.16379254311323166, "step": 349 }, { "epoch": 0.005226259715243506, "grad_norm": 1.1640625, "grad_norm_var": 0.03973534901936849, "learning_rate": 0.0001, "loss": 1.7109, "loss/crossentropy": 2.527679681777954, "loss/fcd": 1.55859375, "loss/idx": 2.0, "loss/logits": 0.1522795334458351, "step": 350 }, { "epoch": 0.005241191885858488, "grad_norm": 1.140625, "grad_norm_var": 0.039341163635253903, "learning_rate": 0.0001, "loss": 1.7953, "loss/crossentropy": 2.591903328895569, "loss/fcd": 1.625, "loss/idx": 2.0, "loss/logits": 0.17032313346862793, "step": 351 }, { "epoch": 0.005256124056473469, "grad_norm": 1.1953125, "grad_norm_var": 0.03831628163655599, "learning_rate": 0.0001, "loss": 1.7707, "loss/crossentropy": 2.668965458869934, "loss/fcd": 1.609375, "loss/idx": 2.0, "loss/logits": 0.16133900731801987, "step": 352 }, { "epoch": 0.005271056227088451, "grad_norm": 1.0390625, "grad_norm_var": 0.038578732808430986, "learning_rate": 0.0001, "loss": 1.7464, "loss/crossentropy": 2.514343738555908, "loss/fcd": 1.58984375, "loss/idx": 2.0, "loss/logits": 0.1565106362104416, "step": 353 }, { "epoch": 0.005285988397703432, "grad_norm": 1.1171875, "grad_norm_var": 0.03901208241780599, "learning_rate": 0.0001, "loss": 1.8161, "loss/crossentropy": 2.726048231124878, "loss/fcd": 1.6171875, "loss/idx": 2.0, "loss/logits": 0.1989368051290512, "step": 354 }, { "epoch": 0.005300920568318414, "grad_norm": 1.125, "grad_norm_var": 0.03488305409749349, "learning_rate": 0.0001, "loss": 1.9807, "loss/crossentropy": 2.365426182746887, "loss/fcd": 1.78515625, "loss/idx": 2.0, "loss/logits": 0.1955546736717224, "step": 355 }, { "epoch": 0.005315852738933395, "grad_norm": 1.09375, "grad_norm_var": 0.03479048411051432, "learning_rate": 0.0001, "loss": 1.725, "loss/crossentropy": 2.2926045656204224, "loss/fcd": 1.578125, "loss/idx": 2.0, "loss/logits": 0.14692262932658195, "step": 356 }, { "epoch": 0.005330784909548377, "grad_norm": 0.98828125, "grad_norm_var": 0.03683039347330729, "learning_rate": 0.0001, "loss": 1.796, "loss/crossentropy": 2.489909529685974, "loss/fcd": 1.62109375, "loss/idx": 2.0, "loss/logits": 0.17487448453903198, "step": 357 }, { "epoch": 0.0053457170801633576, "grad_norm": 0.62109375, "grad_norm_var": 0.05578657786051432, "learning_rate": 0.0001, "loss": 1.7193, "loss/crossentropy": 2.467299699783325, "loss/fcd": 1.52734375, "loss/idx": 2.5, "loss/logits": 0.19200193881988525, "step": 358 }, { "epoch": 0.005360649250778339, "grad_norm": 0.6328125, "grad_norm_var": 0.07156569163004557, "learning_rate": 0.0001, "loss": 1.586, "loss/crossentropy": 2.6709818840026855, "loss/fcd": 1.41796875, "loss/idx": 2.5, "loss/logits": 0.16798634827136993, "step": 359 }, { "epoch": 0.005375581421393321, "grad_norm": 0.63671875, "grad_norm_var": 0.08087539672851562, "learning_rate": 0.0001, "loss": 1.6455, "loss/crossentropy": 2.360570549964905, "loss/fcd": 1.45703125, "loss/idx": 2.5, "loss/logits": 0.1884693130850792, "step": 360 }, { "epoch": 0.005390513592008302, "grad_norm": 0.69140625, "grad_norm_var": 0.0895538330078125, "learning_rate": 0.0001, "loss": 1.6262, "loss/crossentropy": 2.6428942680358887, "loss/fcd": 1.453125, "loss/idx": 2.5, "loss/logits": 0.173103965818882, "step": 361 }, { "epoch": 0.005405445762623284, "grad_norm": 0.59375, "grad_norm_var": 0.1022356669108073, "learning_rate": 0.0001, "loss": 1.4134, "loss/crossentropy": 2.5077950954437256, "loss/fcd": 1.28125, "loss/idx": 2.5, "loss/logits": 0.13219071924686432, "step": 362 }, { "epoch": 0.005420377933238265, "grad_norm": 0.625, "grad_norm_var": 0.07339680989583333, "learning_rate": 0.0001, "loss": 1.543, "loss/crossentropy": 2.4522061347961426, "loss/fcd": 1.390625, "loss/idx": 2.5, "loss/logits": 0.15238897874951363, "step": 363 }, { "epoch": 0.005435310103853247, "grad_norm": 0.7578125, "grad_norm_var": 0.06918309529622396, "learning_rate": 0.0001, "loss": 1.7256, "loss/crossentropy": 2.442309260368347, "loss/fcd": 1.51953125, "loss/idx": 2.5, "loss/logits": 0.20608609169721603, "step": 364 }, { "epoch": 0.005450242274468228, "grad_norm": 0.5859375, "grad_norm_var": 0.061197662353515626, "learning_rate": 0.0001, "loss": 1.5033, "loss/crossentropy": 2.4856804609298706, "loss/fcd": 1.35546875, "loss/idx": 2.5, "loss/logits": 0.14786505699157715, "step": 365 }, { "epoch": 0.00546517444508321, "grad_norm": 0.59765625, "grad_norm_var": 0.05945529937744141, "learning_rate": 0.0001, "loss": 1.5867, "loss/crossentropy": 2.6083357334136963, "loss/fcd": 1.4296875, "loss/idx": 2.5, "loss/logits": 0.1570557877421379, "step": 366 }, { "epoch": 0.0054801066156981906, "grad_norm": 0.6953125, "grad_norm_var": 0.054004859924316403, "learning_rate": 0.0001, "loss": 1.6414, "loss/crossentropy": 2.6212077140808105, "loss/fcd": 1.45703125, "loss/idx": 2.5, "loss/logits": 0.18436457216739655, "step": 367 }, { "epoch": 0.005495038786313172, "grad_norm": 0.63671875, "grad_norm_var": 0.044976806640625, "learning_rate": 0.0001, "loss": 1.6194, "loss/crossentropy": 2.5251601934432983, "loss/fcd": 1.4453125, "loss/idx": 2.5, "loss/logits": 0.17409738898277283, "step": 368 }, { "epoch": 0.005509970956928154, "grad_norm": 0.6640625, "grad_norm_var": 0.040679931640625, "learning_rate": 0.0001, "loss": 1.609, "loss/crossentropy": 2.6006916761398315, "loss/fcd": 1.4296875, "loss/idx": 2.5, "loss/logits": 0.17928281426429749, "step": 369 }, { "epoch": 0.005524903127543135, "grad_norm": 0.6328125, "grad_norm_var": 0.0318817138671875, "learning_rate": 0.0001, "loss": 1.715, "loss/crossentropy": 2.6712244749069214, "loss/fcd": 1.5234375, "loss/idx": 2.5, "loss/logits": 0.19155558198690414, "step": 370 }, { "epoch": 0.005539835298158117, "grad_norm": 0.8125, "grad_norm_var": 0.0212615966796875, "learning_rate": 0.0001, "loss": 1.6621, "loss/crossentropy": 2.5613213777542114, "loss/fcd": 1.5, "loss/idx": 2.5, "loss/logits": 0.16209547966718674, "step": 371 }, { "epoch": 0.005554767468773098, "grad_norm": 0.65625, "grad_norm_var": 0.0104949951171875, "learning_rate": 0.0001, "loss": 1.3512, "loss/crossentropy": 2.641627550125122, "loss/fcd": 1.23828125, "loss/idx": 2.5, "loss/logits": 0.11289479583501816, "step": 372 }, { "epoch": 0.00556969963938808, "grad_norm": 0.6171875, "grad_norm_var": 0.0036879857381184895, "learning_rate": 0.0001, "loss": 1.5787, "loss/crossentropy": 2.66814649105072, "loss/fcd": 1.421875, "loss/idx": 2.5, "loss/logits": 0.15683035552501678, "step": 373 }, { "epoch": 0.005584631810003061, "grad_norm": 0.6484375, "grad_norm_var": 0.0036163330078125, "learning_rate": 0.0001, "loss": 1.6049, "loss/crossentropy": 2.6782963275909424, "loss/fcd": 1.4453125, "loss/idx": 2.5, "loss/logits": 0.1595475971698761, "step": 374 }, { "epoch": 0.005599563980618043, "grad_norm": 0.66015625, "grad_norm_var": 0.0035811742146809895, "learning_rate": 0.0001, "loss": 1.7587, "loss/crossentropy": 2.5838359594345093, "loss/fcd": 1.546875, "loss/idx": 2.5, "loss/logits": 0.2117815613746643, "step": 375 }, { "epoch": 0.005614496151233024, "grad_norm": 0.70703125, "grad_norm_var": 0.0037001927693684896, "learning_rate": 0.0001, "loss": 1.6537, "loss/crossentropy": 2.2126421332359314, "loss/fcd": 1.48828125, "loss/idx": 2.5, "loss/logits": 0.16539722681045532, "step": 376 }, { "epoch": 0.005629428321848005, "grad_norm": 0.6484375, "grad_norm_var": 0.0036435445149739583, "learning_rate": 0.0001, "loss": 1.6255, "loss/crossentropy": 2.60499370098114, "loss/fcd": 1.45703125, "loss/idx": 2.5, "loss/logits": 0.1684800684452057, "step": 377 }, { "epoch": 0.005644360492462987, "grad_norm": 0.6953125, "grad_norm_var": 0.0034088134765625, "learning_rate": 0.0001, "loss": 1.5125, "loss/crossentropy": 2.749386191368103, "loss/fcd": 1.37109375, "loss/idx": 2.5, "loss/logits": 0.14144720882177353, "step": 378 }, { "epoch": 0.005659292663077968, "grad_norm": 0.671875, "grad_norm_var": 0.0032958984375, "learning_rate": 0.0001, "loss": 1.5179, "loss/crossentropy": 2.5258721113204956, "loss/fcd": 1.375, "loss/idx": 2.5, "loss/logits": 0.14287371188402176, "step": 379 }, { "epoch": 0.00567422483369295, "grad_norm": 0.828125, "grad_norm_var": 0.004447174072265625, "learning_rate": 0.0001, "loss": 1.5972, "loss/crossentropy": 2.4670597314834595, "loss/fcd": 1.4296875, "loss/idx": 2.5, "loss/logits": 0.1674765683710575, "step": 380 }, { "epoch": 0.005689157004307931, "grad_norm": 0.75, "grad_norm_var": 0.0042388916015625, "learning_rate": 0.0001, "loss": 1.7309, "loss/crossentropy": 2.41486656665802, "loss/fcd": 1.5625, "loss/idx": 2.5, "loss/logits": 0.1684126779437065, "step": 381 }, { "epoch": 0.005704089174922913, "grad_norm": 1.1015625, "grad_norm_var": 0.014400672912597657, "learning_rate": 0.0001, "loss": 1.8496, "loss/crossentropy": 2.5699658393859863, "loss/fcd": 1.62890625, "loss/idx": 2.5, "loss/logits": 0.22069481760263443, "step": 382 }, { "epoch": 0.005719021345537894, "grad_norm": 0.61328125, "grad_norm_var": 0.01502685546875, "learning_rate": 0.0001, "loss": 1.4251, "loss/crossentropy": 2.6866044998168945, "loss/fcd": 1.2890625, "loss/idx": 2.5, "loss/logits": 0.1360424980521202, "step": 383 }, { "epoch": 0.005733953516152876, "grad_norm": 0.671875, "grad_norm_var": 0.014765357971191407, "learning_rate": 0.0001, "loss": 1.5608, "loss/crossentropy": 2.7703367471694946, "loss/fcd": 1.40625, "loss/idx": 2.5, "loss/logits": 0.15454240143299103, "step": 384 }, { "epoch": 0.005748885686767857, "grad_norm": 0.71484375, "grad_norm_var": 0.014607493082682292, "learning_rate": 0.0001, "loss": 1.6469, "loss/crossentropy": 2.5732953548431396, "loss/fcd": 1.4765625, "loss/idx": 2.5, "loss/logits": 0.17037127912044525, "step": 385 }, { "epoch": 0.005763817857382838, "grad_norm": 0.69140625, "grad_norm_var": 0.014185015360514324, "learning_rate": 0.0001, "loss": 1.5475, "loss/crossentropy": 2.9296597242355347, "loss/fcd": 1.3984375, "loss/idx": 2.5, "loss/logits": 0.14904215186834335, "step": 386 }, { "epoch": 0.00577875002799782, "grad_norm": 0.6015625, "grad_norm_var": 0.014308611551920572, "learning_rate": 0.0001, "loss": 1.5924, "loss/crossentropy": 2.6140079498291016, "loss/fcd": 1.42578125, "loss/idx": 2.5, "loss/logits": 0.16666851192712784, "step": 387 }, { "epoch": 0.005793682198612801, "grad_norm": 0.9296875, "grad_norm_var": 0.01721032460530599, "learning_rate": 0.0001, "loss": 1.9019, "loss/crossentropy": 2.599937319755554, "loss/fcd": 1.6640625, "loss/idx": 2.5, "loss/logits": 0.2378673255443573, "step": 388 }, { "epoch": 0.005808614369227783, "grad_norm": 0.625, "grad_norm_var": 0.01710503896077474, "learning_rate": 0.0001, "loss": 1.5824, "loss/crossentropy": 2.548129081726074, "loss/fcd": 1.42578125, "loss/idx": 2.5, "loss/logits": 0.1565687358379364, "step": 389 }, { "epoch": 0.005823546539842764, "grad_norm": 0.59375, "grad_norm_var": 0.01783135732014974, "learning_rate": 0.0001, "loss": 1.6191, "loss/crossentropy": 2.7168357372283936, "loss/fcd": 1.44140625, "loss/idx": 2.5, "loss/logits": 0.17769953608512878, "step": 390 }, { "epoch": 0.005838478710457746, "grad_norm": 0.6328125, "grad_norm_var": 0.018092600504557292, "learning_rate": 0.0001, "loss": 1.6704, "loss/crossentropy": 2.4698110818862915, "loss/fcd": 1.48046875, "loss/idx": 2.5, "loss/logits": 0.189944326877594, "step": 391 }, { "epoch": 0.005853410881072727, "grad_norm": 0.65625, "grad_norm_var": 0.018323198954264323, "learning_rate": 0.0001, "loss": 1.5337, "loss/crossentropy": 2.594900608062744, "loss/fcd": 1.3828125, "loss/idx": 2.5, "loss/logits": 0.15086808055639267, "step": 392 }, { "epoch": 0.005868343051687709, "grad_norm": 0.6484375, "grad_norm_var": 0.018323198954264323, "learning_rate": 0.0001, "loss": 1.6741, "loss/crossentropy": 2.6552109718322754, "loss/fcd": 1.49609375, "loss/idx": 2.5, "loss/logits": 0.17803405970335007, "step": 393 }, { "epoch": 0.0058832752223026905, "grad_norm": 0.7578125, "grad_norm_var": 0.018410682678222656, "learning_rate": 0.0001, "loss": 1.7245, "loss/crossentropy": 2.6911847591400146, "loss/fcd": 1.53515625, "loss/idx": 2.5, "loss/logits": 0.18936707079410553, "step": 394 }, { "epoch": 0.005898207392917671, "grad_norm": 0.6328125, "grad_norm_var": 0.018746376037597656, "learning_rate": 0.0001, "loss": 1.6557, "loss/crossentropy": 2.44334614276886, "loss/fcd": 1.49609375, "loss/idx": 2.5, "loss/logits": 0.15957393497228622, "step": 395 }, { "epoch": 0.005913139563532653, "grad_norm": 0.58203125, "grad_norm_var": 0.0188385009765625, "learning_rate": 0.0001, "loss": 1.5261, "loss/crossentropy": 2.562255382537842, "loss/fcd": 1.37890625, "loss/idx": 2.5, "loss/logits": 0.14717654883861542, "step": 396 }, { "epoch": 0.005928071734147634, "grad_norm": 0.76953125, "grad_norm_var": 0.018992042541503905, "learning_rate": 0.0001, "loss": 1.676, "loss/crossentropy": 2.5732717514038086, "loss/fcd": 1.49609375, "loss/idx": 2.5, "loss/logits": 0.17988938093185425, "step": 397 }, { "epoch": 0.005943003904762616, "grad_norm": 1.6171875, "grad_norm_var": 0.06311893463134766, "learning_rate": 0.0001, "loss": 1.7434, "loss/crossentropy": 2.4858436584472656, "loss/fcd": 1.57421875, "loss/idx": 3.0, "loss/logits": 0.16918716579675674, "step": 398 }, { "epoch": 0.005957936075377597, "grad_norm": 2.65625, "grad_norm_var": 0.2911905924479167, "learning_rate": 0.0001, "loss": 2.05, "loss/crossentropy": 2.784856915473938, "loss/fcd": 1.83984375, "loss/idx": 3.0, "loss/logits": 0.21015368402004242, "step": 399 }, { "epoch": 0.005972868245992579, "grad_norm": 2.546875, "grad_norm_var": 0.4635538736979167, "learning_rate": 0.0001, "loss": 2.0473, "loss/crossentropy": 2.734723210334778, "loss/fcd": 1.84765625, "loss/idx": 3.0, "loss/logits": 0.19964434951543808, "step": 400 }, { "epoch": 0.00598780041660756, "grad_norm": 1.78125, "grad_norm_var": 0.4971394220987956, "learning_rate": 0.0001, "loss": 2.0588, "loss/crossentropy": 2.341898798942566, "loss/fcd": 1.86328125, "loss/idx": 3.0, "loss/logits": 0.1955610066652298, "step": 401 }, { "epoch": 0.006002732587222542, "grad_norm": 1.8046875, "grad_norm_var": 0.5220904032389323, "learning_rate": 0.0001, "loss": 2.0501, "loss/crossentropy": 2.6248419284820557, "loss/fcd": 1.828125, "loss/idx": 3.0, "loss/logits": 0.22197691351175308, "step": 402 }, { "epoch": 0.0060176647578375235, "grad_norm": 1.46875, "grad_norm_var": 0.5097544352213542, "learning_rate": 0.0001, "loss": 2.0025, "loss/crossentropy": 2.783551812171936, "loss/fcd": 1.79296875, "loss/idx": 3.0, "loss/logits": 0.2095436304807663, "step": 403 }, { "epoch": 0.006032596928452504, "grad_norm": 1.546875, "grad_norm_var": 0.513873036702474, "learning_rate": 0.0001, "loss": 1.9867, "loss/crossentropy": 2.6072347164154053, "loss/fcd": 1.80078125, "loss/idx": 3.0, "loss/logits": 0.1859557330608368, "step": 404 }, { "epoch": 0.006047529099067486, "grad_norm": 1.3046875, "grad_norm_var": 0.48995564778645834, "learning_rate": 0.0001, "loss": 1.9417, "loss/crossentropy": 2.627634286880493, "loss/fcd": 1.7578125, "loss/idx": 3.0, "loss/logits": 0.18390918523073196, "step": 405 }, { "epoch": 0.006062461269682467, "grad_norm": 2.5625, "grad_norm_var": 0.5599385579427083, "learning_rate": 0.0001, "loss": 2.6781, "loss/crossentropy": 2.818581461906433, "loss/fcd": 2.34765625, "loss/idx": 3.0, "loss/logits": 0.3304808735847473, "step": 406 }, { "epoch": 0.006077393440297449, "grad_norm": 1.5234375, "grad_norm_var": 0.5216115315755209, "learning_rate": 0.0001, "loss": 2.0978, "loss/crossentropy": 2.4685953855514526, "loss/fcd": 1.86328125, "loss/idx": 3.0, "loss/logits": 0.23454776406288147, "step": 407 }, { "epoch": 0.00609232561091243, "grad_norm": 1.3671875, "grad_norm_var": 0.4799781799316406, "learning_rate": 0.0001, "loss": 1.9471, "loss/crossentropy": 2.551527738571167, "loss/fcd": 1.765625, "loss/idx": 3.0, "loss/logits": 0.18149860948324203, "step": 408 }, { "epoch": 0.006107257781527412, "grad_norm": 1.4140625, "grad_norm_var": 0.43242568969726564, "learning_rate": 0.0001, "loss": 1.9248, "loss/crossentropy": 2.543463706970215, "loss/fcd": 1.75390625, "loss/idx": 3.0, "loss/logits": 0.17087971419095993, "step": 409 }, { "epoch": 0.006122189952142393, "grad_norm": 1.3046875, "grad_norm_var": 0.39546890258789064, "learning_rate": 0.0001, "loss": 1.9248, "loss/crossentropy": 2.753381848335266, "loss/fcd": 1.72265625, "loss/idx": 3.0, "loss/logits": 0.20210126042366028, "step": 410 }, { "epoch": 0.006137122122757375, "grad_norm": 1.2734375, "grad_norm_var": 0.342333730061849, "learning_rate": 0.0001, "loss": 1.9107, "loss/crossentropy": 2.6746675968170166, "loss/fcd": 1.71484375, "loss/idx": 3.0, "loss/logits": 0.19586054980754852, "step": 411 }, { "epoch": 0.0061520542933723565, "grad_norm": 1.0859375, "grad_norm_var": 0.29013055165608725, "learning_rate": 0.0001, "loss": 1.7843, "loss/crossentropy": 2.6001042127609253, "loss/fcd": 1.6171875, "loss/idx": 3.0, "loss/logits": 0.16708572953939438, "step": 412 }, { "epoch": 0.0061669864639873374, "grad_norm": 0.99609375, "grad_norm_var": 0.2674448013305664, "learning_rate": 0.0001, "loss": 1.6781, "loss/crossentropy": 2.6679431200027466, "loss/fcd": 1.5234375, "loss/idx": 3.0, "loss/logits": 0.15461371839046478, "step": 413 }, { "epoch": 0.006181918634602319, "grad_norm": 1.6015625, "grad_norm_var": 0.267509396870931, "learning_rate": 0.0001, "loss": 2.2249, "loss/crossentropy": 2.26759135723114, "loss/fcd": 1.953125, "loss/idx": 3.0, "loss/logits": 0.27176591753959656, "step": 414 }, { "epoch": 0.0061968508052173, "grad_norm": 1.1015625, "grad_norm_var": 0.2078927993774414, "learning_rate": 0.0001, "loss": 1.8247, "loss/crossentropy": 2.666002035140991, "loss/fcd": 1.640625, "loss/idx": 3.0, "loss/logits": 0.1840425804257393, "step": 415 }, { "epoch": 0.006211782975832282, "grad_norm": 1.078125, "grad_norm_var": 0.1460733413696289, "learning_rate": 0.0001, "loss": 1.7812, "loss/crossentropy": 2.704661011695862, "loss/fcd": 1.6015625, "loss/idx": 3.0, "loss/logits": 0.17964735627174377, "step": 416 }, { "epoch": 0.006226715146447263, "grad_norm": 1.1328125, "grad_norm_var": 0.1437936782836914, "learning_rate": 0.0001, "loss": 1.7362, "loss/crossentropy": 2.7326667308807373, "loss/fcd": 1.57421875, "loss/idx": 3.0, "loss/logits": 0.16196689009666443, "step": 417 }, { "epoch": 0.006241647317062245, "grad_norm": 1.09375, "grad_norm_var": 0.13800805409749348, "learning_rate": 0.0001, "loss": 1.7713, "loss/crossentropy": 2.713056445121765, "loss/fcd": 1.59375, "loss/idx": 3.0, "loss/logits": 0.1775575578212738, "step": 418 }, { "epoch": 0.006256579487677226, "grad_norm": 0.94921875, "grad_norm_var": 0.14775772094726564, "learning_rate": 0.0001, "loss": 1.6976, "loss/crossentropy": 2.5395586490631104, "loss/fcd": 1.546875, "loss/idx": 3.0, "loss/logits": 0.1507459655404091, "step": 419 }, { "epoch": 0.006271511658292208, "grad_norm": 1.015625, "grad_norm_var": 0.15028254191080728, "learning_rate": 0.0001, "loss": 1.9222, "loss/crossentropy": 2.7291557788848877, "loss/fcd": 1.7109375, "loss/idx": 3.0, "loss/logits": 0.21127325296401978, "step": 420 }, { "epoch": 0.0062864438289071895, "grad_norm": 1.203125, "grad_norm_var": 0.15086771647135416, "learning_rate": 0.0001, "loss": 1.8439, "loss/crossentropy": 2.6306716203689575, "loss/fcd": 1.66015625, "loss/idx": 3.0, "loss/logits": 0.18376273661851883, "step": 421 }, { "epoch": 0.0063013759995221704, "grad_norm": 1.0078125, "grad_norm_var": 0.038972727457682294, "learning_rate": 0.0001, "loss": 1.7753, "loss/crossentropy": 2.4223052263259888, "loss/fcd": 1.59375, "loss/idx": 3.0, "loss/logits": 0.1815304309129715, "step": 422 }, { "epoch": 0.006316308170137152, "grad_norm": 1.2265625, "grad_norm_var": 0.03155085245768229, "learning_rate": 0.0001, "loss": 2.0225, "loss/crossentropy": 2.6342397928237915, "loss/fcd": 1.7890625, "loss/idx": 3.0, "loss/logits": 0.23343171179294586, "step": 423 }, { "epoch": 0.006331240340752133, "grad_norm": 1.0078125, "grad_norm_var": 0.03056818644205729, "learning_rate": 0.0001, "loss": 1.7615, "loss/crossentropy": 2.7350512742996216, "loss/fcd": 1.59375, "loss/idx": 3.0, "loss/logits": 0.16770032793283463, "step": 424 }, { "epoch": 0.006346172511367115, "grad_norm": 1.4453125, "grad_norm_var": 0.031705474853515624, "learning_rate": 0.0001, "loss": 1.7408, "loss/crossentropy": 2.6920334100723267, "loss/fcd": 1.578125, "loss/idx": 3.0, "loss/logits": 0.1627093330025673, "step": 425 }, { "epoch": 0.006361104681982096, "grad_norm": 0.91796875, "grad_norm_var": 0.033474159240722653, "learning_rate": 0.0001, "loss": 1.795, "loss/crossentropy": 2.5012316703796387, "loss/fcd": 1.625, "loss/idx": 3.0, "loss/logits": 0.16997068375349045, "step": 426 }, { "epoch": 0.006376036852597078, "grad_norm": 1.1796875, "grad_norm_var": 0.032274818420410155, "learning_rate": 0.0001, "loss": 1.5979, "loss/crossentropy": 2.5879992246627808, "loss/fcd": 1.4609375, "loss/idx": 3.0, "loss/logits": 0.13699475675821304, "step": 427 }, { "epoch": 0.006390969023212059, "grad_norm": 1.375, "grad_norm_var": 0.03588809967041016, "learning_rate": 0.0001, "loss": 1.8766, "loss/crossentropy": 2.762297749519348, "loss/fcd": 1.6875, "loss/idx": 3.0, "loss/logits": 0.18910983949899673, "step": 428 }, { "epoch": 0.006405901193827041, "grad_norm": 0.98046875, "grad_norm_var": 0.03621514638264974, "learning_rate": 0.0001, "loss": 1.7401, "loss/crossentropy": 2.8022525310516357, "loss/fcd": 1.56640625, "loss/idx": 3.0, "loss/logits": 0.17373749613761902, "step": 429 }, { "epoch": 0.0064208333644420225, "grad_norm": 1.125, "grad_norm_var": 0.021384620666503908, "learning_rate": 0.0001, "loss": 2.0262, "loss/crossentropy": 2.3079699277877808, "loss/fcd": 1.828125, "loss/idx": 3.0, "loss/logits": 0.19811610877513885, "step": 430 }, { "epoch": 0.0064357655350570035, "grad_norm": 1.0703125, "grad_norm_var": 0.021501604715983072, "learning_rate": 0.0001, "loss": 1.8896, "loss/crossentropy": 2.638156533241272, "loss/fcd": 1.6796875, "loss/idx": 3.0, "loss/logits": 0.20995599031448364, "step": 431 }, { "epoch": 0.006450697705671985, "grad_norm": 1.0390625, "grad_norm_var": 0.02177880605061849, "learning_rate": 0.0001, "loss": 1.7512, "loss/crossentropy": 2.2521796226501465, "loss/fcd": 1.5859375, "loss/idx": 3.0, "loss/logits": 0.16525574773550034, "step": 432 }, { "epoch": 0.006465629876286966, "grad_norm": 1.0234375, "grad_norm_var": 0.022202491760253906, "learning_rate": 0.0001, "loss": 1.7417, "loss/crossentropy": 2.7757058143615723, "loss/fcd": 1.5703125, "loss/idx": 3.0, "loss/logits": 0.1713484227657318, "step": 433 }, { "epoch": 0.006480562046901948, "grad_norm": 1.3203125, "grad_norm_var": 0.02510827382405599, "learning_rate": 0.0001, "loss": 1.7516, "loss/crossentropy": 2.52429461479187, "loss/fcd": 1.58984375, "loss/idx": 3.0, "loss/logits": 0.16173189133405685, "step": 434 }, { "epoch": 0.006495494217516929, "grad_norm": 0.88671875, "grad_norm_var": 0.026758257548014322, "learning_rate": 0.0001, "loss": 1.778, "loss/crossentropy": 2.576547145843506, "loss/fcd": 1.5859375, "loss/idx": 3.0, "loss/logits": 0.19203433394432068, "step": 435 }, { "epoch": 0.006510426388131911, "grad_norm": 0.9296875, "grad_norm_var": 0.028347206115722657, "learning_rate": 0.0001, "loss": 1.8188, "loss/crossentropy": 2.698705315589905, "loss/fcd": 1.63671875, "loss/idx": 3.0, "loss/logits": 0.18205714225769043, "step": 436 }, { "epoch": 0.006525358558746892, "grad_norm": 1.03125, "grad_norm_var": 0.028028297424316406, "learning_rate": 0.0001, "loss": 1.6624, "loss/crossentropy": 2.384024500846863, "loss/fcd": 1.5, "loss/idx": 3.0, "loss/logits": 0.1624050736427307, "step": 437 }, { "epoch": 0.006540290729361874, "grad_norm": 0.85546875, "grad_norm_var": 0.031308746337890624, "learning_rate": 0.0001, "loss": 1.7476, "loss/crossentropy": 2.4632567167282104, "loss/fcd": 1.5859375, "loss/idx": 3.0, "loss/logits": 0.16169880330562592, "step": 438 }, { "epoch": 0.0065552228999768555, "grad_norm": 0.98828125, "grad_norm_var": 0.03046716054280599, "learning_rate": 0.0001, "loss": 1.7397, "loss/crossentropy": 2.397798180580139, "loss/fcd": 1.58984375, "loss/idx": 3.0, "loss/logits": 0.1498069018125534, "step": 439 }, { "epoch": 0.0065701550705918365, "grad_norm": 1.140625, "grad_norm_var": 0.030406634012858074, "learning_rate": 0.0001, "loss": 2.0347, "loss/crossentropy": 2.70195472240448, "loss/fcd": 1.7890625, "loss/idx": 3.0, "loss/logits": 0.2456425279378891, "step": 440 }, { "epoch": 0.006585087241206818, "grad_norm": 0.8671875, "grad_norm_var": 0.02327416737874349, "learning_rate": 0.0001, "loss": 1.7978, "loss/crossentropy": 2.505584955215454, "loss/fcd": 1.6171875, "loss/idx": 3.0, "loss/logits": 0.1805657595396042, "step": 441 }, { "epoch": 0.006600019411821799, "grad_norm": 0.9140625, "grad_norm_var": 0.023341623942057292, "learning_rate": 0.0001, "loss": 1.7628, "loss/crossentropy": 2.408290147781372, "loss/fcd": 1.59375, "loss/idx": 3.0, "loss/logits": 0.16904298961162567, "step": 442 }, { "epoch": 0.006614951582436781, "grad_norm": 0.9140625, "grad_norm_var": 0.022995758056640624, "learning_rate": 0.0001, "loss": 1.8013, "loss/crossentropy": 2.4535306692123413, "loss/fcd": 1.62109375, "loss/idx": 3.0, "loss/logits": 0.1801793947815895, "step": 443 }, { "epoch": 0.006629883753051762, "grad_norm": 0.93359375, "grad_norm_var": 0.014798418680826823, "learning_rate": 0.0001, "loss": 1.7924, "loss/crossentropy": 2.796666383743286, "loss/fcd": 1.60546875, "loss/idx": 3.0, "loss/logits": 0.18690849840641022, "step": 444 }, { "epoch": 0.006644815923666744, "grad_norm": 1.5859375, "grad_norm_var": 0.03603515625, "learning_rate": 0.0001, "loss": 1.8639, "loss/crossentropy": 2.8323148488998413, "loss/fcd": 1.671875, "loss/idx": 3.0, "loss/logits": 0.19198895245790482, "step": 445 }, { "epoch": 0.006659748094281725, "grad_norm": 1.125, "grad_norm_var": 0.03603515625, "learning_rate": 0.0001, "loss": 1.701, "loss/crossentropy": 2.536306142807007, "loss/fcd": 1.54296875, "loss/idx": 3.0, "loss/logits": 0.1580105796456337, "step": 446 }, { "epoch": 0.006674680264896707, "grad_norm": 0.859375, "grad_norm_var": 0.037937164306640625, "learning_rate": 0.0001, "loss": 1.6491, "loss/crossentropy": 2.4709479808807373, "loss/fcd": 1.4921875, "loss/idx": 3.0, "loss/logits": 0.15694593638181686, "step": 447 }, { "epoch": 0.0066896124355116885, "grad_norm": 1.03125, "grad_norm_var": 0.03792724609375, "learning_rate": 0.0001, "loss": 1.9007, "loss/crossentropy": 2.7610350847244263, "loss/fcd": 1.68359375, "loss/idx": 3.0, "loss/logits": 0.21714556217193604, "step": 448 }, { "epoch": 0.0067045446061266695, "grad_norm": 0.91796875, "grad_norm_var": 0.03864994049072266, "learning_rate": 0.0001, "loss": 1.6404, "loss/crossentropy": 2.662341594696045, "loss/fcd": 1.484375, "loss/idx": 3.0, "loss/logits": 0.15599173307418823, "step": 449 }, { "epoch": 0.006719476776741651, "grad_norm": 1.0703125, "grad_norm_var": 0.032505734761555986, "learning_rate": 0.0001, "loss": 1.6828, "loss/crossentropy": 2.77057945728302, "loss/fcd": 1.515625, "loss/idx": 3.0, "loss/logits": 0.16715113073587418, "step": 450 }, { "epoch": 0.006734408947356632, "grad_norm": 1.4921875, "grad_norm_var": 0.04601643880208333, "learning_rate": 0.0001, "loss": 1.7714, "loss/crossentropy": 2.659293055534363, "loss/fcd": 1.59375, "loss/idx": 3.0, "loss/logits": 0.17766769975423813, "step": 451 }, { "epoch": 0.006749341117971614, "grad_norm": 0.875, "grad_norm_var": 0.04701512654622396, "learning_rate": 0.0001, "loss": 1.602, "loss/crossentropy": 2.4067925214767456, "loss/fcd": 1.453125, "loss/idx": 3.0, "loss/logits": 0.14886727929115295, "step": 452 }, { "epoch": 0.006764273288586595, "grad_norm": 1.0703125, "grad_norm_var": 0.04707743326822917, "learning_rate": 0.0001, "loss": 1.8666, "loss/crossentropy": 2.4434362649917603, "loss/fcd": 1.6875, "loss/idx": 3.0, "loss/logits": 0.179054394364357, "step": 453 }, { "epoch": 0.006779205459201577, "grad_norm": 0.8359375, "grad_norm_var": 0.047581926981608076, "learning_rate": 0.0001, "loss": 1.6547, "loss/crossentropy": 2.508568048477173, "loss/fcd": 1.49609375, "loss/idx": 3.0, "loss/logits": 0.15855654329061508, "step": 454 }, { "epoch": 0.006794137629816558, "grad_norm": 0.8515625, "grad_norm_var": 0.04967142740885417, "learning_rate": 0.0001, "loss": 1.6702, "loss/crossentropy": 2.6750309467315674, "loss/fcd": 1.5078125, "loss/idx": 3.0, "loss/logits": 0.16242003440856934, "step": 455 }, { "epoch": 0.00680906980043154, "grad_norm": 0.7890625, "grad_norm_var": 0.052223459879557295, "learning_rate": 0.0001, "loss": 1.7029, "loss/crossentropy": 2.411786437034607, "loss/fcd": 1.53515625, "loss/idx": 3.0, "loss/logits": 0.16775980591773987, "step": 456 }, { "epoch": 0.0068240019710465215, "grad_norm": 1.109375, "grad_norm_var": 0.05133260091145833, "learning_rate": 0.0001, "loss": 1.7541, "loss/crossentropy": 2.7279776334762573, "loss/fcd": 1.58203125, "loss/idx": 3.0, "loss/logits": 0.17208966612815857, "step": 457 }, { "epoch": 0.0068389341416615025, "grad_norm": 0.86328125, "grad_norm_var": 0.05223433176676432, "learning_rate": 0.0001, "loss": 1.8433, "loss/crossentropy": 2.8027533292770386, "loss/fcd": 1.64453125, "loss/idx": 3.0, "loss/logits": 0.19876495003700256, "step": 458 }, { "epoch": 0.006853866312276484, "grad_norm": 0.87890625, "grad_norm_var": 0.05280939737955729, "learning_rate": 0.0001, "loss": 1.7351, "loss/crossentropy": 2.3620423078536987, "loss/fcd": 1.55859375, "loss/idx": 3.0, "loss/logits": 0.176472008228302, "step": 459 }, { "epoch": 0.006868798482891465, "grad_norm": 0.85546875, "grad_norm_var": 0.05407079060872396, "learning_rate": 0.0001, "loss": 1.7763, "loss/crossentropy": 2.96125864982605, "loss/fcd": 1.58984375, "loss/idx": 3.0, "loss/logits": 0.1864861696958542, "step": 460 }, { "epoch": 0.006883730653506447, "grad_norm": 0.53125, "grad_norm_var": 0.043050130208333336, "learning_rate": 0.0001, "loss": 1.5814, "loss/crossentropy": 2.5584583282470703, "loss/fcd": 1.43359375, "loss/idx": 3.5, "loss/logits": 0.14783349633216858, "step": 461 }, { "epoch": 0.006898662824121428, "grad_norm": 0.75390625, "grad_norm_var": 0.042862892150878906, "learning_rate": 0.0001, "loss": 1.8894, "loss/crossentropy": 2.4608118534088135, "loss/fcd": 1.66015625, "loss/idx": 3.5, "loss/logits": 0.22920453548431396, "step": 462 }, { "epoch": 0.00691359499473641, "grad_norm": 0.62890625, "grad_norm_var": 0.04817072550455729, "learning_rate": 0.0001, "loss": 1.5976, "loss/crossentropy": 2.7459927797317505, "loss/fcd": 1.4296875, "loss/idx": 3.5, "loss/logits": 0.16794496029615402, "step": 463 }, { "epoch": 0.006928527165351392, "grad_norm": 0.5625, "grad_norm_var": 0.05430475870768229, "learning_rate": 0.0001, "loss": 1.6139, "loss/crossentropy": 2.567529797554016, "loss/fcd": 1.44140625, "loss/idx": 3.5, "loss/logits": 0.1725279539823532, "step": 464 }, { "epoch": 0.006943459335966373, "grad_norm": 0.63671875, "grad_norm_var": 0.057838694254557295, "learning_rate": 0.0001, "loss": 1.6145, "loss/crossentropy": 2.7352017164230347, "loss/fcd": 1.44921875, "loss/idx": 3.5, "loss/logits": 0.16529685258865356, "step": 465 }, { "epoch": 0.0069583915065813545, "grad_norm": 0.55859375, "grad_norm_var": 0.060045814514160155, "learning_rate": 0.0001, "loss": 1.6171, "loss/crossentropy": 2.4850025177001953, "loss/fcd": 1.4375, "loss/idx": 3.5, "loss/logits": 0.17964298278093338, "step": 466 }, { "epoch": 0.0069733236771963355, "grad_norm": 1.09375, "grad_norm_var": 0.034832191467285153, "learning_rate": 0.0001, "loss": 2.1071, "loss/crossentropy": 2.3254082202911377, "loss/fcd": 1.8828125, "loss/idx": 3.5, "loss/logits": 0.22428961843252182, "step": 467 }, { "epoch": 0.006988255847811317, "grad_norm": 0.57421875, "grad_norm_var": 0.03771565755208333, "learning_rate": 0.0001, "loss": 1.7908, "loss/crossentropy": 2.8251163959503174, "loss/fcd": 1.59375, "loss/idx": 3.5, "loss/logits": 0.19702401757240295, "step": 468 }, { "epoch": 0.007003188018426298, "grad_norm": 0.7421875, "grad_norm_var": 0.03205464680989583, "learning_rate": 0.0001, "loss": 1.6263, "loss/crossentropy": 2.5575175285339355, "loss/fcd": 1.47265625, "loss/idx": 3.5, "loss/logits": 0.15361517667770386, "step": 469 }, { "epoch": 0.00701812018904128, "grad_norm": 0.7890625, "grad_norm_var": 0.031758626302083336, "learning_rate": 0.0001, "loss": 2.1285, "loss/crossentropy": 2.5669792890548706, "loss/fcd": 1.8671875, "loss/idx": 3.5, "loss/logits": 0.26132843643426895, "step": 470 }, { "epoch": 0.007033052359656261, "grad_norm": 0.66796875, "grad_norm_var": 0.03171380360921224, "learning_rate": 0.0001, "loss": 1.6446, "loss/crossentropy": 2.677261710166931, "loss/fcd": 1.46875, "loss/idx": 3.5, "loss/logits": 0.17586223781108856, "step": 471 }, { "epoch": 0.007047984530271243, "grad_norm": 0.59375, "grad_norm_var": 0.03313795725504557, "learning_rate": 0.0001, "loss": 1.703, "loss/crossentropy": 2.6583694219589233, "loss/fcd": 1.515625, "loss/idx": 3.5, "loss/logits": 0.1873987913131714, "step": 472 }, { "epoch": 0.007062916700886225, "grad_norm": 0.59765625, "grad_norm_var": 0.0243011474609375, "learning_rate": 0.0001, "loss": 1.7603, "loss/crossentropy": 2.4877501726150513, "loss/fcd": 1.5703125, "loss/idx": 3.5, "loss/logits": 0.19003190845251083, "step": 473 }, { "epoch": 0.007077848871501206, "grad_norm": 0.66015625, "grad_norm_var": 0.022674560546875, "learning_rate": 0.0001, "loss": 1.589, "loss/crossentropy": 2.693013906478882, "loss/fcd": 1.421875, "loss/idx": 3.5, "loss/logits": 0.16710253804922104, "step": 474 }, { "epoch": 0.0070927810421161875, "grad_norm": 0.703125, "grad_norm_var": 0.020302772521972656, "learning_rate": 0.0001, "loss": 1.7601, "loss/crossentropy": 2.572099208831787, "loss/fcd": 1.5703125, "loss/idx": 3.5, "loss/logits": 0.18978480249643326, "step": 475 }, { "epoch": 0.0071077132127311685, "grad_norm": 0.50390625, "grad_norm_var": 0.020005226135253906, "learning_rate": 0.0001, "loss": 1.6699, "loss/crossentropy": 2.449280023574829, "loss/fcd": 1.49609375, "loss/idx": 3.5, "loss/logits": 0.17377550154924393, "step": 476 }, { "epoch": 0.00712264538334615, "grad_norm": 0.59375, "grad_norm_var": 0.019156837463378908, "learning_rate": 0.0001, "loss": 1.5723, "loss/crossentropy": 2.59575355052948, "loss/fcd": 1.41015625, "loss/idx": 3.5, "loss/logits": 0.1621238887310028, "step": 477 }, { "epoch": 0.007137577553961131, "grad_norm": 0.8203125, "grad_norm_var": 0.02020848592122396, "learning_rate": 0.0001, "loss": 1.6419, "loss/crossentropy": 2.577454090118408, "loss/fcd": 1.47265625, "loss/idx": 3.5, "loss/logits": 0.1692582219839096, "step": 478 }, { "epoch": 0.007152509724576113, "grad_norm": 0.52734375, "grad_norm_var": 0.021415201822916667, "learning_rate": 0.0001, "loss": 1.6061, "loss/crossentropy": 2.471789002418518, "loss/fcd": 1.45703125, "loss/idx": 3.5, "loss/logits": 0.14902313798666, "step": 479 }, { "epoch": 0.007167441895191094, "grad_norm": 0.53515625, "grad_norm_var": 0.02183221181233724, "learning_rate": 0.0001, "loss": 1.4982, "loss/crossentropy": 2.617826461791992, "loss/fcd": 1.35546875, "loss/idx": 3.5, "loss/logits": 0.14270956814289093, "step": 480 }, { "epoch": 0.007182374065806076, "grad_norm": 0.484375, "grad_norm_var": 0.02380345662434896, "learning_rate": 0.0001, "loss": 1.5917, "loss/crossentropy": 2.6413131952285767, "loss/fcd": 1.421875, "loss/idx": 3.5, "loss/logits": 0.16986460238695145, "step": 481 }, { "epoch": 0.007197306236421058, "grad_norm": 0.7578125, "grad_norm_var": 0.02378075917561849, "learning_rate": 0.0001, "loss": 1.662, "loss/crossentropy": 2.6517175436019897, "loss/fcd": 1.48046875, "loss/idx": 3.5, "loss/logits": 0.18155179917812347, "step": 482 }, { "epoch": 0.007212238407036039, "grad_norm": 0.55078125, "grad_norm_var": 0.011187489827473958, "learning_rate": 0.0001, "loss": 1.656, "loss/crossentropy": 2.555253267288208, "loss/fcd": 1.47265625, "loss/idx": 3.5, "loss/logits": 0.1832970231771469, "step": 483 }, { "epoch": 0.0072271705776510205, "grad_norm": 0.57421875, "grad_norm_var": 0.011187489827473958, "learning_rate": 0.0001, "loss": 1.4994, "loss/crossentropy": 2.5745534896850586, "loss/fcd": 1.359375, "loss/idx": 3.5, "loss/logits": 0.1399783045053482, "step": 484 }, { "epoch": 0.0072421027482660015, "grad_norm": 0.625, "grad_norm_var": 0.010313924153645833, "learning_rate": 0.0001, "loss": 1.7049, "loss/crossentropy": 2.673888325691223, "loss/fcd": 1.52734375, "loss/idx": 3.5, "loss/logits": 0.1775348037481308, "step": 485 }, { "epoch": 0.007257034918880983, "grad_norm": 0.5625, "grad_norm_var": 0.008536529541015626, "learning_rate": 0.0001, "loss": 1.4905, "loss/crossentropy": 2.710415840148926, "loss/fcd": 1.3515625, "loss/idx": 3.5, "loss/logits": 0.13889621198177338, "step": 486 }, { "epoch": 0.007271967089495964, "grad_norm": 0.5234375, "grad_norm_var": 0.008722368876139324, "learning_rate": 0.0001, "loss": 1.6983, "loss/crossentropy": 2.6029082536697388, "loss/fcd": 1.50390625, "loss/idx": 3.5, "loss/logits": 0.19442546367645264, "step": 487 }, { "epoch": 0.007286899260110946, "grad_norm": 1.015625, "grad_norm_var": 0.019447771708170573, "learning_rate": 0.0001, "loss": 1.8318, "loss/crossentropy": 2.562933087348938, "loss/fcd": 1.66015625, "loss/idx": 3.5, "loss/logits": 0.17163604497909546, "step": 488 }, { "epoch": 0.007301831430725927, "grad_norm": 0.71484375, "grad_norm_var": 0.019844500223795573, "learning_rate": 0.0001, "loss": 1.9322, "loss/crossentropy": 2.3318947553634644, "loss/fcd": 1.70703125, "loss/idx": 3.5, "loss/logits": 0.22517724335193634, "step": 489 }, { "epoch": 0.007316763601340909, "grad_norm": 0.66796875, "grad_norm_var": 0.019875017801920573, "learning_rate": 0.0001, "loss": 1.6361, "loss/crossentropy": 2.6366230249404907, "loss/fcd": 1.48046875, "loss/idx": 3.5, "loss/logits": 0.1556784212589264, "step": 490 }, { "epoch": 0.007331695771955891, "grad_norm": 0.6328125, "grad_norm_var": 0.01954542795817057, "learning_rate": 0.0001, "loss": 1.6544, "loss/crossentropy": 2.5596346855163574, "loss/fcd": 1.484375, "loss/idx": 3.5, "loss/logits": 0.17000765353441238, "step": 491 }, { "epoch": 0.007346627942570872, "grad_norm": 0.62890625, "grad_norm_var": 0.018410174051920573, "learning_rate": 0.0001, "loss": 1.666, "loss/crossentropy": 2.6965473890304565, "loss/fcd": 1.49609375, "loss/idx": 3.5, "loss/logits": 0.16991965472698212, "step": 492 }, { "epoch": 0.0073615601131858536, "grad_norm": 0.55859375, "grad_norm_var": 0.01869684855143229, "learning_rate": 0.0001, "loss": 1.6618, "loss/crossentropy": 2.6663613319396973, "loss/fcd": 1.48046875, "loss/idx": 3.5, "loss/logits": 0.18130898475646973, "step": 493 }, { "epoch": 0.0073764922838008345, "grad_norm": 0.57421875, "grad_norm_var": 0.016441790262858073, "learning_rate": 0.0001, "loss": 1.5194, "loss/crossentropy": 2.4370299577713013, "loss/fcd": 1.3671875, "loss/idx": 3.5, "loss/logits": 0.1522502452135086, "step": 494 }, { "epoch": 0.007391424454415816, "grad_norm": 0.54296875, "grad_norm_var": 0.016262245178222657, "learning_rate": 0.0001, "loss": 1.5874, "loss/crossentropy": 2.5481144189834595, "loss/fcd": 1.4296875, "loss/idx": 3.5, "loss/logits": 0.1577276885509491, "step": 495 }, { "epoch": 0.007406356625030797, "grad_norm": 0.51953125, "grad_norm_var": 0.01645806630452474, "learning_rate": 0.0001, "loss": 1.6746, "loss/crossentropy": 2.5940628051757812, "loss/fcd": 1.48046875, "loss/idx": 3.5, "loss/logits": 0.19416546076536179, "step": 496 }, { "epoch": 0.007421288795645779, "grad_norm": 0.515625, "grad_norm_var": 0.015950457255045573, "learning_rate": 0.0001, "loss": 1.5521, "loss/crossentropy": 2.6910064220428467, "loss/fcd": 1.390625, "loss/idx": 3.5, "loss/logits": 0.16150504350662231, "step": 497 }, { "epoch": 0.00743622096626076, "grad_norm": 0.51171875, "grad_norm_var": 0.015305582682291667, "learning_rate": 0.0001, "loss": 1.6799, "loss/crossentropy": 2.6522045135498047, "loss/fcd": 1.4921875, "loss/idx": 3.5, "loss/logits": 0.18767046928405762, "step": 498 }, { "epoch": 0.007451153136875742, "grad_norm": 0.53515625, "grad_norm_var": 0.0154388427734375, "learning_rate": 0.0001, "loss": 1.6198, "loss/crossentropy": 2.6188907623291016, "loss/fcd": 1.453125, "loss/idx": 3.5, "loss/logits": 0.1666594073176384, "step": 499 }, { "epoch": 0.007466085307490724, "grad_norm": 0.58203125, "grad_norm_var": 0.015409088134765625, "learning_rate": 0.0001, "loss": 1.7251, "loss/crossentropy": 2.6162188053131104, "loss/fcd": 1.5390625, "loss/idx": 3.5, "loss/logits": 0.1860610470175743, "step": 500 }, { "epoch": 0.007481017478105705, "grad_norm": 1.7890625, "grad_norm_var": 0.10290323893229167, "learning_rate": 0.0001, "loss": 1.859, "loss/crossentropy": 2.9810004234313965, "loss/fcd": 1.671875, "loss/idx": 4.0, "loss/logits": 0.1871199607849121, "step": 501 }, { "epoch": 0.0074959496487206866, "grad_norm": 3.328125, "grad_norm_var": 0.5377329508463542, "learning_rate": 0.0001, "loss": 1.8599, "loss/crossentropy": 2.6053121089935303, "loss/fcd": 1.67578125, "loss/idx": 4.0, "loss/logits": 0.18407516926527023, "step": 502 }, { "epoch": 0.0075108818193356675, "grad_norm": 2.84375, "grad_norm_var": 0.7724077860514323, "learning_rate": 0.0001, "loss": 1.8754, "loss/crossentropy": 2.6646742820739746, "loss/fcd": 1.69140625, "loss/idx": 4.0, "loss/logits": 0.18401946127414703, "step": 503 }, { "epoch": 0.007525813989950649, "grad_norm": 2.6875, "grad_norm_var": 0.9511329650878906, "learning_rate": 0.0001, "loss": 1.9323, "loss/crossentropy": 2.666251301765442, "loss/fcd": 1.734375, "loss/idx": 4.0, "loss/logits": 0.1978878602385521, "step": 504 }, { "epoch": 0.00754074616056563, "grad_norm": 2.671875, "grad_norm_var": 1.0894693374633788, "learning_rate": 0.0001, "loss": 1.961, "loss/crossentropy": 2.8120644092559814, "loss/fcd": 1.76171875, "loss/idx": 4.0, "loss/logits": 0.19924385100603104, "step": 505 }, { "epoch": 0.007555678331180612, "grad_norm": 2.421875, "grad_norm_var": 1.1516153971354166, "learning_rate": 0.0001, "loss": 1.9617, "loss/crossentropy": 2.4772077798843384, "loss/fcd": 1.7578125, "loss/idx": 4.0, "loss/logits": 0.20385053008794785, "step": 506 }, { "epoch": 0.007570610501795593, "grad_norm": 2.234375, "grad_norm_var": 1.1621986389160157, "learning_rate": 0.0001, "loss": 2.0677, "loss/crossentropy": 2.6911627054214478, "loss/fcd": 1.83984375, "loss/idx": 4.0, "loss/logits": 0.2278064265847206, "step": 507 }, { "epoch": 0.007585542672410575, "grad_norm": 2.0625, "grad_norm_var": 1.136742083231608, "learning_rate": 0.0001, "loss": 1.9551, "loss/crossentropy": 2.6768856048583984, "loss/fcd": 1.7578125, "loss/idx": 4.0, "loss/logits": 0.19726867973804474, "step": 508 }, { "epoch": 0.007600474843025557, "grad_norm": 1.6953125, "grad_norm_var": 1.0712290445963542, "learning_rate": 0.0001, "loss": 1.9038, "loss/crossentropy": 2.9186242818832397, "loss/fcd": 1.7109375, "loss/idx": 4.0, "loss/logits": 0.19286338984966278, "step": 509 }, { "epoch": 0.007615407013640538, "grad_norm": 1.5078125, "grad_norm_var": 0.9986719131469727, "learning_rate": 0.0001, "loss": 1.9477, "loss/crossentropy": 2.547776460647583, "loss/fcd": 1.74609375, "loss/idx": 4.0, "loss/logits": 0.20161531120538712, "step": 510 }, { "epoch": 0.00763033918425552, "grad_norm": 1.3125, "grad_norm_var": 0.9217814127604167, "learning_rate": 0.0001, "loss": 1.8293, "loss/crossentropy": 2.7642080783843994, "loss/fcd": 1.64453125, "loss/idx": 4.0, "loss/logits": 0.1847827509045601, "step": 511 }, { "epoch": 0.0076452713548705005, "grad_norm": 1.5, "grad_norm_var": 0.8273889541625976, "learning_rate": 0.0001, "loss": 1.922, "loss/crossentropy": 2.5652897357940674, "loss/fcd": 1.72265625, "loss/idx": 4.0, "loss/logits": 0.19934390485286713, "step": 512 }, { "epoch": 0.007660203525485482, "grad_norm": 1.28125, "grad_norm_var": 0.7367451349894206, "learning_rate": 0.0001, "loss": 2.081, "loss/crossentropy": 2.3452470302581787, "loss/fcd": 1.84375, "loss/idx": 4.0, "loss/logits": 0.23722243309020996, "step": 513 }, { "epoch": 0.007675135696100463, "grad_norm": 1.7890625, "grad_norm_var": 0.6175554911295573, "learning_rate": 0.0001, "loss": 2.1585, "loss/crossentropy": 2.6333130598068237, "loss/fcd": 1.9453125, "loss/idx": 4.0, "loss/logits": 0.21314261853694916, "step": 514 }, { "epoch": 0.007690067866715445, "grad_norm": 1.203125, "grad_norm_var": 0.5247639338175456, "learning_rate": 0.0001, "loss": 2.0899, "loss/crossentropy": 2.723830461502075, "loss/fcd": 1.84765625, "loss/idx": 4.0, "loss/logits": 0.24222549051046371, "step": 515 }, { "epoch": 0.007705000037330426, "grad_norm": 1.859375, "grad_norm_var": 0.3968424479166667, "learning_rate": 0.0001, "loss": 2.0611, "loss/crossentropy": 2.3409069776535034, "loss/fcd": 1.828125, "loss/idx": 4.0, "loss/logits": 0.23299187421798706, "step": 516 }, { "epoch": 0.007719932207945408, "grad_norm": 1.0625, "grad_norm_var": 0.4514055887858073, "learning_rate": 0.0001, "loss": 1.7604, "loss/crossentropy": 2.5428874492645264, "loss/fcd": 1.59765625, "loss/idx": 4.0, "loss/logits": 0.16276978701353073, "step": 517 }, { "epoch": 0.00773486437856039, "grad_norm": 0.94921875, "grad_norm_var": 0.37315413157145183, "learning_rate": 0.0001, "loss": 2.0376, "loss/crossentropy": 2.533219337463379, "loss/fcd": 1.80078125, "loss/idx": 4.0, "loss/logits": 0.2368319183588028, "step": 518 }, { "epoch": 0.007749796549175371, "grad_norm": 0.96875, "grad_norm_var": 0.33634993235270183, "learning_rate": 0.0001, "loss": 1.8927, "loss/crossentropy": 2.352767586708069, "loss/fcd": 1.70703125, "loss/idx": 4.0, "loss/logits": 0.18567398935556412, "step": 519 }, { "epoch": 0.007764728719790353, "grad_norm": 1.1171875, "grad_norm_var": 0.28380171457926434, "learning_rate": 0.0001, "loss": 2.0012, "loss/crossentropy": 2.597265362739563, "loss/fcd": 1.76953125, "loss/idx": 4.0, "loss/logits": 0.23170067369937897, "step": 520 }, { "epoch": 0.0077796608904053335, "grad_norm": 1.125, "grad_norm_var": 0.21275221506754557, "learning_rate": 0.0001, "loss": 1.7805, "loss/crossentropy": 2.6241856813430786, "loss/fcd": 1.60546875, "loss/idx": 4.0, "loss/logits": 0.17502596974372864, "step": 521 }, { "epoch": 0.007794593061020315, "grad_norm": 1.203125, "grad_norm_var": 0.15669447580973309, "learning_rate": 0.0001, "loss": 1.915, "loss/crossentropy": 2.6768182516098022, "loss/fcd": 1.7109375, "loss/idx": 4.0, "loss/logits": 0.2040882408618927, "step": 522 }, { "epoch": 0.007809525231635296, "grad_norm": 1.3671875, "grad_norm_var": 0.11062513987223307, "learning_rate": 0.0001, "loss": 1.7797, "loss/crossentropy": 2.5710521936416626, "loss/fcd": 1.61328125, "loss/idx": 4.0, "loss/logits": 0.16644778847694397, "step": 523 }, { "epoch": 0.007824457402250278, "grad_norm": 1.0859375, "grad_norm_var": 0.08074334462483725, "learning_rate": 0.0001, "loss": 2.0208, "loss/crossentropy": 2.3945902585983276, "loss/fcd": 1.78515625, "loss/idx": 4.0, "loss/logits": 0.23565331101417542, "step": 524 }, { "epoch": 0.007839389572865259, "grad_norm": 0.921875, "grad_norm_var": 0.07882989247639974, "learning_rate": 0.0001, "loss": 1.6759, "loss/crossentropy": 2.6742191314697266, "loss/fcd": 1.51953125, "loss/idx": 4.0, "loss/logits": 0.15633848309516907, "step": 525 }, { "epoch": 0.007854321743480242, "grad_norm": 1.0078125, "grad_norm_var": 0.07832533518473307, "learning_rate": 0.0001, "loss": 1.7404, "loss/crossentropy": 2.4373925924301147, "loss/fcd": 1.55859375, "loss/idx": 4.0, "loss/logits": 0.18178628385066986, "step": 526 }, { "epoch": 0.007869253914095223, "grad_norm": 1.140625, "grad_norm_var": 0.07838687896728516, "learning_rate": 0.0001, "loss": 1.8653, "loss/crossentropy": 2.4041073322296143, "loss/fcd": 1.66796875, "loss/idx": 4.0, "loss/logits": 0.19732805341482162, "step": 527 }, { "epoch": 0.007884186084710204, "grad_norm": 1.0390625, "grad_norm_var": 0.0746957778930664, "learning_rate": 0.0001, "loss": 1.8203, "loss/crossentropy": 2.6815097332000732, "loss/fcd": 1.63671875, "loss/idx": 4.0, "loss/logits": 0.18355443328619003, "step": 528 }, { "epoch": 0.007899118255325185, "grad_norm": 2.0, "grad_norm_var": 0.11524244944254557, "learning_rate": 0.0001, "loss": 2.093, "loss/crossentropy": 2.7026476860046387, "loss/fcd": 1.69140625, "loss/idx": 4.0, "loss/logits": 0.40154484659433365, "step": 529 }, { "epoch": 0.007914050425940167, "grad_norm": 0.88671875, "grad_norm_var": 0.10007120768229166, "learning_rate": 0.0001, "loss": 1.78, "loss/crossentropy": 2.686976909637451, "loss/fcd": 1.59375, "loss/idx": 4.0, "loss/logits": 0.18629659712314606, "step": 530 }, { "epoch": 0.007928982596555148, "grad_norm": 1.1796875, "grad_norm_var": 0.10004450480143229, "learning_rate": 0.0001, "loss": 1.8566, "loss/crossentropy": 2.3627219200134277, "loss/fcd": 1.671875, "loss/idx": 4.0, "loss/logits": 0.1846960484981537, "step": 531 }, { "epoch": 0.00794391476717013, "grad_norm": 1.0234375, "grad_norm_var": 0.0682342529296875, "learning_rate": 0.0001, "loss": 1.9221, "loss/crossentropy": 2.5744149684906006, "loss/fcd": 1.703125, "loss/idx": 4.0, "loss/logits": 0.21895581483840942, "step": 532 }, { "epoch": 0.00795884693778511, "grad_norm": 0.84375, "grad_norm_var": 0.0731903076171875, "learning_rate": 0.0001, "loss": 1.6618, "loss/crossentropy": 2.7105607986450195, "loss/fcd": 1.5078125, "loss/idx": 4.0, "loss/logits": 0.15397943556308746, "step": 533 }, { "epoch": 0.007973779108400093, "grad_norm": 1.21875, "grad_norm_var": 0.07172946929931641, "learning_rate": 0.0001, "loss": 1.8617, "loss/crossentropy": 2.6265203952789307, "loss/fcd": 1.6875, "loss/idx": 4.0, "loss/logits": 0.17423325031995773, "step": 534 }, { "epoch": 0.007988711279015074, "grad_norm": 0.8203125, "grad_norm_var": 0.07635847727457683, "learning_rate": 0.0001, "loss": 1.7946, "loss/crossentropy": 2.5890642404556274, "loss/fcd": 1.6015625, "loss/idx": 4.0, "loss/logits": 0.1930510774254799, "step": 535 }, { "epoch": 0.008003643449630055, "grad_norm": 1.2578125, "grad_norm_var": 0.07747084299723307, "learning_rate": 0.0001, "loss": 1.91, "loss/crossentropy": 2.8161873817443848, "loss/fcd": 1.69921875, "loss/idx": 4.0, "loss/logits": 0.21079972386360168, "step": 536 }, { "epoch": 0.008018575620245038, "grad_norm": 0.85546875, "grad_norm_var": 0.08228327433268229, "learning_rate": 0.0001, "loss": 1.7658, "loss/crossentropy": 2.6365318298339844, "loss/fcd": 1.57421875, "loss/idx": 4.0, "loss/logits": 0.19160092622041702, "step": 537 }, { "epoch": 0.008033507790860019, "grad_norm": 0.8515625, "grad_norm_var": 0.08591105143229166, "learning_rate": 0.0001, "loss": 1.7883, "loss/crossentropy": 2.6011857986450195, "loss/fcd": 1.59765625, "loss/idx": 4.0, "loss/logits": 0.19066426903009415, "step": 538 }, { "epoch": 0.008048439961475, "grad_norm": 0.87109375, "grad_norm_var": 0.08320611317952474, "learning_rate": 0.0001, "loss": 1.8642, "loss/crossentropy": 2.594154477119446, "loss/fcd": 1.6640625, "loss/idx": 4.0, "loss/logits": 0.20009979605674744, "step": 539 }, { "epoch": 0.00806337213208998, "grad_norm": 0.96875, "grad_norm_var": 0.08370202382405599, "learning_rate": 0.0001, "loss": 2.02, "loss/crossentropy": 2.350104331970215, "loss/fcd": 1.80078125, "loss/idx": 4.0, "loss/logits": 0.21917122602462769, "step": 540 }, { "epoch": 0.008078304302704963, "grad_norm": 0.703125, "grad_norm_var": 0.09058780670166015, "learning_rate": 0.0001, "loss": 1.6337, "loss/crossentropy": 2.5559120178222656, "loss/fcd": 1.48046875, "loss/idx": 4.0, "loss/logits": 0.1532716527581215, "step": 541 }, { "epoch": 0.008093236473319944, "grad_norm": 0.8359375, "grad_norm_var": 0.09321180979410808, "learning_rate": 0.0001, "loss": 1.8395, "loss/crossentropy": 2.4848121404647827, "loss/fcd": 1.6484375, "loss/idx": 4.0, "loss/logits": 0.1910661906003952, "step": 542 }, { "epoch": 0.008108168643934925, "grad_norm": 0.83984375, "grad_norm_var": 0.09446996053059896, "learning_rate": 0.0001, "loss": 1.7239, "loss/crossentropy": 2.6207433938980103, "loss/fcd": 1.546875, "loss/idx": 4.0, "loss/logits": 0.17703650146722794, "step": 543 }, { "epoch": 0.008123100814549908, "grad_norm": 0.91015625, "grad_norm_var": 0.09504693349202474, "learning_rate": 0.0001, "loss": 1.6846, "loss/crossentropy": 2.741000533103943, "loss/fcd": 1.51953125, "loss/idx": 4.0, "loss/logits": 0.1650281846523285, "step": 544 }, { "epoch": 0.008138032985164889, "grad_norm": 0.8359375, "grad_norm_var": 0.025172869364420574, "learning_rate": 0.0001, "loss": 1.8462, "loss/crossentropy": 2.739645481109619, "loss/fcd": 1.6328125, "loss/idx": 4.0, "loss/logits": 0.21342986822128296, "step": 545 }, { "epoch": 0.00815296515577987, "grad_norm": 1.484375, "grad_norm_var": 0.04393717447916667, "learning_rate": 0.0001, "loss": 2.2019, "loss/crossentropy": 2.5596991777420044, "loss/fcd": 1.93359375, "loss/idx": 4.0, "loss/logits": 0.2683301120996475, "step": 546 }, { "epoch": 0.00816789732639485, "grad_norm": 0.89453125, "grad_norm_var": 0.040999285380045575, "learning_rate": 0.0001, "loss": 2.1296, "loss/crossentropy": 2.631078362464905, "loss/fcd": 1.87890625, "loss/idx": 4.0, "loss/logits": 0.25071533769369125, "step": 547 }, { "epoch": 0.008182829497009833, "grad_norm": 0.98046875, "grad_norm_var": 0.04069925944010417, "learning_rate": 0.0001, "loss": 1.766, "loss/crossentropy": 2.6593146324157715, "loss/fcd": 1.5859375, "loss/idx": 4.0, "loss/logits": 0.18003150820732117, "step": 548 }, { "epoch": 0.008197761667624814, "grad_norm": 0.90625, "grad_norm_var": 0.0400726318359375, "learning_rate": 0.0001, "loss": 1.6572, "loss/crossentropy": 2.527645230293274, "loss/fcd": 1.5, "loss/idx": 4.0, "loss/logits": 0.1571880429983139, "step": 549 }, { "epoch": 0.008212693838239795, "grad_norm": 0.8359375, "grad_norm_var": 0.035623931884765626, "learning_rate": 0.0001, "loss": 1.7319, "loss/crossentropy": 2.492905378341675, "loss/fcd": 1.55078125, "loss/idx": 4.0, "loss/logits": 0.18110989034175873, "step": 550 }, { "epoch": 0.008227626008854776, "grad_norm": 0.95703125, "grad_norm_var": 0.034825070699055986, "learning_rate": 0.0001, "loss": 1.6543, "loss/crossentropy": 2.4883817434310913, "loss/fcd": 1.51171875, "loss/idx": 4.0, "loss/logits": 0.14261526614427567, "step": 551 }, { "epoch": 0.008242558179469759, "grad_norm": 1.171875, "grad_norm_var": 0.03160800933837891, "learning_rate": 0.0001, "loss": 2.1634, "loss/crossentropy": 2.388165831565857, "loss/fcd": 1.89453125, "loss/idx": 4.0, "loss/logits": 0.2688213065266609, "step": 552 }, { "epoch": 0.00825749035008474, "grad_norm": 0.9453125, "grad_norm_var": 0.031202952067057293, "learning_rate": 0.0001, "loss": 1.5087, "loss/crossentropy": 2.4662253856658936, "loss/fcd": 1.37109375, "loss/idx": 4.0, "loss/logits": 0.13757767528295517, "step": 553 }, { "epoch": 0.008272422520699721, "grad_norm": 0.8671875, "grad_norm_var": 0.031040191650390625, "learning_rate": 0.0001, "loss": 1.7147, "loss/crossentropy": 2.6518259048461914, "loss/fcd": 1.546875, "loss/idx": 4.0, "loss/logits": 0.16782306134700775, "step": 554 }, { "epoch": 0.008287354691314704, "grad_norm": 0.953125, "grad_norm_var": 0.030729103088378906, "learning_rate": 0.0001, "loss": 1.7046, "loss/crossentropy": 2.50111985206604, "loss/fcd": 1.53515625, "loss/idx": 4.0, "loss/logits": 0.1694774478673935, "step": 555 }, { "epoch": 0.008302286861929685, "grad_norm": 0.87890625, "grad_norm_var": 0.030926513671875, "learning_rate": 0.0001, "loss": 1.8382, "loss/crossentropy": 2.4261611700057983, "loss/fcd": 1.6328125, "loss/idx": 4.0, "loss/logits": 0.20543111115694046, "step": 556 }, { "epoch": 0.008317219032544666, "grad_norm": 1.71875, "grad_norm_var": 0.0636566162109375, "learning_rate": 0.0001, "loss": 2.1674, "loss/crossentropy": 2.5285680294036865, "loss/fcd": 1.9140625, "loss/idx": 4.0, "loss/logits": 0.25333091616630554, "step": 557 }, { "epoch": 0.008332151203159647, "grad_norm": 0.8046875, "grad_norm_var": 0.06440531412760417, "learning_rate": 0.0001, "loss": 1.8293, "loss/crossentropy": 2.547691226005554, "loss/fcd": 1.62890625, "loss/idx": 4.0, "loss/logits": 0.2003782019019127, "step": 558 }, { "epoch": 0.00834708337377463, "grad_norm": 1.015625, "grad_norm_var": 0.06260573069254557, "learning_rate": 0.0001, "loss": 1.7806, "loss/crossentropy": 2.4365475177764893, "loss/fcd": 1.60546875, "loss/idx": 4.0, "loss/logits": 0.1750938817858696, "step": 559 }, { "epoch": 0.00836201554438961, "grad_norm": 1.0234375, "grad_norm_var": 0.061899566650390626, "learning_rate": 0.0001, "loss": 2.228, "loss/crossentropy": 2.2742353677749634, "loss/fcd": 1.96484375, "loss/idx": 4.0, "loss/logits": 0.263118177652359, "step": 560 }, { "epoch": 0.008376947715004591, "grad_norm": 0.8671875, "grad_norm_var": 0.06120580037434896, "learning_rate": 0.0001, "loss": 1.9159, "loss/crossentropy": 2.607572078704834, "loss/fcd": 1.69921875, "loss/idx": 4.0, "loss/logits": 0.2166425883769989, "step": 561 }, { "epoch": 0.008391879885619574, "grad_norm": 0.96484375, "grad_norm_var": 0.045841407775878903, "learning_rate": 0.0001, "loss": 1.9524, "loss/crossentropy": 2.3942710161209106, "loss/fcd": 1.74609375, "loss/idx": 4.0, "loss/logits": 0.20629461109638214, "step": 562 }, { "epoch": 0.008406812056234555, "grad_norm": 0.796875, "grad_norm_var": 0.04763590494791667, "learning_rate": 0.0001, "loss": 1.7762, "loss/crossentropy": 2.674455165863037, "loss/fcd": 1.5859375, "loss/idx": 4.0, "loss/logits": 0.1902579665184021, "step": 563 }, { "epoch": 0.008421744226849536, "grad_norm": 1.0078125, "grad_norm_var": 0.047682634989420575, "learning_rate": 0.0001, "loss": 1.742, "loss/crossentropy": 2.687700867652893, "loss/fcd": 1.55859375, "loss/idx": 4.0, "loss/logits": 0.18337178230285645, "step": 564 }, { "epoch": 0.008436676397464517, "grad_norm": 0.8671875, "grad_norm_var": 0.04817345937093099, "learning_rate": 0.0001, "loss": 1.7458, "loss/crossentropy": 2.496551990509033, "loss/fcd": 1.5703125, "loss/idx": 4.0, "loss/logits": 0.1754860207438469, "step": 565 }, { "epoch": 0.0084516085680795, "grad_norm": 1.046875, "grad_norm_var": 0.046910031636555986, "learning_rate": 0.0001, "loss": 1.8244, "loss/crossentropy": 2.7592931985855103, "loss/fcd": 1.62890625, "loss/idx": 4.0, "loss/logits": 0.19546330720186234, "step": 566 }, { "epoch": 0.00846654073869448, "grad_norm": 0.9765625, "grad_norm_var": 0.04684041341145833, "learning_rate": 0.0001, "loss": 1.6384, "loss/crossentropy": 2.685667395591736, "loss/fcd": 1.484375, "loss/idx": 4.0, "loss/logits": 0.1539781242609024, "step": 567 }, { "epoch": 0.008481472909309461, "grad_norm": 0.98046875, "grad_norm_var": 0.04459425608317057, "learning_rate": 0.0001, "loss": 1.8716, "loss/crossentropy": 2.4638712406158447, "loss/fcd": 1.65625, "loss/idx": 4.0, "loss/logits": 0.21538397669792175, "step": 568 }, { "epoch": 0.008496405079924444, "grad_norm": 0.89453125, "grad_norm_var": 0.04500503540039062, "learning_rate": 0.0001, "loss": 1.7427, "loss/crossentropy": 2.3749929666519165, "loss/fcd": 1.5625, "loss/idx": 4.0, "loss/logits": 0.18019939959049225, "step": 569 }, { "epoch": 0.008511337250539425, "grad_norm": 0.87890625, "grad_norm_var": 0.044838905334472656, "learning_rate": 0.0001, "loss": 1.7957, "loss/crossentropy": 2.600667715072632, "loss/fcd": 1.61328125, "loss/idx": 4.0, "loss/logits": 0.18243755400180817, "step": 570 }, { "epoch": 0.008526269421154406, "grad_norm": 1.484375, "grad_norm_var": 0.06059309641520182, "learning_rate": 0.0001, "loss": 2.0295, "loss/crossentropy": 2.462936758995056, "loss/fcd": 1.8203125, "loss/idx": 4.0, "loss/logits": 0.20919711887836456, "step": 571 }, { "epoch": 0.008541201591769387, "grad_norm": 1.0390625, "grad_norm_var": 0.059334055582682295, "learning_rate": 0.0001, "loss": 1.5455, "loss/crossentropy": 2.69628369808197, "loss/fcd": 1.3984375, "loss/idx": 4.0, "loss/logits": 0.1470213457942009, "step": 572 }, { "epoch": 0.00855613376238437, "grad_norm": 0.88671875, "grad_norm_var": 0.025410906473795573, "learning_rate": 0.0001, "loss": 1.8762, "loss/crossentropy": 2.385751962661743, "loss/fcd": 1.68359375, "loss/idx": 4.0, "loss/logits": 0.19262713938951492, "step": 573 }, { "epoch": 0.00857106593299935, "grad_norm": 0.8984375, "grad_norm_var": 0.023881975809733072, "learning_rate": 0.0001, "loss": 1.7466, "loss/crossentropy": 2.782795786857605, "loss/fcd": 1.578125, "loss/idx": 4.0, "loss/logits": 0.16844282299280167, "step": 574 }, { "epoch": 0.008585998103614332, "grad_norm": 1.046875, "grad_norm_var": 0.024104754130045574, "learning_rate": 0.0001, "loss": 1.8152, "loss/crossentropy": 2.5910396575927734, "loss/fcd": 1.6171875, "loss/idx": 4.0, "loss/logits": 0.19796662032604218, "step": 575 }, { "epoch": 0.008600930274229313, "grad_norm": 0.74609375, "grad_norm_var": 0.02726008097330729, "learning_rate": 0.0001, "loss": 1.6011, "loss/crossentropy": 2.625463128089905, "loss/fcd": 1.4453125, "loss/idx": 4.0, "loss/logits": 0.15583696961402893, "step": 576 }, { "epoch": 0.008615862444844295, "grad_norm": 0.86328125, "grad_norm_var": 0.02731011708577474, "learning_rate": 0.0001, "loss": 1.6691, "loss/crossentropy": 2.7770665884017944, "loss/fcd": 1.515625, "loss/idx": 4.0, "loss/logits": 0.15351397544145584, "step": 577 }, { "epoch": 0.008630794615459276, "grad_norm": 1.359375, "grad_norm_var": 0.037231190999348955, "learning_rate": 0.0001, "loss": 1.8882, "loss/crossentropy": 2.6627167463302612, "loss/fcd": 1.66015625, "loss/idx": 4.0, "loss/logits": 0.22807708382606506, "step": 578 }, { "epoch": 0.008645726786074257, "grad_norm": 0.91015625, "grad_norm_var": 0.03517907460530599, "learning_rate": 0.0001, "loss": 1.7459, "loss/crossentropy": 2.574559807777405, "loss/fcd": 1.5546875, "loss/idx": 4.0, "loss/logits": 0.19124917685985565, "step": 579 }, { "epoch": 0.00866065895668924, "grad_norm": 0.97265625, "grad_norm_var": 0.035186513264973955, "learning_rate": 0.0001, "loss": 1.7128, "loss/crossentropy": 2.4655394554138184, "loss/fcd": 1.52734375, "loss/idx": 4.0, "loss/logits": 0.18545571714639664, "step": 580 }, { "epoch": 0.00867559112730422, "grad_norm": 0.9140625, "grad_norm_var": 0.034551747639973956, "learning_rate": 0.0001, "loss": 1.6711, "loss/crossentropy": 2.4528276920318604, "loss/fcd": 1.5, "loss/idx": 4.0, "loss/logits": 0.17109280824661255, "step": 581 }, { "epoch": 0.008690523297919202, "grad_norm": 1.234375, "grad_norm_var": 0.03807957967122396, "learning_rate": 0.0001, "loss": 1.926, "loss/crossentropy": 2.6185566186904907, "loss/fcd": 1.69921875, "loss/idx": 4.0, "loss/logits": 0.22679200768470764, "step": 582 }, { "epoch": 0.008705455468534183, "grad_norm": 0.7734375, "grad_norm_var": 0.04143854777018229, "learning_rate": 0.0001, "loss": 1.7634, "loss/crossentropy": 2.3130193948745728, "loss/fcd": 1.5859375, "loss/idx": 4.0, "loss/logits": 0.17749568819999695, "step": 583 }, { "epoch": 0.008720387639149165, "grad_norm": 0.875, "grad_norm_var": 0.04230543772379557, "learning_rate": 0.0001, "loss": 1.7338, "loss/crossentropy": 2.6866395473480225, "loss/fcd": 1.54296875, "loss/idx": 4.0, "loss/logits": 0.1908554881811142, "step": 584 }, { "epoch": 0.008735319809764146, "grad_norm": 0.95703125, "grad_norm_var": 0.04178663889567057, "learning_rate": 0.0001, "loss": 1.6385, "loss/crossentropy": 2.8928191661834717, "loss/fcd": 1.47265625, "loss/idx": 4.0, "loss/logits": 0.16579563915729523, "step": 585 }, { "epoch": 0.008750251980379127, "grad_norm": 0.859375, "grad_norm_var": 0.04209976196289063, "learning_rate": 0.0001, "loss": 1.6173, "loss/crossentropy": 2.532406449317932, "loss/fcd": 1.45703125, "loss/idx": 4.0, "loss/logits": 0.16028352081775665, "step": 586 }, { "epoch": 0.00876518415099411, "grad_norm": 0.83203125, "grad_norm_var": 0.025589434305826823, "learning_rate": 0.0001, "loss": 1.7514, "loss/crossentropy": 2.5307902097702026, "loss/fcd": 1.56640625, "loss/idx": 4.0, "loss/logits": 0.18504076451063156, "step": 587 }, { "epoch": 0.008780116321609091, "grad_norm": 1.03125, "grad_norm_var": 0.025498390197753906, "learning_rate": 0.0001, "loss": 1.8988, "loss/crossentropy": 2.645454168319702, "loss/fcd": 1.69140625, "loss/idx": 4.0, "loss/logits": 0.20737073570489883, "step": 588 }, { "epoch": 0.008795048492224072, "grad_norm": 0.9765625, "grad_norm_var": 0.025274658203125, "learning_rate": 0.0001, "loss": 1.6831, "loss/crossentropy": 2.5572890043258667, "loss/fcd": 1.515625, "loss/idx": 4.0, "loss/logits": 0.16751381009817123, "step": 589 }, { "epoch": 0.008809980662839053, "grad_norm": 0.85546875, "grad_norm_var": 0.02570336659749349, "learning_rate": 0.0001, "loss": 1.687, "loss/crossentropy": 2.4566808938980103, "loss/fcd": 1.50390625, "loss/idx": 4.0, "loss/logits": 0.1830824688076973, "step": 590 }, { "epoch": 0.008824912833454036, "grad_norm": 0.83203125, "grad_norm_var": 0.02582575480143229, "learning_rate": 0.0001, "loss": 1.7327, "loss/crossentropy": 2.6418092250823975, "loss/fcd": 1.5546875, "loss/idx": 4.0, "loss/logits": 0.17799954861402512, "step": 591 }, { "epoch": 0.008839845004069017, "grad_norm": 1.046875, "grad_norm_var": 0.02382348378499349, "learning_rate": 0.0001, "loss": 1.7224, "loss/crossentropy": 2.5941314697265625, "loss/fcd": 1.54296875, "loss/idx": 4.0, "loss/logits": 0.17938321828842163, "step": 592 }, { "epoch": 0.008854777174683998, "grad_norm": 0.80078125, "grad_norm_var": 0.024838701883951823, "learning_rate": 0.0001, "loss": 1.7012, "loss/crossentropy": 2.594764471054077, "loss/fcd": 1.515625, "loss/idx": 4.0, "loss/logits": 0.18560975790023804, "step": 593 }, { "epoch": 0.008869709345298979, "grad_norm": 0.84765625, "grad_norm_var": 0.0134033203125, "learning_rate": 0.0001, "loss": 1.7647, "loss/crossentropy": 2.653563141822815, "loss/fcd": 1.546875, "loss/idx": 4.0, "loss/logits": 0.21787381172180176, "step": 594 }, { "epoch": 0.008884641515913961, "grad_norm": 0.8671875, "grad_norm_var": 0.013574663798014324, "learning_rate": 0.0001, "loss": 1.9553, "loss/crossentropy": 2.704614758491516, "loss/fcd": 1.71484375, "loss/idx": 4.0, "loss/logits": 0.24047966301441193, "step": 595 }, { "epoch": 0.008899573686528942, "grad_norm": 0.65625, "grad_norm_var": 0.017493693033854167, "learning_rate": 0.0001, "loss": 1.6271, "loss/crossentropy": 2.810309410095215, "loss/fcd": 1.46875, "loss/idx": 4.0, "loss/logits": 0.1583174169063568, "step": 596 }, { "epoch": 0.008914505857143923, "grad_norm": 0.796875, "grad_norm_var": 0.018092600504557292, "learning_rate": 0.0001, "loss": 1.6866, "loss/crossentropy": 2.514691710472107, "loss/fcd": 1.51953125, "loss/idx": 4.0, "loss/logits": 0.16707603633403778, "step": 597 }, { "epoch": 0.008929438027758906, "grad_norm": 0.9453125, "grad_norm_var": 0.010047403971354167, "learning_rate": 0.0001, "loss": 1.8422, "loss/crossentropy": 2.545697331428528, "loss/fcd": 1.6328125, "loss/idx": 4.0, "loss/logits": 0.2093753144145012, "step": 598 }, { "epoch": 0.008944370198373887, "grad_norm": 0.8359375, "grad_norm_var": 0.0094696044921875, "learning_rate": 0.0001, "loss": 1.6943, "loss/crossentropy": 2.565446376800537, "loss/fcd": 1.53125, "loss/idx": 4.0, "loss/logits": 0.16300475597381592, "step": 599 }, { "epoch": 0.008959302368988868, "grad_norm": 0.8359375, "grad_norm_var": 0.009570058186848958, "learning_rate": 0.0001, "loss": 1.596, "loss/crossentropy": 2.9047205448150635, "loss/fcd": 1.4375, "loss/idx": 4.0, "loss/logits": 0.15853270143270493, "step": 600 }, { "epoch": 0.008974234539603849, "grad_norm": 0.71875, "grad_norm_var": 0.01046593983968099, "learning_rate": 0.0001, "loss": 1.6888, "loss/crossentropy": 2.3885122537612915, "loss/fcd": 1.50390625, "loss/idx": 4.0, "loss/logits": 0.18485693633556366, "step": 601 }, { "epoch": 0.008989166710218831, "grad_norm": 0.78125, "grad_norm_var": 0.01083978017171224, "learning_rate": 0.0001, "loss": 1.7191, "loss/crossentropy": 2.561643123626709, "loss/fcd": 1.546875, "loss/idx": 4.0, "loss/logits": 0.17220281809568405, "step": 602 }, { "epoch": 0.009004098880833812, "grad_norm": 1.0, "grad_norm_var": 0.012116495768229167, "learning_rate": 0.0001, "loss": 2.1397, "loss/crossentropy": 2.338186025619507, "loss/fcd": 1.859375, "loss/idx": 4.0, "loss/logits": 0.2802945226430893, "step": 603 }, { "epoch": 0.009019031051448793, "grad_norm": 1.0625, "grad_norm_var": 0.012873331705729166, "learning_rate": 0.0001, "loss": 1.9177, "loss/crossentropy": 2.7651796340942383, "loss/fcd": 1.68359375, "loss/idx": 4.0, "loss/logits": 0.23406735062599182, "step": 604 }, { "epoch": 0.009033963222063776, "grad_norm": 1.1640625, "grad_norm_var": 0.017829386393229167, "learning_rate": 0.0001, "loss": 1.7051, "loss/crossentropy": 2.51702082157135, "loss/fcd": 1.52734375, "loss/idx": 4.0, "loss/logits": 0.17778894305229187, "step": 605 }, { "epoch": 0.009048895392678757, "grad_norm": 0.9921875, "grad_norm_var": 0.01858819325764974, "learning_rate": 0.0001, "loss": 1.8238, "loss/crossentropy": 2.4436594247817993, "loss/fcd": 1.62109375, "loss/idx": 4.0, "loss/logits": 0.20268025994300842, "step": 606 }, { "epoch": 0.009063827563293738, "grad_norm": 1.09375, "grad_norm_var": 0.020969390869140625, "learning_rate": 0.0001, "loss": 1.815, "loss/crossentropy": 2.816960573196411, "loss/fcd": 1.609375, "loss/idx": 4.0, "loss/logits": 0.20562273263931274, "step": 607 }, { "epoch": 0.009078759733908719, "grad_norm": 1.0390625, "grad_norm_var": 0.020823160807291668, "learning_rate": 0.0001, "loss": 1.8813, "loss/crossentropy": 2.362921714782715, "loss/fcd": 1.68359375, "loss/idx": 4.0, "loss/logits": 0.19768549501895905, "step": 608 }, { "epoch": 0.009093691904523702, "grad_norm": 0.828125, "grad_norm_var": 0.020499610900878908, "learning_rate": 0.0001, "loss": 1.6839, "loss/crossentropy": 2.6550045013427734, "loss/fcd": 1.50390625, "loss/idx": 4.0, "loss/logits": 0.17995422333478928, "step": 609 }, { "epoch": 0.009108624075138683, "grad_norm": 1.0703125, "grad_norm_var": 0.021923828125, "learning_rate": 0.0001, "loss": 1.8508, "loss/crossentropy": 2.478885054588318, "loss/fcd": 1.6484375, "loss/idx": 4.0, "loss/logits": 0.20236501097679138, "step": 610 }, { "epoch": 0.009123556245753664, "grad_norm": 0.83203125, "grad_norm_var": 0.022239112854003908, "learning_rate": 0.0001, "loss": 1.8277, "loss/crossentropy": 2.754876732826233, "loss/fcd": 1.61328125, "loss/idx": 4.0, "loss/logits": 0.21440355479717255, "step": 611 }, { "epoch": 0.009138488416368645, "grad_norm": 0.83203125, "grad_norm_var": 0.0180877685546875, "learning_rate": 0.0001, "loss": 1.7951, "loss/crossentropy": 2.5221598148345947, "loss/fcd": 1.59765625, "loss/idx": 4.0, "loss/logits": 0.19747836142778397, "step": 612 }, { "epoch": 0.009153420586983627, "grad_norm": 0.828125, "grad_norm_var": 0.017607625325520834, "learning_rate": 0.0001, "loss": 1.7805, "loss/crossentropy": 2.666842222213745, "loss/fcd": 1.59765625, "loss/idx": 4.0, "loss/logits": 0.1828431710600853, "step": 613 }, { "epoch": 0.009168352757598608, "grad_norm": 1.203125, "grad_norm_var": 0.022332509358723957, "learning_rate": 0.0001, "loss": 1.8873, "loss/crossentropy": 2.542251467704773, "loss/fcd": 1.671875, "loss/idx": 4.0, "loss/logits": 0.21547389030456543, "step": 614 }, { "epoch": 0.00918328492821359, "grad_norm": 1.0625, "grad_norm_var": 0.022251383463541666, "learning_rate": 0.0001, "loss": 1.871, "loss/crossentropy": 2.180862307548523, "loss/fcd": 1.6640625, "loss/idx": 4.0, "loss/logits": 0.2069542557001114, "step": 615 }, { "epoch": 0.009198217098828572, "grad_norm": 1.0, "grad_norm_var": 0.02124201456705729, "learning_rate": 0.0001, "loss": 1.8445, "loss/crossentropy": 2.765215754508972, "loss/fcd": 1.625, "loss/idx": 4.0, "loss/logits": 0.21946142613887787, "step": 616 }, { "epoch": 0.009213149269443553, "grad_norm": 1.125, "grad_norm_var": 0.017988840738932293, "learning_rate": 0.0001, "loss": 1.5044, "loss/crossentropy": 2.6672626733779907, "loss/fcd": 1.3671875, "loss/idx": 4.0, "loss/logits": 0.1371738463640213, "step": 617 }, { "epoch": 0.009228081440058534, "grad_norm": 0.84375, "grad_norm_var": 0.01645482381184896, "learning_rate": 0.0001, "loss": 1.65, "loss/crossentropy": 2.4921282529830933, "loss/fcd": 1.49609375, "loss/idx": 4.0, "loss/logits": 0.15389146655797958, "step": 618 }, { "epoch": 0.009243013610673515, "grad_norm": 0.92578125, "grad_norm_var": 0.01678460439046224, "learning_rate": 0.0001, "loss": 1.9129, "loss/crossentropy": 2.4398714303970337, "loss/fcd": 1.703125, "loss/idx": 4.0, "loss/logits": 0.20973220467567444, "step": 619 }, { "epoch": 0.009257945781288498, "grad_norm": 0.87109375, "grad_norm_var": 0.01732355753580729, "learning_rate": 0.0001, "loss": 1.7631, "loss/crossentropy": 2.7551677227020264, "loss/fcd": 1.578125, "loss/idx": 4.0, "loss/logits": 0.18502139300107956, "step": 620 }, { "epoch": 0.009272877951903478, "grad_norm": 0.95703125, "grad_norm_var": 0.01497491200764974, "learning_rate": 0.0001, "loss": 1.7992, "loss/crossentropy": 2.537253499031067, "loss/fcd": 1.609375, "loss/idx": 4.0, "loss/logits": 0.18983100354671478, "step": 621 }, { "epoch": 0.00928781012251846, "grad_norm": 1.4765625, "grad_norm_var": 0.031136512756347656, "learning_rate": 0.0001, "loss": 1.6154, "loss/crossentropy": 2.4758503437042236, "loss/fcd": 1.4609375, "loss/idx": 4.0, "loss/logits": 0.15446265041828156, "step": 622 }, { "epoch": 0.009302742293133442, "grad_norm": 0.890625, "grad_norm_var": 0.031156349182128906, "learning_rate": 0.0001, "loss": 1.6562, "loss/crossentropy": 2.649666428565979, "loss/fcd": 1.4921875, "loss/idx": 4.0, "loss/logits": 0.16403885930776596, "step": 623 }, { "epoch": 0.009317674463748423, "grad_norm": 1.046875, "grad_norm_var": 0.03121484120686849, "learning_rate": 0.0001, "loss": 1.6746, "loss/crossentropy": 2.6181329488754272, "loss/fcd": 1.5078125, "loss/idx": 4.0, "loss/logits": 0.1668335720896721, "step": 624 }, { "epoch": 0.009332606634363404, "grad_norm": 1.3671875, "grad_norm_var": 0.037953122456868486, "learning_rate": 0.0001, "loss": 2.1473, "loss/crossentropy": 2.5993008613586426, "loss/fcd": 1.921875, "loss/idx": 4.0, "loss/logits": 0.22541476786136627, "step": 625 }, { "epoch": 0.009347538804978385, "grad_norm": 1.2109375, "grad_norm_var": 0.04011834462483724, "learning_rate": 0.0001, "loss": 1.7797, "loss/crossentropy": 2.519839644432068, "loss/fcd": 1.58984375, "loss/idx": 4.0, "loss/logits": 0.18983253091573715, "step": 626 }, { "epoch": 0.009362470975593368, "grad_norm": 0.9140625, "grad_norm_var": 0.03837865193684896, "learning_rate": 0.0001, "loss": 1.7139, "loss/crossentropy": 2.5223900079727173, "loss/fcd": 1.53515625, "loss/idx": 4.0, "loss/logits": 0.17874927818775177, "step": 627 }, { "epoch": 0.009377403146208349, "grad_norm": 1.6640625, "grad_norm_var": 0.05916589101155599, "learning_rate": 0.0001, "loss": 1.872, "loss/crossentropy": 2.6015422344207764, "loss/fcd": 1.69140625, "loss/idx": 4.0, "loss/logits": 0.18056581169366837, "step": 628 }, { "epoch": 0.00939233531682333, "grad_norm": 0.9921875, "grad_norm_var": 0.05519250233968099, "learning_rate": 0.0001, "loss": 1.7462, "loss/crossentropy": 2.525193214416504, "loss/fcd": 1.56640625, "loss/idx": 4.0, "loss/logits": 0.1797695904970169, "step": 629 }, { "epoch": 0.00940726748743831, "grad_norm": 0.86328125, "grad_norm_var": 0.05759862263997396, "learning_rate": 0.0001, "loss": 1.7776, "loss/crossentropy": 2.4848132133483887, "loss/fcd": 1.58203125, "loss/idx": 4.0, "loss/logits": 0.19558971375226974, "step": 630 }, { "epoch": 0.009422199658053293, "grad_norm": 1.15625, "grad_norm_var": 0.057983144124348955, "learning_rate": 0.0001, "loss": 2.0806, "loss/crossentropy": 2.5745344161987305, "loss/fcd": 1.83203125, "loss/idx": 4.0, "loss/logits": 0.24858521670103073, "step": 631 }, { "epoch": 0.009437131828668274, "grad_norm": 0.8359375, "grad_norm_var": 0.06144917805989583, "learning_rate": 0.0001, "loss": 1.7292, "loss/crossentropy": 2.63934063911438, "loss/fcd": 1.55078125, "loss/idx": 4.0, "loss/logits": 0.1784433200955391, "step": 632 }, { "epoch": 0.009452063999283255, "grad_norm": 0.8671875, "grad_norm_var": 0.06375706990559896, "learning_rate": 0.0001, "loss": 1.5677, "loss/crossentropy": 2.8568373918533325, "loss/fcd": 1.421875, "loss/idx": 4.0, "loss/logits": 0.14580092206597328, "step": 633 }, { "epoch": 0.009466996169898238, "grad_norm": 0.8203125, "grad_norm_var": 0.06445210774739583, "learning_rate": 0.0001, "loss": 1.6701, "loss/crossentropy": 2.530740976333618, "loss/fcd": 1.49609375, "loss/idx": 4.0, "loss/logits": 0.17399245500564575, "step": 634 }, { "epoch": 0.009481928340513219, "grad_norm": 0.9375, "grad_norm_var": 0.06426080067952473, "learning_rate": 0.0001, "loss": 1.6844, "loss/crossentropy": 2.536529541015625, "loss/fcd": 1.51171875, "loss/idx": 4.0, "loss/logits": 0.17269806563854218, "step": 635 }, { "epoch": 0.0094968605111282, "grad_norm": 0.890625, "grad_norm_var": 0.06380716959635417, "learning_rate": 0.0001, "loss": 1.792, "loss/crossentropy": 2.396178364753723, "loss/fcd": 1.6015625, "loss/idx": 4.0, "loss/logits": 0.19045118242502213, "step": 636 }, { "epoch": 0.00951179268174318, "grad_norm": 0.953125, "grad_norm_var": 0.063859494527181, "learning_rate": 0.0001, "loss": 1.9673, "loss/crossentropy": 2.3137094974517822, "loss/fcd": 1.734375, "loss/idx": 4.0, "loss/logits": 0.23295484483242035, "step": 637 }, { "epoch": 0.009526724852358164, "grad_norm": 0.8515625, "grad_norm_var": 0.05317834218343099, "learning_rate": 0.0001, "loss": 1.7548, "loss/crossentropy": 2.7743014097213745, "loss/fcd": 1.5703125, "loss/idx": 4.0, "loss/logits": 0.18447993695735931, "step": 638 }, { "epoch": 0.009541657022973144, "grad_norm": 0.87890625, "grad_norm_var": 0.05338338216145833, "learning_rate": 0.0001, "loss": 1.7626, "loss/crossentropy": 2.4832268953323364, "loss/fcd": 1.5625, "loss/idx": 4.0, "loss/logits": 0.20006201416254044, "step": 639 }, { "epoch": 0.009556589193588125, "grad_norm": 0.9375, "grad_norm_var": 0.053675333658854164, "learning_rate": 0.0001, "loss": 1.7401, "loss/crossentropy": 2.3719040155410767, "loss/fcd": 1.55859375, "loss/idx": 4.0, "loss/logits": 0.18149051070213318, "step": 640 }, { "epoch": 0.009571521364203108, "grad_norm": 0.88671875, "grad_norm_var": 0.04514357248942057, "learning_rate": 0.0001, "loss": 1.8719, "loss/crossentropy": 2.3773958683013916, "loss/fcd": 1.67578125, "loss/idx": 4.0, "loss/logits": 0.19614291191101074, "step": 641 }, { "epoch": 0.009586453534818089, "grad_norm": 0.93359375, "grad_norm_var": 0.04136530558268229, "learning_rate": 0.0001, "loss": 1.7711, "loss/crossentropy": 2.8984084129333496, "loss/fcd": 1.5703125, "loss/idx": 4.0, "loss/logits": 0.20078962296247482, "step": 642 }, { "epoch": 0.00960138570543307, "grad_norm": 0.75390625, "grad_norm_var": 0.043979835510253903, "learning_rate": 0.0001, "loss": 1.7898, "loss/crossentropy": 2.430532932281494, "loss/fcd": 1.58984375, "loss/idx": 4.0, "loss/logits": 0.19993210583925247, "step": 643 }, { "epoch": 0.009616317876048051, "grad_norm": 1.0625, "grad_norm_var": 0.009436988830566406, "learning_rate": 0.0001, "loss": 1.6162, "loss/crossentropy": 2.6839109659194946, "loss/fcd": 1.45703125, "loss/idx": 4.0, "loss/logits": 0.15920980274677277, "step": 644 }, { "epoch": 0.009631250046663034, "grad_norm": 0.765625, "grad_norm_var": 0.010277748107910156, "learning_rate": 0.0001, "loss": 1.5942, "loss/crossentropy": 2.341609477996826, "loss/fcd": 1.44140625, "loss/idx": 4.0, "loss/logits": 0.15275691449642181, "step": 645 }, { "epoch": 0.009646182217278015, "grad_norm": 1.015625, "grad_norm_var": 0.0109893798828125, "learning_rate": 0.0001, "loss": 1.6706, "loss/crossentropy": 2.766583561897278, "loss/fcd": 1.49609375, "loss/idx": 4.0, "loss/logits": 0.174458347260952, "step": 646 }, { "epoch": 0.009661114387892996, "grad_norm": 0.9921875, "grad_norm_var": 0.007266998291015625, "learning_rate": 0.0001, "loss": 1.8037, "loss/crossentropy": 2.591106653213501, "loss/fcd": 1.609375, "loss/idx": 4.0, "loss/logits": 0.19436196237802505, "step": 647 }, { "epoch": 0.009676046558507978, "grad_norm": 0.78515625, "grad_norm_var": 0.007854652404785157, "learning_rate": 0.0001, "loss": 1.8771, "loss/crossentropy": 2.5671372413635254, "loss/fcd": 1.66015625, "loss/idx": 4.0, "loss/logits": 0.2169155329465866, "step": 648 }, { "epoch": 0.00969097872912296, "grad_norm": 0.91015625, "grad_norm_var": 0.007806396484375, "learning_rate": 0.0001, "loss": 1.9598, "loss/crossentropy": 2.369256615638733, "loss/fcd": 1.703125, "loss/idx": 4.0, "loss/logits": 0.2566370368003845, "step": 649 }, { "epoch": 0.00970591089973794, "grad_norm": 1.0234375, "grad_norm_var": 0.008269246419270833, "learning_rate": 0.0001, "loss": 1.7494, "loss/crossentropy": 2.582263708114624, "loss/fcd": 1.5625, "loss/idx": 4.0, "loss/logits": 0.1869092434644699, "step": 650 }, { "epoch": 0.009720843070352921, "grad_norm": 0.8125, "grad_norm_var": 0.008806355794270833, "learning_rate": 0.0001, "loss": 1.5369, "loss/crossentropy": 2.713529109954834, "loss/fcd": 1.39453125, "loss/idx": 4.0, "loss/logits": 0.1423557996749878, "step": 651 }, { "epoch": 0.009735775240967904, "grad_norm": 0.84375, "grad_norm_var": 0.009023030598958334, "learning_rate": 0.0001, "loss": 1.7421, "loss/crossentropy": 2.555392622947693, "loss/fcd": 1.55859375, "loss/idx": 4.0, "loss/logits": 0.18351471424102783, "step": 652 }, { "epoch": 0.009750707411582885, "grad_norm": 0.921875, "grad_norm_var": 0.008864339192708333, "learning_rate": 0.0001, "loss": 1.6934, "loss/crossentropy": 2.4481018781661987, "loss/fcd": 1.51171875, "loss/idx": 4.0, "loss/logits": 0.1817130595445633, "step": 653 }, { "epoch": 0.009765639582197866, "grad_norm": 0.8671875, "grad_norm_var": 0.008781941731770833, "learning_rate": 0.0001, "loss": 1.7043, "loss/crossentropy": 2.630600929260254, "loss/fcd": 1.53125, "loss/idx": 4.0, "loss/logits": 0.17303074896335602, "step": 654 }, { "epoch": 0.009780571752812847, "grad_norm": 0.9609375, "grad_norm_var": 0.00897820790608724, "learning_rate": 0.0001, "loss": 1.564, "loss/crossentropy": 2.6150777339935303, "loss/fcd": 1.41015625, "loss/idx": 4.0, "loss/logits": 0.15385961532592773, "step": 655 }, { "epoch": 0.00979550392342783, "grad_norm": 0.97265625, "grad_norm_var": 0.009209950764973959, "learning_rate": 0.0001, "loss": 1.6253, "loss/crossentropy": 2.5774786472320557, "loss/fcd": 1.46875, "loss/idx": 4.0, "loss/logits": 0.15658701956272125, "step": 656 }, { "epoch": 0.00981043609404281, "grad_norm": 0.76953125, "grad_norm_var": 0.010381062825520834, "learning_rate": 0.0001, "loss": 1.7633, "loss/crossentropy": 2.5385220050811768, "loss/fcd": 1.578125, "loss/idx": 4.0, "loss/logits": 0.18515019118785858, "step": 657 }, { "epoch": 0.009825368264657791, "grad_norm": 0.71484375, "grad_norm_var": 0.0123748779296875, "learning_rate": 0.0001, "loss": 1.532, "loss/crossentropy": 2.4262478351593018, "loss/fcd": 1.3828125, "loss/idx": 4.0, "loss/logits": 0.14915584027767181, "step": 658 }, { "epoch": 0.009840300435272774, "grad_norm": 0.81640625, "grad_norm_var": 0.0115203857421875, "learning_rate": 0.0001, "loss": 1.7338, "loss/crossentropy": 2.755561113357544, "loss/fcd": 1.5390625, "loss/idx": 4.0, "loss/logits": 0.19469743967056274, "step": 659 }, { "epoch": 0.009855232605887755, "grad_norm": 1.0859375, "grad_norm_var": 0.012094879150390625, "learning_rate": 0.0001, "loss": 2.1387, "loss/crossentropy": 2.3337032794952393, "loss/fcd": 1.85546875, "loss/idx": 4.0, "loss/logits": 0.28325171023607254, "step": 660 }, { "epoch": 0.009870164776502736, "grad_norm": 0.75390625, "grad_norm_var": 0.012299537658691406, "learning_rate": 0.0001, "loss": 1.6438, "loss/crossentropy": 2.4794669151306152, "loss/fcd": 1.48046875, "loss/idx": 4.0, "loss/logits": 0.1632954254746437, "step": 661 }, { "epoch": 0.009885096947117717, "grad_norm": 0.97265625, "grad_norm_var": 0.0116973876953125, "learning_rate": 0.0001, "loss": 1.7155, "loss/crossentropy": 2.642146348953247, "loss/fcd": 1.53515625, "loss/idx": 4.0, "loss/logits": 0.18031089007854462, "step": 662 }, { "epoch": 0.0099000291177327, "grad_norm": 0.8359375, "grad_norm_var": 0.011046346028645833, "learning_rate": 0.0001, "loss": 1.8485, "loss/crossentropy": 2.4629205465316772, "loss/fcd": 1.65234375, "loss/idx": 4.0, "loss/logits": 0.19618399441242218, "step": 663 }, { "epoch": 0.00991496128834768, "grad_norm": 0.9296875, "grad_norm_var": 0.010564104715983073, "learning_rate": 0.0001, "loss": 1.6107, "loss/crossentropy": 2.5848491191864014, "loss/fcd": 1.453125, "loss/idx": 4.0, "loss/logits": 0.15756294131278992, "step": 664 }, { "epoch": 0.009929893458962662, "grad_norm": 0.90234375, "grad_norm_var": 0.010543759663899739, "learning_rate": 0.0001, "loss": 1.6383, "loss/crossentropy": 2.578953504562378, "loss/fcd": 1.48046875, "loss/idx": 4.0, "loss/logits": 0.15785705298185349, "step": 665 }, { "epoch": 0.009944825629577644, "grad_norm": 0.86328125, "grad_norm_var": 0.009222157796223958, "learning_rate": 0.0001, "loss": 1.7285, "loss/crossentropy": 2.6012935638427734, "loss/fcd": 1.55859375, "loss/idx": 4.0, "loss/logits": 0.16986890882253647, "step": 666 }, { "epoch": 0.009959757800192625, "grad_norm": 0.82421875, "grad_norm_var": 0.00913079579671224, "learning_rate": 0.0001, "loss": 1.675, "loss/crossentropy": 2.577793598175049, "loss/fcd": 1.4921875, "loss/idx": 4.0, "loss/logits": 0.18284159153699875, "step": 667 }, { "epoch": 0.009974689970807606, "grad_norm": 0.8671875, "grad_norm_var": 0.00906060536702474, "learning_rate": 0.0001, "loss": 1.6981, "loss/crossentropy": 2.6089415550231934, "loss/fcd": 1.52734375, "loss/idx": 4.0, "loss/logits": 0.17078907787799835, "step": 668 }, { "epoch": 0.009989622141422587, "grad_norm": 0.953125, "grad_norm_var": 0.009301694234212239, "learning_rate": 0.0001, "loss": 1.7175, "loss/crossentropy": 2.7468433380126953, "loss/fcd": 1.5390625, "loss/idx": 4.0, "loss/logits": 0.17840731143951416, "step": 669 }, { "epoch": 0.01000455431203757, "grad_norm": 0.84765625, "grad_norm_var": 0.009360504150390626, "learning_rate": 0.0001, "loss": 1.8898, "loss/crossentropy": 2.278976559638977, "loss/fcd": 1.6875, "loss/idx": 4.0, "loss/logits": 0.2022818624973297, "step": 670 }, { "epoch": 0.010019486482652551, "grad_norm": 0.890625, "grad_norm_var": 0.008905029296875, "learning_rate": 0.0001, "loss": 1.8042, "loss/crossentropy": 2.7631266117095947, "loss/fcd": 1.609375, "loss/idx": 4.0, "loss/logits": 0.194790318608284, "step": 671 }, { "epoch": 0.010034418653267532, "grad_norm": 0.78125, "grad_norm_var": 0.008702532450358073, "learning_rate": 0.0001, "loss": 1.6957, "loss/crossentropy": 2.5823254585266113, "loss/fcd": 1.515625, "loss/idx": 4.0, "loss/logits": 0.1801098957657814, "step": 672 }, { "epoch": 0.010049350823882513, "grad_norm": 0.765625, "grad_norm_var": 0.008752187093098959, "learning_rate": 0.0001, "loss": 1.7467, "loss/crossentropy": 2.3960039615631104, "loss/fcd": 1.5546875, "loss/idx": 4.0, "loss/logits": 0.19199489802122116, "step": 673 }, { "epoch": 0.010064282994497496, "grad_norm": 0.91015625, "grad_norm_var": 0.007283528645833333, "learning_rate": 0.0001, "loss": 1.8449, "loss/crossentropy": 2.629265785217285, "loss/fcd": 1.62890625, "loss/idx": 4.0, "loss/logits": 0.21602004766464233, "step": 674 }, { "epoch": 0.010079215165112477, "grad_norm": 0.63671875, "grad_norm_var": 0.010705312093098959, "learning_rate": 0.0001, "loss": 1.6236, "loss/crossentropy": 2.2417469024658203, "loss/fcd": 1.46484375, "loss/idx": 4.0, "loss/logits": 0.15875373780727386, "step": 675 }, { "epoch": 0.010094147335727457, "grad_norm": 0.86328125, "grad_norm_var": 0.007208188374837239, "learning_rate": 0.0001, "loss": 1.6652, "loss/crossentropy": 2.4856256246566772, "loss/fcd": 1.484375, "loss/idx": 4.0, "loss/logits": 0.1808222532272339, "step": 676 }, { "epoch": 0.01010907950634244, "grad_norm": 0.7890625, "grad_norm_var": 0.0068356831868489586, "learning_rate": 0.0001, "loss": 1.6519, "loss/crossentropy": 2.6791006326675415, "loss/fcd": 1.47265625, "loss/idx": 4.0, "loss/logits": 0.1792703941464424, "step": 677 }, { "epoch": 0.010124011676957421, "grad_norm": 1.0078125, "grad_norm_var": 0.00747826894124349, "learning_rate": 0.0001, "loss": 1.8774, "loss/crossentropy": 2.4157909154891968, "loss/fcd": 1.66796875, "loss/idx": 4.0, "loss/logits": 0.20945309102535248, "step": 678 }, { "epoch": 0.010138943847572402, "grad_norm": 0.98828125, "grad_norm_var": 0.008556874593098958, "learning_rate": 0.0001, "loss": 1.7904, "loss/crossentropy": 2.5772135257720947, "loss/fcd": 1.6015625, "loss/idx": 4.0, "loss/logits": 0.188814677298069, "step": 679 }, { "epoch": 0.010153876018187383, "grad_norm": 1.125, "grad_norm_var": 0.012657674153645833, "learning_rate": 0.0001, "loss": 1.6447, "loss/crossentropy": 2.690971612930298, "loss/fcd": 1.4765625, "loss/idx": 4.0, "loss/logits": 0.16817960143089294, "step": 680 }, { "epoch": 0.010168808188802366, "grad_norm": 0.87890625, "grad_norm_var": 0.012609608968098958, "learning_rate": 0.0001, "loss": 1.7396, "loss/crossentropy": 2.602474093437195, "loss/fcd": 1.55078125, "loss/idx": 4.0, "loss/logits": 0.1888354793190956, "step": 681 }, { "epoch": 0.010183740359417347, "grad_norm": 0.78515625, "grad_norm_var": 0.013108062744140624, "learning_rate": 0.0001, "loss": 1.5378, "loss/crossentropy": 2.501987338066101, "loss/fcd": 1.3984375, "loss/idx": 4.0, "loss/logits": 0.13934527337551117, "step": 682 }, { "epoch": 0.010198672530032328, "grad_norm": 0.90234375, "grad_norm_var": 0.013016510009765624, "learning_rate": 0.0001, "loss": 1.6797, "loss/crossentropy": 2.6341532468795776, "loss/fcd": 1.51171875, "loss/idx": 4.0, "loss/logits": 0.1679832711815834, "step": 683 }, { "epoch": 0.01021360470064731, "grad_norm": 0.7421875, "grad_norm_var": 0.014115142822265624, "learning_rate": 0.0001, "loss": 1.5106, "loss/crossentropy": 2.5898005962371826, "loss/fcd": 1.3671875, "loss/idx": 4.0, "loss/logits": 0.14340700209140778, "step": 684 }, { "epoch": 0.010228536871262291, "grad_norm": 0.953125, "grad_norm_var": 0.014115142822265624, "learning_rate": 0.0001, "loss": 1.7084, "loss/crossentropy": 2.754904866218567, "loss/fcd": 1.5234375, "loss/idx": 4.0, "loss/logits": 0.18500886112451553, "step": 685 }, { "epoch": 0.010243469041877272, "grad_norm": 0.87109375, "grad_norm_var": 0.0140899658203125, "learning_rate": 0.0001, "loss": 1.8091, "loss/crossentropy": 2.4006221294403076, "loss/fcd": 1.61328125, "loss/idx": 4.0, "loss/logits": 0.19577862322330475, "step": 686 }, { "epoch": 0.010258401212492253, "grad_norm": 1.3125, "grad_norm_var": 0.02647705078125, "learning_rate": 0.0001, "loss": 1.8188, "loss/crossentropy": 2.374703049659729, "loss/fcd": 1.6171875, "loss/idx": 4.0, "loss/logits": 0.20162508636713028, "step": 687 }, { "epoch": 0.010273333383107236, "grad_norm": 0.87890625, "grad_norm_var": 0.02559808095296224, "learning_rate": 0.0001, "loss": 1.671, "loss/crossentropy": 2.316514492034912, "loss/fcd": 1.49609375, "loss/idx": 4.0, "loss/logits": 0.17492541670799255, "step": 688 }, { "epoch": 0.010288265553722217, "grad_norm": 0.69140625, "grad_norm_var": 0.027278391520182292, "learning_rate": 0.0001, "loss": 1.6032, "loss/crossentropy": 2.468592405319214, "loss/fcd": 1.453125, "loss/idx": 4.0, "loss/logits": 0.15006089210510254, "step": 689 }, { "epoch": 0.010303197724337198, "grad_norm": 0.81640625, "grad_norm_var": 0.02765070597330729, "learning_rate": 0.0001, "loss": 1.6631, "loss/crossentropy": 2.5445820093154907, "loss/fcd": 1.49609375, "loss/idx": 4.0, "loss/logits": 0.16699903458356857, "step": 690 }, { "epoch": 0.010318129894952179, "grad_norm": 0.56640625, "grad_norm_var": 0.030335489908854166, "learning_rate": 0.0001, "loss": 1.6255, "loss/crossentropy": 2.490410089492798, "loss/fcd": 1.46484375, "loss/idx": 4.5, "loss/logits": 0.1606893539428711, "step": 691 }, { "epoch": 0.010333062065567162, "grad_norm": 0.7265625, "grad_norm_var": 0.03191318511962891, "learning_rate": 0.0001, "loss": 1.855, "loss/crossentropy": 2.340074300765991, "loss/fcd": 1.640625, "loss/idx": 4.5, "loss/logits": 0.21434715390205383, "step": 692 }, { "epoch": 0.010347994236182143, "grad_norm": 0.62890625, "grad_norm_var": 0.03539835611979167, "learning_rate": 0.0001, "loss": 1.8011, "loss/crossentropy": 2.5316191911697388, "loss/fcd": 1.59375, "loss/idx": 4.5, "loss/logits": 0.20732642710208893, "step": 693 }, { "epoch": 0.010362926406797123, "grad_norm": 0.54296875, "grad_norm_var": 0.04018751780192057, "learning_rate": 0.0001, "loss": 1.5777, "loss/crossentropy": 2.5405019521713257, "loss/fcd": 1.40234375, "loss/idx": 4.5, "loss/logits": 0.17533162981271744, "step": 694 }, { "epoch": 0.010377858577412106, "grad_norm": 0.5859375, "grad_norm_var": 0.04225031534830729, "learning_rate": 0.0001, "loss": 1.5453, "loss/crossentropy": 2.779569625854492, "loss/fcd": 1.37109375, "loss/idx": 4.5, "loss/logits": 0.17417190968990326, "step": 695 }, { "epoch": 0.010392790748027087, "grad_norm": 0.65625, "grad_norm_var": 0.03648249308268229, "learning_rate": 0.0001, "loss": 1.5685, "loss/crossentropy": 2.5351744890213013, "loss/fcd": 1.41015625, "loss/idx": 4.5, "loss/logits": 0.15832393616437912, "step": 696 }, { "epoch": 0.010407722918642068, "grad_norm": 0.73046875, "grad_norm_var": 0.035975138346354164, "learning_rate": 0.0001, "loss": 1.6724, "loss/crossentropy": 2.2647390365600586, "loss/fcd": 1.484375, "loss/idx": 4.5, "loss/logits": 0.1879829317331314, "step": 697 }, { "epoch": 0.010422655089257049, "grad_norm": 0.73828125, "grad_norm_var": 0.03604532877604167, "learning_rate": 0.0001, "loss": 1.8554, "loss/crossentropy": 2.6300876140594482, "loss/fcd": 1.59765625, "loss/idx": 4.5, "loss/logits": 0.25777651369571686, "step": 698 }, { "epoch": 0.010437587259872032, "grad_norm": 0.60546875, "grad_norm_var": 0.0363739013671875, "learning_rate": 0.0001, "loss": 1.5986, "loss/crossentropy": 2.5962857007980347, "loss/fcd": 1.42578125, "loss/idx": 4.5, "loss/logits": 0.17285378277301788, "step": 699 }, { "epoch": 0.010452519430487013, "grad_norm": 0.5859375, "grad_norm_var": 0.03812357584635417, "learning_rate": 0.0001, "loss": 1.6397, "loss/crossentropy": 2.7463815212249756, "loss/fcd": 1.44921875, "loss/idx": 4.5, "loss/logits": 0.19048676639795303, "step": 700 }, { "epoch": 0.010467451601101994, "grad_norm": 0.67578125, "grad_norm_var": 0.03516686757405599, "learning_rate": 0.0001, "loss": 1.6874, "loss/crossentropy": 2.5035320520401, "loss/fcd": 1.5, "loss/idx": 4.5, "loss/logits": 0.18743757903575897, "step": 701 }, { "epoch": 0.010482383771716976, "grad_norm": 0.7109375, "grad_norm_var": 0.03366800944010417, "learning_rate": 0.0001, "loss": 1.5613, "loss/crossentropy": 2.606451153755188, "loss/fcd": 1.39453125, "loss/idx": 4.5, "loss/logits": 0.16672632098197937, "step": 702 }, { "epoch": 0.010497315942331957, "grad_norm": 0.5234375, "grad_norm_var": 0.009806060791015625, "learning_rate": 0.0001, "loss": 1.6531, "loss/crossentropy": 2.852249503135681, "loss/fcd": 1.453125, "loss/idx": 4.5, "loss/logits": 0.20001086592674255, "step": 703 }, { "epoch": 0.010512248112946938, "grad_norm": 0.6796875, "grad_norm_var": 0.006644630432128906, "learning_rate": 0.0001, "loss": 1.5716, "loss/crossentropy": 2.573358654975891, "loss/fcd": 1.40625, "loss/idx": 4.5, "loss/logits": 0.16530360281467438, "step": 704 }, { "epoch": 0.01052718028356192, "grad_norm": 0.609375, "grad_norm_var": 0.006656646728515625, "learning_rate": 0.0001, "loss": 1.8231, "loss/crossentropy": 2.376923680305481, "loss/fcd": 1.609375, "loss/idx": 4.5, "loss/logits": 0.2137339860200882, "step": 705 }, { "epoch": 0.010542112454176902, "grad_norm": 0.515625, "grad_norm_var": 0.005594317118326823, "learning_rate": 0.0001, "loss": 1.5644, "loss/crossentropy": 2.464131474494934, "loss/fcd": 1.37890625, "loss/idx": 4.5, "loss/logits": 0.18554290384054184, "step": 706 }, { "epoch": 0.010557044624791883, "grad_norm": 0.5625, "grad_norm_var": 0.005628458658854167, "learning_rate": 0.0001, "loss": 1.6984, "loss/crossentropy": 2.2634644508361816, "loss/fcd": 1.5234375, "loss/idx": 4.5, "loss/logits": 0.1750023141503334, "step": 707 }, { "epoch": 0.010571976795406864, "grad_norm": 0.6328125, "grad_norm_var": 0.004969278971354167, "learning_rate": 0.0001, "loss": 1.7061, "loss/crossentropy": 2.3716477751731873, "loss/fcd": 1.4921875, "loss/idx": 4.5, "loss/logits": 0.21387499570846558, "step": 708 }, { "epoch": 0.010586908966021847, "grad_norm": 0.765625, "grad_norm_var": 0.006226539611816406, "learning_rate": 0.0001, "loss": 1.5934, "loss/crossentropy": 2.4680949449539185, "loss/fcd": 1.4296875, "loss/idx": 4.5, "loss/logits": 0.163712278008461, "step": 709 }, { "epoch": 0.010601841136636828, "grad_norm": 1.0546875, "grad_norm_var": 0.01647923787434896, "learning_rate": 0.0001, "loss": 1.9395, "loss/crossentropy": 2.2741931676864624, "loss/fcd": 1.6953125, "loss/idx": 4.5, "loss/logits": 0.24415750801563263, "step": 710 }, { "epoch": 0.010616773307251809, "grad_norm": 0.52734375, "grad_norm_var": 0.01730798085530599, "learning_rate": 0.0001, "loss": 1.5527, "loss/crossentropy": 2.863884925842285, "loss/fcd": 1.37890625, "loss/idx": 4.5, "loss/logits": 0.1737941950559616, "step": 711 }, { "epoch": 0.01063170547786679, "grad_norm": 0.46484375, "grad_norm_var": 0.01971613566080729, "learning_rate": 0.0001, "loss": 1.5012, "loss/crossentropy": 2.589643120765686, "loss/fcd": 1.3359375, "loss/idx": 4.5, "loss/logits": 0.16527117788791656, "step": 712 }, { "epoch": 0.010646637648481772, "grad_norm": 0.9921875, "grad_norm_var": 0.026842689514160155, "learning_rate": 0.0001, "loss": 1.7384, "loss/crossentropy": 2.3513693809509277, "loss/fcd": 1.56640625, "loss/idx": 4.5, "loss/logits": 0.1720396801829338, "step": 713 }, { "epoch": 0.010661569819096753, "grad_norm": 0.59765625, "grad_norm_var": 0.026709938049316408, "learning_rate": 0.0001, "loss": 1.4949, "loss/crossentropy": 2.5104587078094482, "loss/fcd": 1.3359375, "loss/idx": 4.5, "loss/logits": 0.15896443277597427, "step": 714 }, { "epoch": 0.010676501989711734, "grad_norm": 0.78125, "grad_norm_var": 0.027445220947265626, "learning_rate": 0.0001, "loss": 1.635, "loss/crossentropy": 2.6453863382339478, "loss/fcd": 1.4453125, "loss/idx": 4.5, "loss/logits": 0.18970200419425964, "step": 715 }, { "epoch": 0.010691434160326715, "grad_norm": 0.68359375, "grad_norm_var": 0.026979509989420572, "learning_rate": 0.0001, "loss": 1.7556, "loss/crossentropy": 2.2782176733016968, "loss/fcd": 1.5625, "loss/idx": 4.5, "loss/logits": 0.19310269504785538, "step": 716 }, { "epoch": 0.010706366330941698, "grad_norm": 0.89453125, "grad_norm_var": 0.030034319559733073, "learning_rate": 0.0001, "loss": 1.9091, "loss/crossentropy": 2.518513798713684, "loss/fcd": 1.64453125, "loss/idx": 4.5, "loss/logits": 0.2645469158887863, "step": 717 }, { "epoch": 0.010721298501556679, "grad_norm": 0.62109375, "grad_norm_var": 0.030255126953125, "learning_rate": 0.0001, "loss": 1.6619, "loss/crossentropy": 2.3726186752319336, "loss/fcd": 1.48046875, "loss/idx": 4.5, "loss/logits": 0.18143515288829803, "step": 718 }, { "epoch": 0.01073623067217166, "grad_norm": 0.71875, "grad_norm_var": 0.028519439697265624, "learning_rate": 0.0001, "loss": 1.8026, "loss/crossentropy": 2.561523675918579, "loss/fcd": 1.59765625, "loss/idx": 4.5, "loss/logits": 0.20491648465394974, "step": 719 }, { "epoch": 0.010751162842786642, "grad_norm": 0.68359375, "grad_norm_var": 0.028513018290201822, "learning_rate": 0.0001, "loss": 1.4863, "loss/crossentropy": 2.5928921699523926, "loss/fcd": 1.33203125, "loss/idx": 4.5, "loss/logits": 0.15430676937103271, "step": 720 }, { "epoch": 0.010766095013401623, "grad_norm": 0.69140625, "grad_norm_var": 0.028006998697916667, "learning_rate": 0.0001, "loss": 1.5925, "loss/crossentropy": 2.8197734355926514, "loss/fcd": 1.40625, "loss/idx": 4.5, "loss/logits": 0.18625715374946594, "step": 721 }, { "epoch": 0.010781027184016604, "grad_norm": 0.59375, "grad_norm_var": 0.026476033528645835, "learning_rate": 0.0001, "loss": 1.6078, "loss/crossentropy": 2.76321017742157, "loss/fcd": 1.41796875, "loss/idx": 4.5, "loss/logits": 0.18983574956655502, "step": 722 }, { "epoch": 0.010795959354631585, "grad_norm": 0.5859375, "grad_norm_var": 0.02606786092122396, "learning_rate": 0.0001, "loss": 1.5531, "loss/crossentropy": 2.5983407497406006, "loss/fcd": 1.375, "loss/idx": 4.5, "loss/logits": 0.17812784761190414, "step": 723 }, { "epoch": 0.010810891525246568, "grad_norm": 0.69140625, "grad_norm_var": 0.02571404774983724, "learning_rate": 0.0001, "loss": 1.7445, "loss/crossentropy": 2.4067989587783813, "loss/fcd": 1.5546875, "loss/idx": 4.5, "loss/logits": 0.18979395180940628, "step": 724 }, { "epoch": 0.010825823695861549, "grad_norm": 0.54296875, "grad_norm_var": 0.027138264973958333, "learning_rate": 0.0001, "loss": 1.7346, "loss/crossentropy": 2.627017021179199, "loss/fcd": 1.50390625, "loss/idx": 4.5, "loss/logits": 0.23074181377887726, "step": 725 }, { "epoch": 0.01084075586647653, "grad_norm": 0.52734375, "grad_norm_var": 0.01925042470296224, "learning_rate": 0.0001, "loss": 1.4865, "loss/crossentropy": 2.517759680747986, "loss/fcd": 1.33203125, "loss/idx": 4.5, "loss/logits": 0.15450593829154968, "step": 726 }, { "epoch": 0.010855688037091513, "grad_norm": 0.71484375, "grad_norm_var": 0.01807244618733724, "learning_rate": 0.0001, "loss": 1.8211, "loss/crossentropy": 2.503226161003113, "loss/fcd": 1.59375, "loss/idx": 4.5, "loss/logits": 0.22732951492071152, "step": 727 }, { "epoch": 0.010870620207706494, "grad_norm": 0.5390625, "grad_norm_var": 0.016346232096354166, "learning_rate": 0.0001, "loss": 1.5924, "loss/crossentropy": 2.5677419900894165, "loss/fcd": 1.4140625, "loss/idx": 4.5, "loss/logits": 0.17830483615398407, "step": 728 }, { "epoch": 0.010885552378321475, "grad_norm": 0.609375, "grad_norm_var": 0.009504954020182291, "learning_rate": 0.0001, "loss": 1.6721, "loss/crossentropy": 2.6440834999084473, "loss/fcd": 1.47265625, "loss/idx": 4.5, "loss/logits": 0.19942344725131989, "step": 729 }, { "epoch": 0.010900484548936456, "grad_norm": 0.53125, "grad_norm_var": 0.010286394755045574, "learning_rate": 0.0001, "loss": 1.5164, "loss/crossentropy": 2.8209099769592285, "loss/fcd": 1.34765625, "loss/idx": 4.5, "loss/logits": 0.16875500977039337, "step": 730 }, { "epoch": 0.010915416719551438, "grad_norm": 1.4765625, "grad_norm_var": 0.05261173248291016, "learning_rate": 0.0001, "loss": 1.7781, "loss/crossentropy": 2.5496203899383545, "loss/fcd": 1.54296875, "loss/idx": 4.5, "loss/logits": 0.2351158782839775, "step": 731 }, { "epoch": 0.01093034889016642, "grad_norm": 0.7265625, "grad_norm_var": 0.05266698201497396, "learning_rate": 0.0001, "loss": 1.834, "loss/crossentropy": 2.629123568534851, "loss/fcd": 1.59765625, "loss/idx": 4.5, "loss/logits": 0.23638265579938889, "step": 732 }, { "epoch": 0.0109452810607814, "grad_norm": 0.63671875, "grad_norm_var": 0.050023396809895836, "learning_rate": 0.0001, "loss": 1.65, "loss/crossentropy": 2.464652180671692, "loss/fcd": 1.46875, "loss/idx": 4.5, "loss/logits": 0.18121907860040665, "step": 733 }, { "epoch": 0.010960213231396381, "grad_norm": 0.984375, "grad_norm_var": 0.055386288960774736, "learning_rate": 0.0001, "loss": 1.8003, "loss/crossentropy": 2.5960421562194824, "loss/fcd": 1.58203125, "loss/idx": 4.5, "loss/logits": 0.2182973176240921, "step": 734 }, { "epoch": 0.010975145402011364, "grad_norm": 0.6015625, "grad_norm_var": 0.056004269917805986, "learning_rate": 0.0001, "loss": 1.5743, "loss/crossentropy": 2.6835473775863647, "loss/fcd": 1.40625, "loss/idx": 4.5, "loss/logits": 0.16806279867887497, "step": 735 }, { "epoch": 0.010990077572626345, "grad_norm": 0.671875, "grad_norm_var": 0.05603230794270833, "learning_rate": 0.0001, "loss": 1.6808, "loss/crossentropy": 2.8178036212921143, "loss/fcd": 1.48828125, "loss/idx": 4.5, "loss/logits": 0.19247954338788986, "step": 736 }, { "epoch": 0.011005009743241326, "grad_norm": 0.5390625, "grad_norm_var": 0.05756219228108724, "learning_rate": 0.0001, "loss": 1.5277, "loss/crossentropy": 2.874936580657959, "loss/fcd": 1.3515625, "loss/idx": 4.5, "loss/logits": 0.17609652876853943, "step": 737 }, { "epoch": 0.011019941913856308, "grad_norm": 0.56640625, "grad_norm_var": 0.05794448852539062, "learning_rate": 0.0001, "loss": 1.4942, "loss/crossentropy": 2.6819831132888794, "loss/fcd": 1.3359375, "loss/idx": 4.5, "loss/logits": 0.15826641023159027, "step": 738 }, { "epoch": 0.01103487408447129, "grad_norm": 0.65625, "grad_norm_var": 0.0573333740234375, "learning_rate": 0.0001, "loss": 1.6524, "loss/crossentropy": 2.48427951335907, "loss/fcd": 1.47265625, "loss/idx": 4.5, "loss/logits": 0.17978182435035706, "step": 739 }, { "epoch": 0.01104980625508627, "grad_norm": 0.765625, "grad_norm_var": 0.05770664215087891, "learning_rate": 0.0001, "loss": 1.6315, "loss/crossentropy": 2.6147966384887695, "loss/fcd": 1.46484375, "loss/idx": 4.5, "loss/logits": 0.16669780015945435, "step": 740 }, { "epoch": 0.011064738425701251, "grad_norm": 1.7421875, "grad_norm_var": 0.12358169555664063, "learning_rate": 0.0001, "loss": 1.9483, "loss/crossentropy": 2.789383292198181, "loss/fcd": 1.7265625, "loss/idx": 4.5, "loss/logits": 0.22178317606449127, "step": 741 }, { "epoch": 0.011079670596316234, "grad_norm": 0.59765625, "grad_norm_var": 0.1216339111328125, "learning_rate": 0.0001, "loss": 1.666, "loss/crossentropy": 2.650735855102539, "loss/fcd": 1.48828125, "loss/idx": 4.5, "loss/logits": 0.17772582918405533, "step": 742 }, { "epoch": 0.011094602766931215, "grad_norm": 0.7109375, "grad_norm_var": 0.12166487375895182, "learning_rate": 0.0001, "loss": 1.4375, "loss/crossentropy": 2.542524218559265, "loss/fcd": 1.29296875, "loss/idx": 4.5, "loss/logits": 0.1445121243596077, "step": 743 }, { "epoch": 0.011109534937546196, "grad_norm": 0.455078125, "grad_norm_var": 0.12471655209859213, "learning_rate": 0.0001, "loss": 1.4358, "loss/crossentropy": 2.5641543865203857, "loss/fcd": 1.28515625, "loss/idx": 4.5, "loss/logits": 0.1506011188030243, "step": 744 }, { "epoch": 0.011124467108161179, "grad_norm": 0.8203125, "grad_norm_var": 0.12306516965230306, "learning_rate": 0.0001, "loss": 1.8413, "loss/crossentropy": 2.5019315481185913, "loss/fcd": 1.62890625, "loss/idx": 4.5, "loss/logits": 0.2123931124806404, "step": 745 }, { "epoch": 0.01113939927877616, "grad_norm": 1.34375, "grad_norm_var": 0.13736062049865722, "learning_rate": 0.0001, "loss": 1.845, "loss/crossentropy": 2.5026661157608032, "loss/fcd": 1.63671875, "loss/idx": 4.5, "loss/logits": 0.20829641073942184, "step": 746 }, { "epoch": 0.01115433144939114, "grad_norm": 0.67578125, "grad_norm_var": 0.10850434303283692, "learning_rate": 0.0001, "loss": 1.4702, "loss/crossentropy": 2.7267106771469116, "loss/fcd": 1.3203125, "loss/idx": 4.5, "loss/logits": 0.14988812804222107, "step": 747 }, { "epoch": 0.011169263620006122, "grad_norm": 0.6640625, "grad_norm_var": 0.10920116106669107, "learning_rate": 0.0001, "loss": 1.5848, "loss/crossentropy": 2.7606505155563354, "loss/fcd": 1.40234375, "loss/idx": 4.5, "loss/logits": 0.18249479681253433, "step": 748 }, { "epoch": 0.011184195790621104, "grad_norm": 0.6875, "grad_norm_var": 0.1084126631418864, "learning_rate": 0.0001, "loss": 1.6927, "loss/crossentropy": 2.5876625776290894, "loss/fcd": 1.50390625, "loss/idx": 4.5, "loss/logits": 0.1888418346643448, "step": 749 }, { "epoch": 0.011199127961236085, "grad_norm": 0.578125, "grad_norm_var": 0.10766549110412597, "learning_rate": 0.0001, "loss": 1.6173, "loss/crossentropy": 2.8532919883728027, "loss/fcd": 1.421875, "loss/idx": 4.5, "loss/logits": 0.1953836902976036, "step": 750 }, { "epoch": 0.011214060131851066, "grad_norm": 0.6796875, "grad_norm_var": 0.10645114580790202, "learning_rate": 0.0001, "loss": 1.7082, "loss/crossentropy": 2.7735183238983154, "loss/fcd": 1.51953125, "loss/idx": 4.5, "loss/logits": 0.18861962109804153, "step": 751 }, { "epoch": 0.011228992302466047, "grad_norm": 0.92578125, "grad_norm_var": 0.10750908851623535, "learning_rate": 0.0001, "loss": 2.2305, "loss/crossentropy": 2.238771915435791, "loss/fcd": 1.8984375, "loss/idx": 4.5, "loss/logits": 0.3320219963788986, "step": 752 }, { "epoch": 0.01124392447308103, "grad_norm": 0.6328125, "grad_norm_var": 0.1051027774810791, "learning_rate": 0.0001, "loss": 1.5859, "loss/crossentropy": 2.843374013900757, "loss/fcd": 1.40625, "loss/idx": 4.5, "loss/logits": 0.17964357882738113, "step": 753 }, { "epoch": 0.01125885664369601, "grad_norm": 0.5625, "grad_norm_var": 0.1052156925201416, "learning_rate": 0.0001, "loss": 1.6273, "loss/crossentropy": 2.5842140913009644, "loss/fcd": 1.43359375, "loss/idx": 4.5, "loss/logits": 0.1937049850821495, "step": 754 }, { "epoch": 0.011273788814310992, "grad_norm": 0.5078125, "grad_norm_var": 0.10906434059143066, "learning_rate": 0.0001, "loss": 1.6215, "loss/crossentropy": 2.3504830598831177, "loss/fcd": 1.44921875, "loss/idx": 4.5, "loss/logits": 0.1723143458366394, "step": 755 }, { "epoch": 0.011288720984925974, "grad_norm": 0.67578125, "grad_norm_var": 0.10964341163635254, "learning_rate": 0.0001, "loss": 1.5916, "loss/crossentropy": 2.5767829418182373, "loss/fcd": 1.41796875, "loss/idx": 4.5, "loss/logits": 0.1736106276512146, "step": 756 }, { "epoch": 0.011303653155540955, "grad_norm": 0.9375, "grad_norm_var": 0.045402002334594724, "learning_rate": 0.0001, "loss": 1.7466, "loss/crossentropy": 2.8778876066207886, "loss/fcd": 1.5546875, "loss/idx": 4.5, "loss/logits": 0.19187040627002716, "step": 757 }, { "epoch": 0.011318585326155936, "grad_norm": 0.59765625, "grad_norm_var": 0.045402002334594724, "learning_rate": 0.0001, "loss": 1.736, "loss/crossentropy": 2.4122936725616455, "loss/fcd": 1.5390625, "loss/idx": 4.5, "loss/logits": 0.19690872728824615, "step": 758 }, { "epoch": 0.011333517496770917, "grad_norm": 0.98828125, "grad_norm_var": 0.05002439816792806, "learning_rate": 0.0001, "loss": 1.7155, "loss/crossentropy": 2.531772494316101, "loss/fcd": 1.51171875, "loss/idx": 4.5, "loss/logits": 0.2037365883588791, "step": 759 }, { "epoch": 0.0113484496673859, "grad_norm": 0.58984375, "grad_norm_var": 0.04616063435872396, "learning_rate": 0.0001, "loss": 1.6171, "loss/crossentropy": 2.5002293586730957, "loss/fcd": 1.44140625, "loss/idx": 4.5, "loss/logits": 0.17570620775222778, "step": 760 }, { "epoch": 0.011363381838000881, "grad_norm": 0.65625, "grad_norm_var": 0.04612325032552083, "learning_rate": 0.0001, "loss": 1.8533, "loss/crossentropy": 2.38156259059906, "loss/fcd": 1.63671875, "loss/idx": 4.5, "loss/logits": 0.21658504754304886, "step": 761 }, { "epoch": 0.011378314008615862, "grad_norm": 0.625, "grad_norm_var": 0.019731648763020835, "learning_rate": 0.0001, "loss": 1.5841, "loss/crossentropy": 2.541592836380005, "loss/fcd": 1.4140625, "loss/idx": 4.5, "loss/logits": 0.17004821822047234, "step": 762 }, { "epoch": 0.011393246179230845, "grad_norm": 0.69140625, "grad_norm_var": 0.019724527994791668, "learning_rate": 0.0001, "loss": 1.6894, "loss/crossentropy": 2.7292309999465942, "loss/fcd": 1.484375, "loss/idx": 4.5, "loss/logits": 0.20505433529615402, "step": 763 }, { "epoch": 0.011408178349845826, "grad_norm": 0.53125, "grad_norm_var": 0.02124201456705729, "learning_rate": 0.0001, "loss": 1.5474, "loss/crossentropy": 2.6666187047958374, "loss/fcd": 1.3828125, "loss/idx": 4.5, "loss/logits": 0.16459019482135773, "step": 764 }, { "epoch": 0.011423110520460807, "grad_norm": 0.6796875, "grad_norm_var": 0.0212371826171875, "learning_rate": 0.0001, "loss": 1.5077, "loss/crossentropy": 2.5730679035186768, "loss/fcd": 1.34375, "loss/idx": 4.5, "loss/logits": 0.16394272446632385, "step": 765 }, { "epoch": 0.011438042691075788, "grad_norm": 0.71875, "grad_norm_var": 0.020587158203125, "learning_rate": 0.0001, "loss": 1.6541, "loss/crossentropy": 2.655824899673462, "loss/fcd": 1.45703125, "loss/idx": 4.5, "loss/logits": 0.19702915847301483, "step": 766 }, { "epoch": 0.01145297486169077, "grad_norm": 0.60546875, "grad_norm_var": 0.021008745829264323, "learning_rate": 0.0001, "loss": 1.4335, "loss/crossentropy": 2.477110981941223, "loss/fcd": 1.2890625, "loss/idx": 4.5, "loss/logits": 0.14447420090436935, "step": 767 }, { "epoch": 0.011467907032305751, "grad_norm": 0.73046875, "grad_norm_var": 0.01706689198811849, "learning_rate": 0.0001, "loss": 1.4393, "loss/crossentropy": 2.8458136320114136, "loss/fcd": 1.2890625, "loss/idx": 4.5, "loss/logits": 0.15027117729187012, "step": 768 }, { "epoch": 0.011482839202920732, "grad_norm": 0.5703125, "grad_norm_var": 0.017626380920410155, "learning_rate": 0.0001, "loss": 1.6885, "loss/crossentropy": 2.53455650806427, "loss/fcd": 1.48046875, "loss/idx": 4.5, "loss/logits": 0.20800386369228363, "step": 769 }, { "epoch": 0.011497771373535713, "grad_norm": 0.59765625, "grad_norm_var": 0.0172149658203125, "learning_rate": 0.0001, "loss": 1.5591, "loss/crossentropy": 2.6400580406188965, "loss/fcd": 1.390625, "loss/idx": 4.5, "loss/logits": 0.16844240576028824, "step": 770 }, { "epoch": 0.011512703544150696, "grad_norm": 0.73046875, "grad_norm_var": 0.015529823303222657, "learning_rate": 0.0001, "loss": 1.5597, "loss/crossentropy": 2.5259393453598022, "loss/fcd": 1.390625, "loss/idx": 4.5, "loss/logits": 0.16906024515628815, "step": 771 }, { "epoch": 0.011527635714765677, "grad_norm": 0.59375, "grad_norm_var": 0.01602783203125, "learning_rate": 0.0001, "loss": 1.5399, "loss/crossentropy": 2.5816057920455933, "loss/fcd": 1.37890625, "loss/idx": 4.5, "loss/logits": 0.16103952378034592, "step": 772 }, { "epoch": 0.011542567885380658, "grad_norm": 0.734375, "grad_norm_var": 0.011571248372395834, "learning_rate": 0.0001, "loss": 1.5382, "loss/crossentropy": 2.8559324741363525, "loss/fcd": 1.38671875, "loss/idx": 4.5, "loss/logits": 0.15143951773643494, "step": 773 }, { "epoch": 0.01155750005599564, "grad_norm": 0.5703125, "grad_norm_var": 0.011863644917805989, "learning_rate": 0.0001, "loss": 1.5398, "loss/crossentropy": 2.618905544281006, "loss/fcd": 1.375, "loss/idx": 4.5, "loss/logits": 0.16479819267988205, "step": 774 }, { "epoch": 0.011572432226610621, "grad_norm": 0.7265625, "grad_norm_var": 0.004805246988932292, "learning_rate": 0.0001, "loss": 1.5362, "loss/crossentropy": 2.4410958290100098, "loss/fcd": 1.375, "loss/idx": 4.5, "loss/logits": 0.16121891140937805, "step": 775 }, { "epoch": 0.011587364397225602, "grad_norm": 0.671875, "grad_norm_var": 0.0046009699503580725, "learning_rate": 0.0001, "loss": 1.7557, "loss/crossentropy": 2.30470609664917, "loss/fcd": 1.5546875, "loss/idx": 4.5, "loss/logits": 0.2010229527950287, "step": 776 }, { "epoch": 0.011602296567840583, "grad_norm": 0.64453125, "grad_norm_var": 0.004603068033854167, "learning_rate": 0.0001, "loss": 1.5888, "loss/crossentropy": 2.371055006980896, "loss/fcd": 1.40625, "loss/idx": 4.5, "loss/logits": 0.1825580596923828, "step": 777 }, { "epoch": 0.011617228738455566, "grad_norm": 0.64453125, "grad_norm_var": 0.004558245340983073, "learning_rate": 0.0001, "loss": 1.7082, "loss/crossentropy": 2.692077875137329, "loss/fcd": 1.515625, "loss/idx": 4.5, "loss/logits": 0.1926237791776657, "step": 778 }, { "epoch": 0.011632160909070547, "grad_norm": 0.51171875, "grad_norm_var": 0.0056461970011393225, "learning_rate": 0.0001, "loss": 1.3531, "loss/crossentropy": 2.7112133502960205, "loss/fcd": 1.21875, "loss/idx": 4.5, "loss/logits": 0.1343764252960682, "step": 779 }, { "epoch": 0.011647093079685528, "grad_norm": 0.6171875, "grad_norm_var": 0.00484612782796224, "learning_rate": 0.0001, "loss": 1.5029, "loss/crossentropy": 2.5949923992156982, "loss/fcd": 1.34375, "loss/idx": 4.5, "loss/logits": 0.15916766971349716, "step": 780 }, { "epoch": 0.01166202525030051, "grad_norm": 0.7265625, "grad_norm_var": 0.00518945058186849, "learning_rate": 0.0001, "loss": 1.5756, "loss/crossentropy": 2.5727131366729736, "loss/fcd": 1.40625, "loss/idx": 4.5, "loss/logits": 0.16936737298965454, "step": 781 }, { "epoch": 0.011676957420915492, "grad_norm": 0.61328125, "grad_norm_var": 0.0049130757649739586, "learning_rate": 0.0001, "loss": 1.8494, "loss/crossentropy": 2.4157882928848267, "loss/fcd": 1.640625, "loss/idx": 4.75, "loss/logits": 0.2087443321943283, "step": 782 }, { "epoch": 0.011691889591530473, "grad_norm": 3.34375, "grad_norm_var": 0.45982252756754555, "learning_rate": 0.0001, "loss": 1.911, "loss/crossentropy": 2.7091548442840576, "loss/fcd": 1.703125, "loss/idx": 5.0, "loss/logits": 0.20787205547094345, "step": 783 }, { "epoch": 0.011706821762145454, "grad_norm": 2.828125, "grad_norm_var": 0.7114115397135417, "learning_rate": 0.0001, "loss": 1.8491, "loss/crossentropy": 2.588376045227051, "loss/fcd": 1.67578125, "loss/idx": 5.0, "loss/logits": 0.1733308508992195, "step": 784 }, { "epoch": 0.011721753932760436, "grad_norm": 1.6328125, "grad_norm_var": 0.7288431803385417, "learning_rate": 0.0001, "loss": 1.881, "loss/crossentropy": 2.472332239151001, "loss/fcd": 1.67578125, "loss/idx": 5.0, "loss/logits": 0.20525866746902466, "step": 785 }, { "epoch": 0.011736686103375417, "grad_norm": 1.1875, "grad_norm_var": 0.7180236180623373, "learning_rate": 0.0001, "loss": 1.8963, "loss/crossentropy": 2.606132984161377, "loss/fcd": 1.6875, "loss/idx": 5.0, "loss/logits": 0.2088092416524887, "step": 786 }, { "epoch": 0.011751618273990398, "grad_norm": 1.3515625, "grad_norm_var": 0.7157895406087239, "learning_rate": 0.0001, "loss": 2.0326, "loss/crossentropy": 2.485718846321106, "loss/fcd": 1.8125, "loss/idx": 5.0, "loss/logits": 0.22014504671096802, "step": 787 }, { "epoch": 0.011766550444605381, "grad_norm": 0.99609375, "grad_norm_var": 0.699424680074056, "learning_rate": 0.0001, "loss": 1.7551, "loss/crossentropy": 2.6328842639923096, "loss/fcd": 1.578125, "loss/idx": 5.0, "loss/logits": 0.1769460216164589, "step": 788 }, { "epoch": 0.011781482615220362, "grad_norm": 1.0859375, "grad_norm_var": 0.6894225438435873, "learning_rate": 0.0001, "loss": 1.78, "loss/crossentropy": 2.5578906536102295, "loss/fcd": 1.62109375, "loss/idx": 5.0, "loss/logits": 0.15887843072414398, "step": 789 }, { "epoch": 0.011796414785835343, "grad_norm": 0.88671875, "grad_norm_var": 0.6718770345052083, "learning_rate": 0.0001, "loss": 1.8571, "loss/crossentropy": 2.5024824142456055, "loss/fcd": 1.65234375, "loss/idx": 5.0, "loss/logits": 0.20473603904247284, "step": 790 }, { "epoch": 0.011811346956450324, "grad_norm": 1.0234375, "grad_norm_var": 0.6604543050130208, "learning_rate": 0.0001, "loss": 1.8476, "loss/crossentropy": 2.6586837768554688, "loss/fcd": 1.66015625, "loss/idx": 5.0, "loss/logits": 0.18739935755729675, "step": 791 }, { "epoch": 0.011826279127065306, "grad_norm": 1.1640625, "grad_norm_var": 0.642718251546224, "learning_rate": 0.0001, "loss": 2.1369, "loss/crossentropy": 2.6025651693344116, "loss/fcd": 1.89453125, "loss/idx": 5.0, "loss/logits": 0.24233842641115189, "step": 792 }, { "epoch": 0.011841211297680287, "grad_norm": 0.78125, "grad_norm_var": 0.6336949030558269, "learning_rate": 0.0001, "loss": 1.7646, "loss/crossentropy": 2.4655721187591553, "loss/fcd": 1.578125, "loss/idx": 5.0, "loss/logits": 0.18643079698085785, "step": 793 }, { "epoch": 0.011856143468295268, "grad_norm": 0.8125, "grad_norm_var": 0.6227457682291667, "learning_rate": 0.0001, "loss": 1.7683, "loss/crossentropy": 2.915129542350769, "loss/fcd": 1.5703125, "loss/idx": 5.0, "loss/logits": 0.1980355903506279, "step": 794 }, { "epoch": 0.01187107563891025, "grad_norm": 0.77734375, "grad_norm_var": 0.6019765218098958, "learning_rate": 0.0001, "loss": 1.7054, "loss/crossentropy": 2.5375667810440063, "loss/fcd": 1.52734375, "loss/idx": 5.0, "loss/logits": 0.1780666932463646, "step": 795 }, { "epoch": 0.011886007809525232, "grad_norm": 0.8515625, "grad_norm_var": 0.5859700520833333, "learning_rate": 0.0001, "loss": 1.9032, "loss/crossentropy": 2.496041178703308, "loss/fcd": 1.6875, "loss/idx": 5.0, "loss/logits": 0.21571382135152817, "step": 796 }, { "epoch": 0.011900939980140213, "grad_norm": 0.796875, "grad_norm_var": 0.5813351949055989, "learning_rate": 0.0001, "loss": 1.7965, "loss/crossentropy": 2.4716135263442993, "loss/fcd": 1.6015625, "loss/idx": 5.0, "loss/logits": 0.19488991051912308, "step": 797 }, { "epoch": 0.011915872150755194, "grad_norm": 0.75, "grad_norm_var": 0.5707452774047852, "learning_rate": 0.0001, "loss": 1.6519, "loss/crossentropy": 2.6149767637252808, "loss/fcd": 1.48046875, "loss/idx": 5.0, "loss/logits": 0.17144525051116943, "step": 798 }, { "epoch": 0.011930804321370177, "grad_norm": 1.546875, "grad_norm_var": 0.27495110829671227, "learning_rate": 0.0001, "loss": 2.0333, "loss/crossentropy": 2.268043637275696, "loss/fcd": 1.8359375, "loss/idx": 5.0, "loss/logits": 0.19733671098947525, "step": 799 }, { "epoch": 0.011945736491985158, "grad_norm": 0.80859375, "grad_norm_var": 0.07921040852864583, "learning_rate": 0.0001, "loss": 1.7229, "loss/crossentropy": 2.6515145301818848, "loss/fcd": 1.53515625, "loss/idx": 5.0, "loss/logits": 0.1877676323056221, "step": 800 }, { "epoch": 0.011960668662600139, "grad_norm": 0.80859375, "grad_norm_var": 0.055237770080566406, "learning_rate": 0.0001, "loss": 1.9004, "loss/crossentropy": 2.5546780824661255, "loss/fcd": 1.68359375, "loss/idx": 5.0, "loss/logits": 0.21680361777544022, "step": 801 }, { "epoch": 0.01197560083321512, "grad_norm": 0.76171875, "grad_norm_var": 0.05460713704427083, "learning_rate": 0.0001, "loss": 1.7663, "loss/crossentropy": 2.465883493423462, "loss/fcd": 1.58203125, "loss/idx": 5.0, "loss/logits": 0.18430602550506592, "step": 802 }, { "epoch": 0.011990533003830102, "grad_norm": 0.859375, "grad_norm_var": 0.04340794881184896, "learning_rate": 0.0001, "loss": 1.7138, "loss/crossentropy": 2.5838167667388916, "loss/fcd": 1.53125, "loss/idx": 5.0, "loss/logits": 0.1825515776872635, "step": 803 }, { "epoch": 0.012005465174445083, "grad_norm": 0.66796875, "grad_norm_var": 0.046783192952473955, "learning_rate": 0.0001, "loss": 1.7907, "loss/crossentropy": 2.5992263555526733, "loss/fcd": 1.58984375, "loss/idx": 5.0, "loss/logits": 0.20081757754087448, "step": 804 }, { "epoch": 0.012020397345060064, "grad_norm": 0.67578125, "grad_norm_var": 0.047070248921712236, "learning_rate": 0.0001, "loss": 1.6835, "loss/crossentropy": 2.5859053134918213, "loss/fcd": 1.51171875, "loss/idx": 5.0, "loss/logits": 0.17173586785793304, "step": 805 }, { "epoch": 0.012035329515675047, "grad_norm": 0.6484375, "grad_norm_var": 0.0501922607421875, "learning_rate": 0.0001, "loss": 1.6041, "loss/crossentropy": 2.4893710613250732, "loss/fcd": 1.43359375, "loss/idx": 5.0, "loss/logits": 0.17046400159597397, "step": 806 }, { "epoch": 0.012050261686290028, "grad_norm": 0.7265625, "grad_norm_var": 0.04916788736979167, "learning_rate": 0.0001, "loss": 1.8526, "loss/crossentropy": 2.5315016508102417, "loss/fcd": 1.62109375, "loss/idx": 5.0, "loss/logits": 0.23148700594902039, "step": 807 }, { "epoch": 0.012065193856905009, "grad_norm": 0.83984375, "grad_norm_var": 0.041722043355305986, "learning_rate": 0.0001, "loss": 1.9535, "loss/crossentropy": 2.791074752807617, "loss/fcd": 1.7109375, "loss/idx": 5.0, "loss/logits": 0.24258101731538773, "step": 808 }, { "epoch": 0.01208012602751999, "grad_norm": 0.73828125, "grad_norm_var": 0.042057037353515625, "learning_rate": 0.0001, "loss": 1.6953, "loss/crossentropy": 2.7802544832229614, "loss/fcd": 1.52734375, "loss/idx": 5.0, "loss/logits": 0.1679670214653015, "step": 809 }, { "epoch": 0.012095058198134973, "grad_norm": 0.79296875, "grad_norm_var": 0.042092323303222656, "learning_rate": 0.0001, "loss": 1.6601, "loss/crossentropy": 2.8163868188858032, "loss/fcd": 1.4921875, "loss/idx": 5.0, "loss/logits": 0.1679074689745903, "step": 810 }, { "epoch": 0.012109990368749953, "grad_norm": 0.65625, "grad_norm_var": 0.04362767537434896, "learning_rate": 0.0001, "loss": 1.6613, "loss/crossentropy": 2.4688737392425537, "loss/fcd": 1.48828125, "loss/idx": 5.0, "loss/logits": 0.17299792170524597, "step": 811 }, { "epoch": 0.012124922539364934, "grad_norm": 0.77734375, "grad_norm_var": 0.043541908264160156, "learning_rate": 0.0001, "loss": 1.6717, "loss/crossentropy": 2.6986693143844604, "loss/fcd": 1.48828125, "loss/idx": 5.0, "loss/logits": 0.18341868370771408, "step": 812 }, { "epoch": 0.012139854709979915, "grad_norm": 0.9609375, "grad_norm_var": 0.04507999420166016, "learning_rate": 0.0001, "loss": 1.8393, "loss/crossentropy": 2.588726282119751, "loss/fcd": 1.6171875, "loss/idx": 5.0, "loss/logits": 0.22206994891166687, "step": 813 }, { "epoch": 0.012154786880594898, "grad_norm": 0.6796875, "grad_norm_var": 0.045986366271972653, "learning_rate": 0.0001, "loss": 1.766, "loss/crossentropy": 2.444581627845764, "loss/fcd": 1.5625, "loss/idx": 5.0, "loss/logits": 0.20353230834007263, "step": 814 }, { "epoch": 0.012169719051209879, "grad_norm": 0.765625, "grad_norm_var": 0.007305335998535156, "learning_rate": 0.0001, "loss": 1.8629, "loss/crossentropy": 2.46638023853302, "loss/fcd": 1.64453125, "loss/idx": 5.0, "loss/logits": 0.21839633584022522, "step": 815 }, { "epoch": 0.01218465122182486, "grad_norm": 0.69921875, "grad_norm_var": 0.00735162099202474, "learning_rate": 0.0001, "loss": 1.634, "loss/crossentropy": 2.41643226146698, "loss/fcd": 1.45703125, "loss/idx": 5.0, "loss/logits": 0.17698514461517334, "step": 816 }, { "epoch": 0.012199583392439843, "grad_norm": 0.82421875, "grad_norm_var": 0.00748132069905599, "learning_rate": 0.0001, "loss": 1.9725, "loss/crossentropy": 2.3987841606140137, "loss/fcd": 1.73046875, "loss/idx": 5.0, "loss/logits": 0.24200860410928726, "step": 817 }, { "epoch": 0.012214515563054824, "grad_norm": 0.56640625, "grad_norm_var": 0.009681129455566406, "learning_rate": 0.0001, "loss": 1.5978, "loss/crossentropy": 2.526577949523926, "loss/fcd": 1.4453125, "loss/idx": 5.0, "loss/logits": 0.15250347554683685, "step": 818 }, { "epoch": 0.012229447733669805, "grad_norm": 1.234375, "grad_norm_var": 0.024317359924316405, "learning_rate": 0.0001, "loss": 1.9582, "loss/crossentropy": 2.2526134252548218, "loss/fcd": 1.7734375, "loss/idx": 5.0, "loss/logits": 0.18472721427679062, "step": 819 }, { "epoch": 0.012244379904284786, "grad_norm": 0.84765625, "grad_norm_var": 0.02398980458577474, "learning_rate": 0.0001, "loss": 1.9581, "loss/crossentropy": 2.593246102333069, "loss/fcd": 1.734375, "loss/idx": 5.0, "loss/logits": 0.223709836602211, "step": 820 }, { "epoch": 0.012259312074899768, "grad_norm": 0.6953125, "grad_norm_var": 0.023749796549479167, "learning_rate": 0.0001, "loss": 1.6906, "loss/crossentropy": 2.491560935974121, "loss/fcd": 1.5078125, "loss/idx": 5.0, "loss/logits": 0.18277642875909805, "step": 821 }, { "epoch": 0.01227424424551475, "grad_norm": 0.6953125, "grad_norm_var": 0.023075358072916666, "learning_rate": 0.0001, "loss": 1.7343, "loss/crossentropy": 2.584471344947815, "loss/fcd": 1.5390625, "loss/idx": 5.0, "loss/logits": 0.19519615173339844, "step": 822 }, { "epoch": 0.01228917641612973, "grad_norm": 0.7578125, "grad_norm_var": 0.022908528645833332, "learning_rate": 0.0001, "loss": 1.7162, "loss/crossentropy": 2.3712323904037476, "loss/fcd": 1.54296875, "loss/idx": 5.0, "loss/logits": 0.17325877398252487, "step": 823 }, { "epoch": 0.012304108586744713, "grad_norm": 0.8515625, "grad_norm_var": 0.02300561269124349, "learning_rate": 0.0001, "loss": 1.8932, "loss/crossentropy": 2.8186241388320923, "loss/fcd": 1.67578125, "loss/idx": 5.0, "loss/logits": 0.21743790805339813, "step": 824 }, { "epoch": 0.012319040757359694, "grad_norm": 0.58984375, "grad_norm_var": 0.025286293029785155, "learning_rate": 0.0001, "loss": 1.5933, "loss/crossentropy": 2.623106360435486, "loss/fcd": 1.43359375, "loss/idx": 5.0, "loss/logits": 0.15974538028240204, "step": 825 }, { "epoch": 0.012333972927974675, "grad_norm": 0.828125, "grad_norm_var": 0.025449371337890624, "learning_rate": 0.0001, "loss": 1.7932, "loss/crossentropy": 2.6241841316223145, "loss/fcd": 1.60546875, "loss/idx": 5.0, "loss/logits": 0.18778088688850403, "step": 826 }, { "epoch": 0.012348905098589656, "grad_norm": 0.74609375, "grad_norm_var": 0.024509112040201824, "learning_rate": 0.0001, "loss": 1.7246, "loss/crossentropy": 2.3002325296401978, "loss/fcd": 1.5546875, "loss/idx": 5.0, "loss/logits": 0.16992325335741043, "step": 827 }, { "epoch": 0.012363837269204639, "grad_norm": 0.76953125, "grad_norm_var": 0.024518267313639323, "learning_rate": 0.0001, "loss": 1.6774, "loss/crossentropy": 2.769911289215088, "loss/fcd": 1.49609375, "loss/idx": 5.0, "loss/logits": 0.18125663697719574, "step": 828 }, { "epoch": 0.01237876943981962, "grad_norm": 0.80078125, "grad_norm_var": 0.022299957275390626, "learning_rate": 0.0001, "loss": 1.6901, "loss/crossentropy": 2.5946470499038696, "loss/fcd": 1.51953125, "loss/idx": 5.0, "loss/logits": 0.17055295407772064, "step": 829 }, { "epoch": 0.0123937016104346, "grad_norm": 0.6484375, "grad_norm_var": 0.022745513916015626, "learning_rate": 0.0001, "loss": 1.6234, "loss/crossentropy": 2.5415327548980713, "loss/fcd": 1.47265625, "loss/idx": 5.0, "loss/logits": 0.15074985474348068, "step": 830 }, { "epoch": 0.012408633781049581, "grad_norm": 0.6328125, "grad_norm_var": 0.02392578125, "learning_rate": 0.0001, "loss": 1.7857, "loss/crossentropy": 2.580002784729004, "loss/fcd": 1.58203125, "loss/idx": 5.0, "loss/logits": 0.2036324143409729, "step": 831 }, { "epoch": 0.012423565951664564, "grad_norm": 0.87890625, "grad_norm_var": 0.02444636027018229, "learning_rate": 0.0001, "loss": 1.7464, "loss/crossentropy": 2.715728998184204, "loss/fcd": 1.5625, "loss/idx": 5.0, "loss/logits": 0.18391364812850952, "step": 832 }, { "epoch": 0.012438498122279545, "grad_norm": 0.86328125, "grad_norm_var": 0.024808756510416665, "learning_rate": 0.0001, "loss": 1.9102, "loss/crossentropy": 2.602025628089905, "loss/fcd": 1.70703125, "loss/idx": 5.0, "loss/logits": 0.2031889334321022, "step": 833 }, { "epoch": 0.012453430292894526, "grad_norm": 0.8125, "grad_norm_var": 0.021736590067545573, "learning_rate": 0.0001, "loss": 1.817, "loss/crossentropy": 2.6534132957458496, "loss/fcd": 1.62109375, "loss/idx": 5.0, "loss/logits": 0.195940762758255, "step": 834 }, { "epoch": 0.012468362463509509, "grad_norm": 0.59375, "grad_norm_var": 0.00949548085530599, "learning_rate": 0.0001, "loss": 1.6428, "loss/crossentropy": 2.453692674636841, "loss/fcd": 1.47265625, "loss/idx": 5.0, "loss/logits": 0.17016924917697906, "step": 835 }, { "epoch": 0.01248329463412449, "grad_norm": 0.78515625, "grad_norm_var": 0.008931922912597656, "learning_rate": 0.0001, "loss": 1.666, "loss/crossentropy": 2.7629364728927612, "loss/fcd": 1.49609375, "loss/idx": 5.0, "loss/logits": 0.16993117332458496, "step": 836 }, { "epoch": 0.01249822680473947, "grad_norm": 0.67578125, "grad_norm_var": 0.009089914957682292, "learning_rate": 0.0001, "loss": 1.6307, "loss/crossentropy": 2.5838505029678345, "loss/fcd": 1.4609375, "loss/idx": 5.0, "loss/logits": 0.16975942254066467, "step": 837 }, { "epoch": 0.012513158975354452, "grad_norm": 0.66796875, "grad_norm_var": 0.00932000478108724, "learning_rate": 0.0001, "loss": 1.6076, "loss/crossentropy": 2.676839232444763, "loss/fcd": 1.453125, "loss/idx": 5.0, "loss/logits": 0.15447519719600677, "step": 838 }, { "epoch": 0.012528091145969434, "grad_norm": 0.73828125, "grad_norm_var": 0.009307607014973959, "learning_rate": 0.0001, "loss": 1.9002, "loss/crossentropy": 2.4299418926239014, "loss/fcd": 1.68359375, "loss/idx": 5.0, "loss/logits": 0.2166183888912201, "step": 839 }, { "epoch": 0.012543023316584415, "grad_norm": 0.71875, "grad_norm_var": 0.008481852213541667, "learning_rate": 0.0001, "loss": 1.6825, "loss/crossentropy": 2.6252626180648804, "loss/fcd": 1.51171875, "loss/idx": 5.0, "loss/logits": 0.17076187580823898, "step": 840 }, { "epoch": 0.012557955487199396, "grad_norm": 0.94140625, "grad_norm_var": 0.009431711832682292, "learning_rate": 0.0001, "loss": 1.8972, "loss/crossentropy": 2.530099391937256, "loss/fcd": 1.68359375, "loss/idx": 5.0, "loss/logits": 0.21358779817819595, "step": 841 }, { "epoch": 0.012572887657814379, "grad_norm": 0.7109375, "grad_norm_var": 0.009168497721354167, "learning_rate": 0.0001, "loss": 1.6495, "loss/crossentropy": 2.651611328125, "loss/fcd": 1.47265625, "loss/idx": 5.0, "loss/logits": 0.17685066163539886, "step": 842 }, { "epoch": 0.01258781982842936, "grad_norm": 0.75, "grad_norm_var": 0.009167925516764323, "learning_rate": 0.0001, "loss": 1.6369, "loss/crossentropy": 2.6540257930755615, "loss/fcd": 1.46875, "loss/idx": 5.0, "loss/logits": 0.16816890239715576, "step": 843 }, { "epoch": 0.012602751999044341, "grad_norm": 0.6796875, "grad_norm_var": 0.009429677327473959, "learning_rate": 0.0001, "loss": 1.681, "loss/crossentropy": 2.589913010597229, "loss/fcd": 1.4921875, "loss/idx": 5.0, "loss/logits": 0.18879953026771545, "step": 844 }, { "epoch": 0.012617684169659322, "grad_norm": 0.69921875, "grad_norm_var": 0.009300740559895833, "learning_rate": 0.0001, "loss": 1.6358, "loss/crossentropy": 2.5078059434890747, "loss/fcd": 1.48046875, "loss/idx": 5.0, "loss/logits": 0.15536697953939438, "step": 845 }, { "epoch": 0.012632616340274305, "grad_norm": 0.96875, "grad_norm_var": 0.011917877197265624, "learning_rate": 0.0001, "loss": 1.5863, "loss/crossentropy": 2.7713186740875244, "loss/fcd": 1.4296875, "loss/idx": 5.0, "loss/logits": 0.1565636619925499, "step": 846 }, { "epoch": 0.012647548510889286, "grad_norm": 0.671875, "grad_norm_var": 0.01136474609375, "learning_rate": 0.0001, "loss": 1.5134, "loss/crossentropy": 2.5787158012390137, "loss/fcd": 1.37109375, "loss/idx": 5.0, "loss/logits": 0.14233843982219696, "step": 847 }, { "epoch": 0.012662480681504266, "grad_norm": 0.71875, "grad_norm_var": 0.010423723856608074, "learning_rate": 0.0001, "loss": 1.7599, "loss/crossentropy": 2.6209789514541626, "loss/fcd": 1.57421875, "loss/idx": 5.0, "loss/logits": 0.18572837859392166, "step": 848 }, { "epoch": 0.01267741285211925, "grad_norm": 0.87109375, "grad_norm_var": 0.010545794169108074, "learning_rate": 0.0001, "loss": 1.7243, "loss/crossentropy": 3.124199151992798, "loss/fcd": 1.52734375, "loss/idx": 5.0, "loss/logits": 0.19692812114953995, "step": 849 }, { "epoch": 0.01269234502273423, "grad_norm": 0.70703125, "grad_norm_var": 0.010365549723307292, "learning_rate": 0.0001, "loss": 1.8374, "loss/crossentropy": 2.7884299755096436, "loss/fcd": 1.62109375, "loss/idx": 5.0, "loss/logits": 0.21627703309059143, "step": 850 }, { "epoch": 0.012707277193349211, "grad_norm": 0.7421875, "grad_norm_var": 0.008775838216145833, "learning_rate": 0.0001, "loss": 1.8568, "loss/crossentropy": 2.615527868270874, "loss/fcd": 1.63671875, "loss/idx": 5.0, "loss/logits": 0.22006352990865707, "step": 851 }, { "epoch": 0.012722209363964192, "grad_norm": 0.6484375, "grad_norm_var": 0.00935662587483724, "learning_rate": 0.0001, "loss": 1.7633, "loss/crossentropy": 2.4802143573760986, "loss/fcd": 1.56640625, "loss/idx": 5.0, "loss/logits": 0.19686861336231232, "step": 852 }, { "epoch": 0.012737141534579175, "grad_norm": 0.78515625, "grad_norm_var": 0.009103838602701824, "learning_rate": 0.0001, "loss": 1.7749, "loss/crossentropy": 2.7521532773971558, "loss/fcd": 1.5859375, "loss/idx": 5.0, "loss/logits": 0.18896755576133728, "step": 853 }, { "epoch": 0.012752073705194156, "grad_norm": 0.73046875, "grad_norm_var": 0.008654212951660157, "learning_rate": 0.0001, "loss": 1.9076, "loss/crossentropy": 2.4716649055480957, "loss/fcd": 1.68359375, "loss/idx": 5.0, "loss/logits": 0.22396356612443924, "step": 854 }, { "epoch": 0.012767005875809137, "grad_norm": 0.87109375, "grad_norm_var": 0.009458351135253906, "learning_rate": 0.0001, "loss": 1.7463, "loss/crossentropy": 2.7693766355514526, "loss/fcd": 1.5703125, "loss/idx": 5.0, "loss/logits": 0.17594317346811295, "step": 855 }, { "epoch": 0.012781938046424118, "grad_norm": 0.8203125, "grad_norm_var": 0.009498023986816406, "learning_rate": 0.0001, "loss": 1.8636, "loss/crossentropy": 2.4391822814941406, "loss/fcd": 1.63671875, "loss/idx": 5.0, "loss/logits": 0.22686351835727692, "step": 856 }, { "epoch": 0.0127968702170391, "grad_norm": 0.671875, "grad_norm_var": 0.0078704833984375, "learning_rate": 0.0001, "loss": 1.7254, "loss/crossentropy": 2.591761350631714, "loss/fcd": 1.53125, "loss/idx": 5.0, "loss/logits": 0.19416391849517822, "step": 857 }, { "epoch": 0.012811802387654081, "grad_norm": 0.8203125, "grad_norm_var": 0.008005777994791666, "learning_rate": 0.0001, "loss": 1.8034, "loss/crossentropy": 2.520877718925476, "loss/fcd": 1.60546875, "loss/idx": 5.0, "loss/logits": 0.1978968232870102, "step": 858 }, { "epoch": 0.012826734558269062, "grad_norm": 0.6875, "grad_norm_var": 0.008331298828125, "learning_rate": 0.0001, "loss": 1.7766, "loss/crossentropy": 2.5855950117111206, "loss/fcd": 1.578125, "loss/idx": 5.0, "loss/logits": 0.198471337556839, "step": 859 }, { "epoch": 0.012841666728884045, "grad_norm": 0.64453125, "grad_norm_var": 0.008765602111816406, "learning_rate": 0.0001, "loss": 1.73, "loss/crossentropy": 2.7856252193450928, "loss/fcd": 1.53125, "loss/idx": 5.0, "loss/logits": 0.19874712824821472, "step": 860 }, { "epoch": 0.012856598899499026, "grad_norm": 0.6640625, "grad_norm_var": 0.009098052978515625, "learning_rate": 0.0001, "loss": 1.7602, "loss/crossentropy": 2.439252257347107, "loss/fcd": 1.5703125, "loss/idx": 5.0, "loss/logits": 0.18988988548517227, "step": 861 }, { "epoch": 0.012871531070114007, "grad_norm": 0.69921875, "grad_norm_var": 0.005829811096191406, "learning_rate": 0.0001, "loss": 1.585, "loss/crossentropy": 2.6466290950775146, "loss/fcd": 1.421875, "loss/idx": 5.0, "loss/logits": 0.16309361904859543, "step": 862 }, { "epoch": 0.012886463240728988, "grad_norm": 0.64453125, "grad_norm_var": 0.006105295817057292, "learning_rate": 0.0001, "loss": 1.7727, "loss/crossentropy": 2.5705440044403076, "loss/fcd": 1.578125, "loss/idx": 5.0, "loss/logits": 0.19455686211585999, "step": 863 }, { "epoch": 0.01290139541134397, "grad_norm": 0.61328125, "grad_norm_var": 0.006999651590983073, "learning_rate": 0.0001, "loss": 1.6992, "loss/crossentropy": 2.6901649236679077, "loss/fcd": 1.515625, "loss/idx": 5.0, "loss/logits": 0.18361696600914001, "step": 864 }, { "epoch": 0.012916327581958952, "grad_norm": 0.81640625, "grad_norm_var": 0.00613091786702474, "learning_rate": 0.0001, "loss": 1.7852, "loss/crossentropy": 2.6737335920333862, "loss/fcd": 1.58203125, "loss/idx": 5.0, "loss/logits": 0.20320622622966766, "step": 865 }, { "epoch": 0.012931259752573932, "grad_norm": 0.671875, "grad_norm_var": 0.0062825520833333336, "learning_rate": 0.0001, "loss": 1.5996, "loss/crossentropy": 2.6164817810058594, "loss/fcd": 1.42578125, "loss/idx": 5.0, "loss/logits": 0.17384624481201172, "step": 866 }, { "epoch": 0.012946191923188915, "grad_norm": 0.6953125, "grad_norm_var": 0.006285603841145833, "learning_rate": 0.0001, "loss": 1.6747, "loss/crossentropy": 2.4937325716018677, "loss/fcd": 1.4921875, "loss/idx": 5.0, "loss/logits": 0.18247970193624496, "step": 867 }, { "epoch": 0.012961124093803896, "grad_norm": 0.66015625, "grad_norm_var": 0.006185849507649739, "learning_rate": 0.0001, "loss": 1.716, "loss/crossentropy": 2.7091100215911865, "loss/fcd": 1.5234375, "loss/idx": 5.0, "loss/logits": 0.19258494675159454, "step": 868 }, { "epoch": 0.012976056264418877, "grad_norm": 0.72265625, "grad_norm_var": 0.00587457021077474, "learning_rate": 0.0001, "loss": 1.7992, "loss/crossentropy": 2.482669949531555, "loss/fcd": 1.59375, "loss/idx": 5.0, "loss/logits": 0.20546174049377441, "step": 869 }, { "epoch": 0.012990988435033858, "grad_norm": 0.6484375, "grad_norm_var": 0.006121571858723958, "learning_rate": 0.0001, "loss": 1.6672, "loss/crossentropy": 2.4320571422576904, "loss/fcd": 1.484375, "loss/idx": 5.0, "loss/logits": 0.18284663558006287, "step": 870 }, { "epoch": 0.01300592060564884, "grad_norm": 0.5625, "grad_norm_var": 0.005423418680826823, "learning_rate": 0.0001, "loss": 1.5728, "loss/crossentropy": 2.7169127464294434, "loss/fcd": 1.41015625, "loss/idx": 5.0, "loss/logits": 0.16260702162981033, "step": 871 }, { "epoch": 0.013020852776263822, "grad_norm": 0.796875, "grad_norm_var": 0.0050511042277018225, "learning_rate": 0.0001, "loss": 1.6553, "loss/crossentropy": 2.3346874713897705, "loss/fcd": 1.4921875, "loss/idx": 5.0, "loss/logits": 0.1631385162472725, "step": 872 }, { "epoch": 0.013035784946878803, "grad_norm": 0.78515625, "grad_norm_var": 0.005598704020182292, "learning_rate": 0.0001, "loss": 1.5253, "loss/crossentropy": 2.60079562664032, "loss/fcd": 1.3828125, "loss/idx": 5.0, "loss/logits": 0.14248249679803848, "step": 873 }, { "epoch": 0.013050717117493784, "grad_norm": 0.365234375, "grad_norm_var": 0.010987202326456705, "learning_rate": 0.0001, "loss": 1.5183, "loss/crossentropy": 2.696221947669983, "loss/fcd": 1.359375, "loss/idx": 5.5, "loss/logits": 0.15895313769578934, "step": 874 }, { "epoch": 0.013065649288108766, "grad_norm": 1.3828125, "grad_norm_var": 0.043070713678995766, "learning_rate": 0.0001, "loss": 3.0924, "loss/crossentropy": 2.6322977542877197, "loss/fcd": 2.62109375, "loss/idx": 5.5, "loss/logits": 0.47132138907909393, "step": 875 }, { "epoch": 0.013080581458723747, "grad_norm": 0.3828125, "grad_norm_var": 0.04966479937235514, "learning_rate": 0.0001, "loss": 1.6102, "loss/crossentropy": 2.672438383102417, "loss/fcd": 1.43359375, "loss/idx": 5.5, "loss/logits": 0.17659874260425568, "step": 876 }, { "epoch": 0.013095513629338728, "grad_norm": 0.345703125, "grad_norm_var": 0.05728956858317057, "learning_rate": 0.0001, "loss": 1.564, "loss/crossentropy": 2.610317587852478, "loss/fcd": 1.390625, "loss/idx": 5.5, "loss/logits": 0.17338179051876068, "step": 877 }, { "epoch": 0.013110445799953711, "grad_norm": 0.486328125, "grad_norm_var": 0.05942228635152181, "learning_rate": 0.0001, "loss": 1.6579, "loss/crossentropy": 2.7957422733306885, "loss/fcd": 1.46484375, "loss/idx": 5.5, "loss/logits": 0.19305869936943054, "step": 878 }, { "epoch": 0.013125377970568692, "grad_norm": 0.484375, "grad_norm_var": 0.06138253211975098, "learning_rate": 0.0001, "loss": 1.8523, "loss/crossentropy": 2.4633307456970215, "loss/fcd": 1.61328125, "loss/idx": 5.5, "loss/logits": 0.23906587064266205, "step": 879 }, { "epoch": 0.013140310141183673, "grad_norm": 0.330078125, "grad_norm_var": 0.06782881418863933, "learning_rate": 0.0001, "loss": 1.5047, "loss/crossentropy": 2.651333808898926, "loss/fcd": 1.34375, "loss/idx": 5.5, "loss/logits": 0.16093496978282928, "step": 880 }, { "epoch": 0.013155242311798654, "grad_norm": 0.36328125, "grad_norm_var": 0.06961358388264974, "learning_rate": 0.0001, "loss": 1.6017, "loss/crossentropy": 2.7588586807250977, "loss/fcd": 1.421875, "loss/idx": 5.5, "loss/logits": 0.17987017333507538, "step": 881 }, { "epoch": 0.013170174482413637, "grad_norm": 0.298828125, "grad_norm_var": 0.07499616940816244, "learning_rate": 0.0001, "loss": 1.4945, "loss/crossentropy": 2.5945587158203125, "loss/fcd": 1.328125, "loss/idx": 5.5, "loss/logits": 0.16634615510702133, "step": 882 }, { "epoch": 0.013185106653028618, "grad_norm": 0.369140625, "grad_norm_var": 0.07671356201171875, "learning_rate": 0.0001, "loss": 1.5697, "loss/crossentropy": 2.6372928619384766, "loss/fcd": 1.39453125, "loss/idx": 5.5, "loss/logits": 0.17521393299102783, "step": 883 }, { "epoch": 0.013200038823643599, "grad_norm": 0.47265625, "grad_norm_var": 0.07644500732421874, "learning_rate": 0.0001, "loss": 1.6773, "loss/crossentropy": 2.472917318344116, "loss/fcd": 1.4921875, "loss/idx": 5.5, "loss/logits": 0.18511150032281876, "step": 884 }, { "epoch": 0.013214970994258581, "grad_norm": 0.310546875, "grad_norm_var": 0.07756180763244629, "learning_rate": 0.0001, "loss": 1.5916, "loss/crossentropy": 2.264296770095825, "loss/fcd": 1.40625, "loss/idx": 5.5, "loss/logits": 0.18537598103284836, "step": 885 }, { "epoch": 0.013229903164873562, "grad_norm": 0.37890625, "grad_norm_var": 0.07763199806213379, "learning_rate": 0.0001, "loss": 1.6628, "loss/crossentropy": 2.6821300983428955, "loss/fcd": 1.45703125, "loss/idx": 5.5, "loss/logits": 0.20572075992822647, "step": 886 }, { "epoch": 0.013244835335488543, "grad_norm": 0.314453125, "grad_norm_var": 0.07964859008789063, "learning_rate": 0.0001, "loss": 1.5511, "loss/crossentropy": 2.712405562400818, "loss/fcd": 1.3828125, "loss/idx": 5.5, "loss/logits": 0.1682407334446907, "step": 887 }, { "epoch": 0.013259767506103524, "grad_norm": 0.392578125, "grad_norm_var": 0.07341370582580567, "learning_rate": 0.0001, "loss": 1.5617, "loss/crossentropy": 2.6705490350723267, "loss/fcd": 1.3828125, "loss/idx": 5.5, "loss/logits": 0.17886866629123688, "step": 888 }, { "epoch": 0.013274699676718507, "grad_norm": 0.474609375, "grad_norm_var": 0.06624393463134766, "learning_rate": 0.0001, "loss": 1.7751, "loss/crossentropy": 2.8483877182006836, "loss/fcd": 1.5546875, "loss/idx": 5.5, "loss/logits": 0.22039098292589188, "step": 889 }, { "epoch": 0.013289631847333488, "grad_norm": 0.7421875, "grad_norm_var": 0.07101413408915201, "learning_rate": 0.0001, "loss": 1.7931, "loss/crossentropy": 2.6503173112869263, "loss/fcd": 1.578125, "loss/idx": 5.5, "loss/logits": 0.21500100940465927, "step": 890 }, { "epoch": 0.013304564017948469, "grad_norm": 0.306640625, "grad_norm_var": 0.012502543131510417, "learning_rate": 0.0001, "loss": 1.5623, "loss/crossentropy": 2.57335889339447, "loss/fcd": 1.3828125, "loss/idx": 5.5, "loss/logits": 0.1794382557272911, "step": 891 }, { "epoch": 0.01331949618856345, "grad_norm": 0.388671875, "grad_norm_var": 0.012488667170206707, "learning_rate": 0.0001, "loss": 1.5658, "loss/crossentropy": 2.6493390798568726, "loss/fcd": 1.390625, "loss/idx": 5.5, "loss/logits": 0.17513766884803772, "step": 892 }, { "epoch": 0.013334428359178432, "grad_norm": 0.365234375, "grad_norm_var": 0.012361510594685873, "learning_rate": 0.0001, "loss": 1.6516, "loss/crossentropy": 2.388102173805237, "loss/fcd": 1.46484375, "loss/idx": 5.5, "loss/logits": 0.18672683835029602, "step": 893 }, { "epoch": 0.013349360529793413, "grad_norm": 0.392578125, "grad_norm_var": 0.011893065770467122, "learning_rate": 0.0001, "loss": 1.5088, "loss/crossentropy": 2.585448145866394, "loss/fcd": 1.33984375, "loss/idx": 5.5, "loss/logits": 0.16900470852851868, "step": 894 }, { "epoch": 0.013364292700408394, "grad_norm": 0.333984375, "grad_norm_var": 0.011595662434895833, "learning_rate": 0.0001, "loss": 1.6176, "loss/crossentropy": 2.563124418258667, "loss/fcd": 1.41796875, "loss/idx": 5.5, "loss/logits": 0.19963373243808746, "step": 895 }, { "epoch": 0.013379224871023377, "grad_norm": 0.46875, "grad_norm_var": 0.011696100234985352, "learning_rate": 0.0001, "loss": 1.7444, "loss/crossentropy": 2.6045339107513428, "loss/fcd": 1.53515625, "loss/idx": 5.5, "loss/logits": 0.20921117812395096, "step": 896 }, { "epoch": 0.013394157041638358, "grad_norm": 0.400390625, "grad_norm_var": 0.01160882314046224, "learning_rate": 0.0001, "loss": 1.5898, "loss/crossentropy": 2.5683369636535645, "loss/fcd": 1.4140625, "loss/idx": 5.5, "loss/logits": 0.17578723281621933, "step": 897 }, { "epoch": 0.013409089212253339, "grad_norm": 0.421875, "grad_norm_var": 0.010884841283162435, "learning_rate": 0.0001, "loss": 1.6952, "loss/crossentropy": 2.586871862411499, "loss/fcd": 1.46875, "loss/idx": 5.5, "loss/logits": 0.22648146003484726, "step": 898 }, { "epoch": 0.01342402138286832, "grad_norm": 0.37109375, "grad_norm_var": 0.01087487538655599, "learning_rate": 0.0001, "loss": 1.6624, "loss/crossentropy": 2.4116278886795044, "loss/fcd": 1.47265625, "loss/idx": 5.5, "loss/logits": 0.18976984173059464, "step": 899 }, { "epoch": 0.013438953553483303, "grad_norm": 0.37890625, "grad_norm_var": 0.010621579488118489, "learning_rate": 0.0001, "loss": 1.7932, "loss/crossentropy": 2.4072563648223877, "loss/fcd": 1.5703125, "loss/idx": 5.5, "loss/logits": 0.22293731570243835, "step": 900 }, { "epoch": 0.013453885724098284, "grad_norm": 0.396484375, "grad_norm_var": 0.010028521219889322, "learning_rate": 0.0001, "loss": 1.6985, "loss/crossentropy": 2.457371711730957, "loss/fcd": 1.49609375, "loss/idx": 5.5, "loss/logits": 0.202431321144104, "step": 901 }, { "epoch": 0.013468817894713265, "grad_norm": 0.384765625, "grad_norm_var": 0.010007969538370768, "learning_rate": 0.0001, "loss": 1.5288, "loss/crossentropy": 2.6801459789276123, "loss/fcd": 1.36328125, "loss/idx": 5.5, "loss/logits": 0.16556568443775177, "step": 902 }, { "epoch": 0.013483750065328247, "grad_norm": 0.345703125, "grad_norm_var": 0.009677871068318685, "learning_rate": 0.0001, "loss": 1.481, "loss/crossentropy": 2.8748459815979004, "loss/fcd": 1.3203125, "loss/idx": 5.5, "loss/logits": 0.1606980860233307, "step": 903 }, { "epoch": 0.013498682235943228, "grad_norm": 0.34765625, "grad_norm_var": 0.009910011291503906, "learning_rate": 0.0001, "loss": 1.5032, "loss/crossentropy": 2.6527421474456787, "loss/fcd": 1.33984375, "loss/idx": 5.5, "loss/logits": 0.1633721962571144, "step": 904 }, { "epoch": 0.01351361440655821, "grad_norm": 0.33984375, "grad_norm_var": 0.009838724136352539, "learning_rate": 0.0001, "loss": 1.5438, "loss/crossentropy": 2.3300899267196655, "loss/fcd": 1.37890625, "loss/idx": 5.5, "loss/logits": 0.16493894159793854, "step": 905 }, { "epoch": 0.01352854657717319, "grad_norm": 0.328125, "grad_norm_var": 0.0016100406646728516, "learning_rate": 0.0001, "loss": 1.4594, "loss/crossentropy": 2.485817790031433, "loss/fcd": 1.30078125, "loss/idx": 5.5, "loss/logits": 0.15857402980327606, "step": 906 }, { "epoch": 0.013543478747788173, "grad_norm": 0.412109375, "grad_norm_var": 0.0013697147369384766, "learning_rate": 0.0001, "loss": 1.6415, "loss/crossentropy": 2.596954822540283, "loss/fcd": 1.44921875, "loss/idx": 5.5, "loss/logits": 0.19229594618082047, "step": 907 }, { "epoch": 0.013558410918403154, "grad_norm": 0.380859375, "grad_norm_var": 0.0013642470041910807, "learning_rate": 0.0001, "loss": 1.5437, "loss/crossentropy": 2.5879993438720703, "loss/fcd": 1.36328125, "loss/idx": 5.5, "loss/logits": 0.18044909089803696, "step": 908 }, { "epoch": 0.013573343089018135, "grad_norm": 0.359375, "grad_norm_var": 0.0013773600260416667, "learning_rate": 0.0001, "loss": 1.64, "loss/crossentropy": 2.525175929069519, "loss/fcd": 1.43359375, "loss/idx": 5.5, "loss/logits": 0.20643934607505798, "step": 909 }, { "epoch": 0.013588275259633116, "grad_norm": 0.34375, "grad_norm_var": 0.00143736203511556, "learning_rate": 0.0001, "loss": 1.5451, "loss/crossentropy": 2.763263463973999, "loss/fcd": 1.3828125, "loss/idx": 5.5, "loss/logits": 0.16232506185770035, "step": 910 }, { "epoch": 0.013603207430248098, "grad_norm": 0.39453125, "grad_norm_var": 0.0013284683227539062, "learning_rate": 0.0001, "loss": 1.5742, "loss/crossentropy": 2.6070960760116577, "loss/fcd": 1.40625, "loss/idx": 5.5, "loss/logits": 0.16791047900915146, "step": 911 }, { "epoch": 0.01361813960086308, "grad_norm": 0.380859375, "grad_norm_var": 0.0007669925689697266, "learning_rate": 0.0001, "loss": 1.6868, "loss/crossentropy": 2.4819244146347046, "loss/fcd": 1.49609375, "loss/idx": 5.5, "loss/logits": 0.1907288283109665, "step": 912 }, { "epoch": 0.01363307177147806, "grad_norm": 0.396484375, "grad_norm_var": 0.0007542769114176432, "learning_rate": 0.0001, "loss": 1.5673, "loss/crossentropy": 2.4561848640441895, "loss/fcd": 1.3984375, "loss/idx": 5.5, "loss/logits": 0.1689068078994751, "step": 913 }, { "epoch": 0.013648003942093043, "grad_norm": 0.50390625, "grad_norm_var": 0.0016995588938395181, "learning_rate": 0.0001, "loss": 1.7706, "loss/crossentropy": 2.5172276496887207, "loss/fcd": 1.52734375, "loss/idx": 5.5, "loss/logits": 0.24328875541687012, "step": 914 }, { "epoch": 0.013662936112708024, "grad_norm": 0.3046875, "grad_norm_var": 0.002045424779256185, "learning_rate": 0.0001, "loss": 1.5093, "loss/crossentropy": 2.4740447998046875, "loss/fcd": 1.3359375, "loss/idx": 5.5, "loss/logits": 0.1733308956027031, "step": 915 }, { "epoch": 0.013677868283323005, "grad_norm": 0.443359375, "grad_norm_var": 0.0023396809895833335, "learning_rate": 0.0001, "loss": 1.8087, "loss/crossentropy": 2.6645225286483765, "loss/fcd": 1.56640625, "loss/idx": 5.5, "loss/logits": 0.24224933236837387, "step": 916 }, { "epoch": 0.013692800453937986, "grad_norm": 0.453125, "grad_norm_var": 0.0026729424794514974, "learning_rate": 0.0001, "loss": 1.7391, "loss/crossentropy": 2.4171453714370728, "loss/fcd": 1.53515625, "loss/idx": 5.5, "loss/logits": 0.20390180498361588, "step": 917 }, { "epoch": 0.013707732624552969, "grad_norm": 0.37890625, "grad_norm_var": 0.0026732762654622395, "learning_rate": 0.0001, "loss": 1.5484, "loss/crossentropy": 2.8347936868667603, "loss/fcd": 1.375, "loss/idx": 5.5, "loss/logits": 0.17344320565462112, "step": 918 }, { "epoch": 0.01372266479516795, "grad_norm": 0.3984375, "grad_norm_var": 0.0025913079579671225, "learning_rate": 0.0001, "loss": 1.5296, "loss/crossentropy": 2.494508743286133, "loss/fcd": 1.34765625, "loss/idx": 5.5, "loss/logits": 0.1818997785449028, "step": 919 }, { "epoch": 0.01373759696578293, "grad_norm": 0.39453125, "grad_norm_var": 0.002492888768513997, "learning_rate": 0.0001, "loss": 1.6563, "loss/crossentropy": 2.2331273555755615, "loss/fcd": 1.46875, "loss/idx": 5.5, "loss/logits": 0.1875598356127739, "step": 920 }, { "epoch": 0.013752529136397913, "grad_norm": 0.333984375, "grad_norm_var": 0.0025328954060872396, "learning_rate": 0.0001, "loss": 1.5313, "loss/crossentropy": 2.591843605041504, "loss/fcd": 1.36328125, "loss/idx": 5.5, "loss/logits": 0.16802766174077988, "step": 921 }, { "epoch": 0.013767461307012894, "grad_norm": 0.33203125, "grad_norm_var": 0.002502695719401042, "learning_rate": 0.0001, "loss": 1.6131, "loss/crossentropy": 2.5554168224334717, "loss/fcd": 1.41796875, "loss/idx": 5.5, "loss/logits": 0.19509856402873993, "step": 922 }, { "epoch": 0.013782393477627875, "grad_norm": 0.3046875, "grad_norm_var": 0.0028812249501546225, "learning_rate": 0.0001, "loss": 1.4442, "loss/crossentropy": 2.4917492866516113, "loss/fcd": 1.2890625, "loss/idx": 5.5, "loss/logits": 0.1551196053624153, "step": 923 }, { "epoch": 0.013797325648242856, "grad_norm": 0.345703125, "grad_norm_var": 0.0029613335927327475, "learning_rate": 0.0001, "loss": 1.4584, "loss/crossentropy": 2.6141878366470337, "loss/fcd": 1.30078125, "loss/idx": 5.5, "loss/logits": 0.15758418291807175, "step": 924 }, { "epoch": 0.013812257818857839, "grad_norm": 0.478515625, "grad_norm_var": 0.00353240966796875, "learning_rate": 0.0001, "loss": 1.771, "loss/crossentropy": 2.5590325593948364, "loss/fcd": 1.5625, "loss/idx": 5.5, "loss/logits": 0.20846740901470184, "step": 925 }, { "epoch": 0.01382718998947282, "grad_norm": 0.341796875, "grad_norm_var": 0.0035438378651936847, "learning_rate": 0.0001, "loss": 1.5279, "loss/crossentropy": 2.688356399536133, "loss/fcd": 1.359375, "loss/idx": 5.5, "loss/logits": 0.16853488981723785, "step": 926 }, { "epoch": 0.0138421221600878, "grad_norm": 0.36328125, "grad_norm_var": 0.003571812311808268, "learning_rate": 0.0001, "loss": 1.5302, "loss/crossentropy": 2.627697229385376, "loss/fcd": 1.359375, "loss/idx": 5.5, "loss/logits": 0.1708643138408661, "step": 927 }, { "epoch": 0.013857054330702783, "grad_norm": 0.484375, "grad_norm_var": 0.004189300537109375, "learning_rate": 0.0001, "loss": 1.7863, "loss/crossentropy": 2.4670333862304688, "loss/fcd": 1.5859375, "loss/idx": 5.5, "loss/logits": 0.2003796547651291, "step": 928 }, { "epoch": 0.013871986501317764, "grad_norm": 1.3671875, "grad_norm_var": 0.06377600034077963, "learning_rate": 0.0001, "loss": 1.8487, "loss/crossentropy": 2.5073784589767456, "loss/fcd": 1.6484375, "loss/idx": 6.0, "loss/logits": 0.20029612630605698, "step": 929 }, { "epoch": 0.013886918671932745, "grad_norm": 2.390625, "grad_norm_var": 0.2993701775868734, "learning_rate": 0.0001, "loss": 1.9457, "loss/crossentropy": 2.699519634246826, "loss/fcd": 1.73046875, "loss/idx": 6.0, "loss/logits": 0.2152162715792656, "step": 930 }, { "epoch": 0.013901850842547726, "grad_norm": 1.671875, "grad_norm_var": 0.3678853193918864, "learning_rate": 0.0001, "loss": 1.9107, "loss/crossentropy": 2.436430335044861, "loss/fcd": 1.71484375, "loss/idx": 6.0, "loss/logits": 0.19587621092796326, "step": 931 }, { "epoch": 0.013916783013162709, "grad_norm": 1.1875, "grad_norm_var": 0.3814806620279948, "learning_rate": 0.0001, "loss": 1.8237, "loss/crossentropy": 2.57407009601593, "loss/fcd": 1.625, "loss/idx": 6.0, "loss/logits": 0.1987301930785179, "step": 932 }, { "epoch": 0.01393171518377769, "grad_norm": 1.2109375, "grad_norm_var": 0.3922607421875, "learning_rate": 0.0001, "loss": 2.1566, "loss/crossentropy": 2.553762674331665, "loss/fcd": 1.88671875, "loss/idx": 6.0, "loss/logits": 0.26991352438926697, "step": 933 }, { "epoch": 0.013946647354392671, "grad_norm": 0.9375, "grad_norm_var": 0.384196408589681, "learning_rate": 0.0001, "loss": 2.0457, "loss/crossentropy": 2.5711495876312256, "loss/fcd": 1.81640625, "loss/idx": 6.0, "loss/logits": 0.22925003618001938, "step": 934 }, { "epoch": 0.013961579525007652, "grad_norm": 0.76171875, "grad_norm_var": 0.3737721761067708, "learning_rate": 0.0001, "loss": 1.7447, "loss/crossentropy": 2.5488909482955933, "loss/fcd": 1.55078125, "loss/idx": 6.0, "loss/logits": 0.19389048963785172, "step": 935 }, { "epoch": 0.013976511695622635, "grad_norm": 0.8671875, "grad_norm_var": 0.3617634455362956, "learning_rate": 0.0001, "loss": 1.9592, "loss/crossentropy": 2.635095238685608, "loss/fcd": 1.74609375, "loss/idx": 6.0, "loss/logits": 0.21313950419425964, "step": 936 }, { "epoch": 0.013991443866237616, "grad_norm": 0.7109375, "grad_norm_var": 0.3454036553700765, "learning_rate": 0.0001, "loss": 1.7995, "loss/crossentropy": 2.693148612976074, "loss/fcd": 1.6015625, "loss/idx": 6.0, "loss/logits": 0.19793272763490677, "step": 937 }, { "epoch": 0.014006376036852597, "grad_norm": 0.7421875, "grad_norm_var": 0.3270587762196859, "learning_rate": 0.0001, "loss": 1.8292, "loss/crossentropy": 2.7530752420425415, "loss/fcd": 1.625, "loss/idx": 6.0, "loss/logits": 0.20420175790786743, "step": 938 }, { "epoch": 0.01402130820746758, "grad_norm": 0.7109375, "grad_norm_var": 0.30591975847880043, "learning_rate": 0.0001, "loss": 1.9451, "loss/crossentropy": 2.7189706563949585, "loss/fcd": 1.703125, "loss/idx": 6.0, "loss/logits": 0.2419990748167038, "step": 939 }, { "epoch": 0.01403624037808256, "grad_norm": 0.5859375, "grad_norm_var": 0.2914271036783854, "learning_rate": 0.0001, "loss": 1.9409, "loss/crossentropy": 2.3447235822677612, "loss/fcd": 1.71875, "loss/idx": 6.0, "loss/logits": 0.22218556702136993, "step": 940 }, { "epoch": 0.014051172548697541, "grad_norm": 0.65625, "grad_norm_var": 0.28280218442281085, "learning_rate": 0.0001, "loss": 2.0273, "loss/crossentropy": 2.694061040878296, "loss/fcd": 1.7890625, "loss/idx": 6.0, "loss/logits": 0.23818911612033844, "step": 941 }, { "epoch": 0.014066104719312522, "grad_norm": 0.6484375, "grad_norm_var": 0.2643483479817708, "learning_rate": 0.0001, "loss": 1.82, "loss/crossentropy": 2.6287845373153687, "loss/fcd": 1.60546875, "loss/idx": 6.0, "loss/logits": 0.21457893401384354, "step": 942 }, { "epoch": 0.014081036889927505, "grad_norm": 0.95703125, "grad_norm_var": 0.23945414225260417, "learning_rate": 0.0001, "loss": 2.2185, "loss/crossentropy": 2.373252034187317, "loss/fcd": 1.92578125, "loss/idx": 6.0, "loss/logits": 0.2926865443587303, "step": 943 }, { "epoch": 0.014095969060542486, "grad_norm": 0.66015625, "grad_norm_var": 0.22946058909098307, "learning_rate": 0.0001, "loss": 2.0388, "loss/crossentropy": 2.5666359663009644, "loss/fcd": 1.7890625, "loss/idx": 6.0, "loss/logits": 0.24973313510417938, "step": 944 }, { "epoch": 0.014110901231157467, "grad_norm": 0.53515625, "grad_norm_var": 0.23245340983072918, "learning_rate": 0.0001, "loss": 1.6877, "loss/crossentropy": 2.591952323913574, "loss/fcd": 1.51171875, "loss/idx": 6.0, "loss/logits": 0.17602194100618362, "step": 945 }, { "epoch": 0.01412583340177245, "grad_norm": 0.6796875, "grad_norm_var": 0.08725763956705729, "learning_rate": 0.0001, "loss": 2.0353, "loss/crossentropy": 2.3668471574783325, "loss/fcd": 1.7890625, "loss/idx": 6.0, "loss/logits": 0.24623292684555054, "step": 946 }, { "epoch": 0.01414076557238743, "grad_norm": 0.4921875, "grad_norm_var": 0.044209798177083336, "learning_rate": 0.0001, "loss": 1.6808, "loss/crossentropy": 2.7240748405456543, "loss/fcd": 1.5078125, "loss/idx": 6.0, "loss/logits": 0.17297939211130142, "step": 947 }, { "epoch": 0.014155697743002411, "grad_norm": 0.56640625, "grad_norm_var": 0.03386834462483724, "learning_rate": 0.0001, "loss": 1.7939, "loss/crossentropy": 2.401008129119873, "loss/fcd": 1.60546875, "loss/idx": 6.0, "loss/logits": 0.18840720504522324, "step": 948 }, { "epoch": 0.014170629913617392, "grad_norm": 0.57421875, "grad_norm_var": 0.01860326131184896, "learning_rate": 0.0001, "loss": 1.9822, "loss/crossentropy": 2.2320332527160645, "loss/fcd": 1.7734375, "loss/idx": 6.0, "loss/logits": 0.20871728658676147, "step": 949 }, { "epoch": 0.014185562084232375, "grad_norm": 0.6171875, "grad_norm_var": 0.014568074544270834, "learning_rate": 0.0001, "loss": 1.9696, "loss/crossentropy": 2.5282981395721436, "loss/fcd": 1.73046875, "loss/idx": 6.0, "loss/logits": 0.23909948021173477, "step": 950 }, { "epoch": 0.014200494254847356, "grad_norm": 0.72265625, "grad_norm_var": 0.014200592041015625, "learning_rate": 0.0001, "loss": 2.0531, "loss/crossentropy": 2.437385082244873, "loss/fcd": 1.8203125, "loss/idx": 6.0, "loss/logits": 0.23282259702682495, "step": 951 }, { "epoch": 0.014215426425462337, "grad_norm": 0.61328125, "grad_norm_var": 0.011568133036295574, "learning_rate": 0.0001, "loss": 1.8507, "loss/crossentropy": 2.450563669204712, "loss/fcd": 1.6484375, "loss/idx": 6.0, "loss/logits": 0.20221839100122452, "step": 952 }, { "epoch": 0.014230358596077318, "grad_norm": 0.474609375, "grad_norm_var": 0.013281742731730143, "learning_rate": 0.0001, "loss": 1.6715, "loss/crossentropy": 2.5514947175979614, "loss/fcd": 1.4921875, "loss/idx": 6.0, "loss/logits": 0.17935138195753098, "step": 953 }, { "epoch": 0.0142452907666923, "grad_norm": 0.58984375, "grad_norm_var": 0.012651936213175455, "learning_rate": 0.0001, "loss": 1.9562, "loss/crossentropy": 2.6119589805603027, "loss/fcd": 1.72265625, "loss/idx": 6.0, "loss/logits": 0.23356223851442337, "step": 954 }, { "epoch": 0.014260222937307282, "grad_norm": 0.609375, "grad_norm_var": 0.01220396359761556, "learning_rate": 0.0001, "loss": 1.724, "loss/crossentropy": 2.741728901863098, "loss/fcd": 1.5390625, "loss/idx": 6.0, "loss/logits": 0.1849788874387741, "step": 955 }, { "epoch": 0.014275155107922263, "grad_norm": 0.53515625, "grad_norm_var": 0.012622181574503582, "learning_rate": 0.0001, "loss": 1.7135, "loss/crossentropy": 2.7166521549224854, "loss/fcd": 1.53515625, "loss/idx": 6.0, "loss/logits": 0.17837309837341309, "step": 956 }, { "epoch": 0.014290087278537245, "grad_norm": 0.55078125, "grad_norm_var": 0.012817875544230143, "learning_rate": 0.0001, "loss": 1.7684, "loss/crossentropy": 2.659182906150818, "loss/fcd": 1.57421875, "loss/idx": 6.0, "loss/logits": 0.1941850259900093, "step": 957 }, { "epoch": 0.014305019449152226, "grad_norm": 0.65625, "grad_norm_var": 0.012857421239217123, "learning_rate": 0.0001, "loss": 1.9042, "loss/crossentropy": 2.512916922569275, "loss/fcd": 1.6953125, "loss/idx": 6.0, "loss/logits": 0.20891183614730835, "step": 958 }, { "epoch": 0.014319951619767207, "grad_norm": 0.490234375, "grad_norm_var": 0.0051648457845052086, "learning_rate": 0.0001, "loss": 1.6953, "loss/crossentropy": 2.5976825952529907, "loss/fcd": 1.515625, "loss/idx": 6.0, "loss/logits": 0.1796964481472969, "step": 959 }, { "epoch": 0.014334883790382188, "grad_norm": 0.7109375, "grad_norm_var": 0.0058318456013997395, "learning_rate": 0.0001, "loss": 2.0133, "loss/crossentropy": 2.5360227823257446, "loss/fcd": 1.7734375, "loss/idx": 6.0, "loss/logits": 0.23983266949653625, "step": 960 }, { "epoch": 0.014349815960997171, "grad_norm": 0.478515625, "grad_norm_var": 0.00643614133199056, "learning_rate": 0.0001, "loss": 1.606, "loss/crossentropy": 2.5436811447143555, "loss/fcd": 1.44140625, "loss/idx": 6.0, "loss/logits": 0.16457198560237885, "step": 961 }, { "epoch": 0.014364748131612152, "grad_norm": 0.466796875, "grad_norm_var": 0.006583404541015625, "learning_rate": 0.0001, "loss": 1.6402, "loss/crossentropy": 2.638171434402466, "loss/fcd": 1.46484375, "loss/idx": 6.0, "loss/logits": 0.17533918470144272, "step": 962 }, { "epoch": 0.014379680302227133, "grad_norm": 0.447265625, "grad_norm_var": 0.007186237970987956, "learning_rate": 0.0001, "loss": 1.6373, "loss/crossentropy": 2.565574049949646, "loss/fcd": 1.4609375, "loss/idx": 6.0, "loss/logits": 0.17632722109556198, "step": 963 }, { "epoch": 0.014394612472842115, "grad_norm": 0.46484375, "grad_norm_var": 0.007865635553995769, "learning_rate": 0.0001, "loss": 1.682, "loss/crossentropy": 2.6288174390792847, "loss/fcd": 1.5078125, "loss/idx": 6.0, "loss/logits": 0.17416883260011673, "step": 964 }, { "epoch": 0.014409544643457096, "grad_norm": 0.55859375, "grad_norm_var": 0.00785673459370931, "learning_rate": 0.0001, "loss": 1.9074, "loss/crossentropy": 2.8099989891052246, "loss/fcd": 1.6875, "loss/idx": 6.0, "loss/logits": 0.21986465901136398, "step": 965 }, { "epoch": 0.014424476814072077, "grad_norm": 0.5234375, "grad_norm_var": 0.00771177609761556, "learning_rate": 0.0001, "loss": 1.6413, "loss/crossentropy": 2.626309394836426, "loss/fcd": 1.46875, "loss/idx": 6.0, "loss/logits": 0.1725175604224205, "step": 966 }, { "epoch": 0.014439408984687058, "grad_norm": 0.5703125, "grad_norm_var": 0.0057727654774983725, "learning_rate": 0.0001, "loss": 1.8253, "loss/crossentropy": 2.5631070137023926, "loss/fcd": 1.62109375, "loss/idx": 6.0, "loss/logits": 0.20422351360321045, "step": 967 }, { "epoch": 0.014454341155302041, "grad_norm": 0.466796875, "grad_norm_var": 0.005804951985677083, "learning_rate": 0.0001, "loss": 1.6936, "loss/crossentropy": 2.622004508972168, "loss/fcd": 1.50390625, "loss/idx": 6.0, "loss/logits": 0.18973329663276672, "step": 968 }, { "epoch": 0.014469273325917022, "grad_norm": 0.482421875, "grad_norm_var": 0.005743662516276042, "learning_rate": 0.0001, "loss": 1.5916, "loss/crossentropy": 2.339906692504883, "loss/fcd": 1.4375, "loss/idx": 6.0, "loss/logits": 0.15414663404226303, "step": 969 }, { "epoch": 0.014484205496532003, "grad_norm": 0.484375, "grad_norm_var": 0.005704180399576823, "learning_rate": 0.0001, "loss": 1.7258, "loss/crossentropy": 2.5720293521881104, "loss/fcd": 1.53125, "loss/idx": 6.0, "loss/logits": 0.19454781711101532, "step": 970 }, { "epoch": 0.014499137667146984, "grad_norm": 0.45703125, "grad_norm_var": 0.005562845865885417, "learning_rate": 0.0001, "loss": 1.6937, "loss/crossentropy": 2.547956943511963, "loss/fcd": 1.51953125, "loss/idx": 6.0, "loss/logits": 0.17421600222587585, "step": 971 }, { "epoch": 0.014514069837761967, "grad_norm": 0.486328125, "grad_norm_var": 0.00562284787495931, "learning_rate": 0.0001, "loss": 1.7068, "loss/crossentropy": 2.445081949234009, "loss/fcd": 1.5234375, "loss/idx": 6.0, "loss/logits": 0.18333792686462402, "step": 972 }, { "epoch": 0.014529002008376948, "grad_norm": 0.59765625, "grad_norm_var": 0.005962355931599935, "learning_rate": 0.0001, "loss": 2.2223, "loss/crossentropy": 2.6638505458831787, "loss/fcd": 1.91015625, "loss/idx": 6.0, "loss/logits": 0.3120998740196228, "step": 973 }, { "epoch": 0.014543934178991929, "grad_norm": 0.486328125, "grad_norm_var": 0.0047108968098958336, "learning_rate": 0.0001, "loss": 1.7339, "loss/crossentropy": 2.534387230873108, "loss/fcd": 1.55859375, "loss/idx": 6.0, "loss/logits": 0.17531096935272217, "step": 974 }, { "epoch": 0.014558866349606911, "grad_norm": 0.5546875, "grad_norm_var": 0.00479429562886556, "learning_rate": 0.0001, "loss": 1.7092, "loss/crossentropy": 2.6609139442443848, "loss/fcd": 1.5234375, "loss/idx": 6.0, "loss/logits": 0.18571606278419495, "step": 975 }, { "epoch": 0.014573798520221892, "grad_norm": 0.48828125, "grad_norm_var": 0.00206907590230306, "learning_rate": 0.0001, "loss": 1.7175, "loss/crossentropy": 2.5186537504196167, "loss/fcd": 1.53515625, "loss/idx": 6.0, "loss/logits": 0.1823228821158409, "step": 976 }, { "epoch": 0.014588730690836873, "grad_norm": 0.53125, "grad_norm_var": 0.0020858128865559895, "learning_rate": 0.0001, "loss": 1.913, "loss/crossentropy": 2.3601412773132324, "loss/fcd": 1.6953125, "loss/idx": 6.0, "loss/logits": 0.21770965307950974, "step": 977 }, { "epoch": 0.014603662861451854, "grad_norm": 0.46484375, "grad_norm_var": 0.002095778783162435, "learning_rate": 0.0001, "loss": 1.8019, "loss/crossentropy": 2.6870713233947754, "loss/fcd": 1.58203125, "loss/idx": 6.0, "loss/logits": 0.2198324054479599, "step": 978 }, { "epoch": 0.014618595032066837, "grad_norm": 0.5, "grad_norm_var": 0.0018704732259114583, "learning_rate": 0.0001, "loss": 1.8232, "loss/crossentropy": 2.392784595489502, "loss/fcd": 1.625, "loss/idx": 6.0, "loss/logits": 0.1981583684682846, "step": 979 }, { "epoch": 0.014633527202681818, "grad_norm": 0.4921875, "grad_norm_var": 0.0017623265584309896, "learning_rate": 0.0001, "loss": 1.8108, "loss/crossentropy": 2.4014596939086914, "loss/fcd": 1.609375, "loss/idx": 6.0, "loss/logits": 0.20145010948181152, "step": 980 }, { "epoch": 0.014648459373296799, "grad_norm": 0.6328125, "grad_norm_var": 0.0025970458984375, "learning_rate": 0.0001, "loss": 1.8404, "loss/crossentropy": 2.622117042541504, "loss/fcd": 1.640625, "loss/idx": 6.0, "loss/logits": 0.19979581236839294, "step": 981 }, { "epoch": 0.014663391543911782, "grad_norm": 0.515625, "grad_norm_var": 0.0025906880696614583, "learning_rate": 0.0001, "loss": 1.9272, "loss/crossentropy": 2.487557888031006, "loss/fcd": 1.68359375, "loss/idx": 6.0, "loss/logits": 0.2436397820711136, "step": 982 }, { "epoch": 0.014678323714526762, "grad_norm": 0.484375, "grad_norm_var": 0.0023976643880208332, "learning_rate": 0.0001, "loss": 1.6701, "loss/crossentropy": 2.512000560760498, "loss/fcd": 1.48828125, "loss/idx": 6.0, "loss/logits": 0.18184158951044083, "step": 983 }, { "epoch": 0.014693255885141743, "grad_norm": 0.490234375, "grad_norm_var": 0.002303822835286458, "learning_rate": 0.0001, "loss": 1.5974, "loss/crossentropy": 2.630392551422119, "loss/fcd": 1.43359375, "loss/idx": 6.0, "loss/logits": 0.16384021937847137, "step": 984 }, { "epoch": 0.014708188055756724, "grad_norm": 0.4609375, "grad_norm_var": 0.0024096012115478516, "learning_rate": 0.0001, "loss": 1.6285, "loss/crossentropy": 2.629890561103821, "loss/fcd": 1.4609375, "loss/idx": 6.0, "loss/logits": 0.16755110025405884, "step": 985 }, { "epoch": 0.014723120226371707, "grad_norm": 0.53125, "grad_norm_var": 0.0023996829986572266, "learning_rate": 0.0001, "loss": 1.9361, "loss/crossentropy": 2.5041611194610596, "loss/fcd": 1.703125, "loss/idx": 6.0, "loss/logits": 0.23302249610424042, "step": 986 }, { "epoch": 0.014738052396986688, "grad_norm": 0.52734375, "grad_norm_var": 0.0022039890289306642, "learning_rate": 0.0001, "loss": 1.5733, "loss/crossentropy": 2.6311824321746826, "loss/fcd": 1.42578125, "loss/idx": 6.0, "loss/logits": 0.14749648422002792, "step": 987 }, { "epoch": 0.014752984567601669, "grad_norm": 0.498046875, "grad_norm_var": 0.002167367935180664, "learning_rate": 0.0001, "loss": 1.6699, "loss/crossentropy": 2.3241487741470337, "loss/fcd": 1.5, "loss/idx": 6.0, "loss/logits": 0.1698528677225113, "step": 988 }, { "epoch": 0.014767916738216652, "grad_norm": 0.5234375, "grad_norm_var": 0.001703500747680664, "learning_rate": 0.0001, "loss": 1.7241, "loss/crossentropy": 2.490164041519165, "loss/fcd": 1.54296875, "loss/idx": 6.0, "loss/logits": 0.18110015988349915, "step": 989 }, { "epoch": 0.014782848908831633, "grad_norm": 0.44140625, "grad_norm_var": 0.0019795099894205728, "learning_rate": 0.0001, "loss": 1.6575, "loss/crossentropy": 2.446812629699707, "loss/fcd": 1.4765625, "loss/idx": 6.0, "loss/logits": 0.18092559278011322, "step": 990 }, { "epoch": 0.014797781079446614, "grad_norm": 0.52734375, "grad_norm_var": 0.0018580118815104167, "learning_rate": 0.0001, "loss": 1.6609, "loss/crossentropy": 2.4171838760375977, "loss/fcd": 1.5, "loss/idx": 6.0, "loss/logits": 0.1608925387263298, "step": 991 }, { "epoch": 0.014812713250061595, "grad_norm": 0.4921875, "grad_norm_var": 0.0018493016560872397, "learning_rate": 0.0001, "loss": 1.6547, "loss/crossentropy": 2.6258944272994995, "loss/fcd": 1.46875, "loss/idx": 6.0, "loss/logits": 0.1859017238020897, "step": 992 }, { "epoch": 0.014827645420676577, "grad_norm": 0.5078125, "grad_norm_var": 0.0018081029256184896, "learning_rate": 0.0001, "loss": 1.6637, "loss/crossentropy": 2.641685724258423, "loss/fcd": 1.484375, "loss/idx": 6.0, "loss/logits": 0.17929885536432266, "step": 993 }, { "epoch": 0.014842577591291558, "grad_norm": 0.640625, "grad_norm_var": 0.0027837117513020834, "learning_rate": 0.0001, "loss": 1.7782, "loss/crossentropy": 2.4380099773406982, "loss/fcd": 1.59375, "loss/idx": 6.0, "loss/logits": 0.1844642013311386, "step": 994 }, { "epoch": 0.01485750976190654, "grad_norm": 0.53515625, "grad_norm_var": 0.0027831395467122397, "learning_rate": 0.0001, "loss": 1.8351, "loss/crossentropy": 2.3071682453155518, "loss/fcd": 1.62109375, "loss/idx": 6.0, "loss/logits": 0.2140112966299057, "step": 995 }, { "epoch": 0.01487244193252152, "grad_norm": 0.48046875, "grad_norm_var": 0.0028333028157552084, "learning_rate": 0.0001, "loss": 1.6957, "loss/crossentropy": 2.662778615951538, "loss/fcd": 1.50390625, "loss/idx": 6.0, "loss/logits": 0.19180986285209656, "step": 996 }, { "epoch": 0.014887374103136503, "grad_norm": 0.4375, "grad_norm_var": 0.00222930908203125, "learning_rate": 0.0001, "loss": 1.7745, "loss/crossentropy": 2.654296398162842, "loss/fcd": 1.5625, "loss/idx": 6.0, "loss/logits": 0.2119828313589096, "step": 997 }, { "epoch": 0.014902306273751484, "grad_norm": 0.5, "grad_norm_var": 0.002224222819010417, "learning_rate": 0.0001, "loss": 1.6723, "loss/crossentropy": 2.4404503107070923, "loss/fcd": 1.49609375, "loss/idx": 6.0, "loss/logits": 0.17621850222349167, "step": 998 }, { "epoch": 0.014917238444366465, "grad_norm": 0.53125, "grad_norm_var": 0.0022333780924479168, "learning_rate": 0.0001, "loss": 1.5216, "loss/crossentropy": 2.5146583318710327, "loss/fcd": 1.37890625, "loss/idx": 6.0, "loss/logits": 0.14269014447927475, "step": 999 }, { "epoch": 0.014932170614981448, "grad_norm": 0.49609375, "grad_norm_var": 0.0022217909495035808, "learning_rate": 0.0001, "loss": 1.7702, "loss/crossentropy": 2.4616737365722656, "loss/fcd": 1.56640625, "loss/idx": 6.0, "loss/logits": 0.20378455519676208, "step": 1000 }, { "epoch": 0.014947102785596428, "grad_norm": 0.58984375, "grad_norm_var": 0.0024483839670817057, "learning_rate": 0.0001, "loss": 1.9032, "loss/crossentropy": 2.3614426851272583, "loss/fcd": 1.6953125, "loss/idx": 6.0, "loss/logits": 0.20791510492563248, "step": 1001 }, { "epoch": 0.01496203495621141, "grad_norm": 0.5234375, "grad_norm_var": 0.002436558405558268, "learning_rate": 0.0001, "loss": 1.5926, "loss/crossentropy": 2.6121147871017456, "loss/fcd": 1.42578125, "loss/idx": 6.0, "loss/logits": 0.16682759672403336, "step": 1002 }, { "epoch": 0.01497696712682639, "grad_norm": 0.50390625, "grad_norm_var": 0.0024346510569254556, "learning_rate": 0.0001, "loss": 1.5917, "loss/crossentropy": 2.5289264917373657, "loss/fcd": 1.4296875, "loss/idx": 6.0, "loss/logits": 0.16205105185508728, "step": 1003 }, { "epoch": 0.014991899297441373, "grad_norm": 0.4921875, "grad_norm_var": 0.002449480692545573, "learning_rate": 0.0001, "loss": 1.5878, "loss/crossentropy": 2.5034509897232056, "loss/fcd": 1.43359375, "loss/idx": 6.0, "loss/logits": 0.1541755199432373, "step": 1004 }, { "epoch": 0.015006831468056354, "grad_norm": 0.474609375, "grad_norm_var": 0.0025365034739176433, "learning_rate": 0.0001, "loss": 1.6604, "loss/crossentropy": 2.5246294736862183, "loss/fcd": 1.4765625, "loss/idx": 6.0, "loss/logits": 0.18387333303689957, "step": 1005 }, { "epoch": 0.015021763638671335, "grad_norm": 0.58984375, "grad_norm_var": 0.002538919448852539, "learning_rate": 0.0001, "loss": 1.9237, "loss/crossentropy": 2.468030333518982, "loss/fcd": 1.70703125, "loss/idx": 6.0, "loss/logits": 0.21670957654714584, "step": 1006 }, { "epoch": 0.015036695809286318, "grad_norm": 0.51171875, "grad_norm_var": 0.002539173762003581, "learning_rate": 0.0001, "loss": 1.8183, "loss/crossentropy": 2.726784110069275, "loss/fcd": 1.6015625, "loss/idx": 6.0, "loss/logits": 0.21673081070184708, "step": 1007 }, { "epoch": 0.015051627979901299, "grad_norm": 0.4921875, "grad_norm_var": 0.002539173762003581, "learning_rate": 0.0001, "loss": 1.7091, "loss/crossentropy": 2.7661021947860718, "loss/fcd": 1.5234375, "loss/idx": 6.0, "loss/logits": 0.18564757704734802, "step": 1008 }, { "epoch": 0.01506656015051628, "grad_norm": 0.5859375, "grad_norm_var": 0.002802387873331706, "learning_rate": 0.0001, "loss": 1.9547, "loss/crossentropy": 2.5440393686294556, "loss/fcd": 1.7265625, "loss/idx": 6.0, "loss/logits": 0.22810395061969757, "step": 1009 }, { "epoch": 0.01508149232113126, "grad_norm": 0.5234375, "grad_norm_var": 0.0018391768137613932, "learning_rate": 0.0001, "loss": 1.8354, "loss/crossentropy": 2.780647039413452, "loss/fcd": 1.609375, "loss/idx": 6.0, "loss/logits": 0.2260511964559555, "step": 1010 }, { "epoch": 0.015096424491746243, "grad_norm": 0.53515625, "grad_norm_var": 0.0018391768137613932, "learning_rate": 0.0001, "loss": 2.0157, "loss/crossentropy": 2.3090795278549194, "loss/fcd": 1.77734375, "loss/idx": 6.0, "loss/logits": 0.23833715170621872, "step": 1011 }, { "epoch": 0.015111356662361224, "grad_norm": 0.52734375, "grad_norm_var": 0.0017499128977457683, "learning_rate": 0.0001, "loss": 2.0326, "loss/crossentropy": 2.5182803869247437, "loss/fcd": 1.77734375, "loss/idx": 6.0, "loss/logits": 0.2552146390080452, "step": 1012 }, { "epoch": 0.015126288832976205, "grad_norm": 0.443359375, "grad_norm_var": 0.0016878763834635416, "learning_rate": 0.0001, "loss": 1.6628, "loss/crossentropy": 2.5835026502609253, "loss/fcd": 1.47265625, "loss/idx": 6.0, "loss/logits": 0.190179705619812, "step": 1013 }, { "epoch": 0.015141221003591186, "grad_norm": 0.439453125, "grad_norm_var": 0.0020786126454671225, "learning_rate": 0.0001, "loss": 1.6838, "loss/crossentropy": 2.706833243370056, "loss/fcd": 1.48828125, "loss/idx": 6.0, "loss/logits": 0.19552963227033615, "step": 1014 }, { "epoch": 0.015156153174206169, "grad_norm": 1.046875, "grad_norm_var": 0.019727691014607748, "learning_rate": 0.0001, "loss": 2.2195, "loss/crossentropy": 2.607192873954773, "loss/fcd": 1.8984375, "loss/idx": 6.0, "loss/logits": 0.3210318982601166, "step": 1015 }, { "epoch": 0.01517108534482115, "grad_norm": 0.55078125, "grad_norm_var": 0.01953275998433431, "learning_rate": 0.0001, "loss": 1.7991, "loss/crossentropy": 2.596633195877075, "loss/fcd": 1.5703125, "loss/idx": 6.0, "loss/logits": 0.22876836359500885, "step": 1016 }, { "epoch": 0.01518601751543613, "grad_norm": 0.45703125, "grad_norm_var": 0.01996293067932129, "learning_rate": 0.0001, "loss": 1.617, "loss/crossentropy": 2.566289782524109, "loss/fcd": 1.44140625, "loss/idx": 6.0, "loss/logits": 0.1755644902586937, "step": 1017 }, { "epoch": 0.015200949686051114, "grad_norm": 0.58203125, "grad_norm_var": 0.020020151138305665, "learning_rate": 0.0001, "loss": 1.6675, "loss/crossentropy": 2.5668020248413086, "loss/fcd": 1.4921875, "loss/idx": 6.0, "loss/logits": 0.17528380453586578, "step": 1018 }, { "epoch": 0.015215881856666095, "grad_norm": 0.6015625, "grad_norm_var": 0.020051940282185873, "learning_rate": 0.0001, "loss": 1.7784, "loss/crossentropy": 2.85185706615448, "loss/fcd": 1.5859375, "loss/idx": 6.0, "loss/logits": 0.19249311834573746, "step": 1019 }, { "epoch": 0.015230814027281075, "grad_norm": 0.515625, "grad_norm_var": 0.019895156224568684, "learning_rate": 0.0001, "loss": 1.7763, "loss/crossentropy": 2.5789307355880737, "loss/fcd": 1.56640625, "loss/idx": 6.0, "loss/logits": 0.20993077754974365, "step": 1020 }, { "epoch": 0.015245746197896056, "grad_norm": 0.42578125, "grad_norm_var": 0.020566304524739582, "learning_rate": 0.0001, "loss": 1.5714, "loss/crossentropy": 2.543707847595215, "loss/fcd": 1.41796875, "loss/idx": 6.0, "loss/logits": 0.15347032248973846, "step": 1021 }, { "epoch": 0.01526067836851104, "grad_norm": 0.4765625, "grad_norm_var": 0.02079308827718099, "learning_rate": 0.0001, "loss": 1.6796, "loss/crossentropy": 2.784927010536194, "loss/fcd": 1.4921875, "loss/idx": 6.0, "loss/logits": 0.1873757317662239, "step": 1022 }, { "epoch": 0.01527561053912602, "grad_norm": 0.5625, "grad_norm_var": 0.020731099446614585, "learning_rate": 0.0001, "loss": 1.7326, "loss/crossentropy": 2.5929077863693237, "loss/fcd": 1.55078125, "loss/idx": 6.0, "loss/logits": 0.18176910281181335, "step": 1023 }, { "epoch": 0.015290542709741001, "grad_norm": 0.57421875, "grad_norm_var": 0.02054284413655599, "learning_rate": 0.0001, "loss": 1.8381, "loss/crossentropy": 2.612320065498352, "loss/fcd": 1.62890625, "loss/idx": 6.0, "loss/logits": 0.20919139683246613, "step": 1024 }, { "epoch": 0.015305474880355984, "grad_norm": 0.54296875, "grad_norm_var": 0.02046941121419271, "learning_rate": 0.0001, "loss": 1.7577, "loss/crossentropy": 2.4743690490722656, "loss/fcd": 1.56640625, "loss/idx": 6.0, "loss/logits": 0.19128717482089996, "step": 1025 }, { "epoch": 0.015320407050970965, "grad_norm": 0.5234375, "grad_norm_var": 0.02046941121419271, "learning_rate": 0.0001, "loss": 1.7694, "loss/crossentropy": 2.4436780214309692, "loss/fcd": 1.5546875, "loss/idx": 6.0, "loss/logits": 0.21474337577819824, "step": 1026 }, { "epoch": 0.015335339221585946, "grad_norm": 0.55078125, "grad_norm_var": 0.020453135172526043, "learning_rate": 0.0001, "loss": 1.9482, "loss/crossentropy": 2.4892067909240723, "loss/fcd": 1.70703125, "loss/idx": 6.0, "loss/logits": 0.24121662974357605, "step": 1027 }, { "epoch": 0.015350271392200927, "grad_norm": 0.53125, "grad_norm_var": 0.020441627502441405, "learning_rate": 0.0001, "loss": 1.8773, "loss/crossentropy": 2.4016976356506348, "loss/fcd": 1.640625, "loss/idx": 6.0, "loss/logits": 0.23671425879001617, "step": 1028 }, { "epoch": 0.01536520356281591, "grad_norm": 0.50390625, "grad_norm_var": 0.019797627131144205, "learning_rate": 0.0001, "loss": 1.7514, "loss/crossentropy": 2.839547872543335, "loss/fcd": 1.546875, "loss/idx": 6.0, "loss/logits": 0.2044782042503357, "step": 1029 }, { "epoch": 0.01538013573343089, "grad_norm": 0.515625, "grad_norm_var": 0.01898371378580729, "learning_rate": 0.0001, "loss": 1.8493, "loss/crossentropy": 2.322245240211487, "loss/fcd": 1.63671875, "loss/idx": 6.0, "loss/logits": 0.21256640553474426, "step": 1030 }, { "epoch": 0.015395067904045871, "grad_norm": 0.498046875, "grad_norm_var": 0.002185678482055664, "learning_rate": 0.0001, "loss": 1.6567, "loss/crossentropy": 2.486846685409546, "loss/fcd": 1.48046875, "loss/idx": 6.0, "loss/logits": 0.17625004798173904, "step": 1031 }, { "epoch": 0.015410000074660852, "grad_norm": 0.62109375, "grad_norm_var": 0.0027292728424072265, "learning_rate": 0.0001, "loss": 2.055, "loss/crossentropy": 2.6589245796203613, "loss/fcd": 1.80859375, "loss/idx": 6.0, "loss/logits": 0.2464069500565529, "step": 1032 }, { "epoch": 0.015424932245275835, "grad_norm": 0.484375, "grad_norm_var": 0.0025094191233317057, "learning_rate": 0.0001, "loss": 1.6775, "loss/crossentropy": 2.2486242055892944, "loss/fcd": 1.5078125, "loss/idx": 6.0, "loss/logits": 0.16964885592460632, "step": 1033 }, { "epoch": 0.015439864415890816, "grad_norm": 0.54296875, "grad_norm_var": 0.002343479792277018, "learning_rate": 0.0001, "loss": 1.7148, "loss/crossentropy": 2.586572051048279, "loss/fcd": 1.51953125, "loss/idx": 6.0, "loss/logits": 0.19529356062412262, "step": 1034 }, { "epoch": 0.015454796586505797, "grad_norm": 0.58984375, "grad_norm_var": 0.002239338556925456, "learning_rate": 0.0001, "loss": 1.8468, "loss/crossentropy": 2.714082717895508, "loss/fcd": 1.625, "loss/idx": 6.0, "loss/logits": 0.22178450226783752, "step": 1035 }, { "epoch": 0.01546972875712078, "grad_norm": 0.47265625, "grad_norm_var": 0.0024295647939046225, "learning_rate": 0.0001, "loss": 1.7294, "loss/crossentropy": 2.320715546607971, "loss/fcd": 1.53515625, "loss/idx": 6.0, "loss/logits": 0.19423359632492065, "step": 1036 }, { "epoch": 0.01548466092773576, "grad_norm": 0.53515625, "grad_norm_var": 0.001715707778930664, "learning_rate": 0.0001, "loss": 1.7886, "loss/crossentropy": 2.7388995885849, "loss/fcd": 1.57421875, "loss/idx": 6.0, "loss/logits": 0.21433691680431366, "step": 1037 }, { "epoch": 0.015499593098350741, "grad_norm": 0.49609375, "grad_norm_var": 0.00159300168355306, "learning_rate": 0.0001, "loss": 1.7216, "loss/crossentropy": 2.627174973487854, "loss/fcd": 1.5234375, "loss/idx": 6.0, "loss/logits": 0.19816239923238754, "step": 1038 }, { "epoch": 0.015514525268965722, "grad_norm": 0.578125, "grad_norm_var": 0.0016675154368082682, "learning_rate": 0.0001, "loss": 1.708, "loss/crossentropy": 2.2871665954589844, "loss/fcd": 1.53515625, "loss/idx": 6.0, "loss/logits": 0.1728159263730049, "step": 1039 }, { "epoch": 0.015529457439580705, "grad_norm": 0.5546875, "grad_norm_var": 0.0015893141428629557, "learning_rate": 0.0001, "loss": 1.6132, "loss/crossentropy": 2.602549910545349, "loss/fcd": 1.44140625, "loss/idx": 6.0, "loss/logits": 0.1717797815799713, "step": 1040 }, { "epoch": 0.015544389610195686, "grad_norm": 0.5546875, "grad_norm_var": 0.0016122023264567057, "learning_rate": 0.0001, "loss": 1.7748, "loss/crossentropy": 2.658362030982971, "loss/fcd": 1.5546875, "loss/idx": 6.0, "loss/logits": 0.22009392827749252, "step": 1041 }, { "epoch": 0.015559321780810667, "grad_norm": 0.45703125, "grad_norm_var": 0.0019861698150634766, "learning_rate": 0.0001, "loss": 1.6258, "loss/crossentropy": 2.6368483304977417, "loss/fcd": 1.453125, "loss/idx": 6.0, "loss/logits": 0.17269417643547058, "step": 1042 }, { "epoch": 0.01557425395142565, "grad_norm": 0.53125, "grad_norm_var": 0.0019569238026936847, "learning_rate": 0.0001, "loss": 1.84, "loss/crossentropy": 2.529175043106079, "loss/fcd": 1.62890625, "loss/idx": 6.0, "loss/logits": 0.21105806529521942, "step": 1043 }, { "epoch": 0.01558918612204063, "grad_norm": 0.5703125, "grad_norm_var": 0.0020630995432535807, "learning_rate": 0.0001, "loss": 1.8264, "loss/crossentropy": 2.712172746658325, "loss/fcd": 1.625, "loss/idx": 6.0, "loss/logits": 0.2013680636882782, "step": 1044 }, { "epoch": 0.015604118292655612, "grad_norm": 0.71484375, "grad_norm_var": 0.004064671198527018, "learning_rate": 0.0001, "loss": 1.8359, "loss/crossentropy": 2.6467713117599487, "loss/fcd": 1.640625, "loss/idx": 6.0, "loss/logits": 0.19526171684265137, "step": 1045 }, { "epoch": 0.015619050463270593, "grad_norm": 0.53125, "grad_norm_var": 0.00401914914449056, "learning_rate": 0.0001, "loss": 1.7747, "loss/crossentropy": 2.707419753074646, "loss/fcd": 1.58984375, "loss/idx": 6.0, "loss/logits": 0.1848393678665161, "step": 1046 }, { "epoch": 0.015633982633885574, "grad_norm": 0.470703125, "grad_norm_var": 0.004239892959594727, "learning_rate": 0.0001, "loss": 1.6991, "loss/crossentropy": 2.640596628189087, "loss/fcd": 1.515625, "loss/idx": 6.0, "loss/logits": 0.18342873454093933, "step": 1047 }, { "epoch": 0.015648914804500556, "grad_norm": 0.51953125, "grad_norm_var": 0.003841511408487956, "learning_rate": 0.0001, "loss": 1.4506, "loss/crossentropy": 2.8776493072509766, "loss/fcd": 1.3125, "loss/idx": 6.0, "loss/logits": 0.13808635622262955, "step": 1048 }, { "epoch": 0.01566384697511554, "grad_norm": 0.462890625, "grad_norm_var": 0.004023170471191407, "learning_rate": 0.0001, "loss": 1.4478, "loss/crossentropy": 2.757124900817871, "loss/fcd": 1.30078125, "loss/idx": 6.0, "loss/logits": 0.14702700823545456, "step": 1049 }, { "epoch": 0.015678779145730518, "grad_norm": 0.5703125, "grad_norm_var": 0.00409393310546875, "learning_rate": 0.0001, "loss": 1.8706, "loss/crossentropy": 2.544234871864319, "loss/fcd": 1.65234375, "loss/idx": 6.0, "loss/logits": 0.21821290254592896, "step": 1050 }, { "epoch": 0.0156937113163455, "grad_norm": 0.5625, "grad_norm_var": 0.00395196278889974, "learning_rate": 0.0001, "loss": 1.9078, "loss/crossentropy": 2.8513458967208862, "loss/fcd": 1.68359375, "loss/idx": 6.0, "loss/logits": 0.2242419719696045, "step": 1051 }, { "epoch": 0.015708643486960484, "grad_norm": 0.4921875, "grad_norm_var": 0.0038098653157552084, "learning_rate": 0.0001, "loss": 1.7442, "loss/crossentropy": 2.4249703884124756, "loss/fcd": 1.5625, "loss/idx": 6.0, "loss/logits": 0.18165121227502823, "step": 1052 }, { "epoch": 0.015723575657575463, "grad_norm": 0.546875, "grad_norm_var": 0.0038146336873372396, "learning_rate": 0.0001, "loss": 1.6334, "loss/crossentropy": 2.3974103927612305, "loss/fcd": 1.453125, "loss/idx": 6.0, "loss/logits": 0.1802271157503128, "step": 1053 }, { "epoch": 0.015738507828190446, "grad_norm": 0.578125, "grad_norm_var": 0.0037732442220052083, "learning_rate": 0.0001, "loss": 1.7285, "loss/crossentropy": 2.800647497177124, "loss/fcd": 1.5390625, "loss/idx": 6.0, "loss/logits": 0.18946100026369095, "step": 1054 }, { "epoch": 0.015753439998805425, "grad_norm": 0.55078125, "grad_norm_var": 0.0036935806274414062, "learning_rate": 0.0001, "loss": 1.7413, "loss/crossentropy": 2.7701534032821655, "loss/fcd": 1.53125, "loss/idx": 6.0, "loss/logits": 0.21001631021499634, "step": 1055 }, { "epoch": 0.015768372169420408, "grad_norm": 1.1015625, "grad_norm_var": 0.02332909901936849, "learning_rate": 0.0001, "loss": 1.9825, "loss/crossentropy": 2.4670101404190063, "loss/fcd": 1.73046875, "loss/idx": 6.0, "loss/logits": 0.25201089680194855, "step": 1056 }, { "epoch": 0.01578330434003539, "grad_norm": 0.46875, "grad_norm_var": 0.02403405507405599, "learning_rate": 0.0001, "loss": 1.7384, "loss/crossentropy": 2.56765079498291, "loss/fcd": 1.53125, "loss/idx": 6.0, "loss/logits": 0.20711445063352585, "step": 1057 }, { "epoch": 0.01579823651065037, "grad_norm": 0.53515625, "grad_norm_var": 0.02323296864827474, "learning_rate": 0.0001, "loss": 1.6809, "loss/crossentropy": 2.64453125, "loss/fcd": 1.49609375, "loss/idx": 6.0, "loss/logits": 0.1848081573843956, "step": 1058 }, { "epoch": 0.015813168681265352, "grad_norm": 0.515625, "grad_norm_var": 0.023340288798014322, "learning_rate": 0.0001, "loss": 1.7947, "loss/crossentropy": 2.6654093265533447, "loss/fcd": 1.58203125, "loss/idx": 6.0, "loss/logits": 0.212711364030838, "step": 1059 }, { "epoch": 0.015828100851880335, "grad_norm": 0.5234375, "grad_norm_var": 0.023503557840983073, "learning_rate": 0.0001, "loss": 1.64, "loss/crossentropy": 2.512184262275696, "loss/fcd": 1.4765625, "loss/idx": 6.0, "loss/logits": 0.16342011094093323, "step": 1060 }, { "epoch": 0.015843033022495314, "grad_norm": 0.51953125, "grad_norm_var": 0.02215569814046224, "learning_rate": 0.0001, "loss": 1.7657, "loss/crossentropy": 2.6045628786087036, "loss/fcd": 1.5625, "loss/idx": 6.0, "loss/logits": 0.20317083597183228, "step": 1061 }, { "epoch": 0.015857965193110297, "grad_norm": 0.56640625, "grad_norm_var": 0.022101338704427084, "learning_rate": 0.0001, "loss": 1.7964, "loss/crossentropy": 2.3390029668807983, "loss/fcd": 1.61328125, "loss/idx": 6.0, "loss/logits": 0.18314598500728607, "step": 1062 }, { "epoch": 0.01587289736372528, "grad_norm": 0.515625, "grad_norm_var": 0.021683486302693684, "learning_rate": 0.0001, "loss": 1.6747, "loss/crossentropy": 2.6241334676742554, "loss/fcd": 1.48046875, "loss/idx": 6.0, "loss/logits": 0.19422227889299393, "step": 1063 }, { "epoch": 0.01588782953434026, "grad_norm": 0.484375, "grad_norm_var": 0.021970733006795248, "learning_rate": 0.0001, "loss": 1.639, "loss/crossentropy": 2.6619738340377808, "loss/fcd": 1.4765625, "loss/idx": 6.0, "loss/logits": 0.16240306943655014, "step": 1064 }, { "epoch": 0.01590276170495524, "grad_norm": 0.50390625, "grad_norm_var": 0.02153313954671224, "learning_rate": 0.0001, "loss": 1.7279, "loss/crossentropy": 2.4656083583831787, "loss/fcd": 1.53125, "loss/idx": 6.0, "loss/logits": 0.1966322660446167, "step": 1065 }, { "epoch": 0.01591769387557022, "grad_norm": 0.73828125, "grad_norm_var": 0.0234222412109375, "learning_rate": 0.0001, "loss": 1.7699, "loss/crossentropy": 2.4857001304626465, "loss/fcd": 1.578125, "loss/idx": 6.0, "loss/logits": 0.19179469347000122, "step": 1066 }, { "epoch": 0.015932626046185203, "grad_norm": 0.498046875, "grad_norm_var": 0.023790979385375978, "learning_rate": 0.0001, "loss": 1.8223, "loss/crossentropy": 2.5338550806045532, "loss/fcd": 1.59765625, "loss/idx": 6.0, "loss/logits": 0.22468920052051544, "step": 1067 }, { "epoch": 0.015947558216800186, "grad_norm": 0.4296875, "grad_norm_var": 0.02469328244527181, "learning_rate": 0.0001, "loss": 1.5337, "loss/crossentropy": 2.4822702407836914, "loss/fcd": 1.37109375, "loss/idx": 6.0, "loss/logits": 0.16255860030651093, "step": 1068 }, { "epoch": 0.015962490387415165, "grad_norm": 0.5234375, "grad_norm_var": 0.02479132016499837, "learning_rate": 0.0001, "loss": 1.7657, "loss/crossentropy": 2.417506456375122, "loss/fcd": 1.578125, "loss/idx": 6.0, "loss/logits": 0.18761204183101654, "step": 1069 }, { "epoch": 0.015977422558030148, "grad_norm": 0.51953125, "grad_norm_var": 0.024909575780232746, "learning_rate": 0.0001, "loss": 1.7423, "loss/crossentropy": 2.6273897886276245, "loss/fcd": 1.54296875, "loss/idx": 6.0, "loss/logits": 0.19936949759721756, "step": 1070 }, { "epoch": 0.01599235472864513, "grad_norm": 0.54296875, "grad_norm_var": 0.02492521603902181, "learning_rate": 0.0001, "loss": 1.6031, "loss/crossentropy": 2.4808638095855713, "loss/fcd": 1.4453125, "loss/idx": 6.0, "loss/logits": 0.15783234685659409, "step": 1071 }, { "epoch": 0.01600728689926011, "grad_norm": 0.55078125, "grad_norm_var": 0.004235061009724935, "learning_rate": 0.0001, "loss": 1.7248, "loss/crossentropy": 2.5770764350891113, "loss/fcd": 1.53125, "loss/idx": 6.0, "loss/logits": 0.19359392672777176, "step": 1072 }, { "epoch": 0.016022219069875093, "grad_norm": 0.65625, "grad_norm_var": 0.004970534642537435, "learning_rate": 0.0001, "loss": 1.9244, "loss/crossentropy": 2.357397437095642, "loss/fcd": 1.703125, "loss/idx": 6.0, "loss/logits": 0.22131529450416565, "step": 1073 }, { "epoch": 0.016037151240490075, "grad_norm": 0.73828125, "grad_norm_var": 0.007446781794230143, "learning_rate": 0.0001, "loss": 1.9803, "loss/crossentropy": 2.9358640909194946, "loss/fcd": 1.74609375, "loss/idx": 6.0, "loss/logits": 0.23415900021791458, "step": 1074 }, { "epoch": 0.016052083411105054, "grad_norm": 0.48046875, "grad_norm_var": 0.0076928297678629555, "learning_rate": 0.0001, "loss": 1.7242, "loss/crossentropy": 2.5545125007629395, "loss/fcd": 1.52734375, "loss/idx": 6.0, "loss/logits": 0.19681578129529953, "step": 1075 }, { "epoch": 0.016067015581720037, "grad_norm": 0.58984375, "grad_norm_var": 0.007738224665323893, "learning_rate": 0.0001, "loss": 1.9907, "loss/crossentropy": 1.9322530627250671, "loss/fcd": 1.7578125, "loss/idx": 6.0, "loss/logits": 0.23285391926765442, "step": 1076 }, { "epoch": 0.01608194775233502, "grad_norm": 0.486328125, "grad_norm_var": 0.007957903544108073, "learning_rate": 0.0001, "loss": 1.6095, "loss/crossentropy": 2.7540615797042847, "loss/fcd": 1.43359375, "loss/idx": 6.0, "loss/logits": 0.1758873388171196, "step": 1077 }, { "epoch": 0.01609687992295, "grad_norm": 0.478515625, "grad_norm_var": 0.008266178766886394, "learning_rate": 0.0001, "loss": 1.5728, "loss/crossentropy": 2.6025326251983643, "loss/fcd": 1.40625, "loss/idx": 6.0, "loss/logits": 0.16653118282556534, "step": 1078 }, { "epoch": 0.016111812093564982, "grad_norm": 0.4765625, "grad_norm_var": 0.008519856135050456, "learning_rate": 0.0001, "loss": 1.7698, "loss/crossentropy": 2.574973702430725, "loss/fcd": 1.5546875, "loss/idx": 6.0, "loss/logits": 0.21514993906021118, "step": 1079 }, { "epoch": 0.01612674426417996, "grad_norm": 0.4609375, "grad_norm_var": 0.008739201227823894, "learning_rate": 0.0001, "loss": 1.7841, "loss/crossentropy": 2.5307178497314453, "loss/fcd": 1.59375, "loss/idx": 6.0, "loss/logits": 0.1903010457754135, "step": 1080 }, { "epoch": 0.016141676434794944, "grad_norm": 0.53515625, "grad_norm_var": 0.00864103635152181, "learning_rate": 0.0001, "loss": 1.7499, "loss/crossentropy": 2.674168348312378, "loss/fcd": 1.546875, "loss/idx": 6.0, "loss/logits": 0.20304765552282333, "step": 1081 }, { "epoch": 0.016156608605409926, "grad_norm": 0.5234375, "grad_norm_var": 0.005962483088175456, "learning_rate": 0.0001, "loss": 1.7734, "loss/crossentropy": 2.5411276817321777, "loss/fcd": 1.58203125, "loss/idx": 6.0, "loss/logits": 0.1913544237613678, "step": 1082 }, { "epoch": 0.016171540776024906, "grad_norm": 0.431640625, "grad_norm_var": 0.006526676813761393, "learning_rate": 0.0001, "loss": 1.5836, "loss/crossentropy": 2.050433099269867, "loss/fcd": 1.4375, "loss/idx": 6.0, "loss/logits": 0.14609526097774506, "step": 1083 }, { "epoch": 0.01618647294663989, "grad_norm": 0.5859375, "grad_norm_var": 0.006035852432250977, "learning_rate": 0.0001, "loss": 1.7856, "loss/crossentropy": 2.2005414366722107, "loss/fcd": 1.6015625, "loss/idx": 6.0, "loss/logits": 0.18407931923866272, "step": 1084 }, { "epoch": 0.01620140511725487, "grad_norm": 0.498046875, "grad_norm_var": 0.006119537353515625, "learning_rate": 0.0001, "loss": 1.79, "loss/crossentropy": 2.430140733718872, "loss/fcd": 1.5625, "loss/idx": 6.0, "loss/logits": 0.2274610549211502, "step": 1085 }, { "epoch": 0.01621633728786985, "grad_norm": 0.4296875, "grad_norm_var": 0.00680535634358724, "learning_rate": 0.0001, "loss": 1.5372, "loss/crossentropy": 2.7481919527053833, "loss/fcd": 1.37890625, "loss/idx": 6.0, "loss/logits": 0.1583072543144226, "step": 1086 }, { "epoch": 0.016231269458484833, "grad_norm": 0.57421875, "grad_norm_var": 0.00692437489827474, "learning_rate": 0.0001, "loss": 1.8689, "loss/crossentropy": 2.4764903783798218, "loss/fcd": 1.61328125, "loss/idx": 6.0, "loss/logits": 0.255642831325531, "step": 1087 }, { "epoch": 0.016246201629099816, "grad_norm": 0.42578125, "grad_norm_var": 0.0075713475545247395, "learning_rate": 0.0001, "loss": 1.6201, "loss/crossentropy": 2.584195852279663, "loss/fcd": 1.44140625, "loss/idx": 6.0, "loss/logits": 0.1786525845527649, "step": 1088 }, { "epoch": 0.016261133799714795, "grad_norm": 0.5859375, "grad_norm_var": 0.0066329320271809895, "learning_rate": 0.0001, "loss": 1.7831, "loss/crossentropy": 2.3816990852355957, "loss/fcd": 1.57421875, "loss/idx": 6.0, "loss/logits": 0.20887838304042816, "step": 1089 }, { "epoch": 0.016276065970329778, "grad_norm": 0.462890625, "grad_norm_var": 0.0033138116200764974, "learning_rate": 0.0001, "loss": 1.6247, "loss/crossentropy": 2.5489161014556885, "loss/fcd": 1.4609375, "loss/idx": 6.0, "loss/logits": 0.16374288499355316, "step": 1090 }, { "epoch": 0.016290998140944757, "grad_norm": 0.48828125, "grad_norm_var": 0.0032956282297770183, "learning_rate": 0.0001, "loss": 1.6529, "loss/crossentropy": 2.699458956718445, "loss/fcd": 1.4765625, "loss/idx": 6.0, "loss/logits": 0.176344595849514, "step": 1091 }, { "epoch": 0.01630593031155974, "grad_norm": 0.462890625, "grad_norm_var": 0.0028172810872395832, "learning_rate": 0.0001, "loss": 1.6489, "loss/crossentropy": 2.5479389429092407, "loss/fcd": 1.46875, "loss/idx": 6.0, "loss/logits": 0.18014751374721527, "step": 1092 }, { "epoch": 0.016320862482174722, "grad_norm": 0.498046875, "grad_norm_var": 0.0028136571248372397, "learning_rate": 0.0001, "loss": 1.7677, "loss/crossentropy": 2.7490711212158203, "loss/fcd": 1.55859375, "loss/idx": 6.0, "loss/logits": 0.2091096192598343, "step": 1093 }, { "epoch": 0.0163357946527897, "grad_norm": 1.046875, "grad_norm_var": 0.02176359494527181, "learning_rate": 0.0001, "loss": 1.9129, "loss/crossentropy": 2.8259459733963013, "loss/fcd": 1.66015625, "loss/idx": 6.0, "loss/logits": 0.25274983793497086, "step": 1094 }, { "epoch": 0.016350726823404684, "grad_norm": 0.47265625, "grad_norm_var": 0.02179258664449056, "learning_rate": 0.0001, "loss": 1.513, "loss/crossentropy": 2.47867751121521, "loss/fcd": 1.36328125, "loss/idx": 6.0, "loss/logits": 0.14975561946630478, "step": 1095 }, { "epoch": 0.016365658994019667, "grad_norm": 0.435546875, "grad_norm_var": 0.02206719716389974, "learning_rate": 0.0001, "loss": 1.5513, "loss/crossentropy": 2.477361798286438, "loss/fcd": 1.390625, "loss/idx": 6.0, "loss/logits": 0.160676509141922, "step": 1096 }, { "epoch": 0.016380591164634646, "grad_norm": 0.4765625, "grad_norm_var": 0.02223027547200521, "learning_rate": 0.0001, "loss": 1.7071, "loss/crossentropy": 2.768059492111206, "loss/fcd": 1.51171875, "loss/idx": 6.0, "loss/logits": 0.19536980986595154, "step": 1097 }, { "epoch": 0.01639552333524963, "grad_norm": 0.482421875, "grad_norm_var": 0.02234342892964681, "learning_rate": 0.0001, "loss": 1.7626, "loss/crossentropy": 2.445541024208069, "loss/fcd": 1.546875, "loss/idx": 6.0, "loss/logits": 0.2156984806060791, "step": 1098 }, { "epoch": 0.01641045550586461, "grad_norm": 0.50390625, "grad_norm_var": 0.021795908610026043, "learning_rate": 0.0001, "loss": 1.5739, "loss/crossentropy": 2.301167607307434, "loss/fcd": 1.4140625, "loss/idx": 6.0, "loss/logits": 0.15980088710784912, "step": 1099 }, { "epoch": 0.01642538767647959, "grad_norm": 0.40625, "grad_norm_var": 0.02239837646484375, "learning_rate": 0.0001, "loss": 1.6301, "loss/crossentropy": 2.4962133169174194, "loss/fcd": 1.453125, "loss/idx": 6.0, "loss/logits": 0.1769598126411438, "step": 1100 }, { "epoch": 0.016440319847094573, "grad_norm": 0.474609375, "grad_norm_var": 0.022487640380859375, "learning_rate": 0.0001, "loss": 1.7041, "loss/crossentropy": 2.6290271282196045, "loss/fcd": 1.5, "loss/idx": 6.0, "loss/logits": 0.2041047364473343, "step": 1101 }, { "epoch": 0.016455252017709553, "grad_norm": 0.43359375, "grad_norm_var": 0.02244459788004557, "learning_rate": 0.0001, "loss": 1.6305, "loss/crossentropy": 2.6902036666870117, "loss/fcd": 1.45703125, "loss/idx": 6.0, "loss/logits": 0.17346254736185074, "step": 1102 }, { "epoch": 0.016470184188324535, "grad_norm": 0.46484375, "grad_norm_var": 0.022319984436035157, "learning_rate": 0.0001, "loss": 1.7848, "loss/crossentropy": 2.574817419052124, "loss/fcd": 1.578125, "loss/idx": 6.0, "loss/logits": 0.20672458410263062, "step": 1103 }, { "epoch": 0.016485116358939518, "grad_norm": 0.52734375, "grad_norm_var": 0.02185713450113932, "learning_rate": 0.0001, "loss": 1.5013, "loss/crossentropy": 2.7195013761520386, "loss/fcd": 1.359375, "loss/idx": 6.0, "loss/logits": 0.14197393506765366, "step": 1104 }, { "epoch": 0.016500048529554497, "grad_norm": 0.59765625, "grad_norm_var": 0.021978251139322915, "learning_rate": 0.0001, "loss": 1.7151, "loss/crossentropy": 2.817270874977112, "loss/fcd": 1.52734375, "loss/idx": 6.0, "loss/logits": 0.18773505836725235, "step": 1105 }, { "epoch": 0.01651498070016948, "grad_norm": 0.6328125, "grad_norm_var": 0.022610203425089518, "learning_rate": 0.0001, "loss": 1.7907, "loss/crossentropy": 2.5092287063598633, "loss/fcd": 1.5625, "loss/idx": 6.0, "loss/logits": 0.22824940085411072, "step": 1106 }, { "epoch": 0.016529912870784463, "grad_norm": 0.6171875, "grad_norm_var": 0.02301303545633952, "learning_rate": 0.0001, "loss": 1.8864, "loss/crossentropy": 2.5941046476364136, "loss/fcd": 1.6484375, "loss/idx": 6.0, "loss/logits": 0.2380032166838646, "step": 1107 }, { "epoch": 0.016544845041399442, "grad_norm": 0.48828125, "grad_norm_var": 0.02281487782796224, "learning_rate": 0.0001, "loss": 1.8045, "loss/crossentropy": 2.718872547149658, "loss/fcd": 1.59375, "loss/idx": 6.0, "loss/logits": 0.21072804927825928, "step": 1108 }, { "epoch": 0.016559777212014425, "grad_norm": 0.498046875, "grad_norm_var": 0.02281487782796224, "learning_rate": 0.0001, "loss": 1.6336, "loss/crossentropy": 2.7627276182174683, "loss/fcd": 1.453125, "loss/idx": 6.0, "loss/logits": 0.18051744997501373, "step": 1109 }, { "epoch": 0.016574709382629407, "grad_norm": 0.62109375, "grad_norm_var": 0.0050809224446614586, "learning_rate": 0.0001, "loss": 1.9104, "loss/crossentropy": 2.403494715690613, "loss/fcd": 1.69921875, "loss/idx": 6.0, "loss/logits": 0.21119916439056396, "step": 1110 }, { "epoch": 0.016589641553244387, "grad_norm": 0.4375, "grad_norm_var": 0.00532525380452474, "learning_rate": 0.0001, "loss": 1.6656, "loss/crossentropy": 2.596889853477478, "loss/fcd": 1.47265625, "loss/idx": 6.0, "loss/logits": 0.19291236251592636, "step": 1111 }, { "epoch": 0.01660457372385937, "grad_norm": 0.443359375, "grad_norm_var": 0.005255572001139323, "learning_rate": 0.0001, "loss": 1.5324, "loss/crossentropy": 2.6120318174362183, "loss/fcd": 1.3671875, "loss/idx": 6.0, "loss/logits": 0.16524401307106018, "step": 1112 }, { "epoch": 0.016619505894474352, "grad_norm": 0.54296875, "grad_norm_var": 0.005265299479166667, "learning_rate": 0.0001, "loss": 1.8961, "loss/crossentropy": 2.5684638023376465, "loss/fcd": 1.6953125, "loss/idx": 6.0, "loss/logits": 0.20081853866577148, "step": 1113 }, { "epoch": 0.01663443806508933, "grad_norm": 0.5546875, "grad_norm_var": 0.0053188165028889975, "learning_rate": 0.0001, "loss": 1.6964, "loss/crossentropy": 2.8103604316711426, "loss/fcd": 1.5, "loss/idx": 6.0, "loss/logits": 0.19644811004400253, "step": 1114 }, { "epoch": 0.016649370235704314, "grad_norm": 0.5078125, "grad_norm_var": 0.005313857396443685, "learning_rate": 0.0001, "loss": 1.7268, "loss/crossentropy": 2.799278974533081, "loss/fcd": 1.5234375, "loss/idx": 6.0, "loss/logits": 0.20339705049991608, "step": 1115 }, { "epoch": 0.016664302406319293, "grad_norm": 0.55078125, "grad_norm_var": 0.004514042536417643, "learning_rate": 0.0001, "loss": 1.7984, "loss/crossentropy": 2.626010537147522, "loss/fcd": 1.59765625, "loss/idx": 6.0, "loss/logits": 0.20072130858898163, "step": 1116 }, { "epoch": 0.016679234576934276, "grad_norm": 0.53125, "grad_norm_var": 0.004337501525878906, "learning_rate": 0.0001, "loss": 1.6067, "loss/crossentropy": 2.4577295780181885, "loss/fcd": 1.43359375, "loss/idx": 6.0, "loss/logits": 0.17310378700494766, "step": 1117 }, { "epoch": 0.01669416674754926, "grad_norm": 0.44140625, "grad_norm_var": 0.004242897033691406, "learning_rate": 0.0001, "loss": 1.6291, "loss/crossentropy": 2.533613443374634, "loss/fcd": 1.4453125, "loss/idx": 6.0, "loss/logits": 0.18379881232976913, "step": 1118 }, { "epoch": 0.016709098918164238, "grad_norm": 0.48828125, "grad_norm_var": 0.004078102111816406, "learning_rate": 0.0001, "loss": 1.8031, "loss/crossentropy": 2.4966949224472046, "loss/fcd": 1.59375, "loss/idx": 6.0, "loss/logits": 0.2093115895986557, "step": 1119 }, { "epoch": 0.01672403108877922, "grad_norm": 0.5078125, "grad_norm_var": 0.004108937581380209, "learning_rate": 0.0001, "loss": 1.6413, "loss/crossentropy": 2.5867191553115845, "loss/fcd": 1.46484375, "loss/idx": 6.0, "loss/logits": 0.17646171152591705, "step": 1120 }, { "epoch": 0.016738963259394203, "grad_norm": 0.494140625, "grad_norm_var": 0.0038284142812093098, "learning_rate": 0.0001, "loss": 1.7152, "loss/crossentropy": 2.971528172492981, "loss/fcd": 1.51171875, "loss/idx": 6.0, "loss/logits": 0.20348946750164032, "step": 1121 }, { "epoch": 0.016753895430009182, "grad_norm": 0.5546875, "grad_norm_var": 0.0030591169993082683, "learning_rate": 0.0001, "loss": 1.7426, "loss/crossentropy": 2.5865492820739746, "loss/fcd": 1.54296875, "loss/idx": 6.0, "loss/logits": 0.19963253289461136, "step": 1122 }, { "epoch": 0.016768827600624165, "grad_norm": 0.52734375, "grad_norm_var": 0.0023689111073811847, "learning_rate": 0.0001, "loss": 1.8772, "loss/crossentropy": 2.428372383117676, "loss/fcd": 1.65234375, "loss/idx": 6.0, "loss/logits": 0.22489413619041443, "step": 1123 }, { "epoch": 0.016783759771239148, "grad_norm": 0.458984375, "grad_norm_var": 0.0025145848592122394, "learning_rate": 0.0001, "loss": 1.6141, "loss/crossentropy": 2.5533376932144165, "loss/fcd": 1.4453125, "loss/idx": 6.0, "loss/logits": 0.16882772743701935, "step": 1124 }, { "epoch": 0.016798691941854127, "grad_norm": 0.53125, "grad_norm_var": 0.002530527114868164, "learning_rate": 0.0001, "loss": 1.7549, "loss/crossentropy": 2.6719311475753784, "loss/fcd": 1.55078125, "loss/idx": 6.0, "loss/logits": 0.20412559807300568, "step": 1125 }, { "epoch": 0.01681362411246911, "grad_norm": 0.44140625, "grad_norm_var": 0.0019368330637613933, "learning_rate": 0.0001, "loss": 1.6545, "loss/crossentropy": 2.4006348848342896, "loss/fcd": 1.46484375, "loss/idx": 6.0, "loss/logits": 0.18963538110256195, "step": 1126 }, { "epoch": 0.01682855628308409, "grad_norm": 0.40234375, "grad_norm_var": 0.0023110548655192057, "learning_rate": 0.0001, "loss": 1.4978, "loss/crossentropy": 2.6121081113815308, "loss/fcd": 1.3515625, "loss/idx": 6.0, "loss/logits": 0.14626048505306244, "step": 1127 }, { "epoch": 0.01684348845369907, "grad_norm": 0.494140625, "grad_norm_var": 0.0020978132883707683, "learning_rate": 0.0001, "loss": 1.869, "loss/crossentropy": 2.564122796058655, "loss/fcd": 1.63671875, "loss/idx": 6.0, "loss/logits": 0.23223726451396942, "step": 1128 }, { "epoch": 0.016858420624314054, "grad_norm": 0.447265625, "grad_norm_var": 0.0021453221638997396, "learning_rate": 0.0001, "loss": 1.6052, "loss/crossentropy": 2.371549606323242, "loss/fcd": 1.43359375, "loss/idx": 6.0, "loss/logits": 0.17157655954360962, "step": 1129 }, { "epoch": 0.016873352794929034, "grad_norm": 0.57421875, "grad_norm_var": 0.0023223876953125, "learning_rate": 0.0001, "loss": 1.8083, "loss/crossentropy": 2.6700668334960938, "loss/fcd": 1.59375, "loss/idx": 6.0, "loss/logits": 0.21453458815813065, "step": 1130 }, { "epoch": 0.016888284965544016, "grad_norm": 0.4375, "grad_norm_var": 0.002530670166015625, "learning_rate": 0.0001, "loss": 1.5809, "loss/crossentropy": 2.5727399587631226, "loss/fcd": 1.41796875, "loss/idx": 6.0, "loss/logits": 0.16292241215705872, "step": 1131 }, { "epoch": 0.016903217136159, "grad_norm": 0.95703125, "grad_norm_var": 0.01599299112955729, "learning_rate": 0.0001, "loss": 1.9989, "loss/crossentropy": 2.248908281326294, "loss/fcd": 1.75390625, "loss/idx": 6.0, "loss/logits": 0.24498894810676575, "step": 1132 }, { "epoch": 0.016918149306773978, "grad_norm": 0.4375, "grad_norm_var": 0.01637751261393229, "learning_rate": 0.0001, "loss": 1.6055, "loss/crossentropy": 2.702947497367859, "loss/fcd": 1.42578125, "loss/idx": 6.0, "loss/logits": 0.17967890202999115, "step": 1133 }, { "epoch": 0.01693308147738896, "grad_norm": 0.53125, "grad_norm_var": 0.01603387196858724, "learning_rate": 0.0001, "loss": 1.7999, "loss/crossentropy": 2.477932572364807, "loss/fcd": 1.59765625, "loss/idx": 6.0, "loss/logits": 0.2022503912448883, "step": 1134 }, { "epoch": 0.016948013648003944, "grad_norm": 0.486328125, "grad_norm_var": 0.01604180335998535, "learning_rate": 0.0001, "loss": 1.7251, "loss/crossentropy": 2.6051642894744873, "loss/fcd": 1.515625, "loss/idx": 6.0, "loss/logits": 0.20952476561069489, "step": 1135 }, { "epoch": 0.016962945818618923, "grad_norm": 0.41796875, "grad_norm_var": 0.016664743423461914, "learning_rate": 0.0001, "loss": 1.6029, "loss/crossentropy": 2.4434300661087036, "loss/fcd": 1.42578125, "loss/idx": 6.0, "loss/logits": 0.17716213315725327, "step": 1136 }, { "epoch": 0.016977877989233905, "grad_norm": 0.48046875, "grad_norm_var": 0.016709136962890624, "learning_rate": 0.0001, "loss": 1.5944, "loss/crossentropy": 2.532817840576172, "loss/fcd": 1.42578125, "loss/idx": 6.0, "loss/logits": 0.1686493381857872, "step": 1137 }, { "epoch": 0.016992810159848888, "grad_norm": 0.4921875, "grad_norm_var": 0.01659113566080729, "learning_rate": 0.0001, "loss": 1.7766, "loss/crossentropy": 2.6693389415740967, "loss/fcd": 1.578125, "loss/idx": 6.0, "loss/logits": 0.19849102199077606, "step": 1138 }, { "epoch": 0.017007742330463867, "grad_norm": 0.48046875, "grad_norm_var": 0.01660334269205729, "learning_rate": 0.0001, "loss": 1.5415, "loss/crossentropy": 2.757236123085022, "loss/fcd": 1.375, "loss/idx": 6.0, "loss/logits": 0.16651207208633423, "step": 1139 }, { "epoch": 0.01702267450107885, "grad_norm": 0.431640625, "grad_norm_var": 0.016815630594889323, "learning_rate": 0.0001, "loss": 1.6417, "loss/crossentropy": 2.5029356479644775, "loss/fcd": 1.4609375, "loss/idx": 6.0, "loss/logits": 0.1807950809597969, "step": 1140 }, { "epoch": 0.01703760667169383, "grad_norm": 0.71484375, "grad_norm_var": 0.019621531168619793, "learning_rate": 0.0001, "loss": 1.8059, "loss/crossentropy": 2.843307852745056, "loss/fcd": 1.60546875, "loss/idx": 6.0, "loss/logits": 0.20043348520994186, "step": 1141 }, { "epoch": 0.017052538842308812, "grad_norm": 0.46484375, "grad_norm_var": 0.019428507486979166, "learning_rate": 0.0001, "loss": 1.6537, "loss/crossentropy": 2.6783461570739746, "loss/fcd": 1.46875, "loss/idx": 6.0, "loss/logits": 0.18495432287454605, "step": 1142 }, { "epoch": 0.017067471012923795, "grad_norm": 0.5, "grad_norm_var": 0.018549537658691405, "learning_rate": 0.0001, "loss": 1.6484, "loss/crossentropy": 2.5440350770950317, "loss/fcd": 1.4609375, "loss/idx": 6.0, "loss/logits": 0.1874828264117241, "step": 1143 }, { "epoch": 0.017082403183538774, "grad_norm": 0.4453125, "grad_norm_var": 0.01887815793355306, "learning_rate": 0.0001, "loss": 1.51, "loss/crossentropy": 2.4569047689437866, "loss/fcd": 1.35546875, "loss/idx": 6.0, "loss/logits": 0.15451618283987045, "step": 1144 }, { "epoch": 0.017097335354153757, "grad_norm": 0.5078125, "grad_norm_var": 0.018530782063802084, "learning_rate": 0.0001, "loss": 1.6079, "loss/crossentropy": 2.605596423149109, "loss/fcd": 1.44140625, "loss/idx": 6.0, "loss/logits": 0.16648275405168533, "step": 1145 }, { "epoch": 0.01711226752476874, "grad_norm": 0.484375, "grad_norm_var": 0.018415260314941406, "learning_rate": 0.0001, "loss": 1.7809, "loss/crossentropy": 2.4507850408554077, "loss/fcd": 1.56640625, "loss/idx": 6.0, "loss/logits": 0.21446262300014496, "step": 1146 }, { "epoch": 0.01712719969538372, "grad_norm": 0.470703125, "grad_norm_var": 0.018132893244425456, "learning_rate": 0.0001, "loss": 1.5468, "loss/crossentropy": 2.5292779207229614, "loss/fcd": 1.3828125, "loss/idx": 6.0, "loss/logits": 0.16397973895072937, "step": 1147 }, { "epoch": 0.0171421318659987, "grad_norm": 0.4453125, "grad_norm_var": 0.004606993993123373, "learning_rate": 0.0001, "loss": 1.608, "loss/crossentropy": 2.413945198059082, "loss/fcd": 1.4296875, "loss/idx": 6.0, "loss/logits": 0.17828668653964996, "step": 1148 }, { "epoch": 0.017157064036613684, "grad_norm": 0.484375, "grad_norm_var": 0.004435332616170248, "learning_rate": 0.0001, "loss": 1.8044, "loss/crossentropy": 2.469904065132141, "loss/fcd": 1.57421875, "loss/idx": 6.0, "loss/logits": 0.23020246624946594, "step": 1149 }, { "epoch": 0.017171996207228663, "grad_norm": 0.48828125, "grad_norm_var": 0.00431364377339681, "learning_rate": 0.0001, "loss": 1.7087, "loss/crossentropy": 2.64601993560791, "loss/fcd": 1.51171875, "loss/idx": 6.0, "loss/logits": 0.19701046496629715, "step": 1150 }, { "epoch": 0.017186928377843646, "grad_norm": 0.4375, "grad_norm_var": 0.004468218485514323, "learning_rate": 0.0001, "loss": 1.715, "loss/crossentropy": 2.6214022636413574, "loss/fcd": 1.51953125, "loss/idx": 6.0, "loss/logits": 0.1954970881342888, "step": 1151 }, { "epoch": 0.017201860548458625, "grad_norm": 0.451171875, "grad_norm_var": 0.004244216283162435, "learning_rate": 0.0001, "loss": 1.6446, "loss/crossentropy": 2.631308913230896, "loss/fcd": 1.4609375, "loss/idx": 6.0, "loss/logits": 0.1836615949869156, "step": 1152 }, { "epoch": 0.017216792719073608, "grad_norm": 0.46484375, "grad_norm_var": 0.0042714277903238935, "learning_rate": 0.0001, "loss": 1.6015, "loss/crossentropy": 2.698973059654236, "loss/fcd": 1.4296875, "loss/idx": 6.0, "loss/logits": 0.1718554049730301, "step": 1153 }, { "epoch": 0.01723172488968859, "grad_norm": 0.44921875, "grad_norm_var": 0.004346958796183268, "learning_rate": 0.0001, "loss": 1.6272, "loss/crossentropy": 2.672145128250122, "loss/fcd": 1.45703125, "loss/idx": 6.0, "loss/logits": 0.1701432168483734, "step": 1154 }, { "epoch": 0.01724665706030357, "grad_norm": 0.49609375, "grad_norm_var": 0.00435789426167806, "learning_rate": 0.0001, "loss": 1.6417, "loss/crossentropy": 2.340711236000061, "loss/fcd": 1.4609375, "loss/idx": 6.0, "loss/logits": 0.18080533295869827, "step": 1155 }, { "epoch": 0.017261589230918552, "grad_norm": 0.447265625, "grad_norm_var": 0.0042650699615478516, "learning_rate": 0.0001, "loss": 1.6031, "loss/crossentropy": 2.760603189468384, "loss/fcd": 1.4375, "loss/idx": 6.0, "loss/logits": 0.16555795073509216, "step": 1156 }, { "epoch": 0.017276521401533535, "grad_norm": 0.431640625, "grad_norm_var": 0.000579833984375, "learning_rate": 0.0001, "loss": 1.5944, "loss/crossentropy": 2.6986688375473022, "loss/fcd": 1.4296875, "loss/idx": 6.0, "loss/logits": 0.1646973043680191, "step": 1157 }, { "epoch": 0.017291453572148514, "grad_norm": 0.515625, "grad_norm_var": 0.0007277806599934896, "learning_rate": 0.0001, "loss": 1.5676, "loss/crossentropy": 2.542388439178467, "loss/fcd": 1.40625, "loss/idx": 6.0, "loss/logits": 0.1613418385386467, "step": 1158 }, { "epoch": 0.017306385742763497, "grad_norm": 0.458984375, "grad_norm_var": 0.0006687005360921223, "learning_rate": 0.0001, "loss": 1.6341, "loss/crossentropy": 2.4559755325317383, "loss/fcd": 1.45703125, "loss/idx": 6.0, "loss/logits": 0.17710646241903305, "step": 1159 }, { "epoch": 0.01732131791337848, "grad_norm": 0.4375, "grad_norm_var": 0.0006955305735270183, "learning_rate": 0.0001, "loss": 1.5563, "loss/crossentropy": 2.4609339237213135, "loss/fcd": 1.390625, "loss/idx": 6.0, "loss/logits": 0.16569382697343826, "step": 1160 }, { "epoch": 0.01733625008399346, "grad_norm": 0.3359375, "grad_norm_var": 0.0016047000885009766, "learning_rate": 0.0001, "loss": 1.6527, "loss/crossentropy": 2.657406806945801, "loss/fcd": 1.48828125, "loss/idx": 6.25, "loss/logits": 0.16446168720722198, "step": 1161 }, { "epoch": 0.01735118225460844, "grad_norm": 0.314453125, "grad_norm_var": 0.0027704238891601562, "learning_rate": 0.0001, "loss": 1.372, "loss/crossentropy": 2.767332077026367, "loss/fcd": 1.2265625, "loss/idx": 6.5, "loss/logits": 0.14546091854572296, "step": 1162 }, { "epoch": 0.01736611442522342, "grad_norm": 0.3671875, "grad_norm_var": 0.0030930678049723307, "learning_rate": 0.0001, "loss": 1.6071, "loss/crossentropy": 2.622250199317932, "loss/fcd": 1.41796875, "loss/idx": 6.5, "loss/logits": 0.18909041583538055, "step": 1163 }, { "epoch": 0.017381046595838404, "grad_norm": 0.30078125, "grad_norm_var": 0.0042786757151285805, "learning_rate": 0.0001, "loss": 1.4446, "loss/crossentropy": 2.528477191925049, "loss/fcd": 1.28515625, "loss/idx": 6.5, "loss/logits": 0.15940909832715988, "step": 1164 }, { "epoch": 0.017395978766453386, "grad_norm": 0.298828125, "grad_norm_var": 0.005086517333984375, "learning_rate": 0.0001, "loss": 1.459, "loss/crossentropy": 2.553607702255249, "loss/fcd": 1.29296875, "loss/idx": 6.5, "loss/logits": 0.16601165384054184, "step": 1165 }, { "epoch": 0.017410910937068366, "grad_norm": 0.375, "grad_norm_var": 0.00483392079671224, "learning_rate": 0.0001, "loss": 1.6278, "loss/crossentropy": 2.3345470428466797, "loss/fcd": 1.44140625, "loss/idx": 6.5, "loss/logits": 0.1863660141825676, "step": 1166 }, { "epoch": 0.017425843107683348, "grad_norm": 0.326171875, "grad_norm_var": 0.005220778783162435, "learning_rate": 0.0001, "loss": 1.7798, "loss/crossentropy": 2.4525299072265625, "loss/fcd": 1.54296875, "loss/idx": 6.5, "loss/logits": 0.23679041117429733, "step": 1167 }, { "epoch": 0.01744077527829833, "grad_norm": 0.3125, "grad_norm_var": 0.005558204650878906, "learning_rate": 0.0001, "loss": 1.5068, "loss/crossentropy": 2.396833300590515, "loss/fcd": 1.33984375, "loss/idx": 6.5, "loss/logits": 0.16692077368497849, "step": 1168 }, { "epoch": 0.01745570744891331, "grad_norm": 0.341796875, "grad_norm_var": 0.005370950698852539, "learning_rate": 0.0001, "loss": 1.5891, "loss/crossentropy": 2.688977599143982, "loss/fcd": 1.3984375, "loss/idx": 6.5, "loss/logits": 0.1906268373131752, "step": 1169 }, { "epoch": 0.017470639619528293, "grad_norm": 0.33203125, "grad_norm_var": 0.005273675918579102, "learning_rate": 0.0001, "loss": 1.5735, "loss/crossentropy": 2.6352808475494385, "loss/fcd": 1.39453125, "loss/idx": 6.5, "loss/logits": 0.17897118628025055, "step": 1170 }, { "epoch": 0.017485571790143276, "grad_norm": 0.314453125, "grad_norm_var": 0.004541969299316407, "learning_rate": 0.0001, "loss": 1.5492, "loss/crossentropy": 2.646498441696167, "loss/fcd": 1.375, "loss/idx": 6.5, "loss/logits": 0.17418432235717773, "step": 1171 }, { "epoch": 0.017500503960758255, "grad_norm": 0.396484375, "grad_norm_var": 0.004175821940104167, "learning_rate": 0.0001, "loss": 1.7385, "loss/crossentropy": 2.3632874488830566, "loss/fcd": 1.5, "loss/idx": 6.5, "loss/logits": 0.23853551596403122, "step": 1172 }, { "epoch": 0.017515436131373237, "grad_norm": 0.3515625, "grad_norm_var": 0.0038780053456624348, "learning_rate": 0.0001, "loss": 1.6379, "loss/crossentropy": 2.709755778312683, "loss/fcd": 1.44921875, "loss/idx": 6.5, "loss/logits": 0.18869873881340027, "step": 1173 }, { "epoch": 0.01753036830198822, "grad_norm": 0.3203125, "grad_norm_var": 0.0022408644358317058, "learning_rate": 0.0001, "loss": 1.6935, "loss/crossentropy": 2.5511062145233154, "loss/fcd": 1.46875, "loss/idx": 6.5, "loss/logits": 0.2247961387038231, "step": 1174 }, { "epoch": 0.0175453004726032, "grad_norm": 0.470703125, "grad_norm_var": 0.002421299616495768, "learning_rate": 0.0001, "loss": 1.7294, "loss/crossentropy": 2.720292091369629, "loss/fcd": 1.53125, "loss/idx": 6.5, "loss/logits": 0.1981579214334488, "step": 1175 }, { "epoch": 0.017560232643218182, "grad_norm": 0.30859375, "grad_norm_var": 0.0019513289133707681, "learning_rate": 0.0001, "loss": 1.4729, "loss/crossentropy": 2.636868119239807, "loss/fcd": 1.29296875, "loss/idx": 6.5, "loss/logits": 0.17988184094429016, "step": 1176 }, { "epoch": 0.01757516481383316, "grad_norm": 0.3359375, "grad_norm_var": 0.0019513289133707681, "learning_rate": 0.0001, "loss": 1.3891, "loss/crossentropy": 2.7590854167938232, "loss/fcd": 1.23828125, "loss/idx": 6.5, "loss/logits": 0.15085097402334213, "step": 1177 }, { "epoch": 0.017590096984448144, "grad_norm": 0.341796875, "grad_norm_var": 0.001898813247680664, "learning_rate": 0.0001, "loss": 1.6978, "loss/crossentropy": 2.5070645809173584, "loss/fcd": 1.46484375, "loss/idx": 6.5, "loss/logits": 0.2329171895980835, "step": 1178 }, { "epoch": 0.017605029155063127, "grad_norm": 0.396484375, "grad_norm_var": 0.002045440673828125, "learning_rate": 0.0001, "loss": 1.8775, "loss/crossentropy": 2.5748108625411987, "loss/fcd": 1.609375, "loss/idx": 6.5, "loss/logits": 0.26812054216861725, "step": 1179 }, { "epoch": 0.017619961325678106, "grad_norm": 0.291015625, "grad_norm_var": 0.0021092573801676433, "learning_rate": 0.0001, "loss": 1.5288, "loss/crossentropy": 2.4305708408355713, "loss/fcd": 1.34375, "loss/idx": 6.5, "loss/logits": 0.1850065141916275, "step": 1180 }, { "epoch": 0.01763489349629309, "grad_norm": 0.29296875, "grad_norm_var": 0.0021471659342447917, "learning_rate": 0.0001, "loss": 1.4305, "loss/crossentropy": 2.5679367780685425, "loss/fcd": 1.26171875, "loss/idx": 6.5, "loss/logits": 0.1688312292098999, "step": 1181 }, { "epoch": 0.01764982566690807, "grad_norm": 0.4375, "grad_norm_var": 0.002647654215494792, "learning_rate": 0.0001, "loss": 1.9263, "loss/crossentropy": 2.317387104034424, "loss/fcd": 1.66796875, "loss/idx": 6.5, "loss/logits": 0.2583017721772194, "step": 1182 }, { "epoch": 0.01766475783752305, "grad_norm": 0.341796875, "grad_norm_var": 0.002617136637369792, "learning_rate": 0.0001, "loss": 1.6345, "loss/crossentropy": 2.413970947265625, "loss/fcd": 1.4375, "loss/idx": 6.5, "loss/logits": 0.1969573274254799, "step": 1183 }, { "epoch": 0.017679690008138033, "grad_norm": 0.36328125, "grad_norm_var": 0.002530352274576823, "learning_rate": 0.0001, "loss": 1.8208, "loss/crossentropy": 2.495847702026367, "loss/fcd": 1.609375, "loss/idx": 6.5, "loss/logits": 0.21144142746925354, "step": 1184 }, { "epoch": 0.017694622178753016, "grad_norm": 0.453125, "grad_norm_var": 0.0031491438547770183, "learning_rate": 0.0001, "loss": 1.8956, "loss/crossentropy": 2.4860759973526, "loss/fcd": 1.625, "loss/idx": 6.5, "loss/logits": 0.27056364715099335, "step": 1185 }, { "epoch": 0.017709554349367995, "grad_norm": 0.294921875, "grad_norm_var": 0.003369903564453125, "learning_rate": 0.0001, "loss": 1.4122, "loss/crossentropy": 2.79799485206604, "loss/fcd": 1.2578125, "loss/idx": 6.5, "loss/logits": 0.15437982231378555, "step": 1186 }, { "epoch": 0.017724486519982978, "grad_norm": 0.33203125, "grad_norm_var": 0.003289651870727539, "learning_rate": 0.0001, "loss": 1.7309, "loss/crossentropy": 2.386631965637207, "loss/fcd": 1.49609375, "loss/idx": 6.5, "loss/logits": 0.2348131462931633, "step": 1187 }, { "epoch": 0.017739418690597957, "grad_norm": 0.30859375, "grad_norm_var": 0.00332183837890625, "learning_rate": 0.0001, "loss": 1.4955, "loss/crossentropy": 2.6373519897460938, "loss/fcd": 1.3125, "loss/idx": 6.5, "loss/logits": 0.1830146610736847, "step": 1188 }, { "epoch": 0.01775435086121294, "grad_norm": 0.296875, "grad_norm_var": 0.0035158793131510415, "learning_rate": 0.0001, "loss": 1.5288, "loss/crossentropy": 2.6739686727523804, "loss/fcd": 1.34375, "loss/idx": 6.5, "loss/logits": 0.1850878894329071, "step": 1189 }, { "epoch": 0.017769283031827923, "grad_norm": 0.283203125, "grad_norm_var": 0.0037444909413655598, "learning_rate": 0.0001, "loss": 1.4293, "loss/crossentropy": 2.4060449600219727, "loss/fcd": 1.26953125, "loss/idx": 6.5, "loss/logits": 0.15978636592626572, "step": 1190 }, { "epoch": 0.017784215202442902, "grad_norm": 0.2890625, "grad_norm_var": 0.0028058369954427082, "learning_rate": 0.0001, "loss": 1.4926, "loss/crossentropy": 2.5885671377182007, "loss/fcd": 1.31640625, "loss/idx": 6.5, "loss/logits": 0.17616400122642517, "step": 1191 }, { "epoch": 0.017799147373057884, "grad_norm": 0.279296875, "grad_norm_var": 0.0029643853505452473, "learning_rate": 0.0001, "loss": 1.4515, "loss/crossentropy": 2.5950748920440674, "loss/fcd": 1.28515625, "loss/idx": 6.5, "loss/logits": 0.16635886579751968, "step": 1192 }, { "epoch": 0.017814079543672867, "grad_norm": 0.337890625, "grad_norm_var": 0.002965227762858073, "learning_rate": 0.0001, "loss": 1.6403, "loss/crossentropy": 2.4442238807678223, "loss/fcd": 1.44140625, "loss/idx": 6.5, "loss/logits": 0.19890712201595306, "step": 1193 }, { "epoch": 0.017829011714287846, "grad_norm": 0.796875, "grad_norm_var": 0.01639758745829264, "learning_rate": 0.0001, "loss": 1.904, "loss/crossentropy": 2.715569853782654, "loss/fcd": 1.49609375, "loss/idx": 6.5, "loss/logits": 0.4079201966524124, "step": 1194 }, { "epoch": 0.01784394388490283, "grad_norm": 0.267578125, "grad_norm_var": 0.016846577326456707, "learning_rate": 0.0001, "loss": 1.3684, "loss/crossentropy": 2.5312520265579224, "loss/fcd": 1.21875, "loss/idx": 6.5, "loss/logits": 0.14967987686395645, "step": 1195 }, { "epoch": 0.017858876055517812, "grad_norm": 0.302734375, "grad_norm_var": 0.016756550470987955, "learning_rate": 0.0001, "loss": 1.5207, "loss/crossentropy": 2.6301435232162476, "loss/fcd": 1.33984375, "loss/idx": 6.5, "loss/logits": 0.18084491789340973, "step": 1196 }, { "epoch": 0.01787380822613279, "grad_norm": 0.2734375, "grad_norm_var": 0.016941563288370768, "learning_rate": 0.0001, "loss": 1.4427, "loss/crossentropy": 2.401396632194519, "loss/fcd": 1.28515625, "loss/idx": 6.5, "loss/logits": 0.1575082242488861, "step": 1197 }, { "epoch": 0.017888740396747774, "grad_norm": 0.345703125, "grad_norm_var": 0.016441790262858073, "learning_rate": 0.0001, "loss": 1.5302, "loss/crossentropy": 2.4098260402679443, "loss/fcd": 1.35546875, "loss/idx": 6.5, "loss/logits": 0.17468641698360443, "step": 1198 }, { "epoch": 0.017903672567362756, "grad_norm": 0.296875, "grad_norm_var": 0.01660447120666504, "learning_rate": 0.0001, "loss": 1.5407, "loss/crossentropy": 2.5780783891677856, "loss/fcd": 1.359375, "loss/idx": 6.5, "loss/logits": 0.18136019259691238, "step": 1199 }, { "epoch": 0.017918604737977736, "grad_norm": 0.30078125, "grad_norm_var": 0.016697041193644204, "learning_rate": 0.0001, "loss": 1.4706, "loss/crossentropy": 2.601935863494873, "loss/fcd": 1.3046875, "loss/idx": 6.5, "loss/logits": 0.1659601628780365, "step": 1200 }, { "epoch": 0.01793353690859272, "grad_norm": 0.32421875, "grad_norm_var": 0.015811649958292644, "learning_rate": 0.0001, "loss": 1.5053, "loss/crossentropy": 2.5727096796035767, "loss/fcd": 1.32421875, "loss/idx": 6.5, "loss/logits": 0.18112681806087494, "step": 1201 }, { "epoch": 0.017948469079207698, "grad_norm": 0.341796875, "grad_norm_var": 0.015710179011027017, "learning_rate": 0.0001, "loss": 1.5072, "loss/crossentropy": 2.514320135116577, "loss/fcd": 1.33203125, "loss/idx": 6.5, "loss/logits": 0.17511937022209167, "step": 1202 }, { "epoch": 0.01796340124982268, "grad_norm": 0.373046875, "grad_norm_var": 0.015793291727701823, "learning_rate": 0.0001, "loss": 1.6384, "loss/crossentropy": 2.3392175436019897, "loss/fcd": 1.44921875, "loss/idx": 6.5, "loss/logits": 0.1891556680202484, "step": 1203 }, { "epoch": 0.017978333420437663, "grad_norm": 0.33203125, "grad_norm_var": 0.015733782450358072, "learning_rate": 0.0001, "loss": 1.6298, "loss/crossentropy": 2.6545450687408447, "loss/fcd": 1.43359375, "loss/idx": 6.5, "loss/logits": 0.1961742267012596, "step": 1204 }, { "epoch": 0.017993265591052642, "grad_norm": 0.353515625, "grad_norm_var": 0.015607945124308268, "learning_rate": 0.0001, "loss": 1.5463, "loss/crossentropy": 2.391342043876648, "loss/fcd": 1.37109375, "loss/idx": 6.5, "loss/logits": 0.17522381246089935, "step": 1205 }, { "epoch": 0.018008197761667625, "grad_norm": 0.5, "grad_norm_var": 0.01679884592692057, "learning_rate": 0.0001, "loss": 1.4859, "loss/crossentropy": 2.374518036842346, "loss/fcd": 1.3046875, "loss/idx": 6.5, "loss/logits": 0.18116553127765656, "step": 1206 }, { "epoch": 0.018023129932282608, "grad_norm": 0.30859375, "grad_norm_var": 0.016645304361979165, "learning_rate": 0.0001, "loss": 1.6511, "loss/crossentropy": 2.563612222671509, "loss/fcd": 1.44140625, "loss/idx": 6.5, "loss/logits": 0.2097008004784584, "step": 1207 }, { "epoch": 0.018038062102897587, "grad_norm": 0.36328125, "grad_norm_var": 0.01620036760965983, "learning_rate": 0.0001, "loss": 1.7428, "loss/crossentropy": 2.3562076091766357, "loss/fcd": 1.515625, "loss/idx": 6.5, "loss/logits": 0.2272082269191742, "step": 1208 }, { "epoch": 0.01805299427351257, "grad_norm": 0.357421875, "grad_norm_var": 0.016157134373982748, "learning_rate": 0.0001, "loss": 1.6028, "loss/crossentropy": 2.3315422534942627, "loss/fcd": 1.4140625, "loss/idx": 6.5, "loss/logits": 0.188755564391613, "step": 1209 }, { "epoch": 0.018067926444127552, "grad_norm": 0.296875, "grad_norm_var": 0.002981678644816081, "learning_rate": 0.0001, "loss": 1.5347, "loss/crossentropy": 2.6092514991760254, "loss/fcd": 1.35546875, "loss/idx": 6.5, "loss/logits": 0.17925221472978592, "step": 1210 }, { "epoch": 0.01808285861474253, "grad_norm": 0.30859375, "grad_norm_var": 0.002725664774576823, "learning_rate": 0.0001, "loss": 1.5431, "loss/crossentropy": 2.524160623550415, "loss/fcd": 1.3671875, "loss/idx": 6.5, "loss/logits": 0.17589685320854187, "step": 1211 }, { "epoch": 0.018097790785357514, "grad_norm": 0.80859375, "grad_norm_var": 0.016463073094685872, "learning_rate": 0.0001, "loss": 1.7323, "loss/crossentropy": 2.454976439476013, "loss/fcd": 1.49609375, "loss/idx": 6.5, "loss/logits": 0.2362159788608551, "step": 1212 }, { "epoch": 0.018112722955972493, "grad_norm": 0.30859375, "grad_norm_var": 0.01609800656636556, "learning_rate": 0.0001, "loss": 1.4605, "loss/crossentropy": 2.6214447021484375, "loss/fcd": 1.296875, "loss/idx": 6.5, "loss/logits": 0.163585864007473, "step": 1213 }, { "epoch": 0.018127655126587476, "grad_norm": 0.384765625, "grad_norm_var": 0.016066853205362955, "learning_rate": 0.0001, "loss": 1.6508, "loss/crossentropy": 2.786266326904297, "loss/fcd": 1.4296875, "loss/idx": 6.5, "loss/logits": 0.22115909308195114, "step": 1214 }, { "epoch": 0.01814258729720246, "grad_norm": 0.3203125, "grad_norm_var": 0.015865055720011394, "learning_rate": 0.0001, "loss": 1.6344, "loss/crossentropy": 2.6109365224838257, "loss/fcd": 1.41796875, "loss/idx": 6.5, "loss/logits": 0.21646693348884583, "step": 1215 }, { "epoch": 0.018157519467817438, "grad_norm": 0.333984375, "grad_norm_var": 0.015610249837239583, "learning_rate": 0.0001, "loss": 1.5222, "loss/crossentropy": 2.4061062335968018, "loss/fcd": 1.3515625, "loss/idx": 6.5, "loss/logits": 0.1706232950091362, "step": 1216 }, { "epoch": 0.01817245163843242, "grad_norm": 0.279296875, "grad_norm_var": 0.0160463809967041, "learning_rate": 0.0001, "loss": 1.4615, "loss/crossentropy": 2.537352442741394, "loss/fcd": 1.2890625, "loss/idx": 6.5, "loss/logits": 0.17248404771089554, "step": 1217 }, { "epoch": 0.018187383809047403, "grad_norm": 0.376953125, "grad_norm_var": 0.015976572036743165, "learning_rate": 0.0001, "loss": 1.6346, "loss/crossentropy": 2.5680431127548218, "loss/fcd": 1.4296875, "loss/idx": 6.5, "loss/logits": 0.20487521588802338, "step": 1218 }, { "epoch": 0.018202315979662383, "grad_norm": 0.287109375, "grad_norm_var": 0.016464726130167643, "learning_rate": 0.0001, "loss": 1.4726, "loss/crossentropy": 2.602305054664612, "loss/fcd": 1.3046875, "loss/idx": 6.5, "loss/logits": 0.16788026690483093, "step": 1219 }, { "epoch": 0.018217248150277365, "grad_norm": 0.306640625, "grad_norm_var": 0.01663354237874349, "learning_rate": 0.0001, "loss": 1.6387, "loss/crossentropy": 2.514701008796692, "loss/fcd": 1.4296875, "loss/idx": 6.5, "loss/logits": 0.2090064287185669, "step": 1220 }, { "epoch": 0.018232180320892348, "grad_norm": 0.345703125, "grad_norm_var": 0.016652870178222656, "learning_rate": 0.0001, "loss": 1.7284, "loss/crossentropy": 2.7042866945266724, "loss/fcd": 1.5, "loss/idx": 6.5, "loss/logits": 0.22837670892477036, "step": 1221 }, { "epoch": 0.018247112491507327, "grad_norm": 0.34765625, "grad_norm_var": 0.0154205322265625, "learning_rate": 0.0001, "loss": 1.6607, "loss/crossentropy": 2.6267004013061523, "loss/fcd": 1.44140625, "loss/idx": 6.5, "loss/logits": 0.21924810856580734, "step": 1222 }, { "epoch": 0.01826204466212231, "grad_norm": 0.31640625, "grad_norm_var": 0.015372467041015626, "learning_rate": 0.0001, "loss": 1.4897, "loss/crossentropy": 2.599629044532776, "loss/fcd": 1.31640625, "loss/idx": 6.5, "loss/logits": 0.17330200970172882, "step": 1223 }, { "epoch": 0.01827697683273729, "grad_norm": 0.369140625, "grad_norm_var": 0.015378046035766601, "learning_rate": 0.0001, "loss": 1.6224, "loss/crossentropy": 2.4942342042922974, "loss/fcd": 1.43359375, "loss/idx": 6.5, "loss/logits": 0.18882182240486145, "step": 1224 }, { "epoch": 0.018291909003352272, "grad_norm": 0.326171875, "grad_norm_var": 0.015446710586547851, "learning_rate": 0.0001, "loss": 1.5395, "loss/crossentropy": 2.627850890159607, "loss/fcd": 1.36328125, "loss/idx": 6.5, "loss/logits": 0.17617731541395187, "step": 1225 }, { "epoch": 0.018306841173967255, "grad_norm": 0.3359375, "grad_norm_var": 0.015227365493774413, "learning_rate": 0.0001, "loss": 1.6615, "loss/crossentropy": 2.448357105255127, "loss/fcd": 1.44921875, "loss/idx": 6.5, "loss/logits": 0.2123221904039383, "step": 1226 }, { "epoch": 0.018321773344582234, "grad_norm": 0.333984375, "grad_norm_var": 0.015094502766927084, "learning_rate": 0.0001, "loss": 1.6281, "loss/crossentropy": 2.7975505590438843, "loss/fcd": 1.42578125, "loss/idx": 6.5, "loss/logits": 0.2022945061326027, "step": 1227 }, { "epoch": 0.018336705515197217, "grad_norm": 0.345703125, "grad_norm_var": 0.0008815606435139974, "learning_rate": 0.0001, "loss": 1.594, "loss/crossentropy": 2.6412123441696167, "loss/fcd": 1.3828125, "loss/idx": 6.5, "loss/logits": 0.21123310923576355, "step": 1228 }, { "epoch": 0.0183516376858122, "grad_norm": 0.3046875, "grad_norm_var": 0.0008949120839436849, "learning_rate": 0.0001, "loss": 1.5289, "loss/crossentropy": 2.7561702728271484, "loss/fcd": 1.3515625, "loss/idx": 6.5, "loss/logits": 0.1773766726255417, "step": 1229 }, { "epoch": 0.01836656985642718, "grad_norm": 0.328125, "grad_norm_var": 0.000698089599609375, "learning_rate": 0.0001, "loss": 1.6053, "loss/crossentropy": 2.3611074686050415, "loss/fcd": 1.41015625, "loss/idx": 6.5, "loss/logits": 0.19514558464288712, "step": 1230 }, { "epoch": 0.01838150202704216, "grad_norm": 0.306640625, "grad_norm_var": 0.0007249037424723307, "learning_rate": 0.0001, "loss": 1.6022, "loss/crossentropy": 2.4678847789764404, "loss/fcd": 1.421875, "loss/idx": 6.5, "loss/logits": 0.18031777441501617, "step": 1231 }, { "epoch": 0.018396434197657144, "grad_norm": 0.3203125, "grad_norm_var": 0.0007252375284830729, "learning_rate": 0.0001, "loss": 1.4916, "loss/crossentropy": 2.404633402824402, "loss/fcd": 1.328125, "loss/idx": 6.5, "loss/logits": 0.16343700140714645, "step": 1232 }, { "epoch": 0.018411366368272123, "grad_norm": 0.349609375, "grad_norm_var": 0.0005879084269205729, "learning_rate": 0.0001, "loss": 1.5751, "loss/crossentropy": 2.3156590461730957, "loss/fcd": 1.38671875, "loss/idx": 6.5, "loss/logits": 0.18834958225488663, "step": 1233 }, { "epoch": 0.018426298538887106, "grad_norm": 0.322265625, "grad_norm_var": 0.00044193267822265623, "learning_rate": 0.0001, "loss": 1.5267, "loss/crossentropy": 2.755373477935791, "loss/fcd": 1.34765625, "loss/idx": 6.5, "loss/logits": 0.17905279248952866, "step": 1234 }, { "epoch": 0.01844123070950209, "grad_norm": 0.302734375, "grad_norm_var": 0.00037225087483723957, "learning_rate": 0.0001, "loss": 1.6031, "loss/crossentropy": 2.919349431991577, "loss/fcd": 1.390625, "loss/idx": 6.5, "loss/logits": 0.21244197338819504, "step": 1235 }, { "epoch": 0.018456162880117068, "grad_norm": 0.376953125, "grad_norm_var": 0.0004729588826497396, "learning_rate": 0.0001, "loss": 1.6075, "loss/crossentropy": 2.5358701944351196, "loss/fcd": 1.4140625, "loss/idx": 6.5, "loss/logits": 0.1933974176645279, "step": 1236 }, { "epoch": 0.01847109505073205, "grad_norm": 0.328125, "grad_norm_var": 0.0004630883534749349, "learning_rate": 0.0001, "loss": 1.7203, "loss/crossentropy": 2.363653302192688, "loss/fcd": 1.5, "loss/idx": 6.5, "loss/logits": 0.22028225660324097, "step": 1237 }, { "epoch": 0.01848602722134703, "grad_norm": 0.326171875, "grad_norm_var": 0.0004475275675455729, "learning_rate": 0.0001, "loss": 1.4636, "loss/crossentropy": 2.452838659286499, "loss/fcd": 1.29296875, "loss/idx": 6.5, "loss/logits": 0.17060783505439758, "step": 1238 }, { "epoch": 0.018500959391962012, "grad_norm": 0.298828125, "grad_norm_var": 0.0005005995432535807, "learning_rate": 0.0001, "loss": 1.5562, "loss/crossentropy": 2.486661434173584, "loss/fcd": 1.375, "loss/idx": 6.5, "loss/logits": 0.18124858289957047, "step": 1239 }, { "epoch": 0.018515891562576995, "grad_norm": 0.298828125, "grad_norm_var": 0.0004399458567301432, "learning_rate": 0.0001, "loss": 1.4547, "loss/crossentropy": 2.6643694639205933, "loss/fcd": 1.28125, "loss/idx": 6.5, "loss/logits": 0.17340320348739624, "step": 1240 }, { "epoch": 0.018530823733191974, "grad_norm": 0.27734375, "grad_norm_var": 0.0005833943684895833, "learning_rate": 0.0001, "loss": 1.4081, "loss/crossentropy": 2.4870904684066772, "loss/fcd": 1.2578125, "loss/idx": 6.5, "loss/logits": 0.1503349393606186, "step": 1241 }, { "epoch": 0.018545755903806957, "grad_norm": 0.326171875, "grad_norm_var": 0.0005715529123942058, "learning_rate": 0.0001, "loss": 1.5116, "loss/crossentropy": 2.79026997089386, "loss/fcd": 1.33203125, "loss/idx": 6.5, "loss/logits": 0.1795227974653244, "step": 1242 }, { "epoch": 0.01856068807442194, "grad_norm": 0.3515625, "grad_norm_var": 0.0006197611490885417, "learning_rate": 0.0001, "loss": 1.6242, "loss/crossentropy": 2.626877784729004, "loss/fcd": 1.4296875, "loss/idx": 6.5, "loss/logits": 0.1945071667432785, "step": 1243 }, { "epoch": 0.01857562024503692, "grad_norm": 0.34765625, "grad_norm_var": 0.0006259759267171224, "learning_rate": 0.0001, "loss": 1.8053, "loss/crossentropy": 2.413367986679077, "loss/fcd": 1.546875, "loss/idx": 6.5, "loss/logits": 0.2584308609366417, "step": 1244 }, { "epoch": 0.0185905524156519, "grad_norm": 0.30078125, "grad_norm_var": 0.0006364027659098308, "learning_rate": 0.0001, "loss": 1.6274, "loss/crossentropy": 2.557702898979187, "loss/fcd": 1.4296875, "loss/idx": 6.5, "loss/logits": 0.19771048426628113, "step": 1245 }, { "epoch": 0.018605484586266884, "grad_norm": 0.3125, "grad_norm_var": 0.0006402174631754558, "learning_rate": 0.0001, "loss": 1.7298, "loss/crossentropy": 2.6046078205108643, "loss/fcd": 1.5078125, "loss/idx": 6.5, "loss/logits": 0.2219381481409073, "step": 1246 }, { "epoch": 0.018620416756881863, "grad_norm": 0.431640625, "grad_norm_var": 0.0013665358225504558, "learning_rate": 0.0001, "loss": 1.915, "loss/crossentropy": 2.4264408349990845, "loss/fcd": 1.66796875, "loss/idx": 6.5, "loss/logits": 0.24700388312339783, "step": 1247 }, { "epoch": 0.018635348927496846, "grad_norm": 0.359375, "grad_norm_var": 0.0014142195383707683, "learning_rate": 0.0001, "loss": 1.6276, "loss/crossentropy": 2.7338656187057495, "loss/fcd": 1.4375, "loss/idx": 6.5, "loss/logits": 0.19006990641355515, "step": 1248 }, { "epoch": 0.018650281098111825, "grad_norm": 0.296875, "grad_norm_var": 0.0014635721842447917, "learning_rate": 0.0001, "loss": 1.4963, "loss/crossentropy": 2.4242324829101562, "loss/fcd": 1.31640625, "loss/idx": 6.5, "loss/logits": 0.1799371838569641, "step": 1249 }, { "epoch": 0.018665213268726808, "grad_norm": 0.3828125, "grad_norm_var": 0.0016414483388264975, "learning_rate": 0.0001, "loss": 1.6714, "loss/crossentropy": 2.492332339286804, "loss/fcd": 1.45703125, "loss/idx": 6.5, "loss/logits": 0.2144075110554695, "step": 1250 }, { "epoch": 0.01868014543934179, "grad_norm": 0.306640625, "grad_norm_var": 0.0016269524892171224, "learning_rate": 0.0001, "loss": 1.4011, "loss/crossentropy": 2.542190194129944, "loss/fcd": 1.25, "loss/idx": 6.5, "loss/logits": 0.15106689184904099, "step": 1251 }, { "epoch": 0.01869507760995677, "grad_norm": 0.326171875, "grad_norm_var": 0.0014880975087483725, "learning_rate": 0.0001, "loss": 1.5506, "loss/crossentropy": 2.5000863075256348, "loss/fcd": 1.37109375, "loss/idx": 6.5, "loss/logits": 0.17947331815958023, "step": 1252 }, { "epoch": 0.018710009780571753, "grad_norm": 0.3203125, "grad_norm_var": 0.0014933109283447265, "learning_rate": 0.0001, "loss": 1.4848, "loss/crossentropy": 2.552633047103882, "loss/fcd": 1.30078125, "loss/idx": 6.5, "loss/logits": 0.18402951210737228, "step": 1253 }, { "epoch": 0.018724941951186735, "grad_norm": 0.7578125, "grad_norm_var": 0.012976328531901041, "learning_rate": 0.0001, "loss": 1.7571, "loss/crossentropy": 2.593714118003845, "loss/fcd": 1.5078125, "loss/idx": 6.5, "loss/logits": 0.24928182363510132, "step": 1254 }, { "epoch": 0.018739874121801715, "grad_norm": 0.283203125, "grad_norm_var": 0.013110605875651042, "learning_rate": 0.0001, "loss": 1.5159, "loss/crossentropy": 2.672044038772583, "loss/fcd": 1.33203125, "loss/idx": 6.5, "loss/logits": 0.1838703230023384, "step": 1255 }, { "epoch": 0.018754806292416697, "grad_norm": 0.3359375, "grad_norm_var": 0.012918837865193685, "learning_rate": 0.0001, "loss": 1.658, "loss/crossentropy": 2.4547826051712036, "loss/fcd": 1.4453125, "loss/idx": 6.5, "loss/logits": 0.21269508451223373, "step": 1256 }, { "epoch": 0.01876973846303168, "grad_norm": 0.310546875, "grad_norm_var": 0.012633768717447917, "learning_rate": 0.0001, "loss": 1.5041, "loss/crossentropy": 2.6580370664596558, "loss/fcd": 1.3359375, "loss/idx": 6.5, "loss/logits": 0.1681847870349884, "step": 1257 }, { "epoch": 0.01878467063364666, "grad_norm": 0.333984375, "grad_norm_var": 0.012602996826171876, "learning_rate": 0.0001, "loss": 1.5575, "loss/crossentropy": 2.8848483562469482, "loss/fcd": 1.3671875, "loss/idx": 6.5, "loss/logits": 0.19028093665838242, "step": 1258 }, { "epoch": 0.018799602804261642, "grad_norm": 0.3828125, "grad_norm_var": 0.012629445393880208, "learning_rate": 0.0001, "loss": 1.678, "loss/crossentropy": 3.0120307207107544, "loss/fcd": 1.4765625, "loss/idx": 6.5, "loss/logits": 0.20148076117038727, "step": 1259 }, { "epoch": 0.01881453497487662, "grad_norm": 0.298828125, "grad_norm_var": 0.01287064552307129, "learning_rate": 0.0001, "loss": 1.477, "loss/crossentropy": 2.5219684839248657, "loss/fcd": 1.3203125, "loss/idx": 6.5, "loss/logits": 0.15666230767965317, "step": 1260 }, { "epoch": 0.018829467145491604, "grad_norm": 0.408203125, "grad_norm_var": 0.012761370340983073, "learning_rate": 0.0001, "loss": 1.6376, "loss/crossentropy": 2.52824604511261, "loss/fcd": 1.44921875, "loss/idx": 6.5, "loss/logits": 0.1883331537246704, "step": 1261 }, { "epoch": 0.018844399316106587, "grad_norm": 0.326171875, "grad_norm_var": 0.012676477432250977, "learning_rate": 0.0001, "loss": 1.5013, "loss/crossentropy": 2.635972738265991, "loss/fcd": 1.3203125, "loss/idx": 6.5, "loss/logits": 0.1809828281402588, "step": 1262 }, { "epoch": 0.018859331486721566, "grad_norm": 0.359375, "grad_norm_var": 0.012373606363932291, "learning_rate": 0.0001, "loss": 1.5878, "loss/crossentropy": 2.927635669708252, "loss/fcd": 1.38671875, "loss/idx": 6.5, "loss/logits": 0.20111427456140518, "step": 1263 }, { "epoch": 0.01887426365733655, "grad_norm": 0.3671875, "grad_norm_var": 0.0123748779296875, "learning_rate": 0.0001, "loss": 1.5985, "loss/crossentropy": 2.4248982667922974, "loss/fcd": 1.41015625, "loss/idx": 6.5, "loss/logits": 0.18835779279470444, "step": 1264 }, { "epoch": 0.01888919582795153, "grad_norm": 0.333984375, "grad_norm_var": 0.012137206395467122, "learning_rate": 0.0001, "loss": 1.4965, "loss/crossentropy": 2.5499627590179443, "loss/fcd": 1.328125, "loss/idx": 6.5, "loss/logits": 0.16840286552906036, "step": 1265 }, { "epoch": 0.01890412799856651, "grad_norm": 0.33984375, "grad_norm_var": 0.012148396174112955, "learning_rate": 0.0001, "loss": 1.6923, "loss/crossentropy": 2.817864418029785, "loss/fcd": 1.48046875, "loss/idx": 6.5, "loss/logits": 0.21178434789180756, "step": 1266 }, { "epoch": 0.018919060169181493, "grad_norm": 0.337890625, "grad_norm_var": 0.011979023615519205, "learning_rate": 0.0001, "loss": 1.5916, "loss/crossentropy": 2.808348774909973, "loss/fcd": 1.40234375, "loss/idx": 6.5, "loss/logits": 0.189209446310997, "step": 1267 }, { "epoch": 0.018933992339796476, "grad_norm": 0.27734375, "grad_norm_var": 0.012373606363932291, "learning_rate": 0.0001, "loss": 1.3998, "loss/crossentropy": 2.685523271560669, "loss/fcd": 1.23828125, "loss/idx": 6.5, "loss/logits": 0.16152724623680115, "step": 1268 }, { "epoch": 0.018948924510411455, "grad_norm": 0.28125, "grad_norm_var": 0.0126800537109375, "learning_rate": 0.0001, "loss": 1.4709, "loss/crossentropy": 2.490466594696045, "loss/fcd": 1.296875, "loss/idx": 6.5, "loss/logits": 0.17405615001916885, "step": 1269 }, { "epoch": 0.018963856681026438, "grad_norm": 0.31640625, "grad_norm_var": 0.0013503392537434896, "learning_rate": 0.0001, "loss": 1.5713, "loss/crossentropy": 2.6393624544143677, "loss/fcd": 1.3671875, "loss/idx": 6.5, "loss/logits": 0.20406648516654968, "step": 1270 }, { "epoch": 0.01897878885164142, "grad_norm": 0.30078125, "grad_norm_var": 0.001258071263631185, "learning_rate": 0.0001, "loss": 1.4812, "loss/crossentropy": 2.5566498041152954, "loss/fcd": 1.30859375, "loss/idx": 6.5, "loss/logits": 0.172604039311409, "step": 1271 }, { "epoch": 0.0189937210222564, "grad_norm": 0.369140625, "grad_norm_var": 0.0013448079427083334, "learning_rate": 0.0001, "loss": 1.6218, "loss/crossentropy": 2.7665399312973022, "loss/fcd": 1.421875, "loss/idx": 6.5, "loss/logits": 0.1998923420906067, "step": 1272 }, { "epoch": 0.019008653192871382, "grad_norm": 0.359375, "grad_norm_var": 0.00134123166402181, "learning_rate": 0.0001, "loss": 1.4702, "loss/crossentropy": 2.761102795600891, "loss/fcd": 1.3046875, "loss/idx": 6.5, "loss/logits": 0.1654694825410843, "step": 1273 }, { "epoch": 0.01902358536348636, "grad_norm": 0.404296875, "grad_norm_var": 0.0016216119130452475, "learning_rate": 0.0001, "loss": 1.776, "loss/crossentropy": 2.5646800994873047, "loss/fcd": 1.55078125, "loss/idx": 6.5, "loss/logits": 0.22524471580982208, "step": 1274 }, { "epoch": 0.019038517534101344, "grad_norm": 0.2890625, "grad_norm_var": 0.0016536553700764974, "learning_rate": 0.0001, "loss": 1.5461, "loss/crossentropy": 2.4658591747283936, "loss/fcd": 1.35546875, "loss/idx": 6.5, "loss/logits": 0.19066184759140015, "step": 1275 }, { "epoch": 0.019053449704716327, "grad_norm": 0.333984375, "grad_norm_var": 0.001558669408162435, "learning_rate": 0.0001, "loss": 1.6102, "loss/crossentropy": 2.6678355932235718, "loss/fcd": 1.4140625, "loss/idx": 6.5, "loss/logits": 0.19614477455615997, "step": 1276 }, { "epoch": 0.019068381875331306, "grad_norm": 0.30859375, "grad_norm_var": 0.0012433369954427083, "learning_rate": 0.0001, "loss": 1.6021, "loss/crossentropy": 2.784413456916809, "loss/fcd": 1.3984375, "loss/idx": 6.5, "loss/logits": 0.2036561220884323, "step": 1277 }, { "epoch": 0.01908331404594629, "grad_norm": 0.341796875, "grad_norm_var": 0.001247406005859375, "learning_rate": 0.0001, "loss": 1.6353, "loss/crossentropy": 2.580705761909485, "loss/fcd": 1.4453125, "loss/idx": 6.5, "loss/logits": 0.18997424840927124, "step": 1278 }, { "epoch": 0.01909824621656127, "grad_norm": 0.32421875, "grad_norm_var": 0.0011987686157226562, "learning_rate": 0.0001, "loss": 1.3843, "loss/crossentropy": 2.869256019592285, "loss/fcd": 1.234375, "loss/idx": 6.5, "loss/logits": 0.14993099868297577, "step": 1279 }, { "epoch": 0.01911317838717625, "grad_norm": 0.3515625, "grad_norm_var": 0.001137224833170573, "learning_rate": 0.0001, "loss": 1.6721, "loss/crossentropy": 2.6417490243911743, "loss/fcd": 1.46875, "loss/idx": 6.5, "loss/logits": 0.20330028980970383, "step": 1280 }, { "epoch": 0.019128110557791234, "grad_norm": 0.33984375, "grad_norm_var": 0.0011429945627848307, "learning_rate": 0.0001, "loss": 1.7471, "loss/crossentropy": 2.4359676837921143, "loss/fcd": 1.52734375, "loss/idx": 6.5, "loss/logits": 0.2198041006922722, "step": 1281 }, { "epoch": 0.019143042728406216, "grad_norm": 0.310546875, "grad_norm_var": 0.001157061258951823, "learning_rate": 0.0001, "loss": 1.4947, "loss/crossentropy": 2.415672779083252, "loss/fcd": 1.33203125, "loss/idx": 6.5, "loss/logits": 0.1626281514763832, "step": 1282 }, { "epoch": 0.019157974899021196, "grad_norm": 0.322265625, "grad_norm_var": 0.0011514663696289063, "learning_rate": 0.0001, "loss": 1.607, "loss/crossentropy": 2.599458336830139, "loss/fcd": 1.40234375, "loss/idx": 6.5, "loss/logits": 0.20462485402822495, "step": 1283 }, { "epoch": 0.019172907069636178, "grad_norm": 0.33984375, "grad_norm_var": 0.0009826024373372395, "learning_rate": 0.0001, "loss": 1.5397, "loss/crossentropy": 2.6432963609695435, "loss/fcd": 1.36328125, "loss/idx": 6.5, "loss/logits": 0.1764075607061386, "step": 1284 }, { "epoch": 0.019187839240251157, "grad_norm": 0.310546875, "grad_norm_var": 0.0008426507314046224, "learning_rate": 0.0001, "loss": 1.5456, "loss/crossentropy": 2.562265396118164, "loss/fcd": 1.35546875, "loss/idx": 6.5, "loss/logits": 0.1901203915476799, "step": 1285 }, { "epoch": 0.01920277141086614, "grad_norm": 0.32421875, "grad_norm_var": 0.0008295536041259766, "learning_rate": 0.0001, "loss": 1.6828, "loss/crossentropy": 2.4806153774261475, "loss/fcd": 1.4765625, "loss/idx": 6.5, "loss/logits": 0.20623362064361572, "step": 1286 }, { "epoch": 0.019217703581481123, "grad_norm": 0.30078125, "grad_norm_var": 0.0008295536041259766, "learning_rate": 0.0001, "loss": 1.4943, "loss/crossentropy": 2.600746989250183, "loss/fcd": 1.31640625, "loss/idx": 6.5, "loss/logits": 0.1778869926929474, "step": 1287 }, { "epoch": 0.019232635752096102, "grad_norm": 0.306640625, "grad_norm_var": 0.0007736047108968098, "learning_rate": 0.0001, "loss": 1.6329, "loss/crossentropy": 2.6411110162734985, "loss/fcd": 1.421875, "loss/idx": 6.5, "loss/logits": 0.21099568903446198, "step": 1288 }, { "epoch": 0.019247567922711085, "grad_norm": 0.259765625, "grad_norm_var": 0.0009932835896809896, "learning_rate": 0.0001, "loss": 1.4852, "loss/crossentropy": 2.5829832553863525, "loss/fcd": 1.30859375, "loss/idx": 6.5, "loss/logits": 0.17659380286931992, "step": 1289 }, { "epoch": 0.019262500093326067, "grad_norm": 0.314453125, "grad_norm_var": 0.0005238850911458334, "learning_rate": 0.0001, "loss": 1.5055, "loss/crossentropy": 2.631233811378479, "loss/fcd": 1.3203125, "loss/idx": 6.5, "loss/logits": 0.185147225856781, "step": 1290 }, { "epoch": 0.019277432263941047, "grad_norm": 0.2890625, "grad_norm_var": 0.0005238850911458334, "learning_rate": 0.0001, "loss": 1.4458, "loss/crossentropy": 2.445384979248047, "loss/fcd": 1.2890625, "loss/idx": 6.5, "loss/logits": 0.15676811337471008, "step": 1291 }, { "epoch": 0.01929236443455603, "grad_norm": 0.341796875, "grad_norm_var": 0.0005449930826822917, "learning_rate": 0.0001, "loss": 1.6089, "loss/crossentropy": 2.795995831489563, "loss/fcd": 1.41796875, "loss/idx": 6.5, "loss/logits": 0.190910205245018, "step": 1292 }, { "epoch": 0.019307296605171012, "grad_norm": 0.3203125, "grad_norm_var": 0.0005390803019205729, "learning_rate": 0.0001, "loss": 1.7172, "loss/crossentropy": 2.6674411296844482, "loss/fcd": 1.48828125, "loss/idx": 6.5, "loss/logits": 0.22890527546405792, "step": 1293 }, { "epoch": 0.01932222877578599, "grad_norm": 0.310546875, "grad_norm_var": 0.0005034764607747395, "learning_rate": 0.0001, "loss": 1.513, "loss/crossentropy": 2.369943857192993, "loss/fcd": 1.33203125, "loss/idx": 6.5, "loss/logits": 0.18096201121807098, "step": 1294 }, { "epoch": 0.019337160946400974, "grad_norm": 0.3203125, "grad_norm_var": 0.00050048828125, "learning_rate": 0.0001, "loss": 1.4702, "loss/crossentropy": 2.60583758354187, "loss/fcd": 1.30078125, "loss/idx": 6.5, "loss/logits": 0.1694255843758583, "step": 1295 }, { "epoch": 0.019352093117015957, "grad_norm": 0.361328125, "grad_norm_var": 0.000552225112915039, "learning_rate": 0.0001, "loss": 1.4583, "loss/crossentropy": 2.734460473060608, "loss/fcd": 1.29296875, "loss/idx": 6.5, "loss/logits": 0.16532278805971146, "step": 1296 }, { "epoch": 0.019367025287630936, "grad_norm": 0.60546875, "grad_norm_var": 0.005770476659138998, "learning_rate": 0.0001, "loss": 1.6362, "loss/crossentropy": 2.6057682037353516, "loss/fcd": 1.453125, "loss/idx": 7.0, "loss/logits": 0.1831134930253029, "step": 1297 }, { "epoch": 0.01938195745824592, "grad_norm": 0.86328125, "grad_norm_var": 0.023164876302083335, "learning_rate": 0.0001, "loss": 1.7227, "loss/crossentropy": 2.6192381381988525, "loss/fcd": 1.52734375, "loss/idx": 7.0, "loss/logits": 0.19534087181091309, "step": 1298 }, { "epoch": 0.019396889628860898, "grad_norm": 0.7734375, "grad_norm_var": 0.033126052220662436, "learning_rate": 0.0001, "loss": 1.8283, "loss/crossentropy": 2.6694098711013794, "loss/fcd": 1.61328125, "loss/idx": 7.0, "loss/logits": 0.21504506468772888, "step": 1299 }, { "epoch": 0.01941182179947588, "grad_norm": 0.67578125, "grad_norm_var": 0.03764786720275879, "learning_rate": 0.0001, "loss": 1.842, "loss/crossentropy": 2.5446014404296875, "loss/fcd": 1.62109375, "loss/idx": 7.0, "loss/logits": 0.22087354958057404, "step": 1300 }, { "epoch": 0.019426753970090863, "grad_norm": 0.7734375, "grad_norm_var": 0.04444732666015625, "learning_rate": 0.0001, "loss": 1.7877, "loss/crossentropy": 2.3532973527908325, "loss/fcd": 1.609375, "loss/idx": 7.0, "loss/logits": 0.17832990735769272, "step": 1301 }, { "epoch": 0.019441686140705843, "grad_norm": 0.6171875, "grad_norm_var": 0.04504337310791016, "learning_rate": 0.0001, "loss": 1.6963, "loss/crossentropy": 2.371212661266327, "loss/fcd": 1.5234375, "loss/idx": 7.0, "loss/logits": 0.17282748222351074, "step": 1302 }, { "epoch": 0.019456618311320825, "grad_norm": 0.609375, "grad_norm_var": 0.04425481160481771, "learning_rate": 0.0001, "loss": 1.6384, "loss/crossentropy": 2.8181079626083374, "loss/fcd": 1.4609375, "loss/idx": 7.0, "loss/logits": 0.17750699073076248, "step": 1303 }, { "epoch": 0.019471550481935808, "grad_norm": 0.6328125, "grad_norm_var": 0.043195708592732744, "learning_rate": 0.0001, "loss": 1.6662, "loss/crossentropy": 2.6716779470443726, "loss/fcd": 1.48828125, "loss/idx": 7.0, "loss/logits": 0.17791379988193512, "step": 1304 }, { "epoch": 0.019486482652550787, "grad_norm": 0.55859375, "grad_norm_var": 0.03903477986653646, "learning_rate": 0.0001, "loss": 2.0067, "loss/crossentropy": 2.527832865715027, "loss/fcd": 1.73828125, "loss/idx": 7.0, "loss/logits": 0.26841960847377777, "step": 1305 }, { "epoch": 0.01950141482316577, "grad_norm": 0.56640625, "grad_norm_var": 0.0359981377919515, "learning_rate": 0.0001, "loss": 1.6607, "loss/crossentropy": 2.5668392181396484, "loss/fcd": 1.48046875, "loss/idx": 7.0, "loss/logits": 0.18021684139966965, "step": 1306 }, { "epoch": 0.019516346993780753, "grad_norm": 0.58203125, "grad_norm_var": 0.03161123593648275, "learning_rate": 0.0001, "loss": 1.8421, "loss/crossentropy": 2.521816849708557, "loss/fcd": 1.625, "loss/idx": 7.0, "loss/logits": 0.21707086265087128, "step": 1307 }, { "epoch": 0.019531279164395732, "grad_norm": 0.4921875, "grad_norm_var": 0.02870941162109375, "learning_rate": 0.0001, "loss": 1.5868, "loss/crossentropy": 2.608349084854126, "loss/fcd": 1.4140625, "loss/idx": 7.0, "loss/logits": 0.172712080180645, "step": 1308 }, { "epoch": 0.019546211335010714, "grad_norm": 0.60546875, "grad_norm_var": 0.024434852600097656, "learning_rate": 0.0001, "loss": 1.9755, "loss/crossentropy": 2.6307021379470825, "loss/fcd": 1.71875, "loss/idx": 7.0, "loss/logits": 0.2567453756928444, "step": 1309 }, { "epoch": 0.019561143505625694, "grad_norm": 0.61328125, "grad_norm_var": 0.0191158135732015, "learning_rate": 0.0001, "loss": 1.7592, "loss/crossentropy": 2.666864514350891, "loss/fcd": 1.5546875, "loss/idx": 7.0, "loss/logits": 0.2044796496629715, "step": 1310 }, { "epoch": 0.019576075676240676, "grad_norm": 0.498046875, "grad_norm_var": 0.01438751220703125, "learning_rate": 0.0001, "loss": 1.5998, "loss/crossentropy": 2.6031850576400757, "loss/fcd": 1.4296875, "loss/idx": 7.0, "loss/logits": 0.1700783297419548, "step": 1311 }, { "epoch": 0.01959100784685566, "grad_norm": 0.65234375, "grad_norm_var": 0.009866444269816081, "learning_rate": 0.0001, "loss": 1.7639, "loss/crossentropy": 2.525311231613159, "loss/fcd": 1.5703125, "loss/idx": 7.0, "loss/logits": 0.1935998499393463, "step": 1312 }, { "epoch": 0.01960594001747064, "grad_norm": 0.53515625, "grad_norm_var": 0.010428349177042643, "learning_rate": 0.0001, "loss": 1.6331, "loss/crossentropy": 2.5671015977859497, "loss/fcd": 1.453125, "loss/idx": 7.0, "loss/logits": 0.17994633316993713, "step": 1313 }, { "epoch": 0.01962087218808562, "grad_norm": 0.439453125, "grad_norm_var": 0.008362325032552083, "learning_rate": 0.0001, "loss": 1.5889, "loss/crossentropy": 2.5429428815841675, "loss/fcd": 1.421875, "loss/idx": 7.0, "loss/logits": 0.1670430600643158, "step": 1314 }, { "epoch": 0.019635804358700604, "grad_norm": 0.56640625, "grad_norm_var": 0.006296730041503907, "learning_rate": 0.0001, "loss": 1.8112, "loss/crossentropy": 2.7133418321609497, "loss/fcd": 1.5859375, "loss/idx": 7.0, "loss/logits": 0.22525541484355927, "step": 1315 }, { "epoch": 0.019650736529315583, "grad_norm": 0.62890625, "grad_norm_var": 0.005889320373535156, "learning_rate": 0.0001, "loss": 1.7866, "loss/crossentropy": 2.3733800649642944, "loss/fcd": 1.58203125, "loss/idx": 7.0, "loss/logits": 0.2045341208577156, "step": 1316 }, { "epoch": 0.019665668699930566, "grad_norm": 0.61328125, "grad_norm_var": 0.0034833272298177083, "learning_rate": 0.0001, "loss": 1.9862, "loss/crossentropy": 2.2801279425621033, "loss/fcd": 1.75390625, "loss/idx": 7.0, "loss/logits": 0.23227345198392868, "step": 1317 }, { "epoch": 0.01968060087054555, "grad_norm": 0.44921875, "grad_norm_var": 0.004317156473795573, "learning_rate": 0.0001, "loss": 1.7337, "loss/crossentropy": 2.816790819168091, "loss/fcd": 1.5234375, "loss/idx": 7.0, "loss/logits": 0.21028374135494232, "step": 1318 }, { "epoch": 0.019695533041160528, "grad_norm": 0.66015625, "grad_norm_var": 0.00477752685546875, "learning_rate": 0.0001, "loss": 1.9427, "loss/crossentropy": 2.5892642736434937, "loss/fcd": 1.71875, "loss/idx": 7.0, "loss/logits": 0.22399146854877472, "step": 1319 }, { "epoch": 0.01971046521177551, "grad_norm": 0.45703125, "grad_norm_var": 0.005198097229003907, "learning_rate": 0.0001, "loss": 1.6056, "loss/crossentropy": 2.3007309436798096, "loss/fcd": 1.42578125, "loss/idx": 7.0, "loss/logits": 0.1798677295446396, "step": 1320 }, { "epoch": 0.01972539738239049, "grad_norm": 0.45703125, "grad_norm_var": 0.005826250712076823, "learning_rate": 0.0001, "loss": 1.6315, "loss/crossentropy": 2.612833857536316, "loss/fcd": 1.4453125, "loss/idx": 7.0, "loss/logits": 0.18622201681137085, "step": 1321 }, { "epoch": 0.019740329553005472, "grad_norm": 0.546875, "grad_norm_var": 0.0058100382486979164, "learning_rate": 0.0001, "loss": 1.904, "loss/crossentropy": 2.3265438079833984, "loss/fcd": 1.6875, "loss/idx": 7.0, "loss/logits": 0.21648503094911575, "step": 1322 }, { "epoch": 0.019755261723620455, "grad_norm": 0.419921875, "grad_norm_var": 0.006755940119425456, "learning_rate": 0.0001, "loss": 1.7704, "loss/crossentropy": 2.4403117895126343, "loss/fcd": 1.546875, "loss/idx": 7.0, "loss/logits": 0.22352954745292664, "step": 1323 }, { "epoch": 0.019770193894235434, "grad_norm": 0.458984375, "grad_norm_var": 0.007035064697265625, "learning_rate": 0.0001, "loss": 1.6172, "loss/crossentropy": 2.571325421333313, "loss/fcd": 1.44140625, "loss/idx": 7.0, "loss/logits": 0.17575164139270782, "step": 1324 }, { "epoch": 0.019785126064850417, "grad_norm": 0.423828125, "grad_norm_var": 0.007453393936157226, "learning_rate": 0.0001, "loss": 1.8181, "loss/crossentropy": 2.5124125480651855, "loss/fcd": 1.58203125, "loss/idx": 7.0, "loss/logits": 0.23606669902801514, "step": 1325 }, { "epoch": 0.0198000582354654, "grad_norm": 0.4765625, "grad_norm_var": 0.007035048802693685, "learning_rate": 0.0001, "loss": 1.6305, "loss/crossentropy": 2.6320416927337646, "loss/fcd": 1.44921875, "loss/idx": 7.0, "loss/logits": 0.18130720406770706, "step": 1326 }, { "epoch": 0.01981499040608038, "grad_norm": 0.470703125, "grad_norm_var": 0.007153431574503581, "learning_rate": 0.0001, "loss": 1.7774, "loss/crossentropy": 2.6659634113311768, "loss/fcd": 1.5546875, "loss/idx": 7.0, "loss/logits": 0.2227148711681366, "step": 1327 }, { "epoch": 0.01982992257669536, "grad_norm": 0.54296875, "grad_norm_var": 0.005912637710571289, "learning_rate": 0.0001, "loss": 1.828, "loss/crossentropy": 2.837485194206238, "loss/fcd": 1.59765625, "loss/idx": 7.0, "loss/logits": 0.23030738532543182, "step": 1328 }, { "epoch": 0.019844854747310344, "grad_norm": 0.453125, "grad_norm_var": 0.006048822402954101, "learning_rate": 0.0001, "loss": 1.6038, "loss/crossentropy": 2.6289491653442383, "loss/fcd": 1.421875, "loss/idx": 7.0, "loss/logits": 0.18195120990276337, "step": 1329 }, { "epoch": 0.019859786917925323, "grad_norm": 0.455078125, "grad_norm_var": 0.00592954953511556, "learning_rate": 0.0001, "loss": 1.6263, "loss/crossentropy": 2.5986313819885254, "loss/fcd": 1.4453125, "loss/idx": 7.0, "loss/logits": 0.18099741637706757, "step": 1330 }, { "epoch": 0.019874719088540306, "grad_norm": 0.4453125, "grad_norm_var": 0.005854654312133789, "learning_rate": 0.0001, "loss": 1.6214, "loss/crossentropy": 2.4408994913101196, "loss/fcd": 1.44140625, "loss/idx": 7.0, "loss/logits": 0.17999741435050964, "step": 1331 }, { "epoch": 0.01988965125915529, "grad_norm": 0.4609375, "grad_norm_var": 0.004673624038696289, "learning_rate": 0.0001, "loss": 1.5941, "loss/crossentropy": 2.6180654764175415, "loss/fcd": 1.4140625, "loss/idx": 7.0, "loss/logits": 0.18002255260944366, "step": 1332 }, { "epoch": 0.019904583429770268, "grad_norm": 0.423828125, "grad_norm_var": 0.003725433349609375, "learning_rate": 0.0001, "loss": 1.5672, "loss/crossentropy": 2.5988270044326782, "loss/fcd": 1.390625, "loss/idx": 7.0, "loss/logits": 0.17657187581062317, "step": 1333 }, { "epoch": 0.01991951560038525, "grad_norm": 0.5703125, "grad_norm_var": 0.004224077860514323, "learning_rate": 0.0001, "loss": 1.8835, "loss/crossentropy": 2.5439051389694214, "loss/fcd": 1.64453125, "loss/idx": 7.0, "loss/logits": 0.2389221414923668, "step": 1334 }, { "epoch": 0.01993444777100023, "grad_norm": 0.5625, "grad_norm_var": 0.0025090535481770834, "learning_rate": 0.0001, "loss": 1.7545, "loss/crossentropy": 2.7638096809387207, "loss/fcd": 1.5390625, "loss/idx": 7.0, "loss/logits": 0.21539074927568436, "step": 1335 }, { "epoch": 0.019949379941615213, "grad_norm": 0.5, "grad_norm_var": 0.002512550354003906, "learning_rate": 0.0001, "loss": 1.5921, "loss/crossentropy": 2.695431113243103, "loss/fcd": 1.4140625, "loss/idx": 7.0, "loss/logits": 0.1780625656247139, "step": 1336 }, { "epoch": 0.019964312112230195, "grad_norm": 0.47265625, "grad_norm_var": 0.002481524149576823, "learning_rate": 0.0001, "loss": 1.7354, "loss/crossentropy": 2.447667717933655, "loss/fcd": 1.5078125, "loss/idx": 7.0, "loss/logits": 0.22754760086536407, "step": 1337 }, { "epoch": 0.019979244282845175, "grad_norm": 0.63671875, "grad_norm_var": 0.003784434000651042, "learning_rate": 0.0001, "loss": 1.9037, "loss/crossentropy": 2.5905754566192627, "loss/fcd": 1.66796875, "loss/idx": 7.0, "loss/logits": 0.2357194721698761, "step": 1338 }, { "epoch": 0.019994176453460157, "grad_norm": 0.453125, "grad_norm_var": 0.003561512629191081, "learning_rate": 0.0001, "loss": 1.7016, "loss/crossentropy": 2.56851863861084, "loss/fcd": 1.515625, "loss/idx": 7.0, "loss/logits": 0.18602270632982254, "step": 1339 }, { "epoch": 0.02000910862407514, "grad_norm": 0.5625, "grad_norm_var": 0.003831926981608073, "learning_rate": 0.0001, "loss": 1.9636, "loss/crossentropy": 2.431572914123535, "loss/fcd": 1.7265625, "loss/idx": 7.0, "loss/logits": 0.23702546209096909, "step": 1340 }, { "epoch": 0.02002404079469012, "grad_norm": 0.78515625, "grad_norm_var": 0.008592589696248373, "learning_rate": 0.0001, "loss": 2.0499, "loss/crossentropy": 2.5989317893981934, "loss/fcd": 1.765625, "loss/idx": 7.0, "loss/logits": 0.2842986583709717, "step": 1341 }, { "epoch": 0.020038972965305102, "grad_norm": 0.44921875, "grad_norm_var": 0.008786630630493165, "learning_rate": 0.0001, "loss": 1.7025, "loss/crossentropy": 2.804073929786682, "loss/fcd": 1.4921875, "loss/idx": 7.0, "loss/logits": 0.21030206978321075, "step": 1342 }, { "epoch": 0.020053905135920085, "grad_norm": 0.5546875, "grad_norm_var": 0.008728535970052083, "learning_rate": 0.0001, "loss": 1.8545, "loss/crossentropy": 2.5097142457962036, "loss/fcd": 1.6328125, "loss/idx": 7.0, "loss/logits": 0.22171549499034882, "step": 1343 }, { "epoch": 0.020068837306535064, "grad_norm": 0.494140625, "grad_norm_var": 0.008731317520141602, "learning_rate": 0.0001, "loss": 1.8018, "loss/crossentropy": 2.5837600231170654, "loss/fcd": 1.5625, "loss/idx": 7.0, "loss/logits": 0.239334374666214, "step": 1344 }, { "epoch": 0.020083769477150046, "grad_norm": 0.44921875, "grad_norm_var": 0.008765776952107748, "learning_rate": 0.0001, "loss": 1.5829, "loss/crossentropy": 2.578310489654541, "loss/fcd": 1.41796875, "loss/idx": 7.0, "loss/logits": 0.1648903787136078, "step": 1345 }, { "epoch": 0.020098701647765026, "grad_norm": 0.44921875, "grad_norm_var": 0.00881646474202474, "learning_rate": 0.0001, "loss": 1.5699, "loss/crossentropy": 2.5372471809387207, "loss/fcd": 1.41015625, "loss/idx": 7.0, "loss/logits": 0.1597445085644722, "step": 1346 }, { "epoch": 0.02011363381838001, "grad_norm": 0.44921875, "grad_norm_var": 0.008780161539713541, "learning_rate": 0.0001, "loss": 1.6982, "loss/crossentropy": 2.603432536125183, "loss/fcd": 1.50390625, "loss/idx": 7.0, "loss/logits": 0.19433686882257462, "step": 1347 }, { "epoch": 0.02012856598899499, "grad_norm": 0.515625, "grad_norm_var": 0.008557637532552084, "learning_rate": 0.0001, "loss": 1.8823, "loss/crossentropy": 2.557613253593445, "loss/fcd": 1.6640625, "loss/idx": 7.0, "loss/logits": 0.21824145317077637, "step": 1348 }, { "epoch": 0.02014349815960997, "grad_norm": 0.4765625, "grad_norm_var": 0.008051665623982747, "learning_rate": 0.0001, "loss": 1.6971, "loss/crossentropy": 2.5176939964294434, "loss/fcd": 1.5078125, "loss/idx": 7.0, "loss/logits": 0.18926101177930832, "step": 1349 }, { "epoch": 0.020158430330224953, "grad_norm": 0.482421875, "grad_norm_var": 0.007989438374837239, "learning_rate": 0.0001, "loss": 1.6574, "loss/crossentropy": 2.6994282007217407, "loss/fcd": 1.46484375, "loss/idx": 7.0, "loss/logits": 0.19250737130641937, "step": 1350 }, { "epoch": 0.020173362500839936, "grad_norm": 0.447265625, "grad_norm_var": 0.00814042091369629, "learning_rate": 0.0001, "loss": 1.6293, "loss/crossentropy": 2.5995413064956665, "loss/fcd": 1.44921875, "loss/idx": 7.0, "loss/logits": 0.18009082227945328, "step": 1351 }, { "epoch": 0.020188294671454915, "grad_norm": 0.51171875, "grad_norm_var": 0.008131647109985351, "learning_rate": 0.0001, "loss": 1.8459, "loss/crossentropy": 2.503365993499756, "loss/fcd": 1.61328125, "loss/idx": 7.0, "loss/logits": 0.23260920494794846, "step": 1352 }, { "epoch": 0.020203226842069898, "grad_norm": 0.455078125, "grad_norm_var": 0.0082427978515625, "learning_rate": 0.0001, "loss": 1.5196, "loss/crossentropy": 2.564800977706909, "loss/fcd": 1.36328125, "loss/idx": 7.0, "loss/logits": 0.15636174380779266, "step": 1353 }, { "epoch": 0.02021815901268488, "grad_norm": 0.4921875, "grad_norm_var": 0.007120704650878907, "learning_rate": 0.0001, "loss": 1.7418, "loss/crossentropy": 2.3416577577590942, "loss/fcd": 1.546875, "loss/idx": 7.0, "loss/logits": 0.1948934942483902, "step": 1354 }, { "epoch": 0.02023309118329986, "grad_norm": 0.4765625, "grad_norm_var": 0.007003211975097656, "learning_rate": 0.0001, "loss": 1.6993, "loss/crossentropy": 2.5845792293548584, "loss/fcd": 1.4921875, "loss/idx": 7.0, "loss/logits": 0.20708955079317093, "step": 1355 }, { "epoch": 0.020248023353914842, "grad_norm": 0.412109375, "grad_norm_var": 0.007227182388305664, "learning_rate": 0.0001, "loss": 1.6796, "loss/crossentropy": 2.7967952489852905, "loss/fcd": 1.47265625, "loss/idx": 7.0, "loss/logits": 0.20696670562028885, "step": 1356 }, { "epoch": 0.020262955524529825, "grad_norm": 0.40234375, "grad_norm_var": 0.00151365598042806, "learning_rate": 0.0001, "loss": 1.7359, "loss/crossentropy": 2.4202345609664917, "loss/fcd": 1.51953125, "loss/idx": 7.0, "loss/logits": 0.21634161472320557, "step": 1357 }, { "epoch": 0.020277887695144804, "grad_norm": 0.408203125, "grad_norm_var": 0.0017316182454427083, "learning_rate": 0.0001, "loss": 1.4691, "loss/crossentropy": 2.58681583404541, "loss/fcd": 1.3125, "loss/idx": 7.0, "loss/logits": 0.15658579766750336, "step": 1358 }, { "epoch": 0.020292819865759787, "grad_norm": 0.5546875, "grad_norm_var": 0.0017316182454427083, "learning_rate": 0.0001, "loss": 1.67, "loss/crossentropy": 2.6079635620117188, "loss/fcd": 1.4921875, "loss/idx": 7.0, "loss/logits": 0.1777733862400055, "step": 1359 }, { "epoch": 0.020307752036374766, "grad_norm": 0.470703125, "grad_norm_var": 0.0016820271809895833, "learning_rate": 0.0001, "loss": 1.7681, "loss/crossentropy": 2.5722819566726685, "loss/fcd": 1.56640625, "loss/idx": 7.0, "loss/logits": 0.20172813534736633, "step": 1360 }, { "epoch": 0.02032268420698975, "grad_norm": 0.490234375, "grad_norm_var": 0.0016963799794514974, "learning_rate": 0.0001, "loss": 1.7869, "loss/crossentropy": 2.726900815963745, "loss/fcd": 1.57421875, "loss/idx": 7.0, "loss/logits": 0.21265853196382523, "step": 1361 }, { "epoch": 0.02033761637760473, "grad_norm": 0.5078125, "grad_norm_var": 0.0017612298329671224, "learning_rate": 0.0001, "loss": 1.726, "loss/crossentropy": 2.5604859590530396, "loss/fcd": 1.5234375, "loss/idx": 7.0, "loss/logits": 0.20253180712461472, "step": 1362 }, { "epoch": 0.02035254854821971, "grad_norm": 0.416015625, "grad_norm_var": 0.0019311904907226562, "learning_rate": 0.0001, "loss": 1.6416, "loss/crossentropy": 2.589616298675537, "loss/fcd": 1.4453125, "loss/idx": 7.0, "loss/logits": 0.19624745845794678, "step": 1363 }, { "epoch": 0.020367480718834693, "grad_norm": 0.515625, "grad_norm_var": 0.0019311904907226562, "learning_rate": 0.0001, "loss": 1.9452, "loss/crossentropy": 2.620610475540161, "loss/fcd": 1.7109375, "loss/idx": 7.0, "loss/logits": 0.2342279702425003, "step": 1364 }, { "epoch": 0.020382412889449676, "grad_norm": 0.44921875, "grad_norm_var": 0.001953887939453125, "learning_rate": 0.0001, "loss": 1.7039, "loss/crossentropy": 2.894443392753601, "loss/fcd": 1.515625, "loss/idx": 7.0, "loss/logits": 0.18823960423469543, "step": 1365 }, { "epoch": 0.020397345060064655, "grad_norm": 0.431640625, "grad_norm_var": 0.002019182840983073, "learning_rate": 0.0001, "loss": 1.7526, "loss/crossentropy": 2.560238003730774, "loss/fcd": 1.53515625, "loss/idx": 7.0, "loss/logits": 0.2174069955945015, "step": 1366 }, { "epoch": 0.020412277230679638, "grad_norm": 0.59375, "grad_norm_var": 0.003012196222941081, "learning_rate": 0.0001, "loss": 1.9253, "loss/crossentropy": 2.3431705236434937, "loss/fcd": 1.69140625, "loss/idx": 7.0, "loss/logits": 0.23390838503837585, "step": 1367 }, { "epoch": 0.02042720940129462, "grad_norm": 0.48046875, "grad_norm_var": 0.0029170831044514974, "learning_rate": 0.0001, "loss": 1.6624, "loss/crossentropy": 2.6611838340759277, "loss/fcd": 1.4765625, "loss/idx": 7.0, "loss/logits": 0.18582233786582947, "step": 1368 }, { "epoch": 0.0204421415719096, "grad_norm": 0.515625, "grad_norm_var": 0.0030072530110677085, "learning_rate": 0.0001, "loss": 1.7996, "loss/crossentropy": 2.379249095916748, "loss/fcd": 1.5859375, "loss/idx": 7.0, "loss/logits": 0.21370600908994675, "step": 1369 }, { "epoch": 0.020457073742524583, "grad_norm": 0.43359375, "grad_norm_var": 0.0030959447224934897, "learning_rate": 0.0001, "loss": 1.8179, "loss/crossentropy": 2.414939045906067, "loss/fcd": 1.59765625, "loss/idx": 7.0, "loss/logits": 0.22020775079727173, "step": 1370 }, { "epoch": 0.020472005913139562, "grad_norm": 0.41796875, "grad_norm_var": 0.0032780965169270835, "learning_rate": 0.0001, "loss": 1.617, "loss/crossentropy": 2.5957616567611694, "loss/fcd": 1.4375, "loss/idx": 7.0, "loss/logits": 0.1795307919383049, "step": 1371 }, { "epoch": 0.020486938083754545, "grad_norm": 0.46875, "grad_norm_var": 0.0030508518218994142, "learning_rate": 0.0001, "loss": 1.5796, "loss/crossentropy": 2.551032066345215, "loss/fcd": 1.4140625, "loss/idx": 7.0, "loss/logits": 0.16551107168197632, "step": 1372 }, { "epoch": 0.020501870254369527, "grad_norm": 0.58984375, "grad_norm_var": 0.003499460220336914, "learning_rate": 0.0001, "loss": 1.902, "loss/crossentropy": 2.4808170795440674, "loss/fcd": 1.65625, "loss/idx": 7.0, "loss/logits": 0.245725117623806, "step": 1373 }, { "epoch": 0.020516802424984507, "grad_norm": 0.5390625, "grad_norm_var": 0.0032470703125, "learning_rate": 0.0001, "loss": 1.6204, "loss/crossentropy": 2.8979157209396362, "loss/fcd": 1.44140625, "loss/idx": 7.0, "loss/logits": 0.17895907908678055, "step": 1374 }, { "epoch": 0.02053173459559949, "grad_norm": 0.43359375, "grad_norm_var": 0.0031544367472330728, "learning_rate": 0.0001, "loss": 1.6163, "loss/crossentropy": 2.526008129119873, "loss/fcd": 1.4375, "loss/idx": 7.0, "loss/logits": 0.17878473550081253, "step": 1375 }, { "epoch": 0.020546666766214472, "grad_norm": 0.51953125, "grad_norm_var": 0.0032128492991129556, "learning_rate": 0.0001, "loss": 1.6617, "loss/crossentropy": 2.5934818983078003, "loss/fcd": 1.4609375, "loss/idx": 7.0, "loss/logits": 0.20080163329839706, "step": 1376 }, { "epoch": 0.02056159893682945, "grad_norm": 0.439453125, "grad_norm_var": 0.0033566633860270183, "learning_rate": 0.0001, "loss": 1.7143, "loss/crossentropy": 2.480017900466919, "loss/fcd": 1.5234375, "loss/idx": 7.0, "loss/logits": 0.1908370852470398, "step": 1377 }, { "epoch": 0.020576531107444434, "grad_norm": 0.5, "grad_norm_var": 0.003336191177368164, "learning_rate": 0.0001, "loss": 1.6938, "loss/crossentropy": 2.504163980484009, "loss/fcd": 1.5078125, "loss/idx": 7.0, "loss/logits": 0.1859615296125412, "step": 1378 }, { "epoch": 0.020591463278059417, "grad_norm": 0.458984375, "grad_norm_var": 0.0030620416005452474, "learning_rate": 0.0001, "loss": 1.5957, "loss/crossentropy": 2.5576053857803345, "loss/fcd": 1.41796875, "loss/idx": 7.0, "loss/logits": 0.17774324119091034, "step": 1379 }, { "epoch": 0.020606395448674396, "grad_norm": 0.400390625, "grad_norm_var": 0.0034474690755208334, "learning_rate": 0.0001, "loss": 1.5509, "loss/crossentropy": 2.699017286300659, "loss/fcd": 1.38671875, "loss/idx": 7.0, "loss/logits": 0.16415952146053314, "step": 1380 }, { "epoch": 0.02062132761928938, "grad_norm": 0.396484375, "grad_norm_var": 0.0038341363271077473, "learning_rate": 0.0001, "loss": 1.6093, "loss/crossentropy": 2.818260431289673, "loss/fcd": 1.41796875, "loss/idx": 7.0, "loss/logits": 0.19137796759605408, "step": 1381 }, { "epoch": 0.020636259789904358, "grad_norm": 0.408203125, "grad_norm_var": 0.004007705052693685, "learning_rate": 0.0001, "loss": 1.7416, "loss/crossentropy": 2.5644644498825073, "loss/fcd": 1.51171875, "loss/idx": 7.0, "loss/logits": 0.22985026240348816, "step": 1382 }, { "epoch": 0.02065119196051934, "grad_norm": 0.439453125, "grad_norm_var": 0.0030471165974934895, "learning_rate": 0.0001, "loss": 1.8247, "loss/crossentropy": 2.451001286506653, "loss/fcd": 1.6015625, "loss/idx": 7.0, "loss/logits": 0.22313947975635529, "step": 1383 }, { "epoch": 0.020666124131134323, "grad_norm": 0.5, "grad_norm_var": 0.0031110127766927082, "learning_rate": 0.0001, "loss": 1.7051, "loss/crossentropy": 2.5355674028396606, "loss/fcd": 1.5, "loss/idx": 7.0, "loss/logits": 0.2051396146416664, "step": 1384 }, { "epoch": 0.020681056301749302, "grad_norm": 0.458984375, "grad_norm_var": 0.0029390811920166015, "learning_rate": 0.0001, "loss": 1.6715, "loss/crossentropy": 2.6771148443222046, "loss/fcd": 1.4609375, "loss/idx": 7.0, "loss/logits": 0.2105863317847252, "step": 1385 }, { "epoch": 0.020695988472364285, "grad_norm": 0.41796875, "grad_norm_var": 0.00301512082417806, "learning_rate": 0.0001, "loss": 1.6794, "loss/crossentropy": 2.6094021797180176, "loss/fcd": 1.48828125, "loss/idx": 7.0, "loss/logits": 0.1910756528377533, "step": 1386 }, { "epoch": 0.020710920642979268, "grad_norm": 0.443359375, "grad_norm_var": 0.002907053629557292, "learning_rate": 0.0001, "loss": 1.7082, "loss/crossentropy": 2.6047022342681885, "loss/fcd": 1.4921875, "loss/idx": 7.0, "loss/logits": 0.21599744260311127, "step": 1387 }, { "epoch": 0.020725852813594247, "grad_norm": 0.46875, "grad_norm_var": 0.002907053629557292, "learning_rate": 0.0001, "loss": 1.6647, "loss/crossentropy": 2.523642063140869, "loss/fcd": 1.48046875, "loss/idx": 7.0, "loss/logits": 0.18419228494167328, "step": 1388 }, { "epoch": 0.02074078498420923, "grad_norm": 0.4453125, "grad_norm_var": 0.0017755508422851562, "learning_rate": 0.0001, "loss": 1.6581, "loss/crossentropy": 2.8733028173446655, "loss/fcd": 1.46875, "loss/idx": 7.0, "loss/logits": 0.18934316188097, "step": 1389 }, { "epoch": 0.020755717154824212, "grad_norm": 0.51953125, "grad_norm_var": 0.0015787760416666667, "learning_rate": 0.0001, "loss": 1.8048, "loss/crossentropy": 2.4046658277511597, "loss/fcd": 1.6015625, "loss/idx": 7.0, "loss/logits": 0.20321223884820938, "step": 1390 }, { "epoch": 0.02077064932543919, "grad_norm": 0.455078125, "grad_norm_var": 0.001551675796508789, "learning_rate": 0.0001, "loss": 1.7306, "loss/crossentropy": 2.4122198820114136, "loss/fcd": 1.5234375, "loss/idx": 7.0, "loss/logits": 0.2071341872215271, "step": 1391 }, { "epoch": 0.020785581496054174, "grad_norm": 0.431640625, "grad_norm_var": 0.0012720108032226562, "learning_rate": 0.0001, "loss": 1.5693, "loss/crossentropy": 2.4567726850509644, "loss/fcd": 1.39453125, "loss/idx": 7.0, "loss/logits": 0.1747429072856903, "step": 1392 }, { "epoch": 0.020800513666669157, "grad_norm": 0.435546875, "grad_norm_var": 0.001277923583984375, "learning_rate": 0.0001, "loss": 1.5542, "loss/crossentropy": 2.69111704826355, "loss/fcd": 1.38671875, "loss/idx": 7.0, "loss/logits": 0.1674729585647583, "step": 1393 }, { "epoch": 0.020815445837284136, "grad_norm": 0.44140625, "grad_norm_var": 0.0010919570922851562, "learning_rate": 0.0001, "loss": 1.6307, "loss/crossentropy": 2.527459144592285, "loss/fcd": 1.4375, "loss/idx": 7.0, "loss/logits": 0.19319240003824234, "step": 1394 }, { "epoch": 0.02083037800789912, "grad_norm": 0.4453125, "grad_norm_var": 0.0010782718658447266, "learning_rate": 0.0001, "loss": 1.6287, "loss/crossentropy": 2.6566779613494873, "loss/fcd": 1.43359375, "loss/idx": 7.0, "loss/logits": 0.1950736939907074, "step": 1395 }, { "epoch": 0.020845310178514098, "grad_norm": 0.482421875, "grad_norm_var": 0.0010195255279541015, "learning_rate": 0.0001, "loss": 1.7712, "loss/crossentropy": 2.42236065864563, "loss/fcd": 1.55859375, "loss/idx": 7.0, "loss/logits": 0.2125757485628128, "step": 1396 }, { "epoch": 0.02086024234912908, "grad_norm": 0.470703125, "grad_norm_var": 0.0008407433827718099, "learning_rate": 0.0001, "loss": 1.7156, "loss/crossentropy": 2.53904128074646, "loss/fcd": 1.5, "loss/idx": 7.0, "loss/logits": 0.2156321182847023, "step": 1397 }, { "epoch": 0.020875174519744064, "grad_norm": 0.43359375, "grad_norm_var": 0.0007260640462239584, "learning_rate": 0.0001, "loss": 1.573, "loss/crossentropy": 2.6393632888793945, "loss/fcd": 1.3984375, "loss/idx": 7.0, "loss/logits": 0.1746109426021576, "step": 1398 }, { "epoch": 0.020890106690359043, "grad_norm": 0.4375, "grad_norm_var": 0.0007304986317952474, "learning_rate": 0.0001, "loss": 1.6177, "loss/crossentropy": 2.65228533744812, "loss/fcd": 1.4296875, "loss/idx": 7.0, "loss/logits": 0.18801546841859818, "step": 1399 }, { "epoch": 0.020905038860974025, "grad_norm": 0.388671875, "grad_norm_var": 0.0008437474568684896, "learning_rate": 0.0001, "loss": 1.6124, "loss/crossentropy": 2.6110557317733765, "loss/fcd": 1.421875, "loss/idx": 7.0, "loss/logits": 0.19048527628183365, "step": 1400 }, { "epoch": 0.020919971031589008, "grad_norm": 0.421875, "grad_norm_var": 0.0008778731028238932, "learning_rate": 0.0001, "loss": 1.5752, "loss/crossentropy": 2.5177189111709595, "loss/fcd": 1.40234375, "loss/idx": 7.0, "loss/logits": 0.1728520169854164, "step": 1401 }, { "epoch": 0.020934903202203987, "grad_norm": 0.55078125, "grad_norm_var": 0.0014809767405192058, "learning_rate": 0.0001, "loss": 1.6969, "loss/crossentropy": 2.8006900548934937, "loss/fcd": 1.515625, "loss/idx": 7.0, "loss/logits": 0.1812676638364792, "step": 1402 }, { "epoch": 0.02094983537281897, "grad_norm": 0.546875, "grad_norm_var": 0.00199737548828125, "learning_rate": 0.0001, "loss": 1.6337, "loss/crossentropy": 2.6444613933563232, "loss/fcd": 1.44921875, "loss/idx": 7.0, "loss/logits": 0.18452748656272888, "step": 1403 }, { "epoch": 0.020964767543433953, "grad_norm": 0.515625, "grad_norm_var": 0.00218353271484375, "learning_rate": 0.0001, "loss": 1.6804, "loss/crossentropy": 2.775398850440979, "loss/fcd": 1.4765625, "loss/idx": 7.0, "loss/logits": 0.20384299010038376, "step": 1404 }, { "epoch": 0.020979699714048932, "grad_norm": 0.43359375, "grad_norm_var": 0.0022211074829101562, "learning_rate": 0.0001, "loss": 1.5523, "loss/crossentropy": 2.7846622467041016, "loss/fcd": 1.37890625, "loss/idx": 7.0, "loss/logits": 0.1734142228960991, "step": 1405 }, { "epoch": 0.020994631884663915, "grad_norm": 0.416015625, "grad_norm_var": 0.0021124362945556642, "learning_rate": 0.0001, "loss": 1.4762, "loss/crossentropy": 2.86823308467865, "loss/fcd": 1.31640625, "loss/idx": 7.0, "loss/logits": 0.15978942066431046, "step": 1406 }, { "epoch": 0.021009564055278894, "grad_norm": 0.51953125, "grad_norm_var": 0.0023584365844726562, "learning_rate": 0.0001, "loss": 1.6031, "loss/crossentropy": 2.6583101749420166, "loss/fcd": 1.421875, "loss/idx": 7.0, "loss/logits": 0.18122605979442596, "step": 1407 }, { "epoch": 0.021024496225893877, "grad_norm": 0.5078125, "grad_norm_var": 0.002426004409790039, "learning_rate": 0.0001, "loss": 1.6574, "loss/crossentropy": 2.7387313842773438, "loss/fcd": 1.46484375, "loss/idx": 7.0, "loss/logits": 0.19252792745828629, "step": 1408 }, { "epoch": 0.02103942839650886, "grad_norm": 0.416015625, "grad_norm_var": 0.002527729670206706, "learning_rate": 0.0001, "loss": 1.5176, "loss/crossentropy": 2.737845778465271, "loss/fcd": 1.359375, "loss/idx": 7.0, "loss/logits": 0.15820758044719696, "step": 1409 }, { "epoch": 0.02105436056712384, "grad_norm": 0.431640625, "grad_norm_var": 0.0025634129842122396, "learning_rate": 0.0001, "loss": 1.6793, "loss/crossentropy": 2.6955381631851196, "loss/fcd": 1.4765625, "loss/idx": 7.0, "loss/logits": 0.2027088850736618, "step": 1410 }, { "epoch": 0.02106929273773882, "grad_norm": 0.41796875, "grad_norm_var": 0.0026769002278645834, "learning_rate": 0.0001, "loss": 1.6027, "loss/crossentropy": 2.7337403297424316, "loss/fcd": 1.41015625, "loss/idx": 7.0, "loss/logits": 0.19255182147026062, "step": 1411 }, { "epoch": 0.021084224908353804, "grad_norm": 0.48828125, "grad_norm_var": 0.002695067723592122, "learning_rate": 0.0001, "loss": 1.7799, "loss/crossentropy": 2.895545721054077, "loss/fcd": 1.55859375, "loss/idx": 7.0, "loss/logits": 0.221343994140625, "step": 1412 }, { "epoch": 0.021099157078968783, "grad_norm": 0.5625, "grad_norm_var": 0.0033248265584309897, "learning_rate": 0.0001, "loss": 1.8734, "loss/crossentropy": 2.3325068950653076, "loss/fcd": 1.65234375, "loss/idx": 7.0, "loss/logits": 0.22110049426555634, "step": 1413 }, { "epoch": 0.021114089249583766, "grad_norm": 0.50390625, "grad_norm_var": 0.0033110936482747396, "learning_rate": 0.0001, "loss": 1.6906, "loss/crossentropy": 2.468624234199524, "loss/fcd": 1.50390625, "loss/idx": 7.0, "loss/logits": 0.1866515427827835, "step": 1414 }, { "epoch": 0.02112902142019875, "grad_norm": 0.43359375, "grad_norm_var": 0.003330230712890625, "learning_rate": 0.0001, "loss": 1.5433, "loss/crossentropy": 2.677880644798279, "loss/fcd": 1.3828125, "loss/idx": 7.0, "loss/logits": 0.16050750017166138, "step": 1415 }, { "epoch": 0.021143953590813728, "grad_norm": 0.64453125, "grad_norm_var": 0.004573297500610351, "learning_rate": 0.0001, "loss": 2.297, "loss/crossentropy": 2.67445969581604, "loss/fcd": 2.01171875, "loss/idx": 7.0, "loss/logits": 0.28523435443639755, "step": 1416 }, { "epoch": 0.02115888576142871, "grad_norm": 0.52734375, "grad_norm_var": 0.004336404800415039, "learning_rate": 0.0001, "loss": 1.7373, "loss/crossentropy": 2.577675700187683, "loss/fcd": 1.515625, "loss/idx": 7.0, "loss/logits": 0.2216956913471222, "step": 1417 }, { "epoch": 0.021173817932043693, "grad_norm": 0.462890625, "grad_norm_var": 0.00416259765625, "learning_rate": 0.0001, "loss": 1.6366, "loss/crossentropy": 2.5695481300354004, "loss/fcd": 1.453125, "loss/idx": 7.0, "loss/logits": 0.1835166960954666, "step": 1418 }, { "epoch": 0.021188750102658672, "grad_norm": 0.91015625, "grad_norm_var": 0.015201759338378907, "learning_rate": 0.0001, "loss": 1.6032, "loss/crossentropy": 2.8455076217651367, "loss/fcd": 1.42578125, "loss/idx": 7.0, "loss/logits": 0.17739622294902802, "step": 1419 }, { "epoch": 0.021203682273273655, "grad_norm": 0.4921875, "grad_norm_var": 0.015224647521972657, "learning_rate": 0.0001, "loss": 1.7873, "loss/crossentropy": 2.5857309103012085, "loss/fcd": 1.54296875, "loss/idx": 7.0, "loss/logits": 0.24437353014945984, "step": 1420 }, { "epoch": 0.021218614443888634, "grad_norm": 0.447265625, "grad_norm_var": 0.015096139907836915, "learning_rate": 0.0001, "loss": 1.6124, "loss/crossentropy": 2.4495460987091064, "loss/fcd": 1.4375, "loss/idx": 7.0, "loss/logits": 0.1749189794063568, "step": 1421 }, { "epoch": 0.021233546614503617, "grad_norm": 0.51953125, "grad_norm_var": 0.014450009663899739, "learning_rate": 0.0001, "loss": 1.7511, "loss/crossentropy": 2.408402919769287, "loss/fcd": 1.53125, "loss/idx": 7.0, "loss/logits": 0.21988791227340698, "step": 1422 }, { "epoch": 0.0212484787851186, "grad_norm": 0.55859375, "grad_norm_var": 0.014554278055826823, "learning_rate": 0.0001, "loss": 1.6551, "loss/crossentropy": 2.7392570972442627, "loss/fcd": 1.4609375, "loss/idx": 7.0, "loss/logits": 0.19412636011838913, "step": 1423 }, { "epoch": 0.02126341095573358, "grad_norm": 0.455078125, "grad_norm_var": 0.014815632502237957, "learning_rate": 0.0001, "loss": 1.6443, "loss/crossentropy": 2.530651807785034, "loss/fcd": 1.47265625, "loss/idx": 7.0, "loss/logits": 0.1716858074069023, "step": 1424 }, { "epoch": 0.02127834312634856, "grad_norm": 0.455078125, "grad_norm_var": 0.014385207494099935, "learning_rate": 0.0001, "loss": 1.4788, "loss/crossentropy": 2.560731530189514, "loss/fcd": 1.31640625, "loss/idx": 7.0, "loss/logits": 0.16241511702537537, "step": 1425 }, { "epoch": 0.021293275296963544, "grad_norm": 0.470703125, "grad_norm_var": 0.014023447036743164, "learning_rate": 0.0001, "loss": 1.7572, "loss/crossentropy": 2.5136619806289673, "loss/fcd": 1.54296875, "loss/idx": 7.0, "loss/logits": 0.2141972780227661, "step": 1426 }, { "epoch": 0.021308207467578524, "grad_norm": 0.51953125, "grad_norm_var": 0.01326139767964681, "learning_rate": 0.0001, "loss": 1.7078, "loss/crossentropy": 2.8287333250045776, "loss/fcd": 1.50390625, "loss/idx": 7.0, "loss/logits": 0.20389527082443237, "step": 1427 }, { "epoch": 0.021323139638193506, "grad_norm": 0.494140625, "grad_norm_var": 0.01323235829671224, "learning_rate": 0.0001, "loss": 1.7222, "loss/crossentropy": 2.6399881839752197, "loss/fcd": 1.515625, "loss/idx": 7.0, "loss/logits": 0.20652509480714798, "step": 1428 }, { "epoch": 0.02133807180880849, "grad_norm": 0.451171875, "grad_norm_var": 0.013503249486287434, "learning_rate": 0.0001, "loss": 1.5779, "loss/crossentropy": 2.389400362968445, "loss/fcd": 1.40625, "loss/idx": 7.0, "loss/logits": 0.17167968302965164, "step": 1429 }, { "epoch": 0.02135300397942347, "grad_norm": 0.443359375, "grad_norm_var": 0.013875261942545573, "learning_rate": 0.0001, "loss": 1.758, "loss/crossentropy": 2.4448758363723755, "loss/fcd": 1.52734375, "loss/idx": 7.0, "loss/logits": 0.2306400090456009, "step": 1430 }, { "epoch": 0.02136793615003845, "grad_norm": 0.423828125, "grad_norm_var": 0.01399089495340983, "learning_rate": 0.0001, "loss": 1.6016, "loss/crossentropy": 2.542250633239746, "loss/fcd": 1.421875, "loss/idx": 7.0, "loss/logits": 0.1797719970345497, "step": 1431 }, { "epoch": 0.02138286832065343, "grad_norm": 0.4609375, "grad_norm_var": 0.012980890274047852, "learning_rate": 0.0001, "loss": 1.788, "loss/crossentropy": 2.531215786933899, "loss/fcd": 1.54296875, "loss/idx": 7.0, "loss/logits": 0.24498464167118073, "step": 1432 }, { "epoch": 0.021397800491268413, "grad_norm": 0.474609375, "grad_norm_var": 0.013002777099609375, "learning_rate": 0.0001, "loss": 1.6529, "loss/crossentropy": 2.677714705467224, "loss/fcd": 1.46484375, "loss/idx": 7.0, "loss/logits": 0.18806182593107224, "step": 1433 }, { "epoch": 0.021412732661883396, "grad_norm": 0.56640625, "grad_norm_var": 0.013126611709594727, "learning_rate": 0.0001, "loss": 1.822, "loss/crossentropy": 2.5922250747680664, "loss/fcd": 1.59375, "loss/idx": 7.0, "loss/logits": 0.2282187044620514, "step": 1434 }, { "epoch": 0.021427664832498375, "grad_norm": 0.515625, "grad_norm_var": 0.0017478783925374349, "learning_rate": 0.0001, "loss": 1.7681, "loss/crossentropy": 2.8243170976638794, "loss/fcd": 1.55859375, "loss/idx": 7.0, "loss/logits": 0.209464393556118, "step": 1435 }, { "epoch": 0.021442597003113358, "grad_norm": 0.5, "grad_norm_var": 0.001759958267211914, "learning_rate": 0.0001, "loss": 1.6335, "loss/crossentropy": 2.674209237098694, "loss/fcd": 1.453125, "loss/idx": 7.0, "loss/logits": 0.1803731545805931, "step": 1436 }, { "epoch": 0.02145752917372834, "grad_norm": 0.466796875, "grad_norm_var": 0.0016862074534098306, "learning_rate": 0.0001, "loss": 1.6252, "loss/crossentropy": 2.531915068626404, "loss/fcd": 1.4375, "loss/idx": 7.0, "loss/logits": 0.18767689168453217, "step": 1437 }, { "epoch": 0.02147246134434332, "grad_norm": 0.62109375, "grad_norm_var": 0.002785476048787435, "learning_rate": 0.0001, "loss": 1.7866, "loss/crossentropy": 2.2939316034317017, "loss/fcd": 1.6171875, "loss/idx": 7.0, "loss/logits": 0.16944652050733566, "step": 1438 }, { "epoch": 0.021487393514958302, "grad_norm": 0.474609375, "grad_norm_var": 0.0024840672810872394, "learning_rate": 0.0001, "loss": 1.7564, "loss/crossentropy": 2.6811925172805786, "loss/fcd": 1.54296875, "loss/idx": 7.0, "loss/logits": 0.21343431621789932, "step": 1439 }, { "epoch": 0.021502325685573285, "grad_norm": 0.51953125, "grad_norm_var": 0.00246885617574056, "learning_rate": 0.0001, "loss": 1.8515, "loss/crossentropy": 2.4621083736419678, "loss/fcd": 1.62890625, "loss/idx": 7.0, "loss/logits": 0.2225768268108368, "step": 1440 }, { "epoch": 0.021517257856188264, "grad_norm": 0.6015625, "grad_norm_var": 0.0031066258748372396, "learning_rate": 0.0001, "loss": 2.0026, "loss/crossentropy": 2.472296357154846, "loss/fcd": 1.74609375, "loss/idx": 7.0, "loss/logits": 0.25650446116924286, "step": 1441 }, { "epoch": 0.021532190026803247, "grad_norm": 0.458984375, "grad_norm_var": 0.0031613667805989584, "learning_rate": 0.0001, "loss": 1.6266, "loss/crossentropy": 2.822437882423401, "loss/fcd": 1.4375, "loss/idx": 7.0, "loss/logits": 0.18909186869859695, "step": 1442 }, { "epoch": 0.021547122197418226, "grad_norm": 0.50390625, "grad_norm_var": 0.003134918212890625, "learning_rate": 0.0001, "loss": 1.5838, "loss/crossentropy": 2.8150917291641235, "loss/fcd": 1.40625, "loss/idx": 7.0, "loss/logits": 0.17753300070762634, "step": 1443 }, { "epoch": 0.02156205436803321, "grad_norm": 0.71875, "grad_norm_var": 0.006156396865844726, "learning_rate": 0.0001, "loss": 1.9223, "loss/crossentropy": 2.175217628479004, "loss/fcd": 1.69140625, "loss/idx": 7.0, "loss/logits": 0.2308632880449295, "step": 1444 }, { "epoch": 0.02157698653864819, "grad_norm": 0.412109375, "grad_norm_var": 0.006571563084920248, "learning_rate": 0.0001, "loss": 1.5694, "loss/crossentropy": 2.54789400100708, "loss/fcd": 1.3984375, "loss/idx": 7.0, "loss/logits": 0.170965775847435, "step": 1445 }, { "epoch": 0.02159191870926317, "grad_norm": 0.4921875, "grad_norm_var": 0.006285858154296875, "learning_rate": 0.0001, "loss": 1.6893, "loss/crossentropy": 2.5268924236297607, "loss/fcd": 1.5, "loss/idx": 7.0, "loss/logits": 0.18932264298200607, "step": 1446 }, { "epoch": 0.021606850879878153, "grad_norm": 0.447265625, "grad_norm_var": 0.00604095458984375, "learning_rate": 0.0001, "loss": 1.6026, "loss/crossentropy": 2.735979676246643, "loss/fcd": 1.421875, "loss/idx": 7.0, "loss/logits": 0.18071038275957108, "step": 1447 }, { "epoch": 0.021621783050493136, "grad_norm": 0.609375, "grad_norm_var": 0.0063550313313802086, "learning_rate": 0.0001, "loss": 1.685, "loss/crossentropy": 2.3301628828048706, "loss/fcd": 1.51171875, "loss/idx": 7.0, "loss/logits": 0.1733197569847107, "step": 1448 }, { "epoch": 0.021636715221108115, "grad_norm": 0.45703125, "grad_norm_var": 0.0064899285634358725, "learning_rate": 0.0001, "loss": 1.5976, "loss/crossentropy": 2.946329712867737, "loss/fcd": 1.4140625, "loss/idx": 7.0, "loss/logits": 0.18357960879802704, "step": 1449 }, { "epoch": 0.021651647391723098, "grad_norm": 0.546875, "grad_norm_var": 0.006400283177693685, "learning_rate": 0.0001, "loss": 1.8002, "loss/crossentropy": 2.6100656986236572, "loss/fcd": 1.5859375, "loss/idx": 7.0, "loss/logits": 0.21426425874233246, "step": 1450 }, { "epoch": 0.02166657956233808, "grad_norm": 0.546875, "grad_norm_var": 0.006436395645141602, "learning_rate": 0.0001, "loss": 1.8881, "loss/crossentropy": 2.5633318424224854, "loss/fcd": 1.63671875, "loss/idx": 7.0, "loss/logits": 0.2513733506202698, "step": 1451 }, { "epoch": 0.02168151173295306, "grad_norm": 0.439453125, "grad_norm_var": 0.00685571034749349, "learning_rate": 0.0001, "loss": 1.6285, "loss/crossentropy": 2.5173341035842896, "loss/fcd": 1.4375, "loss/idx": 7.0, "loss/logits": 0.19103511422872543, "step": 1452 }, { "epoch": 0.021696443903568043, "grad_norm": 0.6171875, "grad_norm_var": 0.007206964492797852, "learning_rate": 0.0001, "loss": 2.1584, "loss/crossentropy": 2.5997310876846313, "loss/fcd": 1.87890625, "loss/idx": 7.0, "loss/logits": 0.27947692573070526, "step": 1453 }, { "epoch": 0.021711376074183025, "grad_norm": 0.48828125, "grad_norm_var": 0.006681680679321289, "learning_rate": 0.0001, "loss": 1.7712, "loss/crossentropy": 2.172030210494995, "loss/fcd": 1.56640625, "loss/idx": 7.0, "loss/logits": 0.20478252321481705, "step": 1454 }, { "epoch": 0.021726308244798005, "grad_norm": 0.498046875, "grad_norm_var": 0.006571435928344726, "learning_rate": 0.0001, "loss": 1.7373, "loss/crossentropy": 2.5943312644958496, "loss/fcd": 1.53125, "loss/idx": 7.0, "loss/logits": 0.20607079565525055, "step": 1455 }, { "epoch": 0.021741240415412987, "grad_norm": 0.5390625, "grad_norm_var": 0.006587966283162435, "learning_rate": 0.0001, "loss": 1.7219, "loss/crossentropy": 2.6256524324417114, "loss/fcd": 1.515625, "loss/idx": 7.0, "loss/logits": 0.20627319812774658, "step": 1456 }, { "epoch": 0.021756172586027966, "grad_norm": 0.48828125, "grad_norm_var": 0.00621183713277181, "learning_rate": 0.0001, "loss": 1.7206, "loss/crossentropy": 2.4680287837982178, "loss/fcd": 1.52734375, "loss/idx": 7.0, "loss/logits": 0.19329256564378738, "step": 1457 }, { "epoch": 0.02177110475664295, "grad_norm": 0.53125, "grad_norm_var": 0.005984242757161458, "learning_rate": 0.0001, "loss": 1.7452, "loss/crossentropy": 2.1940962076187134, "loss/fcd": 1.54296875, "loss/idx": 7.0, "loss/logits": 0.20221325010061264, "step": 1458 }, { "epoch": 0.021786036927257932, "grad_norm": 0.546875, "grad_norm_var": 0.006001726786295573, "learning_rate": 0.0001, "loss": 1.8579, "loss/crossentropy": 2.3581950664520264, "loss/fcd": 1.64453125, "loss/idx": 7.0, "loss/logits": 0.21334326267242432, "step": 1459 }, { "epoch": 0.02180096909787291, "grad_norm": 0.53125, "grad_norm_var": 0.0033222834269205728, "learning_rate": 0.0001, "loss": 1.5861, "loss/crossentropy": 2.622925043106079, "loss/fcd": 1.40625, "loss/idx": 7.0, "loss/logits": 0.17985717952251434, "step": 1460 }, { "epoch": 0.021815901268487894, "grad_norm": 0.447265625, "grad_norm_var": 0.0029314676920572918, "learning_rate": 0.0001, "loss": 1.5527, "loss/crossentropy": 2.6179198026657104, "loss/fcd": 1.38671875, "loss/idx": 7.0, "loss/logits": 0.16600077599287033, "step": 1461 }, { "epoch": 0.021830833439102876, "grad_norm": 0.474609375, "grad_norm_var": 0.003002278010050456, "learning_rate": 0.0001, "loss": 1.71, "loss/crossentropy": 2.601523756980896, "loss/fcd": 1.515625, "loss/idx": 7.0, "loss/logits": 0.19441629946231842, "step": 1462 }, { "epoch": 0.021845765609717856, "grad_norm": 0.478515625, "grad_norm_var": 0.002789163589477539, "learning_rate": 0.0001, "loss": 1.5912, "loss/crossentropy": 2.3901792764663696, "loss/fcd": 1.41015625, "loss/idx": 7.0, "loss/logits": 0.18108409643173218, "step": 1463 }, { "epoch": 0.02186069778033284, "grad_norm": 0.4375, "grad_norm_var": 0.0024730523427327475, "learning_rate": 0.0001, "loss": 1.5628, "loss/crossentropy": 2.7158310413360596, "loss/fcd": 1.37890625, "loss/idx": 7.0, "loss/logits": 0.18390782922506332, "step": 1464 }, { "epoch": 0.02187562995094782, "grad_norm": 0.431640625, "grad_norm_var": 0.0026732762654622395, "learning_rate": 0.0001, "loss": 1.5699, "loss/crossentropy": 2.8458696603775024, "loss/fcd": 1.3984375, "loss/idx": 7.0, "loss/logits": 0.17150548100471497, "step": 1465 }, { "epoch": 0.0218905621215628, "grad_norm": 0.5, "grad_norm_var": 0.0025344212849934896, "learning_rate": 0.0001, "loss": 1.5382, "loss/crossentropy": 2.5054997205734253, "loss/fcd": 1.3828125, "loss/idx": 7.0, "loss/logits": 0.15543527156114578, "step": 1466 }, { "epoch": 0.021905494292177783, "grad_norm": 0.392578125, "grad_norm_var": 0.003053013483683268, "learning_rate": 0.0001, "loss": 1.4817, "loss/crossentropy": 2.427425265312195, "loss/fcd": 1.328125, "loss/idx": 7.0, "loss/logits": 0.15352777391672134, "step": 1467 }, { "epoch": 0.021920426462792762, "grad_norm": 0.490234375, "grad_norm_var": 0.0028711795806884766, "learning_rate": 0.0001, "loss": 1.5268, "loss/crossentropy": 2.5839306116104126, "loss/fcd": 1.359375, "loss/idx": 7.0, "loss/logits": 0.1674559861421585, "step": 1468 }, { "epoch": 0.021935358633407745, "grad_norm": 0.478515625, "grad_norm_var": 0.0017821629842122396, "learning_rate": 0.0001, "loss": 1.7259, "loss/crossentropy": 2.6946284770965576, "loss/fcd": 1.53125, "loss/idx": 7.0, "loss/logits": 0.19463558495044708, "step": 1469 }, { "epoch": 0.021950290804022728, "grad_norm": 0.44140625, "grad_norm_var": 0.0018966039021809896, "learning_rate": 0.0001, "loss": 1.6243, "loss/crossentropy": 2.6268084049224854, "loss/fcd": 1.4453125, "loss/idx": 7.0, "loss/logits": 0.17897958308458328, "step": 1470 }, { "epoch": 0.021965222974637707, "grad_norm": 0.48828125, "grad_norm_var": 0.0018812656402587891, "learning_rate": 0.0001, "loss": 1.7699, "loss/crossentropy": 2.4048478603363037, "loss/fcd": 1.5703125, "loss/idx": 7.0, "loss/logits": 0.19957569986581802, "step": 1471 }, { "epoch": 0.02198015514525269, "grad_norm": 0.5, "grad_norm_var": 0.001674636205037435, "learning_rate": 0.0001, "loss": 1.6641, "loss/crossentropy": 2.7484490871429443, "loss/fcd": 1.46875, "loss/idx": 7.0, "loss/logits": 0.19537898898124695, "step": 1472 }, { "epoch": 0.021995087315867672, "grad_norm": 0.42578125, "grad_norm_var": 0.0018384138743082683, "learning_rate": 0.0001, "loss": 1.5943, "loss/crossentropy": 2.5969239473342896, "loss/fcd": 1.41015625, "loss/idx": 7.0, "loss/logits": 0.18410242348909378, "step": 1473 }, { "epoch": 0.02201001948648265, "grad_norm": 0.44921875, "grad_norm_var": 0.0016408125559488932, "learning_rate": 0.0001, "loss": 1.5954, "loss/crossentropy": 2.6116435527801514, "loss/fcd": 1.41796875, "loss/idx": 7.0, "loss/logits": 0.1774698942899704, "step": 1474 }, { "epoch": 0.022024951657097634, "grad_norm": 0.44140625, "grad_norm_var": 0.0012494246164957681, "learning_rate": 0.0001, "loss": 1.6374, "loss/crossentropy": 2.5894148349761963, "loss/fcd": 1.4453125, "loss/idx": 7.0, "loss/logits": 0.1920812875032425, "step": 1475 }, { "epoch": 0.022039883827712617, "grad_norm": 0.63671875, "grad_norm_var": 0.002904240290323893, "learning_rate": 0.0001, "loss": 2.0558, "loss/crossentropy": 2.587377429008484, "loss/fcd": 1.8125, "loss/idx": 7.0, "loss/logits": 0.24327364563941956, "step": 1476 }, { "epoch": 0.022054815998327596, "grad_norm": 0.51171875, "grad_norm_var": 0.002971903483072917, "learning_rate": 0.0001, "loss": 1.828, "loss/crossentropy": 2.747790813446045, "loss/fcd": 1.6015625, "loss/idx": 7.0, "loss/logits": 0.2263966202735901, "step": 1477 }, { "epoch": 0.02206974816894258, "grad_norm": 0.46875, "grad_norm_var": 0.0029732863108317058, "learning_rate": 0.0001, "loss": 1.768, "loss/crossentropy": 2.6247669458389282, "loss/fcd": 1.5546875, "loss/idx": 7.0, "loss/logits": 0.213314987719059, "step": 1478 }, { "epoch": 0.02208468033955756, "grad_norm": 0.6640625, "grad_norm_var": 0.005254872639973958, "learning_rate": 0.0001, "loss": 1.8736, "loss/crossentropy": 2.40477979183197, "loss/fcd": 1.66015625, "loss/idx": 7.0, "loss/logits": 0.213446743786335, "step": 1479 }, { "epoch": 0.02209961251017254, "grad_norm": 0.46875, "grad_norm_var": 0.005118560791015625, "learning_rate": 0.0001, "loss": 1.6391, "loss/crossentropy": 2.8101799488067627, "loss/fcd": 1.44140625, "loss/idx": 7.0, "loss/logits": 0.19768545031547546, "step": 1480 }, { "epoch": 0.022114544680787523, "grad_norm": 0.48828125, "grad_norm_var": 0.004902378718058268, "learning_rate": 0.0001, "loss": 1.6937, "loss/crossentropy": 2.3495898246765137, "loss/fcd": 1.50390625, "loss/idx": 7.0, "loss/logits": 0.18976981192827225, "step": 1481 }, { "epoch": 0.022129476851402503, "grad_norm": 0.47265625, "grad_norm_var": 0.004913949966430664, "learning_rate": 0.0001, "loss": 1.6884, "loss/crossentropy": 2.4331902265548706, "loss/fcd": 1.4921875, "loss/idx": 7.0, "loss/logits": 0.1962270364165306, "step": 1482 }, { "epoch": 0.022144409022017485, "grad_norm": 0.416015625, "grad_norm_var": 0.004648065567016602, "learning_rate": 0.0001, "loss": 1.6145, "loss/crossentropy": 2.6590933799743652, "loss/fcd": 1.41796875, "loss/idx": 7.0, "loss/logits": 0.1965487003326416, "step": 1483 }, { "epoch": 0.022159341192632468, "grad_norm": 0.55078125, "grad_norm_var": 0.0048781712849934895, "learning_rate": 0.0001, "loss": 1.7436, "loss/crossentropy": 2.599213719367981, "loss/fcd": 1.51953125, "loss/idx": 7.0, "loss/logits": 0.22408346831798553, "step": 1484 }, { "epoch": 0.022174273363247447, "grad_norm": 0.4609375, "grad_norm_var": 0.0049335320790608725, "learning_rate": 0.0001, "loss": 1.65, "loss/crossentropy": 2.6557412147521973, "loss/fcd": 1.453125, "loss/idx": 7.0, "loss/logits": 0.19689878821372986, "step": 1485 }, { "epoch": 0.02218920553386243, "grad_norm": 0.486328125, "grad_norm_var": 0.004751841227213542, "learning_rate": 0.0001, "loss": 1.6507, "loss/crossentropy": 2.529180407524109, "loss/fcd": 1.44921875, "loss/idx": 7.0, "loss/logits": 0.20145351439714432, "step": 1486 }, { "epoch": 0.022204137704477413, "grad_norm": 0.46875, "grad_norm_var": 0.004794756571451823, "learning_rate": 0.0001, "loss": 1.6538, "loss/crossentropy": 2.4987757205963135, "loss/fcd": 1.4765625, "loss/idx": 7.0, "loss/logits": 0.17723794281482697, "step": 1487 }, { "epoch": 0.022219069875092392, "grad_norm": 0.439453125, "grad_norm_var": 0.004978545506795247, "learning_rate": 0.0001, "loss": 1.5982, "loss/crossentropy": 2.609381675720215, "loss/fcd": 1.4140625, "loss/idx": 7.0, "loss/logits": 0.18409955501556396, "step": 1488 }, { "epoch": 0.022234002045707375, "grad_norm": 0.474609375, "grad_norm_var": 0.004705556233723958, "learning_rate": 0.0001, "loss": 1.7746, "loss/crossentropy": 2.508987545967102, "loss/fcd": 1.5546875, "loss/idx": 7.0, "loss/logits": 0.2199431136250496, "step": 1489 }, { "epoch": 0.022248934216322357, "grad_norm": 0.62890625, "grad_norm_var": 0.005658976236979167, "learning_rate": 0.0001, "loss": 1.89, "loss/crossentropy": 2.3111783266067505, "loss/fcd": 1.67578125, "loss/idx": 7.0, "loss/logits": 0.21425354480743408, "step": 1490 }, { "epoch": 0.022263866386937337, "grad_norm": 0.640625, "grad_norm_var": 0.006453386942545573, "learning_rate": 0.0001, "loss": 1.7514, "loss/crossentropy": 2.5704126358032227, "loss/fcd": 1.55859375, "loss/idx": 7.0, "loss/logits": 0.19285417348146439, "step": 1491 }, { "epoch": 0.02227879855755232, "grad_norm": 0.498046875, "grad_norm_var": 0.005447880427042643, "learning_rate": 0.0001, "loss": 1.7764, "loss/crossentropy": 2.5152299404144287, "loss/fcd": 1.5546875, "loss/idx": 7.0, "loss/logits": 0.22169267386198044, "step": 1492 }, { "epoch": 0.0222937307281673, "grad_norm": 0.412109375, "grad_norm_var": 0.0060274759928385414, "learning_rate": 0.0001, "loss": 1.5753, "loss/crossentropy": 2.641311526298523, "loss/fcd": 1.39453125, "loss/idx": 7.0, "loss/logits": 0.1807536482810974, "step": 1493 }, { "epoch": 0.02230866289878228, "grad_norm": 0.44921875, "grad_norm_var": 0.006139055887858073, "learning_rate": 0.0001, "loss": 1.7143, "loss/crossentropy": 2.45189368724823, "loss/fcd": 1.4921875, "loss/idx": 7.0, "loss/logits": 0.22211898863315582, "step": 1494 }, { "epoch": 0.022323595069397264, "grad_norm": 0.365234375, "grad_norm_var": 0.005231968561808268, "learning_rate": 0.0001, "loss": 1.4379, "loss/crossentropy": 2.4117982387542725, "loss/fcd": 1.2890625, "loss/idx": 7.0, "loss/logits": 0.14883895218372345, "step": 1495 }, { "epoch": 0.022338527240012243, "grad_norm": 0.6171875, "grad_norm_var": 0.006336069107055664, "learning_rate": 0.0001, "loss": 1.7348, "loss/crossentropy": 2.4826900959014893, "loss/fcd": 1.54296875, "loss/idx": 7.0, "loss/logits": 0.19185729324817657, "step": 1496 }, { "epoch": 0.022353459410627226, "grad_norm": 0.439453125, "grad_norm_var": 0.0065081278483072914, "learning_rate": 0.0001, "loss": 1.5233, "loss/crossentropy": 2.655819535255432, "loss/fcd": 1.359375, "loss/idx": 7.0, "loss/logits": 0.1639203429222107, "step": 1497 }, { "epoch": 0.02236839158124221, "grad_norm": 0.478515625, "grad_norm_var": 0.006497685114542643, "learning_rate": 0.0001, "loss": 1.7682, "loss/crossentropy": 2.4737117290496826, "loss/fcd": 1.5625, "loss/idx": 7.0, "loss/logits": 0.20565108954906464, "step": 1498 }, { "epoch": 0.022383323751857188, "grad_norm": 0.419921875, "grad_norm_var": 0.00646055539449056, "learning_rate": 0.0001, "loss": 1.7136, "loss/crossentropy": 2.436096429824829, "loss/fcd": 1.484375, "loss/idx": 7.0, "loss/logits": 0.22923439741134644, "step": 1499 }, { "epoch": 0.02239825592247217, "grad_norm": 0.44140625, "grad_norm_var": 0.006312799453735351, "learning_rate": 0.0001, "loss": 1.6022, "loss/crossentropy": 2.6730234622955322, "loss/fcd": 1.41796875, "loss/idx": 7.0, "loss/logits": 0.1842484176158905, "step": 1500 }, { "epoch": 0.022413188093087153, "grad_norm": 0.439453125, "grad_norm_var": 0.006403541564941407, "learning_rate": 0.0001, "loss": 1.6707, "loss/crossentropy": 2.520161986351013, "loss/fcd": 1.46875, "loss/idx": 7.0, "loss/logits": 0.20194847136735916, "step": 1501 }, { "epoch": 0.022428120263702132, "grad_norm": 0.44140625, "grad_norm_var": 0.006498956680297851, "learning_rate": 0.0001, "loss": 1.6007, "loss/crossentropy": 2.7842148542404175, "loss/fcd": 1.40625, "loss/idx": 7.0, "loss/logits": 0.1944282054901123, "step": 1502 }, { "epoch": 0.022443052434317115, "grad_norm": 0.48046875, "grad_norm_var": 0.006492471694946289, "learning_rate": 0.0001, "loss": 1.6758, "loss/crossentropy": 2.7076008319854736, "loss/fcd": 1.48046875, "loss/idx": 7.0, "loss/logits": 0.19530197978019714, "step": 1503 }, { "epoch": 0.022457984604932094, "grad_norm": 0.46875, "grad_norm_var": 0.006391143798828125, "learning_rate": 0.0001, "loss": 1.6203, "loss/crossentropy": 2.8342201709747314, "loss/fcd": 1.4296875, "loss/idx": 7.0, "loss/logits": 0.19059840589761734, "step": 1504 }, { "epoch": 0.022472916775547077, "grad_norm": 0.46875, "grad_norm_var": 0.006398248672485352, "learning_rate": 0.0001, "loss": 1.7523, "loss/crossentropy": 2.3385519981384277, "loss/fcd": 1.54296875, "loss/idx": 7.0, "loss/logits": 0.20937157422304153, "step": 1505 }, { "epoch": 0.02248784894616206, "grad_norm": 0.400390625, "grad_norm_var": 0.005142974853515625, "learning_rate": 0.0001, "loss": 1.5736, "loss/crossentropy": 2.586688756942749, "loss/fcd": 1.40625, "loss/idx": 7.0, "loss/logits": 0.16738758236169815, "step": 1506 }, { "epoch": 0.02250278111677704, "grad_norm": 0.443359375, "grad_norm_var": 0.0029901981353759764, "learning_rate": 0.0001, "loss": 1.662, "loss/crossentropy": 2.3330858945846558, "loss/fcd": 1.48046875, "loss/idx": 7.0, "loss/logits": 0.18151338398456573, "step": 1507 }, { "epoch": 0.02251771328739202, "grad_norm": 0.51171875, "grad_norm_var": 0.0030822118123372396, "learning_rate": 0.0001, "loss": 1.7309, "loss/crossentropy": 2.3610141277313232, "loss/fcd": 1.54296875, "loss/idx": 7.0, "loss/logits": 0.18796861171722412, "step": 1508 }, { "epoch": 0.022532645458007004, "grad_norm": 0.4375, "grad_norm_var": 0.002977863947550456, "learning_rate": 0.0001, "loss": 1.6468, "loss/crossentropy": 2.5573190450668335, "loss/fcd": 1.453125, "loss/idx": 7.0, "loss/logits": 0.19369368255138397, "step": 1509 }, { "epoch": 0.022547577628621984, "grad_norm": 0.515625, "grad_norm_var": 0.003189706802368164, "learning_rate": 0.0001, "loss": 1.8248, "loss/crossentropy": 2.914300322532654, "loss/fcd": 1.6015625, "loss/idx": 7.0, "loss/logits": 0.22323106229305267, "step": 1510 }, { "epoch": 0.022562509799236966, "grad_norm": 0.435546875, "grad_norm_var": 0.0026049137115478514, "learning_rate": 0.0001, "loss": 1.5476, "loss/crossentropy": 2.729630470275879, "loss/fcd": 1.37890625, "loss/idx": 7.0, "loss/logits": 0.16865114867687225, "step": 1511 }, { "epoch": 0.02257744196985195, "grad_norm": 0.451171875, "grad_norm_var": 0.0009579976399739583, "learning_rate": 0.0001, "loss": 1.6299, "loss/crossentropy": 2.6284605264663696, "loss/fcd": 1.453125, "loss/idx": 7.0, "loss/logits": 0.17675121873617172, "step": 1512 }, { "epoch": 0.022592374140466928, "grad_norm": 0.5234375, "grad_norm_var": 0.0012293338775634765, "learning_rate": 0.0001, "loss": 1.8771, "loss/crossentropy": 2.4401475191116333, "loss/fcd": 1.6484375, "loss/idx": 7.0, "loss/logits": 0.22864650189876556, "step": 1513 }, { "epoch": 0.02260730631108191, "grad_norm": 0.48828125, "grad_norm_var": 0.001259613037109375, "learning_rate": 0.0001, "loss": 1.6196, "loss/crossentropy": 2.6447492837905884, "loss/fcd": 1.4296875, "loss/idx": 7.0, "loss/logits": 0.18992742151021957, "step": 1514 }, { "epoch": 0.022622238481696894, "grad_norm": 0.392578125, "grad_norm_var": 0.0014540990193684896, "learning_rate": 0.0001, "loss": 1.5479, "loss/crossentropy": 2.427100419998169, "loss/fcd": 1.3828125, "loss/idx": 7.0, "loss/logits": 0.16507157683372498, "step": 1515 }, { "epoch": 0.022637170652311873, "grad_norm": 0.39453125, "grad_norm_var": 0.0016997655232747395, "learning_rate": 0.0001, "loss": 1.6024, "loss/crossentropy": 2.499715566635132, "loss/fcd": 1.421875, "loss/idx": 7.0, "loss/logits": 0.18047834187746048, "step": 1516 }, { "epoch": 0.022652102822926855, "grad_norm": 0.5078125, "grad_norm_var": 0.0018427371978759766, "learning_rate": 0.0001, "loss": 1.7189, "loss/crossentropy": 2.6804046630859375, "loss/fcd": 1.50390625, "loss/idx": 7.0, "loss/logits": 0.2150314822793007, "step": 1517 }, { "epoch": 0.022667034993541835, "grad_norm": 0.458984375, "grad_norm_var": 0.0018182754516601562, "learning_rate": 0.0001, "loss": 1.6226, "loss/crossentropy": 2.502648949623108, "loss/fcd": 1.43359375, "loss/idx": 7.0, "loss/logits": 0.18898165971040726, "step": 1518 }, { "epoch": 0.022681967164156817, "grad_norm": 0.49609375, "grad_norm_var": 0.0018737157185872396, "learning_rate": 0.0001, "loss": 1.619, "loss/crossentropy": 2.562047839164734, "loss/fcd": 1.44921875, "loss/idx": 7.0, "loss/logits": 0.16982071101665497, "step": 1519 }, { "epoch": 0.0226968993347718, "grad_norm": 0.474609375, "grad_norm_var": 0.0018810113271077475, "learning_rate": 0.0001, "loss": 1.7214, "loss/crossentropy": 2.667098641395569, "loss/fcd": 1.5234375, "loss/idx": 7.0, "loss/logits": 0.1979236751794815, "step": 1520 }, { "epoch": 0.02271183150538678, "grad_norm": 0.63671875, "grad_norm_var": 0.0037837823232014975, "learning_rate": 0.0001, "loss": 1.818, "loss/crossentropy": 3.0402382612228394, "loss/fcd": 1.59375, "loss/idx": 7.0, "loss/logits": 0.22424130141735077, "step": 1521 }, { "epoch": 0.022726763676001762, "grad_norm": 0.455078125, "grad_norm_var": 0.0034410953521728516, "learning_rate": 0.0001, "loss": 1.6237, "loss/crossentropy": 2.690005898475647, "loss/fcd": 1.4375, "loss/idx": 7.0, "loss/logits": 0.18618075549602509, "step": 1522 }, { "epoch": 0.022741695846616745, "grad_norm": 0.419921875, "grad_norm_var": 0.003578805923461914, "learning_rate": 0.0001, "loss": 1.5849, "loss/crossentropy": 2.6825876235961914, "loss/fcd": 1.39453125, "loss/idx": 7.0, "loss/logits": 0.19041255861520767, "step": 1523 }, { "epoch": 0.022756628017231724, "grad_norm": 0.4453125, "grad_norm_var": 0.0035290877024332684, "learning_rate": 0.0001, "loss": 1.7102, "loss/crossentropy": 2.556373953819275, "loss/fcd": 1.515625, "loss/idx": 7.0, "loss/logits": 0.19456663727760315, "step": 1524 }, { "epoch": 0.022771560187846707, "grad_norm": 0.99609375, "grad_norm_var": 0.02054874102274577, "learning_rate": 0.0001, "loss": 1.6541, "loss/crossentropy": 2.5576895475387573, "loss/fcd": 1.453125, "loss/idx": 7.0, "loss/logits": 0.20096850395202637, "step": 1525 }, { "epoch": 0.02278649235846169, "grad_norm": 0.484375, "grad_norm_var": 0.02056857744852702, "learning_rate": 0.0001, "loss": 1.7278, "loss/crossentropy": 2.709898829460144, "loss/fcd": 1.515625, "loss/idx": 7.0, "loss/logits": 0.21212925761938095, "step": 1526 }, { "epoch": 0.02280142452907667, "grad_norm": 0.6953125, "grad_norm_var": 0.02242253621419271, "learning_rate": 0.0001, "loss": 1.7308, "loss/crossentropy": 2.7060130834579468, "loss/fcd": 1.51953125, "loss/idx": 7.0, "loss/logits": 0.2113034650683403, "step": 1527 }, { "epoch": 0.02281635669969165, "grad_norm": 0.515625, "grad_norm_var": 0.02209051450093587, "learning_rate": 0.0001, "loss": 1.7012, "loss/crossentropy": 2.7240923643112183, "loss/fcd": 1.4921875, "loss/idx": 7.0, "loss/logits": 0.2090420499444008, "step": 1528 }, { "epoch": 0.02283128887030663, "grad_norm": 0.6015625, "grad_norm_var": 0.022465626398722332, "learning_rate": 0.0001, "loss": 1.6888, "loss/crossentropy": 2.8262499570846558, "loss/fcd": 1.4765625, "loss/idx": 7.0, "loss/logits": 0.21227449923753738, "step": 1529 }, { "epoch": 0.022846221040921613, "grad_norm": 0.392578125, "grad_norm_var": 0.02355677286783854, "learning_rate": 0.0001, "loss": 1.4653, "loss/crossentropy": 2.6713192462921143, "loss/fcd": 1.31640625, "loss/idx": 7.0, "loss/logits": 0.1488954946398735, "step": 1530 }, { "epoch": 0.022861153211536596, "grad_norm": 0.447265625, "grad_norm_var": 0.022793070475260416, "learning_rate": 0.0001, "loss": 1.6075, "loss/crossentropy": 2.5342416763305664, "loss/fcd": 1.42578125, "loss/idx": 7.0, "loss/logits": 0.1817667856812477, "step": 1531 }, { "epoch": 0.022876085382151575, "grad_norm": 0.48828125, "grad_norm_var": 0.021694437662760416, "learning_rate": 0.0001, "loss": 1.6303, "loss/crossentropy": 2.8033546209335327, "loss/fcd": 1.42578125, "loss/idx": 7.0, "loss/logits": 0.20447959005832672, "step": 1532 }, { "epoch": 0.022891017552766558, "grad_norm": 0.5234375, "grad_norm_var": 0.021658833821614584, "learning_rate": 0.0001, "loss": 1.7876, "loss/crossentropy": 2.538213014602661, "loss/fcd": 1.56640625, "loss/idx": 7.0, "loss/logits": 0.2211550772190094, "step": 1533 }, { "epoch": 0.02290594972338154, "grad_norm": 0.4453125, "grad_norm_var": 0.021805810928344726, "learning_rate": 0.0001, "loss": 1.7903, "loss/crossentropy": 2.6131246089935303, "loss/fcd": 1.5546875, "loss/idx": 7.0, "loss/logits": 0.23564165830612183, "step": 1534 }, { "epoch": 0.02292088189399652, "grad_norm": 0.4375, "grad_norm_var": 0.02230362892150879, "learning_rate": 0.0001, "loss": 1.5504, "loss/crossentropy": 2.6005011796951294, "loss/fcd": 1.375, "loss/idx": 7.0, "loss/logits": 0.17541643232107162, "step": 1535 }, { "epoch": 0.022935814064611502, "grad_norm": 0.5078125, "grad_norm_var": 0.02213312784830729, "learning_rate": 0.0001, "loss": 1.8733, "loss/crossentropy": 2.4328452348709106, "loss/fcd": 1.63671875, "loss/idx": 7.0, "loss/logits": 0.23656418919563293, "step": 1536 }, { "epoch": 0.022950746235226485, "grad_norm": 0.4296875, "grad_norm_var": 0.02188714345296224, "learning_rate": 0.0001, "loss": 1.4982, "loss/crossentropy": 2.3456164598464966, "loss/fcd": 1.3359375, "loss/idx": 7.0, "loss/logits": 0.1623057723045349, "step": 1537 }, { "epoch": 0.022965678405841464, "grad_norm": 0.3984375, "grad_norm_var": 0.02256150245666504, "learning_rate": 0.0001, "loss": 1.6528, "loss/crossentropy": 2.3263497352600098, "loss/fcd": 1.47265625, "loss/idx": 7.0, "loss/logits": 0.18014327436685562, "step": 1538 }, { "epoch": 0.022980610576456447, "grad_norm": 0.40234375, "grad_norm_var": 0.022801971435546874, "learning_rate": 0.0001, "loss": 1.6256, "loss/crossentropy": 2.741413116455078, "loss/fcd": 1.4375, "loss/idx": 7.0, "loss/logits": 0.18808145076036453, "step": 1539 }, { "epoch": 0.022995542747071426, "grad_norm": 0.4140625, "grad_norm_var": 0.02314580281575521, "learning_rate": 0.0001, "loss": 1.5412, "loss/crossentropy": 2.5992331504821777, "loss/fcd": 1.36328125, "loss/idx": 7.0, "loss/logits": 0.1778806746006012, "step": 1540 }, { "epoch": 0.02301047491768641, "grad_norm": 0.46875, "grad_norm_var": 0.0064345677693684895, "learning_rate": 0.0001, "loss": 1.6861, "loss/crossentropy": 2.645154595375061, "loss/fcd": 1.48046875, "loss/idx": 7.0, "loss/logits": 0.2056177482008934, "step": 1541 }, { "epoch": 0.02302540708830139, "grad_norm": 0.50390625, "grad_norm_var": 0.00647430419921875, "learning_rate": 0.0001, "loss": 1.608, "loss/crossentropy": 3.0556046962738037, "loss/fcd": 1.421875, "loss/idx": 7.0, "loss/logits": 0.1861376166343689, "step": 1542 }, { "epoch": 0.02304033925891637, "grad_norm": 0.455078125, "grad_norm_var": 0.003168344497680664, "learning_rate": 0.0001, "loss": 1.8426, "loss/crossentropy": 2.5418970584869385, "loss/fcd": 1.59765625, "loss/idx": 7.0, "loss/logits": 0.2449493408203125, "step": 1543 }, { "epoch": 0.023055271429531354, "grad_norm": 0.427734375, "grad_norm_var": 0.0030517578125, "learning_rate": 0.0001, "loss": 1.6139, "loss/crossentropy": 2.835118532180786, "loss/fcd": 1.421875, "loss/idx": 7.0, "loss/logits": 0.19205392152071, "step": 1544 }, { "epoch": 0.023070203600146336, "grad_norm": 0.412109375, "grad_norm_var": 0.0016934553782145183, "learning_rate": 0.0001, "loss": 1.5049, "loss/crossentropy": 2.5481245517730713, "loss/fcd": 1.3515625, "loss/idx": 7.0, "loss/logits": 0.15329091250896454, "step": 1545 }, { "epoch": 0.023085135770761316, "grad_norm": 0.44921875, "grad_norm_var": 0.0014818827311197916, "learning_rate": 0.0001, "loss": 1.9153, "loss/crossentropy": 2.778249740600586, "loss/fcd": 1.65625, "loss/idx": 7.0, "loss/logits": 0.2590373530983925, "step": 1546 }, { "epoch": 0.023100067941376298, "grad_norm": 0.392578125, "grad_norm_var": 0.0016937255859375, "learning_rate": 0.0001, "loss": 1.5622, "loss/crossentropy": 2.767631411552429, "loss/fcd": 1.375, "loss/idx": 7.0, "loss/logits": 0.18715695291757584, "step": 1547 }, { "epoch": 0.02311500011199128, "grad_norm": 0.546875, "grad_norm_var": 0.0022287368774414062, "learning_rate": 0.0001, "loss": 1.8284, "loss/crossentropy": 2.524918556213379, "loss/fcd": 1.61328125, "loss/idx": 7.0, "loss/logits": 0.21513129025697708, "step": 1548 }, { "epoch": 0.02312993228260626, "grad_norm": 0.44140625, "grad_norm_var": 0.001856231689453125, "learning_rate": 0.0001, "loss": 1.6473, "loss/crossentropy": 2.6472318172454834, "loss/fcd": 1.44921875, "loss/idx": 7.0, "loss/logits": 0.19806896150112152, "step": 1549 }, { "epoch": 0.023144864453221243, "grad_norm": 0.453125, "grad_norm_var": 0.0018595377604166666, "learning_rate": 0.0001, "loss": 1.565, "loss/crossentropy": 2.3890823125839233, "loss/fcd": 1.3984375, "loss/idx": 7.0, "loss/logits": 0.16657334566116333, "step": 1550 }, { "epoch": 0.023159796623836226, "grad_norm": 0.408203125, "grad_norm_var": 0.0019475142161051431, "learning_rate": 0.0001, "loss": 1.7483, "loss/crossentropy": 2.484150528907776, "loss/fcd": 1.53125, "loss/idx": 7.0, "loss/logits": 0.2170763462781906, "step": 1551 }, { "epoch": 0.023174728794451205, "grad_norm": 0.58203125, "grad_norm_var": 0.0029187361399332684, "learning_rate": 0.0001, "loss": 1.863, "loss/crossentropy": 2.5539822578430176, "loss/fcd": 1.625, "loss/idx": 7.0, "loss/logits": 0.23795197159051895, "step": 1552 }, { "epoch": 0.023189660965066188, "grad_norm": 0.392578125, "grad_norm_var": 0.0031008402506510417, "learning_rate": 0.0001, "loss": 1.4736, "loss/crossentropy": 2.5587610006332397, "loss/fcd": 1.31640625, "loss/idx": 7.0, "loss/logits": 0.15721678733825684, "step": 1553 }, { "epoch": 0.023204593135681167, "grad_norm": 0.48828125, "grad_norm_var": 0.003026262919108073, "learning_rate": 0.0001, "loss": 1.709, "loss/crossentropy": 2.83974826335907, "loss/fcd": 1.515625, "loss/idx": 7.0, "loss/logits": 0.19336502254009247, "step": 1554 }, { "epoch": 0.02321952530629615, "grad_norm": 0.46484375, "grad_norm_var": 0.0028533299763997396, "learning_rate": 0.0001, "loss": 1.6188, "loss/crossentropy": 2.6520248651504517, "loss/fcd": 1.4296875, "loss/idx": 7.0, "loss/logits": 0.1890881061553955, "step": 1555 }, { "epoch": 0.023234457476911132, "grad_norm": 0.51171875, "grad_norm_var": 0.002899424235026042, "learning_rate": 0.0001, "loss": 1.7069, "loss/crossentropy": 2.49362576007843, "loss/fcd": 1.5, "loss/idx": 7.0, "loss/logits": 0.20692860335111618, "step": 1556 }, { "epoch": 0.02324938964752611, "grad_norm": 0.4375, "grad_norm_var": 0.0029340108235677083, "learning_rate": 0.0001, "loss": 1.6047, "loss/crossentropy": 2.834295630455017, "loss/fcd": 1.4296875, "loss/idx": 7.0, "loss/logits": 0.17499014735221863, "step": 1557 }, { "epoch": 0.023264321818141094, "grad_norm": 0.59765625, "grad_norm_var": 0.004026540120442708, "learning_rate": 0.0001, "loss": 1.8205, "loss/crossentropy": 2.5558085441589355, "loss/fcd": 1.609375, "loss/idx": 7.0, "loss/logits": 0.21112027019262314, "step": 1558 }, { "epoch": 0.023279253988756077, "grad_norm": 0.59375, "grad_norm_var": 0.005020761489868164, "learning_rate": 0.0001, "loss": 1.7496, "loss/crossentropy": 2.4718828201293945, "loss/fcd": 1.5390625, "loss/idx": 7.0, "loss/logits": 0.2105449065566063, "step": 1559 }, { "epoch": 0.023294186159371056, "grad_norm": 0.447265625, "grad_norm_var": 0.004921579360961914, "learning_rate": 0.0001, "loss": 1.5735, "loss/crossentropy": 2.622591733932495, "loss/fcd": 1.39453125, "loss/idx": 7.0, "loss/logits": 0.17899316549301147, "step": 1560 }, { "epoch": 0.02330911832998604, "grad_norm": 0.49609375, "grad_norm_var": 0.004644775390625, "learning_rate": 0.0001, "loss": 1.6309, "loss/crossentropy": 2.7571672201156616, "loss/fcd": 1.42578125, "loss/idx": 7.0, "loss/logits": 0.20515643060207367, "step": 1561 }, { "epoch": 0.02332405050060102, "grad_norm": 0.5625, "grad_norm_var": 0.004960060119628906, "learning_rate": 0.0001, "loss": 1.6672, "loss/crossentropy": 2.7672977447509766, "loss/fcd": 1.46875, "loss/idx": 7.0, "loss/logits": 0.19848168641328812, "step": 1562 }, { "epoch": 0.023338982671216, "grad_norm": 0.47265625, "grad_norm_var": 0.004336404800415039, "learning_rate": 0.0001, "loss": 1.6023, "loss/crossentropy": 2.674835681915283, "loss/fcd": 1.42578125, "loss/idx": 7.0, "loss/logits": 0.17654051631689072, "step": 1563 }, { "epoch": 0.023353914841830983, "grad_norm": 0.45703125, "grad_norm_var": 0.004201873143513998, "learning_rate": 0.0001, "loss": 1.5478, "loss/crossentropy": 2.5982450246810913, "loss/fcd": 1.3828125, "loss/idx": 7.0, "loss/logits": 0.16496731340885162, "step": 1564 }, { "epoch": 0.023368847012445963, "grad_norm": 0.5, "grad_norm_var": 0.004053099950154623, "learning_rate": 0.0001, "loss": 1.7248, "loss/crossentropy": 2.3847527503967285, "loss/fcd": 1.5234375, "loss/idx": 7.0, "loss/logits": 0.2014065459370613, "step": 1565 }, { "epoch": 0.023383779183060945, "grad_norm": 0.462890625, "grad_norm_var": 0.0040089925130208336, "learning_rate": 0.0001, "loss": 1.5863, "loss/crossentropy": 2.4995174407958984, "loss/fcd": 1.421875, "loss/idx": 7.0, "loss/logits": 0.164415180683136, "step": 1566 }, { "epoch": 0.023398711353675928, "grad_norm": 0.53125, "grad_norm_var": 0.003577407201131185, "learning_rate": 0.0001, "loss": 1.6788, "loss/crossentropy": 2.8225643634796143, "loss/fcd": 1.4765625, "loss/idx": 7.0, "loss/logits": 0.20224997401237488, "step": 1567 }, { "epoch": 0.023413643524290907, "grad_norm": 0.482421875, "grad_norm_var": 0.0031064351399739585, "learning_rate": 0.0001, "loss": 1.6304, "loss/crossentropy": 2.5911120176315308, "loss/fcd": 1.43359375, "loss/idx": 7.0, "loss/logits": 0.1968412771821022, "step": 1568 }, { "epoch": 0.02342857569490589, "grad_norm": 0.451171875, "grad_norm_var": 0.0025313695271809897, "learning_rate": 0.0001, "loss": 1.6428, "loss/crossentropy": 2.5418704748153687, "loss/fcd": 1.44921875, "loss/idx": 7.0, "loss/logits": 0.1935887709259987, "step": 1569 }, { "epoch": 0.023443507865520873, "grad_norm": 0.5, "grad_norm_var": 0.0025258382161458333, "learning_rate": 0.0001, "loss": 1.6968, "loss/crossentropy": 2.611220955848694, "loss/fcd": 1.48828125, "loss/idx": 7.0, "loss/logits": 0.20849105715751648, "step": 1570 }, { "epoch": 0.023458440036135852, "grad_norm": 0.39453125, "grad_norm_var": 0.0031461079915364584, "learning_rate": 0.0001, "loss": 1.446, "loss/crossentropy": 2.6295387744903564, "loss/fcd": 1.28515625, "loss/idx": 7.0, "loss/logits": 0.16082587838172913, "step": 1571 }, { "epoch": 0.023473372206750834, "grad_norm": 0.451171875, "grad_norm_var": 0.003229379653930664, "learning_rate": 0.0001, "loss": 1.6323, "loss/crossentropy": 2.550940990447998, "loss/fcd": 1.453125, "loss/idx": 7.0, "loss/logits": 0.17917423695325851, "step": 1572 }, { "epoch": 0.023488304377365817, "grad_norm": 0.419921875, "grad_norm_var": 0.003371429443359375, "learning_rate": 0.0001, "loss": 1.6075, "loss/crossentropy": 2.503952383995056, "loss/fcd": 1.41796875, "loss/idx": 7.0, "loss/logits": 0.1895056962966919, "step": 1573 }, { "epoch": 0.023503236547980796, "grad_norm": 0.6171875, "grad_norm_var": 0.0036788304646809896, "learning_rate": 0.0001, "loss": 1.623, "loss/crossentropy": 2.799260377883911, "loss/fcd": 1.4453125, "loss/idx": 7.0, "loss/logits": 0.17770560830831528, "step": 1574 }, { "epoch": 0.02351816871859578, "grad_norm": 0.400390625, "grad_norm_var": 0.0033405145009358724, "learning_rate": 0.0001, "loss": 1.4874, "loss/crossentropy": 2.6628782749176025, "loss/fcd": 1.328125, "loss/idx": 7.0, "loss/logits": 0.15926603972911835, "step": 1575 }, { "epoch": 0.023533100889210762, "grad_norm": 0.5390625, "grad_norm_var": 0.0034921646118164064, "learning_rate": 0.0001, "loss": 1.7877, "loss/crossentropy": 2.4713010787963867, "loss/fcd": 1.5703125, "loss/idx": 7.0, "loss/logits": 0.21734385192394257, "step": 1576 }, { "epoch": 0.02354803305982574, "grad_norm": 0.404296875, "grad_norm_var": 0.003866434097290039, "learning_rate": 0.0001, "loss": 1.5796, "loss/crossentropy": 2.6206711530685425, "loss/fcd": 1.390625, "loss/idx": 7.0, "loss/logits": 0.18898864090442657, "step": 1577 }, { "epoch": 0.023562965230440724, "grad_norm": 0.490234375, "grad_norm_var": 0.0033777236938476564, "learning_rate": 0.0001, "loss": 1.6419, "loss/crossentropy": 2.5034207105636597, "loss/fcd": 1.453125, "loss/idx": 7.0, "loss/logits": 0.18877746164798737, "step": 1578 }, { "epoch": 0.023577897401055703, "grad_norm": 0.53515625, "grad_norm_var": 0.0036157608032226563, "learning_rate": 0.0001, "loss": 2.0868, "loss/crossentropy": 2.7469301223754883, "loss/fcd": 1.80859375, "loss/idx": 7.0, "loss/logits": 0.2782081812620163, "step": 1579 }, { "epoch": 0.023592829571670686, "grad_norm": 0.451171875, "grad_norm_var": 0.003633737564086914, "learning_rate": 0.0001, "loss": 1.6501, "loss/crossentropy": 2.4556583166122437, "loss/fcd": 1.46484375, "loss/idx": 7.0, "loss/logits": 0.18522104620933533, "step": 1580 }, { "epoch": 0.02360776174228567, "grad_norm": 0.46875, "grad_norm_var": 0.003598642349243164, "learning_rate": 0.0001, "loss": 1.7168, "loss/crossentropy": 2.6500853300094604, "loss/fcd": 1.51953125, "loss/idx": 7.0, "loss/logits": 0.1973070204257965, "step": 1581 }, { "epoch": 0.023622693912900648, "grad_norm": 0.447265625, "grad_norm_var": 0.003639078140258789, "learning_rate": 0.0001, "loss": 1.7504, "loss/crossentropy": 2.4983482360839844, "loss/fcd": 1.5234375, "loss/idx": 7.0, "loss/logits": 0.22697453200817108, "step": 1582 }, { "epoch": 0.02363762608351563, "grad_norm": 0.515625, "grad_norm_var": 0.0035350640614827474, "learning_rate": 0.0001, "loss": 1.5874, "loss/crossentropy": 2.6754297018051147, "loss/fcd": 1.41015625, "loss/idx": 7.0, "loss/logits": 0.17720526456832886, "step": 1583 }, { "epoch": 0.023652558254130613, "grad_norm": 0.43359375, "grad_norm_var": 0.003622881571451823, "learning_rate": 0.0001, "loss": 1.6579, "loss/crossentropy": 2.661411762237549, "loss/fcd": 1.453125, "loss/idx": 7.0, "loss/logits": 0.20476175099611282, "step": 1584 }, { "epoch": 0.023667490424745592, "grad_norm": 0.458984375, "grad_norm_var": 0.0036071141560872397, "learning_rate": 0.0001, "loss": 1.5381, "loss/crossentropy": 2.626299500465393, "loss/fcd": 1.3671875, "loss/idx": 7.0, "loss/logits": 0.17094596475362778, "step": 1585 }, { "epoch": 0.023682422595360575, "grad_norm": 0.443359375, "grad_norm_var": 0.0035845279693603516, "learning_rate": 0.0001, "loss": 1.5842, "loss/crossentropy": 2.8080815076828003, "loss/fcd": 1.40234375, "loss/idx": 7.0, "loss/logits": 0.1818404197692871, "step": 1586 }, { "epoch": 0.023697354765975558, "grad_norm": 0.4296875, "grad_norm_var": 0.003322458267211914, "learning_rate": 0.0001, "loss": 1.5698, "loss/crossentropy": 2.596649646759033, "loss/fcd": 1.39453125, "loss/idx": 7.0, "loss/logits": 0.17525531351566315, "step": 1587 }, { "epoch": 0.023712286936590537, "grad_norm": 0.55078125, "grad_norm_var": 0.003704261779785156, "learning_rate": 0.0001, "loss": 1.7572, "loss/crossentropy": 2.7423954010009766, "loss/fcd": 1.546875, "loss/idx": 7.0, "loss/logits": 0.21033258736133575, "step": 1588 }, { "epoch": 0.02372721910720552, "grad_norm": 0.4921875, "grad_norm_var": 0.003496662775675456, "learning_rate": 0.0001, "loss": 1.6647, "loss/crossentropy": 2.879364013671875, "loss/fcd": 1.4609375, "loss/idx": 7.0, "loss/logits": 0.20377619564533234, "step": 1589 }, { "epoch": 0.0237421512778205, "grad_norm": 0.5078125, "grad_norm_var": 0.002241627375284831, "learning_rate": 0.0001, "loss": 1.8962, "loss/crossentropy": 2.434022903442383, "loss/fcd": 1.6484375, "loss/idx": 7.0, "loss/logits": 0.24780651926994324, "step": 1590 }, { "epoch": 0.02375708344843548, "grad_norm": 0.6796875, "grad_norm_var": 0.004412269592285157, "learning_rate": 0.0001, "loss": 1.8758, "loss/crossentropy": 2.5361167192459106, "loss/fcd": 1.66015625, "loss/idx": 7.0, "loss/logits": 0.21565672755241394, "step": 1591 }, { "epoch": 0.023772015619050464, "grad_norm": 0.41796875, "grad_norm_var": 0.004544321695963542, "learning_rate": 0.0001, "loss": 1.5075, "loss/crossentropy": 2.738878607749939, "loss/fcd": 1.33984375, "loss/idx": 7.0, "loss/logits": 0.16762951761484146, "step": 1592 }, { "epoch": 0.023786947789665443, "grad_norm": 0.486328125, "grad_norm_var": 0.0041050593058268225, "learning_rate": 0.0001, "loss": 1.7113, "loss/crossentropy": 2.573231339454651, "loss/fcd": 1.5078125, "loss/idx": 7.0, "loss/logits": 0.20347873866558075, "step": 1593 }, { "epoch": 0.023801879960280426, "grad_norm": 0.44140625, "grad_norm_var": 0.004239765803019205, "learning_rate": 0.0001, "loss": 1.6489, "loss/crossentropy": 2.4706650972366333, "loss/fcd": 1.453125, "loss/idx": 7.0, "loss/logits": 0.1958089843392372, "step": 1594 }, { "epoch": 0.02381681213089541, "grad_norm": 0.453125, "grad_norm_var": 0.004111591974894206, "learning_rate": 0.0001, "loss": 1.6444, "loss/crossentropy": 2.6910659074783325, "loss/fcd": 1.46484375, "loss/idx": 7.0, "loss/logits": 0.17950844019651413, "step": 1595 }, { "epoch": 0.023831744301510388, "grad_norm": 0.470703125, "grad_norm_var": 0.004060729344685873, "learning_rate": 0.0001, "loss": 1.6975, "loss/crossentropy": 2.5878102779388428, "loss/fcd": 1.5, "loss/idx": 7.0, "loss/logits": 0.19749021530151367, "step": 1596 }, { "epoch": 0.02384667647212537, "grad_norm": 0.484375, "grad_norm_var": 0.004050302505493164, "learning_rate": 0.0001, "loss": 1.5805, "loss/crossentropy": 2.41681444644928, "loss/fcd": 1.40234375, "loss/idx": 7.0, "loss/logits": 0.17812514305114746, "step": 1597 }, { "epoch": 0.023861608642740353, "grad_norm": 0.4140625, "grad_norm_var": 0.004273223876953125, "learning_rate": 0.0001, "loss": 1.536, "loss/crossentropy": 2.4718927145004272, "loss/fcd": 1.37109375, "loss/idx": 7.0, "loss/logits": 0.1648888885974884, "step": 1598 }, { "epoch": 0.023876540813355333, "grad_norm": 0.63671875, "grad_norm_var": 0.005765215555826823, "learning_rate": 0.0001, "loss": 1.9009, "loss/crossentropy": 2.596011519432068, "loss/fcd": 1.66015625, "loss/idx": 7.0, "loss/logits": 0.24071332067251205, "step": 1599 }, { "epoch": 0.023891472983970315, "grad_norm": 0.474609375, "grad_norm_var": 0.005575291315714518, "learning_rate": 0.0001, "loss": 1.617, "loss/crossentropy": 2.4079898595809937, "loss/fcd": 1.43359375, "loss/idx": 7.0, "loss/logits": 0.18339631706476212, "step": 1600 }, { "epoch": 0.023906405154585295, "grad_norm": 0.44921875, "grad_norm_var": 0.005621782938639323, "learning_rate": 0.0001, "loss": 1.6684, "loss/crossentropy": 2.572506904602051, "loss/fcd": 1.47265625, "loss/idx": 7.0, "loss/logits": 0.19576621800661087, "step": 1601 }, { "epoch": 0.023921337325200277, "grad_norm": 0.484375, "grad_norm_var": 0.005474583307902018, "learning_rate": 0.0001, "loss": 1.9182, "loss/crossentropy": 2.5226889848709106, "loss/fcd": 1.671875, "loss/idx": 7.0, "loss/logits": 0.24628648161888123, "step": 1602 }, { "epoch": 0.02393626949581526, "grad_norm": 0.3984375, "grad_norm_var": 0.005795526504516602, "learning_rate": 0.0001, "loss": 1.6642, "loss/crossentropy": 2.602216124534607, "loss/fcd": 1.45703125, "loss/idx": 7.0, "loss/logits": 0.20719221979379654, "step": 1603 }, { "epoch": 0.02395120166643024, "grad_norm": 0.50390625, "grad_norm_var": 0.005553674697875976, "learning_rate": 0.0001, "loss": 1.6675, "loss/crossentropy": 2.4609646797180176, "loss/fcd": 1.46875, "loss/idx": 7.0, "loss/logits": 0.1987275555729866, "step": 1604 }, { "epoch": 0.023966133837045222, "grad_norm": 0.5546875, "grad_norm_var": 0.00583952267964681, "learning_rate": 0.0001, "loss": 1.687, "loss/crossentropy": 2.5550639629364014, "loss/fcd": 1.5078125, "loss/idx": 7.0, "loss/logits": 0.1792164072394371, "step": 1605 }, { "epoch": 0.023981066007660205, "grad_norm": 0.40625, "grad_norm_var": 0.00625774065653483, "learning_rate": 0.0001, "loss": 1.3797, "loss/crossentropy": 2.5962554216384888, "loss/fcd": 1.2421875, "loss/idx": 7.0, "loss/logits": 0.13753189146518707, "step": 1606 }, { "epoch": 0.023995998178275184, "grad_norm": 0.458984375, "grad_norm_var": 0.003565406799316406, "learning_rate": 0.0001, "loss": 1.67, "loss/crossentropy": 2.744762420654297, "loss/fcd": 1.47265625, "loss/idx": 7.0, "loss/logits": 0.19737379252910614, "step": 1607 }, { "epoch": 0.024010930348890167, "grad_norm": 0.380859375, "grad_norm_var": 0.003913609186808268, "learning_rate": 0.0001, "loss": 1.5864, "loss/crossentropy": 2.4816367626190186, "loss/fcd": 1.41015625, "loss/idx": 7.0, "loss/logits": 0.17619794607162476, "step": 1608 }, { "epoch": 0.02402586251950515, "grad_norm": 0.45703125, "grad_norm_var": 0.0038981119791666667, "learning_rate": 0.0001, "loss": 1.679, "loss/crossentropy": 2.2741174697875977, "loss/fcd": 1.48046875, "loss/idx": 7.0, "loss/logits": 0.19848152250051498, "step": 1609 }, { "epoch": 0.02404079469012013, "grad_norm": 0.376953125, "grad_norm_var": 0.004375950495402018, "learning_rate": 0.0001, "loss": 1.5897, "loss/crossentropy": 2.5791796445846558, "loss/fcd": 1.39453125, "loss/idx": 7.0, "loss/logits": 0.19519494473934174, "step": 1610 }, { "epoch": 0.02405572686073511, "grad_norm": 0.44140625, "grad_norm_var": 0.004399601618448893, "learning_rate": 0.0001, "loss": 1.6222, "loss/crossentropy": 2.7612199783325195, "loss/fcd": 1.44921875, "loss/idx": 7.0, "loss/logits": 0.1729477047920227, "step": 1611 }, { "epoch": 0.024070659031350094, "grad_norm": 0.416015625, "grad_norm_var": 0.004523324966430664, "learning_rate": 0.0001, "loss": 1.6367, "loss/crossentropy": 2.3753793239593506, "loss/fcd": 1.453125, "loss/idx": 7.0, "loss/logits": 0.18361905962228775, "step": 1612 }, { "epoch": 0.024085591201965073, "grad_norm": 0.46484375, "grad_norm_var": 0.00448009173075358, "learning_rate": 0.0001, "loss": 1.5807, "loss/crossentropy": 2.741746187210083, "loss/fcd": 1.40625, "loss/idx": 7.0, "loss/logits": 0.17443695664405823, "step": 1613 }, { "epoch": 0.024100523372580056, "grad_norm": 0.474609375, "grad_norm_var": 0.0043593724568684895, "learning_rate": 0.0001, "loss": 1.7266, "loss/crossentropy": 2.6350860595703125, "loss/fcd": 1.515625, "loss/idx": 7.0, "loss/logits": 0.21098782122135162, "step": 1614 }, { "epoch": 0.024115455543195035, "grad_norm": 0.490234375, "grad_norm_var": 0.0022720177968343098, "learning_rate": 0.0001, "loss": 1.6602, "loss/crossentropy": 2.6208122968673706, "loss/fcd": 1.4609375, "loss/idx": 7.0, "loss/logits": 0.1992257758975029, "step": 1615 }, { "epoch": 0.024130387713810018, "grad_norm": 0.51171875, "grad_norm_var": 0.0024698257446289064, "learning_rate": 0.0001, "loss": 1.7546, "loss/crossentropy": 2.395472764968872, "loss/fcd": 1.5390625, "loss/idx": 7.0, "loss/logits": 0.21557673066854477, "step": 1616 }, { "epoch": 0.024145319884425, "grad_norm": 0.392578125, "grad_norm_var": 0.002709054946899414, "learning_rate": 0.0001, "loss": 1.6455, "loss/crossentropy": 2.7418118715286255, "loss/fcd": 1.44921875, "loss/idx": 7.0, "loss/logits": 0.19631459563970566, "step": 1617 }, { "epoch": 0.02416025205503998, "grad_norm": 0.453125, "grad_norm_var": 0.002630217870076497, "learning_rate": 0.0001, "loss": 1.7756, "loss/crossentropy": 2.547677159309387, "loss/fcd": 1.55859375, "loss/idx": 7.0, "loss/logits": 0.21696195006370544, "step": 1618 }, { "epoch": 0.024175184225654962, "grad_norm": 0.455078125, "grad_norm_var": 0.0024499893188476562, "learning_rate": 0.0001, "loss": 1.7295, "loss/crossentropy": 2.486993432044983, "loss/fcd": 1.52734375, "loss/idx": 7.0, "loss/logits": 0.20217935740947723, "step": 1619 }, { "epoch": 0.024190116396269945, "grad_norm": 0.447265625, "grad_norm_var": 0.0022614638010660807, "learning_rate": 0.0001, "loss": 1.6641, "loss/crossentropy": 2.610501766204834, "loss/fcd": 1.47265625, "loss/idx": 7.0, "loss/logits": 0.1914554387331009, "step": 1620 }, { "epoch": 0.024205048566884924, "grad_norm": 0.79296875, "grad_norm_var": 0.009172550837198893, "learning_rate": 0.0001, "loss": 1.8343, "loss/crossentropy": 2.6146063804626465, "loss/fcd": 1.625, "loss/idx": 7.0, "loss/logits": 0.209325909614563, "step": 1621 }, { "epoch": 0.024219980737499907, "grad_norm": 0.46875, "grad_norm_var": 0.008937565485636394, "learning_rate": 0.0001, "loss": 1.8125, "loss/crossentropy": 2.703667640686035, "loss/fcd": 1.59375, "loss/idx": 7.0, "loss/logits": 0.2187758833169937, "step": 1622 }, { "epoch": 0.02423491290811489, "grad_norm": 0.458984375, "grad_norm_var": 0.008937565485636394, "learning_rate": 0.0001, "loss": 1.7127, "loss/crossentropy": 2.48435115814209, "loss/fcd": 1.5078125, "loss/idx": 7.0, "loss/logits": 0.2049325406551361, "step": 1623 }, { "epoch": 0.02424984507872987, "grad_norm": 0.427734375, "grad_norm_var": 0.008532444636027018, "learning_rate": 0.0001, "loss": 1.4806, "loss/crossentropy": 2.588418126106262, "loss/fcd": 1.31640625, "loss/idx": 7.0, "loss/logits": 0.1641799509525299, "step": 1624 }, { "epoch": 0.02426477724934485, "grad_norm": 0.5, "grad_norm_var": 0.008570210138956705, "learning_rate": 0.0001, "loss": 1.5556, "loss/crossentropy": 2.860697865486145, "loss/fcd": 1.3828125, "loss/idx": 7.0, "loss/logits": 0.17278952151536942, "step": 1625 }, { "epoch": 0.02427970941995983, "grad_norm": 0.51171875, "grad_norm_var": 0.007974688212076824, "learning_rate": 0.0001, "loss": 1.776, "loss/crossentropy": 2.5550949573516846, "loss/fcd": 1.55078125, "loss/idx": 7.0, "loss/logits": 0.22521134465932846, "step": 1626 }, { "epoch": 0.024294641590574814, "grad_norm": 0.87109375, "grad_norm_var": 0.017206255594889322, "learning_rate": 0.0001, "loss": 1.8529, "loss/crossentropy": 2.4149361848831177, "loss/fcd": 1.65625, "loss/idx": 7.0, "loss/logits": 0.1966421902179718, "step": 1627 }, { "epoch": 0.024309573761189796, "grad_norm": 0.61328125, "grad_norm_var": 0.017204650243123374, "learning_rate": 0.0001, "loss": 1.8448, "loss/crossentropy": 2.2909621596336365, "loss/fcd": 1.62109375, "loss/idx": 7.0, "loss/logits": 0.22369665652513504, "step": 1628 }, { "epoch": 0.024324505931804775, "grad_norm": 0.447265625, "grad_norm_var": 0.01735528310139974, "learning_rate": 0.0001, "loss": 1.5968, "loss/crossentropy": 2.7739763259887695, "loss/fcd": 1.41015625, "loss/idx": 7.0, "loss/logits": 0.18666711449623108, "step": 1629 }, { "epoch": 0.024339438102419758, "grad_norm": 0.5, "grad_norm_var": 0.0172426700592041, "learning_rate": 0.0001, "loss": 1.7953, "loss/crossentropy": 2.588120698928833, "loss/fcd": 1.56640625, "loss/idx": 7.0, "loss/logits": 0.2289050817489624, "step": 1630 }, { "epoch": 0.02435437027303474, "grad_norm": 0.408203125, "grad_norm_var": 0.01800370216369629, "learning_rate": 0.0001, "loss": 1.5356, "loss/crossentropy": 2.6186710596084595, "loss/fcd": 1.359375, "loss/idx": 7.0, "loss/logits": 0.17625487595796585, "step": 1631 }, { "epoch": 0.02436930244364972, "grad_norm": 0.8359375, "grad_norm_var": 0.024378315607706705, "learning_rate": 0.0001, "loss": 1.6888, "loss/crossentropy": 2.7367137670516968, "loss/fcd": 1.48046875, "loss/idx": 7.0, "loss/logits": 0.20829469710588455, "step": 1632 }, { "epoch": 0.024384234614264703, "grad_norm": 0.447265625, "grad_norm_var": 0.02351581255594889, "learning_rate": 0.0001, "loss": 1.748, "loss/crossentropy": 2.6904454231262207, "loss/fcd": 1.515625, "loss/idx": 7.0, "loss/logits": 0.2323940247297287, "step": 1633 }, { "epoch": 0.024399166784879685, "grad_norm": 0.71875, "grad_norm_var": 0.02485171953837077, "learning_rate": 0.0001, "loss": 1.8111, "loss/crossentropy": 2.787333369255066, "loss/fcd": 1.5859375, "loss/idx": 7.0, "loss/logits": 0.22514434158802032, "step": 1634 }, { "epoch": 0.024414098955494665, "grad_norm": 0.4453125, "grad_norm_var": 0.024989763895670574, "learning_rate": 0.0001, "loss": 1.5703, "loss/crossentropy": 2.5433605909347534, "loss/fcd": 1.40234375, "loss/idx": 7.0, "loss/logits": 0.1679105907678604, "step": 1635 }, { "epoch": 0.024429031126109647, "grad_norm": 0.41796875, "grad_norm_var": 0.025467793146769207, "learning_rate": 0.0001, "loss": 1.5799, "loss/crossentropy": 2.6077455282211304, "loss/fcd": 1.40625, "loss/idx": 7.0, "loss/logits": 0.1736254319548607, "step": 1636 }, { "epoch": 0.02444396329672463, "grad_norm": 0.48046875, "grad_norm_var": 0.02161749203999837, "learning_rate": 0.0001, "loss": 1.7188, "loss/crossentropy": 2.7809062004089355, "loss/fcd": 1.51953125, "loss/idx": 7.0, "loss/logits": 0.1992294043302536, "step": 1637 }, { "epoch": 0.02445889546733961, "grad_norm": 0.45703125, "grad_norm_var": 0.021728881200154624, "learning_rate": 0.0001, "loss": 1.622, "loss/crossentropy": 2.7618008852005005, "loss/fcd": 1.421875, "loss/idx": 7.0, "loss/logits": 0.20014109462499619, "step": 1638 }, { "epoch": 0.024473827637954592, "grad_norm": 0.455078125, "grad_norm_var": 0.021768808364868164, "learning_rate": 0.0001, "loss": 1.6655, "loss/crossentropy": 2.623804807662964, "loss/fcd": 1.46875, "loss/idx": 7.0, "loss/logits": 0.1967843845486641, "step": 1639 }, { "epoch": 0.02448875980856957, "grad_norm": 0.462890625, "grad_norm_var": 0.02134995460510254, "learning_rate": 0.0001, "loss": 1.6911, "loss/crossentropy": 2.4398980140686035, "loss/fcd": 1.48828125, "loss/idx": 7.0, "loss/logits": 0.20278441905975342, "step": 1640 }, { "epoch": 0.024503691979184554, "grad_norm": 0.4453125, "grad_norm_var": 0.021797672907511393, "learning_rate": 0.0001, "loss": 1.742, "loss/crossentropy": 2.558762311935425, "loss/fcd": 1.53125, "loss/idx": 7.0, "loss/logits": 0.21071960031986237, "step": 1641 }, { "epoch": 0.024518624149799537, "grad_norm": 0.466796875, "grad_norm_var": 0.02204736073811849, "learning_rate": 0.0001, "loss": 1.6663, "loss/crossentropy": 2.6475770473480225, "loss/fcd": 1.48828125, "loss/idx": 7.0, "loss/logits": 0.17798765748739243, "step": 1642 }, { "epoch": 0.024533556320414516, "grad_norm": 0.40234375, "grad_norm_var": 0.01443322499593099, "learning_rate": 0.0001, "loss": 1.4809, "loss/crossentropy": 2.598748207092285, "loss/fcd": 1.30859375, "loss/idx": 7.0, "loss/logits": 0.172304205596447, "step": 1643 }, { "epoch": 0.0245484884910295, "grad_norm": 0.458984375, "grad_norm_var": 0.01359569231669108, "learning_rate": 0.0001, "loss": 1.5578, "loss/crossentropy": 2.6750913858413696, "loss/fcd": 1.390625, "loss/idx": 7.0, "loss/logits": 0.1672048419713974, "step": 1644 }, { "epoch": 0.02456342066164448, "grad_norm": 0.61328125, "grad_norm_var": 0.014359029134114583, "learning_rate": 0.0001, "loss": 2.1439, "loss/crossentropy": 2.3801056146621704, "loss/fcd": 1.8671875, "loss/idx": 7.0, "loss/logits": 0.27668944001197815, "step": 1645 }, { "epoch": 0.02457835283225946, "grad_norm": 0.48046875, "grad_norm_var": 0.014385414123535157, "learning_rate": 0.0001, "loss": 1.5733, "loss/crossentropy": 2.666161894798279, "loss/fcd": 1.39453125, "loss/idx": 7.0, "loss/logits": 0.17872051894664764, "step": 1646 }, { "epoch": 0.024593285002874443, "grad_norm": 0.40625, "grad_norm_var": 0.014409494400024415, "learning_rate": 0.0001, "loss": 1.6064, "loss/crossentropy": 2.5731054544448853, "loss/fcd": 1.4296875, "loss/idx": 7.0, "loss/logits": 0.17671120166778564, "step": 1647 }, { "epoch": 0.024608217173489426, "grad_norm": 0.4375, "grad_norm_var": 0.006465387344360351, "learning_rate": 0.0001, "loss": 1.5815, "loss/crossentropy": 2.5400450229644775, "loss/fcd": 1.39453125, "loss/idx": 7.0, "loss/logits": 0.1869390457868576, "step": 1648 }, { "epoch": 0.024623149344104405, "grad_norm": 0.4296875, "grad_norm_var": 0.006549072265625, "learning_rate": 0.0001, "loss": 1.6781, "loss/crossentropy": 2.5286325216293335, "loss/fcd": 1.46484375, "loss/idx": 7.0, "loss/logits": 0.21328392624855042, "step": 1649 }, { "epoch": 0.024638081514719388, "grad_norm": 0.43359375, "grad_norm_var": 0.0023116429646809896, "learning_rate": 0.0001, "loss": 1.6307, "loss/crossentropy": 2.507314920425415, "loss/fcd": 1.44140625, "loss/idx": 7.0, "loss/logits": 0.1893395036458969, "step": 1650 }, { "epoch": 0.024653013685334367, "grad_norm": 0.419921875, "grad_norm_var": 0.0023874759674072264, "learning_rate": 0.0001, "loss": 1.6497, "loss/crossentropy": 2.6344709396362305, "loss/fcd": 1.44140625, "loss/idx": 7.0, "loss/logits": 0.20828261971473694, "step": 1651 }, { "epoch": 0.02466794585594935, "grad_norm": 0.390625, "grad_norm_var": 0.0025663852691650392, "learning_rate": 0.0001, "loss": 1.6056, "loss/crossentropy": 2.3970154523849487, "loss/fcd": 1.421875, "loss/idx": 7.0, "loss/logits": 0.1837209090590477, "step": 1652 }, { "epoch": 0.024682878026564332, "grad_norm": 0.51171875, "grad_norm_var": 0.0027438958485921225, "learning_rate": 0.0001, "loss": 1.7946, "loss/crossentropy": 2.633846640586853, "loss/fcd": 1.5546875, "loss/idx": 7.0, "loss/logits": 0.23988796770572662, "step": 1653 }, { "epoch": 0.02469781019717931, "grad_norm": 0.91015625, "grad_norm_var": 0.015731414159138996, "learning_rate": 0.0001, "loss": 1.7568, "loss/crossentropy": 2.801383137702942, "loss/fcd": 1.53515625, "loss/idx": 7.0, "loss/logits": 0.22162751853466034, "step": 1654 }, { "epoch": 0.024712742367794294, "grad_norm": 0.44921875, "grad_norm_var": 0.015755208333333333, "learning_rate": 0.0001, "loss": 1.5121, "loss/crossentropy": 2.677790641784668, "loss/fcd": 1.3515625, "loss/idx": 7.0, "loss/logits": 0.16054877638816833, "step": 1655 }, { "epoch": 0.024727674538409277, "grad_norm": 0.498046875, "grad_norm_var": 0.01574090321858724, "learning_rate": 0.0001, "loss": 1.5935, "loss/crossentropy": 2.7032641172409058, "loss/fcd": 1.4140625, "loss/idx": 7.0, "loss/logits": 0.17940928041934967, "step": 1656 }, { "epoch": 0.024742606709024256, "grad_norm": 0.515625, "grad_norm_var": 0.01568139394124349, "learning_rate": 0.0001, "loss": 1.7322, "loss/crossentropy": 2.6352341175079346, "loss/fcd": 1.50390625, "loss/idx": 7.0, "loss/logits": 0.22824689745903015, "step": 1657 }, { "epoch": 0.02475753887963924, "grad_norm": 0.51953125, "grad_norm_var": 0.015698989232381184, "learning_rate": 0.0001, "loss": 1.6144, "loss/crossentropy": 2.6113274097442627, "loss/fcd": 1.43359375, "loss/idx": 7.0, "loss/logits": 0.1807587519288063, "step": 1658 }, { "epoch": 0.02477247105025422, "grad_norm": 0.53125, "grad_norm_var": 0.015191253026326497, "learning_rate": 0.0001, "loss": 1.9066, "loss/crossentropy": 2.7532520294189453, "loss/fcd": 1.64453125, "loss/idx": 7.0, "loss/logits": 0.26209019124507904, "step": 1659 }, { "epoch": 0.0247874032208692, "grad_norm": 0.703125, "grad_norm_var": 0.017569478352864584, "learning_rate": 0.0001, "loss": 1.9952, "loss/crossentropy": 2.48457670211792, "loss/fcd": 1.75390625, "loss/idx": 7.0, "loss/logits": 0.2413245141506195, "step": 1660 }, { "epoch": 0.024802335391484184, "grad_norm": 0.3984375, "grad_norm_var": 0.017656898498535155, "learning_rate": 0.0001, "loss": 1.378, "loss/crossentropy": 2.5506786108016968, "loss/fcd": 1.2421875, "loss/idx": 7.0, "loss/logits": 0.13581915944814682, "step": 1661 }, { "epoch": 0.024817267562099163, "grad_norm": 0.443359375, "grad_norm_var": 0.017850478490193684, "learning_rate": 0.0001, "loss": 1.6875, "loss/crossentropy": 2.6211588382720947, "loss/fcd": 1.48046875, "loss/idx": 7.0, "loss/logits": 0.20701348036527634, "step": 1662 }, { "epoch": 0.024832199732714146, "grad_norm": 0.43359375, "grad_norm_var": 0.017555856704711915, "learning_rate": 0.0001, "loss": 1.6533, "loss/crossentropy": 2.3935290575027466, "loss/fcd": 1.46875, "loss/idx": 7.0, "loss/logits": 0.18453489243984222, "step": 1663 }, { "epoch": 0.024847131903329128, "grad_norm": 0.447265625, "grad_norm_var": 0.017478370666503908, "learning_rate": 0.0001, "loss": 1.634, "loss/crossentropy": 2.560220956802368, "loss/fcd": 1.453125, "loss/idx": 7.0, "loss/logits": 0.1809101179242134, "step": 1664 }, { "epoch": 0.024862064073944107, "grad_norm": 0.6015625, "grad_norm_var": 0.017663002014160156, "learning_rate": 0.0001, "loss": 1.8313, "loss/crossentropy": 2.6589072942733765, "loss/fcd": 1.56640625, "loss/idx": 7.0, "loss/logits": 0.2649317979812622, "step": 1665 }, { "epoch": 0.02487699624455909, "grad_norm": 0.458984375, "grad_norm_var": 0.01743467648824056, "learning_rate": 0.0001, "loss": 1.6906, "loss/crossentropy": 2.545647144317627, "loss/fcd": 1.484375, "loss/idx": 7.0, "loss/logits": 0.20625202357769012, "step": 1666 }, { "epoch": 0.024891928415174073, "grad_norm": 0.416015625, "grad_norm_var": 0.01748490333557129, "learning_rate": 0.0001, "loss": 1.5305, "loss/crossentropy": 2.803003668785095, "loss/fcd": 1.3515625, "loss/idx": 7.0, "loss/logits": 0.17893870919942856, "step": 1667 }, { "epoch": 0.024906860585789052, "grad_norm": 0.5390625, "grad_norm_var": 0.016414626439412435, "learning_rate": 0.0001, "loss": 1.6153, "loss/crossentropy": 2.668912410736084, "loss/fcd": 1.453125, "loss/idx": 7.0, "loss/logits": 0.16222340613603592, "step": 1668 }, { "epoch": 0.024921792756404035, "grad_norm": 0.404296875, "grad_norm_var": 0.017305437723795572, "learning_rate": 0.0001, "loss": 1.5889, "loss/crossentropy": 2.715358018875122, "loss/fcd": 1.390625, "loss/idx": 7.0, "loss/logits": 0.19830583781003952, "step": 1669 }, { "epoch": 0.024936724927019017, "grad_norm": 0.451171875, "grad_norm_var": 0.006402317682902018, "learning_rate": 0.0001, "loss": 1.5415, "loss/crossentropy": 2.7629430294036865, "loss/fcd": 1.375, "loss/idx": 7.0, "loss/logits": 0.16647836565971375, "step": 1670 }, { "epoch": 0.024951657097633997, "grad_norm": 0.408203125, "grad_norm_var": 0.006720415751139323, "learning_rate": 0.0001, "loss": 1.4606, "loss/crossentropy": 2.637349843978882, "loss/fcd": 1.3046875, "loss/idx": 7.0, "loss/logits": 0.15592695027589798, "step": 1671 }, { "epoch": 0.02496658926824898, "grad_norm": 0.50390625, "grad_norm_var": 0.006732288996378581, "learning_rate": 0.0001, "loss": 1.6183, "loss/crossentropy": 2.631614923477173, "loss/fcd": 1.4375, "loss/idx": 7.0, "loss/logits": 0.18078526109457016, "step": 1672 }, { "epoch": 0.024981521438863962, "grad_norm": 0.48828125, "grad_norm_var": 0.0066708723704020185, "learning_rate": 0.0001, "loss": 1.6249, "loss/crossentropy": 2.4787803888320923, "loss/fcd": 1.453125, "loss/idx": 7.0, "loss/logits": 0.17176026105880737, "step": 1673 }, { "epoch": 0.02499645360947894, "grad_norm": 0.458984375, "grad_norm_var": 0.006615193684895834, "learning_rate": 0.0001, "loss": 1.6468, "loss/crossentropy": 2.4326345920562744, "loss/fcd": 1.453125, "loss/idx": 7.0, "loss/logits": 0.1936572641134262, "step": 1674 }, { "epoch": 0.025011385780093924, "grad_norm": 0.482421875, "grad_norm_var": 0.006433598200480143, "learning_rate": 0.0001, "loss": 1.646, "loss/crossentropy": 2.4791451692581177, "loss/fcd": 1.45703125, "loss/idx": 7.0, "loss/logits": 0.1889389008283615, "step": 1675 }, { "epoch": 0.025026317950708903, "grad_norm": 0.74609375, "grad_norm_var": 0.007842111587524413, "learning_rate": 0.0001, "loss": 1.9245, "loss/crossentropy": 2.682641386985779, "loss/fcd": 1.68359375, "loss/idx": 7.0, "loss/logits": 0.24086102843284607, "step": 1676 }, { "epoch": 0.025041250121323886, "grad_norm": 0.42578125, "grad_norm_var": 0.007591104507446289, "learning_rate": 0.0001, "loss": 1.6518, "loss/crossentropy": 2.6392557621002197, "loss/fcd": 1.44921875, "loss/idx": 7.0, "loss/logits": 0.20261266082525253, "step": 1677 }, { "epoch": 0.02505618229193887, "grad_norm": 0.416015625, "grad_norm_var": 0.007778024673461914, "learning_rate": 0.0001, "loss": 1.6168, "loss/crossentropy": 2.868458390235901, "loss/fcd": 1.421875, "loss/idx": 7.0, "loss/logits": 0.1949130743741989, "step": 1678 }, { "epoch": 0.025071114462553848, "grad_norm": 0.55078125, "grad_norm_var": 0.007909631729125977, "learning_rate": 0.0001, "loss": 1.6456, "loss/crossentropy": 2.4708521366119385, "loss/fcd": 1.47265625, "loss/idx": 7.0, "loss/logits": 0.17297638207674026, "step": 1679 }, { "epoch": 0.02508604663316883, "grad_norm": 0.5078125, "grad_norm_var": 0.007814534505208333, "learning_rate": 0.0001, "loss": 1.7805, "loss/crossentropy": 2.543753504753113, "loss/fcd": 1.56640625, "loss/idx": 7.0, "loss/logits": 0.214086152613163, "step": 1680 }, { "epoch": 0.025100978803783813, "grad_norm": 0.51171875, "grad_norm_var": 0.006997108459472656, "learning_rate": 0.0001, "loss": 1.625, "loss/crossentropy": 2.4634329080581665, "loss/fcd": 1.421875, "loss/idx": 7.0, "loss/logits": 0.20312698930501938, "step": 1681 }, { "epoch": 0.025115910974398793, "grad_norm": 0.609375, "grad_norm_var": 0.007877079645792644, "learning_rate": 0.0001, "loss": 1.8753, "loss/crossentropy": 2.5432770252227783, "loss/fcd": 1.65234375, "loss/idx": 7.0, "loss/logits": 0.22290775179862976, "step": 1682 }, { "epoch": 0.025130843145013775, "grad_norm": 0.451171875, "grad_norm_var": 0.007584110895792643, "learning_rate": 0.0001, "loss": 1.6209, "loss/crossentropy": 2.307678699493408, "loss/fcd": 1.4375, "loss/idx": 7.0, "loss/logits": 0.18343796581029892, "step": 1683 }, { "epoch": 0.025145775315628758, "grad_norm": 0.50390625, "grad_norm_var": 0.007465092341105143, "learning_rate": 0.0001, "loss": 1.8394, "loss/crossentropy": 2.988747477531433, "loss/fcd": 1.61328125, "loss/idx": 7.0, "loss/logits": 0.22612474113702774, "step": 1684 }, { "epoch": 0.025160707486243737, "grad_norm": 0.38671875, "grad_norm_var": 0.007696978251139323, "learning_rate": 0.0001, "loss": 1.5266, "loss/crossentropy": 2.45614755153656, "loss/fcd": 1.359375, "loss/idx": 7.0, "loss/logits": 0.1672608107328415, "step": 1685 }, { "epoch": 0.02517563965685872, "grad_norm": 0.458984375, "grad_norm_var": 0.007656288146972656, "learning_rate": 0.0001, "loss": 1.884, "loss/crossentropy": 2.2561362981796265, "loss/fcd": 1.625, "loss/idx": 7.0, "loss/logits": 0.25896310061216354, "step": 1686 }, { "epoch": 0.0251905718274737, "grad_norm": 0.45703125, "grad_norm_var": 0.007244221369425456, "learning_rate": 0.0001, "loss": 1.5452, "loss/crossentropy": 2.6483774185180664, "loss/fcd": 1.37890625, "loss/idx": 7.0, "loss/logits": 0.16627098619937897, "step": 1687 }, { "epoch": 0.025205503998088682, "grad_norm": 0.5, "grad_norm_var": 0.00724180539449056, "learning_rate": 0.0001, "loss": 1.8459, "loss/crossentropy": 2.4387799501419067, "loss/fcd": 1.5859375, "loss/idx": 7.0, "loss/logits": 0.2599783390760422, "step": 1688 }, { "epoch": 0.025220436168703664, "grad_norm": 0.55078125, "grad_norm_var": 0.007411686579386393, "learning_rate": 0.0001, "loss": 1.9159, "loss/crossentropy": 2.43993878364563, "loss/fcd": 1.65625, "loss/idx": 7.0, "loss/logits": 0.2596488744020462, "step": 1689 }, { "epoch": 0.025235368339318644, "grad_norm": 0.396484375, "grad_norm_var": 0.008006779352823894, "learning_rate": 0.0001, "loss": 1.5128, "loss/crossentropy": 2.69704270362854, "loss/fcd": 1.34375, "loss/idx": 7.0, "loss/logits": 0.16903749108314514, "step": 1690 }, { "epoch": 0.025250300509933626, "grad_norm": 0.546875, "grad_norm_var": 0.008139483133951823, "learning_rate": 0.0001, "loss": 1.6848, "loss/crossentropy": 2.6188812255859375, "loss/fcd": 1.5, "loss/idx": 7.0, "loss/logits": 0.18483464419841766, "step": 1691 }, { "epoch": 0.02526523268054861, "grad_norm": 0.412109375, "grad_norm_var": 0.0042065779368082685, "learning_rate": 0.0001, "loss": 1.6088, "loss/crossentropy": 2.7869977951049805, "loss/fcd": 1.41796875, "loss/idx": 7.0, "loss/logits": 0.19087275862693787, "step": 1692 }, { "epoch": 0.02528016485116359, "grad_norm": 0.6015625, "grad_norm_var": 0.0048588911692301435, "learning_rate": 0.0001, "loss": 1.6886, "loss/crossentropy": 2.6109875440597534, "loss/fcd": 1.4765625, "loss/idx": 7.0, "loss/logits": 0.2120508998632431, "step": 1693 }, { "epoch": 0.02529509702177857, "grad_norm": 0.51171875, "grad_norm_var": 0.004470252990722656, "learning_rate": 0.0001, "loss": 1.5881, "loss/crossentropy": 2.454450845718384, "loss/fcd": 1.42578125, "loss/idx": 7.0, "loss/logits": 0.16236257553100586, "step": 1694 }, { "epoch": 0.025310029192393554, "grad_norm": 0.41796875, "grad_norm_var": 0.004625892639160157, "learning_rate": 0.0001, "loss": 1.423, "loss/crossentropy": 2.6954513788223267, "loss/fcd": 1.26953125, "loss/idx": 7.0, "loss/logits": 0.15342209488153458, "step": 1695 }, { "epoch": 0.025324961363008533, "grad_norm": 0.466796875, "grad_norm_var": 0.004628229141235352, "learning_rate": 0.0001, "loss": 1.5733, "loss/crossentropy": 2.597328305244446, "loss/fcd": 1.40234375, "loss/idx": 7.0, "loss/logits": 0.17098169773817062, "step": 1696 }, { "epoch": 0.025339893533623516, "grad_norm": 0.47265625, "grad_norm_var": 0.004591989517211914, "learning_rate": 0.0001, "loss": 1.6754, "loss/crossentropy": 2.6136374473571777, "loss/fcd": 1.4765625, "loss/idx": 7.0, "loss/logits": 0.1988101452589035, "step": 1697 }, { "epoch": 0.0253548257042385, "grad_norm": 0.47265625, "grad_norm_var": 0.0034749190012613933, "learning_rate": 0.0001, "loss": 1.6458, "loss/crossentropy": 2.4211668968200684, "loss/fcd": 1.453125, "loss/idx": 7.0, "loss/logits": 0.192709781229496, "step": 1698 }, { "epoch": 0.025369757874853478, "grad_norm": 0.48828125, "grad_norm_var": 0.0034407933553059896, "learning_rate": 0.0001, "loss": 1.8869, "loss/crossentropy": 2.555441379547119, "loss/fcd": 1.64453125, "loss/idx": 7.0, "loss/logits": 0.24241416156291962, "step": 1699 }, { "epoch": 0.02538469004546846, "grad_norm": 0.453125, "grad_norm_var": 0.0034250895182291667, "learning_rate": 0.0001, "loss": 1.6706, "loss/crossentropy": 2.854443907737732, "loss/fcd": 1.46484375, "loss/idx": 7.0, "loss/logits": 0.2057504653930664, "step": 1700 }, { "epoch": 0.02539962221608344, "grad_norm": 0.63671875, "grad_norm_var": 0.004401652018229166, "learning_rate": 0.0001, "loss": 1.8133, "loss/crossentropy": 2.7695144414901733, "loss/fcd": 1.56640625, "loss/idx": 7.0, "loss/logits": 0.2468656525015831, "step": 1701 }, { "epoch": 0.025414554386698422, "grad_norm": 0.42578125, "grad_norm_var": 0.004608901341756185, "learning_rate": 0.0001, "loss": 1.6345, "loss/crossentropy": 2.57673442363739, "loss/fcd": 1.4453125, "loss/idx": 7.0, "loss/logits": 0.18916824460029602, "step": 1702 }, { "epoch": 0.025429486557313405, "grad_norm": 0.439453125, "grad_norm_var": 0.00470116933186849, "learning_rate": 0.0001, "loss": 1.6128, "loss/crossentropy": 2.578126311302185, "loss/fcd": 1.42578125, "loss/idx": 7.0, "loss/logits": 0.1870439350605011, "step": 1703 }, { "epoch": 0.025444418727928384, "grad_norm": 0.51953125, "grad_norm_var": 0.004758707682291667, "learning_rate": 0.0001, "loss": 1.6175, "loss/crossentropy": 2.347869634628296, "loss/fcd": 1.453125, "loss/idx": 7.0, "loss/logits": 0.164364293217659, "step": 1704 }, { "epoch": 0.025459350898543367, "grad_norm": 0.455078125, "grad_norm_var": 0.004533624649047852, "learning_rate": 0.0001, "loss": 1.733, "loss/crossentropy": 2.692792534828186, "loss/fcd": 1.50390625, "loss/idx": 7.0, "loss/logits": 0.22913546860218048, "step": 1705 }, { "epoch": 0.02547428306915835, "grad_norm": 0.6875, "grad_norm_var": 0.006496938069661459, "learning_rate": 0.0001, "loss": 1.8966, "loss/crossentropy": 2.581258535385132, "loss/fcd": 1.671875, "loss/idx": 7.0, "loss/logits": 0.22475934028625488, "step": 1706 }, { "epoch": 0.02548921523977333, "grad_norm": 0.41796875, "grad_norm_var": 0.00673821767171224, "learning_rate": 0.0001, "loss": 1.555, "loss/crossentropy": 2.708857297897339, "loss/fcd": 1.37109375, "loss/idx": 7.0, "loss/logits": 0.18387839198112488, "step": 1707 }, { "epoch": 0.02550414741038831, "grad_norm": 0.46875, "grad_norm_var": 0.006332127253214518, "learning_rate": 0.0001, "loss": 1.5831, "loss/crossentropy": 2.8452978134155273, "loss/fcd": 1.3984375, "loss/idx": 7.0, "loss/logits": 0.18468762934207916, "step": 1708 }, { "epoch": 0.025519079581003294, "grad_norm": 0.41015625, "grad_norm_var": 0.005927133560180664, "learning_rate": 0.0001, "loss": 1.684, "loss/crossentropy": 2.57491397857666, "loss/fcd": 1.46875, "loss/idx": 7.0, "loss/logits": 0.21522057056427002, "step": 1709 }, { "epoch": 0.025534011751618273, "grad_norm": 0.4765625, "grad_norm_var": 0.005874490737915039, "learning_rate": 0.0001, "loss": 1.7576, "loss/crossentropy": 2.5990965366363525, "loss/fcd": 1.53515625, "loss/idx": 7.0, "loss/logits": 0.22246869653463364, "step": 1710 }, { "epoch": 0.025548943922233256, "grad_norm": 0.423828125, "grad_norm_var": 0.005826759338378906, "learning_rate": 0.0001, "loss": 1.6086, "loss/crossentropy": 2.581049919128418, "loss/fcd": 1.421875, "loss/idx": 7.0, "loss/logits": 0.18676753342151642, "step": 1711 }, { "epoch": 0.025563876092848235, "grad_norm": 0.478515625, "grad_norm_var": 0.005811309814453125, "learning_rate": 0.0001, "loss": 1.6942, "loss/crossentropy": 2.5850048065185547, "loss/fcd": 1.5, "loss/idx": 7.0, "loss/logits": 0.1942325085401535, "step": 1712 }, { "epoch": 0.025578808263463218, "grad_norm": 0.43359375, "grad_norm_var": 0.0059600830078125, "learning_rate": 0.0001, "loss": 1.592, "loss/crossentropy": 2.736702561378479, "loss/fcd": 1.40625, "loss/idx": 7.0, "loss/logits": 0.1857914999127388, "step": 1713 }, { "epoch": 0.0255937404340782, "grad_norm": 0.4765625, "grad_norm_var": 0.005956967671712239, "learning_rate": 0.0001, "loss": 1.3808, "loss/crossentropy": 2.79103684425354, "loss/fcd": 1.234375, "loss/idx": 7.0, "loss/logits": 0.14638280868530273, "step": 1714 }, { "epoch": 0.02560867260469318, "grad_norm": 0.4375, "grad_norm_var": 0.00606689453125, "learning_rate": 0.0001, "loss": 1.5415, "loss/crossentropy": 2.5501586198806763, "loss/fcd": 1.37890625, "loss/idx": 7.0, "loss/logits": 0.16254615038633347, "step": 1715 }, { "epoch": 0.025623604775308163, "grad_norm": 0.47265625, "grad_norm_var": 0.00602715810139974, "learning_rate": 0.0001, "loss": 1.5126, "loss/crossentropy": 2.6981340646743774, "loss/fcd": 1.33984375, "loss/idx": 7.0, "loss/logits": 0.17272990942001343, "step": 1716 }, { "epoch": 0.025638536945923145, "grad_norm": 0.53125, "grad_norm_var": 0.0045010884602864586, "learning_rate": 0.0001, "loss": 1.7696, "loss/crossentropy": 2.4307806491851807, "loss/fcd": 1.5703125, "loss/idx": 7.0, "loss/logits": 0.19927946478128433, "step": 1717 }, { "epoch": 0.025653469116538125, "grad_norm": 0.53125, "grad_norm_var": 0.00454400380452474, "learning_rate": 0.0001, "loss": 1.5289, "loss/crossentropy": 2.473902702331543, "loss/fcd": 1.37109375, "loss/idx": 7.0, "loss/logits": 0.1577570140361786, "step": 1718 }, { "epoch": 0.025668401287153107, "grad_norm": 0.404296875, "grad_norm_var": 0.004805501302083333, "learning_rate": 0.0001, "loss": 1.6124, "loss/crossentropy": 2.475058436393738, "loss/fcd": 1.41015625, "loss/idx": 7.0, "loss/logits": 0.20222157984972, "step": 1719 }, { "epoch": 0.02568333345776809, "grad_norm": 0.390625, "grad_norm_var": 0.00510552724202474, "learning_rate": 0.0001, "loss": 1.5999, "loss/crossentropy": 2.394508481025696, "loss/fcd": 1.41796875, "loss/idx": 7.0, "loss/logits": 0.18192025274038315, "step": 1720 }, { "epoch": 0.02569826562838307, "grad_norm": 0.5, "grad_norm_var": 0.005151224136352539, "learning_rate": 0.0001, "loss": 1.6383, "loss/crossentropy": 2.6393444538116455, "loss/fcd": 1.43359375, "loss/idx": 7.0, "loss/logits": 0.20466701686382294, "step": 1721 }, { "epoch": 0.025713197798998052, "grad_norm": 0.439453125, "grad_norm_var": 0.001846758524576823, "learning_rate": 0.0001, "loss": 1.6449, "loss/crossentropy": 2.59916889667511, "loss/fcd": 1.4375, "loss/idx": 7.0, "loss/logits": 0.20742526650428772, "step": 1722 }, { "epoch": 0.02572812996961303, "grad_norm": 0.376953125, "grad_norm_var": 0.0021588484446207683, "learning_rate": 0.0001, "loss": 1.5319, "loss/crossentropy": 2.5505361557006836, "loss/fcd": 1.3671875, "loss/idx": 7.0, "loss/logits": 0.16474661231040955, "step": 1723 }, { "epoch": 0.025743062140228014, "grad_norm": 0.439453125, "grad_norm_var": 0.002151934305826823, "learning_rate": 0.0001, "loss": 1.7384, "loss/crossentropy": 2.5282329320907593, "loss/fcd": 1.53125, "loss/idx": 7.0, "loss/logits": 0.2071186602115631, "step": 1724 }, { "epoch": 0.025757994310842997, "grad_norm": 0.6328125, "grad_norm_var": 0.004025522867838542, "learning_rate": 0.0001, "loss": 1.8401, "loss/crossentropy": 3.063611149787903, "loss/fcd": 1.609375, "loss/idx": 7.0, "loss/logits": 0.2307065650820732, "step": 1725 }, { "epoch": 0.025772926481457976, "grad_norm": 0.44140625, "grad_norm_var": 0.004050127665201823, "learning_rate": 0.0001, "loss": 1.6793, "loss/crossentropy": 2.620327115058899, "loss/fcd": 1.4765625, "loss/idx": 7.0, "loss/logits": 0.20271999388933182, "step": 1726 }, { "epoch": 0.02578785865207296, "grad_norm": 0.4296875, "grad_norm_var": 0.004021565119425456, "learning_rate": 0.0001, "loss": 1.5943, "loss/crossentropy": 2.5267714262008667, "loss/fcd": 1.39453125, "loss/idx": 7.0, "loss/logits": 0.19974888116121292, "step": 1727 }, { "epoch": 0.02580279082268794, "grad_norm": 0.359375, "grad_norm_var": 0.004670206705729167, "learning_rate": 0.0001, "loss": 1.4743, "loss/crossentropy": 2.359518885612488, "loss/fcd": 1.3125, "loss/idx": 7.0, "loss/logits": 0.16182270646095276, "step": 1728 }, { "epoch": 0.02581772299330292, "grad_norm": 0.5234375, "grad_norm_var": 0.00490563710530599, "learning_rate": 0.0001, "loss": 1.6289, "loss/crossentropy": 2.655366063117981, "loss/fcd": 1.43359375, "loss/idx": 7.0, "loss/logits": 0.19535569846630096, "step": 1729 }, { "epoch": 0.025832655163917903, "grad_norm": 0.4765625, "grad_norm_var": 0.00490563710530599, "learning_rate": 0.0001, "loss": 1.6928, "loss/crossentropy": 2.4279606342315674, "loss/fcd": 1.50390625, "loss/idx": 7.0, "loss/logits": 0.18886201083660126, "step": 1730 }, { "epoch": 0.025847587334532886, "grad_norm": 0.62109375, "grad_norm_var": 0.006420644124348959, "learning_rate": 0.0001, "loss": 1.7304, "loss/crossentropy": 2.588435649871826, "loss/fcd": 1.53515625, "loss/idx": 7.0, "loss/logits": 0.19522760808467865, "step": 1731 }, { "epoch": 0.025862519505147865, "grad_norm": 0.423828125, "grad_norm_var": 0.006572834650675456, "learning_rate": 0.0001, "loss": 1.5089, "loss/crossentropy": 2.426050543785095, "loss/fcd": 1.34375, "loss/idx": 7.0, "loss/logits": 0.16515402495861053, "step": 1732 }, { "epoch": 0.025877451675762848, "grad_norm": 0.43359375, "grad_norm_var": 0.006372563044230143, "learning_rate": 0.0001, "loss": 1.5332, "loss/crossentropy": 2.7584487199783325, "loss/fcd": 1.35546875, "loss/idx": 7.0, "loss/logits": 0.17777415364980698, "step": 1733 }, { "epoch": 0.02589238384637783, "grad_norm": 0.41796875, "grad_norm_var": 0.006158685684204102, "learning_rate": 0.0001, "loss": 1.6417, "loss/crossentropy": 2.219975471496582, "loss/fcd": 1.4375, "loss/idx": 7.0, "loss/logits": 0.20416638255119324, "step": 1734 }, { "epoch": 0.02590731601699281, "grad_norm": 0.423828125, "grad_norm_var": 0.00604551633199056, "learning_rate": 0.0001, "loss": 1.5733, "loss/crossentropy": 2.7796677350997925, "loss/fcd": 1.39453125, "loss/idx": 7.0, "loss/logits": 0.17880570888519287, "step": 1735 }, { "epoch": 0.025922248187607792, "grad_norm": 0.421875, "grad_norm_var": 0.0058252811431884766, "learning_rate": 0.0001, "loss": 1.6237, "loss/crossentropy": 2.6636239290237427, "loss/fcd": 1.43359375, "loss/idx": 7.0, "loss/logits": 0.19006356596946716, "step": 1736 }, { "epoch": 0.02593718035822277, "grad_norm": 0.388671875, "grad_norm_var": 0.00600738525390625, "learning_rate": 0.0001, "loss": 1.4821, "loss/crossentropy": 2.5613255500793457, "loss/fcd": 1.32421875, "loss/idx": 7.0, "loss/logits": 0.15791697800159454, "step": 1737 }, { "epoch": 0.025952112528837754, "grad_norm": 0.435546875, "grad_norm_var": 0.006015459696451823, "learning_rate": 0.0001, "loss": 1.5669, "loss/crossentropy": 2.7683653831481934, "loss/fcd": 1.37890625, "loss/idx": 7.0, "loss/logits": 0.1879609152674675, "step": 1738 }, { "epoch": 0.025967044699452737, "grad_norm": 0.484375, "grad_norm_var": 0.005649169286092122, "learning_rate": 0.0001, "loss": 1.6172, "loss/crossentropy": 2.6315919160842896, "loss/fcd": 1.43359375, "loss/idx": 7.0, "loss/logits": 0.18363645672798157, "step": 1739 }, { "epoch": 0.025981976870067716, "grad_norm": 0.453125, "grad_norm_var": 0.005624135335286458, "learning_rate": 0.0001, "loss": 1.6165, "loss/crossentropy": 2.885563850402832, "loss/fcd": 1.4140625, "loss/idx": 7.0, "loss/logits": 0.20242498070001602, "step": 1740 }, { "epoch": 0.0259969090406827, "grad_norm": 0.453125, "grad_norm_var": 0.0035125732421875, "learning_rate": 0.0001, "loss": 1.5578, "loss/crossentropy": 2.66191029548645, "loss/fcd": 1.38671875, "loss/idx": 7.0, "loss/logits": 0.17110195010900497, "step": 1741 }, { "epoch": 0.02601184121129768, "grad_norm": 0.380859375, "grad_norm_var": 0.003804763158162435, "learning_rate": 0.0001, "loss": 1.516, "loss/crossentropy": 2.490989565849304, "loss/fcd": 1.3515625, "loss/idx": 7.0, "loss/logits": 0.1644165813922882, "step": 1742 }, { "epoch": 0.02602677338191266, "grad_norm": 0.42578125, "grad_norm_var": 0.003813918431599935, "learning_rate": 0.0001, "loss": 1.6632, "loss/crossentropy": 2.6029820442199707, "loss/fcd": 1.46484375, "loss/idx": 7.0, "loss/logits": 0.19833851605653763, "step": 1743 }, { "epoch": 0.026041705552527643, "grad_norm": 0.474609375, "grad_norm_var": 0.003325335184733073, "learning_rate": 0.0001, "loss": 1.6439, "loss/crossentropy": 2.4745417833328247, "loss/fcd": 1.4609375, "loss/idx": 7.0, "loss/logits": 0.18294240534305573, "step": 1744 }, { "epoch": 0.026056637723142626, "grad_norm": 0.447265625, "grad_norm_var": 0.002966419855753581, "learning_rate": 0.0001, "loss": 1.7305, "loss/crossentropy": 2.6282153129577637, "loss/fcd": 1.53515625, "loss/idx": 7.0, "loss/logits": 0.1953415721654892, "step": 1745 }, { "epoch": 0.026071569893757605, "grad_norm": 0.7734375, "grad_norm_var": 0.009620014826456707, "learning_rate": 0.0001, "loss": 2.1886, "loss/crossentropy": 2.5288267135620117, "loss/fcd": 1.87890625, "loss/idx": 7.0, "loss/logits": 0.3096487820148468, "step": 1746 }, { "epoch": 0.026086502064372588, "grad_norm": 0.43359375, "grad_norm_var": 0.007944599787394205, "learning_rate": 0.0001, "loss": 1.5432, "loss/crossentropy": 2.5799275636672974, "loss/fcd": 1.3671875, "loss/idx": 7.0, "loss/logits": 0.1759936362504959, "step": 1747 }, { "epoch": 0.026101434234987567, "grad_norm": 0.455078125, "grad_norm_var": 0.007877969741821289, "learning_rate": 0.0001, "loss": 1.5568, "loss/crossentropy": 2.489788770675659, "loss/fcd": 1.38671875, "loss/idx": 7.0, "loss/logits": 0.17003405839204788, "step": 1748 }, { "epoch": 0.02611636640560255, "grad_norm": 0.54296875, "grad_norm_var": 0.008292754491170248, "learning_rate": 0.0001, "loss": 1.6785, "loss/crossentropy": 2.467368721961975, "loss/fcd": 1.4921875, "loss/idx": 7.0, "loss/logits": 0.1863422393798828, "step": 1749 }, { "epoch": 0.026131298576217533, "grad_norm": 0.376953125, "grad_norm_var": 0.008645566304524739, "learning_rate": 0.0001, "loss": 1.5403, "loss/crossentropy": 2.6278090476989746, "loss/fcd": 1.36328125, "loss/idx": 7.0, "loss/logits": 0.17704641819000244, "step": 1750 }, { "epoch": 0.026146230746832512, "grad_norm": 0.416015625, "grad_norm_var": 0.008687782287597656, "learning_rate": 0.0001, "loss": 1.5574, "loss/crossentropy": 2.709711790084839, "loss/fcd": 1.37890625, "loss/idx": 7.0, "loss/logits": 0.17853021621704102, "step": 1751 }, { "epoch": 0.026161162917447495, "grad_norm": 0.423828125, "grad_norm_var": 0.008678038914998373, "learning_rate": 0.0001, "loss": 1.6053, "loss/crossentropy": 2.553797960281372, "loss/fcd": 1.41796875, "loss/idx": 7.0, "loss/logits": 0.1872834861278534, "step": 1752 }, { "epoch": 0.026176095088062477, "grad_norm": 0.421875, "grad_norm_var": 0.008429718017578126, "learning_rate": 0.0001, "loss": 1.7212, "loss/crossentropy": 2.6268469095230103, "loss/fcd": 1.5078125, "loss/idx": 7.0, "loss/logits": 0.213436096906662, "step": 1753 }, { "epoch": 0.026191027258677457, "grad_norm": 0.4296875, "grad_norm_var": 0.008452844619750977, "learning_rate": 0.0001, "loss": 1.7586, "loss/crossentropy": 2.3185973167419434, "loss/fcd": 1.54296875, "loss/idx": 7.0, "loss/logits": 0.2156294584274292, "step": 1754 }, { "epoch": 0.02620595942929244, "grad_norm": 0.44921875, "grad_norm_var": 0.008425378799438476, "learning_rate": 0.0001, "loss": 1.7854, "loss/crossentropy": 2.4275808334350586, "loss/fcd": 1.55859375, "loss/idx": 7.0, "loss/logits": 0.22683294117450714, "step": 1755 }, { "epoch": 0.026220891599907422, "grad_norm": 0.45703125, "grad_norm_var": 0.00842283566792806, "learning_rate": 0.0001, "loss": 1.59, "loss/crossentropy": 2.330631732940674, "loss/fcd": 1.41796875, "loss/idx": 7.0, "loss/logits": 0.17204724997282028, "step": 1756 }, { "epoch": 0.0262358237705224, "grad_norm": 0.275390625, "grad_norm_var": 0.01056207021077474, "learning_rate": 0.0001, "loss": 1.5004, "loss/crossentropy": 2.5538136959075928, "loss/fcd": 1.34375, "loss/idx": 7.25, "loss/logits": 0.15666767954826355, "step": 1757 }, { "epoch": 0.026250755941137384, "grad_norm": 0.2490234375, "grad_norm_var": 0.01284570296605428, "learning_rate": 0.0001, "loss": 1.3825, "loss/crossentropy": 2.707225203514099, "loss/fcd": 1.20703125, "loss/idx": 7.5, "loss/logits": 0.17551743984222412, "step": 1758 }, { "epoch": 0.026265688111752363, "grad_norm": 0.353515625, "grad_norm_var": 0.013316182295481364, "learning_rate": 0.0001, "loss": 1.6501, "loss/crossentropy": 2.4359453916549683, "loss/fcd": 1.4453125, "loss/idx": 7.5, "loss/logits": 0.20481543242931366, "step": 1759 }, { "epoch": 0.026280620282367346, "grad_norm": 0.34375, "grad_norm_var": 0.013716598351796469, "learning_rate": 0.0001, "loss": 1.7616, "loss/crossentropy": 2.471827507019043, "loss/fcd": 1.515625, "loss/idx": 7.5, "loss/logits": 0.24594175815582275, "step": 1760 }, { "epoch": 0.02629555245298233, "grad_norm": 0.28515625, "grad_norm_var": 0.01494350035985311, "learning_rate": 0.0001, "loss": 1.424, "loss/crossentropy": 2.588927745819092, "loss/fcd": 1.265625, "loss/idx": 7.5, "loss/logits": 0.15835162997245789, "step": 1761 }, { "epoch": 0.026310484623597308, "grad_norm": 0.294921875, "grad_norm_var": 0.006571034590403239, "learning_rate": 0.0001, "loss": 1.5482, "loss/crossentropy": 2.506476879119873, "loss/fcd": 1.3515625, "loss/idx": 7.5, "loss/logits": 0.19668720662593842, "step": 1762 }, { "epoch": 0.02632541679421229, "grad_norm": 0.31640625, "grad_norm_var": 0.0067169467608133955, "learning_rate": 0.0001, "loss": 1.6674, "loss/crossentropy": 2.1519944071769714, "loss/fcd": 1.4609375, "loss/idx": 7.5, "loss/logits": 0.2064507007598877, "step": 1763 }, { "epoch": 0.026340348964827273, "grad_norm": 0.322265625, "grad_norm_var": 0.0065018614133199055, "learning_rate": 0.0001, "loss": 1.6031, "loss/crossentropy": 2.115912914276123, "loss/fcd": 1.40625, "loss/idx": 7.5, "loss/logits": 0.19682101160287857, "step": 1764 }, { "epoch": 0.026355281135442252, "grad_norm": 0.294921875, "grad_norm_var": 0.004705297946929932, "learning_rate": 0.0001, "loss": 1.4445, "loss/crossentropy": 2.650931715965271, "loss/fcd": 1.27734375, "loss/idx": 7.5, "loss/logits": 0.16712762415409088, "step": 1765 }, { "epoch": 0.026370213306057235, "grad_norm": 0.546875, "grad_norm_var": 0.00696483850479126, "learning_rate": 0.0001, "loss": 1.827, "loss/crossentropy": 2.122111737728119, "loss/fcd": 1.6015625, "loss/idx": 7.5, "loss/logits": 0.2254578173160553, "step": 1766 }, { "epoch": 0.026385145476672218, "grad_norm": 0.267578125, "grad_norm_var": 0.007381594181060791, "learning_rate": 0.0001, "loss": 1.5057, "loss/crossentropy": 2.469594359397888, "loss/fcd": 1.31640625, "loss/idx": 7.5, "loss/logits": 0.18926545977592468, "step": 1767 }, { "epoch": 0.026400077647287197, "grad_norm": 0.263671875, "grad_norm_var": 0.007583614190419515, "learning_rate": 0.0001, "loss": 1.3253, "loss/crossentropy": 2.4640984535217285, "loss/fcd": 1.171875, "loss/idx": 7.5, "loss/logits": 0.15345098823308945, "step": 1768 }, { "epoch": 0.02641500981790218, "grad_norm": 0.6484375, "grad_norm_var": 0.013017205397288005, "learning_rate": 0.0001, "loss": 1.9709, "loss/crossentropy": 2.5803475379943848, "loss/fcd": 1.71875, "loss/idx": 7.5, "loss/logits": 0.2521095424890518, "step": 1769 }, { "epoch": 0.026429941988517162, "grad_norm": 0.306640625, "grad_norm_var": 0.012858990828196208, "learning_rate": 0.0001, "loss": 1.5404, "loss/crossentropy": 2.5042024850845337, "loss/fcd": 1.33984375, "loss/idx": 7.5, "loss/logits": 0.20056728273630142, "step": 1770 }, { "epoch": 0.02644487415913214, "grad_norm": 0.30078125, "grad_norm_var": 0.01236492395401001, "learning_rate": 0.0001, "loss": 1.4027, "loss/crossentropy": 2.4148744344711304, "loss/fcd": 1.2421875, "loss/idx": 7.5, "loss/logits": 0.1605551317334175, "step": 1771 }, { "epoch": 0.026459806329747124, "grad_norm": 0.26953125, "grad_norm_var": 0.01177135705947876, "learning_rate": 0.0001, "loss": 1.3654, "loss/crossentropy": 2.6074177026748657, "loss/fcd": 1.203125, "loss/idx": 7.5, "loss/logits": 0.16225765645503998, "step": 1772 }, { "epoch": 0.026474738500362104, "grad_norm": 0.291015625, "grad_norm_var": 0.011665181318918864, "learning_rate": 0.0001, "loss": 1.4159, "loss/crossentropy": 2.6354998350143433, "loss/fcd": 1.2421875, "loss/idx": 7.5, "loss/logits": 0.1736658662557602, "step": 1773 }, { "epoch": 0.026489670670977086, "grad_norm": 0.314453125, "grad_norm_var": 0.011185693740844726, "learning_rate": 0.0001, "loss": 1.4989, "loss/crossentropy": 2.5636308193206787, "loss/fcd": 1.30859375, "loss/idx": 7.5, "loss/logits": 0.1903422474861145, "step": 1774 }, { "epoch": 0.02650460284159207, "grad_norm": 0.296875, "grad_norm_var": 0.01127465565999349, "learning_rate": 0.0001, "loss": 1.5055, "loss/crossentropy": 2.777296304702759, "loss/fcd": 1.31640625, "loss/idx": 7.5, "loss/logits": 0.18906784802675247, "step": 1775 }, { "epoch": 0.026519535012207048, "grad_norm": 0.318359375, "grad_norm_var": 0.011286020278930664, "learning_rate": 0.0001, "loss": 1.6954, "loss/crossentropy": 2.39978289604187, "loss/fcd": 1.43359375, "loss/idx": 7.5, "loss/logits": 0.26185186207294464, "step": 1776 }, { "epoch": 0.02653446718282203, "grad_norm": 0.306640625, "grad_norm_var": 0.011176045735677083, "learning_rate": 0.0001, "loss": 1.4712, "loss/crossentropy": 2.3673434257507324, "loss/fcd": 1.30078125, "loss/idx": 7.5, "loss/logits": 0.17040134966373444, "step": 1777 }, { "epoch": 0.026549399353437014, "grad_norm": 0.32421875, "grad_norm_var": 0.011073287328084309, "learning_rate": 0.0001, "loss": 1.5825, "loss/crossentropy": 2.438999891281128, "loss/fcd": 1.38671875, "loss/idx": 7.5, "loss/logits": 0.1957414448261261, "step": 1778 }, { "epoch": 0.026564331524051993, "grad_norm": 0.318359375, "grad_norm_var": 0.011068216959635417, "learning_rate": 0.0001, "loss": 1.4316, "loss/crossentropy": 2.551382541656494, "loss/fcd": 1.2578125, "loss/idx": 7.5, "loss/logits": 0.1737765148282051, "step": 1779 }, { "epoch": 0.026579263694666976, "grad_norm": 0.349609375, "grad_norm_var": 0.011061541239420573, "learning_rate": 0.0001, "loss": 1.5833, "loss/crossentropy": 2.7158048152923584, "loss/fcd": 1.38671875, "loss/idx": 7.5, "loss/logits": 0.19656657427549362, "step": 1780 }, { "epoch": 0.026594195865281958, "grad_norm": 1.3359375, "grad_norm_var": 0.07272782325744628, "learning_rate": 0.0001, "loss": 2.2707, "loss/crossentropy": 2.624470829963684, "loss/fcd": 1.8203125, "loss/idx": 7.5, "loss/logits": 0.4503566026687622, "step": 1781 }, { "epoch": 0.026609128035896937, "grad_norm": 0.359375, "grad_norm_var": 0.07134537696838379, "learning_rate": 0.0001, "loss": 1.5925, "loss/crossentropy": 2.7444372177124023, "loss/fcd": 1.3828125, "loss/idx": 7.5, "loss/logits": 0.209727481007576, "step": 1782 }, { "epoch": 0.02662406020651192, "grad_norm": 0.310546875, "grad_norm_var": 0.07074812253316244, "learning_rate": 0.0001, "loss": 1.5231, "loss/crossentropy": 2.5853861570358276, "loss/fcd": 1.33203125, "loss/idx": 7.5, "loss/logits": 0.19111276417970657, "step": 1783 }, { "epoch": 0.0266389923771269, "grad_norm": 0.314453125, "grad_norm_var": 0.07002243995666504, "learning_rate": 0.0001, "loss": 1.6429, "loss/crossentropy": 2.1288896799087524, "loss/fcd": 1.4609375, "loss/idx": 7.5, "loss/logits": 0.18192436546087265, "step": 1784 }, { "epoch": 0.026653924547741882, "grad_norm": 0.26171875, "grad_norm_var": 0.06644730567932129, "learning_rate": 0.0001, "loss": 1.3741, "loss/crossentropy": 2.598435163497925, "loss/fcd": 1.21484375, "loss/idx": 7.5, "loss/logits": 0.15930332243442535, "step": 1785 }, { "epoch": 0.026668856718356865, "grad_norm": 0.73046875, "grad_norm_var": 0.0738870620727539, "learning_rate": 0.0001, "loss": 1.798, "loss/crossentropy": 2.725283145904541, "loss/fcd": 1.51171875, "loss/idx": 7.5, "loss/logits": 0.2863228842616081, "step": 1786 }, { "epoch": 0.026683788888971844, "grad_norm": 0.287109375, "grad_norm_var": 0.07407987912495931, "learning_rate": 0.0001, "loss": 1.4838, "loss/crossentropy": 2.539591908454895, "loss/fcd": 1.30078125, "loss/idx": 7.5, "loss/logits": 0.18304357677698135, "step": 1787 }, { "epoch": 0.026698721059586827, "grad_norm": 0.287109375, "grad_norm_var": 0.07379506429036459, "learning_rate": 0.0001, "loss": 1.4466, "loss/crossentropy": 2.545486330986023, "loss/fcd": 1.2734375, "loss/idx": 7.5, "loss/logits": 0.17317884415388107, "step": 1788 }, { "epoch": 0.02671365323020181, "grad_norm": 0.322265625, "grad_norm_var": 0.07340037027994792, "learning_rate": 0.0001, "loss": 1.5762, "loss/crossentropy": 2.6705424785614014, "loss/fcd": 1.3828125, "loss/idx": 7.5, "loss/logits": 0.1934109628200531, "step": 1789 }, { "epoch": 0.02672858540081679, "grad_norm": 0.365234375, "grad_norm_var": 0.07296644846598307, "learning_rate": 0.0001, "loss": 1.6593, "loss/crossentropy": 2.9419108629226685, "loss/fcd": 1.4296875, "loss/idx": 7.5, "loss/logits": 0.22962473332881927, "step": 1790 }, { "epoch": 0.02674351757143177, "grad_norm": 0.306640625, "grad_norm_var": 0.07283094724019369, "learning_rate": 0.0001, "loss": 1.4223, "loss/crossentropy": 2.606139898300171, "loss/fcd": 1.2578125, "loss/idx": 7.5, "loss/logits": 0.16449995338916779, "step": 1791 }, { "epoch": 0.026758449742046754, "grad_norm": 0.322265625, "grad_norm_var": 0.07278618812561036, "learning_rate": 0.0001, "loss": 1.5832, "loss/crossentropy": 2.622813105583191, "loss/fcd": 1.37890625, "loss/idx": 7.5, "loss/logits": 0.20424649119377136, "step": 1792 }, { "epoch": 0.026773381912661733, "grad_norm": 0.3359375, "grad_norm_var": 0.07245025634765626, "learning_rate": 0.0001, "loss": 1.608, "loss/crossentropy": 2.721924066543579, "loss/fcd": 1.40234375, "loss/idx": 7.5, "loss/logits": 0.20560920238494873, "step": 1793 }, { "epoch": 0.026788314083276716, "grad_norm": 0.306640625, "grad_norm_var": 0.07266640663146973, "learning_rate": 0.0001, "loss": 1.6546, "loss/crossentropy": 2.4652721881866455, "loss/fcd": 1.42578125, "loss/idx": 7.5, "loss/logits": 0.22881120443344116, "step": 1794 }, { "epoch": 0.0268032462538917, "grad_norm": 0.47265625, "grad_norm_var": 0.07232863108317057, "learning_rate": 0.0001, "loss": 1.8245, "loss/crossentropy": 2.977765440940857, "loss/fcd": 1.5546875, "loss/idx": 7.5, "loss/logits": 0.26981621235609055, "step": 1795 }, { "epoch": 0.026818178424506678, "grad_norm": 0.2734375, "grad_norm_var": 0.07337314287821452, "learning_rate": 0.0001, "loss": 1.5042, "loss/crossentropy": 2.5598191022872925, "loss/fcd": 1.328125, "loss/idx": 7.5, "loss/logits": 0.1760355606675148, "step": 1796 }, { "epoch": 0.02683311059512166, "grad_norm": 0.3125, "grad_norm_var": 0.01275645891825358, "learning_rate": 0.0001, "loss": 1.5425, "loss/crossentropy": 2.583548426628113, "loss/fcd": 1.34375, "loss/idx": 7.5, "loss/logits": 0.19874022156000137, "step": 1797 }, { "epoch": 0.02684804276573664, "grad_norm": 0.25390625, "grad_norm_var": 0.013292042414347331, "learning_rate": 0.0001, "loss": 1.3822, "loss/crossentropy": 2.473318338394165, "loss/fcd": 1.22265625, "loss/idx": 7.5, "loss/logits": 0.15951504558324814, "step": 1798 }, { "epoch": 0.026862974936351623, "grad_norm": 0.31640625, "grad_norm_var": 0.013270060221354166, "learning_rate": 0.0001, "loss": 1.6083, "loss/crossentropy": 2.5721927881240845, "loss/fcd": 1.3984375, "loss/idx": 7.5, "loss/logits": 0.2098878100514412, "step": 1799 }, { "epoch": 0.026877907106966605, "grad_norm": 0.33984375, "grad_norm_var": 0.013217782974243164, "learning_rate": 0.0001, "loss": 1.5284, "loss/crossentropy": 2.5014588832855225, "loss/fcd": 1.33203125, "loss/idx": 7.5, "loss/logits": 0.1963837966322899, "step": 1800 }, { "epoch": 0.026892839277581584, "grad_norm": 0.3046875, "grad_norm_var": 0.012865304946899414, "learning_rate": 0.0001, "loss": 1.5965, "loss/crossentropy": 2.6525572538375854, "loss/fcd": 1.375, "loss/idx": 7.5, "loss/logits": 0.22152644395828247, "step": 1801 }, { "epoch": 0.026907771448196567, "grad_norm": 0.27734375, "grad_norm_var": 0.002473815282185872, "learning_rate": 0.0001, "loss": 1.5907, "loss/crossentropy": 2.556833505630493, "loss/fcd": 1.3828125, "loss/idx": 7.5, "loss/logits": 0.207870252430439, "step": 1802 }, { "epoch": 0.02692270361881155, "grad_norm": 0.302734375, "grad_norm_var": 0.0024252414703369142, "learning_rate": 0.0001, "loss": 1.4564, "loss/crossentropy": 2.721066951751709, "loss/fcd": 1.28125, "loss/idx": 7.5, "loss/logits": 0.17513567954301834, "step": 1803 }, { "epoch": 0.02693763578942653, "grad_norm": 0.294921875, "grad_norm_var": 0.0023961226145426434, "learning_rate": 0.0001, "loss": 1.5316, "loss/crossentropy": 2.4543330669403076, "loss/fcd": 1.34375, "loss/idx": 7.5, "loss/logits": 0.1878231167793274, "step": 1804 }, { "epoch": 0.026952567960041512, "grad_norm": 0.55078125, "grad_norm_var": 0.0057528177897135414, "learning_rate": 0.0001, "loss": 2.158, "loss/crossentropy": 2.319103717803955, "loss/fcd": 1.8203125, "loss/idx": 7.5, "loss/logits": 0.3376483768224716, "step": 1805 }, { "epoch": 0.026967500130656494, "grad_norm": 0.265625, "grad_norm_var": 0.005951420466105143, "learning_rate": 0.0001, "loss": 1.3878, "loss/crossentropy": 2.5267797708511353, "loss/fcd": 1.21875, "loss/idx": 7.5, "loss/logits": 0.16907892376184464, "step": 1806 }, { "epoch": 0.026982432301271474, "grad_norm": 0.353515625, "grad_norm_var": 0.005959812800089518, "learning_rate": 0.0001, "loss": 1.6583, "loss/crossentropy": 2.6328471899032593, "loss/fcd": 1.41796875, "loss/idx": 7.5, "loss/logits": 0.24038030207157135, "step": 1807 }, { "epoch": 0.026997364471886456, "grad_norm": 0.314453125, "grad_norm_var": 0.005971892674763998, "learning_rate": 0.0001, "loss": 1.4136, "loss/crossentropy": 2.696038246154785, "loss/fcd": 1.23828125, "loss/idx": 7.5, "loss/logits": 0.17528624087572098, "step": 1808 }, { "epoch": 0.027012296642501436, "grad_norm": 0.337890625, "grad_norm_var": 0.00597375233968099, "learning_rate": 0.0001, "loss": 1.517, "loss/crossentropy": 2.461454749107361, "loss/fcd": 1.32421875, "loss/idx": 7.5, "loss/logits": 0.19280918687582016, "step": 1809 }, { "epoch": 0.02702722881311642, "grad_norm": 0.271484375, "grad_norm_var": 0.006159718831380208, "learning_rate": 0.0001, "loss": 1.4047, "loss/crossentropy": 2.67915940284729, "loss/fcd": 1.23046875, "loss/idx": 7.5, "loss/logits": 0.17423474788665771, "step": 1810 }, { "epoch": 0.0270421609837314, "grad_norm": 0.30859375, "grad_norm_var": 0.004669698079427084, "learning_rate": 0.0001, "loss": 1.3907, "loss/crossentropy": 2.724575161933899, "loss/fcd": 1.22265625, "loss/idx": 7.5, "loss/logits": 0.16803188621997833, "step": 1811 }, { "epoch": 0.02705709315434638, "grad_norm": 0.4296875, "grad_norm_var": 0.005280049641927084, "learning_rate": 0.0001, "loss": 1.9115, "loss/crossentropy": 2.6169649362564087, "loss/fcd": 1.625, "loss/idx": 7.5, "loss/logits": 0.28652864694595337, "step": 1812 }, { "epoch": 0.027072025324961363, "grad_norm": 0.296875, "grad_norm_var": 0.005325826009114584, "learning_rate": 0.0001, "loss": 1.5192, "loss/crossentropy": 2.7209300994873047, "loss/fcd": 1.328125, "loss/idx": 7.5, "loss/logits": 0.19105875492095947, "step": 1813 }, { "epoch": 0.027086957495576346, "grad_norm": 0.9140625, "grad_norm_var": 0.026202837626139324, "learning_rate": 0.0001, "loss": 1.7825, "loss/crossentropy": 2.6711992025375366, "loss/fcd": 1.5078125, "loss/idx": 7.5, "loss/logits": 0.2746375799179077, "step": 1814 }, { "epoch": 0.027101889666191325, "grad_norm": 0.357421875, "grad_norm_var": 0.026028935114542642, "learning_rate": 0.0001, "loss": 1.6117, "loss/crossentropy": 2.5210726261138916, "loss/fcd": 1.3984375, "loss/idx": 7.5, "loss/logits": 0.21324608474969864, "step": 1815 }, { "epoch": 0.027116821836806308, "grad_norm": 0.3828125, "grad_norm_var": 0.02597158749898275, "learning_rate": 0.0001, "loss": 1.5266, "loss/crossentropy": 2.500070095062256, "loss/fcd": 1.3359375, "loss/idx": 7.5, "loss/logits": 0.1906418353319168, "step": 1816 }, { "epoch": 0.02713175400742129, "grad_norm": 0.302734375, "grad_norm_var": 0.025989532470703125, "learning_rate": 0.0001, "loss": 1.5851, "loss/crossentropy": 2.55086088180542, "loss/fcd": 1.39453125, "loss/idx": 7.5, "loss/logits": 0.19061411917209625, "step": 1817 }, { "epoch": 0.02714668617803627, "grad_norm": 0.31640625, "grad_norm_var": 0.0255889892578125, "learning_rate": 0.0001, "loss": 1.6096, "loss/crossentropy": 2.4247604608535767, "loss/fcd": 1.390625, "loss/idx": 7.5, "loss/logits": 0.21899522095918655, "step": 1818 }, { "epoch": 0.027161618348651252, "grad_norm": 0.2734375, "grad_norm_var": 0.0259249210357666, "learning_rate": 0.0001, "loss": 1.3993, "loss/crossentropy": 2.612613081932068, "loss/fcd": 1.23046875, "loss/idx": 7.5, "loss/logits": 0.16881398856639862, "step": 1819 }, { "epoch": 0.02717655051926623, "grad_norm": 0.291015625, "grad_norm_var": 0.025966628392537435, "learning_rate": 0.0001, "loss": 1.4047, "loss/crossentropy": 2.3704020977020264, "loss/fcd": 1.23828125, "loss/idx": 7.5, "loss/logits": 0.166450597345829, "step": 1820 }, { "epoch": 0.027191482689881214, "grad_norm": 0.365234375, "grad_norm_var": 0.02371826171875, "learning_rate": 0.0001, "loss": 1.5727, "loss/crossentropy": 2.353190541267395, "loss/fcd": 1.3828125, "loss/idx": 7.5, "loss/logits": 0.1898830384016037, "step": 1821 }, { "epoch": 0.027206414860496197, "grad_norm": 0.283203125, "grad_norm_var": 0.023513269424438477, "learning_rate": 0.0001, "loss": 1.4686, "loss/crossentropy": 2.5452089309692383, "loss/fcd": 1.27734375, "loss/idx": 7.5, "loss/logits": 0.19126763939857483, "step": 1822 }, { "epoch": 0.027221347031111176, "grad_norm": 1.3671875, "grad_norm_var": 0.086529541015625, "learning_rate": 0.0001, "loss": 1.8668, "loss/crossentropy": 3.4751185178756714, "loss/fcd": 1.6484375, "loss/idx": 7.5, "loss/logits": 0.21837294846773148, "step": 1823 }, { "epoch": 0.02723627920172616, "grad_norm": 0.30859375, "grad_norm_var": 0.08661866188049316, "learning_rate": 0.0001, "loss": 1.4579, "loss/crossentropy": 2.515225887298584, "loss/fcd": 1.28125, "loss/idx": 7.5, "loss/logits": 0.1766873598098755, "step": 1824 }, { "epoch": 0.02725121137234114, "grad_norm": 0.296875, "grad_norm_var": 0.08720245361328124, "learning_rate": 0.0001, "loss": 1.3716, "loss/crossentropy": 2.6789597272872925, "loss/fcd": 1.203125, "loss/idx": 7.5, "loss/logits": 0.16845793277025223, "step": 1825 }, { "epoch": 0.02726614354295612, "grad_norm": 0.333984375, "grad_norm_var": 0.08618520100911459, "learning_rate": 0.0001, "loss": 1.3844, "loss/crossentropy": 2.8141207695007324, "loss/fcd": 1.23046875, "loss/idx": 7.5, "loss/logits": 0.1539573296904564, "step": 1826 }, { "epoch": 0.027281075713571103, "grad_norm": 0.357421875, "grad_norm_var": 0.08556491533915202, "learning_rate": 0.0001, "loss": 1.6948, "loss/crossentropy": 2.623816967010498, "loss/fcd": 1.47265625, "loss/idx": 7.5, "loss/logits": 0.22218744456768036, "step": 1827 }, { "epoch": 0.027296007884186086, "grad_norm": 0.306640625, "grad_norm_var": 0.08651320139567058, "learning_rate": 0.0001, "loss": 1.4101, "loss/crossentropy": 2.72926127910614, "loss/fcd": 1.23828125, "loss/idx": 7.5, "loss/logits": 0.17186040431261063, "step": 1828 }, { "epoch": 0.027310940054801065, "grad_norm": 0.263671875, "grad_norm_var": 0.08713657061258952, "learning_rate": 0.0001, "loss": 1.4489, "loss/crossentropy": 2.571180582046509, "loss/fcd": 1.26953125, "loss/idx": 7.5, "loss/logits": 0.17934715747833252, "step": 1829 }, { "epoch": 0.027325872225416048, "grad_norm": 0.294921875, "grad_norm_var": 0.07031275431315104, "learning_rate": 0.0001, "loss": 1.5624, "loss/crossentropy": 2.6993069648742676, "loss/fcd": 1.359375, "loss/idx": 7.5, "loss/logits": 0.20297697931528091, "step": 1830 }, { "epoch": 0.02734080439603103, "grad_norm": 0.65234375, "grad_norm_var": 0.07480810483296713, "learning_rate": 0.0001, "loss": 2.2399, "loss/crossentropy": 2.70441734790802, "loss/fcd": 1.86328125, "loss/idx": 7.5, "loss/logits": 0.3765818625688553, "step": 1831 }, { "epoch": 0.02735573656664601, "grad_norm": 0.294921875, "grad_norm_var": 0.0754897435506185, "learning_rate": 0.0001, "loss": 1.4978, "loss/crossentropy": 2.6646634340286255, "loss/fcd": 1.3125, "loss/idx": 7.5, "loss/logits": 0.18530730158090591, "step": 1832 }, { "epoch": 0.027370668737260993, "grad_norm": 0.29296875, "grad_norm_var": 0.07561491330464681, "learning_rate": 0.0001, "loss": 1.4287, "loss/crossentropy": 2.571444511413574, "loss/fcd": 1.25, "loss/idx": 7.5, "loss/logits": 0.17866399139165878, "step": 1833 }, { "epoch": 0.027385600907875972, "grad_norm": 0.31640625, "grad_norm_var": 0.07561491330464681, "learning_rate": 0.0001, "loss": 1.6586, "loss/crossentropy": 2.553916811943054, "loss/fcd": 1.46484375, "loss/idx": 7.5, "loss/logits": 0.19379810988903046, "step": 1834 }, { "epoch": 0.027400533078490955, "grad_norm": 0.29296875, "grad_norm_var": 0.07532563209533691, "learning_rate": 0.0001, "loss": 1.4102, "loss/crossentropy": 2.4229013919830322, "loss/fcd": 1.2421875, "loss/idx": 7.5, "loss/logits": 0.1679960861802101, "step": 1835 }, { "epoch": 0.027415465249105937, "grad_norm": 0.328125, "grad_norm_var": 0.07489770253499349, "learning_rate": 0.0001, "loss": 1.4833, "loss/crossentropy": 2.3115060329437256, "loss/fcd": 1.3125, "loss/idx": 7.5, "loss/logits": 0.17078139632940292, "step": 1836 }, { "epoch": 0.027430397419720916, "grad_norm": 0.279296875, "grad_norm_var": 0.07572574615478515, "learning_rate": 0.0001, "loss": 1.517, "loss/crossentropy": 2.46151602268219, "loss/fcd": 1.32421875, "loss/idx": 7.5, "loss/logits": 0.19276980310678482, "step": 1837 }, { "epoch": 0.0274453295903359, "grad_norm": 0.271484375, "grad_norm_var": 0.07590408325195312, "learning_rate": 0.0001, "loss": 1.4096, "loss/crossentropy": 2.487010359764099, "loss/fcd": 1.24609375, "loss/idx": 7.5, "loss/logits": 0.16347003728151321, "step": 1838 }, { "epoch": 0.027460261760950882, "grad_norm": 0.310546875, "grad_norm_var": 0.008170048395792643, "learning_rate": 0.0001, "loss": 1.4457, "loss/crossentropy": 2.558194160461426, "loss/fcd": 1.26953125, "loss/idx": 7.5, "loss/logits": 0.1762079894542694, "step": 1839 }, { "epoch": 0.02747519393156586, "grad_norm": 0.283203125, "grad_norm_var": 0.008266131083170572, "learning_rate": 0.0001, "loss": 1.5756, "loss/crossentropy": 2.67186176776886, "loss/fcd": 1.37109375, "loss/idx": 7.5, "loss/logits": 0.20449146628379822, "step": 1840 }, { "epoch": 0.027490126102180844, "grad_norm": 0.302734375, "grad_norm_var": 0.00824748675028483, "learning_rate": 0.0001, "loss": 1.5622, "loss/crossentropy": 2.7830512523651123, "loss/fcd": 1.359375, "loss/idx": 7.5, "loss/logits": 0.20284771919250488, "step": 1841 }, { "epoch": 0.027505058272795826, "grad_norm": 0.251953125, "grad_norm_var": 0.00855724016825358, "learning_rate": 0.0001, "loss": 1.4079, "loss/crossentropy": 2.5222907066345215, "loss/fcd": 1.23828125, "loss/idx": 7.5, "loss/logits": 0.16962562501430511, "step": 1842 }, { "epoch": 0.027519990443410806, "grad_norm": 0.328125, "grad_norm_var": 0.008459726969401041, "learning_rate": 0.0001, "loss": 1.6206, "loss/crossentropy": 2.350256323814392, "loss/fcd": 1.40234375, "loss/idx": 7.5, "loss/logits": 0.21821796149015427, "step": 1843 }, { "epoch": 0.02753492261402579, "grad_norm": 0.359375, "grad_norm_var": 0.008561436335245769, "learning_rate": 0.0001, "loss": 1.7676, "loss/crossentropy": 2.657059669494629, "loss/fcd": 1.53125, "loss/idx": 7.5, "loss/logits": 0.23639147728681564, "step": 1844 }, { "epoch": 0.027549854784640768, "grad_norm": 0.330078125, "grad_norm_var": 0.008336623509724935, "learning_rate": 0.0001, "loss": 1.5589, "loss/crossentropy": 2.825732111930847, "loss/fcd": 1.35546875, "loss/idx": 7.5, "loss/logits": 0.20341219007968903, "step": 1845 }, { "epoch": 0.02756478695525575, "grad_norm": 0.369140625, "grad_norm_var": 0.008389774958292644, "learning_rate": 0.0001, "loss": 1.7813, "loss/crossentropy": 2.3357163667678833, "loss/fcd": 1.546875, "loss/idx": 7.5, "loss/logits": 0.2344117909669876, "step": 1846 }, { "epoch": 0.027579719125870733, "grad_norm": 0.326171875, "grad_norm_var": 0.0009760538736979167, "learning_rate": 0.0001, "loss": 1.6673, "loss/crossentropy": 2.734041929244995, "loss/fcd": 1.44921875, "loss/idx": 7.5, "loss/logits": 0.2180958315730095, "step": 1847 }, { "epoch": 0.027594651296485712, "grad_norm": 0.3046875, "grad_norm_var": 0.000964212417602539, "learning_rate": 0.0001, "loss": 1.449, "loss/crossentropy": 2.763646125793457, "loss/fcd": 1.28515625, "loss/idx": 7.5, "loss/logits": 0.16384299844503403, "step": 1848 }, { "epoch": 0.027609583467100695, "grad_norm": 0.267578125, "grad_norm_var": 0.0010594685872395834, "learning_rate": 0.0001, "loss": 1.5765, "loss/crossentropy": 2.650328516960144, "loss/fcd": 1.3671875, "loss/idx": 7.5, "loss/logits": 0.2092689573764801, "step": 1849 }, { "epoch": 0.027624515637715678, "grad_norm": 0.306640625, "grad_norm_var": 0.0010539849599202475, "learning_rate": 0.0001, "loss": 1.6693, "loss/crossentropy": 2.6345717906951904, "loss/fcd": 1.421875, "loss/idx": 7.5, "loss/logits": 0.24741078913211823, "step": 1850 }, { "epoch": 0.027639447808330657, "grad_norm": 0.271484375, "grad_norm_var": 0.001123046875, "learning_rate": 0.0001, "loss": 1.3875, "loss/crossentropy": 2.402801752090454, "loss/fcd": 1.23046875, "loss/idx": 7.5, "loss/logits": 0.15703191608190536, "step": 1851 }, { "epoch": 0.02765437997894564, "grad_norm": 0.296875, "grad_norm_var": 0.0010904947916666667, "learning_rate": 0.0001, "loss": 1.5771, "loss/crossentropy": 2.4666026830673218, "loss/fcd": 1.359375, "loss/idx": 7.5, "loss/logits": 0.21776925027370453, "step": 1852 }, { "epoch": 0.027669312149560622, "grad_norm": 0.322265625, "grad_norm_var": 0.0010660171508789062, "learning_rate": 0.0001, "loss": 1.6152, "loss/crossentropy": 2.6620728969573975, "loss/fcd": 1.40234375, "loss/idx": 7.5, "loss/logits": 0.21284056454896927, "step": 1853 }, { "epoch": 0.0276842443201756, "grad_norm": 0.29296875, "grad_norm_var": 0.00099485715230306, "learning_rate": 0.0001, "loss": 1.5247, "loss/crossentropy": 2.656482458114624, "loss/fcd": 1.33203125, "loss/idx": 7.5, "loss/logits": 0.19264397770166397, "step": 1854 }, { "epoch": 0.027699176490790584, "grad_norm": 0.30859375, "grad_norm_var": 0.0009943644205729167, "learning_rate": 0.0001, "loss": 1.3658, "loss/crossentropy": 2.710697650909424, "loss/fcd": 1.19921875, "loss/idx": 7.5, "loss/logits": 0.16661225259304047, "step": 1855 }, { "epoch": 0.027714108661405567, "grad_norm": 0.30078125, "grad_norm_var": 0.0009564558664957682, "learning_rate": 0.0001, "loss": 1.5226, "loss/crossentropy": 2.712221622467041, "loss/fcd": 1.328125, "loss/idx": 7.5, "loss/logits": 0.1944480687379837, "step": 1856 }, { "epoch": 0.027729040832020546, "grad_norm": 0.4140625, "grad_norm_var": 0.0016422907511393229, "learning_rate": 0.0001, "loss": 1.6438, "loss/crossentropy": 2.5411465167999268, "loss/fcd": 1.42578125, "loss/idx": 7.5, "loss/logits": 0.2180405557155609, "step": 1857 }, { "epoch": 0.02774397300263553, "grad_norm": 0.263671875, "grad_norm_var": 0.0015513102213541667, "learning_rate": 0.0001, "loss": 1.4363, "loss/crossentropy": 2.5804349184036255, "loss/fcd": 1.265625, "loss/idx": 7.5, "loss/logits": 0.17066682875156403, "step": 1858 }, { "epoch": 0.027758905173250508, "grad_norm": 0.298828125, "grad_norm_var": 0.0015591780344645183, "learning_rate": 0.0001, "loss": 1.4123, "loss/crossentropy": 2.710708737373352, "loss/fcd": 1.234375, "loss/idx": 7.5, "loss/logits": 0.17788437753915787, "step": 1859 }, { "epoch": 0.02777383734386549, "grad_norm": 0.369140625, "grad_norm_var": 0.0016234715779622395, "learning_rate": 0.0001, "loss": 1.5228, "loss/crossentropy": 2.003714084625244, "loss/fcd": 1.35546875, "loss/idx": 7.5, "loss/logits": 0.16729778051376343, "step": 1860 }, { "epoch": 0.027788769514480473, "grad_norm": 0.302734375, "grad_norm_var": 0.00161590576171875, "learning_rate": 0.0001, "loss": 1.577, "loss/crossentropy": 2.538116216659546, "loss/fcd": 1.375, "loss/idx": 7.5, "loss/logits": 0.2019786238670349, "step": 1861 }, { "epoch": 0.027803701685095453, "grad_norm": 0.287109375, "grad_norm_var": 0.0014276504516601562, "learning_rate": 0.0001, "loss": 1.4848, "loss/crossentropy": 2.833705425262451, "loss/fcd": 1.29296875, "loss/idx": 7.5, "loss/logits": 0.19186238199472427, "step": 1862 }, { "epoch": 0.027818633855710435, "grad_norm": 0.32421875, "grad_norm_var": 0.0014232476552327475, "learning_rate": 0.0001, "loss": 1.6606, "loss/crossentropy": 2.7050453424453735, "loss/fcd": 1.4453125, "loss/idx": 7.5, "loss/logits": 0.21526382863521576, "step": 1863 }, { "epoch": 0.027833566026325418, "grad_norm": 0.283203125, "grad_norm_var": 0.001462237040201823, "learning_rate": 0.0001, "loss": 1.4428, "loss/crossentropy": 2.618694305419922, "loss/fcd": 1.265625, "loss/idx": 7.5, "loss/logits": 0.17713554948568344, "step": 1864 }, { "epoch": 0.027848498196940397, "grad_norm": 0.298828125, "grad_norm_var": 0.0013594945271809896, "learning_rate": 0.0001, "loss": 1.418, "loss/crossentropy": 2.6274200677871704, "loss/fcd": 1.25, "loss/idx": 7.5, "loss/logits": 0.16798733174800873, "step": 1865 }, { "epoch": 0.02786343036755538, "grad_norm": 0.3359375, "grad_norm_var": 0.0014045556386311849, "learning_rate": 0.0001, "loss": 1.6525, "loss/crossentropy": 2.531873106956482, "loss/fcd": 1.42578125, "loss/idx": 7.5, "loss/logits": 0.2267523929476738, "step": 1866 }, { "epoch": 0.027878362538170363, "grad_norm": 0.294921875, "grad_norm_var": 0.0013164361317952475, "learning_rate": 0.0001, "loss": 1.6755, "loss/crossentropy": 2.2504754066467285, "loss/fcd": 1.45703125, "loss/idx": 7.5, "loss/logits": 0.21843808144330978, "step": 1867 }, { "epoch": 0.027893294708785342, "grad_norm": 0.310546875, "grad_norm_var": 0.0013003031412760417, "learning_rate": 0.0001, "loss": 1.5554, "loss/crossentropy": 2.297727584838867, "loss/fcd": 1.3671875, "loss/idx": 7.5, "loss/logits": 0.18825874477624893, "step": 1868 }, { "epoch": 0.027908226879400325, "grad_norm": 0.265625, "grad_norm_var": 0.0014307498931884766, "learning_rate": 0.0001, "loss": 1.3449, "loss/crossentropy": 2.520071268081665, "loss/fcd": 1.19140625, "loss/idx": 7.5, "loss/logits": 0.15346477925777435, "step": 1869 }, { "epoch": 0.027923159050015304, "grad_norm": 0.265625, "grad_norm_var": 0.0015375614166259766, "learning_rate": 0.0001, "loss": 1.4415, "loss/crossentropy": 2.5880095958709717, "loss/fcd": 1.26171875, "loss/idx": 7.5, "loss/logits": 0.17981453239917755, "step": 1870 }, { "epoch": 0.027938091220630287, "grad_norm": 0.28125, "grad_norm_var": 0.0015811761220296224, "learning_rate": 0.0001, "loss": 1.5538, "loss/crossentropy": 2.5557409524917603, "loss/fcd": 1.34375, "loss/idx": 7.5, "loss/logits": 0.210035502910614, "step": 1871 }, { "epoch": 0.02795302339124527, "grad_norm": 0.27734375, "grad_norm_var": 0.001631911595662435, "learning_rate": 0.0001, "loss": 1.3297, "loss/crossentropy": 2.4933377504348755, "loss/fcd": 1.18359375, "loss/idx": 7.5, "loss/logits": 0.14611776173114777, "step": 1872 }, { "epoch": 0.02796795556186025, "grad_norm": 0.427734375, "grad_norm_var": 0.0018431981404622396, "learning_rate": 0.0001, "loss": 1.6354, "loss/crossentropy": 2.6224533319473267, "loss/fcd": 1.421875, "loss/idx": 7.5, "loss/logits": 0.21356236189603806, "step": 1873 }, { "epoch": 0.02798288773247523, "grad_norm": 0.2734375, "grad_norm_var": 0.0017947991689046224, "learning_rate": 0.0001, "loss": 1.5447, "loss/crossentropy": 2.4397172927856445, "loss/fcd": 1.34375, "loss/idx": 7.5, "loss/logits": 0.2009110450744629, "step": 1874 }, { "epoch": 0.027997819903090214, "grad_norm": 0.30859375, "grad_norm_var": 0.0017913818359375, "learning_rate": 0.0001, "loss": 1.5605, "loss/crossentropy": 2.469580054283142, "loss/fcd": 1.36328125, "loss/idx": 7.5, "loss/logits": 0.19724100083112717, "step": 1875 }, { "epoch": 0.028012752073705193, "grad_norm": 0.302734375, "grad_norm_var": 0.0015136082967122397, "learning_rate": 0.0001, "loss": 1.5029, "loss/crossentropy": 2.7592979669570923, "loss/fcd": 1.30078125, "loss/idx": 7.5, "loss/logits": 0.20207630097866058, "step": 1876 }, { "epoch": 0.028027684244320176, "grad_norm": 0.2578125, "grad_norm_var": 0.0016382694244384765, "learning_rate": 0.0001, "loss": 1.4717, "loss/crossentropy": 2.464821934700012, "loss/fcd": 1.28515625, "loss/idx": 7.5, "loss/logits": 0.18653883039951324, "step": 1877 }, { "epoch": 0.02804261641493516, "grad_norm": 0.330078125, "grad_norm_var": 0.0016816298166910807, "learning_rate": 0.0001, "loss": 1.7049, "loss/crossentropy": 2.4362906217575073, "loss/fcd": 1.47265625, "loss/idx": 7.5, "loss/logits": 0.23222223669290543, "step": 1878 }, { "epoch": 0.028057548585550138, "grad_norm": 0.2734375, "grad_norm_var": 0.0016948541005452475, "learning_rate": 0.0001, "loss": 1.4679, "loss/crossentropy": 2.5615785121917725, "loss/fcd": 1.296875, "loss/idx": 7.5, "loss/logits": 0.1710590422153473, "step": 1879 }, { "epoch": 0.02807248075616512, "grad_norm": 0.275390625, "grad_norm_var": 0.0017153263092041016, "learning_rate": 0.0001, "loss": 1.4277, "loss/crossentropy": 2.6645255088806152, "loss/fcd": 1.25390625, "loss/idx": 7.5, "loss/logits": 0.17376437038183212, "step": 1880 }, { "epoch": 0.0280874129267801, "grad_norm": 0.298828125, "grad_norm_var": 0.0017153263092041016, "learning_rate": 0.0001, "loss": 1.5619, "loss/crossentropy": 2.543599843978882, "loss/fcd": 1.36328125, "loss/idx": 7.5, "loss/logits": 0.1986057609319687, "step": 1881 }, { "epoch": 0.028102345097395082, "grad_norm": 0.294921875, "grad_norm_var": 0.0016168594360351563, "learning_rate": 0.0001, "loss": 1.5286, "loss/crossentropy": 2.642424702644348, "loss/fcd": 1.33203125, "loss/idx": 7.5, "loss/logits": 0.19652438908815384, "step": 1882 }, { "epoch": 0.028117277268010065, "grad_norm": 0.318359375, "grad_norm_var": 0.0016473770141601563, "learning_rate": 0.0001, "loss": 1.5587, "loss/crossentropy": 2.63770067691803, "loss/fcd": 1.359375, "loss/idx": 7.5, "loss/logits": 0.19933994114398956, "step": 1883 }, { "epoch": 0.028132209438625044, "grad_norm": 0.310546875, "grad_norm_var": 0.0016473770141601563, "learning_rate": 0.0001, "loss": 1.5717, "loss/crossentropy": 2.5321085453033447, "loss/fcd": 1.38671875, "loss/idx": 7.5, "loss/logits": 0.18496868759393692, "step": 1884 }, { "epoch": 0.028147141609240027, "grad_norm": 0.283203125, "grad_norm_var": 0.0015917301177978515, "learning_rate": 0.0001, "loss": 1.4883, "loss/crossentropy": 2.5472614765167236, "loss/fcd": 1.30859375, "loss/idx": 7.5, "loss/logits": 0.17972340434789658, "step": 1885 }, { "epoch": 0.02816207377985501, "grad_norm": 0.30859375, "grad_norm_var": 0.0015175978342692057, "learning_rate": 0.0001, "loss": 1.6146, "loss/crossentropy": 2.5076574087142944, "loss/fcd": 1.3828125, "loss/idx": 7.5, "loss/logits": 0.2317841500043869, "step": 1886 }, { "epoch": 0.02817700595046999, "grad_norm": 0.322265625, "grad_norm_var": 0.0015125910441080729, "learning_rate": 0.0001, "loss": 1.6261, "loss/crossentropy": 2.5463353395462036, "loss/fcd": 1.40625, "loss/idx": 7.5, "loss/logits": 0.21981056034564972, "step": 1887 }, { "epoch": 0.02819193812108497, "grad_norm": 0.314453125, "grad_norm_var": 0.001466989517211914, "learning_rate": 0.0001, "loss": 1.5593, "loss/crossentropy": 2.4339152574539185, "loss/fcd": 1.36328125, "loss/idx": 7.5, "loss/logits": 0.19599353522062302, "step": 1888 }, { "epoch": 0.028206870291699954, "grad_norm": 0.373046875, "grad_norm_var": 0.0007682641347249349, "learning_rate": 0.0001, "loss": 1.6727, "loss/crossentropy": 2.5966817140579224, "loss/fcd": 1.42578125, "loss/idx": 7.5, "loss/logits": 0.24687369167804718, "step": 1889 }, { "epoch": 0.028221802462314934, "grad_norm": 0.392578125, "grad_norm_var": 0.0011880874633789062, "learning_rate": 0.0001, "loss": 1.503, "loss/crossentropy": 2.6890329122543335, "loss/fcd": 1.32421875, "loss/idx": 7.5, "loss/logits": 0.1788247525691986, "step": 1890 }, { "epoch": 0.028236734632929916, "grad_norm": 0.259765625, "grad_norm_var": 0.0013482252756754558, "learning_rate": 0.0001, "loss": 1.4354, "loss/crossentropy": 2.4243786334991455, "loss/fcd": 1.265625, "loss/idx": 7.5, "loss/logits": 0.16975828260183334, "step": 1891 }, { "epoch": 0.0282516668035449, "grad_norm": 0.30078125, "grad_norm_var": 0.001349639892578125, "learning_rate": 0.0001, "loss": 1.499, "loss/crossentropy": 2.6386592388153076, "loss/fcd": 1.3046875, "loss/idx": 7.5, "loss/logits": 0.1942928582429886, "step": 1892 }, { "epoch": 0.028266598974159878, "grad_norm": 0.326171875, "grad_norm_var": 0.0011922041575113931, "learning_rate": 0.0001, "loss": 1.6796, "loss/crossentropy": 2.382668614387512, "loss/fcd": 1.4609375, "loss/idx": 7.5, "loss/logits": 0.21862833201885223, "step": 1893 }, { "epoch": 0.02828153114477486, "grad_norm": 0.291015625, "grad_norm_var": 0.0011902968088785806, "learning_rate": 0.0001, "loss": 1.4194, "loss/crossentropy": 2.457250952720642, "loss/fcd": 1.2578125, "loss/idx": 7.5, "loss/logits": 0.16157541424036026, "step": 1894 }, { "epoch": 0.02829646331538984, "grad_norm": 0.251953125, "grad_norm_var": 0.0013209025065104166, "learning_rate": 0.0001, "loss": 1.3678, "loss/crossentropy": 2.478179931640625, "loss/fcd": 1.203125, "loss/idx": 7.5, "loss/logits": 0.16465666145086288, "step": 1895 }, { "epoch": 0.028311395486004823, "grad_norm": 0.2412109375, "grad_norm_var": 0.0015407840410868326, "learning_rate": 0.0001, "loss": 1.5487, "loss/crossentropy": 2.4185166358947754, "loss/fcd": 1.34375, "loss/idx": 7.5, "loss/logits": 0.2049560472369194, "step": 1896 }, { "epoch": 0.028326327656619806, "grad_norm": 0.322265625, "grad_norm_var": 0.0015543262163798014, "learning_rate": 0.0001, "loss": 1.5515, "loss/crossentropy": 2.4423259496688843, "loss/fcd": 1.3515625, "loss/idx": 7.5, "loss/logits": 0.1999405100941658, "step": 1897 }, { "epoch": 0.028341259827234785, "grad_norm": 0.291015625, "grad_norm_var": 0.0015615423520406087, "learning_rate": 0.0001, "loss": 1.6507, "loss/crossentropy": 2.385899782180786, "loss/fcd": 1.44921875, "loss/idx": 7.5, "loss/logits": 0.20146413892507553, "step": 1898 }, { "epoch": 0.028356191997849767, "grad_norm": 0.296875, "grad_norm_var": 0.0015569965044657389, "learning_rate": 0.0001, "loss": 1.4639, "loss/crossentropy": 2.6253907680511475, "loss/fcd": 1.2890625, "loss/idx": 7.5, "loss/logits": 0.17485372722148895, "step": 1899 }, { "epoch": 0.02837112416846475, "grad_norm": 0.314453125, "grad_norm_var": 0.001560652256011963, "learning_rate": 0.0001, "loss": 1.7989, "loss/crossentropy": 2.2645241022109985, "loss/fcd": 1.5234375, "loss/idx": 7.5, "loss/logits": 0.275456503033638, "step": 1900 }, { "epoch": 0.02838605633907973, "grad_norm": 0.2734375, "grad_norm_var": 0.0015957792599995931, "learning_rate": 0.0001, "loss": 1.4147, "loss/crossentropy": 2.492580533027649, "loss/fcd": 1.25390625, "loss/idx": 7.5, "loss/logits": 0.160808227956295, "step": 1901 }, { "epoch": 0.028400988509694712, "grad_norm": 0.28125, "grad_norm_var": 0.0016293803850809733, "learning_rate": 0.0001, "loss": 1.4205, "loss/crossentropy": 2.5176279544830322, "loss/fcd": 1.25390625, "loss/idx": 7.5, "loss/logits": 0.1665561944246292, "step": 1902 }, { "epoch": 0.028415920680309695, "grad_norm": 0.30078125, "grad_norm_var": 0.001603853702545166, "learning_rate": 0.0001, "loss": 1.4826, "loss/crossentropy": 2.77282452583313, "loss/fcd": 1.296875, "loss/idx": 7.5, "loss/logits": 0.18571852147579193, "step": 1903 }, { "epoch": 0.028430852850924674, "grad_norm": 0.2265625, "grad_norm_var": 0.0019400238990783692, "learning_rate": 0.0001, "loss": 1.3425, "loss/crossentropy": 2.3292288780212402, "loss/fcd": 1.18359375, "loss/idx": 7.5, "loss/logits": 0.15895090252161026, "step": 1904 }, { "epoch": 0.028445785021539657, "grad_norm": 0.255859375, "grad_norm_var": 0.0016014695167541504, "learning_rate": 0.0001, "loss": 1.375, "loss/crossentropy": 2.572773575782776, "loss/fcd": 1.21484375, "loss/idx": 7.5, "loss/logits": 0.16017340123653412, "step": 1905 }, { "epoch": 0.028460717192154636, "grad_norm": 0.29296875, "grad_norm_var": 0.0008475899696350098, "learning_rate": 0.0001, "loss": 1.3999, "loss/crossentropy": 2.3504263162612915, "loss/fcd": 1.23828125, "loss/idx": 7.5, "loss/logits": 0.16164566576480865, "step": 1906 }, { "epoch": 0.02847564936276962, "grad_norm": 0.318359375, "grad_norm_var": 0.0008814454078674317, "learning_rate": 0.0001, "loss": 1.6815, "loss/crossentropy": 2.7879390716552734, "loss/fcd": 1.4375, "loss/idx": 7.5, "loss/logits": 0.2440466284751892, "step": 1907 }, { "epoch": 0.0284905815333846, "grad_norm": 0.515625, "grad_norm_var": 0.004173688093821208, "learning_rate": 0.0001, "loss": 1.8784, "loss/crossentropy": 2.7805620431900024, "loss/fcd": 1.53125, "loss/idx": 7.5, "loss/logits": 0.34715893864631653, "step": 1908 }, { "epoch": 0.02850551370399958, "grad_norm": 0.275390625, "grad_norm_var": 0.004157570997873942, "learning_rate": 0.0001, "loss": 1.5323, "loss/crossentropy": 2.494640588760376, "loss/fcd": 1.328125, "loss/idx": 7.5, "loss/logits": 0.20422004163265228, "step": 1909 }, { "epoch": 0.028520445874614563, "grad_norm": 0.376953125, "grad_norm_var": 0.004552710056304932, "learning_rate": 0.0001, "loss": 1.602, "loss/crossentropy": 2.690505266189575, "loss/fcd": 1.3984375, "loss/idx": 7.5, "loss/logits": 0.20359589159488678, "step": 1910 }, { "epoch": 0.028535378045229546, "grad_norm": 0.34765625, "grad_norm_var": 0.0044841726620992025, "learning_rate": 0.0001, "loss": 1.4909, "loss/crossentropy": 2.656151294708252, "loss/fcd": 1.31640625, "loss/idx": 7.5, "loss/logits": 0.17448563873767853, "step": 1911 }, { "epoch": 0.028550310215844525, "grad_norm": 0.302734375, "grad_norm_var": 0.004171498616536458, "learning_rate": 0.0001, "loss": 1.5417, "loss/crossentropy": 2.5565932989120483, "loss/fcd": 1.3515625, "loss/idx": 7.5, "loss/logits": 0.19017393887043, "step": 1912 }, { "epoch": 0.028565242386459508, "grad_norm": 0.2890625, "grad_norm_var": 0.004195006688435873, "learning_rate": 0.0001, "loss": 1.3229, "loss/crossentropy": 2.700557827949524, "loss/fcd": 1.17578125, "loss/idx": 7.5, "loss/logits": 0.14714757353067398, "step": 1913 }, { "epoch": 0.02858017455707449, "grad_norm": 0.3359375, "grad_norm_var": 0.004207801818847656, "learning_rate": 0.0001, "loss": 1.507, "loss/crossentropy": 2.620209574699402, "loss/fcd": 1.32421875, "loss/idx": 7.5, "loss/logits": 0.18275465071201324, "step": 1914 }, { "epoch": 0.02859510672768947, "grad_norm": 0.435546875, "grad_norm_var": 0.00511625607808431, "learning_rate": 0.0001, "loss": 1.716, "loss/crossentropy": 2.38981294631958, "loss/fcd": 1.51171875, "loss/idx": 7.5, "loss/logits": 0.2043071985244751, "step": 1915 }, { "epoch": 0.028610038898304452, "grad_norm": 0.296875, "grad_norm_var": 0.005151875813802083, "learning_rate": 0.0001, "loss": 1.476, "loss/crossentropy": 2.3913618326187134, "loss/fcd": 1.296875, "loss/idx": 7.5, "loss/logits": 0.17914563417434692, "step": 1916 }, { "epoch": 0.028624971068919435, "grad_norm": 0.2890625, "grad_norm_var": 0.0050694783528645836, "learning_rate": 0.0001, "loss": 1.4389, "loss/crossentropy": 2.6311800479888916, "loss/fcd": 1.26171875, "loss/idx": 7.5, "loss/logits": 0.1772254854440689, "step": 1917 }, { "epoch": 0.028639903239534414, "grad_norm": 0.26953125, "grad_norm_var": 0.005140622456868489, "learning_rate": 0.0001, "loss": 1.4822, "loss/crossentropy": 2.5183615684509277, "loss/fcd": 1.30859375, "loss/idx": 7.5, "loss/logits": 0.1736455112695694, "step": 1918 }, { "epoch": 0.028654835410149397, "grad_norm": 0.33984375, "grad_norm_var": 0.005132993062337239, "learning_rate": 0.0001, "loss": 1.5154, "loss/crossentropy": 2.4683438539505005, "loss/fcd": 1.32421875, "loss/idx": 7.5, "loss/logits": 0.1911604180932045, "step": 1919 }, { "epoch": 0.028669767580764376, "grad_norm": 0.33203125, "grad_norm_var": 0.004472096761067708, "learning_rate": 0.0001, "loss": 1.6351, "loss/crossentropy": 2.519033670425415, "loss/fcd": 1.41015625, "loss/idx": 7.5, "loss/logits": 0.2249896377325058, "step": 1920 }, { "epoch": 0.02868469975137936, "grad_norm": 0.2734375, "grad_norm_var": 0.004318602879842122, "learning_rate": 0.0001, "loss": 1.4548, "loss/crossentropy": 2.7004140615463257, "loss/fcd": 1.27734375, "loss/idx": 7.5, "loss/logits": 0.17741407454013824, "step": 1921 }, { "epoch": 0.028699631921994342, "grad_norm": 0.291015625, "grad_norm_var": 0.004328664143880208, "learning_rate": 0.0001, "loss": 1.5395, "loss/crossentropy": 2.0922393798828125, "loss/fcd": 1.3515625, "loss/idx": 7.5, "loss/logits": 0.18795033544301987, "step": 1922 }, { "epoch": 0.02871456409260932, "grad_norm": 0.326171875, "grad_norm_var": 0.00431976318359375, "learning_rate": 0.0001, "loss": 1.6011, "loss/crossentropy": 2.483629822731018, "loss/fcd": 1.40234375, "loss/idx": 7.5, "loss/logits": 0.1987830325961113, "step": 1923 }, { "epoch": 0.028729496263224304, "grad_norm": 0.2451171875, "grad_norm_var": 0.0022361397743225096, "learning_rate": 0.0001, "loss": 1.5739, "loss/crossentropy": 2.404491424560547, "loss/fcd": 1.375, "loss/idx": 7.5, "loss/logits": 0.19889789074659348, "step": 1924 }, { "epoch": 0.028744428433839286, "grad_norm": 0.310546875, "grad_norm_var": 0.0021317124366760252, "learning_rate": 0.0001, "loss": 1.4945, "loss/crossentropy": 2.7174516916275024, "loss/fcd": 1.3125, "loss/idx": 7.5, "loss/logits": 0.1820102334022522, "step": 1925 }, { "epoch": 0.028759360604454266, "grad_norm": 0.287109375, "grad_norm_var": 0.0019101738929748536, "learning_rate": 0.0001, "loss": 1.4425, "loss/crossentropy": 2.6432723999023438, "loss/fcd": 1.265625, "loss/idx": 7.5, "loss/logits": 0.17692308872938156, "step": 1926 }, { "epoch": 0.02877429277506925, "grad_norm": 0.296875, "grad_norm_var": 0.0018213232358296712, "learning_rate": 0.0001, "loss": 1.6536, "loss/crossentropy": 2.570428252220154, "loss/fcd": 1.43359375, "loss/idx": 7.5, "loss/logits": 0.22002588212490082, "step": 1927 }, { "epoch": 0.02878922494568423, "grad_norm": 0.359375, "grad_norm_var": 0.001985418796539307, "learning_rate": 0.0001, "loss": 1.4552, "loss/crossentropy": 2.6276506185531616, "loss/fcd": 1.28125, "loss/idx": 7.5, "loss/logits": 0.1739986315369606, "step": 1928 }, { "epoch": 0.02880415711629921, "grad_norm": 0.267578125, "grad_norm_var": 0.002077384789784749, "learning_rate": 0.0001, "loss": 1.434, "loss/crossentropy": 2.4807995557785034, "loss/fcd": 1.25, "loss/idx": 7.5, "loss/logits": 0.18403884023427963, "step": 1929 }, { "epoch": 0.028819089286914193, "grad_norm": 0.2890625, "grad_norm_var": 0.0020510633786519367, "learning_rate": 0.0001, "loss": 1.5259, "loss/crossentropy": 2.31441867351532, "loss/fcd": 1.3359375, "loss/idx": 7.5, "loss/logits": 0.18998070061206818, "step": 1930 }, { "epoch": 0.028834021457529172, "grad_norm": 0.29296875, "grad_norm_var": 0.0008745153745015462, "learning_rate": 0.0001, "loss": 1.7168, "loss/crossentropy": 2.4237349033355713, "loss/fcd": 1.46484375, "loss/idx": 7.5, "loss/logits": 0.251992866396904, "step": 1931 }, { "epoch": 0.028848953628144155, "grad_norm": 0.283203125, "grad_norm_var": 0.0008880893389383951, "learning_rate": 0.0001, "loss": 1.4073, "loss/crossentropy": 2.7782613039016724, "loss/fcd": 1.23046875, "loss/idx": 7.5, "loss/logits": 0.17678172141313553, "step": 1932 }, { "epoch": 0.028863885798759138, "grad_norm": 0.28515625, "grad_norm_var": 0.0008932073911031087, "learning_rate": 0.0001, "loss": 1.5001, "loss/crossentropy": 2.5270785093307495, "loss/fcd": 1.3125, "loss/idx": 7.5, "loss/logits": 0.18764495104551315, "step": 1933 }, { "epoch": 0.028878817969374117, "grad_norm": 0.322265625, "grad_norm_var": 0.0008751829465230306, "learning_rate": 0.0001, "loss": 1.5552, "loss/crossentropy": 2.506539225578308, "loss/fcd": 1.3515625, "loss/idx": 7.5, "loss/logits": 0.20360128581523895, "step": 1934 }, { "epoch": 0.0288937501399891, "grad_norm": 0.306640625, "grad_norm_var": 0.0007681806882222493, "learning_rate": 0.0001, "loss": 1.5821, "loss/crossentropy": 2.6506450176239014, "loss/fcd": 1.37890625, "loss/idx": 7.5, "loss/logits": 0.20314672589302063, "step": 1935 }, { "epoch": 0.028908682310604082, "grad_norm": 0.279296875, "grad_norm_var": 0.0007029493649800618, "learning_rate": 0.0001, "loss": 1.4767, "loss/crossentropy": 2.7774420976638794, "loss/fcd": 1.28515625, "loss/idx": 7.5, "loss/logits": 0.19157731533050537, "step": 1936 }, { "epoch": 0.02892361448121906, "grad_norm": 0.3828125, "grad_norm_var": 0.001139986515045166, "learning_rate": 0.0001, "loss": 2.0609, "loss/crossentropy": 2.687682032585144, "loss/fcd": 1.69140625, "loss/idx": 7.5, "loss/logits": 0.3694523721933365, "step": 1937 }, { "epoch": 0.028938546651834044, "grad_norm": 0.34375, "grad_norm_var": 0.0012395501136779786, "learning_rate": 0.0001, "loss": 1.6571, "loss/crossentropy": 2.470700263977051, "loss/fcd": 1.44921875, "loss/idx": 7.5, "loss/logits": 0.20788709819316864, "step": 1938 }, { "epoch": 0.028953478822449027, "grad_norm": 0.330078125, "grad_norm_var": 0.0012515981992085774, "learning_rate": 0.0001, "loss": 1.5023, "loss/crossentropy": 2.6560239791870117, "loss/fcd": 1.32421875, "loss/idx": 7.5, "loss/logits": 0.17812135815620422, "step": 1939 }, { "epoch": 0.028968410993064006, "grad_norm": 0.30078125, "grad_norm_var": 0.0009999593098958333, "learning_rate": 0.0001, "loss": 1.4671, "loss/crossentropy": 2.846352458000183, "loss/fcd": 1.28515625, "loss/idx": 7.5, "loss/logits": 0.18193431943655014, "step": 1940 }, { "epoch": 0.02898334316367899, "grad_norm": 0.29296875, "grad_norm_var": 0.00101469357808431, "learning_rate": 0.0001, "loss": 1.5437, "loss/crossentropy": 2.766597867012024, "loss/fcd": 1.328125, "loss/idx": 7.5, "loss/logits": 0.2155774012207985, "step": 1941 }, { "epoch": 0.028998275334293968, "grad_norm": 0.2890625, "grad_norm_var": 0.0010096232096354167, "learning_rate": 0.0001, "loss": 1.5658, "loss/crossentropy": 2.692010760307312, "loss/fcd": 1.35546875, "loss/idx": 7.5, "loss/logits": 0.21035537868738174, "step": 1942 }, { "epoch": 0.02901320750490895, "grad_norm": 0.29296875, "grad_norm_var": 0.0010161717732747396, "learning_rate": 0.0001, "loss": 1.5556, "loss/crossentropy": 2.5992730855941772, "loss/fcd": 1.359375, "loss/idx": 7.5, "loss/logits": 0.1961953192949295, "step": 1943 }, { "epoch": 0.029028139675523933, "grad_norm": 0.33203125, "grad_norm_var": 0.0008733113606770833, "learning_rate": 0.0001, "loss": 1.5035, "loss/crossentropy": 2.6337616443634033, "loss/fcd": 1.30859375, "loss/idx": 7.5, "loss/logits": 0.19485964626073837, "step": 1944 }, { "epoch": 0.029043071846138913, "grad_norm": 0.255859375, "grad_norm_var": 0.0009414037068684896, "learning_rate": 0.0001, "loss": 1.421, "loss/crossentropy": 2.5774269104003906, "loss/fcd": 1.25390625, "loss/idx": 7.5, "loss/logits": 0.16705790162086487, "step": 1945 }, { "epoch": 0.029058004016753895, "grad_norm": 0.27734375, "grad_norm_var": 0.0009747823079427083, "learning_rate": 0.0001, "loss": 1.5458, "loss/crossentropy": 2.6940470933914185, "loss/fcd": 1.328125, "loss/idx": 7.5, "loss/logits": 0.21767432987689972, "step": 1946 }, { "epoch": 0.029072936187368878, "grad_norm": 0.353515625, "grad_norm_var": 0.0011132399241129557, "learning_rate": 0.0001, "loss": 1.5958, "loss/crossentropy": 2.9539268016815186, "loss/fcd": 1.375, "loss/idx": 7.5, "loss/logits": 0.22076428681612015, "step": 1947 }, { "epoch": 0.029087868357983857, "grad_norm": 0.265625, "grad_norm_var": 0.001190630594889323, "learning_rate": 0.0001, "loss": 1.3962, "loss/crossentropy": 2.6203267574310303, "loss/fcd": 1.2265625, "loss/idx": 7.5, "loss/logits": 0.16960347443819046, "step": 1948 }, { "epoch": 0.02910280052859884, "grad_norm": 0.298828125, "grad_norm_var": 0.00116270383199056, "learning_rate": 0.0001, "loss": 1.4679, "loss/crossentropy": 2.611543297767639, "loss/fcd": 1.2890625, "loss/idx": 7.5, "loss/logits": 0.17885488271713257, "step": 1949 }, { "epoch": 0.029117732699213823, "grad_norm": 0.255859375, "grad_norm_var": 0.0013096968332926432, "learning_rate": 0.0001, "loss": 1.5832, "loss/crossentropy": 2.619404435157776, "loss/fcd": 1.359375, "loss/idx": 7.5, "loss/logits": 0.22386223822832108, "step": 1950 }, { "epoch": 0.029132664869828802, "grad_norm": 0.275390625, "grad_norm_var": 0.00135801633199056, "learning_rate": 0.0001, "loss": 1.4048, "loss/crossentropy": 2.743823528289795, "loss/fcd": 1.23828125, "loss/idx": 7.5, "loss/logits": 0.16652023047208786, "step": 1951 }, { "epoch": 0.029147597040443785, "grad_norm": 0.326171875, "grad_norm_var": 0.0013557275136311848, "learning_rate": 0.0001, "loss": 1.7229, "loss/crossentropy": 2.8115499019622803, "loss/fcd": 1.46875, "loss/idx": 7.5, "loss/logits": 0.2541176527738571, "step": 1952 }, { "epoch": 0.029162529211058767, "grad_norm": 0.294921875, "grad_norm_var": 0.0009215672810872396, "learning_rate": 0.0001, "loss": 1.5088, "loss/crossentropy": 2.8607544898986816, "loss/fcd": 1.32421875, "loss/idx": 7.5, "loss/logits": 0.18454967439174652, "step": 1953 }, { "epoch": 0.029177461381673746, "grad_norm": 0.3125, "grad_norm_var": 0.0007964452107747396, "learning_rate": 0.0001, "loss": 1.5189, "loss/crossentropy": 2.498580813407898, "loss/fcd": 1.3359375, "loss/idx": 7.5, "loss/logits": 0.1829310953617096, "step": 1954 }, { "epoch": 0.02919239355228873, "grad_norm": 0.31640625, "grad_norm_var": 0.0007480462392171224, "learning_rate": 0.0001, "loss": 1.4689, "loss/crossentropy": 2.5287530422210693, "loss/fcd": 1.2890625, "loss/idx": 7.5, "loss/logits": 0.17985684424638748, "step": 1955 }, { "epoch": 0.02920732572290371, "grad_norm": 0.390625, "grad_norm_var": 0.0013066450754801431, "learning_rate": 0.0001, "loss": 1.6387, "loss/crossentropy": 2.7909224033355713, "loss/fcd": 1.44140625, "loss/idx": 7.5, "loss/logits": 0.19726381450891495, "step": 1956 }, { "epoch": 0.02922225789351869, "grad_norm": 0.296875, "grad_norm_var": 0.001302957534790039, "learning_rate": 0.0001, "loss": 1.5289, "loss/crossentropy": 2.6084100008010864, "loss/fcd": 1.32421875, "loss/idx": 7.5, "loss/logits": 0.20470361411571503, "step": 1957 }, { "epoch": 0.029237190064133674, "grad_norm": 0.314453125, "grad_norm_var": 0.0012990315755208334, "learning_rate": 0.0001, "loss": 1.7085, "loss/crossentropy": 2.459324359893799, "loss/fcd": 1.4609375, "loss/idx": 7.5, "loss/logits": 0.24756821244955063, "step": 1958 }, { "epoch": 0.029252122234748653, "grad_norm": 0.28125, "grad_norm_var": 0.0013243993123372396, "learning_rate": 0.0001, "loss": 1.3924, "loss/crossentropy": 2.470743775367737, "loss/fcd": 1.23046875, "loss/idx": 7.5, "loss/logits": 0.16190864145755768, "step": 1959 }, { "epoch": 0.029267054405363636, "grad_norm": 0.306640625, "grad_norm_var": 0.001266336441040039, "learning_rate": 0.0001, "loss": 1.6331, "loss/crossentropy": 2.5596195459365845, "loss/fcd": 1.39453125, "loss/idx": 7.5, "loss/logits": 0.2385719120502472, "step": 1960 }, { "epoch": 0.02928198657597862, "grad_norm": 0.30078125, "grad_norm_var": 0.0011197408040364584, "learning_rate": 0.0001, "loss": 1.5841, "loss/crossentropy": 2.8690768480300903, "loss/fcd": 1.3671875, "loss/idx": 7.5, "loss/logits": 0.21691838651895523, "step": 1961 }, { "epoch": 0.029296918746593598, "grad_norm": 0.28125, "grad_norm_var": 0.001106707255045573, "learning_rate": 0.0001, "loss": 1.3773, "loss/crossentropy": 2.466285824775696, "loss/fcd": 1.21484375, "loss/idx": 7.5, "loss/logits": 0.16250278055667877, "step": 1962 }, { "epoch": 0.02931185091720858, "grad_norm": 0.365234375, "grad_norm_var": 0.0011919657389322916, "learning_rate": 0.0001, "loss": 1.7176, "loss/crossentropy": 2.648834228515625, "loss/fcd": 1.4765625, "loss/idx": 7.5, "loss/logits": 0.24099178612232208, "step": 1963 }, { "epoch": 0.029326783087823563, "grad_norm": 0.296875, "grad_norm_var": 0.0010882059733072916, "learning_rate": 0.0001, "loss": 1.3692, "loss/crossentropy": 2.532212018966675, "loss/fcd": 1.2109375, "loss/idx": 7.5, "loss/logits": 0.15824826806783676, "step": 1964 }, { "epoch": 0.029341715258438542, "grad_norm": 0.3203125, "grad_norm_var": 0.001093276341756185, "learning_rate": 0.0001, "loss": 1.6864, "loss/crossentropy": 2.49002468585968, "loss/fcd": 1.42578125, "loss/idx": 7.5, "loss/logits": 0.26066526770591736, "step": 1965 }, { "epoch": 0.029356647429053525, "grad_norm": 0.28515625, "grad_norm_var": 0.0009414037068684896, "learning_rate": 0.0001, "loss": 1.5496, "loss/crossentropy": 2.5018492937088013, "loss/fcd": 1.3515625, "loss/idx": 7.5, "loss/logits": 0.1980273500084877, "step": 1966 }, { "epoch": 0.029371579599668504, "grad_norm": 0.287109375, "grad_norm_var": 0.0008954366048177083, "learning_rate": 0.0001, "loss": 1.434, "loss/crossentropy": 2.5343021154403687, "loss/fcd": 1.26171875, "loss/idx": 7.5, "loss/logits": 0.17231228947639465, "step": 1967 }, { "epoch": 0.029386511770283487, "grad_norm": 0.37109375, "grad_norm_var": 0.001112222671508789, "learning_rate": 0.0001, "loss": 1.6817, "loss/crossentropy": 2.352415919303894, "loss/fcd": 1.453125, "loss/idx": 7.5, "loss/logits": 0.22859769314527512, "step": 1968 }, { "epoch": 0.02940144394089847, "grad_norm": 0.30859375, "grad_norm_var": 0.0010894139607747395, "learning_rate": 0.0001, "loss": 1.4004, "loss/crossentropy": 2.6096965074539185, "loss/fcd": 1.234375, "loss/idx": 7.5, "loss/logits": 0.16599421203136444, "step": 1969 }, { "epoch": 0.02941637611151345, "grad_norm": 0.26171875, "grad_norm_var": 0.0012654622395833334, "learning_rate": 0.0001, "loss": 1.3373, "loss/crossentropy": 2.629533290863037, "loss/fcd": 1.18359375, "loss/idx": 7.5, "loss/logits": 0.15375210344791412, "step": 1970 }, { "epoch": 0.02943130828212843, "grad_norm": 0.27734375, "grad_norm_var": 0.0013353983561197917, "learning_rate": 0.0001, "loss": 1.2961, "loss/crossentropy": 2.737032413482666, "loss/fcd": 1.14453125, "loss/idx": 7.5, "loss/logits": 0.15156958997249603, "step": 1971 }, { "epoch": 0.029446240452743414, "grad_norm": 0.33203125, "grad_norm_var": 0.0009129206339518229, "learning_rate": 0.0001, "loss": 1.4493, "loss/crossentropy": 2.472022771835327, "loss/fcd": 1.2890625, "loss/idx": 7.5, "loss/logits": 0.16024889796972275, "step": 1972 }, { "epoch": 0.029461172623358393, "grad_norm": 0.2890625, "grad_norm_var": 0.0009256362915039063, "learning_rate": 0.0001, "loss": 1.5485, "loss/crossentropy": 2.567961573600769, "loss/fcd": 1.3359375, "loss/idx": 7.5, "loss/logits": 0.21256595849990845, "step": 1973 }, { "epoch": 0.029476104793973376, "grad_norm": 0.287109375, "grad_norm_var": 0.000937652587890625, "learning_rate": 0.0001, "loss": 1.4892, "loss/crossentropy": 2.444976806640625, "loss/fcd": 1.3046875, "loss/idx": 7.5, "loss/logits": 0.1844826564192772, "step": 1974 }, { "epoch": 0.02949103696458836, "grad_norm": 0.25390625, "grad_norm_var": 0.0010644912719726563, "learning_rate": 0.0001, "loss": 1.4066, "loss/crossentropy": 2.696893095970154, "loss/fcd": 1.234375, "loss/idx": 7.5, "loss/logits": 0.17225909233093262, "step": 1975 }, { "epoch": 0.029505969135203338, "grad_norm": 0.298828125, "grad_norm_var": 0.0010629653930664062, "learning_rate": 0.0001, "loss": 1.3567, "loss/crossentropy": 2.5711617469787598, "loss/fcd": 1.19921875, "loss/idx": 7.5, "loss/logits": 0.15751300007104874, "step": 1976 }, { "epoch": 0.02952090130581832, "grad_norm": 0.28125, "grad_norm_var": 0.0010874430338541667, "learning_rate": 0.0001, "loss": 1.4822, "loss/crossentropy": 2.373708963394165, "loss/fcd": 1.296875, "loss/idx": 7.5, "loss/logits": 0.18534115701913834, "step": 1977 }, { "epoch": 0.029535833476433303, "grad_norm": 0.302734375, "grad_norm_var": 0.0010631402333577474, "learning_rate": 0.0001, "loss": 1.7471, "loss/crossentropy": 2.311202645301819, "loss/fcd": 1.49609375, "loss/idx": 7.5, "loss/logits": 0.2510295584797859, "step": 1978 }, { "epoch": 0.029550765647048283, "grad_norm": 0.8828125, "grad_norm_var": 0.022228749593098958, "learning_rate": 0.0001, "loss": 1.7238, "loss/crossentropy": 2.7768548727035522, "loss/fcd": 1.48828125, "loss/idx": 7.5, "loss/logits": 0.2354755625128746, "step": 1979 }, { "epoch": 0.029565697817663265, "grad_norm": 0.328125, "grad_norm_var": 0.022137196858723958, "learning_rate": 0.0001, "loss": 1.5399, "loss/crossentropy": 2.6792709827423096, "loss/fcd": 1.3359375, "loss/idx": 7.5, "loss/logits": 0.20392261445522308, "step": 1980 }, { "epoch": 0.029580629988278245, "grad_norm": 0.2490234375, "grad_norm_var": 0.02259870767593384, "learning_rate": 0.0001, "loss": 1.3753, "loss/crossentropy": 2.567617893218994, "loss/fcd": 1.203125, "loss/idx": 7.5, "loss/logits": 0.17222069203853607, "step": 1981 }, { "epoch": 0.029595562158893227, "grad_norm": 0.3515625, "grad_norm_var": 0.022468467553456623, "learning_rate": 0.0001, "loss": 1.6633, "loss/crossentropy": 2.656588912010193, "loss/fcd": 1.4453125, "loss/idx": 7.5, "loss/logits": 0.2179877832531929, "step": 1982 }, { "epoch": 0.02961049432950821, "grad_norm": 0.28125, "grad_norm_var": 0.022508140405019125, "learning_rate": 0.0001, "loss": 1.464, "loss/crossentropy": 2.546592593193054, "loss/fcd": 1.28125, "loss/idx": 7.5, "loss/logits": 0.18279610574245453, "step": 1983 }, { "epoch": 0.02962542650012319, "grad_norm": 0.3203125, "grad_norm_var": 0.022423422336578368, "learning_rate": 0.0001, "loss": 1.6634, "loss/crossentropy": 2.407332420349121, "loss/fcd": 1.453125, "loss/idx": 7.5, "loss/logits": 0.21023399382829666, "step": 1984 }, { "epoch": 0.029640358670738172, "grad_norm": 0.298828125, "grad_norm_var": 0.022459344069163004, "learning_rate": 0.0001, "loss": 1.4897, "loss/crossentropy": 2.2389989495277405, "loss/fcd": 1.3046875, "loss/idx": 7.5, "loss/logits": 0.18500903248786926, "step": 1985 }, { "epoch": 0.029655290841353155, "grad_norm": 0.287109375, "grad_norm_var": 0.02226511240005493, "learning_rate": 0.0001, "loss": 1.5513, "loss/crossentropy": 2.5988144874572754, "loss/fcd": 1.35546875, "loss/idx": 7.5, "loss/logits": 0.1958768144249916, "step": 1986 }, { "epoch": 0.029670223011968134, "grad_norm": 0.263671875, "grad_norm_var": 0.02237748702367147, "learning_rate": 0.0001, "loss": 1.5422, "loss/crossentropy": 2.69570255279541, "loss/fcd": 1.34375, "loss/idx": 7.5, "loss/logits": 0.1984579861164093, "step": 1987 }, { "epoch": 0.029685155182583117, "grad_norm": 0.314453125, "grad_norm_var": 0.02239608367284139, "learning_rate": 0.0001, "loss": 1.5841, "loss/crossentropy": 2.6110461950302124, "loss/fcd": 1.37890625, "loss/idx": 7.5, "loss/logits": 0.20521266758441925, "step": 1988 }, { "epoch": 0.0297000873531981, "grad_norm": 0.31640625, "grad_norm_var": 0.022291274865468343, "learning_rate": 0.0001, "loss": 1.3848, "loss/crossentropy": 2.859120726585388, "loss/fcd": 1.23046875, "loss/idx": 7.5, "loss/logits": 0.15436360239982605, "step": 1989 }, { "epoch": 0.02971501952381308, "grad_norm": 0.31640625, "grad_norm_var": 0.022168250878651936, "learning_rate": 0.0001, "loss": 1.5489, "loss/crossentropy": 2.4848233461380005, "loss/fcd": 1.3515625, "loss/idx": 7.5, "loss/logits": 0.1973266825079918, "step": 1990 }, { "epoch": 0.02972995169442806, "grad_norm": 0.26953125, "grad_norm_var": 0.02201629877090454, "learning_rate": 0.0001, "loss": 1.311, "loss/crossentropy": 2.53887939453125, "loss/fcd": 1.16015625, "loss/idx": 7.5, "loss/logits": 0.15087512135505676, "step": 1991 }, { "epoch": 0.02974488386504304, "grad_norm": 0.32421875, "grad_norm_var": 0.021933646996816, "learning_rate": 0.0001, "loss": 1.4663, "loss/crossentropy": 2.592678189277649, "loss/fcd": 1.27734375, "loss/idx": 7.5, "loss/logits": 0.188975490629673, "step": 1992 }, { "epoch": 0.029759816035658023, "grad_norm": 0.306640625, "grad_norm_var": 0.02178611358006795, "learning_rate": 0.0001, "loss": 1.5695, "loss/crossentropy": 2.6848541498184204, "loss/fcd": 1.37890625, "loss/idx": 7.5, "loss/logits": 0.1906232088804245, "step": 1993 }, { "epoch": 0.029774748206273006, "grad_norm": 0.291015625, "grad_norm_var": 0.021850295861562095, "learning_rate": 0.0001, "loss": 1.4406, "loss/crossentropy": 2.8219099044799805, "loss/fcd": 1.26171875, "loss/idx": 7.5, "loss/logits": 0.17891719192266464, "step": 1994 }, { "epoch": 0.029789680376887985, "grad_norm": 0.3203125, "grad_norm_var": 0.0007336576779683431, "learning_rate": 0.0001, "loss": 1.5743, "loss/crossentropy": 2.4911776781082153, "loss/fcd": 1.3671875, "loss/idx": 7.5, "loss/logits": 0.20711027085781097, "step": 1995 }, { "epoch": 0.029804612547502968, "grad_norm": 0.31640625, "grad_norm_var": 0.0007020910580952962, "learning_rate": 0.0001, "loss": 1.566, "loss/crossentropy": 2.567915916442871, "loss/fcd": 1.3671875, "loss/idx": 7.5, "loss/logits": 0.1987716257572174, "step": 1996 }, { "epoch": 0.02981954471811795, "grad_norm": 0.287109375, "grad_norm_var": 0.0005252679189046224, "learning_rate": 0.0001, "loss": 1.4267, "loss/crossentropy": 2.614750385284424, "loss/fcd": 1.2421875, "loss/idx": 7.5, "loss/logits": 0.1844882145524025, "step": 1997 }, { "epoch": 0.02983447688873293, "grad_norm": 0.33984375, "grad_norm_var": 0.0004596551259358724, "learning_rate": 0.0001, "loss": 1.4933, "loss/crossentropy": 2.769694447517395, "loss/fcd": 1.3046875, "loss/idx": 7.5, "loss/logits": 0.18861094117164612, "step": 1998 }, { "epoch": 0.029849409059347912, "grad_norm": 0.27734375, "grad_norm_var": 0.0004721164703369141, "learning_rate": 0.0001, "loss": 1.5203, "loss/crossentropy": 2.5993359088897705, "loss/fcd": 1.328125, "loss/idx": 7.5, "loss/logits": 0.19222304224967957, "step": 1999 }, { "epoch": 0.029864341229962895, "grad_norm": 0.26171875, "grad_norm_var": 0.000552225112915039, "learning_rate": 0.0001, "loss": 1.5111, "loss/crossentropy": 2.7430869340896606, "loss/fcd": 1.30859375, "loss/idx": 7.5, "loss/logits": 0.2025001049041748, "step": 2000 } ], "logging_steps": 1, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": true, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.80565014069248e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }